diff --git a/src/simulator/api-gdext/src/ai.rs b/src/simulator/api-gdext/src/ai.rs index 02301142..22d24730 100644 --- a/src/simulator/api-gdext/src/ai.rs +++ b/src/simulator/api-gdext/src/ai.rs @@ -611,11 +611,17 @@ impl GdMcTreeController { static BACKEND: OnceLock = OnceLock::new(); let backend = BACKEND.get_or_init(AiBackend::probe); - // Tunable: 32 leaves per dispatch matches the GPU shader workgroup - // size and is the same number the existing `gpu_tree_integration` - // tests use. Smaller batches lose throughput; larger batches risk - // overshooting the rollout budget on the last dispatch. - const BATCH_SIZE: usize = 32; + // Tunable: 1024 leaves per dispatch matches the persistent-buffer + // MAX_BATCH in `mc_ai::gpu::inner`. p0-20 Phase B raised this from + // 32: each `iterate_gpu_batched` call ends up as one wgpu submit, + // and the per-submit overhead amortizes only when the batch is + // big. The kernel's @workgroup_size(64) lets 1024 leaves dispatch + // as 16 workgroups in one submit. Note: bumping batch size also + // means more leaves are selected against stale visit counts before + // the first backprop; tree shape can shift vs. the smaller-batch + // version. That's an algorithmic change measured in Phase C, not a + // bug. + const BATCH_SIZE: usize = 1024; let total_budget = self.rollout_budget as usize; let wall_budget = if self.budget_ms > 0 { diff --git a/src/simulator/crates/mc-ai/src/gpu/inner.rs b/src/simulator/crates/mc-ai/src/gpu/inner.rs index dbdc2585..2d0abc3d 100644 --- a/src/simulator/crates/mc-ai/src/gpu/inner.rs +++ b/src/simulator/crates/mc-ai/src/gpu/inner.rs @@ -15,7 +15,6 @@ use std::sync::OnceLock; use std::time::Duration; use pollster::block_on; -use wgpu::util::DeviceExt; use crate::abstract_state::{AbstractRolloutState, MAX_PLAYERS}; use crate::policy::PersonalityPriors; @@ -61,6 +60,17 @@ static GPU_SHARED: OnceLock> = OnceLock::new(); /// to compute the workgroup count. const WORKGROUP_SIZE_X: u32 = 64; +/// Maximum coalesced batch size. The persistent staging buffers in +/// [`GpuContext`] are allocated once at this capacity; per-dispatch we only +/// write/read the n-prefix. p0-20 Phase B: raised from a per-call alloc'd +/// 32-leaf shape to 1024 to amortize the wgpu submit cost across more +/// rollouts. Sized so the input + scores allocations stay well under the +/// `downlevel_defaults` 128 MB max-buffer-size floor: +/// * batch_pods = 1024 * 256 B = 256 KiB +/// * batch_priors = 1024 * 96 B = 96 KiB +/// * scores = 1024 * 4 B = 4 KiB +const MAX_BATCH: u32 = 1024; + /// Per-player priors block uploaded as a storage buffer entry. Mirrors the /// `PlayerPriors` WGSL struct exactly — 6 f32 per player, no padding needed /// (f32 has 4-byte alignment and 6 × 4 = 24 bytes is already 4-aligned). @@ -112,8 +122,16 @@ struct GpuUniforms { } /// Lazy-initialized wgpu context. Compile the pipeline once, reuse across -/// dispatches. Holds the device + queue + BGL so each dispatch only allocates -/// fresh buffers (not a fresh pipeline). +/// dispatches. Holds the device + queue + BGL plus a set of persistent, +/// MAX_BATCH-sized buffers so each dispatch only writes the n-prefix and +/// rebinds — no per-call allocation. +/// +/// p0-20 Phase B: the previous shape allocated `buf_pods`, `buf_priors`, +/// `buf_scores`, `buf_uniforms`, and a MAP_READ staging buffer **per +/// dispatch**. With BATCH_SIZE=32 in the outer loop, that allocation churn +/// dominated the GPU path's wall time vs. the rayon CPU reference. Holding +/// the buffers persistent + raising MAX_BATCH=1024 amortizes the per-submit +/// driver cost across ~32× more leaves. pub struct GpuContext { device: wgpu::Device, queue: wgpu::Queue, @@ -121,6 +139,17 @@ pub struct GpuContext { bind_group_layout: wgpu::BindGroupLayout, /// Backend string for diagnostics (`"Vulkan"`, `"Metal"`, `"Dx12"`, `"Gl"`). pub backend: String, + /// Persistent input pod buffer (MAX_BATCH * sizeof(AbstractRolloutState)). + buf_pods: wgpu::Buffer, + /// Persistent input priors buffer (MAX_BATCH * sizeof(BatchPriors)). + buf_priors: wgpu::Buffer, + /// Persistent compute-pass output buffer (MAX_BATCH * 4 bytes). + buf_scores: wgpu::Buffer, + /// Persistent uniform buffer (overwritten via Queue::write_buffer per call). + buf_uniforms: wgpu::Buffer, + /// Persistent MAP_READ staging buffer for score readback. Sized for + /// MAX_BATCH; per-call we copy/map only the n-prefix. + buf_staging: wgpu::Buffer, } impl GpuContext { @@ -286,12 +315,62 @@ impl GpuContext { }); if debug { eprintln!("[mc-ai gpu] try_init_inner: returning Some"); } + // ── Persistent buffer pool (p0-20 Phase B) ─────────────────────── + // + // Allocated once at MAX_BATCH capacity; reused across every + // `batch_simulate` call. All input buffers carry COPY_DST so we can + // overwrite their n-prefix via `Queue::write_buffer` per call. The + // scores buffer is `STORAGE | COPY_SRC` (kernel writes, encoder + // reads). The staging buffer is `MAP_READ | COPY_DST` for readback. + let pods_size: u64 = + (MAX_BATCH as u64) * (std::mem::size_of::() as u64); + let priors_size: u64 = + (MAX_BATCH as u64) * (std::mem::size_of::() as u64); + let scores_size: u64 = (MAX_BATCH as u64) * 4; + let uniforms_size: u64 = std::mem::size_of::() as u64; + + let buf_pods = device.create_buffer(&wgpu::BufferDescriptor { + label: Some("mc_ai_rollout_pods_persistent"), + size: pods_size, + usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + let buf_priors = device.create_buffer(&wgpu::BufferDescriptor { + label: Some("mc_ai_rollout_priors_persistent"), + size: priors_size, + usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + let buf_scores = device.create_buffer(&wgpu::BufferDescriptor { + label: Some("mc_ai_rollout_scores_persistent"), + size: scores_size, + usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC, + mapped_at_creation: false, + }); + let buf_uniforms = device.create_buffer(&wgpu::BufferDescriptor { + label: Some("mc_ai_rollout_uniforms_persistent"), + size: uniforms_size, + usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + let buf_staging = device.create_buffer(&wgpu::BufferDescriptor { + label: Some("mc_ai_rollout_scores_staging_persistent"), + size: scores_size, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + }); + Some(Self { device, queue, pipeline, bind_group_layout, backend, + buf_pods, + buf_priors, + buf_scores, + buf_uniforms, + buf_staging, }) } @@ -304,7 +383,12 @@ impl GpuContext { /// # Panics /// /// Does not panic. All runtime GPU failures surface as `None`. - fn dispatch_batch( + /// Dispatch one ≤ MAX_BATCH chunk through the persistent GPU pipeline. + /// + /// Caller is responsible for chunking inputs > MAX_BATCH; see + /// [`Self::batch_simulate`] for the chunking loop. `inputs.len()` must + /// equal `priors_per_entry.len()` and must be in `1..=MAX_BATCH`. + fn dispatch_chunk( &self, inputs: &[AbstractRolloutState], priors_per_entry: &[[PersonalityPriors; MAX_PLAYERS]], @@ -312,12 +396,12 @@ impl GpuContext { horizon: u32, ) -> Option> { debug_assert_eq!(inputs.len(), priors_per_entry.len()); - if inputs.is_empty() { - return Some(Vec::new()); - } + debug_assert!(!inputs.is_empty()); + debug_assert!(inputs.len() <= MAX_BATCH as usize); let n = inputs.len() as u32; let dev = &self.device; + let queue = &self.queue; // Convert CPU priors into GPU-uploadable form. let gpu_priors: Vec = priors_per_entry @@ -343,24 +427,25 @@ impl GpuContext { _pad3: 0, }; - // Upload buffers. - let buf_pods = upload_storage_ro(dev, bytemuck::cast_slice(inputs), "rollout_batch_pods"); - let buf_priors = upload_storage_ro(dev, bytemuck::cast_slice(&gpu_priors), "rollout_priors"); - let buf_scores = create_storage_rw(dev, (n as usize) * 4, "rollout_scores"); - let buf_uniforms = dev.create_buffer_init(&wgpu::util::BufferInitDescriptor { - label: Some("rollout_uniforms"), - contents: bytemuck::bytes_of(&uniforms), - usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_SRC, - }); + // Write only the n-prefix of each persistent input buffer. The + // kernel's `if entry_idx >= n_batch` early-return + our + // `n.div_ceil(WORKGROUP_SIZE_X)` dispatch keeps stale tail bytes + // from being read. + queue.write_buffer(&self.buf_pods, 0, bytemuck::cast_slice(inputs)); + queue.write_buffer(&self.buf_priors, 0, bytemuck::cast_slice(&gpu_priors)); + queue.write_buffer(&self.buf_uniforms, 0, bytemuck::bytes_of(&uniforms)); + // Bind group is rebuilt per call (cheap — descriptor only, no + // allocation). Could be cached but keeping it simple matches the + // advisor's "don't over-engineer" guidance. let bind_group = dev.create_bind_group(&wgpu::BindGroupDescriptor { label: Some("rollout_bg"), layout: &self.bind_group_layout, entries: &[ - wgpu::BindGroupEntry { binding: 0, resource: buf_pods.as_entire_binding() }, - wgpu::BindGroupEntry { binding: 1, resource: buf_priors.as_entire_binding() }, - wgpu::BindGroupEntry { binding: 2, resource: buf_scores.as_entire_binding() }, - wgpu::BindGroupEntry { binding: 3, resource: buf_uniforms.as_entire_binding() }, + wgpu::BindGroupEntry { binding: 0, resource: self.buf_pods.as_entire_binding() }, + wgpu::BindGroupEntry { binding: 1, resource: self.buf_priors.as_entire_binding() }, + wgpu::BindGroupEntry { binding: 2, resource: self.buf_scores.as_entire_binding() }, + wgpu::BindGroupEntry { binding: 3, resource: self.buf_uniforms.as_entire_binding() }, ], }); @@ -379,19 +464,16 @@ impl GpuContext { pass.dispatch_workgroups(workgroups, 1, 1); } - // Readback: copy scores buffer to a MAP_READ staging buffer. - let staging = dev.create_buffer(&wgpu::BufferDescriptor { - label: Some("rollout_scores_staging"), - size: (n as u64) * 4, - usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, - mapped_at_creation: false, - }); - encoder.copy_buffer_to_buffer(&buf_scores, 0, &staging, 0, (n as u64) * 4); + // Copy only the n-prefix of scores into the persistent staging + // buffer. The remainder of the staging buffer holds previous-call + // bytes which we never read (slice cap below is `..n*4`). + let n_bytes = (n as u64) * 4; + encoder.copy_buffer_to_buffer(&self.buf_scores, 0, &self.buf_staging, 0, n_bytes); - self.queue.submit(std::iter::once(encoder.finish())); + queue.submit(std::iter::once(encoder.finish())); - // Map + read. - let slice = staging.slice(..); + // Map only the n-prefix and read. + let slice = self.buf_staging.slice(0..n_bytes); let (tx, rx) = std::sync::mpsc::channel(); slice.map_async(wgpu::MapMode::Read, move |r| { let _ = tx.send(r); @@ -402,7 +484,9 @@ impl GpuContext { let data = slice.get_mapped_range(); let scores: Vec = bytemuck::cast_slice(&data).to_vec(); drop(data); - staging.unmap(); + // Unmap the whole buffer (a slice unmap is the buffer unmap in + // wgpu — there is no per-slice unmap API). + self.buf_staging.unmap(); Some(scores) } @@ -430,13 +514,35 @@ impl GpuContext { if inputs.is_empty() { return Ok(Vec::new()); } - match self.dispatch_batch(inputs, priors_per_entry, seed, horizon) { - Some(scores) => Ok(scores), - None => Err(GpuError::DispatchFailed( - "wgpu pipeline dispatch returned None (queue submit / buffer map / device lost)" - .to_owned(), - )), + + // Chunk over MAX_BATCH. The persistent buffer pool is sized to + // MAX_BATCH; inputs larger than that dispatch as multiple submits. + // Per-chunk seed is `seed + chunk_offset` so the per-entry stream + // (`seed + entry_idx_within_chunk`) reproduces the + // `batch_simulate_cpu` per-entry seed `seed + global_entry_idx` + // — preserving byte-equivalent parity across chunked + un-chunked + // dispatches. + let mut out: Vec = Vec::with_capacity(inputs.len()); + let max = MAX_BATCH as usize; + let mut offset = 0usize; + while offset < inputs.len() { + let end = (offset + max).min(inputs.len()); + let chunk_seed = seed.wrapping_add(offset as u64); + match self.dispatch_chunk( + &inputs[offset..end], + &priors_per_entry[offset..end], + chunk_seed, + horizon, + ) { + Some(scores) => out.extend(scores), + None => return Err(GpuError::DispatchFailed( + "wgpu pipeline dispatch returned None (queue submit / buffer map / device lost)" + .to_owned(), + )), + } + offset = end; } + Ok(out) } } @@ -466,23 +572,6 @@ fn uniform_bgl_entry(binding: u32) -> wgpu::BindGroupLayoutEntry { } } -fn upload_storage_ro(dev: &wgpu::Device, data: &[u8], label: &str) -> wgpu::Buffer { - dev.create_buffer_init(&wgpu::util::BufferInitDescriptor { - label: Some(label), - contents: data, - usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC, - }) -} - -fn create_storage_rw(dev: &wgpu::Device, size_bytes: usize, label: &str) -> wgpu::Buffer { - dev.create_buffer(&wgpu::BufferDescriptor { - label: Some(label), - size: size_bytes as u64, - usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC, - mapped_at_creation: false, - }) -} - #[cfg(test)] mod tests { use super::*;