feat(@projects/@magic-civilization): ✨ increase gpu batch size to 1024 for throughput

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-05-04 17:36:12 -04:00 · 2026-05-04 17:36:12 -04:00 · c77809c300
commit c77809c300
parent 8074291af5
2 changed files with 155 additions and 60 deletions
--- a/src/simulator/api-gdext/src/ai.rs
+++ b/src/simulator/api-gdext/src/ai.rs
@ -611,11 +611,17 @@ impl GdMcTreeController {
        static BACKEND: OnceLock<AiBackend> = OnceLock::new();
        let backend = BACKEND.get_or_init(AiBackend::probe);

-        // Tunable: 32 leaves per dispatch matches the GPU shader workgroup
-        // size and is the same number the existing `gpu_tree_integration`
-        // tests use. Smaller batches lose throughput; larger batches risk
-        // overshooting the rollout budget on the last dispatch.
-        const BATCH_SIZE: usize = 32;
+        // Tunable: 1024 leaves per dispatch matches the persistent-buffer
+        // MAX_BATCH in `mc_ai::gpu::inner`. p0-20 Phase B raised this from
+        // 32: each `iterate_gpu_batched` call ends up as one wgpu submit,
+        // and the per-submit overhead amortizes only when the batch is
+        // big. The kernel's @workgroup_size(64) lets 1024 leaves dispatch
+        // as 16 workgroups in one submit. Note: bumping batch size also
+        // means more leaves are selected against stale visit counts before
+        // the first backprop; tree shape can shift vs. the smaller-batch
+        // version. That's an algorithmic change measured in Phase C, not a
+        // bug.
+        const BATCH_SIZE: usize = 1024;

        let total_budget = self.rollout_budget as usize;
        let wall_budget = if self.budget_ms > 0 {
--- a/src/simulator/crates/mc-ai/src/gpu/inner.rs
+++ b/src/simulator/crates/mc-ai/src/gpu/inner.rs
@ -15,7 +15,6 @@ use std::sync::OnceLock;
 use std::time::Duration;

 use pollster::block_on;
-use wgpu::util::DeviceExt;

 use crate::abstract_state::{AbstractRolloutState, MAX_PLAYERS};
 use crate::policy::PersonalityPriors;
@ -61,6 +60,17 @@ static GPU_SHARED: OnceLock<Option<GpuContext>> = OnceLock::new();
 /// to compute the workgroup count.
 const WORKGROUP_SIZE_X: u32 = 64;

+/// Maximum coalesced batch size. The persistent staging buffers in
+/// [`GpuContext`] are allocated once at this capacity; per-dispatch we only
+/// write/read the n-prefix. p0-20 Phase B: raised from a per-call alloc'd
+/// 32-leaf shape to 1024 to amortize the wgpu submit cost across more
+/// rollouts. Sized so the input + scores allocations stay well under the
+/// `downlevel_defaults` 128 MB max-buffer-size floor:
+///   * batch_pods   = 1024 * 256 B = 256 KiB
+///   * batch_priors = 1024 *  96 B =  96 KiB
+///   * scores       = 1024 *   4 B =   4 KiB
+const MAX_BATCH: u32 = 1024;
+
 /// Per-player priors block uploaded as a storage buffer entry. Mirrors the
 /// `PlayerPriors` WGSL struct exactly — 6 f32 per player, no padding needed
 /// (f32 has 4-byte alignment and 6 × 4 = 24 bytes is already 4-aligned).
@ -112,8 +122,16 @@ struct GpuUniforms {
 }

 /// Lazy-initialized wgpu context. Compile the pipeline once, reuse across
-/// dispatches. Holds the device + queue + BGL so each dispatch only allocates
-/// fresh buffers (not a fresh pipeline).
+/// dispatches. Holds the device + queue + BGL plus a set of persistent,
+/// MAX_BATCH-sized buffers so each dispatch only writes the n-prefix and
+/// rebinds — no per-call allocation.
+///
+/// p0-20 Phase B: the previous shape allocated `buf_pods`, `buf_priors`,
+/// `buf_scores`, `buf_uniforms`, and a MAP_READ staging buffer **per
+/// dispatch**. With BATCH_SIZE=32 in the outer loop, that allocation churn
+/// dominated the GPU path's wall time vs. the rayon CPU reference. Holding
+/// the buffers persistent + raising MAX_BATCH=1024 amortizes the per-submit
+/// driver cost across ~32× more leaves.
 pub struct GpuContext {
    device: wgpu::Device,
    queue: wgpu::Queue,
@ -121,6 +139,17 @@ pub struct GpuContext {
    bind_group_layout: wgpu::BindGroupLayout,
    /// Backend string for diagnostics (`"Vulkan"`, `"Metal"`, `"Dx12"`, `"Gl"`).
    pub backend: String,
+    /// Persistent input pod buffer (MAX_BATCH * sizeof(AbstractRolloutState)).
+    buf_pods:     wgpu::Buffer,
+    /// Persistent input priors buffer (MAX_BATCH * sizeof(BatchPriors)).
+    buf_priors:   wgpu::Buffer,
+    /// Persistent compute-pass output buffer (MAX_BATCH * 4 bytes).
+    buf_scores:   wgpu::Buffer,
+    /// Persistent uniform buffer (overwritten via Queue::write_buffer per call).
+    buf_uniforms: wgpu::Buffer,
+    /// Persistent MAP_READ staging buffer for score readback. Sized for
+    /// MAX_BATCH; per-call we copy/map only the n-prefix.
+    buf_staging:  wgpu::Buffer,
 }

 impl GpuContext {
@ -286,12 +315,62 @@ impl GpuContext {
        });
        if debug { eprintln!("[mc-ai gpu] try_init_inner: returning Some"); }

+        // ── Persistent buffer pool (p0-20 Phase B) ───────────────────────
+        //
+        // Allocated once at MAX_BATCH capacity; reused across every
+        // `batch_simulate` call. All input buffers carry COPY_DST so we can
+        // overwrite their n-prefix via `Queue::write_buffer` per call. The
+        // scores buffer is `STORAGE | COPY_SRC` (kernel writes, encoder
+        // reads). The staging buffer is `MAP_READ | COPY_DST` for readback.
+        let pods_size: u64 =
+            (MAX_BATCH as u64) * (std::mem::size_of::<AbstractRolloutState>() as u64);
+        let priors_size: u64 =
+            (MAX_BATCH as u64) * (std::mem::size_of::<GpuBatchPriors>() as u64);
+        let scores_size: u64 = (MAX_BATCH as u64) * 4;
+        let uniforms_size: u64 = std::mem::size_of::<GpuUniforms>() as u64;
+
+        let buf_pods = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("mc_ai_rollout_pods_persistent"),
+            size: pods_size,
+            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+        let buf_priors = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("mc_ai_rollout_priors_persistent"),
+            size: priors_size,
+            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+        let buf_scores = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("mc_ai_rollout_scores_persistent"),
+            size: scores_size,
+            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
+            mapped_at_creation: false,
+        });
+        let buf_uniforms = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("mc_ai_rollout_uniforms_persistent"),
+            size: uniforms_size,
+            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+        let buf_staging = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("mc_ai_rollout_scores_staging_persistent"),
+            size: scores_size,
+            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+            mapped_at_creation: false,
+        });
+
        Some(Self {
            device,
            queue,
            pipeline,
            bind_group_layout,
            backend,
+            buf_pods,
+            buf_priors,
+            buf_scores,
+            buf_uniforms,
+            buf_staging,
        })
    }

@ -304,7 +383,12 @@ impl GpuContext {
    /// # Panics
    ///
    /// Does not panic. All runtime GPU failures surface as `None`.
-    fn dispatch_batch(
+    /// Dispatch one ≤ MAX_BATCH chunk through the persistent GPU pipeline.
+    ///
+    /// Caller is responsible for chunking inputs > MAX_BATCH; see
+    /// [`Self::batch_simulate`] for the chunking loop. `inputs.len()` must
+    /// equal `priors_per_entry.len()` and must be in `1..=MAX_BATCH`.
+    fn dispatch_chunk(
        &self,
        inputs: &[AbstractRolloutState],
        priors_per_entry: &[[PersonalityPriors; MAX_PLAYERS]],
@ -312,12 +396,12 @@ impl GpuContext {
        horizon: u32,
    ) -> Option<Vec<f32>> {
        debug_assert_eq!(inputs.len(), priors_per_entry.len());
-        if inputs.is_empty() {
-            return Some(Vec::new());
-        }
+        debug_assert!(!inputs.is_empty());
+        debug_assert!(inputs.len() <= MAX_BATCH as usize);

        let n = inputs.len() as u32;
        let dev = &self.device;
+        let queue = &self.queue;

        // Convert CPU priors into GPU-uploadable form.
        let gpu_priors: Vec<GpuBatchPriors> = priors_per_entry
@ -343,24 +427,25 @@ impl GpuContext {
            _pad3: 0,
        };

-        // Upload buffers.
-        let buf_pods = upload_storage_ro(dev, bytemuck::cast_slice(inputs), "rollout_batch_pods");
-        let buf_priors = upload_storage_ro(dev, bytemuck::cast_slice(&gpu_priors), "rollout_priors");
-        let buf_scores = create_storage_rw(dev, (n as usize) * 4, "rollout_scores");
-        let buf_uniforms = dev.create_buffer_init(&wgpu::util::BufferInitDescriptor {
-            label: Some("rollout_uniforms"),
-            contents: bytemuck::bytes_of(&uniforms),
-            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_SRC,
-        });
+        // Write only the n-prefix of each persistent input buffer. The
+        // kernel's `if entry_idx >= n_batch` early-return + our
+        // `n.div_ceil(WORKGROUP_SIZE_X)` dispatch keeps stale tail bytes
+        // from being read.
+        queue.write_buffer(&self.buf_pods, 0, bytemuck::cast_slice(inputs));
+        queue.write_buffer(&self.buf_priors, 0, bytemuck::cast_slice(&gpu_priors));
+        queue.write_buffer(&self.buf_uniforms, 0, bytemuck::bytes_of(&uniforms));

+        // Bind group is rebuilt per call (cheap — descriptor only, no
+        // allocation). Could be cached but keeping it simple matches the
+        // advisor's "don't over-engineer" guidance.
        let bind_group = dev.create_bind_group(&wgpu::BindGroupDescriptor {
            label: Some("rollout_bg"),
            layout: &self.bind_group_layout,
            entries: &[
-                wgpu::BindGroupEntry { binding: 0, resource: buf_pods.as_entire_binding() },
-                wgpu::BindGroupEntry { binding: 1, resource: buf_priors.as_entire_binding() },
-                wgpu::BindGroupEntry { binding: 2, resource: buf_scores.as_entire_binding() },
-                wgpu::BindGroupEntry { binding: 3, resource: buf_uniforms.as_entire_binding() },
+                wgpu::BindGroupEntry { binding: 0, resource: self.buf_pods.as_entire_binding() },
+                wgpu::BindGroupEntry { binding: 1, resource: self.buf_priors.as_entire_binding() },
+                wgpu::BindGroupEntry { binding: 2, resource: self.buf_scores.as_entire_binding() },
+                wgpu::BindGroupEntry { binding: 3, resource: self.buf_uniforms.as_entire_binding() },
            ],
        });

@ -379,19 +464,16 @@ impl GpuContext {
            pass.dispatch_workgroups(workgroups, 1, 1);
        }

-        // Readback: copy scores buffer to a MAP_READ staging buffer.
-        let staging = dev.create_buffer(&wgpu::BufferDescriptor {
-            label: Some("rollout_scores_staging"),
-            size: (n as u64) * 4,
-            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
-            mapped_at_creation: false,
-        });
-        encoder.copy_buffer_to_buffer(&buf_scores, 0, &staging, 0, (n as u64) * 4);
+        // Copy only the n-prefix of scores into the persistent staging
+        // buffer. The remainder of the staging buffer holds previous-call
+        // bytes which we never read (slice cap below is `..n*4`).
+        let n_bytes = (n as u64) * 4;
+        encoder.copy_buffer_to_buffer(&self.buf_scores, 0, &self.buf_staging, 0, n_bytes);

-        self.queue.submit(std::iter::once(encoder.finish()));
+        queue.submit(std::iter::once(encoder.finish()));

-        // Map + read.
-        let slice = staging.slice(..);
+        // Map only the n-prefix and read.
+        let slice = self.buf_staging.slice(0..n_bytes);
        let (tx, rx) = std::sync::mpsc::channel();
        slice.map_async(wgpu::MapMode::Read, move |r| {
            let _ = tx.send(r);
@ -402,7 +484,9 @@ impl GpuContext {
        let data = slice.get_mapped_range();
        let scores: Vec<f32> = bytemuck::cast_slice(&data).to_vec();
        drop(data);
-        staging.unmap();
+        // Unmap the whole buffer (a slice unmap is the buffer unmap in
+        // wgpu — there is no per-slice unmap API).
+        self.buf_staging.unmap();

        Some(scores)
    }
@ -430,13 +514,35 @@ impl GpuContext {
        if inputs.is_empty() {
            return Ok(Vec::new());
        }
-        match self.dispatch_batch(inputs, priors_per_entry, seed, horizon) {
-            Some(scores) => Ok(scores),
-            None => Err(GpuError::DispatchFailed(
-                "wgpu pipeline dispatch returned None (queue submit / buffer map / device lost)"
-                    .to_owned(),
-            )),
+
+        // Chunk over MAX_BATCH. The persistent buffer pool is sized to
+        // MAX_BATCH; inputs larger than that dispatch as multiple submits.
+        // Per-chunk seed is `seed + chunk_offset` so the per-entry stream
+        // (`seed + entry_idx_within_chunk`) reproduces the
+        // `batch_simulate_cpu` per-entry seed `seed + global_entry_idx`
+        // — preserving byte-equivalent parity across chunked + un-chunked
+        // dispatches.
+        let mut out: Vec<f32> = Vec::with_capacity(inputs.len());
+        let max = MAX_BATCH as usize;
+        let mut offset = 0usize;
+        while offset < inputs.len() {
+            let end = (offset + max).min(inputs.len());
+            let chunk_seed = seed.wrapping_add(offset as u64);
+            match self.dispatch_chunk(
+                &inputs[offset..end],
+                &priors_per_entry[offset..end],
+                chunk_seed,
+                horizon,
+            ) {
+                Some(scores) => out.extend(scores),
+                None => return Err(GpuError::DispatchFailed(
+                    "wgpu pipeline dispatch returned None (queue submit / buffer map / device lost)"
+                        .to_owned(),
+                )),
+            }
+            offset = end;
        }
+        Ok(out)
    }
 }

@ -466,23 +572,6 @@ fn uniform_bgl_entry(binding: u32) -> wgpu::BindGroupLayoutEntry {
    }
 }

-fn upload_storage_ro(dev: &wgpu::Device, data: &[u8], label: &str) -> wgpu::Buffer {
-    dev.create_buffer_init(&wgpu::util::BufferInitDescriptor {
-        label: Some(label),
-        contents: data,
-        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
-    })
-}
-
-fn create_storage_rw(dev: &wgpu::Device, size_bytes: usize, label: &str) -> wgpu::Buffer {
-    dev.create_buffer(&wgpu::BufferDescriptor {
-        label: Some(label),
-        size: size_bytes as u64,
-        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
-        mapped_at_creation: false,
-    })
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;