feat(@projects/@magic-civilization): increase gpu batch size to 1024 for throughput

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Natalie 2026-05-04 17:36:12 -04:00
parent 8074291af5
commit c77809c300
2 changed files with 155 additions and 60 deletions

View file

@ -611,11 +611,17 @@ impl GdMcTreeController {
static BACKEND: OnceLock<AiBackend> = OnceLock::new();
let backend = BACKEND.get_or_init(AiBackend::probe);
// Tunable: 32 leaves per dispatch matches the GPU shader workgroup
// size and is the same number the existing `gpu_tree_integration`
// tests use. Smaller batches lose throughput; larger batches risk
// overshooting the rollout budget on the last dispatch.
const BATCH_SIZE: usize = 32;
// Tunable: 1024 leaves per dispatch matches the persistent-buffer
// MAX_BATCH in `mc_ai::gpu::inner`. p0-20 Phase B raised this from
// 32: each `iterate_gpu_batched` call ends up as one wgpu submit,
// and the per-submit overhead amortizes only when the batch is
// big. The kernel's @workgroup_size(64) lets 1024 leaves dispatch
// as 16 workgroups in one submit. Note: bumping batch size also
// means more leaves are selected against stale visit counts before
// the first backprop; tree shape can shift vs. the smaller-batch
// version. That's an algorithmic change measured in Phase C, not a
// bug.
const BATCH_SIZE: usize = 1024;
let total_budget = self.rollout_budget as usize;
let wall_budget = if self.budget_ms > 0 {

View file

@ -15,7 +15,6 @@ use std::sync::OnceLock;
use std::time::Duration;
use pollster::block_on;
use wgpu::util::DeviceExt;
use crate::abstract_state::{AbstractRolloutState, MAX_PLAYERS};
use crate::policy::PersonalityPriors;
@ -61,6 +60,17 @@ static GPU_SHARED: OnceLock<Option<GpuContext>> = OnceLock::new();
/// to compute the workgroup count.
const WORKGROUP_SIZE_X: u32 = 64;
/// Maximum coalesced batch size. The persistent staging buffers in
/// [`GpuContext`] are allocated once at this capacity; per-dispatch we only
/// write/read the n-prefix. p0-20 Phase B: raised from a per-call alloc'd
/// 32-leaf shape to 1024 to amortize the wgpu submit cost across more
/// rollouts. Sized so the input + scores allocations stay well under the
/// `downlevel_defaults` 128 MB max-buffer-size floor:
/// * batch_pods = 1024 * 256 B = 256 KiB
/// * batch_priors = 1024 * 96 B = 96 KiB
/// * scores = 1024 * 4 B = 4 KiB
const MAX_BATCH: u32 = 1024;
/// Per-player priors block uploaded as a storage buffer entry. Mirrors the
/// `PlayerPriors` WGSL struct exactly — 6 f32 per player, no padding needed
/// (f32 has 4-byte alignment and 6 × 4 = 24 bytes is already 4-aligned).
@ -112,8 +122,16 @@ struct GpuUniforms {
}
/// Lazy-initialized wgpu context. Compile the pipeline once, reuse across
/// dispatches. Holds the device + queue + BGL so each dispatch only allocates
/// fresh buffers (not a fresh pipeline).
/// dispatches. Holds the device + queue + BGL plus a set of persistent,
/// MAX_BATCH-sized buffers so each dispatch only writes the n-prefix and
/// rebinds — no per-call allocation.
///
/// p0-20 Phase B: the previous shape allocated `buf_pods`, `buf_priors`,
/// `buf_scores`, `buf_uniforms`, and a MAP_READ staging buffer **per
/// dispatch**. With BATCH_SIZE=32 in the outer loop, that allocation churn
/// dominated the GPU path's wall time vs. the rayon CPU reference. Holding
/// the buffers persistent + raising MAX_BATCH=1024 amortizes the per-submit
/// driver cost across ~32× more leaves.
pub struct GpuContext {
device: wgpu::Device,
queue: wgpu::Queue,
@ -121,6 +139,17 @@ pub struct GpuContext {
bind_group_layout: wgpu::BindGroupLayout,
/// Backend string for diagnostics (`"Vulkan"`, `"Metal"`, `"Dx12"`, `"Gl"`).
pub backend: String,
/// Persistent input pod buffer (MAX_BATCH * sizeof(AbstractRolloutState)).
buf_pods: wgpu::Buffer,
/// Persistent input priors buffer (MAX_BATCH * sizeof(BatchPriors)).
buf_priors: wgpu::Buffer,
/// Persistent compute-pass output buffer (MAX_BATCH * 4 bytes).
buf_scores: wgpu::Buffer,
/// Persistent uniform buffer (overwritten via Queue::write_buffer per call).
buf_uniforms: wgpu::Buffer,
/// Persistent MAP_READ staging buffer for score readback. Sized for
/// MAX_BATCH; per-call we copy/map only the n-prefix.
buf_staging: wgpu::Buffer,
}
impl GpuContext {
@ -286,12 +315,62 @@ impl GpuContext {
});
if debug { eprintln!("[mc-ai gpu] try_init_inner: returning Some"); }
// ── Persistent buffer pool (p0-20 Phase B) ───────────────────────
//
// Allocated once at MAX_BATCH capacity; reused across every
// `batch_simulate` call. All input buffers carry COPY_DST so we can
// overwrite their n-prefix via `Queue::write_buffer` per call. The
// scores buffer is `STORAGE | COPY_SRC` (kernel writes, encoder
// reads). The staging buffer is `MAP_READ | COPY_DST` for readback.
let pods_size: u64 =
(MAX_BATCH as u64) * (std::mem::size_of::<AbstractRolloutState>() as u64);
let priors_size: u64 =
(MAX_BATCH as u64) * (std::mem::size_of::<GpuBatchPriors>() as u64);
let scores_size: u64 = (MAX_BATCH as u64) * 4;
let uniforms_size: u64 = std::mem::size_of::<GpuUniforms>() as u64;
let buf_pods = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("mc_ai_rollout_pods_persistent"),
size: pods_size,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
});
let buf_priors = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("mc_ai_rollout_priors_persistent"),
size: priors_size,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
});
let buf_scores = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("mc_ai_rollout_scores_persistent"),
size: scores_size,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
mapped_at_creation: false,
});
let buf_uniforms = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("mc_ai_rollout_uniforms_persistent"),
size: uniforms_size,
usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
});
let buf_staging = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("mc_ai_rollout_scores_staging_persistent"),
size: scores_size,
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
mapped_at_creation: false,
});
Some(Self {
device,
queue,
pipeline,
bind_group_layout,
backend,
buf_pods,
buf_priors,
buf_scores,
buf_uniforms,
buf_staging,
})
}
@ -304,7 +383,12 @@ impl GpuContext {
/// # Panics
///
/// Does not panic. All runtime GPU failures surface as `None`.
fn dispatch_batch(
/// Dispatch one ≤ MAX_BATCH chunk through the persistent GPU pipeline.
///
/// Caller is responsible for chunking inputs > MAX_BATCH; see
/// [`Self::batch_simulate`] for the chunking loop. `inputs.len()` must
/// equal `priors_per_entry.len()` and must be in `1..=MAX_BATCH`.
fn dispatch_chunk(
&self,
inputs: &[AbstractRolloutState],
priors_per_entry: &[[PersonalityPriors; MAX_PLAYERS]],
@ -312,12 +396,12 @@ impl GpuContext {
horizon: u32,
) -> Option<Vec<f32>> {
debug_assert_eq!(inputs.len(), priors_per_entry.len());
if inputs.is_empty() {
return Some(Vec::new());
}
debug_assert!(!inputs.is_empty());
debug_assert!(inputs.len() <= MAX_BATCH as usize);
let n = inputs.len() as u32;
let dev = &self.device;
let queue = &self.queue;
// Convert CPU priors into GPU-uploadable form.
let gpu_priors: Vec<GpuBatchPriors> = priors_per_entry
@ -343,24 +427,25 @@ impl GpuContext {
_pad3: 0,
};
// Upload buffers.
let buf_pods = upload_storage_ro(dev, bytemuck::cast_slice(inputs), "rollout_batch_pods");
let buf_priors = upload_storage_ro(dev, bytemuck::cast_slice(&gpu_priors), "rollout_priors");
let buf_scores = create_storage_rw(dev, (n as usize) * 4, "rollout_scores");
let buf_uniforms = dev.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("rollout_uniforms"),
contents: bytemuck::bytes_of(&uniforms),
usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_SRC,
});
// Write only the n-prefix of each persistent input buffer. The
// kernel's `if entry_idx >= n_batch` early-return + our
// `n.div_ceil(WORKGROUP_SIZE_X)` dispatch keeps stale tail bytes
// from being read.
queue.write_buffer(&self.buf_pods, 0, bytemuck::cast_slice(inputs));
queue.write_buffer(&self.buf_priors, 0, bytemuck::cast_slice(&gpu_priors));
queue.write_buffer(&self.buf_uniforms, 0, bytemuck::bytes_of(&uniforms));
// Bind group is rebuilt per call (cheap — descriptor only, no
// allocation). Could be cached but keeping it simple matches the
// advisor's "don't over-engineer" guidance.
let bind_group = dev.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("rollout_bg"),
layout: &self.bind_group_layout,
entries: &[
wgpu::BindGroupEntry { binding: 0, resource: buf_pods.as_entire_binding() },
wgpu::BindGroupEntry { binding: 1, resource: buf_priors.as_entire_binding() },
wgpu::BindGroupEntry { binding: 2, resource: buf_scores.as_entire_binding() },
wgpu::BindGroupEntry { binding: 3, resource: buf_uniforms.as_entire_binding() },
wgpu::BindGroupEntry { binding: 0, resource: self.buf_pods.as_entire_binding() },
wgpu::BindGroupEntry { binding: 1, resource: self.buf_priors.as_entire_binding() },
wgpu::BindGroupEntry { binding: 2, resource: self.buf_scores.as_entire_binding() },
wgpu::BindGroupEntry { binding: 3, resource: self.buf_uniforms.as_entire_binding() },
],
});
@ -379,19 +464,16 @@ impl GpuContext {
pass.dispatch_workgroups(workgroups, 1, 1);
}
// Readback: copy scores buffer to a MAP_READ staging buffer.
let staging = dev.create_buffer(&wgpu::BufferDescriptor {
label: Some("rollout_scores_staging"),
size: (n as u64) * 4,
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
mapped_at_creation: false,
});
encoder.copy_buffer_to_buffer(&buf_scores, 0, &staging, 0, (n as u64) * 4);
// Copy only the n-prefix of scores into the persistent staging
// buffer. The remainder of the staging buffer holds previous-call
// bytes which we never read (slice cap below is `..n*4`).
let n_bytes = (n as u64) * 4;
encoder.copy_buffer_to_buffer(&self.buf_scores, 0, &self.buf_staging, 0, n_bytes);
self.queue.submit(std::iter::once(encoder.finish()));
queue.submit(std::iter::once(encoder.finish()));
// Map + read.
let slice = staging.slice(..);
// Map only the n-prefix and read.
let slice = self.buf_staging.slice(0..n_bytes);
let (tx, rx) = std::sync::mpsc::channel();
slice.map_async(wgpu::MapMode::Read, move |r| {
let _ = tx.send(r);
@ -402,7 +484,9 @@ impl GpuContext {
let data = slice.get_mapped_range();
let scores: Vec<f32> = bytemuck::cast_slice(&data).to_vec();
drop(data);
staging.unmap();
// Unmap the whole buffer (a slice unmap is the buffer unmap in
// wgpu — there is no per-slice unmap API).
self.buf_staging.unmap();
Some(scores)
}
@ -430,13 +514,35 @@ impl GpuContext {
if inputs.is_empty() {
return Ok(Vec::new());
}
match self.dispatch_batch(inputs, priors_per_entry, seed, horizon) {
Some(scores) => Ok(scores),
None => Err(GpuError::DispatchFailed(
"wgpu pipeline dispatch returned None (queue submit / buffer map / device lost)"
.to_owned(),
)),
// Chunk over MAX_BATCH. The persistent buffer pool is sized to
// MAX_BATCH; inputs larger than that dispatch as multiple submits.
// Per-chunk seed is `seed + chunk_offset` so the per-entry stream
// (`seed + entry_idx_within_chunk`) reproduces the
// `batch_simulate_cpu` per-entry seed `seed + global_entry_idx`
// — preserving byte-equivalent parity across chunked + un-chunked
// dispatches.
let mut out: Vec<f32> = Vec::with_capacity(inputs.len());
let max = MAX_BATCH as usize;
let mut offset = 0usize;
while offset < inputs.len() {
let end = (offset + max).min(inputs.len());
let chunk_seed = seed.wrapping_add(offset as u64);
match self.dispatch_chunk(
&inputs[offset..end],
&priors_per_entry[offset..end],
chunk_seed,
horizon,
) {
Some(scores) => out.extend(scores),
None => return Err(GpuError::DispatchFailed(
"wgpu pipeline dispatch returned None (queue submit / buffer map / device lost)"
.to_owned(),
)),
}
offset = end;
}
Ok(out)
}
}
@ -466,23 +572,6 @@ fn uniform_bgl_entry(binding: u32) -> wgpu::BindGroupLayoutEntry {
}
}
fn upload_storage_ro(dev: &wgpu::Device, data: &[u8], label: &str) -> wgpu::Buffer {
dev.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some(label),
contents: data,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
})
}
fn create_storage_rw(dev: &wgpu::Device, size_bytes: usize, label: &str) -> wgpu::Buffer {
dev.create_buffer(&wgpu::BufferDescriptor {
label: Some(label),
size: size_bytes as u64,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
mapped_at_creation: false,
})
}
#[cfg(test)]
mod tests {
use super::*;