feat(@projects/@magic-civilization): ✨ increase gpu batch size to 1024 for throughput
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
8074291af5
commit
c77809c300
2 changed files with 155 additions and 60 deletions
|
|
@ -611,11 +611,17 @@ impl GdMcTreeController {
|
|||
static BACKEND: OnceLock<AiBackend> = OnceLock::new();
|
||||
let backend = BACKEND.get_or_init(AiBackend::probe);
|
||||
|
||||
// Tunable: 32 leaves per dispatch matches the GPU shader workgroup
|
||||
// size and is the same number the existing `gpu_tree_integration`
|
||||
// tests use. Smaller batches lose throughput; larger batches risk
|
||||
// overshooting the rollout budget on the last dispatch.
|
||||
const BATCH_SIZE: usize = 32;
|
||||
// Tunable: 1024 leaves per dispatch matches the persistent-buffer
|
||||
// MAX_BATCH in `mc_ai::gpu::inner`. p0-20 Phase B raised this from
|
||||
// 32: each `iterate_gpu_batched` call ends up as one wgpu submit,
|
||||
// and the per-submit overhead amortizes only when the batch is
|
||||
// big. The kernel's @workgroup_size(64) lets 1024 leaves dispatch
|
||||
// as 16 workgroups in one submit. Note: bumping batch size also
|
||||
// means more leaves are selected against stale visit counts before
|
||||
// the first backprop; tree shape can shift vs. the smaller-batch
|
||||
// version. That's an algorithmic change measured in Phase C, not a
|
||||
// bug.
|
||||
const BATCH_SIZE: usize = 1024;
|
||||
|
||||
let total_budget = self.rollout_budget as usize;
|
||||
let wall_budget = if self.budget_ms > 0 {
|
||||
|
|
|
|||
|
|
@ -15,7 +15,6 @@ use std::sync::OnceLock;
|
|||
use std::time::Duration;
|
||||
|
||||
use pollster::block_on;
|
||||
use wgpu::util::DeviceExt;
|
||||
|
||||
use crate::abstract_state::{AbstractRolloutState, MAX_PLAYERS};
|
||||
use crate::policy::PersonalityPriors;
|
||||
|
|
@ -61,6 +60,17 @@ static GPU_SHARED: OnceLock<Option<GpuContext>> = OnceLock::new();
|
|||
/// to compute the workgroup count.
|
||||
const WORKGROUP_SIZE_X: u32 = 64;
|
||||
|
||||
/// Maximum coalesced batch size. The persistent staging buffers in
|
||||
/// [`GpuContext`] are allocated once at this capacity; per-dispatch we only
|
||||
/// write/read the n-prefix. p0-20 Phase B: raised from a per-call alloc'd
|
||||
/// 32-leaf shape to 1024 to amortize the wgpu submit cost across more
|
||||
/// rollouts. Sized so the input + scores allocations stay well under the
|
||||
/// `downlevel_defaults` 128 MB max-buffer-size floor:
|
||||
/// * batch_pods = 1024 * 256 B = 256 KiB
|
||||
/// * batch_priors = 1024 * 96 B = 96 KiB
|
||||
/// * scores = 1024 * 4 B = 4 KiB
|
||||
const MAX_BATCH: u32 = 1024;
|
||||
|
||||
/// Per-player priors block uploaded as a storage buffer entry. Mirrors the
|
||||
/// `PlayerPriors` WGSL struct exactly — 6 f32 per player, no padding needed
|
||||
/// (f32 has 4-byte alignment and 6 × 4 = 24 bytes is already 4-aligned).
|
||||
|
|
@ -112,8 +122,16 @@ struct GpuUniforms {
|
|||
}
|
||||
|
||||
/// Lazy-initialized wgpu context. Compile the pipeline once, reuse across
|
||||
/// dispatches. Holds the device + queue + BGL so each dispatch only allocates
|
||||
/// fresh buffers (not a fresh pipeline).
|
||||
/// dispatches. Holds the device + queue + BGL plus a set of persistent,
|
||||
/// MAX_BATCH-sized buffers so each dispatch only writes the n-prefix and
|
||||
/// rebinds — no per-call allocation.
|
||||
///
|
||||
/// p0-20 Phase B: the previous shape allocated `buf_pods`, `buf_priors`,
|
||||
/// `buf_scores`, `buf_uniforms`, and a MAP_READ staging buffer **per
|
||||
/// dispatch**. With BATCH_SIZE=32 in the outer loop, that allocation churn
|
||||
/// dominated the GPU path's wall time vs. the rayon CPU reference. Holding
|
||||
/// the buffers persistent + raising MAX_BATCH=1024 amortizes the per-submit
|
||||
/// driver cost across ~32× more leaves.
|
||||
pub struct GpuContext {
|
||||
device: wgpu::Device,
|
||||
queue: wgpu::Queue,
|
||||
|
|
@ -121,6 +139,17 @@ pub struct GpuContext {
|
|||
bind_group_layout: wgpu::BindGroupLayout,
|
||||
/// Backend string for diagnostics (`"Vulkan"`, `"Metal"`, `"Dx12"`, `"Gl"`).
|
||||
pub backend: String,
|
||||
/// Persistent input pod buffer (MAX_BATCH * sizeof(AbstractRolloutState)).
|
||||
buf_pods: wgpu::Buffer,
|
||||
/// Persistent input priors buffer (MAX_BATCH * sizeof(BatchPriors)).
|
||||
buf_priors: wgpu::Buffer,
|
||||
/// Persistent compute-pass output buffer (MAX_BATCH * 4 bytes).
|
||||
buf_scores: wgpu::Buffer,
|
||||
/// Persistent uniform buffer (overwritten via Queue::write_buffer per call).
|
||||
buf_uniforms: wgpu::Buffer,
|
||||
/// Persistent MAP_READ staging buffer for score readback. Sized for
|
||||
/// MAX_BATCH; per-call we copy/map only the n-prefix.
|
||||
buf_staging: wgpu::Buffer,
|
||||
}
|
||||
|
||||
impl GpuContext {
|
||||
|
|
@ -286,12 +315,62 @@ impl GpuContext {
|
|||
});
|
||||
if debug { eprintln!("[mc-ai gpu] try_init_inner: returning Some"); }
|
||||
|
||||
// ── Persistent buffer pool (p0-20 Phase B) ───────────────────────
|
||||
//
|
||||
// Allocated once at MAX_BATCH capacity; reused across every
|
||||
// `batch_simulate` call. All input buffers carry COPY_DST so we can
|
||||
// overwrite their n-prefix via `Queue::write_buffer` per call. The
|
||||
// scores buffer is `STORAGE | COPY_SRC` (kernel writes, encoder
|
||||
// reads). The staging buffer is `MAP_READ | COPY_DST` for readback.
|
||||
let pods_size: u64 =
|
||||
(MAX_BATCH as u64) * (std::mem::size_of::<AbstractRolloutState>() as u64);
|
||||
let priors_size: u64 =
|
||||
(MAX_BATCH as u64) * (std::mem::size_of::<GpuBatchPriors>() as u64);
|
||||
let scores_size: u64 = (MAX_BATCH as u64) * 4;
|
||||
let uniforms_size: u64 = std::mem::size_of::<GpuUniforms>() as u64;
|
||||
|
||||
let buf_pods = device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some("mc_ai_rollout_pods_persistent"),
|
||||
size: pods_size,
|
||||
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
|
||||
mapped_at_creation: false,
|
||||
});
|
||||
let buf_priors = device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some("mc_ai_rollout_priors_persistent"),
|
||||
size: priors_size,
|
||||
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
|
||||
mapped_at_creation: false,
|
||||
});
|
||||
let buf_scores = device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some("mc_ai_rollout_scores_persistent"),
|
||||
size: scores_size,
|
||||
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
|
||||
mapped_at_creation: false,
|
||||
});
|
||||
let buf_uniforms = device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some("mc_ai_rollout_uniforms_persistent"),
|
||||
size: uniforms_size,
|
||||
usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
|
||||
mapped_at_creation: false,
|
||||
});
|
||||
let buf_staging = device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some("mc_ai_rollout_scores_staging_persistent"),
|
||||
size: scores_size,
|
||||
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
|
||||
mapped_at_creation: false,
|
||||
});
|
||||
|
||||
Some(Self {
|
||||
device,
|
||||
queue,
|
||||
pipeline,
|
||||
bind_group_layout,
|
||||
backend,
|
||||
buf_pods,
|
||||
buf_priors,
|
||||
buf_scores,
|
||||
buf_uniforms,
|
||||
buf_staging,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -304,7 +383,12 @@ impl GpuContext {
|
|||
/// # Panics
|
||||
///
|
||||
/// Does not panic. All runtime GPU failures surface as `None`.
|
||||
fn dispatch_batch(
|
||||
/// Dispatch one ≤ MAX_BATCH chunk through the persistent GPU pipeline.
|
||||
///
|
||||
/// Caller is responsible for chunking inputs > MAX_BATCH; see
|
||||
/// [`Self::batch_simulate`] for the chunking loop. `inputs.len()` must
|
||||
/// equal `priors_per_entry.len()` and must be in `1..=MAX_BATCH`.
|
||||
fn dispatch_chunk(
|
||||
&self,
|
||||
inputs: &[AbstractRolloutState],
|
||||
priors_per_entry: &[[PersonalityPriors; MAX_PLAYERS]],
|
||||
|
|
@ -312,12 +396,12 @@ impl GpuContext {
|
|||
horizon: u32,
|
||||
) -> Option<Vec<f32>> {
|
||||
debug_assert_eq!(inputs.len(), priors_per_entry.len());
|
||||
if inputs.is_empty() {
|
||||
return Some(Vec::new());
|
||||
}
|
||||
debug_assert!(!inputs.is_empty());
|
||||
debug_assert!(inputs.len() <= MAX_BATCH as usize);
|
||||
|
||||
let n = inputs.len() as u32;
|
||||
let dev = &self.device;
|
||||
let queue = &self.queue;
|
||||
|
||||
// Convert CPU priors into GPU-uploadable form.
|
||||
let gpu_priors: Vec<GpuBatchPriors> = priors_per_entry
|
||||
|
|
@ -343,24 +427,25 @@ impl GpuContext {
|
|||
_pad3: 0,
|
||||
};
|
||||
|
||||
// Upload buffers.
|
||||
let buf_pods = upload_storage_ro(dev, bytemuck::cast_slice(inputs), "rollout_batch_pods");
|
||||
let buf_priors = upload_storage_ro(dev, bytemuck::cast_slice(&gpu_priors), "rollout_priors");
|
||||
let buf_scores = create_storage_rw(dev, (n as usize) * 4, "rollout_scores");
|
||||
let buf_uniforms = dev.create_buffer_init(&wgpu::util::BufferInitDescriptor {
|
||||
label: Some("rollout_uniforms"),
|
||||
contents: bytemuck::bytes_of(&uniforms),
|
||||
usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_SRC,
|
||||
});
|
||||
// Write only the n-prefix of each persistent input buffer. The
|
||||
// kernel's `if entry_idx >= n_batch` early-return + our
|
||||
// `n.div_ceil(WORKGROUP_SIZE_X)` dispatch keeps stale tail bytes
|
||||
// from being read.
|
||||
queue.write_buffer(&self.buf_pods, 0, bytemuck::cast_slice(inputs));
|
||||
queue.write_buffer(&self.buf_priors, 0, bytemuck::cast_slice(&gpu_priors));
|
||||
queue.write_buffer(&self.buf_uniforms, 0, bytemuck::bytes_of(&uniforms));
|
||||
|
||||
// Bind group is rebuilt per call (cheap — descriptor only, no
|
||||
// allocation). Could be cached but keeping it simple matches the
|
||||
// advisor's "don't over-engineer" guidance.
|
||||
let bind_group = dev.create_bind_group(&wgpu::BindGroupDescriptor {
|
||||
label: Some("rollout_bg"),
|
||||
layout: &self.bind_group_layout,
|
||||
entries: &[
|
||||
wgpu::BindGroupEntry { binding: 0, resource: buf_pods.as_entire_binding() },
|
||||
wgpu::BindGroupEntry { binding: 1, resource: buf_priors.as_entire_binding() },
|
||||
wgpu::BindGroupEntry { binding: 2, resource: buf_scores.as_entire_binding() },
|
||||
wgpu::BindGroupEntry { binding: 3, resource: buf_uniforms.as_entire_binding() },
|
||||
wgpu::BindGroupEntry { binding: 0, resource: self.buf_pods.as_entire_binding() },
|
||||
wgpu::BindGroupEntry { binding: 1, resource: self.buf_priors.as_entire_binding() },
|
||||
wgpu::BindGroupEntry { binding: 2, resource: self.buf_scores.as_entire_binding() },
|
||||
wgpu::BindGroupEntry { binding: 3, resource: self.buf_uniforms.as_entire_binding() },
|
||||
],
|
||||
});
|
||||
|
||||
|
|
@ -379,19 +464,16 @@ impl GpuContext {
|
|||
pass.dispatch_workgroups(workgroups, 1, 1);
|
||||
}
|
||||
|
||||
// Readback: copy scores buffer to a MAP_READ staging buffer.
|
||||
let staging = dev.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some("rollout_scores_staging"),
|
||||
size: (n as u64) * 4,
|
||||
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
|
||||
mapped_at_creation: false,
|
||||
});
|
||||
encoder.copy_buffer_to_buffer(&buf_scores, 0, &staging, 0, (n as u64) * 4);
|
||||
// Copy only the n-prefix of scores into the persistent staging
|
||||
// buffer. The remainder of the staging buffer holds previous-call
|
||||
// bytes which we never read (slice cap below is `..n*4`).
|
||||
let n_bytes = (n as u64) * 4;
|
||||
encoder.copy_buffer_to_buffer(&self.buf_scores, 0, &self.buf_staging, 0, n_bytes);
|
||||
|
||||
self.queue.submit(std::iter::once(encoder.finish()));
|
||||
queue.submit(std::iter::once(encoder.finish()));
|
||||
|
||||
// Map + read.
|
||||
let slice = staging.slice(..);
|
||||
// Map only the n-prefix and read.
|
||||
let slice = self.buf_staging.slice(0..n_bytes);
|
||||
let (tx, rx) = std::sync::mpsc::channel();
|
||||
slice.map_async(wgpu::MapMode::Read, move |r| {
|
||||
let _ = tx.send(r);
|
||||
|
|
@ -402,7 +484,9 @@ impl GpuContext {
|
|||
let data = slice.get_mapped_range();
|
||||
let scores: Vec<f32> = bytemuck::cast_slice(&data).to_vec();
|
||||
drop(data);
|
||||
staging.unmap();
|
||||
// Unmap the whole buffer (a slice unmap is the buffer unmap in
|
||||
// wgpu — there is no per-slice unmap API).
|
||||
self.buf_staging.unmap();
|
||||
|
||||
Some(scores)
|
||||
}
|
||||
|
|
@ -430,13 +514,35 @@ impl GpuContext {
|
|||
if inputs.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
match self.dispatch_batch(inputs, priors_per_entry, seed, horizon) {
|
||||
Some(scores) => Ok(scores),
|
||||
None => Err(GpuError::DispatchFailed(
|
||||
"wgpu pipeline dispatch returned None (queue submit / buffer map / device lost)"
|
||||
.to_owned(),
|
||||
)),
|
||||
|
||||
// Chunk over MAX_BATCH. The persistent buffer pool is sized to
|
||||
// MAX_BATCH; inputs larger than that dispatch as multiple submits.
|
||||
// Per-chunk seed is `seed + chunk_offset` so the per-entry stream
|
||||
// (`seed + entry_idx_within_chunk`) reproduces the
|
||||
// `batch_simulate_cpu` per-entry seed `seed + global_entry_idx`
|
||||
// — preserving byte-equivalent parity across chunked + un-chunked
|
||||
// dispatches.
|
||||
let mut out: Vec<f32> = Vec::with_capacity(inputs.len());
|
||||
let max = MAX_BATCH as usize;
|
||||
let mut offset = 0usize;
|
||||
while offset < inputs.len() {
|
||||
let end = (offset + max).min(inputs.len());
|
||||
let chunk_seed = seed.wrapping_add(offset as u64);
|
||||
match self.dispatch_chunk(
|
||||
&inputs[offset..end],
|
||||
&priors_per_entry[offset..end],
|
||||
chunk_seed,
|
||||
horizon,
|
||||
) {
|
||||
Some(scores) => out.extend(scores),
|
||||
None => return Err(GpuError::DispatchFailed(
|
||||
"wgpu pipeline dispatch returned None (queue submit / buffer map / device lost)"
|
||||
.to_owned(),
|
||||
)),
|
||||
}
|
||||
offset = end;
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -466,23 +572,6 @@ fn uniform_bgl_entry(binding: u32) -> wgpu::BindGroupLayoutEntry {
|
|||
}
|
||||
}
|
||||
|
||||
fn upload_storage_ro(dev: &wgpu::Device, data: &[u8], label: &str) -> wgpu::Buffer {
|
||||
dev.create_buffer_init(&wgpu::util::BufferInitDescriptor {
|
||||
label: Some(label),
|
||||
contents: data,
|
||||
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
|
||||
})
|
||||
}
|
||||
|
||||
fn create_storage_rw(dev: &wgpu::Device, size_bytes: usize, label: &str) -> wgpu::Buffer {
|
||||
dev.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some(label),
|
||||
size: size_bytes as u64,
|
||||
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
|
||||
mapped_at_creation: false,
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue