feat(@projects/@magic-civilization): ✨ implement gpu ai backend probe infrastructure
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
039c31a079
commit
0e724b3949
9 changed files with 279 additions and 64 deletions
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"generated_at": "2026-05-04T17:18:27Z",
|
||||
"generated_at": "2026-05-04T20:31:13Z",
|
||||
"totals": {
|
||||
"done": 155,
|
||||
"in_progress": 1,
|
||||
|
|
|
|||
|
|
@ -7,10 +7,14 @@ scope: game1
|
|||
owner: warcouncil
|
||||
updated_at: 2026-05-04
|
||||
evidence:
|
||||
- .project/handoffs/20260504_p0-20-coalesce-blocker.md
|
||||
- src/simulator/crates/mc-ai/src/mcts_tree.rs
|
||||
- src/simulator/api-gdext/src/ai.rs
|
||||
- src/simulator/crates/mc-mcts-service/src/server.rs
|
||||
- "src/simulator/crates/mc-ai/src/backend.rs:1-170 (new AiBackend enum + probe + env override + BackendError)"
|
||||
- "src/simulator/crates/mc-ai/src/gpu/inner.rs:404-432 (batch_simulate now Result<Vec<f32>, GpuError>, no per-call CPU fallback)"
|
||||
- src/simulator/crates/mc-ai/src/gpu/mod.rs (top-level batch_simulate + cfg(not(gpu)) shim deleted)
|
||||
- "src/simulator/crates/mc-ai/src/mcts_tree.rs:368-440 (iterate_gpu_batched takes &AiBackend; gpu_context field + with_gpu_context deleted)"
|
||||
- "src/simulator/api-gdext/src/ai.rs:140-200 (GdMcTreeController.ai_backend probed once at init; gpu_enabled+gpu_context_if_enabled deleted)"
|
||||
- "src/simulator/crates/mc-mcts-service/src/server.rs:30-39 (probes + logs AiBackend at startup)"
|
||||
- "src/simulator/crates/mc-ai/tests/backend_probe.rs (4 new tests, all green)"
|
||||
- cargo test -p mc-ai --features gpu green; gpu_rollout_parity byte-identical on 209 inputs
|
||||
---
|
||||
## Summary
|
||||
|
||||
|
|
@ -251,3 +255,59 @@ GPU; Option B: switch `choose_action` to `iterate_gpu_batched`) — both
|
|||
require user sign-off because they change the rollout reward source from
|
||||
full-fidelity `McSnapshot::step` to `AbstractRolloutState`+heuristic
|
||||
policy, which can perturb the 90% victory-rate baseline.
|
||||
|
||||
## Phase 1 (2026-05-04) — AiBackend infra hygiene
|
||||
|
||||
Strictly infra plumbing — runtime behaviour of `choose_action` is
|
||||
**unchanged**. The Phase-2 action-space switch
|
||||
(`Tree<McSnapshot>` → `Tree<GameRolloutState>`) is a separate, larger phase
|
||||
gated on user sign-off — see the 2026-05-04 handoff for context.
|
||||
|
||||
### Landed in this cycle
|
||||
|
||||
- `mc-ai/src/backend.rs` — new `AiBackend` enum (`Gpu(&'static GpuContext)`
|
||||
vs `Cpu`) + `BackendError` + `probe()`. Boot-probed at construction; the
|
||||
decision is fixed for the session. `MC_AI_BACKEND=cpu|gpu` env override
|
||||
for tests / CI / mobile-dev (cpu forces Cpu, gpu forces probe-Gpu and
|
||||
panics if no adapter). Logs adapter name on stderr at probe time.
|
||||
- `gpu::inner::GpuContext::batch_simulate` signature changed from
|
||||
`Vec<(f32, RolloutPath)>` to `Result<Vec<f32>, GpuError>`. Per-call
|
||||
silent CPU fallback inside `batch_simulate` is **deleted** — runtime
|
||||
GPU dispatch failures now surface as `Err`, never silently degrade.
|
||||
- Top-level `gpu::inner::batch_simulate` + `batch_simulate_default_horizon`
|
||||
+ `cfg(not(feature="gpu"))` shim in `gpu/mod.rs` **deleted**. Single
|
||||
dispatch entry point is `AiBackend::batch_simulate`.
|
||||
- `Tree::iterate_gpu_batched` now takes `backend: &AiBackend` instead of
|
||||
threading an `Option<&'static GpuContext>` via `Tree::with_gpu_context`.
|
||||
`Tree::gpu_context` field + `with_gpu_context` method **deleted**.
|
||||
`gpu_batch_count` retained — bumps on `Ok(_)` when backend is `Gpu(_)`.
|
||||
- `GdMcTreeController` gains `ai_backend: AiBackend` field, probed once at
|
||||
`init()`. `gpu_enabled` field + `gpu_context_if_enabled()` helper
|
||||
**deleted**. `set_gpu_enabled` retained as a no-op stub so the GDScript
|
||||
`ai_turn_bridge.gd` keeps compiling unchanged. `with_gpu_context` calls
|
||||
in `choose_action` / `choose_action_with_stats` removed (they were
|
||||
no-ops anyway — `simulate_parallel` never consulted the field).
|
||||
- `mc-mcts-service::server::run` probes `AiBackend` at startup and logs
|
||||
the chosen backend (`info!(backend = %ai_backend.name(), ...)`). The
|
||||
request handler still uses `simulate_parallel` — Phase 2 wires the
|
||||
backend into the search itself.
|
||||
- `cpu_reference::batch_simulate_cpu` **untouched** — algorithm equivalent
|
||||
to GPU shader, byte-by-byte, on `AbstractRolloutState`. Invariant.
|
||||
|
||||
### Verification
|
||||
|
||||
- `cargo test -p mc-ai` (no `gpu`) — 232 lib + 4 backend_probe + 8 + 7
|
||||
+ 5 + 11 + 9 + 23 + 8 = all green.
|
||||
- `cargo test -p mc-ai --features gpu` — 240 lib + 4 backend_probe + 5
|
||||
parity + 4 gpu_tree_integration + all other suites green. **Parity test
|
||||
byte-identical on 209 inputs** (16 + 65 + 128 across small / partial /
|
||||
multi workgroup) — algorithm untouched.
|
||||
- `cargo build -p magic-civ-physics-gdext` — green.
|
||||
- `cargo build -p mc-mcts-service` — green.
|
||||
- `cargo check --workspace` — green.
|
||||
- `python3 tools/objectives-report.py` — clean (246 objectives).
|
||||
|
||||
`choose_action` and `choose_action_with_stats` remain on
|
||||
`Tree<McSnapshot>` + `simulate_parallel` for Phase 1. No empirical baseline
|
||||
re-run — runtime behaviour bit-equivalent (CPU rollout closure unchanged).
|
||||
Status stays `partial`. Phases 2-5 still ahead.
|
||||
|
|
|
|||
|
|
@ -1,13 +1,13 @@
|
|||
{
|
||||
"generated_at": "2026-05-04T11:32:13Z",
|
||||
"generated_at": "2026-05-04T20:29:33Z",
|
||||
"totals": {
|
||||
"in_progress": 1,
|
||||
"done": 153,
|
||||
"stub": 27,
|
||||
"partial": 28,
|
||||
"stub": 27,
|
||||
"oos": 28,
|
||||
"in_progress": 1,
|
||||
"done": 154,
|
||||
"missing": 6,
|
||||
"total": 243
|
||||
"total": 244
|
||||
},
|
||||
"objectives": [
|
||||
{
|
||||
|
|
@ -440,6 +440,16 @@
|
|||
"updated_at": "2026-04-19",
|
||||
"summary": "Movement is currently a silent left-click on a reachable hex — no path shown, no\nconfirmation step. Players expect the Civ-style flow: enter movement mode (M key\nor Move button), see a path preview, right-click to confirm. This objective\nadds the full movement-mode state machine, path rendering, fog-of-war-aware\npathing, and the Move button on the unit action panel with disabled-state\ntooltips for all action buttons.\n\nDepends on **p0-33** (unit panel must be in the scene tree before the Move\nbutton can be wired)."
|
||||
},
|
||||
{
|
||||
"id": "p0-45",
|
||||
"title": "Turn processor consolidation — entities/ duplicate caused T1 SCRIPT ERROR halt",
|
||||
"priority": "p0",
|
||||
"status": "done",
|
||||
"scope": "game1",
|
||||
"owner": "shipwright",
|
||||
"updated_at": "2026-05-04",
|
||||
"summary": ""
|
||||
},
|
||||
{
|
||||
"id": "p0-20",
|
||||
"title": "GPU-accelerated MCTS rollouts for look-ahead decision-making",
|
||||
|
|
@ -852,12 +862,12 @@
|
|||
},
|
||||
{
|
||||
"id": "p1-38",
|
||||
"title": "Biome → economy coupling — population & luxury driven by live ecology",
|
||||
"title": "\"Biome → economy coupling — population & luxury driven by live ecology\"",
|
||||
"priority": "p1",
|
||||
"status": "partial",
|
||||
"scope": "game1",
|
||||
"owner": "shipwright",
|
||||
"updated_at": "2026-05-03",
|
||||
"updated_at": "2026-05-04",
|
||||
"summary": "Population growth and luxury supply have been decoupled from the live ecology\nsimulation since `mc-flora` was wired up. Cities read static per-terrain food\nyields (`grassland.food=2`, `plains.food=1`); 70 fauna species exist purely\nas combat encounters with no contribution to the city economy; the\n`mc-happiness::get_growth_modifier` tiering (1.25 / 1.00 / 0.50 / 0.00) was\ncomputed but unused on the GDScript side. This objective re-couples the\ncity economy to the ecology layer in four phases (C → A → B → D), each\nsized to land independently with its own balance regression risk.\n\nThe four phases were approved together as a single `p1` objective in plan\n`~/.claude/plans/hi-so-in-valiant-mango.md` (2026-04-27), but ship in\nsequence so `p1-05`'s baseline bands (median `pop_peak=69`, batch\n`p016b_20260417_024754`) are not disturbed."
|
||||
},
|
||||
{
|
||||
|
|
|
|||
|
|
@ -17,9 +17,9 @@ use std::time::{Duration, Instant};
|
|||
|
||||
use godot::prelude::*;
|
||||
use mc_ai::abstract_state::MAX_PLAYERS;
|
||||
use mc_ai::backend::AiBackend;
|
||||
use mc_ai::evaluator::{ScoringEvaluator, ScoringWeights};
|
||||
use mc_ai::game_state::{AiPlayerState, StrategicWeights};
|
||||
use mc_ai::gpu::GpuContext;
|
||||
use mc_ai::mcts::XorShift64;
|
||||
use mc_ai::mcts_tree::{rollout_snapshot, Tree};
|
||||
use mc_ai::tactical::{decide_tactical_actions, Action, TacticalEphemerals, TacticalMap, TacticalState, TacticalTile};
|
||||
|
|
@ -148,12 +148,11 @@ pub struct GdMcTreeController {
|
|||
/// exceeds the budget. Set via `set_budget_ms` (driven by
|
||||
/// `MCTS_DECISION_BUDGET_MS` env on the GDScript side). See p1-22.
|
||||
budget_ms: u64,
|
||||
/// When true, Trees built inside `choose_action` / `choose_action_with_stats`
|
||||
/// are handed a `GpuContext::shared()` via `Tree::with_gpu_context`.
|
||||
/// Toggled by `set_gpu_enabled` (driven by `AI_GPU_ROLLOUT` env on the
|
||||
/// GDScript side) or directly by callers. Default `false` preserves the
|
||||
/// historical CPU-only path until the env flag flips the switch.
|
||||
gpu_enabled: bool,
|
||||
/// Boot-probed AI backend used by batched-rollout call sites (Phase 2+
|
||||
/// of p0-20). Phase 1 plumbs this onto the controller and logs the
|
||||
/// adapter at construction; the live `choose_action` path still uses
|
||||
/// `Tree::simulate_parallel` with CPU rollouts.
|
||||
ai_backend: AiBackend,
|
||||
/// When true, Trees use PUCT selection with per-node priors instead of
|
||||
/// classical UCB1 (p0-38). Toggled by `set_priors_enabled` (driven by
|
||||
/// `AI_MCTS_PRIORS` env). Default `true`; set `AI_MCTS_PRIORS=false` to
|
||||
|
|
@ -168,43 +167,26 @@ pub struct GdMcTreeController {
|
|||
#[godot_api]
|
||||
impl IRefCounted for GdMcTreeController {
|
||||
fn init(base: Base<RefCounted>) -> Self {
|
||||
// Honor AI_GPU_ROLLOUT at construction so callers that never call
|
||||
// `set_gpu_enabled` still pick up the env flag. The GDScript bridge
|
||||
// calls `set_gpu_enabled` explicitly; this is a belt-and-suspenders
|
||||
// default for direct Rust/headless users.
|
||||
let gpu_enabled = matches!(
|
||||
std::env::var("AI_GPU_ROLLOUT").as_deref(),
|
||||
Ok("1") | Ok("true") | Ok("TRUE") | Ok("True")
|
||||
);
|
||||
let priors_enabled = !matches!(
|
||||
std::env::var("AI_MCTS_PRIORS").as_deref(),
|
||||
Ok("0") | Ok("false") | Ok("FALSE") | Ok("False")
|
||||
);
|
||||
// Probe the AI backend exactly once at construction. Logs the
|
||||
// chosen backend (Gpu(adapter) or Cpu) on stderr — visible in
|
||||
// game.log alongside Godot's own startup chatter.
|
||||
let ai_backend = AiBackend::probe();
|
||||
godot_print!("GdMcTreeController: AiBackend probed = {}", ai_backend.name());
|
||||
Self {
|
||||
rollout_budget: 1000,
|
||||
rollout_depth: 20,
|
||||
budget_ms: 0,
|
||||
gpu_enabled,
|
||||
ai_backend,
|
||||
priors_enabled,
|
||||
base,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GdMcTreeController {
|
||||
/// Return the process-wide GPU context when `gpu_enabled` is set and an
|
||||
/// adapter is actually available, otherwise `None`. Threaded into every
|
||||
/// Tree this controller builds; falls through to CPU silently when the
|
||||
/// host has no working compute adapter.
|
||||
fn gpu_context_if_enabled(&self) -> Option<&'static GpuContext> {
|
||||
if self.gpu_enabled {
|
||||
GpuContext::shared()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[godot_api]
|
||||
impl GdMcTreeController {
|
||||
/// Set the per-call rollout budget (default: 1000).
|
||||
|
|
@ -230,16 +212,18 @@ impl GdMcTreeController {
|
|||
self.budget_ms = ms.max(0) as u64;
|
||||
}
|
||||
|
||||
/// Enable or disable GPU rollout dispatch for this controller. When
|
||||
/// enabled, Trees constructed inside `choose_action` /
|
||||
/// `choose_action_with_stats` receive `GpuContext::shared()` via
|
||||
/// `Tree::with_gpu_context`. The actual dispatch still falls back to CPU
|
||||
/// when no adapter is available — see `mc_ai::gpu::GpuContext::shared`.
|
||||
///
|
||||
/// Called from `ai_turn_bridge.gd` based on the `AI_GPU_ROLLOUT` env.
|
||||
/// Phase-1 stub: GPU enable is now decided once at construction by
|
||||
/// `AiBackend::probe()`. The setter is retained so the GDScript
|
||||
/// `ai_turn_bridge.gd` shim keeps compiling without code changes; calls
|
||||
/// are logged but no longer toggle behaviour. Phase 2+ removes this
|
||||
/// surface alongside the GDScript-side env-flag lookup.
|
||||
#[func]
|
||||
fn set_gpu_enabled(&mut self, enabled: bool) {
|
||||
self.gpu_enabled = enabled;
|
||||
godot_print!(
|
||||
"GdMcTreeController::set_gpu_enabled({}) ignored — backend fixed at boot to {}",
|
||||
enabled,
|
||||
self.ai_backend.name()
|
||||
);
|
||||
}
|
||||
|
||||
/// Enable or disable PUCT selection with per-node priors (p0-38).
|
||||
|
|
@ -308,8 +292,7 @@ impl GdMcTreeController {
|
|||
godot_print!("mcts: local");
|
||||
|
||||
let depth = self.rollout_depth;
|
||||
let mut tree = Tree::new(snapshot)
|
||||
.with_gpu_context(self.gpu_context_if_enabled());
|
||||
let mut tree = Tree::new(snapshot);
|
||||
tree.use_priors = self.priors_enabled;
|
||||
|
||||
let rollout_fn = move |snap: &McSnapshot, rng: &mut XorShift64| -> f32 {
|
||||
|
|
@ -472,8 +455,7 @@ impl GdMcTreeController {
|
|||
godot_print!("mcts: local");
|
||||
|
||||
let depth = self.rollout_depth;
|
||||
let mut tree = Tree::new(snapshot)
|
||||
.with_gpu_context(self.gpu_context_if_enabled());
|
||||
let mut tree = Tree::new(snapshot);
|
||||
tree.use_priors = self.priors_enabled;
|
||||
|
||||
let rollout_fn = move |snap: &McSnapshot, rng: &mut XorShift64| -> f32 {
|
||||
|
|
|
|||
|
|
@ -497,6 +497,7 @@ mod tests {
|
|||
wealth: 3.0,
|
||||
trade_willingness: 3.0,
|
||||
grudge_persistence: 7.0,
|
||||
..PersonalityPriors::default()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -508,6 +509,7 @@ mod tests {
|
|||
wealth: 2.0,
|
||||
trade_willingness: 2.0,
|
||||
grudge_persistence: 9.0,
|
||||
..PersonalityPriors::default()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
141
src/simulator/crates/mc-ai/tests/backend_probe.rs
Normal file
141
src/simulator/crates/mc-ai/tests/backend_probe.rs
Normal file
|
|
@ -0,0 +1,141 @@
|
|||
//! Phase-1 contract tests for [`mc_ai::backend::AiBackend::probe`].
|
||||
//!
|
||||
//! Pins the boot-time backend selection rules:
|
||||
//!
|
||||
//! 1. Default probe picks `Gpu` when an adapter is available, else `Cpu`.
|
||||
//! 2. `MC_AI_BACKEND=cpu` always forces `Cpu`, regardless of adapter.
|
||||
//! 3. `MC_AI_BACKEND=gpu` panics if no adapter is available.
|
||||
//!
|
||||
//! These tests serialise on the `MC_AI_BACKEND` env var via a static mutex —
|
||||
//! `cargo test` runs tests in parallel and concurrent `set_var` would race.
|
||||
|
||||
use std::sync::Mutex;
|
||||
|
||||
use mc_ai::backend::AiBackend;
|
||||
|
||||
/// Single-threaded gate around `MC_AI_BACKEND` mutation. Without it,
|
||||
/// concurrent tests racing `std::env::set_var` produce nondeterministic
|
||||
/// outcomes.
|
||||
static ENV_LOCK: Mutex<()> = Mutex::new(());
|
||||
|
||||
/// RAII restorer: captures the prior value of `MC_AI_BACKEND` on construction
|
||||
/// and restores it on drop, including in the panic path. Required because
|
||||
/// tests in this file deliberately panic to assert
|
||||
/// `MC_AI_BACKEND=gpu`-without-adapter behaviour.
|
||||
struct EnvRestore {
|
||||
prev: Option<String>,
|
||||
_guard: std::sync::MutexGuard<'static, ()>,
|
||||
}
|
||||
|
||||
impl Drop for EnvRestore {
|
||||
fn drop(&mut self) {
|
||||
match self.prev.take() {
|
||||
Some(p) => std::env::set_var("MC_AI_BACKEND", p),
|
||||
None => std::env::remove_var("MC_AI_BACKEND"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn set_env(value: Option<&str>) -> EnvRestore {
|
||||
// Recover from poison: previous tests intentionally panic to assert the
|
||||
// GPU-required override behaviour. A poisoned mutex is expected and
|
||||
// safe to take over because each `EnvRestore` resets the env on drop.
|
||||
let guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
|
||||
let prev = std::env::var("MC_AI_BACKEND").ok();
|
||||
match value {
|
||||
Some(v) => std::env::set_var("MC_AI_BACKEND", v),
|
||||
None => std::env::remove_var("MC_AI_BACKEND"),
|
||||
}
|
||||
EnvRestore { prev, _guard: guard }
|
||||
}
|
||||
|
||||
/// Run `body` with `MC_AI_BACKEND` set to `value`, restoring the prior
|
||||
/// state on exit (panic-safe via the [`EnvRestore`] drop guard).
|
||||
fn with_env<F: FnOnce() -> R, R>(value: Option<&str>, body: F) -> R {
|
||||
let _restore = set_env(value);
|
||||
body()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn backend_probe_default_picks_gpu_when_available() {
|
||||
// With the env unset, probe falls back to the standard `GpuContext::shared`
|
||||
// path. On hosts with a working adapter this returns Gpu; on adapter-less
|
||||
// hosts this returns Cpu (covered by the next test). We assert the
|
||||
// disjunction so the test is meaningful on either kind of host.
|
||||
let backend = with_env(None, AiBackend::probe);
|
||||
#[cfg(feature = "gpu")]
|
||||
{
|
||||
assert!(
|
||||
matches!(backend, AiBackend::Gpu(_) | AiBackend::Cpu),
|
||||
"default probe must produce Gpu or Cpu"
|
||||
);
|
||||
}
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
{
|
||||
assert!(
|
||||
matches!(backend, AiBackend::Cpu),
|
||||
"without `gpu` feature, default probe must produce Cpu"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn backend_probe_falls_to_cpu_when_no_adapter() {
|
||||
// Force-CPU path: same outcome a host with no compute adapter would see
|
||||
// under the default probe. Pinning via the env override is the
|
||||
// deterministic way to assert this without depending on the host actually
|
||||
// lacking a GPU.
|
||||
let backend = with_env(Some("cpu"), AiBackend::probe);
|
||||
assert!(
|
||||
matches!(backend, AiBackend::Cpu),
|
||||
"MC_AI_BACKEND=cpu must produce Cpu"
|
||||
);
|
||||
assert_eq!(backend.name(), "cpu");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn backend_probe_env_override_cpu_forces_cpu() {
|
||||
// Even on hosts with a working adapter, MC_AI_BACKEND=cpu must win.
|
||||
let backend = with_env(Some("cpu"), AiBackend::probe);
|
||||
assert!(matches!(backend, AiBackend::Cpu));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "gpu")]
|
||||
fn backend_probe_env_override_gpu_uses_gpu_or_panics() {
|
||||
// MC_AI_BACKEND=gpu must either succeed with Gpu (if adapter is available)
|
||||
// or panic. We catch the panic so the test reports a meaningful skip on
|
||||
// adapter-less hosts and a hard pass on real-adapter hosts.
|
||||
use std::panic;
|
||||
|
||||
let probe_result = with_env(Some("gpu"), || {
|
||||
panic::catch_unwind(panic::AssertUnwindSafe(AiBackend::probe))
|
||||
});
|
||||
|
||||
match probe_result {
|
||||
Ok(AiBackend::Gpu(_)) => {
|
||||
// Real adapter — pass.
|
||||
}
|
||||
Ok(AiBackend::Cpu) => panic!(
|
||||
"MC_AI_BACKEND=gpu must NEVER yield AiBackend::Cpu — \
|
||||
probe should have panicked when no adapter was found"
|
||||
),
|
||||
Err(_panic) => {
|
||||
// No adapter available — probe panicked as required.
|
||||
eprintln!(
|
||||
"[skip] MC_AI_BACKEND=gpu panicked (no adapter on this host) — required behaviour"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
fn backend_probe_env_override_gpu_forbids_cpu_without_feature() {
|
||||
// Without the `gpu` cargo feature, MC_AI_BACKEND=gpu must panic.
|
||||
let result = std::panic::catch_unwind(|| with_env(Some("gpu"), AiBackend::probe));
|
||||
assert!(
|
||||
result.is_err(),
|
||||
"MC_AI_BACKEND=gpu without `gpu` feature must panic"
|
||||
);
|
||||
}
|
||||
|
|
@ -264,7 +264,9 @@ fn gpu_rollout_parity_partial_workgroup() {
|
|||
const SEED: u64 = 0xABCD_EF01_2345_6789_u64;
|
||||
|
||||
let (states, priors) = fixture_batch(N, SEED);
|
||||
let gpu_out = ctx.batch_simulate(&states, &priors, SEED, DEFAULT_ROLLOUT_HORIZON);
|
||||
let gpu_out = ctx
|
||||
.batch_simulate(&states, &priors, SEED, DEFAULT_ROLLOUT_HORIZON)
|
||||
.expect("dispatch should succeed on a working adapter");
|
||||
let cpu_out = batch_simulate_cpu(&states, &priors, SEED, DEFAULT_ROLLOUT_HORIZON);
|
||||
|
||||
assert_eq!(gpu_out.len(), N);
|
||||
|
|
@ -285,11 +287,13 @@ fn gpu_rollout_parity_single_entry() {
|
|||
const SEED: u64 = 42;
|
||||
|
||||
let (states, priors) = fixture_batch(N, SEED);
|
||||
let gpu_out = ctx.batch_simulate(&states, &priors, SEED, DEFAULT_ROLLOUT_HORIZON);
|
||||
let gpu_out = ctx
|
||||
.batch_simulate(&states, &priors, SEED, DEFAULT_ROLLOUT_HORIZON)
|
||||
.expect("dispatch should succeed on a working adapter");
|
||||
let cpu_out = batch_simulate_cpu(&states, &priors, SEED, DEFAULT_ROLLOUT_HORIZON);
|
||||
|
||||
assert_eq!(gpu_out.len(), 1);
|
||||
let g = gpu_out[0].0;
|
||||
let g = gpu_out[0];
|
||||
let c = cpu_out[0].0;
|
||||
let drift = (g - c).abs();
|
||||
|
||||
|
|
@ -320,14 +324,18 @@ fn gpu_rollout_determinism_repeated_dispatch() {
|
|||
const SEED: u64 = 0x5A5A_5A5A_5A5A_5A5A_u64;
|
||||
|
||||
let (states, priors) = fixture_batch(N, SEED);
|
||||
let first = ctx.batch_simulate(&states, &priors, SEED, DEFAULT_ROLLOUT_HORIZON);
|
||||
let second = ctx.batch_simulate(&states, &priors, SEED, DEFAULT_ROLLOUT_HORIZON);
|
||||
let first = ctx
|
||||
.batch_simulate(&states, &priors, SEED, DEFAULT_ROLLOUT_HORIZON)
|
||||
.expect("first dispatch");
|
||||
let second = ctx
|
||||
.batch_simulate(&states, &priors, SEED, DEFAULT_ROLLOUT_HORIZON)
|
||||
.expect("second dispatch");
|
||||
|
||||
assert_eq!(first.len(), second.len());
|
||||
for (i, (a, b)) in first.iter().zip(second.iter()).enumerate() {
|
||||
assert_eq!(
|
||||
a.0.to_bits(),
|
||||
b.0.to_bits(),
|
||||
a.to_bits(),
|
||||
b.to_bits(),
|
||||
"entry {i}: GPU dispatch must be bit-identical on repeat (backend: {})",
|
||||
ctx.backend
|
||||
);
|
||||
|
|
@ -337,8 +345,8 @@ fn gpu_rollout_determinism_repeated_dispatch() {
|
|||
/// Report agreement statistics for a batch. Fails if fewer than
|
||||
/// `MIN_AGREEMENT_FRACTION` of entries agree within `TOLERANCE`.
|
||||
fn report_agreement(
|
||||
gpu: &[(f32, RolloutPath)],
|
||||
cpu: &[(f32, RolloutPath)],
|
||||
gpu: &[f32],
|
||||
cpu: &[(f32, mc_ai::gpu::RolloutPath)],
|
||||
scenario: &str,
|
||||
backend: &str,
|
||||
) {
|
||||
|
|
@ -355,7 +363,7 @@ fn report_agreement(
|
|||
|
||||
let mut failing_entries: Vec<(usize, f32, f32, f32)> = Vec::new();
|
||||
|
||||
for (i, ((g, _), (c, _))) in gpu.iter().zip(cpu.iter()).enumerate() {
|
||||
for (i, (g, (c, _))) in gpu.iter().zip(cpu.iter()).enumerate() {
|
||||
let drift = (g - c).abs();
|
||||
mean_drift += drift as f64;
|
||||
if drift > max_drift {
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ fn ironhold_priors() -> PersonalityPriors {
|
|||
promotion_offense_weight: 1.0,
|
||||
promotion_defense_weight: 1.0,
|
||||
promotion_mobility_weight: 1.0,
|
||||
..PersonalityPriors::default()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -57,6 +58,7 @@ fn blackhammer_priors() -> PersonalityPriors {
|
|||
promotion_offense_weight: 1.0,
|
||||
promotion_defense_weight: 1.0,
|
||||
promotion_mobility_weight: 1.0,
|
||||
..PersonalityPriors::default()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,16 @@ pub async fn run(socket_path: impl AsRef<Path> + std::fmt::Debug) -> Result<(),
|
|||
let path = socket_path.as_ref();
|
||||
let _ = tokio::fs::remove_file(path).await;
|
||||
let listener = UnixListener::bind(path).map_err(ServiceError::Bind)?;
|
||||
|
||||
// Phase 1 of p0-20: probe the AI backend at startup so the chosen path
|
||||
// is observable in service logs. The strategic search call site below
|
||||
// still uses CPU rollouts via `Tree::simulate_parallel` — Phase 2 wires
|
||||
// the boot-probed backend into the search itself. This call exists so
|
||||
// operators see "[mc-ai backend] Cpu (...)" in `mcts-server.log` and
|
||||
// can confirm the deployed binary is on the expected backend.
|
||||
let ai_backend = mc_ai::backend::AiBackend::probe();
|
||||
info!(backend = %ai_backend.name(), "AiBackend probed");
|
||||
|
||||
info!("listening");
|
||||
loop {
|
||||
match listener.accept().await {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue