From b00e965d66c8705e83ba744fecd779560ecb2ad9 Mon Sep 17 00:00:00 2001 From: Natalie Date: Fri, 17 Apr 2026 04:33:28 -0700 Subject: [PATCH] =?UTF-8?q?fix(@projects/@magic-civilization):=20?= =?UTF-8?q?=F0=9F=90=9B=20adjust=20lair-seeking=20behavior=20and=20add=20g?= =?UTF-8?q?pu=20init=20timeout?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Lilith Autocommit --- src/game/engine/scenes/tests/auto_play.gd | 8 +- src/simulator/crates/mc-ai/src/gpu/inner.rs | 211 ++++++++++++++++---- 2 files changed, 173 insertions(+), 46 deletions(-) diff --git a/src/game/engine/scenes/tests/auto_play.gd b/src/game/engine/scenes/tests/auto_play.gd index fcaa8f1f..5c6e8442 100644 --- a/src/game/engine/scenes/tests/auto_play.gd +++ b/src/game/engine/scenes/tests/auto_play.gd @@ -952,6 +952,10 @@ func _play_turn() -> void: var lair_max_tier: int = 2 if u.type_id == "dwarf_scout" else 3 if hp_ok: lair_target = _find_nearest_low_lair(u.position, lair_max_tier) + if _turn_count <= 30 or _turn_count % 50 == 0: + print(" LAIR_SEEK: %s hp=%d/%d hp_ok=%s target=%s" % [ + u.type_id, u.hp, u_max_hp, str(hp_ok), str(lair_target) + ]) if lair_target != Vector2i(-1, -1): _move_toward(u, lair_target, game_map) _try_attack_adjacent_lair(u, game_map) @@ -1582,10 +1586,6 @@ func _try_attack_adjacent(unit: Variant, game_map: RefCounted) -> void: func _try_attack_adjacent_lair(unit: Variant, game_map: RefCounted) -> void: if not unit.is_alive(): return - # Require ≥50% HP — below this, retaliation reliably kills the attacker - var max_hp: int = unit.get_max_hp() - if max_hp > 0 and unit.hp < int(max_hp * 0.5): - return if not ClassDB.class_exists("GdCombatResolver"): return var neighbors: Array[Vector2i] = HexUtilsScript.get_neighbors(unit.position) diff --git a/src/simulator/crates/mc-ai/src/gpu/inner.rs b/src/simulator/crates/mc-ai/src/gpu/inner.rs index 56f95401..97dae387 100644 --- a/src/simulator/crates/mc-ai/src/gpu/inner.rs +++ b/src/simulator/crates/mc-ai/src/gpu/inner.rs @@ -11,6 +11,9 @@ //! the terminal f32 score. The shader (`rollout.wgsl`) is the only place //! rollout semantics are re-expressed — everything else is plumbing. +use std::sync::OnceLock; +use std::time::Duration; + use pollster::block_on; use wgpu::util::DeviceExt; @@ -24,6 +27,22 @@ use super::RolloutPath; /// WGSL kernel source, compiled into the binary at build time. const SHADER_SRC: &str = include_str!("rollout.wgsl"); +/// Hard budget for `try_init` — includes adapter probe + device creation + +/// pipeline compile. Exceeded only when the driver is wedged (seen post-reboot +/// on apricot: 46+ minutes of wall time for what normally takes <100 ms). +/// The timeout is a safety belt; a well-behaved driver never approaches this. +const TRY_INIT_TIMEOUT_MS: u64 = 2000; + +/// Process-wide singleton. `try_init` is expensive (wgpu instance creation, +/// adapter probe, shader compile). Dispatching once per `batch_simulate` call +/// wastes hundreds of milliseconds per rollout batch. The cache ensures we +/// pay that cost once. `None` after init means "no GPU available; every +/// caller falls back to CPU silently." +/// +/// The cache is filled lazily by the first `GpuContext::shared()` call. +/// Subsequent calls return the same `&Option` in O(1). +static GPU_SHARED: OnceLock> = OnceLock::new(); + /// Workgroup X size — must match `@workgroup_size(64, 1, 1)` in `rollout.wgsl`. /// Keep these two numbers in sync; the dispatcher divides batch size by this /// to compute the workgroup count. @@ -92,13 +111,52 @@ pub struct GpuContext { } impl GpuContext { - /// Attempt to acquire a GPU adapter and compile the rollout pipeline. + /// Return the process-wide cached GPU context, initializing it on first + /// call. Subsequent calls are O(1) — the probe cost is paid exactly once. /// - /// Returns `None` if no suitable adapter is present (headless CI, - /// missing Vulkan driver, disabled GPU, etc.). Callers fall back to - /// [`batch_simulate_cpu`] silently — see [`super::batch_simulate`]. + /// Returns `None` if adapter probe failed, timed out, or the `gpu` + /// feature is disabled. Callers in that case fall back to CPU silently. + /// + /// This is the entry point [`batch_simulate`] uses. Tests and direct + /// users should prefer `shared()` over `try_init()` unless they need + /// a fresh probe (e.g. the post-reboot hang repro test). + #[must_use] + pub fn shared() -> Option<&'static Self> { + GPU_SHARED.get_or_init(Self::try_init).as_ref() + } + + /// Attempt to acquire a GPU adapter and compile the rollout pipeline, + /// returning `None` on any failure including a hard timeout. + /// + /// Timeout: `TRY_INIT_TIMEOUT_MS` (2 seconds). On wedged driver state + /// (post-reboot mesa, missing weston, disabled DRI device), the wgpu + /// internal future `request_adapter` / `request_device` can block for + /// tens of minutes with ~0 CPU time. The timeout guarantees any caller + /// — test, game, or user — gets a decision within 2 seconds flat. + /// + /// Implementation detail: we spawn a worker thread, run the probe there, + /// and join with `recv_timeout`. If the timeout trips the worker becomes + /// a detached zombie (it will eventually return or die with the process); + /// that leaked thread is the correct tradeoff vs. hanging the caller. #[must_use] pub fn try_init() -> Option { + let (tx, rx) = std::sync::mpsc::sync_channel::>(1); + std::thread::spawn(move || { + let result = Self::try_init_inner(); + // If the receiver has already timed out and dropped, send fails — + // we don't care, the worker just exits. + let _ = tx.send(result); + }); + match rx.recv_timeout(Duration::from_millis(TRY_INIT_TIMEOUT_MS)) { + Ok(ctx) => ctx, + Err(_) => None, + } + } + + /// The actual adapter probe + pipeline compile. Synchronous. Runs inside + /// the worker thread spawned by [`Self::try_init`]. Never call directly + /// from user code — always go through `try_init` or `shared`. + fn try_init_inner() -> Option { let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor { backends: wgpu::Backends::all(), ..Default::default() @@ -350,13 +408,16 @@ fn create_storage_rw(dev: &wgpu::Device, size_bytes: usize, label: &str) -> wgpu /// Top-level GPU-or-CPU dispatch entry point. /// -/// Attempts GPU initialization once per call. If the adapter is present the -/// batch dispatches to the shader; otherwise the CPU reference runs. Result -/// types are identical — only the `RolloutPath` tag differs. +/// Uses the process-wide cached [`GpuContext::shared`] — the adapter probe +/// runs exactly once per process, not per call. On hosts with a working GPU +/// adapter this dispatches to the shader; on headless hosts or hosts where +/// the driver is wedged (see `TRY_INIT_TIMEOUT_MS`) it falls through to the +/// CPU reference silently. Result types are identical; only the +/// [`RolloutPath`] tag differs. /// -/// `GpuContext::try_init` is lightweight (reuses the system wgpu instance) -/// but still non-trivial. Callers that dispatch many batches back-to-back -/// should cache a `GpuContext` and call `ctx.batch_simulate` directly. +/// For hot loops that dispatch many batches, consider holding a +/// `&GpuContext` directly via `GpuContext::shared()` to skip the `OnceLock` +/// atomic load per call. #[must_use] pub fn batch_simulate( inputs: &[AbstractRolloutState], @@ -364,7 +425,14 @@ pub fn batch_simulate( seed: u64, horizon: u32, ) -> Vec<(f32, RolloutPath)> { - if let Some(ctx) = GpuContext::try_init() { + // Zero-length inputs never touch the GPU cache — fast path. + if inputs.is_empty() { + return Vec::new(); + } + if inputs.len() != priors_per_entry.len() { + return Vec::new(); + } + if let Some(ctx) = GpuContext::shared() { return ctx.batch_simulate(inputs, priors_per_entry, seed, horizon); } batch_simulate_cpu(inputs, priors_per_entry, seed, horizon) @@ -430,26 +498,94 @@ mod tests { pod } + use std::time::Instant; + + // ── Tests that do NOT touch the GPU adapter ────────────────────────── + // + // These rely solely on the pre-dispatch guards in `batch_simulate` (empty + // input / mismatched lens) OR on `GpuContext::shared()` returning None + // after the one-time probe. Either way no single test is responsible for + // the probe cost — the first test that *does* need GPU state pays it + // once and caches. + #[test] - fn batch_simulate_empty_is_empty() { + fn batch_simulate_empty_bypasses_gpu_probe() { + // Empty input returns Vec::new() before GpuContext::shared() is ever + // consulted. Must complete in microseconds even on a wedged-adapter + // host; assert a 100ms upper bound with generous slack for CI jitter. + let start = Instant::now(); let out = batch_simulate(&[], &[], 42, 20); + let elapsed = start.elapsed(); assert!(out.is_empty()); + assert!( + elapsed < Duration::from_millis(100), + "empty input must bypass GPU probe; took {:?}", + elapsed + ); } #[test] - fn batch_simulate_mismatched_lengths_is_empty() { + fn batch_simulate_mismatched_lengths_bypasses_gpu_probe() { let pods = vec![make_entry()]; let priors: Vec<[PersonalityPriors; MAX_PLAYERS]> = vec![iron_vs_bh(), iron_vs_bh()]; - // Entry-level dispatch routes through GpuContext::batch_simulate OR - // batch_simulate_cpu; both return empty on length mismatch. + let start = Instant::now(); let out = batch_simulate(&pods, &priors, 1, 20); + let elapsed = start.elapsed(); assert!(out.is_empty()); + assert!( + elapsed < Duration::from_millis(100), + "length-mismatch must bypass GPU probe; took {:?}", + elapsed + ); } + // ── Timeout contract ───────────────────────────────────────────────── + // + // The central production-safety property: `try_init` MUST bound its own + // wall time. Post-reboot apricot hang (46 minutes, ~0 CPU) is the bug + // this suite regresses against. Budget: 2s per our const + ~1s slack + // for worker-thread scheduling = 3s upper bound. + + #[test] + fn try_init_respects_timeout_budget() { + let start = Instant::now(); + let result = std::panic::catch_unwind(GpuContext::try_init); + let elapsed = start.elapsed(); + assert!(result.is_ok(), "try_init must not panic"); + assert!( + elapsed < Duration::from_millis(TRY_INIT_TIMEOUT_MS + 1000), + "try_init must honor its {}ms timeout budget; took {:?}", + TRY_INIT_TIMEOUT_MS, + elapsed + ); + } + + #[test] + fn shared_is_idempotent_and_cached() { + // First call may probe (within timeout); subsequent calls must be + // O(1) — well under 1ms. Verifies OnceLock caching actually works. + let _first = GpuContext::shared(); + let start = Instant::now(); + for _ in 0..1000 { + let _ = GpuContext::shared(); + } + let elapsed = start.elapsed(); + assert!( + elapsed < Duration::from_millis(50), + "1000 cached `shared()` calls took {:?} (>{}ms target)", + elapsed, + 50 + ); + } + + // ── Scored-path tests ──────────────────────────────────────────────── + // + // These exercise the full dispatch pipeline. The first one to run pays + // the probe cost (bounded by timeout). On a wedged-driver host all of + // these silently route through CPU and still produce valid results. + #[test] fn batch_simulate_produces_unit_interval_scores() { - // May run on GPU or CPU depending on adapter availability. Either - // way, scores must be in [0, 1]. let pods = vec![make_entry(); 4]; let priors = vec![iron_vs_bh(); 4]; let out = batch_simulate(&pods, &priors, 7, 20); @@ -460,11 +596,7 @@ mod tests { } #[test] - fn fallback_returns_cpu_tag_without_gpu() { - // If there's no GPU adapter, batch_simulate_cpu engages and tags - // results with RolloutPath::Cpu. When there IS a GPU adapter, results - // come back as RolloutPath::Gpu. Both are valid; we just assert one - // of them shows up. + fn fallback_returns_valid_path_tag() { let pods = vec![make_entry()]; let priors = vec![iron_vs_bh()]; let out = batch_simulate(&pods, &priors, 100, 20); @@ -472,19 +604,8 @@ mod tests { assert!(matches!(out[0].1, RolloutPath::Cpu | RolloutPath::Gpu)); } - #[test] - fn gpu_context_try_init_does_not_panic() { - // `try_init` must never panic on any machine. Headless CI returns - // None; dev machines return Some. Either is fine. - let result = std::panic::catch_unwind(GpuContext::try_init); - assert!(result.is_ok(), "try_init must not panic"); - } - #[test] fn batch_simulate_is_deterministic_across_repeated_calls() { - // Same path-same inputs-same seed must produce bit-identical results. - // This holds regardless of whether we hit CPU or GPU because each - // path is internally deterministic. let pods = vec![make_entry(); 3]; let priors = vec![iron_vs_bh(); 3]; let a = batch_simulate(&pods, &priors, 77, 20); @@ -511,11 +632,14 @@ mod tests { assert_eq!(a[0].1, b[0].1); } - /// If a GPU adapter IS available, make sure we actually get `RolloutPath::Gpu` - /// back (proving the adapter path isn't dead code). Test is no-op on headless. + // ── Adapter-gated tests ────────────────────────────────────────────── + // + // Only run when a working adapter is actually present. On wedged + // drivers `shared()` returns None and these skip — no hang, no panic. + #[test] fn gpu_path_tags_when_adapter_available() { - let Some(ctx) = GpuContext::try_init() else { + let Some(ctx) = GpuContext::shared() else { eprintln!("[rollout-gpu] no adapter — skipping gpu_path_tags_when_adapter_available"); return; }; @@ -523,17 +647,20 @@ mod tests { let priors = vec![iron_vs_bh()]; let out = ctx.batch_simulate(&pods, &priors, 123, 20); assert_eq!(out.len(), 1); - assert_eq!(out[0].1, RolloutPath::Gpu, - "when GpuContext::try_init succeeds, batch_simulate must return Gpu-tagged results (backend: {})", - ctx.backend); + assert_eq!( + out[0].1, + RolloutPath::Gpu, + "with adapter present, batch_simulate must tag Gpu (backend: {})", + ctx.backend + ); assert!((0.0..=1.0).contains(&out[0].0)); } - /// If a GPU adapter IS available, dispatch determinism must hold across - /// repeated ctx.batch_simulate calls on the same context. + /// Dispatch determinism when an adapter is present. Skips on headless or + /// wedged-driver hosts via the `shared()` None return. #[test] fn gpu_dispatch_is_deterministic_when_adapter_available() { - let Some(ctx) = GpuContext::try_init() else { + let Some(ctx) = GpuContext::shared() else { eprintln!("[rollout-gpu] no adapter — skipping gpu_dispatch_is_deterministic"); return; };