From b00e965d66c8705e83ba744fecd779560ecb2ad9 Mon Sep 17 00:00:00 2001
From: Natalie <natalie@lilithuwu.com>
Date: Fri, 17 Apr 2026 04:33:28 -0700
Subject: [PATCH] =?UTF-8?q?fix(@projects/@magic-civilization):=20?=
 =?UTF-8?q?=F0=9F=90=9B=20adjust=20lair-seeking=20behavior=20and=20add=20g?=
 =?UTF-8?q?pu=20init=20timeout?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
---
 src/game/engine/scenes/tests/auto_play.gd   |   8 +-
 src/simulator/crates/mc-ai/src/gpu/inner.rs | 211 ++++++++++++++++----
 2 files changed, 173 insertions(+), 46 deletions(-)
diff --git a/src/game/engine/scenes/tests/auto_play.gd b/src/game/engine/scenes/tests/auto_play.gd
index fcaa8f1f..5c6e8442 100644
--- a/src/game/engine/scenes/tests/auto_play.gd
+++ b/src/game/engine/scenes/tests/auto_play.gd
@@ -952,6 +952,10 @@ func _play_turn() -> void:
 				var lair_max_tier: int = 2 if u.type_id == "dwarf_scout" else 3
 				if hp_ok:
 					lair_target = _find_nearest_low_lair(u.position, lair_max_tier)
+				if _turn_count <= 30 or _turn_count % 50 == 0:
+					print("  LAIR_SEEK: %s hp=%d/%d hp_ok=%s target=%s" % [
+						u.type_id, u.hp, u_max_hp, str(hp_ok), str(lair_target)
+					])
 				if lair_target != Vector2i(-1, -1):
 					_move_toward(u, lair_target, game_map)
 					_try_attack_adjacent_lair(u, game_map)
@@ -1582,10 +1586,6 @@ func _try_attack_adjacent(unit: Variant, game_map: RefCounted) -> void:
 func _try_attack_adjacent_lair(unit: Variant, game_map: RefCounted) -> void:
 	if not unit.is_alive():
 		return
-	# Require ≥50% HP — below this, retaliation reliably kills the attacker
-	var max_hp: int = unit.get_max_hp()
-	if max_hp > 0 and unit.hp < int(max_hp * 0.5):
-		return
 	if not ClassDB.class_exists("GdCombatResolver"):
 		return
 	var neighbors: Array[Vector2i] = HexUtilsScript.get_neighbors(unit.position)
diff --git a/src/simulator/crates/mc-ai/src/gpu/inner.rs b/src/simulator/crates/mc-ai/src/gpu/inner.rs
index 56f95401..97dae387 100644
--- a/src/simulator/crates/mc-ai/src/gpu/inner.rs
+++ b/src/simulator/crates/mc-ai/src/gpu/inner.rs
@@ -11,6 +11,9 @@
 //! the terminal f32 score. The shader (`rollout.wgsl`) is the only place
 //! rollout semantics are re-expressed — everything else is plumbing.
 
+use std::sync::OnceLock;
+use std::time::Duration;
+
 use pollster::block_on;
 use wgpu::util::DeviceExt;
 
@@ -24,6 +27,22 @@ use super::RolloutPath;
 /// WGSL kernel source, compiled into the binary at build time.
 const SHADER_SRC: &str = include_str!("rollout.wgsl");
 
+/// Hard budget for `try_init` — includes adapter probe + device creation +
+/// pipeline compile. Exceeded only when the driver is wedged (seen post-reboot
+/// on apricot: 46+ minutes of wall time for what normally takes <100 ms).
+/// The timeout is a safety belt; a well-behaved driver never approaches this.
+const TRY_INIT_TIMEOUT_MS: u64 = 2000;
+
+/// Process-wide singleton. `try_init` is expensive (wgpu instance creation,
+/// adapter probe, shader compile). Dispatching once per `batch_simulate` call
+/// wastes hundreds of milliseconds per rollout batch. The cache ensures we
+/// pay that cost once. `None` after init means "no GPU available; every
+/// caller falls back to CPU silently."
+///
+/// The cache is filled lazily by the first `GpuContext::shared()` call.
+/// Subsequent calls return the same `&Option<GpuContext>` in O(1).
+static GPU_SHARED: OnceLock<Option<GpuContext>> = OnceLock::new();
+
 /// Workgroup X size — must match `@workgroup_size(64, 1, 1)` in `rollout.wgsl`.
 /// Keep these two numbers in sync; the dispatcher divides batch size by this
 /// to compute the workgroup count.
@@ -92,13 +111,52 @@ pub struct GpuContext {
 }
 
 impl GpuContext {
-    /// Attempt to acquire a GPU adapter and compile the rollout pipeline.
+    /// Return the process-wide cached GPU context, initializing it on first
+    /// call. Subsequent calls are O(1) — the probe cost is paid exactly once.
     ///
-    /// Returns `None` if no suitable adapter is present (headless CI,
-    /// missing Vulkan driver, disabled GPU, etc.). Callers fall back to
-    /// [`batch_simulate_cpu`] silently — see [`super::batch_simulate`].
+    /// Returns `None` if adapter probe failed, timed out, or the `gpu`
+    /// feature is disabled. Callers in that case fall back to CPU silently.
+    ///
+    /// This is the entry point [`batch_simulate`] uses. Tests and direct
+    /// users should prefer `shared()` over `try_init()` unless they need
+    /// a fresh probe (e.g. the post-reboot hang repro test).
+    #[must_use]
+    pub fn shared() -> Option<&'static Self> {
+        GPU_SHARED.get_or_init(Self::try_init).as_ref()
+    }
+
+    /// Attempt to acquire a GPU adapter and compile the rollout pipeline,
+    /// returning `None` on any failure including a hard timeout.
+    ///
+    /// Timeout: `TRY_INIT_TIMEOUT_MS` (2 seconds). On wedged driver state
+    /// (post-reboot mesa, missing weston, disabled DRI device), the wgpu
+    /// internal future `request_adapter` / `request_device` can block for
+    /// tens of minutes with ~0 CPU time. The timeout guarantees any caller
+    /// — test, game, or user — gets a decision within 2 seconds flat.
+    ///
+    /// Implementation detail: we spawn a worker thread, run the probe there,
+    /// and join with `recv_timeout`. If the timeout trips the worker becomes
+    /// a detached zombie (it will eventually return or die with the process);
+    /// that leaked thread is the correct tradeoff vs. hanging the caller.
     #[must_use]
     pub fn try_init() -> Option<Self> {
+        let (tx, rx) = std::sync::mpsc::sync_channel::<Option<Self>>(1);
+        std::thread::spawn(move || {
+            let result = Self::try_init_inner();
+            // If the receiver has already timed out and dropped, send fails —
+            // we don't care, the worker just exits.
+            let _ = tx.send(result);
+        });
+        match rx.recv_timeout(Duration::from_millis(TRY_INIT_TIMEOUT_MS)) {
+            Ok(ctx) => ctx,
+            Err(_) => None,
+        }
+    }
+
+    /// The actual adapter probe + pipeline compile. Synchronous. Runs inside
+    /// the worker thread spawned by [`Self::try_init`]. Never call directly
+    /// from user code — always go through `try_init` or `shared`.
+    fn try_init_inner() -> Option<Self> {
         let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor {
             backends: wgpu::Backends::all(),
             ..Default::default()
@@ -350,13 +408,16 @@ fn create_storage_rw(dev: &wgpu::Device, size_bytes: usize, label: &str) -> wgpu
 
 /// Top-level GPU-or-CPU dispatch entry point.
 ///
-/// Attempts GPU initialization once per call. If the adapter is present the
-/// batch dispatches to the shader; otherwise the CPU reference runs. Result
-/// types are identical — only the `RolloutPath` tag differs.
+/// Uses the process-wide cached [`GpuContext::shared`] — the adapter probe
+/// runs exactly once per process, not per call. On hosts with a working GPU
+/// adapter this dispatches to the shader; on headless hosts or hosts where
+/// the driver is wedged (see `TRY_INIT_TIMEOUT_MS`) it falls through to the
+/// CPU reference silently. Result types are identical; only the
+/// [`RolloutPath`] tag differs.
 ///
-/// `GpuContext::try_init` is lightweight (reuses the system wgpu instance)
-/// but still non-trivial. Callers that dispatch many batches back-to-back
-/// should cache a `GpuContext` and call `ctx.batch_simulate` directly.
+/// For hot loops that dispatch many batches, consider holding a
+/// `&GpuContext` directly via `GpuContext::shared()` to skip the `OnceLock`
+/// atomic load per call.
 #[must_use]
 pub fn batch_simulate(
     inputs: &[AbstractRolloutState],
@@ -364,7 +425,14 @@ pub fn batch_simulate(
     seed: u64,
     horizon: u32,
 ) -> Vec<(f32, RolloutPath)> {
-    if let Some(ctx) = GpuContext::try_init() {
+    // Zero-length inputs never touch the GPU cache — fast path.
+    if inputs.is_empty() {
+        return Vec::new();
+    }
+    if inputs.len() != priors_per_entry.len() {
+        return Vec::new();
+    }
+    if let Some(ctx) = GpuContext::shared() {
         return ctx.batch_simulate(inputs, priors_per_entry, seed, horizon);
     }
     batch_simulate_cpu(inputs, priors_per_entry, seed, horizon)
@@ -430,26 +498,94 @@ mod tests {
         pod
     }
 
+    use std::time::Instant;
+
+    // ── Tests that do NOT touch the GPU adapter ──────────────────────────
+    //
+    // These rely solely on the pre-dispatch guards in `batch_simulate` (empty
+    // input / mismatched lens) OR on `GpuContext::shared()` returning None
+    // after the one-time probe. Either way no single test is responsible for
+    // the probe cost — the first test that *does* need GPU state pays it
+    // once and caches.
+
     #[test]
-    fn batch_simulate_empty_is_empty() {
+    fn batch_simulate_empty_bypasses_gpu_probe() {
+        // Empty input returns Vec::new() before GpuContext::shared() is ever
+        // consulted. Must complete in microseconds even on a wedged-adapter
+        // host; assert a 100ms upper bound with generous slack for CI jitter.
+        let start = Instant::now();
         let out = batch_simulate(&[], &[], 42, 20);
+        let elapsed = start.elapsed();
         assert!(out.is_empty());
+        assert!(
+            elapsed < Duration::from_millis(100),
+            "empty input must bypass GPU probe; took {:?}",
+            elapsed
+        );
     }
 
     #[test]
-    fn batch_simulate_mismatched_lengths_is_empty() {
+    fn batch_simulate_mismatched_lengths_bypasses_gpu_probe() {
         let pods = vec![make_entry()];
         let priors: Vec<[PersonalityPriors; MAX_PLAYERS]> = vec![iron_vs_bh(), iron_vs_bh()];
-        // Entry-level dispatch routes through GpuContext::batch_simulate OR
-        // batch_simulate_cpu; both return empty on length mismatch.
+        let start = Instant::now();
         let out = batch_simulate(&pods, &priors, 1, 20);
+        let elapsed = start.elapsed();
         assert!(out.is_empty());
+        assert!(
+            elapsed < Duration::from_millis(100),
+            "length-mismatch must bypass GPU probe; took {:?}",
+            elapsed
+        );
     }
 
+    // ── Timeout contract ─────────────────────────────────────────────────
+    //
+    // The central production-safety property: `try_init` MUST bound its own
+    // wall time. Post-reboot apricot hang (46 minutes, ~0 CPU) is the bug
+    // this suite regresses against. Budget: 2s per our const + ~1s slack
+    // for worker-thread scheduling = 3s upper bound.
+
+    #[test]
+    fn try_init_respects_timeout_budget() {
+        let start = Instant::now();
+        let result = std::panic::catch_unwind(GpuContext::try_init);
+        let elapsed = start.elapsed();
+        assert!(result.is_ok(), "try_init must not panic");
+        assert!(
+            elapsed < Duration::from_millis(TRY_INIT_TIMEOUT_MS + 1000),
+            "try_init must honor its {}ms timeout budget; took {:?}",
+            TRY_INIT_TIMEOUT_MS,
+            elapsed
+        );
+    }
+
+    #[test]
+    fn shared_is_idempotent_and_cached() {
+        // First call may probe (within timeout); subsequent calls must be
+        // O(1) — well under 1ms. Verifies OnceLock caching actually works.
+        let _first = GpuContext::shared();
+        let start = Instant::now();
+        for _ in 0..1000 {
+            let _ = GpuContext::shared();
+        }
+        let elapsed = start.elapsed();
+        assert!(
+            elapsed < Duration::from_millis(50),
+            "1000 cached `shared()` calls took {:?} (>{}ms target)",
+            elapsed,
+            50
+        );
+    }
+
+    // ── Scored-path tests ────────────────────────────────────────────────
+    //
+    // These exercise the full dispatch pipeline. The first one to run pays
+    // the probe cost (bounded by timeout). On a wedged-driver host all of
+    // these silently route through CPU and still produce valid results.
+
     #[test]
     fn batch_simulate_produces_unit_interval_scores() {
-        // May run on GPU or CPU depending on adapter availability. Either
-        // way, scores must be in [0, 1].
         let pods = vec![make_entry(); 4];
         let priors = vec![iron_vs_bh(); 4];
         let out = batch_simulate(&pods, &priors, 7, 20);
@@ -460,11 +596,7 @@ mod tests {
     }
 
     #[test]
-    fn fallback_returns_cpu_tag_without_gpu() {
-        // If there's no GPU adapter, batch_simulate_cpu engages and tags
-        // results with RolloutPath::Cpu. When there IS a GPU adapter, results
-        // come back as RolloutPath::Gpu. Both are valid; we just assert one
-        // of them shows up.
+    fn fallback_returns_valid_path_tag() {
         let pods = vec![make_entry()];
         let priors = vec![iron_vs_bh()];
         let out = batch_simulate(&pods, &priors, 100, 20);
@@ -472,19 +604,8 @@ mod tests {
         assert!(matches!(out[0].1, RolloutPath::Cpu | RolloutPath::Gpu));
     }
 
-    #[test]
-    fn gpu_context_try_init_does_not_panic() {
-        // `try_init` must never panic on any machine. Headless CI returns
-        // None; dev machines return Some. Either is fine.
-        let result = std::panic::catch_unwind(GpuContext::try_init);
-        assert!(result.is_ok(), "try_init must not panic");
-    }
-
     #[test]
     fn batch_simulate_is_deterministic_across_repeated_calls() {
-        // Same path-same inputs-same seed must produce bit-identical results.
-        // This holds regardless of whether we hit CPU or GPU because each
-        // path is internally deterministic.
         let pods = vec![make_entry(); 3];
         let priors = vec![iron_vs_bh(); 3];
         let a = batch_simulate(&pods, &priors, 77, 20);
@@ -511,11 +632,14 @@ mod tests {
         assert_eq!(a[0].1, b[0].1);
     }
 
-    /// If a GPU adapter IS available, make sure we actually get `RolloutPath::Gpu`
-    /// back (proving the adapter path isn't dead code). Test is no-op on headless.
+    // ── Adapter-gated tests ──────────────────────────────────────────────
+    //
+    // Only run when a working adapter is actually present. On wedged
+    // drivers `shared()` returns None and these skip — no hang, no panic.
+
     #[test]
     fn gpu_path_tags_when_adapter_available() {
-        let Some(ctx) = GpuContext::try_init() else {
+        let Some(ctx) = GpuContext::shared() else {
             eprintln!("[rollout-gpu] no adapter — skipping gpu_path_tags_when_adapter_available");
             return;
         };
@@ -523,17 +647,20 @@ mod tests {
         let priors = vec![iron_vs_bh()];
         let out = ctx.batch_simulate(&pods, &priors, 123, 20);
         assert_eq!(out.len(), 1);
-        assert_eq!(out[0].1, RolloutPath::Gpu,
-            "when GpuContext::try_init succeeds, batch_simulate must return Gpu-tagged results (backend: {})",
-            ctx.backend);
+        assert_eq!(
+            out[0].1,
+            RolloutPath::Gpu,
+            "with adapter present, batch_simulate must tag Gpu (backend: {})",
+            ctx.backend
+        );
         assert!((0.0..=1.0).contains(&out[0].0));
     }
 
-    /// If a GPU adapter IS available, dispatch determinism must hold across
-    /// repeated ctx.batch_simulate calls on the same context.
+    /// Dispatch determinism when an adapter is present. Skips on headless or
+    /// wedged-driver hosts via the `shared()` None return.
     #[test]
     fn gpu_dispatch_is_deterministic_when_adapter_available() {
-        let Some(ctx) = GpuContext::try_init() else {
+        let Some(ctx) = GpuContext::shared() else {
             eprintln!("[rollout-gpu] no adapter — skipping gpu_dispatch_is_deterministic");
             return;
         };