fix(@projects/@magic-civilization): 🐛 adjust lair-seeking behavior and add gpu init timeout

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Natalie 2026-04-17 04:33:28 -07:00
parent 8107416177
commit b00e965d66
2 changed files with 173 additions and 46 deletions

View file

@ -952,6 +952,10 @@ func _play_turn() -> void:
var lair_max_tier: int = 2 if u.type_id == "dwarf_scout" else 3
if hp_ok:
lair_target = _find_nearest_low_lair(u.position, lair_max_tier)
if _turn_count <= 30 or _turn_count % 50 == 0:
print(" LAIR_SEEK: %s hp=%d/%d hp_ok=%s target=%s" % [
u.type_id, u.hp, u_max_hp, str(hp_ok), str(lair_target)
])
if lair_target != Vector2i(-1, -1):
_move_toward(u, lair_target, game_map)
_try_attack_adjacent_lair(u, game_map)
@ -1582,10 +1586,6 @@ func _try_attack_adjacent(unit: Variant, game_map: RefCounted) -> void:
func _try_attack_adjacent_lair(unit: Variant, game_map: RefCounted) -> void:
if not unit.is_alive():
return
# Require ≥50% HP — below this, retaliation reliably kills the attacker
var max_hp: int = unit.get_max_hp()
if max_hp > 0 and unit.hp < int(max_hp * 0.5):
return
if not ClassDB.class_exists("GdCombatResolver"):
return
var neighbors: Array[Vector2i] = HexUtilsScript.get_neighbors(unit.position)

View file

@ -11,6 +11,9 @@
//! the terminal f32 score. The shader (`rollout.wgsl`) is the only place
//! rollout semantics are re-expressed — everything else is plumbing.
use std::sync::OnceLock;
use std::time::Duration;
use pollster::block_on;
use wgpu::util::DeviceExt;
@ -24,6 +27,22 @@ use super::RolloutPath;
/// WGSL kernel source, compiled into the binary at build time.
const SHADER_SRC: &str = include_str!("rollout.wgsl");
/// Hard budget for `try_init` — includes adapter probe + device creation +
/// pipeline compile. Exceeded only when the driver is wedged (seen post-reboot
/// on apricot: 46+ minutes of wall time for what normally takes <100 ms).
/// The timeout is a safety belt; a well-behaved driver never approaches this.
const TRY_INIT_TIMEOUT_MS: u64 = 2000;
/// Process-wide singleton. `try_init` is expensive (wgpu instance creation,
/// adapter probe, shader compile). Dispatching once per `batch_simulate` call
/// wastes hundreds of milliseconds per rollout batch. The cache ensures we
/// pay that cost once. `None` after init means "no GPU available; every
/// caller falls back to CPU silently."
///
/// The cache is filled lazily by the first `GpuContext::shared()` call.
/// Subsequent calls return the same `&Option<GpuContext>` in O(1).
static GPU_SHARED: OnceLock<Option<GpuContext>> = OnceLock::new();
/// Workgroup X size — must match `@workgroup_size(64, 1, 1)` in `rollout.wgsl`.
/// Keep these two numbers in sync; the dispatcher divides batch size by this
/// to compute the workgroup count.
@ -92,13 +111,52 @@ pub struct GpuContext {
}
impl GpuContext {
/// Attempt to acquire a GPU adapter and compile the rollout pipeline.
/// Return the process-wide cached GPU context, initializing it on first
/// call. Subsequent calls are O(1) — the probe cost is paid exactly once.
///
/// Returns `None` if no suitable adapter is present (headless CI,
/// missing Vulkan driver, disabled GPU, etc.). Callers fall back to
/// [`batch_simulate_cpu`] silently — see [`super::batch_simulate`].
/// Returns `None` if adapter probe failed, timed out, or the `gpu`
/// feature is disabled. Callers in that case fall back to CPU silently.
///
/// This is the entry point [`batch_simulate`] uses. Tests and direct
/// users should prefer `shared()` over `try_init()` unless they need
/// a fresh probe (e.g. the post-reboot hang repro test).
#[must_use]
pub fn shared() -> Option<&'static Self> {
GPU_SHARED.get_or_init(Self::try_init).as_ref()
}
/// Attempt to acquire a GPU adapter and compile the rollout pipeline,
/// returning `None` on any failure including a hard timeout.
///
/// Timeout: `TRY_INIT_TIMEOUT_MS` (2 seconds). On wedged driver state
/// (post-reboot mesa, missing weston, disabled DRI device), the wgpu
/// internal future `request_adapter` / `request_device` can block for
/// tens of minutes with ~0 CPU time. The timeout guarantees any caller
/// — test, game, or user — gets a decision within 2 seconds flat.
///
/// Implementation detail: we spawn a worker thread, run the probe there,
/// and join with `recv_timeout`. If the timeout trips the worker becomes
/// a detached zombie (it will eventually return or die with the process);
/// that leaked thread is the correct tradeoff vs. hanging the caller.
#[must_use]
pub fn try_init() -> Option<Self> {
let (tx, rx) = std::sync::mpsc::sync_channel::<Option<Self>>(1);
std::thread::spawn(move || {
let result = Self::try_init_inner();
// If the receiver has already timed out and dropped, send fails —
// we don't care, the worker just exits.
let _ = tx.send(result);
});
match rx.recv_timeout(Duration::from_millis(TRY_INIT_TIMEOUT_MS)) {
Ok(ctx) => ctx,
Err(_) => None,
}
}
/// The actual adapter probe + pipeline compile. Synchronous. Runs inside
/// the worker thread spawned by [`Self::try_init`]. Never call directly
/// from user code — always go through `try_init` or `shared`.
fn try_init_inner() -> Option<Self> {
let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor {
backends: wgpu::Backends::all(),
..Default::default()
@ -350,13 +408,16 @@ fn create_storage_rw(dev: &wgpu::Device, size_bytes: usize, label: &str) -> wgpu
/// Top-level GPU-or-CPU dispatch entry point.
///
/// Attempts GPU initialization once per call. If the adapter is present the
/// batch dispatches to the shader; otherwise the CPU reference runs. Result
/// types are identical — only the `RolloutPath` tag differs.
/// Uses the process-wide cached [`GpuContext::shared`] — the adapter probe
/// runs exactly once per process, not per call. On hosts with a working GPU
/// adapter this dispatches to the shader; on headless hosts or hosts where
/// the driver is wedged (see `TRY_INIT_TIMEOUT_MS`) it falls through to the
/// CPU reference silently. Result types are identical; only the
/// [`RolloutPath`] tag differs.
///
/// `GpuContext::try_init` is lightweight (reuses the system wgpu instance)
/// but still non-trivial. Callers that dispatch many batches back-to-back
/// should cache a `GpuContext` and call `ctx.batch_simulate` directly.
/// For hot loops that dispatch many batches, consider holding a
/// `&GpuContext` directly via `GpuContext::shared()` to skip the `OnceLock`
/// atomic load per call.
#[must_use]
pub fn batch_simulate(
inputs: &[AbstractRolloutState],
@ -364,7 +425,14 @@ pub fn batch_simulate(
seed: u64,
horizon: u32,
) -> Vec<(f32, RolloutPath)> {
if let Some(ctx) = GpuContext::try_init() {
// Zero-length inputs never touch the GPU cache — fast path.
if inputs.is_empty() {
return Vec::new();
}
if inputs.len() != priors_per_entry.len() {
return Vec::new();
}
if let Some(ctx) = GpuContext::shared() {
return ctx.batch_simulate(inputs, priors_per_entry, seed, horizon);
}
batch_simulate_cpu(inputs, priors_per_entry, seed, horizon)
@ -430,26 +498,94 @@ mod tests {
pod
}
use std::time::Instant;
// ── Tests that do NOT touch the GPU adapter ──────────────────────────
//
// These rely solely on the pre-dispatch guards in `batch_simulate` (empty
// input / mismatched lens) OR on `GpuContext::shared()` returning None
// after the one-time probe. Either way no single test is responsible for
// the probe cost — the first test that *does* need GPU state pays it
// once and caches.
#[test]
fn batch_simulate_empty_is_empty() {
fn batch_simulate_empty_bypasses_gpu_probe() {
// Empty input returns Vec::new() before GpuContext::shared() is ever
// consulted. Must complete in microseconds even on a wedged-adapter
// host; assert a 100ms upper bound with generous slack for CI jitter.
let start = Instant::now();
let out = batch_simulate(&[], &[], 42, 20);
let elapsed = start.elapsed();
assert!(out.is_empty());
assert!(
elapsed < Duration::from_millis(100),
"empty input must bypass GPU probe; took {:?}",
elapsed
);
}
#[test]
fn batch_simulate_mismatched_lengths_is_empty() {
fn batch_simulate_mismatched_lengths_bypasses_gpu_probe() {
let pods = vec![make_entry()];
let priors: Vec<[PersonalityPriors; MAX_PLAYERS]> = vec![iron_vs_bh(), iron_vs_bh()];
// Entry-level dispatch routes through GpuContext::batch_simulate OR
// batch_simulate_cpu; both return empty on length mismatch.
let start = Instant::now();
let out = batch_simulate(&pods, &priors, 1, 20);
let elapsed = start.elapsed();
assert!(out.is_empty());
assert!(
elapsed < Duration::from_millis(100),
"length-mismatch must bypass GPU probe; took {:?}",
elapsed
);
}
// ── Timeout contract ─────────────────────────────────────────────────
//
// The central production-safety property: `try_init` MUST bound its own
// wall time. Post-reboot apricot hang (46 minutes, ~0 CPU) is the bug
// this suite regresses against. Budget: 2s per our const + ~1s slack
// for worker-thread scheduling = 3s upper bound.
#[test]
fn try_init_respects_timeout_budget() {
let start = Instant::now();
let result = std::panic::catch_unwind(GpuContext::try_init);
let elapsed = start.elapsed();
assert!(result.is_ok(), "try_init must not panic");
assert!(
elapsed < Duration::from_millis(TRY_INIT_TIMEOUT_MS + 1000),
"try_init must honor its {}ms timeout budget; took {:?}",
TRY_INIT_TIMEOUT_MS,
elapsed
);
}
#[test]
fn shared_is_idempotent_and_cached() {
// First call may probe (within timeout); subsequent calls must be
// O(1) — well under 1ms. Verifies OnceLock caching actually works.
let _first = GpuContext::shared();
let start = Instant::now();
for _ in 0..1000 {
let _ = GpuContext::shared();
}
let elapsed = start.elapsed();
assert!(
elapsed < Duration::from_millis(50),
"1000 cached `shared()` calls took {:?} (>{}ms target)",
elapsed,
50
);
}
// ── Scored-path tests ────────────────────────────────────────────────
//
// These exercise the full dispatch pipeline. The first one to run pays
// the probe cost (bounded by timeout). On a wedged-driver host all of
// these silently route through CPU and still produce valid results.
#[test]
fn batch_simulate_produces_unit_interval_scores() {
// May run on GPU or CPU depending on adapter availability. Either
// way, scores must be in [0, 1].
let pods = vec![make_entry(); 4];
let priors = vec![iron_vs_bh(); 4];
let out = batch_simulate(&pods, &priors, 7, 20);
@ -460,11 +596,7 @@ mod tests {
}
#[test]
fn fallback_returns_cpu_tag_without_gpu() {
// If there's no GPU adapter, batch_simulate_cpu engages and tags
// results with RolloutPath::Cpu. When there IS a GPU adapter, results
// come back as RolloutPath::Gpu. Both are valid; we just assert one
// of them shows up.
fn fallback_returns_valid_path_tag() {
let pods = vec![make_entry()];
let priors = vec![iron_vs_bh()];
let out = batch_simulate(&pods, &priors, 100, 20);
@ -472,19 +604,8 @@ mod tests {
assert!(matches!(out[0].1, RolloutPath::Cpu | RolloutPath::Gpu));
}
#[test]
fn gpu_context_try_init_does_not_panic() {
// `try_init` must never panic on any machine. Headless CI returns
// None; dev machines return Some. Either is fine.
let result = std::panic::catch_unwind(GpuContext::try_init);
assert!(result.is_ok(), "try_init must not panic");
}
#[test]
fn batch_simulate_is_deterministic_across_repeated_calls() {
// Same path-same inputs-same seed must produce bit-identical results.
// This holds regardless of whether we hit CPU or GPU because each
// path is internally deterministic.
let pods = vec![make_entry(); 3];
let priors = vec![iron_vs_bh(); 3];
let a = batch_simulate(&pods, &priors, 77, 20);
@ -511,11 +632,14 @@ mod tests {
assert_eq!(a[0].1, b[0].1);
}
/// If a GPU adapter IS available, make sure we actually get `RolloutPath::Gpu`
/// back (proving the adapter path isn't dead code). Test is no-op on headless.
// ── Adapter-gated tests ──────────────────────────────────────────────
//
// Only run when a working adapter is actually present. On wedged
// drivers `shared()` returns None and these skip — no hang, no panic.
#[test]
fn gpu_path_tags_when_adapter_available() {
let Some(ctx) = GpuContext::try_init() else {
let Some(ctx) = GpuContext::shared() else {
eprintln!("[rollout-gpu] no adapter — skipping gpu_path_tags_when_adapter_available");
return;
};
@ -523,17 +647,20 @@ mod tests {
let priors = vec![iron_vs_bh()];
let out = ctx.batch_simulate(&pods, &priors, 123, 20);
assert_eq!(out.len(), 1);
assert_eq!(out[0].1, RolloutPath::Gpu,
"when GpuContext::try_init succeeds, batch_simulate must return Gpu-tagged results (backend: {})",
ctx.backend);
assert_eq!(
out[0].1,
RolloutPath::Gpu,
"with adapter present, batch_simulate must tag Gpu (backend: {})",
ctx.backend
);
assert!((0.0..=1.0).contains(&out[0].0));
}
/// If a GPU adapter IS available, dispatch determinism must hold across
/// repeated ctx.batch_simulate calls on the same context.
/// Dispatch determinism when an adapter is present. Skips on headless or
/// wedged-driver hosts via the `shared()` None return.
#[test]
fn gpu_dispatch_is_deterministic_when_adapter_available() {
let Some(ctx) = GpuContext::try_init() else {
let Some(ctx) = GpuContext::shared() else {
eprintln!("[rollout-gpu] no adapter — skipping gpu_dispatch_is_deterministic");
return;
};