fix(@projects/@magic-civilization): 🐛 adjust lair-seeking behavior and add gpu init timeout
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
8107416177
commit
b00e965d66
2 changed files with 173 additions and 46 deletions
|
|
@ -952,6 +952,10 @@ func _play_turn() -> void:
|
|||
var lair_max_tier: int = 2 if u.type_id == "dwarf_scout" else 3
|
||||
if hp_ok:
|
||||
lair_target = _find_nearest_low_lair(u.position, lair_max_tier)
|
||||
if _turn_count <= 30 or _turn_count % 50 == 0:
|
||||
print(" LAIR_SEEK: %s hp=%d/%d hp_ok=%s target=%s" % [
|
||||
u.type_id, u.hp, u_max_hp, str(hp_ok), str(lair_target)
|
||||
])
|
||||
if lair_target != Vector2i(-1, -1):
|
||||
_move_toward(u, lair_target, game_map)
|
||||
_try_attack_adjacent_lair(u, game_map)
|
||||
|
|
@ -1582,10 +1586,6 @@ func _try_attack_adjacent(unit: Variant, game_map: RefCounted) -> void:
|
|||
func _try_attack_adjacent_lair(unit: Variant, game_map: RefCounted) -> void:
|
||||
if not unit.is_alive():
|
||||
return
|
||||
# Require ≥50% HP — below this, retaliation reliably kills the attacker
|
||||
var max_hp: int = unit.get_max_hp()
|
||||
if max_hp > 0 and unit.hp < int(max_hp * 0.5):
|
||||
return
|
||||
if not ClassDB.class_exists("GdCombatResolver"):
|
||||
return
|
||||
var neighbors: Array[Vector2i] = HexUtilsScript.get_neighbors(unit.position)
|
||||
|
|
|
|||
|
|
@ -11,6 +11,9 @@
|
|||
//! the terminal f32 score. The shader (`rollout.wgsl`) is the only place
|
||||
//! rollout semantics are re-expressed — everything else is plumbing.
|
||||
|
||||
use std::sync::OnceLock;
|
||||
use std::time::Duration;
|
||||
|
||||
use pollster::block_on;
|
||||
use wgpu::util::DeviceExt;
|
||||
|
||||
|
|
@ -24,6 +27,22 @@ use super::RolloutPath;
|
|||
/// WGSL kernel source, compiled into the binary at build time.
|
||||
const SHADER_SRC: &str = include_str!("rollout.wgsl");
|
||||
|
||||
/// Hard budget for `try_init` — includes adapter probe + device creation +
|
||||
/// pipeline compile. Exceeded only when the driver is wedged (seen post-reboot
|
||||
/// on apricot: 46+ minutes of wall time for what normally takes <100 ms).
|
||||
/// The timeout is a safety belt; a well-behaved driver never approaches this.
|
||||
const TRY_INIT_TIMEOUT_MS: u64 = 2000;
|
||||
|
||||
/// Process-wide singleton. `try_init` is expensive (wgpu instance creation,
|
||||
/// adapter probe, shader compile). Dispatching once per `batch_simulate` call
|
||||
/// wastes hundreds of milliseconds per rollout batch. The cache ensures we
|
||||
/// pay that cost once. `None` after init means "no GPU available; every
|
||||
/// caller falls back to CPU silently."
|
||||
///
|
||||
/// The cache is filled lazily by the first `GpuContext::shared()` call.
|
||||
/// Subsequent calls return the same `&Option<GpuContext>` in O(1).
|
||||
static GPU_SHARED: OnceLock<Option<GpuContext>> = OnceLock::new();
|
||||
|
||||
/// Workgroup X size — must match `@workgroup_size(64, 1, 1)` in `rollout.wgsl`.
|
||||
/// Keep these two numbers in sync; the dispatcher divides batch size by this
|
||||
/// to compute the workgroup count.
|
||||
|
|
@ -92,13 +111,52 @@ pub struct GpuContext {
|
|||
}
|
||||
|
||||
impl GpuContext {
|
||||
/// Attempt to acquire a GPU adapter and compile the rollout pipeline.
|
||||
/// Return the process-wide cached GPU context, initializing it on first
|
||||
/// call. Subsequent calls are O(1) — the probe cost is paid exactly once.
|
||||
///
|
||||
/// Returns `None` if no suitable adapter is present (headless CI,
|
||||
/// missing Vulkan driver, disabled GPU, etc.). Callers fall back to
|
||||
/// [`batch_simulate_cpu`] silently — see [`super::batch_simulate`].
|
||||
/// Returns `None` if adapter probe failed, timed out, or the `gpu`
|
||||
/// feature is disabled. Callers in that case fall back to CPU silently.
|
||||
///
|
||||
/// This is the entry point [`batch_simulate`] uses. Tests and direct
|
||||
/// users should prefer `shared()` over `try_init()` unless they need
|
||||
/// a fresh probe (e.g. the post-reboot hang repro test).
|
||||
#[must_use]
|
||||
pub fn shared() -> Option<&'static Self> {
|
||||
GPU_SHARED.get_or_init(Self::try_init).as_ref()
|
||||
}
|
||||
|
||||
/// Attempt to acquire a GPU adapter and compile the rollout pipeline,
|
||||
/// returning `None` on any failure including a hard timeout.
|
||||
///
|
||||
/// Timeout: `TRY_INIT_TIMEOUT_MS` (2 seconds). On wedged driver state
|
||||
/// (post-reboot mesa, missing weston, disabled DRI device), the wgpu
|
||||
/// internal future `request_adapter` / `request_device` can block for
|
||||
/// tens of minutes with ~0 CPU time. The timeout guarantees any caller
|
||||
/// — test, game, or user — gets a decision within 2 seconds flat.
|
||||
///
|
||||
/// Implementation detail: we spawn a worker thread, run the probe there,
|
||||
/// and join with `recv_timeout`. If the timeout trips the worker becomes
|
||||
/// a detached zombie (it will eventually return or die with the process);
|
||||
/// that leaked thread is the correct tradeoff vs. hanging the caller.
|
||||
#[must_use]
|
||||
pub fn try_init() -> Option<Self> {
|
||||
let (tx, rx) = std::sync::mpsc::sync_channel::<Option<Self>>(1);
|
||||
std::thread::spawn(move || {
|
||||
let result = Self::try_init_inner();
|
||||
// If the receiver has already timed out and dropped, send fails —
|
||||
// we don't care, the worker just exits.
|
||||
let _ = tx.send(result);
|
||||
});
|
||||
match rx.recv_timeout(Duration::from_millis(TRY_INIT_TIMEOUT_MS)) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// The actual adapter probe + pipeline compile. Synchronous. Runs inside
|
||||
/// the worker thread spawned by [`Self::try_init`]. Never call directly
|
||||
/// from user code — always go through `try_init` or `shared`.
|
||||
fn try_init_inner() -> Option<Self> {
|
||||
let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor {
|
||||
backends: wgpu::Backends::all(),
|
||||
..Default::default()
|
||||
|
|
@ -350,13 +408,16 @@ fn create_storage_rw(dev: &wgpu::Device, size_bytes: usize, label: &str) -> wgpu
|
|||
|
||||
/// Top-level GPU-or-CPU dispatch entry point.
|
||||
///
|
||||
/// Attempts GPU initialization once per call. If the adapter is present the
|
||||
/// batch dispatches to the shader; otherwise the CPU reference runs. Result
|
||||
/// types are identical — only the `RolloutPath` tag differs.
|
||||
/// Uses the process-wide cached [`GpuContext::shared`] — the adapter probe
|
||||
/// runs exactly once per process, not per call. On hosts with a working GPU
|
||||
/// adapter this dispatches to the shader; on headless hosts or hosts where
|
||||
/// the driver is wedged (see `TRY_INIT_TIMEOUT_MS`) it falls through to the
|
||||
/// CPU reference silently. Result types are identical; only the
|
||||
/// [`RolloutPath`] tag differs.
|
||||
///
|
||||
/// `GpuContext::try_init` is lightweight (reuses the system wgpu instance)
|
||||
/// but still non-trivial. Callers that dispatch many batches back-to-back
|
||||
/// should cache a `GpuContext` and call `ctx.batch_simulate` directly.
|
||||
/// For hot loops that dispatch many batches, consider holding a
|
||||
/// `&GpuContext` directly via `GpuContext::shared()` to skip the `OnceLock`
|
||||
/// atomic load per call.
|
||||
#[must_use]
|
||||
pub fn batch_simulate(
|
||||
inputs: &[AbstractRolloutState],
|
||||
|
|
@ -364,7 +425,14 @@ pub fn batch_simulate(
|
|||
seed: u64,
|
||||
horizon: u32,
|
||||
) -> Vec<(f32, RolloutPath)> {
|
||||
if let Some(ctx) = GpuContext::try_init() {
|
||||
// Zero-length inputs never touch the GPU cache — fast path.
|
||||
if inputs.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
if inputs.len() != priors_per_entry.len() {
|
||||
return Vec::new();
|
||||
}
|
||||
if let Some(ctx) = GpuContext::shared() {
|
||||
return ctx.batch_simulate(inputs, priors_per_entry, seed, horizon);
|
||||
}
|
||||
batch_simulate_cpu(inputs, priors_per_entry, seed, horizon)
|
||||
|
|
@ -430,26 +498,94 @@ mod tests {
|
|||
pod
|
||||
}
|
||||
|
||||
use std::time::Instant;
|
||||
|
||||
// ── Tests that do NOT touch the GPU adapter ──────────────────────────
|
||||
//
|
||||
// These rely solely on the pre-dispatch guards in `batch_simulate` (empty
|
||||
// input / mismatched lens) OR on `GpuContext::shared()` returning None
|
||||
// after the one-time probe. Either way no single test is responsible for
|
||||
// the probe cost — the first test that *does* need GPU state pays it
|
||||
// once and caches.
|
||||
|
||||
#[test]
|
||||
fn batch_simulate_empty_is_empty() {
|
||||
fn batch_simulate_empty_bypasses_gpu_probe() {
|
||||
// Empty input returns Vec::new() before GpuContext::shared() is ever
|
||||
// consulted. Must complete in microseconds even on a wedged-adapter
|
||||
// host; assert a 100ms upper bound with generous slack for CI jitter.
|
||||
let start = Instant::now();
|
||||
let out = batch_simulate(&[], &[], 42, 20);
|
||||
let elapsed = start.elapsed();
|
||||
assert!(out.is_empty());
|
||||
assert!(
|
||||
elapsed < Duration::from_millis(100),
|
||||
"empty input must bypass GPU probe; took {:?}",
|
||||
elapsed
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn batch_simulate_mismatched_lengths_is_empty() {
|
||||
fn batch_simulate_mismatched_lengths_bypasses_gpu_probe() {
|
||||
let pods = vec![make_entry()];
|
||||
let priors: Vec<[PersonalityPriors; MAX_PLAYERS]> = vec![iron_vs_bh(), iron_vs_bh()];
|
||||
// Entry-level dispatch routes through GpuContext::batch_simulate OR
|
||||
// batch_simulate_cpu; both return empty on length mismatch.
|
||||
let start = Instant::now();
|
||||
let out = batch_simulate(&pods, &priors, 1, 20);
|
||||
let elapsed = start.elapsed();
|
||||
assert!(out.is_empty());
|
||||
assert!(
|
||||
elapsed < Duration::from_millis(100),
|
||||
"length-mismatch must bypass GPU probe; took {:?}",
|
||||
elapsed
|
||||
);
|
||||
}
|
||||
|
||||
// ── Timeout contract ─────────────────────────────────────────────────
|
||||
//
|
||||
// The central production-safety property: `try_init` MUST bound its own
|
||||
// wall time. Post-reboot apricot hang (46 minutes, ~0 CPU) is the bug
|
||||
// this suite regresses against. Budget: 2s per our const + ~1s slack
|
||||
// for worker-thread scheduling = 3s upper bound.
|
||||
|
||||
#[test]
|
||||
fn try_init_respects_timeout_budget() {
|
||||
let start = Instant::now();
|
||||
let result = std::panic::catch_unwind(GpuContext::try_init);
|
||||
let elapsed = start.elapsed();
|
||||
assert!(result.is_ok(), "try_init must not panic");
|
||||
assert!(
|
||||
elapsed < Duration::from_millis(TRY_INIT_TIMEOUT_MS + 1000),
|
||||
"try_init must honor its {}ms timeout budget; took {:?}",
|
||||
TRY_INIT_TIMEOUT_MS,
|
||||
elapsed
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shared_is_idempotent_and_cached() {
|
||||
// First call may probe (within timeout); subsequent calls must be
|
||||
// O(1) — well under 1ms. Verifies OnceLock caching actually works.
|
||||
let _first = GpuContext::shared();
|
||||
let start = Instant::now();
|
||||
for _ in 0..1000 {
|
||||
let _ = GpuContext::shared();
|
||||
}
|
||||
let elapsed = start.elapsed();
|
||||
assert!(
|
||||
elapsed < Duration::from_millis(50),
|
||||
"1000 cached `shared()` calls took {:?} (>{}ms target)",
|
||||
elapsed,
|
||||
50
|
||||
);
|
||||
}
|
||||
|
||||
// ── Scored-path tests ────────────────────────────────────────────────
|
||||
//
|
||||
// These exercise the full dispatch pipeline. The first one to run pays
|
||||
// the probe cost (bounded by timeout). On a wedged-driver host all of
|
||||
// these silently route through CPU and still produce valid results.
|
||||
|
||||
#[test]
|
||||
fn batch_simulate_produces_unit_interval_scores() {
|
||||
// May run on GPU or CPU depending on adapter availability. Either
|
||||
// way, scores must be in [0, 1].
|
||||
let pods = vec![make_entry(); 4];
|
||||
let priors = vec![iron_vs_bh(); 4];
|
||||
let out = batch_simulate(&pods, &priors, 7, 20);
|
||||
|
|
@ -460,11 +596,7 @@ mod tests {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn fallback_returns_cpu_tag_without_gpu() {
|
||||
// If there's no GPU adapter, batch_simulate_cpu engages and tags
|
||||
// results with RolloutPath::Cpu. When there IS a GPU adapter, results
|
||||
// come back as RolloutPath::Gpu. Both are valid; we just assert one
|
||||
// of them shows up.
|
||||
fn fallback_returns_valid_path_tag() {
|
||||
let pods = vec![make_entry()];
|
||||
let priors = vec![iron_vs_bh()];
|
||||
let out = batch_simulate(&pods, &priors, 100, 20);
|
||||
|
|
@ -472,19 +604,8 @@ mod tests {
|
|||
assert!(matches!(out[0].1, RolloutPath::Cpu | RolloutPath::Gpu));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn gpu_context_try_init_does_not_panic() {
|
||||
// `try_init` must never panic on any machine. Headless CI returns
|
||||
// None; dev machines return Some. Either is fine.
|
||||
let result = std::panic::catch_unwind(GpuContext::try_init);
|
||||
assert!(result.is_ok(), "try_init must not panic");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn batch_simulate_is_deterministic_across_repeated_calls() {
|
||||
// Same path-same inputs-same seed must produce bit-identical results.
|
||||
// This holds regardless of whether we hit CPU or GPU because each
|
||||
// path is internally deterministic.
|
||||
let pods = vec![make_entry(); 3];
|
||||
let priors = vec![iron_vs_bh(); 3];
|
||||
let a = batch_simulate(&pods, &priors, 77, 20);
|
||||
|
|
@ -511,11 +632,14 @@ mod tests {
|
|||
assert_eq!(a[0].1, b[0].1);
|
||||
}
|
||||
|
||||
/// If a GPU adapter IS available, make sure we actually get `RolloutPath::Gpu`
|
||||
/// back (proving the adapter path isn't dead code). Test is no-op on headless.
|
||||
// ── Adapter-gated tests ──────────────────────────────────────────────
|
||||
//
|
||||
// Only run when a working adapter is actually present. On wedged
|
||||
// drivers `shared()` returns None and these skip — no hang, no panic.
|
||||
|
||||
#[test]
|
||||
fn gpu_path_tags_when_adapter_available() {
|
||||
let Some(ctx) = GpuContext::try_init() else {
|
||||
let Some(ctx) = GpuContext::shared() else {
|
||||
eprintln!("[rollout-gpu] no adapter — skipping gpu_path_tags_when_adapter_available");
|
||||
return;
|
||||
};
|
||||
|
|
@ -523,17 +647,20 @@ mod tests {
|
|||
let priors = vec![iron_vs_bh()];
|
||||
let out = ctx.batch_simulate(&pods, &priors, 123, 20);
|
||||
assert_eq!(out.len(), 1);
|
||||
assert_eq!(out[0].1, RolloutPath::Gpu,
|
||||
"when GpuContext::try_init succeeds, batch_simulate must return Gpu-tagged results (backend: {})",
|
||||
ctx.backend);
|
||||
assert_eq!(
|
||||
out[0].1,
|
||||
RolloutPath::Gpu,
|
||||
"with adapter present, batch_simulate must tag Gpu (backend: {})",
|
||||
ctx.backend
|
||||
);
|
||||
assert!((0.0..=1.0).contains(&out[0].0));
|
||||
}
|
||||
|
||||
/// If a GPU adapter IS available, dispatch determinism must hold across
|
||||
/// repeated ctx.batch_simulate calls on the same context.
|
||||
/// Dispatch determinism when an adapter is present. Skips on headless or
|
||||
/// wedged-driver hosts via the `shared()` None return.
|
||||
#[test]
|
||||
fn gpu_dispatch_is_deterministic_when_adapter_available() {
|
||||
let Some(ctx) = GpuContext::try_init() else {
|
||||
let Some(ctx) = GpuContext::shared() else {
|
||||
eprintln!("[rollout-gpu] no adapter — skipping gpu_dispatch_is_deterministic");
|
||||
return;
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue