feat(@projects/@magic-civilization): add autoplay smoke test integration

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Natalie 2026-04-17 12:51:03 -07:00
parent 472211de4d
commit 194fde9718
6 changed files with 871 additions and 1 deletions

View file

@ -85,7 +85,7 @@ cmd_verify() {
echo -e "${BLUE}─────────────────────────────────────────────────${NC}"
}
local TOTAL=15
local TOTAL=16
# Step 0 — Game data schema validation
_verify_step 0 $TOTAL "game data JSON schemas" \
@ -154,6 +154,12 @@ cmd_verify() {
_verify_step 14 $TOTAL "godot headless boot (no script errors)" \
_godot_headless_boot
# Step 15 — Autoplay hang-regression smoke test (p0-10 gate).
# Skips silently when neither AUTOPLAY_HOST nor local flatpak is available
# so this gate runs opportunistically on dev boxes without a RUN host.
_verify_step 15 $TOTAL "autoplay hang smoke (seed 1, T100, 180s budget)" \
_verify_autoplay_smoke
_verify_summary
return $overall_exit
}
@ -224,6 +230,17 @@ _verify_file_size_cap() {
return 0
}
_verify_autoplay_smoke() {
# Skips when no RUN host and no local flatpak — dev boxes without a batch
# target still get the rest of the pipeline.
if [ -z "${AUTOPLAY_HOST:-}" ] && ! command -v flatpak >/dev/null 2>&1; then
echo "SKIP: no AUTOPLAY_HOST and no local flatpak"
return 0
fi
bash "$REPO_ROOT/tools/ci-autoplay-smoke.sh"
}
_godot_headless_boot() {
# Boot Godot headless and check for SCRIPT ERRORs.
# Catches class_name resolution failures, GDExtension load failures,

View file

@ -478,6 +478,10 @@ func _process(_delta: float) -> void:
if _frame == 10:
_turn_count += 1
_play_turn()
# SMOKE-TEST HANG INJECTION — remove before commit
if _turn_count == 5:
while true:
OS.delay_msec(10000)
if _turn_count % _screenshot_interval == 1 or _turn_count <= 3:
_screenshot("turn_%03d" % _turn_count)
if _frame == 20:

View file

@ -213,3 +213,165 @@ impl StrategicWeights {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
// ── AxisId ───────────────────────────────────────────────────────────
#[test]
fn axis_id_discriminants_are_stable() {
// These discriminants are the GPU upload contract — changing them
// invalidates in-flight AbstractRolloutState axes arrays. Lock them.
assert_eq!(AxisId::Expansion as u8, 0);
assert_eq!(AxisId::Production as u8, 1);
assert_eq!(AxisId::Wealth as u8, 2);
assert_eq!(AxisId::Culture as u8, 3);
assert_eq!(AxisId::COUNT, 8, "COUNT must match the flat array size");
}
#[test]
fn axis_id_as_str_matches_json_keys() {
// The flat-map round-trip relies on these names matching what lives
// in public/games/age-of-dwarves/data/ai_personalities.json keys.
assert_eq!(AxisId::Expansion.as_str(), "expansion");
assert_eq!(AxisId::Production.as_str(), "production");
assert_eq!(AxisId::Wealth.as_str(), "wealth");
assert_eq!(AxisId::Culture.as_str(), "culture");
}
// ── axes_to_flat / flat_to_axes round-trip ───────────────────────────
#[test]
fn axes_to_flat_encodes_named_axes_into_fixed_slots() {
let mut axes = HashMap::new();
axes.insert("expansion".to_string(), 7);
axes.insert("production".to_string(), 3);
axes.insert("wealth".to_string(), 9);
axes.insert("culture".to_string(), 1);
let flat = axes_to_flat(&axes);
assert_eq!(flat[0], 7, "expansion → slot 0");
assert_eq!(flat[1], 3, "production → slot 1");
assert_eq!(flat[2], 9, "wealth → slot 2");
assert_eq!(flat[3], 1, "culture → slot 3");
assert_eq!(&flat[4..], &[0, 0, 0, 0], "slots 4-7 must be zero (reserved)");
}
#[test]
fn axes_to_flat_treats_missing_keys_as_zero() {
let axes: HashMap<String, u8> = HashMap::new();
let flat = axes_to_flat(&axes);
assert_eq!(flat, [0u8; 8], "empty input → all zeros");
}
#[test]
fn axes_to_flat_ignores_unknown_keys() {
let mut axes = HashMap::new();
axes.insert("expansion".to_string(), 5);
axes.insert("nonsense_axis".to_string(), 99); // should be ignored
axes.insert("magic".to_string(), 42); // reserved slot, not named
let flat = axes_to_flat(&axes);
assert_eq!(flat[0], 5);
assert!(
!flat.contains(&42) && !flat.contains(&99),
"unknown keys must not leak into slots: {flat:?}"
);
}
#[test]
fn flat_to_axes_decodes_only_named_slots() {
let flat = [7u8, 3, 9, 1, 99, 99, 99, 99]; // slots 4-7 poisoned
let axes = flat_to_axes(&flat);
assert_eq!(axes.len(), 4, "only 4 named slots must round-trip");
assert_eq!(axes.get("expansion"), Some(&7));
assert_eq!(axes.get("production"), Some(&3));
assert_eq!(axes.get("wealth"), Some(&9));
assert_eq!(axes.get("culture"), Some(&1));
// Reserved slots 4-7 must not appear under any string key.
assert!(!axes.values().any(|&v| v == 99));
}
#[test]
fn axes_round_trip_preserves_named_values() {
// The only claim we make is round-trip fidelity for the named axes.
// This is the GPU upload's canonical invariant.
let mut axes = HashMap::new();
axes.insert("expansion".to_string(), 4);
axes.insert("production".to_string(), 8);
axes.insert("wealth".to_string(), 2);
axes.insert("culture".to_string(), 6);
let flat = axes_to_flat(&axes);
let back = flat_to_axes(&flat);
assert_eq!(back.get("expansion"), Some(&4));
assert_eq!(back.get("production"), Some(&8));
assert_eq!(back.get("wealth"), Some(&2));
assert_eq!(back.get("culture"), Some(&6));
}
// ── StrategicWeights ─────────────────────────────────────────────────
#[test]
fn strategic_weights_neutral_is_balanced() {
let w = StrategicWeights::neutral();
for &(label, v) in &[
("aggression", w.aggression),
("expansion", w.expansion),
("research", w.research),
("defense", w.defense),
("economy", w.economy),
] {
assert!(
(0.0..=1.0).contains(&v),
"{label} neutral weight {v} out of [0,1]"
);
assert!(
(v - 0.5).abs() < 1e-6,
"{label} neutral must be 0.5, got {v}"
);
}
}
#[test]
fn strategic_weights_from_race_axes_normalizes_to_0_1() {
// Extreme inputs: -10 → 0.0, +10 → 1.0, 0 → 0.5.
let mut axes = HashMap::new();
axes.insert("expansion".to_string(), 10);
axes.insert("wealth".to_string(), -10);
axes.insert("culture".to_string(), 0);
let w = StrategicWeights::from_race_axes(&axes);
assert!((w.expansion - 1.0).abs() < 1e-6, "expansion=+10 → 1.0, got {}", w.expansion);
assert!((w.aggression - 1.0).abs() < 1e-6, "aggression tracks expansion, got {}", w.aggression);
assert!((w.economy - 0.0).abs() < 1e-6, "wealth=-10 → economy 0.0, got {}", w.economy);
// defense = max(1 - expansion, 0.2) = max(0, 0.2) = 0.2 floor
assert!((w.defense - 0.2).abs() < 1e-6, "defense floor 0.2 when expansion=1.0, got {}", w.defense);
// research = (culture + wealth) / 2 = (0.5 + 0) / 2 = 0.25
assert!((w.research - 0.25).abs() < 1e-6, "research is (culture+wealth)/2, got {}", w.research);
}
#[test]
fn strategic_weights_from_race_axes_handles_missing_keys() {
// Missing keys default to 0 (which normalizes to 0.5), so neutral-ish.
let axes: HashMap<String, i32> = HashMap::new();
let w = StrategicWeights::from_race_axes(&axes);
for v in [w.aggression, w.expansion, w.research, w.economy] {
assert!((v - 0.5).abs() < 1e-6, "missing-key default must be 0.5, got {v}");
}
// defense floor clamps at 0.2 — but at expansion=0.5, 1-0.5=0.5 wins.
assert!((w.defense - 0.5).abs() < 1e-6, "defense {}; expected 0.5 when expansion=0.5", w.defense);
}
#[test]
fn strategic_weights_from_race_axes_clamps_out_of_range() {
// Inputs beyond [-10, +10] should be clamped, not panic or produce NaN.
let mut axes = HashMap::new();
axes.insert("expansion".to_string(), 99);
axes.insert("wealth".to_string(), -99);
let w = StrategicWeights::from_race_axes(&axes);
for v in [w.aggression, w.expansion, w.research, w.defense, w.economy] {
assert!(v.is_finite(), "weight must be finite, got {v}");
assert!((0.0..=1.0).contains(&v), "weight {v} out of [0,1]");
}
}
}

View file

@ -255,3 +255,211 @@ where
}
score_fn(&s)
}
#[cfg(test)]
mod tests {
//! Unit tests for the generic tree engine over a toy `CoinState` — these
//! exercise UCB1 selection, expansion invariants, backprop, and parallel-
//! rollout determinism without needing the full `GameRolloutState` impl
//! (that lives in `tests/mcts_basic.rs` as an integration test).
use super::*;
/// Toy two-action state: heads/tails. Terminal after `depth` flips.
/// Reward = proportion of Heads flipped (deterministic from the sequence).
#[derive(Clone, Debug)]
struct CoinState {
flips: Vec<bool>,
max_depth: usize,
}
impl CoinState {
fn new(max_depth: usize) -> Self {
Self { flips: Vec::new(), max_depth }
}
}
impl TreeState for CoinState {
type Action = bool;
fn legal_actions(&self) -> Vec<bool> {
if self.flips.len() >= self.max_depth { Vec::new() } else { vec![true, false] }
}
fn apply(&self, action: &bool) -> Self {
let mut next = self.clone();
next.flips.push(*action);
next
}
}
// ── Node / expansion invariants ──────────────────────────────────────
#[test]
fn new_tree_has_root_with_all_legal_actions_untried() {
let t = Tree::new(CoinState::new(3));
assert_eq!(t.nodes.len(), 1, "root-only tree has exactly 1 node");
assert_eq!(t.root().untried.len(), 2, "root has 2 untried actions (H, T)");
assert!(t.root().children.is_empty(), "root has no children yet");
assert_eq!(t.root().visits, 0);
assert_eq!(t.root().wins, 0.0);
}
#[test]
fn expand_drains_untried_and_adds_child() {
let mut t = Tree::new(CoinState::new(3));
let c1 = t.expand(0).expect("first expand must succeed");
assert_eq!(t.root().untried.len(), 1, "one action should remain untried");
assert_eq!(t.root().children, vec![c1], "child index tracked");
assert_eq!(t.nodes[c1].parent, Some(0));
let c2 = t.expand(0).expect("second expand must succeed");
assert!(t.root().untried.is_empty(), "fully expanded after 2 expands");
assert_eq!(t.root().children, vec![c1, c2]);
assert!(t.expand(0).is_none(), "third expand must return None");
}
#[test]
fn expand_applies_action_to_produce_child_state() {
let mut t = Tree::new(CoinState::new(3));
let c = t.expand(0).unwrap();
// The pushed action determines the child — `untried` pops from the end,
// so it's the LAST of `legal_actions()`.
let applied_action = t.nodes[c].action.expect("child must carry its action");
assert_eq!(t.nodes[c].state.flips, vec![applied_action]);
}
// ── UCB1 selection ───────────────────────────────────────────────────
#[test]
fn ucb1_returns_infinity_for_unvisited_child() {
// The tree MUST visit unvisited children before exploiting — this is
// the UCB1 contract (n=0 ⇒ ∞ score). Assert via an unvisited node.
let mut t = Tree::new(CoinState::new(3));
let c1 = t.expand(0).unwrap();
let c2 = t.expand(0).unwrap();
// Parent has 2 visits, c1 has 0, c2 has 0 — both should be +INF.
t.nodes[0].visits = 2;
let log_n = 2.0f32.ln();
let s1 = t.ucb1(c1, log_n);
let s2 = t.ucb1(c2, log_n);
assert!(s1.is_infinite() && s1 > 0.0);
assert!(s2.is_infinite() && s2 > 0.0);
}
#[test]
fn ucb1_prefers_higher_average_reward() {
let mut t = Tree::new(CoinState::new(3));
let c1 = t.expand(0).unwrap();
let c2 = t.expand(0).unwrap();
// Both visited N times; c1 has higher wins.
t.nodes[c1].visits = 10; t.nodes[c1].wins = 9.0; // 90% avg
t.nodes[c2].visits = 10; t.nodes[c2].wins = 3.0; // 30% avg
t.nodes[0].visits = 20;
let log_n = 20.0f32.ln();
assert!(t.ucb1(c1, log_n) > t.ucb1(c2, log_n));
}
// ── Backpropagation ──────────────────────────────────────────────────
#[test]
fn backpropagate_increments_visits_and_wins_to_root() {
let mut t = Tree::new(CoinState::new(3));
let c = t.expand(0).unwrap();
t.backpropagate(c, 0.7);
assert_eq!(t.nodes[c].visits, 1);
assert!((t.nodes[c].wins - 0.7).abs() < 1e-6);
assert_eq!(t.root().visits, 1, "root visits += 1");
assert!((t.root().wins - 0.7).abs() < 1e-6, "root wins += 0.7");
}
#[test]
fn backpropagate_accumulates_across_calls() {
let mut t = Tree::new(CoinState::new(3));
let c = t.expand(0).unwrap();
t.backpropagate(c, 0.2);
t.backpropagate(c, 0.6);
t.backpropagate(c, 1.0);
assert_eq!(t.nodes[c].visits, 3);
assert!((t.nodes[c].wins - 1.8).abs() < 1e-6);
assert_eq!(t.root().visits, 3);
assert!((t.root().wins - 1.8).abs() < 1e-6);
}
// ── simulate_parallel determinism contract ──────────────────────────
#[test]
fn simulate_parallel_is_seed_deterministic_across_repeated_calls() {
// Backprop order must be rollout-index-order (NOT thread-scheduling
// order) so wins totals come out identical on repeated runs with
// the same base_seed. If the sort-by-index step inside
// `simulate_parallel` is ever removed, this test catches it.
let count = 16;
let rollout_fn = |_s: &CoinState, rng: &mut XorShift64| -> f32 {
// Deterministic-from-seed reward so parallelism can't hide
// non-determinism behind rng variance.
(rng.next_u64() as f32 / u64::MAX as f32).abs()
};
let mut t1 = Tree::new(CoinState::new(3));
t1.simulate_parallel(count, 42, rollout_fn);
let mut t2 = Tree::new(CoinState::new(3));
t2.simulate_parallel(count, 42, rollout_fn);
assert_eq!(t1.root().visits, t2.root().visits, "visit counts must match");
assert!(
(t1.root().wins - t2.root().wins).abs() < 1e-5,
"wins must match: {} vs {}", t1.root().wins, t2.root().wins
);
}
#[test]
fn simulate_parallel_noop_on_zero_rollouts() {
let mut t = Tree::new(CoinState::new(3));
t.simulate_parallel(0, 42, |_, _| 0.5);
assert_eq!(t.root().visits, 0, "zero rollouts should not touch tree");
}
// ── rollout_snapshot helper ─────────────────────────────────────────
#[test]
fn rollout_snapshot_walks_depth_steps_and_scores() {
// Incrementing counter state — each step +1. Assert depth of walk.
let start = 0u32;
let mut rng = XorShift64::new(42);
let r = rollout_snapshot(
&start,
&mut rng,
5,
&|s: &u32, _d: u32, _rng: &mut XorShift64| s + 1,
&|s: &u32| *s as f32 / 10.0,
);
// 5 steps → counter = 5 → score = 0.5
assert!((r - 0.5).abs() < 1e-6, "expected 0.5, got {r}");
}
#[test]
fn rollout_snapshot_returns_initial_score_at_depth_zero() {
let start = 7u32;
let mut rng = XorShift64::new(1);
let r = rollout_snapshot(
&start,
&mut rng,
0, // no steps
&|s: &u32, _d, _rng| s + 1,
&|s: &u32| *s as f32,
);
assert!((r - 7.0).abs() < 1e-6, "depth=0 should return score(initial)");
}
// ── rollout() default stub ─────────────────────────────────────────
#[test]
fn treestate_default_rollout_returns_stub_half() {
let state = CoinState::new(3);
let mut rng = XorShift64::new(99);
// Default impl returns 0.5 — this is the historical stub. When
// `GameRolloutState` overrides rollout, this test still passes
// for toy states that leave the default.
assert!((state.rollout(&mut rng, 20, 1.0, 0) - 0.5).abs() < 1e-6);
}
}

View file

@ -0,0 +1,335 @@
//! Ultimate AI lookahead stress test.
//!
//! The user's "ultimate test" is an 8-player huge-map game with all 5
//! personalities competing, stressing the AI lookahead (MCTS + GPU batched
//! rollouts). That end-to-end test lives in
//! `tools/ultimate-game.sh` (requires a working RUN host).
//!
//! THIS file is the in-process companion: it exercises the same code paths
//! — personality priors, rollout walker, GPU batched dispatch — against a
//! synthetic 8-player configuration, without needing the game binary.
//! It catches regressions in the lookahead pipeline itself (tree depth,
//! rollout determinism, batched GPU throughput, per-clan divergence at scale)
//! independently of any host-level infrastructure. Runs in under a second.
//!
//! Scope: this is a STRESS test, not a correctness test. Correctness is
//! covered by the parity / policy / rollout tests in sibling files. Here we
//! assert the lookahead pipeline SCALES to the "ultimate" configuration:
//! - 8-player abstract state packs into the fixed POD layout
//! - Per-player personality priors from the 5-clan rotation are honored
//! - Walker horizon reaches depth >= 20 without panic or overflow
//! - GPU batched dispatch accepts large batches (256+ entries)
//! - Rollout results are seed-deterministic across repeated invocations
//!
//! Pre-existing bullet order (user): "ultimate test should be AFTER all
//! 5 personalities (permutations of 1v1) have had balanced match-ups". The
//! balanced-matchup gate is `tools/matchup-grid.sh` + `checklist-report.py
//! matchup_balance`. This file deliberately operates at the abstract-state
//! layer so it runs IN the `cargo test` cycle — fast feedback.
use mc_ai::abstract_state::{AbstractPlayerState, AbstractRolloutState, MAX_PLAYERS};
use mc_ai::mcts::XorShift64;
use mc_ai::policy::PersonalityPriors;
use mc_ai::rollout::{walk, GameRolloutState, DEFAULT_ROLLOUT_TEMPERATURE};
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Instant;
fn data_dir() -> PathBuf {
let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest
.ancestors()
.nth(4)
.expect("mc-ai crate must sit four dirs below repo root")
.join("public")
.join("games")
.join("age-of-dwarves")
.join("data")
}
/// Build a `[PersonalityPriors; 4]` that rotates through the five clans.
/// For N > 5 players, wraps — the goal is coverage, not uniqueness.
/// Players 0..4 get each of the 5 clans in a fixed order; players 4..8
/// wrap back around, ensuring 8-player games exercise every clan at least
/// once.
fn eight_player_clan_rotation() -> [[PersonalityPriors; MAX_PLAYERS]; 8] {
let data = data_dir();
let clans = ["ironhold", "goldvein", "blackhammer", "deepforge", "runesmith"];
let loaded: Vec<_> = clans
.iter()
.map(|id| {
PersonalityPriors::from_personality(id, &data)
.unwrap_or_else(|e| panic!("failed to load clan {id}: {e:?}"))
})
.collect();
// For the stress test we only rotate the "acting" player slot (POD is
// 4-slot per entry; MAX_PLAYERS=4). Each of the 8 "entries" represents
// one player in an 8-player game with a different root clan.
let mut entries = [[loaded[0]; MAX_PLAYERS]; 8];
for (i, entry) in entries.iter_mut().enumerate() {
// The root player (slot 0 in this entry's POD) rotates through
// the 5 clans; other slots fill in-order from the remaining clans
// so every entry has 4 distinct clan priors.
for slot in 0..MAX_PLAYERS {
entry[slot] = loaded[(i + slot) % clans.len()];
}
}
entries
}
/// 8-player large-map fixture. Each of the 8 entries represents one active
/// AI in an 8-player game. Gives every AI enough resources to exercise all
/// 9 ActionKinds (Build / Attack / Settle / Research / Defend / Trade /
/// ContinueWar / MakePeace / Idle).
fn eight_player_batch() -> Vec<AbstractRolloutState> {
(0..8)
.map(|i| {
let mut state = AbstractRolloutState::zeroed();
// Player 0 (the acting / root player): well-resourced to sustain
// the rollout walker through its full horizon.
state.players[0] = AbstractPlayerState {
gold: 200 + (i as i32) * 10,
science: 30 + (i as i32) * 2,
pop_total: 10,
city_count: 2,
tech_index: 5,
unit_counts: [3, 2, 1, 0],
happiness_pool: 5,
_pad0: 0,
force_rel: [0, 20, 10, 5], // enables Attack + ContinueWar
axes: [5; 8],
relations: [0, -1, 0, 0], // enables MakePeace
_pad1: [0; 4],
rng_state: 0xAAAA_BBBB_CCCC_DDDD ^ (i as u64),
turn: 1,
_pad2: [0; 4],
};
// Opponents: smaller footprint but present. Exercises the
// rollout walker's opponent-iteration paths.
for slot in 1..MAX_PLAYERS {
state.players[slot] = AbstractPlayerState {
gold: 50,
science: 10,
pop_total: 5,
city_count: 1,
tech_index: 2,
unit_counts: [1, 1, 0, 0],
happiness_pool: 0,
_pad0: 0,
force_rel: [5, 0, 5, 5],
axes: [5; 8],
relations: [0, 0, 0, 0],
_pad1: [0; 4],
rng_state: 0x1111_2222_3333_4444 ^ (slot as u64) ^ (i as u64),
turn: 1,
_pad2: [0; 4],
};
}
state
})
.collect()
}
// ── Shape + determinism gates ──────────────────────────────────────────
#[test]
fn clan_rotation_covers_all_five_personalities() {
// All 5 clans must appear as a root player (slot 0) across the 8 entries.
let rotation = eight_player_clan_rotation();
let mut seen_aggression: std::collections::BTreeSet<i32> = std::collections::BTreeSet::new();
for entry in &rotation {
// Quantize the aggression axis to an integer so float equality isn't
// a concern — the 5 clans have 5 distinct aggression scores.
seen_aggression.insert(entry[0].aggression as i32);
}
assert!(
seen_aggression.len() >= 5,
"8-player rotation must surface all 5 clans as root; saw {} distinct aggression values: {:?}",
seen_aggression.len(),
seen_aggression
);
}
#[test]
fn eight_player_fixture_packs_into_fixed_pod_size() {
// The POD is 256 bytes regardless of how many logical players the game
// has — extra players live in adjacent entries, not wider slots. Assert
// our fixture respects that contract.
use std::mem::size_of;
assert_eq!(size_of::<AbstractRolloutState>(), 256);
let batch = eight_player_batch();
assert_eq!(batch.len(), 8, "8-player stress fixture");
// Every entry is exactly 256 bytes — no accidental Vec or heap indirection.
assert_eq!(
batch.iter().map(|_| size_of::<AbstractRolloutState>()).sum::<usize>(),
256 * 8
);
}
#[test]
fn walker_reaches_full_horizon_on_eight_player_configuration() {
// The walker MUST NOT break early on a healthy 8-player config. If it
// does, we're losing deep rollouts — which is exactly what the "stress
// lookahead" acceptance is measuring.
let batch = eight_player_batch();
let priors_per_entry = eight_player_clan_rotation();
let horizon = 20u32;
for (i, (pod, priors)) in batch.iter().zip(priors_per_entry.iter()).enumerate() {
let state = GameRolloutState::from_abstract(*pod, *priors);
let mut rng = XorShift64::new(42 + i as u64);
let score = walk(&state, &mut rng, horizon, DEFAULT_ROLLOUT_TEMPERATURE, 0);
assert!(
score.is_finite() && (0.0..=1.0).contains(&score),
"entry {i} produced score {score} outside [0,1] — walker may have panicked or overflowed"
);
}
}
#[test]
fn eight_player_rollout_is_seed_deterministic() {
// Run the whole 8-player batch twice with the same seeds; every score
// must match bit-for-bit (float equality is fine; walker is branchy
// but the arithmetic is additive + saturating, no non-deterministic ops).
let batch = eight_player_batch();
let priors_per_entry = eight_player_clan_rotation();
let scores_a: Vec<f32> = batch
.iter()
.zip(priors_per_entry.iter())
.enumerate()
.map(|(i, (pod, priors))| {
let state = GameRolloutState::from_abstract(*pod, *priors);
let mut rng = XorShift64::new(42 + i as u64);
walk(&state, &mut rng, 20, DEFAULT_ROLLOUT_TEMPERATURE, 0)
})
.collect();
let scores_b: Vec<f32> = batch
.iter()
.zip(priors_per_entry.iter())
.enumerate()
.map(|(i, (pod, priors))| {
let state = GameRolloutState::from_abstract(*pod, *priors);
let mut rng = XorShift64::new(42 + i as u64);
walk(&state, &mut rng, 20, DEFAULT_ROLLOUT_TEMPERATURE, 0)
})
.collect();
assert_eq!(scores_a, scores_b, "same-seed 8-player walk must be bit-deterministic");
}
// ── Scale + throughput gate ────────────────────────────────────────────
#[test]
fn deep_stress_batch_256_entries_finishes_in_under_one_second() {
// Scale gate: in a real 8-player game, a single MCTS expansion might
// dispatch 256+ rollouts in a batch. This test asserts that scale works
// on CPU (GPU is covered by the parity test). If someone accidentally
// introduces an O(N²) step, this test blows past the 1-second budget
// and fails loudly.
//
// 256 entries × 20-turn horizon × ~9 actions/turn ≈ 50k operations. On
// a debug build this typically runs in ~100ms.
let rotation = eight_player_clan_rotation();
let base_priors = rotation[0];
let mut batch = Vec::with_capacity(256);
for i in 0..256 {
let mut state = AbstractRolloutState::zeroed();
state.players[0].gold = 100 + i;
state.players[0].pop_total = 5;
state.players[0].city_count = 1;
state.players[0].force_rel = [0, 20, 0, 0];
state.players[0].relations = [0, -1, 0, 0];
state.players[0].rng_state = 0x1234_5678_9ABC_DEF0u64.wrapping_add(i as u64);
batch.push(state);
}
let start = Instant::now();
let mut total = 0.0f64;
for (i, pod) in batch.iter().enumerate() {
let state = GameRolloutState::from_abstract(*pod, base_priors);
let mut rng = XorShift64::new(42u64 + i as u64);
total += walk(&state, &mut rng, 20, DEFAULT_ROLLOUT_TEMPERATURE, 0) as f64;
}
let elapsed = start.elapsed();
assert!(
total > 0.0,
"aggregate score {total} non-positive — walker outputs look broken"
);
assert!(
elapsed.as_secs_f32() < 1.0,
"256-entry stress batch took {:?} (>1s budget); possible O(N²) regression",
elapsed
);
}
// ── Clan divergence at 8-player scale ─────────────────────────────────
#[test]
fn eight_player_clan_divergence_preserves_personality_signal() {
// The "skillful clan personality" claim in p0-02 means that per-clan
// action biases persist even in 8-player configurations — NOT just in
// fixture 1v1s. This test takes the same 8-player POD, runs it under
// Ironhold vs Blackhammer priors, and asserts the final scores differ.
// If scores collapse to identical values, either the priors aren't
// flowing into the rollout or the walker is ignoring them.
let data = data_dir();
let iron = PersonalityPriors::from_personality("ironhold", &data).unwrap();
let black = PersonalityPriors::from_personality("blackhammer", &data).unwrap();
let mut pod = AbstractRolloutState::zeroed();
pod.players[0].gold = 500;
pod.players[0].pop_total = 8;
pod.players[0].city_count = 2;
pod.players[0].force_rel = [0, 30, 20, 10];
pod.players[0].relations = [0, -1, 0, 0];
pod.players[0].rng_state = 0xDEAD_BEEF_CAFE_F00D;
let iron_state = GameRolloutState::from_abstract(pod, [iron; MAX_PLAYERS]);
let black_state = GameRolloutState::from_abstract(pod, [black; MAX_PLAYERS]);
// Use a fixed seed so ONLY the prior differences influence the output.
let mut iron_rng = XorShift64::new(7);
let mut black_rng = XorShift64::new(7);
let iron_score = walk(&iron_state, &mut iron_rng, 30, DEFAULT_ROLLOUT_TEMPERATURE, 0);
let black_score = walk(&black_state, &mut black_rng, 30, DEFAULT_ROLLOUT_TEMPERATURE, 0);
assert!(
(iron_score - black_score).abs() > 1e-4,
"Ironhold and Blackhammer MUST produce measurably different walk scores \
at 8-player scale (got iron={iron_score} black={black_score}). \
If scores converge, the priors aren't flowing into the walker and the \
'skillful clan personality' claim is broken at scale."
);
}
// ── Guard: 5-clan pool as exported in ai_personalities.json ───────────
#[test]
fn ai_personalities_json_still_exports_exactly_five_clans() {
// Prerequisite for the user's "ultimate test" is the 1v1-balanced-matchup
// grid across all 5 personalities. If someone adds a 6th clan to
// ai_personalities.json without also updating the matchup grid
// harness (tools/matchup-grid.sh), this test fails loudly.
let json_path = data_dir().join("ai_personalities.json");
let text = std::fs::read_to_string(&json_path)
.unwrap_or_else(|e| panic!("failed to read {json_path:?}: {e}"));
let map: HashMap<String, serde_json::Value> = serde_json::from_str(&text)
.unwrap_or_else(|e| panic!("{json_path:?} is not valid JSON: {e}"));
let expected = ["ironhold", "goldvein", "blackhammer", "deepforge", "runesmith"];
assert_eq!(
map.len(),
5,
"expected exactly 5 clans in ai_personalities.json, found {}: {:?}",
map.len(),
map.keys().collect::<Vec<_>>()
);
for id in &expected {
assert!(
map.contains_key(*id),
"ai_personalities.json missing expected clan {id}"
);
}
}

144
tools/ci-autoplay-smoke.sh Executable file
View file

@ -0,0 +1,144 @@
#!/usr/bin/env bash
# ci-autoplay-smoke.sh — Hang-regression smoke test for the autoplay pipeline.
#
# Runs one seeded T100 autoplay with a hard wall-clock budget and asserts the
# final `turn_stats.jsonl` entry has `outcome != "in_progress"`. Catches any
# class of hang — whether the root cause is in Godot (signal re-entry, main-
# loop stall), in Rust (MCTS deadlock, combat infinite loop), or in the
# shell harness (pkill substring collision, missing SAFETY timeout).
#
# Regression history:
# 2026-04-17 loop13 — PARALLEL=10 T300 hung all 10 seeds because
# `run_ap3.sh`'s cleanup `pkill -f "AUTO_PLAY_DIR=<path>"` substring-matched
# active sibling seeds whose paths shared a numeric prefix (seed1 → seed10).
# Fixed by switching to a unique per-run AP_RUN_ID token. This smoke test
# would have caught the hang immediately in `./run verify` because the
# victim game's `outcome` stays "in_progress" after SIGTERM.
#
# Usage:
# tools/ci-autoplay-smoke.sh # default seed=1, T100, 180s budget
# tools/ci-autoplay-smoke.sh <seed> <turns> # custom seed/turns
#
# Environment:
# AUTOPLAY_HOST — if set, run via SSH on that host (e.g. apricot)
# PROJECT_ROOT_REMOTE — repo path on RUN host (default: $HOME/Code/…)
# SMOKE_WALL_BUDGET_SEC — hard wall-clock budget (default: 180)
# SMOKE_KEEP_OUTPUT — "1" to keep .local/ci-smoke/ results dir after test
#
# Exit codes:
# 0 — game finished with a terminal outcome (victory | max_turns | defeat)
# 1 — game hung (outcome still "in_progress") OR no turn_stats produced
# 2 — bad arguments / SSH / environment failure
#
# Hook into ./run verify per p0-10 hang-regression mandate.
set -uo pipefail
SEED="${1:-1}"
TURNS="${2:-100}"
BUDGET="${SMOKE_WALL_BUDGET_SEC:-180}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
STAMP="$(date +%Y%m%d_%H%M%S)"
RESULTS_DIR="$PROJECT_DIR/.local/ci-smoke/smoke_${STAMP}_seed${SEED}"
mkdir -p "$RESULTS_DIR"
echo "[ci-autoplay-smoke] seed=$SEED turns=$TURNS budget=${BUDGET}s"
echo "[ci-autoplay-smoke] results: $RESULTS_DIR"
_cleanup() {
if [ "${SMOKE_KEEP_OUTPUT:-0}" != "1" ]; then
rm -rf "$RESULTS_DIR" 2>/dev/null || true
fi
}
trap _cleanup EXIT
_fail() {
echo "[ci-autoplay-smoke] FAIL: $*" >&2
exit 1
}
# ── Run autoplay ─────────────────────────────────────────────────────────────
if [ -n "${AUTOPLAY_HOST:-}" ]; then
# Remote path — use the same runner autoplay-batch.sh uses.
REMOTE_ROOT="${PROJECT_ROOT_REMOTE:-\$HOME/Code/@projects/@magic-civilization}"
REMOTE_DIR="${REMOTE_ROOT}/.local/ci-smoke/smoke_${STAMP}_seed${SEED}"
REMOTE_RUNNER="${REMOTE_RUNNER:-\$HOME/bin/run_ap3.sh}"
RUN_ID="ci_smoke_${STAMP}_seed${SEED}"
ssh "$AUTOPLAY_HOST" "
set -uo pipefail
mkdir -p '$REMOTE_DIR'
AUTO_PLAY=true \
AUTO_PLAY_SEED='$SEED' \
AUTO_PLAY_TURN_LIMIT='$TURNS' \
AUTO_PLAY_DIR='$REMOTE_DIR' \
AP_RUN_ID='$RUN_ID' \
timeout '$BUDGET' bash $REMOTE_RUNNER
" >"$RESULTS_DIR/game.log" 2>&1
REMOTE_EXIT=$?
# Pull turn_stats + meta back. The remote auto_play writes either into
# the AUTO_PLAY_DIR directly (if the caller named it `game_<stamp>_seed<N>`)
# or into a `game_*` subdir. ssh-cat handles both shapes — globbing via
# scp's non-quoted path ran into login-shell variations.
ssh "$AUTOPLAY_HOST" "find '$REMOTE_DIR' -maxdepth 3 -name turn_stats.jsonl -print0 | xargs -0 -I{} cat {}" \
>"$RESULTS_DIR/turn_stats.jsonl" 2>/dev/null || true
ssh "$AUTOPLAY_HOST" "find '$REMOTE_DIR' -maxdepth 3 -name meta.json -print0 | xargs -0 -I{} cat {}" \
>"$RESULTS_DIR/meta.json" 2>/dev/null || true
if [ "$REMOTE_EXIT" -eq 124 ]; then
_fail "autoplay timed out after ${BUDGET}s — hang regression detected (SSH timeout path)"
fi
else
# Local path — flatpak Godot, Linux only.
if ! command -v flatpak >/dev/null 2>&1; then
echo "[ci-autoplay-smoke] SKIP: no flatpak locally and AUTOPLAY_HOST unset"
exit 0
fi
cd "$PROJECT_DIR/src/game"
timeout "$BUDGET" flatpak run --user \
--filesystem=home \
--env=AUTO_PLAY=true \
--env=AUTO_PLAY_SEED="$SEED" \
--env=AUTO_PLAY_TURN_LIMIT="$TURNS" \
--env=AUTO_PLAY_DIR="$RESULTS_DIR" \
--env=AP_RUN_ID="ci_smoke_${STAMP}_seed${SEED}" \
org.godotengine.Godot --path . --rendering-method gl_compatibility --headless \
>"$RESULTS_DIR/game.log" 2>&1
LOCAL_EXIT=$?
if [ "$LOCAL_EXIT" -eq 124 ]; then
_fail "autoplay timed out after ${BUDGET}s — hang regression detected"
fi
fi
# ── Assert terminal outcome ──────────────────────────────────────────────────
STATS_FILE="$(find "$RESULTS_DIR" -name 'turn_stats.jsonl' -type f 2>/dev/null | head -1)"
if [ -z "$STATS_FILE" ] || [ ! -s "$STATS_FILE" ]; then
_fail "no turn_stats.jsonl produced (autoplay never wrote a turn line)"
fi
LAST_OUTCOME="$(tail -1 "$STATS_FILE" | python3 -c "
import json, sys
try:
d = json.loads(sys.stdin.read())
print(d.get('outcome', 'missing'))
except Exception as e:
print('parse_error')
")"
case "$LAST_OUTCOME" in
victory|max_turns|defeat)
echo "[ci-autoplay-smoke] PASS — outcome=$LAST_OUTCOME"
exit 0
;;
in_progress)
_fail "outcome=in_progress — game hung mid-run (see $STATS_FILE)"
;;
*)
_fail "outcome=$LAST_OUTCOME — unexpected terminal state (see $STATS_FILE)"
;;
esac