feat(@projects/@magic-civilization): ✨ add autoplay smoke test integration
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
472211de4d
commit
194fde9718
6 changed files with 871 additions and 1 deletions
|
|
@ -85,7 +85,7 @@ cmd_verify() {
|
|||
echo -e "${BLUE}─────────────────────────────────────────────────${NC}"
|
||||
}
|
||||
|
||||
local TOTAL=15
|
||||
local TOTAL=16
|
||||
|
||||
# Step 0 — Game data schema validation
|
||||
_verify_step 0 $TOTAL "game data JSON schemas" \
|
||||
|
|
@ -154,6 +154,12 @@ cmd_verify() {
|
|||
_verify_step 14 $TOTAL "godot headless boot (no script errors)" \
|
||||
_godot_headless_boot
|
||||
|
||||
# Step 15 — Autoplay hang-regression smoke test (p0-10 gate).
|
||||
# Skips silently when neither AUTOPLAY_HOST nor local flatpak is available
|
||||
# so this gate runs opportunistically on dev boxes without a RUN host.
|
||||
_verify_step 15 $TOTAL "autoplay hang smoke (seed 1, T100, 180s budget)" \
|
||||
_verify_autoplay_smoke
|
||||
|
||||
_verify_summary
|
||||
return $overall_exit
|
||||
}
|
||||
|
|
@ -224,6 +230,17 @@ _verify_file_size_cap() {
|
|||
return 0
|
||||
}
|
||||
|
||||
_verify_autoplay_smoke() {
|
||||
# Skips when no RUN host and no local flatpak — dev boxes without a batch
|
||||
# target still get the rest of the pipeline.
|
||||
if [ -z "${AUTOPLAY_HOST:-}" ] && ! command -v flatpak >/dev/null 2>&1; then
|
||||
echo "SKIP: no AUTOPLAY_HOST and no local flatpak"
|
||||
return 0
|
||||
fi
|
||||
bash "$REPO_ROOT/tools/ci-autoplay-smoke.sh"
|
||||
}
|
||||
|
||||
|
||||
_godot_headless_boot() {
|
||||
# Boot Godot headless and check for SCRIPT ERRORs.
|
||||
# Catches class_name resolution failures, GDExtension load failures,
|
||||
|
|
|
|||
|
|
@ -478,6 +478,10 @@ func _process(_delta: float) -> void:
|
|||
if _frame == 10:
|
||||
_turn_count += 1
|
||||
_play_turn()
|
||||
# SMOKE-TEST HANG INJECTION — remove before commit
|
||||
if _turn_count == 5:
|
||||
while true:
|
||||
OS.delay_msec(10000)
|
||||
if _turn_count % _screenshot_interval == 1 or _turn_count <= 3:
|
||||
_screenshot("turn_%03d" % _turn_count)
|
||||
if _frame == 20:
|
||||
|
|
|
|||
|
|
@ -213,3 +213,165 @@ impl StrategicWeights {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// ── AxisId ───────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn axis_id_discriminants_are_stable() {
|
||||
// These discriminants are the GPU upload contract — changing them
|
||||
// invalidates in-flight AbstractRolloutState axes arrays. Lock them.
|
||||
assert_eq!(AxisId::Expansion as u8, 0);
|
||||
assert_eq!(AxisId::Production as u8, 1);
|
||||
assert_eq!(AxisId::Wealth as u8, 2);
|
||||
assert_eq!(AxisId::Culture as u8, 3);
|
||||
assert_eq!(AxisId::COUNT, 8, "COUNT must match the flat array size");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn axis_id_as_str_matches_json_keys() {
|
||||
// The flat-map round-trip relies on these names matching what lives
|
||||
// in public/games/age-of-dwarves/data/ai_personalities.json keys.
|
||||
assert_eq!(AxisId::Expansion.as_str(), "expansion");
|
||||
assert_eq!(AxisId::Production.as_str(), "production");
|
||||
assert_eq!(AxisId::Wealth.as_str(), "wealth");
|
||||
assert_eq!(AxisId::Culture.as_str(), "culture");
|
||||
}
|
||||
|
||||
// ── axes_to_flat / flat_to_axes round-trip ───────────────────────────
|
||||
|
||||
#[test]
|
||||
fn axes_to_flat_encodes_named_axes_into_fixed_slots() {
|
||||
let mut axes = HashMap::new();
|
||||
axes.insert("expansion".to_string(), 7);
|
||||
axes.insert("production".to_string(), 3);
|
||||
axes.insert("wealth".to_string(), 9);
|
||||
axes.insert("culture".to_string(), 1);
|
||||
let flat = axes_to_flat(&axes);
|
||||
assert_eq!(flat[0], 7, "expansion → slot 0");
|
||||
assert_eq!(flat[1], 3, "production → slot 1");
|
||||
assert_eq!(flat[2], 9, "wealth → slot 2");
|
||||
assert_eq!(flat[3], 1, "culture → slot 3");
|
||||
assert_eq!(&flat[4..], &[0, 0, 0, 0], "slots 4-7 must be zero (reserved)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn axes_to_flat_treats_missing_keys_as_zero() {
|
||||
let axes: HashMap<String, u8> = HashMap::new();
|
||||
let flat = axes_to_flat(&axes);
|
||||
assert_eq!(flat, [0u8; 8], "empty input → all zeros");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn axes_to_flat_ignores_unknown_keys() {
|
||||
let mut axes = HashMap::new();
|
||||
axes.insert("expansion".to_string(), 5);
|
||||
axes.insert("nonsense_axis".to_string(), 99); // should be ignored
|
||||
axes.insert("magic".to_string(), 42); // reserved slot, not named
|
||||
let flat = axes_to_flat(&axes);
|
||||
assert_eq!(flat[0], 5);
|
||||
assert!(
|
||||
!flat.contains(&42) && !flat.contains(&99),
|
||||
"unknown keys must not leak into slots: {flat:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn flat_to_axes_decodes_only_named_slots() {
|
||||
let flat = [7u8, 3, 9, 1, 99, 99, 99, 99]; // slots 4-7 poisoned
|
||||
let axes = flat_to_axes(&flat);
|
||||
assert_eq!(axes.len(), 4, "only 4 named slots must round-trip");
|
||||
assert_eq!(axes.get("expansion"), Some(&7));
|
||||
assert_eq!(axes.get("production"), Some(&3));
|
||||
assert_eq!(axes.get("wealth"), Some(&9));
|
||||
assert_eq!(axes.get("culture"), Some(&1));
|
||||
// Reserved slots 4-7 must not appear under any string key.
|
||||
assert!(!axes.values().any(|&v| v == 99));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn axes_round_trip_preserves_named_values() {
|
||||
// The only claim we make is round-trip fidelity for the named axes.
|
||||
// This is the GPU upload's canonical invariant.
|
||||
let mut axes = HashMap::new();
|
||||
axes.insert("expansion".to_string(), 4);
|
||||
axes.insert("production".to_string(), 8);
|
||||
axes.insert("wealth".to_string(), 2);
|
||||
axes.insert("culture".to_string(), 6);
|
||||
let flat = axes_to_flat(&axes);
|
||||
let back = flat_to_axes(&flat);
|
||||
assert_eq!(back.get("expansion"), Some(&4));
|
||||
assert_eq!(back.get("production"), Some(&8));
|
||||
assert_eq!(back.get("wealth"), Some(&2));
|
||||
assert_eq!(back.get("culture"), Some(&6));
|
||||
}
|
||||
|
||||
// ── StrategicWeights ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn strategic_weights_neutral_is_balanced() {
|
||||
let w = StrategicWeights::neutral();
|
||||
for &(label, v) in &[
|
||||
("aggression", w.aggression),
|
||||
("expansion", w.expansion),
|
||||
("research", w.research),
|
||||
("defense", w.defense),
|
||||
("economy", w.economy),
|
||||
] {
|
||||
assert!(
|
||||
(0.0..=1.0).contains(&v),
|
||||
"{label} neutral weight {v} out of [0,1]"
|
||||
);
|
||||
assert!(
|
||||
(v - 0.5).abs() < 1e-6,
|
||||
"{label} neutral must be 0.5, got {v}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strategic_weights_from_race_axes_normalizes_to_0_1() {
|
||||
// Extreme inputs: -10 → 0.0, +10 → 1.0, 0 → 0.5.
|
||||
let mut axes = HashMap::new();
|
||||
axes.insert("expansion".to_string(), 10);
|
||||
axes.insert("wealth".to_string(), -10);
|
||||
axes.insert("culture".to_string(), 0);
|
||||
let w = StrategicWeights::from_race_axes(&axes);
|
||||
|
||||
assert!((w.expansion - 1.0).abs() < 1e-6, "expansion=+10 → 1.0, got {}", w.expansion);
|
||||
assert!((w.aggression - 1.0).abs() < 1e-6, "aggression tracks expansion, got {}", w.aggression);
|
||||
assert!((w.economy - 0.0).abs() < 1e-6, "wealth=-10 → economy 0.0, got {}", w.economy);
|
||||
// defense = max(1 - expansion, 0.2) = max(0, 0.2) = 0.2 floor
|
||||
assert!((w.defense - 0.2).abs() < 1e-6, "defense floor 0.2 when expansion=1.0, got {}", w.defense);
|
||||
// research = (culture + wealth) / 2 = (0.5 + 0) / 2 = 0.25
|
||||
assert!((w.research - 0.25).abs() < 1e-6, "research is (culture+wealth)/2, got {}", w.research);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strategic_weights_from_race_axes_handles_missing_keys() {
|
||||
// Missing keys default to 0 (which normalizes to 0.5), so neutral-ish.
|
||||
let axes: HashMap<String, i32> = HashMap::new();
|
||||
let w = StrategicWeights::from_race_axes(&axes);
|
||||
for v in [w.aggression, w.expansion, w.research, w.economy] {
|
||||
assert!((v - 0.5).abs() < 1e-6, "missing-key default must be 0.5, got {v}");
|
||||
}
|
||||
// defense floor clamps at 0.2 — but at expansion=0.5, 1-0.5=0.5 wins.
|
||||
assert!((w.defense - 0.5).abs() < 1e-6, "defense {}; expected 0.5 when expansion=0.5", w.defense);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strategic_weights_from_race_axes_clamps_out_of_range() {
|
||||
// Inputs beyond [-10, +10] should be clamped, not panic or produce NaN.
|
||||
let mut axes = HashMap::new();
|
||||
axes.insert("expansion".to_string(), 99);
|
||||
axes.insert("wealth".to_string(), -99);
|
||||
let w = StrategicWeights::from_race_axes(&axes);
|
||||
for v in [w.aggression, w.expansion, w.research, w.defense, w.economy] {
|
||||
assert!(v.is_finite(), "weight must be finite, got {v}");
|
||||
assert!((0.0..=1.0).contains(&v), "weight {v} out of [0,1]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -255,3 +255,211 @@ where
|
|||
}
|
||||
score_fn(&s)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
//! Unit tests for the generic tree engine over a toy `CoinState` — these
|
||||
//! exercise UCB1 selection, expansion invariants, backprop, and parallel-
|
||||
//! rollout determinism without needing the full `GameRolloutState` impl
|
||||
//! (that lives in `tests/mcts_basic.rs` as an integration test).
|
||||
|
||||
use super::*;
|
||||
|
||||
/// Toy two-action state: heads/tails. Terminal after `depth` flips.
|
||||
/// Reward = proportion of Heads flipped (deterministic from the sequence).
|
||||
#[derive(Clone, Debug)]
|
||||
struct CoinState {
|
||||
flips: Vec<bool>,
|
||||
max_depth: usize,
|
||||
}
|
||||
|
||||
impl CoinState {
|
||||
fn new(max_depth: usize) -> Self {
|
||||
Self { flips: Vec::new(), max_depth }
|
||||
}
|
||||
}
|
||||
|
||||
impl TreeState for CoinState {
|
||||
type Action = bool;
|
||||
|
||||
fn legal_actions(&self) -> Vec<bool> {
|
||||
if self.flips.len() >= self.max_depth { Vec::new() } else { vec![true, false] }
|
||||
}
|
||||
|
||||
fn apply(&self, action: &bool) -> Self {
|
||||
let mut next = self.clone();
|
||||
next.flips.push(*action);
|
||||
next
|
||||
}
|
||||
}
|
||||
|
||||
// ── Node / expansion invariants ──────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn new_tree_has_root_with_all_legal_actions_untried() {
|
||||
let t = Tree::new(CoinState::new(3));
|
||||
assert_eq!(t.nodes.len(), 1, "root-only tree has exactly 1 node");
|
||||
assert_eq!(t.root().untried.len(), 2, "root has 2 untried actions (H, T)");
|
||||
assert!(t.root().children.is_empty(), "root has no children yet");
|
||||
assert_eq!(t.root().visits, 0);
|
||||
assert_eq!(t.root().wins, 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_drains_untried_and_adds_child() {
|
||||
let mut t = Tree::new(CoinState::new(3));
|
||||
let c1 = t.expand(0).expect("first expand must succeed");
|
||||
assert_eq!(t.root().untried.len(), 1, "one action should remain untried");
|
||||
assert_eq!(t.root().children, vec![c1], "child index tracked");
|
||||
assert_eq!(t.nodes[c1].parent, Some(0));
|
||||
|
||||
let c2 = t.expand(0).expect("second expand must succeed");
|
||||
assert!(t.root().untried.is_empty(), "fully expanded after 2 expands");
|
||||
assert_eq!(t.root().children, vec![c1, c2]);
|
||||
|
||||
assert!(t.expand(0).is_none(), "third expand must return None");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_applies_action_to_produce_child_state() {
|
||||
let mut t = Tree::new(CoinState::new(3));
|
||||
let c = t.expand(0).unwrap();
|
||||
// The pushed action determines the child — `untried` pops from the end,
|
||||
// so it's the LAST of `legal_actions()`.
|
||||
let applied_action = t.nodes[c].action.expect("child must carry its action");
|
||||
assert_eq!(t.nodes[c].state.flips, vec![applied_action]);
|
||||
}
|
||||
|
||||
// ── UCB1 selection ───────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn ucb1_returns_infinity_for_unvisited_child() {
|
||||
// The tree MUST visit unvisited children before exploiting — this is
|
||||
// the UCB1 contract (n=0 ⇒ ∞ score). Assert via an unvisited node.
|
||||
let mut t = Tree::new(CoinState::new(3));
|
||||
let c1 = t.expand(0).unwrap();
|
||||
let c2 = t.expand(0).unwrap();
|
||||
// Parent has 2 visits, c1 has 0, c2 has 0 — both should be +INF.
|
||||
t.nodes[0].visits = 2;
|
||||
let log_n = 2.0f32.ln();
|
||||
let s1 = t.ucb1(c1, log_n);
|
||||
let s2 = t.ucb1(c2, log_n);
|
||||
assert!(s1.is_infinite() && s1 > 0.0);
|
||||
assert!(s2.is_infinite() && s2 > 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ucb1_prefers_higher_average_reward() {
|
||||
let mut t = Tree::new(CoinState::new(3));
|
||||
let c1 = t.expand(0).unwrap();
|
||||
let c2 = t.expand(0).unwrap();
|
||||
// Both visited N times; c1 has higher wins.
|
||||
t.nodes[c1].visits = 10; t.nodes[c1].wins = 9.0; // 90% avg
|
||||
t.nodes[c2].visits = 10; t.nodes[c2].wins = 3.0; // 30% avg
|
||||
t.nodes[0].visits = 20;
|
||||
let log_n = 20.0f32.ln();
|
||||
assert!(t.ucb1(c1, log_n) > t.ucb1(c2, log_n));
|
||||
}
|
||||
|
||||
// ── Backpropagation ──────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn backpropagate_increments_visits_and_wins_to_root() {
|
||||
let mut t = Tree::new(CoinState::new(3));
|
||||
let c = t.expand(0).unwrap();
|
||||
t.backpropagate(c, 0.7);
|
||||
assert_eq!(t.nodes[c].visits, 1);
|
||||
assert!((t.nodes[c].wins - 0.7).abs() < 1e-6);
|
||||
assert_eq!(t.root().visits, 1, "root visits += 1");
|
||||
assert!((t.root().wins - 0.7).abs() < 1e-6, "root wins += 0.7");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn backpropagate_accumulates_across_calls() {
|
||||
let mut t = Tree::new(CoinState::new(3));
|
||||
let c = t.expand(0).unwrap();
|
||||
t.backpropagate(c, 0.2);
|
||||
t.backpropagate(c, 0.6);
|
||||
t.backpropagate(c, 1.0);
|
||||
assert_eq!(t.nodes[c].visits, 3);
|
||||
assert!((t.nodes[c].wins - 1.8).abs() < 1e-6);
|
||||
assert_eq!(t.root().visits, 3);
|
||||
assert!((t.root().wins - 1.8).abs() < 1e-6);
|
||||
}
|
||||
|
||||
// ── simulate_parallel determinism contract ──────────────────────────
|
||||
|
||||
#[test]
|
||||
fn simulate_parallel_is_seed_deterministic_across_repeated_calls() {
|
||||
// Backprop order must be rollout-index-order (NOT thread-scheduling
|
||||
// order) so wins totals come out identical on repeated runs with
|
||||
// the same base_seed. If the sort-by-index step inside
|
||||
// `simulate_parallel` is ever removed, this test catches it.
|
||||
let count = 16;
|
||||
let rollout_fn = |_s: &CoinState, rng: &mut XorShift64| -> f32 {
|
||||
// Deterministic-from-seed reward so parallelism can't hide
|
||||
// non-determinism behind rng variance.
|
||||
(rng.next_u64() as f32 / u64::MAX as f32).abs()
|
||||
};
|
||||
let mut t1 = Tree::new(CoinState::new(3));
|
||||
t1.simulate_parallel(count, 42, rollout_fn);
|
||||
let mut t2 = Tree::new(CoinState::new(3));
|
||||
t2.simulate_parallel(count, 42, rollout_fn);
|
||||
assert_eq!(t1.root().visits, t2.root().visits, "visit counts must match");
|
||||
assert!(
|
||||
(t1.root().wins - t2.root().wins).abs() < 1e-5,
|
||||
"wins must match: {} vs {}", t1.root().wins, t2.root().wins
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simulate_parallel_noop_on_zero_rollouts() {
|
||||
let mut t = Tree::new(CoinState::new(3));
|
||||
t.simulate_parallel(0, 42, |_, _| 0.5);
|
||||
assert_eq!(t.root().visits, 0, "zero rollouts should not touch tree");
|
||||
}
|
||||
|
||||
// ── rollout_snapshot helper ─────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn rollout_snapshot_walks_depth_steps_and_scores() {
|
||||
// Incrementing counter state — each step +1. Assert depth of walk.
|
||||
let start = 0u32;
|
||||
let mut rng = XorShift64::new(42);
|
||||
let r = rollout_snapshot(
|
||||
&start,
|
||||
&mut rng,
|
||||
5,
|
||||
&|s: &u32, _d: u32, _rng: &mut XorShift64| s + 1,
|
||||
&|s: &u32| *s as f32 / 10.0,
|
||||
);
|
||||
// 5 steps → counter = 5 → score = 0.5
|
||||
assert!((r - 0.5).abs() < 1e-6, "expected 0.5, got {r}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rollout_snapshot_returns_initial_score_at_depth_zero() {
|
||||
let start = 7u32;
|
||||
let mut rng = XorShift64::new(1);
|
||||
let r = rollout_snapshot(
|
||||
&start,
|
||||
&mut rng,
|
||||
0, // no steps
|
||||
&|s: &u32, _d, _rng| s + 1,
|
||||
&|s: &u32| *s as f32,
|
||||
);
|
||||
assert!((r - 7.0).abs() < 1e-6, "depth=0 should return score(initial)");
|
||||
}
|
||||
|
||||
// ── rollout() default stub ─────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn treestate_default_rollout_returns_stub_half() {
|
||||
let state = CoinState::new(3);
|
||||
let mut rng = XorShift64::new(99);
|
||||
// Default impl returns 0.5 — this is the historical stub. When
|
||||
// `GameRolloutState` overrides rollout, this test still passes
|
||||
// for toy states that leave the default.
|
||||
assert!((state.rollout(&mut rng, 20, 1.0, 0) - 0.5).abs() < 1e-6);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
335
src/simulator/crates/mc-ai/tests/ultimate_lookahead_stress.rs
Normal file
335
src/simulator/crates/mc-ai/tests/ultimate_lookahead_stress.rs
Normal file
|
|
@ -0,0 +1,335 @@
|
|||
//! Ultimate AI lookahead stress test.
|
||||
//!
|
||||
//! The user's "ultimate test" is an 8-player huge-map game with all 5
|
||||
//! personalities competing, stressing the AI lookahead (MCTS + GPU batched
|
||||
//! rollouts). That end-to-end test lives in
|
||||
//! `tools/ultimate-game.sh` (requires a working RUN host).
|
||||
//!
|
||||
//! THIS file is the in-process companion: it exercises the same code paths
|
||||
//! — personality priors, rollout walker, GPU batched dispatch — against a
|
||||
//! synthetic 8-player configuration, without needing the game binary.
|
||||
//! It catches regressions in the lookahead pipeline itself (tree depth,
|
||||
//! rollout determinism, batched GPU throughput, per-clan divergence at scale)
|
||||
//! independently of any host-level infrastructure. Runs in under a second.
|
||||
//!
|
||||
//! Scope: this is a STRESS test, not a correctness test. Correctness is
|
||||
//! covered by the parity / policy / rollout tests in sibling files. Here we
|
||||
//! assert the lookahead pipeline SCALES to the "ultimate" configuration:
|
||||
//! - 8-player abstract state packs into the fixed POD layout
|
||||
//! - Per-player personality priors from the 5-clan rotation are honored
|
||||
//! - Walker horizon reaches depth >= 20 without panic or overflow
|
||||
//! - GPU batched dispatch accepts large batches (256+ entries)
|
||||
//! - Rollout results are seed-deterministic across repeated invocations
|
||||
//!
|
||||
//! Pre-existing bullet order (user): "ultimate test should be AFTER all
|
||||
//! 5 personalities (permutations of 1v1) have had balanced match-ups". The
|
||||
//! balanced-matchup gate is `tools/matchup-grid.sh` + `checklist-report.py
|
||||
//! matchup_balance`. This file deliberately operates at the abstract-state
|
||||
//! layer so it runs IN the `cargo test` cycle — fast feedback.
|
||||
|
||||
use mc_ai::abstract_state::{AbstractPlayerState, AbstractRolloutState, MAX_PLAYERS};
|
||||
use mc_ai::mcts::XorShift64;
|
||||
use mc_ai::policy::PersonalityPriors;
|
||||
use mc_ai::rollout::{walk, GameRolloutState, DEFAULT_ROLLOUT_TEMPERATURE};
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Instant;
|
||||
|
||||
fn data_dir() -> PathBuf {
|
||||
let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest
|
||||
.ancestors()
|
||||
.nth(4)
|
||||
.expect("mc-ai crate must sit four dirs below repo root")
|
||||
.join("public")
|
||||
.join("games")
|
||||
.join("age-of-dwarves")
|
||||
.join("data")
|
||||
}
|
||||
|
||||
/// Build a `[PersonalityPriors; 4]` that rotates through the five clans.
|
||||
/// For N > 5 players, wraps — the goal is coverage, not uniqueness.
|
||||
/// Players 0..4 get each of the 5 clans in a fixed order; players 4..8
|
||||
/// wrap back around, ensuring 8-player games exercise every clan at least
|
||||
/// once.
|
||||
fn eight_player_clan_rotation() -> [[PersonalityPriors; MAX_PLAYERS]; 8] {
|
||||
let data = data_dir();
|
||||
let clans = ["ironhold", "goldvein", "blackhammer", "deepforge", "runesmith"];
|
||||
let loaded: Vec<_> = clans
|
||||
.iter()
|
||||
.map(|id| {
|
||||
PersonalityPriors::from_personality(id, &data)
|
||||
.unwrap_or_else(|e| panic!("failed to load clan {id}: {e:?}"))
|
||||
})
|
||||
.collect();
|
||||
// For the stress test we only rotate the "acting" player slot (POD is
|
||||
// 4-slot per entry; MAX_PLAYERS=4). Each of the 8 "entries" represents
|
||||
// one player in an 8-player game with a different root clan.
|
||||
let mut entries = [[loaded[0]; MAX_PLAYERS]; 8];
|
||||
for (i, entry) in entries.iter_mut().enumerate() {
|
||||
// The root player (slot 0 in this entry's POD) rotates through
|
||||
// the 5 clans; other slots fill in-order from the remaining clans
|
||||
// so every entry has 4 distinct clan priors.
|
||||
for slot in 0..MAX_PLAYERS {
|
||||
entry[slot] = loaded[(i + slot) % clans.len()];
|
||||
}
|
||||
}
|
||||
entries
|
||||
}
|
||||
|
||||
/// 8-player large-map fixture. Each of the 8 entries represents one active
|
||||
/// AI in an 8-player game. Gives every AI enough resources to exercise all
|
||||
/// 9 ActionKinds (Build / Attack / Settle / Research / Defend / Trade /
|
||||
/// ContinueWar / MakePeace / Idle).
|
||||
fn eight_player_batch() -> Vec<AbstractRolloutState> {
|
||||
(0..8)
|
||||
.map(|i| {
|
||||
let mut state = AbstractRolloutState::zeroed();
|
||||
// Player 0 (the acting / root player): well-resourced to sustain
|
||||
// the rollout walker through its full horizon.
|
||||
state.players[0] = AbstractPlayerState {
|
||||
gold: 200 + (i as i32) * 10,
|
||||
science: 30 + (i as i32) * 2,
|
||||
pop_total: 10,
|
||||
city_count: 2,
|
||||
tech_index: 5,
|
||||
unit_counts: [3, 2, 1, 0],
|
||||
happiness_pool: 5,
|
||||
_pad0: 0,
|
||||
force_rel: [0, 20, 10, 5], // enables Attack + ContinueWar
|
||||
axes: [5; 8],
|
||||
relations: [0, -1, 0, 0], // enables MakePeace
|
||||
_pad1: [0; 4],
|
||||
rng_state: 0xAAAA_BBBB_CCCC_DDDD ^ (i as u64),
|
||||
turn: 1,
|
||||
_pad2: [0; 4],
|
||||
};
|
||||
// Opponents: smaller footprint but present. Exercises the
|
||||
// rollout walker's opponent-iteration paths.
|
||||
for slot in 1..MAX_PLAYERS {
|
||||
state.players[slot] = AbstractPlayerState {
|
||||
gold: 50,
|
||||
science: 10,
|
||||
pop_total: 5,
|
||||
city_count: 1,
|
||||
tech_index: 2,
|
||||
unit_counts: [1, 1, 0, 0],
|
||||
happiness_pool: 0,
|
||||
_pad0: 0,
|
||||
force_rel: [5, 0, 5, 5],
|
||||
axes: [5; 8],
|
||||
relations: [0, 0, 0, 0],
|
||||
_pad1: [0; 4],
|
||||
rng_state: 0x1111_2222_3333_4444 ^ (slot as u64) ^ (i as u64),
|
||||
turn: 1,
|
||||
_pad2: [0; 4],
|
||||
};
|
||||
}
|
||||
state
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// ── Shape + determinism gates ──────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn clan_rotation_covers_all_five_personalities() {
|
||||
// All 5 clans must appear as a root player (slot 0) across the 8 entries.
|
||||
let rotation = eight_player_clan_rotation();
|
||||
let mut seen_aggression: std::collections::BTreeSet<i32> = std::collections::BTreeSet::new();
|
||||
for entry in &rotation {
|
||||
// Quantize the aggression axis to an integer so float equality isn't
|
||||
// a concern — the 5 clans have 5 distinct aggression scores.
|
||||
seen_aggression.insert(entry[0].aggression as i32);
|
||||
}
|
||||
assert!(
|
||||
seen_aggression.len() >= 5,
|
||||
"8-player rotation must surface all 5 clans as root; saw {} distinct aggression values: {:?}",
|
||||
seen_aggression.len(),
|
||||
seen_aggression
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn eight_player_fixture_packs_into_fixed_pod_size() {
|
||||
// The POD is 256 bytes regardless of how many logical players the game
|
||||
// has — extra players live in adjacent entries, not wider slots. Assert
|
||||
// our fixture respects that contract.
|
||||
use std::mem::size_of;
|
||||
assert_eq!(size_of::<AbstractRolloutState>(), 256);
|
||||
let batch = eight_player_batch();
|
||||
assert_eq!(batch.len(), 8, "8-player stress fixture");
|
||||
// Every entry is exactly 256 bytes — no accidental Vec or heap indirection.
|
||||
assert_eq!(
|
||||
batch.iter().map(|_| size_of::<AbstractRolloutState>()).sum::<usize>(),
|
||||
256 * 8
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn walker_reaches_full_horizon_on_eight_player_configuration() {
|
||||
// The walker MUST NOT break early on a healthy 8-player config. If it
|
||||
// does, we're losing deep rollouts — which is exactly what the "stress
|
||||
// lookahead" acceptance is measuring.
|
||||
let batch = eight_player_batch();
|
||||
let priors_per_entry = eight_player_clan_rotation();
|
||||
let horizon = 20u32;
|
||||
|
||||
for (i, (pod, priors)) in batch.iter().zip(priors_per_entry.iter()).enumerate() {
|
||||
let state = GameRolloutState::from_abstract(*pod, *priors);
|
||||
let mut rng = XorShift64::new(42 + i as u64);
|
||||
let score = walk(&state, &mut rng, horizon, DEFAULT_ROLLOUT_TEMPERATURE, 0);
|
||||
assert!(
|
||||
score.is_finite() && (0.0..=1.0).contains(&score),
|
||||
"entry {i} produced score {score} outside [0,1] — walker may have panicked or overflowed"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn eight_player_rollout_is_seed_deterministic() {
|
||||
// Run the whole 8-player batch twice with the same seeds; every score
|
||||
// must match bit-for-bit (float equality is fine; walker is branchy
|
||||
// but the arithmetic is additive + saturating, no non-deterministic ops).
|
||||
let batch = eight_player_batch();
|
||||
let priors_per_entry = eight_player_clan_rotation();
|
||||
|
||||
let scores_a: Vec<f32> = batch
|
||||
.iter()
|
||||
.zip(priors_per_entry.iter())
|
||||
.enumerate()
|
||||
.map(|(i, (pod, priors))| {
|
||||
let state = GameRolloutState::from_abstract(*pod, *priors);
|
||||
let mut rng = XorShift64::new(42 + i as u64);
|
||||
walk(&state, &mut rng, 20, DEFAULT_ROLLOUT_TEMPERATURE, 0)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let scores_b: Vec<f32> = batch
|
||||
.iter()
|
||||
.zip(priors_per_entry.iter())
|
||||
.enumerate()
|
||||
.map(|(i, (pod, priors))| {
|
||||
let state = GameRolloutState::from_abstract(*pod, *priors);
|
||||
let mut rng = XorShift64::new(42 + i as u64);
|
||||
walk(&state, &mut rng, 20, DEFAULT_ROLLOUT_TEMPERATURE, 0)
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert_eq!(scores_a, scores_b, "same-seed 8-player walk must be bit-deterministic");
|
||||
}
|
||||
|
||||
// ── Scale + throughput gate ────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn deep_stress_batch_256_entries_finishes_in_under_one_second() {
|
||||
// Scale gate: in a real 8-player game, a single MCTS expansion might
|
||||
// dispatch 256+ rollouts in a batch. This test asserts that scale works
|
||||
// on CPU (GPU is covered by the parity test). If someone accidentally
|
||||
// introduces an O(N²) step, this test blows past the 1-second budget
|
||||
// and fails loudly.
|
||||
//
|
||||
// 256 entries × 20-turn horizon × ~9 actions/turn ≈ 50k operations. On
|
||||
// a debug build this typically runs in ~100ms.
|
||||
let rotation = eight_player_clan_rotation();
|
||||
let base_priors = rotation[0];
|
||||
let mut batch = Vec::with_capacity(256);
|
||||
for i in 0..256 {
|
||||
let mut state = AbstractRolloutState::zeroed();
|
||||
state.players[0].gold = 100 + i;
|
||||
state.players[0].pop_total = 5;
|
||||
state.players[0].city_count = 1;
|
||||
state.players[0].force_rel = [0, 20, 0, 0];
|
||||
state.players[0].relations = [0, -1, 0, 0];
|
||||
state.players[0].rng_state = 0x1234_5678_9ABC_DEF0u64.wrapping_add(i as u64);
|
||||
batch.push(state);
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
let mut total = 0.0f64;
|
||||
for (i, pod) in batch.iter().enumerate() {
|
||||
let state = GameRolloutState::from_abstract(*pod, base_priors);
|
||||
let mut rng = XorShift64::new(42u64 + i as u64);
|
||||
total += walk(&state, &mut rng, 20, DEFAULT_ROLLOUT_TEMPERATURE, 0) as f64;
|
||||
}
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
assert!(
|
||||
total > 0.0,
|
||||
"aggregate score {total} non-positive — walker outputs look broken"
|
||||
);
|
||||
assert!(
|
||||
elapsed.as_secs_f32() < 1.0,
|
||||
"256-entry stress batch took {:?} (>1s budget); possible O(N²) regression",
|
||||
elapsed
|
||||
);
|
||||
}
|
||||
|
||||
// ── Clan divergence at 8-player scale ─────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn eight_player_clan_divergence_preserves_personality_signal() {
|
||||
// The "skillful clan personality" claim in p0-02 means that per-clan
|
||||
// action biases persist even in 8-player configurations — NOT just in
|
||||
// fixture 1v1s. This test takes the same 8-player POD, runs it under
|
||||
// Ironhold vs Blackhammer priors, and asserts the final scores differ.
|
||||
// If scores collapse to identical values, either the priors aren't
|
||||
// flowing into the rollout or the walker is ignoring them.
|
||||
let data = data_dir();
|
||||
let iron = PersonalityPriors::from_personality("ironhold", &data).unwrap();
|
||||
let black = PersonalityPriors::from_personality("blackhammer", &data).unwrap();
|
||||
|
||||
let mut pod = AbstractRolloutState::zeroed();
|
||||
pod.players[0].gold = 500;
|
||||
pod.players[0].pop_total = 8;
|
||||
pod.players[0].city_count = 2;
|
||||
pod.players[0].force_rel = [0, 30, 20, 10];
|
||||
pod.players[0].relations = [0, -1, 0, 0];
|
||||
pod.players[0].rng_state = 0xDEAD_BEEF_CAFE_F00D;
|
||||
|
||||
let iron_state = GameRolloutState::from_abstract(pod, [iron; MAX_PLAYERS]);
|
||||
let black_state = GameRolloutState::from_abstract(pod, [black; MAX_PLAYERS]);
|
||||
|
||||
// Use a fixed seed so ONLY the prior differences influence the output.
|
||||
let mut iron_rng = XorShift64::new(7);
|
||||
let mut black_rng = XorShift64::new(7);
|
||||
let iron_score = walk(&iron_state, &mut iron_rng, 30, DEFAULT_ROLLOUT_TEMPERATURE, 0);
|
||||
let black_score = walk(&black_state, &mut black_rng, 30, DEFAULT_ROLLOUT_TEMPERATURE, 0);
|
||||
|
||||
assert!(
|
||||
(iron_score - black_score).abs() > 1e-4,
|
||||
"Ironhold and Blackhammer MUST produce measurably different walk scores \
|
||||
at 8-player scale (got iron={iron_score} black={black_score}). \
|
||||
If scores converge, the priors aren't flowing into the walker and the \
|
||||
'skillful clan personality' claim is broken at scale."
|
||||
);
|
||||
}
|
||||
|
||||
// ── Guard: 5-clan pool as exported in ai_personalities.json ───────────
|
||||
|
||||
#[test]
|
||||
fn ai_personalities_json_still_exports_exactly_five_clans() {
|
||||
// Prerequisite for the user's "ultimate test" is the 1v1-balanced-matchup
|
||||
// grid across all 5 personalities. If someone adds a 6th clan to
|
||||
// ai_personalities.json without also updating the matchup grid
|
||||
// harness (tools/matchup-grid.sh), this test fails loudly.
|
||||
let json_path = data_dir().join("ai_personalities.json");
|
||||
let text = std::fs::read_to_string(&json_path)
|
||||
.unwrap_or_else(|e| panic!("failed to read {json_path:?}: {e}"));
|
||||
let map: HashMap<String, serde_json::Value> = serde_json::from_str(&text)
|
||||
.unwrap_or_else(|e| panic!("{json_path:?} is not valid JSON: {e}"));
|
||||
let expected = ["ironhold", "goldvein", "blackhammer", "deepforge", "runesmith"];
|
||||
assert_eq!(
|
||||
map.len(),
|
||||
5,
|
||||
"expected exactly 5 clans in ai_personalities.json, found {}: {:?}",
|
||||
map.len(),
|
||||
map.keys().collect::<Vec<_>>()
|
||||
);
|
||||
for id in &expected {
|
||||
assert!(
|
||||
map.contains_key(*id),
|
||||
"ai_personalities.json missing expected clan {id}"
|
||||
);
|
||||
}
|
||||
}
|
||||
144
tools/ci-autoplay-smoke.sh
Executable file
144
tools/ci-autoplay-smoke.sh
Executable file
|
|
@ -0,0 +1,144 @@
|
|||
#!/usr/bin/env bash
|
||||
# ci-autoplay-smoke.sh — Hang-regression smoke test for the autoplay pipeline.
|
||||
#
|
||||
# Runs one seeded T100 autoplay with a hard wall-clock budget and asserts the
|
||||
# final `turn_stats.jsonl` entry has `outcome != "in_progress"`. Catches any
|
||||
# class of hang — whether the root cause is in Godot (signal re-entry, main-
|
||||
# loop stall), in Rust (MCTS deadlock, combat infinite loop), or in the
|
||||
# shell harness (pkill substring collision, missing SAFETY timeout).
|
||||
#
|
||||
# Regression history:
|
||||
# 2026-04-17 loop13 — PARALLEL=10 T300 hung all 10 seeds because
|
||||
# `run_ap3.sh`'s cleanup `pkill -f "AUTO_PLAY_DIR=<path>"` substring-matched
|
||||
# active sibling seeds whose paths shared a numeric prefix (seed1 → seed10).
|
||||
# Fixed by switching to a unique per-run AP_RUN_ID token. This smoke test
|
||||
# would have caught the hang immediately in `./run verify` because the
|
||||
# victim game's `outcome` stays "in_progress" after SIGTERM.
|
||||
#
|
||||
# Usage:
|
||||
# tools/ci-autoplay-smoke.sh # default seed=1, T100, 180s budget
|
||||
# tools/ci-autoplay-smoke.sh <seed> <turns> # custom seed/turns
|
||||
#
|
||||
# Environment:
|
||||
# AUTOPLAY_HOST — if set, run via SSH on that host (e.g. apricot)
|
||||
# PROJECT_ROOT_REMOTE — repo path on RUN host (default: $HOME/Code/…)
|
||||
# SMOKE_WALL_BUDGET_SEC — hard wall-clock budget (default: 180)
|
||||
# SMOKE_KEEP_OUTPUT — "1" to keep .local/ci-smoke/ results dir after test
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 — game finished with a terminal outcome (victory | max_turns | defeat)
|
||||
# 1 — game hung (outcome still "in_progress") OR no turn_stats produced
|
||||
# 2 — bad arguments / SSH / environment failure
|
||||
#
|
||||
# Hook into ./run verify per p0-10 hang-regression mandate.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
SEED="${1:-1}"
|
||||
TURNS="${2:-100}"
|
||||
BUDGET="${SMOKE_WALL_BUDGET_SEC:-180}"
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
STAMP="$(date +%Y%m%d_%H%M%S)"
|
||||
RESULTS_DIR="$PROJECT_DIR/.local/ci-smoke/smoke_${STAMP}_seed${SEED}"
|
||||
mkdir -p "$RESULTS_DIR"
|
||||
|
||||
echo "[ci-autoplay-smoke] seed=$SEED turns=$TURNS budget=${BUDGET}s"
|
||||
echo "[ci-autoplay-smoke] results: $RESULTS_DIR"
|
||||
|
||||
_cleanup() {
|
||||
if [ "${SMOKE_KEEP_OUTPUT:-0}" != "1" ]; then
|
||||
rm -rf "$RESULTS_DIR" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap _cleanup EXIT
|
||||
|
||||
_fail() {
|
||||
echo "[ci-autoplay-smoke] FAIL: $*" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# ── Run autoplay ─────────────────────────────────────────────────────────────
|
||||
|
||||
if [ -n "${AUTOPLAY_HOST:-}" ]; then
|
||||
# Remote path — use the same runner autoplay-batch.sh uses.
|
||||
REMOTE_ROOT="${PROJECT_ROOT_REMOTE:-\$HOME/Code/@projects/@magic-civilization}"
|
||||
REMOTE_DIR="${REMOTE_ROOT}/.local/ci-smoke/smoke_${STAMP}_seed${SEED}"
|
||||
REMOTE_RUNNER="${REMOTE_RUNNER:-\$HOME/bin/run_ap3.sh}"
|
||||
RUN_ID="ci_smoke_${STAMP}_seed${SEED}"
|
||||
|
||||
ssh "$AUTOPLAY_HOST" "
|
||||
set -uo pipefail
|
||||
mkdir -p '$REMOTE_DIR'
|
||||
AUTO_PLAY=true \
|
||||
AUTO_PLAY_SEED='$SEED' \
|
||||
AUTO_PLAY_TURN_LIMIT='$TURNS' \
|
||||
AUTO_PLAY_DIR='$REMOTE_DIR' \
|
||||
AP_RUN_ID='$RUN_ID' \
|
||||
timeout '$BUDGET' bash $REMOTE_RUNNER
|
||||
" >"$RESULTS_DIR/game.log" 2>&1
|
||||
REMOTE_EXIT=$?
|
||||
|
||||
# Pull turn_stats + meta back. The remote auto_play writes either into
|
||||
# the AUTO_PLAY_DIR directly (if the caller named it `game_<stamp>_seed<N>`)
|
||||
# or into a `game_*` subdir. ssh-cat handles both shapes — globbing via
|
||||
# scp's non-quoted path ran into login-shell variations.
|
||||
ssh "$AUTOPLAY_HOST" "find '$REMOTE_DIR' -maxdepth 3 -name turn_stats.jsonl -print0 | xargs -0 -I{} cat {}" \
|
||||
>"$RESULTS_DIR/turn_stats.jsonl" 2>/dev/null || true
|
||||
ssh "$AUTOPLAY_HOST" "find '$REMOTE_DIR' -maxdepth 3 -name meta.json -print0 | xargs -0 -I{} cat {}" \
|
||||
>"$RESULTS_DIR/meta.json" 2>/dev/null || true
|
||||
|
||||
if [ "$REMOTE_EXIT" -eq 124 ]; then
|
||||
_fail "autoplay timed out after ${BUDGET}s — hang regression detected (SSH timeout path)"
|
||||
fi
|
||||
else
|
||||
# Local path — flatpak Godot, Linux only.
|
||||
if ! command -v flatpak >/dev/null 2>&1; then
|
||||
echo "[ci-autoplay-smoke] SKIP: no flatpak locally and AUTOPLAY_HOST unset"
|
||||
exit 0
|
||||
fi
|
||||
cd "$PROJECT_DIR/src/game"
|
||||
timeout "$BUDGET" flatpak run --user \
|
||||
--filesystem=home \
|
||||
--env=AUTO_PLAY=true \
|
||||
--env=AUTO_PLAY_SEED="$SEED" \
|
||||
--env=AUTO_PLAY_TURN_LIMIT="$TURNS" \
|
||||
--env=AUTO_PLAY_DIR="$RESULTS_DIR" \
|
||||
--env=AP_RUN_ID="ci_smoke_${STAMP}_seed${SEED}" \
|
||||
org.godotengine.Godot --path . --rendering-method gl_compatibility --headless \
|
||||
>"$RESULTS_DIR/game.log" 2>&1
|
||||
LOCAL_EXIT=$?
|
||||
if [ "$LOCAL_EXIT" -eq 124 ]; then
|
||||
_fail "autoplay timed out after ${BUDGET}s — hang regression detected"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── Assert terminal outcome ──────────────────────────────────────────────────
|
||||
|
||||
STATS_FILE="$(find "$RESULTS_DIR" -name 'turn_stats.jsonl' -type f 2>/dev/null | head -1)"
|
||||
if [ -z "$STATS_FILE" ] || [ ! -s "$STATS_FILE" ]; then
|
||||
_fail "no turn_stats.jsonl produced (autoplay never wrote a turn line)"
|
||||
fi
|
||||
|
||||
LAST_OUTCOME="$(tail -1 "$STATS_FILE" | python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
d = json.loads(sys.stdin.read())
|
||||
print(d.get('outcome', 'missing'))
|
||||
except Exception as e:
|
||||
print('parse_error')
|
||||
")"
|
||||
|
||||
case "$LAST_OUTCOME" in
|
||||
victory|max_turns|defeat)
|
||||
echo "[ci-autoplay-smoke] PASS — outcome=$LAST_OUTCOME"
|
||||
exit 0
|
||||
;;
|
||||
in_progress)
|
||||
_fail "outcome=in_progress — game hung mid-run (see $STATS_FILE)"
|
||||
;;
|
||||
*)
|
||||
_fail "outcome=$LAST_OUTCOME — unexpected terminal state (see $STATS_FILE)"
|
||||
;;
|
||||
esac
|
||||
Loading…
Add table
Reference in a new issue