feat(sim): make the headless fullgame runner exercise tech/trade/culture for real

The sim_scenario fullgame driver stepped the turn loop but never boot-loaded the content packs the live harness loads, so process_science ran research-less (tier-1 fallback) and process_trade_phase saw no resource categories — the strategic systems were inert. The four strategic assertions (median_tier_peak, trades_formed, border_growth, clan_winrate) were therefore skipped, leaving trade_forms / time_to_tier / culture_borders_expand / clan_fairness_band vacuously green (passing on `terminates` alone). This wires the systems for real and measures them: - drive_fullgame boot-loads the tech web (concatenated public/resources/techs/ *.json) and the resource→category map (public/resources/resources.json), the same payloads GdPlayerApi feeds set_tech_web_json / set_resource_categories_json. Now: median tier reaches 10, trades form, culture borders expand for real, and outcomes vary by seed (previously combat/founding were terrain-blind). - Extract real metrics: tier_peak_p{i} + median_tier_peak (max tier among a player's researched techs), trades_formed (traded luxuries+strategics), owned_tiles_p{i} (culture-claimed territory), and the per-seed winner. - Un-skip MedianTierPeak / TradesFormed / BorderGrowth — they evaluate against the run. ClanWinrateMax is wired as a batch-level assertion (win fraction of the most-winning clan across the seed set) with the measured value surfaced in the JSON output. - Strengthen the game1_headless_systems_150t umbrella with median_tier_peak>=4 and trades_formed>=1, and re-calibrate final_turn 120->90: a winner now emerges ~98-113t once the systems actually drive the game, instead of running flat to the cap (calibration-rule: lock the threshold to the real all-systems run). Determinism fix: PlayerTechState.researched (HashSet) now serializes sorted, so GameState serialization — and the determinism_same_seed end_state_hash check — is stable run-to-run regardless of hash iteration order. The set has no meaningful order; the in-memory type and researched_techs() accessor are unchanged. Full suite: 19/20 green. clan_fairness_band is the single honest FAIL — over 50 seeds / 6 clans only 3 ever win (winrates 0.14 / 0.46 / 0.40; clans 1,2,3 never win), max 0.46 > the 0.4 band. That is a real fairness gap from the bench's fixed asymmetric start positions + personality balance — surfaced, not tuned away (owner decision). Verified: cargo test -p mc-tech (28 passed); full sim_scenario suite run locally on plum (release), determinism + canonical + the three strategic scenarios green on real metrics. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-28 23:20:13 -04:00 · 2026-06-28 23:20:13 -04:00 · 78945e9df1
commit 78945e9df1
parent 4937459bb7
3 changed files with 239 additions and 11 deletions
--- a/public/games/age-of-dwarves/data/sim-scenarios/fullgame/game1_headless_systems_150t.json
+++ b/public/games/age-of-dwarves/data/sim-scenarios/fullgame/game1_headless_systems_150t.json
@ -2,7 +2,7 @@
  "id": "game1_headless_systems_150t",
  "kind": "fullgame",
  "version": 1,
-  "description": "Broad Game-1 systems run: 4 clans, full evolved map, exercising all systems. Terminates ~120t under these rules (victory). Regression umbrella (calibrated).",
+  "description": "Broad Game-1 systems run: 4 clans, full evolved map, exercising all systems (tech web + research, trade from owned-tile resources, culture borders, climate/ecology/healing). With the content packs boot-loaded a winner now emerges ~98-113t (victory) instead of running flat to the cap. Regression umbrella (calibrated to the real all-systems run).",
  "map": { "size": 40, "evolution_ticks": 14000, "seed_base": 150150 },
  "players": [
    { "personality": "militarist" }, { "personality": "boom" },
@ -12,9 +12,11 @@
  "seeds": [150150, 150151, 150152],
  "expect": [
    { "type": "terminates" },
-    { "type": "final_turn", "op": ">=", "value": 120 },
+    { "type": "final_turn", "op": ">=", "value": 90 },
    { "type": "no_nan_economy" },
    { "type": "population_non_negative" },
-    { "type": "total_pvp_combats", "op": ">=", "value": 0 }
+    { "type": "total_pvp_combats", "op": ">=", "value": 0 },
    { "type": "median_tier_peak", "op": ">=", "value": 4 },
    { "type": "trades_formed", "op": ">=", "value": 1 }
  ]
 }
--- a/src/simulator/crates/mc-sim/src/bin/sim_scenario.rs
+++ b/src/simulator/crates/mc-sim/src/bin/sim_scenario.rs
@ -208,9 +208,22 @@ struct BatchResult {
    seeds_run: usize,
    passed_seeds: usize,
    results: Vec<SeedResult>,
    /// Assertions evaluated once across the whole seed batch rather than
    /// per-seed (e.g. `clan_winrate_max`, a property of the win distribution).
    #[serde(skip_serializing_if = "Vec::is_empty")]
    batch_assertions: Vec<BatchAssertion>,
    overall_pass: bool,
 }
 #[derive(Debug, Serialize)]
 struct BatchAssertion {
    label: String,
    passed: bool,
    /// The measured value the assertion was checked against (for transparency
    /// in the JSON output — e.g. the observed max clan winrate).
    measured: f64,
 }
 // ───────────────────────────── Helpers ─────────────────────────────
 fn cmp(actual: f64, op: &str, target: f64) -> bool {
@ -608,6 +621,11 @@ fn drive_fullgame(
        map_seed: seed,
        ..Default::default()
    };
    // Boot-load the resource→category map (luxury/strategic/bonus) the live
    // harness derives from `resources.json`. `process_trade_phase` sources
    // tradeable surpluses from owned-tile collectibles classified by this map;
    // empty → nothing ever trades (the runner's previous behaviour).
    state.resource_categories = load_resource_categories();
    let n = sc.players.len().max(1);
    for i in 0..n {
        let base_col = 6 + (i as i32 * 4);
@ -631,7 +649,18 @@ fn drive_fullgame(
        state.players.push(ps);
    }
-    let processor = TurnProcessor::new(max_turns);
+    // Boot-load the tech web exactly as the live harness does
    // (`GdPlayerApi::set_tech_web_json`). Without it `process_science` runs
    // research-less (tier-1 fallback) and the strategic systems — tech, tiers,
    // the unlocks that gate trades — never progress, leaving the
    // median_tier_peak / trades_formed assertions un-evaluable. Loading it is
    // what makes the fullgame runner exercise "all systems" for real.
    let web_json = load_tech_web_json();
    let tier_map = tech_tier_map(&web_json);
    let mut processor = TurnProcessor::new(max_turns);
    if let Err(e) = processor.set_tech_web_json(&web_json) {
        eprintln!("# WARN: tech web load failed ({e}); running research-less");
    }
    let mut inv = Invariants {
        no_nan_economy: true,
        population_non_negative: true,
@ -641,6 +670,7 @@ fn drive_fullgame(
    let mut total_pvp = 0u32;
    let mut prev_turn = state.turn;
    let mut peak_cities: Vec<usize> = vec![0; n];
    let mut winner_pi: Option<u8> = None;
    for _ in 1..=max_turns {
        let result = processor.step(&mut state);
@ -664,7 +694,8 @@ fn drive_fullgame(
                peak_cities[i] = peak_cities[i].max(p.cities.len());
            }
        }
-        if result.winner.is_some() {
+        if let Some((w, _)) = result.winner {
            winner_pi = Some(w);
            inv.terminated = true;
            break;
        }
@ -679,11 +710,126 @@ fn drive_fullgame(
    for (i, c) in peak_cities.iter().enumerate() {
        metrics.insert(format!("peak_cities_p{i}"), serde_json::json!(c));
    }
    // Strategic-system metrics. Each maps to a real engine field so the
    // formerly-skipped assertions (median_tier_peak / trades_formed /
    // border_growth / clan_winrate) evaluate against the actual run.
    //  • trades_formed — luxuries+strategics that reached a player's ledger
    //    via `process_trade_phase`.
    //  • tier_peak_p{i} — max tier among the player's researched techs
    //    (`process_science` ↦ PlayerTechState); median across players feeds
    //    `median_tier_peak`.
    //  • owned_tiles_p{i} — culture-claimed territory (`process_culture`
    //    border expansion). Cities start with empty `owned_tiles`, so the
    //    final count IS the growth from baseline.
    let mut trades_formed = 0u64;
    let mut tier_peaks: Vec<u32> = Vec::with_capacity(n);
    for (i, p) in state.players.iter().enumerate() {
        trades_formed += (p.traded_luxuries.len() + p.traded_strategics.len()) as u64;
        let peak = p
            .player_tech
            .as_ref()
            .map(|pt| {
                pt.researched_techs()
                    .iter()
                    .filter_map(|id| tier_map.get(id).copied())
                    .max()
                    .unwrap_or(0)
            })
            .unwrap_or(0);
        tier_peaks.push(peak);
        metrics.insert(format!("tier_peak_p{i}"), serde_json::json!(peak));
        let owned: usize = p.cities.iter().map(|c| c.owned_tiles.len()).sum();
        metrics.insert(format!("owned_tiles_p{i}"), serde_json::json!(owned));
    }
    let median_tier = {
        let mut v = tier_peaks;
        v.sort_unstable();
        if v.is_empty() {
            0
        } else {
            v[v.len() / 2]
        }
    };
    metrics.insert("trades_formed".into(), serde_json::json!(trades_formed));
    metrics.insert("median_tier_peak".into(), serde_json::json!(median_tier));
    if let Some(w) = winner_pi {
        metrics.insert("winner".into(), serde_json::json!(w));
    }
    metrics.insert("end_state_hash".into(), serde_json::json!(hash_state(&state)));
    (state.turn, metrics, inv)
 }
 /// Concatenate every `public/resources/techs/*.json` pillar (each a JSON array
 /// of tech definitions) into one flat array — the same payload the live harness
 /// feeds `set_tech_web_json`. Deterministic order (sorted paths) so the run is
 /// reproducible across machines.
 fn load_tech_web_json() -> String {
    let dir = repo_root().join("public/resources/techs");
    let mut all: Vec<serde_json::Value> = Vec::new();
    if let Ok(entries) = fs::read_dir(&dir) {
        let mut paths: Vec<PathBuf> = entries
            .filter_map(|e| e.ok().map(|e| e.path()))
            .filter(|p| p.extension().is_some_and(|x| x == "json"))
            .collect();
        paths.sort();
        for p in paths {
            if let Ok(text) = fs::read_to_string(&p) {
                if let Ok(serde_json::Value::Array(defs)) = serde_json::from_str(&text) {
                    all.extend(defs);
                }
            }
        }
    }
    serde_json::to_string(&all).unwrap_or_else(|_| "[]".to_string())
 }
 /// Flatten `public/resources/resources.json` into the `{resource_id →
 /// category}` map the live harness loads via `set_resource_categories_json`.
 /// Top-level keys `bonus` / `luxury` / `strategic` each hold an array of
 /// resource entries; the key IS the category. Reads the same canonical pack
 /// the live game does (no hardcoded values).
 fn load_resource_categories() -> BTreeMap<String, String> {
    let path = repo_root().join("public/resources/resources.json");
    let mut map = BTreeMap::new();
    let Ok(text) = fs::read_to_string(&path) else {
        return map;
    };
    let Ok(root) = serde_json::from_str::<serde_json::Value>(&text) else {
        return map;
    };
    for category in ["bonus", "luxury", "strategic"] {
        if let Some(arr) = root.get(category).and_then(serde_json::Value::as_array) {
            for entry in arr {
                if let Some(id) = entry.get("id").and_then(serde_json::Value::as_str) {
                    map.insert(id.to_string(), category.to_string());
                }
            }
        }
    }
    map
 }
 /// tech id → tier, parsed from the concatenated tech-web JSON. Backs the
 /// `median_tier_peak` metric (a player's peak tier = max tier of its
 /// researched techs).
 fn tech_tier_map(web_json: &str) -> BTreeMap<String, u32> {
    let mut m = BTreeMap::new();
    if let Ok(serde_json::Value::Array(defs)) = serde_json::from_str::<serde_json::Value>(web_json) {
        for d in defs {
            if let (Some(id), Some(tier)) = (
                d.get("id").and_then(serde_json::Value::as_str),
                d.get("tier").and_then(serde_json::Value::as_u64),
            ) {
                m.insert(id.to_string(), tier as u32);
            }
        }
    }
    m
 }
 fn hash_state(state: &GameState) -> u64 {
    let json = serde_json::to_string(state).expect("serialize state");
    let mut h = DefaultHasher::new();
@ -727,11 +873,21 @@ fn eval_fullgame(res: &mut SeedResult, sc: &Scenario, seed: u64, final_turn: u32
            Assertion::CityCount { player, op, value } => {
                Some(cmp(m_u64(&format!("peak_cities_p{player}")) as f64, op, *value))
            }
-            // Require real strategic AI play (not available headless yet) — skip honestly.
+            // Strategic systems now run for real in the headless fullgame
-            Assertion::MedianTierPeak { .. }
+            // (tech web boot-loaded → research/tiers; trade + culture phases
-            | Assertion::TradesFormed { .. }
+            // active), so these evaluate against the actual run.
-            | Assertion::BorderGrowth { .. }
+            Assertion::MedianTierPeak { op, value } => {
-            | Assertion::ClanWinrateMax { .. } => None,
+                Some(cmp(m_u64("median_tier_peak") as f64, op, *value))
            }
            Assertion::TradesFormed { op, value } => {
                Some(cmp(m_u64("trades_formed") as f64, op, *value))
            }
            Assertion::BorderGrowth { player, op, value } => {
                Some(cmp(m_u64(&format!("owned_tiles_p{player}")) as f64, op, *value))
            }
            // clan_winrate_max is inherently a batch property (win fraction
            // across the seed set), evaluated once in main(), not per seed.
            Assertion::ClanWinrateMax { .. } => None,
            _ => None,
        };
        match outcome {
@ -775,6 +931,38 @@ fn parse_seeds(sc: &Scenario, args: &[String]) -> Vec<u64> {
    vec![base, base + 1, base + 2]
 }
 /// Evaluate assertions that are properties of the whole seed batch rather than
 /// a single run. Currently `clan_winrate_max`: the fraction of games won by the
 /// most-winning player must stay at/below the band (fairness). Reads the
 /// per-seed `winner` metric recorded by the fullgame driver.
 fn eval_batch_assertions(sc: &Scenario, results: &[SeedResult]) -> Vec<BatchAssertion> {
    let mut out = Vec::new();
    for a in &sc.expect {
        if let Assertion::ClanWinrateMax { op, value } = a {
            let mut wins: BTreeMap<u64, u32> = BTreeMap::new();
            let mut decided = 0u32;
            for r in results {
                if let Some(w) = r.metrics.get("winner").and_then(serde_json::Value::as_u64) {
                    *wins.entry(w).or_insert(0) += 1;
                    decided += 1;
                }
            }
            // Undecided batch (no winners) → vacuously fair (max winrate 0).
            let max_winrate = if decided == 0 {
                0.0
            } else {
                wins.values().copied().max().unwrap_or(0) as f64 / decided as f64
            };
            out.push(BatchAssertion {
                label: format!("{a:?}"),
                passed: cmp(max_winrate, op, *value),
                measured: max_winrate,
            });
        }
    }
    out
 }
 fn main() {
    let args: Vec<String> = env::args().collect();
    if args.len() < 2 {
@ -799,7 +987,15 @@ fn main() {
        .collect();
    let passed = results.iter().filter(|r| r.failed.is_empty()).count();
-    let overall = passed == results.len() && !results.is_empty();
+    let seeds_passed = passed == results.len() && !results.is_empty();
    // Batch-level assertions: properties of the whole seed distribution, not a
    // single run. `clan_winrate_max` is the win fraction of the most-winning
    // player across the batch (winner index recorded per fullgame seed).
    let batch_assertions = eval_batch_assertions(&sc, &results);
    let batch_ok = batch_assertions.iter().all(|a| a.passed);
    let overall = seeds_passed && batch_ok;
    let batch = BatchResult {
        scenario_id: sc.id.clone(),
        kind: sc.kind.clone(),
@ -807,13 +1003,19 @@ fn main() {
        seeds_run: results.len(),
        passed_seeds: passed,
        results,
        batch_assertions,
        overall_pass: overall,
    };
    println!("{}", serde_json::to_string_pretty(&batch).unwrap());
    let failed_batch: Vec<&BatchAssertion> =
        batch.batch_assertions.iter().filter(|a| !a.passed).collect();
    if overall {
        eprintln!("# SCENARIO PASS: {}/{} seeds — {}", passed, batch.seeds_run, sc.id);
    } else {
        eprintln!("# SCENARIO FAIL: {}/{} seeds — {}", passed, batch.seeds_run, sc.id);
        for a in failed_batch {
            eprintln!("#   batch assertion failed: {} (measured {:.3})", a.label, a.measured);
        }
        std::process::exit(1);
    }
 }
--- a/src/simulator/crates/mc-tech/src/state.rs
+++ b/src/simulator/crates/mc-tech/src/state.rs
@ -42,11 +42,35 @@ pub enum ResearchResult {
 /// Mutable per-player research state.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PlayerTechState {
    // `HashSet` iteration order is non-deterministic, which would make any
    // serialization of a `GameState` carrying this set (e.g. the sim-scenario
    // `end_state_hash` determinism check) differ run-to-run despite identical
    // logical state. A researched-set has no meaningful order, so serialize it
    // sorted — canonical, deterministic, and transparent to every caller (the
    // in-memory type and `researched_techs()` accessor are unchanged).
    #[serde(serialize_with = "serialize_sorted_set")]
    researched: HashSet<String>,
    researching: Option<String>,
    research_progress: u32,
 }
 /// Serialize a `HashSet<String>` as a sorted JSON array so the output is
 /// deterministic regardless of hash iteration order. Deserialization uses the
 /// default `HashSet` path (order-independent on the way in).
 fn serialize_sorted_set<S>(set: &HashSet<String>, serializer: S) -> Result<S::Ok, S::Error>
 where
    S: serde::Serializer,
 {
    use serde::ser::SerializeSeq;
    let mut sorted: Vec<&String> = set.iter().collect();
    sorted.sort_unstable();
    let mut seq = serializer.serialize_seq(Some(sorted.len()))?;
    for item in sorted {
        seq.serialize_element(item)?;
    }
    seq.end()
 }
 impl PlayerTechState {
    /// Create a new empty player tech state.
    pub fn new() -> Self {