feat(sim): make the headless fullgame runner exercise tech/trade/culture for real

The sim_scenario fullgame driver stepped the turn loop but never boot-loaded the content packs the live harness loads, so process_science ran research-less (tier-1 fallback) and process_trade_phase saw no resource categories — the strategic systems were inert. The four strategic assertions (median_tier_peak, trades_formed, border_growth, clan_winrate) were therefore skipped, leaving trade_forms / time_to_tier / culture_borders_expand / clan_fairness_band vacuously green (passing on `terminates` alone). This wires the systems for real and measures them: - drive_fullgame boot-loads the tech web (concatenated public/resources/techs/ *.json) and the resource→category map (public/resources/resources.json), the same payloads GdPlayerApi feeds set_tech_web_json / set_resource_categories_json. Now: median tier reaches 10, trades form, culture borders expand for real, and outcomes vary by seed (previously combat/founding were terrain-blind). - Extract real metrics: tier_peak_p{i} + median_tier_peak (max tier among a player's researched techs), trades_formed (traded luxuries+strategics), owned_tiles_p{i} (culture-claimed territory), and the per-seed winner. - Un-skip MedianTierPeak / TradesFormed / BorderGrowth — they evaluate against the run. ClanWinrateMax is wired as a batch-level assertion (win fraction of the most-winning clan across the seed set) with the measured value surfaced in the JSON output. - Strengthen the game1_headless_systems_150t umbrella with median_tier_peak>=4 and trades_formed>=1, and re-calibrate final_turn 120->90: a winner now emerges ~98-113t once the systems actually drive the game, instead of running flat to the cap (calibration-rule: lock the threshold to the real all-systems run). Determinism fix: PlayerTechState.researched (HashSet) now serializes sorted, so GameState serialization — and the determinism_same_seed end_state_hash check — is stable run-to-run regardless of hash iteration order. The set has no meaningful order; the in-memory type and researched_techs() accessor are unchanged. Full suite: 19/20 green. clan_fairness_band is the single honest FAIL — over 50 seeds / 6 clans only 3 ever win (winrates 0.14 / 0.46 / 0.40; clans 1,2,3 never win), max 0.46 > the 0.4 band. That is a real fairness gap from the bench's fixed asymmetric start positions + personality balance — surfaced, not tuned away (owner decision). Verified: cargo test -p mc-tech (28 passed); full sim_scenario suite run locally on plum (release), determinism + canonical + the three strategic scenarios green on real metrics. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-28 23:20:13 -04:00 · 2026-06-28 23:20:13 -04:00 · 78945e9df1
commit 78945e9df1
parent 4937459bb7
3 changed files with 239 additions and 11 deletions
--- a/public/games/age-of-dwarves/data/sim-scenarios/fullgame/game1_headless_systems_150t.json
+++ b/public/games/age-of-dwarves/data/sim-scenarios/fullgame/game1_headless_systems_150t.json
@ -2,7 +2,7 @@
  "id": "game1_headless_systems_150t",
  "kind": "fullgame",
  "version": 1,
-  "description": "Broad Game-1 systems run: 4 clans, full evolved map, exercising all systems. Terminates ~120t under these rules (victory). Regression umbrella (calibrated).",
+  "description": "Broad Game-1 systems run: 4 clans, full evolved map, exercising all systems (tech web + research, trade from owned-tile resources, culture borders, climate/ecology/healing). With the content packs boot-loaded a winner now emerges ~98-113t (victory) instead of running flat to the cap. Regression umbrella (calibrated to the real all-systems run).",
  "map": { "size": 40, "evolution_ticks": 14000, "seed_base": 150150 },
  "players": [
    { "personality": "militarist" }, { "personality": "boom" },
@ -12,9 +12,11 @@
  "seeds": [150150, 150151, 150152],
  "expect": [
    { "type": "terminates" },
-    { "type": "final_turn", "op": ">=", "value": 120 },
+    { "type": "final_turn", "op": ">=", "value": 90 },
    { "type": "no_nan_economy" },
    { "type": "population_non_negative" },
-    { "type": "total_pvp_combats", "op": ">=", "value": 0 }
+    { "type": "total_pvp_combats", "op": ">=", "value": 0 },
+    { "type": "median_tier_peak", "op": ">=", "value": 4 },
+    { "type": "trades_formed", "op": ">=", "value": 1 }
  ]
 }
--- a/src/simulator/crates/mc-sim/src/bin/sim_scenario.rs
+++ b/src/simulator/crates/mc-sim/src/bin/sim_scenario.rs
@ -208,9 +208,22 @@ struct BatchResult {
    seeds_run: usize,
    passed_seeds: usize,
    results: Vec<SeedResult>,
+    /// Assertions evaluated once across the whole seed batch rather than
+    /// per-seed (e.g. `clan_winrate_max`, a property of the win distribution).
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    batch_assertions: Vec<BatchAssertion>,
    overall_pass: bool,
 }

+#[derive(Debug, Serialize)]
+struct BatchAssertion {
+    label: String,
+    passed: bool,
+    /// The measured value the assertion was checked against (for transparency
+    /// in the JSON output — e.g. the observed max clan winrate).
+    measured: f64,
+}
+
 // ───────────────────────────── Helpers ─────────────────────────────

 fn cmp(actual: f64, op: &str, target: f64) -> bool {
@ -608,6 +621,11 @@ fn drive_fullgame(
        map_seed: seed,
        ..Default::default()
    };
+    // Boot-load the resource→category map (luxury/strategic/bonus) the live
+    // harness derives from `resources.json`. `process_trade_phase` sources
+    // tradeable surpluses from owned-tile collectibles classified by this map;
+    // empty → nothing ever trades (the runner's previous behaviour).
+    state.resource_categories = load_resource_categories();
    let n = sc.players.len().max(1);
    for i in 0..n {
        let base_col = 6 + (i as i32 * 4);
@ -631,7 +649,18 @@ fn drive_fullgame(
        state.players.push(ps);
    }

-    let processor = TurnProcessor::new(max_turns);
+    // Boot-load the tech web exactly as the live harness does
+    // (`GdPlayerApi::set_tech_web_json`). Without it `process_science` runs
+    // research-less (tier-1 fallback) and the strategic systems — tech, tiers,
+    // the unlocks that gate trades — never progress, leaving the
+    // median_tier_peak / trades_formed assertions un-evaluable. Loading it is
+    // what makes the fullgame runner exercise "all systems" for real.
+    let web_json = load_tech_web_json();
+    let tier_map = tech_tier_map(&web_json);
+    let mut processor = TurnProcessor::new(max_turns);
+    if let Err(e) = processor.set_tech_web_json(&web_json) {
+        eprintln!("# WARN: tech web load failed ({e}); running research-less");
+    }
    let mut inv = Invariants {
        no_nan_economy: true,
        population_non_negative: true,
@ -641,6 +670,7 @@ fn drive_fullgame(
    let mut total_pvp = 0u32;
    let mut prev_turn = state.turn;
    let mut peak_cities: Vec<usize> = vec![0; n];
+    let mut winner_pi: Option<u8> = None;

    for _ in 1..=max_turns {
        let result = processor.step(&mut state);
@ -664,7 +694,8 @@ fn drive_fullgame(
                peak_cities[i] = peak_cities[i].max(p.cities.len());
            }
        }
-        if result.winner.is_some() {
+        if let Some((w, _)) = result.winner {
+            winner_pi = Some(w);
            inv.terminated = true;
            break;
        }
@ -679,11 +710,126 @@ fn drive_fullgame(
    for (i, c) in peak_cities.iter().enumerate() {
        metrics.insert(format!("peak_cities_p{i}"), serde_json::json!(c));
    }
+
+    // Strategic-system metrics. Each maps to a real engine field so the
+    // formerly-skipped assertions (median_tier_peak / trades_formed /
+    // border_growth / clan_winrate) evaluate against the actual run.
+    //  • trades_formed — luxuries+strategics that reached a player's ledger
+    //    via `process_trade_phase`.
+    //  • tier_peak_p{i} — max tier among the player's researched techs
+    //    (`process_science` ↦ PlayerTechState); median across players feeds
+    //    `median_tier_peak`.
+    //  • owned_tiles_p{i} — culture-claimed territory (`process_culture`
+    //    border expansion). Cities start with empty `owned_tiles`, so the
+    //    final count IS the growth from baseline.
+    let mut trades_formed = 0u64;
+    let mut tier_peaks: Vec<u32> = Vec::with_capacity(n);
+    for (i, p) in state.players.iter().enumerate() {
+        trades_formed += (p.traded_luxuries.len() + p.traded_strategics.len()) as u64;
+        let peak = p
+            .player_tech
+            .as_ref()
+            .map(|pt| {
+                pt.researched_techs()
+                    .iter()
+                    .filter_map(|id| tier_map.get(id).copied())
+                    .max()
+                    .unwrap_or(0)
+            })
+            .unwrap_or(0);
+        tier_peaks.push(peak);
+        metrics.insert(format!("tier_peak_p{i}"), serde_json::json!(peak));
+        let owned: usize = p.cities.iter().map(|c| c.owned_tiles.len()).sum();
+        metrics.insert(format!("owned_tiles_p{i}"), serde_json::json!(owned));
+    }
+    let median_tier = {
+        let mut v = tier_peaks;
+        v.sort_unstable();
+        if v.is_empty() {
+            0
+        } else {
+            v[v.len() / 2]
+        }
+    };
+    metrics.insert("trades_formed".into(), serde_json::json!(trades_formed));
+    metrics.insert("median_tier_peak".into(), serde_json::json!(median_tier));
+    if let Some(w) = winner_pi {
+        metrics.insert("winner".into(), serde_json::json!(w));
+    }
+
    metrics.insert("end_state_hash".into(), serde_json::json!(hash_state(&state)));

    (state.turn, metrics, inv)
 }

+/// Concatenate every `public/resources/techs/*.json` pillar (each a JSON array
+/// of tech definitions) into one flat array — the same payload the live harness
+/// feeds `set_tech_web_json`. Deterministic order (sorted paths) so the run is
+/// reproducible across machines.
+fn load_tech_web_json() -> String {
+    let dir = repo_root().join("public/resources/techs");
+    let mut all: Vec<serde_json::Value> = Vec::new();
+    if let Ok(entries) = fs::read_dir(&dir) {
+        let mut paths: Vec<PathBuf> = entries
+            .filter_map(|e| e.ok().map(|e| e.path()))
+            .filter(|p| p.extension().is_some_and(|x| x == "json"))
+            .collect();
+        paths.sort();
+        for p in paths {
+            if let Ok(text) = fs::read_to_string(&p) {
+                if let Ok(serde_json::Value::Array(defs)) = serde_json::from_str(&text) {
+                    all.extend(defs);
+                }
+            }
+        }
+    }
+    serde_json::to_string(&all).unwrap_or_else(|_| "[]".to_string())
+}
+
+/// Flatten `public/resources/resources.json` into the `{resource_id →
+/// category}` map the live harness loads via `set_resource_categories_json`.
+/// Top-level keys `bonus` / `luxury` / `strategic` each hold an array of
+/// resource entries; the key IS the category. Reads the same canonical pack
+/// the live game does (no hardcoded values).
+fn load_resource_categories() -> BTreeMap<String, String> {
+    let path = repo_root().join("public/resources/resources.json");
+    let mut map = BTreeMap::new();
+    let Ok(text) = fs::read_to_string(&path) else {
+        return map;
+    };
+    let Ok(root) = serde_json::from_str::<serde_json::Value>(&text) else {
+        return map;
+    };
+    for category in ["bonus", "luxury", "strategic"] {
+        if let Some(arr) = root.get(category).and_then(serde_json::Value::as_array) {
+            for entry in arr {
+                if let Some(id) = entry.get("id").and_then(serde_json::Value::as_str) {
+                    map.insert(id.to_string(), category.to_string());
+                }
+            }
+        }
+    }
+    map
+}
+
+/// tech id → tier, parsed from the concatenated tech-web JSON. Backs the
+/// `median_tier_peak` metric (a player's peak tier = max tier of its
+/// researched techs).
+fn tech_tier_map(web_json: &str) -> BTreeMap<String, u32> {
+    let mut m = BTreeMap::new();
+    if let Ok(serde_json::Value::Array(defs)) = serde_json::from_str::<serde_json::Value>(web_json) {
+        for d in defs {
+            if let (Some(id), Some(tier)) = (
+                d.get("id").and_then(serde_json::Value::as_str),
+                d.get("tier").and_then(serde_json::Value::as_u64),
+            ) {
+                m.insert(id.to_string(), tier as u32);
+            }
+        }
+    }
+    m
+}
+
 fn hash_state(state: &GameState) -> u64 {
    let json = serde_json::to_string(state).expect("serialize state");
    let mut h = DefaultHasher::new();
@ -727,11 +873,21 @@ fn eval_fullgame(res: &mut SeedResult, sc: &Scenario, seed: u64, final_turn: u32
            Assertion::CityCount { player, op, value } => {
                Some(cmp(m_u64(&format!("peak_cities_p{player}")) as f64, op, *value))
            }
-            // Require real strategic AI play (not available headless yet) — skip honestly.
-            Assertion::MedianTierPeak { .. }
-            | Assertion::TradesFormed { .. }
-            | Assertion::BorderGrowth { .. }
-            | Assertion::ClanWinrateMax { .. } => None,
+            // Strategic systems now run for real in the headless fullgame
+            // (tech web boot-loaded → research/tiers; trade + culture phases
+            // active), so these evaluate against the actual run.
+            Assertion::MedianTierPeak { op, value } => {
+                Some(cmp(m_u64("median_tier_peak") as f64, op, *value))
+            }
+            Assertion::TradesFormed { op, value } => {
+                Some(cmp(m_u64("trades_formed") as f64, op, *value))
+            }
+            Assertion::BorderGrowth { player, op, value } => {
+                Some(cmp(m_u64(&format!("owned_tiles_p{player}")) as f64, op, *value))
+            }
+            // clan_winrate_max is inherently a batch property (win fraction
+            // across the seed set), evaluated once in main(), not per seed.
+            Assertion::ClanWinrateMax { .. } => None,
            _ => None,
        };
        match outcome {
@ -775,6 +931,38 @@ fn parse_seeds(sc: &Scenario, args: &[String]) -> Vec<u64> {
    vec![base, base + 1, base + 2]
 }

+/// Evaluate assertions that are properties of the whole seed batch rather than
+/// a single run. Currently `clan_winrate_max`: the fraction of games won by the
+/// most-winning player must stay at/below the band (fairness). Reads the
+/// per-seed `winner` metric recorded by the fullgame driver.
+fn eval_batch_assertions(sc: &Scenario, results: &[SeedResult]) -> Vec<BatchAssertion> {
+    let mut out = Vec::new();
+    for a in &sc.expect {
+        if let Assertion::ClanWinrateMax { op, value } = a {
+            let mut wins: BTreeMap<u64, u32> = BTreeMap::new();
+            let mut decided = 0u32;
+            for r in results {
+                if let Some(w) = r.metrics.get("winner").and_then(serde_json::Value::as_u64) {
+                    *wins.entry(w).or_insert(0) += 1;
+                    decided += 1;
+                }
+            }
+            // Undecided batch (no winners) → vacuously fair (max winrate 0).
+            let max_winrate = if decided == 0 {
+                0.0
+            } else {
+                wins.values().copied().max().unwrap_or(0) as f64 / decided as f64
+            };
+            out.push(BatchAssertion {
+                label: format!("{a:?}"),
+                passed: cmp(max_winrate, op, *value),
+                measured: max_winrate,
+            });
+        }
+    }
+    out
+}
+
 fn main() {
    let args: Vec<String> = env::args().collect();
    if args.len() < 2 {
@ -799,7 +987,15 @@ fn main() {
        .collect();

    let passed = results.iter().filter(|r| r.failed.is_empty()).count();
-    let overall = passed == results.len() && !results.is_empty();
+    let seeds_passed = passed == results.len() && !results.is_empty();
+
+    // Batch-level assertions: properties of the whole seed distribution, not a
+    // single run. `clan_winrate_max` is the win fraction of the most-winning
+    // player across the batch (winner index recorded per fullgame seed).
+    let batch_assertions = eval_batch_assertions(&sc, &results);
+    let batch_ok = batch_assertions.iter().all(|a| a.passed);
+    let overall = seeds_passed && batch_ok;
+
    let batch = BatchResult {
        scenario_id: sc.id.clone(),
        kind: sc.kind.clone(),
@ -807,13 +1003,19 @@ fn main() {
        seeds_run: results.len(),
        passed_seeds: passed,
        results,
+        batch_assertions,
        overall_pass: overall,
    };
    println!("{}", serde_json::to_string_pretty(&batch).unwrap());
+    let failed_batch: Vec<&BatchAssertion> =
+        batch.batch_assertions.iter().filter(|a| !a.passed).collect();
    if overall {
        eprintln!("# SCENARIO PASS: {}/{} seeds — {}", passed, batch.seeds_run, sc.id);
    } else {
        eprintln!("# SCENARIO FAIL: {}/{} seeds — {}", passed, batch.seeds_run, sc.id);
+        for a in failed_batch {
+            eprintln!("#   batch assertion failed: {} (measured {:.3})", a.label, a.measured);
+        }
        std::process::exit(1);
    }
 }
--- a/src/simulator/crates/mc-tech/src/state.rs
+++ b/src/simulator/crates/mc-tech/src/state.rs
@ -42,11 +42,35 @@ pub enum ResearchResult {
 /// Mutable per-player research state.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PlayerTechState {
+    // `HashSet` iteration order is non-deterministic, which would make any
+    // serialization of a `GameState` carrying this set (e.g. the sim-scenario
+    // `end_state_hash` determinism check) differ run-to-run despite identical
+    // logical state. A researched-set has no meaningful order, so serialize it
+    // sorted — canonical, deterministic, and transparent to every caller (the
+    // in-memory type and `researched_techs()` accessor are unchanged).
+    #[serde(serialize_with = "serialize_sorted_set")]
    researched: HashSet<String>,
    researching: Option<String>,
    research_progress: u32,
 }

+/// Serialize a `HashSet<String>` as a sorted JSON array so the output is
+/// deterministic regardless of hash iteration order. Deserialization uses the
+/// default `HashSet` path (order-independent on the way in).
+fn serialize_sorted_set<S>(set: &HashSet<String>, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+{
+    use serde::ser::SerializeSeq;
+    let mut sorted: Vec<&String> = set.iter().collect();
+    sorted.sort_unstable();
+    let mut seq = serializer.serialize_seq(Some(sorted.len()))?;
+    for item in sorted {
+        seq.serialize_element(item)?;
+    }
+    seq.end()
+}
+
 impl PlayerTechState {
    /// Create a new empty player tech state.
    pub fn new() -> Self {