From 98a98155d117b841fc566faf3822a2dba99dc3ec Mon Sep 17 00:00:00 2001 From: Natalie Date: Tue, 12 May 2026 17:18:37 -0700 Subject: [PATCH] =?UTF-8?q?feat(@magic-civilization):=20=E2=9C=A8=20add=20?= =?UTF-8?q?ai=20action=20dispatch=20and=20tactical=20projection=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Lilith Autocommit --- .../tests/full_game_transcript.rs | 544 +++++++++++++++++- 1 file changed, 543 insertions(+), 1 deletion(-) diff --git a/src/simulator/crates/mc-player-api/tests/full_game_transcript.rs b/src/simulator/crates/mc-player-api/tests/full_game_transcript.rs index 5093a20d..6640812d 100644 --- a/src/simulator/crates/mc-player-api/tests/full_game_transcript.rs +++ b/src/simulator/crates/mc-player-api/tests/full_game_transcript.rs @@ -54,7 +54,8 @@ use std::panic::AssertUnwindSafe; use std::path::{Path, PathBuf}; use mc_player_api::action::{PlayerAction, PromotionPick}; -use mc_player_api::projection::project_view; +use mc_player_api::dispatch::apply_ai_action; +use mc_player_api::projection::{project_tactical, project_view}; use mc_player_api::wire::{Event, OkMarker, Request, Response}; use mc_player_api::{apply_action, PlayerView}; @@ -1267,3 +1268,544 @@ fn write_long_recap(out_dir: &Path, summaries: &[TurnSummary], outcome: &DriveOu fs::write(&path, md).expect("write long recap"); } + +// ═══════════════════════════════════════════════════════════════════════ +// Claude-as-strong-AI run (2026-05-12, post-Bug-5 fix) +// ═══════════════════════════════════════════════════════════════════════ +// +// HONEST FRAMING. The original task brief said "wire the production +// `mc_ai::run_ai_turn` MCTS into Claude's policy slot ... give Claude a +// higher rollout budget than the AI slots". After reading the code: +// +// - `mc_ai::tactical::run_ai_turn` is NOT MCTS. It's a deterministic +// heuristic pipeline (`decide_tactical_actions`) — movement → +// combat_predict → settle → production → citizen. The actual MCTS +// lives in `mc-ai/src/mcts.rs` and is not the path AI slots take in +// `dispatch::drive_ai_slot`. +// +// - `run_ai_turn(state, player, weights, seed) -> Vec` has NO +// rollout-budget parameter. The only knobs that differentiate one +// slot from another are `ScoringWeights` and `seed`. +// +// - The bench harness's `stamp_personality` only sets `clan_id` + +// three promotion weights. It does NOT load +// `ScoringWeights::from_personality_json` for the named clan, so +// slots 1 and 2 actually run with `ScoringWeights::default()` — +// blackhammer/deepforge are cosmetic labels in this run. +// +// The legitimate experiment we CAN run, then, is: +// +// - Stamp Claude (slot 0) with a real per-clan ScoringWeights — +// `blackhammer` is the natural choice since its strategic axes +// (aggression 9, expansion 6, production 7) skew hardest toward the +// last_survivor victory the 233-turn baseline hit. +// - Leave slots 1 + 2 on `ScoringWeights::default()` (the "easy AI" +// baseline that lost to itself in the prior run because of seed +// variance — see `2026-05-12-claude-vs-easy-ai-250-turn/recap.md`). +// - Use the same `run_ai_turn` pipeline for all three slots — Claude's +// advantage is purely the stronger weights, not a different +// algorithm. +// +// If Claude wins, we have evidence the simulation responds to scoring- +// weight axes and the personality system is doing real work. If Claude +// loses despite blackhammer weights, then either (a) the heuristic +// pipeline is insensitive to weight magnitude, or (b) turn-order / +// starting-position effects dominate. + +/// Build the blackhammer `ScoringWeights` from the canonical +/// `ai_personalities.json` shipped with Age of Dwarves. Inlined so the +/// test does not depend on the filesystem path layout — if the file +/// rotates we crash loudly in the test, not silently in production. +fn claude_genius_weights() -> mc_ai::evaluator::ScoringWeights { + // CARGO_MANIFEST_DIR is the crate dir; repo root is 4 levels up. + let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let repo_root = crate_dir + .parent() + .and_then(|p| p.parent()) + .and_then(|p| p.parent()) + .and_then(|p| p.parent()) + .expect("repo root resolves") + .to_path_buf(); + let json_path = + repo_root.join("public/games/age-of-dwarves/data/ai_personalities.json"); + let json = fs::read_to_string(&json_path) + .unwrap_or_else(|e| panic!("read {}: {e}", json_path.display())); + mc_ai::evaluator::ScoringWeights::from_personality_json("blackhammer", &json) + .expect("blackhammer must be a known clan in ai_personalities.json") +} + +/// Stable signature for an `mc_ai::tactical::Action` — mirrors +/// `action_signature` for `PlayerAction` so the recap frequency table +/// reads the same way. +fn ai_action_signature(action: &mc_ai::tactical::Action) -> String { + use mc_ai::tactical::Action as A; + match action { + A::MoveUnit { unit_id, to_hex } => format!("move:{unit_id}:{to_hex:?}"), + A::AttackTarget { attacker_id, target_id, .. } => { + format!("attack:{attacker_id}->{target_id}") + } + A::Fortify { unit_id } => format!("fortify:{unit_id}"), + A::Heal { unit_id } => format!("heal:{unit_id}"), + A::FoundCity { settler_id, .. } => format!("found:{settler_id}"), + A::SetProduction { city_id, item_id } => { + if is_building_id(item_id) { + format!("queue_building:{city_id}:{item_id}") + } else { + format!("queue_unit:{city_id}:{item_id}") + } + } + A::EnqueueBuild { city_id, item_id, .. } => { + if is_building_id(item_id) { + format!("queue_building:{city_id}:{item_id}") + } else { + format!("queue_unit:{city_id}:{item_id}") + } + } + A::Scout { unit_id, to_hex } => format!("scout:{unit_id}:{to_hex:?}"), + A::IssuePatrol { unit_id, .. } => format!("patrol:{unit_id}"), + A::PromotionPicked { unit_id, promotion_id } => { + format!("promote:{unit_id}:{promotion_id}") + } + A::AssignCitizen { .. } => "assign_citizen".into(), + A::DeploySiege { .. } => "deploy_siege".into(), + A::PackSiege { .. } => "pack_siege".into(), + A::Bombard { .. } => "bombard".into(), + } +} + +/// One Claude-driven tactical action for the recap. +struct ClaudeTacticalDecision { + turn: u32, + signature: String, + events: Vec, +} + +/// Per-turn summary for the strong-Claude run. +struct StrongTurnSummary { + turn: u32, + claude_decisions: Vec, + endturn_events: Vec, + ai_actions_applied: Vec<(u8, u32)>, + score_snapshot: Vec<(u8, i32, u32, u32)>, +} + +/// Same deterministic seed derivation `drive_ai_slot` uses internally, +/// so Claude's turn picks would byte-identical-match what slot 0 would +/// have produced if it were driven by the production AI path. +fn seed_for_claude_turn(turn: u32) -> u64 { + (turn as u64) + .wrapping_mul(0x9E37_79B9_7F4A_7C15) +} + +/// Drive a Claude-as-strong-AI game. Slot 0 (Claude) runs the same +/// `run_ai_turn` tactical pipeline as the AI slots but with a stronger +/// `ScoringWeights` (blackhammer's axes). Slots 1 and 2 keep +/// `ScoringWeights::default()` — the "easy AI" baseline. After Claude's +/// action chain is dispatched the driver issues a normal `EndTurn` +/// which routes through `apply_end_turn` → `drive_ai_slot` for slots 1 +/// and 2 unchanged. +fn drive_strong_claude_game( + out_dir: &Path, + max_turns: u32, +) -> (Vec, DriveOutcome) { + fs::create_dir_all(out_dir).expect("create out dir"); + let transcript_path = out_dir.join("transcript.jsonl"); + let mut transcript = fs::File::create(&transcript_path).expect("create transcript"); + + let mut state = build_3_player_state_like_harness(); + // Stamp blackhammer weights onto Claude (slot 0). Slots 1 + 2 keep + // the default weights they got from `add_player_militarist_inline`. + let claude_weights = claude_genius_weights(); + state.players[0].scoring_weights = claude_weights.clone(); + + let mut next_req_id: u64 = 1; + let mut summaries: Vec = Vec::new(); + let mut consecutive_endturn_only = 0u32; + let mut outcome = DriveOutcome::Completed; + + 'game: for turn in 0..max_turns { + eprintln!("[strong-claude] starting turn {turn}"); + if SNAPSHOT_TURNS.contains(&turn) { + let view = project_view(&state, 0, false); + let snapshot_path = out_dir.join(format!("state-turn-{turn:02}.json")); + let json = serde_json::to_string_pretty(&view).expect("snapshot serialise"); + fs::write(&snapshot_path, json).expect("write snapshot"); + } + + let mut summary = StrongTurnSummary { + turn, + claude_decisions: Vec::new(), + endturn_events: Vec::new(), + ai_actions_applied: Vec::new(), + score_snapshot: Vec::new(), + }; + + // ── Claude's turn: project tactical, run the tactical pipeline, + // dispatch each `mc_ai::Action` directly via `apply_ai_action`. + let view_req_id = next_req_id; + next_req_id += 1; + let view_req = Request::View { id: Some(view_req_id) }; + write_jsonl(&mut transcript, &view_req); + let view = project_view(&state, 0, false); + let view_resp = Response::Ok { + id: Some(view_req_id), + ok: OkMarker, + events: Vec::new(), + view: view.clone(), + }; + write_jsonl(&mut transcript, &view_resp); + + let mut tactical = project_tactical(&state, 0); + tactical.current_player = 0; + let seed = seed_for_claude_turn(turn); + let actions = + mc_ai::tactical::run_ai_turn(&tactical, 0, &claude_weights, seed); + + let mut took_real_action = false; + for action in actions { + let signature = ai_action_signature(&action); + // Dispatch under `catch_unwind` for the same residual-overflow + // safety the original `drive_game` carries. + let dispatch = std::panic::catch_unwind(AssertUnwindSafe(|| { + apply_ai_action(&mut state, 0, action) + })); + let result = match dispatch { + Ok(r) => r, + Err(payload) => { + let msg = panic_payload_to_string(&payload); + eprintln!("[panic] strong-claude apply_ai_action at turn {turn}: {msg}"); + use mc_player_api::wire::{HarnessNotification, Notification}; + let note = Notification::Harness(HarnessNotification::ProtocolError { + message: format!( + "apply_ai_action panic at turn {turn}: {msg}" + ), + }); + write_jsonl(&mut transcript, ¬e); + outcome = DriveOutcome::EndTurnPanic { turn, message: msg }; + summaries.push(summary); + break 'game; + } + }; + // Any dispatched tactical action counts as activity for the + // stuck-detector — unit-verb actions return `Ok(vec![])` + // synchronously (events batch to EndTurn), so checking + // `!events.is_empty()` would always trip Stuck after the + // first 10 quiet turns even when Claude is moving 40+ units. + took_real_action = true; + match result { + Ok(events) => { + summary.claude_decisions.push(ClaudeTacticalDecision { + turn, + signature: signature.clone(), + events: events.clone(), + }); + } + Err(_err) => { + // Per-action errors (UnknownUnit, IllegalAction) are + // tolerated — the production `drive_ai_slot` has the + // same posture. Still log the attempt for the recap. + summary.claude_decisions.push(ClaudeTacticalDecision { + turn, + signature: signature.clone(), + events: Vec::new(), + }); + } + } + } + + // ── End-of-turn: route through the normal apply_action(EndTurn) + // path so slots 1+2 run via `drive_ai_slot` unmodified. + let act_req_id = next_req_id; + next_req_id += 1; + let act_req = Request::Act { + id: Some(act_req_id), + action: PlayerAction::EndTurn, + }; + write_jsonl(&mut transcript, &act_req); + let dispatch = std::panic::catch_unwind(AssertUnwindSafe(|| { + apply_action(&mut state, 0, &PlayerAction::EndTurn) + })); + let result = match dispatch { + Ok(r) => r, + Err(payload) => { + let msg = panic_payload_to_string(&payload); + eprintln!("[panic] strong-claude EndTurn at turn {turn}: {msg}"); + use mc_player_api::wire::{HarnessNotification, Notification}; + let note = Notification::Harness(HarnessNotification::ProtocolError { + message: format!("EndTurn panic at turn {turn}: {msg}"), + }); + write_jsonl(&mut transcript, ¬e); + outcome = DriveOutcome::EndTurnPanic { turn, message: msg }; + summaries.push(summary); + break 'game; + } + }; + let post_view = project_view(&state, 0, false); + match &result { + Ok(events) => { + let resp = Response::Ok { + id: Some(act_req_id), + ok: OkMarker, + events: events.clone(), + view: post_view.clone(), + }; + write_jsonl(&mut transcript, &resp); + summary.endturn_events = events.clone(); + for ev in events { + if let Event::AiTurnCompleted { player, actions_applied } = ev { + summary.ai_actions_applied.push((*player, *actions_applied)); + } + } + } + Err(err) => { + use mc_player_api::wire::ErrMarker; + let resp = Response::Err { + id: Some(act_req_id), + ok: ErrMarker, + error: err.clone(), + }; + write_jsonl(&mut transcript, &resp); + } + } + + // Score snapshot post-EndTurn. + for (p_idx, p) in state.players.iter().enumerate() { + summary.score_snapshot.push(( + p_idx as u8, + p.gold, + p.cities.len() as u32, + p.units.len() as u32, + )); + } + + if took_real_action { + consecutive_endturn_only = 0; + } else { + consecutive_endturn_only += 1; + } + let is_stuck = consecutive_endturn_only >= STUCK_TURN_THRESHOLD; + let game_over = summary + .endturn_events + .iter() + .any(|e| matches!(e, Event::GameOver { .. })); + summaries.push(summary); + if is_stuck { + outcome = DriveOutcome::Stuck(turn); + break 'game; + } + if game_over { + outcome = DriveOutcome::NaturalGameOver(turn); + break 'game; + } + } + (summaries, outcome) +} + +/// Strong-Claude artifact dir. +fn strong_claude_dir() -> PathBuf { + let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let repo_root = crate_dir + .parent() + .and_then(|p| p.parent()) + .and_then(|p| p.parent()) + .and_then(|p| p.parent()) + .expect("repo root resolves") + .to_path_buf(); + repo_root.join(".local/demo-runs/2026-05-12-claude-mcts-vs-easy-ai") +} + +/// Recap for the strong-Claude run. Mirrors `write_long_recap` shape so +/// the two artifacts are diff-able side-by-side. +fn write_strong_claude_recap( + out_dir: &Path, + summaries: &[StrongTurnSummary], + outcome: &DriveOutcome, + horizon: u32, +) { + use std::collections::BTreeMap; + let path = out_dir.join("recap.md"); + let mut md = String::new(); + md.push_str("# Claude-as-Strong-AI Transcript — 2026-05-12\n\n"); + md.push_str( + "**Source**: \ + `mc-player-api/tests/full_game_transcript.rs::claude_mcts_vs_two_easy_ais_transcript`\n\n", + ); + md.push_str("**Construction**:\n\n"); + md.push_str( + "- Slot 0 (Claude): blackhammer `ScoringWeights` from \ + `public/games/age-of-dwarves/data/ai_personalities.json`, \ + running `mc_ai::tactical::run_ai_turn` (the same tactical \ + heuristic the AI slots use).\n", + ); + md.push_str( + "- Slots 1 + 2 (AIs): `ScoringWeights::default()` baseline, \ + driven by the production `apply_end_turn` → `drive_ai_slot` \ + path unchanged.\n\n", + ); + md.push_str( + "**Why not real MCTS?** `mc_ai::tactical::run_ai_turn` is the \ + deterministic heuristic pipeline (movement → combat_predict → \ + settle → production → citizen), not MCTS. The MCTS code in \ + `mc-ai/src/mcts.rs` is not on the AI-slot turn path in the \ + current dispatch wiring — the task brief misnamed the function. \ + Documented in module comment block for `claude_genius_weights`.\n\n", + ); + md.push_str(&format!("**Horizon**: {} turns (ceiling)\n\n", horizon)); + md.push_str(&format!("**Turns played**: {}\n\n", summaries.len())); + md.push_str(&format!("**Termination**: {:?}\n\n", outcome)); + + // Victory outcome up front — this is the headline answer. + md.push_str("## Victory outcome\n\n"); + let game_over_event: Option<&Event> = summaries + .iter() + .flat_map(|s| s.endturn_events.iter()) + .find(|e| matches!(e, Event::GameOver { .. })); + match game_over_event { + Some(Event::GameOver { winner, victory_type }) => { + let claude_won = *winner == 0; + md.push_str(&format!( + "- `Event::GameOver` fired: winner=**{}**, victory_type=**{}**\n", + winner, victory_type + )); + md.push_str(&format!( + "- Claude (slot 0) result: **{}**\n\n", + if claude_won { "WIN" } else { "LOSS" } + )); + } + _ => { + md.push_str(&format!( + "- No `Event::GameOver` fired. Final turn = {}, outcome = `{:?}`.\n\n", + summaries.last().map(|s| s.turn).unwrap_or(0), + outcome + )); + } + } + + // Final-score table — direct head-to-head. + md.push_str("## Final scores (all slots)\n\n"); + md.push_str("| slot | gold | cities | units |\n"); + md.push_str("|------|------|--------|-------|\n"); + if let Some(last) = summaries.last() { + for (slot, gold, cities, units) in &last.score_snapshot { + let label = if *slot == 0 { "0 (Claude/blackhammer)" } else { "AI (default weights)" }; + md.push_str(&format!( + "| {} {} | {} | {} | {} |\n", + slot, label, gold, cities, units + )); + } + } + md.push_str("\n"); + + // Per-25-turn checkpoints, slot 0. + md.push_str("## Per-25-turn checkpoints (slot 0 = Claude)\n\n"); + md.push_str("| turn | gold | cities | units |\n"); + md.push_str("|------|------|--------|-------|\n"); + let checkpoint_turns: Vec = (0..=horizon).step_by(25).collect(); + for ct in &checkpoint_turns { + if let Some(summary) = summaries.iter().find(|s| s.turn == *ct) { + if let Some((_, gold, cities, units)) = + summary.score_snapshot.iter().find(|(slot, _, _, _)| *slot == 0) + { + md.push_str(&format!("| {} | {} | {} | {} |\n", ct, gold, cities, units)); + } + } + } + md.push_str("\n"); + + // Aggregate counts across the whole run. + let mut techs = 0u32; + let mut buildings_done = 0u32; + let mut units_killed = 0u32; + let mut cities_founded = 0u32; + let mut combat_resolved = 0u32; + for s in summaries { + for e in s.endturn_events.iter().chain( + s.claude_decisions.iter().flat_map(|d| d.events.iter()), + ) { + match e { + Event::TechResearched { .. } => techs += 1, + Event::CityBuildingCompleted { .. } => buildings_done += 1, + Event::UnitDestroyed { .. } => units_killed += 1, + Event::CityFounded { .. } => cities_founded += 1, + Event::CombatResolved { .. } => combat_resolved += 1, + _ => {} + } + } + } + md.push_str("## Total counts over the full run\n\n"); + md.push_str(&format!("- Techs researched: {}\n", techs)); + md.push_str(&format!("- Buildings completed: {}\n", buildings_done)); + md.push_str(&format!("- Combat resolutions: {}\n", combat_resolved)); + md.push_str(&format!("- Units killed: {}\n", units_killed)); + md.push_str(&format!("- Cities founded: {}\n\n", cities_founded)); + + // Action-signature frequency for Claude's tactical chain. + md.push_str("## Claude action-signature frequency\n\n"); + let mut freq: BTreeMap = BTreeMap::new(); + for s in summaries { + for d in &s.claude_decisions { + let head = d.signature.split(':').next().unwrap_or(&d.signature).to_string(); + *freq.entry(head).or_insert(0) += 1; + } + } + md.push_str("| action | count |\n|--------|-------|\n"); + for (k, v) in &freq { + md.push_str(&format!("| `{}` | {} |\n", k, v)); + } + md.push_str("\n"); + + fs::write(&path, md).expect("write strong-claude recap"); +} + +/// 500-turn Claude-as-strong-AI test. `#[ignore]`d like +/// `long_game_transcript`; run via: +/// +/// ```sh +/// cargo test -p mc-player-api --test full_game_transcript -- \ +/// --ignored claude_mcts_vs_two_easy_ais_transcript --nocapture +/// ``` +/// +/// Note the test name preserves the original task brief's wording +/// (`claude_mcts_vs_two_easy_ais`) even though the implementation runs +/// the tactical heuristic rather than MCTS — the rename would break +/// the surface the brief asked for. The recap and module comment make +/// the actual semantics explicit. +#[test] +#[ignore = "500-turn strong-Claude run; invoke via --ignored"] +fn claude_mcts_vs_two_easy_ais_transcript() { + const STRONG_HORIZON: u32 = 500; + let out_dir = strong_claude_dir(); + let _ = fs::remove_dir_all(&out_dir); + let (summaries, outcome) = drive_strong_claude_game(&out_dir, STRONG_HORIZON); + write_strong_claude_recap(&out_dir, &summaries, &outcome, STRONG_HORIZON); + + // Sanity checks. + assert!( + !summaries.is_empty(), + "strong-claude run produced zero turns" + ); + let transcript_path = out_dir.join("transcript.jsonl"); + assert!( + transcript_path.exists(), + "transcript missing at {}", + transcript_path.display() + ); + + // Surface the headline result on the test log. + let game_over = summaries + .iter() + .flat_map(|s| s.endturn_events.iter()) + .find_map(|e| match e { + Event::GameOver { winner, victory_type } => { + Some((*winner, victory_type.clone())) + } + _ => None, + }); + eprintln!( + "[strong-claude] turns={} outcome={:?} game_over={:?} artifact={}", + summaries.len(), + outcome, + game_over, + out_dir.display() + ); +}