magicciv/tools/test_matchup_and_ultimate.py
Natalie b568d85966 feat(@projects/@magic-civilization): add game 3 objectives and testing tools
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-04-17 13:01:10 -07:00

215 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Tests for matchup_balance and ultimate_stress verdicts in checklist-report.py.
Covers the two new gates added 2026-04-17 alongside `tools/matchup-grid.sh`
(C(5,2)=10 1v1 pairings) and `tools/huge-map-5clan.sh` (5-clan huge-map
ultimate AI lookahead stress test). Both functions consume
`list[tuple[int, dict]]` rows exactly like the pre-existing
`personality_win_balance_verdict`, so the fixture style mirrors
`tools/test_personality_winrate.py`.
Run: python3 tools/test_matchup_and_ultimate.py
"""
from __future__ import annotations
import importlib.util as _iu
import sys
from pathlib import Path
_TOOLS = Path(__file__).parent
def _load(name: str, stem: str):
path = _TOOLS / f"{stem}.py"
spec = _iu.spec_from_file_location(name, path)
mod = _iu.module_from_spec(spec) # type: ignore[arg-type]
spec.loader.exec_module(mod) # type: ignore[union-attr]
return mod
cr = _load("checklist_report", "checklist-report")
def _row(outcome: str, clans: dict, winner_index: int = 1, turn: int = 200) -> dict:
return {
"outcome": outcome,
"winner_index": winner_index,
"winner_personality": clans.get(str(winner_index), ""),
"player_clans": clans,
"turn": turn,
}
def _grid_balanced_1v1() -> list:
clans_ids = ["ironhold", "goldvein", "blackhammer", "deepforge", "runesmith"]
rows: list = []
seed = 0
for i in range(len(clans_ids)):
for j in range(i + 1, len(clans_ids)):
a, b = clans_ids[i], clans_ids[j]
rows.append((seed, _row("victory", {"0": "", "1": a}, winner_index=1, turn=180))); seed += 1
rows.append((seed, _row("victory", {"0": "", "1": b}, winner_index=1, turn=180))); seed += 1
rows.append((seed, _row("in_progress", {"0": "", "1": a}, winner_index=-1, turn=300))); seed += 1
rows.append((seed, _row("in_progress", {"0": "", "1": b}, winner_index=-1, turn=300))); seed += 1
return rows
def _grid_dominant_clan() -> list:
rows: list = []
seed = 0
for opp in ["goldvein", "blackhammer", "deepforge", "runesmith"]:
for _ in range(10):
rows.append((seed, _row("victory", {"0": "", "1": "ironhold"}, winner_index=1, turn=150))); seed += 1
rows.append((seed, _row("in_progress", {"0": "", "1": opp}, winner_index=-1, turn=300))); seed += 1
return rows
def _grid_undersampled() -> list:
rows: list = []
for i, c in enumerate(["ironhold", "goldvein", "blackhammer", "deepforge", "runesmith"]):
rows.append((i * 2, _row("victory", {"0": "", "1": c}, winner_index=1, turn=200)))
rows.append((i * 2 + 1, _row("victory", {"0": "", "1": c}, winner_index=1, turn=200)))
return rows
def _ultimate_balanced() -> list:
wins = ["ironhold", "goldvein", "blackhammer", "deepforge", "ironhold",
"goldvein", "blackhammer", "ironhold", "deepforge", "goldvein"]
rows: list = []
for seed, winner in enumerate(wins):
clans = {"0": "ironhold", "1": "goldvein", "2": "blackhammer",
"3": "deepforge", "4": "runesmith"}
w_slot = next(k for k, v in clans.items() if v == winner)
rows.append((seed, _row("victory", clans, winner_index=int(w_slot), turn=350)))
return rows
def _ultimate_degenerate_single_winner() -> list:
clans = {"0": "ironhold", "1": "goldvein", "2": "blackhammer",
"3": "deepforge", "4": "runesmith"}
return [(s, _row("victory", clans, winner_index=0, turn=350)) for s in range(10)]
def _ultimate_all_stalemates() -> list:
clans = {"0": "ironhold", "1": "goldvein", "2": "blackhammer",
"3": "deepforge", "4": "runesmith"}
return [(s, _row("in_progress", clans, winner_index=-1, turn=500)) for s in range(10)]
def _ultimate_snap_games() -> list:
clans = {"0": "ironhold", "1": "goldvein", "2": "blackhammer",
"3": "deepforge", "4": "runesmith"}
wins = ["ironhold", "goldvein"] * 5
rows: list = []
for seed, winner in enumerate(wins):
w_slot = next(k for k, v in clans.items() if v == winner)
rows.append((seed, _row("victory", clans, winner_index=int(w_slot), turn=25)))
rows[0][1]["turn"] = 500
return rows
def _ultimate_too_few_samples() -> list:
clans = {"0": "ironhold", "1": "goldvein", "2": "blackhammer",
"3": "deepforge", "4": "runesmith"}
return [(s, _row("victory", clans, winner_index=0, turn=300)) for s in range(4)]
_fails: list = []
_passes = 0
def _check(name: str, cond: bool, detail: str = "") -> None:
global _passes
if cond:
_passes += 1
print(f" PASS {name}")
else:
_fails.append(name)
print(f" FAIL {name}" + (f"{detail}" if detail else ""))
print("matchup_balance_verdict")
v = cr.matchup_balance_verdict(_grid_balanced_1v1())
_check("balanced grid passes", v["pass"], f"reasons={v['reasons']}")
_check("balanced grid reports 40 sample size", v["sample_size"] == 40)
_check("balanced grid lists all 5 clans", set(v["clans"].keys()) >= set(cr.KNOWN_CLAN_IDS))
_check("balanced grid has no missing clans", v["missing_clans"] == [])
for c in cr.KNOWN_CLAN_IDS:
_check(
f"balanced: {c} win_rate <= 50%",
v["clans"][c]["win_rate_pct"] <= 50,
f"{c} saw {v['clans'][c]['win_rate_pct']}%",
)
v_dom = cr.matchup_balance_verdict(_grid_dominant_clan())
_check("dominant-clan grid fails", not v_dom["pass"])
_check(
"dominant-clan reason mentions ironhold + 50%",
any("ironhold" in r and "50" in r for r in v_dom["reasons"]),
f"reasons={v_dom['reasons']}",
)
v_under = cr.matchup_balance_verdict(_grid_undersampled())
_check("undersampled grid fails", not v_under["pass"])
_check(
"undersampled reason mentions 'grid incomplete'",
any("grid incomplete" in r or "appearances" in r for r in v_under["reasons"]),
f"reasons={v_under['reasons']}",
)
print("\nultimate_stress_verdict")
v_ult = cr.ultimate_stress_verdict(_ultimate_balanced())
_check("balanced ultimate passes", v_ult["pass"], f"reasons={v_ult['reasons']}")
_check("balanced ultimate has 10 samples", v_ult["sample_size"] == 10)
_check("balanced ultimate victory_count == 10", v_ult["victory_count"] == 10)
_check(
"balanced ultimate distinct_winners >= 2",
len(v_ult["distinct_winners"]) >= 2,
f"got {v_ult['distinct_winners']}",
)
_check("balanced ultimate median_turn > 0", v_ult["median_turn"] > 0)
v_single = cr.ultimate_stress_verdict(_ultimate_degenerate_single_winner())
_check("single-winner sweep fails", not v_single["pass"])
_check(
"single-winner reason mentions degenerate",
any("degenerate" in r for r in v_single["reasons"]),
f"reasons={v_single['reasons']}",
)
v_stale = cr.ultimate_stress_verdict(_ultimate_all_stalemates())
_check("all-stalemate fails", not v_stale["pass"])
_check(
"all-stalemate reason mentions stalling",
any("stalling" in r or "decisive" in r for r in v_stale["reasons"]),
f"reasons={v_stale['reasons']}",
)
v_snap = cr.ultimate_stress_verdict(_ultimate_snap_games())
_check("snap-ending fails", not v_snap["pass"])
_check(
"snap-ending reason mentions map not being used",
any("snap-ending" in r or "not being used" in r or "median" in r
for r in v_snap["reasons"]),
f"reasons={v_snap['reasons']}",
)
v_few = cr.ultimate_stress_verdict(_ultimate_too_few_samples())
_check("too-few-samples fails", not v_few["pass"])
_check(
"too-few-samples reason mentions sample size",
any("SEEDS" in r or "samples" in r or "games" in r for r in v_few["reasons"]),
f"reasons={v_few['reasons']}",
)
print("")
if _fails:
print(f"FAILED ({len(_fails)} / {_passes + len(_fails)})")
for n in _fails:
print(f" × {n}")
sys.exit(1)
print(f"PASS {_passes} / {_passes}")