magicciv/tools/test_matchup_and_ultimate.py

216 lines
7.7 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""Tests for matchup_balance and ultimate_stress verdicts in checklist-report.py.
Covers the two new gates added 2026-04-17 alongside `tools/matchup-grid.sh`
(C(5,2)=10 1v1 pairings) and `tools/huge-map-5clan.sh` (5-clan huge-map
ultimate AI lookahead stress test). Both functions consume
`list[tuple[int, dict]]` rows exactly like the pre-existing
`personality_win_balance_verdict`, so the fixture style mirrors
`tools/test_personality_winrate.py`.
Run: python3 tools/test_matchup_and_ultimate.py
"""
from __future__ import annotations
import importlib.util as _iu
import sys
from pathlib import Path
_TOOLS = Path(__file__).parent
def _load(name: str, stem: str):
path = _TOOLS / f"{stem}.py"
spec = _iu.spec_from_file_location(name, path)
mod = _iu.module_from_spec(spec) # type: ignore[arg-type]
spec.loader.exec_module(mod) # type: ignore[union-attr]
return mod
cr = _load("checklist_report", "checklist-report")
def _row(outcome: str, clans: dict, winner_index: int = 1, turn: int = 200) -> dict:
return {
"outcome": outcome,
"winner_index": winner_index,
"winner_personality": clans.get(str(winner_index), ""),
"player_clans": clans,
"turn": turn,
}
def _grid_balanced_1v1() -> list:
clans_ids = ["ironhold", "goldvein", "blackhammer", "deepforge", "runesmith"]
rows: list = []
seed = 0
for i in range(len(clans_ids)):
for j in range(i + 1, len(clans_ids)):
a, b = clans_ids[i], clans_ids[j]
rows.append((seed, _row("victory", {"0": "", "1": a}, winner_index=1, turn=180))); seed += 1
rows.append((seed, _row("victory", {"0": "", "1": b}, winner_index=1, turn=180))); seed += 1
rows.append((seed, _row("in_progress", {"0": "", "1": a}, winner_index=-1, turn=300))); seed += 1
rows.append((seed, _row("in_progress", {"0": "", "1": b}, winner_index=-1, turn=300))); seed += 1
return rows
def _grid_dominant_clan() -> list:
rows: list = []
seed = 0
for opp in ["goldvein", "blackhammer", "deepforge", "runesmith"]:
for _ in range(10):
rows.append((seed, _row("victory", {"0": "", "1": "ironhold"}, winner_index=1, turn=150))); seed += 1
rows.append((seed, _row("in_progress", {"0": "", "1": opp}, winner_index=-1, turn=300))); seed += 1
return rows
def _grid_undersampled() -> list:
rows: list = []
for i, c in enumerate(["ironhold", "goldvein", "blackhammer", "deepforge", "runesmith"]):
rows.append((i * 2, _row("victory", {"0": "", "1": c}, winner_index=1, turn=200)))
rows.append((i * 2 + 1, _row("victory", {"0": "", "1": c}, winner_index=1, turn=200)))
return rows
def _ultimate_balanced() -> list:
wins = ["ironhold", "goldvein", "blackhammer", "deepforge", "ironhold",
"goldvein", "blackhammer", "ironhold", "deepforge", "goldvein"]
rows: list = []
for seed, winner in enumerate(wins):
clans = {"0": "ironhold", "1": "goldvein", "2": "blackhammer",
"3": "deepforge", "4": "runesmith"}
w_slot = next(k for k, v in clans.items() if v == winner)
rows.append((seed, _row("victory", clans, winner_index=int(w_slot), turn=350)))
return rows
def _ultimate_degenerate_single_winner() -> list:
clans = {"0": "ironhold", "1": "goldvein", "2": "blackhammer",
"3": "deepforge", "4": "runesmith"}
return [(s, _row("victory", clans, winner_index=0, turn=350)) for s in range(10)]
def _ultimate_all_stalemates() -> list:
clans = {"0": "ironhold", "1": "goldvein", "2": "blackhammer",
"3": "deepforge", "4": "runesmith"}
return [(s, _row("in_progress", clans, winner_index=-1, turn=500)) for s in range(10)]
def _ultimate_snap_games() -> list:
clans = {"0": "ironhold", "1": "goldvein", "2": "blackhammer",
"3": "deepforge", "4": "runesmith"}
wins = ["ironhold", "goldvein"] * 5
rows: list = []
for seed, winner in enumerate(wins):
w_slot = next(k for k, v in clans.items() if v == winner)
rows.append((seed, _row("victory", clans, winner_index=int(w_slot), turn=25)))
rows[0][1]["turn"] = 500
return rows
def _ultimate_too_few_samples() -> list:
clans = {"0": "ironhold", "1": "goldvein", "2": "blackhammer",
"3": "deepforge", "4": "runesmith"}
return [(s, _row("victory", clans, winner_index=0, turn=300)) for s in range(4)]
_fails: list = []
_passes = 0
def _check(name: str, cond: bool, detail: str = "") -> None:
global _passes
if cond:
_passes += 1
print(f" PASS {name}")
else:
_fails.append(name)
print(f" FAIL {name}" + (f"{detail}" if detail else ""))
print("matchup_balance_verdict")
v = cr.matchup_balance_verdict(_grid_balanced_1v1())
_check("balanced grid passes", v["pass"], f"reasons={v['reasons']}")
_check("balanced grid reports 40 sample size", v["sample_size"] == 40)
_check("balanced grid lists all 5 clans", set(v["clans"].keys()) >= set(cr.KNOWN_CLAN_IDS))
_check("balanced grid has no missing clans", v["missing_clans"] == [])
for c in cr.KNOWN_CLAN_IDS:
_check(
f"balanced: {c} win_rate <= 50%",
v["clans"][c]["win_rate_pct"] <= 50,
f"{c} saw {v['clans'][c]['win_rate_pct']}%",
)
v_dom = cr.matchup_balance_verdict(_grid_dominant_clan())
_check("dominant-clan grid fails", not v_dom["pass"])
_check(
"dominant-clan reason mentions ironhold + 50%",
any("ironhold" in r and "50" in r for r in v_dom["reasons"]),
f"reasons={v_dom['reasons']}",
)
v_under = cr.matchup_balance_verdict(_grid_undersampled())
_check("undersampled grid fails", not v_under["pass"])
_check(
"undersampled reason mentions 'grid incomplete'",
any("grid incomplete" in r or "appearances" in r for r in v_under["reasons"]),
f"reasons={v_under['reasons']}",
)
print("\nultimate_stress_verdict")
v_ult = cr.ultimate_stress_verdict(_ultimate_balanced())
_check("balanced ultimate passes", v_ult["pass"], f"reasons={v_ult['reasons']}")
_check("balanced ultimate has 10 samples", v_ult["sample_size"] == 10)
_check("balanced ultimate victory_count == 10", v_ult["victory_count"] == 10)
_check(
"balanced ultimate distinct_winners >= 2",
len(v_ult["distinct_winners"]) >= 2,
f"got {v_ult['distinct_winners']}",
)
_check("balanced ultimate median_turn > 0", v_ult["median_turn"] > 0)
v_single = cr.ultimate_stress_verdict(_ultimate_degenerate_single_winner())
_check("single-winner sweep fails", not v_single["pass"])
_check(
"single-winner reason mentions degenerate",
any("degenerate" in r for r in v_single["reasons"]),
f"reasons={v_single['reasons']}",
)
v_stale = cr.ultimate_stress_verdict(_ultimate_all_stalemates())
_check("all-stalemate fails", not v_stale["pass"])
_check(
"all-stalemate reason mentions stalling",
any("stalling" in r or "decisive" in r for r in v_stale["reasons"]),
f"reasons={v_stale['reasons']}",
)
v_snap = cr.ultimate_stress_verdict(_ultimate_snap_games())
_check("snap-ending fails", not v_snap["pass"])
_check(
"snap-ending reason mentions map not being used",
any("snap-ending" in r or "not being used" in r or "median" in r
for r in v_snap["reasons"]),
f"reasons={v_snap['reasons']}",
)
v_few = cr.ultimate_stress_verdict(_ultimate_too_few_samples())
_check("too-few-samples fails", not v_few["pass"])
_check(
"too-few-samples reason mentions sample size",
any("SEEDS" in r or "samples" in r or "games" in r for r in v_few["reasons"]),
f"reasons={v_few['reasons']}",
)
print("")
if _fails:
print(f"FAILED ({len(_fails)} / {_passes + len(_fails)})")
for n in _fails:
print(f" × {n}")
sys.exit(1)
print(f"PASS {_passes} / {_passes}")