magicciv/tools/quality-gates-report.py
Natalie 081b516af1 fix(@projects/@magic-civilization): 🐛 resolve gut cleanup and update objectives
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-04-26 00:37:04 -07:00

170 lines
6.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""quality-gates-report.py — evaluate p0-01 calibrated quality sub-gates against an autoplay batch.
Usage:
python3 tools/quality-gates-report.py <batch-dir>
Gates (from .project/team-leads/warcouncil.md and p0-01-mcts-wiring.md:33-38):
- median winner_tier_peak >= 4
- median tier_peak_gap <= 4
- >=1 player peak_unit_tier >= 3 in >=7/10 seeds
- wonder_count >= 1 in >=5/10 seeds
- total_combats >= 20 median
Exit code: 0 if all gates pass, 1 if any fail.
"""
from __future__ import annotations
import json
import statistics
import sys
from pathlib import Path
GATES = {
"median_winner_tier_peak": (">=", 4),
"median_tier_peak_gap": ("<=", 4),
"seeds_max_peak_unit_ge3": (">=", 7),
"seeds_with_wonder": (">=", 5),
"median_total_combats": (">=", 20),
}
def collect(batch_dir: Path) -> list[dict]:
results: list[dict] = []
for d in sorted(batch_dir.iterdir(), key=lambda p: p.name):
if not d.is_dir() or not d.name.startswith("game_"):
continue
stats_file = d / "turn_stats.jsonl"
if not stats_file.exists():
continue
try:
lines = [
json.loads(line)
for line in stats_file.read_text().splitlines()
if line.strip()
]
except Exception:
continue
if not lines:
continue
final = lines[-1]
ps = final.get("player_stats") or {}
agg = final.get("aggregate") or {}
# A player is "alive" at game end if they still have ≥1 city.
# Eliminated players persist in player_stats with cities=0 and tier_peak=0,
# which contaminates a naive winner-vs-loser gap computation. Compute
# the symmetry gap only across alive players (warcouncil quality metric
# set per .project/team-leads/warcouncil.md:30-34 measures *symmetry at
# game end*, not victory margin against eliminated foes).
tier_peaks_all = [pdata.get("tier_peak", 0) for pdata in ps.values()]
alive_tier_peaks = [
pdata.get("tier_peak", 0)
for pdata in ps.values()
if pdata.get("cities", 0) > 0
]
winner_tp = max(alive_tier_peaks) if alive_tier_peaks else (max(tier_peaks_all) if tier_peaks_all else 0)
# Symmetry gap requires ≥2 alive players. Otherwise game ended in
# domination — no meaningful symmetry to measure; record gap=None.
if len(alive_tier_peaks) >= 2:
others = sorted(alive_tier_peaks)
others.remove(winner_tp) if winner_tp in others else None
loser_tp = min(others) if others else winner_tp
gap = winner_tp - loser_tp
else:
loser_tp = None
gap = None
peak_units = [pdata.get("peak_unit_tier", 0) for pdata in ps.values()]
wonders = sum(pdata.get("wonder_count", 0) for pdata in ps.values())
results.append({
"seed": d.name.split("seed")[-1],
"turn": final.get("turn"),
"outcome": final.get("outcome"),
"winner_personality": final.get("winner_personality"),
"winner_tp": winner_tp,
"loser_tp": loser_tp,
"gap": gap,
"alive_count": len(alive_tier_peaks),
"max_peak_unit": max(peak_units) if peak_units else 0,
"wonders": wonders,
"combats": agg.get("total_combats", 0),
})
return results
def report(results: list[dict]) -> int:
if not results:
print("No games found.")
return 1
print(
f'{"seed":<5}{"turn":<6}{"outcome":<10}{"winner":<13}{"w_tp":<6}{"gap":<6}{"alive":<6}{"unit":<5}{"won":<5}{"comb":<7}'
)
print("-" * 76)
for r in results:
winner = str(r["winner_personality"] or "-")[:12]
gap_str = "" if r["gap"] is None else str(r["gap"])
print(
f'{r["seed"]:<5}{str(r["turn"]):<6}{str(r["outcome"]):<10}{winner:<13}'
f'{r["winner_tp"]:<6}{gap_str:<6}{r["alive_count"]:<6}{r["max_peak_unit"]:<5}{r["wonders"]:<5}{r["combats"]:<7}'
)
n = len(results)
med_w_tp = statistics.median([r["winner_tp"] for r in results])
# tier_peak_gap median computed only over games where ≥2 players survived
# (sole-survivor games are domination wins — no meaningful symmetry).
measurable_gaps = [r["gap"] for r in results if r["gap"] is not None]
med_gap = statistics.median(measurable_gaps) if measurable_gaps else None
n_measurable = len(measurable_gaps)
nu = sum(1 for r in results if r["max_peak_unit"] >= 3)
nw = sum(1 for r in results if r["wonders"] >= 1)
med_combats = statistics.median([r["combats"] for r in results])
print()
print(f"GATES (n={n}):")
failures = 0
gate_rows: list[tuple[str, object, str, int, bool]] = [
("median winner_tier_peak ", med_w_tp, ">=", 4, True),
# Gap is None when no game had ≥2 alive players at end. Treat as
# un-measurable rather than "fail": symmetry can't be measured when
# all games end via domination.
("median tier_peak_gap ", med_gap, "<=", 4, med_gap is not None),
("max_peak_unit>=3 seeds ", nu, ">=", 7, True),
("wonders>=1 seeds ", nw, ">=", 5, True),
("median total_combats ", med_combats, ">=", 20, True),
]
for label, value, gate_op, gate_val, is_measurable in gate_rows:
if not is_measurable:
print(f" {label} = N/A (no game had ≥2 alive players — domination only) SKIP")
continue
passing = (value >= gate_val) if gate_op == ">=" else (value <= gate_val)
verdict = "PASS" if passing else "FAIL"
if not passing:
failures += 1
print(f" {label} = {value} (gate {gate_op}{gate_val}) {verdict}")
print(f" (tier_peak_gap measurable on {n_measurable}/{n} games — others were domination)")
print()
winners: dict[str, int] = {}
for r in results:
w = r["winner_personality"] or "none"
winners[w] = winners.get(w, 0) + 1
print(f" winner distribution: {winners}")
print(f" victories: {sum(1 for r in results if r['outcome'] == 'victory')}/{n}")
return 1 if failures else 0
def main(argv: list[str]) -> int:
if len(argv) != 2:
print(f"Usage: {argv[0]} <batch-dir>", file=sys.stderr)
return 2
batch = Path(argv[1])
if not batch.is_dir():
print(f"Not a directory: {batch}", file=sys.stderr)
return 2
results = collect(batch)
return report(results)
if __name__ == "__main__":
sys.exit(main(sys.argv))