170 lines
6.5 KiB
Python
Executable file
170 lines
6.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""quality-gates-report.py — evaluate p0-01 calibrated quality sub-gates against an autoplay batch.
|
|
|
|
Usage:
|
|
python3 tools/quality-gates-report.py <batch-dir>
|
|
|
|
Gates (from .project/team-leads/warcouncil.md and p0-01-mcts-wiring.md:33-38):
|
|
- median winner_tier_peak >= 4
|
|
- median tier_peak_gap <= 4
|
|
- >=1 player peak_unit_tier >= 3 in >=7/10 seeds
|
|
- wonder_count >= 1 in >=5/10 seeds
|
|
- total_combats >= 20 median
|
|
|
|
Exit code: 0 if all gates pass, 1 if any fail.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import statistics
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
GATES = {
|
|
"median_winner_tier_peak": (">=", 4),
|
|
"median_tier_peak_gap": ("<=", 4),
|
|
"seeds_max_peak_unit_ge3": (">=", 7),
|
|
"seeds_with_wonder": (">=", 5),
|
|
"median_total_combats": (">=", 20),
|
|
}
|
|
|
|
|
|
def collect(batch_dir: Path) -> list[dict]:
|
|
results: list[dict] = []
|
|
for d in sorted(batch_dir.iterdir(), key=lambda p: p.name):
|
|
if not d.is_dir() or not d.name.startswith("game_"):
|
|
continue
|
|
stats_file = d / "turn_stats.jsonl"
|
|
if not stats_file.exists():
|
|
continue
|
|
try:
|
|
lines = [
|
|
json.loads(line)
|
|
for line in stats_file.read_text().splitlines()
|
|
if line.strip()
|
|
]
|
|
except Exception:
|
|
continue
|
|
if not lines:
|
|
continue
|
|
final = lines[-1]
|
|
ps = final.get("player_stats") or {}
|
|
agg = final.get("aggregate") or {}
|
|
# A player is "alive" at game end if they still have ≥1 city.
|
|
# Eliminated players persist in player_stats with cities=0 and tier_peak=0,
|
|
# which contaminates a naive winner-vs-loser gap computation. Compute
|
|
# the symmetry gap only across alive players (warcouncil quality metric
|
|
# set per .project/team-leads/warcouncil.md:30-34 measures *symmetry at
|
|
# game end*, not victory margin against eliminated foes).
|
|
tier_peaks_all = [pdata.get("tier_peak", 0) for pdata in ps.values()]
|
|
alive_tier_peaks = [
|
|
pdata.get("tier_peak", 0)
|
|
for pdata in ps.values()
|
|
if pdata.get("cities", 0) > 0
|
|
]
|
|
winner_tp = max(alive_tier_peaks) if alive_tier_peaks else (max(tier_peaks_all) if tier_peaks_all else 0)
|
|
# Symmetry gap requires ≥2 alive players. Otherwise game ended in
|
|
# domination — no meaningful symmetry to measure; record gap=None.
|
|
if len(alive_tier_peaks) >= 2:
|
|
others = sorted(alive_tier_peaks)
|
|
others.remove(winner_tp) if winner_tp in others else None
|
|
loser_tp = min(others) if others else winner_tp
|
|
gap = winner_tp - loser_tp
|
|
else:
|
|
loser_tp = None
|
|
gap = None
|
|
peak_units = [pdata.get("peak_unit_tier", 0) for pdata in ps.values()]
|
|
wonders = sum(pdata.get("wonder_count", 0) for pdata in ps.values())
|
|
results.append({
|
|
"seed": d.name.split("seed")[-1],
|
|
"turn": final.get("turn"),
|
|
"outcome": final.get("outcome"),
|
|
"winner_personality": final.get("winner_personality"),
|
|
"winner_tp": winner_tp,
|
|
"loser_tp": loser_tp,
|
|
"gap": gap,
|
|
"alive_count": len(alive_tier_peaks),
|
|
"max_peak_unit": max(peak_units) if peak_units else 0,
|
|
"wonders": wonders,
|
|
"combats": agg.get("total_combats", 0),
|
|
})
|
|
return results
|
|
|
|
|
|
def report(results: list[dict]) -> int:
|
|
if not results:
|
|
print("No games found.")
|
|
return 1
|
|
|
|
print(
|
|
f'{"seed":<5}{"turn":<6}{"outcome":<10}{"winner":<13}{"w_tp":<6}{"gap":<6}{"alive":<6}{"unit":<5}{"won":<5}{"comb":<7}'
|
|
)
|
|
print("-" * 76)
|
|
for r in results:
|
|
winner = str(r["winner_personality"] or "-")[:12]
|
|
gap_str = "—" if r["gap"] is None else str(r["gap"])
|
|
print(
|
|
f'{r["seed"]:<5}{str(r["turn"]):<6}{str(r["outcome"]):<10}{winner:<13}'
|
|
f'{r["winner_tp"]:<6}{gap_str:<6}{r["alive_count"]:<6}{r["max_peak_unit"]:<5}{r["wonders"]:<5}{r["combats"]:<7}'
|
|
)
|
|
|
|
n = len(results)
|
|
med_w_tp = statistics.median([r["winner_tp"] for r in results])
|
|
# tier_peak_gap median computed only over games where ≥2 players survived
|
|
# (sole-survivor games are domination wins — no meaningful symmetry).
|
|
measurable_gaps = [r["gap"] for r in results if r["gap"] is not None]
|
|
med_gap = statistics.median(measurable_gaps) if measurable_gaps else None
|
|
n_measurable = len(measurable_gaps)
|
|
nu = sum(1 for r in results if r["max_peak_unit"] >= 3)
|
|
nw = sum(1 for r in results if r["wonders"] >= 1)
|
|
med_combats = statistics.median([r["combats"] for r in results])
|
|
|
|
print()
|
|
print(f"GATES (n={n}):")
|
|
failures = 0
|
|
gate_rows: list[tuple[str, object, str, int, bool]] = [
|
|
("median winner_tier_peak ", med_w_tp, ">=", 4, True),
|
|
# Gap is None when no game had ≥2 alive players at end. Treat as
|
|
# un-measurable rather than "fail": symmetry can't be measured when
|
|
# all games end via domination.
|
|
("median tier_peak_gap ", med_gap, "<=", 4, med_gap is not None),
|
|
("max_peak_unit>=3 seeds ", nu, ">=", 7, True),
|
|
("wonders>=1 seeds ", nw, ">=", 5, True),
|
|
("median total_combats ", med_combats, ">=", 20, True),
|
|
]
|
|
for label, value, gate_op, gate_val, is_measurable in gate_rows:
|
|
if not is_measurable:
|
|
print(f" {label} = N/A (no game had ≥2 alive players — domination only) SKIP")
|
|
continue
|
|
passing = (value >= gate_val) if gate_op == ">=" else (value <= gate_val)
|
|
verdict = "PASS" if passing else "FAIL"
|
|
if not passing:
|
|
failures += 1
|
|
print(f" {label} = {value} (gate {gate_op}{gate_val}) {verdict}")
|
|
print(f" (tier_peak_gap measurable on {n_measurable}/{n} games — others were domination)")
|
|
|
|
print()
|
|
winners: dict[str, int] = {}
|
|
for r in results:
|
|
w = r["winner_personality"] or "none"
|
|
winners[w] = winners.get(w, 0) + 1
|
|
print(f" winner distribution: {winners}")
|
|
print(f" victories: {sum(1 for r in results if r['outcome'] == 'victory')}/{n}")
|
|
|
|
return 1 if failures else 0
|
|
|
|
|
|
def main(argv: list[str]) -> int:
|
|
if len(argv) != 2:
|
|
print(f"Usage: {argv[0]} <batch-dir>", file=sys.stderr)
|
|
return 2
|
|
batch = Path(argv[1])
|
|
if not batch.is_dir():
|
|
print(f"Not a directory: {batch}", file=sys.stderr)
|
|
return 2
|
|
results = collect(batch)
|
|
return report(results)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main(sys.argv))
|