magicciv/tools/tests/test_quality_metrics.py
Natalie 43989eed82 feat(@projects/@magic-civilization): add autoplay quality metrics validation
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-04-17 14:23:47 -07:00

221 lines
8.9 KiB
Python

"""p0-25 game-quality metrics round-trip tests.
Fabricates turn_stats.jsonl lines with and without the new quality-metric
fields (`tier_peak`, `peak_unit_tier`, `wonder_count`) and exercises both the
schema validator and the reporter's aggregation path. Ensures backward
compatibility with pre-p0-25 batches.
Run with:
python3 -m pytest tools/tests/test_quality_metrics.py -v
"""
from __future__ import annotations
import importlib.util
from pathlib import Path
from typing import Any
TOOLS_DIR = Path(__file__).resolve().parents[1]
def _load_module(name: str, filename: str):
path = TOOLS_DIR / filename
spec = importlib.util.spec_from_file_location(name, path)
assert spec is not None and spec.loader is not None
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
_validate_mod = _load_module("autoplay_validate", "autoplay-validate.py")
_report_mod = _load_module("autoplay_report", "autoplay-report.py")
def _base_player_stats(**overrides: Any) -> dict[str, Any]:
"""Return a player_stats dict with all required-by-schema fields populated."""
base = {
"pop": 10, "pop_peak": 12, "mil": 3,
"cities": 2, "cities_captured": 0, "cities_lost": 0,
"gold": 100, "gold_peak": 150, "gold_per_turn": 5,
"techs": 8, "tiles": 20, "buildings": 6,
"luxuries": 1, "happiness": 5,
"food_total": 12.0, "production_total": 8.0,
"kills": 4, "units_lost": 2,
"turn_first_pop_3": 10, "turn_first_pop_4": 20,
}
base.update(overrides)
return base
def _base_turn_line(
*, turn: int = 200, winner_index: int = 0, outcome: str = "victory",
p0_stats: dict[str, Any] | None = None, p1_stats: dict[str, Any] | None = None,
) -> dict[str, Any]:
return {
"turn": turn,
"outcome": outcome,
"winner_index": winner_index,
"winner_personality": "ironhold" if winner_index >= 0 else "",
"victory_type": "domination" if outcome == "victory" else "",
"wall_clock_sec": 12.5,
"aggregate": {
"total_combats": 15,
"total_cities_founded": 4,
"total_cities_captured": 1,
"turn_first_combat": 30,
"turn_first_city_captured": 180,
},
"player_stats": {
"0": p0_stats if p0_stats is not None else _base_player_stats(),
"1": p1_stats if p1_stats is not None else _base_player_stats(),
},
"invariant_violations": [],
}
# -----------------------------------------------------------------------------
# Schema tests — both new (with quality fields) and old (without) must pass.
# -----------------------------------------------------------------------------
def test_schema_accepts_new_jsonl_with_quality_fields() -> None:
schema = _validate_mod.load_schema("turn-stats-line")
p0 = _base_player_stats(tier_peak=6, peak_unit_tier=5, wonder_count=2)
p1 = _base_player_stats(tier_peak=3, peak_unit_tier=2, wonder_count=0)
line = _base_turn_line(winner_index=0, p0_stats=p0, p1_stats=p1)
errs = _validate_mod.validate(line, schema)
assert errs == [], f"new jsonl failed validation: {errs}"
def test_schema_accepts_old_jsonl_without_quality_fields() -> None:
"""Pre-p0-25 batches have no tier_peak/peak_unit_tier/wonder_count."""
schema = _validate_mod.load_schema("turn-stats-line")
line = _base_turn_line() # _base_player_stats omits the three new fields
errs = _validate_mod.validate(line, schema)
assert errs == [], f"old jsonl failed validation: {errs}"
def test_schema_rejects_tier_peak_above_max() -> None:
"""tier_peak capped at 10 per CLAUDE.md 10-era scale."""
schema = _validate_mod.load_schema("turn-stats-line")
p0_hi = _base_player_stats(tier_peak=11, peak_unit_tier=5, wonder_count=0)
errs_hi = _validate_mod.validate(_base_turn_line(p0_stats=p0_hi), schema)
assert any("maximum" in e for e in errs_hi), (
f"expected maximum violation for tier_peak=11, got: {errs_hi}"
)
p0_neg = _base_player_stats(tier_peak=-1, peak_unit_tier=5, wonder_count=0)
errs_neg = _validate_mod.validate(_base_turn_line(p0_stats=p0_neg), schema)
assert any("minimum" in e for e in errs_neg), (
f"expected minimum violation for tier_peak=-1, got: {errs_neg}"
)
# -----------------------------------------------------------------------------
# Reporter round-trip — fields surface correctly through extract_row + medians.
# -----------------------------------------------------------------------------
def test_reporter_extracts_new_quality_fields() -> None:
p0 = _base_player_stats(tier_peak=7, peak_unit_tier=4, wonder_count=3)
p1 = _base_player_stats(tier_peak=4, peak_unit_tier=3, wonder_count=1)
line = _base_turn_line(winner_index=0, p0_stats=p0, p1_stats=p1)
row = _report_mod.extract_row(seed=1, data=line, event_counts={})
assert row["p0_tier_peak"] == 7
assert row["p1_tier_peak"] == 4
assert row["p0_peak_unit_tier"] == 4
assert row["p1_wonder_count"] == 1
def test_reporter_emits_absent_sentinel_for_old_jsonl() -> None:
"""Old jsonl lacking quality fields should surface the -1 absent sentinel."""
line = _base_turn_line()
row = _report_mod.extract_row(seed=1, data=line, event_counts={})
assert row["p0_tier_peak"] == _report_mod.QUALITY_METRIC_ABSENT
assert row["p1_peak_unit_tier"] == _report_mod.QUALITY_METRIC_ABSENT
assert row["p0_wonder_count"] == _report_mod.QUALITY_METRIC_ABSENT
def test_reporter_computes_quality_medians() -> None:
"""With three games, medians for tier_peak winner/loser/gap match hand-computed."""
rows = []
# Game 1: p0 wins, winner_tp=7, loser_tp=4, gap=+3
rows.append(_report_mod.extract_row(
seed=1,
data=_base_turn_line(
winner_index=0,
p0_stats=_base_player_stats(tier_peak=7, peak_unit_tier=5, wonder_count=3),
p1_stats=_base_player_stats(tier_peak=4, peak_unit_tier=2, wonder_count=0),
),
event_counts={},
))
# Game 2: p1 wins, winner_tp=8, loser_tp=3, gap=+5
rows.append(_report_mod.extract_row(
seed=2,
data=_base_turn_line(
winner_index=1,
p0_stats=_base_player_stats(tier_peak=3, peak_unit_tier=1, wonder_count=0),
p1_stats=_base_player_stats(tier_peak=8, peak_unit_tier=6, wonder_count=4),
),
event_counts={},
))
# Game 3: p0 wins, winner_tp=6, loser_tp=5, gap=+1
rows.append(_report_mod.extract_row(
seed=3,
data=_base_turn_line(
winner_index=0,
p0_stats=_base_player_stats(tier_peak=6, peak_unit_tier=4, wonder_count=2),
p1_stats=_base_player_stats(tier_peak=5, peak_unit_tier=3, wonder_count=1),
),
event_counts={},
))
q = _report_mod.build_quality_metrics(rows)
# winner tiers: [7, 8, 6] → median 7
assert q["median_winner_tier_peak"] == 7.0
# loser tiers: [4, 3, 5] → median 4
assert q["median_loser_tier_peak"] == 4.0
# per-game gaps: [3, 5, 1] → median 3
assert q["median_tier_peak_gap"] == 3.0
# peak_unit_tier (all 6 players): [5,2,1,6,4,3] → median 3.5
assert q["median_peak_unit_tier"] == 3.5
# wonder_count (all 6 players): [3,0,0,4,2,1] → median 1.5
assert q["median_wonder_count_per_player"] == 1.5
def test_reporter_quality_medians_skip_old_jsonl() -> None:
"""When a batch has no quality fields, medians come back None — no crash."""
rows = [
_report_mod.extract_row(seed=1, data=_base_turn_line(), event_counts={}),
_report_mod.extract_row(seed=2, data=_base_turn_line(winner_index=1), event_counts={}),
]
q = _report_mod.build_quality_metrics(rows)
assert q["median_winner_tier_peak"] is None
assert q["median_loser_tier_peak"] is None
assert q["median_tier_peak_gap"] is None
assert q["median_peak_unit_tier"] is None
assert q["median_wonder_count_per_player"] is None
def test_reporter_quality_medians_mixed_batch() -> None:
"""Mixed batch (some games with, some without fields) aggregates cleanly."""
rows = []
# One new-format game
rows.append(_report_mod.extract_row(
seed=1,
data=_base_turn_line(
winner_index=0,
p0_stats=_base_player_stats(tier_peak=5, peak_unit_tier=4, wonder_count=2),
p1_stats=_base_player_stats(tier_peak=3, peak_unit_tier=2, wonder_count=0),
),
event_counts={},
))
# One old-format game — fields absent
rows.append(_report_mod.extract_row(
seed=2, data=_base_turn_line(winner_index=1), event_counts={},
))
q = _report_mod.build_quality_metrics(rows)
# Only the new-format game contributes; medians reflect game 1 only.
assert q["median_winner_tier_peak"] == 5.0
assert q["median_loser_tier_peak"] == 3.0
assert q["median_tier_peak_gap"] == 2.0
assert q["median_peak_unit_tier"] == 3.0 # median of [4, 2]
assert q["median_wonder_count_per_player"] == 1.0 # median of [2, 0]