"""p0-25 game-quality metrics round-trip tests. Fabricates turn_stats.jsonl lines with and without the new quality-metric fields (`tier_peak`, `peak_unit_tier`, `wonder_count`) and exercises both the schema validator and the reporter's aggregation path. Ensures backward compatibility with pre-p0-25 batches. Run with: python3 -m pytest tools/tests/test_quality_metrics.py -v """ from __future__ import annotations import importlib.util from pathlib import Path from typing import Any TOOLS_DIR = Path(__file__).resolve().parents[1] def _load_module(name: str, filename: str): path = TOOLS_DIR / filename spec = importlib.util.spec_from_file_location(name, path) assert spec is not None and spec.loader is not None mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) return mod _validate_mod = _load_module("autoplay_validate", "autoplay-validate.py") _report_mod = _load_module("autoplay_report", "autoplay-report.py") def _base_player_stats(**overrides: Any) -> dict[str, Any]: """Return a player_stats dict with all required-by-schema fields populated.""" base = { "pop": 10, "pop_peak": 12, "mil": 3, "cities": 2, "cities_captured": 0, "cities_lost": 0, "gold": 100, "gold_peak": 150, "gold_per_turn": 5, "techs": 8, "tiles": 20, "buildings": 6, "luxuries": 1, "happiness": 5, "food_total": 12.0, "production_total": 8.0, "kills": 4, "units_lost": 2, "turn_first_pop_3": 10, "turn_first_pop_4": 20, } base.update(overrides) return base def _base_turn_line( *, turn: int = 200, winner_index: int = 0, outcome: str = "victory", p0_stats: dict[str, Any] | None = None, p1_stats: dict[str, Any] | None = None, ) -> dict[str, Any]: return { "turn": turn, "outcome": outcome, "winner_index": winner_index, "winner_personality": "ironhold" if winner_index >= 0 else "", "victory_type": "domination" if outcome == "victory" else "", "wall_clock_sec": 12.5, "aggregate": { "total_combats": 15, "total_cities_founded": 4, "total_cities_captured": 1, "turn_first_combat": 30, "turn_first_city_captured": 180, }, "player_stats": { "0": p0_stats if p0_stats is not None else _base_player_stats(), "1": p1_stats if p1_stats is not None else _base_player_stats(), }, "invariant_violations": [], } # ----------------------------------------------------------------------------- # Schema tests — both new (with quality fields) and old (without) must pass. # ----------------------------------------------------------------------------- def test_schema_accepts_new_jsonl_with_quality_fields() -> None: schema = _validate_mod.load_schema("turn-stats-line") p0 = _base_player_stats(tier_peak=6, peak_unit_tier=5, wonder_count=2) p1 = _base_player_stats(tier_peak=3, peak_unit_tier=2, wonder_count=0) line = _base_turn_line(winner_index=0, p0_stats=p0, p1_stats=p1) errs = _validate_mod.validate(line, schema) assert errs == [], f"new jsonl failed validation: {errs}" def test_schema_accepts_old_jsonl_without_quality_fields() -> None: """Pre-p0-25 batches have no tier_peak/peak_unit_tier/wonder_count.""" schema = _validate_mod.load_schema("turn-stats-line") line = _base_turn_line() # _base_player_stats omits the three new fields errs = _validate_mod.validate(line, schema) assert errs == [], f"old jsonl failed validation: {errs}" def test_schema_rejects_tier_peak_above_max() -> None: """tier_peak capped at 10 per CLAUDE.md 10-era scale.""" schema = _validate_mod.load_schema("turn-stats-line") p0_hi = _base_player_stats(tier_peak=11, peak_unit_tier=5, wonder_count=0) errs_hi = _validate_mod.validate(_base_turn_line(p0_stats=p0_hi), schema) assert any("maximum" in e for e in errs_hi), ( f"expected maximum violation for tier_peak=11, got: {errs_hi}" ) p0_neg = _base_player_stats(tier_peak=-1, peak_unit_tier=5, wonder_count=0) errs_neg = _validate_mod.validate(_base_turn_line(p0_stats=p0_neg), schema) assert any("minimum" in e for e in errs_neg), ( f"expected minimum violation for tier_peak=-1, got: {errs_neg}" ) # ----------------------------------------------------------------------------- # Reporter round-trip — fields surface correctly through extract_row + medians. # ----------------------------------------------------------------------------- def test_reporter_extracts_new_quality_fields() -> None: p0 = _base_player_stats(tier_peak=7, peak_unit_tier=4, wonder_count=3) p1 = _base_player_stats(tier_peak=4, peak_unit_tier=3, wonder_count=1) line = _base_turn_line(winner_index=0, p0_stats=p0, p1_stats=p1) row = _report_mod.extract_row(seed=1, data=line, event_counts={}) assert row["p0_tier_peak"] == 7 assert row["p1_tier_peak"] == 4 assert row["p0_peak_unit_tier"] == 4 assert row["p1_wonder_count"] == 1 def test_reporter_emits_absent_sentinel_for_old_jsonl() -> None: """Old jsonl lacking quality fields should surface the -1 absent sentinel.""" line = _base_turn_line() row = _report_mod.extract_row(seed=1, data=line, event_counts={}) assert row["p0_tier_peak"] == _report_mod.QUALITY_METRIC_ABSENT assert row["p1_peak_unit_tier"] == _report_mod.QUALITY_METRIC_ABSENT assert row["p0_wonder_count"] == _report_mod.QUALITY_METRIC_ABSENT def test_reporter_computes_quality_medians() -> None: """With three games, medians for tier_peak winner/loser/gap match hand-computed.""" rows = [] # Game 1: p0 wins, winner_tp=7, loser_tp=4, gap=+3 rows.append(_report_mod.extract_row( seed=1, data=_base_turn_line( winner_index=0, p0_stats=_base_player_stats(tier_peak=7, peak_unit_tier=5, wonder_count=3), p1_stats=_base_player_stats(tier_peak=4, peak_unit_tier=2, wonder_count=0), ), event_counts={}, )) # Game 2: p1 wins, winner_tp=8, loser_tp=3, gap=+5 rows.append(_report_mod.extract_row( seed=2, data=_base_turn_line( winner_index=1, p0_stats=_base_player_stats(tier_peak=3, peak_unit_tier=1, wonder_count=0), p1_stats=_base_player_stats(tier_peak=8, peak_unit_tier=6, wonder_count=4), ), event_counts={}, )) # Game 3: p0 wins, winner_tp=6, loser_tp=5, gap=+1 rows.append(_report_mod.extract_row( seed=3, data=_base_turn_line( winner_index=0, p0_stats=_base_player_stats(tier_peak=6, peak_unit_tier=4, wonder_count=2), p1_stats=_base_player_stats(tier_peak=5, peak_unit_tier=3, wonder_count=1), ), event_counts={}, )) q = _report_mod.build_quality_metrics(rows) # winner tiers: [7, 8, 6] → median 7 assert q["median_winner_tier_peak"] == 7.0 # loser tiers: [4, 3, 5] → median 4 assert q["median_loser_tier_peak"] == 4.0 # per-game gaps: [3, 5, 1] → median 3 assert q["median_tier_peak_gap"] == 3.0 # peak_unit_tier (all 6 players): [5,2,1,6,4,3] → median 3.5 assert q["median_peak_unit_tier"] == 3.5 # wonder_count (all 6 players): [3,0,0,4,2,1] → median 1.5 assert q["median_wonder_count_per_player"] == 1.5 def test_reporter_quality_medians_skip_old_jsonl() -> None: """When a batch has no quality fields, medians come back None — no crash.""" rows = [ _report_mod.extract_row(seed=1, data=_base_turn_line(), event_counts={}), _report_mod.extract_row(seed=2, data=_base_turn_line(winner_index=1), event_counts={}), ] q = _report_mod.build_quality_metrics(rows) assert q["median_winner_tier_peak"] is None assert q["median_loser_tier_peak"] is None assert q["median_tier_peak_gap"] is None assert q["median_peak_unit_tier"] is None assert q["median_wonder_count_per_player"] is None def test_reporter_quality_medians_mixed_batch() -> None: """Mixed batch (some games with, some without fields) aggregates cleanly.""" rows = [] # One new-format game rows.append(_report_mod.extract_row( seed=1, data=_base_turn_line( winner_index=0, p0_stats=_base_player_stats(tier_peak=5, peak_unit_tier=4, wonder_count=2), p1_stats=_base_player_stats(tier_peak=3, peak_unit_tier=2, wonder_count=0), ), event_counts={}, )) # One old-format game — fields absent rows.append(_report_mod.extract_row( seed=2, data=_base_turn_line(winner_index=1), event_counts={}, )) q = _report_mod.build_quality_metrics(rows) # Only the new-format game contributes; medians reflect game 1 only. assert q["median_winner_tier_peak"] == 5.0 assert q["median_loser_tier_peak"] == 3.0 assert q["median_tier_peak_gap"] == 2.0 assert q["median_peak_unit_tier"] == 3.0 # median of [4, 2] assert q["median_wonder_count_per_player"] == 1.0 # median of [2, 0]