221 lines
8.9 KiB
Python
221 lines
8.9 KiB
Python
"""p0-25 game-quality metrics round-trip tests.
|
|
|
|
Fabricates turn_stats.jsonl lines with and without the new quality-metric
|
|
fields (`tier_peak`, `peak_unit_tier`, `wonder_count`) and exercises both the
|
|
schema validator and the reporter's aggregation path. Ensures backward
|
|
compatibility with pre-p0-25 batches.
|
|
|
|
Run with:
|
|
python3 -m pytest tools/tests/test_quality_metrics.py -v
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import importlib.util
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
TOOLS_DIR = Path(__file__).resolve().parents[1]
|
|
|
|
|
|
def _load_module(name: str, filename: str):
|
|
path = TOOLS_DIR / filename
|
|
spec = importlib.util.spec_from_file_location(name, path)
|
|
assert spec is not None and spec.loader is not None
|
|
mod = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(mod)
|
|
return mod
|
|
|
|
|
|
_validate_mod = _load_module("autoplay_validate", "autoplay-validate.py")
|
|
_report_mod = _load_module("autoplay_report", "autoplay-report.py")
|
|
|
|
|
|
def _base_player_stats(**overrides: Any) -> dict[str, Any]:
|
|
"""Return a player_stats dict with all required-by-schema fields populated."""
|
|
base = {
|
|
"pop": 10, "pop_peak": 12, "mil": 3,
|
|
"cities": 2, "cities_captured": 0, "cities_lost": 0,
|
|
"gold": 100, "gold_peak": 150, "gold_per_turn": 5,
|
|
"techs": 8, "tiles": 20, "buildings": 6,
|
|
"luxuries": 1, "happiness": 5,
|
|
"food_total": 12.0, "production_total": 8.0,
|
|
"kills": 4, "units_lost": 2,
|
|
"turn_first_pop_3": 10, "turn_first_pop_4": 20,
|
|
}
|
|
base.update(overrides)
|
|
return base
|
|
|
|
|
|
def _base_turn_line(
|
|
*, turn: int = 200, winner_index: int = 0, outcome: str = "victory",
|
|
p0_stats: dict[str, Any] | None = None, p1_stats: dict[str, Any] | None = None,
|
|
) -> dict[str, Any]:
|
|
return {
|
|
"turn": turn,
|
|
"outcome": outcome,
|
|
"winner_index": winner_index,
|
|
"winner_personality": "ironhold" if winner_index >= 0 else "",
|
|
"victory_type": "domination" if outcome == "victory" else "",
|
|
"wall_clock_sec": 12.5,
|
|
"aggregate": {
|
|
"total_combats": 15,
|
|
"total_cities_founded": 4,
|
|
"total_cities_captured": 1,
|
|
"turn_first_combat": 30,
|
|
"turn_first_city_captured": 180,
|
|
},
|
|
"player_stats": {
|
|
"0": p0_stats if p0_stats is not None else _base_player_stats(),
|
|
"1": p1_stats if p1_stats is not None else _base_player_stats(),
|
|
},
|
|
"invariant_violations": [],
|
|
}
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Schema tests — both new (with quality fields) and old (without) must pass.
|
|
# -----------------------------------------------------------------------------
|
|
|
|
def test_schema_accepts_new_jsonl_with_quality_fields() -> None:
|
|
schema = _validate_mod.load_schema("turn-stats-line")
|
|
p0 = _base_player_stats(tier_peak=6, peak_unit_tier=5, wonder_count=2)
|
|
p1 = _base_player_stats(tier_peak=3, peak_unit_tier=2, wonder_count=0)
|
|
line = _base_turn_line(winner_index=0, p0_stats=p0, p1_stats=p1)
|
|
errs = _validate_mod.validate(line, schema)
|
|
assert errs == [], f"new jsonl failed validation: {errs}"
|
|
|
|
|
|
def test_schema_accepts_old_jsonl_without_quality_fields() -> None:
|
|
"""Pre-p0-25 batches have no tier_peak/peak_unit_tier/wonder_count."""
|
|
schema = _validate_mod.load_schema("turn-stats-line")
|
|
line = _base_turn_line() # _base_player_stats omits the three new fields
|
|
errs = _validate_mod.validate(line, schema)
|
|
assert errs == [], f"old jsonl failed validation: {errs}"
|
|
|
|
|
|
def test_schema_rejects_tier_peak_above_max() -> None:
|
|
"""tier_peak capped at 10 per CLAUDE.md 10-era scale."""
|
|
schema = _validate_mod.load_schema("turn-stats-line")
|
|
p0_hi = _base_player_stats(tier_peak=11, peak_unit_tier=5, wonder_count=0)
|
|
errs_hi = _validate_mod.validate(_base_turn_line(p0_stats=p0_hi), schema)
|
|
assert any("maximum" in e for e in errs_hi), (
|
|
f"expected maximum violation for tier_peak=11, got: {errs_hi}"
|
|
)
|
|
|
|
p0_neg = _base_player_stats(tier_peak=-1, peak_unit_tier=5, wonder_count=0)
|
|
errs_neg = _validate_mod.validate(_base_turn_line(p0_stats=p0_neg), schema)
|
|
assert any("minimum" in e for e in errs_neg), (
|
|
f"expected minimum violation for tier_peak=-1, got: {errs_neg}"
|
|
)
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Reporter round-trip — fields surface correctly through extract_row + medians.
|
|
# -----------------------------------------------------------------------------
|
|
|
|
def test_reporter_extracts_new_quality_fields() -> None:
|
|
p0 = _base_player_stats(tier_peak=7, peak_unit_tier=4, wonder_count=3)
|
|
p1 = _base_player_stats(tier_peak=4, peak_unit_tier=3, wonder_count=1)
|
|
line = _base_turn_line(winner_index=0, p0_stats=p0, p1_stats=p1)
|
|
row = _report_mod.extract_row(seed=1, data=line, event_counts={})
|
|
assert row["p0_tier_peak"] == 7
|
|
assert row["p1_tier_peak"] == 4
|
|
assert row["p0_peak_unit_tier"] == 4
|
|
assert row["p1_wonder_count"] == 1
|
|
|
|
|
|
def test_reporter_emits_absent_sentinel_for_old_jsonl() -> None:
|
|
"""Old jsonl lacking quality fields should surface the -1 absent sentinel."""
|
|
line = _base_turn_line()
|
|
row = _report_mod.extract_row(seed=1, data=line, event_counts={})
|
|
assert row["p0_tier_peak"] == _report_mod.QUALITY_METRIC_ABSENT
|
|
assert row["p1_peak_unit_tier"] == _report_mod.QUALITY_METRIC_ABSENT
|
|
assert row["p0_wonder_count"] == _report_mod.QUALITY_METRIC_ABSENT
|
|
|
|
|
|
def test_reporter_computes_quality_medians() -> None:
|
|
"""With three games, medians for tier_peak winner/loser/gap match hand-computed."""
|
|
rows = []
|
|
# Game 1: p0 wins, winner_tp=7, loser_tp=4, gap=+3
|
|
rows.append(_report_mod.extract_row(
|
|
seed=1,
|
|
data=_base_turn_line(
|
|
winner_index=0,
|
|
p0_stats=_base_player_stats(tier_peak=7, peak_unit_tier=5, wonder_count=3),
|
|
p1_stats=_base_player_stats(tier_peak=4, peak_unit_tier=2, wonder_count=0),
|
|
),
|
|
event_counts={},
|
|
))
|
|
# Game 2: p1 wins, winner_tp=8, loser_tp=3, gap=+5
|
|
rows.append(_report_mod.extract_row(
|
|
seed=2,
|
|
data=_base_turn_line(
|
|
winner_index=1,
|
|
p0_stats=_base_player_stats(tier_peak=3, peak_unit_tier=1, wonder_count=0),
|
|
p1_stats=_base_player_stats(tier_peak=8, peak_unit_tier=6, wonder_count=4),
|
|
),
|
|
event_counts={},
|
|
))
|
|
# Game 3: p0 wins, winner_tp=6, loser_tp=5, gap=+1
|
|
rows.append(_report_mod.extract_row(
|
|
seed=3,
|
|
data=_base_turn_line(
|
|
winner_index=0,
|
|
p0_stats=_base_player_stats(tier_peak=6, peak_unit_tier=4, wonder_count=2),
|
|
p1_stats=_base_player_stats(tier_peak=5, peak_unit_tier=3, wonder_count=1),
|
|
),
|
|
event_counts={},
|
|
))
|
|
|
|
q = _report_mod.build_quality_metrics(rows)
|
|
|
|
# winner tiers: [7, 8, 6] → median 7
|
|
assert q["median_winner_tier_peak"] == 7.0
|
|
# loser tiers: [4, 3, 5] → median 4
|
|
assert q["median_loser_tier_peak"] == 4.0
|
|
# per-game gaps: [3, 5, 1] → median 3
|
|
assert q["median_tier_peak_gap"] == 3.0
|
|
# peak_unit_tier (all 6 players): [5,2,1,6,4,3] → median 3.5
|
|
assert q["median_peak_unit_tier"] == 3.5
|
|
# wonder_count (all 6 players): [3,0,0,4,2,1] → median 1.5
|
|
assert q["median_wonder_count_per_player"] == 1.5
|
|
|
|
|
|
def test_reporter_quality_medians_skip_old_jsonl() -> None:
|
|
"""When a batch has no quality fields, medians come back None — no crash."""
|
|
rows = [
|
|
_report_mod.extract_row(seed=1, data=_base_turn_line(), event_counts={}),
|
|
_report_mod.extract_row(seed=2, data=_base_turn_line(winner_index=1), event_counts={}),
|
|
]
|
|
q = _report_mod.build_quality_metrics(rows)
|
|
assert q["median_winner_tier_peak"] is None
|
|
assert q["median_loser_tier_peak"] is None
|
|
assert q["median_tier_peak_gap"] is None
|
|
assert q["median_peak_unit_tier"] is None
|
|
assert q["median_wonder_count_per_player"] is None
|
|
|
|
|
|
def test_reporter_quality_medians_mixed_batch() -> None:
|
|
"""Mixed batch (some games with, some without fields) aggregates cleanly."""
|
|
rows = []
|
|
# One new-format game
|
|
rows.append(_report_mod.extract_row(
|
|
seed=1,
|
|
data=_base_turn_line(
|
|
winner_index=0,
|
|
p0_stats=_base_player_stats(tier_peak=5, peak_unit_tier=4, wonder_count=2),
|
|
p1_stats=_base_player_stats(tier_peak=3, peak_unit_tier=2, wonder_count=0),
|
|
),
|
|
event_counts={},
|
|
))
|
|
# One old-format game — fields absent
|
|
rows.append(_report_mod.extract_row(
|
|
seed=2, data=_base_turn_line(winner_index=1), event_counts={},
|
|
))
|
|
q = _report_mod.build_quality_metrics(rows)
|
|
# Only the new-format game contributes; medians reflect game 1 only.
|
|
assert q["median_winner_tier_peak"] == 5.0
|
|
assert q["median_loser_tier_peak"] == 3.0
|
|
assert q["median_tier_peak_gap"] == 2.0
|
|
assert q["median_peak_unit_tier"] == 3.0 # median of [4, 2]
|
|
assert q["median_wonder_count_per_player"] == 1.0 # median of [2, 0]
|