278 lines
11 KiB
Python
278 lines
11 KiB
Python
"""p0-26 tactical port baseline harness.
|
||
|
||
Shells out to tools/autoplay-batch.sh on apricot (via SSH alias 'apricot',
|
||
user lilith) and verifies the p0-01 state-at-end quality sub-gates against
|
||
the post-port binary:
|
||
|
||
- Median winner tier_peak ≥ 6
|
||
- Median tier_peak_gap (winner − loser) ≤ 2
|
||
- total_combats ≥ 50 in ≥7/10 games
|
||
|
||
Run directly:
|
||
python3 -m pytest tools/tests/test_tactical_port_baseline.py -v
|
||
|
||
Or via ./run verify (step 17 — added by p0-26).
|
||
|
||
Environment variables:
|
||
AUTOPLAY_HOST — SSH target (default: apricot). Set to "" to run locally
|
||
(requires local flatpak + Godot).
|
||
BASELINE_SEEDS — Comma-separated seed list (default: 1,2,3,4,5,6,7,8,9,10).
|
||
BASELINE_TURN_LIMIT — Turn limit per game (default: 300 = T300 batch).
|
||
SKIP_BASELINE — Set to "1" to skip the SSH batch entirely (CI machines
|
||
without apricot access will set this).
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import importlib.util
|
||
import json
|
||
import os
|
||
import shutil
|
||
import subprocess
|
||
import tempfile
|
||
from pathlib import Path
|
||
from statistics import median
|
||
from typing import Any
|
||
|
||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||
TOOLS_DIR = REPO_ROOT / "tools"
|
||
|
||
AUTOPLAY_HOST: str = os.environ.get("AUTOPLAY_HOST", "apricot")
|
||
BASELINE_SEEDS: list[int] = [
|
||
int(s) for s in os.environ.get("BASELINE_SEEDS", "1,2,3,4,5,6,7,8,9,10").split(",")
|
||
]
|
||
BASELINE_TURN_LIMIT: int = int(os.environ.get("BASELINE_TURN_LIMIT", "300"))
|
||
SKIP_BASELINE: bool = os.environ.get("SKIP_BASELINE", "0") == "1"
|
||
|
||
# p0-01 quality gates (post-reframe 2026-04-17)
|
||
GATE_MEDIAN_WINNER_TIER_PEAK_MIN = 6
|
||
GATE_MEDIAN_TIER_PEAK_GAP_MAX = 2
|
||
GATE_TOTAL_COMBATS_MIN = 50
|
||
GATE_COMBATS_GAMES_MIN = 7 # of 10
|
||
|
||
|
||
def _load_report_module() -> Any:
|
||
path = TOOLS_DIR / "autoplay-report.py"
|
||
spec = importlib.util.spec_from_file_location("autoplay_report", path)
|
||
assert spec is not None and spec.loader is not None
|
||
mod = importlib.util.module_from_spec(spec)
|
||
spec.loader.exec_module(mod)
|
||
return mod
|
||
|
||
|
||
def _collect_turn_stats(results_dir: Path) -> list[dict[str, Any]]:
|
||
"""Walk results_dir and return the last JSONL line from each game dir."""
|
||
records: list[dict[str, Any]] = []
|
||
for game_dir in sorted(results_dir.iterdir()):
|
||
stats_file = game_dir / "turn_stats.jsonl"
|
||
if not stats_file.exists():
|
||
continue
|
||
lines = [l for l in stats_file.read_text().splitlines() if l.strip()]
|
||
if not lines:
|
||
continue
|
||
records.append(json.loads(lines[-1]))
|
||
return records
|
||
|
||
|
||
def _run_batch_on_apricot(results_dir: Path) -> None:
|
||
"""Run autoplay-batch.sh for BASELINE_SEEDS games on apricot via SSH."""
|
||
count = len(BASELINE_SEEDS)
|
||
seed_offset = min(BASELINE_SEEDS) - 1 # seeds are 1-based; offset shifts range
|
||
|
||
env = {
|
||
**os.environ,
|
||
"AUTOPLAY_HOST": AUTOPLAY_HOST,
|
||
"PARALLEL": "10",
|
||
"SEED_OFFSET": str(seed_offset),
|
||
}
|
||
cmd = [
|
||
str(TOOLS_DIR / "autoplay-batch.sh"),
|
||
str(count),
|
||
str(BASELINE_TURN_LIMIT),
|
||
str(results_dir),
|
||
]
|
||
result = subprocess.run(cmd, env=env, capture_output=False, text=True)
|
||
if result.returncode != 0:
|
||
raise RuntimeError(
|
||
f"autoplay-batch.sh exited {result.returncode}. "
|
||
f"Check {results_dir} for per-seed game.log files."
|
||
)
|
||
|
||
|
||
# ── Unit tests for the harness plumbing itself ───────────────────────────────
|
||
|
||
def test_gate_constants_are_internally_consistent() -> None:
|
||
"""Gate thresholds must satisfy the p0-01 spec (sanity, not a live run)."""
|
||
assert GATE_MEDIAN_WINNER_TIER_PEAK_MIN >= 1
|
||
assert GATE_MEDIAN_TIER_PEAK_GAP_MAX >= 0
|
||
assert GATE_TOTAL_COMBATS_MIN > 0
|
||
assert 1 <= GATE_COMBATS_GAMES_MIN <= 10
|
||
|
||
|
||
def test_report_module_loads() -> None:
|
||
"""autoplay-report.py must import cleanly (catches path rot)."""
|
||
mod = _load_report_module()
|
||
assert hasattr(mod, "extract_row"), "extract_row missing from autoplay-report"
|
||
assert hasattr(mod, "build_quality_metrics"), "build_quality_metrics missing"
|
||
|
||
|
||
def test_extract_row_handles_absent_tier_peak() -> None:
|
||
"""extract_row must not crash on old-format JSONL lacking quality fields."""
|
||
mod = _load_report_module()
|
||
line: dict[str, Any] = {
|
||
"turn": 300,
|
||
"outcome": "score",
|
||
"winner_index": 0,
|
||
"winner_personality": "ironhold",
|
||
"victory_type": "score",
|
||
"wall_clock_sec": 15.0,
|
||
"aggregate": {
|
||
"total_combats": 60,
|
||
"total_cities_founded": 6,
|
||
"total_cities_captured": 2,
|
||
"turn_first_combat": 25,
|
||
"turn_first_city_captured": 150,
|
||
},
|
||
"player_stats": {
|
||
"0": {
|
||
"pop": 14, "pop_peak": 16, "mil": 5, "cities": 3,
|
||
"cities_captured": 1, "cities_lost": 0,
|
||
"gold": 300, "gold_peak": 350, "gold_per_turn": 8,
|
||
"techs": 12, "tiles": 30, "buildings": 10,
|
||
"luxuries": 2, "happiness": 6,
|
||
"food_total": 18.0, "production_total": 14.0,
|
||
"kills": 8, "units_lost": 3,
|
||
"turn_first_pop_3": 8, "turn_first_pop_4": 18,
|
||
},
|
||
"1": {
|
||
"pop": 10, "pop_peak": 12, "mil": 3, "cities": 2,
|
||
"cities_captured": 0, "cities_lost": 1,
|
||
"gold": 180, "gold_peak": 220, "gold_per_turn": 5,
|
||
"techs": 9, "tiles": 22, "buildings": 7,
|
||
"luxuries": 1, "happiness": 3,
|
||
"food_total": 12.0, "production_total": 9.0,
|
||
"kills": 4, "units_lost": 5,
|
||
"turn_first_pop_3": 12, "turn_first_pop_4": 25,
|
||
},
|
||
},
|
||
"invariant_violations": [],
|
||
}
|
||
row = mod.extract_row(seed=1, data=line, event_counts={})
|
||
assert row is not None
|
||
|
||
|
||
def test_build_quality_metrics_enforces_gates_on_synthetic_data() -> None:
|
||
"""build_quality_metrics must flag a synthetic batch that violates gates."""
|
||
mod = _load_report_module()
|
||
|
||
def _make_row(winner_tp: int, loser_tp: int, total_combats: int) -> dict[str, Any]:
|
||
from tools.tests.test_quality_metrics import _base_player_stats, _base_turn_line # type: ignore[import]
|
||
data = _base_turn_line(
|
||
winner_index=0,
|
||
p0_stats=_base_player_stats(tier_peak=winner_tp, peak_unit_tier=winner_tp - 1, wonder_count=1),
|
||
p1_stats=_base_player_stats(tier_peak=loser_tp, peak_unit_tier=loser_tp - 1, wonder_count=0),
|
||
)
|
||
data["aggregate"]["total_combats"] = total_combats
|
||
return mod.extract_row(seed=1, data=data, event_counts={})
|
||
|
||
# Build a batch that clearly fails: low tiers, few combats
|
||
rows = [_make_row(winner_tp=3, loser_tp=1, total_combats=10) for _ in range(10)]
|
||
q = mod.build_quality_metrics(rows)
|
||
|
||
assert q["median_winner_tier_peak"] is not None
|
||
assert q["median_winner_tier_peak"] < GATE_MEDIAN_WINNER_TIER_PEAK_MIN, (
|
||
"synthetic low-quality batch should fail the winner tier_peak gate"
|
||
)
|
||
|
||
|
||
def test_build_quality_metrics_passes_on_healthy_synthetic_data() -> None:
|
||
"""build_quality_metrics passes for a batch that clears all sub-gates."""
|
||
mod = _load_report_module()
|
||
|
||
from tools.tests.test_quality_metrics import _base_player_stats, _base_turn_line # type: ignore[import]
|
||
|
||
rows = []
|
||
for i in range(10):
|
||
data = _base_turn_line(
|
||
winner_index=0,
|
||
p0_stats=_base_player_stats(tier_peak=7, peak_unit_tier=6, wonder_count=2),
|
||
p1_stats=_base_player_stats(tier_peak=5, peak_unit_tier=4, wonder_count=1),
|
||
)
|
||
data["aggregate"]["total_combats"] = 55 + i
|
||
rows.append(mod.extract_row(seed=i + 1, data=data, event_counts={}))
|
||
|
||
q = mod.build_quality_metrics(rows)
|
||
|
||
assert q["median_winner_tier_peak"] >= GATE_MEDIAN_WINNER_TIER_PEAK_MIN, (
|
||
f"healthy batch failed winner tier_peak gate: {q['median_winner_tier_peak']}"
|
||
)
|
||
assert q["median_tier_peak_gap"] <= GATE_MEDIAN_TIER_PEAK_GAP_MAX, (
|
||
f"healthy batch failed tier_peak_gap gate: {q['median_tier_peak_gap']}"
|
||
)
|
||
|
||
|
||
# ── Live baseline gate (SSH + apricot run) ───────────────────────────────────
|
||
|
||
def test_p001_quality_gates_hold_post_tactical_port() -> None:
|
||
"""Run a Normal-vs-Normal 10-seed T300 batch and assert p0-01 sub-gates.
|
||
|
||
Skipped when:
|
||
- SKIP_BASELINE=1 (CI without apricot access)
|
||
- autoplay-batch.sh is not executable (dev machine missing runner)
|
||
|
||
This is the regression gate p0-26 acceptance bullet #6 requires.
|
||
"""
|
||
import pytest
|
||
|
||
if SKIP_BASELINE:
|
||
pytest.skip("SKIP_BASELINE=1: skipping live apricot batch")
|
||
|
||
batch_sh = TOOLS_DIR / "autoplay-batch.sh"
|
||
if not batch_sh.exists() or not os.access(batch_sh, os.X_OK):
|
||
pytest.skip(f"autoplay-batch.sh not executable at {batch_sh}")
|
||
|
||
mod = _load_report_module()
|
||
|
||
with tempfile.TemporaryDirectory(
|
||
dir=REPO_ROOT / ".local",
|
||
prefix="tactical_port_baseline_",
|
||
) as tmpdir:
|
||
results_dir = Path(tmpdir) / "results"
|
||
results_dir.mkdir()
|
||
|
||
_run_batch_on_apricot(results_dir)
|
||
|
||
records = _collect_turn_stats(results_dir)
|
||
assert len(records) >= len(BASELINE_SEEDS) * 8 // 10, (
|
||
f"Expected ≥80% of {len(BASELINE_SEEDS)} games to produce turn_stats.jsonl; "
|
||
f"got {len(records)}"
|
||
)
|
||
|
||
rows = [
|
||
mod.extract_row(seed=i + 1, data=rec, event_counts={})
|
||
for i, rec in enumerate(records)
|
||
]
|
||
q = mod.build_quality_metrics(rows)
|
||
|
||
# Gate 1: median winner tier_peak ≥ 6
|
||
mwtp = q.get("median_winner_tier_peak")
|
||
assert mwtp is not None, "median_winner_tier_peak missing from quality metrics"
|
||
assert mwtp >= GATE_MEDIAN_WINNER_TIER_PEAK_MIN, (
|
||
f"p0-01 gate FAIL: median winner tier_peak {mwtp} < {GATE_MEDIAN_WINNER_TIER_PEAK_MIN}"
|
||
)
|
||
|
||
# Gate 2: median tier_peak_gap ≤ 2
|
||
mtpg = q.get("median_tier_peak_gap")
|
||
assert mtpg is not None, "median_tier_peak_gap missing from quality metrics"
|
||
assert mtpg <= GATE_MEDIAN_TIER_PEAK_GAP_MAX, (
|
||
f"p0-01 gate FAIL: median tier_peak_gap {mtpg} > {GATE_MEDIAN_TIER_PEAK_GAP_MAX}"
|
||
)
|
||
|
||
# Gate 3: total_combats ≥ 50 in ≥7/10 games
|
||
combats_passing = sum(
|
||
1 for rec in records
|
||
if rec.get("aggregate", {}).get("total_combats", 0) >= GATE_TOTAL_COMBATS_MIN
|
||
)
|
||
assert combats_passing >= GATE_COMBATS_GAMES_MIN, (
|
||
f"p0-01 gate FAIL: only {combats_passing}/{len(records)} games had "
|
||
f"total_combats ≥ {GATE_TOTAL_COMBATS_MIN} (need ≥ {GATE_COMBATS_GAMES_MIN})"
|
||
)
|