magicciv/tools/tests/test_tactical_port_baseline.py
Natalie 43674e5bcb feat(@projects/@magic-civilization): enhance empire economy & ai integration
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-04-17 16:56:13 -07:00

278 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""p0-26 tactical port baseline harness.
Shells out to tools/autoplay-batch.sh on apricot (via SSH alias 'apricot',
user lilith) and verifies the p0-01 state-at-end quality sub-gates against
the post-port binary:
- Median winner tier_peak ≥ 6
- Median tier_peak_gap (winner loser) ≤ 2
- total_combats ≥ 50 in ≥7/10 games
Run directly:
python3 -m pytest tools/tests/test_tactical_port_baseline.py -v
Or via ./run verify (step 17 — added by p0-26).
Environment variables:
AUTOPLAY_HOST — SSH target (default: apricot). Set to "" to run locally
(requires local flatpak + Godot).
BASELINE_SEEDS — Comma-separated seed list (default: 1,2,3,4,5,6,7,8,9,10).
BASELINE_TURN_LIMIT — Turn limit per game (default: 300 = T300 batch).
SKIP_BASELINE — Set to "1" to skip the SSH batch entirely (CI machines
without apricot access will set this).
"""
from __future__ import annotations
import importlib.util
import json
import os
import shutil
import subprocess
import tempfile
from pathlib import Path
from statistics import median
from typing import Any
REPO_ROOT = Path(__file__).resolve().parents[2]
TOOLS_DIR = REPO_ROOT / "tools"
AUTOPLAY_HOST: str = os.environ.get("AUTOPLAY_HOST", "apricot")
BASELINE_SEEDS: list[int] = [
int(s) for s in os.environ.get("BASELINE_SEEDS", "1,2,3,4,5,6,7,8,9,10").split(",")
]
BASELINE_TURN_LIMIT: int = int(os.environ.get("BASELINE_TURN_LIMIT", "300"))
SKIP_BASELINE: bool = os.environ.get("SKIP_BASELINE", "0") == "1"
# p0-01 quality gates (post-reframe 2026-04-17)
GATE_MEDIAN_WINNER_TIER_PEAK_MIN = 6
GATE_MEDIAN_TIER_PEAK_GAP_MAX = 2
GATE_TOTAL_COMBATS_MIN = 50
GATE_COMBATS_GAMES_MIN = 7 # of 10
def _load_report_module() -> Any:
path = TOOLS_DIR / "autoplay-report.py"
spec = importlib.util.spec_from_file_location("autoplay_report", path)
assert spec is not None and spec.loader is not None
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
def _collect_turn_stats(results_dir: Path) -> list[dict[str, Any]]:
"""Walk results_dir and return the last JSONL line from each game dir."""
records: list[dict[str, Any]] = []
for game_dir in sorted(results_dir.iterdir()):
stats_file = game_dir / "turn_stats.jsonl"
if not stats_file.exists():
continue
lines = [l for l in stats_file.read_text().splitlines() if l.strip()]
if not lines:
continue
records.append(json.loads(lines[-1]))
return records
def _run_batch_on_apricot(results_dir: Path) -> None:
"""Run autoplay-batch.sh for BASELINE_SEEDS games on apricot via SSH."""
count = len(BASELINE_SEEDS)
seed_offset = min(BASELINE_SEEDS) - 1 # seeds are 1-based; offset shifts range
env = {
**os.environ,
"AUTOPLAY_HOST": AUTOPLAY_HOST,
"PARALLEL": "10",
"SEED_OFFSET": str(seed_offset),
}
cmd = [
str(TOOLS_DIR / "autoplay-batch.sh"),
str(count),
str(BASELINE_TURN_LIMIT),
str(results_dir),
]
result = subprocess.run(cmd, env=env, capture_output=False, text=True)
if result.returncode != 0:
raise RuntimeError(
f"autoplay-batch.sh exited {result.returncode}. "
f"Check {results_dir} for per-seed game.log files."
)
# ── Unit tests for the harness plumbing itself ───────────────────────────────
def test_gate_constants_are_internally_consistent() -> None:
"""Gate thresholds must satisfy the p0-01 spec (sanity, not a live run)."""
assert GATE_MEDIAN_WINNER_TIER_PEAK_MIN >= 1
assert GATE_MEDIAN_TIER_PEAK_GAP_MAX >= 0
assert GATE_TOTAL_COMBATS_MIN > 0
assert 1 <= GATE_COMBATS_GAMES_MIN <= 10
def test_report_module_loads() -> None:
"""autoplay-report.py must import cleanly (catches path rot)."""
mod = _load_report_module()
assert hasattr(mod, "extract_row"), "extract_row missing from autoplay-report"
assert hasattr(mod, "build_quality_metrics"), "build_quality_metrics missing"
def test_extract_row_handles_absent_tier_peak() -> None:
"""extract_row must not crash on old-format JSONL lacking quality fields."""
mod = _load_report_module()
line: dict[str, Any] = {
"turn": 300,
"outcome": "score",
"winner_index": 0,
"winner_personality": "ironhold",
"victory_type": "score",
"wall_clock_sec": 15.0,
"aggregate": {
"total_combats": 60,
"total_cities_founded": 6,
"total_cities_captured": 2,
"turn_first_combat": 25,
"turn_first_city_captured": 150,
},
"player_stats": {
"0": {
"pop": 14, "pop_peak": 16, "mil": 5, "cities": 3,
"cities_captured": 1, "cities_lost": 0,
"gold": 300, "gold_peak": 350, "gold_per_turn": 8,
"techs": 12, "tiles": 30, "buildings": 10,
"luxuries": 2, "happiness": 6,
"food_total": 18.0, "production_total": 14.0,
"kills": 8, "units_lost": 3,
"turn_first_pop_3": 8, "turn_first_pop_4": 18,
},
"1": {
"pop": 10, "pop_peak": 12, "mil": 3, "cities": 2,
"cities_captured": 0, "cities_lost": 1,
"gold": 180, "gold_peak": 220, "gold_per_turn": 5,
"techs": 9, "tiles": 22, "buildings": 7,
"luxuries": 1, "happiness": 3,
"food_total": 12.0, "production_total": 9.0,
"kills": 4, "units_lost": 5,
"turn_first_pop_3": 12, "turn_first_pop_4": 25,
},
},
"invariant_violations": [],
}
row = mod.extract_row(seed=1, data=line, event_counts={})
assert row is not None
def test_build_quality_metrics_enforces_gates_on_synthetic_data() -> None:
"""build_quality_metrics must flag a synthetic batch that violates gates."""
mod = _load_report_module()
def _make_row(winner_tp: int, loser_tp: int, total_combats: int) -> dict[str, Any]:
from tools.tests.test_quality_metrics import _base_player_stats, _base_turn_line # type: ignore[import]
data = _base_turn_line(
winner_index=0,
p0_stats=_base_player_stats(tier_peak=winner_tp, peak_unit_tier=winner_tp - 1, wonder_count=1),
p1_stats=_base_player_stats(tier_peak=loser_tp, peak_unit_tier=loser_tp - 1, wonder_count=0),
)
data["aggregate"]["total_combats"] = total_combats
return mod.extract_row(seed=1, data=data, event_counts={})
# Build a batch that clearly fails: low tiers, few combats
rows = [_make_row(winner_tp=3, loser_tp=1, total_combats=10) for _ in range(10)]
q = mod.build_quality_metrics(rows)
assert q["median_winner_tier_peak"] is not None
assert q["median_winner_tier_peak"] < GATE_MEDIAN_WINNER_TIER_PEAK_MIN, (
"synthetic low-quality batch should fail the winner tier_peak gate"
)
def test_build_quality_metrics_passes_on_healthy_synthetic_data() -> None:
"""build_quality_metrics passes for a batch that clears all sub-gates."""
mod = _load_report_module()
from tools.tests.test_quality_metrics import _base_player_stats, _base_turn_line # type: ignore[import]
rows = []
for i in range(10):
data = _base_turn_line(
winner_index=0,
p0_stats=_base_player_stats(tier_peak=7, peak_unit_tier=6, wonder_count=2),
p1_stats=_base_player_stats(tier_peak=5, peak_unit_tier=4, wonder_count=1),
)
data["aggregate"]["total_combats"] = 55 + i
rows.append(mod.extract_row(seed=i + 1, data=data, event_counts={}))
q = mod.build_quality_metrics(rows)
assert q["median_winner_tier_peak"] >= GATE_MEDIAN_WINNER_TIER_PEAK_MIN, (
f"healthy batch failed winner tier_peak gate: {q['median_winner_tier_peak']}"
)
assert q["median_tier_peak_gap"] <= GATE_MEDIAN_TIER_PEAK_GAP_MAX, (
f"healthy batch failed tier_peak_gap gate: {q['median_tier_peak_gap']}"
)
# ── Live baseline gate (SSH + apricot run) ───────────────────────────────────
def test_p001_quality_gates_hold_post_tactical_port() -> None:
"""Run a Normal-vs-Normal 10-seed T300 batch and assert p0-01 sub-gates.
Skipped when:
- SKIP_BASELINE=1 (CI without apricot access)
- autoplay-batch.sh is not executable (dev machine missing runner)
This is the regression gate p0-26 acceptance bullet #6 requires.
"""
import pytest
if SKIP_BASELINE:
pytest.skip("SKIP_BASELINE=1: skipping live apricot batch")
batch_sh = TOOLS_DIR / "autoplay-batch.sh"
if not batch_sh.exists() or not os.access(batch_sh, os.X_OK):
pytest.skip(f"autoplay-batch.sh not executable at {batch_sh}")
mod = _load_report_module()
with tempfile.TemporaryDirectory(
dir=REPO_ROOT / ".local",
prefix="tactical_port_baseline_",
) as tmpdir:
results_dir = Path(tmpdir) / "results"
results_dir.mkdir()
_run_batch_on_apricot(results_dir)
records = _collect_turn_stats(results_dir)
assert len(records) >= len(BASELINE_SEEDS) * 8 // 10, (
f"Expected ≥80% of {len(BASELINE_SEEDS)} games to produce turn_stats.jsonl; "
f"got {len(records)}"
)
rows = [
mod.extract_row(seed=i + 1, data=rec, event_counts={})
for i, rec in enumerate(records)
]
q = mod.build_quality_metrics(rows)
# Gate 1: median winner tier_peak ≥ 6
mwtp = q.get("median_winner_tier_peak")
assert mwtp is not None, "median_winner_tier_peak missing from quality metrics"
assert mwtp >= GATE_MEDIAN_WINNER_TIER_PEAK_MIN, (
f"p0-01 gate FAIL: median winner tier_peak {mwtp} < {GATE_MEDIAN_WINNER_TIER_PEAK_MIN}"
)
# Gate 2: median tier_peak_gap ≤ 2
mtpg = q.get("median_tier_peak_gap")
assert mtpg is not None, "median_tier_peak_gap missing from quality metrics"
assert mtpg <= GATE_MEDIAN_TIER_PEAK_GAP_MAX, (
f"p0-01 gate FAIL: median tier_peak_gap {mtpg} > {GATE_MEDIAN_TIER_PEAK_GAP_MAX}"
)
# Gate 3: total_combats ≥ 50 in ≥7/10 games
combats_passing = sum(
1 for rec in records
if rec.get("aggregate", {}).get("total_combats", 0) >= GATE_TOTAL_COMBATS_MIN
)
assert combats_passing >= GATE_COMBATS_GAMES_MIN, (
f"p0-01 gate FAIL: only {combats_passing}/{len(records)} games had "
f"total_combats ≥ {GATE_TOTAL_COMBATS_MIN} (need ≥ {GATE_COMBATS_GAMES_MIN})"
)