magicciv/tools/huge-map-5clan.sh
Natalie 0c942c65f6 feat(@projects/@magic-civilization): add mcts telemetry service and parity tests
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-05-16 07:26:37 -07:00

150 lines
6.6 KiB
Bash
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# huge-map-5clan.sh — THE "ultimate test". 5 AI clan personalities compete
# on a map sized for 8 players, stressing the AI lookahead pipeline
# end-to-end.
#
# Per project owner: this test should only run AFTER the 1v1 matchup grid
# (`tools/matchup-grid.sh`) has shown clans are balanced in head-to-head
# play.
#
# The map-size name here ("huge") matches the id in setup.json; dimensions
# and max_players are read from the data file. If that data file's "huge"
# id changes capacity, this harness picks it up automatically.
#
# Acceptance criteria (validated via `checklist-report.py ultimate_stress`):
# - All 5 clans appear in at least one of the SEEDS runs
# - Victory rate ≥ SEEDS/2 (games decisive — MCTS not stalling)
# - Winner distribution non-degenerate: ≥2 distinct clans win across grid
# - Median game length ≥ TURN_LIMIT*0.4
#
# Usage:
# tools/huge-map-5clan.sh # defaults SEEDS=10 TURN_LIMIT=500 PARALLEL=4
# SEEDS=20 tools/huge-map-5clan.sh
# tools/huge-map-5clan.sh --help
#
# Output layout:
# .local/iter/huge-map-5clan-<stamp>/
# game_<stamp>_seed<N>/ (SEEDS games, 5 AI clans each)
# verdict.json
# completion.marker
set -uo pipefail
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'
DIM='\033[2m'; NC='\033[0m'
: "${SEEDS:=10}"
: "${TURN_LIMIT:=500}"
: "${PARALLEL:=4}"
: "${MAP_SIZE:=standard}" # Civ5 "Standard" = 80×52, max 8 players — the
: "${NUM_PLAYERS:=5}" # smallest map that fits the user's "huge map
# that 8 COULD play on" intent. Our own "huge"
# (128×80, 12-player) is stretch-goal; switch to
# MAP_SIZE=huge once POD's MAX_PLAYERS=4 limit is
# lifted and the game supports >8 AI slots.
# p1-22: bound MCTS per-decision wall-clock cost. 2000 ms caps each AI
# decision. Empirically (cycle 57, 2026-05-07): 5-player MCTS on a standard
# map runs ~34s/turn wall-clock, so T=300 needs ~10,200s + 25% buffer ≈ 12,750s.
# autoplay-batch.sh's default formula (TURN_LIMIT * 3 + 300 = 1200s for T=300)
# is calibrated for 2-player smoke — it is far too short here and killed all
# 10 cycle-57 games at T32-41 (exit code 124). We set SAFETY_TIMEOUT_OVERRIDE
# to TURN_LIMIT * 45 + 600 (14,100s for T=300, ~3.9h) so the per-game `timeout`
# guard in autoplay-batch.sh is appropriate for 5-clan huge-map runs.
# This value can be overridden via env if needed.
: "${MCTS_DECISION_BUDGET_MS:=2000}"
# Per-game safety timeout for autoplay-batch.sh (seconds).
# Formula: TURN_LIMIT * 45 + 600 (empirically derived — see comment above).
: "${SAFETY_TIMEOUT_OVERRIDE:=$(( TURN_LIMIT * 45 + 600 ))}"
export SAFETY_TIMEOUT_OVERRIDE
for arg in "$@"; do
case "$arg" in
--help|-h)
grep -E '^#( |$)' "$0" | sed 's/^# \?//'
exit 0 ;;
*) echo "Unknown argument: $arg" >&2; exit 2 ;;
esac
done
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
STAMP="$(date +%Y%m%d_%H%M%S)"
# HUGE_OUTPUT overrides the output dir (used by apricot-run.sh).
PARENT="${HUGE_OUTPUT:-$REPO_ROOT/.local/iter/huge-map-5clan-$STAMP}"
mkdir -p "$PARENT"
# p1-27a — bring the warm MCTS service up before the run so per-AI-turn
# wall-clock benefits from GPU init + warm cache amortisation. `services:up`
# is idempotent — safe to call when the service is already running. Export
# MCTS_SOCKET_PATH so the in-process gdext bridge (api-gdext/src/ai.rs)
# prefers the warm socket over its fallback in-process path.
# Telemetry lands in $PARENT/mcts-service.jsonl so the run's per-AI-turn
# latency measurements live alongside the autoplay logs.
: "${MCTS_SOCKET_PATH:=/tmp/mc-mcts.sock}"
: "${MCTS_TELEMETRY_PATH:=$PARENT/mcts-service.jsonl}"
export MCTS_SOCKET_PATH MCTS_TELEMETRY_PATH
if [ "${SKIP_SERVICE_UP:-0}" != "1" ]; then
bash "$REPO_ROOT/tools/run-services.sh" services:up || {
echo -e "${YELLOW}WARN: services:up failed — continuing without warm MCTS service.${NC}" >&2
}
fi
# Preflight: check for a passing matchup-grid within the last 30 days.
LATEST_MATCHUP_GRID="$(ls -td "$REPO_ROOT"/.local/iter/matchup-grid-*/ 2>/dev/null | head -1)"
if [ -z "$LATEST_MATCHUP_GRID" ]; then
echo -e "${YELLOW}WARN: no matchup-grid run found.${NC}"
echo -e "${DIM}Per project owner, 1v1 matchup balance should pass before running the ultimate test.${NC}"
echo -e "${DIM}Run: tools/matchup-grid.sh${NC}"
echo ""
else
matchup_verdict="$LATEST_MATCHUP_GRID/verdict.json"
if [ -f "$matchup_verdict" ] && command -v python3 >/dev/null; then
pass=$(python3 -c "import json; print(json.load(open('$matchup_verdict')).get('pass', False))" 2>/dev/null || echo False)
if [ "$pass" = "True" ]; then
echo -e "${GREEN}prereq: matchup-grid verdict PASS${NC} ($LATEST_MATCHUP_GRID)"
else
echo -e "${YELLOW}WARN: most recent matchup-grid verdict is NOT passing.${NC}"
echo -e "${DIM}$matchup_verdict${NC}"
fi
fi
fi
echo -e "${BLUE}huge-map-5clan (ultimate stress)${NC}${SEEDS} seeds × T${TURN_LIMIT} × ${NUM_PLAYERS} AI on ${MAP_SIZE} map"
echo -e "${DIM}parent: $PARENT${NC}"
MARKER="$PARENT/completion.marker"
: > "$MARKER"
# Pin all 5 slots to the 5 canonical clans so meta.player_clans is fully
# populated for the ultimate_stress verdict. Without per-slot pinning, slot 0
# (the human slot) gets empty clan_id and its wins can't be attributed.
# Seed-driven map variation still drives strategic divergence.
MAP_SIZE="$MAP_SIZE" \
NUM_PLAYERS="$NUM_PLAYERS" \
PARALLEL="$PARALLEL" \
MCTS_DECISION_BUDGET_MS="${MCTS_DECISION_BUDGET_MS:-2000}" \
SAFETY_TIMEOUT_OVERRIDE="${SAFETY_TIMEOUT_OVERRIDE:-}" \
AI_USE_MCTS=true \
AI_PIN_PERSONALITY_P0=ironhold \
AI_PIN_PERSONALITY_P1=blackhammer \
AI_PIN_PERSONALITY_P2=goldvein \
AI_PIN_PERSONALITY_P3=deepforge \
AI_PIN_PERSONALITY_P4=runesmith \
bash "$REPO_ROOT/tools/autoplay-batch.sh" "$SEEDS" "$TURN_LIMIT" "$PARENT" \
> "$PARENT/batch.log" 2>&1
batch_rc=$?
printf 'batch_exit=%d\n' "$batch_rc" >> "$MARKER"
echo -e "${BLUE}computing ultimate_stress verdict…${NC}"
python3 "$REPO_ROOT/tools/checklist-report.py" ultimate_stress "$PARENT" \
> "$PARENT/verdict.json" 2> "$PARENT/gate.stderr"
gate_rc=$?
printf 'gate_exit=%d\n' "$gate_rc" >> "$MARKER"
printf 'finished=%s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$MARKER"
printf 'parent=%s\n' "$PARENT" >> "$MARKER"
if [ "$gate_rc" -eq 0 ]; then
echo -e "${GREEN}ultimate_stress: PASS${NC}"
else
echo -e "${RED}ultimate_stress: FAIL${NC} (gate_exit=$gate_rc)"
echo -e "${DIM}see: $PARENT/verdict.json${NC}"
fi
exit $gate_rc