magicciv/tools/huge-map-5clan.sh

#!/usr/bin/env bash
# huge-map-5clan.sh — THE "ultimate test". 5 AI clan personalities compete
# on a map sized for 8 players, stressing the AI lookahead pipeline
# end-to-end.
#
# Per project owner: this test should only run AFTER the 1v1 matchup grid
# (`tools/matchup-grid.sh`) has shown clans are balanced in head-to-head
# play.
#
# The map-size name here ("huge") matches the id in setup.json; dimensions
# and max_players are read from the data file. If that data file's "huge"
# id changes capacity, this harness picks it up automatically.
#
# Acceptance criteria (validated via `checklist-report.py ultimate_stress`):
#   - All 5 clans appear in at least one of the SEEDS runs
#   - Victory rate ≥ SEEDS/2 (games decisive — MCTS not stalling)
#   - Winner distribution non-degenerate: ≥2 distinct clans win across grid
#   - Median game length ≥ TURN_LIMIT*0.4
#
# Usage:
#   tools/huge-map-5clan.sh          # defaults SEEDS=10 TURN_LIMIT=500 PARALLEL=4
#   SEEDS=20 tools/huge-map-5clan.sh
#   tools/huge-map-5clan.sh --help
#
# Output layout:
#   .local/iter/huge-map-5clan-<stamp>/
#     game_<stamp>_seed<N>/          (SEEDS games, 5 AI clans each)
#     verdict.json
#     completion.marker
set -uo pipefail

RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'
DIM='\033[2m'; NC='\033[0m'

: "${SEEDS:=10}"
: "${TURN_LIMIT:=500}"
: "${PARALLEL:=4}"
: "${MAP_SIZE:=standard}"   # Civ5 "Standard" = 80×52, max 8 players — the
: "${NUM_PLAYERS:=5}"        # smallest map that fits the user's "huge map
                             # that 8 COULD play on" intent. Our own "huge"
                             # (128×80, 12-player) is stretch-goal; switch to
                             # MAP_SIZE=huge once POD's MAX_PLAYERS=4 limit is
                             # lifted and the game supports >8 AI slots.
# p1-22: bound MCTS per-decision wall-clock cost. 2000 ms caps each AI
# decision. Empirically (cycle 57, 2026-05-07): 5-player MCTS on a standard
# map runs ~34s/turn wall-clock, so T=300 needs ~10,200s + 25% buffer ≈ 12,750s.
# autoplay-batch.sh's default formula (TURN_LIMIT * 3 + 300 = 1200s for T=300)
# is calibrated for 2-player smoke — it is far too short here and killed all
# 10 cycle-57 games at T32-41 (exit code 124). We set SAFETY_TIMEOUT_OVERRIDE
# to TURN_LIMIT * 45 + 600 (14,100s for T=300, ~3.9h) so the per-game `timeout`
# guard in autoplay-batch.sh is appropriate for 5-clan huge-map runs.
# This value can be overridden via env if needed.
: "${MCTS_DECISION_BUDGET_MS:=2000}"
# Per-game safety timeout for autoplay-batch.sh (seconds).
# Formula: TURN_LIMIT * 45 + 600  (empirically derived — see comment above).
: "${SAFETY_TIMEOUT_OVERRIDE:=$(( TURN_LIMIT * 45 + 600 ))}"
export SAFETY_TIMEOUT_OVERRIDE

for arg in "$@"; do
    case "$arg" in
        --help|-h)
            grep -E '^#( |$)' "$0" | sed 's/^# \?//'
            exit 0 ;;
        *) echo "Unknown argument: $arg" >&2; exit 2 ;;
    esac
done

REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
STAMP="$(date +%Y%m%d_%H%M%S)"
# HUGE_OUTPUT overrides the output dir (used by apricot-run.sh).
PARENT="${HUGE_OUTPUT:-$REPO_ROOT/.local/iter/huge-map-5clan-$STAMP}"
mkdir -p "$PARENT"

# p1-27a — bring the warm MCTS service up before the run so per-AI-turn
# wall-clock benefits from GPU init + warm cache amortisation. `services:up`
# is idempotent — safe to call when the service is already running. Export
# MCTS_SOCKET_PATH so the in-process gdext bridge (api-gdext/src/ai.rs)
# prefers the warm socket over its fallback in-process path.
# Telemetry lands in $PARENT/mcts-service.jsonl so the run's per-AI-turn
# latency measurements live alongside the autoplay logs.
: "${MCTS_SOCKET_PATH:=/tmp/mc-mcts.sock}"
: "${MCTS_TELEMETRY_PATH:=$PARENT/mcts-service.jsonl}"
export MCTS_SOCKET_PATH MCTS_TELEMETRY_PATH
if [ "${SKIP_SERVICE_UP:-0}" != "1" ]; then
    bash "$REPO_ROOT/tools/run-services.sh" services:up || {
        echo -e "${YELLOW}WARN: services:up failed — continuing without warm MCTS service.${NC}" >&2
    }
fi

# Preflight: check for a passing matchup-grid within the last 30 days.
LATEST_MATCHUP_GRID="$(ls -td "$REPO_ROOT"/.local/iter/matchup-grid-*/ 2>/dev/null | head -1)"
if [ -z "$LATEST_MATCHUP_GRID" ]; then
    echo -e "${YELLOW}WARN: no matchup-grid run found.${NC}"
    echo -e "${DIM}Per project owner, 1v1 matchup balance should pass before running the ultimate test.${NC}"
    echo -e "${DIM}Run: tools/matchup-grid.sh${NC}"
    echo ""
else
    matchup_verdict="$LATEST_MATCHUP_GRID/verdict.json"
    if [ -f "$matchup_verdict" ] && command -v python3 >/dev/null; then
        pass=$(python3 -c "import json; print(json.load(open('$matchup_verdict')).get('pass', False))" 2>/dev/null || echo False)
        if [ "$pass" = "True" ]; then
            echo -e "${GREEN}prereq: matchup-grid verdict PASS${NC} ($LATEST_MATCHUP_GRID)"
        else
            echo -e "${YELLOW}WARN: most recent matchup-grid verdict is NOT passing.${NC}"
            echo -e "${DIM}$matchup_verdict${NC}"
        fi
    fi
fi

echo -e "${BLUE}huge-map-5clan (ultimate stress)${NC} — ${SEEDS} seeds × T${TURN_LIMIT} × ${NUM_PLAYERS} AI on ${MAP_SIZE} map"
echo -e "${DIM}parent: $PARENT${NC}"

MARKER="$PARENT/completion.marker"
: > "$MARKER"

# Pin all 5 slots to the 5 canonical clans so meta.player_clans is fully
# populated for the ultimate_stress verdict. Without per-slot pinning, slot 0
# (the human slot) gets empty clan_id and its wins can't be attributed.
# Seed-driven map variation still drives strategic divergence.
MAP_SIZE="$MAP_SIZE" \
NUM_PLAYERS="$NUM_PLAYERS" \
PARALLEL="$PARALLEL" \
MCTS_DECISION_BUDGET_MS="${MCTS_DECISION_BUDGET_MS:-2000}" \
SAFETY_TIMEOUT_OVERRIDE="${SAFETY_TIMEOUT_OVERRIDE:-}" \
AI_USE_MCTS=true \
AI_PIN_PERSONALITY_P0=ironhold \
AI_PIN_PERSONALITY_P1=blackhammer \
AI_PIN_PERSONALITY_P2=goldvein \
AI_PIN_PERSONALITY_P3=deepforge \
AI_PIN_PERSONALITY_P4=runesmith \
bash "$REPO_ROOT/tools/autoplay-batch.sh" "$SEEDS" "$TURN_LIMIT" "$PARENT" \
    > "$PARENT/batch.log" 2>&1
batch_rc=$?
printf 'batch_exit=%d\n' "$batch_rc" >> "$MARKER"

echo -e "${BLUE}computing ultimate_stress verdict…${NC}"
python3 "$REPO_ROOT/tools/checklist-report.py" ultimate_stress "$PARENT" \
    > "$PARENT/verdict.json" 2> "$PARENT/gate.stderr"
gate_rc=$?
printf 'gate_exit=%d\n' "$gate_rc" >> "$MARKER"
printf 'finished=%s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$MARKER"
printf 'parent=%s\n' "$PARENT" >> "$MARKER"

if [ "$gate_rc" -eq 0 ]; then
    echo -e "${GREEN}ultimate_stress: PASS${NC}"
else
    echo -e "${RED}ultimate_stress: FAIL${NC} (gate_exit=$gate_rc)"
    echo -e "${DIM}see: $PARENT/verdict.json${NC}"
fi
exit $gate_rc