150 lines
6.6 KiB
Bash
Executable file
150 lines
6.6 KiB
Bash
Executable file
#!/usr/bin/env bash
|
||
# huge-map-5clan.sh — THE "ultimate test". 5 AI clan personalities compete
|
||
# on a map sized for 8 players, stressing the AI lookahead pipeline
|
||
# end-to-end.
|
||
#
|
||
# Per project owner: this test should only run AFTER the 1v1 matchup grid
|
||
# (`tools/matchup-grid.sh`) has shown clans are balanced in head-to-head
|
||
# play.
|
||
#
|
||
# The map-size name here ("huge") matches the id in setup.json; dimensions
|
||
# and max_players are read from the data file. If that data file's "huge"
|
||
# id changes capacity, this harness picks it up automatically.
|
||
#
|
||
# Acceptance criteria (validated via `checklist-report.py ultimate_stress`):
|
||
# - All 5 clans appear in at least one of the SEEDS runs
|
||
# - Victory rate ≥ SEEDS/2 (games decisive — MCTS not stalling)
|
||
# - Winner distribution non-degenerate: ≥2 distinct clans win across grid
|
||
# - Median game length ≥ TURN_LIMIT*0.4
|
||
#
|
||
# Usage:
|
||
# tools/huge-map-5clan.sh # defaults SEEDS=10 TURN_LIMIT=500 PARALLEL=4
|
||
# SEEDS=20 tools/huge-map-5clan.sh
|
||
# tools/huge-map-5clan.sh --help
|
||
#
|
||
# Output layout:
|
||
# .local/iter/huge-map-5clan-<stamp>/
|
||
# game_<stamp>_seed<N>/ (SEEDS games, 5 AI clans each)
|
||
# verdict.json
|
||
# completion.marker
|
||
set -uo pipefail
|
||
|
||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'
|
||
DIM='\033[2m'; NC='\033[0m'
|
||
|
||
: "${SEEDS:=10}"
|
||
: "${TURN_LIMIT:=500}"
|
||
: "${PARALLEL:=4}"
|
||
: "${MAP_SIZE:=standard}" # Civ5 "Standard" = 80×52, max 8 players — the
|
||
: "${NUM_PLAYERS:=5}" # smallest map that fits the user's "huge map
|
||
# that 8 COULD play on" intent. Our own "huge"
|
||
# (128×80, 12-player) is stretch-goal; switch to
|
||
# MAP_SIZE=huge once POD's MAX_PLAYERS=4 limit is
|
||
# lifted and the game supports >8 AI slots.
|
||
# p1-22: bound MCTS per-decision wall-clock cost. 2000 ms caps each AI
|
||
# decision. Empirically (cycle 57, 2026-05-07): 5-player MCTS on a standard
|
||
# map runs ~34s/turn wall-clock, so T=300 needs ~10,200s + 25% buffer ≈ 12,750s.
|
||
# autoplay-batch.sh's default formula (TURN_LIMIT * 3 + 300 = 1200s for T=300)
|
||
# is calibrated for 2-player smoke — it is far too short here and killed all
|
||
# 10 cycle-57 games at T32-41 (exit code 124). We set SAFETY_TIMEOUT_OVERRIDE
|
||
# to TURN_LIMIT * 45 + 600 (14,100s for T=300, ~3.9h) so the per-game `timeout`
|
||
# guard in autoplay-batch.sh is appropriate for 5-clan huge-map runs.
|
||
# This value can be overridden via env if needed.
|
||
: "${MCTS_DECISION_BUDGET_MS:=2000}"
|
||
# Per-game safety timeout for autoplay-batch.sh (seconds).
|
||
# Formula: TURN_LIMIT * 45 + 600 (empirically derived — see comment above).
|
||
: "${SAFETY_TIMEOUT_OVERRIDE:=$(( TURN_LIMIT * 45 + 600 ))}"
|
||
export SAFETY_TIMEOUT_OVERRIDE
|
||
|
||
for arg in "$@"; do
|
||
case "$arg" in
|
||
--help|-h)
|
||
grep -E '^#( |$)' "$0" | sed 's/^# \?//'
|
||
exit 0 ;;
|
||
*) echo "Unknown argument: $arg" >&2; exit 2 ;;
|
||
esac
|
||
done
|
||
|
||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||
STAMP="$(date +%Y%m%d_%H%M%S)"
|
||
# HUGE_OUTPUT overrides the output dir (used by apricot-run.sh).
|
||
PARENT="${HUGE_OUTPUT:-$REPO_ROOT/.local/iter/huge-map-5clan-$STAMP}"
|
||
mkdir -p "$PARENT"
|
||
|
||
# p1-27a — bring the warm MCTS service up before the run so per-AI-turn
|
||
# wall-clock benefits from GPU init + warm cache amortisation. `services:up`
|
||
# is idempotent — safe to call when the service is already running. Export
|
||
# MCTS_SOCKET_PATH so the in-process gdext bridge (api-gdext/src/ai.rs)
|
||
# prefers the warm socket over its fallback in-process path.
|
||
# Telemetry lands in $PARENT/mcts-service.jsonl so the run's per-AI-turn
|
||
# latency measurements live alongside the autoplay logs.
|
||
: "${MCTS_SOCKET_PATH:=/tmp/mc-mcts.sock}"
|
||
: "${MCTS_TELEMETRY_PATH:=$PARENT/mcts-service.jsonl}"
|
||
export MCTS_SOCKET_PATH MCTS_TELEMETRY_PATH
|
||
if [ "${SKIP_SERVICE_UP:-0}" != "1" ]; then
|
||
bash "$REPO_ROOT/tools/run-services.sh" services:up || {
|
||
echo -e "${YELLOW}WARN: services:up failed — continuing without warm MCTS service.${NC}" >&2
|
||
}
|
||
fi
|
||
|
||
# Preflight: check for a passing matchup-grid within the last 30 days.
|
||
LATEST_MATCHUP_GRID="$(ls -td "$REPO_ROOT"/.local/iter/matchup-grid-*/ 2>/dev/null | head -1)"
|
||
if [ -z "$LATEST_MATCHUP_GRID" ]; then
|
||
echo -e "${YELLOW}WARN: no matchup-grid run found.${NC}"
|
||
echo -e "${DIM}Per project owner, 1v1 matchup balance should pass before running the ultimate test.${NC}"
|
||
echo -e "${DIM}Run: tools/matchup-grid.sh${NC}"
|
||
echo ""
|
||
else
|
||
matchup_verdict="$LATEST_MATCHUP_GRID/verdict.json"
|
||
if [ -f "$matchup_verdict" ] && command -v python3 >/dev/null; then
|
||
pass=$(python3 -c "import json; print(json.load(open('$matchup_verdict')).get('pass', False))" 2>/dev/null || echo False)
|
||
if [ "$pass" = "True" ]; then
|
||
echo -e "${GREEN}prereq: matchup-grid verdict PASS${NC} ($LATEST_MATCHUP_GRID)"
|
||
else
|
||
echo -e "${YELLOW}WARN: most recent matchup-grid verdict is NOT passing.${NC}"
|
||
echo -e "${DIM}$matchup_verdict${NC}"
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
echo -e "${BLUE}huge-map-5clan (ultimate stress)${NC} — ${SEEDS} seeds × T${TURN_LIMIT} × ${NUM_PLAYERS} AI on ${MAP_SIZE} map"
|
||
echo -e "${DIM}parent: $PARENT${NC}"
|
||
|
||
MARKER="$PARENT/completion.marker"
|
||
: > "$MARKER"
|
||
|
||
# Pin all 5 slots to the 5 canonical clans so meta.player_clans is fully
|
||
# populated for the ultimate_stress verdict. Without per-slot pinning, slot 0
|
||
# (the human slot) gets empty clan_id and its wins can't be attributed.
|
||
# Seed-driven map variation still drives strategic divergence.
|
||
MAP_SIZE="$MAP_SIZE" \
|
||
NUM_PLAYERS="$NUM_PLAYERS" \
|
||
PARALLEL="$PARALLEL" \
|
||
MCTS_DECISION_BUDGET_MS="${MCTS_DECISION_BUDGET_MS:-2000}" \
|
||
SAFETY_TIMEOUT_OVERRIDE="${SAFETY_TIMEOUT_OVERRIDE:-}" \
|
||
AI_USE_MCTS=true \
|
||
AI_PIN_PERSONALITY_P0=ironhold \
|
||
AI_PIN_PERSONALITY_P1=blackhammer \
|
||
AI_PIN_PERSONALITY_P2=goldvein \
|
||
AI_PIN_PERSONALITY_P3=deepforge \
|
||
AI_PIN_PERSONALITY_P4=runesmith \
|
||
bash "$REPO_ROOT/tools/autoplay-batch.sh" "$SEEDS" "$TURN_LIMIT" "$PARENT" \
|
||
> "$PARENT/batch.log" 2>&1
|
||
batch_rc=$?
|
||
printf 'batch_exit=%d\n' "$batch_rc" >> "$MARKER"
|
||
|
||
echo -e "${BLUE}computing ultimate_stress verdict…${NC}"
|
||
python3 "$REPO_ROOT/tools/checklist-report.py" ultimate_stress "$PARENT" \
|
||
> "$PARENT/verdict.json" 2> "$PARENT/gate.stderr"
|
||
gate_rc=$?
|
||
printf 'gate_exit=%d\n' "$gate_rc" >> "$MARKER"
|
||
printf 'finished=%s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$MARKER"
|
||
printf 'parent=%s\n' "$PARENT" >> "$MARKER"
|
||
|
||
if [ "$gate_rc" -eq 0 ]; then
|
||
echo -e "${GREEN}ultimate_stress: PASS${NC}"
|
||
else
|
||
echo -e "${RED}ultimate_stress: FAIL${NC} (gate_exit=$gate_rc)"
|
||
echo -e "${DIM}see: $PARENT/verdict.json${NC}"
|
||
fi
|
||
exit $gate_rc
|