magicciv/tools/determinism-audit.sh

199 lines
8.3 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
# determinism-audit.sh — Audit AI determinism across the three canonical scenarios.
#
# TODO: re-run when RUN-host environment stabilizes. As of 2026-04-17, the RUN
# host has a `class_name Diplomacy` collision preventing TurnManager from
# compiling, so all autoplay games exit in_progress around turn 61. No
# determinism signal can be extracted until that's resolved. See task #5 /
# blocker thread.
#
# Scenarios (p0-20 / task T3):
# 1. CPU -> CPU: same seed twice with AI_GPU_ROLLOUT=false. Diff must be empty
# (modulo timing/metadata allowlist).
# 2. CPU -> GPU: same seed on AI_GPU_ROLLOUT=false and =true. Integer fields
# must match byte-for-byte; scalar floats within 1e-4.
# 3. Parallel batch: PARALLEL=10 twice — per-seed dirs identical (modulo timing).
# (Process-restart determinism is OUT OF SCOPE here — owned by p1-09 autosave.
# Do NOT add to this audit. See .project/objectives/p1-09-*.md when that lands.)
#
# Runs ON the RUN host via SSH (requires AUTOPLAY_HOST / PROJECT_ROOT_REMOTE).
# Writes report to .local/iter/determinism-audit-<timestamp>/summary.md on EDIT host.
#
# Exit codes:
# 0 — all enabled scenarios passed
# 1 — one or more scenarios failed
# 2 — usage / env error
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
: "${AUTOPLAY_HOST:?AUTOPLAY_HOST must be set (e.g. lilith@apricot.lan)}"
: "${PROJECT_ROOT_REMOTE:?PROJECT_ROOT_REMOTE must be set (repo path on RUN host)}"
STAMP="$(date +%Y%m%d_%H%M%S)"
AUDIT_DIR_LOCAL="$PROJECT_DIR/.local/iter/determinism-audit-$STAMP"
AUDIT_DIR_REMOTE="$PROJECT_ROOT_REMOTE/.local/iter/determinism-audit-$STAMP"
SUMMARY="$AUDIT_DIR_LOCAL/summary.md"
mkdir -p "$AUDIT_DIR_LOCAL"
# Scenario toggles (env-overridable so we can run scenarios individually)
RUN_CPU_CPU="${RUN_CPU_CPU:-true}"
RUN_CPU_GPU="${RUN_CPU_GPU:-true}"
RUN_PARALLEL="${RUN_PARALLEL:-true}"
SEED_COUNT="${SEED_COUNT:-3}"
TURN_LIMIT="${TURN_LIMIT:-150}"
FLOAT_TOL="${FLOAT_TOL:-0.0001}"
FAILURES=()
echo "# Determinism Audit — $STAMP" > "$SUMMARY"
echo "" >> "$SUMMARY"
echo "- Host: \`$AUTOPLAY_HOST\`" >> "$SUMMARY"
echo "- Seeds: $SEED_COUNT, turn_limit: $TURN_LIMIT, float tolerance: $FLOAT_TOL" >> "$SUMMARY"
echo "" >> "$SUMMARY"
_batch_remote() {
# $1 remote_results_dir, $2 parallelism, $3 extra_env
local remote_dir="$1"
local par="$2"
local extra_env="$3"
ssh "$AUTOPLAY_HOST" "
mkdir -p '$remote_dir'
$extra_env PARALLEL=$par bash '$PROJECT_ROOT_REMOTE/tools/autoplay-batch.sh' \
$SEED_COUNT $TURN_LIMIT '$remote_dir' > '$remote_dir/batch.log' 2>&1
"
}
_diff_remote() {
# $1 dir_a, $2 dir_b, $3 output diff path — excludes timing/log/stamp fields
# turn_stats.jsonl and events.jsonl are the deterministic signals; game.log
# and meta.json carry wall-clock/PID/timestamp data that legitimately varies.
local a="$1" b="$2" out="$3"
ssh "$AUTOPLAY_HOST" "
diff -r \
--exclude='game.log' \
--exclude='batch.log' \
--exclude='weston.log' \
--exclude='meta.json' \
'$a' '$b' > '$out' 2>&1 || true
"
}
_fetch_remote_file() {
# Pull a single remote text file back to EDIT host. Swallows errors since
# the scenario assertion is what matters; a missing diff file is handled
# upstream via `[ -s ... ]`.
scp "$AUTOPLAY_HOST:$1" "$2" >/dev/null 2>&1 || true
}
_record() {
# $1 scenario, $2 PASS|FAIL, $3 detail
echo "## $1$2" >> "$SUMMARY"
echo "" >> "$SUMMARY"
echo "$3" >> "$SUMMARY"
echo "" >> "$SUMMARY"
if [ "$2" = "FAIL" ]; then
FAILURES+=("$1")
fi
}
# ── Scenario 1: CPU → CPU (serial, PARALLEL=1) ──────────────────────────────
# Run serially so Scenario 1 isolates RNG determinism from parallel dispatch.
# Scenario 3 covers parallel dispatch determinism separately.
if [ "$RUN_CPU_CPU" = "true" ]; then
echo "Running Scenario 1: CPU → CPU (serial)..."
_batch_remote "$AUDIT_DIR_REMOTE/cpu-cpu-run1" 1 "AI_USE_MCTS=true AI_GPU_ROLLOUT=false"
_batch_remote "$AUDIT_DIR_REMOTE/cpu-cpu-run2" 1 "AI_USE_MCTS=true AI_GPU_ROLLOUT=false"
_diff_remote "$AUDIT_DIR_REMOTE/cpu-cpu-run1" "$AUDIT_DIR_REMOTE/cpu-cpu-run2" \
"$AUDIT_DIR_REMOTE/cpu-cpu.diff"
_fetch_remote_file "$AUDIT_DIR_REMOTE/cpu-cpu.diff" "$AUDIT_DIR_LOCAL/cpu-cpu.diff"
if [ -s "$AUDIT_DIR_LOCAL/cpu-cpu.diff" ]; then
_record "Scenario 1 — CPU → CPU" "FAIL" \
"Non-empty diff — see \`cpu-cpu.diff\`. First lines:
\`\`\`
$(head -20 "$AUDIT_DIR_LOCAL/cpu-cpu.diff")
\`\`\`"
else
_record "Scenario 1 — CPU → CPU" "PASS" "Empty diff across $SEED_COUNT seed(s) (serial)."
fi
fi
# ── Scenario 2: CPU → GPU (integer-byte-equal + float tolerance) ────────────
# Integer fields (pop, gold, winner_id, etc.) must match byte-for-byte; scalar
# floats are allowed to diverge within FLOAT_TOL. Delegates the per-seed
# tolerance check to determinism-compare.py (lives alongside this script).
if [ "$RUN_CPU_GPU" = "true" ]; then
echo "Running Scenario 2: CPU → GPU..."
_batch_remote "$AUDIT_DIR_REMOTE/cpu-gpu-cpu" "$SEED_COUNT" "AI_USE_MCTS=true AI_GPU_ROLLOUT=false"
_batch_remote "$AUDIT_DIR_REMOTE/cpu-gpu-gpu" "$SEED_COUNT" "AI_USE_MCTS=true AI_GPU_ROLLOUT=true"
set +e
ssh "$AUTOPLAY_HOST" "python3 '$PROJECT_ROOT_REMOTE/tools/determinism-compare.py' \
'$AUDIT_DIR_REMOTE/cpu-gpu-cpu' '$AUDIT_DIR_REMOTE/cpu-gpu-gpu' \
--float-tol $FLOAT_TOL > '$AUDIT_DIR_REMOTE/cpu-gpu.report' 2>&1"
cpu_gpu_status=$?
set -e
_fetch_remote_file "$AUDIT_DIR_REMOTE/cpu-gpu.report" "$AUDIT_DIR_LOCAL/cpu-gpu.report"
if [ "$cpu_gpu_status" -ne 0 ]; then
report_snippet="(see cpu-gpu.report)"
[ -s "$AUDIT_DIR_LOCAL/cpu-gpu.report" ] && \
report_snippet="$(head -10 "$AUDIT_DIR_LOCAL/cpu-gpu.report")"
_record "Scenario 2 — CPU → GPU" "FAIL" \
"Parity check failed (exit $cpu_gpu_status):
\`\`\`
$report_snippet
\`\`\`"
else
_record "Scenario 2 — CPU → GPU" "PASS" \
"Integer fields byte-equal; floats within $FLOAT_TOL across $SEED_COUNT seeds."
fi
fi
# ── Scenario 3: Parallel batch determinism ──────────────────────────────────
# PARALLEL=SEED_COUNT dispatches all seeds concurrently; run twice and diff.
# If batch output is order-sensitive, seed ranges or RNG state-leakage will
# produce divergent turn_stats.jsonl across the two runs.
if [ "$RUN_PARALLEL" = "true" ]; then
par=$SEED_COUNT
[ "$par" -lt 2 ] && par=2
echo "Running Scenario 3: Parallel batch (PARALLEL=$par)..."
_batch_remote "$AUDIT_DIR_REMOTE/parallel-run1" "$par" "AI_USE_MCTS=true AI_GPU_ROLLOUT=false"
_batch_remote "$AUDIT_DIR_REMOTE/parallel-run2" "$par" "AI_USE_MCTS=true AI_GPU_ROLLOUT=false"
_diff_remote "$AUDIT_DIR_REMOTE/parallel-run1" "$AUDIT_DIR_REMOTE/parallel-run2" \
"$AUDIT_DIR_REMOTE/parallel.diff"
_fetch_remote_file "$AUDIT_DIR_REMOTE/parallel.diff" "$AUDIT_DIR_LOCAL/parallel.diff"
if [ -s "$AUDIT_DIR_LOCAL/parallel.diff" ]; then
_record "Scenario 3 — Parallel (PARALLEL=$par)" "FAIL" \
"Parallel batch diverged between runs — order-of-dispatch RNG leak? First lines:
\`\`\`
$(head -20 "$AUDIT_DIR_LOCAL/parallel.diff")
\`\`\`"
else
_record "Scenario 3 — Parallel (PARALLEL=$par)" "PASS" \
"Parallel dispatch is deterministic across $SEED_COUNT seeds."
fi
fi
# (Process-restart determinism lives in p1-09 / autosave — do not add here.)
# ── Summary ──────────────────────────────────────────────────────────────────
echo "" >> "$SUMMARY"
echo "---" >> "$SUMMARY"
if [ "${#FAILURES[@]}" -eq 0 ]; then
echo "" >> "$SUMMARY"
echo "**Result: ALL PASS**" >> "$SUMMARY"
echo "Determinism audit PASSED — report: $SUMMARY"
exit 0
else
echo "" >> "$SUMMARY"
echo "**Result: FAIL — ${#FAILURES[@]} scenario(s): ${FAILURES[*]}**" >> "$SUMMARY"
echo "Determinism audit FAILED — report: $SUMMARY" >&2
exit 1
fi