296 lines
16 KiB
Bash
Executable file
296 lines
16 KiB
Bash
Executable file
#!/usr/bin/env bash
|
||
# apricot-run.sh — Isolated build + batch pipeline on apricot.
|
||
#
|
||
# apricot is a multi-tenant RUN host. We do NOT touch ~/Code/@projects/@magic-civilization
|
||
# on apricot — that's another developer's workspace. Instead:
|
||
#
|
||
# 1. Rsync this EDIT-host source tree to /tmp/mc-<stamp>/ on apricot.
|
||
# 2. Build (cargo) in /tmp/mc-<stamp>/, target dir stays there (ephemeral).
|
||
# 3. Run the batch with RESULTS_DIR under $HOME/.cache/mc-batches/<stamp>/
|
||
# (persistent, XDG cache convention, flatpak-visible via --filesystem=home).
|
||
# 4. Fetch verdict JSON back to EDIT host for review.
|
||
#
|
||
# Usage:
|
||
# scripts/apricot-run.sh smoke [seeds=10] [turns=300]
|
||
# scripts/apricot-run.sh clan <clan_id> [seeds=10] [turns=300]
|
||
# scripts/apricot-run.sh gpu-walltime [seeds=10] [turns=300]
|
||
#
|
||
# Environment:
|
||
# APRICOT_SSH_ALIAS — ssh alias for the RUN host (default: apricot).
|
||
# STAMP — override the timestamp (for reproducing a specific run).
|
||
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||
|
||
APRICOT="${APRICOT_SSH_ALIAS:-apricot}"
|
||
STAMP="${STAMP:-$(date +%Y%m%d_%H%M%S)}"
|
||
|
||
# ── Load .env / .env.local so USE_MAX_CORES + MIN_CORES + AI_GPU_ROLLOUT
|
||
# propagate into the resource policy below. .env.local wins over .env.
|
||
for envfile in "${PROJECT_DIR}/.env" "${PROJECT_DIR}/.env.local"; do
|
||
if [[ -f "${envfile}" ]]; then
|
||
set -a; source "${envfile}"; set +a
|
||
fi
|
||
done
|
||
|
||
# MODE + positional args resolved early so the resource-policy block can
|
||
# peek at the seed count (which differs per mode — for `clan` it's $2
|
||
# because $1 is the clan_id; for smoke/gpu-walltime it's $1).
|
||
MODE="${1:?usage: apricot-run.sh <smoke|clan|difficulty|difficulty-asym|gpu-walltime> [args]}"
|
||
shift || true
|
||
|
||
# ── Resource policy for PARALLEL + RAYON_NUM_THREADS ─────────────────
|
||
# Each Godot instance spawns its own rayon thread pool for MCTS rollouts;
|
||
# rayon defaults to nproc unless RAYON_NUM_THREADS is set. If PARALLEL
|
||
# instances each claim all nproc threads, we get PARALLEL*nproc threads
|
||
# fighting over nproc cores → thrashing, each process effectively single
|
||
# core. Better: PARALLEL = number of seeds (one instance each), and
|
||
# RAYON_NUM_THREADS = nproc / PARALLEL so the box is saturated evenly.
|
||
case "${MODE}" in
|
||
clan) _seed_count_peek="${2:-10}" ;; # $1 is clan_id, $2 is seeds
|
||
clan-priors) _seed_count_peek="${2:-10}" ;; # $1 is clan_id, $2 is seeds
|
||
difficulty) _seed_count_peek="${2:-10}" ;; # $1 is tier, $2 is seeds
|
||
difficulty-asym) _seed_count_peek="${3:-10}" ;; # $1 p0 tier, $2 p1 tier, $3 seeds
|
||
matchup-grid) _seed_count_peek="${1:-5}" ;; # $1 is seeds_per_pair (default 5); total=10pairs*seeds
|
||
huge-map-5clan) _seed_count_peek="${1:-5}" ;; # $1 is seeds
|
||
*) _seed_count_peek="${1:-10}" ;; # smoke, gpu-walltime
|
||
esac
|
||
|
||
NPROC="$(ssh "${APRICOT}" nproc 2>/dev/null || echo 8)"
|
||
|
||
if [[ -n "${PARALLEL:-}" ]]; then
|
||
PARALLEL_EFFECTIVE="${PARALLEL}"
|
||
PARALLEL_SOURCE="env override"
|
||
elif [[ "${USE_MAX_CORES:-false}" == "true" ]]; then
|
||
# One instance per seed — up to NPROC. More instances than that
|
||
# would queue serially anyway (NPROC concurrent Godots max).
|
||
PARALLEL_EFFECTIVE="$(( _seed_count_peek < NPROC ? _seed_count_peek : NPROC ))"
|
||
PARALLEL_SOURCE="USE_MAX_CORES=true → min(seeds=${_seed_count_peek}, nproc=${NPROC})"
|
||
else
|
||
PARALLEL_EFFECTIVE="${MIN_CORES:-4}"
|
||
PARALLEL_SOURCE="MIN_CORES default"
|
||
fi
|
||
export PARALLEL="${PARALLEL_EFFECTIVE}"
|
||
|
||
# RAYON_NUM_THREADS per Godot instance = fair share of cores.
|
||
if [[ -n "${RAYON_NUM_THREADS:-}" ]]; then
|
||
RAYON_SOURCE="env override"
|
||
else
|
||
if [[ "${PARALLEL_EFFECTIVE}" -gt 0 ]]; then
|
||
RAYON_NUM_THREADS="$(( NPROC / PARALLEL_EFFECTIVE ))"
|
||
else
|
||
RAYON_NUM_THREADS=1
|
||
fi
|
||
[[ "${RAYON_NUM_THREADS}" -lt 1 ]] && RAYON_NUM_THREADS=1
|
||
RAYON_SOURCE="nproc(${NPROC}) / PARALLEL(${PARALLEL_EFFECTIVE})"
|
||
fi
|
||
export RAYON_NUM_THREADS
|
||
# Source + build scratch lives under $HOME/.cache (flatpak-visible via
|
||
# --filesystem=home). /tmp was tried first but flatpak's sandbox can't see
|
||
# /tmp, so Godot rejected the --path argument with "Invalid project path".
|
||
# $HOME/.cache/ still satisfies the apricot-isolation rule (not under ~/Code,
|
||
# not shared with other devs) and is convention-cleanable.
|
||
SCRATCH="\$HOME/.cache/mc-src-${STAMP}" # expanded on apricot
|
||
RESULTS="\$HOME/.cache/mc-batches/${STAMP}" # expanded on apricot
|
||
|
||
# Resolve $HOME on apricot so SCRATCH / RESULTS are fully-qualified paths on that host.
|
||
SCRATCH_ABS="$(ssh "${APRICOT}" "echo \$HOME/.cache/mc-src-${STAMP}")"
|
||
RESULTS_ABS="$(ssh "${APRICOT}" "echo \$HOME/.cache/mc-batches/${STAMP}")"
|
||
|
||
echo "============================================================"
|
||
echo "apricot-run.sh mode=${MODE} stamp=${STAMP}"
|
||
echo " EDIT host: $(hostname)"
|
||
echo " RUN host: ${APRICOT}"
|
||
echo " SCRATCH: ${SCRATCH_ABS} (per-run source + build scratch)"
|
||
echo " RESULTS: ${RESULTS_ABS} (persistent batch output)"
|
||
echo " PARALLEL: ${PARALLEL_EFFECTIVE} (source: ${PARALLEL_SOURCE})"
|
||
echo " RAYON_NUM_THREADS/instance: ${RAYON_NUM_THREADS} (source: ${RAYON_SOURCE})"
|
||
echo " Total CPU saturation: ${PARALLEL_EFFECTIVE} × ${RAYON_NUM_THREADS} = $((PARALLEL_EFFECTIVE * RAYON_NUM_THREADS))/${NPROC} cores"
|
||
echo " AI_GPU_ROLLOUT: ${AI_GPU_ROLLOUT:-true (default on for smoke/clan)}"
|
||
echo "============================================================"
|
||
|
||
# ── Step 1: rsync EDIT → SCRATCH ─────────────────────────────────────────────
|
||
echo "[$(date +%H:%M:%S)] rsync EDIT source → ${SCRATCH_ABS}..."
|
||
rsync -a --delete \
|
||
--exclude='.git' \
|
||
--exclude='.local/build' \
|
||
--exclude='.local/iter' \
|
||
--exclude='.local/batches' \
|
||
--exclude='node_modules' \
|
||
--exclude='target' \
|
||
--exclude='*.dylib' \
|
||
"${PROJECT_DIR}/" "${APRICOT}:${SCRATCH_ABS}/"
|
||
|
||
# ── Step 2: build + deploy via build-gdext.sh ────────────────────────────────
|
||
# Canonical build script: runs `cargo build --release --target x86_64-unknown-linux-gnu`
|
||
# AND copies the output from .local/build/rust/$TARGET/release/libmagic_civ_physics_gdext.so
|
||
# into src/game/engine/addons/magic_civ_physics/libmagic_civ_physics.x86_64.so
|
||
# with the name Godot's .gdextension file expects.
|
||
echo "[$(date +%H:%M:%S)] build-gdext.sh x86_64-unknown-linux-gnu (in SCRATCH)..."
|
||
# Never pipe build output through `| tail -N` — ssh inherits the pipe's
|
||
# exit code (0 from tail), which masks build-gdext.sh failures. That's
|
||
# how a stale April-16 .so ran for 2 hours on 2026-04-17 while bullets
|
||
# 3-5 of p0-32 stayed ✗ with no visible warning. Let the full log through.
|
||
ssh "${APRICOT}" "set -euo pipefail; cd '${SCRATCH_ABS}/src/simulator' && bash build-gdext.sh x86_64-unknown-linux-gnu"
|
||
|
||
# Also purge the stale .dylib that the rsync carried from the Mac — flatpak Godot
|
||
# on Linux shouldn't try to load a macOS binary even though the .gdextension config
|
||
# only points at it for macos.* target strings.
|
||
ssh "${APRICOT}" "rm -f '${SCRATCH_ABS}/src/game/engine/addons/magic_civ_physics/libmagic_civ_physics.dylib'"
|
||
|
||
# ── Step 3: populate .godot/global_script_class_cache.cfg via editor pre-pass ──
|
||
# Fresh scratch tree has an empty .godot/ cache. Godot's class_name resolution
|
||
# (the `as Weather` / `as SplitPanelContainer` idiom) requires this cache to be
|
||
# populated by an editor-mode scan. Without it, headless autoplay cascades through
|
||
# "Could not find type X" → "Compilation failed" → turn_manager.gd fails to load.
|
||
#
|
||
# `--editor --quit` opens, imports, scans class_names, writes cache, exits.
|
||
# Expected exit code: 0. We grep the log to verify the cache got written.
|
||
echo "[$(date +%H:%M:%S)] editor pre-pass to populate .godot/ class cache..."
|
||
ssh "${APRICOT}" "set -euo pipefail; \
|
||
flatpak run --user --filesystem=home --command=godot \
|
||
org.godotengine.Godot --headless --editor --quit \
|
||
--path '${SCRATCH_ABS}/src/game' 2>&1 | tail -5; \
|
||
test -s '${SCRATCH_ABS}/src/game/.godot/global_script_class_cache.cfg' && \
|
||
echo ' ✓ class cache populated' || \
|
||
echo ' ⚠ class cache missing — headless autoplay may still cascade'"
|
||
|
||
# ── Step 4: run the batch per MODE ───────────────────────────────────────────
|
||
ssh "${APRICOT}" "mkdir -p ${RESULTS_ABS}"
|
||
|
||
case "${MODE}" in
|
||
smoke)
|
||
SEEDS="${1:-10}"; TURNS="${2:-300}"
|
||
# Default: use the GPU when available (MCTS rollouts through WGSL kernel).
|
||
# gpu-walltime mode overrides this explicitly to true/false per iteration.
|
||
# Default AI_GPU_ROLLOUT=false for smoke/clan. The GPU integration
|
||
# (p0-20 task #10) is parity-verified on isolated rollouts, but
|
||
# enabling it in a 2-player smoke produced a deterministic
|
||
# "P0 always wins at T11-T18, P1 never founds" regression on
|
||
# 2026-04-18. Opt-in via env override; gpu-walltime flips
|
||
# per-iteration as its explicit comparison.
|
||
GPU_ENV="AI_GPU_ROLLOUT=${AI_GPU_ROLLOUT:-false}"
|
||
echo "[$(date +%H:%M:%S)] smoke batch: ${SEEDS} seeds T${TURNS} PARALLEL=${PARALLEL} ${GPU_ENV}"
|
||
ssh "${APRICOT}" "set -euo pipefail; cd '${SCRATCH_ABS}' && \
|
||
AI_USE_MCTS=true ${GPU_ENV} PARALLEL=${PARALLEL} \
|
||
bash tools/autoplay-batch.sh ${SEEDS} ${TURNS} ${RESULTS_ABS}/smoke 2>&1 | tail -30"
|
||
;;
|
||
clan)
|
||
CLAN="${1:?usage: apricot-run.sh clan <clan_id> [seeds] [turns]}"
|
||
SEEDS="${2:-10}"; TURNS="${3:-300}"
|
||
# Default AI_GPU_ROLLOUT=false for smoke/clan. The GPU integration
|
||
# (p0-20 task #10) is parity-verified on isolated rollouts, but
|
||
# enabling it in a 2-player smoke produced a deterministic
|
||
# "P0 always wins at T11-T18, P1 never founds" regression on
|
||
# 2026-04-18. Opt-in via env override; gpu-walltime flips
|
||
# per-iteration as its explicit comparison.
|
||
GPU_ENV="AI_GPU_ROLLOUT=${AI_GPU_ROLLOUT:-false}"
|
||
echo "[$(date +%H:%M:%S)] clan=${CLAN} batch: ${SEEDS} seeds T${TURNS} PARALLEL=${PARALLEL} ${GPU_ENV}"
|
||
ssh "${APRICOT}" "set -euo pipefail; cd '${SCRATCH_ABS}' && \
|
||
AI_USE_MCTS=true AI_PIN_PERSONALITY='${CLAN}' ${GPU_ENV} PARALLEL=${PARALLEL} \
|
||
bash tools/autoplay-batch.sh ${SEEDS} ${TURNS} ${RESULTS_ABS}/clan-${CLAN} 2>&1 | tail -30"
|
||
;;
|
||
difficulty)
|
||
DIFF_TIER="${1:?usage: apricot-run.sh difficulty <easy|normal|hard|insane> [seeds] [turns]}"
|
||
SEEDS="${2:-10}"; TURNS="${3:-300}"
|
||
GPU_ENV="AI_GPU_ROLLOUT=${AI_GPU_ROLLOUT:-false}"
|
||
echo "[$(date +%H:%M:%S)] difficulty=${DIFF_TIER} batch: ${SEEDS} seeds T${TURNS} PARALLEL=${PARALLEL} ${GPU_ENV}"
|
||
# AI_DIFFICULTY_P0 + AI_DIFFICULTY_P1 apply the modifier to BOTH players
|
||
# (including the human-slot player 0 which is_human=true). This is
|
||
# required for symmetric Easy-vs-Easy / Hard-vs-Hard tier_peak differentiation.
|
||
ssh "${APRICOT}" "set -euo pipefail; cd '${SCRATCH_ABS}' && \
|
||
AI_USE_MCTS=true AI_DIFFICULTY='${DIFF_TIER}' \
|
||
AI_DIFFICULTY_P0='${DIFF_TIER}' AI_DIFFICULTY_P1='${DIFF_TIER}' \
|
||
${GPU_ENV} PARALLEL=${PARALLEL} \
|
||
bash tools/autoplay-batch.sh ${SEEDS} ${TURNS} ${RESULTS_ABS}/difficulty-${DIFF_TIER} 2>&1 | tail -30"
|
||
;;
|
||
difficulty-asym)
|
||
P0_TIER="${1:?usage: apricot-run.sh difficulty-asym <p0-tier> <p1-tier> [seeds] [turns]}"
|
||
P1_TIER="${2:?usage: apricot-run.sh difficulty-asym <p0-tier> <p1-tier> [seeds] [turns]}"
|
||
SEEDS="${3:-10}"; TURNS="${4:-300}"
|
||
GPU_ENV="AI_GPU_ROLLOUT=${AI_GPU_ROLLOUT:-false}"
|
||
echo "[$(date +%H:%M:%S)] difficulty-asym p0=${P0_TIER} p1=${P1_TIER}: ${SEEDS} seeds T${TURNS}"
|
||
ssh "${APRICOT}" "set -euo pipefail; cd '${SCRATCH_ABS}' && \
|
||
AI_USE_MCTS=true AI_DIFFICULTY_P0='${P0_TIER}' AI_DIFFICULTY_P1='${P1_TIER}' \
|
||
${GPU_ENV} PARALLEL=${PARALLEL} \
|
||
bash tools/autoplay-batch.sh ${SEEDS} ${TURNS} \
|
||
${RESULTS_ABS}/difficulty-asym-${P0_TIER}-vs-${P1_TIER} 2>&1 | tail -30"
|
||
;;
|
||
clan-priors)
|
||
# Like `clan` but with AI_MCTS_PRIORS=true — used to verify p0-38
|
||
# tree-shape divergence across 5 clan personalities.
|
||
CLAN="${1:?usage: apricot-run.sh clan-priors <clan_id> [seeds] [turns]}"
|
||
SEEDS="${2:-10}"; TURNS="${3:-300}"
|
||
GPU_ENV="AI_GPU_ROLLOUT=${AI_GPU_ROLLOUT:-false}"
|
||
echo "[$(date +%H:%M:%S)] clan-priors=${CLAN} batch: ${SEEDS} seeds T${TURNS} PARALLEL=${PARALLEL} priors=true"
|
||
ssh "${APRICOT}" "set -euo pipefail; cd '${SCRATCH_ABS}' && \
|
||
AI_USE_MCTS=true AI_PIN_PERSONALITY='${CLAN}' AI_MCTS_PRIORS=true ${GPU_ENV} PARALLEL=${PARALLEL} \
|
||
bash tools/autoplay-batch.sh ${SEEDS} ${TURNS} ${RESULTS_ABS}/clan-priors-${CLAN} 2>&1 | tail -30"
|
||
;;
|
||
gpu-walltime)
|
||
SEEDS="${1:-10}"; TURNS="${2:-300}"
|
||
echo "[$(date +%H:%M:%S)] GPU wall-time comparison: ${SEEDS} seeds T${TURNS}"
|
||
for GPU in true false; do
|
||
echo " --- AI_GPU_ROLLOUT=${GPU} ---"
|
||
ssh "${APRICOT}" "set -euo pipefail; cd '${SCRATCH_ABS}' && \
|
||
AI_USE_MCTS=true AI_GPU_ROLLOUT=${GPU} PARALLEL=${PARALLEL} \
|
||
bash tools/autoplay-batch.sh ${SEEDS} ${TURNS} ${RESULTS_ABS}/gpu-${GPU} 2>&1 | tail -10"
|
||
done
|
||
;;
|
||
matchup-grid)
|
||
# Run all C(5,2)=10 clan-pair matchups serially (pairs run one at a time;
|
||
# seeds within a pair use PARALLEL concurrency). Uses the scratch-resident
|
||
# binary so we never touch ~/Code on the RUN host.
|
||
SEEDS_PER_PAIR="${1:-5}"; TURNS="${2:-300}"
|
||
REMOTE_GRID="${RESULTS_ABS}/matchup-grid"
|
||
echo "[$(date +%H:%M:%S)] matchup-grid: ${SEEDS_PER_PAIR} seeds/pair T${TURNS} PARALLEL=${PARALLEL}"
|
||
ssh "${APRICOT}" "set -euo pipefail; mkdir -p '${REMOTE_GRID}'; cd '${SCRATCH_ABS}' && \
|
||
AI_USE_MCTS=true PARALLEL=${PARALLEL} RAYON_NUM_THREADS=${RAYON_NUM_THREADS} \
|
||
COUNT=${SEEDS_PER_PAIR} TURN_LIMIT=${TURNS} \
|
||
MATCHUP_OUTPUT='${REMOTE_GRID}' \
|
||
bash tools/matchup-grid.sh 2>&1 | tail -40"
|
||
;;
|
||
huge-map-5clan)
|
||
SEEDS="${1:-5}"; TURNS="${2:-300}"
|
||
REMOTE_HUGE="${RESULTS_ABS}/huge-map-5clan"
|
||
echo "[$(date +%H:%M:%S)] huge-map-5clan: ${SEEDS} seeds T${TURNS} PARALLEL=${PARALLEL}"
|
||
ssh "${APRICOT}" "set -euo pipefail; mkdir -p '${REMOTE_HUGE}'; cd '${SCRATCH_ABS}' && \
|
||
AI_USE_MCTS=true PARALLEL=${PARALLEL} RAYON_NUM_THREADS=${RAYON_NUM_THREADS} \
|
||
COUNT=${SEEDS} TURN_LIMIT=${TURNS} \
|
||
HUGE_OUTPUT='${REMOTE_HUGE}' \
|
||
bash tools/huge-map-5clan.sh 2>&1 | tail -40"
|
||
;;
|
||
*)
|
||
echo "ERROR: unknown mode '${MODE}'" >&2
|
||
exit 2
|
||
;;
|
||
esac
|
||
|
||
# ── Step 5: fetch results summary back to EDIT ───────────────────────────────
|
||
LOCAL_RESULTS="${PROJECT_DIR}/.local/iter/apricot-${STAMP}"
|
||
mkdir -p "${LOCAL_RESULTS}"
|
||
echo "[$(date +%H:%M:%S)] fetch verdict/summary to ${LOCAL_RESULTS}..."
|
||
scp -r "${APRICOT}:${RESULTS_ABS}/" "${LOCAL_RESULTS}/" 2>/dev/null || \
|
||
echo "WARN: scp returned non-zero; check manually on ${APRICOT}:${RESULTS_ABS}"
|
||
|
||
# ── Step 6: prune old local copies — keep only the 3 most recent ─────────────
|
||
ITER_ROOT="${PROJECT_DIR}/.local/iter"
|
||
if [[ -d "${ITER_ROOT}" ]]; then
|
||
# List apricot-* dirs newest-first, skip the first 3, delete the rest.
|
||
mapfile -t OLD_RUNS < <(ls -1dt "${ITER_ROOT}"/apricot-* 2>/dev/null | tail -n +4)
|
||
if (( ${#OLD_RUNS[@]} > 0 )); then
|
||
echo "[$(date +%H:%M:%S)] pruning ${#OLD_RUNS[@]} old local run(s) (keeping 3 newest)..."
|
||
for d in "${OLD_RUNS[@]}"; do
|
||
echo " rm -rf ${d}"
|
||
rm -rf "${d}"
|
||
done
|
||
fi
|
||
fi
|
||
|
||
echo "============================================================"
|
||
echo "DONE. Scratch at ${APRICOT}:${SCRATCH_ABS} (ephemeral, /tmp)."
|
||
echo "Results at ${APRICOT}:${RESULTS_ABS} (persistent, .cache)."
|
||
echo "Local copy at ${LOCAL_RESULTS}"
|
||
echo "============================================================"
|