magicciv/scripts/apricot-run.sh
Natalie d245ea469b feat(@projects): add parallel execution config
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-04-17 23:35:00 -07:00

188 lines
9.7 KiB
Bash
Executable file

#!/usr/bin/env bash
# apricot-run.sh — Isolated build + batch pipeline on apricot.
#
# apricot is a multi-tenant RUN host. We do NOT touch ~/Code/@projects/@magic-civilization
# on apricot — that's another developer's workspace. Instead:
#
# 1. Rsync this EDIT-host source tree to /tmp/mc-<stamp>/ on apricot.
# 2. Build (cargo) in /tmp/mc-<stamp>/, target dir stays there (ephemeral).
# 3. Run the batch with RESULTS_DIR under $HOME/.cache/mc-batches/<stamp>/
# (persistent, XDG cache convention, flatpak-visible via --filesystem=home).
# 4. Fetch verdict JSON back to EDIT host for review.
#
# Usage:
# scripts/apricot-run.sh smoke [seeds=10] [turns=300]
# scripts/apricot-run.sh clan <clan_id> [seeds=10] [turns=300]
# scripts/apricot-run.sh gpu-walltime [seeds=10] [turns=300]
#
# Environment:
# APRICOT_SSH_ALIAS — ssh alias for the RUN host (default: apricot).
# STAMP — override the timestamp (for reproducing a specific run).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
APRICOT="${APRICOT_SSH_ALIAS:-apricot}"
STAMP="${STAMP:-$(date +%Y%m%d_%H%M%S)}"
# ── Load .env / .env.local so USE_MAX_CORES + MIN_CORES + AI_GPU_ROLLOUT
# propagate into the resource policy below. .env.local wins over .env.
for envfile in "${PROJECT_DIR}/.env" "${PROJECT_DIR}/.env.local"; do
if [[ -f "${envfile}" ]]; then
set -a; source "${envfile}"; set +a
fi
done
# ── Resource policy for PARALLEL ─────────────────────────────────────
# Precedence: explicit PARALLEL env > USE_MAX_CORES=true (nproc on RUN host)
# > MIN_CORES from .env (default 4). Games are single-core each;
# this controls how many run concurrently.
if [[ -n "${PARALLEL:-}" ]]; then
PARALLEL_EFFECTIVE="${PARALLEL}"
PARALLEL_SOURCE="env override"
elif [[ "${USE_MAX_CORES:-false}" == "true" ]]; then
PARALLEL_EFFECTIVE="$(ssh "${APRICOT}" nproc 2>/dev/null || echo "${MIN_CORES:-4}")"
PARALLEL_SOURCE="USE_MAX_CORES=true → nproc"
else
PARALLEL_EFFECTIVE="${MIN_CORES:-4}"
PARALLEL_SOURCE="MIN_CORES default"
fi
export PARALLEL="${PARALLEL_EFFECTIVE}"
# Source + build scratch lives under $HOME/.cache (flatpak-visible via
# --filesystem=home). /tmp was tried first but flatpak's sandbox can't see
# /tmp, so Godot rejected the --path argument with "Invalid project path".
# $HOME/.cache/ still satisfies the apricot-isolation rule (not under ~/Code,
# not shared with other devs) and is convention-cleanable.
SCRATCH="\$HOME/.cache/mc-src-${STAMP}" # expanded on apricot
RESULTS="\$HOME/.cache/mc-batches/${STAMP}" # expanded on apricot
MODE="${1:?usage: apricot-run.sh <smoke|clan|gpu-walltime> [args]}"
shift || true
# Resolve $HOME on apricot so SCRATCH / RESULTS are fully-qualified paths on that host.
SCRATCH_ABS="$(ssh "${APRICOT}" "echo \$HOME/.cache/mc-src-${STAMP}")"
RESULTS_ABS="$(ssh "${APRICOT}" "echo \$HOME/.cache/mc-batches/${STAMP}")"
echo "============================================================"
echo "apricot-run.sh mode=${MODE} stamp=${STAMP}"
echo " EDIT host: $(hostname)"
echo " RUN host: ${APRICOT}"
echo " SCRATCH: ${SCRATCH_ABS} (per-run source + build scratch)"
echo " RESULTS: ${RESULTS_ABS} (persistent batch output)"
echo " PARALLEL: ${PARALLEL_EFFECTIVE} (source: ${PARALLEL_SOURCE})"
echo " AI_GPU_ROLLOUT: ${AI_GPU_ROLLOUT:-true (default on for smoke/clan)}"
echo "============================================================"
# ── Step 1: rsync EDIT → SCRATCH ─────────────────────────────────────────────
echo "[$(date +%H:%M:%S)] rsync EDIT source → ${SCRATCH_ABS}..."
rsync -a --delete \
--exclude='.git' \
--exclude='.local/build' \
--exclude='.local/iter' \
--exclude='.local/batches' \
--exclude='node_modules' \
--exclude='target' \
--exclude='*.dylib' \
"${PROJECT_DIR}/" "${APRICOT}:${SCRATCH_ABS}/"
# ── Step 2: build + deploy via build-gdext.sh ────────────────────────────────
# Canonical build script: runs `cargo build --release --target x86_64-unknown-linux-gnu`
# AND copies the output from .local/build/rust/$TARGET/release/libmagic_civ_physics_gdext.so
# into src/game/engine/addons/magic_civ_physics/libmagic_civ_physics.x86_64.so
# with the name Godot's .gdextension file expects.
echo "[$(date +%H:%M:%S)] build-gdext.sh x86_64-unknown-linux-gnu (in SCRATCH)..."
# Never pipe build output through `| tail -N` — ssh inherits the pipe's
# exit code (0 from tail), which masks build-gdext.sh failures. That's
# how a stale April-16 .so ran for 2 hours on 2026-04-17 while bullets
# 3-5 of p0-32 stayed ✗ with no visible warning. Let the full log through.
ssh "${APRICOT}" "set -euo pipefail; cd '${SCRATCH_ABS}/src/simulator' && bash build-gdext.sh x86_64-unknown-linux-gnu"
# Also purge the stale .dylib that the rsync carried from the Mac — flatpak Godot
# on Linux shouldn't try to load a macOS binary even though the .gdextension config
# only points at it for macos.* target strings.
ssh "${APRICOT}" "rm -f '${SCRATCH_ABS}/src/game/engine/addons/magic_civ_physics/libmagic_civ_physics.dylib'"
# ── Step 3: populate .godot/global_script_class_cache.cfg via editor pre-pass ──
# Fresh scratch tree has an empty .godot/ cache. Godot's class_name resolution
# (the `as Weather` / `as SplitPanelContainer` idiom) requires this cache to be
# populated by an editor-mode scan. Without it, headless autoplay cascades through
# "Could not find type X" → "Compilation failed" → turn_manager.gd fails to load.
#
# `--editor --quit` opens, imports, scans class_names, writes cache, exits.
# Expected exit code: 0. We grep the log to verify the cache got written.
echo "[$(date +%H:%M:%S)] editor pre-pass to populate .godot/ class cache..."
ssh "${APRICOT}" "set -euo pipefail; \
flatpak run --user --filesystem=home --command=godot \
org.godotengine.Godot --headless --editor --quit \
--path '${SCRATCH_ABS}/src/game' 2>&1 | tail -5; \
test -s '${SCRATCH_ABS}/src/game/.godot/global_script_class_cache.cfg' && \
echo ' ✓ class cache populated' || \
echo ' ⚠ class cache missing — headless autoplay may still cascade'"
# ── Step 4: run the batch per MODE ───────────────────────────────────────────
ssh "${APRICOT}" "mkdir -p ${RESULTS_ABS}"
case "${MODE}" in
smoke)
SEEDS="${1:-10}"; TURNS="${2:-300}"
# Default: use the GPU when available (MCTS rollouts through WGSL kernel).
# gpu-walltime mode overrides this explicitly to true/false per iteration.
GPU_ENV="AI_GPU_ROLLOUT=${AI_GPU_ROLLOUT:-true}"
echo "[$(date +%H:%M:%S)] smoke batch: ${SEEDS} seeds T${TURNS} PARALLEL=${PARALLEL} ${GPU_ENV}"
ssh "${APRICOT}" "set -euo pipefail; cd '${SCRATCH_ABS}' && \
AI_USE_MCTS=true ${GPU_ENV} PARALLEL=${PARALLEL} \
bash tools/autoplay-batch.sh ${SEEDS} ${TURNS} ${RESULTS_ABS}/smoke 2>&1 | tail -30"
;;
clan)
CLAN="${1:?usage: apricot-run.sh clan <clan_id> [seeds] [turns]}"
SEEDS="${2:-10}"; TURNS="${3:-300}"
GPU_ENV="AI_GPU_ROLLOUT=${AI_GPU_ROLLOUT:-true}"
echo "[$(date +%H:%M:%S)] clan=${CLAN} batch: ${SEEDS} seeds T${TURNS} PARALLEL=${PARALLEL} ${GPU_ENV}"
ssh "${APRICOT}" "set -euo pipefail; cd '${SCRATCH_ABS}' && \
AI_USE_MCTS=true AI_PIN_PERSONALITY='${CLAN}' ${GPU_ENV} PARALLEL=${PARALLEL} \
bash tools/autoplay-batch.sh ${SEEDS} ${TURNS} ${RESULTS_ABS}/clan-${CLAN} 2>&1 | tail -30"
;;
gpu-walltime)
SEEDS="${1:-10}"; TURNS="${2:-300}"
echo "[$(date +%H:%M:%S)] GPU wall-time comparison: ${SEEDS} seeds T${TURNS}"
for GPU in true false; do
echo " --- AI_GPU_ROLLOUT=${GPU} ---"
ssh "${APRICOT}" "set -euo pipefail; cd '${SCRATCH_ABS}' && \
AI_USE_MCTS=true AI_GPU_ROLLOUT=${GPU} PARALLEL=${PARALLEL} \
bash tools/autoplay-batch.sh ${SEEDS} ${TURNS} ${RESULTS_ABS}/gpu-${GPU} 2>&1 | tail -10"
done
;;
*)
echo "ERROR: unknown mode '${MODE}'" >&2
exit 2
;;
esac
# ── Step 5: fetch results summary back to EDIT ───────────────────────────────
LOCAL_RESULTS="${PROJECT_DIR}/.local/iter/apricot-${STAMP}"
mkdir -p "${LOCAL_RESULTS}"
echo "[$(date +%H:%M:%S)] fetch verdict/summary to ${LOCAL_RESULTS}..."
scp -r "${APRICOT}:${RESULTS_ABS}/" "${LOCAL_RESULTS}/" 2>/dev/null || \
echo "WARN: scp returned non-zero; check manually on ${APRICOT}:${RESULTS_ABS}"
# ── Step 6: prune old local copies — keep only the 3 most recent ─────────────
ITER_ROOT="${PROJECT_DIR}/.local/iter"
if [[ -d "${ITER_ROOT}" ]]; then
# List apricot-* dirs newest-first, skip the first 3, delete the rest.
mapfile -t OLD_RUNS < <(ls -1dt "${ITER_ROOT}"/apricot-* 2>/dev/null | tail -n +4)
if (( ${#OLD_RUNS[@]} > 0 )); then
echo "[$(date +%H:%M:%S)] pruning ${#OLD_RUNS[@]} old local run(s) (keeping 3 newest)..."
for d in "${OLD_RUNS[@]}"; do
echo " rm -rf ${d}"
rm -rf "${d}"
done
fi
fi
echo "============================================================"
echo "DONE. Scratch at ${APRICOT}:${SCRATCH_ABS} (ephemeral, /tmp)."
echo "Results at ${APRICOT}:${RESULTS_ABS} (persistent, .cache)."
echo "Local copy at ${LOCAL_RESULTS}"
echo "============================================================"