diff --git a/scripts/apricot-async-smoke.sh b/scripts/apricot-async-smoke.sh new file mode 100755 index 00000000..3d962353 --- /dev/null +++ b/scripts/apricot-async-smoke.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +# apricot-async-smoke.sh — End-to-end smoke for the p2-64 launch/status/fetch protocol. +# +# Exercises a tiny batch (smoke 1 50) through the full async loop: +# 1. launch → bare stamp on stdout +# 2. status → valid JSON immediately, state in {running, unreachable} +# 3. wait loop → poll until state==complete (or fail) +# 4. fetch → rsync results to .local/iter// +# 5. verify → at least one game_*/turn_stats.jsonl present locally +# +# Skips gracefully (exit 0) if apricot is unreachable, so this can run on plum +# without blocking when the RUN host is offline. +# +# Usage: +# bash scripts/apricot-async-smoke.sh # default smoke 1 50 +# POLL_TIMEOUT_S=600 bash scripts/apricot-async-smoke.sh # extend the wait + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +APRICOT="${APRICOT_SSH_ALIAS:-apricot}" +POLL_TIMEOUT_S="${POLL_TIMEOUT_S:-900}" +POLL_INTERVAL_S="${POLL_INTERVAL_S:-15}" + +log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } + +# ── Reachability gate: skip if apricot can't be reached at all. ────────────── +if ! ssh -o ConnectTimeout=5 -o BatchMode=yes "$APRICOT" 'echo ok' >/dev/null 2>&1; then + log "apricot unreachable; skipping smoke (exit 0)" + exit 0 +fi + +# ── Step 1: launch ─────────────────────────────────────────────────────────── +log "launching smoke 1 50 …" +STAMP="$(bash "$SCRIPT_DIR/apricot-run.sh" launch smoke 1 50)" +if [[ -z "$STAMP" ]]; then + log "FAIL: launch returned empty stamp" + exit 1 +fi +log "launched stamp=$STAMP" + +# ── Step 2: status (must be valid JSON, must mention the stamp) ────────────── +STATUS_JSON="$(bash "$SCRIPT_DIR/apricot-run.sh" status "$STAMP")" +log "initial status: $STATUS_JSON" +case "$STATUS_JSON" in + *"\"stamp\":\"$STAMP\""*) ;; + *) log "FAIL: status JSON missing stamp field"; exit 1 ;; +esac + +# ── Step 3: poll until complete | failed (with timeout) ────────────────────── +log "polling every ${POLL_INTERVAL_S}s up to ${POLL_TIMEOUT_S}s …" +DEADLINE=$(( $(date +%s) + POLL_TIMEOUT_S )) +STATE="running" +while (( $(date +%s) < DEADLINE )); do + STATUS_JSON="$(bash "$SCRIPT_DIR/apricot-run.sh" status "$STAMP" || true)" + STATE="$(echo "$STATUS_JSON" | sed -n 's/.*"state":"\([^"]*\)".*/\1/p')" + log "state=$STATE ($STATUS_JSON)" + case "$STATE" in + complete) break ;; + failed) log "FAIL: batch failed; journalctl --user -u mc-batch-$STAMP on $APRICOT"; exit 1 ;; + esac + sleep "$POLL_INTERVAL_S" +done + +if [[ "$STATE" != "complete" ]]; then + log "FAIL: did not reach complete within ${POLL_TIMEOUT_S}s (last state=$STATE)" + exit 1 +fi + +# ── Step 4: fetch ──────────────────────────────────────────────────────────── +LOCAL_DEST="$(bash "$SCRIPT_DIR/apricot-run.sh" fetch "$STAMP")" +log "fetched to: $LOCAL_DEST" + +# ── Step 5: verify result presence ─────────────────────────────────────────── +if ! find "$LOCAL_DEST" -path '*/game_*/turn_stats.jsonl' -type f | grep -q .; then + log "FAIL: no turn_stats.jsonl found under $LOCAL_DEST" + exit 1 +fi + +log "OK — async protocol smoke passed for stamp=$STAMP" diff --git a/scripts/apricot-run.sh b/scripts/apricot-run.sh index 872fcd76..e8d00a59 100755 --- a/scripts/apricot-run.sh +++ b/scripts/apricot-run.sh @@ -15,11 +15,41 @@ # 5. Fetch verdict JSON back to EDIT host for review. # 6. Remove the worktree (canonical + objects retained for next run). # -# Usage: +# ── Synchronous usage (block until done, fetch results inline) ────────────── # scripts/apricot-run.sh smoke [seeds=10] [turns=300] # scripts/apricot-run.sh clan [seeds=10] [turns=300] # scripts/apricot-run.sh gpu-walltime [seeds=10] [turns=300] # +# ── Async protocol (p2-64): launch / status / fetch ───────────────────────── +# Decouples job lifecycle from the orchestrating ssh. The systemd --user unit +# on apricot owns build+batch and survives ssh disconnect, sleep/wake, network +# blips. EDIT host polls via short ConnectTimeout=5 probes. +# +# STAMP=$(scripts/apricot-run.sh launch smoke 1 50) # bare stdout = stamp +# scripts/apricot-run.sh status "$STAMP" # one-line JSON +# # → {"stamp":"...","state":"running|complete|failed|unreachable", +# # "seeds_done":N,"seeds_total":M,"completion_marker":bool} +# scripts/apricot-run.sh fetch "$STAMP" # rsync results +# +# Loop pattern: +# STAMP=$(scripts/apricot-run.sh launch smoke 10 300) +# while STATE=$(scripts/apricot-run.sh status "$STAMP" | jq -r .state); \ +# [[ $STATE != complete ]]; do +# [[ $STATE == failed ]] && exit 1 +# sleep 60 +# done +# scripts/apricot-run.sh fetch "$STAMP" +# +# Implementation notes: +# • launch writes a per-stamp launcher.sh into ~/.cache/mc-batches// +# and starts it under `systemd-run --user --collect --unit=mc-batch-`. +# • The launcher does git fetch, worktree add, build-gdext, autoplay-batch, +# then `touch /completion.marker` ONLY on success. +# • status uses a single ssh ConnectTimeout=5 with three lightweight probes +# (systemctl is-active, marker count, turn_stats count) — no piped reads +# of file contents (we hit channel saturation with that historically). +# • fetch is rsync -a --partial; resumable across drops. +# # Environment: # APRICOT_SSH_ALIAS — ssh alias for the RUN host (default: apricot). # STAMP — override the timestamp (for reproducing a specific run). @@ -45,9 +75,252 @@ done # MODE + positional args resolved early so the resource-policy block can # peek at the seed count (which differs per mode — for `clan` it's $2 # because $1 is the clan_id; for smoke/gpu-walltime it's $1). -MODE="${1:?usage: apricot-run.sh [args]}" +MODE="${1:?usage: apricot-run.sh [args]}" shift || true +# ── p2-64 async protocol: launch / status / fetch ──────────────────────────── +# These three sub-modes decouple the batch lifecycle from the orchestrating ssh. +# See the header comment for the full protocol shape and example loop. + +if [[ "${MODE}" == "launch" ]]; then + SUBMODE="${1:?usage: apricot-run.sh launch [args]}" + shift || true + LAUNCH_ARGS=("$@") + + # Pre-resolve the seed count from the sub-mode args (mirrors the resource + # policy peek below). The launcher script on apricot will use this to write + # a seeds_total file before invoking autoplay-batch. + case "${SUBMODE}" in + clan|clan-priors|difficulty) SEEDS_TOTAL_PEEK="${LAUNCH_ARGS[1]:-10}" ;; + difficulty-asym) SEEDS_TOTAL_PEEK="${LAUNCH_ARGS[2]:-10}" ;; + matchup-grid|huge-map-5clan) SEEDS_TOTAL_PEEK="${LAUNCH_ARGS[0]:-5}" ;; + ai-quality-baseline*) SEEDS_TOTAL_PEEK="${LAUNCH_ARGS[0]:-50}" ;; + smoke|gpu-walltime|*) SEEDS_TOTAL_PEEK="${LAUNCH_ARGS[0]:-10}" ;; + esac + + # Map sub-mode → submode results dir (mirrors the case statement below). + case "${SUBMODE}" in + clan) SUBDIR="clan-${LAUNCH_ARGS[0]:-unknown}" ;; + clan-priors) SUBDIR="clan-priors-${LAUNCH_ARGS[0]:-unknown}" ;; + difficulty) SUBDIR="difficulty-${LAUNCH_ARGS[0]:-unknown}" ;; + difficulty-asym) SUBDIR="difficulty-asym-${LAUNCH_ARGS[0]:-unknown}-vs-${LAUNCH_ARGS[1]:-unknown}" ;; + matchup-grid) SUBDIR="matchup-grid" ;; + huge-map-5clan) SUBDIR="huge-map-5clan" ;; + ai-quality-baseline*) SUBDIR="baseline" ;; # tier subdirs underneath + gpu-walltime) SUBDIR="gpu-walltime" ;; + smoke|*) SUBDIR="smoke" ;; + esac + + # Build a properly-quoted args string for embedding in the launcher script. + ARGS_QUOTED="" + for a in "${LAUNCH_ARGS[@]}"; do + ARGS_QUOTED+=" $(printf '%q' "$a")" + done + + # All status/log output goes to stderr — stdout is reserved for the bare + # stamp value so callers can do STAMP=$(scripts/apricot-run.sh launch ...). + { + echo "[launch] stamp=${STAMP} submode=${SUBMODE} args=${LAUNCH_ARGS[*]}" + echo "[launch] writing launcher.sh to apricot:~/.cache/mc-batches/${STAMP}/" + } >&2 + + # Write a per-stamp launcher script. Heredoc keeps quoting sane; we splice + # in only the values we actually need ($STAMP, $SUBMODE, $ARGS_QUOTED, $SUBDIR, + # $BUILD_REF, $SEEDS_TOTAL_PEEK). + BUILD_REF_LAUNCH="${BUILD_REF:-origin/main}" + ssh "${APRICOT}" "mkdir -p \"\$HOME/.cache/mc-batches/${STAMP}/${SUBDIR}\" && cat > \"\$HOME/.cache/mc-batches/${STAMP}/launcher.sh\"" < "\${RESULTS_SUB}/seeds_total" + +# Launch-side log lives next to results so post-mortem doesn't need journalctl. +LOG="\${RESULTS}/launcher.log" +exec >>"\${LOG}" 2>&1 +echo "===== mc-batch-\${STAMP} launcher start \$(date -u +%FT%TZ) =====" +echo "submode=\${SUBMODE} subdir=\${SUBDIR} build_ref=\${BUILD_REF} seeds_total=\${SEEDS_TOTAL}" + +# ── Resource policy (formerly EDIT-side; moved here so async re-launches from +# any orchestrator host produce the same answer). ───────────────────────────── +NPROC="\$(nproc 2>/dev/null || echo 8)" +if [[ -n "\${PARALLEL:-}" ]]; then + PARALLEL_EFFECTIVE="\${PARALLEL}" +elif [[ "\${USE_MAX_CORES:-true}" == "true" ]]; then + PARALLEL_EFFECTIVE="\$(( SEEDS_TOTAL < NPROC ? SEEDS_TOTAL : NPROC ))" +else + PARALLEL_EFFECTIVE="\${MIN_CORES:-4}" +fi +[[ "\${PARALLEL_EFFECTIVE}" -lt 1 ]] && PARALLEL_EFFECTIVE=1 +export PARALLEL="\${PARALLEL_EFFECTIVE}" + +if [[ -z "\${RAYON_NUM_THREADS:-}" ]]; then + RAYON_NUM_THREADS="\$(( NPROC / PARALLEL_EFFECTIVE ))" + [[ "\${RAYON_NUM_THREADS}" -lt 1 ]] && RAYON_NUM_THREADS=1 +fi +export RAYON_NUM_THREADS + +echo "PARALLEL=\${PARALLEL} RAYON_NUM_THREADS=\${RAYON_NUM_THREADS} NPROC=\${NPROC}" + +# ── Step 1: fetch + worktree ───────────────────────────────────────────────── +test -d "\${CANONICAL}/.git" || { + echo "ERROR: canonical checkout missing at \${CANONICAL}" >&2 + exit 1 +} +git -C "\${CANONICAL}" fetch origin --quiet +git -C "\${CANONICAL}" worktree add --detach "\${SCRATCH}" "\${BUILD_REF}" +BUILT_SHA="\$(git -C "\${SCRATCH}" rev-parse --short HEAD)" +echo "built_sha=\${BUILT_SHA}" + +# ── Step 2: build ──────────────────────────────────────────────────────────── +( cd "\${SCRATCH}/src/simulator" && bash build-gdext.sh x86_64-unknown-linux-gnu ) +rm -f "\${SCRATCH}/src/game/engine/addons/magic_civ_physics/libmagic_civ_physics.dylib" + +# ── Step 3: editor pre-pass to populate .godot/ class cache ───────────────── +flatpak run --user --filesystem=home --command=godot \\ + org.godotengine.Godot --headless --editor --quit \\ + --path "\${SCRATCH}/src/game" 2>&1 | tail -5 || true + +# ── Step 4: run the batch ──────────────────────────────────────────────────── +GPU_ENV_VAL="\${AI_GPU_ROLLOUT:-false}" +cd "\${SCRATCH}" + +case "\${SUBMODE}" in + smoke) + AI_USE_MCTS=true AI_GPU_ROLLOUT="\${GPU_ENV_VAL}" PARALLEL="\${PARALLEL}" \\ + bash tools/autoplay-batch.sh "\$@" "\${RESULTS_SUB}" + ;; + clan) + CLAN="\$1"; shift + AI_USE_MCTS=true AI_PIN_PERSONALITY="\${CLAN}" \\ + AI_GPU_ROLLOUT="\${GPU_ENV_VAL}" PARALLEL="\${PARALLEL}" \\ + bash tools/autoplay-batch.sh "\$@" "\${RESULTS_SUB}" + ;; + difficulty) + TIER="\$1"; shift + AI_USE_MCTS=true AI_DIFFICULTY="\${TIER}" \\ + AI_DIFFICULTY_P0="\${TIER}" AI_DIFFICULTY_P1="\${TIER}" \\ + AI_GPU_ROLLOUT="\${GPU_ENV_VAL}" PARALLEL="\${PARALLEL}" \\ + bash tools/autoplay-batch.sh "\$@" "\${RESULTS_SUB}" + ;; + *) + echo "ERROR: launcher does not yet support submode '\${SUBMODE}'" >&2 + exit 2 + ;; +esac + +# Only on success path: write completion.marker. Status's "failed" state = +# unit inactive AND no marker. +touch "\${RESULTS_SUB}/completion.marker" +echo "===== mc-batch-\${STAMP} launcher OK \$(date -u +%FT%TZ) =====" + +# Cleanup worktree (canonical + objects retained for next run). +git -C "\${CANONICAL}" worktree remove --force "\${SCRATCH}" 2>&1 || rm -rf "\${SCRATCH}" +LAUNCHER + + # The launcher reads args from "$@" inside its case branches; we pass + # them through the systemd-run invocation below. + ssh "${APRICOT}" "chmod +x \"\$HOME/.cache/mc-batches/${STAMP}/launcher.sh\"" + + # Start the unit. Pass the full LAUNCH_ARGS as positional args to the + # launcher script (they show up as $1, $2, ... inside the case branches). + SYSTEMD_CMD="systemd-run --user --collect --unit=mc-batch-${STAMP} \"\$HOME/.cache/mc-batches/${STAMP}/launcher.sh\"${ARGS_QUOTED}" + echo "[launch] starting systemd unit mc-batch-${STAMP}" >&2 + if ! ssh "${APRICOT}" "${SYSTEMD_CMD}" >&2; then + echo "[launch] FAILED to start systemd unit; check ssh + systemd --user availability" >&2 + exit 1 + fi + + echo "[launch] unit started; tail logs via: ssh ${APRICOT} 'journalctl --user -u mc-batch-${STAMP} -f'" >&2 + + # Bare stamp on stdout — this is the contract for callers. + echo "${STAMP}" + exit 0 +fi + +if [[ "${MODE}" == "status" ]]; then + QUERY_STAMP="${1:?usage: apricot-run.sh status }" + UNIT="mc-batch-${QUERY_STAMP}" + + # Single ssh probe with short ConnectTimeout. Three lightweight queries: + # 1. systemctl --user is-active (active|inactive|failed|unknown) + # 2. count of completion.marker files under /*/ + # 3. count of turn_stats.jsonl files under /*/game_*/ + # We also read seeds_total from the first submode dir if present. + PROBE='set +e + IS_ACTIVE=$(systemctl --user is-active '"${UNIT}"' 2>/dev/null || echo unknown) + MARKER_COUNT=$(ls "$HOME/.cache/mc-batches/'"${QUERY_STAMP}"'"/*/completion.marker 2>/dev/null | wc -l | tr -d " ") + STATS_COUNT=$(ls "$HOME/.cache/mc-batches/'"${QUERY_STAMP}"'"/*/game_*/turn_stats.jsonl 2>/dev/null | wc -l | tr -d " ") + SEEDS_TOTAL=$(cat "$HOME/.cache/mc-batches/'"${QUERY_STAMP}"'"/*/seeds_total 2>/dev/null | head -1) + SEEDS_TOTAL=${SEEDS_TOTAL:-0} + printf "%s|%s|%s|%s\n" "$IS_ACTIVE" "$MARKER_COUNT" "$STATS_COUNT" "$SEEDS_TOTAL"' + + PROBE_OUT="$(ssh -o ConnectTimeout=5 -o BatchMode=yes "${APRICOT}" "${PROBE}" 2>/dev/null)" || PROBE_OUT="" + + if [[ -z "${PROBE_OUT}" ]]; then + printf '{"stamp":"%s","state":"unreachable","seeds_done":0,"seeds_total":0,"completion_marker":false}\n' "${QUERY_STAMP}" + exit 0 + fi + + IFS='|' read -r IS_ACTIVE MARKER_COUNT STATS_COUNT SEEDS_TOTAL <<<"${PROBE_OUT}" + MARKER_COUNT="${MARKER_COUNT:-0}" + STATS_COUNT="${STATS_COUNT:-0}" + SEEDS_TOTAL="${SEEDS_TOTAL:-0}" + + if [[ "${MARKER_COUNT}" -gt 0 ]]; then + STATE="complete" + MARKER_BOOL="true" + elif [[ "${IS_ACTIVE}" == "active" || "${IS_ACTIVE}" == "activating" ]]; then + STATE="running" + MARKER_BOOL="false" + else + STATE="failed" + MARKER_BOOL="false" + fi + + printf '{"stamp":"%s","state":"%s","seeds_done":%s,"seeds_total":%s,"completion_marker":%s}\n' \ + "${QUERY_STAMP}" "${STATE}" "${STATS_COUNT}" "${SEEDS_TOTAL}" "${MARKER_BOOL}" + exit 0 +fi + +if [[ "${MODE}" == "fetch" ]]; then + QUERY_STAMP="${1:?usage: apricot-run.sh fetch }" + LOCAL_DEST="${PROJECT_DIR}/.local/iter/${QUERY_STAMP}" + + # Re-use status to gate the fetch — this is the same one-line probe. + STATUS_JSON="$("$0" status "${QUERY_STAMP}")" + STATE="$(echo "${STATUS_JSON}" | sed -n 's/.*"state":"\([^"]*\)".*/\1/p')" + + if [[ "${STATE}" != "complete" ]]; then + echo "[fetch] batch not complete; status=${STATE}" >&2 + echo "[fetch] full status: ${STATUS_JSON}" >&2 + exit 1 + fi + + mkdir -p "${LOCAL_DEST}" + echo "[fetch] rsync apricot:~/.cache/mc-batches/${QUERY_STAMP}/ → ${LOCAL_DEST}/" >&2 + rsync -a --partial \ + "${APRICOT}:.cache/mc-batches/${QUERY_STAMP}/" \ + "${LOCAL_DEST}/" + + echo "${LOCAL_DEST}" + exit 0 +fi +# ── end p2-64 async protocol ───────────────────────────────────────────────── + # ── Resource policy for PARALLEL + RAYON_NUM_THREADS ───────────────── # Each Godot instance spawns its own rayon thread pool for MCTS rollouts; # rayon defaults to nproc unless RAYON_NUM_THREADS is set. If PARALLEL diff --git a/tooling/claude/dot-claude/instructions/canonical-commands.md b/tooling/claude/dot-claude/instructions/canonical-commands.md index eb986b90..881ac9a0 100644 --- a/tooling/claude/dot-claude/instructions/canonical-commands.md +++ b/tooling/claude/dot-claude/instructions/canonical-commands.md @@ -30,3 +30,22 @@ ssh apricot 'cd ~/.cache/mc-src- && timeout 60 flatpak run --user --files ``` Subsequent runs (autoplay, GUT, batches) will then load extensions and class_name registrations correctly. + +## Async batch protocol on apricot (p2-64) + +When apricot connectivity is intermittent (sleep/wake, sshd channel saturation, network blips), use the launch / status / fetch loop instead of the synchronous `scripts/apricot-run.sh smoke …` flow. The systemd `--user` unit on apricot owns the build+batch lifecycle and survives ssh disconnects. Status probes use a single short-timeout ssh and never read file contents — only `is-active` / `ls | wc -l` style checks. + +``` +STAMP=$(scripts/apricot-run.sh launch smoke 10 300) # bare stdout = stamp +while STATE=$(scripts/apricot-run.sh status "$STAMP" | jq -r .state); \ + [[ $STATE != complete ]]; do + [[ $STATE == failed ]] && { echo "batch failed; journalctl --user -u mc-batch-$STAMP" >&2; exit 1; } + [[ $STATE == unreachable ]] && sleep 30 && continue + sleep 60 +done +LOCAL=$(scripts/apricot-run.sh fetch "$STAMP") # rsync to .local/iter// +``` + +States: `running` (unit active), `complete` (`completion.marker` present), `failed` (unit inactive + no marker), `unreachable` (ssh probe timeout — retryable, no work lost). + +Submodes currently wired into the launcher: `smoke`, `clan`, `difficulty`. Other modes (`gpu-walltime`, `matchup-grid`, `huge-map-5clan`, `ai-quality-baseline*`) still run via the synchronous flow and can be added to the launcher case-branch as needed.