#!/usr/bin/env bash # Distributed test/train dispatch — fan the iteration loop across the DigitalOcean # test fleet. Sourced by ./run (defines cmd_dist_*). Auto-registered via the # cmd__ name-dispatch, so no edit to the top-level `run` is needed. # # ./run dist:up [size] [region] spin the fleet up # ./run dist:sim [turn_limit] [--destroy-after] fan a sim batch across it # ./run dist:train [--destroy-after] fan an RL sweep across it # ./run dist:down tear it down (zero cost) # # Requires: TF_VAR_do_token in env, terraform on PATH, and a coordinator with # GNU coreutils (autoplay-batch.sh uses `realpath -m`). _DIST_TF_DIR_REL="infra/terraform/test-fleet" _dist_repo_root() { (cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd); } _dist_tf() { local root root="$(_dist_repo_root)" terraform -chdir="$root/$_DIST_TF_DIR_REL" "$@" } _dist_read_hosts() { # Echo one "@" per line from the inventory, skipping comments/blanks. local inv="$1" grep -vE '^\s*(#|$)' "$inv" 2>/dev/null || true } cmd_dist() { cat <<'EOF' Distributed test/train fleet (DigitalOcean). Set TF_VAR_do_token first. ./run dist:check offline: fmt + validate + mocked test (no token/spend) ./run dist:up [size] [region] e.g. ./run dist:up 10 ./run dist:sim [turn_limit] [--destroy-after] ./run dist:train [--destroy-after] ./run dist:test cargo test --workspace on a worker ./run dist:build cargo build + wasm on a worker (wasm rsync'd back) ./run dist:sync [ref] git pull + rebuild gdext on live workers ./run dist:render render a proof scene (software weston, no GPU) → png ./run dist:down EOF } cmd_dist_check() { # Offline IaC verification — no DigitalOcean token, no API, no servers, no cost. # fmt (style) + validate (schema typecheck) + test (mocked-provider behaviour). local root root="$(_dist_repo_root)" local dir="$root/$_DIST_TF_DIR_REL" echo "== terraform fmt ==" terraform -chdir="$dir" fmt -check -recursive || { echo "fmt: run 'terraform -chdir=$dir fmt'" >&2; return 1; } echo "== terraform init (providers only) ==" terraform -chdir="$dir" init -backend=false -input=false >/dev/null || return 1 echo "== terraform validate (schema typecheck) ==" terraform -chdir="$dir" validate || return 1 echo "== terraform test (mocked digitalocean) ==" terraform -chdir="$dir" test || return 1 echo "dist:check OK — config is valid, no resources touched." } cmd_dist_up() { local n="${1:-}" [[ "$n" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:up [size] [region]" >&2; return 1; } : "${TF_VAR_do_token:?export TF_VAR_do_token= first}" local args=(-auto-approve -var "workers=$n") [ -n "${2:-}" ] && args+=(-var "size=$2") [ -n "${3:-}" ] && args+=(-var "region=$3") _dist_tf init -input=false >/dev/null _dist_tf apply "${args[@]}" echo "fleet up: $n worker(s). inventory: $(_dist_repo_root)/.local/fleet/inventory" } cmd_dist_down() { : "${TF_VAR_do_token:?export TF_VAR_do_token= first}" _dist_tf apply -auto-approve -var "workers=0" echo "fleet down (workers=0): zero compute cost, snapshot only (~$0.40/mo)." } cmd_dist_sim() { local total="${1:-}" turn="${2:-300}" destroy=false local a for a in "$@"; do [ "$a" = "--destroy-after" ] && destroy=true; done [[ "$total" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:sim [turn_limit] [--destroy-after]" >&2; return 1; } local root inv root="$(_dist_repo_root)" inv="$root/.local/fleet/inventory" [ -f "$inv" ] || { echo "no inventory at $inv — run ./run dist:up first" >&2; return 1; } local hosts=() while IFS= read -r line; do hosts+=("$line"); done < <(_dist_read_hosts "$inv") local n=${#hosts[@]} [ "$n" -gt 0 ] || { echo "inventory empty — fleet is down" >&2; return 1; } local stamp results shard stamp="$(date +%Y%m%d_%H%M%S)" results="$root/.local/iter/$stamp" mkdir -p "$results" shard=$(( (total + n - 1) / n )) # ceil(total / n) echo "distributing $total game(s) over $n worker(s): ~$shard each, turn_limit=$turn" echo "results → $results" local pids=() i=0 host offset cnt cores for host in "${hosts[@]}"; do offset=$(( i * shard )) cnt=$shard (( offset + cnt > total )) && cnt=$(( total - offset )) (( cnt <= 0 )) && break cores="$(ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" nproc 2>/dev/null || echo 8)" echo " [$host] seeds $(( offset + 1 ))..$(( offset + cnt )) PARALLEL=$cores" AUTOPLAY_HOST="$host" SEED_OFFSET="$offset" PARALLEL="$cores" \ bash "$root/tools/autoplay-batch.sh" "$cnt" "$turn" "$results" \ >"$results/dispatch_worker_${i}.log" 2>&1 & pids+=($!) i=$(( i + 1 )) done local fail=0 p for p in "${pids[@]}"; do wait "$p" || fail=$(( fail + 1 )); done local produced produced="$(find "$results" -name turn_stats.jsonl -type f 2>/dev/null | wc -l | tr -d ' ')" echo "----------------------------------------------------------------" echo "distributed sim done: $produced game(s) produced turn_stats under $results" [ "$fail" -eq 0 ] || echo "WARNING: $fail worker batch(es) errored — see $results/dispatch_worker_*.log" >&2 $destroy && { echo "--destroy-after → tearing down"; cmd_dist_down; } [ "$fail" -eq 0 ] } cmd_dist_train() { # v1 blocking sweep: one training run per worker (distinct seed + run-name), # then pull the models back. Detached orchestration is the documented follow-up. local steps="${1:-1000000}" destroy=false local a for a in "$@"; do [ "$a" = "--destroy-after" ] && destroy=true; done [[ "$steps" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:train [--destroy-after]" >&2; return 1; } local root inv root="$(_dist_repo_root)" inv="$root/.local/fleet/inventory" [ -f "$inv" ] || { echo "no inventory at $inv — run ./run dist:up first" >&2; return 1; } local hosts=() while IFS= read -r line; do hosts+=("$line"); done < <(_dist_read_hosts "$inv") local n=${#hosts[@]} [ "$n" -gt 0 ] || { echo "inventory empty — fleet is down" >&2; return 1; } local stamp results stamp="$(date +%Y%m%d_%H%M%S)" results="$root/.local/train/$stamp" mkdir -p "$results" echo "fanning $n training run(s) × $steps steps (CPU). results → $results" local repo_remote="Code/@projects/@magic-civilization" local pids=() i=0 host seed run for host in "${hosts[@]}"; do seed=$(( 42 + i )) run="dist-${stamp}-w${i}" echo " [$host] run=$run seed=$seed" ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" \ "cd ~/$repo_remote && python3 -m tooling.rl_self_play.train --run-name '$run' --seed $seed --total-steps $steps --device cpu" \ >"$results/train_worker_${i}.log" 2>&1 & pids+=($!) i=$(( i + 1 )) done local fail=0 p for p in "${pids[@]}"; do wait "$p" || fail=$(( fail + 1 )); done # Pull each worker's model dir back. i=0 for host in "${hosts[@]}"; do run="dist-${stamp}-w${i}" rsync -az "$host:~/$repo_remote/tooling/rl_self_play/models/$run" "$results/" 2>/dev/null || \ echo " note: no model dir for $run on $host (check $results/train_worker_${i}.log)" i=$(( i + 1 )) done echo "----------------------------------------------------------------" echo "distributed train done under $results" [ "$fail" -eq 0 ] || echo "WARNING: $fail run(s) errored — see $results/train_worker_*.log" >&2 $destroy && { echo "--destroy-after → tearing down"; cmd_dist_down; } [ "$fail" -eq 0 ] } # ── compute offload (single worker) ────────────────────────────────────────── # Run heavy build/test compute on a DO worker instead of plum (M2 Air). Workers # already carry the toolchain (golden image) + repo (cloud-init git pull). _dist_first_host() { local inv inv="$(_dist_repo_root)/.local/fleet/inventory" [ -f "$inv" ] || return 1 _dist_read_hosts "$inv" | head -1 } cmd_dist_sync() { # Pull the given ref on every live worker + rebuild the GDExtension, so a # mid-session code change reaches the fleet without an image rebuild. local ref="${1:-main}" local root inv host root="$(_dist_repo_root)" inv="$root/.local/fleet/inventory" [ -f "$inv" ] || { echo "no fleet — run ./run dist:up first" >&2; return 1; } local pids=() p fail=0 while IFS= read -r host; do echo "[$host] sync → $ref" ssh -n -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" " set -e cd ~/Code/@projects/@magic-civilization git fetch --depth=1 origin '$ref' && git reset --hard FETCH_HEAD cd src/simulator && . ~/.cargo/env && bash build-gdext.sh " & pids+=($!) done < <(_dist_read_hosts "$inv") for p in "${pids[@]}"; do wait "$p" || fail=$(( fail + 1 )); done [ "$fail" -eq 0 ] && echo "synced all workers to $ref" || { echo "$fail worker(s) failed sync" >&2; return 1; } } cmd_dist_test() { # Offload the Rust test suite to one fast worker (slow on the M2 Air). local host repo host="$(_dist_first_host)" || { echo "no fleet — run ./run dist:up 1 c-8 first" >&2; return 1; } repo="Code/@projects/@magic-civilization" echo "running cargo tests on $host ..." ssh -n -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" " set -e cd ~/$repo/src/simulator && . ~/.cargo/env if command -v cargo-nextest >/dev/null 2>&1; then cargo nextest run --workspace; else cargo test --workspace; fi " } cmd_dist_build() { # Offload the workspace build for fast compile feedback, and bring back the # platform-independent WASM artifact. The native .so is linux-only and stays # on the worker (plum builds its own macOS .dylib locally). local host root repo host="$(_dist_first_host)" || { echo "no fleet — run ./run dist:up 1 first" >&2; return 1; } root="$(_dist_repo_root)" repo="Code/@projects/@magic-civilization" echo "building workspace + wasm on $host ..." ssh -n -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" " set -e cd ~/$repo/src/simulator && . ~/.cargo/env cargo build --workspace bash build-wasm.sh " echo "fetching wasm artifact → plum ..." mkdir -p "$root/.local/build/wasm" rsync -az "$host:~/$repo/.local/build/wasm/" "$root/.local/build/wasm/" 2>/dev/null \ && echo "wasm → .local/build/wasm/" || echo "note: no wasm at .local/build/wasm/ on worker" } cmd_dist_render() { # Render a proof scene on a worker (software weston + Mesa llvmpipe, no GPU) and # pull the PNG back to plum. Replaces the apricot SCREENSHOT_HOST flow. local scene="${1:-}" out="${2:-}" [ -n "$scene" ] && [ -n "$out" ] || { echo "usage: ./run dist:render [timeout_s]" >&2; return 1; } local host host="$(_dist_first_host)" || { echo "no fleet — run ./run dist:up 1 first" >&2; return 1; } local user="${host%@*}" AUTOPLAY_HOST="$host" \ PROJECT_ROOT_REMOTE="/home/${user}/Code/@projects/@magic-civilization" \ bash "$(_dist_repo_root)/tools/capture-proof.sh" "$scene" "$out" "${3:-180}" }