2026-06-27 06:12:07 -04:00
|
|
|
|
#!/usr/bin/env bash
|
2026-06-27 08:51:09 -04:00
|
|
|
|
# Distributed test/train dispatch — fan the iteration loop across the DigitalOcean
|
2026-06-27 06:12:07 -04:00
|
|
|
|
# test fleet. Sourced by ./run (defines cmd_dist_*). Auto-registered via the
|
|
|
|
|
|
# cmd_<verb>_<target> name-dispatch, so no edit to the top-level `run` is needed.
|
|
|
|
|
|
#
|
2026-06-27 08:51:09 -04:00
|
|
|
|
# ./run dist:up <workers> [size] [region] spin the fleet up
|
2026-06-27 06:12:07 -04:00
|
|
|
|
# ./run dist:sim <games> [turn_limit] [--destroy-after] fan a sim batch across it
|
|
|
|
|
|
# ./run dist:train <total_steps> [--destroy-after] fan an RL sweep across it
|
|
|
|
|
|
# ./run dist:down tear it down (zero cost)
|
|
|
|
|
|
#
|
2026-06-27 08:51:09 -04:00
|
|
|
|
# Requires: TF_VAR_do_token in env, terraform on PATH, and a coordinator with
|
2026-06-27 06:12:07 -04:00
|
|
|
|
# GNU coreutils (autoplay-batch.sh uses `realpath -m`).
|
|
|
|
|
|
|
|
|
|
|
|
_DIST_TF_DIR_REL="infra/terraform/test-fleet"
|
|
|
|
|
|
|
|
|
|
|
|
_dist_repo_root() { (cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd); }
|
|
|
|
|
|
|
|
|
|
|
|
_dist_tf() {
|
|
|
|
|
|
local root
|
|
|
|
|
|
root="$(_dist_repo_root)"
|
|
|
|
|
|
terraform -chdir="$root/$_DIST_TF_DIR_REL" "$@"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
_dist_read_hosts() {
|
|
|
|
|
|
# Echo one "<user>@<ip>" per line from the inventory, skipping comments/blanks.
|
|
|
|
|
|
local inv="$1"
|
|
|
|
|
|
grep -vE '^\s*(#|$)' "$inv" 2>/dev/null || true
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
cmd_dist() {
|
|
|
|
|
|
cat <<'EOF'
|
2026-06-27 08:51:09 -04:00
|
|
|
|
Distributed test/train fleet (DigitalOcean). Set TF_VAR_do_token first.
|
2026-06-27 06:29:41 -04:00
|
|
|
|
./run dist:check offline: fmt + validate + mocked test (no token/spend)
|
2026-06-27 08:51:09 -04:00
|
|
|
|
./run dist:up <workers> [size] [region] e.g. ./run dist:up 10
|
2026-06-27 06:12:07 -04:00
|
|
|
|
./run dist:sim <games> [turn_limit] [--destroy-after]
|
|
|
|
|
|
./run dist:train <total_steps> [--destroy-after]
|
|
|
|
|
|
./run dist:down
|
|
|
|
|
|
EOF
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-27 06:29:41 -04:00
|
|
|
|
cmd_dist_check() {
|
2026-06-27 08:51:09 -04:00
|
|
|
|
# Offline IaC verification — no DigitalOcean token, no API, no servers, no cost.
|
2026-06-27 06:29:41 -04:00
|
|
|
|
# fmt (style) + validate (schema typecheck) + test (mocked-provider behaviour).
|
|
|
|
|
|
local root
|
|
|
|
|
|
root="$(_dist_repo_root)"
|
|
|
|
|
|
local dir="$root/$_DIST_TF_DIR_REL"
|
|
|
|
|
|
echo "== terraform fmt =="
|
|
|
|
|
|
terraform -chdir="$dir" fmt -check -recursive || { echo "fmt: run 'terraform -chdir=$dir fmt'" >&2; return 1; }
|
|
|
|
|
|
echo "== terraform init (providers only) =="
|
|
|
|
|
|
terraform -chdir="$dir" init -backend=false -input=false >/dev/null || return 1
|
|
|
|
|
|
echo "== terraform validate (schema typecheck) =="
|
|
|
|
|
|
terraform -chdir="$dir" validate || return 1
|
2026-06-27 08:51:09 -04:00
|
|
|
|
echo "== terraform test (mocked digitalocean) =="
|
2026-06-27 06:29:41 -04:00
|
|
|
|
terraform -chdir="$dir" test || return 1
|
|
|
|
|
|
echo "dist:check OK — config is valid, no resources touched."
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-27 06:12:07 -04:00
|
|
|
|
cmd_dist_up() {
|
|
|
|
|
|
local n="${1:-}"
|
2026-06-27 08:51:09 -04:00
|
|
|
|
[[ "$n" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:up <workers> [size] [region]" >&2; return 1; }
|
|
|
|
|
|
: "${TF_VAR_do_token:?export TF_VAR_do_token=<DigitalOcean API token> first}"
|
2026-06-27 06:12:07 -04:00
|
|
|
|
local args=(-auto-approve -var "workers=$n")
|
2026-06-27 08:51:09 -04:00
|
|
|
|
[ -n "${2:-}" ] && args+=(-var "size=$2")
|
|
|
|
|
|
[ -n "${3:-}" ] && args+=(-var "region=$3")
|
2026-06-27 06:12:07 -04:00
|
|
|
|
_dist_tf init -input=false >/dev/null
|
|
|
|
|
|
_dist_tf apply "${args[@]}"
|
|
|
|
|
|
echo "fleet up: $n worker(s). inventory: $(_dist_repo_root)/.local/fleet/inventory"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
cmd_dist_down() {
|
2026-06-27 08:51:09 -04:00
|
|
|
|
: "${TF_VAR_do_token:?export TF_VAR_do_token=<DigitalOcean API token> first}"
|
2026-06-27 06:12:07 -04:00
|
|
|
|
_dist_tf apply -auto-approve -var "workers=0"
|
2026-06-27 08:51:09 -04:00
|
|
|
|
echo "fleet down (workers=0): zero compute cost, snapshot only (~$0.40/mo)."
|
2026-06-27 06:12:07 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
cmd_dist_sim() {
|
|
|
|
|
|
local total="${1:-}" turn="${2:-300}" destroy=false
|
|
|
|
|
|
local a
|
|
|
|
|
|
for a in "$@"; do [ "$a" = "--destroy-after" ] && destroy=true; done
|
|
|
|
|
|
[[ "$total" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:sim <total_games> [turn_limit] [--destroy-after]" >&2; return 1; }
|
|
|
|
|
|
|
|
|
|
|
|
local root inv
|
|
|
|
|
|
root="$(_dist_repo_root)"
|
|
|
|
|
|
inv="$root/.local/fleet/inventory"
|
|
|
|
|
|
[ -f "$inv" ] || { echo "no inventory at $inv — run ./run dist:up <N> first" >&2; return 1; }
|
|
|
|
|
|
|
|
|
|
|
|
local hosts=()
|
|
|
|
|
|
while IFS= read -r line; do hosts+=("$line"); done < <(_dist_read_hosts "$inv")
|
|
|
|
|
|
local n=${#hosts[@]}
|
|
|
|
|
|
[ "$n" -gt 0 ] || { echo "inventory empty — fleet is down" >&2; return 1; }
|
|
|
|
|
|
|
|
|
|
|
|
local stamp results shard
|
|
|
|
|
|
stamp="$(date +%Y%m%d_%H%M%S)"
|
|
|
|
|
|
results="$root/.local/iter/$stamp"
|
|
|
|
|
|
mkdir -p "$results"
|
|
|
|
|
|
shard=$(( (total + n - 1) / n )) # ceil(total / n)
|
|
|
|
|
|
echo "distributing $total game(s) over $n worker(s): ~$shard each, turn_limit=$turn"
|
|
|
|
|
|
echo "results → $results"
|
|
|
|
|
|
|
|
|
|
|
|
local pids=() i=0 host offset cnt cores
|
|
|
|
|
|
for host in "${hosts[@]}"; do
|
|
|
|
|
|
offset=$(( i * shard ))
|
|
|
|
|
|
cnt=$shard
|
|
|
|
|
|
(( offset + cnt > total )) && cnt=$(( total - offset ))
|
|
|
|
|
|
(( cnt <= 0 )) && break
|
|
|
|
|
|
cores="$(ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" nproc 2>/dev/null || echo 8)"
|
|
|
|
|
|
echo " [$host] seeds $(( offset + 1 ))..$(( offset + cnt )) PARALLEL=$cores"
|
|
|
|
|
|
AUTOPLAY_HOST="$host" SEED_OFFSET="$offset" PARALLEL="$cores" \
|
|
|
|
|
|
bash "$root/tools/autoplay-batch.sh" "$cnt" "$turn" "$results" \
|
|
|
|
|
|
>"$results/dispatch_worker_${i}.log" 2>&1 &
|
|
|
|
|
|
pids+=($!)
|
|
|
|
|
|
i=$(( i + 1 ))
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
local fail=0 p
|
|
|
|
|
|
for p in "${pids[@]}"; do wait "$p" || fail=$(( fail + 1 )); done
|
|
|
|
|
|
|
|
|
|
|
|
local produced
|
|
|
|
|
|
produced="$(find "$results" -name turn_stats.jsonl -type f 2>/dev/null | wc -l | tr -d ' ')"
|
|
|
|
|
|
echo "----------------------------------------------------------------"
|
|
|
|
|
|
echo "distributed sim done: $produced game(s) produced turn_stats under $results"
|
|
|
|
|
|
[ "$fail" -eq 0 ] || echo "WARNING: $fail worker batch(es) errored — see $results/dispatch_worker_*.log" >&2
|
|
|
|
|
|
|
|
|
|
|
|
$destroy && { echo "--destroy-after → tearing down"; cmd_dist_down; }
|
|
|
|
|
|
[ "$fail" -eq 0 ]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
cmd_dist_train() {
|
|
|
|
|
|
# v1 blocking sweep: one training run per worker (distinct seed + run-name),
|
|
|
|
|
|
# then pull the models back. Detached orchestration is the documented follow-up.
|
|
|
|
|
|
local steps="${1:-1000000}" destroy=false
|
|
|
|
|
|
local a
|
|
|
|
|
|
for a in "$@"; do [ "$a" = "--destroy-after" ] && destroy=true; done
|
|
|
|
|
|
[[ "$steps" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:train <total_steps> [--destroy-after]" >&2; return 1; }
|
|
|
|
|
|
|
|
|
|
|
|
local root inv
|
|
|
|
|
|
root="$(_dist_repo_root)"
|
|
|
|
|
|
inv="$root/.local/fleet/inventory"
|
|
|
|
|
|
[ -f "$inv" ] || { echo "no inventory at $inv — run ./run dist:up <N> first" >&2; return 1; }
|
|
|
|
|
|
|
|
|
|
|
|
local hosts=()
|
|
|
|
|
|
while IFS= read -r line; do hosts+=("$line"); done < <(_dist_read_hosts "$inv")
|
|
|
|
|
|
local n=${#hosts[@]}
|
|
|
|
|
|
[ "$n" -gt 0 ] || { echo "inventory empty — fleet is down" >&2; return 1; }
|
|
|
|
|
|
|
|
|
|
|
|
local stamp results
|
|
|
|
|
|
stamp="$(date +%Y%m%d_%H%M%S)"
|
|
|
|
|
|
results="$root/.local/train/$stamp"
|
|
|
|
|
|
mkdir -p "$results"
|
|
|
|
|
|
echo "fanning $n training run(s) × $steps steps (CPU). results → $results"
|
|
|
|
|
|
|
|
|
|
|
|
local repo_remote="Code/@projects/@magic-civilization"
|
|
|
|
|
|
local pids=() i=0 host seed run
|
|
|
|
|
|
for host in "${hosts[@]}"; do
|
|
|
|
|
|
seed=$(( 42 + i ))
|
|
|
|
|
|
run="dist-${stamp}-w${i}"
|
|
|
|
|
|
echo " [$host] run=$run seed=$seed"
|
|
|
|
|
|
ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" \
|
|
|
|
|
|
"cd ~/$repo_remote && python3 -m tooling.rl_self_play.train --run-name '$run' --seed $seed --total-steps $steps --device cpu" \
|
|
|
|
|
|
>"$results/train_worker_${i}.log" 2>&1 &
|
|
|
|
|
|
pids+=($!)
|
|
|
|
|
|
i=$(( i + 1 ))
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
local fail=0 p
|
|
|
|
|
|
for p in "${pids[@]}"; do wait "$p" || fail=$(( fail + 1 )); done
|
|
|
|
|
|
|
|
|
|
|
|
# Pull each worker's model dir back.
|
|
|
|
|
|
i=0
|
|
|
|
|
|
for host in "${hosts[@]}"; do
|
|
|
|
|
|
run="dist-${stamp}-w${i}"
|
|
|
|
|
|
rsync -az "$host:~/$repo_remote/tooling/rl_self_play/models/$run" "$results/" 2>/dev/null || \
|
|
|
|
|
|
echo " note: no model dir for $run on $host (check $results/train_worker_${i}.log)"
|
|
|
|
|
|
i=$(( i + 1 ))
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
echo "----------------------------------------------------------------"
|
|
|
|
|
|
echo "distributed train done under $results"
|
|
|
|
|
|
[ "$fail" -eq 0 ] || echo "WARNING: $fail run(s) errored — see $results/train_worker_*.log" >&2
|
|
|
|
|
|
|
|
|
|
|
|
$destroy && { echo "--destroy-after → tearing down"; cmd_dist_down; }
|
|
|
|
|
|
[ "$fail" -eq 0 ]
|
|
|
|
|
|
}
|