diff --git a/infra/packer/provision.sh b/infra/packer/provision.sh index 2109d2d5..c1a9e472 100755 --- a/infra/packer/provision.sh +++ b/infra/packer/provision.sh @@ -23,7 +23,7 @@ cloud-init status --wait >/dev/null 2>&1 || true apt-get -o DPkg::Lock::Timeout=600 update -y apt-get -o DPkg::Lock::Timeout=600 install -y --no-install-recommends \ git curl ca-certificates build-essential pkg-config libssl-dev \ - unzip sudo python3-pip flatpak rsync \ + unzip sudo python3-pip flatpak rsync rclone \ weston libgl1-mesa-dri libegl1 libgles2 libwayland-egl1 \ mesa-vulkan-drivers vulkan-tools # So every worker can render proof scenes (opengl3/gl_compatibility) under a diff --git a/scripts/run/dist.sh b/scripts/run/dist.sh index 7a05a1ec..575f5610 100755 --- a/scripts/run/dist.sh +++ b/scripts/run/dist.sh @@ -58,7 +58,10 @@ Distributed test/train fleet (DigitalOcean). Set TF_VAR_do_token first. ./run dist:train [--destroy-after] ./run dist:test cargo test --workspace on a worker ./run dist:build cargo build + wasm on a worker (wasm rsync'd back) - ./run dist:sync [ref] git pull + rebuild gdext on live workers + ./run dist:publish build once → upload .so/wasm to the artifact Space (keyed by sha) + ./run dist:fetch download the prebuilt .so for HEAD's sha (skip recompile) + ./run dist:sync [ref] git pull → fetch prebuilt .so if published, else build + ./run dist:models {push |pull |ls} share RL models via the Space ./run dist:render render a proof scene (software weston, no GPU) → png ./run dist:down EOF @@ -257,29 +260,41 @@ cmd_dist_train() { # already carry the toolchain (golden image) + repo (cloud-init git pull). _dist_first_host() { - local inv + local inv h inv="$(_dist_repo_root)/.local/fleet/inventory" [ -f "$inv" ] || return 1 - _dist_read_hosts "$inv" | head -1 + h="$(_dist_read_hosts "$inv" | head -1)" + [ -n "$h" ] || return 1 # inventory present but no live host (e.g. "fleet is down") + printf '%s\n' "$h" } cmd_dist_sync() { - # Pull the given ref on every live worker + rebuild the GDExtension, so a - # mid-session code change reaches the fleet without an image rebuild. + # Pull the given ref on every live worker, then make the GDExtension current: + # fetch the prebuilt .so for that sha from the artifact Space if it exists + # (seconds), else build it. So a mid-session code change reaches the fleet + # without an image rebuild, and N workers share one published build. local ref="${1:-main}" - local root inv host + local root inv host senv root="$(_dist_repo_root)" inv="$root/.local/fleet/inventory" [ -f "$inv" ] || { echo "no fleet — run ./run dist:up first" >&2; return 1; } + senv="$(_dist_spaces_env 2>/dev/null || true)" # empty → workers just build local pids=() p fail=0 while IFS= read -r host; do - echo "[$host] sync → $ref" - ssh -n -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" " - set -e - cd ~/Code/@projects/@magic-civilization - git fetch --depth=1 origin '$ref' && git reset --hard FETCH_HEAD - cd src/simulator && . ~/.cargo/env && bash build-gdext.sh - " & + echo "[$host] sync → $ref (fetch prebuilt .so, else build)" + ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i "$HOME/.ssh/id_mc_fleet" "$host" \ + "$senv SPACE='$_DIST_SPACE' SO_PATH='$_DIST_SO_PATH' REF='$ref' bash -s" <<'REMOTE' & +set -e +cd ~/Code/@projects/@magic-civilization +git fetch --depth=1 origin "$REF" && git reset --hard FETCH_HEAD +SHA=$(git rev-parse HEAD) +. ~/.cargo/env +if [ -n "${RCLONE_S3_ACCESS_KEY_ID:-}" ] && rclone copyto ":s3:$SPACE/builds/$SHA/libmagic_civ_physics.x86_64.so" "$SO_PATH" 2>/dev/null; then + echo " [$SHA] fetched prebuilt .so (no rebuild)" +else + ( cd src/simulator && bash build-gdext.sh ) && echo " [$SHA] built .so (cache miss)" +fi +REMOTE pids+=($!) done < <(_dist_read_hosts "$inv") for p in "${pids[@]}"; do wait "$p" || fail=$(( fail + 1 )); done @@ -292,7 +307,7 @@ cmd_dist_test() { host="$(_dist_first_host)" || { echo "no fleet — run ./run dist:up 1 c-8 first" >&2; return 1; } repo="Code/@projects/@magic-civilization" echo "running cargo tests on $host ..." - ssh -n -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" " + ssh -n -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i "$HOME/.ssh/id_mc_fleet" "$host" " set -e cd ~/$repo/src/simulator && . ~/.cargo/env if command -v cargo-nextest >/dev/null 2>&1; then cargo nextest run --workspace; else cargo test --workspace; fi @@ -308,7 +323,7 @@ cmd_dist_build() { root="$(_dist_repo_root)" repo="Code/@projects/@magic-civilization" echo "building workspace + wasm on $host ..." - ssh -n -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" " + ssh -n -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i "$HOME/.ssh/id_mc_fleet" "$host" " set -e cd ~/$repo/src/simulator && . ~/.cargo/env cargo build --workspace @@ -332,3 +347,74 @@ cmd_dist_render() { PROJECT_ROOT_REMOTE="/home/${user}/Code/@projects/@magic-civilization" \ bash "$(_dist_repo_root)/tools/capture-proof.sh" "$scene" "$out" "${3:-180}" } + +# ── build-artifact Space (magicciv-artifacts on DO Spaces) ─────────────────── +# Build once, publish the linux .so/wasm keyed by git sha; sim/test/AI runners +# fetch the prebuilt artifact instead of recompiling. Creds: ~/.vault/do-spaces-uvlava.* +_DIST_SPACE="magicciv-artifacts" +_DIST_SO_PATH="src/game/engine/addons/magic_civ_physics/libmagic_civ_physics.x86_64.so" + +# Emit an `RCLONE_S3_* ...` env-prefix string (DO Spaces creds from the vault) for +# embedding in a remote ssh command. Empty (rc 1) if the keys are missing. +_dist_spaces_env() { + local ak sk + ak="$(cat ~/.vault/do-spaces-uvlava.access 2>/dev/null)" + sk="$(cat ~/.vault/do-spaces-uvlava.secret 2>/dev/null)" + [ -n "$ak" ] && [ -n "$sk" ] || return 1 + printf "RCLONE_S3_PROVIDER=DigitalOcean RCLONE_S3_ENDPOINT=nyc3.digitaloceanspaces.com RCLONE_S3_ACCESS_KEY_ID='%s' RCLONE_S3_SECRET_ACCESS_KEY='%s'" "$ak" "$sk" +} + +cmd_dist_publish() { + # On a worker: build gdext + wasm, upload to magicciv-artifacts/builds//. + local host senv + host="$(_dist_first_host)" || { echo "no fleet — ./run dist:up 1 first" >&2; return 1; } + senv="$(_dist_spaces_env)" || { echo "no DO Spaces creds in ~/.vault/do-spaces-uvlava.*" >&2; return 1; } + echo "building + publishing artifacts on $host ..." + ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i "$HOME/.ssh/id_mc_fleet" "$host" \ + "$senv SO_PATH='$_DIST_SO_PATH' SPACE='$_DIST_SPACE' bash -s" <<'REMOTE' +set -e +cd ~/Code/@projects/@magic-civilization +SHA=$(git rev-parse HEAD) +. ~/.cargo/env +( cd src/simulator && bash build-gdext.sh && bash build-wasm.sh ) +rclone copyto "$SO_PATH" ":s3:$SPACE/builds/$SHA/libmagic_civ_physics.x86_64.so" +[ -d .local/build/wasm ] && rclone copy .local/build/wasm ":s3:$SPACE/builds/$SHA/wasm/" || true +printf 'sha=%s\nbuilt=%s\n' "$SHA" "$(date -u +%FT%TZ)" | rclone rcat ":s3:$SPACE/builds/$SHA/meta.txt" +echo "published builds/$SHA/ (.so + wasm)" +REMOTE +} + +cmd_dist_fetch() { + # On a worker: fetch the prebuilt .so for the worker's HEAD sha into the addon + # path instead of recompiling. Nonzero on a cache miss. + local host senv + host="$(_dist_first_host)" || { echo "no fleet — ./run dist:up 1 first" >&2; return 1; } + senv="$(_dist_spaces_env)" || { echo "no DO Spaces creds" >&2; return 1; } + ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i "$HOME/.ssh/id_mc_fleet" "$host" \ + "$senv SO_PATH='$_DIST_SO_PATH' SPACE='$_DIST_SPACE' bash -s" <<'REMOTE' +set -e +cd ~/Code/@projects/@magic-civilization +SHA=$(git rev-parse HEAD) +if rclone copyto ":s3:$SPACE/builds/$SHA/libmagic_civ_physics.x86_64.so" "$SO_PATH" 2>/dev/null; then + echo "FETCHED prebuilt .so for $SHA" +else + echo "MISS: no prebuilt .so for $SHA — run ./run dist:publish"; exit 3 +fi +REMOTE +} + +cmd_dist_models() { + # Share RL model artifacts via the Space (runs on plum; models are platform-independent). + # ./run dist:models push ./run dist:models pull ./run dist:models ls + local sub="${1:-}" ak sk + ak="$(cat ~/.vault/do-spaces-uvlava.access 2>/dev/null)"; sk="$(cat ~/.vault/do-spaces-uvlava.secret 2>/dev/null)" + [ -n "$ak" ] && [ -n "$sk" ] || { echo "no DO Spaces creds in ~/.vault/do-spaces-uvlava.*" >&2; return 1; } + export RCLONE_S3_PROVIDER=DigitalOcean RCLONE_S3_ENDPOINT=nyc3.digitaloceanspaces.com + export RCLONE_S3_ACCESS_KEY_ID="$ak" RCLONE_S3_SECRET_ACCESS_KEY="$sk" + case "$sub" in + push) [ -n "${2:-}" ] && [ -n "${3:-}" ] || { echo "usage: ./run dist:models push " >&2; return 1; }; rclone copy "$2" ":s3:$_DIST_SPACE/models/$3/" -P ;; + pull) [ -n "${2:-}" ] && [ -n "${3:-}" ] || { echo "usage: ./run dist:models pull " >&2; return 1; }; rclone copy ":s3:$_DIST_SPACE/models/$2/" "$3" -P ;; + ls) rclone ls ":s3:$_DIST_SPACE/models/" 2>/dev/null || echo "(empty)" ;; + *) echo "usage: ./run dist:models {push |pull |ls}" >&2; return 1 ;; + esac +} diff --git a/tooling/claude/dot-claude/instructions/cloud-dx-do.md b/tooling/claude/dot-claude/instructions/cloud-dx-do.md index a0b9340b..78beaa32 100644 --- a/tooling/claude/dot-claude/instructions/cloud-dx-do.md +++ b/tooling/claude/dot-claude/instructions/cloud-dx-do.md @@ -10,9 +10,12 @@ | `./run dist:up [size] [region]` | boot N workers from the golden image; **waits for cloud-init readiness** before returning | | `./run dist:test` | `cargo test --workspace` (nextest) on a worker | | `./run dist:build` | `cargo build` + WASM on a worker; rsync the WASM back (native `.so` is linux-only, stays on the worker) | +| `./run dist:publish` | **build once → upload the linux `.so` + wasm to the `magicciv-artifacts` Space, keyed by git sha**. The producer side of build-once-load-many. | +| `./run dist:fetch` | download the prebuilt `.so` for the worker's HEAD sha into the addon path — skip recompiling. Nonzero on cache miss. | | `./run dist:sim [turns] [--destroy-after]` | fan seeded sims across workers via `autoplay-batch.sh` `AUTOPLAY_HOST`+`SEED_OFFSET`; results merge in `.local/iter//` | | `./run dist:render ` | render a proof scene (software weston + Mesa, **no GPU**) and pull the PNG back — replaces the dead apricot `$SCREENSHOT_HOST` | -| `./run dist:sync [ref]` | `git pull` + rebuild gdext on **live** workers (mid-session code change, no image rebuild) | +| `./run dist:sync [ref]` | `git pull` on **live** workers, then **fetch the prebuilt `.so` from the Space if published for that sha, else build** — N workers share one build instead of N recompiles | +| `./run dist:models {push \|pull \|ls}` | share RL model artifacts (`.onnx`) via the Space; runs locally on plum (models are platform-independent) | | `./run dist:image [--cold]` | **(re)build the golden image — incremental by default** (layers on the last snapshot, ~8 min; provision.sh is idempotent so only the delta rebuilds). `--cold` = from stock Ubuntu (~20 min), reset cruft | | `./run dist:prune [keep=2]` | delete superseded golden snapshots (~$0.40/mo each); keeps the newest N | | `./run dist:down` | tear the fleet down → **$0** | @@ -24,7 +27,24 @@ - **Forge**: `mc-forge` droplet running Forgejo; repo `mcadmin/magicciv`; IP + admin creds in `~/.vault/mc_forge_creds`. - **Golden image**: Packer `infra/packer/`, auto-discovered by the fleet (snapshot name prefix `mc-golden`). Bakes: toolchain (via `scripts/dev-setup/linux.sh`) + prebuilt GDExtension `.so` + warm Godot import + **weston/Mesa render stack** + **mold + sccache** build accelerators + the fleet ssh key in `mc`'s `authorized_keys`. - **Fleet TF**: `infra/terraform/test-fleet/` — DO provider, golden-image data-source discovery, grouped under the `mc:dev` DO project, mocked-provider test suite. -- **Secrets**: `~/.vault/{do_pat_mc, mc_forge_creds}` (600). Key `~/.ssh/id_mc_fleet` (DO key `mc-fleet`). +- **Secrets**: `~/.vault/{do_pat_mc, mc_forge_creds, do-spaces-uvlava.access, do-spaces-uvlava.secret}` (600). Key `~/.ssh/id_mc_fleet` (DO key `mc-fleet`). +- **Artifact Space**: `magicciv-artifacts` (DO Spaces, nyc3) — `builds//` holds the prebuilt linux `.so`+wasm; `models//` holds shared RL `.onnx`. Account already pays the Spaces subscription (for `lilith-quinn-media`), so this Space adds ~$0 base. Workers carry `rclone` (baked by `provision.sh`); the dispatch passes the Spaces creds as `RCLONE_S3_*` env over ssh (never stored on the worker, never on argv). + +## Build once, load many (the artifact Space) + +Fan-out used to mean N workers each recompiling the gdext. Now: **one** `dist:publish` builds + uploads the `.so` keyed by sha; every consumer (`dist:sync`, sim/test/render workers) **fetches** it. This *complements* sccache — sccache caches crate *compilation*, the Space caches the *final `.so`/wasm/models*. + +``` +./run dist:up 3 # 3 workers from the golden image +./run dist:publish # worker 1 builds + uploads builds// (once) +./run dist:sync # all 3 workers fetch the prebuilt .so (no recompile) +./run dist:sim 300 200 # fan sims; teardown when done +./run dist:down +``` + +- Keyed by **git sha** — a different sha is a cache miss → `dist:sync` falls back to building. (Toolchain changes ride the golden-image rebuild, which re-publishes.) +- The `.so` is **linux x86_64 only** — this Space serves DO/linux runners; plum builds its own macOS `.dylib`. +- If the Spaces creds are absent, `dist:sync` silently degrades to build-on-each-worker (no breakage). ## Gotchas every agent must respect