perf(infra): incremental golden-image rebuilds (layer on the last snapshot)

Packer base image is now a var; ./run dist:image builds FROM the newest
mc-golden snapshot by default, so the idempotent provision.sh only redoes changed
work (~3-8 min vs ~20 cold). --cold rebuilds from stock Ubuntu to reset layer
cruft. Made the clone step idempotent (clone-or-fetch) so it works on a
pre-provisioned base. Directly addresses 'avoid unnecessary rebuilds'.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Natalie 2026-06-27 14:41:01 -04:00
parent 68099051b8
commit d9588f8c80
3 changed files with 42 additions and 4 deletions

View file

@ -53,6 +53,14 @@ variable "remote_user" {
default = "mc"
}
variable "base_image" {
type = string
default = "ubuntu-24-04-x64"
# Stock Ubuntu for a COLD build, or a previous mc-golden snapshot ID for an
# INCREMENTAL rebuild provision.sh is idempotent, so it only redoes changed
# work (~3-8 min vs ~20). `./run dist:image` picks this automatically.
}
variable "fleet_pubkey" {
type = string
default = ""
@ -68,7 +76,7 @@ source "digitalocean" "golden" {
api_token = var.do_token
region = var.region
size = var.build_size
image = "ubuntu-24-04-x64"
image = var.base_image
ssh_username = "root"
snapshot_name = "mc-golden-${local.ts}"
}

View file

@ -65,10 +65,11 @@ as_user() {
bash -lc "$1"
}
echo "=== [3/7] clone repo @ \$HOME/$REPO_PATH (ref $GIT_REF) ==="
echo "=== [3/7] clone-or-update repo @ \$HOME/$REPO_PATH (ref $GIT_REF) ==="
# Idempotent: clone on a cold build, fetch+reset on an incremental rebuild (the
# repo is already present when building FROM a previous golden snapshot).
as_user "mkdir -p ~/$(dirname "$REPO_PATH")"
as_user "git clone '$GIT_REMOTE' ~/$REPO_PATH"
as_user "cd ~/$REPO_PATH && git checkout -f '$GIT_REF'"
as_user "if [ -d ~/$REPO_PATH/.git ]; then cd ~/$REPO_PATH && git remote set-url origin '$GIT_REMOTE' && git fetch --depth=1 origin '$GIT_REF' && git reset --hard FETCH_HEAD; else git clone '$GIT_REMOTE' ~/$REPO_PATH && cd ~/$REPO_PATH && git checkout -f '$GIT_REF'; fi"
echo "=== [4/7] toolchain via scripts/dev-setup/linux.sh ==="
# WITH_RUNNER must be defined: linux.sh references it unguarded under set -u and

View file

@ -51,6 +51,7 @@ cmd_dist() {
cat <<'EOF'
Distributed test/train fleet (DigitalOcean). Set TF_VAR_do_token first.
./run dist:check offline: fmt + validate + mocked test (no token/spend)
./run dist:image [--cold] (re)build golden image — incremental by default (~3-8min vs ~20 cold)
./run dist:up <workers> [size] [region] e.g. ./run dist:up 10
./run dist:sim <games> [turn_limit] [--destroy-after]
./run dist:train <total_steps> [--destroy-after]
@ -79,6 +80,34 @@ cmd_dist_check() {
echo "dist:check OK — config is valid, no resources touched."
}
cmd_dist_image() {
# (Re)build the golden image. INCREMENTAL by default: builds FROM the newest
# mc-golden snapshot, so provision.sh (idempotent) only redoes changed work
# (~3-8 min). --cold builds from stock Ubuntu (~20 min) — resets accumulated
# layer cruft; run occasionally. Needs ~/.vault/{do_pat_mc,mc_forge_creds}.
local cold=false a
for a in "$@"; do [ "$a" = "--cold" ] && cold=true; done
local root pat
root="$(_dist_repo_root)"
pat="$(cat ~/.vault/do_pat_mc 2>/dev/null)"
[ -n "$pat" ] || { echo "no ~/.vault/do_pat_mc" >&2; return 1; }
export DIGITALOCEAN_TOKEN="$pat"
# shellcheck disable=SC1090
. ~/.vault/mc_forge_creds
export PKR_VAR_git_remote="http://${ADMIN_USER}:${ADMIN_PASS}@${FORGE_IP}:3000/mcadmin/magicciv.git"
PKR_VAR_fleet_pubkey="$(cat ~/.ssh/id_mc_fleet.pub)"; export PKR_VAR_fleet_pubkey
local base="ubuntu-24-04-x64" prev
if ! $cold; then
prev="$(curl -s -H "Authorization: Bearer $pat" "https://api.digitalocean.com/v2/snapshots?resource_type=droplet&per_page=200" \
| python3 -c "import sys,json;s=[x for x in json.load(sys.stdin)['snapshots'] if x['name'].startswith('mc-golden')];s.sort(key=lambda x:x['created_at']);print(s[-1]['id'] if s else '')" 2>/dev/null)"
if [ -n "$prev" ]; then base="$prev"; echo "INCREMENTAL rebuild from snapshot $base (pass --cold for a full rebuild)"; else echo "no prior golden — cold build"; fi
else
echo "COLD rebuild from $base"
fi
export PKR_VAR_base_image="$base"
( cd "$root/infra/packer" && packer init golden-image.pkr.hcl >/dev/null && packer build golden-image.pkr.hcl )
}
cmd_dist_up() {
local n="${1:-}"
[[ "$n" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:up <workers> [size] [region]" >&2; return 1; }