diff --git a/infra/launchd/com.uvlava.mc.cull-builders.plist b/infra/launchd/com.uvlava.mc.cull-builders.plist new file mode 100644 index 00000000..66862de1 --- /dev/null +++ b/infra/launchd/com.uvlava.mc.cull-builders.plist @@ -0,0 +1,50 @@ + + + + + Label + com.uvlava.mc.cull-builders + + ProgramArguments + + /bin/bash + scripts/cull-orphan-builders.sh + --min-age-min + 90 + + + + WorkingDirectory + /Users/natalie/Code/@mc/@applications/magicciv + + + StartInterval + 1800 + + RunAtLoad + + + StandardOutPath + /tmp/mc-cull-builders.log + StandardErrorPath + /tmp/mc-cull-builders.log + + diff --git a/infra/packer/golden-image.pkr.hcl b/infra/packer/golden-image.pkr.hcl index 7e35907f..a48b3b2f 100644 --- a/infra/packer/golden-image.pkr.hcl +++ b/infra/packer/golden-image.pkr.hcl @@ -79,6 +79,10 @@ source "digitalocean" "golden" { image = var.base_image ssh_username = "root" snapshot_name = "mc-golden-${local.ts}" + # Deterministic, MC-owned builder name so scripts/cull-orphan-builders.sh can + # reap a leftover build droplet by prefix if a run is killed before Packer's own + # teardown. (Default would be "packer-"; the cull script matches both.) + droplet_name = "mc-packer-${local.ts}" } build { diff --git a/scripts/cloud-bringup.sh b/scripts/cloud-bringup.sh index 1d571acc..14dd2c7f 100644 --- a/scripts/cloud-bringup.sh +++ b/scripts/cloud-bringup.sh @@ -33,6 +33,10 @@ echo "########## $(date) — DO cloud bring-up starting ##########" _teardown() { echo "########## teardown: ./run dist:down ##########" ./run dist:down 2>&1 | tail -3 || true + # Reap any Packer build droplet left alive by a failed/interrupted build. Packer + # tears its builder down on a clean finish; this catches the cases it can't. + echo "########## teardown: cull orphaned packer builders ##########" + bash scripts/cull-orphan-builders.sh 2>&1 | tail -5 || true echo "forge left UP for inspection — './run forge:down' to park it (~\$0.30/mo idle)." } trap _teardown EXIT diff --git a/scripts/cull-orphan-builders.sh b/scripts/cull-orphan-builders.sh new file mode 100755 index 00000000..706d2a5c --- /dev/null +++ b/scripts/cull-orphan-builders.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +# Cull orphaned Packer build droplets ("zombies") from the MC DigitalOcean account. +# +# Packer destroys its build droplet on a clean finish. An interrupted or failed run +# (SIGKILL, laptop sleep, network drop) can leave the s-8vcpu-16gb-amd builder alive — +# ~$192/mo bleeding silently. See .project/handoffs/20260629_packer-cross-account-leak.md. +# +# Two ways this runs: +# * Automatically — cloud-bringup.sh calls it in its EXIT trap after every build, +# so a failed/Ctrl-C'd run reaps its own builder. +# * Periodically — from a launchd/cron timer, to catch hard-kill cases the trap +# can't (SIGKILL/power loss). Use --min-age-min so it never races a live build. +# +# Selector = droplet NAME prefix (never matches a real service droplet). The packer +# source names its builder "mc-packer-"; we also match the legacy default +# "packer-" so pre-existing zombies are reaped. Size is a defense-in-depth guard. +# +# Usage: +# scripts/cull-orphan-builders.sh # reap every leftover builder now +# scripts/cull-orphan-builders.sh --min-age-min 90 # only reap builders >90 min old (cron-safe) +# scripts/cull-orphan-builders.sh --dry-run # list what would be reaped, delete nothing +set -euo pipefail + +MIN_AGE_MIN=0 +DRY_RUN=0 +while [[ $# -gt 0 ]]; do + case "$1" in + --min-age-min) MIN_AGE_MIN="${2:?--min-age-min needs a value}"; shift 2 ;; + --dry-run) DRY_RUN=1; shift ;; + -h|--help) grep '^#' "$0" | sed 's/^#\{1,\} \{0,1\}//'; exit 0 ;; + *) echo "cull-orphan-builders: unknown arg '$1'" >&2; exit 2 ;; + esac +done + +TOKEN_FILE="${MC_DO_TOKEN_FILE:-$HOME/.vault/do_pat_mc}" +[[ -r "$TOKEN_FILE" ]] || { echo "!!! no DO token at $TOKEN_FILE" >&2; exit 1; } +DIGITALOCEAN_ACCESS_TOKEN="$(cat "$TOKEN_FILE")"; export DIGITALOCEAN_ACCESS_TOKEN + +# Name prefixes that identify an MC packer builder. Anchored — never matches a +# real service droplet (com.uvlava.*, ct-forge-*, etc.). +BUILD_SIZE="${MC_BUILD_SIZE:-s-8vcpu-16gb-amd}" + +# Emit one "idnamesizeage_min" row per qualifying builder. Age is +# computed in python (portable RFC3339 parse; macOS `date` can't do it cleanly). +# --access-token pins the MC token explicitly (the documented rule), not whatever +# doctl's default context happens to hold. +builder_filter=' +import json, os, re, sys +from datetime import datetime, timezone + +min_age = float(os.environ["MIN_AGE_MIN"]) +build_size = os.environ["BUILD_SIZE"] +rx = re.compile(r"^(mc-packer-|packer-)") +now = datetime.now(timezone.utc) + +for d in json.load(sys.stdin) or []: + name = d.get("name", "") + if not rx.match(name): + continue + created = d.get("created_at", "") + try: + ts = datetime.fromisoformat(created.replace("Z", "+00:00")) + age_min = (now - ts).total_seconds() / 60.0 + except ValueError: + age_min = 0.0 # unparseable timestamp -> treat as old enough to reap + if age_min < min_age: + continue + size = d.get("size_slug", "?") + did = d.get("id", "?") + # Defense-in-depth: only reap the known builder size. A differently-sized + # "packer-*" droplet is unexpected; surface it instead of nuking it. + if size != build_size: + print(f"SKIP-SIZE\t{did}\t{name}\t{size}\t{age_min:.0f}", file=sys.stderr) + continue + print(f"{did}\t{name}\t{size}\t{age_min:.0f}") +' +droplets_json="$(doctl compute droplet list -o json --access-token "$DIGITALOCEAN_ACCESS_TOKEN")" +mapfile -t victims < <( + printf '%s' "$droplets_json" \ + | MIN_AGE_MIN="$MIN_AGE_MIN" BUILD_SIZE="$BUILD_SIZE" python3 -c "$builder_filter" +) + +if [[ ${#victims[@]} -eq 0 ]]; then + echo "cull-orphan-builders: no orphaned packer builders found (min-age ${MIN_AGE_MIN}m)." + exit 0 +fi + +ids=() +for row in "${victims[@]}"; do + IFS=$'\t' read -r id name size age <<<"$row" + echo " orphan: $id $name $size ~${age}m old" + ids+=("$id") +done + +if [[ $DRY_RUN -eq 1 ]]; then + echo "cull-orphan-builders: --dry-run, deleting nothing (${#ids[@]} would be culled)." + exit 0 +fi + +echo "cull-orphan-builders: deleting ${#ids[@]} orphaned builder(s) ..." +doctl compute droplet delete "${ids[@]}" --force +echo "cull-orphan-builders: done."