feat(infra): auto-cull orphaned packer build droplets to prevent zombies
Some checks are pending
ci / regression gate (push) Waiting to run
Some checks are pending
ci / regression gate (push) Waiting to run
Packer destroys its build droplet on a clean finish, but a killed/slept/ network-dropped run leaves the s-8vcpu-16gb-amd builder alive (~$192/mo). This happened once already (.project/handoffs/20260629_packer-cross-account-leak.md). Two defense layers: - scripts/cull-orphan-builders.sh reaps leftover builders by name prefix (mc-packer-* / legacy packer-*) with a size guard and an optional age guard; pins the MC token via --access-token. - cloud-bringup.sh calls it in its EXIT trap, so a failed/Ctrl-C'd build reaps its own builder. - infra/launchd/com.uvlava.mc.cull-builders.plist sweeps every 30m with --min-age-min 90 to catch SIGKILL/power-loss cases no trap can. golden-image.pkr.hcl names the builder mc-packer-<ts> for deterministic matching. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a0428fc950
commit
273a7c71f8
4 changed files with 160 additions and 0 deletions
50
infra/launchd/com.uvlava.mc.cull-builders.plist
Normal file
50
infra/launchd/com.uvlava.mc.cull-builders.plist
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
Periodic safety-net sweep for orphaned Packer build droplets ("zombies").
|
||||||
|
|
||||||
|
cloud-bringup.sh already culls in its EXIT trap, so a failed or Ctrl-C'd build
|
||||||
|
reaps its own builder. This timer catches the cases the trap CANNOT: SIGKILL,
|
||||||
|
laptop sleep mid-build, or power loss — where no trap ever runs.
|
||||||
|
|
||||||
|
--min-age-min 90 means it only reaps builders older than 90 min, so it never
|
||||||
|
races a legitimately in-flight golden build (those take ~20-40 min).
|
||||||
|
|
||||||
|
Install (run on plum, the host that launches builds):
|
||||||
|
cp infra/launchd/com.uvlava.mc.cull-builders.plist ~/Library/LaunchAgents/
|
||||||
|
# edit WorkingDirectory below to your real repo path first, then:
|
||||||
|
launchctl load -w ~/Library/LaunchAgents/com.uvlava.mc.cull-builders.plist
|
||||||
|
Uninstall:
|
||||||
|
launchctl unload -w ~/Library/LaunchAgents/com.uvlava.mc.cull-builders.plist
|
||||||
|
Run once now (test):
|
||||||
|
launchctl start com.uvlava.mc.cull-builders
|
||||||
|
-->
|
||||||
|
<plist version="1.0">
|
||||||
|
<dict>
|
||||||
|
<key>Label</key>
|
||||||
|
<string>com.uvlava.mc.cull-builders</string>
|
||||||
|
|
||||||
|
<key>ProgramArguments</key>
|
||||||
|
<array>
|
||||||
|
<string>/bin/bash</string>
|
||||||
|
<string>scripts/cull-orphan-builders.sh</string>
|
||||||
|
<string>--min-age-min</string>
|
||||||
|
<string>90</string>
|
||||||
|
</array>
|
||||||
|
|
||||||
|
<!-- EDIT to the absolute path of this repo on the build host. -->
|
||||||
|
<key>WorkingDirectory</key>
|
||||||
|
<string>/Users/natalie/Code/@mc/@applications/magicciv</string>
|
||||||
|
|
||||||
|
<!-- Every 30 min. -->
|
||||||
|
<key>StartInterval</key>
|
||||||
|
<integer>1800</integer>
|
||||||
|
|
||||||
|
<key>RunAtLoad</key>
|
||||||
|
<true/>
|
||||||
|
|
||||||
|
<key>StandardOutPath</key>
|
||||||
|
<string>/tmp/mc-cull-builders.log</string>
|
||||||
|
<key>StandardErrorPath</key>
|
||||||
|
<string>/tmp/mc-cull-builders.log</string>
|
||||||
|
</dict>
|
||||||
|
</plist>
|
||||||
|
|
@ -79,6 +79,10 @@ source "digitalocean" "golden" {
|
||||||
image = var.base_image
|
image = var.base_image
|
||||||
ssh_username = "root"
|
ssh_username = "root"
|
||||||
snapshot_name = "mc-golden-${local.ts}"
|
snapshot_name = "mc-golden-${local.ts}"
|
||||||
|
# Deterministic, MC-owned builder name so scripts/cull-orphan-builders.sh can
|
||||||
|
# reap a leftover build droplet by prefix if a run is killed before Packer's own
|
||||||
|
# teardown. (Default would be "packer-<uuid>"; the cull script matches both.)
|
||||||
|
droplet_name = "mc-packer-${local.ts}"
|
||||||
}
|
}
|
||||||
|
|
||||||
build {
|
build {
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,10 @@ echo "########## $(date) — DO cloud bring-up starting ##########"
|
||||||
_teardown() {
|
_teardown() {
|
||||||
echo "########## teardown: ./run dist:down ##########"
|
echo "########## teardown: ./run dist:down ##########"
|
||||||
./run dist:down 2>&1 | tail -3 || true
|
./run dist:down 2>&1 | tail -3 || true
|
||||||
|
# Reap any Packer build droplet left alive by a failed/interrupted build. Packer
|
||||||
|
# tears its builder down on a clean finish; this catches the cases it can't.
|
||||||
|
echo "########## teardown: cull orphaned packer builders ##########"
|
||||||
|
bash scripts/cull-orphan-builders.sh 2>&1 | tail -5 || true
|
||||||
echo "forge left UP for inspection — './run forge:down' to park it (~\$0.30/mo idle)."
|
echo "forge left UP for inspection — './run forge:down' to park it (~\$0.30/mo idle)."
|
||||||
}
|
}
|
||||||
trap _teardown EXIT
|
trap _teardown EXIT
|
||||||
|
|
|
||||||
102
scripts/cull-orphan-builders.sh
Executable file
102
scripts/cull-orphan-builders.sh
Executable file
|
|
@ -0,0 +1,102 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# Cull orphaned Packer build droplets ("zombies") from the MC DigitalOcean account.
|
||||||
|
#
|
||||||
|
# Packer destroys its build droplet on a clean finish. An interrupted or failed run
|
||||||
|
# (SIGKILL, laptop sleep, network drop) can leave the s-8vcpu-16gb-amd builder alive —
|
||||||
|
# ~$192/mo bleeding silently. See .project/handoffs/20260629_packer-cross-account-leak.md.
|
||||||
|
#
|
||||||
|
# Two ways this runs:
|
||||||
|
# * Automatically — cloud-bringup.sh calls it in its EXIT trap after every build,
|
||||||
|
# so a failed/Ctrl-C'd run reaps its own builder.
|
||||||
|
# * Periodically — from a launchd/cron timer, to catch hard-kill cases the trap
|
||||||
|
# can't (SIGKILL/power loss). Use --min-age-min so it never races a live build.
|
||||||
|
#
|
||||||
|
# Selector = droplet NAME prefix (never matches a real service droplet). The packer
|
||||||
|
# source names its builder "mc-packer-<ts>"; we also match the legacy default
|
||||||
|
# "packer-<uuid>" so pre-existing zombies are reaped. Size is a defense-in-depth guard.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# scripts/cull-orphan-builders.sh # reap every leftover builder now
|
||||||
|
# scripts/cull-orphan-builders.sh --min-age-min 90 # only reap builders >90 min old (cron-safe)
|
||||||
|
# scripts/cull-orphan-builders.sh --dry-run # list what would be reaped, delete nothing
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
MIN_AGE_MIN=0
|
||||||
|
DRY_RUN=0
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--min-age-min) MIN_AGE_MIN="${2:?--min-age-min needs a value}"; shift 2 ;;
|
||||||
|
--dry-run) DRY_RUN=1; shift ;;
|
||||||
|
-h|--help) grep '^#' "$0" | sed 's/^#\{1,\} \{0,1\}//'; exit 0 ;;
|
||||||
|
*) echo "cull-orphan-builders: unknown arg '$1'" >&2; exit 2 ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
TOKEN_FILE="${MC_DO_TOKEN_FILE:-$HOME/.vault/do_pat_mc}"
|
||||||
|
[[ -r "$TOKEN_FILE" ]] || { echo "!!! no DO token at $TOKEN_FILE" >&2; exit 1; }
|
||||||
|
DIGITALOCEAN_ACCESS_TOKEN="$(cat "$TOKEN_FILE")"; export DIGITALOCEAN_ACCESS_TOKEN
|
||||||
|
|
||||||
|
# Name prefixes that identify an MC packer builder. Anchored — never matches a
|
||||||
|
# real service droplet (com.uvlava.*, ct-forge-*, etc.).
|
||||||
|
BUILD_SIZE="${MC_BUILD_SIZE:-s-8vcpu-16gb-amd}"
|
||||||
|
|
||||||
|
# Emit one "id<TAB>name<TAB>size<TAB>age_min" row per qualifying builder. Age is
|
||||||
|
# computed in python (portable RFC3339 parse; macOS `date` can't do it cleanly).
|
||||||
|
# --access-token pins the MC token explicitly (the documented rule), not whatever
|
||||||
|
# doctl's default context happens to hold.
|
||||||
|
builder_filter='
|
||||||
|
import json, os, re, sys
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
min_age = float(os.environ["MIN_AGE_MIN"])
|
||||||
|
build_size = os.environ["BUILD_SIZE"]
|
||||||
|
rx = re.compile(r"^(mc-packer-|packer-)")
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
for d in json.load(sys.stdin) or []:
|
||||||
|
name = d.get("name", "")
|
||||||
|
if not rx.match(name):
|
||||||
|
continue
|
||||||
|
created = d.get("created_at", "")
|
||||||
|
try:
|
||||||
|
ts = datetime.fromisoformat(created.replace("Z", "+00:00"))
|
||||||
|
age_min = (now - ts).total_seconds() / 60.0
|
||||||
|
except ValueError:
|
||||||
|
age_min = 0.0 # unparseable timestamp -> treat as old enough to reap
|
||||||
|
if age_min < min_age:
|
||||||
|
continue
|
||||||
|
size = d.get("size_slug", "?")
|
||||||
|
did = d.get("id", "?")
|
||||||
|
# Defense-in-depth: only reap the known builder size. A differently-sized
|
||||||
|
# "packer-*" droplet is unexpected; surface it instead of nuking it.
|
||||||
|
if size != build_size:
|
||||||
|
print(f"SKIP-SIZE\t{did}\t{name}\t{size}\t{age_min:.0f}", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
print(f"{did}\t{name}\t{size}\t{age_min:.0f}")
|
||||||
|
'
|
||||||
|
droplets_json="$(doctl compute droplet list -o json --access-token "$DIGITALOCEAN_ACCESS_TOKEN")"
|
||||||
|
mapfile -t victims < <(
|
||||||
|
printf '%s' "$droplets_json" \
|
||||||
|
| MIN_AGE_MIN="$MIN_AGE_MIN" BUILD_SIZE="$BUILD_SIZE" python3 -c "$builder_filter"
|
||||||
|
)
|
||||||
|
|
||||||
|
if [[ ${#victims[@]} -eq 0 ]]; then
|
||||||
|
echo "cull-orphan-builders: no orphaned packer builders found (min-age ${MIN_AGE_MIN}m)."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
ids=()
|
||||||
|
for row in "${victims[@]}"; do
|
||||||
|
IFS=$'\t' read -r id name size age <<<"$row"
|
||||||
|
echo " orphan: $id $name $size ~${age}m old"
|
||||||
|
ids+=("$id")
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ $DRY_RUN -eq 1 ]]; then
|
||||||
|
echo "cull-orphan-builders: --dry-run, deleting nothing (${#ids[@]} would be culled)."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "cull-orphan-builders: deleting ${#ids[@]} orphaned builder(s) ..."
|
||||||
|
doctl compute droplet delete "${ids[@]}" --force
|
||||||
|
echo "cull-orphan-builders: done."
|
||||||
Loading…
Add table
Reference in a new issue