Ephemeral CPU Droplet fleet that horizontally scales the iteration loop:
- infra/terraform/test-fleet: cattle Droplets from a golden image (auto-discovered
by name via digitalocean_images), grouped under the mc:dev DO project, with a
mocked-provider test suite (no token/spend).
- infra/packer: golden-image builder reusing scripts/dev-setup/linux.sh.
- scripts/run/dist.sh: ./run dist:{check,up,sim,train,down} — shard sim/test
batches across workers via autoplay-batch AUTOPLAY_HOST+SEED_OFFSET.
GPU intentionally absent (workload is CPU-bound per docs/ai-production.md).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
95 lines
3.1 KiB
HCL
95 lines
3.1 KiB
HCL
variable "do_token" {
|
|
description = "DigitalOcean API token (Read/Write). Export as TF_VAR_do_token; never commit."
|
|
type = string
|
|
sensitive = true
|
|
}
|
|
|
|
variable "workers" {
|
|
description = <<-EOT
|
|
Fleet size — the iteration-speed lever. 0 = nothing running, zero cost.
|
|
Set to N to fan distributed sim/test work across N disposable Droplets, then
|
|
back to 0 to tear the fleet down. Each worker is identical cattle; results
|
|
are scp'd off by the dispatch layer before teardown, so there is no
|
|
per-worker state to preserve.
|
|
EOT
|
|
type = number
|
|
default = 0
|
|
|
|
validation {
|
|
condition = var.workers >= 0 && var.workers <= 50
|
|
error_message = "Keep the fleet between 0 and 50 (account-limit / sanity guard)."
|
|
}
|
|
}
|
|
|
|
variable "region" {
|
|
description = "DigitalOcean region slug. NYC: nyc1, nyc3. Others: sfo3, ams3, fra1, sgp1."
|
|
type = string
|
|
default = "nyc3"
|
|
}
|
|
|
|
variable "size" {
|
|
description = <<-EOT
|
|
Per-worker Droplet size slug. Distributed fan-out favours many small cheap
|
|
boxes over one big one. Match tier to CPU profile:
|
|
Bursty test/sim (minutes) -> Basic shared:
|
|
s-4vcpu-8gb, s-8vcpu-16gb (default), s-8vcpu-16gb-amd
|
|
Sustained RL training (hours @ 100%) -> CPU-Optimized (dedicated vCPU):
|
|
c-8 = 8 vCPU / 16 GB, c-16 = 16 vCPU / 32 GB
|
|
EOT
|
|
type = string
|
|
default = "s-8vcpu-16gb"
|
|
}
|
|
|
|
variable "base_image" {
|
|
description = <<-EOT
|
|
Bootstrap escape hatch. Leave empty ("") to boot workers from the latest
|
|
Packer-built golden image (resolved by name-substring via the
|
|
digitalocean_images data source). Set to a stock image slug (e.g.
|
|
"ubuntu-24-04-x64") only for first-run `terraform plan` testing BEFORE any
|
|
golden image exists.
|
|
EOT
|
|
type = string
|
|
default = ""
|
|
}
|
|
|
|
variable "golden_name_match" {
|
|
description = "Name substring identifying the Packer golden image. Must match snapshot_name prefix in infra/packer/golden-image.pkr.hcl."
|
|
type = string
|
|
default = "mc-golden"
|
|
}
|
|
|
|
variable "ssh_public_key_path" {
|
|
description = "Public key authorised on every worker (and used by the dispatch scripts to ssh in)."
|
|
type = string
|
|
default = "~/.ssh/id_ed25519.pub"
|
|
}
|
|
|
|
variable "name" {
|
|
description = "Resource name prefix; workers are named <name>-0, <name>-1, ..."
|
|
type = string
|
|
default = "mc-test"
|
|
}
|
|
|
|
variable "git_remote" {
|
|
description = "GitLab clone URL (origin) the golden image was built from; cloud-init pulls the latest ref from here on boot."
|
|
type = string
|
|
default = ""
|
|
}
|
|
|
|
variable "git_ref" {
|
|
description = "Branch/tag/SHA the fleet checks out on boot. Pin to a SHA for reproducible distributed runs."
|
|
type = string
|
|
default = "main"
|
|
}
|
|
|
|
variable "remote_user" {
|
|
description = "Unix user the dispatch layer ssh's in as (created in the golden image; flatpak runs --user as this account)."
|
|
type = string
|
|
default = "mc"
|
|
}
|
|
|
|
variable "do_project" {
|
|
description = "DigitalOcean project name to group fleet resources under."
|
|
type = string
|
|
default = "mc:dev"
|
|
}
|