magicciv/infra/terraform/test-fleet/variables.tf
Natalie f5c5d1a410 feat(infra): distributed test/train fleet on DigitalOcean (Terraform + Packer + dispatch)
Ephemeral CPU Droplet fleet that horizontally scales the iteration loop:
- infra/terraform/test-fleet: cattle Droplets from a golden image (auto-discovered
  by name via digitalocean_images), grouped under the mc:dev DO project, with a
  mocked-provider test suite (no token/spend).
- infra/packer: golden-image builder reusing scripts/dev-setup/linux.sh.
- scripts/run/dist.sh: ./run dist:{check,up,sim,train,down} — shard sim/test
  batches across workers via autoplay-batch AUTOPLAY_HOST+SEED_OFFSET.
GPU intentionally absent (workload is CPU-bound per docs/ai-production.md).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-27 08:51:09 -04:00

95 lines
3.1 KiB
HCL

variable "do_token" {
description = "DigitalOcean API token (Read/Write). Export as TF_VAR_do_token; never commit."
type = string
sensitive = true
}
variable "workers" {
description = <<-EOT
Fleet size — the iteration-speed lever. 0 = nothing running, zero cost.
Set to N to fan distributed sim/test work across N disposable Droplets, then
back to 0 to tear the fleet down. Each worker is identical cattle; results
are scp'd off by the dispatch layer before teardown, so there is no
per-worker state to preserve.
EOT
type = number
default = 0
validation {
condition = var.workers >= 0 && var.workers <= 50
error_message = "Keep the fleet between 0 and 50 (account-limit / sanity guard)."
}
}
variable "region" {
description = "DigitalOcean region slug. NYC: nyc1, nyc3. Others: sfo3, ams3, fra1, sgp1."
type = string
default = "nyc3"
}
variable "size" {
description = <<-EOT
Per-worker Droplet size slug. Distributed fan-out favours many small cheap
boxes over one big one. Match tier to CPU profile:
Bursty test/sim (minutes) -> Basic shared:
s-4vcpu-8gb, s-8vcpu-16gb (default), s-8vcpu-16gb-amd
Sustained RL training (hours @ 100%) -> CPU-Optimized (dedicated vCPU):
c-8 = 8 vCPU / 16 GB, c-16 = 16 vCPU / 32 GB
EOT
type = string
default = "s-8vcpu-16gb"
}
variable "base_image" {
description = <<-EOT
Bootstrap escape hatch. Leave empty ("") to boot workers from the latest
Packer-built golden image (resolved by name-substring via the
digitalocean_images data source). Set to a stock image slug (e.g.
"ubuntu-24-04-x64") only for first-run `terraform plan` testing BEFORE any
golden image exists.
EOT
type = string
default = ""
}
variable "golden_name_match" {
description = "Name substring identifying the Packer golden image. Must match snapshot_name prefix in infra/packer/golden-image.pkr.hcl."
type = string
default = "mc-golden"
}
variable "ssh_public_key_path" {
description = "Public key authorised on every worker (and used by the dispatch scripts to ssh in)."
type = string
default = "~/.ssh/id_ed25519.pub"
}
variable "name" {
description = "Resource name prefix; workers are named <name>-0, <name>-1, ..."
type = string
default = "mc-test"
}
variable "git_remote" {
description = "GitLab clone URL (origin) the golden image was built from; cloud-init pulls the latest ref from here on boot."
type = string
default = ""
}
variable "git_ref" {
description = "Branch/tag/SHA the fleet checks out on boot. Pin to a SHA for reproducible distributed runs."
type = string
default = "main"
}
variable "remote_user" {
description = "Unix user the dispatch layer ssh's in as (created in the golden image; flatpak runs --user as this account)."
type = string
default = "mc"
}
variable "do_project" {
description = "DigitalOcean project name to group fleet resources under."
type = string
default = "mc:dev"
}