From f5c5d1a410400c03c6eb9d867e420b0b6881e406 Mon Sep 17 00:00:00 2001 From: Natalie Date: Sat, 27 Jun 2026 08:51:09 -0400 Subject: [PATCH] feat(infra): distributed test/train fleet on DigitalOcean (Terraform + Packer + dispatch) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ephemeral CPU Droplet fleet that horizontally scales the iteration loop: - infra/terraform/test-fleet: cattle Droplets from a golden image (auto-discovered by name via digitalocean_images), grouped under the mc:dev DO project, with a mocked-provider test suite (no token/spend). - infra/packer: golden-image builder reusing scripts/dev-setup/linux.sh. - scripts/run/dist.sh: ./run dist:{check,up,sim,train,down} — shard sim/test batches across workers via autoplay-batch AUTOPLAY_HOST+SEED_OFFSET. GPU intentionally absent (workload is CPU-bound per docs/ai-production.md). Co-Authored-By: Claude Opus 4.8 --- infra/packer/golden-image.pkr.hcl | 51 ++++++++-------- .../terraform/test-fleet/.terraform.lock.hcl | 45 +++++++------- infra/terraform/test-fleet/README.md | 58 +++++++++++-------- infra/terraform/test-fleet/main.tf | 54 ++++++++++------- infra/terraform/test-fleet/outputs.tf | 6 +- .../test-fleet/terraform.tfvars.example | 10 ++-- .../test-fleet/tests/fleet.tftest.hcl | 39 ++++++++----- infra/terraform/test-fleet/variables.tf | 49 +++++++++------- infra/terraform/test-fleet/versions.tf | 10 ++-- scripts/run/dist.sh | 26 ++++----- 10 files changed, 191 insertions(+), 157 deletions(-) diff --git a/infra/packer/golden-image.pkr.hcl b/infra/packer/golden-image.pkr.hcl index 11003ed1..73a5810e 100644 --- a/infra/packer/golden-image.pkr.hcl +++ b/infra/packer/golden-image.pkr.hcl @@ -1,41 +1,41 @@ -// Bakes a Hetzner snapshot with the full toolchain + a warm clone + a prebuilt -// GDExtension + a warm Godot import cache, so fleet workers boot build-ready in -// ~30s instead of running rustup/godot-install on every spin-up. +// Bakes a DigitalOcean custom image (snapshot) with the full toolchain + a warm +// clone + a prebuilt GDExtension + a warm Godot import cache, so fleet workers +// boot build-ready in ~30s instead of running rustup/godot-install per spin-up. // // Build once: -// export HCLOUD_TOKEN=... # or pass -var hcloud_token=... +// export DIGITALOCEAN_TOKEN=... // or pass -var do_token=... // packer init infra/packer/golden-image.pkr.hcl // packer build -var git_remote=https://gitlab.com//magic-civilization.git \ // infra/packer/golden-image.pkr.hcl // -// The snapshot is labelled type=golden,project=magic-civilization; the test-fleet -// Terraform module auto-discovers the newest one by that label. +// The image is named mc-golden-; the test-fleet Terraform module +// auto-discovers the newest one by the "mc-golden" name substring. packer { required_plugins { - hcloud = { - source = "github.com/hetznercloud/hcloud" - version = ">= 1.5.0" + digitalocean = { + source = "github.com/digitalocean/digitalocean" + version = ">= 1.4.0" } } } -variable "hcloud_token" { +variable "do_token" { type = string sensitive = true - default = env("HCLOUD_TOKEN") + default = env("DIGITALOCEAN_TOKEN") } -variable "location" { +variable "region" { type = string - default = "ash" + default = "nyc3" } -# A one-off dedicated box builds fast (cargo + godot import are CPU-heavy); +# A one-off CPU-Optimized box builds fast (cargo + godot import are CPU-heavy); # it only exists for the duration of the build. -variable "build_server_type" { +variable "build_size" { type = string - default = "ccx33" + default = "c-8" } variable "git_remote" { @@ -56,22 +56,17 @@ locals { ts = formatdate("YYYYMMDDhhmmss", timestamp()) } -source "hcloud" "golden" { - token = var.hcloud_token - image = "ubuntu-24.04" - location = var.location - server_type = var.build_server_type - ssh_username = "root" - +source "digitalocean" "golden" { + api_token = var.do_token + region = var.region + size = var.build_size + image = "ubuntu-24-04-x64" + ssh_username = "root" snapshot_name = "mc-golden-${local.ts}" - snapshot_labels = { - type = "golden" - project = "magic-civilization" - } } build { - sources = ["source.hcloud.golden"] + sources = ["source.digitalocean.golden"] provisioner "shell" { environment_vars = [ diff --git a/infra/terraform/test-fleet/.terraform.lock.hcl b/infra/terraform/test-fleet/.terraform.lock.hcl index bea2b44e..907f7e93 100644 --- a/infra/terraform/test-fleet/.terraform.lock.hcl +++ b/infra/terraform/test-fleet/.terraform.lock.hcl @@ -1,6 +1,30 @@ # This file is maintained automatically by "terraform init". # Manual edits may be lost in future updates. +provider "registry.terraform.io/digitalocean/digitalocean" { + version = "2.92.0" + constraints = "~> 2.0" + hashes = [ + "h1:PDahQCnG9M3XAjihY7KzGVPuLQTB6gPKWn7Tp9TPaOY=", + "zh:13cefc6a94b74445713abeacfdf6422d1aecf820ec08fe69bae63c3ea6fbe24e", + "zh:20fc749afda0dfd10ec6815db78efb0bdf399033db536738580816ca341cd2c6", + "zh:2fac398f97fbec5d9c16ce3c58a9925ca0474c4931ead3352af56161fd7d6f1e", + "zh:3e0542d5200c1efb3113bd2ad3a5cc1ba32b9d1fe7017044ceeb0b7729a7a7f6", + "zh:583ddc43350dfb84a9a5689fe11964df9afe1ae03d099ab96c8f0fb7bc7a4cad", + "zh:6025ea83b0602b6ff01b3c5bbe025e73e8b47a217aae6c4270725feac01ebb2b", + "zh:6be3c78cb90752ce9357c33792f869382ff9dbd01333d985127116478bdcec21", + "zh:75c4c76c24bdc7e9c8626603d1c082d0894c798096ccfe8e2ceba68ad4570638", + "zh:7abc9714982dee251e6b9ce6d4910cd413a46cb92f76a4ed3a92a56e7cc1b4e7", + "zh:7c4808dd90886f33c5bd861b7b6be9b942ae2b32a188793f6f4e07be4e146b47", + "zh:7d13d3bec74e08444334e6b5c1c5f5380d40dd0bbb80d2d387d9084aaecbd3cf", + "zh:8a11b04c46865bdcd49f15622398e6e4911aa5be5d0b12d0b708cdda5c8ff734", + "zh:910cad53707e4743f6c277fb0007f6937a64be5b3a8ded3af1273628b9c141fe", + "zh:a67d98e6aceb5837064c6e811a557dbaaa61791b99a8b8d87b278aecb871910e", + "zh:bed15d16d4be506123fba16c3fa6db7cafa7d2ed53f07ff370cc2228e5f6d9ba", + "zh:f794ef952a8b2b5702ecfecf9bfe372dac392789b0762e5598764d10f24a8210", + ] +} + provider "registry.terraform.io/hashicorp/local" { version = "2.9.0" constraints = "~> 2.5" @@ -21,24 +45,3 @@ provider "registry.terraform.io/hashicorp/local" { "zh:fcafa360a5b0b96244f26f4e3a6d642b716a376557142c2442ff2fb12d11da18", ] } - -provider "registry.terraform.io/hetznercloud/hcloud" { - version = "1.66.0" - constraints = "~> 1.49" - hashes = [ - "h1:iVAGP8gRbZK0kJF7SiYJRt61wz0D5AF9q+WMsrAiBI0=", - "zh:1286cee6fb63dbcb18f53077bbb5e5d132a4e4d9f006af4e8d8edfc08d6bcdc8", - "zh:204460dacc044bda019a4a18b398e094289500c36913c7c9457f432adf31b8b2", - "zh:214175d50773481cbeaf9c9004e4121a3a1c9686c79424ebdc8ff189dd057d3e", - "zh:22b17bceff61cc13ad04a399ba87521356a3a134d4687273727473ae9eccf5f1", - "zh:368867dac5525c411de7e38f2e27de0a71854d1750867322ff2b9321128c88fb", - "zh:5289b75f8370bdbc4c6051d55cf33d0b1bd25dc6d71bfbd39b360249a37f1501", - "zh:81cb676aa50c5777df8fc80d4e69c9012330ae751f5e6f12bf6074bfd2e7c496", - "zh:ab08aead10643b21aa6b51af562b50492e12b9dd0ab7dca27a05aa63209b7d66", - "zh:af25c210d0570cf61ef767b2545bf9f3fb909178135f0e5e14bec0c1c9d07a63", - "zh:bcad66f4830c97118fa793723e53f8a4d27ddd34ea969ff259408842c2238331", - "zh:ce3ed323d75ae905d975925fa98c7054a7514c81276a485fc37da8232b53e39f", - "zh:d481bc0ef0c87ab1969c17777f526b2f59f823432d676145134c41a6d29bd98e", - "zh:ea7ef88df2c3ca154d86238920636d52a3c9066c7467543d3fa45f1e52ec2f7b", - ] -} diff --git a/infra/terraform/test-fleet/README.md b/infra/terraform/test-fleet/README.md index 6dbe1ae0..4e3097db 100644 --- a/infra/terraform/test-fleet/README.md +++ b/infra/terraform/test-fleet/README.md @@ -1,39 +1,47 @@ -# test-fleet — distributed test/train infra (Hetzner) +# test-fleet — distributed test/train infra (DigitalOcean) -Horizontally scales the iteration loop onto cheap ephemeral Hetzner cattle. One -local command fans seeded sim batches (or RL training) across N disposable +Horizontally scales the iteration loop onto cheap ephemeral DigitalOcean Droplets. +One local command fans seeded sim batches (or RL training) across N disposable workers, collects results locally, and tears the fleet down. **Idle cost ≈ €0** -(fleet defaults to 0 workers; only the golden snapshot bills, ~€0.10/mo). +(fleet defaults to 0 workers; only the golden image bills, ~$0.40/mo). ## Layers | Layer | Where | What | |---|---|---| -| Golden image | `../../packer/` | Packer bakes toolchain + warm clone + prebuilt `.so` → labelled snapshot | -| Fleet | here | `workers = N` cattle from the snapshot, auto-discovered by label | +| Golden image | `../../packer/` | Packer bakes toolchain + warm clone + prebuilt `.so` → custom image | +| Fleet | here | `workers = N` Droplets from the image, auto-discovered by name | | Dispatch | `scripts/run/dist.sh` | shard → fan out over ssh → collect → merge → teardown | +## Offline verification (no token, no spend) + +```sh +./run dist:check # terraform fmt + validate (schema typecheck) + mocked-provider test +``` + +Run it anytime — before you even have a DO account. It uses a **mocked** provider. + ## One-time setup -1. **Hetzner**: create a Cloud account + project + payment method. Generate a - project-scoped **Read/Write API token**. (New accounts may need ID verification.) +1. **DigitalOcean**: in the Control Panel → **API → Tokens**, generate a personal + access token with **read+write** scope. 2. **GitLab**: push the repo; note the clone URL (the workers' `origin`). 3. **Build the golden image once** (see `../../packer/golden-image.pkr.hcl`): ```sh - export HCLOUD_TOKEN= + export DIGITALOCEAN_TOKEN= packer init ../../packer/golden-image.pkr.hcl packer build -var git_remote= ../../packer/golden-image.pkr.hcl ``` 4. **Auth env** for Terraform/dispatch: ```sh - export TF_VAR_hcloud_token= + export TF_VAR_do_token= cp terraform.tfvars.example terraform.tfvars # set git_remote ``` ## Daily use ```sh -./run dist:up 10 # 10 workers boot from the golden snapshot (~30s) +./run dist:up 10 # 10 Droplets boot from the golden image (~30s) ./run dist:sim 200 300 # 200 games / turn-limit 300, sharded 20/worker ./run dist:down # destroy the fleet → back to ~€0 # or fold teardown into the run: @@ -45,29 +53,29 @@ worker via `SEED_OFFSET`, so no collisions). RL sweeps: `./run dist:train 0`: +Pure pay-as-you-go, billed hourly only while `workers > 0` (⚠️ approximate — confirm in the DO console): -| | tier | rough cost | +| | size | rough cost | |---|---|---| -| `dist:sim` fan-out (bursty) | Shared **CPX** (`cpx41` default) | a 10×30-min run ≈ **cents** | -| `dist:train` (sustained, hours @100%) | Dedicated **CCX** (`-var server_type=ccx33`) | ~€0.50 / 3.5h generation | -| idle (fleet down) | snapshot only | **~€0.10/mo** | +| `dist:sim` fan-out (bursty) | Basic `s-8vcpu-16gb` | ~$0.12/hr; a 10×30-min run ≈ **~$0.60** | +| `dist:train` (sustained, hours @100%) | CPU-Optimized `c-8` (`./run dist:up N c-8`) | ~$0.25/hr | +| idle (fleet down) | image storage only | **~$0.40/mo** ($0.06/GB/mo) | -Shared tiers are throttled under sustained load — use a Dedicated `ccx*` for long -training runs (`./run dist:up N ccx33`), Shared `cpx*` for short test fan-out. +DigitalOcean runs ~2–3× Hetzner's per-core price, but the cattle model keeps each +run to cents-to-a-dollar since you only pay hourly while a fleet is up. Use a +CPU-Optimized `c-*` for long training runs, Basic `s-*` for short test fan-out. ## Design notes / caveats - **No persistent volume.** Workers are stateless; the golden image carries the warm clone + toolchain + prebuilt GDExtension. Results leave via `scp`/`rsync`. -- **Image auto-discovery.** `data.hcloud_image.golden` selects the newest snapshot - by label (`type=golden,project=magic-civilization`); rebuild with Packer and the - fleet picks it up — no ID edits. Set `-var base_image=ubuntu-24.04` only to test - `terraform plan` before any snapshot exists. -- **No placement group.** Hetzner caps spread groups at 10 servers; for short-lived - test cattle the HA spread isn't worth the `workers > 10` footgun. +- **Image auto-discovery.** `data.digitalocean_images.golden` selects the newest + custom image whose name contains `mc-golden` (filter `match_by = "substring"`, + sort `created desc`); rebuild with Packer and the fleet picks it up — no ID + edits. Set `-var base_image=ubuntu-24-04-x64` only to test `terraform plan` + before any image exists. - **Coordinator needs GNU coreutils.** `tools/autoplay-batch.sh` uses `realpath -m`; on macOS install `coreutils` or run the dispatch from a Linux host. - **State holds the token** — `*.tfstate` and `terraform.tfvars` are gitignored. - GPU is intentionally absent: the workload is CPU-bound (`docs/ai-production.md`); - rent spot GPU only if a profiler ever shows it saturated. + rent a DO GPU Droplet only if a profiler ever shows the GPU saturated. diff --git a/infra/terraform/test-fleet/main.tf b/infra/terraform/test-fleet/main.tf index 86e300a4..3dfc4a46 100644 --- a/infra/terraform/test-fleet/main.tf +++ b/infra/terraform/test-fleet/main.tf @@ -2,32 +2,39 @@ # No persistent volume: workers are stateless. The golden image carries the warm # clone + toolchain + prebuilt .so; results leave via the dispatch layer (scp). -resource "hcloud_ssh_key" "fleet" { +resource "digitalocean_ssh_key" "fleet" { name = "${var.name}-key" public_key = file(pathexpand(var.ssh_public_key_path)) } -# Resolve the newest golden snapshot by label. Skipped entirely when +# Resolve the newest golden image by name substring. Skipped entirely when # var.base_image is set (bootstrap path), so `terraform plan` works before any -# snapshot exists. -data "hcloud_image" "golden" { - count = var.base_image == "" ? 1 : 0 - with_selector = var.golden_selector - with_architecture = "x86" - most_recent = true +# golden image exists. +data "digitalocean_images" "golden" { + count = var.base_image == "" ? 1 : 0 + + filter { + key = "name" + values = [var.golden_name_match] + match_by = "substring" + } + sort { + key = "created" + direction = "desc" + } } locals { - image = var.base_image != "" ? var.base_image : data.hcloud_image.golden[0].id + image = var.base_image != "" ? var.base_image : tostring(data.digitalocean_images.golden[0].images[0].id) } -resource "hcloud_server" "worker" { - count = var.workers - name = "${var.name}-${count.index}" - server_type = var.server_type - location = var.location - image = local.image - ssh_keys = [hcloud_ssh_key.fleet.id] +resource "digitalocean_droplet" "worker" { + count = var.workers + name = "${var.name}-${count.index}" + size = var.size + region = var.region + image = local.image + ssh_keys = [digitalocean_ssh_key.fleet.id] # Thin cloud-init: copy the injected key to the build user and fast-forward # the warm clone to the requested ref. The golden image already holds the @@ -37,9 +44,14 @@ resource "hcloud_server" "worker" { git_ref = var.git_ref remote_user = var.remote_user }) - - labels = { - project = "magic-civilization" - role = "test-fleet" - } +} + +# Group fleet workers under the DigitalOcean "mc" project. +data "digitalocean_project" "mc" { + name = var.do_project +} + +resource "digitalocean_project_resources" "fleet" { + project = data.digitalocean_project.mc.id + resources = [for d in digitalocean_droplet.worker : d.urn] } diff --git a/infra/terraform/test-fleet/outputs.tf b/infra/terraform/test-fleet/outputs.tf index 48515c5d..103a8a1a 100644 --- a/infra/terraform/test-fleet/outputs.tf +++ b/infra/terraform/test-fleet/outputs.tf @@ -3,17 +3,17 @@ locals { repo_root = abspath("${path.module}/../../..") inventory_path = "${local.repo_root}/.local/fleet/inventory" # One "@" line per worker — consumed by scripts/run/dist.sh. - inventory_body = join("\n", [for s in hcloud_server.worker : "${var.remote_user}@${s.ipv4_address}"]) + inventory_body = join("\n", [for d in digitalocean_droplet.worker : "${var.remote_user}@${d.ipv4_address}"]) } output "worker_ips" { description = "Public IPv4 of each fleet worker." - value = [for s in hcloud_server.worker : s.ipv4_address] + value = [for d in digitalocean_droplet.worker : d.ipv4_address] } output "worker_hosts" { description = "ssh targets (@) the dispatch layer fans work across." - value = [for s in hcloud_server.worker : "${var.remote_user}@${s.ipv4_address}"] + value = [for d in digitalocean_droplet.worker : "${var.remote_user}@${d.ipv4_address}"] } output "inventory_path" { diff --git a/infra/terraform/test-fleet/terraform.tfvars.example b/infra/terraform/test-fleet/terraform.tfvars.example index 7c118571..a2b5a679 100644 --- a/infra/terraform/test-fleet/terraform.tfvars.example +++ b/infra/terraform/test-fleet/terraform.tfvars.example @@ -1,12 +1,12 @@ # Copy to terraform.tfvars and fill in. terraform.tfvars is gitignored. -# The token is best passed via env instead: export TF_VAR_hcloud_token=... +# The token is best passed via env instead: export TF_VAR_do_token=... # Required: GitLab origin the golden image was built from. git_remote = "https://gitlab.com//magic-civilization.git" # Optional overrides (defaults shown). -# location = "ash" # Ashburn VA (~near NYC) -# server_type = "cpx41" # bursty test/sim; use ccx33 for sustained training +# region = "nyc3" # NYC (also: nyc1, sfo3, ams3, fra1) +# size = "s-8vcpu-16gb" # bursty test/sim; use c-8 for sustained training # git_ref = "main" # remote_user = "mc" # ssh_public_key_path = "~/.ssh/id_ed25519.pub" @@ -15,5 +15,5 @@ git_remote = "https://gitlab.com//magic-civilization.git" # (./run dist:up N -> -var workers=N), not pinned here. # workers = 0 -# Bootstrap only: set to test `terraform plan` before a golden snapshot exists. -# base_image = "ubuntu-24.04" +# Bootstrap only: set to test `terraform plan` before a golden image exists. +# base_image = "ubuntu-24-04-x64" diff --git a/infra/terraform/test-fleet/tests/fleet.tftest.hcl b/infra/terraform/test-fleet/tests/fleet.tftest.hcl index 29756412..74ff526c 100644 --- a/infra/terraform/test-fleet/tests/fleet.tftest.hcl +++ b/infra/terraform/test-fleet/tests/fleet.tftest.hcl @@ -1,12 +1,21 @@ # No-spend test harness for the fleet module. # terraform test (from the module dir) -# Uses a MOCKED hcloud provider — no API token, no API calls, no servers, no cost. -# Exercises count expansion, the golden-image branch toggle, and the workers guardrail. +# Uses a MOCKED digitalocean provider — no API token, no API calls, no Droplets, +# no cost. Exercises count expansion, the golden-image branch toggle, and the +# workers guardrail. -mock_provider "hcloud" {} +mock_provider "digitalocean" { + # The golden-image data source is computed; give it a non-empty result so the + # base_image == "" branch (images[0].id) resolves under mocking. + mock_data "digitalocean_images" { + defaults = { + images = [{ id = 123456789 }] + } + } +} variables { - hcloud_token = "mock-token-unused" + do_token = "mock-token-unused" git_remote = "https://example.com/magic-civilization.git" ssh_public_key_path = "./tests/fixtures/id_test.pub" } @@ -17,21 +26,21 @@ run "fleet_expands_and_skips_golden_when_base_image_set" { variables { workers = 3 - base_image = "ubuntu-24.04" + base_image = "ubuntu-24-04-x64" } assert { - condition = length(hcloud_server.worker) == 3 + condition = length(digitalocean_droplet.worker) == 3 error_message = "expected 3 workers when workers=3" } assert { - condition = length(data.hcloud_image.golden) == 0 + condition = length(data.digitalocean_images.golden) == 0 error_message = "golden data source must be skipped when base_image is set" } } -# base_image empty -> golden snapshot is resolved via the label selector. +# base_image empty -> golden image is resolved via the name-substring filter. run "golden_image_branch_active_when_base_image_empty" { command = plan @@ -41,28 +50,28 @@ run "golden_image_branch_active_when_base_image_empty" { } assert { - condition = length(data.hcloud_image.golden) == 1 + condition = length(data.digitalocean_images.golden) == 1 error_message = "golden data source must be queried when base_image is empty" } assert { - condition = length(hcloud_server.worker) == 2 + condition = length(digitalocean_droplet.worker) == 2 error_message = "expected 2 workers when workers=2" } } -# workers = 0 -> zero servers (idle / torn-down state). +# workers = 0 -> zero Droplets (idle / torn-down state). run "zero_workers_is_empty_fleet" { command = plan variables { workers = 0 - base_image = "ubuntu-24.04" + base_image = "ubuntu-24-04-x64" } assert { - condition = length(hcloud_server.worker) == 0 - error_message = "workers=0 must produce no servers" + condition = length(digitalocean_droplet.worker) == 0 + error_message = "workers=0 must produce no Droplets" } } @@ -72,7 +81,7 @@ run "rejects_oversize_fleet" { variables { workers = 99 - base_image = "ubuntu-24.04" + base_image = "ubuntu-24-04-x64" } expect_failures = [var.workers] diff --git a/infra/terraform/test-fleet/variables.tf b/infra/terraform/test-fleet/variables.tf index 5ab47104..f833c989 100644 --- a/infra/terraform/test-fleet/variables.tf +++ b/infra/terraform/test-fleet/variables.tf @@ -1,5 +1,5 @@ -variable "hcloud_token" { - description = "Hetzner Cloud API token (project-scoped, Read/Write). Export as TF_VAR_hcloud_token; never commit." +variable "do_token" { + description = "DigitalOcean API token (Read/Write). Export as TF_VAR_do_token; never commit." type = string sensitive = true } @@ -7,7 +7,7 @@ variable "hcloud_token" { variable "workers" { description = <<-EOT Fleet size — the iteration-speed lever. 0 = nothing running, zero cost. - Set to N to fan distributed sim/test work across N disposable workers, then + Set to N to fan distributed sim/test work across N disposable Droplets, then back to 0 to tear the fleet down. Each worker is identical cattle; results are scp'd off by the dispatch layer before teardown, so there is no per-worker state to preserve. @@ -17,44 +17,45 @@ variable "workers" { validation { condition = var.workers >= 0 && var.workers <= 50 - error_message = "Keep the fleet between 0 and 50 (project-quota / sanity guard)." + error_message = "Keep the fleet between 0 and 50 (account-limit / sanity guard)." } } -variable "location" { - description = "Hetzner location. US: ash (Ashburn VA, ~near NYC), hil (Hillsboro OR). EU: fsn1, nbg1, hel1." +variable "region" { + description = "DigitalOcean region slug. NYC: nyc1, nyc3. Others: sfo3, ams3, fra1, sgp1." type = string - default = "ash" + default = "nyc3" } -variable "server_type" { +variable "size" { description = <<-EOT - Per-worker size. Distributed fan-out favours many small cheap boxes over one - big one (finer shard granularity per euro). Match tier to CPU profile: - Bursty test/sim (minutes) -> Shared "Regular" CPX: - cpx31 = 4 vCPU / 8 GB, cpx41 = 8 vCPU / 16 GB (default), cpx51 = 16/32 - Sustained RL training (hours @ 100%) -> Dedicated CCX (no shared-tier throttle): - ccx33 = 8 vCPU / 32 GB, ccx43 = 16 vCPU / 64 GB + Per-worker Droplet size slug. Distributed fan-out favours many small cheap + boxes over one big one. Match tier to CPU profile: + Bursty test/sim (minutes) -> Basic shared: + s-4vcpu-8gb, s-8vcpu-16gb (default), s-8vcpu-16gb-amd + Sustained RL training (hours @ 100%) -> CPU-Optimized (dedicated vCPU): + c-8 = 8 vCPU / 16 GB, c-16 = 16 vCPU / 32 GB EOT type = string - default = "cpx41" + default = "s-8vcpu-16gb" } variable "base_image" { description = <<-EOT Bootstrap escape hatch. Leave empty ("") to boot workers from the latest - Packer-built golden snapshot (resolved by label via the hcloud_image data - source). Set to a stock image name (e.g. "ubuntu-24.04") only for first-run - Terraform plan testing BEFORE any golden snapshot exists. + Packer-built golden image (resolved by name-substring via the + digitalocean_images data source). Set to a stock image slug (e.g. + "ubuntu-24-04-x64") only for first-run `terraform plan` testing BEFORE any + golden image exists. EOT type = string default = "" } -variable "golden_selector" { - description = "Label selector identifying the Packer golden snapshot. Must match the labels set in infra/packer/golden-image.pkr.hcl." +variable "golden_name_match" { + description = "Name substring identifying the Packer golden image. Must match snapshot_name prefix in infra/packer/golden-image.pkr.hcl." type = string - default = "type=golden,project=magic-civilization" + default = "mc-golden" } variable "ssh_public_key_path" { @@ -86,3 +87,9 @@ variable "remote_user" { type = string default = "mc" } + +variable "do_project" { + description = "DigitalOcean project name to group fleet resources under." + type = string + default = "mc:dev" +} diff --git a/infra/terraform/test-fleet/versions.tf b/infra/terraform/test-fleet/versions.tf index b52bb988..a0d2784a 100644 --- a/infra/terraform/test-fleet/versions.tf +++ b/infra/terraform/test-fleet/versions.tf @@ -2,9 +2,9 @@ terraform { required_version = ">= 1.6" required_providers { - hcloud = { - source = "hetznercloud/hcloud" - version = "~> 1.49" + digitalocean = { + source = "digitalocean/digitalocean" + version = "~> 2.0" } local = { source = "hashicorp/local" @@ -13,6 +13,6 @@ terraform { } } -provider "hcloud" { - token = var.hcloud_token +provider "digitalocean" { + token = var.do_token } diff --git a/scripts/run/dist.sh b/scripts/run/dist.sh index 12e35b2f..eb4bc749 100755 --- a/scripts/run/dist.sh +++ b/scripts/run/dist.sh @@ -1,14 +1,14 @@ #!/usr/bin/env bash -# Distributed test/train dispatch — fan the iteration loop across the Hetzner +# Distributed test/train dispatch — fan the iteration loop across the DigitalOcean # test fleet. Sourced by ./run (defines cmd_dist_*). Auto-registered via the # cmd__ name-dispatch, so no edit to the top-level `run` is needed. # -# ./run dist:up [server_type] [location] spin the fleet up +# ./run dist:up [size] [region] spin the fleet up # ./run dist:sim [turn_limit] [--destroy-after] fan a sim batch across it # ./run dist:train [--destroy-after] fan an RL sweep across it # ./run dist:down tear it down (zero cost) # -# Requires: TF_VAR_hcloud_token in env, terraform on PATH, and a coordinator with +# Requires: TF_VAR_do_token in env, terraform on PATH, and a coordinator with # GNU coreutils (autoplay-batch.sh uses `realpath -m`). _DIST_TF_DIR_REL="infra/terraform/test-fleet" @@ -29,9 +29,9 @@ _dist_read_hosts() { cmd_dist() { cat <<'EOF' -Distributed test/train fleet (Hetzner). Set TF_VAR_hcloud_token first. +Distributed test/train fleet (DigitalOcean). Set TF_VAR_do_token first. ./run dist:check offline: fmt + validate + mocked test (no token/spend) - ./run dist:up [server_type] [location] e.g. ./run dist:up 10 + ./run dist:up [size] [region] e.g. ./run dist:up 10 ./run dist:sim [turn_limit] [--destroy-after] ./run dist:train [--destroy-after] ./run dist:down @@ -39,7 +39,7 @@ EOF } cmd_dist_check() { - # Offline IaC verification — no Hetzner token, no API, no servers, no cost. + # Offline IaC verification — no DigitalOcean token, no API, no servers, no cost. # fmt (style) + validate (schema typecheck) + test (mocked-provider behaviour). local root root="$(_dist_repo_root)" @@ -50,27 +50,27 @@ cmd_dist_check() { terraform -chdir="$dir" init -backend=false -input=false >/dev/null || return 1 echo "== terraform validate (schema typecheck) ==" terraform -chdir="$dir" validate || return 1 - echo "== terraform test (mocked hcloud) ==" + echo "== terraform test (mocked digitalocean) ==" terraform -chdir="$dir" test || return 1 echo "dist:check OK — config is valid, no resources touched." } cmd_dist_up() { local n="${1:-}" - [[ "$n" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:up [server_type] [location]" >&2; return 1; } - : "${TF_VAR_hcloud_token:?export TF_VAR_hcloud_token= first}" + [[ "$n" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:up [size] [region]" >&2; return 1; } + : "${TF_VAR_do_token:?export TF_VAR_do_token= first}" local args=(-auto-approve -var "workers=$n") - [ -n "${2:-}" ] && args+=(-var "server_type=$2") - [ -n "${3:-}" ] && args+=(-var "location=$3") + [ -n "${2:-}" ] && args+=(-var "size=$2") + [ -n "${3:-}" ] && args+=(-var "region=$3") _dist_tf init -input=false >/dev/null _dist_tf apply "${args[@]}" echo "fleet up: $n worker(s). inventory: $(_dist_repo_root)/.local/fleet/inventory" } cmd_dist_down() { - : "${TF_VAR_hcloud_token:?export TF_VAR_hcloud_token= first}" + : "${TF_VAR_do_token:?export TF_VAR_do_token= first}" _dist_tf apply -auto-approve -var "workers=0" - echo "fleet down (workers=0): zero compute cost, snapshot only (~€0.10/mo)." + echo "fleet down (workers=0): zero compute cost, snapshot only (~$0.40/mo)." } cmd_dist_sim() {