feat(infra): distributed test/train fleet on DigitalOcean (Terraform + Packer + dispatch)

Ephemeral CPU Droplet fleet that horizontally scales the iteration loop:
- infra/terraform/test-fleet: cattle Droplets from a golden image (auto-discovered
  by name via digitalocean_images), grouped under the mc:dev DO project, with a
  mocked-provider test suite (no token/spend).
- infra/packer: golden-image builder reusing scripts/dev-setup/linux.sh.
- scripts/run/dist.sh: ./run dist:{check,up,sim,train,down} — shard sim/test
  batches across workers via autoplay-batch AUTOPLAY_HOST+SEED_OFFSET.
GPU intentionally absent (workload is CPU-bound per docs/ai-production.md).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Natalie 2026-06-27 08:51:09 -04:00
parent bd186b162a
commit f5c5d1a410
10 changed files with 191 additions and 157 deletions

View file

@ -1,41 +1,41 @@
// Bakes a Hetzner snapshot with the full toolchain + a warm clone + a prebuilt
// GDExtension + a warm Godot import cache, so fleet workers boot build-ready in
// ~30s instead of running rustup/godot-install on every spin-up.
// Bakes a DigitalOcean custom image (snapshot) with the full toolchain + a warm
// clone + a prebuilt GDExtension + a warm Godot import cache, so fleet workers
// boot build-ready in ~30s instead of running rustup/godot-install per spin-up.
//
// Build once:
// export HCLOUD_TOKEN=... # or pass -var hcloud_token=...
// export DIGITALOCEAN_TOKEN=... // or pass -var do_token=...
// packer init infra/packer/golden-image.pkr.hcl
// packer build -var git_remote=https://gitlab.com/<you>/magic-civilization.git \
// infra/packer/golden-image.pkr.hcl
//
// The snapshot is labelled type=golden,project=magic-civilization; the test-fleet
// Terraform module auto-discovers the newest one by that label.
// The image is named mc-golden-<timestamp>; the test-fleet Terraform module
// auto-discovers the newest one by the "mc-golden" name substring.
packer {
required_plugins {
hcloud = {
source = "github.com/hetznercloud/hcloud"
version = ">= 1.5.0"
digitalocean = {
source = "github.com/digitalocean/digitalocean"
version = ">= 1.4.0"
}
}
}
variable "hcloud_token" {
variable "do_token" {
type = string
sensitive = true
default = env("HCLOUD_TOKEN")
default = env("DIGITALOCEAN_TOKEN")
}
variable "location" {
variable "region" {
type = string
default = "ash"
default = "nyc3"
}
# A one-off dedicated box builds fast (cargo + godot import are CPU-heavy);
# A one-off CPU-Optimized box builds fast (cargo + godot import are CPU-heavy);
# it only exists for the duration of the build.
variable "build_server_type" {
variable "build_size" {
type = string
default = "ccx33"
default = "c-8"
}
variable "git_remote" {
@ -56,22 +56,17 @@ locals {
ts = formatdate("YYYYMMDDhhmmss", timestamp())
}
source "hcloud" "golden" {
token = var.hcloud_token
image = "ubuntu-24.04"
location = var.location
server_type = var.build_server_type
ssh_username = "root"
source "digitalocean" "golden" {
api_token = var.do_token
region = var.region
size = var.build_size
image = "ubuntu-24-04-x64"
ssh_username = "root"
snapshot_name = "mc-golden-${local.ts}"
snapshot_labels = {
type = "golden"
project = "magic-civilization"
}
}
build {
sources = ["source.hcloud.golden"]
sources = ["source.digitalocean.golden"]
provisioner "shell" {
environment_vars = [

View file

@ -1,6 +1,30 @@
# This file is maintained automatically by "terraform init".
# Manual edits may be lost in future updates.
provider "registry.terraform.io/digitalocean/digitalocean" {
version = "2.92.0"
constraints = "~> 2.0"
hashes = [
"h1:PDahQCnG9M3XAjihY7KzGVPuLQTB6gPKWn7Tp9TPaOY=",
"zh:13cefc6a94b74445713abeacfdf6422d1aecf820ec08fe69bae63c3ea6fbe24e",
"zh:20fc749afda0dfd10ec6815db78efb0bdf399033db536738580816ca341cd2c6",
"zh:2fac398f97fbec5d9c16ce3c58a9925ca0474c4931ead3352af56161fd7d6f1e",
"zh:3e0542d5200c1efb3113bd2ad3a5cc1ba32b9d1fe7017044ceeb0b7729a7a7f6",
"zh:583ddc43350dfb84a9a5689fe11964df9afe1ae03d099ab96c8f0fb7bc7a4cad",
"zh:6025ea83b0602b6ff01b3c5bbe025e73e8b47a217aae6c4270725feac01ebb2b",
"zh:6be3c78cb90752ce9357c33792f869382ff9dbd01333d985127116478bdcec21",
"zh:75c4c76c24bdc7e9c8626603d1c082d0894c798096ccfe8e2ceba68ad4570638",
"zh:7abc9714982dee251e6b9ce6d4910cd413a46cb92f76a4ed3a92a56e7cc1b4e7",
"zh:7c4808dd90886f33c5bd861b7b6be9b942ae2b32a188793f6f4e07be4e146b47",
"zh:7d13d3bec74e08444334e6b5c1c5f5380d40dd0bbb80d2d387d9084aaecbd3cf",
"zh:8a11b04c46865bdcd49f15622398e6e4911aa5be5d0b12d0b708cdda5c8ff734",
"zh:910cad53707e4743f6c277fb0007f6937a64be5b3a8ded3af1273628b9c141fe",
"zh:a67d98e6aceb5837064c6e811a557dbaaa61791b99a8b8d87b278aecb871910e",
"zh:bed15d16d4be506123fba16c3fa6db7cafa7d2ed53f07ff370cc2228e5f6d9ba",
"zh:f794ef952a8b2b5702ecfecf9bfe372dac392789b0762e5598764d10f24a8210",
]
}
provider "registry.terraform.io/hashicorp/local" {
version = "2.9.0"
constraints = "~> 2.5"
@ -21,24 +45,3 @@ provider "registry.terraform.io/hashicorp/local" {
"zh:fcafa360a5b0b96244f26f4e3a6d642b716a376557142c2442ff2fb12d11da18",
]
}
provider "registry.terraform.io/hetznercloud/hcloud" {
version = "1.66.0"
constraints = "~> 1.49"
hashes = [
"h1:iVAGP8gRbZK0kJF7SiYJRt61wz0D5AF9q+WMsrAiBI0=",
"zh:1286cee6fb63dbcb18f53077bbb5e5d132a4e4d9f006af4e8d8edfc08d6bcdc8",
"zh:204460dacc044bda019a4a18b398e094289500c36913c7c9457f432adf31b8b2",
"zh:214175d50773481cbeaf9c9004e4121a3a1c9686c79424ebdc8ff189dd057d3e",
"zh:22b17bceff61cc13ad04a399ba87521356a3a134d4687273727473ae9eccf5f1",
"zh:368867dac5525c411de7e38f2e27de0a71854d1750867322ff2b9321128c88fb",
"zh:5289b75f8370bdbc4c6051d55cf33d0b1bd25dc6d71bfbd39b360249a37f1501",
"zh:81cb676aa50c5777df8fc80d4e69c9012330ae751f5e6f12bf6074bfd2e7c496",
"zh:ab08aead10643b21aa6b51af562b50492e12b9dd0ab7dca27a05aa63209b7d66",
"zh:af25c210d0570cf61ef767b2545bf9f3fb909178135f0e5e14bec0c1c9d07a63",
"zh:bcad66f4830c97118fa793723e53f8a4d27ddd34ea969ff259408842c2238331",
"zh:ce3ed323d75ae905d975925fa98c7054a7514c81276a485fc37da8232b53e39f",
"zh:d481bc0ef0c87ab1969c17777f526b2f59f823432d676145134c41a6d29bd98e",
"zh:ea7ef88df2c3ca154d86238920636d52a3c9066c7467543d3fa45f1e52ec2f7b",
]
}

View file

@ -1,39 +1,47 @@
# test-fleet — distributed test/train infra (Hetzner)
# test-fleet — distributed test/train infra (DigitalOcean)
Horizontally scales the iteration loop onto cheap ephemeral Hetzner cattle. One
local command fans seeded sim batches (or RL training) across N disposable
Horizontally scales the iteration loop onto cheap ephemeral DigitalOcean Droplets.
One local command fans seeded sim batches (or RL training) across N disposable
workers, collects results locally, and tears the fleet down. **Idle cost ≈ €0**
(fleet defaults to 0 workers; only the golden snapshot bills, ~€0.10/mo).
(fleet defaults to 0 workers; only the golden image bills, ~$0.40/mo).
## Layers
| Layer | Where | What |
|---|---|---|
| Golden image | `../../packer/` | Packer bakes toolchain + warm clone + prebuilt `.so`labelled snapshot |
| Fleet | here | `workers = N` cattle from the snapshot, auto-discovered by label |
| Golden image | `../../packer/` | Packer bakes toolchain + warm clone + prebuilt `.so`custom image |
| Fleet | here | `workers = N` Droplets from the image, auto-discovered by name |
| Dispatch | `scripts/run/dist.sh` | shard → fan out over ssh → collect → merge → teardown |
## Offline verification (no token, no spend)
```sh
./run dist:check # terraform fmt + validate (schema typecheck) + mocked-provider test
```
Run it anytime — before you even have a DO account. It uses a **mocked** provider.
## One-time setup
1. **Hetzner**: create a Cloud account + project + payment method. Generate a
project-scoped **Read/Write API token**. (New accounts may need ID verification.)
1. **DigitalOcean**: in the Control Panel → **API → Tokens**, generate a personal
access token with **read+write** scope.
2. **GitLab**: push the repo; note the clone URL (the workers' `origin`).
3. **Build the golden image once** (see `../../packer/golden-image.pkr.hcl`):
```sh
export HCLOUD_TOKEN=<token>
export DIGITALOCEAN_TOKEN=<token>
packer init ../../packer/golden-image.pkr.hcl
packer build -var git_remote=<gitlab-url> ../../packer/golden-image.pkr.hcl
```
4. **Auth env** for Terraform/dispatch:
```sh
export TF_VAR_hcloud_token=<token>
export TF_VAR_do_token=<token>
cp terraform.tfvars.example terraform.tfvars # set git_remote
```
## Daily use
```sh
./run dist:up 10 # 10 workers boot from the golden snapshot (~30s)
./run dist:up 10 # 10 Droplets boot from the golden image (~30s)
./run dist:sim 200 300 # 200 games / turn-limit 300, sharded 20/worker
./run dist:down # destroy the fleet → back to ~€0
# or fold teardown into the run:
@ -45,29 +53,29 @@ worker via `SEED_OFFSET`, so no collisions). RL sweeps: `./run dist:train <steps
## Cost
Pure pay-as-you-go, billed hourly only while `workers > 0`:
Pure pay-as-you-go, billed hourly only while `workers > 0` (⚠️ approximate — confirm in the DO console):
| | tier | rough cost |
| | size | rough cost |
|---|---|---|
| `dist:sim` fan-out (bursty) | Shared **CPX** (`cpx41` default) | a 10×30-min run ≈ **cents** |
| `dist:train` (sustained, hours @100%) | Dedicated **CCX** (`-var server_type=ccx33`) | ~€0.50 / 3.5h generation |
| idle (fleet down) | snapshot only | **~€0.10/mo** |
| `dist:sim` fan-out (bursty) | Basic `s-8vcpu-16gb` | ~$0.12/hr; a 10×30-min run ≈ **~$0.60** |
| `dist:train` (sustained, hours @100%) | CPU-Optimized `c-8` (`./run dist:up N c-8`) | ~$0.25/hr |
| idle (fleet down) | image storage only | **~$0.40/mo** ($0.06/GB/mo) |
Shared tiers are throttled under sustained load — use a Dedicated `ccx*` for long
training runs (`./run dist:up N ccx33`), Shared `cpx*` for short test fan-out.
DigitalOcean runs ~23× Hetzner's per-core price, but the cattle model keeps each
run to cents-to-a-dollar since you only pay hourly while a fleet is up. Use a
CPU-Optimized `c-*` for long training runs, Basic `s-*` for short test fan-out.
## Design notes / caveats
- **No persistent volume.** Workers are stateless; the golden image carries the
warm clone + toolchain + prebuilt GDExtension. Results leave via `scp`/`rsync`.
- **Image auto-discovery.** `data.hcloud_image.golden` selects the newest snapshot
by label (`type=golden,project=magic-civilization`); rebuild with Packer and the
fleet picks it up — no ID edits. Set `-var base_image=ubuntu-24.04` only to test
`terraform plan` before any snapshot exists.
- **No placement group.** Hetzner caps spread groups at 10 servers; for short-lived
test cattle the HA spread isn't worth the `workers > 10` footgun.
- **Image auto-discovery.** `data.digitalocean_images.golden` selects the newest
custom image whose name contains `mc-golden` (filter `match_by = "substring"`,
sort `created desc`); rebuild with Packer and the fleet picks it up — no ID
edits. Set `-var base_image=ubuntu-24-04-x64` only to test `terraform plan`
before any image exists.
- **Coordinator needs GNU coreutils.** `tools/autoplay-batch.sh` uses `realpath -m`;
on macOS install `coreutils` or run the dispatch from a Linux host.
- **State holds the token**`*.tfstate` and `terraform.tfvars` are gitignored.
- GPU is intentionally absent: the workload is CPU-bound (`docs/ai-production.md`);
rent spot GPU only if a profiler ever shows it saturated.
rent a DO GPU Droplet only if a profiler ever shows the GPU saturated.

View file

@ -2,32 +2,39 @@
# No persistent volume: workers are stateless. The golden image carries the warm
# clone + toolchain + prebuilt .so; results leave via the dispatch layer (scp).
resource "hcloud_ssh_key" "fleet" {
resource "digitalocean_ssh_key" "fleet" {
name = "${var.name}-key"
public_key = file(pathexpand(var.ssh_public_key_path))
}
# Resolve the newest golden snapshot by label. Skipped entirely when
# Resolve the newest golden image by name substring. Skipped entirely when
# var.base_image is set (bootstrap path), so `terraform plan` works before any
# snapshot exists.
data "hcloud_image" "golden" {
count = var.base_image == "" ? 1 : 0
with_selector = var.golden_selector
with_architecture = "x86"
most_recent = true
# golden image exists.
data "digitalocean_images" "golden" {
count = var.base_image == "" ? 1 : 0
filter {
key = "name"
values = [var.golden_name_match]
match_by = "substring"
}
sort {
key = "created"
direction = "desc"
}
}
locals {
image = var.base_image != "" ? var.base_image : data.hcloud_image.golden[0].id
image = var.base_image != "" ? var.base_image : tostring(data.digitalocean_images.golden[0].images[0].id)
}
resource "hcloud_server" "worker" {
count = var.workers
name = "${var.name}-${count.index}"
server_type = var.server_type
location = var.location
image = local.image
ssh_keys = [hcloud_ssh_key.fleet.id]
resource "digitalocean_droplet" "worker" {
count = var.workers
name = "${var.name}-${count.index}"
size = var.size
region = var.region
image = local.image
ssh_keys = [digitalocean_ssh_key.fleet.id]
# Thin cloud-init: copy the injected key to the build user and fast-forward
# the warm clone to the requested ref. The golden image already holds the
@ -37,9 +44,14 @@ resource "hcloud_server" "worker" {
git_ref = var.git_ref
remote_user = var.remote_user
})
labels = {
project = "magic-civilization"
role = "test-fleet"
}
}
# Group fleet workers under the DigitalOcean "mc" project.
data "digitalocean_project" "mc" {
name = var.do_project
}
resource "digitalocean_project_resources" "fleet" {
project = data.digitalocean_project.mc.id
resources = [for d in digitalocean_droplet.worker : d.urn]
}

View file

@ -3,17 +3,17 @@ locals {
repo_root = abspath("${path.module}/../../..")
inventory_path = "${local.repo_root}/.local/fleet/inventory"
# One "<user>@<ipv4>" line per worker consumed by scripts/run/dist.sh.
inventory_body = join("\n", [for s in hcloud_server.worker : "${var.remote_user}@${s.ipv4_address}"])
inventory_body = join("\n", [for d in digitalocean_droplet.worker : "${var.remote_user}@${d.ipv4_address}"])
}
output "worker_ips" {
description = "Public IPv4 of each fleet worker."
value = [for s in hcloud_server.worker : s.ipv4_address]
value = [for d in digitalocean_droplet.worker : d.ipv4_address]
}
output "worker_hosts" {
description = "ssh targets (<user>@<ip>) the dispatch layer fans work across."
value = [for s in hcloud_server.worker : "${var.remote_user}@${s.ipv4_address}"]
value = [for d in digitalocean_droplet.worker : "${var.remote_user}@${d.ipv4_address}"]
}
output "inventory_path" {

View file

@ -1,12 +1,12 @@
# Copy to terraform.tfvars and fill in. terraform.tfvars is gitignored.
# The token is best passed via env instead: export TF_VAR_hcloud_token=...
# The token is best passed via env instead: export TF_VAR_do_token=...
# Required: GitLab origin the golden image was built from.
git_remote = "https://gitlab.com/<you>/magic-civilization.git"
# Optional overrides (defaults shown).
# location = "ash" # Ashburn VA (~near NYC)
# server_type = "cpx41" # bursty test/sim; use ccx33 for sustained training
# region = "nyc3" # NYC (also: nyc1, sfo3, ams3, fra1)
# size = "s-8vcpu-16gb" # bursty test/sim; use c-8 for sustained training
# git_ref = "main"
# remote_user = "mc"
# ssh_public_key_path = "~/.ssh/id_ed25519.pub"
@ -15,5 +15,5 @@ git_remote = "https://gitlab.com/<you>/magic-civilization.git"
# (./run dist:up N -> -var workers=N), not pinned here.
# workers = 0
# Bootstrap only: set to test `terraform plan` before a golden snapshot exists.
# base_image = "ubuntu-24.04"
# Bootstrap only: set to test `terraform plan` before a golden image exists.
# base_image = "ubuntu-24-04-x64"

View file

@ -1,12 +1,21 @@
# No-spend test harness for the fleet module.
# terraform test (from the module dir)
# Uses a MOCKED hcloud provider no API token, no API calls, no servers, no cost.
# Exercises count expansion, the golden-image branch toggle, and the workers guardrail.
# Uses a MOCKED digitalocean provider no API token, no API calls, no Droplets,
# no cost. Exercises count expansion, the golden-image branch toggle, and the
# workers guardrail.
mock_provider "hcloud" {}
mock_provider "digitalocean" {
# The golden-image data source is computed; give it a non-empty result so the
# base_image == "" branch (images[0].id) resolves under mocking.
mock_data "digitalocean_images" {
defaults = {
images = [{ id = 123456789 }]
}
}
}
variables {
hcloud_token = "mock-token-unused"
do_token = "mock-token-unused"
git_remote = "https://example.com/magic-civilization.git"
ssh_public_key_path = "./tests/fixtures/id_test.pub"
}
@ -17,21 +26,21 @@ run "fleet_expands_and_skips_golden_when_base_image_set" {
variables {
workers = 3
base_image = "ubuntu-24.04"
base_image = "ubuntu-24-04-x64"
}
assert {
condition = length(hcloud_server.worker) == 3
condition = length(digitalocean_droplet.worker) == 3
error_message = "expected 3 workers when workers=3"
}
assert {
condition = length(data.hcloud_image.golden) == 0
condition = length(data.digitalocean_images.golden) == 0
error_message = "golden data source must be skipped when base_image is set"
}
}
# base_image empty -> golden snapshot is resolved via the label selector.
# base_image empty -> golden image is resolved via the name-substring filter.
run "golden_image_branch_active_when_base_image_empty" {
command = plan
@ -41,28 +50,28 @@ run "golden_image_branch_active_when_base_image_empty" {
}
assert {
condition = length(data.hcloud_image.golden) == 1
condition = length(data.digitalocean_images.golden) == 1
error_message = "golden data source must be queried when base_image is empty"
}
assert {
condition = length(hcloud_server.worker) == 2
condition = length(digitalocean_droplet.worker) == 2
error_message = "expected 2 workers when workers=2"
}
}
# workers = 0 -> zero servers (idle / torn-down state).
# workers = 0 -> zero Droplets (idle / torn-down state).
run "zero_workers_is_empty_fleet" {
command = plan
variables {
workers = 0
base_image = "ubuntu-24.04"
base_image = "ubuntu-24-04-x64"
}
assert {
condition = length(hcloud_server.worker) == 0
error_message = "workers=0 must produce no servers"
condition = length(digitalocean_droplet.worker) == 0
error_message = "workers=0 must produce no Droplets"
}
}
@ -72,7 +81,7 @@ run "rejects_oversize_fleet" {
variables {
workers = 99
base_image = "ubuntu-24.04"
base_image = "ubuntu-24-04-x64"
}
expect_failures = [var.workers]

View file

@ -1,5 +1,5 @@
variable "hcloud_token" {
description = "Hetzner Cloud API token (project-scoped, Read/Write). Export as TF_VAR_hcloud_token; never commit."
variable "do_token" {
description = "DigitalOcean API token (Read/Write). Export as TF_VAR_do_token; never commit."
type = string
sensitive = true
}
@ -7,7 +7,7 @@ variable "hcloud_token" {
variable "workers" {
description = <<-EOT
Fleet size the iteration-speed lever. 0 = nothing running, zero cost.
Set to N to fan distributed sim/test work across N disposable workers, then
Set to N to fan distributed sim/test work across N disposable Droplets, then
back to 0 to tear the fleet down. Each worker is identical cattle; results
are scp'd off by the dispatch layer before teardown, so there is no
per-worker state to preserve.
@ -17,44 +17,45 @@ variable "workers" {
validation {
condition = var.workers >= 0 && var.workers <= 50
error_message = "Keep the fleet between 0 and 50 (project-quota / sanity guard)."
error_message = "Keep the fleet between 0 and 50 (account-limit / sanity guard)."
}
}
variable "location" {
description = "Hetzner location. US: ash (Ashburn VA, ~near NYC), hil (Hillsboro OR). EU: fsn1, nbg1, hel1."
variable "region" {
description = "DigitalOcean region slug. NYC: nyc1, nyc3. Others: sfo3, ams3, fra1, sgp1."
type = string
default = "ash"
default = "nyc3"
}
variable "server_type" {
variable "size" {
description = <<-EOT
Per-worker size. Distributed fan-out favours many small cheap boxes over one
big one (finer shard granularity per euro). Match tier to CPU profile:
Bursty test/sim (minutes) -> Shared "Regular" CPX:
cpx31 = 4 vCPU / 8 GB, cpx41 = 8 vCPU / 16 GB (default), cpx51 = 16/32
Sustained RL training (hours @ 100%) -> Dedicated CCX (no shared-tier throttle):
ccx33 = 8 vCPU / 32 GB, ccx43 = 16 vCPU / 64 GB
Per-worker Droplet size slug. Distributed fan-out favours many small cheap
boxes over one big one. Match tier to CPU profile:
Bursty test/sim (minutes) -> Basic shared:
s-4vcpu-8gb, s-8vcpu-16gb (default), s-8vcpu-16gb-amd
Sustained RL training (hours @ 100%) -> CPU-Optimized (dedicated vCPU):
c-8 = 8 vCPU / 16 GB, c-16 = 16 vCPU / 32 GB
EOT
type = string
default = "cpx41"
default = "s-8vcpu-16gb"
}
variable "base_image" {
description = <<-EOT
Bootstrap escape hatch. Leave empty ("") to boot workers from the latest
Packer-built golden snapshot (resolved by label via the hcloud_image data
source). Set to a stock image name (e.g. "ubuntu-24.04") only for first-run
Terraform plan testing BEFORE any golden snapshot exists.
Packer-built golden image (resolved by name-substring via the
digitalocean_images data source). Set to a stock image slug (e.g.
"ubuntu-24-04-x64") only for first-run `terraform plan` testing BEFORE any
golden image exists.
EOT
type = string
default = ""
}
variable "golden_selector" {
description = "Label selector identifying the Packer golden snapshot. Must match the labels set in infra/packer/golden-image.pkr.hcl."
variable "golden_name_match" {
description = "Name substring identifying the Packer golden image. Must match snapshot_name prefix in infra/packer/golden-image.pkr.hcl."
type = string
default = "type=golden,project=magic-civilization"
default = "mc-golden"
}
variable "ssh_public_key_path" {
@ -86,3 +87,9 @@ variable "remote_user" {
type = string
default = "mc"
}
variable "do_project" {
description = "DigitalOcean project name to group fleet resources under."
type = string
default = "mc:dev"
}

View file

@ -2,9 +2,9 @@ terraform {
required_version = ">= 1.6"
required_providers {
hcloud = {
source = "hetznercloud/hcloud"
version = "~> 1.49"
digitalocean = {
source = "digitalocean/digitalocean"
version = "~> 2.0"
}
local = {
source = "hashicorp/local"
@ -13,6 +13,6 @@ terraform {
}
}
provider "hcloud" {
token = var.hcloud_token
provider "digitalocean" {
token = var.do_token
}

View file

@ -1,14 +1,14 @@
#!/usr/bin/env bash
# Distributed test/train dispatch — fan the iteration loop across the Hetzner
# Distributed test/train dispatch — fan the iteration loop across the DigitalOcean
# test fleet. Sourced by ./run (defines cmd_dist_*). Auto-registered via the
# cmd_<verb>_<target> name-dispatch, so no edit to the top-level `run` is needed.
#
# ./run dist:up <workers> [server_type] [location] spin the fleet up
# ./run dist:up <workers> [size] [region] spin the fleet up
# ./run dist:sim <games> [turn_limit] [--destroy-after] fan a sim batch across it
# ./run dist:train <total_steps> [--destroy-after] fan an RL sweep across it
# ./run dist:down tear it down (zero cost)
#
# Requires: TF_VAR_hcloud_token in env, terraform on PATH, and a coordinator with
# Requires: TF_VAR_do_token in env, terraform on PATH, and a coordinator with
# GNU coreutils (autoplay-batch.sh uses `realpath -m`).
_DIST_TF_DIR_REL="infra/terraform/test-fleet"
@ -29,9 +29,9 @@ _dist_read_hosts() {
cmd_dist() {
cat <<'EOF'
Distributed test/train fleet (Hetzner). Set TF_VAR_hcloud_token first.
Distributed test/train fleet (DigitalOcean). Set TF_VAR_do_token first.
./run dist:check offline: fmt + validate + mocked test (no token/spend)
./run dist:up <workers> [server_type] [location] e.g. ./run dist:up 10
./run dist:up <workers> [size] [region] e.g. ./run dist:up 10
./run dist:sim <games> [turn_limit] [--destroy-after]
./run dist:train <total_steps> [--destroy-after]
./run dist:down
@ -39,7 +39,7 @@ EOF
}
cmd_dist_check() {
# Offline IaC verification — no Hetzner token, no API, no servers, no cost.
# Offline IaC verification — no DigitalOcean token, no API, no servers, no cost.
# fmt (style) + validate (schema typecheck) + test (mocked-provider behaviour).
local root
root="$(_dist_repo_root)"
@ -50,27 +50,27 @@ cmd_dist_check() {
terraform -chdir="$dir" init -backend=false -input=false >/dev/null || return 1
echo "== terraform validate (schema typecheck) =="
terraform -chdir="$dir" validate || return 1
echo "== terraform test (mocked hcloud) =="
echo "== terraform test (mocked digitalocean) =="
terraform -chdir="$dir" test || return 1
echo "dist:check OK — config is valid, no resources touched."
}
cmd_dist_up() {
local n="${1:-}"
[[ "$n" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:up <workers> [server_type] [location]" >&2; return 1; }
: "${TF_VAR_hcloud_token:?export TF_VAR_hcloud_token=<hetzner API token> first}"
[[ "$n" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:up <workers> [size] [region]" >&2; return 1; }
: "${TF_VAR_do_token:?export TF_VAR_do_token=<DigitalOcean API token> first}"
local args=(-auto-approve -var "workers=$n")
[ -n "${2:-}" ] && args+=(-var "server_type=$2")
[ -n "${3:-}" ] && args+=(-var "location=$3")
[ -n "${2:-}" ] && args+=(-var "size=$2")
[ -n "${3:-}" ] && args+=(-var "region=$3")
_dist_tf init -input=false >/dev/null
_dist_tf apply "${args[@]}"
echo "fleet up: $n worker(s). inventory: $(_dist_repo_root)/.local/fleet/inventory"
}
cmd_dist_down() {
: "${TF_VAR_hcloud_token:?export TF_VAR_hcloud_token=<hetzner API token> first}"
: "${TF_VAR_do_token:?export TF_VAR_do_token=<DigitalOcean API token> first}"
_dist_tf apply -auto-approve -var "workers=0"
echo "fleet down (workers=0): zero compute cost, snapshot only (~€0.10/mo)."
echo "fleet down (workers=0): zero compute cost, snapshot only (~$0.40/mo)."
}
cmd_dist_sim() {