feat(infra): distributed test/train fleet on DigitalOcean (Terraform + Packer + dispatch)
Ephemeral CPU Droplet fleet that horizontally scales the iteration loop:
- infra/terraform/test-fleet: cattle Droplets from a golden image (auto-discovered
by name via digitalocean_images), grouped under the mc:dev DO project, with a
mocked-provider test suite (no token/spend).
- infra/packer: golden-image builder reusing scripts/dev-setup/linux.sh.
- scripts/run/dist.sh: ./run dist:{check,up,sim,train,down} — shard sim/test
batches across workers via autoplay-batch AUTOPLAY_HOST+SEED_OFFSET.
GPU intentionally absent (workload is CPU-bound per docs/ai-production.md).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
bd186b162a
commit
f5c5d1a410
10 changed files with 191 additions and 157 deletions
|
|
@ -1,41 +1,41 @@
|
|||
// Bakes a Hetzner snapshot with the full toolchain + a warm clone + a prebuilt
|
||||
// GDExtension + a warm Godot import cache, so fleet workers boot build-ready in
|
||||
// ~30s instead of running rustup/godot-install on every spin-up.
|
||||
// Bakes a DigitalOcean custom image (snapshot) with the full toolchain + a warm
|
||||
// clone + a prebuilt GDExtension + a warm Godot import cache, so fleet workers
|
||||
// boot build-ready in ~30s instead of running rustup/godot-install per spin-up.
|
||||
//
|
||||
// Build once:
|
||||
// export HCLOUD_TOKEN=... # or pass -var hcloud_token=...
|
||||
// export DIGITALOCEAN_TOKEN=... // or pass -var do_token=...
|
||||
// packer init infra/packer/golden-image.pkr.hcl
|
||||
// packer build -var git_remote=https://gitlab.com/<you>/magic-civilization.git \
|
||||
// infra/packer/golden-image.pkr.hcl
|
||||
//
|
||||
// The snapshot is labelled type=golden,project=magic-civilization; the test-fleet
|
||||
// Terraform module auto-discovers the newest one by that label.
|
||||
// The image is named mc-golden-<timestamp>; the test-fleet Terraform module
|
||||
// auto-discovers the newest one by the "mc-golden" name substring.
|
||||
|
||||
packer {
|
||||
required_plugins {
|
||||
hcloud = {
|
||||
source = "github.com/hetznercloud/hcloud"
|
||||
version = ">= 1.5.0"
|
||||
digitalocean = {
|
||||
source = "github.com/digitalocean/digitalocean"
|
||||
version = ">= 1.4.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
variable "hcloud_token" {
|
||||
variable "do_token" {
|
||||
type = string
|
||||
sensitive = true
|
||||
default = env("HCLOUD_TOKEN")
|
||||
default = env("DIGITALOCEAN_TOKEN")
|
||||
}
|
||||
|
||||
variable "location" {
|
||||
variable "region" {
|
||||
type = string
|
||||
default = "ash"
|
||||
default = "nyc3"
|
||||
}
|
||||
|
||||
# A one-off dedicated box builds fast (cargo + godot import are CPU-heavy);
|
||||
# A one-off CPU-Optimized box builds fast (cargo + godot import are CPU-heavy);
|
||||
# it only exists for the duration of the build.
|
||||
variable "build_server_type" {
|
||||
variable "build_size" {
|
||||
type = string
|
||||
default = "ccx33"
|
||||
default = "c-8"
|
||||
}
|
||||
|
||||
variable "git_remote" {
|
||||
|
|
@ -56,22 +56,17 @@ locals {
|
|||
ts = formatdate("YYYYMMDDhhmmss", timestamp())
|
||||
}
|
||||
|
||||
source "hcloud" "golden" {
|
||||
token = var.hcloud_token
|
||||
image = "ubuntu-24.04"
|
||||
location = var.location
|
||||
server_type = var.build_server_type
|
||||
ssh_username = "root"
|
||||
|
||||
source "digitalocean" "golden" {
|
||||
api_token = var.do_token
|
||||
region = var.region
|
||||
size = var.build_size
|
||||
image = "ubuntu-24-04-x64"
|
||||
ssh_username = "root"
|
||||
snapshot_name = "mc-golden-${local.ts}"
|
||||
snapshot_labels = {
|
||||
type = "golden"
|
||||
project = "magic-civilization"
|
||||
}
|
||||
}
|
||||
|
||||
build {
|
||||
sources = ["source.hcloud.golden"]
|
||||
sources = ["source.digitalocean.golden"]
|
||||
|
||||
provisioner "shell" {
|
||||
environment_vars = [
|
||||
|
|
|
|||
45
infra/terraform/test-fleet/.terraform.lock.hcl
generated
45
infra/terraform/test-fleet/.terraform.lock.hcl
generated
|
|
@ -1,6 +1,30 @@
|
|||
# This file is maintained automatically by "terraform init".
|
||||
# Manual edits may be lost in future updates.
|
||||
|
||||
provider "registry.terraform.io/digitalocean/digitalocean" {
|
||||
version = "2.92.0"
|
||||
constraints = "~> 2.0"
|
||||
hashes = [
|
||||
"h1:PDahQCnG9M3XAjihY7KzGVPuLQTB6gPKWn7Tp9TPaOY=",
|
||||
"zh:13cefc6a94b74445713abeacfdf6422d1aecf820ec08fe69bae63c3ea6fbe24e",
|
||||
"zh:20fc749afda0dfd10ec6815db78efb0bdf399033db536738580816ca341cd2c6",
|
||||
"zh:2fac398f97fbec5d9c16ce3c58a9925ca0474c4931ead3352af56161fd7d6f1e",
|
||||
"zh:3e0542d5200c1efb3113bd2ad3a5cc1ba32b9d1fe7017044ceeb0b7729a7a7f6",
|
||||
"zh:583ddc43350dfb84a9a5689fe11964df9afe1ae03d099ab96c8f0fb7bc7a4cad",
|
||||
"zh:6025ea83b0602b6ff01b3c5bbe025e73e8b47a217aae6c4270725feac01ebb2b",
|
||||
"zh:6be3c78cb90752ce9357c33792f869382ff9dbd01333d985127116478bdcec21",
|
||||
"zh:75c4c76c24bdc7e9c8626603d1c082d0894c798096ccfe8e2ceba68ad4570638",
|
||||
"zh:7abc9714982dee251e6b9ce6d4910cd413a46cb92f76a4ed3a92a56e7cc1b4e7",
|
||||
"zh:7c4808dd90886f33c5bd861b7b6be9b942ae2b32a188793f6f4e07be4e146b47",
|
||||
"zh:7d13d3bec74e08444334e6b5c1c5f5380d40dd0bbb80d2d387d9084aaecbd3cf",
|
||||
"zh:8a11b04c46865bdcd49f15622398e6e4911aa5be5d0b12d0b708cdda5c8ff734",
|
||||
"zh:910cad53707e4743f6c277fb0007f6937a64be5b3a8ded3af1273628b9c141fe",
|
||||
"zh:a67d98e6aceb5837064c6e811a557dbaaa61791b99a8b8d87b278aecb871910e",
|
||||
"zh:bed15d16d4be506123fba16c3fa6db7cafa7d2ed53f07ff370cc2228e5f6d9ba",
|
||||
"zh:f794ef952a8b2b5702ecfecf9bfe372dac392789b0762e5598764d10f24a8210",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/hashicorp/local" {
|
||||
version = "2.9.0"
|
||||
constraints = "~> 2.5"
|
||||
|
|
@ -21,24 +45,3 @@ provider "registry.terraform.io/hashicorp/local" {
|
|||
"zh:fcafa360a5b0b96244f26f4e3a6d642b716a376557142c2442ff2fb12d11da18",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/hetznercloud/hcloud" {
|
||||
version = "1.66.0"
|
||||
constraints = "~> 1.49"
|
||||
hashes = [
|
||||
"h1:iVAGP8gRbZK0kJF7SiYJRt61wz0D5AF9q+WMsrAiBI0=",
|
||||
"zh:1286cee6fb63dbcb18f53077bbb5e5d132a4e4d9f006af4e8d8edfc08d6bcdc8",
|
||||
"zh:204460dacc044bda019a4a18b398e094289500c36913c7c9457f432adf31b8b2",
|
||||
"zh:214175d50773481cbeaf9c9004e4121a3a1c9686c79424ebdc8ff189dd057d3e",
|
||||
"zh:22b17bceff61cc13ad04a399ba87521356a3a134d4687273727473ae9eccf5f1",
|
||||
"zh:368867dac5525c411de7e38f2e27de0a71854d1750867322ff2b9321128c88fb",
|
||||
"zh:5289b75f8370bdbc4c6051d55cf33d0b1bd25dc6d71bfbd39b360249a37f1501",
|
||||
"zh:81cb676aa50c5777df8fc80d4e69c9012330ae751f5e6f12bf6074bfd2e7c496",
|
||||
"zh:ab08aead10643b21aa6b51af562b50492e12b9dd0ab7dca27a05aa63209b7d66",
|
||||
"zh:af25c210d0570cf61ef767b2545bf9f3fb909178135f0e5e14bec0c1c9d07a63",
|
||||
"zh:bcad66f4830c97118fa793723e53f8a4d27ddd34ea969ff259408842c2238331",
|
||||
"zh:ce3ed323d75ae905d975925fa98c7054a7514c81276a485fc37da8232b53e39f",
|
||||
"zh:d481bc0ef0c87ab1969c17777f526b2f59f823432d676145134c41a6d29bd98e",
|
||||
"zh:ea7ef88df2c3ca154d86238920636d52a3c9066c7467543d3fa45f1e52ec2f7b",
|
||||
]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,39 +1,47 @@
|
|||
# test-fleet — distributed test/train infra (Hetzner)
|
||||
# test-fleet — distributed test/train infra (DigitalOcean)
|
||||
|
||||
Horizontally scales the iteration loop onto cheap ephemeral Hetzner cattle. One
|
||||
local command fans seeded sim batches (or RL training) across N disposable
|
||||
Horizontally scales the iteration loop onto cheap ephemeral DigitalOcean Droplets.
|
||||
One local command fans seeded sim batches (or RL training) across N disposable
|
||||
workers, collects results locally, and tears the fleet down. **Idle cost ≈ €0**
|
||||
(fleet defaults to 0 workers; only the golden snapshot bills, ~€0.10/mo).
|
||||
(fleet defaults to 0 workers; only the golden image bills, ~$0.40/mo).
|
||||
|
||||
## Layers
|
||||
|
||||
| Layer | Where | What |
|
||||
|---|---|---|
|
||||
| Golden image | `../../packer/` | Packer bakes toolchain + warm clone + prebuilt `.so` → labelled snapshot |
|
||||
| Fleet | here | `workers = N` cattle from the snapshot, auto-discovered by label |
|
||||
| Golden image | `../../packer/` | Packer bakes toolchain + warm clone + prebuilt `.so` → custom image |
|
||||
| Fleet | here | `workers = N` Droplets from the image, auto-discovered by name |
|
||||
| Dispatch | `scripts/run/dist.sh` | shard → fan out over ssh → collect → merge → teardown |
|
||||
|
||||
## Offline verification (no token, no spend)
|
||||
|
||||
```sh
|
||||
./run dist:check # terraform fmt + validate (schema typecheck) + mocked-provider test
|
||||
```
|
||||
|
||||
Run it anytime — before you even have a DO account. It uses a **mocked** provider.
|
||||
|
||||
## One-time setup
|
||||
|
||||
1. **Hetzner**: create a Cloud account + project + payment method. Generate a
|
||||
project-scoped **Read/Write API token**. (New accounts may need ID verification.)
|
||||
1. **DigitalOcean**: in the Control Panel → **API → Tokens**, generate a personal
|
||||
access token with **read+write** scope.
|
||||
2. **GitLab**: push the repo; note the clone URL (the workers' `origin`).
|
||||
3. **Build the golden image once** (see `../../packer/golden-image.pkr.hcl`):
|
||||
```sh
|
||||
export HCLOUD_TOKEN=<token>
|
||||
export DIGITALOCEAN_TOKEN=<token>
|
||||
packer init ../../packer/golden-image.pkr.hcl
|
||||
packer build -var git_remote=<gitlab-url> ../../packer/golden-image.pkr.hcl
|
||||
```
|
||||
4. **Auth env** for Terraform/dispatch:
|
||||
```sh
|
||||
export TF_VAR_hcloud_token=<token>
|
||||
export TF_VAR_do_token=<token>
|
||||
cp terraform.tfvars.example terraform.tfvars # set git_remote
|
||||
```
|
||||
|
||||
## Daily use
|
||||
|
||||
```sh
|
||||
./run dist:up 10 # 10 workers boot from the golden snapshot (~30s)
|
||||
./run dist:up 10 # 10 Droplets boot from the golden image (~30s)
|
||||
./run dist:sim 200 300 # 200 games / turn-limit 300, sharded 20/worker
|
||||
./run dist:down # destroy the fleet → back to ~€0
|
||||
# or fold teardown into the run:
|
||||
|
|
@ -45,29 +53,29 @@ worker via `SEED_OFFSET`, so no collisions). RL sweeps: `./run dist:train <steps
|
|||
|
||||
## Cost
|
||||
|
||||
Pure pay-as-you-go, billed hourly only while `workers > 0`:
|
||||
Pure pay-as-you-go, billed hourly only while `workers > 0` (⚠️ approximate — confirm in the DO console):
|
||||
|
||||
| | tier | rough cost |
|
||||
| | size | rough cost |
|
||||
|---|---|---|
|
||||
| `dist:sim` fan-out (bursty) | Shared **CPX** (`cpx41` default) | a 10×30-min run ≈ **cents** |
|
||||
| `dist:train` (sustained, hours @100%) | Dedicated **CCX** (`-var server_type=ccx33`) | ~€0.50 / 3.5h generation |
|
||||
| idle (fleet down) | snapshot only | **~€0.10/mo** |
|
||||
| `dist:sim` fan-out (bursty) | Basic `s-8vcpu-16gb` | ~$0.12/hr; a 10×30-min run ≈ **~$0.60** |
|
||||
| `dist:train` (sustained, hours @100%) | CPU-Optimized `c-8` (`./run dist:up N c-8`) | ~$0.25/hr |
|
||||
| idle (fleet down) | image storage only | **~$0.40/mo** ($0.06/GB/mo) |
|
||||
|
||||
Shared tiers are throttled under sustained load — use a Dedicated `ccx*` for long
|
||||
training runs (`./run dist:up N ccx33`), Shared `cpx*` for short test fan-out.
|
||||
DigitalOcean runs ~2–3× Hetzner's per-core price, but the cattle model keeps each
|
||||
run to cents-to-a-dollar since you only pay hourly while a fleet is up. Use a
|
||||
CPU-Optimized `c-*` for long training runs, Basic `s-*` for short test fan-out.
|
||||
|
||||
## Design notes / caveats
|
||||
|
||||
- **No persistent volume.** Workers are stateless; the golden image carries the
|
||||
warm clone + toolchain + prebuilt GDExtension. Results leave via `scp`/`rsync`.
|
||||
- **Image auto-discovery.** `data.hcloud_image.golden` selects the newest snapshot
|
||||
by label (`type=golden,project=magic-civilization`); rebuild with Packer and the
|
||||
fleet picks it up — no ID edits. Set `-var base_image=ubuntu-24.04` only to test
|
||||
`terraform plan` before any snapshot exists.
|
||||
- **No placement group.** Hetzner caps spread groups at 10 servers; for short-lived
|
||||
test cattle the HA spread isn't worth the `workers > 10` footgun.
|
||||
- **Image auto-discovery.** `data.digitalocean_images.golden` selects the newest
|
||||
custom image whose name contains `mc-golden` (filter `match_by = "substring"`,
|
||||
sort `created desc`); rebuild with Packer and the fleet picks it up — no ID
|
||||
edits. Set `-var base_image=ubuntu-24-04-x64` only to test `terraform plan`
|
||||
before any image exists.
|
||||
- **Coordinator needs GNU coreutils.** `tools/autoplay-batch.sh` uses `realpath -m`;
|
||||
on macOS install `coreutils` or run the dispatch from a Linux host.
|
||||
- **State holds the token** — `*.tfstate` and `terraform.tfvars` are gitignored.
|
||||
- GPU is intentionally absent: the workload is CPU-bound (`docs/ai-production.md`);
|
||||
rent spot GPU only if a profiler ever shows it saturated.
|
||||
rent a DO GPU Droplet only if a profiler ever shows the GPU saturated.
|
||||
|
|
|
|||
|
|
@ -2,32 +2,39 @@
|
|||
# No persistent volume: workers are stateless. The golden image carries the warm
|
||||
# clone + toolchain + prebuilt .so; results leave via the dispatch layer (scp).
|
||||
|
||||
resource "hcloud_ssh_key" "fleet" {
|
||||
resource "digitalocean_ssh_key" "fleet" {
|
||||
name = "${var.name}-key"
|
||||
public_key = file(pathexpand(var.ssh_public_key_path))
|
||||
}
|
||||
|
||||
# Resolve the newest golden snapshot by label. Skipped entirely when
|
||||
# Resolve the newest golden image by name substring. Skipped entirely when
|
||||
# var.base_image is set (bootstrap path), so `terraform plan` works before any
|
||||
# snapshot exists.
|
||||
data "hcloud_image" "golden" {
|
||||
count = var.base_image == "" ? 1 : 0
|
||||
with_selector = var.golden_selector
|
||||
with_architecture = "x86"
|
||||
most_recent = true
|
||||
# golden image exists.
|
||||
data "digitalocean_images" "golden" {
|
||||
count = var.base_image == "" ? 1 : 0
|
||||
|
||||
filter {
|
||||
key = "name"
|
||||
values = [var.golden_name_match]
|
||||
match_by = "substring"
|
||||
}
|
||||
sort {
|
||||
key = "created"
|
||||
direction = "desc"
|
||||
}
|
||||
}
|
||||
|
||||
locals {
|
||||
image = var.base_image != "" ? var.base_image : data.hcloud_image.golden[0].id
|
||||
image = var.base_image != "" ? var.base_image : tostring(data.digitalocean_images.golden[0].images[0].id)
|
||||
}
|
||||
|
||||
resource "hcloud_server" "worker" {
|
||||
count = var.workers
|
||||
name = "${var.name}-${count.index}"
|
||||
server_type = var.server_type
|
||||
location = var.location
|
||||
image = local.image
|
||||
ssh_keys = [hcloud_ssh_key.fleet.id]
|
||||
resource "digitalocean_droplet" "worker" {
|
||||
count = var.workers
|
||||
name = "${var.name}-${count.index}"
|
||||
size = var.size
|
||||
region = var.region
|
||||
image = local.image
|
||||
ssh_keys = [digitalocean_ssh_key.fleet.id]
|
||||
|
||||
# Thin cloud-init: copy the injected key to the build user and fast-forward
|
||||
# the warm clone to the requested ref. The golden image already holds the
|
||||
|
|
@ -37,9 +44,14 @@ resource "hcloud_server" "worker" {
|
|||
git_ref = var.git_ref
|
||||
remote_user = var.remote_user
|
||||
})
|
||||
|
||||
labels = {
|
||||
project = "magic-civilization"
|
||||
role = "test-fleet"
|
||||
}
|
||||
}
|
||||
|
||||
# Group fleet workers under the DigitalOcean "mc" project.
|
||||
data "digitalocean_project" "mc" {
|
||||
name = var.do_project
|
||||
}
|
||||
|
||||
resource "digitalocean_project_resources" "fleet" {
|
||||
project = data.digitalocean_project.mc.id
|
||||
resources = [for d in digitalocean_droplet.worker : d.urn]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,17 +3,17 @@ locals {
|
|||
repo_root = abspath("${path.module}/../../..")
|
||||
inventory_path = "${local.repo_root}/.local/fleet/inventory"
|
||||
# One "<user>@<ipv4>" line per worker — consumed by scripts/run/dist.sh.
|
||||
inventory_body = join("\n", [for s in hcloud_server.worker : "${var.remote_user}@${s.ipv4_address}"])
|
||||
inventory_body = join("\n", [for d in digitalocean_droplet.worker : "${var.remote_user}@${d.ipv4_address}"])
|
||||
}
|
||||
|
||||
output "worker_ips" {
|
||||
description = "Public IPv4 of each fleet worker."
|
||||
value = [for s in hcloud_server.worker : s.ipv4_address]
|
||||
value = [for d in digitalocean_droplet.worker : d.ipv4_address]
|
||||
}
|
||||
|
||||
output "worker_hosts" {
|
||||
description = "ssh targets (<user>@<ip>) the dispatch layer fans work across."
|
||||
value = [for s in hcloud_server.worker : "${var.remote_user}@${s.ipv4_address}"]
|
||||
value = [for d in digitalocean_droplet.worker : "${var.remote_user}@${d.ipv4_address}"]
|
||||
}
|
||||
|
||||
output "inventory_path" {
|
||||
|
|
|
|||
|
|
@ -1,12 +1,12 @@
|
|||
# Copy to terraform.tfvars and fill in. terraform.tfvars is gitignored.
|
||||
# The token is best passed via env instead: export TF_VAR_hcloud_token=...
|
||||
# The token is best passed via env instead: export TF_VAR_do_token=...
|
||||
|
||||
# Required: GitLab origin the golden image was built from.
|
||||
git_remote = "https://gitlab.com/<you>/magic-civilization.git"
|
||||
|
||||
# Optional overrides (defaults shown).
|
||||
# location = "ash" # Ashburn VA (~near NYC)
|
||||
# server_type = "cpx41" # bursty test/sim; use ccx33 for sustained training
|
||||
# region = "nyc3" # NYC (also: nyc1, sfo3, ams3, fra1)
|
||||
# size = "s-8vcpu-16gb" # bursty test/sim; use c-8 for sustained training
|
||||
# git_ref = "main"
|
||||
# remote_user = "mc"
|
||||
# ssh_public_key_path = "~/.ssh/id_ed25519.pub"
|
||||
|
|
@ -15,5 +15,5 @@ git_remote = "https://gitlab.com/<you>/magic-civilization.git"
|
|||
# (./run dist:up N -> -var workers=N), not pinned here.
|
||||
# workers = 0
|
||||
|
||||
# Bootstrap only: set to test `terraform plan` before a golden snapshot exists.
|
||||
# base_image = "ubuntu-24.04"
|
||||
# Bootstrap only: set to test `terraform plan` before a golden image exists.
|
||||
# base_image = "ubuntu-24-04-x64"
|
||||
|
|
|
|||
|
|
@ -1,12 +1,21 @@
|
|||
# No-spend test harness for the fleet module.
|
||||
# terraform test (from the module dir)
|
||||
# Uses a MOCKED hcloud provider — no API token, no API calls, no servers, no cost.
|
||||
# Exercises count expansion, the golden-image branch toggle, and the workers guardrail.
|
||||
# Uses a MOCKED digitalocean provider — no API token, no API calls, no Droplets,
|
||||
# no cost. Exercises count expansion, the golden-image branch toggle, and the
|
||||
# workers guardrail.
|
||||
|
||||
mock_provider "hcloud" {}
|
||||
mock_provider "digitalocean" {
|
||||
# The golden-image data source is computed; give it a non-empty result so the
|
||||
# base_image == "" branch (images[0].id) resolves under mocking.
|
||||
mock_data "digitalocean_images" {
|
||||
defaults = {
|
||||
images = [{ id = 123456789 }]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
variables {
|
||||
hcloud_token = "mock-token-unused"
|
||||
do_token = "mock-token-unused"
|
||||
git_remote = "https://example.com/magic-civilization.git"
|
||||
ssh_public_key_path = "./tests/fixtures/id_test.pub"
|
||||
}
|
||||
|
|
@ -17,21 +26,21 @@ run "fleet_expands_and_skips_golden_when_base_image_set" {
|
|||
|
||||
variables {
|
||||
workers = 3
|
||||
base_image = "ubuntu-24.04"
|
||||
base_image = "ubuntu-24-04-x64"
|
||||
}
|
||||
|
||||
assert {
|
||||
condition = length(hcloud_server.worker) == 3
|
||||
condition = length(digitalocean_droplet.worker) == 3
|
||||
error_message = "expected 3 workers when workers=3"
|
||||
}
|
||||
|
||||
assert {
|
||||
condition = length(data.hcloud_image.golden) == 0
|
||||
condition = length(data.digitalocean_images.golden) == 0
|
||||
error_message = "golden data source must be skipped when base_image is set"
|
||||
}
|
||||
}
|
||||
|
||||
# base_image empty -> golden snapshot is resolved via the label selector.
|
||||
# base_image empty -> golden image is resolved via the name-substring filter.
|
||||
run "golden_image_branch_active_when_base_image_empty" {
|
||||
command = plan
|
||||
|
||||
|
|
@ -41,28 +50,28 @@ run "golden_image_branch_active_when_base_image_empty" {
|
|||
}
|
||||
|
||||
assert {
|
||||
condition = length(data.hcloud_image.golden) == 1
|
||||
condition = length(data.digitalocean_images.golden) == 1
|
||||
error_message = "golden data source must be queried when base_image is empty"
|
||||
}
|
||||
|
||||
assert {
|
||||
condition = length(hcloud_server.worker) == 2
|
||||
condition = length(digitalocean_droplet.worker) == 2
|
||||
error_message = "expected 2 workers when workers=2"
|
||||
}
|
||||
}
|
||||
|
||||
# workers = 0 -> zero servers (idle / torn-down state).
|
||||
# workers = 0 -> zero Droplets (idle / torn-down state).
|
||||
run "zero_workers_is_empty_fleet" {
|
||||
command = plan
|
||||
|
||||
variables {
|
||||
workers = 0
|
||||
base_image = "ubuntu-24.04"
|
||||
base_image = "ubuntu-24-04-x64"
|
||||
}
|
||||
|
||||
assert {
|
||||
condition = length(hcloud_server.worker) == 0
|
||||
error_message = "workers=0 must produce no servers"
|
||||
condition = length(digitalocean_droplet.worker) == 0
|
||||
error_message = "workers=0 must produce no Droplets"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -72,7 +81,7 @@ run "rejects_oversize_fleet" {
|
|||
|
||||
variables {
|
||||
workers = 99
|
||||
base_image = "ubuntu-24.04"
|
||||
base_image = "ubuntu-24-04-x64"
|
||||
}
|
||||
|
||||
expect_failures = [var.workers]
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
variable "hcloud_token" {
|
||||
description = "Hetzner Cloud API token (project-scoped, Read/Write). Export as TF_VAR_hcloud_token; never commit."
|
||||
variable "do_token" {
|
||||
description = "DigitalOcean API token (Read/Write). Export as TF_VAR_do_token; never commit."
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
|
@ -7,7 +7,7 @@ variable "hcloud_token" {
|
|||
variable "workers" {
|
||||
description = <<-EOT
|
||||
Fleet size — the iteration-speed lever. 0 = nothing running, zero cost.
|
||||
Set to N to fan distributed sim/test work across N disposable workers, then
|
||||
Set to N to fan distributed sim/test work across N disposable Droplets, then
|
||||
back to 0 to tear the fleet down. Each worker is identical cattle; results
|
||||
are scp'd off by the dispatch layer before teardown, so there is no
|
||||
per-worker state to preserve.
|
||||
|
|
@ -17,44 +17,45 @@ variable "workers" {
|
|||
|
||||
validation {
|
||||
condition = var.workers >= 0 && var.workers <= 50
|
||||
error_message = "Keep the fleet between 0 and 50 (project-quota / sanity guard)."
|
||||
error_message = "Keep the fleet between 0 and 50 (account-limit / sanity guard)."
|
||||
}
|
||||
}
|
||||
|
||||
variable "location" {
|
||||
description = "Hetzner location. US: ash (Ashburn VA, ~near NYC), hil (Hillsboro OR). EU: fsn1, nbg1, hel1."
|
||||
variable "region" {
|
||||
description = "DigitalOcean region slug. NYC: nyc1, nyc3. Others: sfo3, ams3, fra1, sgp1."
|
||||
type = string
|
||||
default = "ash"
|
||||
default = "nyc3"
|
||||
}
|
||||
|
||||
variable "server_type" {
|
||||
variable "size" {
|
||||
description = <<-EOT
|
||||
Per-worker size. Distributed fan-out favours many small cheap boxes over one
|
||||
big one (finer shard granularity per euro). Match tier to CPU profile:
|
||||
Bursty test/sim (minutes) -> Shared "Regular" CPX:
|
||||
cpx31 = 4 vCPU / 8 GB, cpx41 = 8 vCPU / 16 GB (default), cpx51 = 16/32
|
||||
Sustained RL training (hours @ 100%) -> Dedicated CCX (no shared-tier throttle):
|
||||
ccx33 = 8 vCPU / 32 GB, ccx43 = 16 vCPU / 64 GB
|
||||
Per-worker Droplet size slug. Distributed fan-out favours many small cheap
|
||||
boxes over one big one. Match tier to CPU profile:
|
||||
Bursty test/sim (minutes) -> Basic shared:
|
||||
s-4vcpu-8gb, s-8vcpu-16gb (default), s-8vcpu-16gb-amd
|
||||
Sustained RL training (hours @ 100%) -> CPU-Optimized (dedicated vCPU):
|
||||
c-8 = 8 vCPU / 16 GB, c-16 = 16 vCPU / 32 GB
|
||||
EOT
|
||||
type = string
|
||||
default = "cpx41"
|
||||
default = "s-8vcpu-16gb"
|
||||
}
|
||||
|
||||
variable "base_image" {
|
||||
description = <<-EOT
|
||||
Bootstrap escape hatch. Leave empty ("") to boot workers from the latest
|
||||
Packer-built golden snapshot (resolved by label via the hcloud_image data
|
||||
source). Set to a stock image name (e.g. "ubuntu-24.04") only for first-run
|
||||
Terraform plan testing BEFORE any golden snapshot exists.
|
||||
Packer-built golden image (resolved by name-substring via the
|
||||
digitalocean_images data source). Set to a stock image slug (e.g.
|
||||
"ubuntu-24-04-x64") only for first-run `terraform plan` testing BEFORE any
|
||||
golden image exists.
|
||||
EOT
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "golden_selector" {
|
||||
description = "Label selector identifying the Packer golden snapshot. Must match the labels set in infra/packer/golden-image.pkr.hcl."
|
||||
variable "golden_name_match" {
|
||||
description = "Name substring identifying the Packer golden image. Must match snapshot_name prefix in infra/packer/golden-image.pkr.hcl."
|
||||
type = string
|
||||
default = "type=golden,project=magic-civilization"
|
||||
default = "mc-golden"
|
||||
}
|
||||
|
||||
variable "ssh_public_key_path" {
|
||||
|
|
@ -86,3 +87,9 @@ variable "remote_user" {
|
|||
type = string
|
||||
default = "mc"
|
||||
}
|
||||
|
||||
variable "do_project" {
|
||||
description = "DigitalOcean project name to group fleet resources under."
|
||||
type = string
|
||||
default = "mc:dev"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,9 +2,9 @@ terraform {
|
|||
required_version = ">= 1.6"
|
||||
|
||||
required_providers {
|
||||
hcloud = {
|
||||
source = "hetznercloud/hcloud"
|
||||
version = "~> 1.49"
|
||||
digitalocean = {
|
||||
source = "digitalocean/digitalocean"
|
||||
version = "~> 2.0"
|
||||
}
|
||||
local = {
|
||||
source = "hashicorp/local"
|
||||
|
|
@ -13,6 +13,6 @@ terraform {
|
|||
}
|
||||
}
|
||||
|
||||
provider "hcloud" {
|
||||
token = var.hcloud_token
|
||||
provider "digitalocean" {
|
||||
token = var.do_token
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,14 +1,14 @@
|
|||
#!/usr/bin/env bash
|
||||
# Distributed test/train dispatch — fan the iteration loop across the Hetzner
|
||||
# Distributed test/train dispatch — fan the iteration loop across the DigitalOcean
|
||||
# test fleet. Sourced by ./run (defines cmd_dist_*). Auto-registered via the
|
||||
# cmd_<verb>_<target> name-dispatch, so no edit to the top-level `run` is needed.
|
||||
#
|
||||
# ./run dist:up <workers> [server_type] [location] spin the fleet up
|
||||
# ./run dist:up <workers> [size] [region] spin the fleet up
|
||||
# ./run dist:sim <games> [turn_limit] [--destroy-after] fan a sim batch across it
|
||||
# ./run dist:train <total_steps> [--destroy-after] fan an RL sweep across it
|
||||
# ./run dist:down tear it down (zero cost)
|
||||
#
|
||||
# Requires: TF_VAR_hcloud_token in env, terraform on PATH, and a coordinator with
|
||||
# Requires: TF_VAR_do_token in env, terraform on PATH, and a coordinator with
|
||||
# GNU coreutils (autoplay-batch.sh uses `realpath -m`).
|
||||
|
||||
_DIST_TF_DIR_REL="infra/terraform/test-fleet"
|
||||
|
|
@ -29,9 +29,9 @@ _dist_read_hosts() {
|
|||
|
||||
cmd_dist() {
|
||||
cat <<'EOF'
|
||||
Distributed test/train fleet (Hetzner). Set TF_VAR_hcloud_token first.
|
||||
Distributed test/train fleet (DigitalOcean). Set TF_VAR_do_token first.
|
||||
./run dist:check offline: fmt + validate + mocked test (no token/spend)
|
||||
./run dist:up <workers> [server_type] [location] e.g. ./run dist:up 10
|
||||
./run dist:up <workers> [size] [region] e.g. ./run dist:up 10
|
||||
./run dist:sim <games> [turn_limit] [--destroy-after]
|
||||
./run dist:train <total_steps> [--destroy-after]
|
||||
./run dist:down
|
||||
|
|
@ -39,7 +39,7 @@ EOF
|
|||
}
|
||||
|
||||
cmd_dist_check() {
|
||||
# Offline IaC verification — no Hetzner token, no API, no servers, no cost.
|
||||
# Offline IaC verification — no DigitalOcean token, no API, no servers, no cost.
|
||||
# fmt (style) + validate (schema typecheck) + test (mocked-provider behaviour).
|
||||
local root
|
||||
root="$(_dist_repo_root)"
|
||||
|
|
@ -50,27 +50,27 @@ cmd_dist_check() {
|
|||
terraform -chdir="$dir" init -backend=false -input=false >/dev/null || return 1
|
||||
echo "== terraform validate (schema typecheck) =="
|
||||
terraform -chdir="$dir" validate || return 1
|
||||
echo "== terraform test (mocked hcloud) =="
|
||||
echo "== terraform test (mocked digitalocean) =="
|
||||
terraform -chdir="$dir" test || return 1
|
||||
echo "dist:check OK — config is valid, no resources touched."
|
||||
}
|
||||
|
||||
cmd_dist_up() {
|
||||
local n="${1:-}"
|
||||
[[ "$n" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:up <workers> [server_type] [location]" >&2; return 1; }
|
||||
: "${TF_VAR_hcloud_token:?export TF_VAR_hcloud_token=<hetzner API token> first}"
|
||||
[[ "$n" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:up <workers> [size] [region]" >&2; return 1; }
|
||||
: "${TF_VAR_do_token:?export TF_VAR_do_token=<DigitalOcean API token> first}"
|
||||
local args=(-auto-approve -var "workers=$n")
|
||||
[ -n "${2:-}" ] && args+=(-var "server_type=$2")
|
||||
[ -n "${3:-}" ] && args+=(-var "location=$3")
|
||||
[ -n "${2:-}" ] && args+=(-var "size=$2")
|
||||
[ -n "${3:-}" ] && args+=(-var "region=$3")
|
||||
_dist_tf init -input=false >/dev/null
|
||||
_dist_tf apply "${args[@]}"
|
||||
echo "fleet up: $n worker(s). inventory: $(_dist_repo_root)/.local/fleet/inventory"
|
||||
}
|
||||
|
||||
cmd_dist_down() {
|
||||
: "${TF_VAR_hcloud_token:?export TF_VAR_hcloud_token=<hetzner API token> first}"
|
||||
: "${TF_VAR_do_token:?export TF_VAR_do_token=<DigitalOcean API token> first}"
|
||||
_dist_tf apply -auto-approve -var "workers=0"
|
||||
echo "fleet down (workers=0): zero compute cost, snapshot only (~€0.10/mo)."
|
||||
echo "fleet down (workers=0): zero compute cost, snapshot only (~$0.40/mo)."
|
||||
}
|
||||
|
||||
cmd_dist_sim() {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue