feat(@projects/@magic-civilization): 🩹 p3-29 T2 — Rust turn emits UnitHealed

The live GDScript turn emitted `unit_healed` inline; the headless healing
phase recovered HP silently. The healing phase runs in the end-of-turn
`fn(&mut GameState)` registry (no event sink), so follow the FloraSuccession
buffer pattern: stash `(player, unit_id, applied_amount, col, row)` into a new
transient `GameState.pending_heal_events`, drain it in `step()` into
`TurnEvent::UnitHealed`. The buffered amount is the CLAMPED delta actually
applied (not the nominal heal rate). No wire surface — dispatch drops it; the
live UI consumes it via the kind-tagged `event_to_dict` dict.

Verified headless: mc-replay 19/0 (unit_healed_serde), mc-turn 289/0
(healing_buffers_unit_heal_event_with_applied_amount +
healing_buffers_clamped_amount_near_full_hp + event_collector_wiring).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Natalie 2026-06-27 06:12:07 -04:00
parent 236a5058e5
commit 158ef4d1bd
21 changed files with 780 additions and 126 deletions

View file

@ -0,0 +1,85 @@
// Bakes a Hetzner snapshot with the full toolchain + a warm clone + a prebuilt
// GDExtension + a warm Godot import cache, so fleet workers boot build-ready in
// ~30s instead of running rustup/godot-install on every spin-up.
//
// Build once:
// export HCLOUD_TOKEN=... # or pass -var hcloud_token=...
// packer init infra/packer/golden-image.pkr.hcl
// packer build -var git_remote=https://gitlab.com/<you>/magic-civilization.git \
// infra/packer/golden-image.pkr.hcl
//
// The snapshot is labelled type=golden,project=magic-civilization; the test-fleet
// Terraform module auto-discovers the newest one by that label.
packer {
required_plugins {
hcloud = {
source = "github.com/hetznercloud/hcloud"
version = ">= 1.5.0"
}
}
}
variable "hcloud_token" {
type = string
sensitive = true
default = env("HCLOUD_TOKEN")
}
variable "location" {
type = string
default = "ash"
}
# A one-off dedicated box builds fast (cargo + godot import are CPU-heavy);
# it only exists for the duration of the build.
variable "build_server_type" {
type = string
default = "ccx33"
}
variable "git_remote" {
type = string
}
variable "git_ref" {
type = string
default = "main"
}
variable "remote_user" {
type = string
default = "mc"
}
locals {
ts = formatdate("YYYYMMDDhhmmss", timestamp())
}
source "hcloud" "golden" {
token = var.hcloud_token
image = "ubuntu-24.04"
location = var.location
server_type = var.build_server_type
ssh_username = "root"
snapshot_name = "mc-golden-${local.ts}"
snapshot_labels = {
type = "golden"
project = "magic-civilization"
}
}
build {
sources = ["source.hcloud.golden"]
provisioner "shell" {
environment_vars = [
"GIT_REMOTE=${var.git_remote}",
"GIT_REF=${var.git_ref}",
"BUILD_USER=${var.remote_user}",
]
execute_command = "chmod +x {{ .Path }}; env {{ .Vars }} bash {{ .Path }}"
script = "${path.root}/provision.sh"
}
}

69
infra/packer/provision.sh Executable file
View file

@ -0,0 +1,69 @@
#!/usr/bin/env bash
# Golden-image provisioner. Runs as root on a fresh Ubuntu 24.04 box during
# `packer build`. Produces a build-ready image for the distributed test fleet:
# - a build user (flatpak runs --user as this account; root flatpak is unsupported)
# - the repo cloned at $HOME/Code/@projects/@magic-civilization (run_ap3.sh hard-codes this path)
# - the full toolchain via the repo's own scripts/dev-setup/linux.sh (DRY)
# - a prebuilt GDExtension .so (remote autoplay-batch skips the rebuild)
# - a warm Godot import cache + warm cargo registry
#
# Env (injected by Packer): GIT_REMOTE (required), GIT_REF, BUILD_USER.
set -euo pipefail
GIT_REMOTE="${GIT_REMOTE:?GIT_REMOTE must be set}"
GIT_REF="${GIT_REF:-main}"
BUILD_USER="${BUILD_USER:-mc}"
REPO_PATH="Code/@projects/@magic-civilization" # relative to the build user's HOME
echo "=== [1/7] base packages ==="
export DEBIAN_FRONTEND=noninteractive
apt-get update -y
apt-get install -y --no-install-recommends \
git curl ca-certificates build-essential pkg-config libssl-dev \
unzip sudo python3-pip flatpak rsync
echo "=== [2/7] build user '$BUILD_USER' ==="
if ! id "$BUILD_USER" >/dev/null 2>&1; then
useradd --create-home --shell /bin/bash "$BUILD_USER"
fi
BUILD_UID="$(id -u "$BUILD_USER")"
# Enable lingering so /run/user/$UID (and the user D-Bus flatpak needs for
# headless --import) exists without an interactive login.
loginctl enable-linger "$BUILD_USER" || true
# Helper: run a command as the build user in a login shell with the user
# runtime dir wired up (matches autoplay-batch.sh's XDG_RUNTIME_DIR handling).
as_user() {
sudo -u "$BUILD_USER" -H \
env "XDG_RUNTIME_DIR=/run/user/${BUILD_UID}" \
bash -lc "$1"
}
echo "=== [3/7] clone repo @ \$HOME/$REPO_PATH (ref $GIT_REF) ==="
as_user "mkdir -p ~/$(dirname "$REPO_PATH")"
as_user "git clone '$GIT_REMOTE' ~/$REPO_PATH"
as_user "cd ~/$REPO_PATH && git checkout -f '$GIT_REF'"
echo "=== [4/7] toolchain via scripts/dev-setup/linux.sh ==="
# WITH_RUNNER must be defined: linux.sh references it unguarded under set -u and
# we use GitLab CI, not a forgejo runner, so keep it false.
as_user "cd ~/$REPO_PATH && WITH_RUNNER=false bash scripts/dev-setup/linux.sh"
echo "=== [5/7] python RL deps ==="
as_user "pip3 install --user --break-system-packages -r ~/$REPO_PATH/tooling/rl_self_play/requirements.txt || pip3 install --user -r ~/$REPO_PATH/tooling/rl_self_play/requirements.txt"
echo "=== [6/7] prebuild GDExtension + warm cargo registry ==="
# Remote autoplay-batch mode does NOT rebuild the .so (tools/autoplay-batch.sh:144),
# so the golden image must ship a fresh one.
as_user "cd ~/$REPO_PATH/src/simulator && source ~/.cargo/env && cargo fetch && bash build-gdext.sh"
echo "=== [7/7] place run_ap3.sh in ~/bin + warm Godot import cache ==="
# autoplay-batch.sh expects the runner at \$HOME/bin/run_ap3.sh (tools/autoplay-batch.sh:372).
as_user "mkdir -p ~/bin && cp ~/$REPO_PATH/scripts/autoplay/run_ap3.sh ~/bin/run_ap3.sh && chmod +x ~/bin/run_ap3.sh"
# First-import gotcha: a fresh checkout needs one --headless --import to build
# .godot/*.cfg or GDExtension classes resolve as 'not declared'. Non-fatal if it
# flakes here — the first real run rebuilds it — but baking it makes boots clean.
as_user "cd ~/$REPO_PATH && flatpak run --user org.godotengine.Godot --path src/game --headless --import" || \
echo "WARN: headless --import did not complete cleanly — validate in the live smoke"
echo "=== golden image provisioned OK ==="

View file

@ -1,51 +0,0 @@
locals {
server_count = var.enabled ? 1 : 0
}
resource "hcloud_ssh_key" "runner" {
name = "${var.name}-key"
public_key = file(pathexpand(var.ssh_public_key_path))
}
# Persistent data volume deliberately NOT gated on var.enabled, so it lives
# across server destroy/recreate. This is what makes the server ephemeral:
# the slow-to-rebuild state (cargo cache, target/, the clone, RL checkpoints)
# stays here, the compute is disposable.
resource "hcloud_volume" "data" {
name = "${var.name}-data"
size = var.volume_size
location = var.location
format = "ext4"
}
resource "hcloud_server" "runner" {
count = local.server_count
name = var.name
server_type = var.server_type
location = var.location
image = "ubuntu-24.04"
ssh_keys = [hcloud_ssh_key.runner.id]
user_data = templatefile("${path.module}/cloud-init.yaml", {
volume_id = hcloud_volume.data.id
git_remote = var.git_remote
})
labels = {
project = "magic-civilization"
role = "cpu-runner"
}
# Keep the box if it is briefly toggled; protects against an accidental apply
# nuking an in-flight training run. Remove if you want hard ephemerality.
lifecycle {
ignore_changes = [ssh_keys]
}
}
resource "hcloud_volume_attachment" "data" {
count = local.server_count
volume_id = hcloud_volume.data.id
server_id = hcloud_server.runner[0].id
automount = false # cloud-init mounts it deterministically by id
}

View file

@ -1,74 +0,0 @@
variable "hcloud_token" {
description = "Hetzner Cloud API token (project-scoped). Export as TF_VAR_hcloud_token; never commit."
type = string
sensitive = true
}
variable "workers" {
description = <<-EOT
Fleet size the iteration-speed lever. 0 = nothing running, zero cost.
Set to N to fan distributed sim/test work across N cattle, then back to 0
to tear the fleet down. Each worker is identical and disposable; results
are rsynced off before destroy, so there is no per-worker state to keep.
EOT
type = number
default = 0
validation {
condition = var.workers >= 0 && var.workers <= 50
error_message = "Keep the fleet between 0 and 50 (project-quota / sanity guard)."
}
}
variable "location" {
description = "Hetzner location. US: ash (Ashburn VA, ~near NYC), hil (Hillsboro OR). EU: fsn1, nbg1, hel1."
type = string
default = "ash"
}
variable "server_type" {
description = <<-EOT
Per-worker size. Distributed fan-out favours many small cheap boxes over one
big one (finer shard granularity per euro). Shared-vCPU cpx line is cheapest:
cpx31 = 4 vCPU / 8 GB (fine granularity, cheapest unit)
cpx41 = 8 vCPU / 16 GB (default; PARALLEL=8 games/worker)
cpx51 = 16 vCPU / 32 GB (fewer, fatter workers; also for RL self-play envs)
EOT
type = string
default = "cpx41"
}
variable "image" {
description = <<-EOT
Boot image. Default is the stock Ubuntu base workers then run the full
toolchain install via cloud-init on first boot (~3-4 min, parallel across the
fleet). After you bake a golden snapshot with the Packer template in
../../packer, set this to that snapshot's ID for ~30 s ready-to-work boots.
EOT
type = string
default = "ubuntu-24.04"
}
variable "ssh_public_key_path" {
description = "Public key authorised for SSH into every worker (also used by the dispatch script)."
type = string
default = "~/.ssh/id_ed25519.pub"
}
variable "name" {
description = "Resource name prefix; workers are named <name>-0, <name>-1, ..."
type = string
default = "mc-test"
}
variable "git_remote" {
description = "GitLab clone URL (origin) the workers pull source from. Required for cloud-init to fetch the repo."
type = string
default = ""
}
variable "git_ref" {
description = "Branch/tag/SHA the fleet checks out. Pin to a SHA for reproducible distributed runs."
type = string
default = "main"
}

7
infra/terraform/test-fleet/.gitignore vendored Normal file
View file

@ -0,0 +1,7 @@
# Terraform state holds the hcloud token in plaintext — never commit it.
*.tfstate
*.tfstate.*
.terraform/
terraform.tfvars
# NOTE: .terraform.lock.hcl is intentionally committed (pins provider hashes).
crash.log

View file

@ -0,0 +1,44 @@
# This file is maintained automatically by "terraform init".
# Manual edits may be lost in future updates.
provider "registry.terraform.io/hashicorp/local" {
version = "2.9.0"
constraints = "~> 2.5"
hashes = [
"h1:m24fjcInWvTVZ1XSo2MaNuKPe+X/gfG8SIi09rA7a7M=",
"zh:0baa4566cf77f1ff52f4293d1c8536202dd23edc197c3196413a28343c3ac3a0",
"zh:16b5559c3c07088ddad11a9bb9e9c0799999363c2958e9a5be2bcbbf2cd9ca64",
"zh:197c79015a10d1cce904a8ea722cbc750c42aeae2da53f44a6a0751d9fd1aa90",
"zh:29d0b03e5343a80677ebfeb2e2c31cbe4b1f65e736e53417454a4277fec2544c",
"zh:4896bfa6cf1d2fd562b47ef2e87f47862ae92a04f8ad5d764380f0c6653473b8",
"zh:531f8529cbca49f681883e57761a05a8398afaef6d1ab0d205d26bf12f4428e8",
"zh:6aaf5011d83161c86d2bfb80c0923ec934e578288758da2f37acb7aec129004b",
"zh:7430275253d3d3c40aa6179e0ec0d63212874dbbc06c5a51b9d07ec590f9756c",
"zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
"zh:be17dc611e95e26cdf6cad79dfccf1064f0e32032a2efeb939a9bbe7fb1cbfe9",
"zh:f0e3b0aa644202e1d79d2000dca91f6019425da71e9800fa23f27e51c034f195",
"zh:f62bae4519e4ead49182ddc8afe8cf61e2a4c3ba3973b0fbba967736a2696aa3",
"zh:fcafa360a5b0b96244f26f4e3a6d642b716a376557142c2442ff2fb12d11da18",
]
}
provider "registry.terraform.io/hetznercloud/hcloud" {
version = "1.66.0"
constraints = "~> 1.49"
hashes = [
"h1:iVAGP8gRbZK0kJF7SiYJRt61wz0D5AF9q+WMsrAiBI0=",
"zh:1286cee6fb63dbcb18f53077bbb5e5d132a4e4d9f006af4e8d8edfc08d6bcdc8",
"zh:204460dacc044bda019a4a18b398e094289500c36913c7c9457f432adf31b8b2",
"zh:214175d50773481cbeaf9c9004e4121a3a1c9686c79424ebdc8ff189dd057d3e",
"zh:22b17bceff61cc13ad04a399ba87521356a3a134d4687273727473ae9eccf5f1",
"zh:368867dac5525c411de7e38f2e27de0a71854d1750867322ff2b9321128c88fb",
"zh:5289b75f8370bdbc4c6051d55cf33d0b1bd25dc6d71bfbd39b360249a37f1501",
"zh:81cb676aa50c5777df8fc80d4e69c9012330ae751f5e6f12bf6074bfd2e7c496",
"zh:ab08aead10643b21aa6b51af562b50492e12b9dd0ab7dca27a05aa63209b7d66",
"zh:af25c210d0570cf61ef767b2545bf9f3fb909178135f0e5e14bec0c1c9d07a63",
"zh:bcad66f4830c97118fa793723e53f8a4d27ddd34ea969ff259408842c2238331",
"zh:ce3ed323d75ae905d975925fa98c7054a7514c81276a485fc37da8232b53e39f",
"zh:d481bc0ef0c87ab1969c17777f526b2f59f823432d676145134c41a6d29bd98e",
"zh:ea7ef88df2c3ca154d86238920636d52a3c9066c7467543d3fa45f1e52ec2f7b",
]
}

View file

@ -0,0 +1,73 @@
# test-fleet — distributed test/train infra (Hetzner)
Horizontally scales the iteration loop onto cheap ephemeral Hetzner cattle. One
local command fans seeded sim batches (or RL training) across N disposable
workers, collects results locally, and tears the fleet down. **Idle cost ≈ €0**
(fleet defaults to 0 workers; only the golden snapshot bills, ~€0.10/mo).
## Layers
| Layer | Where | What |
|---|---|---|
| Golden image | `../../packer/` | Packer bakes toolchain + warm clone + prebuilt `.so` → labelled snapshot |
| Fleet | here | `workers = N` cattle from the snapshot, auto-discovered by label |
| Dispatch | `scripts/run/dist.sh` | shard → fan out over ssh → collect → merge → teardown |
## One-time setup
1. **Hetzner**: create a Cloud account + project + payment method. Generate a
project-scoped **Read/Write API token**. (New accounts may need ID verification.)
2. **GitLab**: push the repo; note the clone URL (the workers' `origin`).
3. **Build the golden image once** (see `../../packer/golden-image.pkr.hcl`):
```sh
export HCLOUD_TOKEN=<token>
packer init ../../packer/golden-image.pkr.hcl
packer build -var git_remote=<gitlab-url> ../../packer/golden-image.pkr.hcl
```
4. **Auth env** for Terraform/dispatch:
```sh
export TF_VAR_hcloud_token=<token>
cp terraform.tfvars.example terraform.tfvars # set git_remote
```
## Daily use
```sh
./run dist:up 10 # 10 workers boot from the golden snapshot (~30s)
./run dist:sim 200 300 # 200 games / turn-limit 300, sharded 20/worker
./run dist:down # destroy the fleet → back to ~€0
# or fold teardown into the run:
./run dist:sim 200 300 --destroy-after
```
Results land merged under `.local/iter/<stamp>/` (disjoint seed numbers per
worker via `SEED_OFFSET`, so no collisions). RL sweeps: `./run dist:train <steps>`.
## Cost
Pure pay-as-you-go, billed hourly only while `workers > 0`:
| | tier | rough cost |
|---|---|---|
| `dist:sim` fan-out (bursty) | Shared **CPX** (`cpx41` default) | a 10×30-min run ≈ **cents** |
| `dist:train` (sustained, hours @100%) | Dedicated **CCX** (`-var server_type=ccx33`) | ~€0.50 / 3.5h generation |
| idle (fleet down) | snapshot only | **~€0.10/mo** |
Shared tiers are throttled under sustained load — use a Dedicated `ccx*` for long
training runs (`./run dist:up N ccx33`), Shared `cpx*` for short test fan-out.
## Design notes / caveats
- **No persistent volume.** Workers are stateless; the golden image carries the
warm clone + toolchain + prebuilt GDExtension. Results leave via `scp`/`rsync`.
- **Image auto-discovery.** `data.hcloud_image.golden` selects the newest snapshot
by label (`type=golden,project=magic-civilization`); rebuild with Packer and the
fleet picks it up — no ID edits. Set `-var base_image=ubuntu-24.04` only to test
`terraform plan` before any snapshot exists.
- **No placement group.** Hetzner caps spread groups at 10 servers; for short-lived
test cattle the HA spread isn't worth the `workers > 10` footgun.
- **Coordinator needs GNU coreutils.** `tools/autoplay-batch.sh` uses `realpath -m`;
on macOS install `coreutils` or run the dispatch from a Linux host.
- **State holds the token**`*.tfstate` and `terraform.tfvars` are gitignored.
- GPU is intentionally absent: the workload is CPU-bound (`docs/ai-production.md`);
rent spot GPU only if a profiler ever shows it saturated.

View file

@ -0,0 +1,9 @@
#cloud-config
# Thin boot script for golden-image workers. The image already carries the
# build user, the warm clone, the toolchain, and the prebuilt GDExtension — so
# all this does is (1) authorise the injected ssh key for the build user and
# (2) fast-forward the clone to the requested git ref.
runcmd:
- install -d -m 700 -o ${remote_user} -g ${remote_user} /home/${remote_user}/.ssh
- install -m 600 -o ${remote_user} -g ${remote_user} /root/.ssh/authorized_keys /home/${remote_user}/.ssh/authorized_keys
- sudo -u ${remote_user} -H bash -lc 'cd ~/Code/@projects/@magic-civilization && (git remote set-url origin "${git_remote}" 2>/dev/null || true) && git fetch --depth=1 origin "${git_ref}" && git reset --hard FETCH_HEAD'

View file

@ -0,0 +1,45 @@
# Distributed test/train fleet disposable cattle from the Packer golden image.
# No persistent volume: workers are stateless. The golden image carries the warm
# clone + toolchain + prebuilt .so; results leave via the dispatch layer (scp).
resource "hcloud_ssh_key" "fleet" {
name = "${var.name}-key"
public_key = file(pathexpand(var.ssh_public_key_path))
}
# Resolve the newest golden snapshot by label. Skipped entirely when
# var.base_image is set (bootstrap path), so `terraform plan` works before any
# snapshot exists.
data "hcloud_image" "golden" {
count = var.base_image == "" ? 1 : 0
with_selector = var.golden_selector
with_architecture = "x86"
most_recent = true
}
locals {
image = var.base_image != "" ? var.base_image : data.hcloud_image.golden[0].id
}
resource "hcloud_server" "worker" {
count = var.workers
name = "${var.name}-${count.index}"
server_type = var.server_type
location = var.location
image = local.image
ssh_keys = [hcloud_ssh_key.fleet.id]
# Thin cloud-init: copy the injected key to the build user and fast-forward
# the warm clone to the requested ref. The golden image already holds the
# toolchain + prebuilt GDExtension, so there is nothing heavy to install here.
user_data = templatefile("${path.module}/cloud-init.yaml", {
git_remote = var.git_remote
git_ref = var.git_ref
remote_user = var.remote_user
})
labels = {
project = "magic-civilization"
role = "test-fleet"
}
}

View file

@ -0,0 +1,29 @@
locals {
# Repo root, three levels up from infra/terraform/test-fleet.
repo_root = abspath("${path.module}/../../..")
inventory_path = "${local.repo_root}/.local/fleet/inventory"
# One "<user>@<ipv4>" line per worker consumed by scripts/run/dist.sh.
inventory_body = join("\n", [for s in hcloud_server.worker : "${var.remote_user}@${s.ipv4_address}"])
}
output "worker_ips" {
description = "Public IPv4 of each fleet worker."
value = [for s in hcloud_server.worker : s.ipv4_address]
}
output "worker_hosts" {
description = "ssh targets (<user>@<ip>) the dispatch layer fans work across."
value = [for s in hcloud_server.worker : "${var.remote_user}@${s.ipv4_address}"]
}
output "inventory_path" {
description = "Path to the rendered ssh inventory file."
value = local.inventory_path
}
# Rendered whenever workers exist; emptied (header only) when workers = 0 so a
# stale fleet can't be addressed after teardown.
resource "local_file" "inventory" {
filename = local.inventory_path
content = "${local.inventory_body}${local.inventory_body == "" ? "# fleet is down (workers = 0)\n" : "\n"}"
}

View file

@ -0,0 +1,19 @@
# Copy to terraform.tfvars and fill in. terraform.tfvars is gitignored.
# The token is best passed via env instead: export TF_VAR_hcloud_token=...
# Required: GitLab origin the golden image was built from.
git_remote = "https://gitlab.com/<you>/magic-civilization.git"
# Optional overrides (defaults shown).
# location = "ash" # Ashburn VA (~near NYC)
# server_type = "cpx41" # bursty test/sim; use ccx33 for sustained training
# git_ref = "main"
# remote_user = "mc"
# ssh_public_key_path = "~/.ssh/id_ed25519.pub"
# workers is normally set on the CLI by the dispatch layer
# (./run dist:up N -> -var workers=N), not pinned here.
# workers = 0
# Bootstrap only: set to test `terraform plan` before a golden snapshot exists.
# base_image = "ubuntu-24.04"

View file

@ -0,0 +1,88 @@
variable "hcloud_token" {
description = "Hetzner Cloud API token (project-scoped, Read/Write). Export as TF_VAR_hcloud_token; never commit."
type = string
sensitive = true
}
variable "workers" {
description = <<-EOT
Fleet size the iteration-speed lever. 0 = nothing running, zero cost.
Set to N to fan distributed sim/test work across N disposable workers, then
back to 0 to tear the fleet down. Each worker is identical cattle; results
are scp'd off by the dispatch layer before teardown, so there is no
per-worker state to preserve.
EOT
type = number
default = 0
validation {
condition = var.workers >= 0 && var.workers <= 50
error_message = "Keep the fleet between 0 and 50 (project-quota / sanity guard)."
}
}
variable "location" {
description = "Hetzner location. US: ash (Ashburn VA, ~near NYC), hil (Hillsboro OR). EU: fsn1, nbg1, hel1."
type = string
default = "ash"
}
variable "server_type" {
description = <<-EOT
Per-worker size. Distributed fan-out favours many small cheap boxes over one
big one (finer shard granularity per euro). Match tier to CPU profile:
Bursty test/sim (minutes) -> Shared "Regular" CPX:
cpx31 = 4 vCPU / 8 GB, cpx41 = 8 vCPU / 16 GB (default), cpx51 = 16/32
Sustained RL training (hours @ 100%) -> Dedicated CCX (no shared-tier throttle):
ccx33 = 8 vCPU / 32 GB, ccx43 = 16 vCPU / 64 GB
EOT
type = string
default = "cpx41"
}
variable "base_image" {
description = <<-EOT
Bootstrap escape hatch. Leave empty ("") to boot workers from the latest
Packer-built golden snapshot (resolved by label via the hcloud_image data
source). Set to a stock image name (e.g. "ubuntu-24.04") only for first-run
Terraform plan testing BEFORE any golden snapshot exists.
EOT
type = string
default = ""
}
variable "golden_selector" {
description = "Label selector identifying the Packer golden snapshot. Must match the labels set in infra/packer/golden-image.pkr.hcl."
type = string
default = "type=golden,project=magic-civilization"
}
variable "ssh_public_key_path" {
description = "Public key authorised on every worker (and used by the dispatch scripts to ssh in)."
type = string
default = "~/.ssh/id_ed25519.pub"
}
variable "name" {
description = "Resource name prefix; workers are named <name>-0, <name>-1, ..."
type = string
default = "mc-test"
}
variable "git_remote" {
description = "GitLab clone URL (origin) the golden image was built from; cloud-init pulls the latest ref from here on boot."
type = string
default = ""
}
variable "git_ref" {
description = "Branch/tag/SHA the fleet checks out on boot. Pin to a SHA for reproducible distributed runs."
type = string
default = "main"
}
variable "remote_user" {
description = "Unix user the dispatch layer ssh's in as (created in the golden image; flatpak runs --user as this account)."
type = string
default = "mc"
}

View file

@ -6,6 +6,10 @@ terraform {
source = "hetznercloud/hcloud"
version = "~> 1.49"
}
local = {
source = "hashicorp/local"
version = "~> 2.5"
}
}
}

165
scripts/run/dist.sh Executable file
View file

@ -0,0 +1,165 @@
#!/usr/bin/env bash
# Distributed test/train dispatch — fan the iteration loop across the Hetzner
# test fleet. Sourced by ./run (defines cmd_dist_*). Auto-registered via the
# cmd_<verb>_<target> name-dispatch, so no edit to the top-level `run` is needed.
#
# ./run dist:up <workers> [server_type] [location] spin the fleet up
# ./run dist:sim <games> [turn_limit] [--destroy-after] fan a sim batch across it
# ./run dist:train <total_steps> [--destroy-after] fan an RL sweep across it
# ./run dist:down tear it down (zero cost)
#
# Requires: TF_VAR_hcloud_token in env, terraform on PATH, and a coordinator with
# GNU coreutils (autoplay-batch.sh uses `realpath -m`).
_DIST_TF_DIR_REL="infra/terraform/test-fleet"
_dist_repo_root() { (cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd); }
_dist_tf() {
local root
root="$(_dist_repo_root)"
terraform -chdir="$root/$_DIST_TF_DIR_REL" "$@"
}
_dist_read_hosts() {
# Echo one "<user>@<ip>" per line from the inventory, skipping comments/blanks.
local inv="$1"
grep -vE '^\s*(#|$)' "$inv" 2>/dev/null || true
}
cmd_dist() {
cat <<'EOF'
Distributed test/train fleet (Hetzner). Set TF_VAR_hcloud_token first.
./run dist:up <workers> [server_type] [location] e.g. ./run dist:up 10
./run dist:sim <games> [turn_limit] [--destroy-after]
./run dist:train <total_steps> [--destroy-after]
./run dist:down
EOF
}
cmd_dist_up() {
local n="${1:-}"
[[ "$n" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:up <workers> [server_type] [location]" >&2; return 1; }
: "${TF_VAR_hcloud_token:?export TF_VAR_hcloud_token=<hetzner API token> first}"
local args=(-auto-approve -var "workers=$n")
[ -n "${2:-}" ] && args+=(-var "server_type=$2")
[ -n "${3:-}" ] && args+=(-var "location=$3")
_dist_tf init -input=false >/dev/null
_dist_tf apply "${args[@]}"
echo "fleet up: $n worker(s). inventory: $(_dist_repo_root)/.local/fleet/inventory"
}
cmd_dist_down() {
: "${TF_VAR_hcloud_token:?export TF_VAR_hcloud_token=<hetzner API token> first}"
_dist_tf apply -auto-approve -var "workers=0"
echo "fleet down (workers=0): zero compute cost, snapshot only (~€0.10/mo)."
}
cmd_dist_sim() {
local total="${1:-}" turn="${2:-300}" destroy=false
local a
for a in "$@"; do [ "$a" = "--destroy-after" ] && destroy=true; done
[[ "$total" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:sim <total_games> [turn_limit] [--destroy-after]" >&2; return 1; }
local root inv
root="$(_dist_repo_root)"
inv="$root/.local/fleet/inventory"
[ -f "$inv" ] || { echo "no inventory at $inv — run ./run dist:up <N> first" >&2; return 1; }
local hosts=()
while IFS= read -r line; do hosts+=("$line"); done < <(_dist_read_hosts "$inv")
local n=${#hosts[@]}
[ "$n" -gt 0 ] || { echo "inventory empty — fleet is down" >&2; return 1; }
local stamp results shard
stamp="$(date +%Y%m%d_%H%M%S)"
results="$root/.local/iter/$stamp"
mkdir -p "$results"
shard=$(( (total + n - 1) / n )) # ceil(total / n)
echo "distributing $total game(s) over $n worker(s): ~$shard each, turn_limit=$turn"
echo "results → $results"
local pids=() i=0 host offset cnt cores
for host in "${hosts[@]}"; do
offset=$(( i * shard ))
cnt=$shard
(( offset + cnt > total )) && cnt=$(( total - offset ))
(( cnt <= 0 )) && break
cores="$(ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" nproc 2>/dev/null || echo 8)"
echo " [$host] seeds $(( offset + 1 ))..$(( offset + cnt )) PARALLEL=$cores"
AUTOPLAY_HOST="$host" SEED_OFFSET="$offset" PARALLEL="$cores" \
bash "$root/tools/autoplay-batch.sh" "$cnt" "$turn" "$results" \
>"$results/dispatch_worker_${i}.log" 2>&1 &
pids+=($!)
i=$(( i + 1 ))
done
local fail=0 p
for p in "${pids[@]}"; do wait "$p" || fail=$(( fail + 1 )); done
local produced
produced="$(find "$results" -name turn_stats.jsonl -type f 2>/dev/null | wc -l | tr -d ' ')"
echo "----------------------------------------------------------------"
echo "distributed sim done: $produced game(s) produced turn_stats under $results"
[ "$fail" -eq 0 ] || echo "WARNING: $fail worker batch(es) errored — see $results/dispatch_worker_*.log" >&2
$destroy && { echo "--destroy-after → tearing down"; cmd_dist_down; }
[ "$fail" -eq 0 ]
}
cmd_dist_train() {
# v1 blocking sweep: one training run per worker (distinct seed + run-name),
# then pull the models back. Detached orchestration is the documented follow-up.
local steps="${1:-1000000}" destroy=false
local a
for a in "$@"; do [ "$a" = "--destroy-after" ] && destroy=true; done
[[ "$steps" =~ ^[0-9]+$ ]] || { echo "usage: ./run dist:train <total_steps> [--destroy-after]" >&2; return 1; }
local root inv
root="$(_dist_repo_root)"
inv="$root/.local/fleet/inventory"
[ -f "$inv" ] || { echo "no inventory at $inv — run ./run dist:up <N> first" >&2; return 1; }
local hosts=()
while IFS= read -r line; do hosts+=("$line"); done < <(_dist_read_hosts "$inv")
local n=${#hosts[@]}
[ "$n" -gt 0 ] || { echo "inventory empty — fleet is down" >&2; return 1; }
local stamp results
stamp="$(date +%Y%m%d_%H%M%S)"
results="$root/.local/train/$stamp"
mkdir -p "$results"
echo "fanning $n training run(s) × $steps steps (CPU). results → $results"
local repo_remote="Code/@projects/@magic-civilization"
local pids=() i=0 host seed run
for host in "${hosts[@]}"; do
seed=$(( 42 + i ))
run="dist-${stamp}-w${i}"
echo " [$host] run=$run seed=$seed"
ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" \
"cd ~/$repo_remote && python3 -m tooling.rl_self_play.train --run-name '$run' --seed $seed --total-steps $steps --device cpu" \
>"$results/train_worker_${i}.log" 2>&1 &
pids+=($!)
i=$(( i + 1 ))
done
local fail=0 p
for p in "${pids[@]}"; do wait "$p" || fail=$(( fail + 1 )); done
# Pull each worker's model dir back.
i=0
for host in "${hosts[@]}"; do
run="dist-${stamp}-w${i}"
rsync -az "$host:~/$repo_remote/tooling/rl_self_play/models/$run" "$results/" 2>/dev/null || \
echo " note: no model dir for $run on $host (check $results/train_worker_${i}.log)"
i=$(( i + 1 ))
done
echo "----------------------------------------------------------------"
echo "distributed train done under $results"
[ "$fail" -eq 0 ] || echo "WARNING: $fail run(s) errored — see $results/train_worker_*.log" >&2
$destroy && { echo "--destroy-after → tearing down"; cmd_dist_down; }
[ "$fail" -eq 0 ]
}

View file

@ -153,6 +153,15 @@ pub(crate) fn event_to_dict(evt: &TurnEvent) -> Dictionary {
d.set("clan", clan.0 as i64);
d.set("tradition", GString::from(tradition.as_str()));
}
TurnEvent::UnitHealed { turn, clan, unit_id, amount, hex } => {
d.set("kind", GString::from("UnitHealed"));
d.set("turn", *turn as i64);
d.set("clan", clan.0 as i64);
d.set("unit_id", *unit_id as i64);
d.set("amount", *amount as i64);
d.set("col", hex.q as i64);
d.set("row", hex.r as i64);
}
TurnEvent::AmbientEncounterFired { turn, clan, hex, species, group_size } => {
d.set("kind", GString::from("AmbientEncounterFired"));
d.set("turn", *turn as i64);

View file

@ -992,7 +992,11 @@ fn translate_processor_events(events: &[mc_replay::TurnEvent]) -> Vec<Event> {
// expansion), not the wire protocol — drop here to keep it exhaustive.
| mc_replay::TurnEvent::CityGrew { .. }
| mc_replay::TurnEvent::CityBordersExpanded { .. }
| mc_replay::TurnEvent::FloraSuccession { .. } => {}
| mc_replay::TurnEvent::FloraSuccession { .. }
// p3-29 (T2): no wire `Event::UnitHealed` surface — consumed by the
// live UI via the kind-tagged `event_to_dict` dict, not this wire
// path. Drop here to keep the match exhaustive.
| mc_replay::TurnEvent::UnitHealed { .. } => {}
}
}
out

View file

@ -160,6 +160,21 @@ pub enum TurnEvent {
/// Tradition node now unlocked.
tradition: String,
},
/// p3-29 (T2): a unit recovered HP during the end-of-turn healing phase
/// (single-source replacement for the GDScript turn's inline `unit_healed`
/// signal). `amount` is the HP actually applied (capped at `max_hp`).
UnitHealed {
/// Turn the event fired on.
turn: u32,
/// Owning clan.
clan: ClanId,
/// Instance id of the healed unit.
unit_id: u32,
/// HP actually restored this turn.
amount: i32,
/// Hex the unit healed on.
hex: TileCoord,
},
/// A wonder finished construction.
WonderBuilt {
/// Turn the event fired on.
@ -556,6 +571,7 @@ impl TurnEvent {
| Self::CityBordersExpanded { turn, .. }
| Self::FloraSuccession { turn, .. }
| Self::CultureResearched { turn, .. }
| Self::UnitHealed { turn, .. }
| Self::WonderBuilt { turn, .. }
| Self::WarDeclared { turn, .. }
| Self::PeaceSigned { turn, .. }
@ -723,6 +739,30 @@ mod tests {
assert_eq!(decoded, ev);
}
/// p3-29 (T2): verify `UnitHealed` survives a JSON + bincode serde
/// round-trip and `turn()` returns its turn.
#[test]
fn unit_healed_serde() {
let ev = TurnEvent::UnitHealed {
turn: 9,
clan: ClanId(1),
unit_id: 42,
amount: 15,
hex: TileCoord::new(3, 4),
};
assert_eq!(ev.turn(), 9);
let json = serde_json::to_string(&ev).expect("serialize");
let back: TurnEvent = serde_json::from_str(&json).expect("deserialize");
assert_eq!(back, ev);
let cfg = bincode::config::standard();
let bytes = bincode::serde::encode_to_vec(&ev, cfg).expect("encode");
let (decoded, _): (TurnEvent, usize) =
bincode::serde::decode_from_slice(&bytes, cfg).expect("decode");
assert_eq!(decoded, ev);
}
/// p2-55: verify that `UnitCaptured`, `UnitRansomOffered`, and
/// `CivilianDestroyed` survive a JSON serde round-trip, and that
/// `turn()` returns the correct value for each.

View file

@ -439,6 +439,14 @@ pub struct GameState {
/// registry signature). `#[serde(skip)]` — cleared/drained every turn.
#[serde(skip)]
pub pending_flora_events: Vec<(i32, i32, u32, i32, i32)>,
/// p3-29 (T2): transient buffer of unit-heal events
/// `(player_index, unit_id, amount, col, row)` the healing phase produced
/// this turn. `step()` drains it into `TurnResult` as `UnitHealed` events
/// (the healing phase runs in the `fn(&mut GameState)` end-of-turn registry,
/// so it has no event sink — same buffer pattern as `pending_flora_events`).
/// `#[serde(skip)]` — cleared/drained every turn.
#[serde(skip)]
pub pending_heal_events: Vec<(usize, u32, i32, i32, i32)>,
/// p3-26 B3: improvement definitions (`id → {build_turns, food, production}`),
/// boot-loaded from `public/resources/improvements/*.json`. `#[serde(skip)]`
/// static content like the other catalogs; drives both the build-tick

View file

@ -77,6 +77,12 @@ pub fn process_healing_phase(state: &mut GameState) {
.flat_map(|(pi, p)| p.units.iter().map(move |u| (pi, u.col, u.row)))
.collect();
// p3-29 (T2): collect per-unit heal events here, then push into the
// transient `pending_heal_events` buffer after the player loop releases its
// `&mut state.players` borrow. `step()` drains the buffer into `UnitHealed`
// turn events (the registry phase signature carries no event sink).
let mut healed: Vec<(usize, u32, i32, i32, i32)> = Vec::new();
for (pi, player) in state.players.iter_mut().enumerate() {
// Snapshot city positions for garrison detection; `player` is borrowed
// mutably below for units, so we can't hold a reference to
@ -111,7 +117,12 @@ pub fn process_healing_phase(state: &mut GameState) {
}
let heal_amount = unit_heal_rate(unit.col, unit.row, &city_positions, unit.is_fortified);
let before = unit.hp;
unit.hp = (unit.hp + heal_amount).min(unit.max_hp);
let applied = unit.hp - before;
if applied > 0 {
healed.push((pi, unit.id, applied, unit.col, unit.row));
}
}
// ── City healing (siege-suppressed) ───────────────────────────────
@ -129,6 +140,10 @@ pub fn process_healing_phase(state: &mut GameState) {
}
}
}
// The `&mut state.players` borrow is released; stash the heals for `step()`
// to drain into `UnitHealed` events.
state.pending_heal_events.extend(healed);
}
/// Compute the healing rate (HP) for a unit at `(col, row)` in the headless bench.
@ -223,6 +238,55 @@ mod tests {
assert_eq!(state.players[0].units[0].hp, 40, "neutral territory = 10 HP");
}
/// p3-29 (T2): a unit that heals buffers exactly one `pending_heal_events`
/// entry carrying the unit id and the HP actually applied; a full-HP unit
/// buffers nothing.
#[test]
fn healing_buffers_unit_heal_event_with_applied_amount() {
let mut p = PlayerState {
city_positions: vec![(3, 4)],
..PlayerState::default()
};
let mut healed = unit_at(3, 4, 50, 100); // garrison → +20
healed.id = 7;
let mut full = unit_at(3, 4, 100, 100); // already full → no heal
full.id = 8;
p.units.push(healed);
p.units.push(full);
let mut state = state_with_player(p);
process_healing_phase(&mut state);
assert_eq!(
state.pending_heal_events,
vec![(0usize, 7u32, 20i32, 3i32, 4i32)],
"exactly one heal event for unit 7 with applied=20; full unit 8 buffers nothing"
);
}
/// p3-29 (T2): when a heal would overshoot `max_hp`, the buffered amount is
/// the clamped (actually-applied) delta, not the nominal heal rate.
#[test]
fn healing_buffers_clamped_amount_near_full_hp() {
let mut p = PlayerState {
city_positions: vec![(3, 4)],
..PlayerState::default()
};
let mut nearly = unit_at(3, 4, 95, 100); // garrison +20 but clamps to +5
nearly.id = 3;
p.units.push(nearly);
let mut state = state_with_player(p);
process_healing_phase(&mut state);
assert_eq!(state.players[0].units[0].hp, 100);
assert_eq!(
state.pending_heal_events,
vec![(0usize, 3u32, 5i32, 3i32, 4i32)],
"buffered amount is the clamped delta (5), not the 20 HP garrison rate"
);
}
/// A unit that moved this turn (`movement_remaining < base_moves` and not
/// fortified) does NOT heal.
#[test]

View file

@ -543,6 +543,21 @@ impl TurnProcessor {
}
}
// p3-29 (T2): drain the healing phase's unit-heal buffer into UnitHealed
// events (same registry-has-no-event-sink pattern as flora above).
if !state.pending_heal_events.is_empty() {
let turn_now = state.turn;
for (pi, unit_id, amount, col, row) in state.pending_heal_events.drain(..) {
result.events_emitted.push(mc_replay::TurnEvent::UnitHealed {
turn: turn_now,
clan: mc_replay::ClanId(pi as u32),
unit_id,
amount,
hex: mc_replay::TileCoord::new(col, row),
});
}
}
// Phase 5a-sentry: wake sentrying units that have enemies in vision range (2 hex).
// Runs after movement so positions are current; runs before PvP so the
// now-awoken unit's state is consistent when combat checks fire.

View file

@ -210,6 +210,7 @@ fn ten_turn_run_emits_each_wired_variant() {
TurnEvent::CityBordersExpanded { .. } => "CityBordersExpanded",
TurnEvent::FloraSuccession { .. } => "FloraSuccession",
TurnEvent::CultureResearched { .. } => "CultureResearched",
TurnEvent::UnitHealed { .. } => "UnitHealed",
TurnEvent::CityFounded { .. } => "CityFounded",
TurnEvent::WonderBuilt { .. } => "WonderBuilt",
TurnEvent::CityCaptured { .. } => "CityCaptured",
@ -330,6 +331,7 @@ fn events_emitted_appears_on_turn_result() {
TurnEvent::CityBordersExpanded { .. } => "CityBordersExpanded",
TurnEvent::FloraSuccession { .. } => "FloraSuccession",
TurnEvent::CultureResearched { .. } => "CultureResearched",
TurnEvent::UnitHealed { .. } => "UnitHealed",
TurnEvent::CityFounded { .. } => "CityFounded",
TurnEvent::WonderBuilt { .. } => "WonderBuilt",
TurnEvent::CityCaptured { .. } => "CityCaptured",