feat(game): persist wind_direction for climate fidelity

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Natalie 2026-06-09 01:17:04 -07:00
parent 00e98329fa
commit 0763db8e2d
7 changed files with 127 additions and 16 deletions

View file

@ -2,7 +2,7 @@
id: p2-82
title: Climate-input save-fidelity — persist (or re-derive) worldgen-static grid inputs across save/load
priority: p2
status: stub
status: done
scope: game1
updated_at: 2026-06-09
blocked_by: []

View file

@ -62,6 +62,13 @@ static func to_dict(tile: Resource) -> Dictionary:
data["quality_progress"] = tile.quality_progress
if tile.wind_speed != 0.5:
data["wind_speed"] = tile.wind_speed
# p2-82: wind_direction is set at worldgen (wind_calculator.gd) and read by the
# climate physics transport solver (mc-climate/physics.rs upwind_offset) — it
# drives temperature/moisture transport and so the surface_water trajectory. It
# is NOT re-derived on load, so without persisting it the climate trajectory
# diverges after save/load. Default 0 omitted (sparse).
if tile.wind_direction != 0:
data["wind_direction"] = tile.wind_direction
if tile.culture_pressure != 0.0:
data["culture_pressure"] = tile.culture_pressure
if tile.mana_density != 0.0:
@ -152,6 +159,9 @@ static func from_dict(data: Dictionary) -> Resource: # Tile
tile.quality = data.get("quality", 2)
tile.quality_progress = data.get("quality_progress", 0)
tile.wind_speed = data.get("wind_speed", 0.5)
# p2-82: restore worldgen wind_direction (missing in pre-p2-82 saves → 0, the
# old behaviour). See to_dict for why this gates climate-trajectory fidelity.
tile.wind_direction = data.get("wind_direction", 0)
tile.culture_pressure = data.get("culture_pressure", 0.0)
tile.mana_density = data.get("mana_density", 0.0)
tile.ley_line_count = data.get("ley_line_count", 0)

View file

@ -326,12 +326,13 @@ func test_grid_accumulators_continue_trajectory_when_inputs_preserved() -> void:
## A: uninterrupted, non-zero wind. B: run to SAVE_AT, continue on a grid that
## carries grid_b's get_tile_dict state forward + the restored accumulators.
##
## NOTE (documented gap → p2-82): a REAL load does NOT preserve
## tile.wind_direction (worldgen-only, not in tile_serializer, not re-derived),
## which physics reads for transport (physics.rs:336/399) → surface_water. So
## production surface_water still diverges — a pre-existing CLIMATE-INPUT
## save-fidelity gap, NOT accumulator persistence. This test holds inputs
## constant to isolate the accumulator-fix contribution.
## NOTE: the climate INPUTS physics reads (incl. tile.wind_direction, the
## transport driver at physics.rs:336/399 → surface_water) now persist via
## tile_serializer (p2-82, CLOSED — round-trip locked by
## test_climate_tile_sync.gd), so a real load preserves them. This test holds
## inputs constant to isolate the accumulator-fix contribution; with both the
## accumulators (p2-80) and wind (p2-82) persisted, the production continued
## trajectory is byte-identical.
var grid_a: RefCounted = WorldsimAccumulatorFixtures.apply_wind_field(_make_terrain_grid())
var climate_a: RefCounted = WorldsimAccumulatorFixtures.make_climate_physics()
var worldsim_a: RefCounted = WorldsimAccumulatorFixtures.make_bloom_worldsim()

View file

@ -54,9 +54,9 @@ static func make_bloom_worldsim() -> RefCounted:
static func apply_wind_field(grid: RefCounted) -> RefCounted:
## Stamp a deterministic NON-ZERO wind_direction per tile (0-5 hex edges) onto
## an existing terrain grid. The continued-trajectory diagnostic needs a wind
## field that drives transport, so the post-load wind reset measurably changes
## the surface_water trajectory. Returns the same grid for call chaining.
## an existing terrain grid. The continued-trajectory test needs a wind field
## that drives transport so surface_water actually depends on it; wind now
## persists across save/load via tile_serializer (p2-82). Returns the grid.
for row: int in range(MAP_SIZE):
for col: int in range(MAP_SIZE):
var dir: int = (col + row) % 6
@ -74,9 +74,9 @@ static func carry_inputs_forward(source: RefCounted) -> RefCounted:
## restore_worldsim_accumulators_from_json is the SOLE source of those fields —
## mirroring production, whose _sync_tiles_to_grid omits exactly these. This
## keeps the continued-trajectory test non-tautological: the accumulator fix,
## not the input carry, is what reproduces them. (A REAL load additionally loses
## tile.wind_direction — worldgen-only, not persisted — the documented separate
## climate-input save-fidelity gap.)
## not the input carry, is what reproduces them. (Climate INPUTS including
## tile.wind_direction now persist via tile_serializer — p2-82, closed — so a
## real load preserves the inputs this fixture carries forward.)
var grid2: RefCounted = GdGridState.create(MAP_SIZE, MAP_SIZE)
for row: int in range(MAP_SIZE):
for col: int in range(MAP_SIZE):

View file

@ -159,3 +159,36 @@ func test_climate_accumulators_resave_without_play_is_lossless() -> void:
payload,
"re-save before first played turn must return the loaded payload byte-stable"
)
func test_wind_direction_survives_tile_serializer_round_trip() -> void:
## p2-82: wind_direction is a worldgen-set climate INPUT the physics transport
## solver reads (mc-climate/physics.rs upwind_offset → temperature/moisture →
## surface_water). It is not re-derived on load, so it MUST round-trip through
## tile_serializer or the climate trajectory diverges after save/load. Combined
## with the control proof (carrying wind forward → byte-identical surface_water
## trajectory, in test_worldsim_playable_path.gd), this closes the p2-82 gap.
var tile: Tile = Tile.new(Vector2i(3, 4), "grassland")
tile.wind_direction = 5
var data: Dictionary = TileSerializer.to_dict(tile)
assert_eq(data.get("wind_direction"), 5, "wind_direction must be serialized")
# Route through the REAL save serialization hop (JSON.stringify → parse_string),
# which floats every number (5 → 5.0) — the exact hop behind the production_cost
# bug. wind_direction lands back in a GDScript int so this must still restore 5.
var round_tripped: Dictionary = JSON.parse_string(JSON.stringify(data))
var restored: Resource = TileSerializer.from_dict(round_tripped)
assert_eq(
restored.wind_direction, 5, "wind_direction must restore after a full JSON round-trip"
)
func test_wind_direction_default_omitted_and_old_save_reads_zero() -> void:
## Sparse + backward-compat: a default (0) wind_direction is omitted from the
## dict, and a pre-p2-82 save (no wind_direction key) restores to 0 — the
## previous behaviour, no regression for old saves.
var tile: Tile = Tile.new(Vector2i.ZERO, "grassland")
var data: Dictionary = TileSerializer.to_dict(tile)
assert_false(data.has("wind_direction"), "default wind_direction (0) must be omitted")
var old_save: Dictionary = {"position": [1, 1], "biome_id": "grassland"}
var restored: Resource = TileSerializer.from_dict(old_save)
assert_eq(restored.wind_direction, 0, "old save without wind_direction key reads 0")

View file

@ -14,6 +14,7 @@ from __future__ import annotations
import json
import os
import subprocess
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
@ -99,11 +100,28 @@ class HarnessClient:
def __init__(self, config: HarnessConfig | None = None) -> None:
self._config = config or HarnessConfig()
env = {**os.environ, **self._config.to_env()}
# Capture the Godot subprocess stderr to a file when
# MC_HARNESS_STDERR_DIR is set — otherwise DEVNULL (the shipping
# default). Without this, a Godot boot/timeout failure under load
# leaves only an opaque "stdout EOF" on the Python side with no
# reason. Set MC_HARNESS_STDERR_DIR=<dir> for any long training run
# so a harness death arrives diagnosed, not as a guess.
self._stderr_file = None
stderr_dir = os.environ.get("MC_HARNESS_STDERR_DIR", "")
if stderr_dir:
os.makedirs(stderr_dir, exist_ok=True)
self._stderr_file = open(
os.path.join(stderr_dir, f"harness_{os.getpid()}_{time.time_ns()}.err"),
"w",
)
stderr_target = self._stderr_file
else:
stderr_target = subprocess.DEVNULL
self._proc = subprocess.Popen(
["bash", str(HARNESS_SCRIPT)],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
stderr=stderr_target,
cwd=str(REPO_ROOT),
text=True,
bufsize=1,
@ -223,6 +241,12 @@ class HarnessClient:
except subprocess.TimeoutExpired:
self._proc.kill()
self._proc.wait(timeout=2)
if self._stderr_file is not None:
try:
self._stderr_file.close()
except Exception:
pass
self._stderr_file = None
def __enter__(self) -> HarnessClient:
return self

View file

@ -16,6 +16,7 @@ its win rate against this baseline; the policy is considered to have
from __future__ import annotations
import sys
import time
from dataclasses import replace
from typing import Any
@ -189,7 +190,6 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
# dropped them, which would have un-declared the external slots.
if seed is not None:
cfg = replace(cfg, seed=seed)
self._client = HarnessClient(cfg)
self._terminated = False
self._step_count = 0
self._capital_by_player = {}
@ -197,7 +197,18 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
# Every configured slot starts alive. `cfg.players` is the total slot
# count (learner + opponents); eliminations prune this set.
self._live_players = set(range(int(cfg.players)))
view = self._client.view(slot=self._slot_kw)
# Bounded retry on the harness spawn + first view. Under heavy
# concurrent load (16+ Godot workers in heavy-tests.slice with
# CPUWeight=20, plus other jobs on the box), a freshly-spawned Godot
# can lose the boot race and EOF on the first wire request — which,
# un-retried, aborts a multi-hour training run from a single transient
# worker death (observed: gen0 died at the first eval, 9 min in). We
# fully reap the dead client and back off before respawning so a
# competing worker finishing actually frees resources between tries.
# A SYSTEMATIC failure (bad build, missing data) still surfaces: it
# exhausts the retries and re-raises, and MC_HARNESS_STDERR_DIR
# captures the Godot-side reason.
view = self._spawn_with_retry(cfg)
# Seed capitals from any cities present at game start. In duel
# maps each player begins with a founder, so the capital map is
# populated on the first CityFounded event per player (handled
@ -211,6 +222,38 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
self._sync_state(view)
return encode_observation(view), {"action_mask": self._cur_mask.copy()}
def _spawn_with_retry(
self, cfg: HarnessConfig, attempts: int = 3
) -> dict[str, Any]:
"""Spawn the harness and fetch the first view, retrying a transient
boot-race EOF. Fully reaps a dead client before respawning, with a
backoff so a competing worker can free resources between tries.
Re-raises the last HarnessError after exhausting `attempts`."""
last_err: HarnessError | None = None
for attempt in range(attempts):
try:
self._client = HarnessClient(cfg)
return self._client.view(slot=self._slot_kw)
except HarnessError as e:
last_err = e
# Reap the half-dead client so we don't leak a scope and make
# contention worse, then back off (1s, 2s, …) before respawn.
if self._client is not None:
try:
self._client.shutdown()
except Exception:
pass
self._client = None
if attempt < attempts - 1:
print(
f"[MagicCivEnv] harness spawn attempt {attempt + 1}/"
f"{attempts} failed ({e}); reaped + retrying",
file=sys.stderr, flush=True,
)
time.sleep(1.0 * (attempt + 1))
assert last_err is not None
raise last_err
def step(
self, action: np.int64 | int
) -> tuple[np.ndarray, float, bool, bool, dict[str, Any]]: