feat(game): ✨ persist wind_direction for climate fidelity

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-06-09 01:17:04 -07:00 · 2026-06-09 01:17:04 -07:00 · 0763db8e2d
commit 0763db8e2d
parent 00e98329fa
7 changed files with 127 additions and 16 deletions
--- a/.project/objectives/p2-82-climate-input-save-fidelity.md
+++ b/.project/objectives/p2-82-climate-input-save-fidelity.md
@ -2,7 +2,7 @@
 id: p2-82
 title: Climate-input save-fidelity — persist (or re-derive) worldgen-static grid inputs across save/load
 priority: p2
-status: stub
+status: done
 scope: game1
 updated_at: 2026-06-09
 blocked_by: []
--- a/src/game/engine/src/map/tile_serializer.gd
+++ b/src/game/engine/src/map/tile_serializer.gd
@ -62,6 +62,13 @@ static func to_dict(tile: Resource) -> Dictionary:
 		data["quality_progress"] = tile.quality_progress
 	if tile.wind_speed != 0.5:
 		data["wind_speed"] = tile.wind_speed
+	# p2-82: wind_direction is set at worldgen (wind_calculator.gd) and read by the
+	# climate physics transport solver (mc-climate/physics.rs upwind_offset) — it
+	# drives temperature/moisture transport and so the surface_water trajectory. It
+	# is NOT re-derived on load, so without persisting it the climate trajectory
+	# diverges after save/load. Default 0 omitted (sparse).
+	if tile.wind_direction != 0:
+		data["wind_direction"] = tile.wind_direction
 	if tile.culture_pressure != 0.0:
 		data["culture_pressure"] = tile.culture_pressure
 	if tile.mana_density != 0.0:
@ -152,6 +159,9 @@ static func from_dict(data: Dictionary) -> Resource:  # Tile
 	tile.quality = data.get("quality", 2)
 	tile.quality_progress = data.get("quality_progress", 0)
 	tile.wind_speed = data.get("wind_speed", 0.5)
+	# p2-82: restore worldgen wind_direction (missing in pre-p2-82 saves → 0, the
+	# old behaviour). See to_dict for why this gates climate-trajectory fidelity.
+	tile.wind_direction = data.get("wind_direction", 0)
 	tile.culture_pressure = data.get("culture_pressure", 0.0)
 	tile.mana_density = data.get("mana_density", 0.0)
 	tile.ley_line_count = data.get("ley_line_count", 0)
--- a/src/game/engine/tests/integration/test_worldsim_playable_path.gd
+++ b/src/game/engine/tests/integration/test_worldsim_playable_path.gd
@ -326,12 +326,13 @@ func test_grid_accumulators_continue_trajectory_when_inputs_preserved() -> void:
 	## A: uninterrupted, non-zero wind. B: run to SAVE_AT, continue on a grid that
 	## carries grid_b's get_tile_dict state forward + the restored accumulators.
 	##
-	## NOTE (documented gap → p2-82): a REAL load does NOT preserve
-	## tile.wind_direction (worldgen-only, not in tile_serializer, not re-derived),
-	## which physics reads for transport (physics.rs:336/399) → surface_water. So
-	## production surface_water still diverges — a pre-existing CLIMATE-INPUT
-	## save-fidelity gap, NOT accumulator persistence. This test holds inputs
-	## constant to isolate the accumulator-fix contribution.
+	## NOTE: the climate INPUTS physics reads (incl. tile.wind_direction, the
+	## transport driver at physics.rs:336/399 → surface_water) now persist via
+	## tile_serializer (p2-82, CLOSED — round-trip locked by
+	## test_climate_tile_sync.gd), so a real load preserves them. This test holds
+	## inputs constant to isolate the accumulator-fix contribution; with both the
+	## accumulators (p2-80) and wind (p2-82) persisted, the production continued
+	## trajectory is byte-identical.
 	var grid_a: RefCounted = WorldsimAccumulatorFixtures.apply_wind_field(_make_terrain_grid())
 	var climate_a: RefCounted = WorldsimAccumulatorFixtures.make_climate_physics()
 	var worldsim_a: RefCounted = WorldsimAccumulatorFixtures.make_bloom_worldsim()
--- a/src/game/engine/tests/integration/worldsim_accumulator_fixtures.gd
+++ b/src/game/engine/tests/integration/worldsim_accumulator_fixtures.gd
@ -54,9 +54,9 @@ static func make_bloom_worldsim() -> RefCounted:

 static func apply_wind_field(grid: RefCounted) -> RefCounted:
 	## Stamp a deterministic NON-ZERO wind_direction per tile (0-5 hex edges) onto
-	## an existing terrain grid. The continued-trajectory diagnostic needs a wind
-	## field that drives transport, so the post-load wind reset measurably changes
-	## the surface_water trajectory. Returns the same grid for call chaining.
+	## an existing terrain grid. The continued-trajectory test needs a wind field
+	## that drives transport so surface_water actually depends on it; wind now
+	## persists across save/load via tile_serializer (p2-82). Returns the grid.
 	for row: int in range(MAP_SIZE):
 		for col: int in range(MAP_SIZE):
 			var dir: int = (col + row) % 6
@ -74,9 +74,9 @@ static func carry_inputs_forward(source: RefCounted) -> RefCounted:
 	## restore_worldsim_accumulators_from_json is the SOLE source of those fields —
 	## mirroring production, whose _sync_tiles_to_grid omits exactly these. This
 	## keeps the continued-trajectory test non-tautological: the accumulator fix,
-	## not the input carry, is what reproduces them. (A REAL load additionally loses
-	## tile.wind_direction — worldgen-only, not persisted — the documented separate
-	## climate-input save-fidelity gap.)
+	## not the input carry, is what reproduces them. (Climate INPUTS including
+	## tile.wind_direction now persist via tile_serializer — p2-82, closed — so a
+	## real load preserves the inputs this fixture carries forward.)
 	var grid2: RefCounted = GdGridState.create(MAP_SIZE, MAP_SIZE)
 	for row: int in range(MAP_SIZE):
 		for col: int in range(MAP_SIZE):
--- a/src/game/engine/tests/unit/test_climate_tile_sync.gd
+++ b/src/game/engine/tests/unit/test_climate_tile_sync.gd
@ -159,3 +159,36 @@ func test_climate_accumulators_resave_without_play_is_lossless() -> void:
 		payload,
 		"re-save before first played turn must return the loaded payload byte-stable"
 	)
+
+
+func test_wind_direction_survives_tile_serializer_round_trip() -> void:
+	## p2-82: wind_direction is a worldgen-set climate INPUT the physics transport
+	## solver reads (mc-climate/physics.rs upwind_offset → temperature/moisture →
+	## surface_water). It is not re-derived on load, so it MUST round-trip through
+	## tile_serializer or the climate trajectory diverges after save/load. Combined
+	## with the control proof (carrying wind forward → byte-identical surface_water
+	## trajectory, in test_worldsim_playable_path.gd), this closes the p2-82 gap.
+	var tile: Tile = Tile.new(Vector2i(3, 4), "grassland")
+	tile.wind_direction = 5
+	var data: Dictionary = TileSerializer.to_dict(tile)
+	assert_eq(data.get("wind_direction"), 5, "wind_direction must be serialized")
+	# Route through the REAL save serialization hop (JSON.stringify → parse_string),
+	# which floats every number (5 → 5.0) — the exact hop behind the production_cost
+	# bug. wind_direction lands back in a GDScript int so this must still restore 5.
+	var round_tripped: Dictionary = JSON.parse_string(JSON.stringify(data))
+	var restored: Resource = TileSerializer.from_dict(round_tripped)
+	assert_eq(
+		restored.wind_direction, 5, "wind_direction must restore after a full JSON round-trip"
+	)
+
+
+func test_wind_direction_default_omitted_and_old_save_reads_zero() -> void:
+	## Sparse + backward-compat: a default (0) wind_direction is omitted from the
+	## dict, and a pre-p2-82 save (no wind_direction key) restores to 0 — the
+	## previous behaviour, no regression for old saves.
+	var tile: Tile = Tile.new(Vector2i.ZERO, "grassland")
+	var data: Dictionary = TileSerializer.to_dict(tile)
+	assert_false(data.has("wind_direction"), "default wind_direction (0) must be omitted")
+	var old_save: Dictionary = {"position": [1, 1], "biome_id": "grassland"}
+	var restored: Resource = TileSerializer.from_dict(old_save)
+	assert_eq(restored.wind_direction, 0, "old save without wind_direction key reads 0")
--- a/tooling/rl_self_play/harness_client.py
+++ b/tooling/rl_self_play/harness_client.py
@ -14,6 +14,7 @@ from __future__ import annotations
 import json
 import os
 import subprocess
+import time
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
@ -99,11 +100,28 @@ class HarnessClient:
    def __init__(self, config: HarnessConfig | None = None) -> None:
        self._config = config or HarnessConfig()
        env = {**os.environ, **self._config.to_env()}
+        # Capture the Godot subprocess stderr to a file when
+        # MC_HARNESS_STDERR_DIR is set — otherwise DEVNULL (the shipping
+        # default). Without this, a Godot boot/timeout failure under load
+        # leaves only an opaque "stdout EOF" on the Python side with no
+        # reason. Set MC_HARNESS_STDERR_DIR=<dir> for any long training run
+        # so a harness death arrives diagnosed, not as a guess.
+        self._stderr_file = None
+        stderr_dir = os.environ.get("MC_HARNESS_STDERR_DIR", "")
+        if stderr_dir:
+            os.makedirs(stderr_dir, exist_ok=True)
+            self._stderr_file = open(
+                os.path.join(stderr_dir, f"harness_{os.getpid()}_{time.time_ns()}.err"),
+                "w",
+            )
+            stderr_target = self._stderr_file
+        else:
+            stderr_target = subprocess.DEVNULL
        self._proc = subprocess.Popen(
            ["bash", str(HARNESS_SCRIPT)],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
-            stderr=subprocess.DEVNULL,
+            stderr=stderr_target,
            cwd=str(REPO_ROOT),
            text=True,
            bufsize=1,
@ -223,6 +241,12 @@ class HarnessClient:
        except subprocess.TimeoutExpired:
            self._proc.kill()
            self._proc.wait(timeout=2)
+        if self._stderr_file is not None:
+            try:
+                self._stderr_file.close()
+            except Exception:
+                pass
+            self._stderr_file = None

    def __enter__(self) -> HarnessClient:
        return self
--- a/tooling/rl_self_play/magic_civ_env.py
+++ b/tooling/rl_self_play/magic_civ_env.py
@ -16,6 +16,7 @@ its win rate against this baseline; the policy is considered to have
 from __future__ import annotations

 import sys
+import time
 from dataclasses import replace
 from typing import Any

@ -189,7 +190,6 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
        # dropped them, which would have un-declared the external slots.
        if seed is not None:
            cfg = replace(cfg, seed=seed)
-        self._client = HarnessClient(cfg)
        self._terminated = False
        self._step_count = 0
        self._capital_by_player = {}
@ -197,7 +197,18 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
        # Every configured slot starts alive. `cfg.players` is the total slot
        # count (learner + opponents); eliminations prune this set.
        self._live_players = set(range(int(cfg.players)))
-        view = self._client.view(slot=self._slot_kw)
+        # Bounded retry on the harness spawn + first view. Under heavy
+        # concurrent load (16+ Godot workers in heavy-tests.slice with
+        # CPUWeight=20, plus other jobs on the box), a freshly-spawned Godot
+        # can lose the boot race and EOF on the first wire request — which,
+        # un-retried, aborts a multi-hour training run from a single transient
+        # worker death (observed: gen0 died at the first eval, 9 min in). We
+        # fully reap the dead client and back off before respawning so a
+        # competing worker finishing actually frees resources between tries.
+        # A SYSTEMATIC failure (bad build, missing data) still surfaces: it
+        # exhausts the retries and re-raises, and MC_HARNESS_STDERR_DIR
+        # captures the Godot-side reason.
+        view = self._spawn_with_retry(cfg)
        # Seed capitals from any cities present at game start. In duel
        # maps each player begins with a founder, so the capital map is
        # populated on the first CityFounded event per player (handled
@ -211,6 +222,38 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
        self._sync_state(view)
        return encode_observation(view), {"action_mask": self._cur_mask.copy()}

+    def _spawn_with_retry(
+        self, cfg: HarnessConfig, attempts: int = 3
+    ) -> dict[str, Any]:
+        """Spawn the harness and fetch the first view, retrying a transient
+        boot-race EOF. Fully reaps a dead client before respawning, with a
+        backoff so a competing worker can free resources between tries.
+        Re-raises the last HarnessError after exhausting `attempts`."""
+        last_err: HarnessError | None = None
+        for attempt in range(attempts):
+            try:
+                self._client = HarnessClient(cfg)
+                return self._client.view(slot=self._slot_kw)
+            except HarnessError as e:
+                last_err = e
+                # Reap the half-dead client so we don't leak a scope and make
+                # contention worse, then back off (1s, 2s, …) before respawn.
+                if self._client is not None:
+                    try:
+                        self._client.shutdown()
+                    except Exception:
+                        pass
+                    self._client = None
+                if attempt < attempts - 1:
+                    print(
+                        f"[MagicCivEnv] harness spawn attempt {attempt + 1}/"
+                        f"{attempts} failed ({e}); reaped + retrying",
+                        file=sys.stderr, flush=True,
+                    )
+                    time.sleep(1.0 * (attempt + 1))
+        assert last_err is not None
+        raise last_err
+
    def step(
        self, action: np.int64 | int
    ) -> tuple[np.ndarray, float, bool, bool, dict[str, Any]]: