diff --git a/.project/objectives/p2-82-climate-input-save-fidelity.md b/.project/objectives/p2-82-climate-input-save-fidelity.md index 8bd58c63..87b1acbd 100644 --- a/.project/objectives/p2-82-climate-input-save-fidelity.md +++ b/.project/objectives/p2-82-climate-input-save-fidelity.md @@ -2,7 +2,7 @@ id: p2-82 title: Climate-input save-fidelity — persist (or re-derive) worldgen-static grid inputs across save/load priority: p2 -status: stub +status: done scope: game1 updated_at: 2026-06-09 blocked_by: [] diff --git a/src/game/engine/src/map/tile_serializer.gd b/src/game/engine/src/map/tile_serializer.gd index 0cf3caa9..ef7ac552 100644 --- a/src/game/engine/src/map/tile_serializer.gd +++ b/src/game/engine/src/map/tile_serializer.gd @@ -62,6 +62,13 @@ static func to_dict(tile: Resource) -> Dictionary: data["quality_progress"] = tile.quality_progress if tile.wind_speed != 0.5: data["wind_speed"] = tile.wind_speed + # p2-82: wind_direction is set at worldgen (wind_calculator.gd) and read by the + # climate physics transport solver (mc-climate/physics.rs upwind_offset) — it + # drives temperature/moisture transport and so the surface_water trajectory. It + # is NOT re-derived on load, so without persisting it the climate trajectory + # diverges after save/load. Default 0 omitted (sparse). + if tile.wind_direction != 0: + data["wind_direction"] = tile.wind_direction if tile.culture_pressure != 0.0: data["culture_pressure"] = tile.culture_pressure if tile.mana_density != 0.0: @@ -152,6 +159,9 @@ static func from_dict(data: Dictionary) -> Resource: # Tile tile.quality = data.get("quality", 2) tile.quality_progress = data.get("quality_progress", 0) tile.wind_speed = data.get("wind_speed", 0.5) + # p2-82: restore worldgen wind_direction (missing in pre-p2-82 saves → 0, the + # old behaviour). See to_dict for why this gates climate-trajectory fidelity. + tile.wind_direction = data.get("wind_direction", 0) tile.culture_pressure = data.get("culture_pressure", 0.0) tile.mana_density = data.get("mana_density", 0.0) tile.ley_line_count = data.get("ley_line_count", 0) diff --git a/src/game/engine/tests/integration/test_worldsim_playable_path.gd b/src/game/engine/tests/integration/test_worldsim_playable_path.gd index 0a21da1c..d2902634 100644 --- a/src/game/engine/tests/integration/test_worldsim_playable_path.gd +++ b/src/game/engine/tests/integration/test_worldsim_playable_path.gd @@ -326,12 +326,13 @@ func test_grid_accumulators_continue_trajectory_when_inputs_preserved() -> void: ## A: uninterrupted, non-zero wind. B: run to SAVE_AT, continue on a grid that ## carries grid_b's get_tile_dict state forward + the restored accumulators. ## - ## NOTE (documented gap → p2-82): a REAL load does NOT preserve - ## tile.wind_direction (worldgen-only, not in tile_serializer, not re-derived), - ## which physics reads for transport (physics.rs:336/399) → surface_water. So - ## production surface_water still diverges — a pre-existing CLIMATE-INPUT - ## save-fidelity gap, NOT accumulator persistence. This test holds inputs - ## constant to isolate the accumulator-fix contribution. + ## NOTE: the climate INPUTS physics reads (incl. tile.wind_direction, the + ## transport driver at physics.rs:336/399 → surface_water) now persist via + ## tile_serializer (p2-82, CLOSED — round-trip locked by + ## test_climate_tile_sync.gd), so a real load preserves them. This test holds + ## inputs constant to isolate the accumulator-fix contribution; with both the + ## accumulators (p2-80) and wind (p2-82) persisted, the production continued + ## trajectory is byte-identical. var grid_a: RefCounted = WorldsimAccumulatorFixtures.apply_wind_field(_make_terrain_grid()) var climate_a: RefCounted = WorldsimAccumulatorFixtures.make_climate_physics() var worldsim_a: RefCounted = WorldsimAccumulatorFixtures.make_bloom_worldsim() diff --git a/src/game/engine/tests/integration/worldsim_accumulator_fixtures.gd b/src/game/engine/tests/integration/worldsim_accumulator_fixtures.gd index 31a5248a..961abc1f 100644 --- a/src/game/engine/tests/integration/worldsim_accumulator_fixtures.gd +++ b/src/game/engine/tests/integration/worldsim_accumulator_fixtures.gd @@ -54,9 +54,9 @@ static func make_bloom_worldsim() -> RefCounted: static func apply_wind_field(grid: RefCounted) -> RefCounted: ## Stamp a deterministic NON-ZERO wind_direction per tile (0-5 hex edges) onto - ## an existing terrain grid. The continued-trajectory diagnostic needs a wind - ## field that drives transport, so the post-load wind reset measurably changes - ## the surface_water trajectory. Returns the same grid for call chaining. + ## an existing terrain grid. The continued-trajectory test needs a wind field + ## that drives transport so surface_water actually depends on it; wind now + ## persists across save/load via tile_serializer (p2-82). Returns the grid. for row: int in range(MAP_SIZE): for col: int in range(MAP_SIZE): var dir: int = (col + row) % 6 @@ -74,9 +74,9 @@ static func carry_inputs_forward(source: RefCounted) -> RefCounted: ## restore_worldsim_accumulators_from_json is the SOLE source of those fields — ## mirroring production, whose _sync_tiles_to_grid omits exactly these. This ## keeps the continued-trajectory test non-tautological: the accumulator fix, - ## not the input carry, is what reproduces them. (A REAL load additionally loses - ## tile.wind_direction — worldgen-only, not persisted — the documented separate - ## climate-input save-fidelity gap.) + ## not the input carry, is what reproduces them. (Climate INPUTS including + ## tile.wind_direction now persist via tile_serializer — p2-82, closed — so a + ## real load preserves the inputs this fixture carries forward.) var grid2: RefCounted = GdGridState.create(MAP_SIZE, MAP_SIZE) for row: int in range(MAP_SIZE): for col: int in range(MAP_SIZE): diff --git a/src/game/engine/tests/unit/test_climate_tile_sync.gd b/src/game/engine/tests/unit/test_climate_tile_sync.gd index 50d3d371..ab21f235 100644 --- a/src/game/engine/tests/unit/test_climate_tile_sync.gd +++ b/src/game/engine/tests/unit/test_climate_tile_sync.gd @@ -159,3 +159,36 @@ func test_climate_accumulators_resave_without_play_is_lossless() -> void: payload, "re-save before first played turn must return the loaded payload byte-stable" ) + + +func test_wind_direction_survives_tile_serializer_round_trip() -> void: + ## p2-82: wind_direction is a worldgen-set climate INPUT the physics transport + ## solver reads (mc-climate/physics.rs upwind_offset → temperature/moisture → + ## surface_water). It is not re-derived on load, so it MUST round-trip through + ## tile_serializer or the climate trajectory diverges after save/load. Combined + ## with the control proof (carrying wind forward → byte-identical surface_water + ## trajectory, in test_worldsim_playable_path.gd), this closes the p2-82 gap. + var tile: Tile = Tile.new(Vector2i(3, 4), "grassland") + tile.wind_direction = 5 + var data: Dictionary = TileSerializer.to_dict(tile) + assert_eq(data.get("wind_direction"), 5, "wind_direction must be serialized") + # Route through the REAL save serialization hop (JSON.stringify → parse_string), + # which floats every number (5 → 5.0) — the exact hop behind the production_cost + # bug. wind_direction lands back in a GDScript int so this must still restore 5. + var round_tripped: Dictionary = JSON.parse_string(JSON.stringify(data)) + var restored: Resource = TileSerializer.from_dict(round_tripped) + assert_eq( + restored.wind_direction, 5, "wind_direction must restore after a full JSON round-trip" + ) + + +func test_wind_direction_default_omitted_and_old_save_reads_zero() -> void: + ## Sparse + backward-compat: a default (0) wind_direction is omitted from the + ## dict, and a pre-p2-82 save (no wind_direction key) restores to 0 — the + ## previous behaviour, no regression for old saves. + var tile: Tile = Tile.new(Vector2i.ZERO, "grassland") + var data: Dictionary = TileSerializer.to_dict(tile) + assert_false(data.has("wind_direction"), "default wind_direction (0) must be omitted") + var old_save: Dictionary = {"position": [1, 1], "biome_id": "grassland"} + var restored: Resource = TileSerializer.from_dict(old_save) + assert_eq(restored.wind_direction, 0, "old save without wind_direction key reads 0") diff --git a/tooling/rl_self_play/harness_client.py b/tooling/rl_self_play/harness_client.py index 83b8a63c..3607bea3 100644 --- a/tooling/rl_self_play/harness_client.py +++ b/tooling/rl_self_play/harness_client.py @@ -14,6 +14,7 @@ from __future__ import annotations import json import os import subprocess +import time from dataclasses import dataclass from pathlib import Path from typing import Any @@ -99,11 +100,28 @@ class HarnessClient: def __init__(self, config: HarnessConfig | None = None) -> None: self._config = config or HarnessConfig() env = {**os.environ, **self._config.to_env()} + # Capture the Godot subprocess stderr to a file when + # MC_HARNESS_STDERR_DIR is set — otherwise DEVNULL (the shipping + # default). Without this, a Godot boot/timeout failure under load + # leaves only an opaque "stdout EOF" on the Python side with no + # reason. Set MC_HARNESS_STDERR_DIR= for any long training run + # so a harness death arrives diagnosed, not as a guess. + self._stderr_file = None + stderr_dir = os.environ.get("MC_HARNESS_STDERR_DIR", "") + if stderr_dir: + os.makedirs(stderr_dir, exist_ok=True) + self._stderr_file = open( + os.path.join(stderr_dir, f"harness_{os.getpid()}_{time.time_ns()}.err"), + "w", + ) + stderr_target = self._stderr_file + else: + stderr_target = subprocess.DEVNULL self._proc = subprocess.Popen( ["bash", str(HARNESS_SCRIPT)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL, + stderr=stderr_target, cwd=str(REPO_ROOT), text=True, bufsize=1, @@ -223,6 +241,12 @@ class HarnessClient: except subprocess.TimeoutExpired: self._proc.kill() self._proc.wait(timeout=2) + if self._stderr_file is not None: + try: + self._stderr_file.close() + except Exception: + pass + self._stderr_file = None def __enter__(self) -> HarnessClient: return self diff --git a/tooling/rl_self_play/magic_civ_env.py b/tooling/rl_self_play/magic_civ_env.py index 5b70c1ff..2ed03f0b 100644 --- a/tooling/rl_self_play/magic_civ_env.py +++ b/tooling/rl_self_play/magic_civ_env.py @@ -16,6 +16,7 @@ its win rate against this baseline; the policy is considered to have from __future__ import annotations import sys +import time from dataclasses import replace from typing import Any @@ -189,7 +190,6 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): # dropped them, which would have un-declared the external slots. if seed is not None: cfg = replace(cfg, seed=seed) - self._client = HarnessClient(cfg) self._terminated = False self._step_count = 0 self._capital_by_player = {} @@ -197,7 +197,18 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): # Every configured slot starts alive. `cfg.players` is the total slot # count (learner + opponents); eliminations prune this set. self._live_players = set(range(int(cfg.players))) - view = self._client.view(slot=self._slot_kw) + # Bounded retry on the harness spawn + first view. Under heavy + # concurrent load (16+ Godot workers in heavy-tests.slice with + # CPUWeight=20, plus other jobs on the box), a freshly-spawned Godot + # can lose the boot race and EOF on the first wire request — which, + # un-retried, aborts a multi-hour training run from a single transient + # worker death (observed: gen0 died at the first eval, 9 min in). We + # fully reap the dead client and back off before respawning so a + # competing worker finishing actually frees resources between tries. + # A SYSTEMATIC failure (bad build, missing data) still surfaces: it + # exhausts the retries and re-raises, and MC_HARNESS_STDERR_DIR + # captures the Godot-side reason. + view = self._spawn_with_retry(cfg) # Seed capitals from any cities present at game start. In duel # maps each player begins with a founder, so the capital map is # populated on the first CityFounded event per player (handled @@ -211,6 +222,38 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): self._sync_state(view) return encode_observation(view), {"action_mask": self._cur_mask.copy()} + def _spawn_with_retry( + self, cfg: HarnessConfig, attempts: int = 3 + ) -> dict[str, Any]: + """Spawn the harness and fetch the first view, retrying a transient + boot-race EOF. Fully reaps a dead client before respawning, with a + backoff so a competing worker can free resources between tries. + Re-raises the last HarnessError after exhausting `attempts`.""" + last_err: HarnessError | None = None + for attempt in range(attempts): + try: + self._client = HarnessClient(cfg) + return self._client.view(slot=self._slot_kw) + except HarnessError as e: + last_err = e + # Reap the half-dead client so we don't leak a scope and make + # contention worse, then back off (1s, 2s, …) before respawn. + if self._client is not None: + try: + self._client.shutdown() + except Exception: + pass + self._client = None + if attempt < attempts - 1: + print( + f"[MagicCivEnv] harness spawn attempt {attempt + 1}/" + f"{attempts} failed ({e}); reaped + retrying", + file=sys.stderr, flush=True, + ) + time.sleep(1.0 * (attempt + 1)) + assert last_err is not None + raise last_err + def step( self, action: np.int64 | int ) -> tuple[np.ndarray, float, bool, bool, dict[str, Any]]: