From 50e174ab06fa36d24546749f5386fb6d23309fef Mon Sep 17 00:00:00 2001 From: Natalie Date: Sun, 17 May 2026 05:34:29 -0700 Subject: [PATCH] =?UTF-8?q?feat(@projects/@magic-civilization):=20?= =?UTF-8?q?=E2=9C=A8=20add=20step=5Fcap=20evaluation=20category?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Lilith Autocommit --- scripts/rl-train.sh | 8 +++-- tooling/rl_self_play/evaluate.py | 17 ++++++++- tooling/rl_self_play/magic_civ_env.py | 52 +++++++++++++-------------- 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/scripts/rl-train.sh b/scripts/rl-train.sh index ec638c15..9a6dd851 100755 --- a/scripts/rl-train.sh +++ b/scripts/rl-train.sh @@ -88,9 +88,10 @@ case "$cmd" in launch) remote " cd ${RL_WORKTREE} || exit 1 - if pgrep -f 'python3 -m tooling.rl_self_play.train' >/dev/null; then + existing=\$(ps -eo pid,comm,args | awk '\$2 ~ /^python/ && /rl_self_play.train/ {print \$1}') + if [ -n \"\$existing\" ]; then echo 'training already running; run kill first' - pgrep -af 'python3 -m tooling.rl_self_play.train' + echo \"\$existing\" exit 1 fi nohup python3 -m tooling.rl_self_play.train \ @@ -100,7 +101,8 @@ case "$cmd" in --run-name ${RL_RUN_NAME} > ${LOG_REMOTE} 2>&1 & echo \$! > ${RL_PIDFILE} sleep 3 - pgrep -af 'python3 -m tooling.rl_self_play.train' || (echo 'launch failed; check log'; tail -20 ${LOG_REMOTE}) + ps -eo pid,comm,args | awk '\$2 ~ /^python/ && /rl_self_play.train/' \\ + || (echo 'launch failed; check log'; tail -20 ${LOG_REMOTE}) " ;; diff --git a/tooling/rl_self_play/evaluate.py b/tooling/rl_self_play/evaluate.py index 2e9dc5c0..20ade18a 100644 --- a/tooling/rl_self_play/evaluate.py +++ b/tooling/rl_self_play/evaluate.py @@ -52,6 +52,11 @@ def _classify_episode(info_history: list[dict[str, object]], total_reward: float return "loss" if reason == "harness_error": return "loss" + if reason == "step_cap": + # Policy stuck in a no-progress loop and the env truncated the + # whole episode — degenerate non-result, surfaced as its own + # category so it's visible in the eval JSON. + return "step_cap" # No explicit win yet from the env; use score sign as tiebreaker. if total_reward > 0.5: return "win" @@ -66,7 +71,7 @@ def main() -> int: model = MaskablePPO.load(str(args.model_path)) - wins = losses = draws = 0 + wins = losses = draws = step_caps = 0 turns_per_episode: list[int] = [] for episode in range(args.episodes): cfg = HarnessConfig( @@ -93,6 +98,8 @@ def main() -> int: wins += 1 elif verdict == "loss": losses += 1 + elif verdict == "step_cap": + step_caps += 1 else: draws += 1 turns_per_episode.append(int(info.get("turn", 0))) @@ -106,10 +113,18 @@ def main() -> int: "wins": wins, "losses": losses, "draws": draws, + "step_caps": step_caps, "win_rate": wins / total, "mean_turns": round(mean_turns, 1), } print(json.dumps(verdict)) + if step_caps: + print( + f"WARNING: {step_caps}/{args.episodes} eval episodes hit the " + f"per-episode step cap — policy got stuck in a no-progress " + f"loop. Check encoder/reward shaping.", + file=sys.stderr, + ) return 0 diff --git a/tooling/rl_self_play/magic_civ_env.py b/tooling/rl_self_play/magic_civ_env.py index 0f1dc6fb..062f11aa 100644 --- a/tooling/rl_self_play/magic_civ_env.py +++ b/tooling/rl_self_play/magic_civ_env.py @@ -15,6 +15,7 @@ its win rate against this baseline; the policy is considered to have """ from __future__ import annotations +import sys from typing import Any import gymnasium as gym @@ -66,12 +67,12 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): self, harness_config: HarnessConfig | None = None, max_turns: int = 200, - max_micro_actions_per_turn: int = DEFAULT_MAX_MICRO_ACTIONS_PER_TURN, + max_steps_per_episode: int = DEFAULT_MAX_STEPS_PER_EPISODE, ) -> None: super().__init__() self._config = harness_config or HarnessConfig() self._max_turns = max_turns - self._max_micro_actions_per_turn = max_micro_actions_per_turn + self._max_steps_per_episode = max_steps_per_episode self.observation_space = spaces.Box( low=-1e6, high=1e6, shape=(OBS_DIM,), dtype=np.float32 ) @@ -82,8 +83,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): self._idx_to_action: dict[int, dict[str, Any]] = {} self._cur_mask: np.ndarray = np.zeros(ACTION_DIM, dtype=bool) self._terminated: bool = False - self._cur_turn: int = 0 - self._micro_actions_this_turn: int = 0 + self._step_count: int = 0 # ── Gymnasium API ──────────────────────────────────────────────── @@ -108,8 +108,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): ) self._client = HarnessClient(cfg) self._terminated = False - self._cur_turn = 0 - self._micro_actions_this_turn = 0 + self._step_count = 0 view = self._client.view() self._sync_state(view) return encode_observation(view), {"action_mask": self._cur_mask.copy()} @@ -127,18 +126,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): # Mask should prevent this, but be defensive: substitute end_turn. idx = 0 player_action = decode_action_index(idx, self._idx_to_action) - - # Hard ceiling: if the policy refuses to end its turn after - # MAX_MICRO_ACTIONS_PER_TURN, force end_turn. Without this an eval - # policy that has learned "ending the turn lowers my reward" - # produces an episode of unbounded length. - forced_end = False - if ( - self._micro_actions_this_turn >= MAX_MICRO_ACTIONS_PER_TURN - and player_action.get("type") != "end_turn" - ): - player_action = {"type": "end_turn"} - forced_end = True + self._step_count += 1 reward = 0.0 try: @@ -159,12 +147,6 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): ) view = self._client.view() - new_turn = int(view.get("turn", 0)) - if new_turn != self._cur_turn: - self._cur_turn = new_turn - self._micro_actions_this_turn = 0 - else: - self._micro_actions_this_turn += 1 prev_score = self._last_score new_score = float(view.get("score", {}).get("score_estimate", 0.0)) reward += SCORE_DELTA_SCALE * (new_score - prev_score) @@ -174,7 +156,15 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): self._sync_state(view) self._terminated = terminated - truncated = (not terminated) and int(view.get("turn", 0)) >= self._max_turns + step_capped = ( + not terminated + and self._step_count >= self._max_steps_per_episode + ) + turn_capped = ( + not terminated + and int(view.get("turn", 0)) >= self._max_turns + ) + truncated = step_capped or turn_capped if truncated: self._terminated = True info: dict[str, Any] = { @@ -185,8 +175,16 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): } if reason: info["reason"] = reason - if forced_end: - info["forced_end_turn"] = True + elif step_capped: + info["reason"] = "step_cap" + print( + f"[MagicCivEnv] step_cap hit at step={self._step_count} " + f"turn={int(view.get('turn', 0))} — truncating episode", + file=sys.stderr, + flush=True, + ) + elif turn_capped: + info["reason"] = "turn_cap" return encode_observation(view), reward, terminated, truncated, info def close(self) -> None: