diff --git a/tooling/rl_self_play/magic_civ_env.py b/tooling/rl_self_play/magic_civ_env.py index 062f11aa..79cc3509 100644 --- a/tooling/rl_self_play/magic_civ_env.py +++ b/tooling/rl_self_play/magic_civ_env.py @@ -43,6 +43,19 @@ SCORE_DELTA_SCALE = 1e-3 WIN_REWARD = 1.0 LOSS_REWARD = -1.0 DRAW_REWARD = 0.0 +# Per-step time penalty. Without this, score_estimate barely moves +# within a turn so the policy gets ~0 reward per micro-action and has +# no gradient toward end_turn. Empirical observation (32-env run, eval +# at step 20k): all 10 eval episodes never advanced past turn 0 — +# policy got stuck doing 50k no-op-equivalents because doing nothing +# costs nothing. 5e-4 per step makes a 1000-step episode lose 0.5 to +# time alone, which is meaningful against ±1.0 terminal but doesn't +# dominate score-shaping when the policy is actually making progress. +STEP_PENALTY = 5e-4 +# Bonus for advancing the turn counter. Positive feedback for the one +# action that lets the game proceed (end_turn). 1e-2 per turn × 100 +# turns = +1.0, comparable to the terminal win bonus. +TURN_ADVANCE_BONUS = 1e-2 # Hard ceiling on env.step() calls per episode. A policy that learned # "ending the turn lowers my reward" would otherwise produce episodes @@ -128,7 +141,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): player_action = decode_action_index(idx, self._idx_to_action) self._step_count += 1 - reward = 0.0 + reward = -STEP_PENALTY try: if player_action.get("type") == "end_turn": self._client.end_turn() @@ -147,6 +160,13 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): ) view = self._client.view() + new_turn = int(view.get("turn", 0)) + # Track previous turn so we can grant the advance bonus exactly + # when the turn counter ticks up — initialized from the last + # synced view, so first step after reset uses turn 0 baseline. + prev_turn = int(self._last_view.get("turn", 0)) + if new_turn > prev_turn: + reward += TURN_ADVANCE_BONUS * (new_turn - prev_turn) prev_score = self._last_score new_score = float(view.get("score", {}).get("score_estimate", 0.0)) reward += SCORE_DELTA_SCALE * (new_score - prev_score)