feat(rl-self-play): ✨ Introduce no-op penalty and turn advancement bonus in RL environment

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-05-17 06:47:57 -07:00 · 2026-05-17 06:47:57 -07:00 · b82e4a8fbd
commit b82e4a8fbd
parent 7678f4668f
1 changed files with 21 additions and 1 deletions
--- a/tooling/rl_self_play/magic_civ_env.py
+++ b/tooling/rl_self_play/magic_civ_env.py
@ -43,6 +43,19 @@ SCORE_DELTA_SCALE = 1e-3
 WIN_REWARD = 1.0
 LOSS_REWARD = -1.0
 DRAW_REWARD = 0.0
+# Per-step time penalty. Without this, score_estimate barely moves
+# within a turn so the policy gets ~0 reward per micro-action and has
+# no gradient toward end_turn. Empirical observation (32-env run, eval
+# at step 20k): all 10 eval episodes never advanced past turn 0 —
+# policy got stuck doing 50k no-op-equivalents because doing nothing
+# costs nothing. 5e-4 per step makes a 1000-step episode lose 0.5 to
+# time alone, which is meaningful against ±1.0 terminal but doesn't
+# dominate score-shaping when the policy is actually making progress.
+STEP_PENALTY = 5e-4
+# Bonus for advancing the turn counter. Positive feedback for the one
+# action that lets the game proceed (end_turn). 1e-2 per turn × 100
+# turns = +1.0, comparable to the terminal win bonus.
+TURN_ADVANCE_BONUS = 1e-2

 # Hard ceiling on env.step() calls per episode. A policy that learned
 # "ending the turn lowers my reward" would otherwise produce episodes
@ -128,7 +141,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
        player_action = decode_action_index(idx, self._idx_to_action)
        self._step_count += 1

-        reward = 0.0
+        reward = -STEP_PENALTY
        try:
            if player_action.get("type") == "end_turn":
                self._client.end_turn()
@ -147,6 +160,13 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
            )

        view = self._client.view()
+        new_turn = int(view.get("turn", 0))
+        # Track previous turn so we can grant the advance bonus exactly
+        # when the turn counter ticks up — initialized from the last
+        # synced view, so first step after reset uses turn 0 baseline.
+        prev_turn = int(self._last_view.get("turn", 0))
+        if new_turn > prev_turn:
+            reward += TURN_ADVANCE_BONUS * (new_turn - prev_turn)
        prev_score = self._last_score
        new_score = float(view.get("score", {}).get("score_estimate", 0.0))
        reward += SCORE_DELTA_SCALE * (new_score - prev_score)