diff --git a/tooling/rl_self_play/magic_civ_env.py b/tooling/rl_self_play/magic_civ_env.py
index 062f11aa..79cc3509 100644
--- a/tooling/rl_self_play/magic_civ_env.py
+++ b/tooling/rl_self_play/magic_civ_env.py
@@ -43,6 +43,19 @@ SCORE_DELTA_SCALE = 1e-3
 WIN_REWARD = 1.0
 LOSS_REWARD = -1.0
 DRAW_REWARD = 0.0
+# Per-step time penalty. Without this, score_estimate barely moves
+# within a turn so the policy gets ~0 reward per micro-action and has
+# no gradient toward end_turn. Empirical observation (32-env run, eval
+# at step 20k): all 10 eval episodes never advanced past turn 0 —
+# policy got stuck doing 50k no-op-equivalents because doing nothing
+# costs nothing. 5e-4 per step makes a 1000-step episode lose 0.5 to
+# time alone, which is meaningful against ±1.0 terminal but doesn't
+# dominate score-shaping when the policy is actually making progress.
+STEP_PENALTY = 5e-4
+# Bonus for advancing the turn counter. Positive feedback for the one
+# action that lets the game proceed (end_turn). 1e-2 per turn × 100
+# turns = +1.0, comparable to the terminal win bonus.
+TURN_ADVANCE_BONUS = 1e-2
 
 # Hard ceiling on env.step() calls per episode. A policy that learned
 # "ending the turn lowers my reward" would otherwise produce episodes
@@ -128,7 +141,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
         player_action = decode_action_index(idx, self._idx_to_action)
         self._step_count += 1
 
-        reward = 0.0
+        reward = -STEP_PENALTY
         try:
             if player_action.get("type") == "end_turn":
                 self._client.end_turn()
@@ -147,6 +160,13 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
             )
 
         view = self._client.view()
+        new_turn = int(view.get("turn", 0))
+        # Track previous turn so we can grant the advance bonus exactly
+        # when the turn counter ticks up — initialized from the last
+        # synced view, so first step after reset uses turn 0 baseline.
+        prev_turn = int(self._last_view.get("turn", 0))
+        if new_turn > prev_turn:
+            reward += TURN_ADVANCE_BONUS * (new_turn - prev_turn)
         prev_score = self._last_score
         new_score = float(view.get("score", {}).get("score_estimate", 0.0))
         reward += SCORE_DELTA_SCALE * (new_score - prev_score)