feat(rl-self-play): Introduce no-op penalty and turn advancement bonus in RL environment

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
autocommit 2026-05-17 06:47:57 -07:00
parent 7678f4668f
commit b82e4a8fbd

View file

@ -43,6 +43,19 @@ SCORE_DELTA_SCALE = 1e-3
WIN_REWARD = 1.0
LOSS_REWARD = -1.0
DRAW_REWARD = 0.0
# Per-step time penalty. Without this, score_estimate barely moves
# within a turn so the policy gets ~0 reward per micro-action and has
# no gradient toward end_turn. Empirical observation (32-env run, eval
# at step 20k): all 10 eval episodes never advanced past turn 0 —
# policy got stuck doing 50k no-op-equivalents because doing nothing
# costs nothing. 5e-4 per step makes a 1000-step episode lose 0.5 to
# time alone, which is meaningful against ±1.0 terminal but doesn't
# dominate score-shaping when the policy is actually making progress.
STEP_PENALTY = 5e-4
# Bonus for advancing the turn counter. Positive feedback for the one
# action that lets the game proceed (end_turn). 1e-2 per turn × 100
# turns = +1.0, comparable to the terminal win bonus.
TURN_ADVANCE_BONUS = 1e-2
# Hard ceiling on env.step() calls per episode. A policy that learned
# "ending the turn lowers my reward" would otherwise produce episodes
@ -128,7 +141,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
player_action = decode_action_index(idx, self._idx_to_action)
self._step_count += 1
reward = 0.0
reward = -STEP_PENALTY
try:
if player_action.get("type") == "end_turn":
self._client.end_turn()
@ -147,6 +160,13 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
)
view = self._client.view()
new_turn = int(view.get("turn", 0))
# Track previous turn so we can grant the advance bonus exactly
# when the turn counter ticks up — initialized from the last
# synced view, so first step after reset uses turn 0 baseline.
prev_turn = int(self._last_view.get("turn", 0))
if new_turn > prev_turn:
reward += TURN_ADVANCE_BONUS * (new_turn - prev_turn)
prev_score = self._last_score
new_score = float(view.get("score", {}).get("score_estimate", 0.0))
reward += SCORE_DELTA_SCALE * (new_score - prev_score)