feat(rl-self-play): ✨ Introduce no-op penalty and turn advancement bonus in RL environment
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
7678f4668f
commit
b82e4a8fbd
1 changed files with 21 additions and 1 deletions
|
|
@ -43,6 +43,19 @@ SCORE_DELTA_SCALE = 1e-3
|
|||
WIN_REWARD = 1.0
|
||||
LOSS_REWARD = -1.0
|
||||
DRAW_REWARD = 0.0
|
||||
# Per-step time penalty. Without this, score_estimate barely moves
|
||||
# within a turn so the policy gets ~0 reward per micro-action and has
|
||||
# no gradient toward end_turn. Empirical observation (32-env run, eval
|
||||
# at step 20k): all 10 eval episodes never advanced past turn 0 —
|
||||
# policy got stuck doing 50k no-op-equivalents because doing nothing
|
||||
# costs nothing. 5e-4 per step makes a 1000-step episode lose 0.5 to
|
||||
# time alone, which is meaningful against ±1.0 terminal but doesn't
|
||||
# dominate score-shaping when the policy is actually making progress.
|
||||
STEP_PENALTY = 5e-4
|
||||
# Bonus for advancing the turn counter. Positive feedback for the one
|
||||
# action that lets the game proceed (end_turn). 1e-2 per turn × 100
|
||||
# turns = +1.0, comparable to the terminal win bonus.
|
||||
TURN_ADVANCE_BONUS = 1e-2
|
||||
|
||||
# Hard ceiling on env.step() calls per episode. A policy that learned
|
||||
# "ending the turn lowers my reward" would otherwise produce episodes
|
||||
|
|
@ -128,7 +141,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
|
|||
player_action = decode_action_index(idx, self._idx_to_action)
|
||||
self._step_count += 1
|
||||
|
||||
reward = 0.0
|
||||
reward = -STEP_PENALTY
|
||||
try:
|
||||
if player_action.get("type") == "end_turn":
|
||||
self._client.end_turn()
|
||||
|
|
@ -147,6 +160,13 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
|
|||
)
|
||||
|
||||
view = self._client.view()
|
||||
new_turn = int(view.get("turn", 0))
|
||||
# Track previous turn so we can grant the advance bonus exactly
|
||||
# when the turn counter ticks up — initialized from the last
|
||||
# synced view, so first step after reset uses turn 0 baseline.
|
||||
prev_turn = int(self._last_view.get("turn", 0))
|
||||
if new_turn > prev_turn:
|
||||
reward += TURN_ADVANCE_BONUS * (new_turn - prev_turn)
|
||||
prev_score = self._last_score
|
||||
new_score = float(view.get("score", {}).get("score_estimate", 0.0))
|
||||
reward += SCORE_DELTA_SCALE * (new_score - prev_score)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue