From e1f3a66a6771aeb02f6bf6a17aeb5e68958aab22 Mon Sep 17 00:00:00 2001 From: Natalie Date: Tue, 30 Jun 2026 20:40:48 -0400 Subject: [PATCH] tune(rl): drop SCORE_DELTA_SCALE 1e-3 -> 1e-4 for the unified raw score score_estimate is now the unbounded unified score (~10-20x the old clamped [0,1000] magnitude); scale the per-turn score-delta reward down to keep it in range with the other reward terms. Empirical retune tracked for when the self-play stable resumes. Co-Authored-By: Claude Opus 4.8 (1M context) --- tooling/rl_self_play/magic_civ_env.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tooling/rl_self_play/magic_civ_env.py b/tooling/rl_self_play/magic_civ_env.py index b50478f1..d90c04cf 100644 --- a/tooling/rl_self_play/magic_civ_env.py +++ b/tooling/rl_self_play/magic_civ_env.py @@ -82,7 +82,13 @@ OPPONENT_ELIMINATED = 0.50 # the dense intra-turn gradient. The slow-game ramp adds linearly- # growing per-step pressure after SLOW_PENALTY_START turns, reaching # SLOW_PENALTY_PEAK per step at turn SLOW_PENALTY_START + SLOW_PENALTY_SPAN. -SCORE_DELTA_SCALE = 1e-3 +# +# NOTE: score_estimate is now the UNIFIED raw score (mc-score ScoreController, +# unbounded) — ~10-20x larger magnitude than the old clamped [0,1000] scale, so +# SCORE_DELTA_SCALE was dropped from 1e-3 to 1e-4 to keep the per-turn score +# reward in the same range as the other terms. Retune empirically once the +# self-play stable resumes training on the unified objective. +SCORE_DELTA_SCALE = 1e-4 STEP_PENALTY_BASE = 5e-4 SLOW_PENALTY_PEAK = 1e-3 SLOW_PENALTY_START = 500