diff --git a/tooling/rl_self_play/magic_civ_env.py b/tooling/rl_self_play/magic_civ_env.py index b50478f1..d90c04cf 100644 --- a/tooling/rl_self_play/magic_civ_env.py +++ b/tooling/rl_self_play/magic_civ_env.py @@ -82,7 +82,13 @@ OPPONENT_ELIMINATED = 0.50 # the dense intra-turn gradient. The slow-game ramp adds linearly- # growing per-step pressure after SLOW_PENALTY_START turns, reaching # SLOW_PENALTY_PEAK per step at turn SLOW_PENALTY_START + SLOW_PENALTY_SPAN. -SCORE_DELTA_SCALE = 1e-3 +# +# NOTE: score_estimate is now the UNIFIED raw score (mc-score ScoreController, +# unbounded) — ~10-20x larger magnitude than the old clamped [0,1000] scale, so +# SCORE_DELTA_SCALE was dropped from 1e-3 to 1e-4 to keep the per-turn score +# reward in the same range as the other terms. Retune empirically once the +# self-play stable resumes training on the unified objective. +SCORE_DELTA_SCALE = 1e-4 STEP_PENALTY_BASE = 5e-4 SLOW_PENALTY_PEAK = 1e-3 SLOW_PENALTY_START = 500