From e1f3a66a6771aeb02f6bf6a17aeb5e68958aab22 Mon Sep 17 00:00:00 2001
From: Natalie <natalie@lilithuwu.com>
Date: Tue, 30 Jun 2026 20:40:48 -0400
Subject: [PATCH] tune(rl): drop SCORE_DELTA_SCALE 1e-3 -> 1e-4 for the unified
 raw score

score_estimate is now the unbounded unified score (~10-20x the old clamped [0,1000] magnitude);
scale the per-turn score-delta reward down to keep it in range with the other reward terms.
Empirical retune tracked for when the self-play stable resumes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tooling/rl_self_play/magic_civ_env.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tooling/rl_self_play/magic_civ_env.py b/tooling/rl_self_play/magic_civ_env.py
index b50478f1..d90c04cf 100644
--- a/tooling/rl_self_play/magic_civ_env.py
+++ b/tooling/rl_self_play/magic_civ_env.py
@@ -82,7 +82,13 @@ OPPONENT_ELIMINATED = 0.50
 # the dense intra-turn gradient. The slow-game ramp adds linearly-
 # growing per-step pressure after SLOW_PENALTY_START turns, reaching
 # SLOW_PENALTY_PEAK per step at turn SLOW_PENALTY_START + SLOW_PENALTY_SPAN.
-SCORE_DELTA_SCALE = 1e-3
+#
+# NOTE: score_estimate is now the UNIFIED raw score (mc-score ScoreController,
+# unbounded) — ~10-20x larger magnitude than the old clamped [0,1000] scale, so
+# SCORE_DELTA_SCALE was dropped from 1e-3 to 1e-4 to keep the per-turn score
+# reward in the same range as the other terms. Retune empirically once the
+# self-play stable resumes training on the unified objective.
+SCORE_DELTA_SCALE = 1e-4
 STEP_PENALTY_BASE = 5e-4
 SLOW_PENALTY_PEAK = 1e-3
 SLOW_PENALTY_START = 500