From e5a2a37d0e2725370299c603fef7f829ff82047b Mon Sep 17 00:00:00 2001
From: autocommit <autocommit@ftw.codes>
Date: Sun, 17 May 2026 06:55:12 -0700
Subject: [PATCH] =?UTF-8?q?feat(rl-self-play):=20=E2=9C=A8=20Add=20stochas?=
 =?UTF-8?q?tic=20evaluation=20with=20masked=20softmax=20sampling=20to=20re?=
 =?UTF-8?q?place=20deterministic=20argmax=20in=20RL=20self-play=20training?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
---
 tooling/rl_self_play/train.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tooling/rl_self_play/train.py b/tooling/rl_self_play/train.py
index a6567e8c..a181352d 100644
--- a/tooling/rl_self_play/train.py
+++ b/tooling/rl_self_play/train.py
@@ -132,7 +132,13 @@ def main() -> int:
         log_path=str(run_dir / "eval"),
         eval_freq=max(args.eval_freq // args.num_envs, 1),
         n_eval_episodes=args.eval_episodes,
-        deterministic=True,
+        # Stochastic eval: a barely-trained net's argmax over the
+        # 322-dim action head has ~zero chance of being end_turn (idx 0),
+        # so deterministic eval episodes never advance past turn 0 and
+        # all 10 hit step_cap with reward 0. Sampling from the masked
+        # softmax keeps end_turn reachable until the policy has
+        # consolidated enough mass on a real strategy.
+        deterministic=False,
         render=False,
     )