From e5a2a37d0e2725370299c603fef7f829ff82047b Mon Sep 17 00:00:00 2001 From: autocommit Date: Sun, 17 May 2026 06:55:12 -0700 Subject: [PATCH] =?UTF-8?q?feat(rl-self-play):=20=E2=9C=A8=20Add=20stochas?= =?UTF-8?q?tic=20evaluation=20with=20masked=20softmax=20sampling=20to=20re?= =?UTF-8?q?place=20deterministic=20argmax=20in=20RL=20self-play=20training?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Lilith Autocommit --- tooling/rl_self_play/train.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tooling/rl_self_play/train.py b/tooling/rl_self_play/train.py index a6567e8c..a181352d 100644 --- a/tooling/rl_self_play/train.py +++ b/tooling/rl_self_play/train.py @@ -132,7 +132,13 @@ def main() -> int: log_path=str(run_dir / "eval"), eval_freq=max(args.eval_freq // args.num_envs, 1), n_eval_episodes=args.eval_episodes, - deterministic=True, + # Stochastic eval: a barely-trained net's argmax over the + # 322-dim action head has ~zero chance of being end_turn (idx 0), + # so deterministic eval episodes never advance past turn 0 and + # all 10 hit step_cap with reward 0. Sampling from the masked + # softmax keeps end_turn reachable until the policy has + # consolidated enough mass on a real strategy. + deterministic=False, render=False, )