diff --git a/tooling/rl_self_play/train.py b/tooling/rl_self_play/train.py
index a6567e8c..a181352d 100644
--- a/tooling/rl_self_play/train.py
+++ b/tooling/rl_self_play/train.py
@@ -132,7 +132,13 @@ def main() -> int:
         log_path=str(run_dir / "eval"),
         eval_freq=max(args.eval_freq // args.num_envs, 1),
         n_eval_episodes=args.eval_episodes,
-        deterministic=True,
+        # Stochastic eval: a barely-trained net's argmax over the
+        # 322-dim action head has ~zero chance of being end_turn (idx 0),
+        # so deterministic eval episodes never advance past turn 0 and
+        # all 10 hit step_cap with reward 0. Sampling from the masked
+        # softmax keeps end_turn reachable until the policy has
+        # consolidated enough mass on a real strategy.
+        deterministic=False,
         render=False,
     )