diff --git a/tooling/rl_self_play/train.py b/tooling/rl_self_play/train.py index a6567e8c..a181352d 100644 --- a/tooling/rl_self_play/train.py +++ b/tooling/rl_self_play/train.py @@ -132,7 +132,13 @@ def main() -> int: log_path=str(run_dir / "eval"), eval_freq=max(args.eval_freq // args.num_envs, 1), n_eval_episodes=args.eval_episodes, - deterministic=True, + # Stochastic eval: a barely-trained net's argmax over the + # 322-dim action head has ~zero chance of being end_turn (idx 0), + # so deterministic eval episodes never advance past turn 0 and + # all 10 hit step_cap with reward 0. Sampling from the masked + # softmax keeps end_turn reachable until the policy has + # consolidated enough mass on a real strategy. + deterministic=False, render=False, )