From dbeb3f408810186c7eb24a479212537f7beefa41 Mon Sep 17 00:00:00 2001 From: autocommit Date: Wed, 27 May 2026 20:26:00 -0700 Subject: [PATCH] =?UTF-8?q?test(rl-self-play):=20=E2=9C=85=20Add=20evaluat?= =?UTF-8?q?ion=20functions,=20opponent=20models,=20and=20smoke=20tests=20f?= =?UTF-8?q?or=20divergence=20mining=20in=20RL=20self-play=20tools?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Lilith Autocommit --- tooling/rl_self_play/evaluate.py | 11 ++++++++++- tooling/rl_self_play/magic_civ_env.py | 6 ++++++ tooling/rl_self_play/smoke_model_opponent.py | 8 ++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/tooling/rl_self_play/evaluate.py b/tooling/rl_self_play/evaluate.py index 238745c7..2efd04d9 100644 --- a/tooling/rl_self_play/evaluate.py +++ b/tooling/rl_self_play/evaluate.py @@ -51,6 +51,13 @@ def _build_argparser() -> argparse.ArgumentParser: p.add_argument("--opponent-device", default="cpu") p.add_argument("--opponent-deterministic", action="store_true", help="Argmax opponent actions (default: stochastic sampling).") + p.add_argument("--learner-deterministic", action=argparse.BooleanOptionalAction, + default=True, + help=("Argmax the evaluated (slot-0) policy. Default True. " + "For a symmetric self-play sanity check (e.g. v4 vs " + "v4, expect ~50%) pass --no-learner-deterministic so " + "both sides sample from the masked softmax — matching " + "the stochastic training-eval regime.")) return p @@ -121,7 +128,9 @@ def main() -> int: info_history: list[dict[str, object]] = [] while not done: mask = env.action_masks() - action, _ = model.predict(obs, action_masks=mask, deterministic=True) + action, _ = model.predict( + obs, action_masks=mask, deterministic=args.learner_deterministic + ) obs, reward, terminated, truncated, info = env.step(int(action)) info_history.append(info) done = terminated or truncated diff --git a/tooling/rl_self_play/magic_civ_env.py b/tooling/rl_self_play/magic_civ_env.py index e509da13..8a4e0d6a 100644 --- a/tooling/rl_self_play/magic_civ_env.py +++ b/tooling/rl_self_play/magic_civ_env.py @@ -293,6 +293,12 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): "score": new_score, "city_count": int(view.get("score", {}).get("city_count", 0)), } + if self._opponent is not None: + # Diagnostic: how many wire events the frozen opponent's turn + # produced this step. Zero across a whole episode means the + # opponent never actually acted (e.g. stale binary not skipping + # the external slot) — the smoke asserts this is >0. + info["opp_events"] = len(opp_events) if reason: info["reason"] = reason elif step_capped: diff --git a/tooling/rl_self_play/smoke_model_opponent.py b/tooling/rl_self_play/smoke_model_opponent.py index 6542e5cc..041a31ad 100644 --- a/tooling/rl_self_play/smoke_model_opponent.py +++ b/tooling/rl_self_play/smoke_model_opponent.py @@ -56,6 +56,7 @@ def main() -> int: "max_turn_seen": 0, "mask_violations": 0, "opp_turns_implied": 0, + "opp_events_total": 0, "terminal_reason": None, } @@ -89,6 +90,7 @@ def main() -> int: details["mask_violations"] += 1 obs, reward, terminated, truncated, info = env.step(action) mask = info.get("action_mask", np.zeros_like(mask)) + details["opp_events_total"] += int(info.get("opp_events", 0)) turn = int(info.get("turn", 0)) if turn > details["max_turn_seen"]: details["max_turn_seen"] = turn @@ -110,6 +112,12 @@ def main() -> int: reasons.append("turn counter never advanced — opponent/turn loop stuck") if details["mask_violations"] > 0: reasons.append(f"{details['mask_violations']} mask violations") + if details["opp_events_total"] < 1: + reasons.append( + "opponent produced zero wire events across the run — frozen " + "opponent never acted (likely a stale binary not skipping the " + "external slot, so the simulator AI drove it instead)" + ) passed = not reasons print(json.dumps({"passed": passed, "reasons": reasons, "details": details}))