From dbeb3f408810186c7eb24a479212537f7beefa41 Mon Sep 17 00:00:00 2001
From: autocommit <autocommit@ftw.codes>
Date: Wed, 27 May 2026 20:26:00 -0700
Subject: [PATCH] =?UTF-8?q?test(rl-self-play):=20=E2=9C=85=20Add=20evaluat?=
 =?UTF-8?q?ion=20functions,=20opponent=20models,=20and=20smoke=20tests=20f?=
 =?UTF-8?q?or=20divergence=20mining=20in=20RL=20self-play=20tools?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
---
 tooling/rl_self_play/evaluate.py             | 11 ++++++++++-
 tooling/rl_self_play/magic_civ_env.py        |  6 ++++++
 tooling/rl_self_play/smoke_model_opponent.py |  8 ++++++++
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/tooling/rl_self_play/evaluate.py b/tooling/rl_self_play/evaluate.py
index 238745c7..2efd04d9 100644
--- a/tooling/rl_self_play/evaluate.py
+++ b/tooling/rl_self_play/evaluate.py
@@ -51,6 +51,13 @@ def _build_argparser() -> argparse.ArgumentParser:
     p.add_argument("--opponent-device", default="cpu")
     p.add_argument("--opponent-deterministic", action="store_true",
                    help="Argmax opponent actions (default: stochastic sampling).")
+    p.add_argument("--learner-deterministic", action=argparse.BooleanOptionalAction,
+                   default=True,
+                   help=("Argmax the evaluated (slot-0) policy. Default True. "
+                         "For a symmetric self-play sanity check (e.g. v4 vs "
+                         "v4, expect ~50%) pass --no-learner-deterministic so "
+                         "both sides sample from the masked softmax — matching "
+                         "the stochastic training-eval regime."))
     return p
 
 
@@ -121,7 +128,9 @@ def main() -> int:
             info_history: list[dict[str, object]] = []
             while not done:
                 mask = env.action_masks()
-                action, _ = model.predict(obs, action_masks=mask, deterministic=True)
+                action, _ = model.predict(
+                    obs, action_masks=mask, deterministic=args.learner_deterministic
+                )
                 obs, reward, terminated, truncated, info = env.step(int(action))
                 info_history.append(info)
                 done = terminated or truncated
diff --git a/tooling/rl_self_play/magic_civ_env.py b/tooling/rl_self_play/magic_civ_env.py
index e509da13..8a4e0d6a 100644
--- a/tooling/rl_self_play/magic_civ_env.py
+++ b/tooling/rl_self_play/magic_civ_env.py
@@ -293,6 +293,12 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
             "score": new_score,
             "city_count": int(view.get("score", {}).get("city_count", 0)),
         }
+        if self._opponent is not None:
+            # Diagnostic: how many wire events the frozen opponent's turn
+            # produced this step. Zero across a whole episode means the
+            # opponent never actually acted (e.g. stale binary not skipping
+            # the external slot) — the smoke asserts this is >0.
+            info["opp_events"] = len(opp_events)
         if reason:
             info["reason"] = reason
         elif step_capped:
diff --git a/tooling/rl_self_play/smoke_model_opponent.py b/tooling/rl_self_play/smoke_model_opponent.py
index 6542e5cc..041a31ad 100644
--- a/tooling/rl_self_play/smoke_model_opponent.py
+++ b/tooling/rl_self_play/smoke_model_opponent.py
@@ -56,6 +56,7 @@ def main() -> int:
         "max_turn_seen": 0,
         "mask_violations": 0,
         "opp_turns_implied": 0,
+        "opp_events_total": 0,
         "terminal_reason": None,
     }
 
@@ -89,6 +90,7 @@ def main() -> int:
                 details["mask_violations"] += 1
             obs, reward, terminated, truncated, info = env.step(action)
             mask = info.get("action_mask", np.zeros_like(mask))
+            details["opp_events_total"] += int(info.get("opp_events", 0))
             turn = int(info.get("turn", 0))
             if turn > details["max_turn_seen"]:
                 details["max_turn_seen"] = turn
@@ -110,6 +112,12 @@ def main() -> int:
         reasons.append("turn counter never advanced — opponent/turn loop stuck")
     if details["mask_violations"] > 0:
         reasons.append(f"{details['mask_violations']} mask violations")
+    if details["opp_events_total"] < 1:
+        reasons.append(
+            "opponent produced zero wire events across the run — frozen "
+            "opponent never acted (likely a stale binary not skipping the "
+            "external slot, so the simulator AI drove it instead)"
+        )
 
     passed = not reasons
     print(json.dumps({"passed": passed, "reasons": reasons, "details": details}))