From 4a862b76fbd65f0c8f304d1803f470da05d29c1f Mon Sep 17 00:00:00 2001
From: Natalie <natalie@lilithuwu.com>
Date: Sun, 17 May 2026 05:28:24 -0700
Subject: [PATCH] =?UTF-8?q?fix(@projects/@magic-civilization):=20?=
 =?UTF-8?q?=F0=9F=90=9B=20improve=20pid=20detection=20in=20rl=20scripts?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
---
 scripts/rl-train.sh                   |  4 ++--
 tooling/rl_self_play/magic_civ_env.py | 17 +++++++++++++----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/scripts/rl-train.sh b/scripts/rl-train.sh
index 85908988..ec638c15 100755
--- a/scripts/rl-train.sh
+++ b/scripts/rl-train.sh
@@ -48,7 +48,7 @@ case "$cmd" in
     remote "
       set +e
       echo '---PYTHON PID---'
-      py=\$(pgrep -f 'python3 -m tooling.rl_self_play.train' | head -1)
+      py=\$(ps -eo pid,comm,args | awk '\$2 ~ /^python/ && /rl_self_play.train/ {print \$1; exit}')
       if [ -z \"\$py\" ]; then
         echo 'no training process'
         echo '---EVAL DIR (${EVAL_DIR_REMOTE})---'
@@ -67,7 +67,7 @@ case "$cmd" in
   logs)
     n="${1:-60}"
     remote "
-      py=\$(pgrep -f 'python3 -m tooling.rl_self_play.train' | head -1)
+      py=\$(ps -eo pid,comm,args | awk '\$2 ~ /^python/ && /rl_self_play.train/ {print \$1; exit}')
       if [ -z \"\$py\" ]; then echo 'no training process'; exit 1; fi
       tail -${n} \"\$(readlink /proc/\$py/fd/1)\"
     "
diff --git a/tooling/rl_self_play/magic_civ_env.py b/tooling/rl_self_play/magic_civ_env.py
index 2c51d577..0f1dc6fb 100644
--- a/tooling/rl_self_play/magic_civ_env.py
+++ b/tooling/rl_self_play/magic_civ_env.py
@@ -43,10 +43,17 @@ WIN_REWARD = 1.0
 LOSS_REWARD = -1.0
 DRAW_REWARD = 0.0
 
-# Per-Gym-step ceiling on micro-actions before forcing end_turn. Without
-# this, a policy stuck in a loop (e.g. fortify→unfortify→fortify) would
-# hang the env forever. 64 is generous for a duel game's per-turn budget.
-MAX_MICRO_ACTIONS_PER_TURN = 64
+# Hard ceiling on env.step() calls per episode. A policy that learned
+# "ending the turn lowers my reward" would otherwise produce episodes
+# of unbounded length (observed: 1.3M harness round-trips in a single
+# eval episode). A total-episode budget catches that without biasing
+# intra-turn behavior — players in late game with hundreds of units
+# legitimately have hundreds of micro-actions per turn, so a per-turn
+# cap would interfere with normal play. 50k bounds eval wall-clock to
+# ~10 min at 50 fps while sitting an order of magnitude above any
+# plausibly legitimate game length (200 units * 200 turns * 5 acts/unit
+# = 200k upper bound, but real PPO eval games end far earlier).
+DEFAULT_MAX_STEPS_PER_EPISODE = 50_000
 
 
 class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
@@ -59,10 +66,12 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
         self,
         harness_config: HarnessConfig | None = None,
         max_turns: int = 200,
+        max_micro_actions_per_turn: int = DEFAULT_MAX_MICRO_ACTIONS_PER_TURN,
     ) -> None:
         super().__init__()
         self._config = harness_config or HarnessConfig()
         self._max_turns = max_turns
+        self._max_micro_actions_per_turn = max_micro_actions_per_turn
         self.observation_space = spaces.Box(
             low=-1e6, high=1e6, shape=(OBS_DIM,), dtype=np.float32
         )