fix(@projects/@magic-civilization): 🐛 improve pid detection in rl scripts

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Natalie 2026-05-17 05:28:24 -07:00
parent a6f909a151
commit 4a862b76fb
2 changed files with 15 additions and 6 deletions

View file

@ -48,7 +48,7 @@ case "$cmd" in
remote "
set +e
echo '---PYTHON PID---'
py=\$(pgrep -f 'python3 -m tooling.rl_self_play.train' | head -1)
py=\$(ps -eo pid,comm,args | awk '\$2 ~ /^python/ && /rl_self_play.train/ {print \$1; exit}')
if [ -z \"\$py\" ]; then
echo 'no training process'
echo '---EVAL DIR (${EVAL_DIR_REMOTE})---'
@ -67,7 +67,7 @@ case "$cmd" in
logs)
n="${1:-60}"
remote "
py=\$(pgrep -f 'python3 -m tooling.rl_self_play.train' | head -1)
py=\$(ps -eo pid,comm,args | awk '\$2 ~ /^python/ && /rl_self_play.train/ {print \$1; exit}')
if [ -z \"\$py\" ]; then echo 'no training process'; exit 1; fi
tail -${n} \"\$(readlink /proc/\$py/fd/1)\"
"

View file

@ -43,10 +43,17 @@ WIN_REWARD = 1.0
LOSS_REWARD = -1.0
DRAW_REWARD = 0.0
# Per-Gym-step ceiling on micro-actions before forcing end_turn. Without
# this, a policy stuck in a loop (e.g. fortify→unfortify→fortify) would
# hang the env forever. 64 is generous for a duel game's per-turn budget.
MAX_MICRO_ACTIONS_PER_TURN = 64
# Hard ceiling on env.step() calls per episode. A policy that learned
# "ending the turn lowers my reward" would otherwise produce episodes
# of unbounded length (observed: 1.3M harness round-trips in a single
# eval episode). A total-episode budget catches that without biasing
# intra-turn behavior — players in late game with hundreds of units
# legitimately have hundreds of micro-actions per turn, so a per-turn
# cap would interfere with normal play. 50k bounds eval wall-clock to
# ~10 min at 50 fps while sitting an order of magnitude above any
# plausibly legitimate game length (200 units * 200 turns * 5 acts/unit
# = 200k upper bound, but real PPO eval games end far earlier).
DEFAULT_MAX_STEPS_PER_EPISODE = 50_000
class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
@ -59,10 +66,12 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
self,
harness_config: HarnessConfig | None = None,
max_turns: int = 200,
max_micro_actions_per_turn: int = DEFAULT_MAX_MICRO_ACTIONS_PER_TURN,
) -> None:
super().__init__()
self._config = harness_config or HarnessConfig()
self._max_turns = max_turns
self._max_micro_actions_per_turn = max_micro_actions_per_turn
self.observation_space = spaces.Box(
low=-1e6, high=1e6, shape=(OBS_DIM,), dtype=np.float32
)