130 lines
3.9 KiB
Bash
130 lines
3.9 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
# rl-train.sh — manage the RL self-play training run on apricot.
|
||
|
|
#
|
||
|
|
# Subcommands:
|
||
|
|
# status Show PID, elapsed, fps, last train block, eval dir state
|
||
|
|
# logs [N] Tail N (default 60) lines of the active training log
|
||
|
|
# gpu Show nvidia-smi snapshot
|
||
|
|
# procs Count godot/python procs related to training
|
||
|
|
# launch Start training (env-driven; see ENV below)
|
||
|
|
# kill Stop training cleanly (SIGTERM, then SIGKILL after 5s)
|
||
|
|
# sync scp local rl_self_play/ to the apricot worktree
|
||
|
|
#
|
||
|
|
# Env vars (with defaults):
|
||
|
|
# RL_HOST=apricot SSH alias for the training box
|
||
|
|
# RL_WORKTREE=/var/home/lilith/.cache/mc-rl-train-1779015795
|
||
|
|
# RL_RUN_NAME=duel-v1b-cuda1
|
||
|
|
# RL_DEVICE=cuda:1
|
||
|
|
# RL_ENVS=4
|
||
|
|
# RL_TOTAL_STEPS=200000
|
||
|
|
# RL_EVAL_FREQ=20000
|
||
|
|
# RL_EVAL_EPS=10
|
||
|
|
# RL_MAX_TURNS=100
|
||
|
|
# RL_PIDFILE=/tmp/rl-train.pid (on the remote)
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
: "${RL_HOST:=apricot}"
|
||
|
|
: "${RL_WORKTREE:=/var/home/lilith/.cache/mc-rl-train-1779015795}"
|
||
|
|
: "${RL_RUN_NAME:=duel-v1b-cuda1}"
|
||
|
|
: "${RL_DEVICE:=cuda:1}"
|
||
|
|
: "${RL_ENVS:=4}"
|
||
|
|
: "${RL_TOTAL_STEPS:=200000}"
|
||
|
|
: "${RL_EVAL_FREQ:=20000}"
|
||
|
|
: "${RL_EVAL_EPS:=10}"
|
||
|
|
: "${RL_MAX_TURNS:=100}"
|
||
|
|
: "${RL_PIDFILE:=/tmp/rl-train.pid}"
|
||
|
|
|
||
|
|
LOG_REMOTE="${RL_WORKTREE}/training-${RL_RUN_NAME}.log"
|
||
|
|
EVAL_DIR_REMOTE="${RL_WORKTREE}/tooling/rl_self_play/runs/${RL_RUN_NAME}/eval"
|
||
|
|
|
||
|
|
cmd="${1:-status}"
|
||
|
|
shift || true
|
||
|
|
|
||
|
|
remote() { ssh "${RL_HOST}" "$1"; }
|
||
|
|
|
||
|
|
case "$cmd" in
|
||
|
|
status)
|
||
|
|
remote "
|
||
|
|
set +e
|
||
|
|
echo '---PYTHON PID---'
|
||
|
|
py=\$(pgrep -f 'python3 -m tooling.rl_self_play.train' | head -1)
|
||
|
|
if [ -z \"\$py\" ]; then
|
||
|
|
echo 'no training process'
|
||
|
|
echo '---EVAL DIR (${EVAL_DIR_REMOTE})---'
|
||
|
|
ls -la ${EVAL_DIR_REMOTE} 2>/dev/null || echo 'missing'
|
||
|
|
exit 0
|
||
|
|
fi
|
||
|
|
ps -p \$py -o pid,etime,pcpu,pmem,cmd
|
||
|
|
log=\$(readlink /proc/\$py/fd/1 2>/dev/null)
|
||
|
|
echo \"---LOG (\$log)---\"
|
||
|
|
tail -40 \"\$log\" 2>/dev/null || echo 'log not readable'
|
||
|
|
echo '---EVAL DIR (${EVAL_DIR_REMOTE})---'
|
||
|
|
ls -la ${EVAL_DIR_REMOTE} 2>/dev/null || echo 'missing'
|
||
|
|
"
|
||
|
|
;;
|
||
|
|
|
||
|
|
logs)
|
||
|
|
n="${1:-60}"
|
||
|
|
remote "
|
||
|
|
py=\$(pgrep -f 'python3 -m tooling.rl_self_play.train' | head -1)
|
||
|
|
if [ -z \"\$py\" ]; then echo 'no training process'; exit 1; fi
|
||
|
|
tail -${n} \"\$(readlink /proc/\$py/fd/1)\"
|
||
|
|
"
|
||
|
|
;;
|
||
|
|
|
||
|
|
gpu)
|
||
|
|
remote 'nvidia-smi --query-gpu=index,name,memory.used,memory.total,utilization.gpu --format=csv'
|
||
|
|
;;
|
||
|
|
|
||
|
|
procs)
|
||
|
|
remote "
|
||
|
|
printf 'python train procs: '; pgrep -af 'python3 -m tooling.rl_self_play.train' | wc -l
|
||
|
|
printf 'godot-bin procs: '; pgrep -af 'godot-bin' | wc -l
|
||
|
|
uptime
|
||
|
|
"
|
||
|
|
;;
|
||
|
|
|
||
|
|
launch)
|
||
|
|
remote "
|
||
|
|
cd ${RL_WORKTREE} || exit 1
|
||
|
|
if pgrep -f 'python3 -m tooling.rl_self_play.train' >/dev/null; then
|
||
|
|
echo 'training already running; run kill first'
|
||
|
|
pgrep -af 'python3 -m tooling.rl_self_play.train'
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
nohup python3 -m tooling.rl_self_play.train \
|
||
|
|
--device ${RL_DEVICE} --num-envs ${RL_ENVS} \
|
||
|
|
--total-steps ${RL_TOTAL_STEPS} --eval-freq ${RL_EVAL_FREQ} \
|
||
|
|
--eval-episodes ${RL_EVAL_EPS} --max-turns ${RL_MAX_TURNS} \
|
||
|
|
--run-name ${RL_RUN_NAME} > ${LOG_REMOTE} 2>&1 &
|
||
|
|
echo \$! > ${RL_PIDFILE}
|
||
|
|
sleep 3
|
||
|
|
pgrep -af 'python3 -m tooling.rl_self_play.train' || (echo 'launch failed; check log'; tail -20 ${LOG_REMOTE})
|
||
|
|
"
|
||
|
|
;;
|
||
|
|
|
||
|
|
kill)
|
||
|
|
remote "
|
||
|
|
pkill -f 'rl_self_play.train' 2>/dev/null || true
|
||
|
|
sleep 5
|
||
|
|
pkill -9 -f 'rl_self_play.train' 2>/dev/null || true
|
||
|
|
pkill -9 -f 'godot-bin --path' 2>/dev/null || true
|
||
|
|
printf 'remaining: '
|
||
|
|
pgrep -f 'rl_self_play.train|godot-bin' | wc -l
|
||
|
|
"
|
||
|
|
;;
|
||
|
|
|
||
|
|
sync)
|
||
|
|
here="$(cd "$(dirname "$0")/.." && pwd)"
|
||
|
|
scp -q "${here}/tooling/rl_self_play/"*.py \
|
||
|
|
"${RL_HOST}:${RL_WORKTREE}/tooling/rl_self_play/"
|
||
|
|
echo "synced tooling/rl_self_play/*.py to ${RL_HOST}:${RL_WORKTREE}"
|
||
|
|
;;
|
||
|
|
|
||
|
|
*)
|
||
|
|
sed -n '2,28p' "$0"
|
||
|
|
exit 2
|
||
|
|
;;
|
||
|
|
esac
|