From de5fbd42c436a89b2249ed0d7a7d8472dad4271a Mon Sep 17 00:00:00 2001
From: Natalie <natalie@lilithuwu.com>
Date: Sun, 17 May 2026 04:02:09 -0700
Subject: [PATCH] =?UTF-8?q?feat(tooling):=20=E2=9C=A8=20add=20apricot=20gp?=
 =?UTF-8?q?u=20device=20guidance?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
---
 tooling/rl_self_play/README.md | 20 ++++++++++++++++++++
 tooling/rl_self_play/train.py  | 27 +++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/tooling/rl_self_play/README.md b/tooling/rl_self_play/README.md
index ac66117e..86e8ea40 100644
--- a/tooling/rl_self_play/README.md
+++ b/tooling/rl_self_play/README.md
@@ -72,6 +72,26 @@ python -m tooling.rl_self_play.train --total-steps 1_000_000 --num-envs 4
 tensorboard --logdir tooling/rl_self_play/runs/
 ```
 
+### Apricot GPU layout
+
+Apricot has 2× NVIDIA RTX 3090 (24 GB each). The typical division:
+
+- `cuda:0` — model-boss inference / commit-message daemon (frequently busy).
+- `cuda:1` — free; use this for RL training to avoid contention.
+
+```bash
+ssh apricot
+cd ~/Code/project-buildspace/magic-civilization   # or wherever the canonical checkout lives
+pip install -r tooling/rl_self_play/requirements.txt   # one-time
+python -m tooling.rl_self_play.train --device cuda:1 --num-envs 8 --total-steps 5_000_000
+```
+
+`--device auto` is the safe default for a single-GPU box or local Mac
+(`mps` on Apple Silicon). The MlpPolicy this scaffold uses fits in
+well under 1 GB VRAM, so the bottleneck is the harness CPU subprocesses
+rather than the GPU. Raise `--num-envs` (one harness each) to keep
+the GPU fed.
+
 For evaluation only (no training):
 
 ```bash
diff --git a/tooling/rl_self_play/train.py b/tooling/rl_self_play/train.py
index eb861e91..a6567e8c 100644
--- a/tooling/rl_self_play/train.py
+++ b/tooling/rl_self_play/train.py
@@ -68,6 +68,12 @@ def _build_argparser() -> argparse.ArgumentParser:
                    help="Subdirectory under runs/ + models/ (default: duel-v1).")
     p.add_argument("--seed", type=int, default=42,
                    help="Base RNG seed; per-env seeds offset from this (default: 42).")
+    p.add_argument("--device", default="auto",
+                   help=("Torch device for the policy net: 'auto' (default — "
+                         "picks cuda if available, else cpu), 'cuda', "
+                         "'cuda:1' (second GPU), 'mps' (Apple Silicon), or "
+                         "'cpu'. On apricot, prefer 'cuda:1' so cuda:0 stays "
+                         "free for model-boss / MCTS rollouts."))
     return p
 
 
@@ -130,12 +136,33 @@ def main() -> int:
         render=False,
     )
 
+    # Resolve `--device` for logging clarity — sb3 accepts 'auto' but we
+    # want to print exactly which device the rollouts will land on so a
+    # multi-GPU box (apricot has 2× RTX 3090) can be confirmed at a glance.
+    import torch  # type: ignore[import-not-found]
+
+    if args.device == "auto":
+        if torch.cuda.is_available():
+            resolved_device = "cuda"
+        elif getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
+            resolved_device = "mps"
+        else:
+            resolved_device = "cpu"
+    else:
+        resolved_device = args.device
+    print(
+        f"policy device: {resolved_device}  "
+        f"(cuda_available={torch.cuda.is_available()}, "
+        f"cuda_devices={torch.cuda.device_count() if torch.cuda.is_available() else 0})"
+    )
+
     model = MaskablePPO(
         "MlpPolicy",
         train_env,
         verbose=1,
         tensorboard_log=str(run_dir),
         seed=args.seed,
+        device=resolved_device,
         n_steps=512,
         batch_size=128,
         learning_rate=3e-4,