From de5fbd42c436a89b2249ed0d7a7d8472dad4271a Mon Sep 17 00:00:00 2001 From: Natalie Date: Sun, 17 May 2026 04:02:09 -0700 Subject: [PATCH] =?UTF-8?q?feat(tooling):=20=E2=9C=A8=20add=20apricot=20gp?= =?UTF-8?q?u=20device=20guidance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Lilith Autocommit --- tooling/rl_self_play/README.md | 20 ++++++++++++++++++++ tooling/rl_self_play/train.py | 27 +++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/tooling/rl_self_play/README.md b/tooling/rl_self_play/README.md index ac66117e..86e8ea40 100644 --- a/tooling/rl_self_play/README.md +++ b/tooling/rl_self_play/README.md @@ -72,6 +72,26 @@ python -m tooling.rl_self_play.train --total-steps 1_000_000 --num-envs 4 tensorboard --logdir tooling/rl_self_play/runs/ ``` +### Apricot GPU layout + +Apricot has 2× NVIDIA RTX 3090 (24 GB each). The typical division: + +- `cuda:0` — model-boss inference / commit-message daemon (frequently busy). +- `cuda:1` — free; use this for RL training to avoid contention. + +```bash +ssh apricot +cd ~/Code/project-buildspace/magic-civilization # or wherever the canonical checkout lives +pip install -r tooling/rl_self_play/requirements.txt # one-time +python -m tooling.rl_self_play.train --device cuda:1 --num-envs 8 --total-steps 5_000_000 +``` + +`--device auto` is the safe default for a single-GPU box or local Mac +(`mps` on Apple Silicon). The MlpPolicy this scaffold uses fits in +well under 1 GB VRAM, so the bottleneck is the harness CPU subprocesses +rather than the GPU. Raise `--num-envs` (one harness each) to keep +the GPU fed. + For evaluation only (no training): ```bash diff --git a/tooling/rl_self_play/train.py b/tooling/rl_self_play/train.py index eb861e91..a6567e8c 100644 --- a/tooling/rl_self_play/train.py +++ b/tooling/rl_self_play/train.py @@ -68,6 +68,12 @@ def _build_argparser() -> argparse.ArgumentParser: help="Subdirectory under runs/ + models/ (default: duel-v1).") p.add_argument("--seed", type=int, default=42, help="Base RNG seed; per-env seeds offset from this (default: 42).") + p.add_argument("--device", default="auto", + help=("Torch device for the policy net: 'auto' (default — " + "picks cuda if available, else cpu), 'cuda', " + "'cuda:1' (second GPU), 'mps' (Apple Silicon), or " + "'cpu'. On apricot, prefer 'cuda:1' so cuda:0 stays " + "free for model-boss / MCTS rollouts.")) return p @@ -130,12 +136,33 @@ def main() -> int: render=False, ) + # Resolve `--device` for logging clarity — sb3 accepts 'auto' but we + # want to print exactly which device the rollouts will land on so a + # multi-GPU box (apricot has 2× RTX 3090) can be confirmed at a glance. + import torch # type: ignore[import-not-found] + + if args.device == "auto": + if torch.cuda.is_available(): + resolved_device = "cuda" + elif getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available(): + resolved_device = "mps" + else: + resolved_device = "cpu" + else: + resolved_device = args.device + print( + f"policy device: {resolved_device} " + f"(cuda_available={torch.cuda.is_available()}, " + f"cuda_devices={torch.cuda.device_count() if torch.cuda.is_available() else 0})" + ) + model = MaskablePPO( "MlpPolicy", train_env, verbose=1, tensorboard_log=str(run_dir), seed=args.seed, + device=resolved_device, n_steps=512, batch_size=128, learning_rate=3e-4,