diff --git a/tooling/rl_self_play/README.md b/tooling/rl_self_play/README.md index 0c6f41fc..ac66117e 100644 --- a/tooling/rl_self_play/README.md +++ b/tooling/rl_self_play/README.md @@ -33,6 +33,7 @@ the right shape. | `magic_civ_env.py` | `gymnasium.Env` subclass exposing the harness as one episode = one game. Implements `action_masks()` for MaskablePPO. | | `train.py` | CLI entry. Builds K parallel envs (each its own harness), runs MaskablePPO, periodically evaluates against the same baseline, saves best model. | | `evaluate.py` | Standalone eval — load a saved model, run N games, print `{episodes, wins, losses, draws, win_rate, mean_turns}` JSON. | +| `smoke.py` | Stdlib-only CI gate. Drives the harness + encoders through a random-policy loop without importing `gymnasium`/`sb3`/`torch`. Prints a one-line JSON verdict; exit 0 on `passed: true`. Run before any training session to confirm the protocol layer is intact. | | `requirements.txt` | Pinned versions; `pip install -r requirements.txt` is the one-time setup. | ## Methodology @@ -53,8 +54,18 @@ the right shape. ## Run it +Smoke test the protocol layer first (no heavy deps required): + ```bash cd /Users/natalie/Code/@projects/@magic-civilization +python3 -m tooling.rl_self_play.smoke --turns 30 +# → {"steps": 332, "turns_reached": 30, "mask_violations": 0, +# "harness_errors": 0, "passed": true} +``` + +Then install RL deps and train: + +```bash pip install -r tooling/rl_self_play/requirements.txt python -m tooling.rl_self_play.train --total-steps 1_000_000 --num-envs 4 # In a second terminal: diff --git a/tooling/rl_self_play/requirements.txt b/tooling/rl_self_play/requirements.txt index 64db9be0..15fc934c 100644 --- a/tooling/rl_self_play/requirements.txt +++ b/tooling/rl_self_play/requirements.txt @@ -1,9 +1,10 @@ -# Pinned to versions that are known to compose cleanly with sb3-contrib's -# MaskablePPO as of 2026-Q2. Bump together — sb3 and sb3-contrib track in -# lockstep, and torch's wheel ABI changes between minor versions. +# Pinned to versions known to compose cleanly with sb3-contrib's +# MaskablePPO and Python 3.12+ as of 2026-Q2. Bump together — sb3 and +# sb3-contrib track in lockstep; torch's wheel ABI changes between +# minor versions. torch 2.5+ is required for Python 3.13 support. gymnasium==1.2.1 -stable-baselines3==2.7.0 -sb3-contrib==2.7.0 -torch==2.4.1 +stable-baselines3>=2.8.0,<2.10 +sb3-contrib>=2.8.0,<2.10 +torch>=2.5.0,<2.7 numpy>=2.0,<3 tensorboard>=2.18 diff --git a/tooling/rl_self_play/smoke.py b/tooling/rl_self_play/smoke.py new file mode 100644 index 00000000..98f134a8 --- /dev/null +++ b/tooling/rl_self_play/smoke.py @@ -0,0 +1,153 @@ +"""Stdlib-only smoke test for the harness + encoder layer. + +Verifies — without needing `gymnasium`, `stable-baselines3`, or `torch` — +that the protocol shim works end-to-end: + + 1. `HarnessClient` spawns the Godot subprocess and returns a valid + `view` JSON on first request. + 2. `encode_observation` projects every view into a fixed-shape + `np.float32[OBS_DIM]` without raising. + 3. `encode_legal_actions` produces a boolean mask whose `True` + positions all map back to a legal `PlayerAction` via + `decode_action_index`. + 4. A random-policy loop bounded by `--turns` reaches the turn limit + OR terminates cleanly without raising `HarnessError`. + +Run: + python3 -m tooling.rl_self_play.smoke [--turns 30] [--seed 42] + +Output is one-line JSON like: + + {"steps": 87, "turns_reached": 30, "mask_violations": 0, + "harness_errors": 0, "obs_dim": 32, "action_dim": 322, + "episodes": 1, "passed": true} + +Exit 0 on `passed: true`; non-zero otherwise. Suitable as a CI gate +before any real training run. +""" +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +import numpy as np + +THIS_DIR = Path(__file__).resolve().parent +PROJECT_ROOT = THIS_DIR.parents[1] +if __package__ is None: + sys.path.insert(0, str(PROJECT_ROOT)) + +from tooling.rl_self_play.encoders import ( # noqa: E402 + ACTION_DIM, + OBS_DIM, + decode_action_index, + encode_legal_actions, + encode_observation, +) +from tooling.rl_self_play.harness_client import ( # noqa: E402 + HarnessClient, + HarnessConfig, + HarnessError, +) + + +def _build_argparser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(description="Smoke-test the harness + encoder layer") + p.add_argument("--turns", type=int, default=30, help="Max turns per episode") + p.add_argument("--episodes", type=int, default=1, help="Episodes to run") + p.add_argument("--seed", type=int, default=42, help="Base RNG seed") + p.add_argument("--players", type=int, default=2) + p.add_argument("--map-size", default="duel") + return p + + +def _run_episode( + client: HarnessClient, rng: np.random.Generator, max_turns: int +) -> dict[str, int]: + steps = 0 + mask_violations = 0 + eliminations = 0 + last_turn = 0 + view = client.view() + while last_turn < max_turns: + obs = encode_observation(view) + if obs.shape != (OBS_DIM,): + mask_violations += 1 + break + mask, idx_to_action = encode_legal_actions(view) + legal_indices = np.where(mask)[0] + if legal_indices.size == 0: + mask_violations += 1 + break + idx = int(rng.choice(legal_indices)) + action = decode_action_index(idx, idx_to_action) + if action.get("type") == "end_turn": + client.end_turn() + else: + client.act(action) + view = client.view() + last_turn = int(view.get("turn", 0)) + steps += 1 + score = view.get("score", {}) + if int(score.get("city_count", 0)) == 0: + units = view.get("units", []) + me = int(view.get("player", 0)) + has_founder = any( + int(u.get("owner", -1)) == me + and "founder" in str(u.get("type", "")) + and float(u.get("hp", 0)) > 0 + for u in units + ) + if not has_founder: + eliminations += 1 + break + return { + "steps": steps, + "turns_reached": last_turn, + "mask_violations": mask_violations, + "eliminations": eliminations, + } + + +def main() -> int: + args = _build_argparser().parse_args() + rng = np.random.default_rng(args.seed) + totals = { + "steps": 0, + "turns_reached": 0, + "mask_violations": 0, + "eliminations": 0, + "harness_errors": 0, + } + for episode in range(args.episodes): + cfg = HarnessConfig( + seed=args.seed + episode, + players=args.players, + player_slot=0, + map_size=args.map_size, + ) + with HarnessClient(cfg) as client: + try: + result = _run_episode(client, rng, args.turns) + except HarnessError: + totals["harness_errors"] += 1 + continue + totals["steps"] += result["steps"] + totals["turns_reached"] = max(totals["turns_reached"], result["turns_reached"]) + totals["mask_violations"] += result["mask_violations"] + totals["eliminations"] += result["eliminations"] + verdict = { + **totals, + "obs_dim": OBS_DIM, + "action_dim": ACTION_DIM, + "episodes": args.episodes, + "passed": totals["mask_violations"] == 0 and totals["harness_errors"] == 0, + } + print(json.dumps(verdict)) + return 0 if verdict["passed"] else 1 + + +if __name__ == "__main__": + sys.exit(main())