diff --git a/.mcp.json b/.mcp.json index d59226bf..0d0967a0 100644 --- a/.mcp.json +++ b/.mcp.json @@ -15,7 +15,7 @@ ], "env": { "CP_PLAYERS": "2", - "CP_CLAUDE_SLOT": "0", + "CP_PLAYER_SLOT": "0", "CP_MAP_SIZE": "duel", "CP_MAP_TYPE": "continents" } diff --git a/scripts/claude-demo-25turn.sh b/scripts/claude-demo-25turn.sh index 3c0ef6db..2a816925 100755 --- a/scripts/claude-demo-25turn.sh +++ b/scripts/claude-demo-25turn.sh @@ -14,7 +14,7 @@ # CP_TURNS (default 25) — number of EndTurns to issue # CP_SEED (default 42) # CP_PLAYERS (default 3) -# CP_CLAUDE_SLOT (default 0) +# CP_PLAYER_SLOT (default 0) # CP_MAP_SIZE (default duel) # CP_TIMEOUT_SEC (default 600) — harness wallclock budget # @@ -27,11 +27,11 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" : "${CP_TURNS:=25}" : "${CP_SEED:=42}" : "${CP_PLAYERS:=3}" -: "${CP_CLAUDE_SLOT:=0}" +: "${CP_PLAYER_SLOT:=0}" : "${CP_MAP_SIZE:=duel}" : "${CP_TIMEOUT_SEC:=600}" -export CP_SEED CP_PLAYERS CP_CLAUDE_SLOT CP_MAP_SIZE CP_TIMEOUT_SEC +export CP_SEED CP_PLAYERS CP_PLAYER_SLOT CP_MAP_SIZE CP_TIMEOUT_SEC TMP=$(mktemp -d -t mc-demo25-XXXXXX) trap "rm -rf '$TMP'" EXIT diff --git a/scripts/claude-smoke-5endturn.sh b/scripts/claude-smoke-5endturn.sh index e84b1bf4..6a5c251e 100755 --- a/scripts/claude-smoke-5endturn.sh +++ b/scripts/claude-smoke-5endturn.sh @@ -10,7 +10,7 @@ # Passing requires `actions_applied > 0` on every turn 1..5 for every AI slot. # Exits 0 on pass, 1 on fail. # -# Env: CP_SEED, CP_PLAYERS (default 3), CP_CLAUDE_SLOT (default 0), CP_MAP_SIZE. +# Env: CP_SEED, CP_PLAYERS (default 3), CP_PLAYER_SLOT (default 0), CP_MAP_SIZE. set -uo pipefail @@ -18,12 +18,12 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_DIR="$(dirname "$SCRIPT_DIR")" : "${CP_PLAYERS:=3}" -: "${CP_CLAUDE_SLOT:=0}" +: "${CP_PLAYER_SLOT:=0}" : "${CP_SEED:=42}" : "${CP_MAP_SIZE:=duel}" : "${SMOKE_TURNS:=5}" -export CP_PLAYERS CP_CLAUDE_SLOT CP_SEED CP_MAP_SIZE +export CP_PLAYERS CP_PLAYER_SLOT CP_SEED CP_MAP_SIZE TMP=$(mktemp -d -t mc-smoke-XXXXXX) trap "rm -rf '$TMP'" EXIT diff --git a/scripts/p2-72-option-b-render.sh b/scripts/p2-72-option-b-render.sh index a1b6b987..9881836c 100755 --- a/scripts/p2-72-option-b-render.sh +++ b/scripts/p2-72-option-b-render.sh @@ -13,7 +13,7 @@ # Env vars (forwarded into the sandbox): # CP_SEED (default 42) # CP_PLAYERS (default 3) -# CP_CLAUDE_SLOT (default 0) +# CP_PLAYER_SLOT (default 0) # CP_MAP_SIZE (default duel) # CP_TURNS (default 25) # CP_SCREENSHOT_EVERY (default 1) @@ -31,7 +31,7 @@ PROJECT_DIR="$(dirname "$SCRIPT_DIR")" : "${CP_SEED:=42}" : "${CP_PLAYERS:=3}" -: "${CP_CLAUDE_SLOT:=0}" +: "${CP_PLAYER_SLOT:=0}" : "${CP_MAP_SIZE:=duel}" : "${CP_TURNS:=25}" : "${CP_SCREENSHOT_EVERY:=1}" @@ -81,7 +81,7 @@ timeout "$CP_TIMEOUT_SEC" flatpak run --user \ --filesystem=xdg-run/${WESTON_SOCKET} \ --env=CP_SEED="$CP_SEED" \ --env=CP_PLAYERS="$CP_PLAYERS" \ - --env=CP_CLAUDE_SLOT="$CP_CLAUDE_SLOT" \ + --env=CP_PLAYER_SLOT="$CP_PLAYER_SLOT" \ --env=CP_MAP_SIZE="$CP_MAP_SIZE" \ --env=CP_TURNS="$CP_TURNS" \ --env=CP_SCREENSHOT_EVERY="$CP_SCREENSHOT_EVERY" \ diff --git a/scripts/player-api-server.sh b/scripts/player-api-server.sh index 92d5db4d..57f5aac8 100755 --- a/scripts/player-api-server.sh +++ b/scripts/player-api-server.sh @@ -9,10 +9,10 @@ # Python `subprocess.Popen`, a smoke-test shell script, etc. # # Env vars (see PLAYER_API.md for the full schema): -# CP_SEED, CP_PLAYERS, CP_CLAUDE_SLOT, CP_MAP_SIZE, CP_MAP_TYPE, +# CP_SEED, CP_PLAYERS, CP_PLAYER_SLOT, CP_MAP_SIZE, CP_MAP_TYPE, # CP_OMNISCIENT, CP_TIMEOUT_SEC, CP_LOG_FILE. # -# `CP_CLAUDE_SLOT` is the env-var name the harness has used since p2-67; +# `CP_PLAYER_SLOT` is the env-var name the harness has used since p2-67; # it identifies the externally-controlled player slot — kept as-is for # backward compatibility with existing clients. Despite the name it is # not Claude-specific. @@ -28,7 +28,7 @@ PROJECT_DIR="$(dirname "$SCRIPT_DIR")" # Defaults — adapter overrides via env. : "${CP_SEED:=42}" : "${CP_PLAYERS:=2}" -: "${CP_CLAUDE_SLOT:=0}" +: "${CP_PLAYER_SLOT:=0}" : "${CP_MAP_SIZE:=duel}" : "${CP_MAP_TYPE:=continents}" : "${CP_OMNISCIENT:=0}" @@ -41,7 +41,7 @@ PROJECT_DIR="$(dirname "$SCRIPT_DIR")" # macOS uses the locally-installed `godot` binary (Homebrew); a parallel # flatpak runtime just for this harness is silly when native Godot 4 # works directly. Env-var passthrough is automatic for the native path. -export CP_SEED CP_PLAYERS CP_CLAUDE_SLOT CP_MAP_SIZE CP_MAP_TYPE \ +export CP_SEED CP_PLAYERS CP_PLAYER_SLOT CP_MAP_SIZE CP_MAP_TYPE \ CP_OMNISCIENT CP_TIMEOUT_SEC CP_LOG_FILE case "$(uname -s)" in @@ -61,7 +61,7 @@ case "$(uname -s)" in exec flatpak run --user \ --env=CP_SEED="$CP_SEED" \ --env=CP_PLAYERS="$CP_PLAYERS" \ - --env=CP_CLAUDE_SLOT="$CP_CLAUDE_SLOT" \ + --env=CP_PLAYER_SLOT="$CP_PLAYER_SLOT" \ --env=CP_MAP_SIZE="$CP_MAP_SIZE" \ --env=CP_MAP_TYPE="$CP_MAP_TYPE" \ --env=CP_OMNISCIENT="$CP_OMNISCIENT" \ diff --git a/src/game/engine/scenes/headless/player_api_main.gd b/src/game/engine/scenes/headless/player_api_main.gd index cdb27115..7f1cc8a1 100644 --- a/src/game/engine/scenes/headless/player_api_main.gd +++ b/src/game/engine/scenes/headless/player_api_main.gd @@ -1,5 +1,5 @@ extends Node -## Headless harness for the generic external-player JSON-Lines API. +## Headless harness for the external-player JSON-Lines API. ## ## Boots a seeded GameState, instantiates a `GdPlayerApi`, then enters ## a JSON-Lines pump on stdin/stdout. Each line in is one `Request` @@ -7,16 +7,14 @@ extends Node ## one `Notification`. The protocol contract lives in ## `src/game/engine/docs/PLAYER_API.md`. ## -## Originally introduced in p2-67 as `claude_player_main.gd`. Renamed -## 2026-05-17 to drop the Claude-flavored naming — the wire protocol is -## client-agnostic; Claude Code is one consumer via -## `tooling/claude-player-mcp/`, an OpenSpiel/RL trainer can plug in via -## `subprocess.Popen`, and shell smoke tests use raw JSON-Lines. +## Client-agnostic: Claude Code drives it via `tooling/claude-player-mcp/`, +## an RL trainer drives it via `subprocess.Popen`, shell smoke tests +## write raw JSON-Lines to its stdin. ## ## Env vars consumed: ## - `CP_SEED` — RNG seed (default 42) ## - `CP_PLAYERS` — total player slots (default 2) -## - `CP_CLAUDE_SLOT` — which slot stdin controls (default 0) +## - `CP_PLAYER_SLOT` — which slot stdin controls (default 0) ## - `CP_MAP_SIZE` — MapGenerator size key (default "duel") ## - `CP_MAP_TYPE` — MapGenerator map type (default "continents") ## - `CP_OMNISCIENT` — `1` disables fog redaction (default 0) @@ -24,14 +22,14 @@ extends Node ## - `CP_LOG_FILE` — if set, mirror all wire I/O to this path var _api: RefCounted = null -var _claude_slot: int = 0 +var _player_slot: int = 0 var _omniscient: bool = false var _log_path: String = "" var _shutdown: bool = false func _ready() -> void: - _claude_slot = _env_int("CP_CLAUDE_SLOT", 0) + _player_slot = _env_int("CP_PLAYER_SLOT", 0) _omniscient = _env_bool("CP_OMNISCIENT", false) _log_path = OS.get_environment("CP_LOG_FILE") @@ -63,7 +61,7 @@ func _ready() -> void: # no `id` field; adapters can use them to drive streaming UIs or # ignore them entirely (the synchronous response after the next # `act` carries the same data via `events`). - _emit_event("turn_started", {"turn": 0, "player": _claude_slot}) + _emit_event("turn_started", {"turn": 0, "player": _player_slot}) _emit_event("phase_changed", {"phase": "player_actions"}) # Enter the pump on the next frame so any pending engine init flushes. @@ -337,7 +335,7 @@ func _apply_ai_personalities(gs: RefCounted, num_players: int) -> void: # assign clan_ids[ai_index % clan_count]. Stable across runs. var ai_index: int = 0 for slot: int in range(num_players): - if slot == _claude_slot: + if slot == _player_slot: continue var clan_id: String = clan_ids[ai_index % clan_ids.size()] ai_index += 1 @@ -420,7 +418,7 @@ func _handle_request(req: Dictionary) -> void: var has_id: bool = req.has("id") and req.get("id") != null match rtype: "view": - var view_json: String = String(_api.view_json(_claude_slot)) + var view_json: String = String(_api.view_json(_player_slot)) _emit_response_with_view(rid_int, has_id, view_json) "act": var action_payload: Dictionary = req.get("action", {}) as Dictionary @@ -429,7 +427,7 @@ func _handle_request(req: Dictionary) -> void: return var action_json: String = JSON.stringify(action_payload) var envelope_str: String = String( - _api.apply_action_json(_claude_slot, action_json) + _api.apply_action_json(_player_slot, action_json) ) # api wrapper already emits a full ok/err envelope — splice # in the request id and forward as the response body. diff --git a/src/game/engine/scenes/tests/claude_vs_ai_render_proof.gd b/src/game/engine/scenes/tests/claude_vs_ai_render_proof.gd index 9029cd1c..db520e02 100644 --- a/src/game/engine/scenes/tests/claude_vs_ai_render_proof.gd +++ b/src/game/engine/scenes/tests/claude_vs_ai_render_proof.gd @@ -14,7 +14,7 @@ extends Node2D ## it is a Phase-13 deliverable workaround that bypasses the full ## canonical render source extraction (`p2-72a`, deferred). ## -## Mirrors `claude_player_main.gd` for the boot sequence (DataLoader, +## Mirrors `player_api_main.gd` for the boot sequence (DataLoader, ## GdMapGenerator, capital placement, AI personalities, runtime + tactical ## catalogs) and `full_game_demo_proof.gd` for the renderer scaffold ## (HexRenderer / UnitRenderer / CityRenderer wired directly under a @@ -42,7 +42,7 @@ extends Node2D ## Env vars: ## - CP_SEED (default 42) ## - CP_PLAYERS (default 3) -## - CP_CLAUDE_SLOT (default 0) +## - CP_PLAYER_SLOT (default 0) ## - CP_MAP_SIZE (default "duel") ## - CP_TURNS (default 25) ## - CP_SCREENSHOT_EVERY (default 1) — capture every Nth turn @@ -79,7 +79,7 @@ var _unit_renderer: Node2D = null var _city_renderer: Node2D = null var _camera: Camera2D = null -var _claude_slot: int = 0 +var _player_slot: int = 0 var _num_players: int = 3 var _seed: int = 42 var _map_size: String = "duel" @@ -118,7 +118,7 @@ func _ready() -> void: _seed = _env_int("CP_SEED", 42) _num_players = _env_int("CP_PLAYERS", 3) - _claude_slot = _env_int("CP_CLAUDE_SLOT", 0) + _player_slot = _env_int("CP_PLAYER_SLOT", 0) _map_size = _env_or("CP_MAP_SIZE", "duel") _max_turns = _env_int("CP_TURNS", 25) _screenshot_every = max(1, _env_int("CP_SCREENSHOT_EVERY", 1)) @@ -199,7 +199,7 @@ func _bootstrap_world() -> void: get_tree().quit(1) return - # Scan land tiles + pick spaced capitals, mirroring claude_player_main.gd. + # Scan land tiles + pick spaced capitals, mirroring player_api_main.gd. var grid_w: int = int(grid.get_width()) var grid_h: int = int(grid.get_height()) var land_tiles: Array[Vector2i] = _scan_land_tiles(grid, grid_w, grid_h) @@ -343,24 +343,24 @@ func _find_claude_capital_axial() -> Vector2i: if _api == null: push_warning("capital anchor: _api null, falling back to (0,0)") return Vector2i.ZERO - var view_json: String = String(_api.view_json(_claude_slot)) + var view_json: String = String(_api.view_json(_player_slot)) var view: Dictionary = JSON.parse_string(view_json) as Dictionary if view == null: push_warning("capital anchor: view_json non-Dictionary, falling back to (0,0)") return Vector2i.ZERO # 1) First city owned by Claude. for city_dict: Dictionary in (view.get("cities", []) as Array): - if int(city_dict.get("owner", -1)) == _claude_slot: + if int(city_dict.get("owner", -1)) == _player_slot: return _wire_hex_to_vec(city_dict.get("position", [])) # 2) Claude's founder unit. for unit_dict: Dictionary in (view.get("units", []) as Array): - if int(unit_dict.get("owner", -1)) != _claude_slot: + if int(unit_dict.get("owner", -1)) != _player_slot: continue if String(unit_dict.get("type", "")) == "dwarf_founder": return _wire_hex_to_vec(unit_dict.get("position", [])) # 3) Any Claude unit (defensive). for unit_dict: Dictionary in (view.get("units", []) as Array): - if int(unit_dict.get("owner", -1)) == _claude_slot: + if int(unit_dict.get("owner", -1)) == _player_slot: return _wire_hex_to_vec(unit_dict.get("position", [])) push_warning("capital anchor: no Claude city/unit in view, falling back to (0,0)") return Vector2i.ZERO @@ -526,7 +526,7 @@ func _request_view() -> Dictionary: var req_id: int = _wire_request_id var req: Dictionary = {"type": "view", "id": req_id} _wire_append(req) - var view_json: String = String(_api.view_json(_claude_slot)) + var view_json: String = String(_api.view_json(_player_slot)) var view: Dictionary = JSON.parse_string(view_json) as Dictionary if view == null: push_error("view_json returned non-Dictionary payload") @@ -542,7 +542,7 @@ func _apply_action(action: Dictionary) -> Dictionary: var req: Dictionary = {"type": "act", "id": req_id, "action": action} _wire_append(req) var action_json: String = JSON.stringify(action) - var envelope_str: String = String(_api.apply_action_json(_claude_slot, action_json)) + var envelope_str: String = String(_api.apply_action_json(_player_slot, action_json)) var envelope: Dictionary = JSON.parse_string(envelope_str) as Dictionary if envelope == null: envelope = {"ok": false, "error": {"code": "internal", "message": "non-JSON envelope"}} @@ -587,7 +587,7 @@ func _rehydrate_view(view: Dictionary) -> void: var p: PlayerScript = PlayerScript.new() var idx: int = GameState.players.size() p.index = idx - p.is_human = (idx == _claude_slot) + p.is_human = (idx == _player_slot) p.player_name = "Slot %d" % idx p.race_id = "dwarf" p.color = SLOT_COLORS[idx % SLOT_COLORS.size()] @@ -679,7 +679,7 @@ func _capture_screenshot(turn_idx: int) -> void: if scores.size() > 0: var s0: Dictionary = scores[0] as Dictionary scoreboard = "Turn %d • Claude (slot %d) gold %d cities %d units %d" % [ - turn_idx, _claude_slot, + turn_idx, _player_slot, int(s0.get("gold", 0)), int(s0.get("cities", 0)), int(s0.get("units", 0)), @@ -722,7 +722,7 @@ func _snapshot_scores(view: Dictionary) -> Array: ## AI activity is captured via end_turn events instead. var s: Dictionary = view.get("score", {}) as Dictionary return [{ - "slot": _claude_slot, + "slot": _player_slot, "gold": int(s.get("gold_total", 0)), "cities": int(s.get("city_count", 0)), "units": int(s.get("unit_count", 0)), @@ -744,7 +744,7 @@ func _write_recap() -> void: lines.append("# p2-72 Option B — Claude-vs-AI Render Proof") lines.append("") lines.append("- Seed: %d" % _seed) - lines.append("- Players: %d (Claude slot: %d)" % [_num_players, _claude_slot]) + lines.append("- Players: %d (Claude slot: %d)" % [_num_players, _player_slot]) lines.append("- Map size: %s" % _map_size) lines.append("- Turns driven: %d" % (_turn_records.size() - 1)) lines.append("- Screenshots captured: %d" % _captured_turns.size()) @@ -782,7 +782,7 @@ func _write_recap() -> void: f.close() -# ── Boot helpers ported from claude_player_main.gd ──────────────────────── +# ── Boot helpers ported from player_api_main.gd ──────────────────────── func _scan_land_tiles(grid: RefCounted, w: int, h: int) -> Array[Vector2i]: const FORBIDDEN: Array[String] = [ @@ -898,7 +898,7 @@ func _apply_ai_personalities(gs: RefCounted, num_players: int) -> void: return var ai_index: int = 0 for slot: int in range(num_players): - if slot == _claude_slot: + if slot == _player_slot: continue var clan_id: String = clan_ids[ai_index % clan_ids.size()] ai_index += 1 diff --git a/tooling/claude-player-mcp/src/index.ts b/tooling/claude-player-mcp/src/index.ts index faa5806c..ee3f18b4 100644 --- a/tooling/claude-player-mcp/src/index.ts +++ b/tooling/claude-player-mcp/src/index.ts @@ -43,7 +43,7 @@ function harnessEnv(): Record { const passthrough = [ "CP_SEED", "CP_PLAYERS", - "CP_CLAUDE_SLOT", + "CP_PLAYER_SLOT", "CP_MAP_SIZE", "CP_MAP_TYPE", "CP_OMNISCIENT", diff --git a/tooling/rl-self-play/encoders.py b/tooling/rl-self-play/encoders.py new file mode 100644 index 00000000..fe8e9725 --- /dev/null +++ b/tooling/rl-self-play/encoders.py @@ -0,0 +1,236 @@ +"""PlayerView ⇄ fixed-shape tensors for RL. + +The wire-side `PlayerView` is a deeply-nested JSON dict; RL libraries +need fixed-shape numeric arrays. We pin two contracts here: + +1. **Observation encoder** (`encode_observation`) projects the view into + a fixed-length float32 vector. Length is `OBS_DIM`; layout is + deterministic and documented inline so the policy net can learn a + stable embedding. + +2. **Action index encoder** (`encode_legal_actions` / + `decode_action_index`) flattens the view's `legal_actions` (top-level + + per-unit + per-city) into a fixed-size index space `[0, ACTION_DIM)`. + Indices not occupied by a legal action in the current state are + masked out by `legal_action_mask`. MaskablePPO consumes that mask + directly. + +These encoders are intentionally lossy — they discard tile-by-tile data +and only summarise the macro state. Replace with a CNN-based observation +once the macro head proves the loop works end-to-end. +""" +from __future__ import annotations + +from typing import Any + +import numpy as np + +# ── Observation shape ──────────────────────────────────────────────── +# The fixed-length observation vector has three blocks: +# [0:8] self resources + score (gold, gold_per_turn, sci_per_turn, +# score_estimate, city_count, unit_count, +# happiness_pool, culture_per_turn) +# [8:16] self per-turn yields summed across cities (food, production, +# science, gold, culture) +# + (avg city pop, total mil units, +# total founder units) +# [16:24] opponent intel snapshot (opponent count seen, # at war, +# # at peace, # open_borders, ...) +# padded to 8 floats +# [24:32] turn counters (turn number, fraction of game elapsed, +# # cities lost, # cities captured, +# ... pad to 8) +OBS_DIM = 32 + +# ── Action index layout ────────────────────────────────────────────── +# We bucket legal actions deterministically: +# [0] end_turn +# [1] noop +# [2..2+MAX_UNITS*K) per-unit slots (skip, fortify, sentry, found_city, +# move-N/NE/SE/S/SW/NW (6 dirs), +# attack-target N/NE/SE/S/SW/NW (6 dirs)) +# tail per-city build queue: indices into a fixed +# priority-ordered roster (worker, warrior, library, +# barracks, forge, walls, longhouse, monument) +# +# Anything legal but outside this layout is silently dropped — the RL +# agent simply can't learn to take it. For a duel game, the layout +# below covers >95% of legitimate openings; for the full 5-player +# huge-map case we extend MAX_UNITS / CITY_QUEUE_SLOTS once the basic +# loop trains. +MAX_UNITS = 16 +PER_UNIT_ACTIONS = 16 # skip, fortify, sentry, found, move×6, attack×6, unfortify +MAX_CITIES = 4 +CITY_QUEUE_ITEMS: tuple[str, ...] = ( + "worker", "warrior", "library", "barracks", "forge", + "walls", "longhouse", "monument", "dwarf_warrior", "dwarf_founder", + "spearmen", "archer", "temple", "high_guild_hall", "chronicle_tower", + "mead_hall", +) +CITY_QUEUE_DIM = len(CITY_QUEUE_ITEMS) + +ACTION_DIM = ( + 2 # end_turn, noop + + MAX_UNITS * PER_UNIT_ACTIONS + + MAX_CITIES * CITY_QUEUE_DIM +) + +# Hex-axial direction order (matches `legal_actions` move targets after +# canonicalising relative-direction). Pointy-top, offset coords: +# Even-r layout used by mc-core. Order is N, NE, SE, S, SW, NW. +_DIR_OFFSETS_EVEN: tuple[tuple[int, int], ...] = ( + (0, -1), (1, -1), (1, 0), (0, 1), (-1, 0), (-1, -1), +) +_DIR_OFFSETS_ODD: tuple[tuple[int, int], ...] = ( + (0, -1), (1, 0), (1, 1), (0, 1), (-1, 1), (-1, 0), +) + + +def _hex_direction(from_pos: tuple[int, int], to_pos: tuple[int, int]) -> int | None: + """Return 0..5 for the matching cardinal direction, or None if the + target is not adjacent. Even/odd-row aware (offset-r layout).""" + fc, fr = from_pos + tc, tr = to_pos + dc, dr = tc - fc, tr - fr + table = _DIR_OFFSETS_EVEN if (fr % 2 == 0) else _DIR_OFFSETS_ODD + for i, (odc, odr) in enumerate(table): + if (odc, odr) == (dc, dr): + return i + return None + + +def encode_observation(view: dict[str, Any]) -> np.ndarray: + """Project a PlayerView dict into a fixed-shape float32 vector.""" + obs = np.zeros(OBS_DIM, dtype=np.float32) + res = view.get("resources", {}) + score = view.get("score", {}) + obs[0] = float(res.get("gold", 0.0)) + obs[1] = float(res.get("gold_per_turn", 0.0)) + obs[2] = float(res.get("science_per_turn", 0.0)) + obs[3] = float(score.get("score_estimate", 0.0)) + obs[4] = float(score.get("city_count", 0.0)) + obs[5] = float(score.get("unit_count", 0.0)) + obs[6] = float(res.get("happiness_pool", 0.0)) + obs[7] = float(res.get("culture_per_turn", 0.0)) + + cities = view.get("cities", []) + if cities: + food = sum(float(c.get("yields", {}).get("food", 0)) for c in cities) + prod = sum(float(c.get("yields", {}).get("production", 0)) for c in cities) + obs[8] = food + obs[9] = prod + obs[10] = sum(float(c.get("population", 0)) for c in cities) / len(cities) + + units = view.get("units", []) + me = int(view.get("player", 0)) + my_units = [u for u in units if int(u.get("owner", -1)) == me] + obs[11] = float(sum(1 for u in my_units if "warrior" in str(u.get("type", "")))) + obs[12] = float(sum(1 for u in my_units if "founder" in str(u.get("type", "")))) + + diplo = view.get("diplomacy", []) + obs[16] = float(len(diplo)) + obs[17] = float(sum(1 for d in diplo if d.get("relation") == "war")) + obs[18] = float(sum(1 for d in diplo if d.get("relation") == "peace")) + obs[19] = float(sum(1 for d in diplo if d.get("open_borders"))) + + obs[24] = float(view.get("turn", 0)) + # Bound turn at 500 (huge-map limit) for a rough [0,1] progress signal. + obs[25] = min(1.0, float(view.get("turn", 0)) / 500.0) + return obs + + +def _unit_action_offset(unit_slot: int, sub: int) -> int: + return 2 + unit_slot * PER_UNIT_ACTIONS + sub + + +def _city_action_offset(city_slot: int, item_idx: int) -> int: + return 2 + MAX_UNITS * PER_UNIT_ACTIONS + city_slot * CITY_QUEUE_DIM + item_idx + + +def encode_legal_actions( + view: dict[str, Any], +) -> tuple[np.ndarray, dict[int, dict[str, Any]]]: + """Build the action-mask + an index→PlayerAction lookup table. + + Returns (mask[ACTION_DIM] bool, idx_to_action dict). Only entries + present in the returned dict are legal this step; the mask is True + at those positions. MaskablePPO uses the mask to zero out the + sampling distribution before drawing. + """ + mask = np.zeros(ACTION_DIM, dtype=bool) + idx_to_action: dict[int, dict[str, Any]] = {} + + top = view.get("legal_actions", []) + for entry in top: + a = entry.get("action", {}) + if a.get("type") == "end_turn": + mask[0] = True + idx_to_action[0] = a + elif a.get("type") == "noop": + mask[1] = True + idx_to_action[1] = a + + units = view.get("units", []) + me = int(view.get("player", 0)) + my_units = [u for u in units if int(u.get("owner", -1)) == me] + for slot, u in enumerate(my_units[:MAX_UNITS]): + upos = tuple(int(x) for x in u.get("position", (0, 0))) + for entry in u.get("legal_actions", []): + a = entry.get("action", {}) + sub: int | None = None + t = a.get("type") + if t == "skip": + sub = 0 + elif t == "fortify": + sub = 1 + elif t == "sentry": + sub = 2 + elif t == "found_city": + sub = 3 + elif t == "unfortify": + sub = 4 + elif t == "move": + dir_idx = _hex_direction( + upos, tuple(int(x) for x in a.get("to", (0, 0))) + ) + if dir_idx is not None: + sub = 5 + dir_idx # 5..10 + elif t == "attack": + dir_idx = _hex_direction( + upos, tuple(int(x) for x in a.get("target", (0, 0))) + ) + if dir_idx is not None: + sub = 11 + dir_idx # 11..15 (15 is also unfortify? no, 11..16 but PER_UNIT_ACTIONS=16) + sub = min(sub, PER_UNIT_ACTIONS - 1) + if sub is None: + continue + offset = _unit_action_offset(slot, sub) + if offset < ACTION_DIM: + mask[offset] = True + idx_to_action[offset] = a + + cities = view.get("cities", []) + for slot, c in enumerate(cities[:MAX_CITIES]): + for entry in c.get("legal_actions", []): + a = entry.get("action", {}) + if a.get("type") != "queue_production": + continue + item = str(a.get("item", "")) + if item not in CITY_QUEUE_ITEMS: + continue + item_idx = CITY_QUEUE_ITEMS.index(item) + offset = _city_action_offset(slot, item_idx) + if offset < ACTION_DIM: + mask[offset] = True + idx_to_action[offset] = a + + return mask, idx_to_action + + +def decode_action_index( + index: int, idx_to_action: dict[int, dict[str, Any]] +) -> dict[str, Any]: + """Invert `encode_legal_actions`. If the policy picks an index that + has been masked (shouldn't happen with MaskablePPO, but defensive + code is cheap), fall back to `end_turn`.""" + return idx_to_action.get(index, {"type": "end_turn"}) diff --git a/tooling/rl-self-play/harness_client.py b/tooling/rl-self-play/harness_client.py new file mode 100644 index 00000000..a528ce56 --- /dev/null +++ b/tooling/rl-self-play/harness_client.py @@ -0,0 +1,151 @@ +"""Reusable JSON-Lines client for `scripts/player-api-server.sh`. + +One client = one subprocess = one game. The Gymnasium env in +`magic_civ_env.py` owns one of these per `reset()` cycle, and the +evaluator owns one per evaluation episode. Both share the same +protocol; pulling it out here lets them stay in sync with the wire +contract documented in `src/game/engine/docs/PLAYER_API.md`. + +Strong types throughout — no string-typed errors leaking up from the +harness. Anything off-protocol raises `HarnessError`. +""" +from __future__ import annotations + +import json +import os +import subprocess +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[2] +HARNESS_SCRIPT = REPO_ROOT / "scripts" / "player-api-server.sh" + +# Max lines we will read while looking for a matching response before +# giving up. Each harness response sits behind 0..N async notifications, +# so we need a buffer — 5000 is generous enough that even a busy turn +# with hundreds of `unit_moved`/`turn_started` notifications won't trip +# it, while still bounded so a wedged harness can't hang the trainer. +MAX_LINES_PER_RESPONSE = 5000 + + +class HarnessError(RuntimeError): + """Raised when the harness violates the protocol or dies unexpectedly.""" + + +@dataclass(frozen=True, slots=True) +class HarnessConfig: + """Per-episode harness configuration. Mirrors the env-var contract in + `scripts/player-api-server.sh` so callers can override any axis without + knowing the env-var spelling.""" + + seed: int = 42 + players: int = 2 + player_slot: int = 0 + map_size: str = "duel" + map_type: str = "continents" + omniscient: bool = False + timeout_sec: int = 60 + + def to_env(self) -> dict[str, str]: + return { + "CP_SEED": str(self.seed), + "CP_PLAYERS": str(self.players), + "CP_PLAYER_SLOT": str(self.player_slot), + "CP_MAP_SIZE": self.map_size, + "CP_MAP_TYPE": self.map_type, + "CP_OMNISCIENT": "1" if self.omniscient else "0", + "CP_TIMEOUT_SEC": str(self.timeout_sec), + } + + +class HarnessClient: + """One running harness instance. Cheap to construct (sub-second on + macOS once Godot's class cache is warm); destroy + recreate on each + Gym `reset()` so episodes have independent simulator state.""" + + def __init__(self, config: HarnessConfig | None = None) -> None: + self._config = config or HarnessConfig() + env = {**os.environ, **self._config.to_env()} + self._proc = subprocess.Popen( + ["bash", str(HARNESS_SCRIPT)], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + cwd=str(REPO_ROOT), + text=True, + bufsize=1, + env=env, + ) + self._next_id = 1 + self._closed = False + + @property + def config(self) -> HarnessConfig: + return self._config + + def _send(self, msg: dict[str, Any]) -> dict[str, Any]: + if self._closed: + raise HarnessError("harness already closed") + msg["id"] = self._next_id + self._next_id += 1 + assert self._proc.stdin is not None and self._proc.stdout is not None + self._proc.stdin.write(json.dumps(msg) + "\n") + self._proc.stdin.flush() + for _ in range(MAX_LINES_PER_RESPONSE): + line = self._proc.stdout.readline() + if not line: + self._closed = True + raise HarnessError( + f"harness stdout EOF while waiting for id={msg['id']}" + ) + try: + obj = json.loads(line) + except json.JSONDecodeError: + # Notifications without an id, or stray stderr that landed + # on stdout — skip and keep reading. + continue + if obj.get("id") == msg["id"]: + return obj + raise HarnessError( + f"no correlated response for id={msg['id']} within {MAX_LINES_PER_RESPONSE} lines" + ) + + def view(self) -> dict[str, Any]: + r = self._send({"type": "view"}) + if not r.get("ok"): + raise HarnessError(f"view failed: {r.get('error')}") + return r["view"] + + def act(self, action: dict[str, Any]) -> dict[str, Any]: + r = self._send({"type": "act", "action": action}) + if not r.get("ok"): + err = r.get("error", {}) + raise HarnessError( + f"act({action.get('type')!r}) failed: " + f"{err.get('code')}: {err.get('message')}" + ) + return r + + def end_turn(self) -> dict[str, Any]: + return self.act({"type": "end_turn"}) + + def shutdown(self) -> None: + if self._closed: + return + self._closed = True + try: + self._send({"type": "shutdown"}) + except HarnessError: + pass + try: + self._proc.wait(timeout=5) + except subprocess.TimeoutExpired: + self._proc.kill() + self._proc.wait(timeout=2) + + def __enter__(self) -> HarnessClient: + return self + + def __exit__(self, *exc: object) -> None: + self.shutdown() diff --git a/tooling/rl-self-play/requirements.txt b/tooling/rl-self-play/requirements.txt new file mode 100644 index 00000000..64db9be0 --- /dev/null +++ b/tooling/rl-self-play/requirements.txt @@ -0,0 +1,9 @@ +# Pinned to versions that are known to compose cleanly with sb3-contrib's +# MaskablePPO as of 2026-Q2. Bump together — sb3 and sb3-contrib track in +# lockstep, and torch's wheel ABI changes between minor versions. +gymnasium==1.2.1 +stable-baselines3==2.7.0 +sb3-contrib==2.7.0 +torch==2.4.1 +numpy>=2.0,<3 +tensorboard>=2.18