diff --git a/public/games/age-of-dwarves/data/ai/reward_overlays.json b/public/games/age-of-dwarves/data/ai/reward_overlays.json new file mode 100644 index 00000000..f36d0820 --- /dev/null +++ b/public/games/age-of-dwarves/data/ai/reward_overlays.json @@ -0,0 +1,68 @@ +{ + "$comment": "Per-clan reward-shaping overlays for clan-conditioned RL (mc-ai-trained-not-scripted). Multipliers on the EVENT-reward groups only \u2014 terminal win/loss/decisive bonus stay universal so every clan equally wants to win, differing only in intermediate incentives. Derived from ai_personalities.json strategic_axes (axis/5 = neutral at 5), normalized per clan to mean 1.0 so total shaping magnitude is comparable across clans (no difficulty/fairness confound). Generated, do not hand-edit; regenerate from the axes.", + "groups": { + "combat": [ + "capital_captured_by_me", + "city_captured_by_me", + "enemy_unit_killed_by_me", + "opponent_eliminated" + ], + "expansion": [ + "city_founded_by_me" + ], + "production": [ + "wonder_built_by_me" + ], + "economy": [ + "score_delta" + ], + "tech": [ + "tech_researched_by_me", + "culture_researched_by_me" + ] + }, + "overlays": { + "ironhold": { + "combat": 1.0909, + "expansion": 0.7273, + "production": 1.6364, + "economy": 0.5455, + "tech": 1.0 + }, + "goldvein": { + "combat": 0.5455, + "expansion": 0.9091, + "production": 0.9091, + "economy": 1.6364, + "tech": 1.0 + }, + "blackhammer": { + "combat": 1.5, + "expansion": 1.0, + "production": 1.1667, + "economy": 0.3333, + "tech": 1.0 + }, + "deepforge": { + "combat": 0.8421, + "expansion": 0.4211, + "production": 1.6842, + "economy": 1.0526, + "tech": 1.0 + }, + "tinkersmith": { + "combat": 0.8421, + "expansion": 1.0526, + "production": 1.2632, + "economy": 0.8421, + "tech": 1.0 + }, + "runesmith": { + "combat": 0.9091, + "expansion": 1.0909, + "production": 0.9091, + "economy": 1.0909, + "tech": 1.0 + } + } +} \ No newline at end of file diff --git a/src/game/engine/scenes/headless/player_api_main.gd b/src/game/engine/scenes/headless/player_api_main.gd index d76a5a21..d0f45dbe 100644 --- a/src/game/engine/scenes/headless/player_api_main.gd +++ b/src/game/engine/scenes/headless/player_api_main.gd @@ -696,6 +696,29 @@ func _apply_ai_assignments(gs: RefCounted, num_players: int) -> void: else: _emit_event("ai_controller_assigned", {"slot": slot, "controller_id": controller_id}) + # Clan-condition the LEARNER slot (the Python-controlled slot). Stamp its + # clan id so `PlayerState.clan_id` projects into `PlayerView.clan_index`, + # which the learned-controller observation one-hots — this is how RL + # training conditions the policy on the clan it is playing. The scoring + # weights set alongside are inert (the learner's actions come from stdin, + # not the scripted AI). No-op if unset/invalid: learner stays the + # generalist (clan_index = -1). + var learner_clan: String = _env_or("CP_LEARNER_CLAN", "").strip_edges() + if not learner_clan.is_empty(): + if clan_ids.has(learner_clan): + var ok_learner: bool = bool( + gs.set_player_personality_json(_player_slot, learner_clan, json_text) + ) + if ok_learner: + _emit_event("learner_clan_assigned", {"slot": _player_slot, "clan_id": learner_clan}) + else: + _emit_protocol_error( + "set_player_personality_json failed for learner slot=%d clan=%s" + % [_player_slot, learner_clan] + ) + else: + _emit_protocol_error("CP_LEARNER_CLAN=%s not in ai_personalities.json" % learner_clan) + func _scan_land_tiles(grid: RefCounted, w: int, h: int) -> Array[Vector2i]: ## Walk the grid and collect every land hex. Mirrors diff --git a/tooling/rl_self_play/harness_client.py b/tooling/rl_self_play/harness_client.py index 76bb7d9b..a9007ed0 100644 --- a/tooling/rl_self_play/harness_client.py +++ b/tooling/rl_self_play/harness_client.py @@ -69,6 +69,12 @@ class HarnessConfig: # Set this to mix learned + scripted opponents in one game, e.g. # `("learned:duel-v1b", "", "")` puts learned on the first AI slot. player_controllers: tuple[str, ...] = () + # Clan-conditioned RL: stamp the LEARNER slot's clan id (an + # ai_personalities.json key, e.g. "blackhammer") so PlayerState.clan_id + # projects into PlayerView.clan_index and the observation one-hots it. + # Empty = generalist (clan_index = -1). See player_api_main.gd + # CP_LEARNER_CLAN. + learner_clan: str = "" @property def effective_player_slots(self) -> tuple[int, ...]: @@ -90,6 +96,8 @@ class HarnessConfig: } if self.player_controllers: env["CP_PLAYER_CONTROLLERS"] = ",".join(self.player_controllers) + if self.learner_clan: + env["CP_LEARNER_CLAN"] = self.learner_clan return env diff --git a/tooling/rl_self_play/magic_civ_env.py b/tooling/rl_self_play/magic_civ_env.py index 2ed03f0b..b50478f1 100644 --- a/tooling/rl_self_play/magic_civ_env.py +++ b/tooling/rl_self_play/magic_civ_env.py @@ -15,6 +15,9 @@ its win rate against this baseline; the policy is considered to have """ from __future__ import annotations +import json +import os +import random import sys import time from dataclasses import replace @@ -107,6 +110,20 @@ def _step_penalty(turn: int) -> float: DEFAULT_MAX_STEPS_PER_EPISODE = 250_000 DEFAULT_MAX_TURNS = 1000 +_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +_OVERLAYS_REL = "public/games/age-of-dwarves/data/ai/reward_overlays.json" + + +def _load_reward_overlays() -> dict[str, dict[str, float]]: + """Per-clan reward-shaping overlays (clan -> {group -> multiplier}). Missing + file = no overlays (every clan trains on the neutral catalog).""" + path = os.environ.get("MC_REWARD_OVERLAYS") or os.path.join(_REPO_ROOT, _OVERLAYS_REL) + try: + with open(path, encoding="utf-8") as fh: + return json.load(fh).get("overlays", {}) + except (OSError, json.JSONDecodeError): + return {} + class MagicCivEnv(gym.Env[np.ndarray, np.int64]): """Single-learner Gym wrapper: our policy controls slot 0. @@ -130,6 +147,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): max_turns: int = DEFAULT_MAX_TURNS, max_steps_per_episode: int = DEFAULT_MAX_STEPS_PER_EPISODE, opponent: ModelOpponent | None = None, + clan_list: tuple[str, ...] = (), ) -> None: super().__init__() self._config = harness_config or HarnessConfig() @@ -168,6 +186,22 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): # opponent elimination (the old duel-only 1v1 shortcut). The # authoritative `game_over` event still takes priority when present. self._live_players: set[int] = set() + # Clan-conditioned RL. Each episode the env samples a clan from + # `clan_list`, stamps it on the learner slot (CP_LEARNER_CLAN → the obs + # clan one-hot) and scales the SHAPING rewards by that clan's overlay + # (group -> multiplier). Terminal win/loss/decisive stay universal so + # every clan equally wants to win. Empty list = generalist (no clan, + # neutral catalog). Seeded RNG → reproducible clan sequence per run. + self._clan_list: tuple[str, ...] = tuple(clan_list) + self._overlays: dict[str, dict[str, float]] = _load_reward_overlays() + self._clan_rng = random.Random(self._config.seed) + self._cur_clan: str = "" + self._cur_overlay: dict[str, float] = {} + + def _ov(self, group: str) -> float: + """Reward-shaping multiplier for the current episode's clan (1.0 if + generalist / unknown group).""" + return self._cur_overlay.get(group, 1.0) # ── Gymnasium API ──────────────────────────────────────────────── @@ -190,6 +224,15 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): # dropped them, which would have un-declared the external slots. if seed is not None: cfg = replace(cfg, seed=seed) + # Clan-conditioned RL: sample this episode's clan, stamp it on the + # learner slot (CP_LEARNER_CLAN), and select its reward overlay. + if self._clan_list: + self._cur_clan = self._clan_rng.choice(self._clan_list) + cfg = replace(cfg, learner_clan=self._cur_clan) + self._cur_overlay = self._overlays.get(self._cur_clan, {}) + else: + self._cur_clan = "" + self._cur_overlay = {} self._terminated = False self._step_count = 0 self._capital_by_player = {} @@ -309,7 +352,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): prev_score = self._last_score new_score = float(view.get("score", {}).get("score_estimate", 0.0)) # Symmetric score-delta — gains and losses both count. - reward += SCORE_DELTA_SCALE * (new_score - prev_score) + reward += SCORE_DELTA_SCALE * (new_score - prev_score) * self._ov("economy") # Event-driven shaping (Phase 1 catalog). reward += self._apply_event_rewards(recent_events, me) @@ -454,7 +497,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): self._capital_by_player[owner] = cid if owner == me: if self._city_founded_rewards_issued < MAX_CITY_FOUNDED_REWARDS: - total += CITY_FOUNDED_BY_ME + total += CITY_FOUNDED_BY_ME * self._ov("expansion") self._city_founded_rewards_issued += 1 elif kind == "city_captured": old_owner = int(ev.get("old_owner", -1)) @@ -465,14 +508,16 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): and self._capital_by_player.get(old_owner) == cid ) if new_owner == me: - total += CAPITAL_CAPTURED_BY_ME if is_capital else CITY_CAPTURED_BY_ME + total += ( + CAPITAL_CAPTURED_BY_ME if is_capital else CITY_CAPTURED_BY_ME + ) * self._ov("combat") elif old_owner == me: total += CAPITAL_LOST_BY_ME if is_capital else CITY_LOST_BY_ME # When a capital changes hands, the *capturer's* first # city is still their own capital — don't reassign. elif kind == "wonder_built": if int(ev.get("player", -1)) == me: - total += WONDER_BUILT_BY_ME + total += WONDER_BUILT_BY_ME * self._ov("production") elif kind == "combat_resolved": # Attribution: the wire event carries unit ids, not owners. # We synthesise from defender_killed/attacker_killed plus @@ -494,17 +539,17 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]): # asymmetric ±0.04/+0.05 is net-positive on even trades). killer = ev.get("killer_unit_id") if killer is None or self._unit_owner_lookup(str(killer)) == me: - total += ENEMY_UNIT_KILLED_BY_ME + total += ENEMY_UNIT_KILLED_BY_ME * self._ov("combat") elif kind == "tech_researched": if int(ev.get("player", -1)) == me: - total += TECH_RESEARCHED_BY_ME + total += TECH_RESEARCHED_BY_ME * self._ov("tech") elif kind == "culture_researched": if int(ev.get("player", -1)) == me: - total += CULTURE_RESEARCHED_BY_ME + total += CULTURE_RESEARCHED_BY_ME * self._ov("tech") elif kind == "player_eliminated": p = int(ev.get("player", -1)) if p != me and p >= 0: - total += OPPONENT_ELIMINATED + total += OPPONENT_ELIMINATED * self._ov("combat") return total def _unit_owner_lookup(self, unit_id: str) -> int: diff --git a/tooling/rl_self_play/train.py b/tooling/rl_self_play/train.py index 653f55f4..6c29bcfe 100644 --- a/tooling/rl_self_play/train.py +++ b/tooling/rl_self_play/train.py @@ -66,6 +66,13 @@ def _build_argparser() -> argparse.ArgumentParser: help="Stop training once eval win-rate exceeds this (default: 0.55).") p.add_argument("--run-name", default="duel-v1", help="Subdirectory under runs/ + models/ (default: duel-v1).") + p.add_argument("--clan", default="", + help=("Clan-conditioned training. '' = generalist (no clan, " + "clan_index=-1). 'all' = sample every clan per episode " + "(one conditioned policy for all clans). Or a comma list " + "of ai_personalities.json ids, e.g. 'blackhammer'. The " + "sampled clan is stamped on the learner (obs one-hot) and " + "selects its reward overlay.")) p.add_argument("--seed", type=int, default=42, help="Base RNG seed; per-env seeds offset from this (default: 42).") p.add_argument("--init-from", default=None, @@ -111,6 +118,18 @@ def _make_env_factory(args: argparse.Namespace, env_idx: int): int(s) for s in str(args.opponent_slots).split(",") if s.strip() ) + # Resolve the clan list for clan-conditioned training. '' = generalist; + # 'all' = every clan (from the obs schema's canonical clan_order); else a + # comma list of ai_personalities.json ids. + clan_arg = str(getattr(args, "clan", "") or "").strip() + if not clan_arg: + clan_list: tuple[str, ...] = () + elif clan_arg == "all": + from tooling.rl_self_play.obs_contract import load_schema # type: ignore[import-not-found] + clan_list = tuple(load_schema()["clan_order"]) + else: + clan_list = tuple(c.strip() for c in clan_arg.split(",") if c.strip()) + def _make() -> MagicCivEnv: cfg = HarnessConfig( seed=args.seed + env_idx, @@ -128,7 +147,10 @@ def _make_env_factory(args: argparse.Namespace, env_idx: int): deterministic=args.opponent_deterministic, ) return MagicCivEnv( - harness_config=cfg, max_turns=args.max_turns, opponent=opponent + harness_config=cfg, + max_turns=args.max_turns, + opponent=opponent, + clan_list=clan_list, ) return _make