feat(ai): clan-conditioned training pipeline (harness + env + reward overlays)

The wiring for per-clan trained AI. Each training episode samples a clan, stamps it on the LEARNER slot so the obs one-hots it, and scales the SHAPING rewards by that clan's overlay (terminal win/loss stay universal): - player_api_main.gd: CP_LEARNER_CLAN stamps the learner slot's clan via set_player_personality_json -> PlayerState.clan_id -> PlayerView.clan_index -> obs clan one-hot. (Previously only non-learner slots got a clan.) - reward_overlays.json: per-clan group multipliers (combat/expansion/production/ economy/tech) derived from ai_personalities.json strategic_axes, normalized per clan to mean 1.0 (no fairness confound). Archetypes emerge: blackhammer combat 1.5, goldvein economy 1.64, deepforge expansion 0.42. - magic_civ_env.py: samples the clan per episode (seeded), passes CP_LEARNER_CLAN, scales the 8 shaping reward terms by self._ov(group). - harness_client.py: HarnessConfig.learner_clan -> CP_LEARNER_CLAN. - train.py: --clan ('' generalist | 'all' samples every clan | comma list). Local checks: py_compile clean; overlays cover all 6 clans. Next: fleet smoke (clan_index in the learner view + a tiny training run) before scaling out. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 13:06:02 -04:00 · 2026-06-30 13:06:02 -04:00 · 57b326b670
commit 57b326b670
parent a6fb75a480
5 changed files with 175 additions and 9 deletions
--- a/public/games/age-of-dwarves/data/ai/reward_overlays.json
+++ b/public/games/age-of-dwarves/data/ai/reward_overlays.json
@ -0,0 +1,68 @@
+{
+  "$comment": "Per-clan reward-shaping overlays for clan-conditioned RL (mc-ai-trained-not-scripted). Multipliers on the EVENT-reward groups only \u2014 terminal win/loss/decisive bonus stay universal so every clan equally wants to win, differing only in intermediate incentives. Derived from ai_personalities.json strategic_axes (axis/5 = neutral at 5), normalized per clan to mean 1.0 so total shaping magnitude is comparable across clans (no difficulty/fairness confound). Generated, do not hand-edit; regenerate from the axes.",
+  "groups": {
+    "combat": [
+      "capital_captured_by_me",
+      "city_captured_by_me",
+      "enemy_unit_killed_by_me",
+      "opponent_eliminated"
+    ],
+    "expansion": [
+      "city_founded_by_me"
+    ],
+    "production": [
+      "wonder_built_by_me"
+    ],
+    "economy": [
+      "score_delta"
+    ],
+    "tech": [
+      "tech_researched_by_me",
+      "culture_researched_by_me"
+    ]
+  },
+  "overlays": {
+    "ironhold": {
+      "combat": 1.0909,
+      "expansion": 0.7273,
+      "production": 1.6364,
+      "economy": 0.5455,
+      "tech": 1.0
+    },
+    "goldvein": {
+      "combat": 0.5455,
+      "expansion": 0.9091,
+      "production": 0.9091,
+      "economy": 1.6364,
+      "tech": 1.0
+    },
+    "blackhammer": {
+      "combat": 1.5,
+      "expansion": 1.0,
+      "production": 1.1667,
+      "economy": 0.3333,
+      "tech": 1.0
+    },
+    "deepforge": {
+      "combat": 0.8421,
+      "expansion": 0.4211,
+      "production": 1.6842,
+      "economy": 1.0526,
+      "tech": 1.0
+    },
+    "tinkersmith": {
+      "combat": 0.8421,
+      "expansion": 1.0526,
+      "production": 1.2632,
+      "economy": 0.8421,
+      "tech": 1.0
+    },
+    "runesmith": {
+      "combat": 0.9091,
+      "expansion": 1.0909,
+      "production": 0.9091,
+      "economy": 1.0909,
+      "tech": 1.0
+    }
+  }
+}
--- a/src/game/engine/scenes/headless/player_api_main.gd
+++ b/src/game/engine/scenes/headless/player_api_main.gd
@ -696,6 +696,29 @@ func _apply_ai_assignments(gs: RefCounted, num_players: int) -> void:
 		else:
 			_emit_event("ai_controller_assigned", {"slot": slot, "controller_id": controller_id})

+	# Clan-condition the LEARNER slot (the Python-controlled slot). Stamp its
+	# clan id so `PlayerState.clan_id` projects into `PlayerView.clan_index`,
+	# which the learned-controller observation one-hots — this is how RL
+	# training conditions the policy on the clan it is playing. The scoring
+	# weights set alongside are inert (the learner's actions come from stdin,
+	# not the scripted AI). No-op if unset/invalid: learner stays the
+	# generalist (clan_index = -1).
+	var learner_clan: String = _env_or("CP_LEARNER_CLAN", "").strip_edges()
+	if not learner_clan.is_empty():
+		if clan_ids.has(learner_clan):
+			var ok_learner: bool = bool(
+				gs.set_player_personality_json(_player_slot, learner_clan, json_text)
+			)
+			if ok_learner:
+				_emit_event("learner_clan_assigned", {"slot": _player_slot, "clan_id": learner_clan})
+			else:
+				_emit_protocol_error(
+					"set_player_personality_json failed for learner slot=%d clan=%s"
+					% [_player_slot, learner_clan]
+				)
+		else:
+			_emit_protocol_error("CP_LEARNER_CLAN=%s not in ai_personalities.json" % learner_clan)
+

 func _scan_land_tiles(grid: RefCounted, w: int, h: int) -> Array[Vector2i]:
 	## Walk the grid and collect every land hex. Mirrors
--- a/tooling/rl_self_play/harness_client.py
+++ b/tooling/rl_self_play/harness_client.py
@ -69,6 +69,12 @@ class HarnessConfig:
    # Set this to mix learned + scripted opponents in one game, e.g.
    # `("learned:duel-v1b", "", "")` puts learned on the first AI slot.
    player_controllers: tuple[str, ...] = ()
+    # Clan-conditioned RL: stamp the LEARNER slot's clan id (an
+    # ai_personalities.json key, e.g. "blackhammer") so PlayerState.clan_id
+    # projects into PlayerView.clan_index and the observation one-hots it.
+    # Empty = generalist (clan_index = -1). See player_api_main.gd
+    # CP_LEARNER_CLAN.
+    learner_clan: str = ""

    @property
    def effective_player_slots(self) -> tuple[int, ...]:
@ -90,6 +96,8 @@ class HarnessConfig:
        }
        if self.player_controllers:
            env["CP_PLAYER_CONTROLLERS"] = ",".join(self.player_controllers)
+        if self.learner_clan:
+            env["CP_LEARNER_CLAN"] = self.learner_clan
        return env


--- a/tooling/rl_self_play/magic_civ_env.py
+++ b/tooling/rl_self_play/magic_civ_env.py
@ -15,6 +15,9 @@ its win rate against this baseline; the policy is considered to have
 """
 from __future__ import annotations

+import json
+import os
+import random
 import sys
 import time
 from dataclasses import replace
@ -107,6 +110,20 @@ def _step_penalty(turn: int) -> float:
 DEFAULT_MAX_STEPS_PER_EPISODE = 250_000
 DEFAULT_MAX_TURNS = 1000

+_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+_OVERLAYS_REL = "public/games/age-of-dwarves/data/ai/reward_overlays.json"
+
+
+def _load_reward_overlays() -> dict[str, dict[str, float]]:
+    """Per-clan reward-shaping overlays (clan -> {group -> multiplier}). Missing
+    file = no overlays (every clan trains on the neutral catalog)."""
+    path = os.environ.get("MC_REWARD_OVERLAYS") or os.path.join(_REPO_ROOT, _OVERLAYS_REL)
+    try:
+        with open(path, encoding="utf-8") as fh:
+            return json.load(fh).get("overlays", {})
+    except (OSError, json.JSONDecodeError):
+        return {}
+

 class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
    """Single-learner Gym wrapper: our policy controls slot 0.
@ -130,6 +147,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
        max_turns: int = DEFAULT_MAX_TURNS,
        max_steps_per_episode: int = DEFAULT_MAX_STEPS_PER_EPISODE,
        opponent: ModelOpponent | None = None,
+        clan_list: tuple[str, ...] = (),
    ) -> None:
        super().__init__()
        self._config = harness_config or HarnessConfig()
@ -168,6 +186,22 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
        # opponent elimination (the old duel-only 1v1 shortcut). The
        # authoritative `game_over` event still takes priority when present.
        self._live_players: set[int] = set()
+        # Clan-conditioned RL. Each episode the env samples a clan from
+        # `clan_list`, stamps it on the learner slot (CP_LEARNER_CLAN → the obs
+        # clan one-hot) and scales the SHAPING rewards by that clan's overlay
+        # (group -> multiplier). Terminal win/loss/decisive stay universal so
+        # every clan equally wants to win. Empty list = generalist (no clan,
+        # neutral catalog). Seeded RNG → reproducible clan sequence per run.
+        self._clan_list: tuple[str, ...] = tuple(clan_list)
+        self._overlays: dict[str, dict[str, float]] = _load_reward_overlays()
+        self._clan_rng = random.Random(self._config.seed)
+        self._cur_clan: str = ""
+        self._cur_overlay: dict[str, float] = {}
+
+    def _ov(self, group: str) -> float:
+        """Reward-shaping multiplier for the current episode's clan (1.0 if
+        generalist / unknown group)."""
+        return self._cur_overlay.get(group, 1.0)

    # ── Gymnasium API ────────────────────────────────────────────────

@ -190,6 +224,15 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
        # dropped them, which would have un-declared the external slots.
        if seed is not None:
            cfg = replace(cfg, seed=seed)
+        # Clan-conditioned RL: sample this episode's clan, stamp it on the
+        # learner slot (CP_LEARNER_CLAN), and select its reward overlay.
+        if self._clan_list:
+            self._cur_clan = self._clan_rng.choice(self._clan_list)
+            cfg = replace(cfg, learner_clan=self._cur_clan)
+            self._cur_overlay = self._overlays.get(self._cur_clan, {})
+        else:
+            self._cur_clan = ""
+            self._cur_overlay = {}
        self._terminated = False
        self._step_count = 0
        self._capital_by_player = {}
@ -309,7 +352,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
        prev_score = self._last_score
        new_score = float(view.get("score", {}).get("score_estimate", 0.0))
        # Symmetric score-delta — gains and losses both count.
-        reward += SCORE_DELTA_SCALE * (new_score - prev_score)
+        reward += SCORE_DELTA_SCALE * (new_score - prev_score) * self._ov("economy")
        # Event-driven shaping (Phase 1 catalog).
        reward += self._apply_event_rewards(recent_events, me)

@ -454,7 +497,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
                    self._capital_by_player[owner] = cid
                if owner == me:
                    if self._city_founded_rewards_issued < MAX_CITY_FOUNDED_REWARDS:
-                        total += CITY_FOUNDED_BY_ME
+                        total += CITY_FOUNDED_BY_ME * self._ov("expansion")
                        self._city_founded_rewards_issued += 1
            elif kind == "city_captured":
                old_owner = int(ev.get("old_owner", -1))
@ -465,14 +508,16 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
                    and self._capital_by_player.get(old_owner) == cid
                )
                if new_owner == me:
-                    total += CAPITAL_CAPTURED_BY_ME if is_capital else CITY_CAPTURED_BY_ME
+                    total += (
+                        CAPITAL_CAPTURED_BY_ME if is_capital else CITY_CAPTURED_BY_ME
+                    ) * self._ov("combat")
                elif old_owner == me:
                    total += CAPITAL_LOST_BY_ME if is_capital else CITY_LOST_BY_ME
                # When a capital changes hands, the *capturer's* first
                # city is still their own capital — don't reassign.
            elif kind == "wonder_built":
                if int(ev.get("player", -1)) == me:
-                    total += WONDER_BUILT_BY_ME
+                    total += WONDER_BUILT_BY_ME * self._ov("production")
            elif kind == "combat_resolved":
                # Attribution: the wire event carries unit ids, not owners.
                # We synthesise from defender_killed/attacker_killed plus
@ -494,17 +539,17 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
                    # asymmetric ±0.04/+0.05 is net-positive on even trades).
                    killer = ev.get("killer_unit_id")
                    if killer is None or self._unit_owner_lookup(str(killer)) == me:
-                        total += ENEMY_UNIT_KILLED_BY_ME
+                        total += ENEMY_UNIT_KILLED_BY_ME * self._ov("combat")
            elif kind == "tech_researched":
                if int(ev.get("player", -1)) == me:
-                    total += TECH_RESEARCHED_BY_ME
+                    total += TECH_RESEARCHED_BY_ME * self._ov("tech")
            elif kind == "culture_researched":
                if int(ev.get("player", -1)) == me:
-                    total += CULTURE_RESEARCHED_BY_ME
+                    total += CULTURE_RESEARCHED_BY_ME * self._ov("tech")
            elif kind == "player_eliminated":
                p = int(ev.get("player", -1))
                if p != me and p >= 0:
-                    total += OPPONENT_ELIMINATED
+                    total += OPPONENT_ELIMINATED * self._ov("combat")
        return total

    def _unit_owner_lookup(self, unit_id: str) -> int:
--- a/tooling/rl_self_play/train.py
+++ b/tooling/rl_self_play/train.py
@ -66,6 +66,13 @@ def _build_argparser() -> argparse.ArgumentParser:
                   help="Stop training once eval win-rate exceeds this (default: 0.55).")
    p.add_argument("--run-name", default="duel-v1",
                   help="Subdirectory under runs/ + models/ (default: duel-v1).")
+    p.add_argument("--clan", default="",
+                   help=("Clan-conditioned training. '' = generalist (no clan, "
+                         "clan_index=-1). 'all' = sample every clan per episode "
+                         "(one conditioned policy for all clans). Or a comma list "
+                         "of ai_personalities.json ids, e.g. 'blackhammer'. The "
+                         "sampled clan is stamped on the learner (obs one-hot) and "
+                         "selects its reward overlay."))
    p.add_argument("--seed", type=int, default=42,
                   help="Base RNG seed; per-env seeds offset from this (default: 42).")
    p.add_argument("--init-from", default=None,
@ -111,6 +118,18 @@ def _make_env_factory(args: argparse.Namespace, env_idx: int):
        int(s) for s in str(args.opponent_slots).split(",") if s.strip()
    )

+    # Resolve the clan list for clan-conditioned training. '' = generalist;
+    # 'all' = every clan (from the obs schema's canonical clan_order); else a
+    # comma list of ai_personalities.json ids.
+    clan_arg = str(getattr(args, "clan", "") or "").strip()
+    if not clan_arg:
+        clan_list: tuple[str, ...] = ()
+    elif clan_arg == "all":
+        from tooling.rl_self_play.obs_contract import load_schema  # type: ignore[import-not-found]
+        clan_list = tuple(load_schema()["clan_order"])
+    else:
+        clan_list = tuple(c.strip() for c in clan_arg.split(",") if c.strip())
+
    def _make() -> MagicCivEnv:
        cfg = HarnessConfig(
            seed=args.seed + env_idx,
@ -128,7 +147,10 @@ def _make_env_factory(args: argparse.Namespace, env_idx: int):
                deterministic=args.opponent_deterministic,
            )
        return MagicCivEnv(
-            harness_config=cfg, max_turns=args.max_turns, opponent=opponent
+            harness_config=cfg,
+            max_turns=args.max_turns,
+            opponent=opponent,
+            clan_list=clan_list,
        )

    return _make