feat(ai): clan-conditioned training pipeline (harness + env + reward overlays)
Some checks failed
ci / regression gate (push) Waiting to run
deploy-next / deploy dev guide to mc.next.black.lan (push) Failing after 44s

The wiring for per-clan trained AI. Each training episode samples a clan, stamps it
on the LEARNER slot so the obs one-hots it, and scales the SHAPING rewards by that
clan's overlay (terminal win/loss stay universal):

- player_api_main.gd: CP_LEARNER_CLAN stamps the learner slot's clan via
  set_player_personality_json -> PlayerState.clan_id -> PlayerView.clan_index ->
  obs clan one-hot. (Previously only non-learner slots got a clan.)
- reward_overlays.json: per-clan group multipliers (combat/expansion/production/
  economy/tech) derived from ai_personalities.json strategic_axes, normalized per
  clan to mean 1.0 (no fairness confound). Archetypes emerge: blackhammer combat 1.5,
  goldvein economy 1.64, deepforge expansion 0.42.
- magic_civ_env.py: samples the clan per episode (seeded), passes CP_LEARNER_CLAN,
  scales the 8 shaping reward terms by self._ov(group).
- harness_client.py: HarnessConfig.learner_clan -> CP_LEARNER_CLAN.
- train.py: --clan ('' generalist | 'all' samples every clan | comma list).

Local checks: py_compile clean; overlays cover all 6 clans. Next: fleet smoke
(clan_index in the learner view + a tiny training run) before scaling out.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Natalie 2026-06-30 13:06:02 -04:00
parent a6fb75a480
commit 57b326b670
5 changed files with 175 additions and 9 deletions

View file

@ -0,0 +1,68 @@
{
"$comment": "Per-clan reward-shaping overlays for clan-conditioned RL (mc-ai-trained-not-scripted). Multipliers on the EVENT-reward groups only \u2014 terminal win/loss/decisive bonus stay universal so every clan equally wants to win, differing only in intermediate incentives. Derived from ai_personalities.json strategic_axes (axis/5 = neutral at 5), normalized per clan to mean 1.0 so total shaping magnitude is comparable across clans (no difficulty/fairness confound). Generated, do not hand-edit; regenerate from the axes.",
"groups": {
"combat": [
"capital_captured_by_me",
"city_captured_by_me",
"enemy_unit_killed_by_me",
"opponent_eliminated"
],
"expansion": [
"city_founded_by_me"
],
"production": [
"wonder_built_by_me"
],
"economy": [
"score_delta"
],
"tech": [
"tech_researched_by_me",
"culture_researched_by_me"
]
},
"overlays": {
"ironhold": {
"combat": 1.0909,
"expansion": 0.7273,
"production": 1.6364,
"economy": 0.5455,
"tech": 1.0
},
"goldvein": {
"combat": 0.5455,
"expansion": 0.9091,
"production": 0.9091,
"economy": 1.6364,
"tech": 1.0
},
"blackhammer": {
"combat": 1.5,
"expansion": 1.0,
"production": 1.1667,
"economy": 0.3333,
"tech": 1.0
},
"deepforge": {
"combat": 0.8421,
"expansion": 0.4211,
"production": 1.6842,
"economy": 1.0526,
"tech": 1.0
},
"tinkersmith": {
"combat": 0.8421,
"expansion": 1.0526,
"production": 1.2632,
"economy": 0.8421,
"tech": 1.0
},
"runesmith": {
"combat": 0.9091,
"expansion": 1.0909,
"production": 0.9091,
"economy": 1.0909,
"tech": 1.0
}
}
}

View file

@ -696,6 +696,29 @@ func _apply_ai_assignments(gs: RefCounted, num_players: int) -> void:
else:
_emit_event("ai_controller_assigned", {"slot": slot, "controller_id": controller_id})
# Clan-condition the LEARNER slot (the Python-controlled slot). Stamp its
# clan id so `PlayerState.clan_id` projects into `PlayerView.clan_index`,
# which the learned-controller observation one-hots — this is how RL
# training conditions the policy on the clan it is playing. The scoring
# weights set alongside are inert (the learner's actions come from stdin,
# not the scripted AI). No-op if unset/invalid: learner stays the
# generalist (clan_index = -1).
var learner_clan: String = _env_or("CP_LEARNER_CLAN", "").strip_edges()
if not learner_clan.is_empty():
if clan_ids.has(learner_clan):
var ok_learner: bool = bool(
gs.set_player_personality_json(_player_slot, learner_clan, json_text)
)
if ok_learner:
_emit_event("learner_clan_assigned", {"slot": _player_slot, "clan_id": learner_clan})
else:
_emit_protocol_error(
"set_player_personality_json failed for learner slot=%d clan=%s"
% [_player_slot, learner_clan]
)
else:
_emit_protocol_error("CP_LEARNER_CLAN=%s not in ai_personalities.json" % learner_clan)
func _scan_land_tiles(grid: RefCounted, w: int, h: int) -> Array[Vector2i]:
## Walk the grid and collect every land hex. Mirrors

View file

@ -69,6 +69,12 @@ class HarnessConfig:
# Set this to mix learned + scripted opponents in one game, e.g.
# `("learned:duel-v1b", "", "")` puts learned on the first AI slot.
player_controllers: tuple[str, ...] = ()
# Clan-conditioned RL: stamp the LEARNER slot's clan id (an
# ai_personalities.json key, e.g. "blackhammer") so PlayerState.clan_id
# projects into PlayerView.clan_index and the observation one-hots it.
# Empty = generalist (clan_index = -1). See player_api_main.gd
# CP_LEARNER_CLAN.
learner_clan: str = ""
@property
def effective_player_slots(self) -> tuple[int, ...]:
@ -90,6 +96,8 @@ class HarnessConfig:
}
if self.player_controllers:
env["CP_PLAYER_CONTROLLERS"] = ",".join(self.player_controllers)
if self.learner_clan:
env["CP_LEARNER_CLAN"] = self.learner_clan
return env

View file

@ -15,6 +15,9 @@ its win rate against this baseline; the policy is considered to have
"""
from __future__ import annotations
import json
import os
import random
import sys
import time
from dataclasses import replace
@ -107,6 +110,20 @@ def _step_penalty(turn: int) -> float:
DEFAULT_MAX_STEPS_PER_EPISODE = 250_000
DEFAULT_MAX_TURNS = 1000
_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
_OVERLAYS_REL = "public/games/age-of-dwarves/data/ai/reward_overlays.json"
def _load_reward_overlays() -> dict[str, dict[str, float]]:
"""Per-clan reward-shaping overlays (clan -> {group -> multiplier}). Missing
file = no overlays (every clan trains on the neutral catalog)."""
path = os.environ.get("MC_REWARD_OVERLAYS") or os.path.join(_REPO_ROOT, _OVERLAYS_REL)
try:
with open(path, encoding="utf-8") as fh:
return json.load(fh).get("overlays", {})
except (OSError, json.JSONDecodeError):
return {}
class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
"""Single-learner Gym wrapper: our policy controls slot 0.
@ -130,6 +147,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
max_turns: int = DEFAULT_MAX_TURNS,
max_steps_per_episode: int = DEFAULT_MAX_STEPS_PER_EPISODE,
opponent: ModelOpponent | None = None,
clan_list: tuple[str, ...] = (),
) -> None:
super().__init__()
self._config = harness_config or HarnessConfig()
@ -168,6 +186,22 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
# opponent elimination (the old duel-only 1v1 shortcut). The
# authoritative `game_over` event still takes priority when present.
self._live_players: set[int] = set()
# Clan-conditioned RL. Each episode the env samples a clan from
# `clan_list`, stamps it on the learner slot (CP_LEARNER_CLAN → the obs
# clan one-hot) and scales the SHAPING rewards by that clan's overlay
# (group -> multiplier). Terminal win/loss/decisive stay universal so
# every clan equally wants to win. Empty list = generalist (no clan,
# neutral catalog). Seeded RNG → reproducible clan sequence per run.
self._clan_list: tuple[str, ...] = tuple(clan_list)
self._overlays: dict[str, dict[str, float]] = _load_reward_overlays()
self._clan_rng = random.Random(self._config.seed)
self._cur_clan: str = ""
self._cur_overlay: dict[str, float] = {}
def _ov(self, group: str) -> float:
"""Reward-shaping multiplier for the current episode's clan (1.0 if
generalist / unknown group)."""
return self._cur_overlay.get(group, 1.0)
# ── Gymnasium API ────────────────────────────────────────────────
@ -190,6 +224,15 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
# dropped them, which would have un-declared the external slots.
if seed is not None:
cfg = replace(cfg, seed=seed)
# Clan-conditioned RL: sample this episode's clan, stamp it on the
# learner slot (CP_LEARNER_CLAN), and select its reward overlay.
if self._clan_list:
self._cur_clan = self._clan_rng.choice(self._clan_list)
cfg = replace(cfg, learner_clan=self._cur_clan)
self._cur_overlay = self._overlays.get(self._cur_clan, {})
else:
self._cur_clan = ""
self._cur_overlay = {}
self._terminated = False
self._step_count = 0
self._capital_by_player = {}
@ -309,7 +352,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
prev_score = self._last_score
new_score = float(view.get("score", {}).get("score_estimate", 0.0))
# Symmetric score-delta — gains and losses both count.
reward += SCORE_DELTA_SCALE * (new_score - prev_score)
reward += SCORE_DELTA_SCALE * (new_score - prev_score) * self._ov("economy")
# Event-driven shaping (Phase 1 catalog).
reward += self._apply_event_rewards(recent_events, me)
@ -454,7 +497,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
self._capital_by_player[owner] = cid
if owner == me:
if self._city_founded_rewards_issued < MAX_CITY_FOUNDED_REWARDS:
total += CITY_FOUNDED_BY_ME
total += CITY_FOUNDED_BY_ME * self._ov("expansion")
self._city_founded_rewards_issued += 1
elif kind == "city_captured":
old_owner = int(ev.get("old_owner", -1))
@ -465,14 +508,16 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
and self._capital_by_player.get(old_owner) == cid
)
if new_owner == me:
total += CAPITAL_CAPTURED_BY_ME if is_capital else CITY_CAPTURED_BY_ME
total += (
CAPITAL_CAPTURED_BY_ME if is_capital else CITY_CAPTURED_BY_ME
) * self._ov("combat")
elif old_owner == me:
total += CAPITAL_LOST_BY_ME if is_capital else CITY_LOST_BY_ME
# When a capital changes hands, the *capturer's* first
# city is still their own capital — don't reassign.
elif kind == "wonder_built":
if int(ev.get("player", -1)) == me:
total += WONDER_BUILT_BY_ME
total += WONDER_BUILT_BY_ME * self._ov("production")
elif kind == "combat_resolved":
# Attribution: the wire event carries unit ids, not owners.
# We synthesise from defender_killed/attacker_killed plus
@ -494,17 +539,17 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
# asymmetric ±0.04/+0.05 is net-positive on even trades).
killer = ev.get("killer_unit_id")
if killer is None or self._unit_owner_lookup(str(killer)) == me:
total += ENEMY_UNIT_KILLED_BY_ME
total += ENEMY_UNIT_KILLED_BY_ME * self._ov("combat")
elif kind == "tech_researched":
if int(ev.get("player", -1)) == me:
total += TECH_RESEARCHED_BY_ME
total += TECH_RESEARCHED_BY_ME * self._ov("tech")
elif kind == "culture_researched":
if int(ev.get("player", -1)) == me:
total += CULTURE_RESEARCHED_BY_ME
total += CULTURE_RESEARCHED_BY_ME * self._ov("tech")
elif kind == "player_eliminated":
p = int(ev.get("player", -1))
if p != me and p >= 0:
total += OPPONENT_ELIMINATED
total += OPPONENT_ELIMINATED * self._ov("combat")
return total
def _unit_owner_lookup(self, unit_id: str) -> int:

View file

@ -66,6 +66,13 @@ def _build_argparser() -> argparse.ArgumentParser:
help="Stop training once eval win-rate exceeds this (default: 0.55).")
p.add_argument("--run-name", default="duel-v1",
help="Subdirectory under runs/ + models/ (default: duel-v1).")
p.add_argument("--clan", default="",
help=("Clan-conditioned training. '' = generalist (no clan, "
"clan_index=-1). 'all' = sample every clan per episode "
"(one conditioned policy for all clans). Or a comma list "
"of ai_personalities.json ids, e.g. 'blackhammer'. The "
"sampled clan is stamped on the learner (obs one-hot) and "
"selects its reward overlay."))
p.add_argument("--seed", type=int, default=42,
help="Base RNG seed; per-env seeds offset from this (default: 42).")
p.add_argument("--init-from", default=None,
@ -111,6 +118,18 @@ def _make_env_factory(args: argparse.Namespace, env_idx: int):
int(s) for s in str(args.opponent_slots).split(",") if s.strip()
)
# Resolve the clan list for clan-conditioned training. '' = generalist;
# 'all' = every clan (from the obs schema's canonical clan_order); else a
# comma list of ai_personalities.json ids.
clan_arg = str(getattr(args, "clan", "") or "").strip()
if not clan_arg:
clan_list: tuple[str, ...] = ()
elif clan_arg == "all":
from tooling.rl_self_play.obs_contract import load_schema # type: ignore[import-not-found]
clan_list = tuple(load_schema()["clan_order"])
else:
clan_list = tuple(c.strip() for c in clan_arg.split(",") if c.strip())
def _make() -> MagicCivEnv:
cfg = HarnessConfig(
seed=args.seed + env_idx,
@ -128,7 +147,10 @@ def _make_env_factory(args: argparse.Namespace, env_idx: int):
deterministic=args.opponent_deterministic,
)
return MagicCivEnv(
harness_config=cfg, max_turns=args.max_turns, opponent=opponent
harness_config=cfg,
max_turns=args.max_turns,
opponent=opponent,
clan_list=clan_list,
)
return _make