feat(ai): clan-conditioned training pipeline (harness + env + reward overlays)
The wiring for per-clan trained AI. Each training episode samples a clan, stamps it
on the LEARNER slot so the obs one-hots it, and scales the SHAPING rewards by that
clan's overlay (terminal win/loss stay universal):
- player_api_main.gd: CP_LEARNER_CLAN stamps the learner slot's clan via
set_player_personality_json -> PlayerState.clan_id -> PlayerView.clan_index ->
obs clan one-hot. (Previously only non-learner slots got a clan.)
- reward_overlays.json: per-clan group multipliers (combat/expansion/production/
economy/tech) derived from ai_personalities.json strategic_axes, normalized per
clan to mean 1.0 (no fairness confound). Archetypes emerge: blackhammer combat 1.5,
goldvein economy 1.64, deepforge expansion 0.42.
- magic_civ_env.py: samples the clan per episode (seeded), passes CP_LEARNER_CLAN,
scales the 8 shaping reward terms by self._ov(group).
- harness_client.py: HarnessConfig.learner_clan -> CP_LEARNER_CLAN.
- train.py: --clan ('' generalist | 'all' samples every clan | comma list).
Local checks: py_compile clean; overlays cover all 6 clans. Next: fleet smoke
(clan_index in the learner view + a tiny training run) before scaling out.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a6fb75a480
commit
57b326b670
5 changed files with 175 additions and 9 deletions
68
public/games/age-of-dwarves/data/ai/reward_overlays.json
Normal file
68
public/games/age-of-dwarves/data/ai/reward_overlays.json
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
{
|
||||
"$comment": "Per-clan reward-shaping overlays for clan-conditioned RL (mc-ai-trained-not-scripted). Multipliers on the EVENT-reward groups only \u2014 terminal win/loss/decisive bonus stay universal so every clan equally wants to win, differing only in intermediate incentives. Derived from ai_personalities.json strategic_axes (axis/5 = neutral at 5), normalized per clan to mean 1.0 so total shaping magnitude is comparable across clans (no difficulty/fairness confound). Generated, do not hand-edit; regenerate from the axes.",
|
||||
"groups": {
|
||||
"combat": [
|
||||
"capital_captured_by_me",
|
||||
"city_captured_by_me",
|
||||
"enemy_unit_killed_by_me",
|
||||
"opponent_eliminated"
|
||||
],
|
||||
"expansion": [
|
||||
"city_founded_by_me"
|
||||
],
|
||||
"production": [
|
||||
"wonder_built_by_me"
|
||||
],
|
||||
"economy": [
|
||||
"score_delta"
|
||||
],
|
||||
"tech": [
|
||||
"tech_researched_by_me",
|
||||
"culture_researched_by_me"
|
||||
]
|
||||
},
|
||||
"overlays": {
|
||||
"ironhold": {
|
||||
"combat": 1.0909,
|
||||
"expansion": 0.7273,
|
||||
"production": 1.6364,
|
||||
"economy": 0.5455,
|
||||
"tech": 1.0
|
||||
},
|
||||
"goldvein": {
|
||||
"combat": 0.5455,
|
||||
"expansion": 0.9091,
|
||||
"production": 0.9091,
|
||||
"economy": 1.6364,
|
||||
"tech": 1.0
|
||||
},
|
||||
"blackhammer": {
|
||||
"combat": 1.5,
|
||||
"expansion": 1.0,
|
||||
"production": 1.1667,
|
||||
"economy": 0.3333,
|
||||
"tech": 1.0
|
||||
},
|
||||
"deepforge": {
|
||||
"combat": 0.8421,
|
||||
"expansion": 0.4211,
|
||||
"production": 1.6842,
|
||||
"economy": 1.0526,
|
||||
"tech": 1.0
|
||||
},
|
||||
"tinkersmith": {
|
||||
"combat": 0.8421,
|
||||
"expansion": 1.0526,
|
||||
"production": 1.2632,
|
||||
"economy": 0.8421,
|
||||
"tech": 1.0
|
||||
},
|
||||
"runesmith": {
|
||||
"combat": 0.9091,
|
||||
"expansion": 1.0909,
|
||||
"production": 0.9091,
|
||||
"economy": 1.0909,
|
||||
"tech": 1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -696,6 +696,29 @@ func _apply_ai_assignments(gs: RefCounted, num_players: int) -> void:
|
|||
else:
|
||||
_emit_event("ai_controller_assigned", {"slot": slot, "controller_id": controller_id})
|
||||
|
||||
# Clan-condition the LEARNER slot (the Python-controlled slot). Stamp its
|
||||
# clan id so `PlayerState.clan_id` projects into `PlayerView.clan_index`,
|
||||
# which the learned-controller observation one-hots — this is how RL
|
||||
# training conditions the policy on the clan it is playing. The scoring
|
||||
# weights set alongside are inert (the learner's actions come from stdin,
|
||||
# not the scripted AI). No-op if unset/invalid: learner stays the
|
||||
# generalist (clan_index = -1).
|
||||
var learner_clan: String = _env_or("CP_LEARNER_CLAN", "").strip_edges()
|
||||
if not learner_clan.is_empty():
|
||||
if clan_ids.has(learner_clan):
|
||||
var ok_learner: bool = bool(
|
||||
gs.set_player_personality_json(_player_slot, learner_clan, json_text)
|
||||
)
|
||||
if ok_learner:
|
||||
_emit_event("learner_clan_assigned", {"slot": _player_slot, "clan_id": learner_clan})
|
||||
else:
|
||||
_emit_protocol_error(
|
||||
"set_player_personality_json failed for learner slot=%d clan=%s"
|
||||
% [_player_slot, learner_clan]
|
||||
)
|
||||
else:
|
||||
_emit_protocol_error("CP_LEARNER_CLAN=%s not in ai_personalities.json" % learner_clan)
|
||||
|
||||
|
||||
func _scan_land_tiles(grid: RefCounted, w: int, h: int) -> Array[Vector2i]:
|
||||
## Walk the grid and collect every land hex. Mirrors
|
||||
|
|
|
|||
|
|
@ -69,6 +69,12 @@ class HarnessConfig:
|
|||
# Set this to mix learned + scripted opponents in one game, e.g.
|
||||
# `("learned:duel-v1b", "", "")` puts learned on the first AI slot.
|
||||
player_controllers: tuple[str, ...] = ()
|
||||
# Clan-conditioned RL: stamp the LEARNER slot's clan id (an
|
||||
# ai_personalities.json key, e.g. "blackhammer") so PlayerState.clan_id
|
||||
# projects into PlayerView.clan_index and the observation one-hots it.
|
||||
# Empty = generalist (clan_index = -1). See player_api_main.gd
|
||||
# CP_LEARNER_CLAN.
|
||||
learner_clan: str = ""
|
||||
|
||||
@property
|
||||
def effective_player_slots(self) -> tuple[int, ...]:
|
||||
|
|
@ -90,6 +96,8 @@ class HarnessConfig:
|
|||
}
|
||||
if self.player_controllers:
|
||||
env["CP_PLAYER_CONTROLLERS"] = ",".join(self.player_controllers)
|
||||
if self.learner_clan:
|
||||
env["CP_LEARNER_CLAN"] = self.learner_clan
|
||||
return env
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,9 @@ its win rate against this baseline; the policy is considered to have
|
|||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import replace
|
||||
|
|
@ -107,6 +110,20 @@ def _step_penalty(turn: int) -> float:
|
|||
DEFAULT_MAX_STEPS_PER_EPISODE = 250_000
|
||||
DEFAULT_MAX_TURNS = 1000
|
||||
|
||||
_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
_OVERLAYS_REL = "public/games/age-of-dwarves/data/ai/reward_overlays.json"
|
||||
|
||||
|
||||
def _load_reward_overlays() -> dict[str, dict[str, float]]:
|
||||
"""Per-clan reward-shaping overlays (clan -> {group -> multiplier}). Missing
|
||||
file = no overlays (every clan trains on the neutral catalog)."""
|
||||
path = os.environ.get("MC_REWARD_OVERLAYS") or os.path.join(_REPO_ROOT, _OVERLAYS_REL)
|
||||
try:
|
||||
with open(path, encoding="utf-8") as fh:
|
||||
return json.load(fh).get("overlays", {})
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return {}
|
||||
|
||||
|
||||
class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
|
||||
"""Single-learner Gym wrapper: our policy controls slot 0.
|
||||
|
|
@ -130,6 +147,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
|
|||
max_turns: int = DEFAULT_MAX_TURNS,
|
||||
max_steps_per_episode: int = DEFAULT_MAX_STEPS_PER_EPISODE,
|
||||
opponent: ModelOpponent | None = None,
|
||||
clan_list: tuple[str, ...] = (),
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self._config = harness_config or HarnessConfig()
|
||||
|
|
@ -168,6 +186,22 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
|
|||
# opponent elimination (the old duel-only 1v1 shortcut). The
|
||||
# authoritative `game_over` event still takes priority when present.
|
||||
self._live_players: set[int] = set()
|
||||
# Clan-conditioned RL. Each episode the env samples a clan from
|
||||
# `clan_list`, stamps it on the learner slot (CP_LEARNER_CLAN → the obs
|
||||
# clan one-hot) and scales the SHAPING rewards by that clan's overlay
|
||||
# (group -> multiplier). Terminal win/loss/decisive stay universal so
|
||||
# every clan equally wants to win. Empty list = generalist (no clan,
|
||||
# neutral catalog). Seeded RNG → reproducible clan sequence per run.
|
||||
self._clan_list: tuple[str, ...] = tuple(clan_list)
|
||||
self._overlays: dict[str, dict[str, float]] = _load_reward_overlays()
|
||||
self._clan_rng = random.Random(self._config.seed)
|
||||
self._cur_clan: str = ""
|
||||
self._cur_overlay: dict[str, float] = {}
|
||||
|
||||
def _ov(self, group: str) -> float:
|
||||
"""Reward-shaping multiplier for the current episode's clan (1.0 if
|
||||
generalist / unknown group)."""
|
||||
return self._cur_overlay.get(group, 1.0)
|
||||
|
||||
# ── Gymnasium API ────────────────────────────────────────────────
|
||||
|
||||
|
|
@ -190,6 +224,15 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
|
|||
# dropped them, which would have un-declared the external slots.
|
||||
if seed is not None:
|
||||
cfg = replace(cfg, seed=seed)
|
||||
# Clan-conditioned RL: sample this episode's clan, stamp it on the
|
||||
# learner slot (CP_LEARNER_CLAN), and select its reward overlay.
|
||||
if self._clan_list:
|
||||
self._cur_clan = self._clan_rng.choice(self._clan_list)
|
||||
cfg = replace(cfg, learner_clan=self._cur_clan)
|
||||
self._cur_overlay = self._overlays.get(self._cur_clan, {})
|
||||
else:
|
||||
self._cur_clan = ""
|
||||
self._cur_overlay = {}
|
||||
self._terminated = False
|
||||
self._step_count = 0
|
||||
self._capital_by_player = {}
|
||||
|
|
@ -309,7 +352,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
|
|||
prev_score = self._last_score
|
||||
new_score = float(view.get("score", {}).get("score_estimate", 0.0))
|
||||
# Symmetric score-delta — gains and losses both count.
|
||||
reward += SCORE_DELTA_SCALE * (new_score - prev_score)
|
||||
reward += SCORE_DELTA_SCALE * (new_score - prev_score) * self._ov("economy")
|
||||
# Event-driven shaping (Phase 1 catalog).
|
||||
reward += self._apply_event_rewards(recent_events, me)
|
||||
|
||||
|
|
@ -454,7 +497,7 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
|
|||
self._capital_by_player[owner] = cid
|
||||
if owner == me:
|
||||
if self._city_founded_rewards_issued < MAX_CITY_FOUNDED_REWARDS:
|
||||
total += CITY_FOUNDED_BY_ME
|
||||
total += CITY_FOUNDED_BY_ME * self._ov("expansion")
|
||||
self._city_founded_rewards_issued += 1
|
||||
elif kind == "city_captured":
|
||||
old_owner = int(ev.get("old_owner", -1))
|
||||
|
|
@ -465,14 +508,16 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
|
|||
and self._capital_by_player.get(old_owner) == cid
|
||||
)
|
||||
if new_owner == me:
|
||||
total += CAPITAL_CAPTURED_BY_ME if is_capital else CITY_CAPTURED_BY_ME
|
||||
total += (
|
||||
CAPITAL_CAPTURED_BY_ME if is_capital else CITY_CAPTURED_BY_ME
|
||||
) * self._ov("combat")
|
||||
elif old_owner == me:
|
||||
total += CAPITAL_LOST_BY_ME if is_capital else CITY_LOST_BY_ME
|
||||
# When a capital changes hands, the *capturer's* first
|
||||
# city is still their own capital — don't reassign.
|
||||
elif kind == "wonder_built":
|
||||
if int(ev.get("player", -1)) == me:
|
||||
total += WONDER_BUILT_BY_ME
|
||||
total += WONDER_BUILT_BY_ME * self._ov("production")
|
||||
elif kind == "combat_resolved":
|
||||
# Attribution: the wire event carries unit ids, not owners.
|
||||
# We synthesise from defender_killed/attacker_killed plus
|
||||
|
|
@ -494,17 +539,17 @@ class MagicCivEnv(gym.Env[np.ndarray, np.int64]):
|
|||
# asymmetric ±0.04/+0.05 is net-positive on even trades).
|
||||
killer = ev.get("killer_unit_id")
|
||||
if killer is None or self._unit_owner_lookup(str(killer)) == me:
|
||||
total += ENEMY_UNIT_KILLED_BY_ME
|
||||
total += ENEMY_UNIT_KILLED_BY_ME * self._ov("combat")
|
||||
elif kind == "tech_researched":
|
||||
if int(ev.get("player", -1)) == me:
|
||||
total += TECH_RESEARCHED_BY_ME
|
||||
total += TECH_RESEARCHED_BY_ME * self._ov("tech")
|
||||
elif kind == "culture_researched":
|
||||
if int(ev.get("player", -1)) == me:
|
||||
total += CULTURE_RESEARCHED_BY_ME
|
||||
total += CULTURE_RESEARCHED_BY_ME * self._ov("tech")
|
||||
elif kind == "player_eliminated":
|
||||
p = int(ev.get("player", -1))
|
||||
if p != me and p >= 0:
|
||||
total += OPPONENT_ELIMINATED
|
||||
total += OPPONENT_ELIMINATED * self._ov("combat")
|
||||
return total
|
||||
|
||||
def _unit_owner_lookup(self, unit_id: str) -> int:
|
||||
|
|
|
|||
|
|
@ -66,6 +66,13 @@ def _build_argparser() -> argparse.ArgumentParser:
|
|||
help="Stop training once eval win-rate exceeds this (default: 0.55).")
|
||||
p.add_argument("--run-name", default="duel-v1",
|
||||
help="Subdirectory under runs/ + models/ (default: duel-v1).")
|
||||
p.add_argument("--clan", default="",
|
||||
help=("Clan-conditioned training. '' = generalist (no clan, "
|
||||
"clan_index=-1). 'all' = sample every clan per episode "
|
||||
"(one conditioned policy for all clans). Or a comma list "
|
||||
"of ai_personalities.json ids, e.g. 'blackhammer'. The "
|
||||
"sampled clan is stamped on the learner (obs one-hot) and "
|
||||
"selects its reward overlay."))
|
||||
p.add_argument("--seed", type=int, default=42,
|
||||
help="Base RNG seed; per-env seeds offset from this (default: 42).")
|
||||
p.add_argument("--init-from", default=None,
|
||||
|
|
@ -111,6 +118,18 @@ def _make_env_factory(args: argparse.Namespace, env_idx: int):
|
|||
int(s) for s in str(args.opponent_slots).split(",") if s.strip()
|
||||
)
|
||||
|
||||
# Resolve the clan list for clan-conditioned training. '' = generalist;
|
||||
# 'all' = every clan (from the obs schema's canonical clan_order); else a
|
||||
# comma list of ai_personalities.json ids.
|
||||
clan_arg = str(getattr(args, "clan", "") or "").strip()
|
||||
if not clan_arg:
|
||||
clan_list: tuple[str, ...] = ()
|
||||
elif clan_arg == "all":
|
||||
from tooling.rl_self_play.obs_contract import load_schema # type: ignore[import-not-found]
|
||||
clan_list = tuple(load_schema()["clan_order"])
|
||||
else:
|
||||
clan_list = tuple(c.strip() for c in clan_arg.split(",") if c.strip())
|
||||
|
||||
def _make() -> MagicCivEnv:
|
||||
cfg = HarnessConfig(
|
||||
seed=args.seed + env_idx,
|
||||
|
|
@ -128,7 +147,10 @@ def _make_env_factory(args: argparse.Namespace, env_idx: int):
|
|||
deterministic=args.opponent_deterministic,
|
||||
)
|
||||
return MagicCivEnv(
|
||||
harness_config=cfg, max_turns=args.max_turns, opponent=opponent
|
||||
harness_config=cfg,
|
||||
max_turns=args.max_turns,
|
||||
opponent=opponent,
|
||||
clan_list=clan_list,
|
||||
)
|
||||
|
||||
return _make
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue