diff --git a/public/games/age-of-dwarves/data/sim-scenarios/fullgame/clan_fairness_band.json b/public/games/age-of-dwarves/data/sim-scenarios/fullgame/clan_fairness_band.json index d3a47bc8..52cdf6a8 100644 --- a/public/games/age-of-dwarves/data/sim-scenarios/fullgame/clan_fairness_band.json +++ b/public/games/age-of-dwarves/data/sim-scenarios/fullgame/clan_fairness_band.json @@ -2,7 +2,8 @@ "id": "clan_fairness_band", "kind": "fullgame", "version": 1, - "description": "Balance gate: round-robin all 5+ clan personalities across many seeds; no single personality may exceed the win-rate ceiling. A statistical scenario meant for the DO fleet (needs many games). Threshold is calibrated, not aspirational.", + "gating": false, + "description": "NON-GATING (owner decision 2026-06-30). Measures SCRIPTED clan-personality balance: round-robin all 6 personalities across 50 seeds. Current reality — tech_rusher ~46%, turtle 40%, militarist 14%, and boom/expansionist/merchant 0% — a real imbalance in the SCRIPTED AI. The project's answer is the TRAINED/learned controllers (which perform much better), NOT scripted-personality rebalancing, so this does not gate the release baseline; it stays as a measurement. Fix path: train learned controllers toward the 6 clan types (docs/ai-roadmap.md). The 0.4 ceiling (a pre-calibration placeholder) is intentionally left untuned so the gap stays visible.", "map": { "size": 32, "evolution_ticks": 12000, "seed_base": 10000 }, "players": [ { "personality": "militarist" }, { "personality": "boom" }, diff --git a/scripts/green-pass.sh b/scripts/green-pass.sh new file mode 100755 index 00000000..52a561a3 --- /dev/null +++ b/scripts/green-pass.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# Hardened-baseline gate: the full horizontal green pass for the simulator. +# +# 1. cargo nextest run --workspace — every crate's unit + integration tests +# 2. all sim scenarios — combat set-pieces + fullgame invariants, +# run through the REAL sim_scenario resolver +# +# Gating: a scenario JSON with "gating": false is run and REPORTED but does not +# fail the gate (e.g. clan_fairness_band measures scripted-personality balance, +# which is intentionally superseded by trained/learned controllers — see +# docs/ai-roadmap.md). Every other scenario, and any workspace test failure, +# fails the gate. +# +# Designed to run ON a fleet worker (no local Rust toolchain on plum): +# ./run dist:up 1 && rsync this repo state up && ssh worker 'bash scripts/green-pass.sh' +# Exit 0 only when the baseline is fully green. +set -uo pipefail + +REPO="${MC_REPO:-$HOME/Code/@projects/@magic-civilization}" +cd "$REPO" || { echo "no repo at $REPO" >&2; exit 2; } +. ~/.cargo/env 2>/dev/null || true +SCN_DIR="public/games/age-of-dwarves/data/sim-scenarios" +OUT="${MC_GREENPASS_OUT:-/tmp/green-pass}" +rm -rf "$OUT"; mkdir -p "$OUT" +echo "HEAD: $(git log --oneline -1 2>/dev/null)" + +# ── [1/3] workspace tests ──────────────────────────────────────────────────── +echo "########## [1/3] cargo nextest run --workspace ##########" +( cd "$REPO/src/simulator" && cargo nextest run --workspace --no-fail-fast ) > "$OUT/tests.log" 2>&1 +TESTS_RC=$? +tail -4 "$OUT/tests.log" +echo "tests rc=$TESTS_RC" + +# ── [2/3] build the scenario runner ────────────────────────────────────────── +echo "########## [2/3] build sim_scenario ##########" +( cd "$REPO/src/simulator" && cargo build --release -p mc-sim --bin sim_scenario ) > "$OUT/simbuild.log" 2>&1 +SIMBIN="$(find "$REPO" -type f -name sim_scenario -path '*release*' 2>/dev/null | head -1)" +echo "sim_scenario: ${SIMBIN:-NOT FOUND}" + +# ── [3/3] run scenarios (gating-aware) ─────────────────────────────────────── +echo "########## [3/3] scenarios ##########" +gate_pass=0; gate_fail=0; nongate_fail=0; failed_gating=""; failed_nongating="" +if [ -x "$SIMBIN" ]; then + for f in "$SCN_DIR"/combat/*.json "$SCN_DIR"/fullgame/*.json; do + [ -e "$f" ] || continue + name="$(basename "$f" .json)" + gating="$(python3 -c "import json,sys;print(json.load(open('$f')).get('gating',True))" 2>/dev/null)" + if "$SIMBIN" "$f" > "$OUT/scn_$name.json" 2>"$OUT/scn_$name.err"; then + echo "PASS $name"; gate_pass=$((gate_pass+1)) + elif [ "$gating" = "False" ]; then + echo "KNOWN $name (non-gating — does not fail the baseline)"; nongate_fail=$((nongate_fail+1)); failed_nongating="$failed_nongating $name" + else + echo "FAIL $name"; gate_fail=$((gate_fail+1)); failed_gating="$failed_gating $name" + fi + done +else + echo "sim_scenario not built — gate fails"; gate_fail=1 +fi + +echo "=================== SUMMARY ===================" +echo "workspace tests rc=$TESTS_RC" +echo "scenarios: $gate_pass gating-pass, $gate_fail gating-FAIL, $nongate_fail non-gating-known-fail" +[ -n "$failed_gating" ] && echo "GATING_FAILURES:$failed_gating" +[ -n "$failed_nongating" ] && echo "KNOWN_NONGATING:$failed_nongating" + +if [ "$TESTS_RC" -eq 0 ] && [ "$gate_fail" -eq 0 ]; then + echo "GREEN_BASELINE: OK"; exit 0 +else + echo "GREEN_BASELINE: FAILED"; exit 1 +fi