magicciv/tools/sprite-generation/engine/ranker.py

"""AI-assisted sprite variant ranking via Claude vision.

Two-tier scoring system:
1. Boolean GATES — binary pass/fail checks (facing, shadows, background, etc.)
   Any single gate failure = instant reject (confidence 0).
2. Quality RANGES — 0-100 gradients for genuine quality dimensions.
   Only scored when all gates pass.

Uses claude-code-batch-sdk to send each variant to the scorer model.
"""
from __future__ import annotations

import asyncio
import json
import logging
from datetime import datetime, timezone
from pathlib import Path

from engine.registry import SpriteRegistry


def _now() -> str:
    return datetime.now(timezone.utc).isoformat()

logger = logging.getLogger(__name__)

CONFIDENCE_THRESHOLD = 0.70
MIN_GOOD_VARIANTS = 3
QUALITY_DIM_FLOOR = 45  # Any single quality dimension below this = auto-reject

# Concurrency limits per backend type
CONCURRENCY: dict[str, int] = {
    "model-boss": 4,
    "claude": 8,
}

CATEGORY_THRESHOLDS: dict[str, float] = {
    "resources": 0.55,
    "improvements": 0.55,
    "ui": 0.55,
}

# ---------------------------------------------------------------------------
# Boolean gates — binary pass/fail, no partial credit
# ---------------------------------------------------------------------------

UNIT_GATES: tuple[str, ...] = (
    "facing_southwest",
    "single_character",
    "no_text_watermark",
    "no_base_or_ground",
    "full_body_visible",
    "correct_subject_type",
    "is_fantasy_dressed",
    "dwarf_proportions",
    "not_photorealistic",
    "no_anime_style",
    "no_pixel_art",
    "no_multiple_poses",
    "no_chroma_bleed",
    "correct_camera_elevation",
    "clean_background",
)

_GATE_DESCRIPTIONS: dict[str, str] = {
    "facing_southwest": (
        "VERY STRICT directional check for isometric game sprites. The character must be "
        "oriented EXACTLY toward the BOTTOM-LEFT corner of the image — southwest on a compass. "
        "In Warcraft III / Civ5 isometric style, this means: their LEFT shoulder is closer to "
        "you, their RIGHT shoulder is farther away, their feet point toward the lower-left, "
        "and you see mostly their back with a slight angle toward the left side. "
        "Answer FALSE if: (1) you can see the character's face or front torso AT ALL, "
        "(2) the character is in pure side PROFILE — body facing purely left with no depth, "
        "(3) the character faces directly SOUTH (straight down) with no leftward lean, "
        "(4) the character faces SOUTHEAST (lower-right) instead of southwest (lower-left), "
        "(5) the character's feet/body appear to point straight down or to any direction "
        "other than the lower-left corner. "
        "Answer TRUE ONLY if the body is unambiguously angled toward the LOWER-LEFT — "
        "not south, not southeast, not profile-left. Bottom-left corner, like walking "
        "toward 7-8 o'clock on a clock face."
    ),
    "single_character": (
        "Is there exactly ONE character in the image? Answer false if there are multiple "
        "characters, a turnaround/reference sheet, or a collage layout."
    ),
    "no_text_watermark": (
        "Is the image free of text, watermarks, logos, or UI elements? "
        "Answer false if any text or watermark is visible."
    ),
    "no_base_or_ground": (
        "Is there NO solid physical surface beneath the character? "
        "Answer TRUE if the character appears to float in empty space against a plain background. "
        "Answer FALSE ONLY if you can see an actual PHYSICAL SURFACE directly under them: "
        "a stone pedestal, grass patch, floor tile, wooden plank, dirt ground, or any opaque "
        "solid surface with texture/color of its own. "
        "CRITICAL: A SHADOW cast on the background does NOT count as a ground surface — "
        "shadows are lighting artifacts, not physical objects. A character floating above their "
        "own shadow is still floating. Answer TRUE unless you see a clearly solid surface "
        "with its own distinct color/texture (not just a darkening of the background)."
    ),
    "full_body_visible": (
        "Can you see the character from head to feet, nothing cropped? "
        "Answer false if head or feet are cut off by the frame edge."
    ),
    "correct_subject_type": (
        "Is this the correct type of unit? An archer MUST have a bow or crossbow visible. "
        "A spearman MUST have a spear. Cavalry MUST be mounted on a horse. "
        "Answer FALSE if the signature weapon or equipment is completely absent — "
        "a plain walking figure with no visible weapons, armor, or role-identifying equipment fails."
    ),
    "is_fantasy_dressed": (
        "Is this character dressed in MEDIEVAL or FANTASY attire from a pre-industrial world? "
        "Acceptable: plate armor, chainmail, leather armor with metal studs, fantasy robes, "
        "tabards over mail, fur cloaks, rough-spun tunics, medieval blacksmith aprons. "
        "Answer FALSE if wearing MODERN or CONTEMPORARY clothing or gear: t-shirts, hoodies, "
        "jeans, sneakers, tracksuits, hard hats, hi-vis vests, safety goggles, cargo pants, "
        "modern tool belts with nylon pouches, utility belts with snap closures, "
        "modern leather holsters, construction worker gear, rubber boots, backpacks with zippers. "
        "Also FALSE for Fortnite/Team Fortress/Overwatch style characters — these look like "
        "modern cartoon characters in costumes, NOT medieval fantasy inhabitants. "
        "Also FALSE for featureless 3D mannequins with no clothing detail. "
        "The character MUST look like a medieval craftsman, warrior, or peasant — NOT a modern "
        "person wearing a costume or a video game character from a contemporary setting."
    ),
    "dwarf_proportions": (
        "Does this character have DWARF proportions? Dwarves are SHORT and STOCKY — roughly "
        "half human height with a barrel chest, thick limbs, wide stance, and large head relative "
        "to body. Answer FALSE if the character has normal human proportions (tall and lean), "
        "is clearly human-height, or looks like a regular person. "
        "Answer TRUE if the character is visibly short, wide, and stocky like a fantasy dwarf."
    ),
    "not_photorealistic": (
        "Is this game art — either painted/illustrated OR stylized 3D — NOT photorealistic? "
        "ACCEPTABLE styles: "
        "(1) Hand-painted / illustrated fantasy art (Magic: The Gathering, Warcraft III concept art), "
        "(2) Stylized 3D game art — Clash Royale, mobile strategy, cartoon 3D, toy-like renders — "
        "where surfaces are clean and simplified, NOT trying to look like real life. "
        "Answer FALSE ONLY for TRUE photorealism: studio-quality photography realism with visible "
        "skin pores, photographic depth of field, hair strands individually rendered, material "
        "textures indistinguishable from a real photograph, or a CGI render trying to look like "
        "a real human being (VFX/film quality). "
        "Clash Royale style, Fortnite style, Pixar style, and mobile strategy game 3D ALL pass. "
        "Only fail if it looks like a movie VFX render or professional CGI photo-double."
    ),
    "no_anime_style": (
        "Is the art style NOT anime/manga? Western fantasy game art is acceptable. "
        "Answer false if you see distinctly Japanese anime aesthetics: very large anime eyes, "
        "exaggerated manga face proportions, or cel-shaded flat-color anime style."
    ),
    "no_pixel_art": (
        "Is this NOT pixel art or retro-style low-resolution art? "
        "Answer false if you see a visible pixel grid or retro game aesthetic."
    ),
    "no_multiple_poses": (
        "Is there exactly ONE pose/view? Answer false if there are multiple views, "
        "a T-pose sheet, or front/side/back layout."
    ),
    "no_chroma_bleed": (
        "Is the character FREE of bright yellow or lime-green color contamination on their "
        "clothing or armor? When sprites are generated on a lime green background, the green "
        "can BLEED into adjacent colors, creating bright YELLOW artifacts — yellow hoods, "
        "yellow vests, yellow armor plates, or neon green patches on clothing that should be "
        "brown, grey, red, or metallic. "
        "Answer FALSE if you see any suspiciously bright YELLOW or NEON GREEN patches on the "
        "character's clothing/armor that look like color contamination from the background. "
        "Answer TRUE if all clothing colors look natural and intentional."
    ),
    "correct_camera_elevation": (
        "STRICT overhead angle check for isometric game sprites. The camera must be "
        "CLEARLY elevated above the character — you see the TOP of their head/helmet "
        "prominently, as if looking DOWN at them from a 45-60 degree angle. "
        "In correct Warcraft III / Civ5 style: the helmet/head top is large and clearly "
        "visible, the character appears short because you see them from above, you can "
        "see their shoulders from a top-down perspective, and the ground under them "
        "would be visible if they weren't floating. "
        "Answer FALSE if: (1) the camera appears to be at EYE LEVEL — you see the "
        "character head-on as if standing in front of them at the same height, "
        "(2) you see more of the CHARACTER'S SIDE than their top — a side view means "
        "eye level, NOT overhead, (3) the head appears the same size or smaller than "
        "in a normal standing portrait — it should look squashed/compressed from above, "
        "(4) the horizon is visible or implied behind the character. "
        "Answer TRUE ONLY if there is unambiguous overhead compression — the character "
        "looks viewed from ABOVE, not from the side."
    ),
    "clean_background": (
        "Is the background plain and simple — a flat color, gradient, or neutral surface — "
        "with NO complex scene elements? "
        "Answer TRUE for: solid color backgrounds, simple gradients, clean studio-style backdrops "
        "(any color: white, green, grey, blue, etc). "
        "Answer FALSE if the background contains: landscape elements (sky, trees, ground, "
        "buildings), other characters, furniture, objects, patterns, textures, or any visual "
        "content that would appear behind the character in a game scene. "
        "The background color itself does not matter — only whether it is plain vs. complex."
    ),
}

# ---------------------------------------------------------------------------
# Quality ranges — 0-100, scored only if all gates pass
# ---------------------------------------------------------------------------

UNIT_QUALITY_DIMS: tuple[str, ...] = (
    "direction_quality",
    "art_style",
    "equipment_detail",
    "background_cleanliness",
    "shadow_acceptability",
)

# These are scored for informational display in the review UI but NOT included in
# confidence calculation — rear-view sprites hide facial/body features that are needed
# for reliable race/gender assessment.
UNIT_DISPLAY_DIMS: tuple[str, ...] = (
    "race_accuracy",
    "gender_accuracy",
)

_QUALITY_DESCRIPTIONS: dict[str, str] = {
    "direction_quality": "How cleanly southwest is the character oriented? 90+ = textbook southwest angle (225 degrees). 70 = clearly left-ish lean. 50 = ambiguous direction.",
    "race_accuracy": "How well do proportions match the race? For dwarves: 90+ = unmistakably short and stocky. 70 = short but could be a human child. 50 = human proportions.",
    "gender_accuracy": "How clear are the gender cues? Male dwarves should have thick braided beards. Female dwarves should have NO beard, braided hair, sturdy feminine build. 90+ = unambiguous. 70 = mostly clear. 50 = ambiguous.",
    "art_style": "How well does it match fantasy game art? Acceptable: painted illustration, stylized 3D (Clash Royale, mobile strategy, Warcraft III 3D). 90+ = strong game-art aesthetic. 70 = serviceable stylized look. 50 = borderline (too photorealistic or too generic).",
    "equipment_detail": "How sharp and readable are weapons/armor? Would it read at 64x64 pixels? 90+ = iconic silhouette, sharp detail. 70 = recognizable gear. 50 = muddy/unclear details.",
    "background_cleanliness": "How clean and plain is the background? 90+ = perfectly flat color or simple gradient with no artifacts. 70 = slight variation but no distracting elements. 50 = some background noise or color bleed from character.",
    "shadow_acceptability": "How acceptable is any shadow on the green background for a game sprite? 90+ = no shadow or only extremely faint shadow that won't affect chroma keying. 70 = mild shadow that can be handled by tuning chroma key threshold. 50 = prominent shadow that will leave dark artifacts. 20 = heavy black shadow covering large area.",
}

# ---------------------------------------------------------------------------
# Non-unit category gates and quality dims
# ---------------------------------------------------------------------------

TERRAIN_GATES: tuple[str, ...] = (
    "top_down_view", "seamless_tileable", "no_text_watermark",
    "no_horizon_or_sky", "no_distinct_objects",
)
TERRAIN_QUALITY: tuple[str, ...] = ("texture_quality", "color_richness", "tile_edge_blending")

BUILDING_GATES: tuple[str, ...] = (
    "isometric_view", "single_building", "no_text_watermark",
    "roof_visible", "no_front_facade",
)
BUILDING_QUALITY: tuple[str, ...] = ("architectural_detail", "style_consistency", "readability_at_small_size")

RESOURCE_GATES: tuple[str, ...] = (
    "icon_not_texture", "single_feature", "green_background",
    "no_text_watermark", "recognizable_as_named",
)
RESOURCE_QUALITY: tuple[str, ...] = ("icon_clarity", "style_match", "readability_at_64px")

SPELL_GATES: tuple[str, ...] = (
    "magical_effect_visible", "no_text_watermark", "dark_background",
)
SPELL_QUALITY: tuple[str, ...] = ("drama_impact", "magic_type_clarity", "color_vibrancy")

_NON_UNIT_GATE_DESCRIPTIONS: dict[str, str] = {
    "top_down_view": "Is the view perfectly top-down, looking straight down at ground like a satellite photo? Answer false if there's any horizon, sky, or perspective vanishing point.",
    "seamless_tileable": "Does this look like a seamless, tileable ground texture? Answer false if it has distinct borders, framing, or non-repeating composition.",
    "no_text_watermark": "Is the image free of text, watermarks, logos, or UI elements?",
    "no_horizon_or_sky": "Is the image free of any horizon line or sky? Answer false if you see sky, clouds, or a horizon.",
    "no_distinct_objects": "Is this a pure ground texture without distinct objects like buildings, characters, or items?",
    "isometric_view": "Is this a 3/4 isometric view from above, showing roof and walls? Answer false for front-facing facades or eye-level views.",
    "single_building": "Is there exactly ONE building, centered? Answer false for multiple buildings, cityscapes, or villages.",
    "roof_visible": "Can you see the building's roof from above? Answer false if it's a front elevation without visible roof.",
    "no_front_facade": "Is this NOT a straight-on front view of a building face? Answer false for architectural elevation drawings.",
    "icon_not_texture": "Is this a small isolated icon/object, NOT a full-frame seamless texture? Answer false if it fills the entire frame as ground texture.",
    "single_feature": "Is there exactly ONE distinct feature/object? Answer false for multiple items or cluttered scenes.",
    "green_background": "Is the background predominantly green (chroma key)?",
    "recognizable_as_named": "Is the depicted object clearly recognizable as the named resource/improvement?",
    "magical_effect_visible": "Is there a visible magical energy effect? Answer false if it's just a dark void with no magic visible.",
    "dark_background": "Is the background dark/black? Answer false for bright or colorful backgrounds.",
}

_NON_UNIT_QUALITY_DESCRIPTIONS: dict[str, str] = {
    "texture_quality": "How detailed and rich is the ground texture? 90+ = beautiful painterly texture. 50 = bland flat color.",
    "color_richness": "How vibrant and appropriate are the colors? 90+ = rich natural palette. 50 = washed out or wrong hue.",
    "tile_edge_blending": "How well would this tile with copies of itself? 90+ = perfectly seamless. 50 = obvious repeat edges.",
    "architectural_detail": "How detailed and well-crafted is the building? 90+ = impressive stonework and ornamentation. 50 = generic box shape.",
    "style_consistency": "Does the art style match fantasy game buildings? 90+ = perfect Civ5/AoE style. 50 = inconsistent or wrong style.",
    "readability_at_small_size": "Would this read clearly at its target display size? 90+ = bold clear silhouette. 50 = muddy at small size.",
    "icon_clarity": "How clear and readable is the icon? 90+ = instantly recognizable. 50 = vague blob.",
    "style_match": "Does it match the game's art direction? 90+ = perfect match. 50 = out of place.",
    "readability_at_64px": "Would this be readable at 64x64 pixels? 90+ = crisp and clear. 50 = loses all detail.",
    "drama_impact": "How dramatic and impactful is the spell effect? 90+ = awe-inspiring. 50 = weak and unimpressive.",
    "magic_type_clarity": "Can you tell what type of magic this is? 90+ = instantly clear (fire, ice, death, etc). 50 = generic energy blob.",
    "color_vibrancy": "How vivid and magical are the colors? 90+ = stunning magical palette. 50 = dull and flat.",
}

# ---------------------------------------------------------------------------
# Category → gates/quality mapping
# ---------------------------------------------------------------------------

def _get_category_config(category: str) -> tuple[tuple[str, ...], tuple[str, ...], dict[str, str], dict[str, str]]:
    """Return (gates, quality_dims, gate_descriptions, quality_descriptions) for a category."""
    if category == "units":
        return UNIT_GATES, UNIT_QUALITY_DIMS, _GATE_DESCRIPTIONS, _QUALITY_DESCRIPTIONS
    if category in ("terrain", "biome_grid"):
        return TERRAIN_GATES, TERRAIN_QUALITY, _NON_UNIT_GATE_DESCRIPTIONS, _NON_UNIT_QUALITY_DESCRIPTIONS
    if category == "buildings":
        return BUILDING_GATES, BUILDING_QUALITY, _NON_UNIT_GATE_DESCRIPTIONS, _NON_UNIT_QUALITY_DESCRIPTIONS
    if category in ("resources", "improvements"):
        return RESOURCE_GATES, RESOURCE_QUALITY, _NON_UNIT_GATE_DESCRIPTIONS, _NON_UNIT_QUALITY_DESCRIPTIONS
    if category == "spells":
        return SPELL_GATES, SPELL_QUALITY, _NON_UNIT_GATE_DESCRIPTIONS, _NON_UNIT_QUALITY_DESCRIPTIONS
    # Fallback: generic quality-only
    return (), (), {}, {}


# ---------------------------------------------------------------------------
# Prompt templates
# ---------------------------------------------------------------------------

SYSTEM_PROMPT = """\
You are a QA inspector for a commercial fantasy 4X strategy game. You evaluate AI-generated sprites with strict binary checks and quality scores.

Your job has TWO parts:
1. GATES: Answer each boolean gate as true or false. These are BINARY — not "kind of" or "mostly". If a shadow exists, no_shadow is false. If the character faces right, facing_southwest is false. No hedging.
2. QUALITY: Score quality dimensions 0-100. Only answer these if ALL gates pass.

If ANY gate is false, you MUST provide a failed_gate_reason explaining which gate failed and why.

Think through each gate carefully before answering. Then output ONLY valid JSON — no text outside the JSON block.

Example FAIL (facing_southwest gate):
{"gates":{"facing_southwest":false,"single_character":true,"no_text_watermark":true},"failed_gate_reason":"facing_southwest failed: character faces the camera, front of body visible","quality":{}}

Example PASS (all gates true, quality scored):
{"gates":{"facing_southwest":true,"single_character":true,"no_text_watermark":true},"failed_gate_reason":null,"quality":{"direction_quality":85,"art_style":80,"equipment_detail":75,"background_cleanliness":90,"shadow_acceptability":88}}"""

COMBINED_PROMPT_TEMPLATE = """Look at the image file {filename} in this directory.

This sprite was generated as:
- Category: {category}
- Entity: {entity_id}
- Prompt: {prompt}

## Gates (true or false each)

{gate_instructions}

## Quality (0-100, only if ALL gates are true)

{quality_instructions}

Respond with this exact JSON:
{combined_template}"""

GATE_PROMPT_TEMPLATE = """\
Look at the image file {filename} in this directory.

This sprite was generated as:
- Category: {category}
- Entity: {entity_id}
- Prompt: {prompt}

Evaluate each boolean gate (true or false):

{gate_instructions}

Respond with this exact JSON:
{gate_template}"""

QUALITY_PROMPT_TEMPLATE = """\
Look at the image file {filename} in this directory.

All gates passed for this sprite ({entity_id}). Now score each quality dimension 0-100:

{quality_instructions}

Respond with this exact JSON:
{quality_template}"""


def _extract_json(text: str) -> dict | None:
    """Extract JSON object from VLM response text.

    Handles markdown code blocks, leading/trailing text, raw JSON, and
    Qwen3 <think>...</think> reasoning blocks.
    """
    import re
    # Strip Qwen3 thinking blocks before any parse attempt
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
    # Try direct parse
    try:
        return json.loads(text)
    except (json.JSONDecodeError, TypeError):
        pass
    # Try extracting from markdown code block
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
    if m:
        try:
            return json.loads(m.group(1))
        except json.JSONDecodeError:
            pass
    # Try finding outermost { ... }
    m = re.search(r"\{.*\}", text, re.DOTALL)
    if m:
        try:
            return json.loads(m.group())
        except json.JSONDecodeError:
            pass
    return None


def _parse_entity_context(entity_id: str) -> dict[str, str]:
    """Extract race, gender, and unit class from entity_id like 'spearmen_dwarves_f'."""
    ctx: dict[str, str] = {}
    base = entity_id
    if base.endswith("_f"):
        ctx["gender"] = "female"
        base = base[:-2]
    elif base.endswith("_m"):
        ctx["gender"] = "male"
        base = base[:-2]
    for race in ("dwarves", "humans", "high_elves", "orcs"):
        if f"_{race}" in base:
            ctx["race"] = race
            base = base.replace(f"_{race}", "")
            break
    ctx["unit_class"] = base
    return ctx


def _contextualize_descriptions(
    descs: dict[str, str],
    entity_id: str,
) -> dict[str, str]:
    """Inject entity context into gate/quality descriptions so the scorer knows
    the SPECIFIC gender, race, and unit type it should evaluate against.

    Without this, the scorer sees "is gender correct?" but doesn't know WHICH
    gender is expected — and defaults to "looks fine" (always true).
    """
    ctx = _parse_entity_context(entity_id)
    gender = ctx.get("gender", "")
    race = ctx.get("race", "")
    unit_class = ctx.get("unit_class", "")

    result = dict(descs)

    # Contextualize gender gate
    if "gender_accuracy" in result and gender:
        if gender == "female" and race == "dwarves":
            result["gender_accuracy"] = (
                "This MUST be a FEMALE dwarf. Female dwarves have NO beard, braided hair, "
                "and sturdy feminine build. If the character has a BEARD, answer false. "
                "90+ = clearly female with no beard. 50 = ambiguous. 0 = male with beard."
            )
        elif gender == "male" and race == "dwarves":
            result["gender_accuracy"] = (
                "This MUST be a MALE dwarf. Male dwarves have thick braided beards and "
                "burly masculine build. If the character has NO beard, answer false. "
                "90+ = clearly male with prominent beard. 50 = ambiguous."
            )
        elif gender == "female":
            result["gender_accuracy"] = (
                f"This MUST be a FEMALE {race}. The character should have clearly feminine "
                f"features and build. 90+ = clearly female. 50 = ambiguous."
            )
        elif gender == "male":
            result["gender_accuracy"] = (
                f"This MUST be a MALE {race}. The character should have clearly masculine "
                f"features and build. 90+ = clearly male. 50 = ambiguous."
            )

    # Contextualize race gate
    if "race_accuracy" in result and race:
        race_traits = {
            "dwarves": "SHORT and STOCKY — half human height, barrel chest, thick limbs, wide stance. NOT tall, NOT slender.",
            "high_elves": "TALL and SLENDER — pointed ears, pale skin, graceful build. NOT short, NOT stocky.",
            "humans": "MEDIUM build — average proportions, no extreme features.",
            "orcs": "TALL and MUSCULAR — dark olive skin, prominent tusks, fierce expression.",
        }
        if race in race_traits:
            result["race_accuracy"] = (
                f"This must be a {race.upper().replace('_', ' ')}. "
                f"{race_traits[race]} "
                f"90+ = unmistakable. 70 = mostly right. 50 = wrong proportions."
            )

    # Contextualize facing direction gate — stricter than base description
    if "facing_southwest" in result:
        result["facing_southwest"] = (
            "STRICT rear-view check. You must see the character's BACK — spine, shoulder blades, "
            "and the BACK of their head/helmet. The character walks AWAY toward BOTTOM-LEFT. "
            "FAIL (false) if: (1) you see the character's FACE or chest — they face camera, "
            "(2) you see a SIDE PROFILE — one arm in front, one behind, side of face/torso visible — "
            "a side view is NOT a rear view even if the character faces left, "
            "(3) character faces right or upward. "
            "PASS (true) ONLY if the character's back clearly faces you AND body angles toward "
            "bottom-left corner."
        )

    # Contextualize dwarf_proportions — only enforce for dwarf units
    if "dwarf_proportions" in result and race:
        if race == "dwarves":
            result["dwarf_proportions"] = (
                "This MUST be a DWARF — SHORT and STOCKY, roughly half human height, barrel chest, "
                "thick limbs, wide stance, large head relative to body. "
                "Answer FALSE if the character has normal human proportions (tall, lean, "
                "standard limb-to-torso ratio). A tall warrior is NOT a dwarf."
            )
        else:
            # Non-dwarf races: skip this gate (always pass)
            result["dwarf_proportions"] = (
                f"This is a {race.upper()} unit, not a dwarf. Answer TRUE — dwarf proportion "
                f"check does not apply to this race."
            )

    # Contextualize subject type with specific unit class
    if "correct_subject_type" in result and unit_class:
        # NOTE: Characters face AWAY from camera (rear view, walking southwest).
        # Ranged units (archers, crossbowmen, longbowmen) — bow is typically hidden behind body
        # in rear view. Only check for equipment that's naturally visible from behind.
        _ranged_units = {"archers", "crossbowmen", "longbowmen", "bowmen"}
        _cavalry_units = {"cavalry", "heavy_cavalry"}
        _siege_units = {"catapult", "cannon", "ballista"}

        if unit_class in _ranged_units:
            result["correct_subject_type"] = (
                f"This is a {unit_class.upper().replace('_', ' ')} shown from BEHIND. "
                f"From a rear view, the bow is typically hidden behind the body — this is ACCEPTABLE. "
                f"Answer TRUE as long as this is a single humanoid figure (dwarf, human, etc.) in fantasy armor walking away. "
                f"Answer FALSE only if this is clearly the wrong subject (a horse, a building, multiple figures, or a non-character)."
            )
        elif unit_class in _cavalry_units:
            result["correct_subject_type"] = (
                f"This is a {unit_class.upper().replace('_', ' ')} shown from BEHIND. "
                f"The character MUST be MOUNTED on a visible horse/warhorse. "
                f"Answer TRUE if there is a rider clearly seated on a horse. "
                f"Answer FALSE if no horse is visible or the character is on foot."
            )
        elif unit_class in _siege_units:
            result["correct_subject_type"] = (
                f"This is a {unit_class.upper().replace('_', ' ')} siege engine. "
                f"There MUST be a siege machine visible (catapult frame, cannon barrel, etc.). "
                f"Answer FALSE if this shows only characters with no visible siege machine."
            )
        else:
            weapon_map = {
                "spearmen": "a SPEAR (long pole) visible — carried at side, strapped to back, or in hand. A round SHIELD on back or arm.",
                "swordsmen": "a SWORD or SHIELD visible — sheathed at hip, on arm, or in hand from the rear.",
                "axemen": "a battle AXE visible — strapped to back or in hand.",
                "pikemen": "a very long PIKE (far longer than character height) visible, held upright.",
                "mithril_guard": "a gleaming HALBERD or polearm visible, elite armor.",
                "berserkers": "dual AXEs visible or clearly bare-chested berserk warrior.",
            }
            expected = weapon_map.get(unit_class, f"equipment appropriate for a {unit_class}")
            result["correct_subject_type"] = (
                f"This is a {unit_class.upper().replace('_', ' ')} shown from BEHIND (rear view). "
                f"Verify: {expected}. "
                f"Answer TRUE if the unit type is identifiable. "
                f"Answer FALSE only if clearly the wrong type (no weapon visible when it should be prominent)."
            )

    return result


def _build_gate_instructions(gates: tuple[str, ...], descs: dict[str, str]) -> str:
    lines = []
    for i, g in enumerate(gates, 1):
        desc = descs.get(g, "")
        lines.append(f"{i}. **{g}** (true/false): {desc}")
    return "\n".join(lines)


def _build_quality_instructions(dims: tuple[str, ...], descs: dict[str, str]) -> str:
    lines = []
    for i, d in enumerate(dims, 1):
        desc = descs.get(d, "")
        lines.append(f"{i}. **{d}** (0-100): {desc}")
    return "\n".join(lines)


def _build_gate_only_template(gates: tuple[str, ...]) -> str:
    return json.dumps({
        "gates": {g: True for g in gates},
        "failed_gate_reason": "null or string explaining which gate failed and why",
    }, indent=2)


def _build_quality_only_template(quality_dims: tuple[str, ...]) -> str:
    return json.dumps({"quality": {d: 0 for d in quality_dims}}, indent=2)


def _parse_gates_only(
    raw: str,
    gates: tuple[str, ...],
) -> tuple[bool, dict[str, bool], str | None] | None:
    """Parse a gate-only response. Returns (gate_passed, gates_dict, failed_reason) or None."""
    data = _extract_json(raw)
    if data is None:
        return None
    gates_data = data.get("gates")
    if not isinstance(gates_data, dict):
        return None
    parsed: dict[str, bool] = {}
    for g in gates:
        val = gates_data.get(g)
        if isinstance(val, bool):
            parsed[g] = val
        elif isinstance(val, (int, float)):
            parsed[g] = bool(val)
        else:
            parsed[g] = False
    gate_passed = all(parsed.values())
    failed_reason = data.get("failed_gate_reason") if not gate_passed else None
    if isinstance(failed_reason, str) and failed_reason.lower() in ("null", "none", ""):
        failed_reason = None
    return (gate_passed, parsed, failed_reason)


def _parse_quality_only(
    raw: str,
    quality_dims: tuple[str, ...],
) -> dict[str, int] | None:
    """Parse a quality-only response. Returns {dim: score} or None."""
    data = _extract_json(raw)
    if data is None:
        return None
    quality_data = data.get("quality", {})
    if not isinstance(quality_data, dict):
        return None
    result: dict[str, int] = {}
    for d in quality_dims:
        val = quality_data.get(d)
        result[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0
    return result


def _quality_confidence(quality: dict[str, int]) -> float:
    if not quality:
        return 0.0
    return round(sum(quality.values()) / (len(quality) * 100), 3)


def _quality_floor_check(quality: dict[str, int]) -> str | None:
    """Returns a failure reason string if any dim is below floor, else None."""
    failures = [f"{d}={v}" for d, v in quality.items() if v < QUALITY_DIM_FLOOR]
    return f"quality floor breach: {', '.join(failures)}" if failures else None


def _merge_quality(a: dict[str, int], b: dict[str, int]) -> dict[str, int]:
    """Average two quality score dicts dimension-by-dimension."""
    return {d: (a.get(d, 0) + b.get(d, 0)) // 2 for d in a}


# kept for backward-compat with confidence_from_notes (reads stored JSON)
def _parse_gated_scores_legacy(
    data: dict,
    gates: tuple[str, ...],
    quality_dims: tuple[str, ...],
) -> dict:
    parsed_gates: dict[str, bool] = {}
    for g in gates:
        val = data.get("gates", {}).get(g)
        parsed_gates[g] = bool(val) if isinstance(val, (bool, int, float)) else False
    gate_passed = all(parsed_gates.values())
    failed_reason = data.get("failed_gate_reason") if not gate_passed else None
    parsed_quality: dict[str, int] = {}
    for d in quality_dims:
        val = data.get("quality", {}).get(d)
        parsed_quality[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0
    floor_reason = _quality_floor_check(parsed_quality) if gate_passed else None
    confidence = _quality_confidence(parsed_quality) if gate_passed and not floor_reason else 0.0
    floor_failed = bool(floor_reason)
    return {
        "gates": parsed_gates,
        "quality": parsed_quality,
        "gate_passed": gate_passed and not floor_failed,
        "confidence": confidence,
        "failed_gate_reason": floor_reason if floor_failed else failed_reason,
        "quality_floor_failed": floor_failed,
    }


def _build_combined_template(gates: tuple[str, ...], quality_dims: tuple[str, ...]) -> str:
    return json.dumps({
        "gates": {g: True for g in gates},
        "failed_gate_reason": "null or string explaining which gate failed and why",
        "quality": {d: 0 for d in quality_dims},
    }, indent=2)


def _parse_combined(
    raw: str,
    gates: tuple[str, ...],
    quality_dims: tuple[str, ...],
) -> dict | None:
    """Parse a single combined gate+quality response into a result dict."""
    gate_result = _parse_gates_only(raw, gates)
    if gate_result is None:
        return None
    gate_passed, parsed_gates, failed_reason = gate_result
    if not gate_passed:
        return {
            "gates": parsed_gates,
            "quality": {},
            "gate_passed": False,
            "confidence": 0.0,
            "failed_gate_reason": failed_reason,
            "quality_floor_failed": False,
        }
    quality = _parse_quality_only(raw, quality_dims)
    if quality is None:
        return None
    floor_reason = _quality_floor_check(quality)
    if floor_reason:
        return {
            "gates": parsed_gates,
            "quality": quality,
            "gate_passed": False,
            "confidence": 0.0,
            "failed_gate_reason": floor_reason,
            "quality_floor_failed": True,
        }
    return {
        "gates": parsed_gates,
        "quality": quality,
        "gate_passed": True,
        "confidence": _quality_confidence(quality),
        "failed_gate_reason": None,
        "quality_floor_failed": False,
    }

def confidence_from_notes(notes_json: str) -> tuple[float, bool]:
    """Extract confidence and gate_passed from stored notes JSON.

    Handles both new gated format and legacy flat format for backward compat.
    Returns (confidence, gate_passed).
    """
    scores = json.loads(notes_json)
    # New gated format
    if "gates" in scores:
        gate_passed = scores.get("gate_passed", False)
        return (scores.get("confidence", 0.0), gate_passed)
    # Legacy flat format — treat all as gate-passed, compute average
    vals = [v for v in scores.values() if isinstance(v, (int, float))]
    conf = sum(vals) / len(vals) if vals else 0
    return (conf, True)


class Scorer:
    """Scores a variant image using configurable single-pass or two-pass rubric.

    single_pass=False (default — two-pass + optional tiebreaker):
      Pass 1 — gate-only prompt: binary pass/fail. Failures exit immediately.
      Pass 2 — quality-only prompt: 0-100 dimensions, only if all gates pass.
      Pass 3 — tiebreaker (optional): re-scores quality when confidence is within
                `tiebreaker_range` of `threshold`; averages pass-2 and pass-3 scores.

    single_pass=True (one combined gate+quality call):
      For high-accuracy models where extra API calls cost more than accuracy gain.

    Two backend types:
    - "model-boss": local VLM via InferenceClient (Qwen3-VL, etc.)
    - "claude": Claude API via claude-code-batch-sdk (Haiku, Sonnet, Opus)
    """

    def __init__(
        self,
        name: str,
        backend: str,
        model: str,
        threshold: float,
        tiebreaker_range: float = 0.0,
        single_pass: bool = False,
    ):
        self.name = name
        self.backend = backend
        self.model = model
        self.threshold = threshold
        self.tiebreaker_range = tiebreaker_range
        self.single_pass = single_pass
        self._semaphore = asyncio.Semaphore(CONCURRENCY.get(backend, 4))

        if backend == "model-boss":
            from model_boss import InferenceClient
            self._client = InferenceClient(
                client_id=f"sprite-ranker-{name}",
                default_priority="normal",
                timeout=120.0,
            )
        elif backend == "claude":
            _ensure_claude_sdk()
            from claude_code_batch_sdk import ClaudeClient
            self._client = ClaudeClient(model=model, max_concurrent=2, timeout=180.0)

    async def score(self, raw_path: str, sprite: dict) -> dict | None:
        """Score a single variant image, bounded by backend semaphore."""
        async with self._semaphore:
            return await self._score_inner(raw_path, sprite)

    async def score_stream(
        self, items: list[tuple[str, dict]],
    ):
        """Score multiple (image_path, sprite) pairs concurrently.

        Yields (index, result) pairs as each completes — callers receive and
        process results immediately without waiting for the whole batch.
        This keeps memory bounded and allows progress commits during long runs.
        """
        async def _task(idx: int, path: str, sprite: dict) -> tuple[int, dict | None]:
            result = await self.score(path, sprite)
            return (idx, result)

        tasks = [_task(i, p, s) for i, (p, s) in enumerate(items)]
        for coro in asyncio.as_completed(tasks):
            try:
                yield await coro
            except Exception as exc:
                logger.warning("[%s] Batch item error: %s", self.name, exc)
                yield (-1, None)

    async def _call_backend(self, img_b64: str, raw_path: str, prompt: str) -> str | None:
        """Send a single prompt+image to the backend. Returns raw text or None."""
        try:
            if self.backend == "model-boss":
                return await self._client.chat(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {
                            "role": "user",
                            "content": [
                                {"type": "text", "text": prompt},
                                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
                            ],
                        },
                    ],
                    max_tokens=2048,
                    temperature=0.1,
                    keep_alive=300,
                )
            elif self.backend == "claude":
                return await self._client.generate(
                    system=SYSTEM_PROMPT,
                    user=prompt,
                    cwd=str(Path(raw_path).parent),
                    allowed_tools=["Read"],
                )
            return None
        except Exception as exc:
            logger.warning("[%s] Backend call failed: %s", self.name, exc)
            return None

    async def _score_inner(self, raw_path: str, sprite: dict) -> dict | None:
        """Shared setup, then branches to single-pass or two-pass scoring."""
        import base64

        if not raw_path or not Path(raw_path).exists():
            return None

        category = sprite["category"]
        gates, quality_dims, gate_descs, quality_descs = _get_category_config(category)
        if not gates:
            return None

        entity_id = sprite.get("entity_id", "")
        ctx_gate_descs = _contextualize_descriptions(gate_descs, entity_id)
        ctx_quality_descs = _contextualize_descriptions(quality_descs, entity_id)
        filename = Path(raw_path).name
        prompt_excerpt = sprite["prompt"][:300]
        img_b64 = base64.b64encode(Path(raw_path).read_bytes()).decode()

        gate_instructions = _build_gate_instructions(gates, ctx_gate_descs)
        quality_instructions = _build_quality_instructions(quality_dims, ctx_quality_descs)

        # ---------------------------------------------------------------
        # Single-pass: one combined gate+quality call
        # ---------------------------------------------------------------
        if self.single_pass:
            prompt = COMBINED_PROMPT_TEMPLATE.format(
                filename=filename,
                category=category,
                entity_id=entity_id,
                prompt=prompt_excerpt,
                gate_instructions=gate_instructions,
                quality_instructions=quality_instructions,
                combined_template=_build_combined_template(gates, quality_dims),
            )
            raw = await self._call_backend(img_b64, raw_path, prompt)
            if raw is None:
                return None
            return _parse_combined(raw, gates, quality_dims)

        # ---------------------------------------------------------------
        # Two-pass + optional tiebreaker
        # ---------------------------------------------------------------

        # --- Pass 1: gates only ---
        gate_prompt = GATE_PROMPT_TEMPLATE.format(
            filename=filename,
            category=category,
            entity_id=entity_id,
            prompt=prompt_excerpt,
            gate_instructions=gate_instructions,
            gate_template=_build_gate_only_template(gates),
        )
        gate_raw = await self._call_backend(img_b64, raw_path, gate_prompt)
        if gate_raw is None:
            return None

        gate_result = _parse_gates_only(gate_raw, gates)
        if gate_result is None:
            return None

        gate_passed, parsed_gates, failed_reason = gate_result
        if not gate_passed:
            return {
                "gates": parsed_gates,
                "quality": {},
                "gate_passed": False,
                "confidence": 0.0,
                "failed_gate_reason": failed_reason,
                "quality_floor_failed": False,
            }

        # --- Pass 2: quality only ---
        quality_prompt = QUALITY_PROMPT_TEMPLATE.format(
            filename=filename,
            entity_id=entity_id,
            quality_instructions=quality_instructions,
            quality_template=_build_quality_only_template(quality_dims),
        )
        quality_raw = await self._call_backend(img_b64, raw_path, quality_prompt)
        if quality_raw is None:
            return None

        quality = _parse_quality_only(quality_raw, quality_dims)
        if quality is None:
            return None

        # --- Pass 3: tiebreaker (when confidence is within range of threshold) ---
        if self.tiebreaker_range > 0.0:
            confidence = _quality_confidence(quality)
            if abs(confidence - self.threshold) <= self.tiebreaker_range:
                logger.debug(
                    "[%s] Tiebreaker triggered: confidence=%.3f threshold=%.2f range=%.2f",
                    self.name, confidence, self.threshold, self.tiebreaker_range,
                )
                tie_raw = await self._call_backend(img_b64, raw_path, quality_prompt)
                if tie_raw is not None:
                    tie_quality = _parse_quality_only(tie_raw, quality_dims)
                    if tie_quality is not None:
                        quality = _merge_quality(quality, tie_quality)

        floor_reason = _quality_floor_check(quality)
        if floor_reason:
            return {
                "gates": parsed_gates,
                "quality": quality,
                "gate_passed": False,
                "confidence": 0.0,
                "failed_gate_reason": floor_reason,
                "quality_floor_failed": True,
            }

        return {
            "gates": parsed_gates,
            "quality": quality,
            "gate_passed": True,
            "confidence": _quality_confidence(quality),
            "failed_gate_reason": None,
            "quality_floor_failed": False,
        }


def _ensure_claude_sdk():
    """Add claude-code-batch-sdk to path if not installed."""
    import sys
    sdk_path = Path(__file__).resolve().parent.parent.parent.parent.parent / (
        "@applications/@ml/@packages/@py/claude-code-batch-sdk/src"
    )
    if sdk_path.exists() and str(sdk_path) not in sys.path:
        sys.path.insert(0, str(sdk_path))


def _load_pipeline_config() -> dict:
    """Load scoring pipeline stages from YAML."""
    import yaml
    config_path = Path(__file__).parent / "prompts" / "scoring_pipeline.yaml"
    with open(config_path) as f:
        return yaml.safe_load(f)


class ScoringPipeline:
    """Multi-stage scoring pipeline: variants escalate through scorers.

    Each stage only sees variants that passed all previous stages.
    Rejections at any stage go back to generation.
    Only the deficit needed is escalated — no wasteful re-evaluation.

    Stages are configured in scoring_pipeline.yaml. Default:
        Qwen3-VL (free, 5s) → Haiku ($0.001, 5s) → Opus ($0.015, 30s) → User
    """

    def __init__(self, registry: SpriteRegistry, raw_dir: Path):
        self.registry = registry
        self.raw_dir = raw_dir
        config = _load_pipeline_config()
        self.target_approved = config.get("target_approved", MIN_GOOD_VARIANTS)
        self.stages: list[Scorer] = []
        for stage in config.get("stages", []):
            self.stages.append(Scorer(
                name=stage["name"],
                backend=stage["backend"],
                model=stage["model"],
                threshold=stage.get("threshold", CONFIDENCE_THRESHOLD),
                tiebreaker_range=stage.get("tiebreaker_range", 0.0),
                single_pass=stage.get("single_pass", False),
            ))

    async def score_variant_at_tier(
        self, variant: dict, sprite: dict, tier: int,
    ) -> dict | None:
        """Score a variant at a specific pipeline tier. Returns result or None."""
        if tier < 0 or tier >= len(self.stages):
            return None
        return await self.stages[tier].score(variant.get("raw_path", ""), sprite)

    async def advance_sprite(self, sprite_id: str) -> dict:
        """Advance a sprite through the scoring pipeline.

        For each tier, find variants that passed the previous tier but haven't
        been evaluated at this tier yet. Score them. Track results.

        Returns {tier_results, ready_count, needs_regen, deficit}
        """
        import asyncio

        sprite = self.registry.get_sprite(sprite_id)
        if not sprite:
            return {"tier_results": [], "ready_count": 0, "needs_regen": True, "deficit": self.target_approved}

        cat = sprite["category"]
        threshold = CATEGORY_THRESHOLDS.get(cat, CONFIDENCE_THRESHOLD)
        tier_results = []

        for tier_idx, scorer in enumerate(self.stages):
            # Find variants eligible for this tier:
            # - completed, has image
            # - review_tier == tier_idx (passed all previous tiers, not yet scored here)
            # - not rejected (rating != -1)
            variants = self.registry.conn.execute(
                "SELECT * FROM variants WHERE sprite_id=? AND job_status='completed' "
                "AND raw_path IS NOT NULL AND (rating IS NULL OR rating != -1) "
                "AND COALESCE(review_tier, 0) = ?",
                (sprite_id, tier_idx),
            ).fetchall()

            if not variants:
                tier_results.append({"tier": scorer.name, "scored": 0, "passed": 0, "failed": 0})
                continue

            passed = 0
            failed = 0
            for v in variants:
                result = await self.score_variant_at_tier(dict(v), dict(sprite), tier_idx)
                if result is None:
                    continue

                gate_passed = result["gate_passed"]
                conf = result["confidence"]
                rating = max(1, min(5, round(conf * 5))) if gate_passed else 1
                passed_tier = gate_passed and conf >= scorer.threshold

                scored_by = json.dumps({
                    "model": scorer.model,
                    "backend": scorer.backend,
                    "stage": scorer.name,
                    "tier": tier_idx,
                    "scoring_version": "pipeline_v2",
                })

                # Store per-scorer scorecard (preserves all evaluations)
                self.registry.store_score(
                    variant_id=v["id"],
                    scorer_name=scorer.name,
                    scorer_model=scorer.model,
                    tier=tier_idx,
                    result=result,
                )

                if passed_tier:
                    # Advance to next tier — notes stores latest scorer's result
                    self.registry.conn.execute(
                        "UPDATE variants SET rating=?, notes=?, scored_by=?, scored_at=?, review_tier=? WHERE id=?",
                        (rating, json.dumps(result), scored_by, _now(), tier_idx + 1, v["id"]),
                    )
                    passed += 1
                else:
                    # Rejected at this tier
                    self.registry.conn.execute(
                        "UPDATE variants SET rating=-1, notes=?, scored_by=?, scored_at=?, review_tier=? WHERE id=?",
                        (json.dumps(result), scored_by, _now(), tier_idx, v["id"]),
                    )
                    failed += 1
                self.registry.conn.commit()

            tier_results.append({"tier": scorer.name, "scored": len(variants), "passed": passed, "failed": failed})
            print(f"  [{scorer.name}] {passed}/{len(variants)} passed (tier {tier_idx})")

        # Count fully-approved variants (passed all tiers)
        final_tier = len(self.stages)
        ready = self.registry.conn.execute(
            "SELECT COUNT(*) FROM variants WHERE sprite_id=? AND COALESCE(review_tier, 0) >= ? AND rating != -1",
            (sprite_id, final_tier),
        ).fetchone()[0]

        deficit = max(0, self.target_approved - ready)
        return {
            "tier_results": tier_results,
            "ready_count": ready,
            "needs_regen": ready < self.target_approved,
            "deficit": deficit,
        }

    async def rank_and_filter(
        self,
        sprite_id: str,
        threshold: float | None = None,
        min_good: int | None = None,
    ) -> dict:
        """Score unscored variants through the pipeline, return status.

        Compatible with the old SpriteRanker.rank_and_filter() interface.
        """
        result = await self.advance_sprite(sprite_id)

        # Also build the ranked list for display
        sprite = self.registry.get_sprite(sprite_id)
        category = sprite["category"] if sprite else ""
        if threshold is None:
            threshold = CATEGORY_THRESHOLDS.get(category, CONFIDENCE_THRESHOLD)
        if min_good is None:
            min_good = self.target_approved

        all_variants = self.registry.get_variants(sprite_id)
        ranked = []
        for v in all_variants:
            if v["notes"] is None or v["rating"] == -1:
                continue
            conf, gate_passed = confidence_from_notes(v["notes"])
            ranked.append({
                "variant_id": v["id"],
                "seed": v["seed"],
                "raw_path": v["raw_path"],
                "scores": json.loads(v["notes"]),
                "confidence": round(conf, 3),
                "gate_passed": gate_passed,
                "review_tier": v["review_tier"] if "review_tier" in v.keys() else 0,
            })
        ranked.sort(key=lambda x: x["confidence"], reverse=True)

        good = [r for r in ranked if r["gate_passed"] and r["confidence"] >= threshold]
        return {
            "ranked": ranked,
            "good_count": len(good),
            "needs_regen": result["needs_regen"],
            "deficit": result["deficit"],
            "tier_results": result["tier_results"],
        }


# Backward-compatible alias
SpriteRanker = ScoringPipeline