1166 lines
53 KiB
Python
1166 lines
53 KiB
Python
"""AI-assisted sprite variant ranking via Claude vision.
|
|
|
|
Two-tier scoring system:
|
|
1. Boolean GATES — binary pass/fail checks (facing, shadows, background, etc.)
|
|
Any single gate failure = instant reject (confidence 0).
|
|
2. Quality RANGES — 0-100 gradients for genuine quality dimensions.
|
|
Only scored when all gates pass.
|
|
|
|
Uses claude-code-batch-sdk to send each variant to the scorer model.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from engine.registry import SpriteRegistry
|
|
|
|
|
|
def _now() -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
CONFIDENCE_THRESHOLD = 0.70
|
|
MIN_GOOD_VARIANTS = 3
|
|
QUALITY_DIM_FLOOR = 45 # Any single quality dimension below this = auto-reject
|
|
|
|
# Concurrency limits per backend type
|
|
CONCURRENCY: dict[str, int] = {
|
|
"model-boss": 4,
|
|
"claude": 8,
|
|
}
|
|
|
|
CATEGORY_THRESHOLDS: dict[str, float] = {
|
|
"resources": 0.55,
|
|
"improvements": 0.55,
|
|
"ui": 0.55,
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Boolean gates — binary pass/fail, no partial credit
|
|
# ---------------------------------------------------------------------------
|
|
|
|
UNIT_GATES: tuple[str, ...] = (
|
|
"facing_southwest",
|
|
"single_character",
|
|
"no_text_watermark",
|
|
"no_base_or_ground",
|
|
"full_body_visible",
|
|
"correct_subject_type",
|
|
"is_fantasy_dressed",
|
|
"dwarf_proportions",
|
|
"not_photorealistic",
|
|
"no_anime_style",
|
|
"no_pixel_art",
|
|
"no_multiple_poses",
|
|
"no_chroma_bleed",
|
|
"correct_camera_elevation",
|
|
"clean_background",
|
|
)
|
|
|
|
_GATE_DESCRIPTIONS: dict[str, str] = {
|
|
"facing_southwest": (
|
|
"VERY STRICT directional check for isometric game sprites. The character must be "
|
|
"oriented EXACTLY toward the BOTTOM-LEFT corner of the image — southwest on a compass. "
|
|
"In Warcraft III / Civ5 isometric style, this means: their LEFT shoulder is closer to "
|
|
"you, their RIGHT shoulder is farther away, their feet point toward the lower-left, "
|
|
"and you see mostly their back with a slight angle toward the left side. "
|
|
"Answer FALSE if: (1) you can see the character's face or front torso AT ALL, "
|
|
"(2) the character is in pure side PROFILE — body facing purely left with no depth, "
|
|
"(3) the character faces directly SOUTH (straight down) with no leftward lean, "
|
|
"(4) the character faces SOUTHEAST (lower-right) instead of southwest (lower-left), "
|
|
"(5) the character's feet/body appear to point straight down or to any direction "
|
|
"other than the lower-left corner. "
|
|
"Answer TRUE ONLY if the body is unambiguously angled toward the LOWER-LEFT — "
|
|
"not south, not southeast, not profile-left. Bottom-left corner, like walking "
|
|
"toward 7-8 o'clock on a clock face."
|
|
),
|
|
"single_character": (
|
|
"Is there exactly ONE character in the image? Answer false if there are multiple "
|
|
"characters, a turnaround/reference sheet, or a collage layout."
|
|
),
|
|
"no_text_watermark": (
|
|
"Is the image free of text, watermarks, logos, or UI elements? "
|
|
"Answer false if any text or watermark is visible."
|
|
),
|
|
"no_base_or_ground": (
|
|
"Is there NO solid physical surface beneath the character? "
|
|
"Answer TRUE if the character appears to float in empty space against a plain background. "
|
|
"Answer FALSE ONLY if you can see an actual PHYSICAL SURFACE directly under them: "
|
|
"a stone pedestal, grass patch, floor tile, wooden plank, dirt ground, or any opaque "
|
|
"solid surface with texture/color of its own. "
|
|
"CRITICAL: A SHADOW cast on the background does NOT count as a ground surface — "
|
|
"shadows are lighting artifacts, not physical objects. A character floating above their "
|
|
"own shadow is still floating. Answer TRUE unless you see a clearly solid surface "
|
|
"with its own distinct color/texture (not just a darkening of the background)."
|
|
),
|
|
"full_body_visible": (
|
|
"Can you see the character from head to feet, nothing cropped? "
|
|
"Answer false if head or feet are cut off by the frame edge."
|
|
),
|
|
"correct_subject_type": (
|
|
"Is this the correct type of unit? An archer MUST have a bow or crossbow visible. "
|
|
"A spearman MUST have a spear. Cavalry MUST be mounted on a horse. "
|
|
"Answer FALSE if the signature weapon or equipment is completely absent — "
|
|
"a plain walking figure with no visible weapons, armor, or role-identifying equipment fails."
|
|
),
|
|
"is_fantasy_dressed": (
|
|
"Is this character dressed in MEDIEVAL or FANTASY attire from a pre-industrial world? "
|
|
"Acceptable: plate armor, chainmail, leather armor with metal studs, fantasy robes, "
|
|
"tabards over mail, fur cloaks, rough-spun tunics, medieval blacksmith aprons. "
|
|
"Answer FALSE if wearing MODERN or CONTEMPORARY clothing or gear: t-shirts, hoodies, "
|
|
"jeans, sneakers, tracksuits, hard hats, hi-vis vests, safety goggles, cargo pants, "
|
|
"modern tool belts with nylon pouches, utility belts with snap closures, "
|
|
"modern leather holsters, construction worker gear, rubber boots, backpacks with zippers. "
|
|
"Also FALSE for Fortnite/Team Fortress/Overwatch style characters — these look like "
|
|
"modern cartoon characters in costumes, NOT medieval fantasy inhabitants. "
|
|
"Also FALSE for featureless 3D mannequins with no clothing detail. "
|
|
"The character MUST look like a medieval craftsman, warrior, or peasant — NOT a modern "
|
|
"person wearing a costume or a video game character from a contemporary setting."
|
|
),
|
|
"dwarf_proportions": (
|
|
"Does this character have DWARF proportions? Dwarves are SHORT and STOCKY — roughly "
|
|
"half human height with a barrel chest, thick limbs, wide stance, and large head relative "
|
|
"to body. Answer FALSE if the character has normal human proportions (tall and lean), "
|
|
"is clearly human-height, or looks like a regular person. "
|
|
"Answer TRUE if the character is visibly short, wide, and stocky like a fantasy dwarf."
|
|
),
|
|
"not_photorealistic": (
|
|
"Is this game art — either painted/illustrated OR stylized 3D — NOT photorealistic? "
|
|
"ACCEPTABLE styles: "
|
|
"(1) Hand-painted / illustrated fantasy art (Magic: The Gathering, Warcraft III concept art), "
|
|
"(2) Stylized 3D game art — Clash Royale, mobile strategy, cartoon 3D, toy-like renders — "
|
|
"where surfaces are clean and simplified, NOT trying to look like real life. "
|
|
"Answer FALSE ONLY for TRUE photorealism: studio-quality photography realism with visible "
|
|
"skin pores, photographic depth of field, hair strands individually rendered, material "
|
|
"textures indistinguishable from a real photograph, or a CGI render trying to look like "
|
|
"a real human being (VFX/film quality). "
|
|
"Clash Royale style, Fortnite style, Pixar style, and mobile strategy game 3D ALL pass. "
|
|
"Only fail if it looks like a movie VFX render or professional CGI photo-double."
|
|
),
|
|
"no_anime_style": (
|
|
"Is the art style NOT anime/manga? Western fantasy game art is acceptable. "
|
|
"Answer false if you see distinctly Japanese anime aesthetics: very large anime eyes, "
|
|
"exaggerated manga face proportions, or cel-shaded flat-color anime style."
|
|
),
|
|
"no_pixel_art": (
|
|
"Is this NOT pixel art or retro-style low-resolution art? "
|
|
"Answer false if you see a visible pixel grid or retro game aesthetic."
|
|
),
|
|
"no_multiple_poses": (
|
|
"Is there exactly ONE pose/view? Answer false if there are multiple views, "
|
|
"a T-pose sheet, or front/side/back layout."
|
|
),
|
|
"no_chroma_bleed": (
|
|
"Is the character FREE of bright yellow or lime-green color contamination on their "
|
|
"clothing or armor? When sprites are generated on a lime green background, the green "
|
|
"can BLEED into adjacent colors, creating bright YELLOW artifacts — yellow hoods, "
|
|
"yellow vests, yellow armor plates, or neon green patches on clothing that should be "
|
|
"brown, grey, red, or metallic. "
|
|
"Answer FALSE if you see any suspiciously bright YELLOW or NEON GREEN patches on the "
|
|
"character's clothing/armor that look like color contamination from the background. "
|
|
"Answer TRUE if all clothing colors look natural and intentional."
|
|
),
|
|
"correct_camera_elevation": (
|
|
"STRICT overhead angle check for isometric game sprites. The camera must be "
|
|
"CLEARLY elevated above the character — you see the TOP of their head/helmet "
|
|
"prominently, as if looking DOWN at them from a 45-60 degree angle. "
|
|
"In correct Warcraft III / Civ5 style: the helmet/head top is large and clearly "
|
|
"visible, the character appears short because you see them from above, you can "
|
|
"see their shoulders from a top-down perspective, and the ground under them "
|
|
"would be visible if they weren't floating. "
|
|
"Answer FALSE if: (1) the camera appears to be at EYE LEVEL — you see the "
|
|
"character head-on as if standing in front of them at the same height, "
|
|
"(2) you see more of the CHARACTER'S SIDE than their top — a side view means "
|
|
"eye level, NOT overhead, (3) the head appears the same size or smaller than "
|
|
"in a normal standing portrait — it should look squashed/compressed from above, "
|
|
"(4) the horizon is visible or implied behind the character. "
|
|
"Answer TRUE ONLY if there is unambiguous overhead compression — the character "
|
|
"looks viewed from ABOVE, not from the side."
|
|
),
|
|
"clean_background": (
|
|
"Is the background plain and simple — a flat color, gradient, or neutral surface — "
|
|
"with NO complex scene elements? "
|
|
"Answer TRUE for: solid color backgrounds, simple gradients, clean studio-style backdrops "
|
|
"(any color: white, green, grey, blue, etc). "
|
|
"Answer FALSE if the background contains: landscape elements (sky, trees, ground, "
|
|
"buildings), other characters, furniture, objects, patterns, textures, or any visual "
|
|
"content that would appear behind the character in a game scene. "
|
|
"The background color itself does not matter — only whether it is plain vs. complex."
|
|
),
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Quality ranges — 0-100, scored only if all gates pass
|
|
# ---------------------------------------------------------------------------
|
|
|
|
UNIT_QUALITY_DIMS: tuple[str, ...] = (
|
|
"direction_quality",
|
|
"art_style",
|
|
"equipment_detail",
|
|
"background_cleanliness",
|
|
"shadow_acceptability",
|
|
)
|
|
|
|
# These are scored for informational display in the review UI but NOT included in
|
|
# confidence calculation — rear-view sprites hide facial/body features that are needed
|
|
# for reliable race/gender assessment.
|
|
UNIT_DISPLAY_DIMS: tuple[str, ...] = (
|
|
"race_accuracy",
|
|
"gender_accuracy",
|
|
)
|
|
|
|
_QUALITY_DESCRIPTIONS: dict[str, str] = {
|
|
"direction_quality": "How cleanly southwest is the character oriented? 90+ = textbook southwest angle (225 degrees). 70 = clearly left-ish lean. 50 = ambiguous direction.",
|
|
"race_accuracy": "How well do proportions match the race? For dwarves: 90+ = unmistakably short and stocky. 70 = short but could be a human child. 50 = human proportions.",
|
|
"gender_accuracy": "How clear are the gender cues? Male dwarves should have thick braided beards. Female dwarves should have NO beard, braided hair, sturdy feminine build. 90+ = unambiguous. 70 = mostly clear. 50 = ambiguous.",
|
|
"art_style": "How well does it match fantasy game art? Acceptable: painted illustration, stylized 3D (Clash Royale, mobile strategy, Warcraft III 3D). 90+ = strong game-art aesthetic. 70 = serviceable stylized look. 50 = borderline (too photorealistic or too generic).",
|
|
"equipment_detail": "How sharp and readable are weapons/armor? Would it read at 64x64 pixels? 90+ = iconic silhouette, sharp detail. 70 = recognizable gear. 50 = muddy/unclear details.",
|
|
"background_cleanliness": "How clean and plain is the background? 90+ = perfectly flat color or simple gradient with no artifacts. 70 = slight variation but no distracting elements. 50 = some background noise or color bleed from character.",
|
|
"shadow_acceptability": "How acceptable is any shadow on the green background for a game sprite? 90+ = no shadow or only extremely faint shadow that won't affect chroma keying. 70 = mild shadow that can be handled by tuning chroma key threshold. 50 = prominent shadow that will leave dark artifacts. 20 = heavy black shadow covering large area.",
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Non-unit category gates and quality dims
|
|
# ---------------------------------------------------------------------------
|
|
|
|
TERRAIN_GATES: tuple[str, ...] = (
|
|
"top_down_view", "seamless_tileable", "no_text_watermark",
|
|
"no_horizon_or_sky", "no_distinct_objects",
|
|
)
|
|
TERRAIN_QUALITY: tuple[str, ...] = ("texture_quality", "color_richness", "tile_edge_blending")
|
|
|
|
BUILDING_GATES: tuple[str, ...] = (
|
|
"isometric_view", "single_building", "no_text_watermark",
|
|
"roof_visible", "no_front_facade",
|
|
)
|
|
BUILDING_QUALITY: tuple[str, ...] = ("architectural_detail", "style_consistency", "readability_at_small_size")
|
|
|
|
RESOURCE_GATES: tuple[str, ...] = (
|
|
"icon_not_texture", "single_feature", "green_background",
|
|
"no_text_watermark", "recognizable_as_named",
|
|
)
|
|
RESOURCE_QUALITY: tuple[str, ...] = ("icon_clarity", "style_match", "readability_at_64px")
|
|
|
|
SPELL_GATES: tuple[str, ...] = (
|
|
"magical_effect_visible", "no_text_watermark", "dark_background",
|
|
)
|
|
SPELL_QUALITY: tuple[str, ...] = ("drama_impact", "magic_type_clarity", "color_vibrancy")
|
|
|
|
_NON_UNIT_GATE_DESCRIPTIONS: dict[str, str] = {
|
|
"top_down_view": "Is the view perfectly top-down, looking straight down at ground like a satellite photo? Answer false if there's any horizon, sky, or perspective vanishing point.",
|
|
"seamless_tileable": "Does this look like a seamless, tileable ground texture? Answer false if it has distinct borders, framing, or non-repeating composition.",
|
|
"no_text_watermark": "Is the image free of text, watermarks, logos, or UI elements?",
|
|
"no_horizon_or_sky": "Is the image free of any horizon line or sky? Answer false if you see sky, clouds, or a horizon.",
|
|
"no_distinct_objects": "Is this a pure ground texture without distinct objects like buildings, characters, or items?",
|
|
"isometric_view": "Is this a 3/4 isometric view from above, showing roof and walls? Answer false for front-facing facades or eye-level views.",
|
|
"single_building": "Is there exactly ONE building, centered? Answer false for multiple buildings, cityscapes, or villages.",
|
|
"roof_visible": "Can you see the building's roof from above? Answer false if it's a front elevation without visible roof.",
|
|
"no_front_facade": "Is this NOT a straight-on front view of a building face? Answer false for architectural elevation drawings.",
|
|
"icon_not_texture": "Is this a small isolated icon/object, NOT a full-frame seamless texture? Answer false if it fills the entire frame as ground texture.",
|
|
"single_feature": "Is there exactly ONE distinct feature/object? Answer false for multiple items or cluttered scenes.",
|
|
"green_background": "Is the background predominantly green (chroma key)?",
|
|
"recognizable_as_named": "Is the depicted object clearly recognizable as the named resource/improvement?",
|
|
"magical_effect_visible": "Is there a visible magical energy effect? Answer false if it's just a dark void with no magic visible.",
|
|
"dark_background": "Is the background dark/black? Answer false for bright or colorful backgrounds.",
|
|
}
|
|
|
|
_NON_UNIT_QUALITY_DESCRIPTIONS: dict[str, str] = {
|
|
"texture_quality": "How detailed and rich is the ground texture? 90+ = beautiful painterly texture. 50 = bland flat color.",
|
|
"color_richness": "How vibrant and appropriate are the colors? 90+ = rich natural palette. 50 = washed out or wrong hue.",
|
|
"tile_edge_blending": "How well would this tile with copies of itself? 90+ = perfectly seamless. 50 = obvious repeat edges.",
|
|
"architectural_detail": "How detailed and well-crafted is the building? 90+ = impressive stonework and ornamentation. 50 = generic box shape.",
|
|
"style_consistency": "Does the art style match fantasy game buildings? 90+ = perfect Civ5/AoE style. 50 = inconsistent or wrong style.",
|
|
"readability_at_small_size": "Would this read clearly at its target display size? 90+ = bold clear silhouette. 50 = muddy at small size.",
|
|
"icon_clarity": "How clear and readable is the icon? 90+ = instantly recognizable. 50 = vague blob.",
|
|
"style_match": "Does it match the game's art direction? 90+ = perfect match. 50 = out of place.",
|
|
"readability_at_64px": "Would this be readable at 64x64 pixels? 90+ = crisp and clear. 50 = loses all detail.",
|
|
"drama_impact": "How dramatic and impactful is the spell effect? 90+ = awe-inspiring. 50 = weak and unimpressive.",
|
|
"magic_type_clarity": "Can you tell what type of magic this is? 90+ = instantly clear (fire, ice, death, etc). 50 = generic energy blob.",
|
|
"color_vibrancy": "How vivid and magical are the colors? 90+ = stunning magical palette. 50 = dull and flat.",
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Category → gates/quality mapping
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _get_category_config(category: str) -> tuple[tuple[str, ...], tuple[str, ...], dict[str, str], dict[str, str]]:
|
|
"""Return (gates, quality_dims, gate_descriptions, quality_descriptions) for a category."""
|
|
if category == "units":
|
|
return UNIT_GATES, UNIT_QUALITY_DIMS, _GATE_DESCRIPTIONS, _QUALITY_DESCRIPTIONS
|
|
if category in ("terrain", "biome_grid"):
|
|
return TERRAIN_GATES, TERRAIN_QUALITY, _NON_UNIT_GATE_DESCRIPTIONS, _NON_UNIT_QUALITY_DESCRIPTIONS
|
|
if category == "buildings":
|
|
return BUILDING_GATES, BUILDING_QUALITY, _NON_UNIT_GATE_DESCRIPTIONS, _NON_UNIT_QUALITY_DESCRIPTIONS
|
|
if category in ("resources", "improvements"):
|
|
return RESOURCE_GATES, RESOURCE_QUALITY, _NON_UNIT_GATE_DESCRIPTIONS, _NON_UNIT_QUALITY_DESCRIPTIONS
|
|
if category == "spells":
|
|
return SPELL_GATES, SPELL_QUALITY, _NON_UNIT_GATE_DESCRIPTIONS, _NON_UNIT_QUALITY_DESCRIPTIONS
|
|
# Fallback: generic quality-only
|
|
return (), (), {}, {}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Prompt templates
|
|
# ---------------------------------------------------------------------------
|
|
|
|
SYSTEM_PROMPT = """\
|
|
You are a QA inspector for a commercial fantasy 4X strategy game. You evaluate AI-generated sprites with strict binary checks and quality scores.
|
|
|
|
Your job has TWO parts:
|
|
1. GATES: Answer each boolean gate as true or false. These are BINARY — not "kind of" or "mostly". If a shadow exists, no_shadow is false. If the character faces right, facing_southwest is false. No hedging.
|
|
2. QUALITY: Score quality dimensions 0-100. Only answer these if ALL gates pass.
|
|
|
|
If ANY gate is false, you MUST provide a failed_gate_reason explaining which gate failed and why.
|
|
|
|
Think through each gate carefully before answering. Then output ONLY valid JSON — no text outside the JSON block.
|
|
|
|
Example FAIL (facing_southwest gate):
|
|
{"gates":{"facing_southwest":false,"single_character":true,"no_text_watermark":true},"failed_gate_reason":"facing_southwest failed: character faces the camera, front of body visible","quality":{}}
|
|
|
|
Example PASS (all gates true, quality scored):
|
|
{"gates":{"facing_southwest":true,"single_character":true,"no_text_watermark":true},"failed_gate_reason":null,"quality":{"direction_quality":85,"art_style":80,"equipment_detail":75,"background_cleanliness":90,"shadow_acceptability":88}}"""
|
|
|
|
COMBINED_PROMPT_TEMPLATE = """Look at the image file {filename} in this directory.
|
|
|
|
This sprite was generated as:
|
|
- Category: {category}
|
|
- Entity: {entity_id}
|
|
- Prompt: {prompt}
|
|
|
|
## Gates (true or false each)
|
|
|
|
{gate_instructions}
|
|
|
|
## Quality (0-100, only if ALL gates are true)
|
|
|
|
{quality_instructions}
|
|
|
|
Respond with this exact JSON:
|
|
{combined_template}"""
|
|
|
|
GATE_PROMPT_TEMPLATE = """\
|
|
Look at the image file {filename} in this directory.
|
|
|
|
This sprite was generated as:
|
|
- Category: {category}
|
|
- Entity: {entity_id}
|
|
- Prompt: {prompt}
|
|
|
|
Evaluate each boolean gate (true or false):
|
|
|
|
{gate_instructions}
|
|
|
|
Respond with this exact JSON:
|
|
{gate_template}"""
|
|
|
|
QUALITY_PROMPT_TEMPLATE = """\
|
|
Look at the image file {filename} in this directory.
|
|
|
|
All gates passed for this sprite ({entity_id}). Now score each quality dimension 0-100:
|
|
|
|
{quality_instructions}
|
|
|
|
Respond with this exact JSON:
|
|
{quality_template}"""
|
|
|
|
|
|
def _extract_json(text: str) -> dict | None:
|
|
"""Extract JSON object from VLM response text.
|
|
|
|
Handles markdown code blocks, leading/trailing text, raw JSON, and
|
|
Qwen3 <think>...</think> reasoning blocks.
|
|
"""
|
|
import re
|
|
# Strip Qwen3 thinking blocks before any parse attempt
|
|
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
|
|
# Try direct parse
|
|
try:
|
|
return json.loads(text)
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
# Try extracting from markdown code block
|
|
m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
|
|
if m:
|
|
try:
|
|
return json.loads(m.group(1))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
# Try finding outermost { ... }
|
|
m = re.search(r"\{.*\}", text, re.DOTALL)
|
|
if m:
|
|
try:
|
|
return json.loads(m.group())
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return None
|
|
|
|
|
|
def _parse_entity_context(entity_id: str) -> dict[str, str]:
|
|
"""Extract race, gender, and unit class from entity_id like 'spearmen_dwarves_f'."""
|
|
ctx: dict[str, str] = {}
|
|
base = entity_id
|
|
if base.endswith("_f"):
|
|
ctx["gender"] = "female"
|
|
base = base[:-2]
|
|
elif base.endswith("_m"):
|
|
ctx["gender"] = "male"
|
|
base = base[:-2]
|
|
for race in ("dwarves", "humans", "high_elves", "orcs"):
|
|
if f"_{race}" in base:
|
|
ctx["race"] = race
|
|
base = base.replace(f"_{race}", "")
|
|
break
|
|
ctx["unit_class"] = base
|
|
return ctx
|
|
|
|
|
|
def _contextualize_descriptions(
|
|
descs: dict[str, str],
|
|
entity_id: str,
|
|
) -> dict[str, str]:
|
|
"""Inject entity context into gate/quality descriptions so the scorer knows
|
|
the SPECIFIC gender, race, and unit type it should evaluate against.
|
|
|
|
Without this, the scorer sees "is gender correct?" but doesn't know WHICH
|
|
gender is expected — and defaults to "looks fine" (always true).
|
|
"""
|
|
ctx = _parse_entity_context(entity_id)
|
|
gender = ctx.get("gender", "")
|
|
race = ctx.get("race", "")
|
|
unit_class = ctx.get("unit_class", "")
|
|
|
|
result = dict(descs)
|
|
|
|
# Contextualize gender gate
|
|
if "gender_accuracy" in result and gender:
|
|
if gender == "female" and race == "dwarves":
|
|
result["gender_accuracy"] = (
|
|
"This MUST be a FEMALE dwarf. Female dwarves have NO beard, braided hair, "
|
|
"and sturdy feminine build. If the character has a BEARD, answer false. "
|
|
"90+ = clearly female with no beard. 50 = ambiguous. 0 = male with beard."
|
|
)
|
|
elif gender == "male" and race == "dwarves":
|
|
result["gender_accuracy"] = (
|
|
"This MUST be a MALE dwarf. Male dwarves have thick braided beards and "
|
|
"burly masculine build. If the character has NO beard, answer false. "
|
|
"90+ = clearly male with prominent beard. 50 = ambiguous."
|
|
)
|
|
elif gender == "female":
|
|
result["gender_accuracy"] = (
|
|
f"This MUST be a FEMALE {race}. The character should have clearly feminine "
|
|
f"features and build. 90+ = clearly female. 50 = ambiguous."
|
|
)
|
|
elif gender == "male":
|
|
result["gender_accuracy"] = (
|
|
f"This MUST be a MALE {race}. The character should have clearly masculine "
|
|
f"features and build. 90+ = clearly male. 50 = ambiguous."
|
|
)
|
|
|
|
# Contextualize race gate
|
|
if "race_accuracy" in result and race:
|
|
race_traits = {
|
|
"dwarves": "SHORT and STOCKY — half human height, barrel chest, thick limbs, wide stance. NOT tall, NOT slender.",
|
|
"high_elves": "TALL and SLENDER — pointed ears, pale skin, graceful build. NOT short, NOT stocky.",
|
|
"humans": "MEDIUM build — average proportions, no extreme features.",
|
|
"orcs": "TALL and MUSCULAR — dark olive skin, prominent tusks, fierce expression.",
|
|
}
|
|
if race in race_traits:
|
|
result["race_accuracy"] = (
|
|
f"This must be a {race.upper().replace('_', ' ')}. "
|
|
f"{race_traits[race]} "
|
|
f"90+ = unmistakable. 70 = mostly right. 50 = wrong proportions."
|
|
)
|
|
|
|
# Contextualize facing direction gate — stricter than base description
|
|
if "facing_southwest" in result:
|
|
result["facing_southwest"] = (
|
|
"STRICT rear-view check. You must see the character's BACK — spine, shoulder blades, "
|
|
"and the BACK of their head/helmet. The character walks AWAY toward BOTTOM-LEFT. "
|
|
"FAIL (false) if: (1) you see the character's FACE or chest — they face camera, "
|
|
"(2) you see a SIDE PROFILE — one arm in front, one behind, side of face/torso visible — "
|
|
"a side view is NOT a rear view even if the character faces left, "
|
|
"(3) character faces right or upward. "
|
|
"PASS (true) ONLY if the character's back clearly faces you AND body angles toward "
|
|
"bottom-left corner."
|
|
)
|
|
|
|
# Contextualize dwarf_proportions — only enforce for dwarf units
|
|
if "dwarf_proportions" in result and race:
|
|
if race == "dwarves":
|
|
result["dwarf_proportions"] = (
|
|
"This MUST be a DWARF — SHORT and STOCKY, roughly half human height, barrel chest, "
|
|
"thick limbs, wide stance, large head relative to body. "
|
|
"Answer FALSE if the character has normal human proportions (tall, lean, "
|
|
"standard limb-to-torso ratio). A tall warrior is NOT a dwarf."
|
|
)
|
|
else:
|
|
# Non-dwarf races: skip this gate (always pass)
|
|
result["dwarf_proportions"] = (
|
|
f"This is a {race.upper()} unit, not a dwarf. Answer TRUE — dwarf proportion "
|
|
f"check does not apply to this race."
|
|
)
|
|
|
|
# Contextualize subject type with specific unit class
|
|
if "correct_subject_type" in result and unit_class:
|
|
# NOTE: Characters face AWAY from camera (rear view, walking southwest).
|
|
# Ranged units (archers, crossbowmen, longbowmen) — bow is typically hidden behind body
|
|
# in rear view. Only check for equipment that's naturally visible from behind.
|
|
_ranged_units = {"archers", "crossbowmen", "longbowmen", "bowmen"}
|
|
_cavalry_units = {"cavalry", "heavy_cavalry"}
|
|
_siege_units = {"catapult", "cannon", "ballista"}
|
|
|
|
if unit_class in _ranged_units:
|
|
result["correct_subject_type"] = (
|
|
f"This is a {unit_class.upper().replace('_', ' ')} shown from BEHIND. "
|
|
f"From a rear view, the bow is typically hidden behind the body — this is ACCEPTABLE. "
|
|
f"Answer TRUE as long as this is a single humanoid figure (dwarf, human, etc.) in fantasy armor walking away. "
|
|
f"Answer FALSE only if this is clearly the wrong subject (a horse, a building, multiple figures, or a non-character)."
|
|
)
|
|
elif unit_class in _cavalry_units:
|
|
result["correct_subject_type"] = (
|
|
f"This is a {unit_class.upper().replace('_', ' ')} shown from BEHIND. "
|
|
f"The character MUST be MOUNTED on a visible horse/warhorse. "
|
|
f"Answer TRUE if there is a rider clearly seated on a horse. "
|
|
f"Answer FALSE if no horse is visible or the character is on foot."
|
|
)
|
|
elif unit_class in _siege_units:
|
|
result["correct_subject_type"] = (
|
|
f"This is a {unit_class.upper().replace('_', ' ')} siege engine. "
|
|
f"There MUST be a siege machine visible (catapult frame, cannon barrel, etc.). "
|
|
f"Answer FALSE if this shows only characters with no visible siege machine."
|
|
)
|
|
else:
|
|
weapon_map = {
|
|
"spearmen": "a SPEAR (long pole) visible — carried at side, strapped to back, or in hand. A round SHIELD on back or arm.",
|
|
"swordsmen": "a SWORD or SHIELD visible — sheathed at hip, on arm, or in hand from the rear.",
|
|
"axemen": "a battle AXE visible — strapped to back or in hand.",
|
|
"pikemen": "a very long PIKE (far longer than character height) visible, held upright.",
|
|
"mithril_guard": "a gleaming HALBERD or polearm visible, elite armor.",
|
|
"berserkers": "dual AXEs visible or clearly bare-chested berserk warrior.",
|
|
}
|
|
expected = weapon_map.get(unit_class, f"equipment appropriate for a {unit_class}")
|
|
result["correct_subject_type"] = (
|
|
f"This is a {unit_class.upper().replace('_', ' ')} shown from BEHIND (rear view). "
|
|
f"Verify: {expected}. "
|
|
f"Answer TRUE if the unit type is identifiable. "
|
|
f"Answer FALSE only if clearly the wrong type (no weapon visible when it should be prominent)."
|
|
)
|
|
|
|
return result
|
|
|
|
|
|
def _build_gate_instructions(gates: tuple[str, ...], descs: dict[str, str]) -> str:
|
|
lines = []
|
|
for i, g in enumerate(gates, 1):
|
|
desc = descs.get(g, "")
|
|
lines.append(f"{i}. **{g}** (true/false): {desc}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _build_quality_instructions(dims: tuple[str, ...], descs: dict[str, str]) -> str:
|
|
lines = []
|
|
for i, d in enumerate(dims, 1):
|
|
desc = descs.get(d, "")
|
|
lines.append(f"{i}. **{d}** (0-100): {desc}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _build_gate_only_template(gates: tuple[str, ...]) -> str:
|
|
return json.dumps({
|
|
"gates": {g: True for g in gates},
|
|
"failed_gate_reason": "null or string explaining which gate failed and why",
|
|
}, indent=2)
|
|
|
|
|
|
def _build_quality_only_template(quality_dims: tuple[str, ...]) -> str:
|
|
return json.dumps({"quality": {d: 0 for d in quality_dims}}, indent=2)
|
|
|
|
|
|
def _parse_gates_only(
|
|
raw: str,
|
|
gates: tuple[str, ...],
|
|
) -> tuple[bool, dict[str, bool], str | None] | None:
|
|
"""Parse a gate-only response. Returns (gate_passed, gates_dict, failed_reason) or None."""
|
|
data = _extract_json(raw)
|
|
if data is None:
|
|
return None
|
|
gates_data = data.get("gates")
|
|
if not isinstance(gates_data, dict):
|
|
return None
|
|
parsed: dict[str, bool] = {}
|
|
for g in gates:
|
|
val = gates_data.get(g)
|
|
if isinstance(val, bool):
|
|
parsed[g] = val
|
|
elif isinstance(val, (int, float)):
|
|
parsed[g] = bool(val)
|
|
else:
|
|
parsed[g] = False
|
|
gate_passed = all(parsed.values())
|
|
failed_reason = data.get("failed_gate_reason") if not gate_passed else None
|
|
if isinstance(failed_reason, str) and failed_reason.lower() in ("null", "none", ""):
|
|
failed_reason = None
|
|
return (gate_passed, parsed, failed_reason)
|
|
|
|
|
|
def _parse_quality_only(
|
|
raw: str,
|
|
quality_dims: tuple[str, ...],
|
|
) -> dict[str, int] | None:
|
|
"""Parse a quality-only response. Returns {dim: score} or None."""
|
|
data = _extract_json(raw)
|
|
if data is None:
|
|
return None
|
|
quality_data = data.get("quality", {})
|
|
if not isinstance(quality_data, dict):
|
|
return None
|
|
result: dict[str, int] = {}
|
|
for d in quality_dims:
|
|
val = quality_data.get(d)
|
|
result[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0
|
|
return result
|
|
|
|
|
|
def _quality_confidence(quality: dict[str, int]) -> float:
|
|
if not quality:
|
|
return 0.0
|
|
return round(sum(quality.values()) / (len(quality) * 100), 3)
|
|
|
|
|
|
def _quality_floor_check(quality: dict[str, int]) -> str | None:
|
|
"""Returns a failure reason string if any dim is below floor, else None."""
|
|
failures = [f"{d}={v}" for d, v in quality.items() if v < QUALITY_DIM_FLOOR]
|
|
return f"quality floor breach: {', '.join(failures)}" if failures else None
|
|
|
|
|
|
def _merge_quality(a: dict[str, int], b: dict[str, int]) -> dict[str, int]:
|
|
"""Average two quality score dicts dimension-by-dimension."""
|
|
return {d: (a.get(d, 0) + b.get(d, 0)) // 2 for d in a}
|
|
|
|
|
|
# kept for backward-compat with confidence_from_notes (reads stored JSON)
|
|
def _parse_gated_scores_legacy(
|
|
data: dict,
|
|
gates: tuple[str, ...],
|
|
quality_dims: tuple[str, ...],
|
|
) -> dict:
|
|
parsed_gates: dict[str, bool] = {}
|
|
for g in gates:
|
|
val = data.get("gates", {}).get(g)
|
|
parsed_gates[g] = bool(val) if isinstance(val, (bool, int, float)) else False
|
|
gate_passed = all(parsed_gates.values())
|
|
failed_reason = data.get("failed_gate_reason") if not gate_passed else None
|
|
parsed_quality: dict[str, int] = {}
|
|
for d in quality_dims:
|
|
val = data.get("quality", {}).get(d)
|
|
parsed_quality[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0
|
|
floor_reason = _quality_floor_check(parsed_quality) if gate_passed else None
|
|
confidence = _quality_confidence(parsed_quality) if gate_passed and not floor_reason else 0.0
|
|
floor_failed = bool(floor_reason)
|
|
return {
|
|
"gates": parsed_gates,
|
|
"quality": parsed_quality,
|
|
"gate_passed": gate_passed and not floor_failed,
|
|
"confidence": confidence,
|
|
"failed_gate_reason": floor_reason if floor_failed else failed_reason,
|
|
"quality_floor_failed": floor_failed,
|
|
}
|
|
|
|
|
|
|
|
def _build_combined_template(gates: tuple[str, ...], quality_dims: tuple[str, ...]) -> str:
|
|
return json.dumps({
|
|
"gates": {g: True for g in gates},
|
|
"failed_gate_reason": "null or string explaining which gate failed and why",
|
|
"quality": {d: 0 for d in quality_dims},
|
|
}, indent=2)
|
|
|
|
|
|
def _parse_combined(
|
|
raw: str,
|
|
gates: tuple[str, ...],
|
|
quality_dims: tuple[str, ...],
|
|
) -> dict | None:
|
|
"""Parse a single combined gate+quality response into a result dict."""
|
|
gate_result = _parse_gates_only(raw, gates)
|
|
if gate_result is None:
|
|
return None
|
|
gate_passed, parsed_gates, failed_reason = gate_result
|
|
if not gate_passed:
|
|
return {
|
|
"gates": parsed_gates,
|
|
"quality": {},
|
|
"gate_passed": False,
|
|
"confidence": 0.0,
|
|
"failed_gate_reason": failed_reason,
|
|
"quality_floor_failed": False,
|
|
}
|
|
quality = _parse_quality_only(raw, quality_dims)
|
|
if quality is None:
|
|
return None
|
|
floor_reason = _quality_floor_check(quality)
|
|
if floor_reason:
|
|
return {
|
|
"gates": parsed_gates,
|
|
"quality": quality,
|
|
"gate_passed": False,
|
|
"confidence": 0.0,
|
|
"failed_gate_reason": floor_reason,
|
|
"quality_floor_failed": True,
|
|
}
|
|
return {
|
|
"gates": parsed_gates,
|
|
"quality": quality,
|
|
"gate_passed": True,
|
|
"confidence": _quality_confidence(quality),
|
|
"failed_gate_reason": None,
|
|
"quality_floor_failed": False,
|
|
}
|
|
|
|
def confidence_from_notes(notes_json: str) -> tuple[float, bool]:
|
|
"""Extract confidence and gate_passed from stored notes JSON.
|
|
|
|
Handles both new gated format and legacy flat format for backward compat.
|
|
Returns (confidence, gate_passed).
|
|
"""
|
|
scores = json.loads(notes_json)
|
|
# New gated format
|
|
if "gates" in scores:
|
|
gate_passed = scores.get("gate_passed", False)
|
|
return (scores.get("confidence", 0.0), gate_passed)
|
|
# Legacy flat format — treat all as gate-passed, compute average
|
|
vals = [v for v in scores.values() if isinstance(v, (int, float))]
|
|
conf = sum(vals) / len(vals) if vals else 0
|
|
return (conf, True)
|
|
|
|
|
|
class Scorer:
|
|
"""Scores a variant image using configurable single-pass or two-pass rubric.
|
|
|
|
single_pass=False (default — two-pass + optional tiebreaker):
|
|
Pass 1 — gate-only prompt: binary pass/fail. Failures exit immediately.
|
|
Pass 2 — quality-only prompt: 0-100 dimensions, only if all gates pass.
|
|
Pass 3 — tiebreaker (optional): re-scores quality when confidence is within
|
|
`tiebreaker_range` of `threshold`; averages pass-2 and pass-3 scores.
|
|
|
|
single_pass=True (one combined gate+quality call):
|
|
For high-accuracy models where extra API calls cost more than accuracy gain.
|
|
|
|
Two backend types:
|
|
- "model-boss": local VLM via InferenceClient (Qwen3-VL, etc.)
|
|
- "claude": Claude API via claude-code-batch-sdk (Haiku, Sonnet, Opus)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
name: str,
|
|
backend: str,
|
|
model: str,
|
|
threshold: float,
|
|
tiebreaker_range: float = 0.0,
|
|
single_pass: bool = False,
|
|
):
|
|
self.name = name
|
|
self.backend = backend
|
|
self.model = model
|
|
self.threshold = threshold
|
|
self.tiebreaker_range = tiebreaker_range
|
|
self.single_pass = single_pass
|
|
self._semaphore = asyncio.Semaphore(CONCURRENCY.get(backend, 4))
|
|
|
|
if backend == "model-boss":
|
|
from model_boss import InferenceClient
|
|
self._client = InferenceClient(
|
|
client_id=f"sprite-ranker-{name}",
|
|
default_priority="normal",
|
|
timeout=120.0,
|
|
)
|
|
elif backend == "claude":
|
|
_ensure_claude_sdk()
|
|
from claude_code_batch_sdk import ClaudeClient
|
|
self._client = ClaudeClient(model=model, max_concurrent=2, timeout=180.0)
|
|
|
|
async def score(self, raw_path: str, sprite: dict) -> dict | None:
|
|
"""Score a single variant image, bounded by backend semaphore."""
|
|
async with self._semaphore:
|
|
return await self._score_inner(raw_path, sprite)
|
|
|
|
async def score_stream(
|
|
self, items: list[tuple[str, dict]],
|
|
):
|
|
"""Score multiple (image_path, sprite) pairs concurrently.
|
|
|
|
Yields (index, result) pairs as each completes — callers receive and
|
|
process results immediately without waiting for the whole batch.
|
|
This keeps memory bounded and allows progress commits during long runs.
|
|
"""
|
|
async def _task(idx: int, path: str, sprite: dict) -> tuple[int, dict | None]:
|
|
result = await self.score(path, sprite)
|
|
return (idx, result)
|
|
|
|
tasks = [_task(i, p, s) for i, (p, s) in enumerate(items)]
|
|
for coro in asyncio.as_completed(tasks):
|
|
try:
|
|
yield await coro
|
|
except Exception as exc:
|
|
logger.warning("[%s] Batch item error: %s", self.name, exc)
|
|
yield (-1, None)
|
|
|
|
async def _call_backend(self, img_b64: str, raw_path: str, prompt: str) -> str | None:
|
|
"""Send a single prompt+image to the backend. Returns raw text or None."""
|
|
try:
|
|
if self.backend == "model-boss":
|
|
return await self._client.chat(
|
|
model=self.model,
|
|
messages=[
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": prompt},
|
|
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
|
|
],
|
|
},
|
|
],
|
|
max_tokens=2048,
|
|
temperature=0.1,
|
|
keep_alive=300,
|
|
)
|
|
elif self.backend == "claude":
|
|
return await self._client.generate(
|
|
system=SYSTEM_PROMPT,
|
|
user=prompt,
|
|
cwd=str(Path(raw_path).parent),
|
|
allowed_tools=["Read"],
|
|
)
|
|
return None
|
|
except Exception as exc:
|
|
logger.warning("[%s] Backend call failed: %s", self.name, exc)
|
|
return None
|
|
|
|
async def _score_inner(self, raw_path: str, sprite: dict) -> dict | None:
|
|
"""Shared setup, then branches to single-pass or two-pass scoring."""
|
|
import base64
|
|
|
|
if not raw_path or not Path(raw_path).exists():
|
|
return None
|
|
|
|
category = sprite["category"]
|
|
gates, quality_dims, gate_descs, quality_descs = _get_category_config(category)
|
|
if not gates:
|
|
return None
|
|
|
|
entity_id = sprite.get("entity_id", "")
|
|
ctx_gate_descs = _contextualize_descriptions(gate_descs, entity_id)
|
|
ctx_quality_descs = _contextualize_descriptions(quality_descs, entity_id)
|
|
filename = Path(raw_path).name
|
|
prompt_excerpt = sprite["prompt"][:300]
|
|
img_b64 = base64.b64encode(Path(raw_path).read_bytes()).decode()
|
|
|
|
gate_instructions = _build_gate_instructions(gates, ctx_gate_descs)
|
|
quality_instructions = _build_quality_instructions(quality_dims, ctx_quality_descs)
|
|
|
|
# ---------------------------------------------------------------
|
|
# Single-pass: one combined gate+quality call
|
|
# ---------------------------------------------------------------
|
|
if self.single_pass:
|
|
prompt = COMBINED_PROMPT_TEMPLATE.format(
|
|
filename=filename,
|
|
category=category,
|
|
entity_id=entity_id,
|
|
prompt=prompt_excerpt,
|
|
gate_instructions=gate_instructions,
|
|
quality_instructions=quality_instructions,
|
|
combined_template=_build_combined_template(gates, quality_dims),
|
|
)
|
|
raw = await self._call_backend(img_b64, raw_path, prompt)
|
|
if raw is None:
|
|
return None
|
|
return _parse_combined(raw, gates, quality_dims)
|
|
|
|
# ---------------------------------------------------------------
|
|
# Two-pass + optional tiebreaker
|
|
# ---------------------------------------------------------------
|
|
|
|
# --- Pass 1: gates only ---
|
|
gate_prompt = GATE_PROMPT_TEMPLATE.format(
|
|
filename=filename,
|
|
category=category,
|
|
entity_id=entity_id,
|
|
prompt=prompt_excerpt,
|
|
gate_instructions=gate_instructions,
|
|
gate_template=_build_gate_only_template(gates),
|
|
)
|
|
gate_raw = await self._call_backend(img_b64, raw_path, gate_prompt)
|
|
if gate_raw is None:
|
|
return None
|
|
|
|
gate_result = _parse_gates_only(gate_raw, gates)
|
|
if gate_result is None:
|
|
return None
|
|
|
|
gate_passed, parsed_gates, failed_reason = gate_result
|
|
if not gate_passed:
|
|
return {
|
|
"gates": parsed_gates,
|
|
"quality": {},
|
|
"gate_passed": False,
|
|
"confidence": 0.0,
|
|
"failed_gate_reason": failed_reason,
|
|
"quality_floor_failed": False,
|
|
}
|
|
|
|
# --- Pass 2: quality only ---
|
|
quality_prompt = QUALITY_PROMPT_TEMPLATE.format(
|
|
filename=filename,
|
|
entity_id=entity_id,
|
|
quality_instructions=quality_instructions,
|
|
quality_template=_build_quality_only_template(quality_dims),
|
|
)
|
|
quality_raw = await self._call_backend(img_b64, raw_path, quality_prompt)
|
|
if quality_raw is None:
|
|
return None
|
|
|
|
quality = _parse_quality_only(quality_raw, quality_dims)
|
|
if quality is None:
|
|
return None
|
|
|
|
# --- Pass 3: tiebreaker (when confidence is within range of threshold) ---
|
|
if self.tiebreaker_range > 0.0:
|
|
confidence = _quality_confidence(quality)
|
|
if abs(confidence - self.threshold) <= self.tiebreaker_range:
|
|
logger.debug(
|
|
"[%s] Tiebreaker triggered: confidence=%.3f threshold=%.2f range=%.2f",
|
|
self.name, confidence, self.threshold, self.tiebreaker_range,
|
|
)
|
|
tie_raw = await self._call_backend(img_b64, raw_path, quality_prompt)
|
|
if tie_raw is not None:
|
|
tie_quality = _parse_quality_only(tie_raw, quality_dims)
|
|
if tie_quality is not None:
|
|
quality = _merge_quality(quality, tie_quality)
|
|
|
|
floor_reason = _quality_floor_check(quality)
|
|
if floor_reason:
|
|
return {
|
|
"gates": parsed_gates,
|
|
"quality": quality,
|
|
"gate_passed": False,
|
|
"confidence": 0.0,
|
|
"failed_gate_reason": floor_reason,
|
|
"quality_floor_failed": True,
|
|
}
|
|
|
|
return {
|
|
"gates": parsed_gates,
|
|
"quality": quality,
|
|
"gate_passed": True,
|
|
"confidence": _quality_confidence(quality),
|
|
"failed_gate_reason": None,
|
|
"quality_floor_failed": False,
|
|
}
|
|
|
|
|
|
def _ensure_claude_sdk():
|
|
"""Add claude-code-batch-sdk to path if not installed."""
|
|
import sys
|
|
sdk_path = Path(__file__).resolve().parent.parent.parent.parent.parent / (
|
|
"@applications/@ml/@packages/@py/claude-code-batch-sdk/src"
|
|
)
|
|
if sdk_path.exists() and str(sdk_path) not in sys.path:
|
|
sys.path.insert(0, str(sdk_path))
|
|
|
|
|
|
def _load_pipeline_config() -> dict:
|
|
"""Load scoring pipeline stages from YAML."""
|
|
import yaml
|
|
config_path = Path(__file__).parent / "prompts" / "scoring_pipeline.yaml"
|
|
with open(config_path) as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
class ScoringPipeline:
|
|
"""Multi-stage scoring pipeline: variants escalate through scorers.
|
|
|
|
Each stage only sees variants that passed all previous stages.
|
|
Rejections at any stage go back to generation.
|
|
Only the deficit needed is escalated — no wasteful re-evaluation.
|
|
|
|
Stages are configured in scoring_pipeline.yaml. Default:
|
|
Qwen3-VL (free, 5s) → Haiku ($0.001, 5s) → Opus ($0.015, 30s) → User
|
|
"""
|
|
|
|
def __init__(self, registry: SpriteRegistry, raw_dir: Path):
|
|
self.registry = registry
|
|
self.raw_dir = raw_dir
|
|
config = _load_pipeline_config()
|
|
self.target_approved = config.get("target_approved", MIN_GOOD_VARIANTS)
|
|
self.stages: list[Scorer] = []
|
|
for stage in config.get("stages", []):
|
|
self.stages.append(Scorer(
|
|
name=stage["name"],
|
|
backend=stage["backend"],
|
|
model=stage["model"],
|
|
threshold=stage.get("threshold", CONFIDENCE_THRESHOLD),
|
|
tiebreaker_range=stage.get("tiebreaker_range", 0.0),
|
|
single_pass=stage.get("single_pass", False),
|
|
))
|
|
|
|
async def score_variant_at_tier(
|
|
self, variant: dict, sprite: dict, tier: int,
|
|
) -> dict | None:
|
|
"""Score a variant at a specific pipeline tier. Returns result or None."""
|
|
if tier < 0 or tier >= len(self.stages):
|
|
return None
|
|
return await self.stages[tier].score(variant.get("raw_path", ""), sprite)
|
|
|
|
async def advance_sprite(self, sprite_id: str) -> dict:
|
|
"""Advance a sprite through the scoring pipeline.
|
|
|
|
For each tier, find variants that passed the previous tier but haven't
|
|
been evaluated at this tier yet. Score them. Track results.
|
|
|
|
Returns {tier_results, ready_count, needs_regen, deficit}
|
|
"""
|
|
import asyncio
|
|
|
|
sprite = self.registry.get_sprite(sprite_id)
|
|
if not sprite:
|
|
return {"tier_results": [], "ready_count": 0, "needs_regen": True, "deficit": self.target_approved}
|
|
|
|
cat = sprite["category"]
|
|
threshold = CATEGORY_THRESHOLDS.get(cat, CONFIDENCE_THRESHOLD)
|
|
tier_results = []
|
|
|
|
for tier_idx, scorer in enumerate(self.stages):
|
|
# Find variants eligible for this tier:
|
|
# - completed, has image
|
|
# - review_tier == tier_idx (passed all previous tiers, not yet scored here)
|
|
# - not rejected (rating != -1)
|
|
variants = self.registry.conn.execute(
|
|
"SELECT * FROM variants WHERE sprite_id=? AND job_status='completed' "
|
|
"AND raw_path IS NOT NULL AND (rating IS NULL OR rating != -1) "
|
|
"AND COALESCE(review_tier, 0) = ?",
|
|
(sprite_id, tier_idx),
|
|
).fetchall()
|
|
|
|
if not variants:
|
|
tier_results.append({"tier": scorer.name, "scored": 0, "passed": 0, "failed": 0})
|
|
continue
|
|
|
|
passed = 0
|
|
failed = 0
|
|
for v in variants:
|
|
result = await self.score_variant_at_tier(dict(v), dict(sprite), tier_idx)
|
|
if result is None:
|
|
continue
|
|
|
|
gate_passed = result["gate_passed"]
|
|
conf = result["confidence"]
|
|
rating = max(1, min(5, round(conf * 5))) if gate_passed else 1
|
|
passed_tier = gate_passed and conf >= scorer.threshold
|
|
|
|
scored_by = json.dumps({
|
|
"model": scorer.model,
|
|
"backend": scorer.backend,
|
|
"stage": scorer.name,
|
|
"tier": tier_idx,
|
|
"scoring_version": "pipeline_v2",
|
|
})
|
|
|
|
# Store per-scorer scorecard (preserves all evaluations)
|
|
self.registry.store_score(
|
|
variant_id=v["id"],
|
|
scorer_name=scorer.name,
|
|
scorer_model=scorer.model,
|
|
tier=tier_idx,
|
|
result=result,
|
|
)
|
|
|
|
if passed_tier:
|
|
# Advance to next tier — notes stores latest scorer's result
|
|
self.registry.conn.execute(
|
|
"UPDATE variants SET rating=?, notes=?, scored_by=?, scored_at=?, review_tier=? WHERE id=?",
|
|
(rating, json.dumps(result), scored_by, _now(), tier_idx + 1, v["id"]),
|
|
)
|
|
passed += 1
|
|
else:
|
|
# Rejected at this tier
|
|
self.registry.conn.execute(
|
|
"UPDATE variants SET rating=-1, notes=?, scored_by=?, scored_at=?, review_tier=? WHERE id=?",
|
|
(json.dumps(result), scored_by, _now(), tier_idx, v["id"]),
|
|
)
|
|
failed += 1
|
|
self.registry.conn.commit()
|
|
|
|
tier_results.append({"tier": scorer.name, "scored": len(variants), "passed": passed, "failed": failed})
|
|
print(f" [{scorer.name}] {passed}/{len(variants)} passed (tier {tier_idx})")
|
|
|
|
# Count fully-approved variants (passed all tiers)
|
|
final_tier = len(self.stages)
|
|
ready = self.registry.conn.execute(
|
|
"SELECT COUNT(*) FROM variants WHERE sprite_id=? AND COALESCE(review_tier, 0) >= ? AND rating != -1",
|
|
(sprite_id, final_tier),
|
|
).fetchone()[0]
|
|
|
|
deficit = max(0, self.target_approved - ready)
|
|
return {
|
|
"tier_results": tier_results,
|
|
"ready_count": ready,
|
|
"needs_regen": ready < self.target_approved,
|
|
"deficit": deficit,
|
|
}
|
|
|
|
async def rank_and_filter(
|
|
self,
|
|
sprite_id: str,
|
|
threshold: float | None = None,
|
|
min_good: int | None = None,
|
|
) -> dict:
|
|
"""Score unscored variants through the pipeline, return status.
|
|
|
|
Compatible with the old SpriteRanker.rank_and_filter() interface.
|
|
"""
|
|
result = await self.advance_sprite(sprite_id)
|
|
|
|
# Also build the ranked list for display
|
|
sprite = self.registry.get_sprite(sprite_id)
|
|
category = sprite["category"] if sprite else ""
|
|
if threshold is None:
|
|
threshold = CATEGORY_THRESHOLDS.get(category, CONFIDENCE_THRESHOLD)
|
|
if min_good is None:
|
|
min_good = self.target_approved
|
|
|
|
all_variants = self.registry.get_variants(sprite_id)
|
|
ranked = []
|
|
for v in all_variants:
|
|
if v["notes"] is None or v["rating"] == -1:
|
|
continue
|
|
conf, gate_passed = confidence_from_notes(v["notes"])
|
|
ranked.append({
|
|
"variant_id": v["id"],
|
|
"seed": v["seed"],
|
|
"raw_path": v["raw_path"],
|
|
"scores": json.loads(v["notes"]),
|
|
"confidence": round(conf, 3),
|
|
"gate_passed": gate_passed,
|
|
"review_tier": v["review_tier"] if "review_tier" in v.keys() else 0,
|
|
})
|
|
ranked.sort(key=lambda x: x["confidence"], reverse=True)
|
|
|
|
good = [r for r in ranked if r["gate_passed"] and r["confidence"] >= threshold]
|
|
return {
|
|
"ranked": ranked,
|
|
"good_count": len(good),
|
|
"needs_regen": result["needs_regen"],
|
|
"deficit": result["deficit"],
|
|
"tier_results": result["tier_results"],
|
|
}
|
|
|
|
|
|
# Backward-compatible alias
|
|
SpriteRanker = ScoringPipeline
|