diff --git a/tools/sprite-generation/engine/ranker.py b/tools/sprite-generation/engine/ranker.py
index 20a1fd1b..71952a44 100644
--- a/tools/sprite-generation/engine/ranker.py
+++ b/tools/sprite-generation/engine/ranker.py
@@ -307,34 +307,49 @@ Your job has TWO parts:
If ANY gate is false, you MUST provide a failed_gate_reason explaining which gate failed and why.
-Always respond with valid JSON only — no other text."""
+Think through each gate carefully before answering. Then output ONLY valid JSON — no text outside the JSON block.
-RANKING_PROMPT_TEMPLATE = """\
+Example FAIL (facing_southwest gate):
+{"gates":{"facing_southwest":false,"single_character":true,"no_text_watermark":true},"failed_gate_reason":"facing_southwest failed: character faces the camera, front of body visible","quality":{}}
+
+Example PASS (all gates true, quality scored):
+{"gates":{"facing_southwest":true,"single_character":true,"no_text_watermark":true},"failed_gate_reason":null,"quality":{"direction_quality":85,"art_style":80,"equipment_detail":75,"background_cleanliness":90,"shadow_acceptability":88}}"""
+
+GATE_PROMPT_TEMPLATE = """\
Look at the image file {filename} in this directory.
-This image was generated as a game sprite:
+This sprite was generated as:
- Category: {category}
- Entity: {entity_id}
-- Prompt used: {prompt}
+- Prompt: {prompt}
-## STEP 1: Boolean Gates (answer true or false for each)
+Evaluate each boolean gate (true or false):
{gate_instructions}
-## STEP 2: Quality Ranges (0-100, only if ALL gates are true)
+Respond with this exact JSON:
+{gate_template}"""
+
+QUALITY_PROMPT_TEMPLATE = """\
+Look at the image file {filename} in this directory.
+
+All gates passed for this sprite ({entity_id}). Now score each quality dimension 0-100:
{quality_instructions}
-Respond with this exact JSON structure:
-{score_template}"""
+Respond with this exact JSON:
+{quality_template}"""
def _extract_json(text: str) -> dict | None:
"""Extract JSON object from VLM response text.
- Handles markdown code blocks, leading/trailing text, and raw JSON.
+ Handles markdown code blocks, leading/trailing text, raw JSON, and
+ Qwen3 ... reasoning blocks.
"""
import re
+ # Strip Qwen3 thinking blocks before any parse attempt
+ text = re.sub(r".*?", "", text, flags=re.DOTALL).strip()
# Try direct parse
try:
return json.loads(text)
@@ -527,81 +542,105 @@ def _build_quality_instructions(dims: tuple[str, ...], descs: dict[str, str]) ->
return "\n".join(lines)
-def _build_score_template(gates: tuple[str, ...], quality_dims: tuple[str, ...]) -> str:
- template: dict = {
+def _build_gate_only_template(gates: tuple[str, ...]) -> str:
+ return json.dumps({
"gates": {g: True for g in gates},
"failed_gate_reason": "null or string explaining which gate failed and why",
- "quality": {d: 0 for d in quality_dims},
- }
- return json.dumps(template, indent=2)
+ }, indent=2)
-def _parse_gated_scores(
+def _build_quality_only_template(quality_dims: tuple[str, ...]) -> str:
+ return json.dumps({"quality": {d: 0 for d in quality_dims}}, indent=2)
+
+
+def _parse_gates_only(
raw: str,
gates: tuple[str, ...],
- quality_dims: tuple[str, ...],
-) -> dict | None:
- """Parse reviewer's JSON response into gate+quality structure.
-
- Returns dict with keys: gates, quality, gate_passed, confidence, failed_gate_reason
- Returns None if JSON is unparseable.
- """
+) -> tuple[bool, dict[str, bool], str | None] | None:
+ """Parse a gate-only response. Returns (gate_passed, gates_dict, failed_reason) or None."""
data = _extract_json(raw)
if data is None:
return None
-
gates_data = data.get("gates")
if not isinstance(gates_data, dict):
return None
-
- # Parse gates — must be boolean
- parsed_gates: dict[str, bool] = {}
+ parsed: dict[str, bool] = {}
for g in gates:
val = gates_data.get(g)
if isinstance(val, bool):
- parsed_gates[g] = val
+ parsed[g] = val
elif isinstance(val, (int, float)):
- parsed_gates[g] = bool(val)
+ parsed[g] = bool(val)
else:
- parsed_gates[g] = False # missing gate = fail
+ parsed[g] = False
+ gate_passed = all(parsed.values())
+ failed_reason = data.get("failed_gate_reason") if not gate_passed else None
+ if isinstance(failed_reason, str) and failed_reason.lower() in ("null", "none", ""):
+ failed_reason = None
+ return (gate_passed, parsed, failed_reason)
+
+def _parse_quality_only(
+ raw: str,
+ quality_dims: tuple[str, ...],
+) -> dict[str, int] | None:
+ """Parse a quality-only response. Returns {dim: score} or None."""
+ data = _extract_json(raw)
+ if data is None:
+ return None
+ quality_data = data.get("quality", {})
+ if not isinstance(quality_data, dict):
+ return None
+ result: dict[str, int] = {}
+ for d in quality_dims:
+ val = quality_data.get(d)
+ result[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0
+ return result
+
+
+def _quality_confidence(quality: dict[str, int]) -> float:
+ if not quality:
+ return 0.0
+ return round(sum(quality.values()) / (len(quality) * 100), 3)
+
+
+def _quality_floor_check(quality: dict[str, int]) -> str | None:
+ """Returns a failure reason string if any dim is below floor, else None."""
+ failures = [f"{d}={v}" for d, v in quality.items() if v < QUALITY_DIM_FLOOR]
+ return f"quality floor breach: {', '.join(failures)}" if failures else None
+
+
+def _merge_quality(a: dict[str, int], b: dict[str, int]) -> dict[str, int]:
+ """Average two quality score dicts dimension-by-dimension."""
+ return {d: (a.get(d, 0) + b.get(d, 0)) // 2 for d in a}
+
+
+# kept for backward-compat with confidence_from_notes (reads stored JSON)
+def _parse_gated_scores_legacy(
+ data: dict,
+ gates: tuple[str, ...],
+ quality_dims: tuple[str, ...],
+) -> dict:
+ parsed_gates: dict[str, bool] = {}
+ for g in gates:
+ val = data.get("gates", {}).get(g)
+ parsed_gates[g] = bool(val) if isinstance(val, (bool, int, float)) else False
gate_passed = all(parsed_gates.values())
failed_reason = data.get("failed_gate_reason") if not gate_passed else None
-
- # Parse quality — only meaningful if gates passed
parsed_quality: dict[str, int] = {}
- quality_data = data.get("quality", {})
- if isinstance(quality_data, dict):
- for d in quality_dims:
- val = quality_data.get(d)
- if isinstance(val, (int, float)):
- parsed_quality[d] = max(0, min(100, int(val)))
- else:
- parsed_quality[d] = 0
-
- # Per-dimension quality floor: any single dim below threshold = auto-reject
- quality_floor_failed = False
- floor_failures: list[str] = []
- if gate_passed and parsed_quality:
- for dim_name, dim_val in parsed_quality.items():
- if dim_val < QUALITY_DIM_FLOOR:
- floor_failures.append(f"{dim_name}={dim_val}")
- quality_floor_failed = True
-
- if gate_passed and parsed_quality and not quality_floor_failed:
- confidence = sum(parsed_quality.values()) / (len(parsed_quality) * 100)
- else:
- confidence = 0.0
- if quality_floor_failed and not failed_reason:
- failed_reason = f"quality floor breach: {', '.join(floor_failures)}"
-
+ for d in quality_dims:
+ val = data.get("quality", {}).get(d)
+ parsed_quality[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0
+ floor_reason = _quality_floor_check(parsed_quality) if gate_passed else None
+ confidence = _quality_confidence(parsed_quality) if gate_passed and not floor_reason else 0.0
+ floor_failed = bool(floor_reason)
return {
"gates": parsed_gates,
"quality": parsed_quality,
- "gate_passed": gate_passed and not quality_floor_failed,
- "confidence": round(confidence, 3),
- "failed_gate_reason": failed_reason,
- "quality_floor_failed": quality_floor_failed,
+ "gate_passed": gate_passed and not floor_failed,
+ "confidence": confidence,
+ "failed_gate_reason": floor_reason if floor_failed else failed_reason,
+ "quality_floor_failed": floor_failed,
}
@@ -623,18 +662,31 @@ def confidence_from_notes(notes_json: str) -> tuple[float, bool]:
class Scorer:
- """Scores a variant image using gated boolean + quality range rubric.
+ """Scores a variant image using a two-pass gate→quality rubric with optional tiebreaker.
+
+ Pass 1 — gate-only prompt: binary pass/fail checks. Failures exit immediately.
+ Pass 2 — quality-only prompt: 0-100 dimensions, only reached when all gates pass.
+ Pass 3 — tiebreaker (optional): re-scores quality when confidence is within
+ `tiebreaker_range` of `threshold`; averages pass-2 and pass-3 scores.
Two backend types:
- "model-boss": local VLM via InferenceClient (Qwen3-VL, etc.)
- "claude": Claude API via claude-code-batch-sdk (Haiku, Sonnet, Opus)
"""
- def __init__(self, name: str, backend: str, model: str, threshold: float):
+ def __init__(
+ self,
+ name: str,
+ backend: str,
+ model: str,
+ threshold: float,
+ tiebreaker_range: float = 0.0,
+ ):
self.name = name
self.backend = backend
self.model = model
self.threshold = threshold
+ self.tiebreaker_range = tiebreaker_range
self._semaphore = asyncio.Semaphore(CONCURRENCY.get(backend, 4))
if backend == "model-boss":
@@ -676,8 +728,40 @@ class Scorer:
logger.warning("[%s] Batch item error: %s", self.name, exc)
return results
+ async def _call_backend(self, img_b64: str, raw_path: str, prompt: str) -> str | None:
+ """Send a single prompt+image to the backend. Returns raw text or None."""
+ try:
+ if self.backend == "model-boss":
+ return await self._client.chat(
+ model=self.model,
+ messages=[
+ {"role": "system", "content": SYSTEM_PROMPT},
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": prompt},
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
+ ],
+ },
+ ],
+ max_tokens=2048,
+ temperature=0.1,
+ keep_alive=300,
+ )
+ elif self.backend == "claude":
+ return await self._client.generate(
+ system=SYSTEM_PROMPT,
+ user=prompt,
+ cwd=str(Path(raw_path).parent),
+ allowed_tools=["Read"],
+ )
+ return None
+ except Exception as exc:
+ logger.warning("[%s] Backend call failed: %s", self.name, exc)
+ return None
+
async def _score_inner(self, raw_path: str, sprite: dict) -> dict | None:
- """Score a single variant image. Returns gated result dict or None."""
+ """Two-pass (+ optional tiebreaker) scoring. Returns gated result dict or None."""
import base64
if not raw_path or not Path(raw_path).exists():
@@ -691,56 +775,87 @@ class Scorer:
entity_id = sprite.get("entity_id", "")
ctx_gate_descs = _contextualize_descriptions(gate_descs, entity_id)
ctx_quality_descs = _contextualize_descriptions(quality_descs, entity_id)
+ filename = Path(raw_path).name
+ prompt_excerpt = sprite["prompt"][:300]
- gate_instructions = _build_gate_instructions(gates, ctx_gate_descs)
- quality_instructions = _build_quality_instructions(quality_dims, ctx_quality_descs)
- score_template = _build_score_template(gates, quality_dims)
+ img_b64 = base64.b64encode(Path(raw_path).read_bytes()).decode()
- user_prompt = RANKING_PROMPT_TEMPLATE.format(
- filename=Path(raw_path).name,
+ # --- Pass 1: gates only ---
+ gate_prompt = GATE_PROMPT_TEMPLATE.format(
+ filename=filename,
category=category,
entity_id=entity_id,
- prompt=sprite["prompt"][:300],
- gate_instructions=gate_instructions,
- quality_instructions=quality_instructions,
- score_template=score_template,
+ prompt=prompt_excerpt,
+ gate_instructions=_build_gate_instructions(gates, ctx_gate_descs),
+ gate_template=_build_gate_only_template(gates),
)
-
- try:
- if self.backend == "model-boss":
- img_b64 = base64.b64encode(Path(raw_path).read_bytes()).decode()
- raw = await self._client.chat(
- model=self.model,
- messages=[
- {"role": "system", "content": SYSTEM_PROMPT},
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
- {"type": "text", "text": user_prompt},
- ],
- },
- ],
- max_tokens=500,
- keep_alive=300,
- )
- elif self.backend == "claude":
- raw = await self._client.generate(
- system=SYSTEM_PROMPT,
- user=user_prompt,
- cwd=str(Path(raw_path).parent),
- allowed_tools=["Read"],
- )
- else:
- return None
- except Exception as exc:
- logger.warning("[%s] Scoring failed: %s", self.name, exc)
+ gate_raw = await self._call_backend(img_b64, raw_path, gate_prompt)
+ if gate_raw is None:
return None
- if raw is None:
+ gate_result = _parse_gates_only(gate_raw, gates)
+ if gate_result is None:
return None
- return _parse_gated_scores(raw, gates, quality_dims)
+ gate_passed, parsed_gates, failed_reason = gate_result
+ if not gate_passed:
+ return {
+ "gates": parsed_gates,
+ "quality": {},
+ "gate_passed": False,
+ "confidence": 0.0,
+ "failed_gate_reason": failed_reason,
+ "quality_floor_failed": False,
+ }
+
+ # --- Pass 2: quality only ---
+ quality_prompt = QUALITY_PROMPT_TEMPLATE.format(
+ filename=filename,
+ entity_id=entity_id,
+ quality_instructions=_build_quality_instructions(quality_dims, ctx_quality_descs),
+ quality_template=_build_quality_only_template(quality_dims),
+ )
+ quality_raw = await self._call_backend(img_b64, raw_path, quality_prompt)
+ if quality_raw is None:
+ return None
+
+ quality = _parse_quality_only(quality_raw, quality_dims)
+ if quality is None:
+ return None
+
+ # --- Pass 3: tiebreaker (when confidence is within range of threshold) ---
+ if self.tiebreaker_range > 0.0:
+ confidence = _quality_confidence(quality)
+ if abs(confidence - self.threshold) <= self.tiebreaker_range:
+ logger.debug(
+ "[%s] Tiebreaker triggered: confidence=%.3f threshold=%.2f range=%.2f",
+ self.name, confidence, self.threshold, self.tiebreaker_range,
+ )
+ tie_raw = await self._call_backend(img_b64, raw_path, quality_prompt)
+ if tie_raw is not None:
+ tie_quality = _parse_quality_only(tie_raw, quality_dims)
+ if tie_quality is not None:
+ quality = _merge_quality(quality, tie_quality)
+
+ floor_reason = _quality_floor_check(quality)
+ if floor_reason:
+ return {
+ "gates": parsed_gates,
+ "quality": quality,
+ "gate_passed": False,
+ "confidence": 0.0,
+ "failed_gate_reason": floor_reason,
+ "quality_floor_failed": True,
+ }
+
+ return {
+ "gates": parsed_gates,
+ "quality": quality,
+ "gate_passed": True,
+ "confidence": _quality_confidence(quality),
+ "failed_gate_reason": None,
+ "quality_floor_failed": False,
+ }
def _ensure_claude_sdk():
@@ -784,6 +899,7 @@ class ScoringPipeline:
backend=stage["backend"],
model=stage["model"],
threshold=stage.get("threshold", CONFIDENCE_THRESHOLD),
+ tiebreaker_range=stage.get("tiebreaker_range", 0.0),
))
async def score_variant_at_tier(
diff --git a/tools/sprite-generation/engine/registry.py b/tools/sprite-generation/engine/registry.py
index ee90de5b..024cefc1 100644
--- a/tools/sprite-generation/engine/registry.py
+++ b/tools/sprite-generation/engine/registry.py
@@ -894,6 +894,14 @@ class SpriteRegistry:
"avg_confidence": round(row["avg_conf"] or 0.0, 3),
}
+ unscored = self.conn.execute(
+ """SELECT COUNT(*) FROM variants
+ WHERE job_status = 'completed'
+ AND NOT EXISTS (
+ SELECT 1 FROM latest_scores WHERE variant_id = variants.id
+ )"""
+ ).fetchone()[0]
+
approved = self.conn.execute(
"SELECT COUNT(*) FROM variants WHERE is_approved = 1"
).fetchone()[0]
@@ -905,6 +913,7 @@ class SpriteRegistry:
funnel = {
"total_completed": total_completed,
"total_processed": total_processed,
+ "unscored": unscored,
"scoring": scoring,
"approved": approved,
"installed": installed,