refactor(sprite-generation): ♻️ Refactor sprite generation ranking and registry to support dynamic strategy injection and reduce processing overhead

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-03-29 05:47:59 -07:00 · 2026-03-29 05:47:59 -07:00 · af0dd0372c
commit af0dd0372c
parent ae3cabaf54
2 changed files with 229 additions and 104 deletions
--- a/tools/sprite-generation/engine/ranker.py
+++ b/tools/sprite-generation/engine/ranker.py
@ -307,34 +307,49 @@ Your job has TWO parts:

 If ANY gate is false, you MUST provide a failed_gate_reason explaining which gate failed and why.

-Always respond with valid JSON only — no other text."""
+Think through each gate carefully before answering. Then output ONLY valid JSON — no text outside the JSON block.

-RANKING_PROMPT_TEMPLATE = """\
+Example FAIL (facing_southwest gate):
+{"gates":{"facing_southwest":false,"single_character":true,"no_text_watermark":true},"failed_gate_reason":"facing_southwest failed: character faces the camera, front of body visible","quality":{}}
+
+Example PASS (all gates true, quality scored):
+{"gates":{"facing_southwest":true,"single_character":true,"no_text_watermark":true},"failed_gate_reason":null,"quality":{"direction_quality":85,"art_style":80,"equipment_detail":75,"background_cleanliness":90,"shadow_acceptability":88}}"""
+
+GATE_PROMPT_TEMPLATE = """\
 Look at the image file {filename} in this directory.

-This image was generated as a game sprite:
+This sprite was generated as:
 - Category: {category}
 - Entity: {entity_id}
- Prompt used: {prompt}
+- Prompt: {prompt}

-## STEP 1: Boolean Gates (answer true or false for each)
+Evaluate each boolean gate (true or false):

 {gate_instructions}

-## STEP 2: Quality Ranges (0-100, only if ALL gates are true)
+Respond with this exact JSON:
+{gate_template}"""
+
+QUALITY_PROMPT_TEMPLATE = """\
+Look at the image file {filename} in this directory.
+
+All gates passed for this sprite ({entity_id}). Now score each quality dimension 0-100:

 {quality_instructions}

-Respond with this exact JSON structure:
-{score_template}"""
+Respond with this exact JSON:
+{quality_template}"""


 def _extract_json(text: str) -> dict | None:
    """Extract JSON object from VLM response text.

-    Handles markdown code blocks, leading/trailing text, and raw JSON.
+    Handles markdown code blocks, leading/trailing text, raw JSON, and
+    Qwen3 <think>...</think> reasoning blocks.
    """
    import re
+    # Strip Qwen3 thinking blocks before any parse attempt
+    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
    # Try direct parse
    try:
        return json.loads(text)
@ -527,81 +542,105 @@ def _build_quality_instructions(dims: tuple[str, ...], descs: dict[str, str]) ->
    return "\n".join(lines)


-def _build_score_template(gates: tuple[str, ...], quality_dims: tuple[str, ...]) -> str:
-    template: dict = {
+def _build_gate_only_template(gates: tuple[str, ...]) -> str:
+    return json.dumps({
        "gates": {g: True for g in gates},
        "failed_gate_reason": "null or string explaining which gate failed and why",
-        "quality": {d: 0 for d in quality_dims},
-    }
-    return json.dumps(template, indent=2)
+    }, indent=2)


-def _parse_gated_scores(
+def _build_quality_only_template(quality_dims: tuple[str, ...]) -> str:
+    return json.dumps({"quality": {d: 0 for d in quality_dims}}, indent=2)
+
+
+def _parse_gates_only(
    raw: str,
    gates: tuple[str, ...],
-    quality_dims: tuple[str, ...],
-) -> dict | None:
-    """Parse reviewer's JSON response into gate+quality structure.
-
-    Returns dict with keys: gates, quality, gate_passed, confidence, failed_gate_reason
-    Returns None if JSON is unparseable.
-    """
+) -> tuple[bool, dict[str, bool], str | None] | None:
+    """Parse a gate-only response. Returns (gate_passed, gates_dict, failed_reason) or None."""
    data = _extract_json(raw)
    if data is None:
        return None
-
    gates_data = data.get("gates")
    if not isinstance(gates_data, dict):
        return None
-
-    # Parse gates — must be boolean
-    parsed_gates: dict[str, bool] = {}
+    parsed: dict[str, bool] = {}
    for g in gates:
        val = gates_data.get(g)
        if isinstance(val, bool):
-            parsed_gates[g] = val
+            parsed[g] = val
        elif isinstance(val, (int, float)):
-            parsed_gates[g] = bool(val)
+            parsed[g] = bool(val)
        else:
-            parsed_gates[g] = False  # missing gate = fail
+            parsed[g] = False
+    gate_passed = all(parsed.values())
+    failed_reason = data.get("failed_gate_reason") if not gate_passed else None
+    if isinstance(failed_reason, str) and failed_reason.lower() in ("null", "none", ""):
+        failed_reason = None
+    return (gate_passed, parsed, failed_reason)

+
+def _parse_quality_only(
+    raw: str,
+    quality_dims: tuple[str, ...],
+) -> dict[str, int] | None:
+    """Parse a quality-only response. Returns {dim: score} or None."""
+    data = _extract_json(raw)
+    if data is None:
+        return None
+    quality_data = data.get("quality", {})
+    if not isinstance(quality_data, dict):
+        return None
+    result: dict[str, int] = {}
+    for d in quality_dims:
+        val = quality_data.get(d)
+        result[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0
+    return result
+
+
+def _quality_confidence(quality: dict[str, int]) -> float:
+    if not quality:
+        return 0.0
+    return round(sum(quality.values()) / (len(quality) * 100), 3)
+
+
+def _quality_floor_check(quality: dict[str, int]) -> str | None:
+    """Returns a failure reason string if any dim is below floor, else None."""
+    failures = [f"{d}={v}" for d, v in quality.items() if v < QUALITY_DIM_FLOOR]
+    return f"quality floor breach: {', '.join(failures)}" if failures else None
+
+
+def _merge_quality(a: dict[str, int], b: dict[str, int]) -> dict[str, int]:
+    """Average two quality score dicts dimension-by-dimension."""
+    return {d: (a.get(d, 0) + b.get(d, 0)) // 2 for d in a}
+
+
+# kept for backward-compat with confidence_from_notes (reads stored JSON)
+def _parse_gated_scores_legacy(
+    data: dict,
+    gates: tuple[str, ...],
+    quality_dims: tuple[str, ...],
+) -> dict:
+    parsed_gates: dict[str, bool] = {}
+    for g in gates:
+        val = data.get("gates", {}).get(g)
+        parsed_gates[g] = bool(val) if isinstance(val, (bool, int, float)) else False
    gate_passed = all(parsed_gates.values())
    failed_reason = data.get("failed_gate_reason") if not gate_passed else None
-
-    # Parse quality — only meaningful if gates passed
    parsed_quality: dict[str, int] = {}
-    quality_data = data.get("quality", {})
-    if isinstance(quality_data, dict):
-        for d in quality_dims:
-            val = quality_data.get(d)
-            if isinstance(val, (int, float)):
-                parsed_quality[d] = max(0, min(100, int(val)))
-            else:
-                parsed_quality[d] = 0
-
-    # Per-dimension quality floor: any single dim below threshold = auto-reject
-    quality_floor_failed = False
-    floor_failures: list[str] = []
-    if gate_passed and parsed_quality:
-        for dim_name, dim_val in parsed_quality.items():
-            if dim_val < QUALITY_DIM_FLOOR:
-                floor_failures.append(f"{dim_name}={dim_val}")
-                quality_floor_failed = True
-
-    if gate_passed and parsed_quality and not quality_floor_failed:
-        confidence = sum(parsed_quality.values()) / (len(parsed_quality) * 100)
-    else:
-        confidence = 0.0
-        if quality_floor_failed and not failed_reason:
-            failed_reason = f"quality floor breach: {', '.join(floor_failures)}"
-
+    for d in quality_dims:
+        val = data.get("quality", {}).get(d)
+        parsed_quality[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0
+    floor_reason = _quality_floor_check(parsed_quality) if gate_passed else None
+    confidence = _quality_confidence(parsed_quality) if gate_passed and not floor_reason else 0.0
+    floor_failed = bool(floor_reason)
    return {
        "gates": parsed_gates,
        "quality": parsed_quality,
-        "gate_passed": gate_passed and not quality_floor_failed,
-        "confidence": round(confidence, 3),
-        "failed_gate_reason": failed_reason,
-        "quality_floor_failed": quality_floor_failed,
+        "gate_passed": gate_passed and not floor_failed,
+        "confidence": confidence,
+        "failed_gate_reason": floor_reason if floor_failed else failed_reason,
+        "quality_floor_failed": floor_failed,
    }


@ -623,18 +662,31 @@ def confidence_from_notes(notes_json: str) -> tuple[float, bool]:


 class Scorer:
-    """Scores a variant image using gated boolean + quality range rubric.
+    """Scores a variant image using a two-pass gate→quality rubric with optional tiebreaker.
+
+    Pass 1 — gate-only prompt: binary pass/fail checks. Failures exit immediately.
+    Pass 2 — quality-only prompt: 0-100 dimensions, only reached when all gates pass.
+    Pass 3 — tiebreaker (optional): re-scores quality when confidence is within
+              `tiebreaker_range` of `threshold`; averages pass-2 and pass-3 scores.

    Two backend types:
    - "model-boss": local VLM via InferenceClient (Qwen3-VL, etc.)
    - "claude": Claude API via claude-code-batch-sdk (Haiku, Sonnet, Opus)
    """

-    def __init__(self, name: str, backend: str, model: str, threshold: float):
+    def __init__(
+        self,
+        name: str,
+        backend: str,
+        model: str,
+        threshold: float,
+        tiebreaker_range: float = 0.0,
+    ):
        self.name = name
        self.backend = backend
        self.model = model
        self.threshold = threshold
+        self.tiebreaker_range = tiebreaker_range
        self._semaphore = asyncio.Semaphore(CONCURRENCY.get(backend, 4))

        if backend == "model-boss":
@ -676,8 +728,40 @@ class Scorer:
                logger.warning("[%s] Batch item error: %s", self.name, exc)
        return results

+    async def _call_backend(self, img_b64: str, raw_path: str, prompt: str) -> str | None:
+        """Send a single prompt+image to the backend. Returns raw text or None."""
+        try:
+            if self.backend == "model-boss":
+                return await self._client.chat(
+                    model=self.model,
+                    messages=[
+                        {"role": "system", "content": SYSTEM_PROMPT},
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": prompt},
+                                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
+                            ],
+                        },
+                    ],
+                    max_tokens=2048,
+                    temperature=0.1,
+                    keep_alive=300,
+                )
+            elif self.backend == "claude":
+                return await self._client.generate(
+                    system=SYSTEM_PROMPT,
+                    user=prompt,
+                    cwd=str(Path(raw_path).parent),
+                    allowed_tools=["Read"],
+                )
+            return None
+        except Exception as exc:
+            logger.warning("[%s] Backend call failed: %s", self.name, exc)
+            return None
+
    async def _score_inner(self, raw_path: str, sprite: dict) -> dict | None:
-        """Score a single variant image. Returns gated result dict or None."""
+        """Two-pass (+ optional tiebreaker) scoring. Returns gated result dict or None."""
        import base64

        if not raw_path or not Path(raw_path).exists():
@ -691,56 +775,87 @@ class Scorer:
        entity_id = sprite.get("entity_id", "")
        ctx_gate_descs = _contextualize_descriptions(gate_descs, entity_id)
        ctx_quality_descs = _contextualize_descriptions(quality_descs, entity_id)
+        filename = Path(raw_path).name
+        prompt_excerpt = sprite["prompt"][:300]

-        gate_instructions = _build_gate_instructions(gates, ctx_gate_descs)
-        quality_instructions = _build_quality_instructions(quality_dims, ctx_quality_descs)
-        score_template = _build_score_template(gates, quality_dims)
+        img_b64 = base64.b64encode(Path(raw_path).read_bytes()).decode()

-        user_prompt = RANKING_PROMPT_TEMPLATE.format(
-            filename=Path(raw_path).name,
+        # --- Pass 1: gates only ---
+        gate_prompt = GATE_PROMPT_TEMPLATE.format(
+            filename=filename,
            category=category,
            entity_id=entity_id,
-            prompt=sprite["prompt"][:300],
-            gate_instructions=gate_instructions,
-            quality_instructions=quality_instructions,
-            score_template=score_template,
+            prompt=prompt_excerpt,
+            gate_instructions=_build_gate_instructions(gates, ctx_gate_descs),
+            gate_template=_build_gate_only_template(gates),
        )
-
-        try:
-            if self.backend == "model-boss":
-                img_b64 = base64.b64encode(Path(raw_path).read_bytes()).decode()
-                raw = await self._client.chat(
-                    model=self.model,
-                    messages=[
-                        {"role": "system", "content": SYSTEM_PROMPT},
-                        {
-                            "role": "user",
-                            "content": [
-                                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
-                                {"type": "text", "text": user_prompt},
-                            ],
-                        },
-                    ],
-                    max_tokens=500,
-                    keep_alive=300,
-                )
-            elif self.backend == "claude":
-                raw = await self._client.generate(
-                    system=SYSTEM_PROMPT,
-                    user=user_prompt,
-                    cwd=str(Path(raw_path).parent),
-                    allowed_tools=["Read"],
-                )
-            else:
-                return None
-        except Exception as exc:
-            logger.warning("[%s] Scoring failed: %s", self.name, exc)
+        gate_raw = await self._call_backend(img_b64, raw_path, gate_prompt)
+        if gate_raw is None:
            return None

-        if raw is None:
+        gate_result = _parse_gates_only(gate_raw, gates)
+        if gate_result is None:
            return None

-        return _parse_gated_scores(raw, gates, quality_dims)
+        gate_passed, parsed_gates, failed_reason = gate_result
+        if not gate_passed:
+            return {
+                "gates": parsed_gates,
+                "quality": {},
+                "gate_passed": False,
+                "confidence": 0.0,
+                "failed_gate_reason": failed_reason,
+                "quality_floor_failed": False,
+            }
+
+        # --- Pass 2: quality only ---
+        quality_prompt = QUALITY_PROMPT_TEMPLATE.format(
+            filename=filename,
+            entity_id=entity_id,
+            quality_instructions=_build_quality_instructions(quality_dims, ctx_quality_descs),
+            quality_template=_build_quality_only_template(quality_dims),
+        )
+        quality_raw = await self._call_backend(img_b64, raw_path, quality_prompt)
+        if quality_raw is None:
+            return None
+
+        quality = _parse_quality_only(quality_raw, quality_dims)
+        if quality is None:
+            return None
+
+        # --- Pass 3: tiebreaker (when confidence is within range of threshold) ---
+        if self.tiebreaker_range > 0.0:
+            confidence = _quality_confidence(quality)
+            if abs(confidence - self.threshold) <= self.tiebreaker_range:
+                logger.debug(
+                    "[%s] Tiebreaker triggered: confidence=%.3f threshold=%.2f range=%.2f",
+                    self.name, confidence, self.threshold, self.tiebreaker_range,
+                )
+                tie_raw = await self._call_backend(img_b64, raw_path, quality_prompt)
+                if tie_raw is not None:
+                    tie_quality = _parse_quality_only(tie_raw, quality_dims)
+                    if tie_quality is not None:
+                        quality = _merge_quality(quality, tie_quality)
+
+        floor_reason = _quality_floor_check(quality)
+        if floor_reason:
+            return {
+                "gates": parsed_gates,
+                "quality": quality,
+                "gate_passed": False,
+                "confidence": 0.0,
+                "failed_gate_reason": floor_reason,
+                "quality_floor_failed": True,
+            }
+
+        return {
+            "gates": parsed_gates,
+            "quality": quality,
+            "gate_passed": True,
+            "confidence": _quality_confidence(quality),
+            "failed_gate_reason": None,
+            "quality_floor_failed": False,
+        }


 def _ensure_claude_sdk():
@ -784,6 +899,7 @@ class ScoringPipeline:
                backend=stage["backend"],
                model=stage["model"],
                threshold=stage.get("threshold", CONFIDENCE_THRESHOLD),
+                tiebreaker_range=stage.get("tiebreaker_range", 0.0),
            ))

    async def score_variant_at_tier(
--- a/tools/sprite-generation/engine/registry.py
+++ b/tools/sprite-generation/engine/registry.py
@ -894,6 +894,14 @@ class SpriteRegistry:
                "avg_confidence": round(row["avg_conf"] or 0.0, 3),
            }

+        unscored = self.conn.execute(
+            """SELECT COUNT(*) FROM variants
+               WHERE job_status = 'completed'
+                 AND NOT EXISTS (
+                     SELECT 1 FROM latest_scores WHERE variant_id = variants.id
+                 )"""
+        ).fetchone()[0]
+
        approved = self.conn.execute(
            "SELECT COUNT(*) FROM variants WHERE is_approved = 1"
        ).fetchone()[0]
@ -905,6 +913,7 @@ class SpriteRegistry:
        funnel = {
            "total_completed": total_completed,
            "total_processed": total_processed,
+            "unscored": unscored,
            "scoring": scoring,
            "approved": approved,
            "installed": installed,