diff --git a/tools/sprite-generation/engine/ranker.py b/tools/sprite-generation/engine/ranker.py index 20a1fd1b..71952a44 100644 --- a/tools/sprite-generation/engine/ranker.py +++ b/tools/sprite-generation/engine/ranker.py @@ -307,34 +307,49 @@ Your job has TWO parts: If ANY gate is false, you MUST provide a failed_gate_reason explaining which gate failed and why. -Always respond with valid JSON only — no other text.""" +Think through each gate carefully before answering. Then output ONLY valid JSON — no text outside the JSON block. -RANKING_PROMPT_TEMPLATE = """\ +Example FAIL (facing_southwest gate): +{"gates":{"facing_southwest":false,"single_character":true,"no_text_watermark":true},"failed_gate_reason":"facing_southwest failed: character faces the camera, front of body visible","quality":{}} + +Example PASS (all gates true, quality scored): +{"gates":{"facing_southwest":true,"single_character":true,"no_text_watermark":true},"failed_gate_reason":null,"quality":{"direction_quality":85,"art_style":80,"equipment_detail":75,"background_cleanliness":90,"shadow_acceptability":88}}""" + +GATE_PROMPT_TEMPLATE = """\ Look at the image file {filename} in this directory. -This image was generated as a game sprite: +This sprite was generated as: - Category: {category} - Entity: {entity_id} -- Prompt used: {prompt} +- Prompt: {prompt} -## STEP 1: Boolean Gates (answer true or false for each) +Evaluate each boolean gate (true or false): {gate_instructions} -## STEP 2: Quality Ranges (0-100, only if ALL gates are true) +Respond with this exact JSON: +{gate_template}""" + +QUALITY_PROMPT_TEMPLATE = """\ +Look at the image file {filename} in this directory. + +All gates passed for this sprite ({entity_id}). Now score each quality dimension 0-100: {quality_instructions} -Respond with this exact JSON structure: -{score_template}""" +Respond with this exact JSON: +{quality_template}""" def _extract_json(text: str) -> dict | None: """Extract JSON object from VLM response text. - Handles markdown code blocks, leading/trailing text, and raw JSON. + Handles markdown code blocks, leading/trailing text, raw JSON, and + Qwen3 ... reasoning blocks. """ import re + # Strip Qwen3 thinking blocks before any parse attempt + text = re.sub(r".*?", "", text, flags=re.DOTALL).strip() # Try direct parse try: return json.loads(text) @@ -527,81 +542,105 @@ def _build_quality_instructions(dims: tuple[str, ...], descs: dict[str, str]) -> return "\n".join(lines) -def _build_score_template(gates: tuple[str, ...], quality_dims: tuple[str, ...]) -> str: - template: dict = { +def _build_gate_only_template(gates: tuple[str, ...]) -> str: + return json.dumps({ "gates": {g: True for g in gates}, "failed_gate_reason": "null or string explaining which gate failed and why", - "quality": {d: 0 for d in quality_dims}, - } - return json.dumps(template, indent=2) + }, indent=2) -def _parse_gated_scores( +def _build_quality_only_template(quality_dims: tuple[str, ...]) -> str: + return json.dumps({"quality": {d: 0 for d in quality_dims}}, indent=2) + + +def _parse_gates_only( raw: str, gates: tuple[str, ...], - quality_dims: tuple[str, ...], -) -> dict | None: - """Parse reviewer's JSON response into gate+quality structure. - - Returns dict with keys: gates, quality, gate_passed, confidence, failed_gate_reason - Returns None if JSON is unparseable. - """ +) -> tuple[bool, dict[str, bool], str | None] | None: + """Parse a gate-only response. Returns (gate_passed, gates_dict, failed_reason) or None.""" data = _extract_json(raw) if data is None: return None - gates_data = data.get("gates") if not isinstance(gates_data, dict): return None - - # Parse gates — must be boolean - parsed_gates: dict[str, bool] = {} + parsed: dict[str, bool] = {} for g in gates: val = gates_data.get(g) if isinstance(val, bool): - parsed_gates[g] = val + parsed[g] = val elif isinstance(val, (int, float)): - parsed_gates[g] = bool(val) + parsed[g] = bool(val) else: - parsed_gates[g] = False # missing gate = fail + parsed[g] = False + gate_passed = all(parsed.values()) + failed_reason = data.get("failed_gate_reason") if not gate_passed else None + if isinstance(failed_reason, str) and failed_reason.lower() in ("null", "none", ""): + failed_reason = None + return (gate_passed, parsed, failed_reason) + +def _parse_quality_only( + raw: str, + quality_dims: tuple[str, ...], +) -> dict[str, int] | None: + """Parse a quality-only response. Returns {dim: score} or None.""" + data = _extract_json(raw) + if data is None: + return None + quality_data = data.get("quality", {}) + if not isinstance(quality_data, dict): + return None + result: dict[str, int] = {} + for d in quality_dims: + val = quality_data.get(d) + result[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0 + return result + + +def _quality_confidence(quality: dict[str, int]) -> float: + if not quality: + return 0.0 + return round(sum(quality.values()) / (len(quality) * 100), 3) + + +def _quality_floor_check(quality: dict[str, int]) -> str | None: + """Returns a failure reason string if any dim is below floor, else None.""" + failures = [f"{d}={v}" for d, v in quality.items() if v < QUALITY_DIM_FLOOR] + return f"quality floor breach: {', '.join(failures)}" if failures else None + + +def _merge_quality(a: dict[str, int], b: dict[str, int]) -> dict[str, int]: + """Average two quality score dicts dimension-by-dimension.""" + return {d: (a.get(d, 0) + b.get(d, 0)) // 2 for d in a} + + +# kept for backward-compat with confidence_from_notes (reads stored JSON) +def _parse_gated_scores_legacy( + data: dict, + gates: tuple[str, ...], + quality_dims: tuple[str, ...], +) -> dict: + parsed_gates: dict[str, bool] = {} + for g in gates: + val = data.get("gates", {}).get(g) + parsed_gates[g] = bool(val) if isinstance(val, (bool, int, float)) else False gate_passed = all(parsed_gates.values()) failed_reason = data.get("failed_gate_reason") if not gate_passed else None - - # Parse quality — only meaningful if gates passed parsed_quality: dict[str, int] = {} - quality_data = data.get("quality", {}) - if isinstance(quality_data, dict): - for d in quality_dims: - val = quality_data.get(d) - if isinstance(val, (int, float)): - parsed_quality[d] = max(0, min(100, int(val))) - else: - parsed_quality[d] = 0 - - # Per-dimension quality floor: any single dim below threshold = auto-reject - quality_floor_failed = False - floor_failures: list[str] = [] - if gate_passed and parsed_quality: - for dim_name, dim_val in parsed_quality.items(): - if dim_val < QUALITY_DIM_FLOOR: - floor_failures.append(f"{dim_name}={dim_val}") - quality_floor_failed = True - - if gate_passed and parsed_quality and not quality_floor_failed: - confidence = sum(parsed_quality.values()) / (len(parsed_quality) * 100) - else: - confidence = 0.0 - if quality_floor_failed and not failed_reason: - failed_reason = f"quality floor breach: {', '.join(floor_failures)}" - + for d in quality_dims: + val = data.get("quality", {}).get(d) + parsed_quality[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0 + floor_reason = _quality_floor_check(parsed_quality) if gate_passed else None + confidence = _quality_confidence(parsed_quality) if gate_passed and not floor_reason else 0.0 + floor_failed = bool(floor_reason) return { "gates": parsed_gates, "quality": parsed_quality, - "gate_passed": gate_passed and not quality_floor_failed, - "confidence": round(confidence, 3), - "failed_gate_reason": failed_reason, - "quality_floor_failed": quality_floor_failed, + "gate_passed": gate_passed and not floor_failed, + "confidence": confidence, + "failed_gate_reason": floor_reason if floor_failed else failed_reason, + "quality_floor_failed": floor_failed, } @@ -623,18 +662,31 @@ def confidence_from_notes(notes_json: str) -> tuple[float, bool]: class Scorer: - """Scores a variant image using gated boolean + quality range rubric. + """Scores a variant image using a two-pass gate→quality rubric with optional tiebreaker. + + Pass 1 — gate-only prompt: binary pass/fail checks. Failures exit immediately. + Pass 2 — quality-only prompt: 0-100 dimensions, only reached when all gates pass. + Pass 3 — tiebreaker (optional): re-scores quality when confidence is within + `tiebreaker_range` of `threshold`; averages pass-2 and pass-3 scores. Two backend types: - "model-boss": local VLM via InferenceClient (Qwen3-VL, etc.) - "claude": Claude API via claude-code-batch-sdk (Haiku, Sonnet, Opus) """ - def __init__(self, name: str, backend: str, model: str, threshold: float): + def __init__( + self, + name: str, + backend: str, + model: str, + threshold: float, + tiebreaker_range: float = 0.0, + ): self.name = name self.backend = backend self.model = model self.threshold = threshold + self.tiebreaker_range = tiebreaker_range self._semaphore = asyncio.Semaphore(CONCURRENCY.get(backend, 4)) if backend == "model-boss": @@ -676,8 +728,40 @@ class Scorer: logger.warning("[%s] Batch item error: %s", self.name, exc) return results + async def _call_backend(self, img_b64: str, raw_path: str, prompt: str) -> str | None: + """Send a single prompt+image to the backend. Returns raw text or None.""" + try: + if self.backend == "model-boss": + return await self._client.chat( + model=self.model, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}}, + ], + }, + ], + max_tokens=2048, + temperature=0.1, + keep_alive=300, + ) + elif self.backend == "claude": + return await self._client.generate( + system=SYSTEM_PROMPT, + user=prompt, + cwd=str(Path(raw_path).parent), + allowed_tools=["Read"], + ) + return None + except Exception as exc: + logger.warning("[%s] Backend call failed: %s", self.name, exc) + return None + async def _score_inner(self, raw_path: str, sprite: dict) -> dict | None: - """Score a single variant image. Returns gated result dict or None.""" + """Two-pass (+ optional tiebreaker) scoring. Returns gated result dict or None.""" import base64 if not raw_path or not Path(raw_path).exists(): @@ -691,56 +775,87 @@ class Scorer: entity_id = sprite.get("entity_id", "") ctx_gate_descs = _contextualize_descriptions(gate_descs, entity_id) ctx_quality_descs = _contextualize_descriptions(quality_descs, entity_id) + filename = Path(raw_path).name + prompt_excerpt = sprite["prompt"][:300] - gate_instructions = _build_gate_instructions(gates, ctx_gate_descs) - quality_instructions = _build_quality_instructions(quality_dims, ctx_quality_descs) - score_template = _build_score_template(gates, quality_dims) + img_b64 = base64.b64encode(Path(raw_path).read_bytes()).decode() - user_prompt = RANKING_PROMPT_TEMPLATE.format( - filename=Path(raw_path).name, + # --- Pass 1: gates only --- + gate_prompt = GATE_PROMPT_TEMPLATE.format( + filename=filename, category=category, entity_id=entity_id, - prompt=sprite["prompt"][:300], - gate_instructions=gate_instructions, - quality_instructions=quality_instructions, - score_template=score_template, + prompt=prompt_excerpt, + gate_instructions=_build_gate_instructions(gates, ctx_gate_descs), + gate_template=_build_gate_only_template(gates), ) - - try: - if self.backend == "model-boss": - img_b64 = base64.b64encode(Path(raw_path).read_bytes()).decode() - raw = await self._client.chat( - model=self.model, - messages=[ - {"role": "system", "content": SYSTEM_PROMPT}, - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}}, - {"type": "text", "text": user_prompt}, - ], - }, - ], - max_tokens=500, - keep_alive=300, - ) - elif self.backend == "claude": - raw = await self._client.generate( - system=SYSTEM_PROMPT, - user=user_prompt, - cwd=str(Path(raw_path).parent), - allowed_tools=["Read"], - ) - else: - return None - except Exception as exc: - logger.warning("[%s] Scoring failed: %s", self.name, exc) + gate_raw = await self._call_backend(img_b64, raw_path, gate_prompt) + if gate_raw is None: return None - if raw is None: + gate_result = _parse_gates_only(gate_raw, gates) + if gate_result is None: return None - return _parse_gated_scores(raw, gates, quality_dims) + gate_passed, parsed_gates, failed_reason = gate_result + if not gate_passed: + return { + "gates": parsed_gates, + "quality": {}, + "gate_passed": False, + "confidence": 0.0, + "failed_gate_reason": failed_reason, + "quality_floor_failed": False, + } + + # --- Pass 2: quality only --- + quality_prompt = QUALITY_PROMPT_TEMPLATE.format( + filename=filename, + entity_id=entity_id, + quality_instructions=_build_quality_instructions(quality_dims, ctx_quality_descs), + quality_template=_build_quality_only_template(quality_dims), + ) + quality_raw = await self._call_backend(img_b64, raw_path, quality_prompt) + if quality_raw is None: + return None + + quality = _parse_quality_only(quality_raw, quality_dims) + if quality is None: + return None + + # --- Pass 3: tiebreaker (when confidence is within range of threshold) --- + if self.tiebreaker_range > 0.0: + confidence = _quality_confidence(quality) + if abs(confidence - self.threshold) <= self.tiebreaker_range: + logger.debug( + "[%s] Tiebreaker triggered: confidence=%.3f threshold=%.2f range=%.2f", + self.name, confidence, self.threshold, self.tiebreaker_range, + ) + tie_raw = await self._call_backend(img_b64, raw_path, quality_prompt) + if tie_raw is not None: + tie_quality = _parse_quality_only(tie_raw, quality_dims) + if tie_quality is not None: + quality = _merge_quality(quality, tie_quality) + + floor_reason = _quality_floor_check(quality) + if floor_reason: + return { + "gates": parsed_gates, + "quality": quality, + "gate_passed": False, + "confidence": 0.0, + "failed_gate_reason": floor_reason, + "quality_floor_failed": True, + } + + return { + "gates": parsed_gates, + "quality": quality, + "gate_passed": True, + "confidence": _quality_confidence(quality), + "failed_gate_reason": None, + "quality_floor_failed": False, + } def _ensure_claude_sdk(): @@ -784,6 +899,7 @@ class ScoringPipeline: backend=stage["backend"], model=stage["model"], threshold=stage.get("threshold", CONFIDENCE_THRESHOLD), + tiebreaker_range=stage.get("tiebreaker_range", 0.0), )) async def score_variant_at_tier( diff --git a/tools/sprite-generation/engine/registry.py b/tools/sprite-generation/engine/registry.py index ee90de5b..024cefc1 100644 --- a/tools/sprite-generation/engine/registry.py +++ b/tools/sprite-generation/engine/registry.py @@ -894,6 +894,14 @@ class SpriteRegistry: "avg_confidence": round(row["avg_conf"] or 0.0, 3), } + unscored = self.conn.execute( + """SELECT COUNT(*) FROM variants + WHERE job_status = 'completed' + AND NOT EXISTS ( + SELECT 1 FROM latest_scores WHERE variant_id = variants.id + )""" + ).fetchone()[0] + approved = self.conn.execute( "SELECT COUNT(*) FROM variants WHERE is_approved = 1" ).fetchone()[0] @@ -905,6 +913,7 @@ class SpriteRegistry: funnel = { "total_completed": total_completed, "total_processed": total_processed, + "unscored": unscored, "scoring": scoring, "approved": approved, "installed": installed,