refactor(sprite-generation): ♻️ Refactor sprite generation ranking and registry to support dynamic strategy injection and reduce processing overhead

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Claude Code 2026-03-29 05:47:59 -07:00
parent ae3cabaf54
commit af0dd0372c
2 changed files with 229 additions and 104 deletions

View file

@ -307,34 +307,49 @@ Your job has TWO parts:
If ANY gate is false, you MUST provide a failed_gate_reason explaining which gate failed and why.
Always respond with valid JSON only no other text."""
Think through each gate carefully before answering. Then output ONLY valid JSON no text outside the JSON block.
RANKING_PROMPT_TEMPLATE = """\
Example FAIL (facing_southwest gate):
{"gates":{"facing_southwest":false,"single_character":true,"no_text_watermark":true},"failed_gate_reason":"facing_southwest failed: character faces the camera, front of body visible","quality":{}}
Example PASS (all gates true, quality scored):
{"gates":{"facing_southwest":true,"single_character":true,"no_text_watermark":true},"failed_gate_reason":null,"quality":{"direction_quality":85,"art_style":80,"equipment_detail":75,"background_cleanliness":90,"shadow_acceptability":88}}"""
GATE_PROMPT_TEMPLATE = """\
Look at the image file {filename} in this directory.
This image was generated as a game sprite:
This sprite was generated as:
- Category: {category}
- Entity: {entity_id}
- Prompt used: {prompt}
- Prompt: {prompt}
## STEP 1: Boolean Gates (answer true or false for each)
Evaluate each boolean gate (true or false):
{gate_instructions}
## STEP 2: Quality Ranges (0-100, only if ALL gates are true)
Respond with this exact JSON:
{gate_template}"""
QUALITY_PROMPT_TEMPLATE = """\
Look at the image file {filename} in this directory.
All gates passed for this sprite ({entity_id}). Now score each quality dimension 0-100:
{quality_instructions}
Respond with this exact JSON structure:
{score_template}"""
Respond with this exact JSON:
{quality_template}"""
def _extract_json(text: str) -> dict | None:
"""Extract JSON object from VLM response text.
Handles markdown code blocks, leading/trailing text, and raw JSON.
Handles markdown code blocks, leading/trailing text, raw JSON, and
Qwen3 <think>...</think> reasoning blocks.
"""
import re
# Strip Qwen3 thinking blocks before any parse attempt
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
# Try direct parse
try:
return json.loads(text)
@ -527,81 +542,105 @@ def _build_quality_instructions(dims: tuple[str, ...], descs: dict[str, str]) ->
return "\n".join(lines)
def _build_score_template(gates: tuple[str, ...], quality_dims: tuple[str, ...]) -> str:
template: dict = {
def _build_gate_only_template(gates: tuple[str, ...]) -> str:
return json.dumps({
"gates": {g: True for g in gates},
"failed_gate_reason": "null or string explaining which gate failed and why",
"quality": {d: 0 for d in quality_dims},
}
return json.dumps(template, indent=2)
}, indent=2)
def _parse_gated_scores(
def _build_quality_only_template(quality_dims: tuple[str, ...]) -> str:
return json.dumps({"quality": {d: 0 for d in quality_dims}}, indent=2)
def _parse_gates_only(
raw: str,
gates: tuple[str, ...],
quality_dims: tuple[str, ...],
) -> dict | None:
"""Parse reviewer's JSON response into gate+quality structure.
Returns dict with keys: gates, quality, gate_passed, confidence, failed_gate_reason
Returns None if JSON is unparseable.
"""
) -> tuple[bool, dict[str, bool], str | None] | None:
"""Parse a gate-only response. Returns (gate_passed, gates_dict, failed_reason) or None."""
data = _extract_json(raw)
if data is None:
return None
gates_data = data.get("gates")
if not isinstance(gates_data, dict):
return None
# Parse gates — must be boolean
parsed_gates: dict[str, bool] = {}
parsed: dict[str, bool] = {}
for g in gates:
val = gates_data.get(g)
if isinstance(val, bool):
parsed_gates[g] = val
parsed[g] = val
elif isinstance(val, (int, float)):
parsed_gates[g] = bool(val)
parsed[g] = bool(val)
else:
parsed_gates[g] = False # missing gate = fail
parsed[g] = False
gate_passed = all(parsed.values())
failed_reason = data.get("failed_gate_reason") if not gate_passed else None
if isinstance(failed_reason, str) and failed_reason.lower() in ("null", "none", ""):
failed_reason = None
return (gate_passed, parsed, failed_reason)
def _parse_quality_only(
raw: str,
quality_dims: tuple[str, ...],
) -> dict[str, int] | None:
"""Parse a quality-only response. Returns {dim: score} or None."""
data = _extract_json(raw)
if data is None:
return None
quality_data = data.get("quality", {})
if not isinstance(quality_data, dict):
return None
result: dict[str, int] = {}
for d in quality_dims:
val = quality_data.get(d)
result[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0
return result
def _quality_confidence(quality: dict[str, int]) -> float:
if not quality:
return 0.0
return round(sum(quality.values()) / (len(quality) * 100), 3)
def _quality_floor_check(quality: dict[str, int]) -> str | None:
"""Returns a failure reason string if any dim is below floor, else None."""
failures = [f"{d}={v}" for d, v in quality.items() if v < QUALITY_DIM_FLOOR]
return f"quality floor breach: {', '.join(failures)}" if failures else None
def _merge_quality(a: dict[str, int], b: dict[str, int]) -> dict[str, int]:
"""Average two quality score dicts dimension-by-dimension."""
return {d: (a.get(d, 0) + b.get(d, 0)) // 2 for d in a}
# kept for backward-compat with confidence_from_notes (reads stored JSON)
def _parse_gated_scores_legacy(
data: dict,
gates: tuple[str, ...],
quality_dims: tuple[str, ...],
) -> dict:
parsed_gates: dict[str, bool] = {}
for g in gates:
val = data.get("gates", {}).get(g)
parsed_gates[g] = bool(val) if isinstance(val, (bool, int, float)) else False
gate_passed = all(parsed_gates.values())
failed_reason = data.get("failed_gate_reason") if not gate_passed else None
# Parse quality — only meaningful if gates passed
parsed_quality: dict[str, int] = {}
quality_data = data.get("quality", {})
if isinstance(quality_data, dict):
for d in quality_dims:
val = quality_data.get(d)
if isinstance(val, (int, float)):
parsed_quality[d] = max(0, min(100, int(val)))
else:
parsed_quality[d] = 0
# Per-dimension quality floor: any single dim below threshold = auto-reject
quality_floor_failed = False
floor_failures: list[str] = []
if gate_passed and parsed_quality:
for dim_name, dim_val in parsed_quality.items():
if dim_val < QUALITY_DIM_FLOOR:
floor_failures.append(f"{dim_name}={dim_val}")
quality_floor_failed = True
if gate_passed and parsed_quality and not quality_floor_failed:
confidence = sum(parsed_quality.values()) / (len(parsed_quality) * 100)
else:
confidence = 0.0
if quality_floor_failed and not failed_reason:
failed_reason = f"quality floor breach: {', '.join(floor_failures)}"
for d in quality_dims:
val = data.get("quality", {}).get(d)
parsed_quality[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0
floor_reason = _quality_floor_check(parsed_quality) if gate_passed else None
confidence = _quality_confidence(parsed_quality) if gate_passed and not floor_reason else 0.0
floor_failed = bool(floor_reason)
return {
"gates": parsed_gates,
"quality": parsed_quality,
"gate_passed": gate_passed and not quality_floor_failed,
"confidence": round(confidence, 3),
"failed_gate_reason": failed_reason,
"quality_floor_failed": quality_floor_failed,
"gate_passed": gate_passed and not floor_failed,
"confidence": confidence,
"failed_gate_reason": floor_reason if floor_failed else failed_reason,
"quality_floor_failed": floor_failed,
}
@ -623,18 +662,31 @@ def confidence_from_notes(notes_json: str) -> tuple[float, bool]:
class Scorer:
"""Scores a variant image using gated boolean + quality range rubric.
"""Scores a variant image using a two-pass gate→quality rubric with optional tiebreaker.
Pass 1 gate-only prompt: binary pass/fail checks. Failures exit immediately.
Pass 2 quality-only prompt: 0-100 dimensions, only reached when all gates pass.
Pass 3 tiebreaker (optional): re-scores quality when confidence is within
`tiebreaker_range` of `threshold`; averages pass-2 and pass-3 scores.
Two backend types:
- "model-boss": local VLM via InferenceClient (Qwen3-VL, etc.)
- "claude": Claude API via claude-code-batch-sdk (Haiku, Sonnet, Opus)
"""
def __init__(self, name: str, backend: str, model: str, threshold: float):
def __init__(
self,
name: str,
backend: str,
model: str,
threshold: float,
tiebreaker_range: float = 0.0,
):
self.name = name
self.backend = backend
self.model = model
self.threshold = threshold
self.tiebreaker_range = tiebreaker_range
self._semaphore = asyncio.Semaphore(CONCURRENCY.get(backend, 4))
if backend == "model-boss":
@ -676,8 +728,40 @@ class Scorer:
logger.warning("[%s] Batch item error: %s", self.name, exc)
return results
async def _call_backend(self, img_b64: str, raw_path: str, prompt: str) -> str | None:
"""Send a single prompt+image to the backend. Returns raw text or None."""
try:
if self.backend == "model-boss":
return await self._client.chat(
model=self.model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
],
},
],
max_tokens=2048,
temperature=0.1,
keep_alive=300,
)
elif self.backend == "claude":
return await self._client.generate(
system=SYSTEM_PROMPT,
user=prompt,
cwd=str(Path(raw_path).parent),
allowed_tools=["Read"],
)
return None
except Exception as exc:
logger.warning("[%s] Backend call failed: %s", self.name, exc)
return None
async def _score_inner(self, raw_path: str, sprite: dict) -> dict | None:
"""Score a single variant image. Returns gated result dict or None."""
"""Two-pass (+ optional tiebreaker) scoring. Returns gated result dict or None."""
import base64
if not raw_path or not Path(raw_path).exists():
@ -691,56 +775,87 @@ class Scorer:
entity_id = sprite.get("entity_id", "")
ctx_gate_descs = _contextualize_descriptions(gate_descs, entity_id)
ctx_quality_descs = _contextualize_descriptions(quality_descs, entity_id)
filename = Path(raw_path).name
prompt_excerpt = sprite["prompt"][:300]
gate_instructions = _build_gate_instructions(gates, ctx_gate_descs)
quality_instructions = _build_quality_instructions(quality_dims, ctx_quality_descs)
score_template = _build_score_template(gates, quality_dims)
img_b64 = base64.b64encode(Path(raw_path).read_bytes()).decode()
user_prompt = RANKING_PROMPT_TEMPLATE.format(
filename=Path(raw_path).name,
# --- Pass 1: gates only ---
gate_prompt = GATE_PROMPT_TEMPLATE.format(
filename=filename,
category=category,
entity_id=entity_id,
prompt=sprite["prompt"][:300],
gate_instructions=gate_instructions,
quality_instructions=quality_instructions,
score_template=score_template,
prompt=prompt_excerpt,
gate_instructions=_build_gate_instructions(gates, ctx_gate_descs),
gate_template=_build_gate_only_template(gates),
)
try:
if self.backend == "model-boss":
img_b64 = base64.b64encode(Path(raw_path).read_bytes()).decode()
raw = await self._client.chat(
model=self.model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
{"type": "text", "text": user_prompt},
],
},
],
max_tokens=500,
keep_alive=300,
)
elif self.backend == "claude":
raw = await self._client.generate(
system=SYSTEM_PROMPT,
user=user_prompt,
cwd=str(Path(raw_path).parent),
allowed_tools=["Read"],
)
else:
return None
except Exception as exc:
logger.warning("[%s] Scoring failed: %s", self.name, exc)
gate_raw = await self._call_backend(img_b64, raw_path, gate_prompt)
if gate_raw is None:
return None
if raw is None:
gate_result = _parse_gates_only(gate_raw, gates)
if gate_result is None:
return None
return _parse_gated_scores(raw, gates, quality_dims)
gate_passed, parsed_gates, failed_reason = gate_result
if not gate_passed:
return {
"gates": parsed_gates,
"quality": {},
"gate_passed": False,
"confidence": 0.0,
"failed_gate_reason": failed_reason,
"quality_floor_failed": False,
}
# --- Pass 2: quality only ---
quality_prompt = QUALITY_PROMPT_TEMPLATE.format(
filename=filename,
entity_id=entity_id,
quality_instructions=_build_quality_instructions(quality_dims, ctx_quality_descs),
quality_template=_build_quality_only_template(quality_dims),
)
quality_raw = await self._call_backend(img_b64, raw_path, quality_prompt)
if quality_raw is None:
return None
quality = _parse_quality_only(quality_raw, quality_dims)
if quality is None:
return None
# --- Pass 3: tiebreaker (when confidence is within range of threshold) ---
if self.tiebreaker_range > 0.0:
confidence = _quality_confidence(quality)
if abs(confidence - self.threshold) <= self.tiebreaker_range:
logger.debug(
"[%s] Tiebreaker triggered: confidence=%.3f threshold=%.2f range=%.2f",
self.name, confidence, self.threshold, self.tiebreaker_range,
)
tie_raw = await self._call_backend(img_b64, raw_path, quality_prompt)
if tie_raw is not None:
tie_quality = _parse_quality_only(tie_raw, quality_dims)
if tie_quality is not None:
quality = _merge_quality(quality, tie_quality)
floor_reason = _quality_floor_check(quality)
if floor_reason:
return {
"gates": parsed_gates,
"quality": quality,
"gate_passed": False,
"confidence": 0.0,
"failed_gate_reason": floor_reason,
"quality_floor_failed": True,
}
return {
"gates": parsed_gates,
"quality": quality,
"gate_passed": True,
"confidence": _quality_confidence(quality),
"failed_gate_reason": None,
"quality_floor_failed": False,
}
def _ensure_claude_sdk():
@ -784,6 +899,7 @@ class ScoringPipeline:
backend=stage["backend"],
model=stage["model"],
threshold=stage.get("threshold", CONFIDENCE_THRESHOLD),
tiebreaker_range=stage.get("tiebreaker_range", 0.0),
))
async def score_variant_at_tier(

View file

@ -894,6 +894,14 @@ class SpriteRegistry:
"avg_confidence": round(row["avg_conf"] or 0.0, 3),
}
unscored = self.conn.execute(
"""SELECT COUNT(*) FROM variants
WHERE job_status = 'completed'
AND NOT EXISTS (
SELECT 1 FROM latest_scores WHERE variant_id = variants.id
)"""
).fetchone()[0]
approved = self.conn.execute(
"SELECT COUNT(*) FROM variants WHERE is_approved = 1"
).fetchone()[0]
@ -905,6 +913,7 @@ class SpriteRegistry:
funnel = {
"total_completed": total_completed,
"total_processed": total_processed,
"unscored": unscored,
"scoring": scoring,
"approved": approved,
"installed": installed,