refactor(sprite-generation): ♻️ Refactor sprite generation ranking and registry to support dynamic strategy injection and reduce processing overhead
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
ae3cabaf54
commit
af0dd0372c
2 changed files with 229 additions and 104 deletions
|
|
@ -307,34 +307,49 @@ Your job has TWO parts:
|
|||
|
||||
If ANY gate is false, you MUST provide a failed_gate_reason explaining which gate failed and why.
|
||||
|
||||
Always respond with valid JSON only — no other text."""
|
||||
Think through each gate carefully before answering. Then output ONLY valid JSON — no text outside the JSON block.
|
||||
|
||||
RANKING_PROMPT_TEMPLATE = """\
|
||||
Example FAIL (facing_southwest gate):
|
||||
{"gates":{"facing_southwest":false,"single_character":true,"no_text_watermark":true},"failed_gate_reason":"facing_southwest failed: character faces the camera, front of body visible","quality":{}}
|
||||
|
||||
Example PASS (all gates true, quality scored):
|
||||
{"gates":{"facing_southwest":true,"single_character":true,"no_text_watermark":true},"failed_gate_reason":null,"quality":{"direction_quality":85,"art_style":80,"equipment_detail":75,"background_cleanliness":90,"shadow_acceptability":88}}"""
|
||||
|
||||
GATE_PROMPT_TEMPLATE = """\
|
||||
Look at the image file {filename} in this directory.
|
||||
|
||||
This image was generated as a game sprite:
|
||||
This sprite was generated as:
|
||||
- Category: {category}
|
||||
- Entity: {entity_id}
|
||||
- Prompt used: {prompt}
|
||||
- Prompt: {prompt}
|
||||
|
||||
## STEP 1: Boolean Gates (answer true or false for each)
|
||||
Evaluate each boolean gate (true or false):
|
||||
|
||||
{gate_instructions}
|
||||
|
||||
## STEP 2: Quality Ranges (0-100, only if ALL gates are true)
|
||||
Respond with this exact JSON:
|
||||
{gate_template}"""
|
||||
|
||||
QUALITY_PROMPT_TEMPLATE = """\
|
||||
Look at the image file {filename} in this directory.
|
||||
|
||||
All gates passed for this sprite ({entity_id}). Now score each quality dimension 0-100:
|
||||
|
||||
{quality_instructions}
|
||||
|
||||
Respond with this exact JSON structure:
|
||||
{score_template}"""
|
||||
Respond with this exact JSON:
|
||||
{quality_template}"""
|
||||
|
||||
|
||||
def _extract_json(text: str) -> dict | None:
|
||||
"""Extract JSON object from VLM response text.
|
||||
|
||||
Handles markdown code blocks, leading/trailing text, and raw JSON.
|
||||
Handles markdown code blocks, leading/trailing text, raw JSON, and
|
||||
Qwen3 <think>...</think> reasoning blocks.
|
||||
"""
|
||||
import re
|
||||
# Strip Qwen3 thinking blocks before any parse attempt
|
||||
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
|
||||
# Try direct parse
|
||||
try:
|
||||
return json.loads(text)
|
||||
|
|
@ -527,81 +542,105 @@ def _build_quality_instructions(dims: tuple[str, ...], descs: dict[str, str]) ->
|
|||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _build_score_template(gates: tuple[str, ...], quality_dims: tuple[str, ...]) -> str:
|
||||
template: dict = {
|
||||
def _build_gate_only_template(gates: tuple[str, ...]) -> str:
|
||||
return json.dumps({
|
||||
"gates": {g: True for g in gates},
|
||||
"failed_gate_reason": "null or string explaining which gate failed and why",
|
||||
"quality": {d: 0 for d in quality_dims},
|
||||
}
|
||||
return json.dumps(template, indent=2)
|
||||
}, indent=2)
|
||||
|
||||
|
||||
def _parse_gated_scores(
|
||||
def _build_quality_only_template(quality_dims: tuple[str, ...]) -> str:
|
||||
return json.dumps({"quality": {d: 0 for d in quality_dims}}, indent=2)
|
||||
|
||||
|
||||
def _parse_gates_only(
|
||||
raw: str,
|
||||
gates: tuple[str, ...],
|
||||
quality_dims: tuple[str, ...],
|
||||
) -> dict | None:
|
||||
"""Parse reviewer's JSON response into gate+quality structure.
|
||||
|
||||
Returns dict with keys: gates, quality, gate_passed, confidence, failed_gate_reason
|
||||
Returns None if JSON is unparseable.
|
||||
"""
|
||||
) -> tuple[bool, dict[str, bool], str | None] | None:
|
||||
"""Parse a gate-only response. Returns (gate_passed, gates_dict, failed_reason) or None."""
|
||||
data = _extract_json(raw)
|
||||
if data is None:
|
||||
return None
|
||||
|
||||
gates_data = data.get("gates")
|
||||
if not isinstance(gates_data, dict):
|
||||
return None
|
||||
|
||||
# Parse gates — must be boolean
|
||||
parsed_gates: dict[str, bool] = {}
|
||||
parsed: dict[str, bool] = {}
|
||||
for g in gates:
|
||||
val = gates_data.get(g)
|
||||
if isinstance(val, bool):
|
||||
parsed_gates[g] = val
|
||||
parsed[g] = val
|
||||
elif isinstance(val, (int, float)):
|
||||
parsed_gates[g] = bool(val)
|
||||
parsed[g] = bool(val)
|
||||
else:
|
||||
parsed_gates[g] = False # missing gate = fail
|
||||
parsed[g] = False
|
||||
gate_passed = all(parsed.values())
|
||||
failed_reason = data.get("failed_gate_reason") if not gate_passed else None
|
||||
if isinstance(failed_reason, str) and failed_reason.lower() in ("null", "none", ""):
|
||||
failed_reason = None
|
||||
return (gate_passed, parsed, failed_reason)
|
||||
|
||||
|
||||
def _parse_quality_only(
|
||||
raw: str,
|
||||
quality_dims: tuple[str, ...],
|
||||
) -> dict[str, int] | None:
|
||||
"""Parse a quality-only response. Returns {dim: score} or None."""
|
||||
data = _extract_json(raw)
|
||||
if data is None:
|
||||
return None
|
||||
quality_data = data.get("quality", {})
|
||||
if not isinstance(quality_data, dict):
|
||||
return None
|
||||
result: dict[str, int] = {}
|
||||
for d in quality_dims:
|
||||
val = quality_data.get(d)
|
||||
result[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0
|
||||
return result
|
||||
|
||||
|
||||
def _quality_confidence(quality: dict[str, int]) -> float:
|
||||
if not quality:
|
||||
return 0.0
|
||||
return round(sum(quality.values()) / (len(quality) * 100), 3)
|
||||
|
||||
|
||||
def _quality_floor_check(quality: dict[str, int]) -> str | None:
|
||||
"""Returns a failure reason string if any dim is below floor, else None."""
|
||||
failures = [f"{d}={v}" for d, v in quality.items() if v < QUALITY_DIM_FLOOR]
|
||||
return f"quality floor breach: {', '.join(failures)}" if failures else None
|
||||
|
||||
|
||||
def _merge_quality(a: dict[str, int], b: dict[str, int]) -> dict[str, int]:
|
||||
"""Average two quality score dicts dimension-by-dimension."""
|
||||
return {d: (a.get(d, 0) + b.get(d, 0)) // 2 for d in a}
|
||||
|
||||
|
||||
# kept for backward-compat with confidence_from_notes (reads stored JSON)
|
||||
def _parse_gated_scores_legacy(
|
||||
data: dict,
|
||||
gates: tuple[str, ...],
|
||||
quality_dims: tuple[str, ...],
|
||||
) -> dict:
|
||||
parsed_gates: dict[str, bool] = {}
|
||||
for g in gates:
|
||||
val = data.get("gates", {}).get(g)
|
||||
parsed_gates[g] = bool(val) if isinstance(val, (bool, int, float)) else False
|
||||
gate_passed = all(parsed_gates.values())
|
||||
failed_reason = data.get("failed_gate_reason") if not gate_passed else None
|
||||
|
||||
# Parse quality — only meaningful if gates passed
|
||||
parsed_quality: dict[str, int] = {}
|
||||
quality_data = data.get("quality", {})
|
||||
if isinstance(quality_data, dict):
|
||||
for d in quality_dims:
|
||||
val = quality_data.get(d)
|
||||
if isinstance(val, (int, float)):
|
||||
parsed_quality[d] = max(0, min(100, int(val)))
|
||||
else:
|
||||
parsed_quality[d] = 0
|
||||
|
||||
# Per-dimension quality floor: any single dim below threshold = auto-reject
|
||||
quality_floor_failed = False
|
||||
floor_failures: list[str] = []
|
||||
if gate_passed and parsed_quality:
|
||||
for dim_name, dim_val in parsed_quality.items():
|
||||
if dim_val < QUALITY_DIM_FLOOR:
|
||||
floor_failures.append(f"{dim_name}={dim_val}")
|
||||
quality_floor_failed = True
|
||||
|
||||
if gate_passed and parsed_quality and not quality_floor_failed:
|
||||
confidence = sum(parsed_quality.values()) / (len(parsed_quality) * 100)
|
||||
else:
|
||||
confidence = 0.0
|
||||
if quality_floor_failed and not failed_reason:
|
||||
failed_reason = f"quality floor breach: {', '.join(floor_failures)}"
|
||||
|
||||
for d in quality_dims:
|
||||
val = data.get("quality", {}).get(d)
|
||||
parsed_quality[d] = max(0, min(100, int(val))) if isinstance(val, (int, float)) else 0
|
||||
floor_reason = _quality_floor_check(parsed_quality) if gate_passed else None
|
||||
confidence = _quality_confidence(parsed_quality) if gate_passed and not floor_reason else 0.0
|
||||
floor_failed = bool(floor_reason)
|
||||
return {
|
||||
"gates": parsed_gates,
|
||||
"quality": parsed_quality,
|
||||
"gate_passed": gate_passed and not quality_floor_failed,
|
||||
"confidence": round(confidence, 3),
|
||||
"failed_gate_reason": failed_reason,
|
||||
"quality_floor_failed": quality_floor_failed,
|
||||
"gate_passed": gate_passed and not floor_failed,
|
||||
"confidence": confidence,
|
||||
"failed_gate_reason": floor_reason if floor_failed else failed_reason,
|
||||
"quality_floor_failed": floor_failed,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -623,18 +662,31 @@ def confidence_from_notes(notes_json: str) -> tuple[float, bool]:
|
|||
|
||||
|
||||
class Scorer:
|
||||
"""Scores a variant image using gated boolean + quality range rubric.
|
||||
"""Scores a variant image using a two-pass gate→quality rubric with optional tiebreaker.
|
||||
|
||||
Pass 1 — gate-only prompt: binary pass/fail checks. Failures exit immediately.
|
||||
Pass 2 — quality-only prompt: 0-100 dimensions, only reached when all gates pass.
|
||||
Pass 3 — tiebreaker (optional): re-scores quality when confidence is within
|
||||
`tiebreaker_range` of `threshold`; averages pass-2 and pass-3 scores.
|
||||
|
||||
Two backend types:
|
||||
- "model-boss": local VLM via InferenceClient (Qwen3-VL, etc.)
|
||||
- "claude": Claude API via claude-code-batch-sdk (Haiku, Sonnet, Opus)
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, backend: str, model: str, threshold: float):
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
backend: str,
|
||||
model: str,
|
||||
threshold: float,
|
||||
tiebreaker_range: float = 0.0,
|
||||
):
|
||||
self.name = name
|
||||
self.backend = backend
|
||||
self.model = model
|
||||
self.threshold = threshold
|
||||
self.tiebreaker_range = tiebreaker_range
|
||||
self._semaphore = asyncio.Semaphore(CONCURRENCY.get(backend, 4))
|
||||
|
||||
if backend == "model-boss":
|
||||
|
|
@ -676,8 +728,40 @@ class Scorer:
|
|||
logger.warning("[%s] Batch item error: %s", self.name, exc)
|
||||
return results
|
||||
|
||||
async def _call_backend(self, img_b64: str, raw_path: str, prompt: str) -> str | None:
|
||||
"""Send a single prompt+image to the backend. Returns raw text or None."""
|
||||
try:
|
||||
if self.backend == "model-boss":
|
||||
return await self._client.chat(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
|
||||
],
|
||||
},
|
||||
],
|
||||
max_tokens=2048,
|
||||
temperature=0.1,
|
||||
keep_alive=300,
|
||||
)
|
||||
elif self.backend == "claude":
|
||||
return await self._client.generate(
|
||||
system=SYSTEM_PROMPT,
|
||||
user=prompt,
|
||||
cwd=str(Path(raw_path).parent),
|
||||
allowed_tools=["Read"],
|
||||
)
|
||||
return None
|
||||
except Exception as exc:
|
||||
logger.warning("[%s] Backend call failed: %s", self.name, exc)
|
||||
return None
|
||||
|
||||
async def _score_inner(self, raw_path: str, sprite: dict) -> dict | None:
|
||||
"""Score a single variant image. Returns gated result dict or None."""
|
||||
"""Two-pass (+ optional tiebreaker) scoring. Returns gated result dict or None."""
|
||||
import base64
|
||||
|
||||
if not raw_path or not Path(raw_path).exists():
|
||||
|
|
@ -691,56 +775,87 @@ class Scorer:
|
|||
entity_id = sprite.get("entity_id", "")
|
||||
ctx_gate_descs = _contextualize_descriptions(gate_descs, entity_id)
|
||||
ctx_quality_descs = _contextualize_descriptions(quality_descs, entity_id)
|
||||
filename = Path(raw_path).name
|
||||
prompt_excerpt = sprite["prompt"][:300]
|
||||
|
||||
gate_instructions = _build_gate_instructions(gates, ctx_gate_descs)
|
||||
quality_instructions = _build_quality_instructions(quality_dims, ctx_quality_descs)
|
||||
score_template = _build_score_template(gates, quality_dims)
|
||||
img_b64 = base64.b64encode(Path(raw_path).read_bytes()).decode()
|
||||
|
||||
user_prompt = RANKING_PROMPT_TEMPLATE.format(
|
||||
filename=Path(raw_path).name,
|
||||
# --- Pass 1: gates only ---
|
||||
gate_prompt = GATE_PROMPT_TEMPLATE.format(
|
||||
filename=filename,
|
||||
category=category,
|
||||
entity_id=entity_id,
|
||||
prompt=sprite["prompt"][:300],
|
||||
gate_instructions=gate_instructions,
|
||||
quality_instructions=quality_instructions,
|
||||
score_template=score_template,
|
||||
prompt=prompt_excerpt,
|
||||
gate_instructions=_build_gate_instructions(gates, ctx_gate_descs),
|
||||
gate_template=_build_gate_only_template(gates),
|
||||
)
|
||||
|
||||
try:
|
||||
if self.backend == "model-boss":
|
||||
img_b64 = base64.b64encode(Path(raw_path).read_bytes()).decode()
|
||||
raw = await self._client.chat(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
|
||||
{"type": "text", "text": user_prompt},
|
||||
],
|
||||
},
|
||||
],
|
||||
max_tokens=500,
|
||||
keep_alive=300,
|
||||
)
|
||||
elif self.backend == "claude":
|
||||
raw = await self._client.generate(
|
||||
system=SYSTEM_PROMPT,
|
||||
user=user_prompt,
|
||||
cwd=str(Path(raw_path).parent),
|
||||
allowed_tools=["Read"],
|
||||
)
|
||||
else:
|
||||
return None
|
||||
except Exception as exc:
|
||||
logger.warning("[%s] Scoring failed: %s", self.name, exc)
|
||||
gate_raw = await self._call_backend(img_b64, raw_path, gate_prompt)
|
||||
if gate_raw is None:
|
||||
return None
|
||||
|
||||
if raw is None:
|
||||
gate_result = _parse_gates_only(gate_raw, gates)
|
||||
if gate_result is None:
|
||||
return None
|
||||
|
||||
return _parse_gated_scores(raw, gates, quality_dims)
|
||||
gate_passed, parsed_gates, failed_reason = gate_result
|
||||
if not gate_passed:
|
||||
return {
|
||||
"gates": parsed_gates,
|
||||
"quality": {},
|
||||
"gate_passed": False,
|
||||
"confidence": 0.0,
|
||||
"failed_gate_reason": failed_reason,
|
||||
"quality_floor_failed": False,
|
||||
}
|
||||
|
||||
# --- Pass 2: quality only ---
|
||||
quality_prompt = QUALITY_PROMPT_TEMPLATE.format(
|
||||
filename=filename,
|
||||
entity_id=entity_id,
|
||||
quality_instructions=_build_quality_instructions(quality_dims, ctx_quality_descs),
|
||||
quality_template=_build_quality_only_template(quality_dims),
|
||||
)
|
||||
quality_raw = await self._call_backend(img_b64, raw_path, quality_prompt)
|
||||
if quality_raw is None:
|
||||
return None
|
||||
|
||||
quality = _parse_quality_only(quality_raw, quality_dims)
|
||||
if quality is None:
|
||||
return None
|
||||
|
||||
# --- Pass 3: tiebreaker (when confidence is within range of threshold) ---
|
||||
if self.tiebreaker_range > 0.0:
|
||||
confidence = _quality_confidence(quality)
|
||||
if abs(confidence - self.threshold) <= self.tiebreaker_range:
|
||||
logger.debug(
|
||||
"[%s] Tiebreaker triggered: confidence=%.3f threshold=%.2f range=%.2f",
|
||||
self.name, confidence, self.threshold, self.tiebreaker_range,
|
||||
)
|
||||
tie_raw = await self._call_backend(img_b64, raw_path, quality_prompt)
|
||||
if tie_raw is not None:
|
||||
tie_quality = _parse_quality_only(tie_raw, quality_dims)
|
||||
if tie_quality is not None:
|
||||
quality = _merge_quality(quality, tie_quality)
|
||||
|
||||
floor_reason = _quality_floor_check(quality)
|
||||
if floor_reason:
|
||||
return {
|
||||
"gates": parsed_gates,
|
||||
"quality": quality,
|
||||
"gate_passed": False,
|
||||
"confidence": 0.0,
|
||||
"failed_gate_reason": floor_reason,
|
||||
"quality_floor_failed": True,
|
||||
}
|
||||
|
||||
return {
|
||||
"gates": parsed_gates,
|
||||
"quality": quality,
|
||||
"gate_passed": True,
|
||||
"confidence": _quality_confidence(quality),
|
||||
"failed_gate_reason": None,
|
||||
"quality_floor_failed": False,
|
||||
}
|
||||
|
||||
|
||||
def _ensure_claude_sdk():
|
||||
|
|
@ -784,6 +899,7 @@ class ScoringPipeline:
|
|||
backend=stage["backend"],
|
||||
model=stage["model"],
|
||||
threshold=stage.get("threshold", CONFIDENCE_THRESHOLD),
|
||||
tiebreaker_range=stage.get("tiebreaker_range", 0.0),
|
||||
))
|
||||
|
||||
async def score_variant_at_tier(
|
||||
|
|
|
|||
|
|
@ -894,6 +894,14 @@ class SpriteRegistry:
|
|||
"avg_confidence": round(row["avg_conf"] or 0.0, 3),
|
||||
}
|
||||
|
||||
unscored = self.conn.execute(
|
||||
"""SELECT COUNT(*) FROM variants
|
||||
WHERE job_status = 'completed'
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM latest_scores WHERE variant_id = variants.id
|
||||
)"""
|
||||
).fetchone()[0]
|
||||
|
||||
approved = self.conn.execute(
|
||||
"SELECT COUNT(*) FROM variants WHERE is_approved = 1"
|
||||
).fetchone()[0]
|
||||
|
|
@ -905,6 +913,7 @@ class SpriteRegistry:
|
|||
funnel = {
|
||||
"total_completed": total_completed,
|
||||
"total_processed": total_processed,
|
||||
"unscored": unscored,
|
||||
"scoring": scoring,
|
||||
"approved": approved,
|
||||
"installed": installed,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue