chore(sprite-generation): 🔧 Update scoring pipeline weights in sprite-generation YAML config

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Claude Code 2026-03-29 06:07:11 -07:00
parent 7f2c0e731d
commit 90f71f8c51

View file

@ -5,11 +5,15 @@
# Rejections at any stage go back to generation.
#
# Fields per stage:
# name: Human-readable name (also used in DB scored_by)
# backend: "model-boss" (local VLM) or "claude" (API)
# model: Model ID for the backend
# threshold: Minimum confidence (0-1) for quality scores to pass
# purpose: What this stage is checking for
# name: Human-readable name (also used in DB scored_by)
# backend: "model-boss" (local VLM) or "claude" (API)
# model: Model ID for the backend
# threshold: Minimum confidence (0-1) for quality scores to pass
# tiebreaker_range: If set, re-scores quality when confidence is within ±range of threshold,
# then averages pass-2 and pass-3 scores. Omit or 0.0 to disable.
# single_pass: If true, uses one combined gate+quality call instead of two-pass.
# Best for expensive high-accuracy models. Default: false.
# purpose: What this stage is checking for
#
# The pipeline stops escalating when `target_approved` variants have passed
# ALL stages. Only the deficit is sent to the next stage.
@ -28,16 +32,26 @@ stages:
backend: model-boss
model: qwen3-vl-8b-instruct
threshold: 0.40
purpose: "Free local bulk filter — catches obvious gate failures (wrong direction, modern clothes, wrong background) before spending API dollars."
tiebreaker_range: 0.12
purpose: "Free local bulk filter — catches obvious gate failures (wrong direction, modern clothes, wrong background) before spending API dollars. Tiebreaker re-scores quality when confidence is borderline (±0.12 of threshold)."
- name: haiku
backend: claude
model: haiku
threshold: 0.50
purpose: "Cheap API filter — re-evaluates independently with stricter threshold. Only sees VLM-approved candidates."
tiebreaker_range: 0.08
purpose: "Cheap API filter — re-evaluates independently with stricter threshold. Only sees VLM-approved candidates. Tiebreaker re-scores quality when confidence is borderline (±0.08 of threshold)."
- name: sonnet
backend: claude
model: sonnet
threshold: 0.58
tiebreaker_range: 0.08
purpose: "Mid-tier filter — stronger than Haiku, cheaper than Opus. Two-pass with tiebreaker on borderline quality scores."
- name: opus
backend: claude
model: opus
threshold: 0.65
purpose: "Final quality approval — highest accuracy, only sees pre-filtered candidates. Per-dimension floor (45) also enforced."
single_pass: true
purpose: "Final quality approval — single combined call (expensive). Highest accuracy, only sees pre-filtered candidates. Per-dimension floor (45) also enforced."