chore(sprite-generation): 🔧 Update scoring pipeline weights in sprite-generation YAML config

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-03-29 06:07:11 -07:00 · 2026-03-29 06:07:11 -07:00 · 90f71f8c51
commit 90f71f8c51
parent 7f2c0e731d
1 changed files with 22 additions and 8 deletions
--- a/tools/sprite-generation/engine/prompts/scoring_pipeline.yaml
+++ b/tools/sprite-generation/engine/prompts/scoring_pipeline.yaml
@ -5,11 +5,15 @@
 # Rejections at any stage go back to generation.
 #
 # Fields per stage:
-#   name:       Human-readable name (also used in DB scored_by)
-#   backend:    "model-boss" (local VLM) or "claude" (API)
-#   model:      Model ID for the backend
-#   threshold:  Minimum confidence (0-1) for quality scores to pass
-#   purpose:    What this stage is checking for
+#   name:             Human-readable name (also used in DB scored_by)
+#   backend:          "model-boss" (local VLM) or "claude" (API)
+#   model:            Model ID for the backend
+#   threshold:        Minimum confidence (0-1) for quality scores to pass
+#   tiebreaker_range: If set, re-scores quality when confidence is within ±range of threshold,
+#                     then averages pass-2 and pass-3 scores. Omit or 0.0 to disable.
+#   single_pass:      If true, uses one combined gate+quality call instead of two-pass.
+#                     Best for expensive high-accuracy models. Default: false.
+#   purpose:          What this stage is checking for
 #
 # The pipeline stops escalating when `target_approved` variants have passed
 # ALL stages. Only the deficit is sent to the next stage.
@ -28,16 +32,26 @@ stages:
    backend: model-boss
    model: qwen3-vl-8b-instruct
    threshold: 0.40
-    purpose: "Free local bulk filter — catches obvious gate failures (wrong direction, modern clothes, wrong background) before spending API dollars."
+    tiebreaker_range: 0.12
+    purpose: "Free local bulk filter — catches obvious gate failures (wrong direction, modern clothes, wrong background) before spending API dollars. Tiebreaker re-scores quality when confidence is borderline (±0.12 of threshold)."

  - name: haiku
    backend: claude
    model: haiku
    threshold: 0.50
-    purpose: "Cheap API filter — re-evaluates independently with stricter threshold. Only sees VLM-approved candidates."
+    tiebreaker_range: 0.08
+    purpose: "Cheap API filter — re-evaluates independently with stricter threshold. Only sees VLM-approved candidates. Tiebreaker re-scores quality when confidence is borderline (±0.08 of threshold)."
+
+  - name: sonnet
+    backend: claude
+    model: sonnet
+    threshold: 0.58
+    tiebreaker_range: 0.08
+    purpose: "Mid-tier filter — stronger than Haiku, cheaper than Opus. Two-pass with tiebreaker on borderline quality scores."

  - name: opus
    backend: claude
    model: opus
    threshold: 0.65
-    purpose: "Final quality approval — highest accuracy, only sees pre-filtered candidates. Per-dimension floor (45) also enforced."
+    single_pass: true
+    purpose: "Final quality approval — single combined call (expensive). Highest accuracy, only sees pre-filtered candidates. Per-dimension floor (45) also enforced."