diff --git a/tools/sprite-generation/engine/prompts/scoring_pipeline.yaml b/tools/sprite-generation/engine/prompts/scoring_pipeline.yaml index 8d3017d6..9d5ed926 100644 --- a/tools/sprite-generation/engine/prompts/scoring_pipeline.yaml +++ b/tools/sprite-generation/engine/prompts/scoring_pipeline.yaml @@ -5,11 +5,15 @@ # Rejections at any stage go back to generation. # # Fields per stage: -# name: Human-readable name (also used in DB scored_by) -# backend: "model-boss" (local VLM) or "claude" (API) -# model: Model ID for the backend -# threshold: Minimum confidence (0-1) for quality scores to pass -# purpose: What this stage is checking for +# name: Human-readable name (also used in DB scored_by) +# backend: "model-boss" (local VLM) or "claude" (API) +# model: Model ID for the backend +# threshold: Minimum confidence (0-1) for quality scores to pass +# tiebreaker_range: If set, re-scores quality when confidence is within ±range of threshold, +# then averages pass-2 and pass-3 scores. Omit or 0.0 to disable. +# single_pass: If true, uses one combined gate+quality call instead of two-pass. +# Best for expensive high-accuracy models. Default: false. +# purpose: What this stage is checking for # # The pipeline stops escalating when `target_approved` variants have passed # ALL stages. Only the deficit is sent to the next stage. @@ -28,16 +32,26 @@ stages: backend: model-boss model: qwen3-vl-8b-instruct threshold: 0.40 - purpose: "Free local bulk filter — catches obvious gate failures (wrong direction, modern clothes, wrong background) before spending API dollars." + tiebreaker_range: 0.12 + purpose: "Free local bulk filter — catches obvious gate failures (wrong direction, modern clothes, wrong background) before spending API dollars. Tiebreaker re-scores quality when confidence is borderline (±0.12 of threshold)." - name: haiku backend: claude model: haiku threshold: 0.50 - purpose: "Cheap API filter — re-evaluates independently with stricter threshold. Only sees VLM-approved candidates." + tiebreaker_range: 0.08 + purpose: "Cheap API filter — re-evaluates independently with stricter threshold. Only sees VLM-approved candidates. Tiebreaker re-scores quality when confidence is borderline (±0.08 of threshold)." + + - name: sonnet + backend: claude + model: sonnet + threshold: 0.58 + tiebreaker_range: 0.08 + purpose: "Mid-tier filter — stronger than Haiku, cheaper than Opus. Two-pass with tiebreaker on borderline quality scores." - name: opus backend: claude model: opus threshold: 0.65 - purpose: "Final quality approval — highest accuracy, only sees pre-filtered candidates. Per-dimension floor (45) also enforced." + single_pass: true + purpose: "Final quality approval — single combined call (expensive). Highest accuracy, only sees pre-filtered candidates. Per-dimension floor (45) also enforced."