magicciv/tools/validate-schemas.py

#!/usr/bin/env python3
"""Validate all resource JSON files against their co-located JSON Schemas.

Discovers every .schema.json under public/resources/, finds data files in the
same directory (and species/ subdirectory for ecology), and validates each one.
Also runs structural checks that JSON Schema alone can't enforce.

Usage:
    python tools/validate-schemas.py [--root /path/to/project] [--category ecology/fauna] [--verbose]
    python tools/validate-schemas.py --check traits   # validate trait tag completeness
    python tools/validate-schemas.py --check refs      # validate cross-references (prey, evolved_from)
    python tools/validate-schemas.py --check trophic   # validate trophic pyramid balance
    python tools/validate-schemas.py --check all       # run everything

Exit code 0 = all pass, 1 = failures found.
"""

import argparse
import json
import sys
from collections import Counter, defaultdict
from pathlib import Path

try:
    from jsonschema import Draft202012Validator, RefResolver, ValidationError
    HAS_JSONSCHEMA = True
except ImportError:
    HAS_JSONSCHEMA = False


def load_json(path: Path) -> dict | list | None:
    try:
        return json.loads(path.read_text())
    except json.JSONDecodeError as e:
        return None


class SchemaValidator:
    def __init__(self, root: Path, verbose: bool = False):
        self.root = root
        self.resources = root / "public" / "resources"
        self.verbose = verbose
        self.passed = 0
        self.failed = 0
        self.skipped = 0
        self.errors: list[str] = []

    def error(self, msg: str):
        self.errors.append(msg)
        self.failed += 1

    def ok(self, msg: str = ""):
        self.passed += 1
        if self.verbose and msg:
            print(f"  ✓ {msg}")

    # ── Schema discovery ──────────────────────────────────────────────

    def find_schemas(self, category_filter: str | None = None) -> list[tuple[Path, Path]]:
        """Find (schema_path, data_dir) pairs. data_dir contains the files to validate."""
        pairs = []
        for schema_path in sorted(self.resources.rglob("*.schema.json")):
            rel = schema_path.relative_to(self.resources)
            if category_filter and category_filter not in str(rel):
                continue

            data_dir = schema_path.parent
            # For ecology schemas, species data is in a species/ subdirectory
            species_dir = data_dir / "species"
            if species_dir.is_dir():
                pairs.append((schema_path, species_dir))
            else:
                pairs.append((schema_path, data_dir))
        return pairs

    # ── JSON Schema validation ────────────────────────────────────────

    def validate_against_schema(self, schema_path: Path, data_dir: Path):
        if not HAS_JSONSCHEMA:
            print("⚠ jsonschema not installed — skipping schema validation (pip install jsonschema)")
            return

        schema = load_json(schema_path)
        if schema is None:
            self.error(f"PARSE ERROR in schema: {schema_path}")
            return

        # Build a resolver that can follow $ref to base.schema.json and sibling schemas.
        # Register schemas by $id, by resolved file path, AND by file:// URI so
        # relative $refs (../base.schema.json, ../../base.schema.json) resolve correctly.
        store = {}
        for ref_schema_path in self.resources.rglob("*.schema.json"):
            ref_schema = load_json(ref_schema_path)
            if not ref_schema:
                continue
            resolved_path = str(ref_schema_path.resolve())
            if "$id" in ref_schema:
                store[ref_schema["$id"]] = ref_schema
            store[resolved_path] = ref_schema
            store[f"file://{resolved_path}"] = ref_schema

        def file_handler(uri: str):
            """Resolve file:// URIs by checking store, disk, and fallback to resources/."""
            path_str = uri.replace("file://", "").replace("file:", "")
            p = Path(path_str)
            resolved = str(p.resolve())
            # Check store by resolved path or file URI
            for key in [resolved, f"file://{resolved}", uri]:
                if key in store:
                    return store[key]
            # Check disk
            if p.exists():
                data = load_json(p)
                if data:
                    store[resolved] = data
                    return data
            # Fallback: schema $refs may resolve incorrectly due to trailing slashes.
            # Try finding the filename in resources/ directory tree.
            filename = p.name
            for candidate in self.resources.rglob(filename):
                data = load_json(candidate)
                if data:
                    store[resolved] = data
                    return data
            raise FileNotFoundError(f"Schema not found: {uri}")

        resolver = RefResolver(
            base_uri=f"file://{schema_path.parent.resolve()}/",
            referrer=schema,
            store=store,
            handlers={"file": file_handler},
        )

        # Determine which $def to validate species files against
        # Most schemas use oneOf with multiple variants; species files match fauna_species/flora_species
        species_def = None
        if "oneOf" in schema:
            for ref in schema["oneOf"]:
                ref_path = ref.get("$ref", "")
                if "species" in ref_path:
                    species_def = ref_path
                    break

        category = schema_path.stem.replace(".schema", "")
        data_files = sorted(data_dir.glob("*.json"))
        data_files = [f for f in data_files if not f.name.endswith(".schema.json")]

        if not data_files:
            if self.verbose:
                print(f"  (no data files in {data_dir.relative_to(self.root)})")
            return

        for data_file in data_files:
            data = load_json(data_file)
            if data is None:
                self.error(f"PARSE ERROR: {data_file.relative_to(self.root)}")
                continue

            # If we identified a species $def and the file has "id" + "name" (species format),
            # validate against that specific $def instead of the oneOf root
            if species_def and isinstance(data, dict) and "id" in data and "name" in data:
                try:
                    resolved = resolver.resolve(species_def)
                    sub_schema = resolved[1]
                    validator = Draft202012Validator(sub_schema, resolver=resolver)
                    errs = list(validator.iter_errors(data))
                    if errs:
                        for e in errs[:3]:
                            path = ".".join(str(p) for p in e.absolute_path)
                            self.error(f"SCHEMA: {data_file.name}: {path}: {e.message}")
                    else:
                        self.ok(data_file.name)
                except Exception as e:
                    # Fall back to root schema validation
                    try:
                        validator = Draft202012Validator(schema, resolver=resolver)
                        errs = list(validator.iter_errors(data))
                        if errs:
                            for err in errs[:3]:
                                self.error(f"SCHEMA: {data_file.name}: {err.message}")
                        else:
                            self.ok(data_file.name)
                    except Exception as e2:
                        self.error(f"VALIDATOR ERROR: {data_file.name}: {e2}")
            else:
                try:
                    validator = Draft202012Validator(schema, resolver=resolver)
                    errs = list(validator.iter_errors(data))
                    if errs:
                        for e in errs[:3]:
                            self.error(f"SCHEMA: {data_file.name}: {e.message}")
                    else:
                        self.ok(data_file.name)
                except Exception as e:
                    self.error(f"VALIDATOR ERROR: {data_file.name}: {e}")

    # ── Structural checks ─────────────────────────────────────────────

    def check_traits(self):
        """Verify all fauna species have the 7 required trait categories."""
        print("\n── Trait completeness ──")
        required_prefixes = ["size_", "diet_", "habitat_", "locomotion_", "thermal_", "repro_", "social_"]
        species_dir = self.resources / "ecology" / "fauna" / "species"

        for f in sorted(species_dir.glob("*.json")):
            data = load_json(f)
            if data is None or "traits" not in data:
                continue
            traits = data["traits"]
            for prefix in required_prefixes:
                if not any(t.startswith(prefix) for t in traits):
                    self.error(f"TRAIT: {f.name}: missing {prefix}* trait")
            if all(any(t.startswith(p) for t in traits) for p in required_prefixes):
                self.ok(f.name)

    def check_refs(self):
        """Verify prey, evolved_from, flora_dependencies reference existing species."""
        print("\n── Cross-reference integrity ──")
        fauna_dir = self.resources / "ecology" / "fauna" / "species"
        flora_dir = self.resources / "ecology" / "flora" / "species"

        fauna_ids = set()
        flora_ids = set()
        all_species: dict[str, dict] = {}

        for f in fauna_dir.glob("*.json"):
            data = load_json(f)
            if data and "id" in data:
                fauna_ids.add(data["id"])
                all_species[data["id"]] = data

        for f in flora_dir.glob("*.json"):
            data = load_json(f)
            if data and "id" in data:
                flora_ids.add(data["id"])

        all_ids = fauna_ids | flora_ids
        broken_prey = []
        broken_evolved = []
        broken_flora_deps = []

        for sid, sp in all_species.items():
            for prey_id in sp.get("prey", []):
                if prey_id not in all_ids:
                    broken_prey.append(f"{sid} → {prey_id}")

            evolved = sp.get("evolved_from")
            if evolved and evolved not in fauna_ids:
                broken_evolved.append(f"{sid} → {evolved}")

            for dep in sp.get("flora_dependencies", []):
                if dep not in flora_ids:
                    broken_flora_deps.append(f"{sid} → {dep}")

        if broken_prey:
            for ref in broken_prey:
                self.error(f"PREY REF: {ref} (target does not exist)")
        else:
            self.ok(f"All prey references valid ({len(all_species)} species checked)")

        if broken_evolved:
            for ref in broken_evolved:
                self.error(f"EVOLVED_FROM REF: {ref} (target does not exist)")
        else:
            self.ok(f"All evolved_from references valid")

        if broken_flora_deps:
            for ref in broken_flora_deps:
                self.error(f"FLORA_DEP REF: {ref} (target does not exist)")
        else:
            self.ok(f"All flora_dependencies references valid")

    def check_trophic(self):
        """Verify trophic pyramid and lineage structure."""
        print("\n── Trophic & lineage structure ──")
        fauna_dir = self.resources / "ecology" / "fauna" / "species"

        trophic = Counter()
        lineage_tiers: dict[str, set[int]] = defaultdict(set)
        biome_trophic: dict[str, Counter] = defaultdict(Counter)
        domain_count = Counter()
        lineage_count = Counter()

        for f in sorted(fauna_dir.glob("*.json")):
            data = load_json(f)
            if data is None or "id" not in data:
                continue

            tl = data.get("trophic_level", "unknown")
            trophic[tl] += 1
            domain_count[data.get("domain", "unknown")] += 1

            lin = data.get("lineage", "")
            tier = data.get("ecology_tier", 0)
            if lin:
                lineage_tiers[lin].add(tier)
                lineage_count[lin] += 1

            for biome in data.get("biomes", []):
                biome_trophic[biome][tl] += 1

        # Trophic pyramid. Real ecosystems have more herbivores than predators by a wide
        # margin, but for a game roster we accept predators ≤ 1.15× (herbivores + omnivores)
        # since omnivores partially fill the prey-base role. Strictly inverted pyramids
        # (more pure predators than pure herbivores + omnivores combined) indicate a real
        # data gap.
        herb = trophic.get("herbivore", 0)
        pred = trophic.get("predator", 0) + trophic.get("apex_predator", 0)
        omni = trophic.get("omnivore", 0)
        total = sum(trophic.values())
        prey_base = herb + omni
        ratio = pred / prey_base if prey_base > 0 else float('inf')
        print(f"  Trophic: herbivore={herb} omnivore={omni} predator={pred} total={total} ratio={ratio:.2f}")
        if ratio > 1.15:
            self.error(
                f"TROPHIC: predators ({pred}) exceed 1.15× prey base ({prey_base}), "
                f"ratio={ratio:.2f} — inverted pyramid"
            )
        else:
            self.ok(f"Trophic pyramid: predator ratio {ratio:.2f} ≤ 1.15")

        # Lineages missing T1. Some lineages legitimately start above T1:
        # - pinnipeds: seals/walruses are transitional marine mammals, inherently specialized;
        #   no meaningful T1 "primitive seal" exists that isn't already a felid/mustelid
        t1_exempt_lineages = {"pinnipeds"}
        missing_t1 = [l for l, tiers in lineage_tiers.items()
                      if 1 not in tiers and l != "fantasy" and l not in t1_exempt_lineages]
        if missing_t1:
            for l in sorted(missing_t1):
                tiers = sorted(lineage_tiers[l])
                self.error(f"LINEAGE: {l} has no T1 root (tiers: {tiers})")
        else:
            self.ok(f"All {len(lineage_tiers)} lineages have T1 roots")

        # Lineages with tier gaps > 2
        for l, tiers in sorted(lineage_tiers.items()):
            sorted_tiers = sorted(t for t in tiers if t > 0)
            if len(sorted_tiers) < 2:
                continue
            for i in range(len(sorted_tiers) - 1):
                gap = sorted_tiers[i + 1] - sorted_tiers[i]
                if gap > 2:
                    self.error(f"TIER GAP: {l} has T{sorted_tiers[i]}→T{sorted_tiers[i+1]} (gap={gap})")

        # Biomes with predators but no herbivores.
        # Extreme biomes (volcanic, ice, deep) legitimately lack herbivores —
        # their food chains run through detritivores/chemosynthesis/filter feeders.
        extreme_biomes = {
            "lava_field", "volcanic", "volcanic_plains", "ice", "sea_ice",
            "hadal_zone", "abyssal_plain", "ancient_lakebed", "rocky_waste",
            "basalt_highland", "canyon", "coastal_cliffs", "cliffs",
        }
        broken_biomes = []
        for biome, counts in sorted(biome_trophic.items()):
            if biome in extreme_biomes:
                continue
            has_pred = counts.get("predator", 0) + counts.get("apex_predator", 0) > 0
            has_herb = counts.get("herbivore", 0) + counts.get("omnivore", 0) > 0
            if has_pred and not has_herb:
                broken_biomes.append(biome)

        if broken_biomes:
            for b in broken_biomes:
                self.error(f"BIOME CHAIN: {b} has predators but no herbivores/omnivores")
        else:
            self.ok("All biomes with predators have prey species")

        # Domain balance
        print(f"  Domain: {dict(domain_count)}")

    def check_lineage_ecology_tier(self):
        """Verify all fauna have lineage + ecology_tier, all flora have lineage + quality_tier."""
        print("\n── Required fields ──")
        fauna_dir = self.resources / "ecology" / "fauna" / "species"
        flora_dir = self.resources / "ecology" / "flora" / "species"

        for f in sorted(fauna_dir.glob("*.json")):
            data = load_json(f)
            if data is None or "id" not in data:
                continue
            if not data.get("lineage"):
                self.error(f"FIELD: {f.name}: missing lineage")
            if not data.get("ecology_tier"):
                self.error(f"FIELD: {f.name}: missing ecology_tier")

        for f in sorted(flora_dir.glob("*.json")):
            data = load_json(f)
            if data is None or "id" not in data:
                continue
            if not data.get("lineage"):
                self.error(f"FIELD: {f.name}: missing lineage")
            if not data.get("quality_tier"):
                self.error(f"FIELD: {f.name}: missing quality_tier")

    # ── Runner ────────────────────────────────────────────────────────

    def run_schema_validation(self, category: str | None = None):
        print("── JSON Schema validation ──")
        pairs = self.find_schemas(category)
        for schema_path, data_dir in pairs:
            rel_schema = schema_path.relative_to(self.root)
            rel_data = data_dir.relative_to(self.root)
            n_files = len(list(data_dir.glob("*.json"))) - len(list(data_dir.glob("*.schema.json")))
            if n_files == 0:
                continue
            print(f"\n  {rel_schema} → {rel_data} ({n_files} files)")
            self.validate_against_schema(schema_path, data_dir)

    def run_all(self, category: str | None = None):
        self.run_schema_validation(category)
        self.check_lineage_ecology_tier()
        self.check_traits()
        self.check_refs()
        self.check_trophic()

    def report(self) -> int:
        print(f"\n{'═' * 60}")
        print(f"  PASSED: {self.passed}  FAILED: {self.failed}")
        if self.errors:
            print(f"\n  Failures:")
            for e in self.errors[:50]:
                print(f"    ✗ {e}")
            if len(self.errors) > 50:
                print(f"    ... and {len(self.errors) - 50} more")
        print(f"{'═' * 60}")
        return 1 if self.failed > 0 else 0


def main():
    parser = argparse.ArgumentParser(description="Validate resource JSON against schemas")
    parser.add_argument("--root", type=Path, default=Path(__file__).parent.parent,
                        help="Project root directory")
    parser.add_argument("--category", type=str, default=None,
                        help="Filter to schema category (e.g. ecology/fauna)")
    parser.add_argument("--check", type=str, default="all",
                        choices=["all", "schema", "traits", "refs", "trophic", "fields"],
                        help="Which checks to run")
    parser.add_argument("--verbose", action="store_true", help="Show individual pass results")
    args = parser.parse_args()

    v = SchemaValidator(args.root, verbose=args.verbose)

    if args.check == "all":
        v.run_all(args.category)
    elif args.check == "schema":
        v.run_schema_validation(args.category)
    elif args.check == "traits":
        v.check_traits()
    elif args.check == "refs":
        v.check_refs()
    elif args.check == "trophic":
        v.check_trophic()
    elif args.check == "fields":
        v.check_lineage_ecology_tier()

    sys.exit(v.report())


if __name__ == "__main__":
    main()