magicciv/tools/validate-schemas.py

461 lines
19 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""Validate all resource JSON files against their co-located JSON Schemas.
Discovers every .schema.json under public/resources/, finds data files in the
same directory (and species/ subdirectory for ecology), and validates each one.
Also runs structural checks that JSON Schema alone can't enforce.
Usage:
python tools/validate-schemas.py [--root /path/to/project] [--category ecology/fauna] [--verbose]
python tools/validate-schemas.py --check traits # validate trait tag completeness
python tools/validate-schemas.py --check refs # validate cross-references (prey, evolved_from)
python tools/validate-schemas.py --check trophic # validate trophic pyramid balance
python tools/validate-schemas.py --check all # run everything
Exit code 0 = all pass, 1 = failures found.
"""
import argparse
import json
import sys
from collections import Counter, defaultdict
from pathlib import Path
try:
from jsonschema import Draft202012Validator, RefResolver, ValidationError
HAS_JSONSCHEMA = True
except ImportError:
HAS_JSONSCHEMA = False
def load_json(path: Path) -> dict | list | None:
try:
return json.loads(path.read_text())
except json.JSONDecodeError as e:
return None
class SchemaValidator:
def __init__(self, root: Path, verbose: bool = False):
self.root = root
self.resources = root / "public" / "resources"
self.verbose = verbose
self.passed = 0
self.failed = 0
self.skipped = 0
self.errors: list[str] = []
def error(self, msg: str):
self.errors.append(msg)
self.failed += 1
def ok(self, msg: str = ""):
self.passed += 1
if self.verbose and msg:
print(f"{msg}")
# ── Schema discovery ──────────────────────────────────────────────
def find_schemas(self, category_filter: str | None = None) -> list[tuple[Path, Path]]:
"""Find (schema_path, data_dir) pairs. data_dir contains the files to validate."""
pairs = []
for schema_path in sorted(self.resources.rglob("*.schema.json")):
rel = schema_path.relative_to(self.resources)
if category_filter and category_filter not in str(rel):
continue
data_dir = schema_path.parent
# For ecology schemas, species data is in a species/ subdirectory
species_dir = data_dir / "species"
if species_dir.is_dir():
pairs.append((schema_path, species_dir))
else:
pairs.append((schema_path, data_dir))
return pairs
# ── JSON Schema validation ────────────────────────────────────────
def validate_against_schema(self, schema_path: Path, data_dir: Path):
if not HAS_JSONSCHEMA:
print("⚠ jsonschema not installed — skipping schema validation (pip install jsonschema)")
return
schema = load_json(schema_path)
if schema is None:
self.error(f"PARSE ERROR in schema: {schema_path}")
return
# Build a resolver that can follow $ref to base.schema.json and sibling schemas.
# Register schemas by $id, by resolved file path, AND by file:// URI so
# relative $refs (../base.schema.json, ../../base.schema.json) resolve correctly.
store = {}
for ref_schema_path in self.resources.rglob("*.schema.json"):
ref_schema = load_json(ref_schema_path)
if not ref_schema:
continue
resolved_path = str(ref_schema_path.resolve())
if "$id" in ref_schema:
store[ref_schema["$id"]] = ref_schema
store[resolved_path] = ref_schema
store[f"file://{resolved_path}"] = ref_schema
def file_handler(uri: str):
"""Resolve file:// URIs by checking store, disk, and fallback to resources/."""
path_str = uri.replace("file://", "").replace("file:", "")
p = Path(path_str)
resolved = str(p.resolve())
# Check store by resolved path or file URI
for key in [resolved, f"file://{resolved}", uri]:
if key in store:
return store[key]
# Check disk
if p.exists():
data = load_json(p)
if data:
store[resolved] = data
return data
# Fallback: schema $refs may resolve incorrectly due to trailing slashes.
# Try finding the filename in resources/ directory tree.
filename = p.name
for candidate in self.resources.rglob(filename):
data = load_json(candidate)
if data:
store[resolved] = data
return data
raise FileNotFoundError(f"Schema not found: {uri}")
resolver = RefResolver(
base_uri=f"file://{schema_path.parent.resolve()}/",
referrer=schema,
store=store,
handlers={"file": file_handler},
)
# Determine which $def to validate species files against
# Most schemas use oneOf with multiple variants; species files match fauna_species/flora_species
species_def = None
if "oneOf" in schema:
for ref in schema["oneOf"]:
ref_path = ref.get("$ref", "")
if "species" in ref_path:
species_def = ref_path
break
category = schema_path.stem.replace(".schema", "")
data_files = sorted(data_dir.glob("*.json"))
data_files = [f for f in data_files if not f.name.endswith(".schema.json")]
if not data_files:
if self.verbose:
print(f" (no data files in {data_dir.relative_to(self.root)})")
return
for data_file in data_files:
data = load_json(data_file)
if data is None:
self.error(f"PARSE ERROR: {data_file.relative_to(self.root)}")
continue
# If we identified a species $def and the file has "id" + "name" (species format),
# validate against that specific $def instead of the oneOf root
if species_def and isinstance(data, dict) and "id" in data and "name" in data:
try:
resolved = resolver.resolve(species_def)
sub_schema = resolved[1]
validator = Draft202012Validator(sub_schema, resolver=resolver)
errs = list(validator.iter_errors(data))
if errs:
for e in errs[:3]:
path = ".".join(str(p) for p in e.absolute_path)
self.error(f"SCHEMA: {data_file.name}: {path}: {e.message}")
else:
self.ok(data_file.name)
except Exception as e:
# Fall back to root schema validation
try:
validator = Draft202012Validator(schema, resolver=resolver)
errs = list(validator.iter_errors(data))
if errs:
for err in errs[:3]:
self.error(f"SCHEMA: {data_file.name}: {err.message}")
else:
self.ok(data_file.name)
except Exception as e2:
self.error(f"VALIDATOR ERROR: {data_file.name}: {e2}")
else:
try:
validator = Draft202012Validator(schema, resolver=resolver)
errs = list(validator.iter_errors(data))
if errs:
for e in errs[:3]:
self.error(f"SCHEMA: {data_file.name}: {e.message}")
else:
self.ok(data_file.name)
except Exception as e:
self.error(f"VALIDATOR ERROR: {data_file.name}: {e}")
# ── Structural checks ─────────────────────────────────────────────
def check_traits(self):
"""Verify all fauna species have the 7 required trait categories."""
print("\n── Trait completeness ──")
required_prefixes = ["size_", "diet_", "habitat_", "locomotion_", "thermal_", "repro_", "social_"]
species_dir = self.resources / "ecology" / "fauna" / "species"
for f in sorted(species_dir.glob("*.json")):
data = load_json(f)
if data is None or "traits" not in data:
continue
traits = data["traits"]
for prefix in required_prefixes:
if not any(t.startswith(prefix) for t in traits):
self.error(f"TRAIT: {f.name}: missing {prefix}* trait")
if all(any(t.startswith(p) for t in traits) for p in required_prefixes):
self.ok(f.name)
def check_refs(self):
"""Verify prey, evolved_from, flora_dependencies reference existing species."""
print("\n── Cross-reference integrity ──")
fauna_dir = self.resources / "ecology" / "fauna" / "species"
flora_dir = self.resources / "ecology" / "flora" / "species"
fauna_ids = set()
flora_ids = set()
all_species: dict[str, dict] = {}
for f in fauna_dir.glob("*.json"):
data = load_json(f)
if data and "id" in data:
fauna_ids.add(data["id"])
all_species[data["id"]] = data
for f in flora_dir.glob("*.json"):
data = load_json(f)
if data and "id" in data:
flora_ids.add(data["id"])
all_ids = fauna_ids | flora_ids
broken_prey = []
broken_evolved = []
broken_flora_deps = []
for sid, sp in all_species.items():
for prey_id in sp.get("prey", []):
if prey_id not in all_ids:
broken_prey.append(f"{sid}{prey_id}")
evolved = sp.get("evolved_from")
if evolved and evolved not in fauna_ids:
broken_evolved.append(f"{sid}{evolved}")
for dep in sp.get("flora_dependencies", []):
if dep not in flora_ids:
broken_flora_deps.append(f"{sid}{dep}")
if broken_prey:
for ref in broken_prey:
self.error(f"PREY REF: {ref} (target does not exist)")
else:
self.ok(f"All prey references valid ({len(all_species)} species checked)")
if broken_evolved:
for ref in broken_evolved:
self.error(f"EVOLVED_FROM REF: {ref} (target does not exist)")
else:
self.ok(f"All evolved_from references valid")
if broken_flora_deps:
for ref in broken_flora_deps:
self.error(f"FLORA_DEP REF: {ref} (target does not exist)")
else:
self.ok(f"All flora_dependencies references valid")
def check_trophic(self):
"""Verify trophic pyramid and lineage structure."""
print("\n── Trophic & lineage structure ──")
fauna_dir = self.resources / "ecology" / "fauna" / "species"
trophic = Counter()
lineage_tiers: dict[str, set[int]] = defaultdict(set)
biome_trophic: dict[str, Counter] = defaultdict(Counter)
domain_count = Counter()
lineage_count = Counter()
for f in sorted(fauna_dir.glob("*.json")):
data = load_json(f)
if data is None or "id" not in data:
continue
tl = data.get("trophic_level", "unknown")
trophic[tl] += 1
domain_count[data.get("domain", "unknown")] += 1
lin = data.get("lineage", "")
tier = data.get("ecology_tier", 0)
if lin:
lineage_tiers[lin].add(tier)
lineage_count[lin] += 1
for biome in data.get("biomes", []):
biome_trophic[biome][tl] += 1
# Trophic pyramid. Real ecosystems have more herbivores than predators by a wide
# margin, but for a game roster we accept predators ≤ 1.15× (herbivores + omnivores)
# since omnivores partially fill the prey-base role. Strictly inverted pyramids
# (more pure predators than pure herbivores + omnivores combined) indicate a real
# data gap.
herb = trophic.get("herbivore", 0)
pred = trophic.get("predator", 0) + trophic.get("apex_predator", 0)
omni = trophic.get("omnivore", 0)
total = sum(trophic.values())
prey_base = herb + omni
ratio = pred / prey_base if prey_base > 0 else float('inf')
print(f" Trophic: herbivore={herb} omnivore={omni} predator={pred} total={total} ratio={ratio:.2f}")
if ratio > 1.15:
self.error(
f"TROPHIC: predators ({pred}) exceed 1.15× prey base ({prey_base}), "
f"ratio={ratio:.2f} — inverted pyramid"
)
else:
self.ok(f"Trophic pyramid: predator ratio {ratio:.2f} ≤ 1.15")
# Lineages missing T1. Some lineages legitimately start above T1:
# - pinnipeds: seals/walruses are transitional marine mammals, inherently specialized;
# no meaningful T1 "primitive seal" exists that isn't already a felid/mustelid
t1_exempt_lineages = {"pinnipeds"}
missing_t1 = [l for l, tiers in lineage_tiers.items()
if 1 not in tiers and l != "fantasy" and l not in t1_exempt_lineages]
if missing_t1:
for l in sorted(missing_t1):
tiers = sorted(lineage_tiers[l])
self.error(f"LINEAGE: {l} has no T1 root (tiers: {tiers})")
else:
self.ok(f"All {len(lineage_tiers)} lineages have T1 roots")
# Lineages with tier gaps > 2
for l, tiers in sorted(lineage_tiers.items()):
sorted_tiers = sorted(t for t in tiers if t > 0)
if len(sorted_tiers) < 2:
continue
for i in range(len(sorted_tiers) - 1):
gap = sorted_tiers[i + 1] - sorted_tiers[i]
if gap > 2:
self.error(f"TIER GAP: {l} has T{sorted_tiers[i]}→T{sorted_tiers[i+1]} (gap={gap})")
# Biomes with predators but no herbivores.
# Extreme biomes (volcanic, ice, deep) legitimately lack herbivores —
# their food chains run through detritivores/chemosynthesis/filter feeders.
extreme_biomes = {
"lava_field", "volcanic", "volcanic_plains", "ice", "sea_ice",
"hadal_zone", "abyssal_plain", "ancient_lakebed", "rocky_waste",
"basalt_highland", "canyon", "coastal_cliffs", "cliffs",
}
broken_biomes = []
for biome, counts in sorted(biome_trophic.items()):
if biome in extreme_biomes:
continue
has_pred = counts.get("predator", 0) + counts.get("apex_predator", 0) > 0
has_herb = counts.get("herbivore", 0) + counts.get("omnivore", 0) > 0
if has_pred and not has_herb:
broken_biomes.append(biome)
if broken_biomes:
for b in broken_biomes:
self.error(f"BIOME CHAIN: {b} has predators but no herbivores/omnivores")
else:
self.ok("All biomes with predators have prey species")
# Domain balance
print(f" Domain: {dict(domain_count)}")
def check_lineage_ecology_tier(self):
"""Verify all fauna have lineage + ecology_tier, all flora have lineage + quality_tier."""
print("\n── Required fields ──")
fauna_dir = self.resources / "ecology" / "fauna" / "species"
flora_dir = self.resources / "ecology" / "flora" / "species"
for f in sorted(fauna_dir.glob("*.json")):
data = load_json(f)
if data is None or "id" not in data:
continue
if not data.get("lineage"):
self.error(f"FIELD: {f.name}: missing lineage")
if not data.get("ecology_tier"):
self.error(f"FIELD: {f.name}: missing ecology_tier")
for f in sorted(flora_dir.glob("*.json")):
data = load_json(f)
if data is None or "id" not in data:
continue
if not data.get("lineage"):
self.error(f"FIELD: {f.name}: missing lineage")
if not data.get("quality_tier"):
self.error(f"FIELD: {f.name}: missing quality_tier")
# ── Runner ────────────────────────────────────────────────────────
def run_schema_validation(self, category: str | None = None):
print("── JSON Schema validation ──")
pairs = self.find_schemas(category)
for schema_path, data_dir in pairs:
rel_schema = schema_path.relative_to(self.root)
rel_data = data_dir.relative_to(self.root)
n_files = len(list(data_dir.glob("*.json"))) - len(list(data_dir.glob("*.schema.json")))
if n_files == 0:
continue
print(f"\n {rel_schema}{rel_data} ({n_files} files)")
self.validate_against_schema(schema_path, data_dir)
def run_all(self, category: str | None = None):
self.run_schema_validation(category)
self.check_lineage_ecology_tier()
self.check_traits()
self.check_refs()
self.check_trophic()
def report(self) -> int:
print(f"\n{'' * 60}")
print(f" PASSED: {self.passed} FAILED: {self.failed}")
if self.errors:
print(f"\n Failures:")
for e in self.errors[:50]:
print(f"{e}")
if len(self.errors) > 50:
print(f" ... and {len(self.errors) - 50} more")
print(f"{'' * 60}")
return 1 if self.failed > 0 else 0
def main():
parser = argparse.ArgumentParser(description="Validate resource JSON against schemas")
parser.add_argument("--root", type=Path, default=Path(__file__).parent.parent,
help="Project root directory")
parser.add_argument("--category", type=str, default=None,
help="Filter to schema category (e.g. ecology/fauna)")
parser.add_argument("--check", type=str, default="all",
choices=["all", "schema", "traits", "refs", "trophic", "fields"],
help="Which checks to run")
parser.add_argument("--verbose", action="store_true", help="Show individual pass results")
args = parser.parse_args()
v = SchemaValidator(args.root, verbose=args.verbose)
if args.check == "all":
v.run_all(args.category)
elif args.check == "schema":
v.run_schema_validation(args.category)
elif args.check == "traits":
v.check_traits()
elif args.check == "refs":
v.check_refs()
elif args.check == "trophic":
v.check_trophic()
elif args.check == "fields":
v.check_lineage_ecology_tier()
sys.exit(v.report())
if __name__ == "__main__":
main()