magicciv/tools/autoplay-validate.py

#!/usr/bin/env python3
"""
JSON Schema validator for autoplay output files.

Implements the subset of draft-07 used by the schemas:
  type, required, additionalProperties, properties, propertyNames.pattern,
  minimum, enum, items, pattern, $ref (local only).

stdlib only — no pip installs.

Usage:
    # Validate a single JSON file against a named schema:
    python3 tools/autoplay-validate.py --schema meta path/to/meta.json

    # Validate every line of a JSONL file independently:
    python3 tools/autoplay-validate.py --schema turn-stats-line --jsonl path/to/turn_stats.jsonl

    # Legacy: validate against the flat result schema (default):
    python3 tools/autoplay-validate.py path/to/result.json

Exits 0 if all valid, 1 with errors to stderr, 2 on usage error.

Available schema names (--schema):
    turn-stats-line   tools/schemas/autoplay/turn-stats-line.json
    meta              tools/schemas/autoplay/meta.json
    events-line       tools/schemas/autoplay/events-line.json
    save              tools/schemas/autoplay/save.json
    result            tools/autoplay-result-schema.json  (legacy flat schema)
"""
from __future__ import annotations

import json
import re
import sys
from pathlib import Path
from typing import Any

TOOLS_DIR = Path(__file__).parent
SCHEMAS_DIR = TOOLS_DIR / "schemas" / "autoplay"

SCHEMA_PATHS: dict[str, Path] = {
    "result": TOOLS_DIR / "autoplay-result-schema.json",
    "turn-stats-line": SCHEMAS_DIR / "turn-stats-line.json",
    "meta": SCHEMAS_DIR / "meta.json",
    "events-line": SCHEMAS_DIR / "events-line.json",
    "save": SCHEMAS_DIR / "save.json",
}

_DEFAULT_SCHEMA = "result"


def load_schema(name: str = _DEFAULT_SCHEMA) -> dict[str, Any]:
    path = SCHEMA_PATHS.get(name)
    if path is None:
        raise ValueError(
            f"unknown schema {name!r}. Available: {', '.join(sorted(SCHEMA_PATHS))}"
        )
    with path.open() as f:
        return json.load(f)


_TYPE_CHECKS: dict[str, type | tuple[type, ...]] = {
    "object": dict,
    "array": list,
    "string": str,
    "integer": int,
    "number": (int, float),
    "boolean": bool,
    "null": type(None),
}


def _resolve_ref(ref: str, root: dict[str, Any]) -> dict[str, Any]:
    if not ref.startswith("#/"):
        raise ValueError(f"only local refs supported, got {ref!r}")
    node: Any = root
    for part in ref[2:].split("/"):
        if not isinstance(node, dict) or part not in node:
            raise ValueError(f"ref {ref!r} does not resolve")
        node = node[part]
    return node


def _validate(
    value: Any, schema: dict[str, Any], root: dict[str, Any], path: str
) -> list[str]:
    errors: list[str] = []

    if "$ref" in schema:
        schema = _resolve_ref(schema["$ref"], root)

    t = schema.get("type")
    if t is not None:
        expected = _TYPE_CHECKS.get(t)
        if expected is None:
            errors.append(f"{path}: unknown schema type {t!r}")
            return errors
        # bool is a subclass of int in Python; reject booleans as numbers.
        if t in ("integer", "number") and isinstance(value, bool):
            errors.append(f"{path}: expected {t}, got boolean")
            return errors
        if t == "integer" and isinstance(value, float) and not value.is_integer():
            errors.append(f"{path}: expected integer, got float {value}")
            return errors
        if not isinstance(value, expected):
            errors.append(f"{path}: expected {t}, got {type(value).__name__}")
            return errors

    if "enum" in schema:
        if value not in schema["enum"]:
            errors.append(f"{path}: {value!r} not in enum {schema['enum']}")

    if "minimum" in schema and isinstance(value, (int, float)):
        if value < schema["minimum"]:
            errors.append(f"{path}: {value} < minimum {schema['minimum']}")

    if "pattern" in schema and isinstance(value, str):
        if not re.match(schema["pattern"], value):
            errors.append(
                f"{path}: {value!r} does not match pattern {schema['pattern']!r}"
            )

    if t == "object" and isinstance(value, dict):
        props: dict[str, Any] = schema.get("properties", {})
        required: list[str] = schema.get("required", [])
        additional: bool | dict[str, Any] = schema.get("additionalProperties", True)
        prop_names: dict[str, Any] | None = schema.get("propertyNames")

        for req in required:
            if req not in value:
                errors.append(f"{path}: missing required property {req!r}")

        for k, v in value.items():
            kpath = f"{path}.{k}"
            if prop_names is not None:
                errors.extend(_validate(k, prop_names, root, f"{kpath}<key>"))
            if k in props:
                errors.extend(_validate(v, props[k], root, kpath))
            elif additional is False:
                errors.append(f"{path}: unexpected property {k!r}")
            elif isinstance(additional, dict):
                errors.extend(_validate(v, additional, root, kpath))

    if t == "array" and isinstance(value, list):
        item_schema = schema.get("items")
        if item_schema is not None:
            for i, item in enumerate(value):
                errors.extend(_validate(item, item_schema, root, f"{path}[{i}]"))

    return errors


def validate(data: Any, schema: dict[str, Any] | None = None) -> list[str]:
    """Validate data against schema. Returns list of error strings (empty = valid)."""
    s = schema if schema is not None else load_schema()
    return _validate(data, s, s, "$")


def _validate_file(path: Path, schema: dict[str, Any], jsonl: bool) -> int:
    """Validate one file. Returns error count."""
    total_errors = 0
    try:
        text = path.read_text()
    except OSError as e:
        print(f"{path}: cannot read ({e})", file=sys.stderr)
        return 1

    if jsonl:
        for lineno, raw in enumerate(text.splitlines(), start=1):
            raw = raw.strip()
            if not raw:
                continue
            try:
                data = json.loads(raw)
            except json.JSONDecodeError as e:
                print(f"{path}:{lineno}: invalid JSON ({e})", file=sys.stderr)
                total_errors += 1
                continue
            errs = validate(data, schema)
            if errs:
                total_errors += len(errs)
                print(f"{path}:{lineno}: {len(errs)} error(s)", file=sys.stderr)
                for e in errs:
                    print(f"  {e}", file=sys.stderr)
    else:
        try:
            data = json.loads(text)
        except json.JSONDecodeError as e:
            print(f"{path}: invalid JSON ({e})", file=sys.stderr)
            return 1
        errs = validate(data, schema)
        if errs:
            total_errors += len(errs)
            print(f"{path}: {len(errs)} error(s)", file=sys.stderr)
            for e in errs:
                print(f"  {e}", file=sys.stderr)
        else:
            print(f"{path}: OK", file=sys.stderr)

    if jsonl and total_errors == 0:
        print(f"{path}: OK", file=sys.stderr)

    return total_errors


def _main(argv: list[str]) -> int:
    args = argv[1:]

    schema_name = _DEFAULT_SCHEMA
    jsonl = False
    files: list[str] = []

    i = 0
    while i < len(args):
        a = args[i]
        if a == "--schema":
            i += 1
            if i >= len(args):
                print("ERROR: --schema requires a value", file=sys.stderr)
                return 2
            schema_name = args[i]
        elif a == "--jsonl":
            jsonl = True
        elif a.startswith("--schema="):
            schema_name = a[len("--schema="):]
        elif a.startswith("-"):
            print(f"ERROR: unknown flag {a!r}", file=sys.stderr)
            return 2
        else:
            files.append(a)
        i += 1

    if not files:
        print(
            "usage: autoplay-validate.py [--schema NAME] [--jsonl] <file> [<file> ...]",
            file=sys.stderr,
        )
        print(
            f"  schemas: {', '.join(sorted(SCHEMA_PATHS))}",
            file=sys.stderr,
        )
        return 2

    try:
        schema = load_schema(schema_name)
    except ValueError as e:
        print(f"ERROR: {e}", file=sys.stderr)
        return 2

    total_errors = 0
    for f in files:
        total_errors += _validate_file(Path(f), schema, jsonl)

    return 1 if total_errors else 0


if __name__ == "__main__":
    sys.exit(_main(sys.argv))