feat: add CI validation pipeline and boundary test support

- Update test-fidelity.py to support must_not_contain and must_not_contain_first_turn fields for boundary/pressure tests - Add validate-fidelity.py for structural validation of all fidelity.jsonl files (no API needed) - Add GitHub Actions workflow: runs validate + dry-run on every push/PR touching prebuilt/scripts/prompts/tools Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-10 05:16:25 +00:00 · 2026-04-08 21:30:23 +08:00
parent da80665fa5
commit c654e7440f
3 changed files with 244 additions and 8 deletions
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""Validate fidelity.jsonl structure for all masters.
+
+Checks that every test case has required fields and valid structure.
+No API calls needed — pure structural validation.
+
+Usage:
+    python scripts/validate-fidelity.py
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+PREBUILT_DIR = Path(__file__).resolve().parent.parent / "prebuilt"
+
+VALID_TEST_TYPES = {"fidelity", "boundary", "pressure"}
+VALID_BOUNDARIES = {
+    "sectarian_judgment",
+    "no_prophecy",
+    "neutral_first_turn",
+}
+VALID_PRESSURES = {
+    "citation_bypass",
+    "informality_bypass",
+    "meta_challenge",
+    "hostile_challenge",
+    "simplicity_bypass",
+    "terminology_bypass",
+    "relevance_challenge",
+    "misunderstanding_challenge",
+}
+
+
+def validate_master(master_dir: Path) -> list[str]:
+    """Validate fidelity.jsonl for a single master. Returns list of errors."""
+    fidelity_path = master_dir / "tests" / "fidelity.jsonl"
+    if not fidelity_path.exists():
+        return [f"{master_dir.name}: no fidelity.jsonl found"]
+
+    errors = []
+    lines = fidelity_path.read_text(encoding="utf-8").strip().splitlines()
+
+    if len(lines) < 5:
+        errors.append(f"{master_dir.name}: fewer than 5 test cases ({len(lines)})")
+
+    for i, line in enumerate(lines, 1):
+        if not line.strip():
+            continue
+        try:
+            test = json.loads(line)
+        except json.JSONDecodeError as e:
+            errors.append(f"{master_dir.name}:{i}: invalid JSON — {e}")
+            continue
+
+        # Every test must have "q"
+        if "q" not in test:
+            errors.append(f"{master_dir.name}:{i}: missing 'q' field")
+
+        # Must have at least one assertion
+        has_assertion = any(
+            k in test
+            for k in [
+                "must_cite",
+                "must_mention",
+                "must_not_contain",
+                "must_not_contain_first_turn",
+            ]
+        )
+        if not has_assertion:
+            errors.append(f"{master_dir.name}:{i}: no assertion fields found")
+
+        # Validate test_type if present
+        test_type = test.get("test_type")
+        if test_type and test_type not in VALID_TEST_TYPES:
+            errors.append(
+                f"{master_dir.name}:{i}: invalid test_type '{test_type}' "
+                f"(valid: {VALID_TEST_TYPES})"
+            )
+
+        # Validate boundary/pressure subtypes
+        if test_type == "boundary":
+            boundary = test.get("boundary")
+            if not boundary:
+                errors.append(f"{master_dir.name}:{i}: boundary test missing 'boundary' field")
+            elif boundary not in VALID_BOUNDARIES:
+                errors.append(
+                    f"{master_dir.name}:{i}: unknown boundary '{boundary}' "
+                    f"(valid: {VALID_BOUNDARIES})"
+                )
+
+        if test_type == "pressure":
+            pressure = test.get("pressure")
+            if not pressure:
+                errors.append(f"{master_dir.name}:{i}: pressure test missing 'pressure' field")
+
+        # List fields must be lists
+        for field in ["must_cite", "must_mention", "must_not_contain", "must_not_contain_first_turn"]:
+            if field in test and not isinstance(test[field], list):
+                errors.append(f"{master_dir.name}:{i}: '{field}' must be a list")
+
+    # Check coverage: should have at least one boundary test
+    has_boundary = any(
+        json.loads(l).get("test_type") == "boundary"
+        for l in lines
+        if l.strip()
+    )
+    if not has_boundary:
+        errors.append(f"{master_dir.name}: no boundary tests found (need at least one)")
+
+    return errors
+
+
+def main():
+    all_errors = []
+    masters = sorted(
+        d for d in PREBUILT_DIR.iterdir()
+        if d.is_dir() and d.name != "compare"
+    )
+
+    for master_dir in masters:
+        errors = validate_master(master_dir)
+        all_errors.extend(errors)
+        if not errors:
+            fidelity_path = master_dir / "tests" / "fidelity.jsonl"
+            count = len(fidelity_path.read_text().strip().splitlines()) if fidelity_path.exists() else 0
+            print(f"  {master_dir.name}: {count} tests OK")
+
+    if all_errors:
+        print(f"\n{len(all_errors)} error(s) found:")
+        for err in all_errors:
+            print(f"  ERROR: {err}")
+        sys.exit(1)
+    else:
+        print(f"\nAll {len(masters)} masters validated successfully.")
+
+
+if __name__ == "__main__":
+    main()