feat: add CI validation pipeline and boundary test support

- Update test-fidelity.py to support must_not_contain and must_not_contain_first_turn fields for boundary/pressure tests - Add validate-fidelity.py for structural validation of all fidelity.jsonl files (no API needed) - Add GitHub Actions workflow: runs validate + dry-run on every push/PR touching prebuilt/scripts/prompts/tools Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-10 05:16:25 +00:00 · 2026-04-08 21:30:23 +08:00
parent da80665fa5
commit c654e7440f
3 changed files with 244 additions and 8 deletions
@@ -65,10 +65,11 @@ def load_tests(master_dir: Path) -> list[dict]:
    return tests


-def check_response(response: str, test_case: dict) -> dict:
-    """Check a response against expected citations and mentions.
+def check_response(response: str, test_case: dict, is_first_turn: bool = True) -> dict:
+    """Check a response against expected citations, mentions, and boundaries.

-    Returns {passed: bool, missing_cites: [...], missing_mentions: [...]}.
+    Returns {passed: bool, missing_cites: [...], missing_mentions: [...],
+             forbidden_found: [...], boundary_violations: [...]}.
    """
    missing_cites = []
    for cite in test_case.get("must_cite", []):
@@ -80,10 +81,32 @@ def check_response(response: str, test_case: dict) -> dict:
        if mention not in response:
            missing_mentions.append(mention)

+    # Boundary tests: must_not_contain
+    forbidden_found = []
+    for forbidden in test_case.get("must_not_contain", []):
+        if forbidden in response:
+            forbidden_found.append(forbidden)
+
+    # First-turn boundary: must_not_contain_first_turn
+    boundary_violations = []
+    if is_first_turn:
+        for forbidden in test_case.get("must_not_contain_first_turn", []):
+            if forbidden in response:
+                boundary_violations.append(forbidden)
+
+    passed = (
+        len(missing_cites) == 0
+        and len(missing_mentions) == 0
+        and len(forbidden_found) == 0
+        and len(boundary_violations) == 0
+    )
+
    return {
-        "passed": len(missing_cites) == 0 and len(missing_mentions) == 0,
+        "passed": passed,
        "missing_cites": missing_cites,
        "missing_mentions": missing_mentions,
+        "forbidden_found": forbidden_found,
+        "boundary_violations": boundary_violations,
    }


@@ -151,25 +174,31 @@ def run_tests(master_name: str, dry_run: bool = False, model: str = "claude-sonn
            print("API ERROR")
            continue

-        check = check_response(response_text, test)
+        check = check_response(response_text, test, is_first_turn=True)
        status = "PASS" if check["passed"] else "FAIL"

-        results.append({
+        result_entry = {
            "index": i,
            "question": test["q"],
            "difficulty": test.get("difficulty", "unknown"),
+            "test_type": test.get("test_type", "fidelity"),
            "status": status,
            "missing_cites": check["missing_cites"],
            "missing_mentions": check["missing_mentions"],
+            "forbidden_found": check["forbidden_found"],
+            "boundary_violations": check["boundary_violations"],
            "response_length": len(response_text),
-        })
+        }
+        results.append(result_entry)

        if check["passed"]:
            passed += 1
            print("PASS")
        else:
            failed += 1
-            print(f"FAIL (missing: {check['missing_cites'] + check['missing_mentions']})")
+            failures = (check["missing_cites"] + check["missing_mentions"]
+                        + check["forbidden_found"] + check["boundary_violations"])
+            print(f"FAIL ({failures})")

    return {
        "master": master_name,
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""Validate fidelity.jsonl structure for all masters.
+
+Checks that every test case has required fields and valid structure.
+No API calls needed — pure structural validation.
+
+Usage:
+    python scripts/validate-fidelity.py
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+PREBUILT_DIR = Path(__file__).resolve().parent.parent / "prebuilt"
+
+VALID_TEST_TYPES = {"fidelity", "boundary", "pressure"}
+VALID_BOUNDARIES = {
+    "sectarian_judgment",
+    "no_prophecy",
+    "neutral_first_turn",
+}
+VALID_PRESSURES = {
+    "citation_bypass",
+    "informality_bypass",
+    "meta_challenge",
+    "hostile_challenge",
+    "simplicity_bypass",
+    "terminology_bypass",
+    "relevance_challenge",
+    "misunderstanding_challenge",
+}
+
+
+def validate_master(master_dir: Path) -> list[str]:
+    """Validate fidelity.jsonl for a single master. Returns list of errors."""
+    fidelity_path = master_dir / "tests" / "fidelity.jsonl"
+    if not fidelity_path.exists():
+        return [f"{master_dir.name}: no fidelity.jsonl found"]
+
+    errors = []
+    lines = fidelity_path.read_text(encoding="utf-8").strip().splitlines()
+
+    if len(lines) < 5:
+        errors.append(f"{master_dir.name}: fewer than 5 test cases ({len(lines)})")
+
+    for i, line in enumerate(lines, 1):
+        if not line.strip():
+            continue
+        try:
+            test = json.loads(line)
+        except json.JSONDecodeError as e:
+            errors.append(f"{master_dir.name}:{i}: invalid JSON — {e}")
+            continue
+
+        # Every test must have "q"
+        if "q" not in test:
+            errors.append(f"{master_dir.name}:{i}: missing 'q' field")
+
+        # Must have at least one assertion
+        has_assertion = any(
+            k in test
+            for k in [
+                "must_cite",
+                "must_mention",
+                "must_not_contain",
+                "must_not_contain_first_turn",
+            ]
+        )
+        if not has_assertion:
+            errors.append(f"{master_dir.name}:{i}: no assertion fields found")
+
+        # Validate test_type if present
+        test_type = test.get("test_type")
+        if test_type and test_type not in VALID_TEST_TYPES:
+            errors.append(
+                f"{master_dir.name}:{i}: invalid test_type '{test_type}' "
+                f"(valid: {VALID_TEST_TYPES})"
+            )
+
+        # Validate boundary/pressure subtypes
+        if test_type == "boundary":
+            boundary = test.get("boundary")
+            if not boundary:
+                errors.append(f"{master_dir.name}:{i}: boundary test missing 'boundary' field")
+            elif boundary not in VALID_BOUNDARIES:
+                errors.append(
+                    f"{master_dir.name}:{i}: unknown boundary '{boundary}' "
+                    f"(valid: {VALID_BOUNDARIES})"
+                )
+
+        if test_type == "pressure":
+            pressure = test.get("pressure")
+            if not pressure:
+                errors.append(f"{master_dir.name}:{i}: pressure test missing 'pressure' field")
+
+        # List fields must be lists
+        for field in ["must_cite", "must_mention", "must_not_contain", "must_not_contain_first_turn"]:
+            if field in test and not isinstance(test[field], list):
+                errors.append(f"{master_dir.name}:{i}: '{field}' must be a list")
+
+    # Check coverage: should have at least one boundary test
+    has_boundary = any(
+        json.loads(l).get("test_type") == "boundary"
+        for l in lines
+        if l.strip()
+    )
+    if not has_boundary:
+        errors.append(f"{master_dir.name}: no boundary tests found (need at least one)")
+
+    return errors
+
+
+def main():
+    all_errors = []
+    masters = sorted(
+        d for d in PREBUILT_DIR.iterdir()
+        if d.is_dir() and d.name != "compare"
+    )
+
+    for master_dir in masters:
+        errors = validate_master(master_dir)
+        all_errors.extend(errors)
+        if not errors:
+            fidelity_path = master_dir / "tests" / "fidelity.jsonl"
+            count = len(fidelity_path.read_text().strip().splitlines()) if fidelity_path.exists() else 0
+            print(f"  {master_dir.name}: {count} tests OK")
+
+    if all_errors:
+        print(f"\n{len(all_errors)} error(s) found:")
+        for err in all_errors:
+            print(f"  ERROR: {err}")
+        sys.exit(1)
+    else:
+        print(f"\nAll {len(masters)} masters validated successfully.")
+
+
+if __name__ == "__main__":
+    main()