feat: add CI validation pipeline and boundary test support

- Update test-fidelity.py to support must_not_contain and must_not_contain_first_turn fields for boundary/pressure tests - Add validate-fidelity.py for structural validation of all fidelity.jsonl files (no API needed) - Add GitHub Actions workflow: runs validate + dry-run on every push/PR touching prebuilt/scripts/prompts/tools Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-10 05:16:25 +00:00 · 2026-04-08 21:30:23 +08:00
parent da80665fa5
commit c654e7440f
3 changed files with 244 additions and 8 deletions
@@ -0,0 +1,66 @@
 name: Validate & Test
 on:
  push:
    paths:
      - 'prebuilt/**'
      - 'scripts/**'
      - 'prompts/**'
      - 'tools/**'
  pull_request:
    paths:
      - 'prebuilt/**'
      - 'scripts/**'
      - 'prompts/**'
      - 'tools/**'
 jobs:
  validate:
    name: Validate SKILL.md & fidelity structure
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Install dependencies
        run: pip install requests pypinyin pyyaml
      - name: Lint SKILL.md frontmatter
        run: python scripts/validate.py --strict
      - name: Validate fidelity.jsonl structure
        run: python scripts/validate-fidelity.py
      - name: Dry-run fidelity tests
        run: python scripts/test-fidelity.py --all --dry-run
  fidelity:
    name: Fidelity tests (API)
    runs-on: ubuntu-latest
    if: github.event_name == 'workflow_dispatch'
    needs: validate
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Install dependencies
        run: pip install anthropic requests pypinyin
      - name: Run fidelity tests
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
        run: python scripts/test-fidelity.py --all --json > fidelity-results.json
      - name: Upload results
        uses: actions/upload-artifact@v4
        with:
          name: fidelity-results
          path: fidelity-results.json
@@ -65,10 +65,11 @@ def load_tests(master_dir: Path) -> list[dict]:
    return tests
-def check_response(response: str, test_case: dict) -> dict:
+def check_response(response: str, test_case: dict, is_first_turn: bool = True) -> dict:
-    """Check a response against expected citations and mentions.
+    """Check a response against expected citations, mentions, and boundaries.
-    Returns {passed: bool, missing_cites: [...], missing_mentions: [...]}.
+    Returns {passed: bool, missing_cites: [...], missing_mentions: [...],
             forbidden_found: [...], boundary_violations: [...]}.
    """
    missing_cites = []
    for cite in test_case.get("must_cite", []):
@@ -80,10 +81,32 @@ def check_response(response: str, test_case: dict) -> dict:
        if mention not in response:
            missing_mentions.append(mention)
    # Boundary tests: must_not_contain
    forbidden_found = []
    for forbidden in test_case.get("must_not_contain", []):
        if forbidden in response:
            forbidden_found.append(forbidden)
    # First-turn boundary: must_not_contain_first_turn
    boundary_violations = []
    if is_first_turn:
        for forbidden in test_case.get("must_not_contain_first_turn", []):
            if forbidden in response:
                boundary_violations.append(forbidden)
    passed = (
        len(missing_cites) == 0
        and len(missing_mentions) == 0
        and len(forbidden_found) == 0
        and len(boundary_violations) == 0
    )
    return {
-        "passed": len(missing_cites) == 0 and len(missing_mentions) == 0,
+        "passed": passed,
        "missing_cites": missing_cites,
        "missing_mentions": missing_mentions,
        "forbidden_found": forbidden_found,
        "boundary_violations": boundary_violations,
    }
@@ -151,25 +174,31 @@ def run_tests(master_name: str, dry_run: bool = False, model: str = "claude-sonn
            print("API ERROR")
            continue
-        check = check_response(response_text, test)
+        check = check_response(response_text, test, is_first_turn=True)
        status = "PASS" if check["passed"] else "FAIL"
-        results.append({
+        result_entry = {
            "index": i,
            "question": test["q"],
            "difficulty": test.get("difficulty", "unknown"),
            "test_type": test.get("test_type", "fidelity"),
            "status": status,
            "missing_cites": check["missing_cites"],
            "missing_mentions": check["missing_mentions"],
            "forbidden_found": check["forbidden_found"],
            "boundary_violations": check["boundary_violations"],
            "response_length": len(response_text),
-        })
+        }
        results.append(result_entry)
        if check["passed"]:
            passed += 1
            print("PASS")
        else:
            failed += 1
-            print(f"FAIL (missing: {check['missing_cites'] + check['missing_mentions']})")
+            failures = (check["missing_cites"] + check["missing_mentions"]
                        + check["forbidden_found"] + check["boundary_violations"])
            print(f"FAIL ({failures})")
    return {
        "master": master_name,
@@ -0,0 +1,141 @@
 #!/usr/bin/env python3
 """Validate fidelity.jsonl structure for all masters.
 Checks that every test case has required fields and valid structure.
 No API calls needed — pure structural validation.
 Usage:
    python scripts/validate-fidelity.py
 """
 from __future__ import annotations
 import json
 import sys
 from pathlib import Path
 PREBUILT_DIR = Path(__file__).resolve().parent.parent / "prebuilt"
 VALID_TEST_TYPES = {"fidelity", "boundary", "pressure"}
 VALID_BOUNDARIES = {
    "sectarian_judgment",
    "no_prophecy",
    "neutral_first_turn",
 }
 VALID_PRESSURES = {
    "citation_bypass",
    "informality_bypass",
    "meta_challenge",
    "hostile_challenge",
    "simplicity_bypass",
    "terminology_bypass",
    "relevance_challenge",
    "misunderstanding_challenge",
 }
 def validate_master(master_dir: Path) -> list[str]:
    """Validate fidelity.jsonl for a single master. Returns list of errors."""
    fidelity_path = master_dir / "tests" / "fidelity.jsonl"
    if not fidelity_path.exists():
        return [f"{master_dir.name}: no fidelity.jsonl found"]
    errors = []
    lines = fidelity_path.read_text(encoding="utf-8").strip().splitlines()
    if len(lines) < 5:
        errors.append(f"{master_dir.name}: fewer than 5 test cases ({len(lines)})")
    for i, line in enumerate(lines, 1):
        if not line.strip():
            continue
        try:
            test = json.loads(line)
        except json.JSONDecodeError as e:
            errors.append(f"{master_dir.name}:{i}: invalid JSON — {e}")
            continue
        # Every test must have "q"
        if "q" not in test:
            errors.append(f"{master_dir.name}:{i}: missing 'q' field")
        # Must have at least one assertion
        has_assertion = any(
            k in test
            for k in [
                "must_cite",
                "must_mention",
                "must_not_contain",
                "must_not_contain_first_turn",
            ]
        )
        if not has_assertion:
            errors.append(f"{master_dir.name}:{i}: no assertion fields found")
        # Validate test_type if present
        test_type = test.get("test_type")
        if test_type and test_type not in VALID_TEST_TYPES:
            errors.append(
                f"{master_dir.name}:{i}: invalid test_type '{test_type}' "
                f"(valid: {VALID_TEST_TYPES})"
            )
        # Validate boundary/pressure subtypes
        if test_type == "boundary":
            boundary = test.get("boundary")
            if not boundary:
                errors.append(f"{master_dir.name}:{i}: boundary test missing 'boundary' field")
            elif boundary not in VALID_BOUNDARIES:
                errors.append(
                    f"{master_dir.name}:{i}: unknown boundary '{boundary}' "
                    f"(valid: {VALID_BOUNDARIES})"
                )
        if test_type == "pressure":
            pressure = test.get("pressure")
            if not pressure:
                errors.append(f"{master_dir.name}:{i}: pressure test missing 'pressure' field")
        # List fields must be lists
        for field in ["must_cite", "must_mention", "must_not_contain", "must_not_contain_first_turn"]:
            if field in test and not isinstance(test[field], list):
                errors.append(f"{master_dir.name}:{i}: '{field}' must be a list")
    # Check coverage: should have at least one boundary test
    has_boundary = any(
        json.loads(l).get("test_type") == "boundary"
        for l in lines
        if l.strip()
    )
    if not has_boundary:
        errors.append(f"{master_dir.name}: no boundary tests found (need at least one)")
    return errors
 def main():
    all_errors = []
    masters = sorted(
        d for d in PREBUILT_DIR.iterdir()
        if d.is_dir() and d.name != "compare"
    )
    for master_dir in masters:
        errors = validate_master(master_dir)
        all_errors.extend(errors)
        if not errors:
            fidelity_path = master_dir / "tests" / "fidelity.jsonl"
            count = len(fidelity_path.read_text().strip().splitlines()) if fidelity_path.exists() else 0
            print(f"  {master_dir.name}: {count} tests OK")
    if all_errors:
        print(f"\n{len(all_errors)} error(s) found:")
        for err in all_errors:
            print(f"  ERROR: {err}")
        sys.exit(1)
    else:
        print(f"\nAll {len(masters)} masters validated successfully.")
 if __name__ == "__main__":
    main()