diff --git a/.github/workflows/validate-and-test.yml b/.github/workflows/validate-and-test.yml new file mode 100644 index 0000000..d7ae908 --- /dev/null +++ b/.github/workflows/validate-and-test.yml @@ -0,0 +1,66 @@ +name: Validate & Test + +on: + push: + paths: + - 'prebuilt/**' + - 'scripts/**' + - 'prompts/**' + - 'tools/**' + pull_request: + paths: + - 'prebuilt/**' + - 'scripts/**' + - 'prompts/**' + - 'tools/**' + +jobs: + validate: + name: Validate SKILL.md & fidelity structure + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install requests pypinyin pyyaml + + - name: Lint SKILL.md frontmatter + run: python scripts/validate.py --strict + + - name: Validate fidelity.jsonl structure + run: python scripts/validate-fidelity.py + + - name: Dry-run fidelity tests + run: python scripts/test-fidelity.py --all --dry-run + + fidelity: + name: Fidelity tests (API) + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' + needs: validate + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install anthropic requests pypinyin + + - name: Run fidelity tests + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: python scripts/test-fidelity.py --all --json > fidelity-results.json + + - name: Upload results + uses: actions/upload-artifact@v4 + with: + name: fidelity-results + path: fidelity-results.json diff --git a/scripts/test-fidelity.py b/scripts/test-fidelity.py index e40390b..a53ce4b 100644 --- a/scripts/test-fidelity.py +++ b/scripts/test-fidelity.py @@ -65,10 +65,11 @@ def load_tests(master_dir: Path) -> list[dict]: return tests -def check_response(response: str, test_case: dict) -> dict: - """Check a response against expected citations and mentions. +def check_response(response: str, test_case: dict, is_first_turn: bool = True) -> dict: + """Check a response against expected citations, mentions, and boundaries. - Returns {passed: bool, missing_cites: [...], missing_mentions: [...]}. + Returns {passed: bool, missing_cites: [...], missing_mentions: [...], + forbidden_found: [...], boundary_violations: [...]}. """ missing_cites = [] for cite in test_case.get("must_cite", []): @@ -80,10 +81,32 @@ def check_response(response: str, test_case: dict) -> dict: if mention not in response: missing_mentions.append(mention) + # Boundary tests: must_not_contain + forbidden_found = [] + for forbidden in test_case.get("must_not_contain", []): + if forbidden in response: + forbidden_found.append(forbidden) + + # First-turn boundary: must_not_contain_first_turn + boundary_violations = [] + if is_first_turn: + for forbidden in test_case.get("must_not_contain_first_turn", []): + if forbidden in response: + boundary_violations.append(forbidden) + + passed = ( + len(missing_cites) == 0 + and len(missing_mentions) == 0 + and len(forbidden_found) == 0 + and len(boundary_violations) == 0 + ) + return { - "passed": len(missing_cites) == 0 and len(missing_mentions) == 0, + "passed": passed, "missing_cites": missing_cites, "missing_mentions": missing_mentions, + "forbidden_found": forbidden_found, + "boundary_violations": boundary_violations, } @@ -151,25 +174,31 @@ def run_tests(master_name: str, dry_run: bool = False, model: str = "claude-sonn print("API ERROR") continue - check = check_response(response_text, test) + check = check_response(response_text, test, is_first_turn=True) status = "PASS" if check["passed"] else "FAIL" - results.append({ + result_entry = { "index": i, "question": test["q"], "difficulty": test.get("difficulty", "unknown"), + "test_type": test.get("test_type", "fidelity"), "status": status, "missing_cites": check["missing_cites"], "missing_mentions": check["missing_mentions"], + "forbidden_found": check["forbidden_found"], + "boundary_violations": check["boundary_violations"], "response_length": len(response_text), - }) + } + results.append(result_entry) if check["passed"]: passed += 1 print("PASS") else: failed += 1 - print(f"FAIL (missing: {check['missing_cites'] + check['missing_mentions']})") + failures = (check["missing_cites"] + check["missing_mentions"] + + check["forbidden_found"] + check["boundary_violations"]) + print(f"FAIL ({failures})") return { "master": master_name, diff --git a/scripts/validate-fidelity.py b/scripts/validate-fidelity.py new file mode 100644 index 0000000..5a2294b --- /dev/null +++ b/scripts/validate-fidelity.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +"""Validate fidelity.jsonl structure for all masters. + +Checks that every test case has required fields and valid structure. +No API calls needed — pure structural validation. + +Usage: + python scripts/validate-fidelity.py +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +PREBUILT_DIR = Path(__file__).resolve().parent.parent / "prebuilt" + +VALID_TEST_TYPES = {"fidelity", "boundary", "pressure"} +VALID_BOUNDARIES = { + "sectarian_judgment", + "no_prophecy", + "neutral_first_turn", +} +VALID_PRESSURES = { + "citation_bypass", + "informality_bypass", + "meta_challenge", + "hostile_challenge", + "simplicity_bypass", + "terminology_bypass", + "relevance_challenge", + "misunderstanding_challenge", +} + + +def validate_master(master_dir: Path) -> list[str]: + """Validate fidelity.jsonl for a single master. Returns list of errors.""" + fidelity_path = master_dir / "tests" / "fidelity.jsonl" + if not fidelity_path.exists(): + return [f"{master_dir.name}: no fidelity.jsonl found"] + + errors = [] + lines = fidelity_path.read_text(encoding="utf-8").strip().splitlines() + + if len(lines) < 5: + errors.append(f"{master_dir.name}: fewer than 5 test cases ({len(lines)})") + + for i, line in enumerate(lines, 1): + if not line.strip(): + continue + try: + test = json.loads(line) + except json.JSONDecodeError as e: + errors.append(f"{master_dir.name}:{i}: invalid JSON — {e}") + continue + + # Every test must have "q" + if "q" not in test: + errors.append(f"{master_dir.name}:{i}: missing 'q' field") + + # Must have at least one assertion + has_assertion = any( + k in test + for k in [ + "must_cite", + "must_mention", + "must_not_contain", + "must_not_contain_first_turn", + ] + ) + if not has_assertion: + errors.append(f"{master_dir.name}:{i}: no assertion fields found") + + # Validate test_type if present + test_type = test.get("test_type") + if test_type and test_type not in VALID_TEST_TYPES: + errors.append( + f"{master_dir.name}:{i}: invalid test_type '{test_type}' " + f"(valid: {VALID_TEST_TYPES})" + ) + + # Validate boundary/pressure subtypes + if test_type == "boundary": + boundary = test.get("boundary") + if not boundary: + errors.append(f"{master_dir.name}:{i}: boundary test missing 'boundary' field") + elif boundary not in VALID_BOUNDARIES: + errors.append( + f"{master_dir.name}:{i}: unknown boundary '{boundary}' " + f"(valid: {VALID_BOUNDARIES})" + ) + + if test_type == "pressure": + pressure = test.get("pressure") + if not pressure: + errors.append(f"{master_dir.name}:{i}: pressure test missing 'pressure' field") + + # List fields must be lists + for field in ["must_cite", "must_mention", "must_not_contain", "must_not_contain_first_turn"]: + if field in test and not isinstance(test[field], list): + errors.append(f"{master_dir.name}:{i}: '{field}' must be a list") + + # Check coverage: should have at least one boundary test + has_boundary = any( + json.loads(l).get("test_type") == "boundary" + for l in lines + if l.strip() + ) + if not has_boundary: + errors.append(f"{master_dir.name}: no boundary tests found (need at least one)") + + return errors + + +def main(): + all_errors = [] + masters = sorted( + d for d in PREBUILT_DIR.iterdir() + if d.is_dir() and d.name != "compare" + ) + + for master_dir in masters: + errors = validate_master(master_dir) + all_errors.extend(errors) + if not errors: + fidelity_path = master_dir / "tests" / "fidelity.jsonl" + count = len(fidelity_path.read_text().strip().splitlines()) if fidelity_path.exists() else 0 + print(f" {master_dir.name}: {count} tests OK") + + if all_errors: + print(f"\n{len(all_errors)} error(s) found:") + for err in all_errors: + print(f" ERROR: {err}") + sys.exit(1) + else: + print(f"\nAll {len(masters)} masters validated successfully.") + + +if __name__ == "__main__": + main()