feat: add CI validation pipeline and boundary test support

- Update test-fidelity.py to support must_not_contain and
  must_not_contain_first_turn fields for boundary/pressure tests
- Add validate-fidelity.py for structural validation of all
  fidelity.jsonl files (no API needed)
- Add GitHub Actions workflow: runs validate + dry-run on every
  push/PR touching prebuilt/scripts/prompts/tools

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
xianren
2026-04-08 21:30:23 +08:00
parent da80665fa5
commit c654e7440f
3 changed files with 244 additions and 8 deletions
+37 -8
View File
@@ -65,10 +65,11 @@ def load_tests(master_dir: Path) -> list[dict]:
return tests
def check_response(response: str, test_case: dict) -> dict:
"""Check a response against expected citations and mentions.
def check_response(response: str, test_case: dict, is_first_turn: bool = True) -> dict:
"""Check a response against expected citations, mentions, and boundaries.
Returns {passed: bool, missing_cites: [...], missing_mentions: [...]}.
Returns {passed: bool, missing_cites: [...], missing_mentions: [...],
forbidden_found: [...], boundary_violations: [...]}.
"""
missing_cites = []
for cite in test_case.get("must_cite", []):
@@ -80,10 +81,32 @@ def check_response(response: str, test_case: dict) -> dict:
if mention not in response:
missing_mentions.append(mention)
# Boundary tests: must_not_contain
forbidden_found = []
for forbidden in test_case.get("must_not_contain", []):
if forbidden in response:
forbidden_found.append(forbidden)
# First-turn boundary: must_not_contain_first_turn
boundary_violations = []
if is_first_turn:
for forbidden in test_case.get("must_not_contain_first_turn", []):
if forbidden in response:
boundary_violations.append(forbidden)
passed = (
len(missing_cites) == 0
and len(missing_mentions) == 0
and len(forbidden_found) == 0
and len(boundary_violations) == 0
)
return {
"passed": len(missing_cites) == 0 and len(missing_mentions) == 0,
"passed": passed,
"missing_cites": missing_cites,
"missing_mentions": missing_mentions,
"forbidden_found": forbidden_found,
"boundary_violations": boundary_violations,
}
@@ -151,25 +174,31 @@ def run_tests(master_name: str, dry_run: bool = False, model: str = "claude-sonn
print("API ERROR")
continue
check = check_response(response_text, test)
check = check_response(response_text, test, is_first_turn=True)
status = "PASS" if check["passed"] else "FAIL"
results.append({
result_entry = {
"index": i,
"question": test["q"],
"difficulty": test.get("difficulty", "unknown"),
"test_type": test.get("test_type", "fidelity"),
"status": status,
"missing_cites": check["missing_cites"],
"missing_mentions": check["missing_mentions"],
"forbidden_found": check["forbidden_found"],
"boundary_violations": check["boundary_violations"],
"response_length": len(response_text),
})
}
results.append(result_entry)
if check["passed"]:
passed += 1
print("PASS")
else:
failed += 1
print(f"FAIL (missing: {check['missing_cites'] + check['missing_mentions']})")
failures = (check["missing_cites"] + check["missing_mentions"]
+ check["forbidden_found"] + check["boundary_violations"])
print(f"FAIL ({failures})")
return {
"master": master_name,
+141
View File
@@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""Validate fidelity.jsonl structure for all masters.
Checks that every test case has required fields and valid structure.
No API calls needed — pure structural validation.
Usage:
python scripts/validate-fidelity.py
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
PREBUILT_DIR = Path(__file__).resolve().parent.parent / "prebuilt"
VALID_TEST_TYPES = {"fidelity", "boundary", "pressure"}
VALID_BOUNDARIES = {
"sectarian_judgment",
"no_prophecy",
"neutral_first_turn",
}
VALID_PRESSURES = {
"citation_bypass",
"informality_bypass",
"meta_challenge",
"hostile_challenge",
"simplicity_bypass",
"terminology_bypass",
"relevance_challenge",
"misunderstanding_challenge",
}
def validate_master(master_dir: Path) -> list[str]:
"""Validate fidelity.jsonl for a single master. Returns list of errors."""
fidelity_path = master_dir / "tests" / "fidelity.jsonl"
if not fidelity_path.exists():
return [f"{master_dir.name}: no fidelity.jsonl found"]
errors = []
lines = fidelity_path.read_text(encoding="utf-8").strip().splitlines()
if len(lines) < 5:
errors.append(f"{master_dir.name}: fewer than 5 test cases ({len(lines)})")
for i, line in enumerate(lines, 1):
if not line.strip():
continue
try:
test = json.loads(line)
except json.JSONDecodeError as e:
errors.append(f"{master_dir.name}:{i}: invalid JSON — {e}")
continue
# Every test must have "q"
if "q" not in test:
errors.append(f"{master_dir.name}:{i}: missing 'q' field")
# Must have at least one assertion
has_assertion = any(
k in test
for k in [
"must_cite",
"must_mention",
"must_not_contain",
"must_not_contain_first_turn",
]
)
if not has_assertion:
errors.append(f"{master_dir.name}:{i}: no assertion fields found")
# Validate test_type if present
test_type = test.get("test_type")
if test_type and test_type not in VALID_TEST_TYPES:
errors.append(
f"{master_dir.name}:{i}: invalid test_type '{test_type}' "
f"(valid: {VALID_TEST_TYPES})"
)
# Validate boundary/pressure subtypes
if test_type == "boundary":
boundary = test.get("boundary")
if not boundary:
errors.append(f"{master_dir.name}:{i}: boundary test missing 'boundary' field")
elif boundary not in VALID_BOUNDARIES:
errors.append(
f"{master_dir.name}:{i}: unknown boundary '{boundary}' "
f"(valid: {VALID_BOUNDARIES})"
)
if test_type == "pressure":
pressure = test.get("pressure")
if not pressure:
errors.append(f"{master_dir.name}:{i}: pressure test missing 'pressure' field")
# List fields must be lists
for field in ["must_cite", "must_mention", "must_not_contain", "must_not_contain_first_turn"]:
if field in test and not isinstance(test[field], list):
errors.append(f"{master_dir.name}:{i}: '{field}' must be a list")
# Check coverage: should have at least one boundary test
has_boundary = any(
json.loads(l).get("test_type") == "boundary"
for l in lines
if l.strip()
)
if not has_boundary:
errors.append(f"{master_dir.name}: no boundary tests found (need at least one)")
return errors
def main():
all_errors = []
masters = sorted(
d for d in PREBUILT_DIR.iterdir()
if d.is_dir() and d.name != "compare"
)
for master_dir in masters:
errors = validate_master(master_dir)
all_errors.extend(errors)
if not errors:
fidelity_path = master_dir / "tests" / "fidelity.jsonl"
count = len(fidelity_path.read_text().strip().splitlines()) if fidelity_path.exists() else 0
print(f" {master_dir.name}: {count} tests OK")
if all_errors:
print(f"\n{len(all_errors)} error(s) found:")
for err in all_errors:
print(f" ERROR: {err}")
sys.exit(1)
else:
print(f"\nAll {len(masters)} masters validated successfully.")
if __name__ == "__main__":
main()