feat: add CI validation pipeline and boundary test support

- Update test-fidelity.py to support must_not_contain and
  must_not_contain_first_turn fields for boundary/pressure tests
- Add validate-fidelity.py for structural validation of all
  fidelity.jsonl files (no API needed)
- Add GitHub Actions workflow: runs validate + dry-run on every
  push/PR touching prebuilt/scripts/prompts/tools

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
xianren
2026-04-08 21:30:23 +08:00
parent da80665fa5
commit c654e7440f
3 changed files with 244 additions and 8 deletions
+37 -8
View File
@@ -65,10 +65,11 @@ def load_tests(master_dir: Path) -> list[dict]:
return tests
def check_response(response: str, test_case: dict) -> dict:
"""Check a response against expected citations and mentions.
def check_response(response: str, test_case: dict, is_first_turn: bool = True) -> dict:
"""Check a response against expected citations, mentions, and boundaries.
Returns {passed: bool, missing_cites: [...], missing_mentions: [...]}.
Returns {passed: bool, missing_cites: [...], missing_mentions: [...],
forbidden_found: [...], boundary_violations: [...]}.
"""
missing_cites = []
for cite in test_case.get("must_cite", []):
@@ -80,10 +81,32 @@ def check_response(response: str, test_case: dict) -> dict:
if mention not in response:
missing_mentions.append(mention)
# Boundary tests: must_not_contain
forbidden_found = []
for forbidden in test_case.get("must_not_contain", []):
if forbidden in response:
forbidden_found.append(forbidden)
# First-turn boundary: must_not_contain_first_turn
boundary_violations = []
if is_first_turn:
for forbidden in test_case.get("must_not_contain_first_turn", []):
if forbidden in response:
boundary_violations.append(forbidden)
passed = (
len(missing_cites) == 0
and len(missing_mentions) == 0
and len(forbidden_found) == 0
and len(boundary_violations) == 0
)
return {
"passed": len(missing_cites) == 0 and len(missing_mentions) == 0,
"passed": passed,
"missing_cites": missing_cites,
"missing_mentions": missing_mentions,
"forbidden_found": forbidden_found,
"boundary_violations": boundary_violations,
}
@@ -151,25 +174,31 @@ def run_tests(master_name: str, dry_run: bool = False, model: str = "claude-sonn
print("API ERROR")
continue
check = check_response(response_text, test)
check = check_response(response_text, test, is_first_turn=True)
status = "PASS" if check["passed"] else "FAIL"
results.append({
result_entry = {
"index": i,
"question": test["q"],
"difficulty": test.get("difficulty", "unknown"),
"test_type": test.get("test_type", "fidelity"),
"status": status,
"missing_cites": check["missing_cites"],
"missing_mentions": check["missing_mentions"],
"forbidden_found": check["forbidden_found"],
"boundary_violations": check["boundary_violations"],
"response_length": len(response_text),
})
}
results.append(result_entry)
if check["passed"]:
passed += 1
print("PASS")
else:
failed += 1
print(f"FAIL (missing: {check['missing_cites'] + check['missing_mentions']})")
failures = (check["missing_cites"] + check["missing_mentions"]
+ check["forbidden_found"] + check["boundary_violations"])
print(f"FAIL ({failures})")
return {
"master": master_name,