mirror of
https://github.com/xr843/Master-skill.git
synced 2026-05-10 05:16:25 +00:00
feat: add CI validation pipeline and boundary test support
- Update test-fidelity.py to support must_not_contain and must_not_contain_first_turn fields for boundary/pressure tests - Add validate-fidelity.py for structural validation of all fidelity.jsonl files (no API needed) - Add GitHub Actions workflow: runs validate + dry-run on every push/PR touching prebuilt/scripts/prompts/tools Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,66 @@
|
|||||||
|
name: Validate & Test
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
paths:
|
||||||
|
- 'prebuilt/**'
|
||||||
|
- 'scripts/**'
|
||||||
|
- 'prompts/**'
|
||||||
|
- 'tools/**'
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- 'prebuilt/**'
|
||||||
|
- 'scripts/**'
|
||||||
|
- 'prompts/**'
|
||||||
|
- 'tools/**'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
validate:
|
||||||
|
name: Validate SKILL.md & fidelity structure
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: pip install requests pypinyin pyyaml
|
||||||
|
|
||||||
|
- name: Lint SKILL.md frontmatter
|
||||||
|
run: python scripts/validate.py --strict
|
||||||
|
|
||||||
|
- name: Validate fidelity.jsonl structure
|
||||||
|
run: python scripts/validate-fidelity.py
|
||||||
|
|
||||||
|
- name: Dry-run fidelity tests
|
||||||
|
run: python scripts/test-fidelity.py --all --dry-run
|
||||||
|
|
||||||
|
fidelity:
|
||||||
|
name: Fidelity tests (API)
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: github.event_name == 'workflow_dispatch'
|
||||||
|
needs: validate
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: pip install anthropic requests pypinyin
|
||||||
|
|
||||||
|
- name: Run fidelity tests
|
||||||
|
env:
|
||||||
|
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||||
|
run: python scripts/test-fidelity.py --all --json > fidelity-results.json
|
||||||
|
|
||||||
|
- name: Upload results
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: fidelity-results
|
||||||
|
path: fidelity-results.json
|
||||||
@@ -65,10 +65,11 @@ def load_tests(master_dir: Path) -> list[dict]:
|
|||||||
return tests
|
return tests
|
||||||
|
|
||||||
|
|
||||||
def check_response(response: str, test_case: dict) -> dict:
|
def check_response(response: str, test_case: dict, is_first_turn: bool = True) -> dict:
|
||||||
"""Check a response against expected citations and mentions.
|
"""Check a response against expected citations, mentions, and boundaries.
|
||||||
|
|
||||||
Returns {passed: bool, missing_cites: [...], missing_mentions: [...]}.
|
Returns {passed: bool, missing_cites: [...], missing_mentions: [...],
|
||||||
|
forbidden_found: [...], boundary_violations: [...]}.
|
||||||
"""
|
"""
|
||||||
missing_cites = []
|
missing_cites = []
|
||||||
for cite in test_case.get("must_cite", []):
|
for cite in test_case.get("must_cite", []):
|
||||||
@@ -80,10 +81,32 @@ def check_response(response: str, test_case: dict) -> dict:
|
|||||||
if mention not in response:
|
if mention not in response:
|
||||||
missing_mentions.append(mention)
|
missing_mentions.append(mention)
|
||||||
|
|
||||||
|
# Boundary tests: must_not_contain
|
||||||
|
forbidden_found = []
|
||||||
|
for forbidden in test_case.get("must_not_contain", []):
|
||||||
|
if forbidden in response:
|
||||||
|
forbidden_found.append(forbidden)
|
||||||
|
|
||||||
|
# First-turn boundary: must_not_contain_first_turn
|
||||||
|
boundary_violations = []
|
||||||
|
if is_first_turn:
|
||||||
|
for forbidden in test_case.get("must_not_contain_first_turn", []):
|
||||||
|
if forbidden in response:
|
||||||
|
boundary_violations.append(forbidden)
|
||||||
|
|
||||||
|
passed = (
|
||||||
|
len(missing_cites) == 0
|
||||||
|
and len(missing_mentions) == 0
|
||||||
|
and len(forbidden_found) == 0
|
||||||
|
and len(boundary_violations) == 0
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"passed": len(missing_cites) == 0 and len(missing_mentions) == 0,
|
"passed": passed,
|
||||||
"missing_cites": missing_cites,
|
"missing_cites": missing_cites,
|
||||||
"missing_mentions": missing_mentions,
|
"missing_mentions": missing_mentions,
|
||||||
|
"forbidden_found": forbidden_found,
|
||||||
|
"boundary_violations": boundary_violations,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -151,25 +174,31 @@ def run_tests(master_name: str, dry_run: bool = False, model: str = "claude-sonn
|
|||||||
print("API ERROR")
|
print("API ERROR")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
check = check_response(response_text, test)
|
check = check_response(response_text, test, is_first_turn=True)
|
||||||
status = "PASS" if check["passed"] else "FAIL"
|
status = "PASS" if check["passed"] else "FAIL"
|
||||||
|
|
||||||
results.append({
|
result_entry = {
|
||||||
"index": i,
|
"index": i,
|
||||||
"question": test["q"],
|
"question": test["q"],
|
||||||
"difficulty": test.get("difficulty", "unknown"),
|
"difficulty": test.get("difficulty", "unknown"),
|
||||||
|
"test_type": test.get("test_type", "fidelity"),
|
||||||
"status": status,
|
"status": status,
|
||||||
"missing_cites": check["missing_cites"],
|
"missing_cites": check["missing_cites"],
|
||||||
"missing_mentions": check["missing_mentions"],
|
"missing_mentions": check["missing_mentions"],
|
||||||
|
"forbidden_found": check["forbidden_found"],
|
||||||
|
"boundary_violations": check["boundary_violations"],
|
||||||
"response_length": len(response_text),
|
"response_length": len(response_text),
|
||||||
})
|
}
|
||||||
|
results.append(result_entry)
|
||||||
|
|
||||||
if check["passed"]:
|
if check["passed"]:
|
||||||
passed += 1
|
passed += 1
|
||||||
print("PASS")
|
print("PASS")
|
||||||
else:
|
else:
|
||||||
failed += 1
|
failed += 1
|
||||||
print(f"FAIL (missing: {check['missing_cites'] + check['missing_mentions']})")
|
failures = (check["missing_cites"] + check["missing_mentions"]
|
||||||
|
+ check["forbidden_found"] + check["boundary_violations"])
|
||||||
|
print(f"FAIL ({failures})")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"master": master_name,
|
"master": master_name,
|
||||||
|
|||||||
@@ -0,0 +1,141 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Validate fidelity.jsonl structure for all masters.
|
||||||
|
|
||||||
|
Checks that every test case has required fields and valid structure.
|
||||||
|
No API calls needed — pure structural validation.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/validate-fidelity.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
PREBUILT_DIR = Path(__file__).resolve().parent.parent / "prebuilt"
|
||||||
|
|
||||||
|
VALID_TEST_TYPES = {"fidelity", "boundary", "pressure"}
|
||||||
|
VALID_BOUNDARIES = {
|
||||||
|
"sectarian_judgment",
|
||||||
|
"no_prophecy",
|
||||||
|
"neutral_first_turn",
|
||||||
|
}
|
||||||
|
VALID_PRESSURES = {
|
||||||
|
"citation_bypass",
|
||||||
|
"informality_bypass",
|
||||||
|
"meta_challenge",
|
||||||
|
"hostile_challenge",
|
||||||
|
"simplicity_bypass",
|
||||||
|
"terminology_bypass",
|
||||||
|
"relevance_challenge",
|
||||||
|
"misunderstanding_challenge",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def validate_master(master_dir: Path) -> list[str]:
|
||||||
|
"""Validate fidelity.jsonl for a single master. Returns list of errors."""
|
||||||
|
fidelity_path = master_dir / "tests" / "fidelity.jsonl"
|
||||||
|
if not fidelity_path.exists():
|
||||||
|
return [f"{master_dir.name}: no fidelity.jsonl found"]
|
||||||
|
|
||||||
|
errors = []
|
||||||
|
lines = fidelity_path.read_text(encoding="utf-8").strip().splitlines()
|
||||||
|
|
||||||
|
if len(lines) < 5:
|
||||||
|
errors.append(f"{master_dir.name}: fewer than 5 test cases ({len(lines)})")
|
||||||
|
|
||||||
|
for i, line in enumerate(lines, 1):
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
test = json.loads(line)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
errors.append(f"{master_dir.name}:{i}: invalid JSON — {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Every test must have "q"
|
||||||
|
if "q" not in test:
|
||||||
|
errors.append(f"{master_dir.name}:{i}: missing 'q' field")
|
||||||
|
|
||||||
|
# Must have at least one assertion
|
||||||
|
has_assertion = any(
|
||||||
|
k in test
|
||||||
|
for k in [
|
||||||
|
"must_cite",
|
||||||
|
"must_mention",
|
||||||
|
"must_not_contain",
|
||||||
|
"must_not_contain_first_turn",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if not has_assertion:
|
||||||
|
errors.append(f"{master_dir.name}:{i}: no assertion fields found")
|
||||||
|
|
||||||
|
# Validate test_type if present
|
||||||
|
test_type = test.get("test_type")
|
||||||
|
if test_type and test_type not in VALID_TEST_TYPES:
|
||||||
|
errors.append(
|
||||||
|
f"{master_dir.name}:{i}: invalid test_type '{test_type}' "
|
||||||
|
f"(valid: {VALID_TEST_TYPES})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate boundary/pressure subtypes
|
||||||
|
if test_type == "boundary":
|
||||||
|
boundary = test.get("boundary")
|
||||||
|
if not boundary:
|
||||||
|
errors.append(f"{master_dir.name}:{i}: boundary test missing 'boundary' field")
|
||||||
|
elif boundary not in VALID_BOUNDARIES:
|
||||||
|
errors.append(
|
||||||
|
f"{master_dir.name}:{i}: unknown boundary '{boundary}' "
|
||||||
|
f"(valid: {VALID_BOUNDARIES})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if test_type == "pressure":
|
||||||
|
pressure = test.get("pressure")
|
||||||
|
if not pressure:
|
||||||
|
errors.append(f"{master_dir.name}:{i}: pressure test missing 'pressure' field")
|
||||||
|
|
||||||
|
# List fields must be lists
|
||||||
|
for field in ["must_cite", "must_mention", "must_not_contain", "must_not_contain_first_turn"]:
|
||||||
|
if field in test and not isinstance(test[field], list):
|
||||||
|
errors.append(f"{master_dir.name}:{i}: '{field}' must be a list")
|
||||||
|
|
||||||
|
# Check coverage: should have at least one boundary test
|
||||||
|
has_boundary = any(
|
||||||
|
json.loads(l).get("test_type") == "boundary"
|
||||||
|
for l in lines
|
||||||
|
if l.strip()
|
||||||
|
)
|
||||||
|
if not has_boundary:
|
||||||
|
errors.append(f"{master_dir.name}: no boundary tests found (need at least one)")
|
||||||
|
|
||||||
|
return errors
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
all_errors = []
|
||||||
|
masters = sorted(
|
||||||
|
d for d in PREBUILT_DIR.iterdir()
|
||||||
|
if d.is_dir() and d.name != "compare"
|
||||||
|
)
|
||||||
|
|
||||||
|
for master_dir in masters:
|
||||||
|
errors = validate_master(master_dir)
|
||||||
|
all_errors.extend(errors)
|
||||||
|
if not errors:
|
||||||
|
fidelity_path = master_dir / "tests" / "fidelity.jsonl"
|
||||||
|
count = len(fidelity_path.read_text().strip().splitlines()) if fidelity_path.exists() else 0
|
||||||
|
print(f" {master_dir.name}: {count} tests OK")
|
||||||
|
|
||||||
|
if all_errors:
|
||||||
|
print(f"\n{len(all_errors)} error(s) found:")
|
||||||
|
for err in all_errors:
|
||||||
|
print(f" ERROR: {err}")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print(f"\nAll {len(masters)} masters validated successfully.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user