Files
Master-skill/scripts/test-fidelity.py
T
xianren bd04ede817 chore(governance): add ETHICS.md, CHANGELOG.md, and PR fidelity smoke
Three governance-tier additions to convert Master-skill from a demo into a
defensible long-lived project:

**ETHICS.md** — mandatory governance document:
- AI transparency: outputs are AI-synthesized, not masters' own words
- Copyright tiers A (public domain, current 8), B (in-copyright, needs
  license), C (never admit: living masters, Buddhas/bodhisattvas, apocryphal
  figures), D (case-by-case)
- Religious boundary: AI must refuse precept transmission, awakening
  certification, karmic diagnosis, spirit-medium framing, etc.
- Dual-track content license: code MIT, master content CC BY-NC-SA 4.0,
  prompts CC BY 4.0
- Takedown + appeal channel with 48h / 7d SLAs

**CHANGELOG.md** — Keep a Changelog format:
- [Unreleased] captures the current governance + community + npm work
- [0.3.0] retroactively documents the architectural rebuild (provenance,
  fidelity, NPX, multi-platform, HARD-GATE, two-stage review)
- [0.2.0], [0.1.0] historical sections

**CI fidelity smoke** — make HARD-GATE a real gate, not just documentation:
- New `fidelity-smoke` job runs one basic-difficulty fixture against one
  master per PR; picks the master touched by the diff, else rotates by
  day-of-year for uniform coverage
- Cost cap ≈ $0.05/PR (~10k-token system prompt × 1 request × Sonnet 4.6
  pricing). Forks without ANTHROPIC_API_KEY get an advisory pass so
  external PRs can still land
- `scripts/test-fidelity.py` gains `--max-tests N` flag; when capping, it
  sorts by difficulty (basic → intermediate → advanced) so smoke runs hit
  the reliable floor, not stress cases
- Old `fidelity` job renamed to `fidelity-full` (still workflow_dispatch)

README 声明 section now links to ETHICS.md so every reader sees the AI
disclosure and boundary rules before copying master content.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 13:45:56 +08:00

283 lines
9.3 KiB
Python

#!/usr/bin/env python3
"""Master-skill fidelity test runner.
Loads fidelity.jsonl for a master, sends each question through the Claude API
with the master's SKILL.md loaded as system prompt, and checks responses for
expected citations and keywords.
Usage:
python scripts/test-fidelity.py --master zhiyi # test one master
python scripts/test-fidelity.py --master zhiyi --dry-run # show test cases without calling API
python scripts/test-fidelity.py --all # test all masters
python scripts/test-fidelity.py --master zhiyi --model claude-sonnet-4-6 # specific model
Requires:
- ANTHROPIC_API_KEY environment variable
- pip install anthropic
"""
from __future__ import annotations
import argparse
import json
import os
import re
import sys
from pathlib import Path
PREBUILT_DIR = Path(__file__).resolve().parent.parent / "prebuilt"
def load_skill_context(master_dir: Path) -> str:
"""Load SKILL.md + references as a combined system prompt."""
parts: list[str] = []
skill = master_dir / "SKILL.md"
if skill.exists():
parts.append(skill.read_text(encoding="utf-8"))
# Load references (voice.md, teaching.md)
refs_dir = master_dir / "references"
if refs_dir.exists():
for f in sorted(refs_dir.glob("*.md")):
parts.append(f"\n\n---\n# {f.stem}\n\n{f.read_text(encoding='utf-8')}")
# Load source excerpts
sources_dir = master_dir / "sources"
if sources_dir.exists():
for f in sorted(sources_dir.glob("*.md")):
if f.name == "INDEX.md":
continue
parts.append(f"\n\n---\n# Source: {f.stem}\n\n{f.read_text(encoding='utf-8')}")
return "\n".join(parts)
def load_tests(master_dir: Path) -> list[dict]:
"""Load fidelity.jsonl test cases."""
fidelity_path = master_dir / "tests" / "fidelity.jsonl"
if not fidelity_path.exists():
return []
tests = []
for line in fidelity_path.read_text(encoding="utf-8").strip().splitlines():
if line.strip():
tests.append(json.loads(line))
return tests
def check_response(response: str, test_case: dict, is_first_turn: bool = True) -> dict:
"""Check a response against expected citations, mentions, and boundaries.
Returns {passed: bool, missing_cites: [...], missing_mentions: [...],
forbidden_found: [...], boundary_violations: [...]}.
"""
missing_cites = []
for cite in test_case.get("must_cite", []):
if cite not in response:
missing_cites.append(cite)
missing_mentions = []
for mention in test_case.get("must_mention", []):
if mention not in response:
missing_mentions.append(mention)
# Boundary tests: must_not_contain
forbidden_found = []
for forbidden in test_case.get("must_not_contain", []):
if forbidden in response:
forbidden_found.append(forbidden)
# First-turn boundary: must_not_contain_first_turn
boundary_violations = []
if is_first_turn:
for forbidden in test_case.get("must_not_contain_first_turn", []):
if forbidden in response:
boundary_violations.append(forbidden)
passed = (
len(missing_cites) == 0
and len(missing_mentions) == 0
and len(forbidden_found) == 0
and len(boundary_violations) == 0
)
return {
"passed": passed,
"missing_cites": missing_cites,
"missing_mentions": missing_mentions,
"forbidden_found": forbidden_found,
"boundary_violations": boundary_violations,
}
def run_tests(
master_name: str,
dry_run: bool = False,
model: str = "claude-sonnet-4-6",
max_tests: int | None = None,
) -> dict:
"""Run fidelity tests for a master. Returns summary."""
master_dir = PREBUILT_DIR / master_name
if not master_dir.exists():
return {"error": f"Master '{master_name}' not found"}
tests = load_tests(master_dir)
if not tests:
return {"error": f"No fidelity.jsonl found for '{master_name}'"}
if max_tests is not None and max_tests > 0:
# Prefer easier/basic tests when capping — smoke suite should hit
# the reliable floor, not the advanced stress cases.
tests = sorted(
tests,
key=lambda t: {"basic": 0, "intermediate": 1, "advanced": 2}.get(
t.get("difficulty", "intermediate"), 1
),
)[:max_tests]
results: list[dict] = []
if dry_run:
for i, test in enumerate(tests):
results.append({
"index": i,
"question": test["q"],
"must_cite": test.get("must_cite", []),
"must_mention": test.get("must_mention", []),
"difficulty": test.get("difficulty", "unknown"),
"status": "dry_run",
})
return {"master": master_name, "total": len(tests), "results": results}
# Load skill context
system_prompt = load_skill_context(master_dir)
# Import anthropic
try:
import anthropic
except ImportError:
return {"error": "anthropic package not installed. Run: pip install anthropic"}
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
return {"error": "ANTHROPIC_API_KEY environment variable not set"}
client = anthropic.Anthropic(api_key=api_key)
passed = 0
failed = 0
for i, test in enumerate(tests):
print(f" [{i+1}/{len(tests)}] {test['q'][:50]}...", end=" ", flush=True)
try:
message = client.messages.create(
model=model,
max_tokens=2048,
system=system_prompt,
messages=[{"role": "user", "content": test["q"]}],
)
response_text = message.content[0].text
except Exception as e:
results.append({
"index": i,
"question": test["q"],
"status": "api_error",
"error": str(e),
})
failed += 1
print("API ERROR")
continue
check = check_response(response_text, test, is_first_turn=True)
status = "PASS" if check["passed"] else "FAIL"
result_entry = {
"index": i,
"question": test["q"],
"difficulty": test.get("difficulty", "unknown"),
"test_type": test.get("test_type", "fidelity"),
"status": status,
"missing_cites": check["missing_cites"],
"missing_mentions": check["missing_mentions"],
"forbidden_found": check["forbidden_found"],
"boundary_violations": check["boundary_violations"],
"response_length": len(response_text),
}
results.append(result_entry)
if check["passed"]:
passed += 1
print("PASS")
else:
failed += 1
failures = (check["missing_cites"] + check["missing_mentions"]
+ check["forbidden_found"] + check["boundary_violations"])
print(f"FAIL ({failures})")
return {
"master": master_name,
"model": model,
"total": len(tests),
"passed": passed,
"failed": failed,
"pass_rate": f"{passed / len(tests) * 100:.0f}%" if tests else "N/A",
"results": results,
}
def main():
parser = argparse.ArgumentParser(description="Master-skill fidelity test runner")
parser.add_argument("--master", type=str, help="Test a specific master")
parser.add_argument("--all", action="store_true", help="Test all masters with fidelity.jsonl")
parser.add_argument("--dry-run", action="store_true", help="Show test cases without calling API")
parser.add_argument("--model", type=str, default="claude-sonnet-4-6", help="Claude model to use")
parser.add_argument("--json", action="store_true", help="Output as JSON")
parser.add_argument(
"--max-tests",
type=int,
default=None,
help="Cap the number of fixtures per master (smoke runs in CI use 1)",
)
args = parser.parse_args()
if not args.master and not args.all:
parser.error("Specify --master <name> or --all")
if args.all:
masters = sorted(
d.name for d in PREBUILT_DIR.iterdir()
if d.is_dir() and (d / "tests" / "fidelity.jsonl").exists()
)
else:
masters = [args.master]
all_results = []
for master in masters:
print(f"\n{'='*50}")
print(f"Testing: {master}")
print(f"{'='*50}")
result = run_tests(
master, dry_run=args.dry_run, model=args.model, max_tests=args.max_tests
)
all_results.append(result)
if not args.json and "error" not in result:
print(f"\nResult: {result.get('passed', 0)}/{result['total']} passed "
f"({result.get('pass_rate', 'N/A')})")
if args.json:
print(json.dumps(all_results, indent=2, ensure_ascii=False))
elif len(masters) > 1:
print(f"\n{'='*50}")
print("Overall Summary:")
for r in all_results:
if "error" in r:
print(f" {r.get('master', '?')}: {r['error']}")
else:
print(f" {r['master']}: {r.get('passed', 0)}/{r['total']} ({r.get('pass_rate', 'N/A')})")
if __name__ == "__main__":
main()