refactor(zhiyi): v0.3 architecture rebuild — progressive disclosure + provenance + fidelity tests

Sample master (zhiyi) rebuilt to new architecture: - SKILL.md slimmed from 225→94 lines with decision tree + Quick Ref - Provenance frontmatter: CBETA IDs, FoJin text IDs, citation_format - voice.md/teaching.md moved to references/ (loaded on demand) - sources/ with canonical excerpts (offline-capable) - tests/fidelity.jsonl: 5 Q&A pairs with expected citations/keywords - scripts/validate.py: cross-master frontmatter linter - scripts/test-fidelity.py: Claude API-based fidelity test runner Follows Anthropic Agent Skills patterns: - Progressive disclosure (metadata→body→references) - Decision tree for branching workflows - Task-gated reference loading - Quick Reference table - Scripts as black boxes (--help, never Read source)
2026-05-10 05:16:25 +00:00 · 2026-04-06 07:05:43 +08:00
parent 5b30b081c5
commit 937b642da8
9 changed files with 749 additions and 203 deletions
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+"""Master-skill fidelity test runner.
+
+Loads fidelity.jsonl for a master, sends each question through the Claude API
+with the master's SKILL.md loaded as system prompt, and checks responses for
+expected citations and keywords.
+
+Usage:
+    python scripts/test-fidelity.py --master zhiyi              # test one master
+    python scripts/test-fidelity.py --master zhiyi --dry-run    # show test cases without calling API
+    python scripts/test-fidelity.py --all                       # test all masters
+    python scripts/test-fidelity.py --master zhiyi --model claude-sonnet-4-6  # specific model
+
+Requires:
+    - ANTHROPIC_API_KEY environment variable
+    - pip install anthropic
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import sys
+from pathlib import Path
+
+PREBUILT_DIR = Path(__file__).resolve().parent.parent / "prebuilt"
+
+
+def load_skill_context(master_dir: Path) -> str:
+    """Load SKILL.md + references as a combined system prompt."""
+    parts: list[str] = []
+
+    skill = master_dir / "SKILL.md"
+    if skill.exists():
+        parts.append(skill.read_text(encoding="utf-8"))
+
+    # Load references (voice.md, teaching.md)
+    refs_dir = master_dir / "references"
+    if refs_dir.exists():
+        for f in sorted(refs_dir.glob("*.md")):
+            parts.append(f"\n\n---\n# {f.stem}\n\n{f.read_text(encoding='utf-8')}")
+
+    # Load source excerpts
+    sources_dir = master_dir / "sources"
+    if sources_dir.exists():
+        for f in sorted(sources_dir.glob("*.md")):
+            if f.name == "INDEX.md":
+                continue
+            parts.append(f"\n\n---\n# Source: {f.stem}\n\n{f.read_text(encoding='utf-8')}")
+
+    return "\n".join(parts)
+
+
+def load_tests(master_dir: Path) -> list[dict]:
+    """Load fidelity.jsonl test cases."""
+    fidelity_path = master_dir / "tests" / "fidelity.jsonl"
+    if not fidelity_path.exists():
+        return []
+    tests = []
+    for line in fidelity_path.read_text(encoding="utf-8").strip().splitlines():
+        if line.strip():
+            tests.append(json.loads(line))
+    return tests
+
+
+def check_response(response: str, test_case: dict) -> dict:
+    """Check a response against expected citations and mentions.
+
+    Returns {passed: bool, missing_cites: [...], missing_mentions: [...]}.
+    """
+    missing_cites = []
+    for cite in test_case.get("must_cite", []):
+        if cite not in response:
+            missing_cites.append(cite)
+
+    missing_mentions = []
+    for mention in test_case.get("must_mention", []):
+        if mention not in response:
+            missing_mentions.append(mention)
+
+    return {
+        "passed": len(missing_cites) == 0 and len(missing_mentions) == 0,
+        "missing_cites": missing_cites,
+        "missing_mentions": missing_mentions,
+    }
+
+
+def run_tests(master_name: str, dry_run: bool = False, model: str = "claude-sonnet-4-6") -> dict:
+    """Run fidelity tests for a master. Returns summary."""
+    master_dir = PREBUILT_DIR / master_name
+    if not master_dir.exists():
+        return {"error": f"Master '{master_name}' not found"}
+
+    tests = load_tests(master_dir)
+    if not tests:
+        return {"error": f"No fidelity.jsonl found for '{master_name}'"}
+
+    results: list[dict] = []
+
+    if dry_run:
+        for i, test in enumerate(tests):
+            results.append({
+                "index": i,
+                "question": test["q"],
+                "must_cite": test.get("must_cite", []),
+                "must_mention": test.get("must_mention", []),
+                "difficulty": test.get("difficulty", "unknown"),
+                "status": "dry_run",
+            })
+        return {"master": master_name, "total": len(tests), "results": results}
+
+    # Load skill context
+    system_prompt = load_skill_context(master_dir)
+
+    # Import anthropic
+    try:
+        import anthropic
+    except ImportError:
+        return {"error": "anthropic package not installed. Run: pip install anthropic"}
+
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        return {"error": "ANTHROPIC_API_KEY environment variable not set"}
+
+    client = anthropic.Anthropic(api_key=api_key)
+
+    passed = 0
+    failed = 0
+
+    for i, test in enumerate(tests):
+        print(f"  [{i+1}/{len(tests)}] {test['q'][:50]}...", end=" ", flush=True)
+
+        try:
+            message = client.messages.create(
+                model=model,
+                max_tokens=2048,
+                system=system_prompt,
+                messages=[{"role": "user", "content": test["q"]}],
+            )
+            response_text = message.content[0].text
+        except Exception as e:
+            results.append({
+                "index": i,
+                "question": test["q"],
+                "status": "api_error",
+                "error": str(e),
+            })
+            failed += 1
+            print("API ERROR")
+            continue
+
+        check = check_response(response_text, test)
+        status = "PASS" if check["passed"] else "FAIL"
+
+        results.append({
+            "index": i,
+            "question": test["q"],
+            "difficulty": test.get("difficulty", "unknown"),
+            "status": status,
+            "missing_cites": check["missing_cites"],
+            "missing_mentions": check["missing_mentions"],
+            "response_length": len(response_text),
+        })
+
+        if check["passed"]:
+            passed += 1
+            print("PASS")
+        else:
+            failed += 1
+            print(f"FAIL (missing: {check['missing_cites'] + check['missing_mentions']})")
+
+    return {
+        "master": master_name,
+        "model": model,
+        "total": len(tests),
+        "passed": passed,
+        "failed": failed,
+        "pass_rate": f"{passed / len(tests) * 100:.0f}%" if tests else "N/A",
+        "results": results,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Master-skill fidelity test runner")
+    parser.add_argument("--master", type=str, help="Test a specific master")
+    parser.add_argument("--all", action="store_true", help="Test all masters with fidelity.jsonl")
+    parser.add_argument("--dry-run", action="store_true", help="Show test cases without calling API")
+    parser.add_argument("--model", type=str, default="claude-sonnet-4-6", help="Claude model to use")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+    args = parser.parse_args()
+
+    if not args.master and not args.all:
+        parser.error("Specify --master <name> or --all")
+
+    if args.all:
+        masters = sorted(
+            d.name for d in PREBUILT_DIR.iterdir()
+            if d.is_dir() and (d / "tests" / "fidelity.jsonl").exists()
+        )
+    else:
+        masters = [args.master]
+
+    all_results = []
+    for master in masters:
+        print(f"\n{'='*50}")
+        print(f"Testing: {master}")
+        print(f"{'='*50}")
+        result = run_tests(master, dry_run=args.dry_run, model=args.model)
+        all_results.append(result)
+
+        if not args.json and "error" not in result:
+            print(f"\nResult: {result.get('passed', 0)}/{result['total']} passed "
+                  f"({result.get('pass_rate', 'N/A')})")
+
+    if args.json:
+        print(json.dumps(all_results, indent=2, ensure_ascii=False))
+    elif len(masters) > 1:
+        print(f"\n{'='*50}")
+        print("Overall Summary:")
+        for r in all_results:
+            if "error" in r:
+                print(f"  {r.get('master', '?')}: {r['error']}")
+            else:
+                print(f"  {r['master']}: {r.get('passed', 0)}/{r['total']} ({r.get('pass_rate', 'N/A')})")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+"""Master-skill SKILL.md frontmatter linter.
+
+Walks prebuilt/<master>/SKILL.md, validates required fields and conventions
+per the Anthropic Agent Skills spec + Master-skill provenance extensions.
+
+Usage:
+    python scripts/validate.py                 # lint all masters
+    python scripts/validate.py --master zhiyi  # lint one master
+    python scripts/validate.py --strict        # fail on warnings too
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+PREBUILT_DIR = Path(__file__).resolve().parent.parent / "prebuilt"
+
+# --- Required and recommended fields ---
+
+REQUIRED_FIELDS = {"name", "description"}
+RECOMMENDED_FIELDS = {"version", "license", "lineage", "dates", "sources", "citation_format"}
+MAX_DESCRIPTION_CHARS = 500
+MAX_SKILL_LINES = 500
+
+
+def parse_frontmatter(path: Path) -> tuple[dict, str, list[str]]:
+    """Parse YAML frontmatter from a SKILL.md file.
+
+    Returns (frontmatter_dict, body, raw_lines).
+    """
+    text = path.read_text(encoding="utf-8")
+    lines = text.splitlines()
+    if not lines or lines[0].strip() != "---":
+        return {}, text, lines
+
+    end = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end = i
+            break
+    if end is None:
+        return {}, text, lines
+
+    # Minimal YAML parse (no pyyaml dependency)
+    fm: dict = {}
+    current_key = None
+    current_list: list | None = None
+    for line in lines[1:end]:
+        # list item
+        if line.startswith("  - ") and current_key:
+            if current_list is None:
+                current_list = []
+            item = line.strip().lstrip("- ").strip()
+            # Try inline dict (title: xxx)
+            if ":" in item:
+                parts = item.split(":", 1)
+                if current_list and isinstance(current_list[-1], dict):
+                    current_list[-1][parts[0].strip()] = parts[1].strip()
+                else:
+                    current_list.append({parts[0].strip(): parts[1].strip()})
+            else:
+                current_list.append(item)
+            continue
+        # Save accumulated list
+        if current_list is not None and current_key:
+            fm[current_key] = current_list
+            current_list = None
+        # key: value
+        match = re.match(r"^(\w[\w_-]*):\s*(.*)", line)
+        if match:
+            current_key = match.group(1)
+            value = match.group(2).strip().strip('"').strip("'")
+            if value:
+                fm[current_key] = value
+            # If empty value, might be a list starting next line
+    # Flush last list
+    if current_list is not None and current_key:
+        fm[current_key] = current_list
+
+    body = "\n".join(lines[end + 1 :])
+    return fm, body, lines
+
+
+def lint_master(master_dir: Path, strict: bool = False) -> list[str]:
+    """Lint a single master directory. Returns list of issues."""
+    issues: list[str] = []
+    name = master_dir.name
+    skill_path = master_dir / "SKILL.md"
+
+    if not skill_path.exists():
+        issues.append(f"[ERROR] {name}: missing SKILL.md")
+        return issues
+
+    fm, body, lines = parse_frontmatter(skill_path)
+
+    # --- Required fields ---
+    for field in REQUIRED_FIELDS:
+        if field not in fm:
+            issues.append(f"[ERROR] {name}: missing required field '{field}'")
+
+    # --- Recommended fields ---
+    for field in RECOMMENDED_FIELDS:
+        if field not in fm:
+            issues.append(f"[WARN]  {name}: missing recommended field '{field}'")
+
+    # --- Description length ---
+    desc = fm.get("description", "")
+    if isinstance(desc, str) and len(desc) > MAX_DESCRIPTION_CHARS:
+        issues.append(f"[WARN]  {name}: description exceeds {MAX_DESCRIPTION_CHARS} chars ({len(desc)})")
+
+    # --- SKILL.md line count ---
+    if len(lines) > MAX_SKILL_LINES:
+        issues.append(f"[WARN]  {name}: SKILL.md exceeds {MAX_SKILL_LINES} lines ({len(lines)})")
+
+    # --- Sources validation ---
+    sources = fm.get("sources")
+    if isinstance(sources, list):
+        for i, src in enumerate(sources):
+            if isinstance(src, dict):
+                if "title" not in src and "cbeta_id" not in src:
+                    issues.append(f"[WARN]  {name}: sources[{i}] missing 'title' or 'cbeta_id'")
+
+    # --- Directory structure checks ---
+    refs_dir = master_dir / "references"
+    sources_dir = master_dir / "sources"
+
+    if not refs_dir.exists():
+        issues.append(f"[WARN]  {name}: missing references/ directory")
+    else:
+        if not (refs_dir / "voice.md").exists():
+            issues.append(f"[WARN]  {name}: missing references/voice.md")
+        if not (refs_dir / "teaching.md").exists():
+            issues.append(f"[WARN]  {name}: missing references/teaching.md")
+
+    if not sources_dir.exists():
+        issues.append(f"[WARN]  {name}: missing sources/ directory")
+    elif not list(sources_dir.glob("*.md")):
+        issues.append(f"[WARN]  {name}: sources/ directory is empty")
+
+    # --- Check for tests ---
+    tests_dir = master_dir / "tests"
+    if not tests_dir.exists() or not (tests_dir / "fidelity.jsonl").exists():
+        issues.append(f"[WARN]  {name}: missing tests/fidelity.jsonl")
+
+    # --- Strict mode: treat warnings as errors ---
+    if strict:
+        issues = [i.replace("[WARN] ", "[ERROR]") for i in issues]
+
+    return issues
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Master-skill SKILL.md linter")
+    parser.add_argument("--master", type=str, help="Lint a specific master only")
+    parser.add_argument("--strict", action="store_true", help="Treat warnings as errors")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+    args = parser.parse_args()
+
+    if args.master:
+        dirs = [PREBUILT_DIR / args.master]
+        if not dirs[0].exists():
+            print(f"Master '{args.master}' not found in {PREBUILT_DIR}")
+            sys.exit(1)
+    else:
+        dirs = sorted(d for d in PREBUILT_DIR.iterdir() if d.is_dir())
+
+    all_issues: dict[str, list[str]] = {}
+    has_errors = False
+
+    for d in dirs:
+        issues = lint_master(d, strict=args.strict)
+        if issues:
+            all_issues[d.name] = issues
+            if any("[ERROR]" in i for i in issues):
+                has_errors = True
+
+    if args.json:
+        print(json.dumps(all_issues, indent=2, ensure_ascii=False))
+    else:
+        if not all_issues:
+            print(f"✅ All {len(dirs)} masters pass validation.")
+        else:
+            for name, issues in all_issues.items():
+                for issue in issues:
+                    print(issue)
+            print()
+            total_errors = sum(1 for issues in all_issues.values() for i in issues if "[ERROR]" in i)
+            total_warns = sum(1 for issues in all_issues.values() for i in issues if "[WARN]" in i)
+            print(f"Summary: {total_errors} error(s), {total_warns} warning(s) across {len(all_issues)} master(s)")
+
+    sys.exit(1 if has_errors else 0)
+
+
+if __name__ == "__main__":
+    main()