mirror of
https://github.com/xr843/Master-skill.git
synced 2026-05-10 05:16:25 +00:00
refactor(zhiyi): v0.3 architecture rebuild — progressive disclosure + provenance + fidelity tests
Sample master (zhiyi) rebuilt to new architecture: - SKILL.md slimmed from 225→94 lines with decision tree + Quick Ref - Provenance frontmatter: CBETA IDs, FoJin text IDs, citation_format - voice.md/teaching.md moved to references/ (loaded on demand) - sources/ with canonical excerpts (offline-capable) - tests/fidelity.jsonl: 5 Q&A pairs with expected citations/keywords - scripts/validate.py: cross-master frontmatter linter - scripts/test-fidelity.py: Claude API-based fidelity test runner Follows Anthropic Agent Skills patterns: - Progressive disclosure (metadata→body→references) - Decision tree for branching workflows - Task-gated reference loading - Quick Reference table - Scripts as black boxes (--help, never Read source)
This commit is contained in:
@@ -0,0 +1,230 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Master-skill fidelity test runner.
|
||||
|
||||
Loads fidelity.jsonl for a master, sends each question through the Claude API
|
||||
with the master's SKILL.md loaded as system prompt, and checks responses for
|
||||
expected citations and keywords.
|
||||
|
||||
Usage:
|
||||
python scripts/test-fidelity.py --master zhiyi # test one master
|
||||
python scripts/test-fidelity.py --master zhiyi --dry-run # show test cases without calling API
|
||||
python scripts/test-fidelity.py --all # test all masters
|
||||
python scripts/test-fidelity.py --master zhiyi --model claude-sonnet-4-6 # specific model
|
||||
|
||||
Requires:
|
||||
- ANTHROPIC_API_KEY environment variable
|
||||
- pip install anthropic
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
PREBUILT_DIR = Path(__file__).resolve().parent.parent / "prebuilt"
|
||||
|
||||
|
||||
def load_skill_context(master_dir: Path) -> str:
|
||||
"""Load SKILL.md + references as a combined system prompt."""
|
||||
parts: list[str] = []
|
||||
|
||||
skill = master_dir / "SKILL.md"
|
||||
if skill.exists():
|
||||
parts.append(skill.read_text(encoding="utf-8"))
|
||||
|
||||
# Load references (voice.md, teaching.md)
|
||||
refs_dir = master_dir / "references"
|
||||
if refs_dir.exists():
|
||||
for f in sorted(refs_dir.glob("*.md")):
|
||||
parts.append(f"\n\n---\n# {f.stem}\n\n{f.read_text(encoding='utf-8')}")
|
||||
|
||||
# Load source excerpts
|
||||
sources_dir = master_dir / "sources"
|
||||
if sources_dir.exists():
|
||||
for f in sorted(sources_dir.glob("*.md")):
|
||||
if f.name == "INDEX.md":
|
||||
continue
|
||||
parts.append(f"\n\n---\n# Source: {f.stem}\n\n{f.read_text(encoding='utf-8')}")
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def load_tests(master_dir: Path) -> list[dict]:
|
||||
"""Load fidelity.jsonl test cases."""
|
||||
fidelity_path = master_dir / "tests" / "fidelity.jsonl"
|
||||
if not fidelity_path.exists():
|
||||
return []
|
||||
tests = []
|
||||
for line in fidelity_path.read_text(encoding="utf-8").strip().splitlines():
|
||||
if line.strip():
|
||||
tests.append(json.loads(line))
|
||||
return tests
|
||||
|
||||
|
||||
def check_response(response: str, test_case: dict) -> dict:
|
||||
"""Check a response against expected citations and mentions.
|
||||
|
||||
Returns {passed: bool, missing_cites: [...], missing_mentions: [...]}.
|
||||
"""
|
||||
missing_cites = []
|
||||
for cite in test_case.get("must_cite", []):
|
||||
if cite not in response:
|
||||
missing_cites.append(cite)
|
||||
|
||||
missing_mentions = []
|
||||
for mention in test_case.get("must_mention", []):
|
||||
if mention not in response:
|
||||
missing_mentions.append(mention)
|
||||
|
||||
return {
|
||||
"passed": len(missing_cites) == 0 and len(missing_mentions) == 0,
|
||||
"missing_cites": missing_cites,
|
||||
"missing_mentions": missing_mentions,
|
||||
}
|
||||
|
||||
|
||||
def run_tests(master_name: str, dry_run: bool = False, model: str = "claude-sonnet-4-6") -> dict:
|
||||
"""Run fidelity tests for a master. Returns summary."""
|
||||
master_dir = PREBUILT_DIR / master_name
|
||||
if not master_dir.exists():
|
||||
return {"error": f"Master '{master_name}' not found"}
|
||||
|
||||
tests = load_tests(master_dir)
|
||||
if not tests:
|
||||
return {"error": f"No fidelity.jsonl found for '{master_name}'"}
|
||||
|
||||
results: list[dict] = []
|
||||
|
||||
if dry_run:
|
||||
for i, test in enumerate(tests):
|
||||
results.append({
|
||||
"index": i,
|
||||
"question": test["q"],
|
||||
"must_cite": test.get("must_cite", []),
|
||||
"must_mention": test.get("must_mention", []),
|
||||
"difficulty": test.get("difficulty", "unknown"),
|
||||
"status": "dry_run",
|
||||
})
|
||||
return {"master": master_name, "total": len(tests), "results": results}
|
||||
|
||||
# Load skill context
|
||||
system_prompt = load_skill_context(master_dir)
|
||||
|
||||
# Import anthropic
|
||||
try:
|
||||
import anthropic
|
||||
except ImportError:
|
||||
return {"error": "anthropic package not installed. Run: pip install anthropic"}
|
||||
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
if not api_key:
|
||||
return {"error": "ANTHROPIC_API_KEY environment variable not set"}
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
|
||||
passed = 0
|
||||
failed = 0
|
||||
|
||||
for i, test in enumerate(tests):
|
||||
print(f" [{i+1}/{len(tests)}] {test['q'][:50]}...", end=" ", flush=True)
|
||||
|
||||
try:
|
||||
message = client.messages.create(
|
||||
model=model,
|
||||
max_tokens=2048,
|
||||
system=system_prompt,
|
||||
messages=[{"role": "user", "content": test["q"]}],
|
||||
)
|
||||
response_text = message.content[0].text
|
||||
except Exception as e:
|
||||
results.append({
|
||||
"index": i,
|
||||
"question": test["q"],
|
||||
"status": "api_error",
|
||||
"error": str(e),
|
||||
})
|
||||
failed += 1
|
||||
print("API ERROR")
|
||||
continue
|
||||
|
||||
check = check_response(response_text, test)
|
||||
status = "PASS" if check["passed"] else "FAIL"
|
||||
|
||||
results.append({
|
||||
"index": i,
|
||||
"question": test["q"],
|
||||
"difficulty": test.get("difficulty", "unknown"),
|
||||
"status": status,
|
||||
"missing_cites": check["missing_cites"],
|
||||
"missing_mentions": check["missing_mentions"],
|
||||
"response_length": len(response_text),
|
||||
})
|
||||
|
||||
if check["passed"]:
|
||||
passed += 1
|
||||
print("PASS")
|
||||
else:
|
||||
failed += 1
|
||||
print(f"FAIL (missing: {check['missing_cites'] + check['missing_mentions']})")
|
||||
|
||||
return {
|
||||
"master": master_name,
|
||||
"model": model,
|
||||
"total": len(tests),
|
||||
"passed": passed,
|
||||
"failed": failed,
|
||||
"pass_rate": f"{passed / len(tests) * 100:.0f}%" if tests else "N/A",
|
||||
"results": results,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Master-skill fidelity test runner")
|
||||
parser.add_argument("--master", type=str, help="Test a specific master")
|
||||
parser.add_argument("--all", action="store_true", help="Test all masters with fidelity.jsonl")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show test cases without calling API")
|
||||
parser.add_argument("--model", type=str, default="claude-sonnet-4-6", help="Claude model to use")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.master and not args.all:
|
||||
parser.error("Specify --master <name> or --all")
|
||||
|
||||
if args.all:
|
||||
masters = sorted(
|
||||
d.name for d in PREBUILT_DIR.iterdir()
|
||||
if d.is_dir() and (d / "tests" / "fidelity.jsonl").exists()
|
||||
)
|
||||
else:
|
||||
masters = [args.master]
|
||||
|
||||
all_results = []
|
||||
for master in masters:
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Testing: {master}")
|
||||
print(f"{'='*50}")
|
||||
result = run_tests(master, dry_run=args.dry_run, model=args.model)
|
||||
all_results.append(result)
|
||||
|
||||
if not args.json and "error" not in result:
|
||||
print(f"\nResult: {result.get('passed', 0)}/{result['total']} passed "
|
||||
f"({result.get('pass_rate', 'N/A')})")
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(all_results, indent=2, ensure_ascii=False))
|
||||
elif len(masters) > 1:
|
||||
print(f"\n{'='*50}")
|
||||
print("Overall Summary:")
|
||||
for r in all_results:
|
||||
if "error" in r:
|
||||
print(f" {r.get('master', '?')}: {r['error']}")
|
||||
else:
|
||||
print(f" {r['master']}: {r.get('passed', 0)}/{r['total']} ({r.get('pass_rate', 'N/A')})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,200 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Master-skill SKILL.md frontmatter linter.
|
||||
|
||||
Walks prebuilt/<master>/SKILL.md, validates required fields and conventions
|
||||
per the Anthropic Agent Skills spec + Master-skill provenance extensions.
|
||||
|
||||
Usage:
|
||||
python scripts/validate.py # lint all masters
|
||||
python scripts/validate.py --master zhiyi # lint one master
|
||||
python scripts/validate.py --strict # fail on warnings too
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
PREBUILT_DIR = Path(__file__).resolve().parent.parent / "prebuilt"
|
||||
|
||||
# --- Required and recommended fields ---
|
||||
|
||||
REQUIRED_FIELDS = {"name", "description"}
|
||||
RECOMMENDED_FIELDS = {"version", "license", "lineage", "dates", "sources", "citation_format"}
|
||||
MAX_DESCRIPTION_CHARS = 500
|
||||
MAX_SKILL_LINES = 500
|
||||
|
||||
|
||||
def parse_frontmatter(path: Path) -> tuple[dict, str, list[str]]:
|
||||
"""Parse YAML frontmatter from a SKILL.md file.
|
||||
|
||||
Returns (frontmatter_dict, body, raw_lines).
|
||||
"""
|
||||
text = path.read_text(encoding="utf-8")
|
||||
lines = text.splitlines()
|
||||
if not lines or lines[0].strip() != "---":
|
||||
return {}, text, lines
|
||||
|
||||
end = None
|
||||
for i, line in enumerate(lines[1:], start=1):
|
||||
if line.strip() == "---":
|
||||
end = i
|
||||
break
|
||||
if end is None:
|
||||
return {}, text, lines
|
||||
|
||||
# Minimal YAML parse (no pyyaml dependency)
|
||||
fm: dict = {}
|
||||
current_key = None
|
||||
current_list: list | None = None
|
||||
for line in lines[1:end]:
|
||||
# list item
|
||||
if line.startswith(" - ") and current_key:
|
||||
if current_list is None:
|
||||
current_list = []
|
||||
item = line.strip().lstrip("- ").strip()
|
||||
# Try inline dict (title: xxx)
|
||||
if ":" in item:
|
||||
parts = item.split(":", 1)
|
||||
if current_list and isinstance(current_list[-1], dict):
|
||||
current_list[-1][parts[0].strip()] = parts[1].strip()
|
||||
else:
|
||||
current_list.append({parts[0].strip(): parts[1].strip()})
|
||||
else:
|
||||
current_list.append(item)
|
||||
continue
|
||||
# Save accumulated list
|
||||
if current_list is not None and current_key:
|
||||
fm[current_key] = current_list
|
||||
current_list = None
|
||||
# key: value
|
||||
match = re.match(r"^(\w[\w_-]*):\s*(.*)", line)
|
||||
if match:
|
||||
current_key = match.group(1)
|
||||
value = match.group(2).strip().strip('"').strip("'")
|
||||
if value:
|
||||
fm[current_key] = value
|
||||
# If empty value, might be a list starting next line
|
||||
# Flush last list
|
||||
if current_list is not None and current_key:
|
||||
fm[current_key] = current_list
|
||||
|
||||
body = "\n".join(lines[end + 1 :])
|
||||
return fm, body, lines
|
||||
|
||||
|
||||
def lint_master(master_dir: Path, strict: bool = False) -> list[str]:
|
||||
"""Lint a single master directory. Returns list of issues."""
|
||||
issues: list[str] = []
|
||||
name = master_dir.name
|
||||
skill_path = master_dir / "SKILL.md"
|
||||
|
||||
if not skill_path.exists():
|
||||
issues.append(f"[ERROR] {name}: missing SKILL.md")
|
||||
return issues
|
||||
|
||||
fm, body, lines = parse_frontmatter(skill_path)
|
||||
|
||||
# --- Required fields ---
|
||||
for field in REQUIRED_FIELDS:
|
||||
if field not in fm:
|
||||
issues.append(f"[ERROR] {name}: missing required field '{field}'")
|
||||
|
||||
# --- Recommended fields ---
|
||||
for field in RECOMMENDED_FIELDS:
|
||||
if field not in fm:
|
||||
issues.append(f"[WARN] {name}: missing recommended field '{field}'")
|
||||
|
||||
# --- Description length ---
|
||||
desc = fm.get("description", "")
|
||||
if isinstance(desc, str) and len(desc) > MAX_DESCRIPTION_CHARS:
|
||||
issues.append(f"[WARN] {name}: description exceeds {MAX_DESCRIPTION_CHARS} chars ({len(desc)})")
|
||||
|
||||
# --- SKILL.md line count ---
|
||||
if len(lines) > MAX_SKILL_LINES:
|
||||
issues.append(f"[WARN] {name}: SKILL.md exceeds {MAX_SKILL_LINES} lines ({len(lines)})")
|
||||
|
||||
# --- Sources validation ---
|
||||
sources = fm.get("sources")
|
||||
if isinstance(sources, list):
|
||||
for i, src in enumerate(sources):
|
||||
if isinstance(src, dict):
|
||||
if "title" not in src and "cbeta_id" not in src:
|
||||
issues.append(f"[WARN] {name}: sources[{i}] missing 'title' or 'cbeta_id'")
|
||||
|
||||
# --- Directory structure checks ---
|
||||
refs_dir = master_dir / "references"
|
||||
sources_dir = master_dir / "sources"
|
||||
|
||||
if not refs_dir.exists():
|
||||
issues.append(f"[WARN] {name}: missing references/ directory")
|
||||
else:
|
||||
if not (refs_dir / "voice.md").exists():
|
||||
issues.append(f"[WARN] {name}: missing references/voice.md")
|
||||
if not (refs_dir / "teaching.md").exists():
|
||||
issues.append(f"[WARN] {name}: missing references/teaching.md")
|
||||
|
||||
if not sources_dir.exists():
|
||||
issues.append(f"[WARN] {name}: missing sources/ directory")
|
||||
elif not list(sources_dir.glob("*.md")):
|
||||
issues.append(f"[WARN] {name}: sources/ directory is empty")
|
||||
|
||||
# --- Check for tests ---
|
||||
tests_dir = master_dir / "tests"
|
||||
if not tests_dir.exists() or not (tests_dir / "fidelity.jsonl").exists():
|
||||
issues.append(f"[WARN] {name}: missing tests/fidelity.jsonl")
|
||||
|
||||
# --- Strict mode: treat warnings as errors ---
|
||||
if strict:
|
||||
issues = [i.replace("[WARN] ", "[ERROR]") for i in issues]
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Master-skill SKILL.md linter")
|
||||
parser.add_argument("--master", type=str, help="Lint a specific master only")
|
||||
parser.add_argument("--strict", action="store_true", help="Treat warnings as errors")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.master:
|
||||
dirs = [PREBUILT_DIR / args.master]
|
||||
if not dirs[0].exists():
|
||||
print(f"Master '{args.master}' not found in {PREBUILT_DIR}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
dirs = sorted(d for d in PREBUILT_DIR.iterdir() if d.is_dir())
|
||||
|
||||
all_issues: dict[str, list[str]] = {}
|
||||
has_errors = False
|
||||
|
||||
for d in dirs:
|
||||
issues = lint_master(d, strict=args.strict)
|
||||
if issues:
|
||||
all_issues[d.name] = issues
|
||||
if any("[ERROR]" in i for i in issues):
|
||||
has_errors = True
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(all_issues, indent=2, ensure_ascii=False))
|
||||
else:
|
||||
if not all_issues:
|
||||
print(f"✅ All {len(dirs)} masters pass validation.")
|
||||
else:
|
||||
for name, issues in all_issues.items():
|
||||
for issue in issues:
|
||||
print(issue)
|
||||
print()
|
||||
total_errors = sum(1 for issues in all_issues.values() for i in issues if "[ERROR]" in i)
|
||||
total_warns = sum(1 for issues in all_issues.values() for i in issues if "[WARN]" in i)
|
||||
print(f"Summary: {total_errors} error(s), {total_warns} warning(s) across {len(all_issues)} master(s)")
|
||||
|
||||
sys.exit(1 if has_errors else 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user