feat: add source verification tool to validate FoJin links

Scans all prebuilt teachers for CBETA IDs and fojin.app URLs, verifies against FoJin API, and can fix URLs to use internal text_ids (--fix flag). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-10 05:16:25 +00:00 · 2026-04-04 19:57:48 +08:00
parent 901fbee3c3
commit d0709af973
1 changed files with 361 additions and 0 deletions
@@ -0,0 +1,361 @@
 #!/usr/bin/env python3
 """
 Verify and fix FoJin source links in all prebuilt teacher skills.
 Discovers CBETA IDs from meta.json sources and fojin.app URLs in markdown
 files, then verifies each against FoJin's API and maps to internal text_ids.
 Key insight: meta.json and URLs use the full CBETA catalog format (e.g.
 T08n0235) while FoJin internally uses a shorter cbeta_id (e.g. T0235).
 This script handles the conversion.
 Usage:
    python3 tools/verify_sources.py          # Dry run - report only
    python3 tools/verify_sources.py --fix    # Actually update files
 """
 import argparse
 import json
 import os
 import re
 import sys
 import time
 # Allow importing fojin_bridge from tools/
 sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
 from fojin_bridge import create_bridge
 PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 PREBUILT_DIR = os.path.join(PROJECT_ROOT, "prebuilt")
 # Matches fojin.app/texts/<ID> in URLs — ID can be CBETA-style or numeric
 FOJIN_URL_RE = re.compile(r"(https?://fojin\.app/texts/)([A-Za-z0-9n]+)")
 # Full CBETA catalog ID pattern: T08n0235, X62n1182, J36n0348
 FULL_CBETA_RE = re.compile(r"^([A-Z])(\d+)n(\d+[a-z]?)$")
 def full_to_short_cbeta(full_id: str) -> str | None:
    """Convert full CBETA ID (T08n0235) to FoJin short format (T0235).
    FoJin stores cbeta_id as the collection prefix + text number,
    dropping the volume number. E.g.:
        T08n0235  -> T0235
        X62n1182  -> X1182
        J36n0348  -> J0348
        T34n1718  -> T1718
    """
    m = FULL_CBETA_RE.match(full_id)
    if not m:
        return None
    prefix = m.group(1)    # T, X, J, etc.
    text_num = m.group(3)  # 0235, 1182, etc.
    return f"{prefix}{text_num}"
 def collect_cbeta_ids() -> dict[str, list[str]]:
    """Scan all meta.json and return {full_cbeta_id: [teacher_slugs]}."""
    cbeta_map: dict[str, list[str]] = {}
    for teacher in sorted(os.listdir(PREBUILT_DIR)):
        meta_path = os.path.join(PREBUILT_DIR, teacher, "meta.json")
        if not os.path.isfile(meta_path):
            continue
        with open(meta_path, encoding="utf-8") as f:
            meta = json.load(f)
        for src in meta.get("sources", []):
            if src.get("type") == "cbeta" and src.get("id"):
                cbeta_id = src["id"]
                cbeta_map.setdefault(cbeta_id, []).append(teacher)
    return cbeta_map
 def collect_all_fojin_urls() -> dict[str, list[tuple[str, int]]]:
    """Scan all md/py files and return {id_in_url: [(filepath, line_num)]}."""
    url_map: dict[str, list[tuple[str, int]]] = {}
    scan_dirs = [PREBUILT_DIR, os.path.join(PROJECT_ROOT, "prompts")]
    extensions = {".md", ".py"}
    for scan_dir in scan_dirs:
        if not os.path.isdir(scan_dir):
            continue
        for root, _dirs, files in os.walk(scan_dir):
            for fname in files:
                if os.path.splitext(fname)[1] not in extensions:
                    continue
                fpath = os.path.join(root, fname)
                with open(fpath, encoding="utf-8") as f:
                    for line_num, line in enumerate(f, 1):
                        for m in FOJIN_URL_RE.finditer(line):
                            text_id_in_url = m.group(2)
                            url_map.setdefault(text_id_in_url, []).append(
                                (fpath, line_num)
                            )
    return url_map
 def verify_via_search(bridge, title: str, short_cbeta_id: str) -> dict | None:
    """Search FoJin for a text by title, return the matching result with cbeta_id match."""
    try:
        resp = bridge.search_texts(title, sources="cbeta", page=1, size=5)
        for r in resp.get("results", []):
            if r.get("cbeta_id") == short_cbeta_id:
                return r
        # Also try without source filter
        resp = bridge.search_texts(title, page=1, size=5)
        for r in resp.get("results", []):
            if r.get("cbeta_id") == short_cbeta_id:
                return r
    except Exception:
        pass
    return None
 def verify_via_lookup(bridge, short_ids: list[str]) -> dict:
    """Try the batch lookup-cbeta endpoint. Returns {short_cbeta_id: internal_id}."""
    result = {}
    try:
        ids_str = ",".join(short_ids)
        resp = bridge.lookup_cbeta_ids(ids_str)
        if isinstance(resp, dict):
            mapping = resp.get("results") or resp.get("data") or resp
            for sid in short_ids:
                entry = mapping.get(sid)
                if entry and isinstance(entry, dict):
                    result[sid] = entry.get("text_id") or entry.get("id")
                elif entry and isinstance(entry, int):
                    result[sid] = entry
    except Exception:
        pass  # Endpoint may not be implemented; fall back to search
    return result
 def verify_ids(bridge, cbeta_map: dict[str, list[str]], titles: dict[str, str]) -> dict[str, dict]:
    """Verify all CBETA IDs and return {full_cbeta_id: {text_id, short_id, title, ...}}.
    Args:
        cbeta_map: {full_cbeta_id: [teacher_slugs]}
        titles: {full_cbeta_id: title_from_meta} for search fallback
    """
    results: dict[str, dict] = {}
    # Build full -> short mapping
    full_to_short = {}
    short_to_full = {}
    for full_id in cbeta_map:
        short = full_to_short_cbeta(full_id)
        if short:
            full_to_short[full_id] = short
            short_to_full[short] = full_id
    # Try batch lookup first
    short_ids = list(full_to_short.values())
    lookup_result = verify_via_lookup(bridge, short_ids)
    for full_id, short_id in sorted(full_to_short.items()):
        if short_id in lookup_result:
            results[full_id] = {
                "text_id": lookup_result[short_id],
                "short_cbeta_id": short_id,
                "method": "lookup",
            }
            continue
        # Fallback: search by title
        title = titles.get(full_id, "")
        if title:
            time.sleep(0.2)  # Rate limit
            match = verify_via_search(bridge, title, short_id)
            if match:
                results[full_id] = {
                    "text_id": match["id"],
                    "short_cbeta_id": short_id,
                    "title_zh": match.get("title_zh", ""),
                    "method": "search",
                }
                continue
        # Try direct get with the internal ID if it's numeric-ish
        # Last resort: not found
        results[full_id] = {
            "text_id": None,
            "short_cbeta_id": short_id,
            "method": "not_found",
        }
    return results
 def collect_titles_from_meta() -> dict[str, str]:
    """Collect {full_cbeta_id: title} from meta.json sources."""
    titles = {}
    for teacher in os.listdir(PREBUILT_DIR):
        meta_path = os.path.join(PREBUILT_DIR, teacher, "meta.json")
        if not os.path.isfile(meta_path):
            continue
        with open(meta_path, encoding="utf-8") as f:
            meta = json.load(f)
        for src in meta.get("sources", []):
            if src.get("type") == "cbeta" and src.get("id") and src.get("title"):
                titles[src["id"]] = src["title"]
    return titles
 def fix_urls_in_file(
    filepath: str, id_map: dict[str, str], dry_run: bool
 ) -> list[str]:
    """Replace CBETA IDs with internal text_ids in URLs. Returns list of changes."""
    changes = []
    with open(filepath, encoding="utf-8") as f:
        content = f.read()
    def replacer(m):
        prefix = m.group(1)
        old_id = m.group(2)
        if old_id in id_map:
            new_id = id_map[old_id]
            rel = os.path.relpath(filepath, PROJECT_ROOT)
            changes.append(f"    {rel}: {old_id} -> {new_id}")
            return prefix + new_id
        return m.group(0)
    new_content = FOJIN_URL_RE.sub(replacer, content)
    if not dry_run and new_content != content:
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(new_content)
    return changes
 def main():
    parser = argparse.ArgumentParser(description="Verify and fix FoJin source links")
    parser.add_argument(
        "--fix", action="store_true", help="Actually write changes to files"
    )
    args = parser.parse_args()
    dry_run = not args.fix
    print("=" * 60)
    print("FoJin Source Verification Report")
    print("=" * 60)
    if dry_run:
        print("Mode: DRY RUN (use --fix to apply changes)\n")
    else:
        print("Mode: FIX (writing changes to files)\n")
    # Step 1: Collect CBETA IDs from meta.json
    cbeta_map = collect_cbeta_ids()
    all_cbeta_ids = sorted(cbeta_map.keys())
    teacher_count = len(set(t for ts in cbeta_map.values() for t in ts))
    print(f"[1/4] Found {len(all_cbeta_ids)} unique CBETA IDs across {teacher_count} teachers")
    for cid in all_cbeta_ids:
        short = full_to_short_cbeta(cid) or "?"
        print(f"  {cid} (-> {short}) <- {', '.join(cbeta_map[cid])}")
    # Step 2: Collect all fojin.app URLs from files
    url_map = collect_all_fojin_urls()
    all_url_ids = sorted(url_map.keys())
    total_urls = sum(len(v) for v in url_map.values())
    print(f"\n[2/4] Found {total_urls} fojin.app URLs using {len(all_url_ids)} unique IDs")
    # IDs in URLs but not in meta.json sources
    extra_url_ids = set(all_url_ids) - set(all_cbeta_ids)
    if extra_url_ids:
        print(f"  Extra IDs in URLs (not in meta.json sources):")
        for eid in sorted(extra_url_ids):
            locs = url_map[eid]
            files = set(os.path.relpath(f, PROJECT_ROOT) for f, _ in locs)
            print(f"    {eid} in {', '.join(sorted(files))}")
    # Combine: all unique CBETA-style IDs from both meta.json and URLs
    all_ids = set(all_cbeta_ids)
    for uid in all_url_ids:
        if FULL_CBETA_RE.match(uid):
            all_ids.add(uid)
    # Non-CBETA IDs in URLs (e.g. suttacentral IDs, placeholder "123")
    non_cbeta_url_ids = [uid for uid in all_url_ids if not FULL_CBETA_RE.match(uid)]
    if non_cbeta_url_ids:
        print(f"  Non-CBETA IDs in URLs (skipped): {', '.join(non_cbeta_url_ids)}")
    # Step 3: Verify with FoJin API
    print(f"\n[3/4] Verifying {len(all_ids)} CBETA IDs against FoJin API...")
    bridge = create_bridge()
    if not bridge.test_connection():
        print("  [ERROR] Cannot connect to FoJin API.")
        print("  Set FOJIN_URL environment variable if using a custom instance.")
        sys.exit(1)
    print("  API connection OK")
    # Build combined cbeta_map (include URL-only IDs)
    combined_map = dict(cbeta_map)
    for uid in all_url_ids:
        if FULL_CBETA_RE.match(uid) and uid not in combined_map:
            combined_map[uid] = ["(URL only)"]
    titles = collect_titles_from_meta()
    verified = verify_ids(bridge, combined_map, titles)
    found = {k: v for k, v in verified.items() if v["text_id"] is not None}
    not_found = {k: v for k, v in verified.items() if v["text_id"] is None}
    print(f"\n  Verified: {len(found)}/{len(verified)}")
    for cid in sorted(found):
        info = found[cid]
        title = info.get("title_zh", titles.get(cid, ""))
        print(f"    [OK]   {cid} -> text_id={info['text_id']}  {title}  ({info['method']})")
    if not_found:
        print(f"\n  Not found in FoJin ({len(not_found)}):")
        for cid in sorted(not_found):
            teachers = combined_map.get(cid, ["?"])
            print(f"    [MISS] {cid} (-> {not_found[cid]['short_cbeta_id']}) used by: {', '.join(teachers)}")
    # Step 4: Update URLs
    # Build replacement map: full_cbeta_id -> str(internal_text_id)
    id_replacement_map: dict[str, str] = {}
    for full_id, info in found.items():
        id_replacement_map[full_id] = str(info["text_id"])
    action = "Would update" if dry_run else "Updating"
    print(f"\n[4/4] {action} URLs...")
    all_changes = []
    files_to_fix = set()
    for locations in url_map.values():
        for fpath, _ in locations:
            files_to_fix.add(fpath)
    for fpath in sorted(files_to_fix):
        changes = fix_urls_in_file(fpath, id_replacement_map, dry_run)
        all_changes.extend(changes)
    if all_changes:
        for c in all_changes:
            print(c)
        verb = "would be made" if dry_run else "applied"
        print(f"\n  Total: {len(all_changes)} URL replacements {verb}")
    else:
        print("  No URL replacements needed")
    # Summary
    print("\n" + "=" * 60)
    print("Summary")
    print("=" * 60)
    print(f"  CBETA IDs in meta.json:    {len(all_cbeta_ids)}")
    print(f"  CBETA IDs in URLs:         {len([u for u in all_url_ids if FULL_CBETA_RE.match(u)])}")
    print(f"  Total unique CBETA IDs:    {len(all_ids)}")
    print(f"  Verified in FoJin:         {len(found)}")
    print(f"  Not found in FoJin:        {len(not_found)}")
    print(f"  URL replacements:          {len(all_changes)}")
    if dry_run and all_changes:
        print("\n  Run with --fix to apply changes.")
 if __name__ == "__main__":
    main()