feat: add source verification tool to validate FoJin links

Scans all prebuilt teachers for CBETA IDs and fojin.app URLs, verifies against FoJin API, and can fix URLs to use internal text_ids (--fix flag). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-10 05:16:25 +00:00 · 2026-04-04 19:57:48 +08:00
parent 901fbee3c3
commit d0709af973
1 changed files with 361 additions and 0 deletions
@@ -0,0 +1,361 @@
+#!/usr/bin/env python3
+"""
+Verify and fix FoJin source links in all prebuilt teacher skills.
+
+Discovers CBETA IDs from meta.json sources and fojin.app URLs in markdown
+files, then verifies each against FoJin's API and maps to internal text_ids.
+
+Key insight: meta.json and URLs use the full CBETA catalog format (e.g.
+T08n0235) while FoJin internally uses a shorter cbeta_id (e.g. T0235).
+This script handles the conversion.
+
+Usage:
+    python3 tools/verify_sources.py          # Dry run - report only
+    python3 tools/verify_sources.py --fix    # Actually update files
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+import time
+
+# Allow importing fojin_bridge from tools/
+sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
+
+from fojin_bridge import create_bridge
+
+
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+PREBUILT_DIR = os.path.join(PROJECT_ROOT, "prebuilt")
+
+# Matches fojin.app/texts/<ID> in URLs — ID can be CBETA-style or numeric
+FOJIN_URL_RE = re.compile(r"(https?://fojin\.app/texts/)([A-Za-z0-9n]+)")
+
+# Full CBETA catalog ID pattern: T08n0235, X62n1182, J36n0348
+FULL_CBETA_RE = re.compile(r"^([A-Z])(\d+)n(\d+[a-z]?)$")
+
+
+def full_to_short_cbeta(full_id: str) -> str | None:
+    """Convert full CBETA ID (T08n0235) to FoJin short format (T0235).
+
+    FoJin stores cbeta_id as the collection prefix + text number,
+    dropping the volume number. E.g.:
+        T08n0235  -> T0235
+        X62n1182  -> X1182
+        J36n0348  -> J0348
+        T34n1718  -> T1718
+    """
+    m = FULL_CBETA_RE.match(full_id)
+    if not m:
+        return None
+    prefix = m.group(1)    # T, X, J, etc.
+    text_num = m.group(3)  # 0235, 1182, etc.
+    return f"{prefix}{text_num}"
+
+
+def collect_cbeta_ids() -> dict[str, list[str]]:
+    """Scan all meta.json and return {full_cbeta_id: [teacher_slugs]}."""
+    cbeta_map: dict[str, list[str]] = {}
+    for teacher in sorted(os.listdir(PREBUILT_DIR)):
+        meta_path = os.path.join(PREBUILT_DIR, teacher, "meta.json")
+        if not os.path.isfile(meta_path):
+            continue
+        with open(meta_path, encoding="utf-8") as f:
+            meta = json.load(f)
+        for src in meta.get("sources", []):
+            if src.get("type") == "cbeta" and src.get("id"):
+                cbeta_id = src["id"]
+                cbeta_map.setdefault(cbeta_id, []).append(teacher)
+    return cbeta_map
+
+
+def collect_all_fojin_urls() -> dict[str, list[tuple[str, int]]]:
+    """Scan all md/py files and return {id_in_url: [(filepath, line_num)]}."""
+    url_map: dict[str, list[tuple[str, int]]] = {}
+
+    scan_dirs = [PREBUILT_DIR, os.path.join(PROJECT_ROOT, "prompts")]
+    extensions = {".md", ".py"}
+
+    for scan_dir in scan_dirs:
+        if not os.path.isdir(scan_dir):
+            continue
+        for root, _dirs, files in os.walk(scan_dir):
+            for fname in files:
+                if os.path.splitext(fname)[1] not in extensions:
+                    continue
+                fpath = os.path.join(root, fname)
+                with open(fpath, encoding="utf-8") as f:
+                    for line_num, line in enumerate(f, 1):
+                        for m in FOJIN_URL_RE.finditer(line):
+                            text_id_in_url = m.group(2)
+                            url_map.setdefault(text_id_in_url, []).append(
+                                (fpath, line_num)
+                            )
+    return url_map
+
+
+def verify_via_search(bridge, title: str, short_cbeta_id: str) -> dict | None:
+    """Search FoJin for a text by title, return the matching result with cbeta_id match."""
+    try:
+        resp = bridge.search_texts(title, sources="cbeta", page=1, size=5)
+        for r in resp.get("results", []):
+            if r.get("cbeta_id") == short_cbeta_id:
+                return r
+        # Also try without source filter
+        resp = bridge.search_texts(title, page=1, size=5)
+        for r in resp.get("results", []):
+            if r.get("cbeta_id") == short_cbeta_id:
+                return r
+    except Exception:
+        pass
+    return None
+
+
+def verify_via_lookup(bridge, short_ids: list[str]) -> dict:
+    """Try the batch lookup-cbeta endpoint. Returns {short_cbeta_id: internal_id}."""
+    result = {}
+    try:
+        ids_str = ",".join(short_ids)
+        resp = bridge.lookup_cbeta_ids(ids_str)
+        if isinstance(resp, dict):
+            mapping = resp.get("results") or resp.get("data") or resp
+            for sid in short_ids:
+                entry = mapping.get(sid)
+                if entry and isinstance(entry, dict):
+                    result[sid] = entry.get("text_id") or entry.get("id")
+                elif entry and isinstance(entry, int):
+                    result[sid] = entry
+    except Exception:
+        pass  # Endpoint may not be implemented; fall back to search
+    return result
+
+
+def verify_ids(bridge, cbeta_map: dict[str, list[str]], titles: dict[str, str]) -> dict[str, dict]:
+    """Verify all CBETA IDs and return {full_cbeta_id: {text_id, short_id, title, ...}}.
+
+    Args:
+        cbeta_map: {full_cbeta_id: [teacher_slugs]}
+        titles: {full_cbeta_id: title_from_meta} for search fallback
+    """
+    results: dict[str, dict] = {}
+
+    # Build full -> short mapping
+    full_to_short = {}
+    short_to_full = {}
+    for full_id in cbeta_map:
+        short = full_to_short_cbeta(full_id)
+        if short:
+            full_to_short[full_id] = short
+            short_to_full[short] = full_id
+
+    # Try batch lookup first
+    short_ids = list(full_to_short.values())
+    lookup_result = verify_via_lookup(bridge, short_ids)
+
+    for full_id, short_id in sorted(full_to_short.items()):
+        if short_id in lookup_result:
+            results[full_id] = {
+                "text_id": lookup_result[short_id],
+                "short_cbeta_id": short_id,
+                "method": "lookup",
+            }
+            continue
+
+        # Fallback: search by title
+        title = titles.get(full_id, "")
+        if title:
+            time.sleep(0.2)  # Rate limit
+            match = verify_via_search(bridge, title, short_id)
+            if match:
+                results[full_id] = {
+                    "text_id": match["id"],
+                    "short_cbeta_id": short_id,
+                    "title_zh": match.get("title_zh", ""),
+                    "method": "search",
+                }
+                continue
+
+        # Try direct get with the internal ID if it's numeric-ish
+        # Last resort: not found
+        results[full_id] = {
+            "text_id": None,
+            "short_cbeta_id": short_id,
+            "method": "not_found",
+        }
+
+    return results
+
+
+def collect_titles_from_meta() -> dict[str, str]:
+    """Collect {full_cbeta_id: title} from meta.json sources."""
+    titles = {}
+    for teacher in os.listdir(PREBUILT_DIR):
+        meta_path = os.path.join(PREBUILT_DIR, teacher, "meta.json")
+        if not os.path.isfile(meta_path):
+            continue
+        with open(meta_path, encoding="utf-8") as f:
+            meta = json.load(f)
+        for src in meta.get("sources", []):
+            if src.get("type") == "cbeta" and src.get("id") and src.get("title"):
+                titles[src["id"]] = src["title"]
+    return titles
+
+
+def fix_urls_in_file(
+    filepath: str, id_map: dict[str, str], dry_run: bool
+) -> list[str]:
+    """Replace CBETA IDs with internal text_ids in URLs. Returns list of changes."""
+    changes = []
+    with open(filepath, encoding="utf-8") as f:
+        content = f.read()
+
+    def replacer(m):
+        prefix = m.group(1)
+        old_id = m.group(2)
+        if old_id in id_map:
+            new_id = id_map[old_id]
+            rel = os.path.relpath(filepath, PROJECT_ROOT)
+            changes.append(f"    {rel}: {old_id} -> {new_id}")
+            return prefix + new_id
+        return m.group(0)
+
+    new_content = FOJIN_URL_RE.sub(replacer, content)
+
+    if not dry_run and new_content != content:
+        with open(filepath, "w", encoding="utf-8") as f:
+            f.write(new_content)
+
+    return changes
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Verify and fix FoJin source links")
+    parser.add_argument(
+        "--fix", action="store_true", help="Actually write changes to files"
+    )
+    args = parser.parse_args()
+
+    dry_run = not args.fix
+
+    print("=" * 60)
+    print("FoJin Source Verification Report")
+    print("=" * 60)
+    if dry_run:
+        print("Mode: DRY RUN (use --fix to apply changes)\n")
+    else:
+        print("Mode: FIX (writing changes to files)\n")
+
+    # Step 1: Collect CBETA IDs from meta.json
+    cbeta_map = collect_cbeta_ids()
+    all_cbeta_ids = sorted(cbeta_map.keys())
+    teacher_count = len(set(t for ts in cbeta_map.values() for t in ts))
+    print(f"[1/4] Found {len(all_cbeta_ids)} unique CBETA IDs across {teacher_count} teachers")
+    for cid in all_cbeta_ids:
+        short = full_to_short_cbeta(cid) or "?"
+        print(f"  {cid} (-> {short}) <- {', '.join(cbeta_map[cid])}")
+
+    # Step 2: Collect all fojin.app URLs from files
+    url_map = collect_all_fojin_urls()
+    all_url_ids = sorted(url_map.keys())
+    total_urls = sum(len(v) for v in url_map.values())
+    print(f"\n[2/4] Found {total_urls} fojin.app URLs using {len(all_url_ids)} unique IDs")
+
+    # IDs in URLs but not in meta.json sources
+    extra_url_ids = set(all_url_ids) - set(all_cbeta_ids)
+    if extra_url_ids:
+        print(f"  Extra IDs in URLs (not in meta.json sources):")
+        for eid in sorted(extra_url_ids):
+            locs = url_map[eid]
+            files = set(os.path.relpath(f, PROJECT_ROOT) for f, _ in locs)
+            print(f"    {eid} in {', '.join(sorted(files))}")
+
+    # Combine: all unique CBETA-style IDs from both meta.json and URLs
+    all_ids = set(all_cbeta_ids)
+    for uid in all_url_ids:
+        if FULL_CBETA_RE.match(uid):
+            all_ids.add(uid)
+
+    # Non-CBETA IDs in URLs (e.g. suttacentral IDs, placeholder "123")
+    non_cbeta_url_ids = [uid for uid in all_url_ids if not FULL_CBETA_RE.match(uid)]
+    if non_cbeta_url_ids:
+        print(f"  Non-CBETA IDs in URLs (skipped): {', '.join(non_cbeta_url_ids)}")
+
+    # Step 3: Verify with FoJin API
+    print(f"\n[3/4] Verifying {len(all_ids)} CBETA IDs against FoJin API...")
+    bridge = create_bridge()
+
+    if not bridge.test_connection():
+        print("  [ERROR] Cannot connect to FoJin API.")
+        print("  Set FOJIN_URL environment variable if using a custom instance.")
+        sys.exit(1)
+    print("  API connection OK")
+
+    # Build combined cbeta_map (include URL-only IDs)
+    combined_map = dict(cbeta_map)
+    for uid in all_url_ids:
+        if FULL_CBETA_RE.match(uid) and uid not in combined_map:
+            combined_map[uid] = ["(URL only)"]
+
+    titles = collect_titles_from_meta()
+    verified = verify_ids(bridge, combined_map, titles)
+
+    found = {k: v for k, v in verified.items() if v["text_id"] is not None}
+    not_found = {k: v for k, v in verified.items() if v["text_id"] is None}
+
+    print(f"\n  Verified: {len(found)}/{len(verified)}")
+    for cid in sorted(found):
+        info = found[cid]
+        title = info.get("title_zh", titles.get(cid, ""))
+        print(f"    [OK]   {cid} -> text_id={info['text_id']}  {title}  ({info['method']})")
+
+    if not_found:
+        print(f"\n  Not found in FoJin ({len(not_found)}):")
+        for cid in sorted(not_found):
+            teachers = combined_map.get(cid, ["?"])
+            print(f"    [MISS] {cid} (-> {not_found[cid]['short_cbeta_id']}) used by: {', '.join(teachers)}")
+
+    # Step 4: Update URLs
+    # Build replacement map: full_cbeta_id -> str(internal_text_id)
+    id_replacement_map: dict[str, str] = {}
+    for full_id, info in found.items():
+        id_replacement_map[full_id] = str(info["text_id"])
+
+    action = "Would update" if dry_run else "Updating"
+    print(f"\n[4/4] {action} URLs...")
+    all_changes = []
+
+    files_to_fix = set()
+    for locations in url_map.values():
+        for fpath, _ in locations:
+            files_to_fix.add(fpath)
+
+    for fpath in sorted(files_to_fix):
+        changes = fix_urls_in_file(fpath, id_replacement_map, dry_run)
+        all_changes.extend(changes)
+
+    if all_changes:
+        for c in all_changes:
+            print(c)
+        verb = "would be made" if dry_run else "applied"
+        print(f"\n  Total: {len(all_changes)} URL replacements {verb}")
+    else:
+        print("  No URL replacements needed")
+
+    # Summary
+    print("\n" + "=" * 60)
+    print("Summary")
+    print("=" * 60)
+    print(f"  CBETA IDs in meta.json:    {len(all_cbeta_ids)}")
+    print(f"  CBETA IDs in URLs:         {len([u for u in all_url_ids if FULL_CBETA_RE.match(u)])}")
+    print(f"  Total unique CBETA IDs:    {len(all_ids)}")
+    print(f"  Verified in FoJin:         {len(found)}")
+    print(f"  Not found in FoJin:        {len(not_found)}")
+    print(f"  URL replacements:          {len(all_changes)}")
+    if dry_run and all_changes:
+        print("\n  Run with --fix to apply changes.")
+
+
+if __name__ == "__main__":
+    main()