diff --git a/tools/verify_sources.py b/tools/verify_sources.py new file mode 100644 index 0000000..858f39e --- /dev/null +++ b/tools/verify_sources.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python3 +""" +Verify and fix FoJin source links in all prebuilt teacher skills. + +Discovers CBETA IDs from meta.json sources and fojin.app URLs in markdown +files, then verifies each against FoJin's API and maps to internal text_ids. + +Key insight: meta.json and URLs use the full CBETA catalog format (e.g. +T08n0235) while FoJin internally uses a shorter cbeta_id (e.g. T0235). +This script handles the conversion. + +Usage: + python3 tools/verify_sources.py # Dry run - report only + python3 tools/verify_sources.py --fix # Actually update files +""" + +import argparse +import json +import os +import re +import sys +import time + +# Allow importing fojin_bridge from tools/ +sys.path.insert(0, os.path.join(os.path.dirname(__file__))) + +from fojin_bridge import create_bridge + + +PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +PREBUILT_DIR = os.path.join(PROJECT_ROOT, "prebuilt") + +# Matches fojin.app/texts/ in URLs — ID can be CBETA-style or numeric +FOJIN_URL_RE = re.compile(r"(https?://fojin\.app/texts/)([A-Za-z0-9n]+)") + +# Full CBETA catalog ID pattern: T08n0235, X62n1182, J36n0348 +FULL_CBETA_RE = re.compile(r"^([A-Z])(\d+)n(\d+[a-z]?)$") + + +def full_to_short_cbeta(full_id: str) -> str | None: + """Convert full CBETA ID (T08n0235) to FoJin short format (T0235). + + FoJin stores cbeta_id as the collection prefix + text number, + dropping the volume number. E.g.: + T08n0235 -> T0235 + X62n1182 -> X1182 + J36n0348 -> J0348 + T34n1718 -> T1718 + """ + m = FULL_CBETA_RE.match(full_id) + if not m: + return None + prefix = m.group(1) # T, X, J, etc. + text_num = m.group(3) # 0235, 1182, etc. + return f"{prefix}{text_num}" + + +def collect_cbeta_ids() -> dict[str, list[str]]: + """Scan all meta.json and return {full_cbeta_id: [teacher_slugs]}.""" + cbeta_map: dict[str, list[str]] = {} + for teacher in sorted(os.listdir(PREBUILT_DIR)): + meta_path = os.path.join(PREBUILT_DIR, teacher, "meta.json") + if not os.path.isfile(meta_path): + continue + with open(meta_path, encoding="utf-8") as f: + meta = json.load(f) + for src in meta.get("sources", []): + if src.get("type") == "cbeta" and src.get("id"): + cbeta_id = src["id"] + cbeta_map.setdefault(cbeta_id, []).append(teacher) + return cbeta_map + + +def collect_all_fojin_urls() -> dict[str, list[tuple[str, int]]]: + """Scan all md/py files and return {id_in_url: [(filepath, line_num)]}.""" + url_map: dict[str, list[tuple[str, int]]] = {} + + scan_dirs = [PREBUILT_DIR, os.path.join(PROJECT_ROOT, "prompts")] + extensions = {".md", ".py"} + + for scan_dir in scan_dirs: + if not os.path.isdir(scan_dir): + continue + for root, _dirs, files in os.walk(scan_dir): + for fname in files: + if os.path.splitext(fname)[1] not in extensions: + continue + fpath = os.path.join(root, fname) + with open(fpath, encoding="utf-8") as f: + for line_num, line in enumerate(f, 1): + for m in FOJIN_URL_RE.finditer(line): + text_id_in_url = m.group(2) + url_map.setdefault(text_id_in_url, []).append( + (fpath, line_num) + ) + return url_map + + +def verify_via_search(bridge, title: str, short_cbeta_id: str) -> dict | None: + """Search FoJin for a text by title, return the matching result with cbeta_id match.""" + try: + resp = bridge.search_texts(title, sources="cbeta", page=1, size=5) + for r in resp.get("results", []): + if r.get("cbeta_id") == short_cbeta_id: + return r + # Also try without source filter + resp = bridge.search_texts(title, page=1, size=5) + for r in resp.get("results", []): + if r.get("cbeta_id") == short_cbeta_id: + return r + except Exception: + pass + return None + + +def verify_via_lookup(bridge, short_ids: list[str]) -> dict: + """Try the batch lookup-cbeta endpoint. Returns {short_cbeta_id: internal_id}.""" + result = {} + try: + ids_str = ",".join(short_ids) + resp = bridge.lookup_cbeta_ids(ids_str) + if isinstance(resp, dict): + mapping = resp.get("results") or resp.get("data") or resp + for sid in short_ids: + entry = mapping.get(sid) + if entry and isinstance(entry, dict): + result[sid] = entry.get("text_id") or entry.get("id") + elif entry and isinstance(entry, int): + result[sid] = entry + except Exception: + pass # Endpoint may not be implemented; fall back to search + return result + + +def verify_ids(bridge, cbeta_map: dict[str, list[str]], titles: dict[str, str]) -> dict[str, dict]: + """Verify all CBETA IDs and return {full_cbeta_id: {text_id, short_id, title, ...}}. + + Args: + cbeta_map: {full_cbeta_id: [teacher_slugs]} + titles: {full_cbeta_id: title_from_meta} for search fallback + """ + results: dict[str, dict] = {} + + # Build full -> short mapping + full_to_short = {} + short_to_full = {} + for full_id in cbeta_map: + short = full_to_short_cbeta(full_id) + if short: + full_to_short[full_id] = short + short_to_full[short] = full_id + + # Try batch lookup first + short_ids = list(full_to_short.values()) + lookup_result = verify_via_lookup(bridge, short_ids) + + for full_id, short_id in sorted(full_to_short.items()): + if short_id in lookup_result: + results[full_id] = { + "text_id": lookup_result[short_id], + "short_cbeta_id": short_id, + "method": "lookup", + } + continue + + # Fallback: search by title + title = titles.get(full_id, "") + if title: + time.sleep(0.2) # Rate limit + match = verify_via_search(bridge, title, short_id) + if match: + results[full_id] = { + "text_id": match["id"], + "short_cbeta_id": short_id, + "title_zh": match.get("title_zh", ""), + "method": "search", + } + continue + + # Try direct get with the internal ID if it's numeric-ish + # Last resort: not found + results[full_id] = { + "text_id": None, + "short_cbeta_id": short_id, + "method": "not_found", + } + + return results + + +def collect_titles_from_meta() -> dict[str, str]: + """Collect {full_cbeta_id: title} from meta.json sources.""" + titles = {} + for teacher in os.listdir(PREBUILT_DIR): + meta_path = os.path.join(PREBUILT_DIR, teacher, "meta.json") + if not os.path.isfile(meta_path): + continue + with open(meta_path, encoding="utf-8") as f: + meta = json.load(f) + for src in meta.get("sources", []): + if src.get("type") == "cbeta" and src.get("id") and src.get("title"): + titles[src["id"]] = src["title"] + return titles + + +def fix_urls_in_file( + filepath: str, id_map: dict[str, str], dry_run: bool +) -> list[str]: + """Replace CBETA IDs with internal text_ids in URLs. Returns list of changes.""" + changes = [] + with open(filepath, encoding="utf-8") as f: + content = f.read() + + def replacer(m): + prefix = m.group(1) + old_id = m.group(2) + if old_id in id_map: + new_id = id_map[old_id] + rel = os.path.relpath(filepath, PROJECT_ROOT) + changes.append(f" {rel}: {old_id} -> {new_id}") + return prefix + new_id + return m.group(0) + + new_content = FOJIN_URL_RE.sub(replacer, content) + + if not dry_run and new_content != content: + with open(filepath, "w", encoding="utf-8") as f: + f.write(new_content) + + return changes + + +def main(): + parser = argparse.ArgumentParser(description="Verify and fix FoJin source links") + parser.add_argument( + "--fix", action="store_true", help="Actually write changes to files" + ) + args = parser.parse_args() + + dry_run = not args.fix + + print("=" * 60) + print("FoJin Source Verification Report") + print("=" * 60) + if dry_run: + print("Mode: DRY RUN (use --fix to apply changes)\n") + else: + print("Mode: FIX (writing changes to files)\n") + + # Step 1: Collect CBETA IDs from meta.json + cbeta_map = collect_cbeta_ids() + all_cbeta_ids = sorted(cbeta_map.keys()) + teacher_count = len(set(t for ts in cbeta_map.values() for t in ts)) + print(f"[1/4] Found {len(all_cbeta_ids)} unique CBETA IDs across {teacher_count} teachers") + for cid in all_cbeta_ids: + short = full_to_short_cbeta(cid) or "?" + print(f" {cid} (-> {short}) <- {', '.join(cbeta_map[cid])}") + + # Step 2: Collect all fojin.app URLs from files + url_map = collect_all_fojin_urls() + all_url_ids = sorted(url_map.keys()) + total_urls = sum(len(v) for v in url_map.values()) + print(f"\n[2/4] Found {total_urls} fojin.app URLs using {len(all_url_ids)} unique IDs") + + # IDs in URLs but not in meta.json sources + extra_url_ids = set(all_url_ids) - set(all_cbeta_ids) + if extra_url_ids: + print(f" Extra IDs in URLs (not in meta.json sources):") + for eid in sorted(extra_url_ids): + locs = url_map[eid] + files = set(os.path.relpath(f, PROJECT_ROOT) for f, _ in locs) + print(f" {eid} in {', '.join(sorted(files))}") + + # Combine: all unique CBETA-style IDs from both meta.json and URLs + all_ids = set(all_cbeta_ids) + for uid in all_url_ids: + if FULL_CBETA_RE.match(uid): + all_ids.add(uid) + + # Non-CBETA IDs in URLs (e.g. suttacentral IDs, placeholder "123") + non_cbeta_url_ids = [uid for uid in all_url_ids if not FULL_CBETA_RE.match(uid)] + if non_cbeta_url_ids: + print(f" Non-CBETA IDs in URLs (skipped): {', '.join(non_cbeta_url_ids)}") + + # Step 3: Verify with FoJin API + print(f"\n[3/4] Verifying {len(all_ids)} CBETA IDs against FoJin API...") + bridge = create_bridge() + + if not bridge.test_connection(): + print(" [ERROR] Cannot connect to FoJin API.") + print(" Set FOJIN_URL environment variable if using a custom instance.") + sys.exit(1) + print(" API connection OK") + + # Build combined cbeta_map (include URL-only IDs) + combined_map = dict(cbeta_map) + for uid in all_url_ids: + if FULL_CBETA_RE.match(uid) and uid not in combined_map: + combined_map[uid] = ["(URL only)"] + + titles = collect_titles_from_meta() + verified = verify_ids(bridge, combined_map, titles) + + found = {k: v for k, v in verified.items() if v["text_id"] is not None} + not_found = {k: v for k, v in verified.items() if v["text_id"] is None} + + print(f"\n Verified: {len(found)}/{len(verified)}") + for cid in sorted(found): + info = found[cid] + title = info.get("title_zh", titles.get(cid, "")) + print(f" [OK] {cid} -> text_id={info['text_id']} {title} ({info['method']})") + + if not_found: + print(f"\n Not found in FoJin ({len(not_found)}):") + for cid in sorted(not_found): + teachers = combined_map.get(cid, ["?"]) + print(f" [MISS] {cid} (-> {not_found[cid]['short_cbeta_id']}) used by: {', '.join(teachers)}") + + # Step 4: Update URLs + # Build replacement map: full_cbeta_id -> str(internal_text_id) + id_replacement_map: dict[str, str] = {} + for full_id, info in found.items(): + id_replacement_map[full_id] = str(info["text_id"]) + + action = "Would update" if dry_run else "Updating" + print(f"\n[4/4] {action} URLs...") + all_changes = [] + + files_to_fix = set() + for locations in url_map.values(): + for fpath, _ in locations: + files_to_fix.add(fpath) + + for fpath in sorted(files_to_fix): + changes = fix_urls_in_file(fpath, id_replacement_map, dry_run) + all_changes.extend(changes) + + if all_changes: + for c in all_changes: + print(c) + verb = "would be made" if dry_run else "applied" + print(f"\n Total: {len(all_changes)} URL replacements {verb}") + else: + print(" No URL replacements needed") + + # Summary + print("\n" + "=" * 60) + print("Summary") + print("=" * 60) + print(f" CBETA IDs in meta.json: {len(all_cbeta_ids)}") + print(f" CBETA IDs in URLs: {len([u for u in all_url_ids if FULL_CBETA_RE.match(u)])}") + print(f" Total unique CBETA IDs: {len(all_ids)}") + print(f" Verified in FoJin: {len(found)}") + print(f" Not found in FoJin: {len(not_found)}") + print(f" URL replacements: {len(all_changes)}") + if dry_run and all_changes: + print("\n Run with --fix to apply changes.") + + +if __name__ == "__main__": + main()