#!/usr/bin/env python3 """ Verify and fix FoJin source links in all prebuilt teacher skills. Discovers CBETA IDs from meta.json sources and fojin.app URLs in markdown files, then verifies each against FoJin's API and maps to internal text_ids. Key insight: meta.json and URLs use the full CBETA catalog format (e.g. T08n0235) while FoJin internally uses a shorter cbeta_id (e.g. T0235). This script handles the conversion. Usage: python3 tools/verify_sources.py # Dry run - report only python3 tools/verify_sources.py --fix # Actually update files """ import argparse import json import os import re import sys import time # Allow importing fojin_bridge from tools/ sys.path.insert(0, os.path.join(os.path.dirname(__file__))) from fojin_bridge import create_bridge PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) PREBUILT_DIR = os.path.join(PROJECT_ROOT, "prebuilt") # Matches fojin.app/texts/ in URLs — ID can be CBETA-style or numeric FOJIN_URL_RE = re.compile(r"(https?://fojin\.app/texts/)([A-Za-z0-9n]+)") # Full CBETA catalog ID pattern: T08n0235, X62n1182, J36n0348 FULL_CBETA_RE = re.compile(r"^([A-Z])(\d+)n(\d+[a-z]?)$") def full_to_short_cbeta(full_id: str) -> str | None: """Convert full CBETA ID (T08n0235) to FoJin short format (T0235). FoJin stores cbeta_id as the collection prefix + text number, dropping the volume number. E.g.: T08n0235 -> T0235 X62n1182 -> X1182 J36n0348 -> J0348 T34n1718 -> T1718 """ m = FULL_CBETA_RE.match(full_id) if not m: return None prefix = m.group(1) # T, X, J, etc. text_num = m.group(3) # 0235, 1182, etc. return f"{prefix}{text_num}" def collect_cbeta_ids() -> dict[str, list[str]]: """Scan all meta.json and return {full_cbeta_id: [teacher_slugs]}.""" cbeta_map: dict[str, list[str]] = {} for teacher in sorted(os.listdir(PREBUILT_DIR)): meta_path = os.path.join(PREBUILT_DIR, teacher, "meta.json") if not os.path.isfile(meta_path): continue with open(meta_path, encoding="utf-8") as f: meta = json.load(f) for src in meta.get("sources", []): if src.get("type") == "cbeta" and src.get("id"): cbeta_id = src["id"] cbeta_map.setdefault(cbeta_id, []).append(teacher) return cbeta_map def collect_all_fojin_urls() -> dict[str, list[tuple[str, int]]]: """Scan all md/py files and return {id_in_url: [(filepath, line_num)]}.""" url_map: dict[str, list[tuple[str, int]]] = {} scan_dirs = [PREBUILT_DIR, os.path.join(PROJECT_ROOT, "prompts")] extensions = {".md", ".py"} for scan_dir in scan_dirs: if not os.path.isdir(scan_dir): continue for root, _dirs, files in os.walk(scan_dir): for fname in files: if os.path.splitext(fname)[1] not in extensions: continue fpath = os.path.join(root, fname) with open(fpath, encoding="utf-8") as f: for line_num, line in enumerate(f, 1): for m in FOJIN_URL_RE.finditer(line): text_id_in_url = m.group(2) url_map.setdefault(text_id_in_url, []).append( (fpath, line_num) ) return url_map def verify_via_search(bridge, title: str, short_cbeta_id: str) -> dict | None: """Search FoJin for a text by title, return the matching result with cbeta_id match.""" try: resp = bridge.search_texts(title, sources="cbeta", page=1, size=5) for r in resp.get("results", []): if r.get("cbeta_id") == short_cbeta_id: return r # Also try without source filter resp = bridge.search_texts(title, page=1, size=5) for r in resp.get("results", []): if r.get("cbeta_id") == short_cbeta_id: return r except Exception: pass return None def verify_via_lookup(bridge, short_ids: list[str]) -> dict: """Try the batch lookup-cbeta endpoint. Returns {short_cbeta_id: internal_id}.""" result = {} try: ids_str = ",".join(short_ids) resp = bridge.lookup_cbeta_ids(ids_str) if isinstance(resp, dict): mapping = resp.get("results") or resp.get("data") or resp for sid in short_ids: entry = mapping.get(sid) if entry and isinstance(entry, dict): result[sid] = entry.get("text_id") or entry.get("id") elif entry and isinstance(entry, int): result[sid] = entry except Exception: pass # Endpoint may not be implemented; fall back to search return result def verify_ids(bridge, cbeta_map: dict[str, list[str]], titles: dict[str, str]) -> dict[str, dict]: """Verify all CBETA IDs and return {full_cbeta_id: {text_id, short_id, title, ...}}. Args: cbeta_map: {full_cbeta_id: [teacher_slugs]} titles: {full_cbeta_id: title_from_meta} for search fallback """ results: dict[str, dict] = {} # Build full -> short mapping full_to_short = {} short_to_full = {} for full_id in cbeta_map: short = full_to_short_cbeta(full_id) if short: full_to_short[full_id] = short short_to_full[short] = full_id # Try batch lookup first short_ids = list(full_to_short.values()) lookup_result = verify_via_lookup(bridge, short_ids) for full_id, short_id in sorted(full_to_short.items()): if short_id in lookup_result: results[full_id] = { "text_id": lookup_result[short_id], "short_cbeta_id": short_id, "method": "lookup", } continue # Fallback: search by title title = titles.get(full_id, "") if title: time.sleep(0.2) # Rate limit match = verify_via_search(bridge, title, short_id) if match: results[full_id] = { "text_id": match["id"], "short_cbeta_id": short_id, "title_zh": match.get("title_zh", ""), "method": "search", } continue # Try direct get with the internal ID if it's numeric-ish # Last resort: not found results[full_id] = { "text_id": None, "short_cbeta_id": short_id, "method": "not_found", } return results def collect_titles_from_meta() -> dict[str, str]: """Collect {full_cbeta_id: title} from meta.json sources.""" titles = {} for teacher in os.listdir(PREBUILT_DIR): meta_path = os.path.join(PREBUILT_DIR, teacher, "meta.json") if not os.path.isfile(meta_path): continue with open(meta_path, encoding="utf-8") as f: meta = json.load(f) for src in meta.get("sources", []): if src.get("type") == "cbeta" and src.get("id") and src.get("title"): titles[src["id"]] = src["title"] return titles def fix_urls_in_file( filepath: str, id_map: dict[str, str], dry_run: bool ) -> list[str]: """Replace CBETA IDs with internal text_ids in URLs. Returns list of changes.""" changes = [] with open(filepath, encoding="utf-8") as f: content = f.read() def replacer(m): prefix = m.group(1) old_id = m.group(2) if old_id in id_map: new_id = id_map[old_id] rel = os.path.relpath(filepath, PROJECT_ROOT) changes.append(f" {rel}: {old_id} -> {new_id}") return prefix + new_id return m.group(0) new_content = FOJIN_URL_RE.sub(replacer, content) if not dry_run and new_content != content: with open(filepath, "w", encoding="utf-8") as f: f.write(new_content) return changes def main(): parser = argparse.ArgumentParser(description="Verify and fix FoJin source links") parser.add_argument( "--fix", action="store_true", help="Actually write changes to files" ) args = parser.parse_args() dry_run = not args.fix print("=" * 60) print("FoJin Source Verification Report") print("=" * 60) if dry_run: print("Mode: DRY RUN (use --fix to apply changes)\n") else: print("Mode: FIX (writing changes to files)\n") # Step 1: Collect CBETA IDs from meta.json cbeta_map = collect_cbeta_ids() all_cbeta_ids = sorted(cbeta_map.keys()) teacher_count = len(set(t for ts in cbeta_map.values() for t in ts)) print(f"[1/4] Found {len(all_cbeta_ids)} unique CBETA IDs across {teacher_count} teachers") for cid in all_cbeta_ids: short = full_to_short_cbeta(cid) or "?" print(f" {cid} (-> {short}) <- {', '.join(cbeta_map[cid])}") # Step 2: Collect all fojin.app URLs from files url_map = collect_all_fojin_urls() all_url_ids = sorted(url_map.keys()) total_urls = sum(len(v) for v in url_map.values()) print(f"\n[2/4] Found {total_urls} fojin.app URLs using {len(all_url_ids)} unique IDs") # IDs in URLs but not in meta.json sources extra_url_ids = set(all_url_ids) - set(all_cbeta_ids) if extra_url_ids: print(f" Extra IDs in URLs (not in meta.json sources):") for eid in sorted(extra_url_ids): locs = url_map[eid] files = set(os.path.relpath(f, PROJECT_ROOT) for f, _ in locs) print(f" {eid} in {', '.join(sorted(files))}") # Combine: all unique CBETA-style IDs from both meta.json and URLs all_ids = set(all_cbeta_ids) for uid in all_url_ids: if FULL_CBETA_RE.match(uid): all_ids.add(uid) # Non-CBETA IDs in URLs (e.g. suttacentral IDs, placeholder "123") non_cbeta_url_ids = [uid for uid in all_url_ids if not FULL_CBETA_RE.match(uid)] if non_cbeta_url_ids: print(f" Non-CBETA IDs in URLs (skipped): {', '.join(non_cbeta_url_ids)}") # Step 3: Verify with FoJin API print(f"\n[3/4] Verifying {len(all_ids)} CBETA IDs against FoJin API...") bridge = create_bridge() if not bridge.test_connection(): print(" [ERROR] Cannot connect to FoJin API.") print(" Set FOJIN_URL environment variable if using a custom instance.") sys.exit(1) print(" API connection OK") # Build combined cbeta_map (include URL-only IDs) combined_map = dict(cbeta_map) for uid in all_url_ids: if FULL_CBETA_RE.match(uid) and uid not in combined_map: combined_map[uid] = ["(URL only)"] titles = collect_titles_from_meta() verified = verify_ids(bridge, combined_map, titles) found = {k: v for k, v in verified.items() if v["text_id"] is not None} not_found = {k: v for k, v in verified.items() if v["text_id"] is None} print(f"\n Verified: {len(found)}/{len(verified)}") for cid in sorted(found): info = found[cid] title = info.get("title_zh", titles.get(cid, "")) print(f" [OK] {cid} -> text_id={info['text_id']} {title} ({info['method']})") if not_found: print(f"\n Not found in FoJin ({len(not_found)}):") for cid in sorted(not_found): teachers = combined_map.get(cid, ["?"]) print(f" [MISS] {cid} (-> {not_found[cid]['short_cbeta_id']}) used by: {', '.join(teachers)}") # Step 4: Update URLs # Build replacement map: full_cbeta_id -> str(internal_text_id) id_replacement_map: dict[str, str] = {} for full_id, info in found.items(): id_replacement_map[full_id] = str(info["text_id"]) action = "Would update" if dry_run else "Updating" print(f"\n[4/4] {action} URLs...") all_changes = [] files_to_fix = set() for locations in url_map.values(): for fpath, _ in locations: files_to_fix.add(fpath) for fpath in sorted(files_to_fix): changes = fix_urls_in_file(fpath, id_replacement_map, dry_run) all_changes.extend(changes) if all_changes: for c in all_changes: print(c) verb = "would be made" if dry_run else "applied" print(f"\n Total: {len(all_changes)} URL replacements {verb}") else: print(" No URL replacements needed") # Summary print("\n" + "=" * 60) print("Summary") print("=" * 60) print(f" CBETA IDs in meta.json: {len(all_cbeta_ids)}") print(f" CBETA IDs in URLs: {len([u for u in all_url_ids if FULL_CBETA_RE.match(u)])}") print(f" Total unique CBETA IDs: {len(all_ids)}") print(f" Verified in FoJin: {len(found)}") print(f" Not found in FoJin: {len(not_found)}") print(f" URL replacements: {len(all_changes)}") if dry_run and all_changes: print("\n Run with --fix to apply changes.") if __name__ == "__main__": main()