Files
xianren d0709af973 feat: add source verification tool to validate FoJin links
Scans all prebuilt teachers for CBETA IDs and fojin.app URLs, verifies
against FoJin API, and can fix URLs to use internal text_ids (--fix flag).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-04 19:57:48 +08:00

362 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Verify and fix FoJin source links in all prebuilt teacher skills.
Discovers CBETA IDs from meta.json sources and fojin.app URLs in markdown
files, then verifies each against FoJin's API and maps to internal text_ids.
Key insight: meta.json and URLs use the full CBETA catalog format (e.g.
T08n0235) while FoJin internally uses a shorter cbeta_id (e.g. T0235).
This script handles the conversion.
Usage:
python3 tools/verify_sources.py # Dry run - report only
python3 tools/verify_sources.py --fix # Actually update files
"""
import argparse
import json
import os
import re
import sys
import time
# Allow importing fojin_bridge from tools/
sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
from fojin_bridge import create_bridge
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
PREBUILT_DIR = os.path.join(PROJECT_ROOT, "prebuilt")
# Matches fojin.app/texts/<ID> in URLs — ID can be CBETA-style or numeric
FOJIN_URL_RE = re.compile(r"(https?://fojin\.app/texts/)([A-Za-z0-9n]+)")
# Full CBETA catalog ID pattern: T08n0235, X62n1182, J36n0348
FULL_CBETA_RE = re.compile(r"^([A-Z])(\d+)n(\d+[a-z]?)$")
def full_to_short_cbeta(full_id: str) -> str | None:
"""Convert full CBETA ID (T08n0235) to FoJin short format (T0235).
FoJin stores cbeta_id as the collection prefix + text number,
dropping the volume number. E.g.:
T08n0235 -> T0235
X62n1182 -> X1182
J36n0348 -> J0348
T34n1718 -> T1718
"""
m = FULL_CBETA_RE.match(full_id)
if not m:
return None
prefix = m.group(1) # T, X, J, etc.
text_num = m.group(3) # 0235, 1182, etc.
return f"{prefix}{text_num}"
def collect_cbeta_ids() -> dict[str, list[str]]:
"""Scan all meta.json and return {full_cbeta_id: [teacher_slugs]}."""
cbeta_map: dict[str, list[str]] = {}
for teacher in sorted(os.listdir(PREBUILT_DIR)):
meta_path = os.path.join(PREBUILT_DIR, teacher, "meta.json")
if not os.path.isfile(meta_path):
continue
with open(meta_path, encoding="utf-8") as f:
meta = json.load(f)
for src in meta.get("sources", []):
if src.get("type") == "cbeta" and src.get("id"):
cbeta_id = src["id"]
cbeta_map.setdefault(cbeta_id, []).append(teacher)
return cbeta_map
def collect_all_fojin_urls() -> dict[str, list[tuple[str, int]]]:
"""Scan all md/py files and return {id_in_url: [(filepath, line_num)]}."""
url_map: dict[str, list[tuple[str, int]]] = {}
scan_dirs = [PREBUILT_DIR, os.path.join(PROJECT_ROOT, "prompts")]
extensions = {".md", ".py"}
for scan_dir in scan_dirs:
if not os.path.isdir(scan_dir):
continue
for root, _dirs, files in os.walk(scan_dir):
for fname in files:
if os.path.splitext(fname)[1] not in extensions:
continue
fpath = os.path.join(root, fname)
with open(fpath, encoding="utf-8") as f:
for line_num, line in enumerate(f, 1):
for m in FOJIN_URL_RE.finditer(line):
text_id_in_url = m.group(2)
url_map.setdefault(text_id_in_url, []).append(
(fpath, line_num)
)
return url_map
def verify_via_search(bridge, title: str, short_cbeta_id: str) -> dict | None:
"""Search FoJin for a text by title, return the matching result with cbeta_id match."""
try:
resp = bridge.search_texts(title, sources="cbeta", page=1, size=5)
for r in resp.get("results", []):
if r.get("cbeta_id") == short_cbeta_id:
return r
# Also try without source filter
resp = bridge.search_texts(title, page=1, size=5)
for r in resp.get("results", []):
if r.get("cbeta_id") == short_cbeta_id:
return r
except Exception:
pass
return None
def verify_via_lookup(bridge, short_ids: list[str]) -> dict:
"""Try the batch lookup-cbeta endpoint. Returns {short_cbeta_id: internal_id}."""
result = {}
try:
ids_str = ",".join(short_ids)
resp = bridge.lookup_cbeta_ids(ids_str)
if isinstance(resp, dict):
mapping = resp.get("results") or resp.get("data") or resp
for sid in short_ids:
entry = mapping.get(sid)
if entry and isinstance(entry, dict):
result[sid] = entry.get("text_id") or entry.get("id")
elif entry and isinstance(entry, int):
result[sid] = entry
except Exception:
pass # Endpoint may not be implemented; fall back to search
return result
def verify_ids(bridge, cbeta_map: dict[str, list[str]], titles: dict[str, str]) -> dict[str, dict]:
"""Verify all CBETA IDs and return {full_cbeta_id: {text_id, short_id, title, ...}}.
Args:
cbeta_map: {full_cbeta_id: [teacher_slugs]}
titles: {full_cbeta_id: title_from_meta} for search fallback
"""
results: dict[str, dict] = {}
# Build full -> short mapping
full_to_short = {}
short_to_full = {}
for full_id in cbeta_map:
short = full_to_short_cbeta(full_id)
if short:
full_to_short[full_id] = short
short_to_full[short] = full_id
# Try batch lookup first
short_ids = list(full_to_short.values())
lookup_result = verify_via_lookup(bridge, short_ids)
for full_id, short_id in sorted(full_to_short.items()):
if short_id in lookup_result:
results[full_id] = {
"text_id": lookup_result[short_id],
"short_cbeta_id": short_id,
"method": "lookup",
}
continue
# Fallback: search by title
title = titles.get(full_id, "")
if title:
time.sleep(0.2) # Rate limit
match = verify_via_search(bridge, title, short_id)
if match:
results[full_id] = {
"text_id": match["id"],
"short_cbeta_id": short_id,
"title_zh": match.get("title_zh", ""),
"method": "search",
}
continue
# Try direct get with the internal ID if it's numeric-ish
# Last resort: not found
results[full_id] = {
"text_id": None,
"short_cbeta_id": short_id,
"method": "not_found",
}
return results
def collect_titles_from_meta() -> dict[str, str]:
"""Collect {full_cbeta_id: title} from meta.json sources."""
titles = {}
for teacher in os.listdir(PREBUILT_DIR):
meta_path = os.path.join(PREBUILT_DIR, teacher, "meta.json")
if not os.path.isfile(meta_path):
continue
with open(meta_path, encoding="utf-8") as f:
meta = json.load(f)
for src in meta.get("sources", []):
if src.get("type") == "cbeta" and src.get("id") and src.get("title"):
titles[src["id"]] = src["title"]
return titles
def fix_urls_in_file(
filepath: str, id_map: dict[str, str], dry_run: bool
) -> list[str]:
"""Replace CBETA IDs with internal text_ids in URLs. Returns list of changes."""
changes = []
with open(filepath, encoding="utf-8") as f:
content = f.read()
def replacer(m):
prefix = m.group(1)
old_id = m.group(2)
if old_id in id_map:
new_id = id_map[old_id]
rel = os.path.relpath(filepath, PROJECT_ROOT)
changes.append(f" {rel}: {old_id} -> {new_id}")
return prefix + new_id
return m.group(0)
new_content = FOJIN_URL_RE.sub(replacer, content)
if not dry_run and new_content != content:
with open(filepath, "w", encoding="utf-8") as f:
f.write(new_content)
return changes
def main():
parser = argparse.ArgumentParser(description="Verify and fix FoJin source links")
parser.add_argument(
"--fix", action="store_true", help="Actually write changes to files"
)
args = parser.parse_args()
dry_run = not args.fix
print("=" * 60)
print("FoJin Source Verification Report")
print("=" * 60)
if dry_run:
print("Mode: DRY RUN (use --fix to apply changes)\n")
else:
print("Mode: FIX (writing changes to files)\n")
# Step 1: Collect CBETA IDs from meta.json
cbeta_map = collect_cbeta_ids()
all_cbeta_ids = sorted(cbeta_map.keys())
teacher_count = len(set(t for ts in cbeta_map.values() for t in ts))
print(f"[1/4] Found {len(all_cbeta_ids)} unique CBETA IDs across {teacher_count} teachers")
for cid in all_cbeta_ids:
short = full_to_short_cbeta(cid) or "?"
print(f" {cid} (-> {short}) <- {', '.join(cbeta_map[cid])}")
# Step 2: Collect all fojin.app URLs from files
url_map = collect_all_fojin_urls()
all_url_ids = sorted(url_map.keys())
total_urls = sum(len(v) for v in url_map.values())
print(f"\n[2/4] Found {total_urls} fojin.app URLs using {len(all_url_ids)} unique IDs")
# IDs in URLs but not in meta.json sources
extra_url_ids = set(all_url_ids) - set(all_cbeta_ids)
if extra_url_ids:
print(f" Extra IDs in URLs (not in meta.json sources):")
for eid in sorted(extra_url_ids):
locs = url_map[eid]
files = set(os.path.relpath(f, PROJECT_ROOT) for f, _ in locs)
print(f" {eid} in {', '.join(sorted(files))}")
# Combine: all unique CBETA-style IDs from both meta.json and URLs
all_ids = set(all_cbeta_ids)
for uid in all_url_ids:
if FULL_CBETA_RE.match(uid):
all_ids.add(uid)
# Non-CBETA IDs in URLs (e.g. suttacentral IDs, placeholder "123")
non_cbeta_url_ids = [uid for uid in all_url_ids if not FULL_CBETA_RE.match(uid)]
if non_cbeta_url_ids:
print(f" Non-CBETA IDs in URLs (skipped): {', '.join(non_cbeta_url_ids)}")
# Step 3: Verify with FoJin API
print(f"\n[3/4] Verifying {len(all_ids)} CBETA IDs against FoJin API...")
bridge = create_bridge()
if not bridge.test_connection():
print(" [ERROR] Cannot connect to FoJin API.")
print(" Set FOJIN_URL environment variable if using a custom instance.")
sys.exit(1)
print(" API connection OK")
# Build combined cbeta_map (include URL-only IDs)
combined_map = dict(cbeta_map)
for uid in all_url_ids:
if FULL_CBETA_RE.match(uid) and uid not in combined_map:
combined_map[uid] = ["(URL only)"]
titles = collect_titles_from_meta()
verified = verify_ids(bridge, combined_map, titles)
found = {k: v for k, v in verified.items() if v["text_id"] is not None}
not_found = {k: v for k, v in verified.items() if v["text_id"] is None}
print(f"\n Verified: {len(found)}/{len(verified)}")
for cid in sorted(found):
info = found[cid]
title = info.get("title_zh", titles.get(cid, ""))
print(f" [OK] {cid} -> text_id={info['text_id']} {title} ({info['method']})")
if not_found:
print(f"\n Not found in FoJin ({len(not_found)}):")
for cid in sorted(not_found):
teachers = combined_map.get(cid, ["?"])
print(f" [MISS] {cid} (-> {not_found[cid]['short_cbeta_id']}) used by: {', '.join(teachers)}")
# Step 4: Update URLs
# Build replacement map: full_cbeta_id -> str(internal_text_id)
id_replacement_map: dict[str, str] = {}
for full_id, info in found.items():
id_replacement_map[full_id] = str(info["text_id"])
action = "Would update" if dry_run else "Updating"
print(f"\n[4/4] {action} URLs...")
all_changes = []
files_to_fix = set()
for locations in url_map.values():
for fpath, _ in locations:
files_to_fix.add(fpath)
for fpath in sorted(files_to_fix):
changes = fix_urls_in_file(fpath, id_replacement_map, dry_run)
all_changes.extend(changes)
if all_changes:
for c in all_changes:
print(c)
verb = "would be made" if dry_run else "applied"
print(f"\n Total: {len(all_changes)} URL replacements {verb}")
else:
print(" No URL replacements needed")
# Summary
print("\n" + "=" * 60)
print("Summary")
print("=" * 60)
print(f" CBETA IDs in meta.json: {len(all_cbeta_ids)}")
print(f" CBETA IDs in URLs: {len([u for u in all_url_ids if FULL_CBETA_RE.match(u)])}")
print(f" Total unique CBETA IDs: {len(all_ids)}")
print(f" Verified in FoJin: {len(found)}")
print(f" Not found in FoJin: {len(not_found)}")
print(f" URL replacements: {len(all_changes)}")
if dry_run and all_changes:
print("\n Run with --fix to apply changes.")
if __name__ == "__main__":
main()