mirror of
https://github.com/xr843/Master-skill.git
synced 2026-05-10 05:16:25 +00:00
feat: add source verification tool to validate FoJin links
Scans all prebuilt teachers for CBETA IDs and fojin.app URLs, verifies against FoJin API, and can fix URLs to use internal text_ids (--fix flag). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,361 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Verify and fix FoJin source links in all prebuilt teacher skills.
|
||||
|
||||
Discovers CBETA IDs from meta.json sources and fojin.app URLs in markdown
|
||||
files, then verifies each against FoJin's API and maps to internal text_ids.
|
||||
|
||||
Key insight: meta.json and URLs use the full CBETA catalog format (e.g.
|
||||
T08n0235) while FoJin internally uses a shorter cbeta_id (e.g. T0235).
|
||||
This script handles the conversion.
|
||||
|
||||
Usage:
|
||||
python3 tools/verify_sources.py # Dry run - report only
|
||||
python3 tools/verify_sources.py --fix # Actually update files
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
# Allow importing fojin_bridge from tools/
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
|
||||
|
||||
from fojin_bridge import create_bridge
|
||||
|
||||
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
PREBUILT_DIR = os.path.join(PROJECT_ROOT, "prebuilt")
|
||||
|
||||
# Matches fojin.app/texts/<ID> in URLs — ID can be CBETA-style or numeric
|
||||
FOJIN_URL_RE = re.compile(r"(https?://fojin\.app/texts/)([A-Za-z0-9n]+)")
|
||||
|
||||
# Full CBETA catalog ID pattern: T08n0235, X62n1182, J36n0348
|
||||
FULL_CBETA_RE = re.compile(r"^([A-Z])(\d+)n(\d+[a-z]?)$")
|
||||
|
||||
|
||||
def full_to_short_cbeta(full_id: str) -> str | None:
|
||||
"""Convert full CBETA ID (T08n0235) to FoJin short format (T0235).
|
||||
|
||||
FoJin stores cbeta_id as the collection prefix + text number,
|
||||
dropping the volume number. E.g.:
|
||||
T08n0235 -> T0235
|
||||
X62n1182 -> X1182
|
||||
J36n0348 -> J0348
|
||||
T34n1718 -> T1718
|
||||
"""
|
||||
m = FULL_CBETA_RE.match(full_id)
|
||||
if not m:
|
||||
return None
|
||||
prefix = m.group(1) # T, X, J, etc.
|
||||
text_num = m.group(3) # 0235, 1182, etc.
|
||||
return f"{prefix}{text_num}"
|
||||
|
||||
|
||||
def collect_cbeta_ids() -> dict[str, list[str]]:
|
||||
"""Scan all meta.json and return {full_cbeta_id: [teacher_slugs]}."""
|
||||
cbeta_map: dict[str, list[str]] = {}
|
||||
for teacher in sorted(os.listdir(PREBUILT_DIR)):
|
||||
meta_path = os.path.join(PREBUILT_DIR, teacher, "meta.json")
|
||||
if not os.path.isfile(meta_path):
|
||||
continue
|
||||
with open(meta_path, encoding="utf-8") as f:
|
||||
meta = json.load(f)
|
||||
for src in meta.get("sources", []):
|
||||
if src.get("type") == "cbeta" and src.get("id"):
|
||||
cbeta_id = src["id"]
|
||||
cbeta_map.setdefault(cbeta_id, []).append(teacher)
|
||||
return cbeta_map
|
||||
|
||||
|
||||
def collect_all_fojin_urls() -> dict[str, list[tuple[str, int]]]:
|
||||
"""Scan all md/py files and return {id_in_url: [(filepath, line_num)]}."""
|
||||
url_map: dict[str, list[tuple[str, int]]] = {}
|
||||
|
||||
scan_dirs = [PREBUILT_DIR, os.path.join(PROJECT_ROOT, "prompts")]
|
||||
extensions = {".md", ".py"}
|
||||
|
||||
for scan_dir in scan_dirs:
|
||||
if not os.path.isdir(scan_dir):
|
||||
continue
|
||||
for root, _dirs, files in os.walk(scan_dir):
|
||||
for fname in files:
|
||||
if os.path.splitext(fname)[1] not in extensions:
|
||||
continue
|
||||
fpath = os.path.join(root, fname)
|
||||
with open(fpath, encoding="utf-8") as f:
|
||||
for line_num, line in enumerate(f, 1):
|
||||
for m in FOJIN_URL_RE.finditer(line):
|
||||
text_id_in_url = m.group(2)
|
||||
url_map.setdefault(text_id_in_url, []).append(
|
||||
(fpath, line_num)
|
||||
)
|
||||
return url_map
|
||||
|
||||
|
||||
def verify_via_search(bridge, title: str, short_cbeta_id: str) -> dict | None:
|
||||
"""Search FoJin for a text by title, return the matching result with cbeta_id match."""
|
||||
try:
|
||||
resp = bridge.search_texts(title, sources="cbeta", page=1, size=5)
|
||||
for r in resp.get("results", []):
|
||||
if r.get("cbeta_id") == short_cbeta_id:
|
||||
return r
|
||||
# Also try without source filter
|
||||
resp = bridge.search_texts(title, page=1, size=5)
|
||||
for r in resp.get("results", []):
|
||||
if r.get("cbeta_id") == short_cbeta_id:
|
||||
return r
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def verify_via_lookup(bridge, short_ids: list[str]) -> dict:
|
||||
"""Try the batch lookup-cbeta endpoint. Returns {short_cbeta_id: internal_id}."""
|
||||
result = {}
|
||||
try:
|
||||
ids_str = ",".join(short_ids)
|
||||
resp = bridge.lookup_cbeta_ids(ids_str)
|
||||
if isinstance(resp, dict):
|
||||
mapping = resp.get("results") or resp.get("data") or resp
|
||||
for sid in short_ids:
|
||||
entry = mapping.get(sid)
|
||||
if entry and isinstance(entry, dict):
|
||||
result[sid] = entry.get("text_id") or entry.get("id")
|
||||
elif entry and isinstance(entry, int):
|
||||
result[sid] = entry
|
||||
except Exception:
|
||||
pass # Endpoint may not be implemented; fall back to search
|
||||
return result
|
||||
|
||||
|
||||
def verify_ids(bridge, cbeta_map: dict[str, list[str]], titles: dict[str, str]) -> dict[str, dict]:
|
||||
"""Verify all CBETA IDs and return {full_cbeta_id: {text_id, short_id, title, ...}}.
|
||||
|
||||
Args:
|
||||
cbeta_map: {full_cbeta_id: [teacher_slugs]}
|
||||
titles: {full_cbeta_id: title_from_meta} for search fallback
|
||||
"""
|
||||
results: dict[str, dict] = {}
|
||||
|
||||
# Build full -> short mapping
|
||||
full_to_short = {}
|
||||
short_to_full = {}
|
||||
for full_id in cbeta_map:
|
||||
short = full_to_short_cbeta(full_id)
|
||||
if short:
|
||||
full_to_short[full_id] = short
|
||||
short_to_full[short] = full_id
|
||||
|
||||
# Try batch lookup first
|
||||
short_ids = list(full_to_short.values())
|
||||
lookup_result = verify_via_lookup(bridge, short_ids)
|
||||
|
||||
for full_id, short_id in sorted(full_to_short.items()):
|
||||
if short_id in lookup_result:
|
||||
results[full_id] = {
|
||||
"text_id": lookup_result[short_id],
|
||||
"short_cbeta_id": short_id,
|
||||
"method": "lookup",
|
||||
}
|
||||
continue
|
||||
|
||||
# Fallback: search by title
|
||||
title = titles.get(full_id, "")
|
||||
if title:
|
||||
time.sleep(0.2) # Rate limit
|
||||
match = verify_via_search(bridge, title, short_id)
|
||||
if match:
|
||||
results[full_id] = {
|
||||
"text_id": match["id"],
|
||||
"short_cbeta_id": short_id,
|
||||
"title_zh": match.get("title_zh", ""),
|
||||
"method": "search",
|
||||
}
|
||||
continue
|
||||
|
||||
# Try direct get with the internal ID if it's numeric-ish
|
||||
# Last resort: not found
|
||||
results[full_id] = {
|
||||
"text_id": None,
|
||||
"short_cbeta_id": short_id,
|
||||
"method": "not_found",
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def collect_titles_from_meta() -> dict[str, str]:
|
||||
"""Collect {full_cbeta_id: title} from meta.json sources."""
|
||||
titles = {}
|
||||
for teacher in os.listdir(PREBUILT_DIR):
|
||||
meta_path = os.path.join(PREBUILT_DIR, teacher, "meta.json")
|
||||
if not os.path.isfile(meta_path):
|
||||
continue
|
||||
with open(meta_path, encoding="utf-8") as f:
|
||||
meta = json.load(f)
|
||||
for src in meta.get("sources", []):
|
||||
if src.get("type") == "cbeta" and src.get("id") and src.get("title"):
|
||||
titles[src["id"]] = src["title"]
|
||||
return titles
|
||||
|
||||
|
||||
def fix_urls_in_file(
|
||||
filepath: str, id_map: dict[str, str], dry_run: bool
|
||||
) -> list[str]:
|
||||
"""Replace CBETA IDs with internal text_ids in URLs. Returns list of changes."""
|
||||
changes = []
|
||||
with open(filepath, encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
def replacer(m):
|
||||
prefix = m.group(1)
|
||||
old_id = m.group(2)
|
||||
if old_id in id_map:
|
||||
new_id = id_map[old_id]
|
||||
rel = os.path.relpath(filepath, PROJECT_ROOT)
|
||||
changes.append(f" {rel}: {old_id} -> {new_id}")
|
||||
return prefix + new_id
|
||||
return m.group(0)
|
||||
|
||||
new_content = FOJIN_URL_RE.sub(replacer, content)
|
||||
|
||||
if not dry_run and new_content != content:
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(new_content)
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Verify and fix FoJin source links")
|
||||
parser.add_argument(
|
||||
"--fix", action="store_true", help="Actually write changes to files"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
dry_run = not args.fix
|
||||
|
||||
print("=" * 60)
|
||||
print("FoJin Source Verification Report")
|
||||
print("=" * 60)
|
||||
if dry_run:
|
||||
print("Mode: DRY RUN (use --fix to apply changes)\n")
|
||||
else:
|
||||
print("Mode: FIX (writing changes to files)\n")
|
||||
|
||||
# Step 1: Collect CBETA IDs from meta.json
|
||||
cbeta_map = collect_cbeta_ids()
|
||||
all_cbeta_ids = sorted(cbeta_map.keys())
|
||||
teacher_count = len(set(t for ts in cbeta_map.values() for t in ts))
|
||||
print(f"[1/4] Found {len(all_cbeta_ids)} unique CBETA IDs across {teacher_count} teachers")
|
||||
for cid in all_cbeta_ids:
|
||||
short = full_to_short_cbeta(cid) or "?"
|
||||
print(f" {cid} (-> {short}) <- {', '.join(cbeta_map[cid])}")
|
||||
|
||||
# Step 2: Collect all fojin.app URLs from files
|
||||
url_map = collect_all_fojin_urls()
|
||||
all_url_ids = sorted(url_map.keys())
|
||||
total_urls = sum(len(v) for v in url_map.values())
|
||||
print(f"\n[2/4] Found {total_urls} fojin.app URLs using {len(all_url_ids)} unique IDs")
|
||||
|
||||
# IDs in URLs but not in meta.json sources
|
||||
extra_url_ids = set(all_url_ids) - set(all_cbeta_ids)
|
||||
if extra_url_ids:
|
||||
print(f" Extra IDs in URLs (not in meta.json sources):")
|
||||
for eid in sorted(extra_url_ids):
|
||||
locs = url_map[eid]
|
||||
files = set(os.path.relpath(f, PROJECT_ROOT) for f, _ in locs)
|
||||
print(f" {eid} in {', '.join(sorted(files))}")
|
||||
|
||||
# Combine: all unique CBETA-style IDs from both meta.json and URLs
|
||||
all_ids = set(all_cbeta_ids)
|
||||
for uid in all_url_ids:
|
||||
if FULL_CBETA_RE.match(uid):
|
||||
all_ids.add(uid)
|
||||
|
||||
# Non-CBETA IDs in URLs (e.g. suttacentral IDs, placeholder "123")
|
||||
non_cbeta_url_ids = [uid for uid in all_url_ids if not FULL_CBETA_RE.match(uid)]
|
||||
if non_cbeta_url_ids:
|
||||
print(f" Non-CBETA IDs in URLs (skipped): {', '.join(non_cbeta_url_ids)}")
|
||||
|
||||
# Step 3: Verify with FoJin API
|
||||
print(f"\n[3/4] Verifying {len(all_ids)} CBETA IDs against FoJin API...")
|
||||
bridge = create_bridge()
|
||||
|
||||
if not bridge.test_connection():
|
||||
print(" [ERROR] Cannot connect to FoJin API.")
|
||||
print(" Set FOJIN_URL environment variable if using a custom instance.")
|
||||
sys.exit(1)
|
||||
print(" API connection OK")
|
||||
|
||||
# Build combined cbeta_map (include URL-only IDs)
|
||||
combined_map = dict(cbeta_map)
|
||||
for uid in all_url_ids:
|
||||
if FULL_CBETA_RE.match(uid) and uid not in combined_map:
|
||||
combined_map[uid] = ["(URL only)"]
|
||||
|
||||
titles = collect_titles_from_meta()
|
||||
verified = verify_ids(bridge, combined_map, titles)
|
||||
|
||||
found = {k: v for k, v in verified.items() if v["text_id"] is not None}
|
||||
not_found = {k: v for k, v in verified.items() if v["text_id"] is None}
|
||||
|
||||
print(f"\n Verified: {len(found)}/{len(verified)}")
|
||||
for cid in sorted(found):
|
||||
info = found[cid]
|
||||
title = info.get("title_zh", titles.get(cid, ""))
|
||||
print(f" [OK] {cid} -> text_id={info['text_id']} {title} ({info['method']})")
|
||||
|
||||
if not_found:
|
||||
print(f"\n Not found in FoJin ({len(not_found)}):")
|
||||
for cid in sorted(not_found):
|
||||
teachers = combined_map.get(cid, ["?"])
|
||||
print(f" [MISS] {cid} (-> {not_found[cid]['short_cbeta_id']}) used by: {', '.join(teachers)}")
|
||||
|
||||
# Step 4: Update URLs
|
||||
# Build replacement map: full_cbeta_id -> str(internal_text_id)
|
||||
id_replacement_map: dict[str, str] = {}
|
||||
for full_id, info in found.items():
|
||||
id_replacement_map[full_id] = str(info["text_id"])
|
||||
|
||||
action = "Would update" if dry_run else "Updating"
|
||||
print(f"\n[4/4] {action} URLs...")
|
||||
all_changes = []
|
||||
|
||||
files_to_fix = set()
|
||||
for locations in url_map.values():
|
||||
for fpath, _ in locations:
|
||||
files_to_fix.add(fpath)
|
||||
|
||||
for fpath in sorted(files_to_fix):
|
||||
changes = fix_urls_in_file(fpath, id_replacement_map, dry_run)
|
||||
all_changes.extend(changes)
|
||||
|
||||
if all_changes:
|
||||
for c in all_changes:
|
||||
print(c)
|
||||
verb = "would be made" if dry_run else "applied"
|
||||
print(f"\n Total: {len(all_changes)} URL replacements {verb}")
|
||||
else:
|
||||
print(" No URL replacements needed")
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("Summary")
|
||||
print("=" * 60)
|
||||
print(f" CBETA IDs in meta.json: {len(all_cbeta_ids)}")
|
||||
print(f" CBETA IDs in URLs: {len([u for u in all_url_ids if FULL_CBETA_RE.match(u)])}")
|
||||
print(f" Total unique CBETA IDs: {len(all_ids)}")
|
||||
print(f" Verified in FoJin: {len(found)}")
|
||||
print(f" Not found in FoJin: {len(not_found)}")
|
||||
print(f" URL replacements: {len(all_changes)}")
|
||||
if dry_run and all_changes:
|
||||
print("\n Run with --fix to apply changes.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user