Files
Master-skill/tools/sutra_collector.py
T

126 lines
3.9 KiB
Python

"""
Sutra Collector — gathers Buddhist texts and metadata for a specific teacher.
Orchestrates FoJin Bridge calls to collect:
1. Teacher's KG entity and relations (lineage, school, texts)
2. Core texts associated with the teacher
3. Text content (selected juans)
4. Dictionary terms related to the teacher's tradition
"""
import json
import os
from typing import Optional
from fojin_bridge import FojinBridge, create_bridge
def collect_teacher_data(
teacher_name: str,
tradition: Optional[str] = None,
bridge: Optional[FojinBridge] = None,
) -> dict:
"""Collect all available data about a teacher from FoJin."""
if bridge is None:
bridge = create_bridge()
result = {
"entity": None,
"lineage": [],
"texts": [],
"content_samples": [],
"terms": [],
}
# Step 1: Find teacher in KG
kg_results = bridge.search_kg_entities(teacher_name, entity_type="person")
if kg_results.get("results"):
entity = kg_results["results"][0]
result["entity"] = entity
detail = bridge.get_kg_entity(entity["id"])
result["entity"] = detail
for rel in detail.get("relations", []):
if rel["predicate"] in (
"teacher_of", "student_of", "lineage_holder",
"transmitted_to", "received_from",
):
result["lineage"].append(rel)
# Step 2: Search for associated texts
text_results = bridge.search_texts(teacher_name, size=50)
if text_results.get("results"):
result["texts"] = text_results["results"]
# Step 3: Collect content samples from top texts
for text in result["texts"][:5]:
text_id = text.get("id")
if not text_id:
continue
try:
content = bridge.get_text_content(text_id, juan_num=1)
result["content_samples"].append({
"text_id": text_id,
"title": text.get("title_zh", ""),
"content": content.get("content", "")[:3000],
})
except Exception:
continue
# Step 4: Collect tradition-specific terms
if tradition:
tradition_terms = {
"汉传": ["净土", "", "般若", "菩提", "念佛"],
"南传": ["vipassana", "satipatthana", "anicca", "dukkha", "anatta"],
"藏传": ["菩提道次第", "空性", "菩提心", "止观", "三主要道"],
}
for term in tradition_terms.get(tradition, []):
dict_results = bridge.search_dictionary(term, size=5)
if dict_results.get("results"):
result["terms"].extend(dict_results["results"])
return result
def collect_specific_texts(
cbeta_ids: list,
bridge: Optional[FojinBridge] = None,
) -> list:
"""Collect full content for specific texts by CBETA ID."""
if bridge is None:
bridge = create_bridge()
texts = []
id_map = bridge.lookup_cbeta_ids(",".join(cbeta_ids))
for cbeta_id, text_id in id_map.items():
if not text_id:
continue
text_meta = bridge.get_text(text_id)
juan_list = bridge.get_text_juans(text_id)
text_data = {
"cbeta_id": cbeta_id,
"text_id": text_id,
"title": text_meta.get("title_zh", ""),
"juans": [],
}
for juan in juan_list.get("juans", [])[:10]:
content = bridge.get_text_content(text_id, juan["juan_num"])
text_data["juans"].append({
"juan_num": juan["juan_num"],
"content": content.get("content", ""),
})
texts.append(text_data)
return texts
def save_collected_data(data: dict, output_path: str) -> str:
"""Save collected data to JSON file."""
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
return os.path.abspath(output_path)