diff --git a/tools/sutra_collector.py b/tools/sutra_collector.py new file mode 100644 index 0000000..c4917d0 --- /dev/null +++ b/tools/sutra_collector.py @@ -0,0 +1,125 @@ +""" +Sutra Collector — gathers Buddhist texts and metadata for a specific teacher. + +Orchestrates FoJin Bridge calls to collect: +1. Teacher's KG entity and relations (lineage, school, texts) +2. Core texts associated with the teacher +3. Text content (selected juans) +4. Dictionary terms related to the teacher's tradition +""" + +import json +import os +from typing import Optional + +from fojin_bridge import FojinBridge, create_bridge + + +def collect_teacher_data( + teacher_name: str, + tradition: Optional[str] = None, + bridge: Optional[FojinBridge] = None, +) -> dict: + """Collect all available data about a teacher from FoJin.""" + if bridge is None: + bridge = create_bridge() + + result = { + "entity": None, + "lineage": [], + "texts": [], + "content_samples": [], + "terms": [], + } + + # Step 1: Find teacher in KG + kg_results = bridge.search_kg_entities(teacher_name, entity_type="person") + if kg_results.get("results"): + entity = kg_results["results"][0] + result["entity"] = entity + detail = bridge.get_kg_entity(entity["id"]) + result["entity"] = detail + for rel in detail.get("relations", []): + if rel["predicate"] in ( + "teacher_of", "student_of", "lineage_holder", + "transmitted_to", "received_from", + ): + result["lineage"].append(rel) + + # Step 2: Search for associated texts + text_results = bridge.search_texts(teacher_name, size=50) + if text_results.get("results"): + result["texts"] = text_results["results"] + + # Step 3: Collect content samples from top texts + for text in result["texts"][:5]: + text_id = text.get("id") + if not text_id: + continue + try: + content = bridge.get_text_content(text_id, juan_num=1) + result["content_samples"].append({ + "text_id": text_id, + "title": text.get("title_zh", ""), + "content": content.get("content", "")[:3000], + }) + except Exception: + continue + + # Step 4: Collect tradition-specific terms + if tradition: + tradition_terms = { + "汉传": ["净土", "禅", "般若", "菩提", "念佛"], + "南传": ["vipassana", "satipatthana", "anicca", "dukkha", "anatta"], + "藏传": ["菩提道次第", "空性", "菩提心", "止观", "三主要道"], + } + for term in tradition_terms.get(tradition, []): + dict_results = bridge.search_dictionary(term, size=5) + if dict_results.get("results"): + result["terms"].extend(dict_results["results"]) + + return result + + +def collect_specific_texts( + cbeta_ids: list, + bridge: Optional[FojinBridge] = None, +) -> list: + """Collect full content for specific texts by CBETA ID.""" + if bridge is None: + bridge = create_bridge() + + texts = [] + id_map = bridge.lookup_cbeta_ids(",".join(cbeta_ids)) + + for cbeta_id, text_id in id_map.items(): + if not text_id: + continue + text_meta = bridge.get_text(text_id) + juan_list = bridge.get_text_juans(text_id) + + text_data = { + "cbeta_id": cbeta_id, + "text_id": text_id, + "title": text_meta.get("title_zh", ""), + "juans": [], + } + + for juan in juan_list.get("juans", [])[:10]: + content = bridge.get_text_content(text_id, juan["juan_num"]) + text_data["juans"].append({ + "juan_num": juan["juan_num"], + "content": content.get("content", ""), + }) + + texts.append(text_data) + + return texts + + +def save_collected_data(data: dict, output_path: str) -> str: + """Save collected data to JSON file.""" + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + return os.path.abspath(output_path)