mirror of
https://github.com/xr843/Master-skill.git
synced 2026-05-10 05:16:25 +00:00
feat: add sutra collector for gathering teacher data from FoJin
This commit is contained in:
@@ -0,0 +1,125 @@
|
||||
"""
|
||||
Sutra Collector — gathers Buddhist texts and metadata for a specific teacher.
|
||||
|
||||
Orchestrates FoJin Bridge calls to collect:
|
||||
1. Teacher's KG entity and relations (lineage, school, texts)
|
||||
2. Core texts associated with the teacher
|
||||
3. Text content (selected juans)
|
||||
4. Dictionary terms related to the teacher's tradition
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from fojin_bridge import FojinBridge, create_bridge
|
||||
|
||||
|
||||
def collect_teacher_data(
|
||||
teacher_name: str,
|
||||
tradition: Optional[str] = None,
|
||||
bridge: Optional[FojinBridge] = None,
|
||||
) -> dict:
|
||||
"""Collect all available data about a teacher from FoJin."""
|
||||
if bridge is None:
|
||||
bridge = create_bridge()
|
||||
|
||||
result = {
|
||||
"entity": None,
|
||||
"lineage": [],
|
||||
"texts": [],
|
||||
"content_samples": [],
|
||||
"terms": [],
|
||||
}
|
||||
|
||||
# Step 1: Find teacher in KG
|
||||
kg_results = bridge.search_kg_entities(teacher_name, entity_type="person")
|
||||
if kg_results.get("results"):
|
||||
entity = kg_results["results"][0]
|
||||
result["entity"] = entity
|
||||
detail = bridge.get_kg_entity(entity["id"])
|
||||
result["entity"] = detail
|
||||
for rel in detail.get("relations", []):
|
||||
if rel["predicate"] in (
|
||||
"teacher_of", "student_of", "lineage_holder",
|
||||
"transmitted_to", "received_from",
|
||||
):
|
||||
result["lineage"].append(rel)
|
||||
|
||||
# Step 2: Search for associated texts
|
||||
text_results = bridge.search_texts(teacher_name, size=50)
|
||||
if text_results.get("results"):
|
||||
result["texts"] = text_results["results"]
|
||||
|
||||
# Step 3: Collect content samples from top texts
|
||||
for text in result["texts"][:5]:
|
||||
text_id = text.get("id")
|
||||
if not text_id:
|
||||
continue
|
||||
try:
|
||||
content = bridge.get_text_content(text_id, juan_num=1)
|
||||
result["content_samples"].append({
|
||||
"text_id": text_id,
|
||||
"title": text.get("title_zh", ""),
|
||||
"content": content.get("content", "")[:3000],
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Step 4: Collect tradition-specific terms
|
||||
if tradition:
|
||||
tradition_terms = {
|
||||
"汉传": ["净土", "禅", "般若", "菩提", "念佛"],
|
||||
"南传": ["vipassana", "satipatthana", "anicca", "dukkha", "anatta"],
|
||||
"藏传": ["菩提道次第", "空性", "菩提心", "止观", "三主要道"],
|
||||
}
|
||||
for term in tradition_terms.get(tradition, []):
|
||||
dict_results = bridge.search_dictionary(term, size=5)
|
||||
if dict_results.get("results"):
|
||||
result["terms"].extend(dict_results["results"])
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def collect_specific_texts(
|
||||
cbeta_ids: list,
|
||||
bridge: Optional[FojinBridge] = None,
|
||||
) -> list:
|
||||
"""Collect full content for specific texts by CBETA ID."""
|
||||
if bridge is None:
|
||||
bridge = create_bridge()
|
||||
|
||||
texts = []
|
||||
id_map = bridge.lookup_cbeta_ids(",".join(cbeta_ids))
|
||||
|
||||
for cbeta_id, text_id in id_map.items():
|
||||
if not text_id:
|
||||
continue
|
||||
text_meta = bridge.get_text(text_id)
|
||||
juan_list = bridge.get_text_juans(text_id)
|
||||
|
||||
text_data = {
|
||||
"cbeta_id": cbeta_id,
|
||||
"text_id": text_id,
|
||||
"title": text_meta.get("title_zh", ""),
|
||||
"juans": [],
|
||||
}
|
||||
|
||||
for juan in juan_list.get("juans", [])[:10]:
|
||||
content = bridge.get_text_content(text_id, juan["juan_num"])
|
||||
text_data["juans"].append({
|
||||
"juan_num": juan["juan_num"],
|
||||
"content": content.get("content", ""),
|
||||
})
|
||||
|
||||
texts.append(text_data)
|
||||
|
||||
return texts
|
||||
|
||||
|
||||
def save_collected_data(data: dict, output_path: str) -> str:
|
||||
"""Save collected data to JSON file."""
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
return os.path.abspath(output_path)
|
||||
Reference in New Issue
Block a user