Master-skill/tools/sutra_collector.py

"""
Sutra Collector — gathers Buddhist texts and metadata for a specific teacher.

Orchestrates FoJin Bridge calls to collect:
1. Teacher's KG entity and relations (lineage, school, texts)
2. Core texts associated with the teacher
3. Text content (selected juans)
4. Dictionary terms related to the teacher's tradition
"""

import json
import os
from typing import Optional

from fojin_bridge import FojinBridge, create_bridge


def collect_teacher_data(
    teacher_name: str,
    tradition: Optional[str] = None,
    bridge: Optional[FojinBridge] = None,
) -> dict:
    """Collect all available data about a teacher from FoJin."""
    if bridge is None:
        bridge = create_bridge()

    result = {
        "entity": None,
        "lineage": [],
        "texts": [],
        "content_samples": [],
        "terms": [],
    }

    # Step 1: Find teacher in KG
    kg_results = bridge.search_kg_entities(teacher_name, entity_type="person")
    if kg_results.get("results"):
        entity = kg_results["results"][0]
        result["entity"] = entity
        detail = bridge.get_kg_entity(entity["id"])
        result["entity"] = detail
        for rel in detail.get("relations", []):
            if rel["predicate"] in (
                "teacher_of", "student_of", "lineage_holder",
                "transmitted_to", "received_from",
            ):
                result["lineage"].append(rel)

    # Step 2: Search for associated texts
    text_results = bridge.search_texts(teacher_name, size=50)
    if text_results.get("results"):
        result["texts"] = text_results["results"]

    # Step 3: Collect content samples from top texts
    for text in result["texts"][:5]:
        text_id = text.get("id")
        if not text_id:
            continue
        try:
            content = bridge.get_text_content(text_id, juan_num=1)
            result["content_samples"].append({
                "text_id": text_id,
                "title": text.get("title_zh", ""),
                "content": content.get("content", "")[:3000],
            })
        except Exception:
            continue

    # Step 4: Collect tradition-specific terms
    if tradition:
        tradition_terms = {
            "汉传": ["净土", "禅", "般若", "菩提", "念佛"],
            "南传": ["vipassana", "satipatthana", "anicca", "dukkha", "anatta"],
            "藏传": ["菩提道次第", "空性", "菩提心", "止观", "三主要道"],
        }
        for term in tradition_terms.get(tradition, []):
            dict_results = bridge.search_dictionary(term, size=5)
            if dict_results.get("results"):
                result["terms"].extend(dict_results["results"])

    return result


def collect_specific_texts(
    cbeta_ids: list,
    bridge: Optional[FojinBridge] = None,
) -> list:
    """Collect full content for specific texts by CBETA ID."""
    if bridge is None:
        bridge = create_bridge()

    texts = []
    id_map = bridge.lookup_cbeta_ids(",".join(cbeta_ids))

    for cbeta_id, text_id in id_map.items():
        if not text_id:
            continue
        text_meta = bridge.get_text(text_id)
        juan_list = bridge.get_text_juans(text_id)

        text_data = {
            "cbeta_id": cbeta_id,
            "text_id": text_id,
            "title": text_meta.get("title_zh", ""),
            "juans": [],
        }

        for juan in juan_list.get("juans", [])[:10]:
            content = bridge.get_text_content(text_id, juan["juan_num"])
            text_data["juans"].append({
                "juan_num": juan["juan_num"],
                "content": content.get("content", ""),
            })

        texts.append(text_data)

    return texts


def save_collected_data(data: dict, output_path: str) -> str:
    """Save collected data to JSON file."""
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    return os.path.abspath(output_path)