feat(eval): add KRRA parse + ingest scripts

SonAIengine · claude · SonAIengine · commit c003f7a2fa95 · 2026-04-09T10:37:21.000+09:00
Add evaluation harness scripts for the KRRA (마사회) document corpus:

- `eval/scripts/parse_krra.py` — walks 마사회/ directory, extracts text
  + chunks from HWP/XLSX files using xgen-doc2chunk, writes standardized
  JSONL to eval/data/parsed/krra/. Skips PDF/ subdirectories (duplicates
  of HWP files). Extracts category from folder name and title from
  filename conventions.

- `eval/scripts/ingest_krra.py` — reads parsed JSONL and builds a Kuzu
  graph with Category, Document, and Chunk nodes connected by PART_OF,
  CONTAINS, and NEXT_CHUNK edges. Uses SynapticGraph without ontology
  constraints to allow general-purpose data.

Results on 마사회 corpus:
  - Input:  1,111 files (1,071 HWP + 38 XLSX + 2 XLS)
  - Parsed: 1,110 documents, 18,600 chunks (253 HWP encoding errors)
  - Graph:  ~19,720 nodes, ~19,710 edges, 240 MB Kuzu DB
  - Parse time: ~12 min, ingest time: ~6 min

Note: this is structural ingestion only (filesystem → graph). Entity
extraction, relation detection, and ontology construction are the next
step — the extensions (PhraseExtractor, EntityExtractor, RelationDetector)
exist but are not yet wired into this pipeline.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/eval/scripts/ingest_krra.py b/eval/scripts/ingest_krra.py
@@ -0,0 +1,163 @@
+"""Ingest parsed KRRA chunks into a SynapticGraph (Kuzu backend).
+
+Reads eval/data/parsed/krra/{documents,chunks}.jsonl and builds a graph
+with Document nodes, Chunk nodes, Category nodes, and edges.
+
+Usage:
+    uv run python eval/scripts/ingest_krra.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import sys
+import time
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(REPO_ROOT))
+
+from synaptic.backends.kuzu import KuzuBackend  # noqa: E402
+from synaptic.graph import SynapticGraph  # noqa: E402
+from synaptic.models import EdgeKind, NodeKind  # noqa: E402
+
+PARSED_DIR = REPO_ROOT / "eval" / "data" / "parsed" / "krra"
+GRAPH_DIR = REPO_ROOT / "eval" / "data" / "krra_graph.kuzu"
+
+
+async def main() -> int:
+    docs_path = PARSED_DIR / "documents.jsonl"
+    chunks_path = PARSED_DIR / "chunks.jsonl"
+
+    if not docs_path.exists():
+        print(f"ERROR: {docs_path} not found. Run parse_krra.py first.")
+        return 1
+
+    # Load parsed data
+    docs: list[dict] = []
+    with open(docs_path, encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                docs.append(json.loads(line))
+
+    chunks: list[dict] = []
+    with open(chunks_path, encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                chunks.append(json.loads(line))
+
+    print(f"Loaded {len(docs)} documents, {len(chunks)} chunks")
+
+    # Build chunk lookup: doc_id → [chunk_dicts]
+    doc_chunks: dict[str, list[dict]] = {}
+    for c in chunks:
+        doc_chunks.setdefault(c["doc_id"], []).append(c)
+
+    # Create graph
+    import shutil
+
+    if GRAPH_DIR.exists():
+        shutil.rmtree(GRAPH_DIR)
+
+    backend = KuzuBackend(str(GRAPH_DIR))
+    await backend.connect()
+    graph = SynapticGraph(backend)  # no ontology constraints → 범용 데이터 허용
+
+    start = time.time()
+
+    # Phase 1: Category nodes
+    categories = sorted({d["category"] for d in docs})
+    cat_ids: dict[str, str] = {}
+    for cat_name in categories:
+        node = await graph.add(
+            title=cat_name,
+            content=f"마사회 문서 카테고리: {cat_name}",
+            kind=NodeKind.CONCEPT,
+            tags=["category", "krra"],
+        )
+        cat_ids[cat_name] = node.id
+    print(f"  Created {len(categories)} category nodes")
+
+    # Phase 2: Document + Chunk nodes
+    total_doc_nodes = 0
+    total_chunk_nodes = 0
+
+    for i, doc in enumerate(docs):
+        if (i + 1) % 100 == 0:
+            elapsed = time.time() - start
+            print(f"  [{i+1}/{len(docs)}] {elapsed:.0f}s — docs={total_doc_nodes} chunks={total_chunk_nodes}")
+
+        # Document node
+        doc_node = await graph.add(
+            title=doc["title"],
+            content="",  # content is in chunks
+            kind=NodeKind.ENTITY,
+            tags=["document", "krra", doc.get("doc_type", "")],
+            source=doc["source_path"],
+            properties={
+                "doc_id": doc["doc_id"],
+                "doc_type": doc.get("doc_type", ""),
+                "year": str(doc["year"]) if doc.get("year") else "",
+                "category": doc.get("category", ""),
+                "original_filename": doc.get("metadata", {}).get("original_filename", ""),
+                "chunk_count": str(doc.get("chunk_count", 0)),
+            },
+        )
+        total_doc_nodes += 1
+
+        # Link to category
+        cat_id = cat_ids.get(doc.get("category", ""))
+        if cat_id:
+            await graph.link(doc_node.id, cat_id, kind=EdgeKind.PART_OF)
+
+        # Chunk nodes
+        doc_chunk_list = doc_chunks.get(doc["doc_id"], [])
+        prev_chunk_id: str | None = None
+
+        for chunk_data in sorted(doc_chunk_list, key=lambda x: x["index"]):
+            chunk_node = await graph.add(
+                title=f"{doc['title']} #{chunk_data['index']}",
+                content=chunk_data["text"],
+                kind=NodeKind.CHUNK,
+                tags=["chunk", "krra"],
+                source=doc["source_path"],
+                properties={
+                    "doc_id": doc["doc_id"],
+                    "chunk_index": str(chunk_data["index"]),
+                    "page_number": str(chunk_data.get("page_number") or ""),
+                },
+            )
+            total_chunk_nodes += 1
+
+            # Document → Chunk
+            await graph.link(doc_node.id, chunk_node.id, kind=EdgeKind.CONTAINS)
+
+            # Sequential chunk linking
+            if prev_chunk_id:
+                await graph.link(prev_chunk_id, chunk_node.id, kind=EdgeKind.NEXT_CHUNK)
+            prev_chunk_id = chunk_node.id
+
+    elapsed = time.time() - start
+    print(f"\n{'='*60}")
+    print(f"KRRA Ingest Complete — {elapsed:.1f}s")
+    print(f"  Categories:  {len(categories)}")
+    print(f"  Documents:   {total_doc_nodes}")
+    print(f"  Chunks:      {total_chunk_nodes}")
+    print(f"  Graph path:  {GRAPH_DIR.relative_to(REPO_ROOT)}")
+    print(f"{'='*60}")
+
+    # Quick test: search
+    print("\n[Quick search test]")
+    for q in ["경마 운영계획", "인권경영", "정보기술 시스템"]:
+        result = await graph.search(q, limit=3)
+        hits = len(result.nodes)
+        top = result.nodes[0].node.title if result.nodes else "-"
+        print(f"  '{q}' → {hits} hits, top: {top}")
+
+    await backend.close()
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))
diff --git a/eval/scripts/parse_krra.py b/eval/scripts/parse_krra.py
@@ -0,0 +1,209 @@
+"""Parse KRRA (마사회) documents using xgen-doc2chunk.
+
+Walks the raw document directory, extracts text + chunks from each file,
+and writes standardized JSONL to eval/data/parsed/krra/.
+
+Usage:
+    uv run python eval/scripts/parse_krra.py
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+RAW_DIR = REPO_ROOT / "마사회"
+OUT_DIR = REPO_ROOT / "eval" / "data" / "parsed" / "krra"
+
+SUPPORTED_EXTS = {
+    ".pdf", ".txt", ".md", ".docx", ".doc", ".rtf",
+    ".hwp", ".hwpx",
+    ".xlsx", ".xls", ".csv", ".tsv",
+    ".pptx", ".odp",
+    ".png", ".jpg", ".jpeg",
+}
+
+CHUNK_SIZE = 1000
+CHUNK_OVERLAP = 200
+
+
+@dataclass(slots=True)
+class ParsedChunk:
+    chunk_id: str
+    doc_id: str
+    text: str
+    index: int
+    page_number: int | None = None
+    line_start: int | None = None
+    line_end: int | None = None
+
+
+@dataclass(slots=True)
+class ParsedDocument:
+    doc_id: str
+    source_path: str
+    title: str
+    doc_type: str
+    category: str
+    year: int | None = None
+    metadata: dict[str, Any] = field(default_factory=dict)
+    chunk_count: int = 0
+
+
+def _doc_id(path: str) -> str:
+    return hashlib.md5(path.encode()).hexdigest()[:16]
+
+
+def _extract_year(filename: str) -> int | None:
+    m = re.match(r"(\d{4})년도", filename)
+    return int(m.group(1)) if m else None
+
+
+def _extract_category(rel_path: Path) -> str:
+    parts = rel_path.parts
+    return parts[0] if parts else "unknown"
+
+
+def _extract_title(filename: str) -> str:
+    # Remove year prefix and extension
+    name = re.sub(r"^\d{4}년도_", "", filename)
+    name = Path(name).stem
+    # Remove common prefixes
+    name = re.sub(r"^\(본문\)\s*", "", name)
+    name = re.sub(r"^\(붙임[#\d]*\)\s*", "", name)
+    name = re.sub(r"^붙임\d*\s*", "", name)
+    name = re.sub(r"^\[붙임\d*\]\s*", "", name)
+    name = re.sub(r"^\(별첨\d*\)\s*", "", name)
+    return name.strip() or filename
+
+
+def parse_all() -> None:
+    if not RAW_DIR.exists():
+        print(f"ERROR: {RAW_DIR} not found")
+        sys.exit(1)
+
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+    docs_path = OUT_DIR / "documents.jsonl"
+    chunks_path = OUT_DIR / "chunks.jsonl"
+    errors_path = OUT_DIR / "errors.jsonl"
+
+    from xgen_doc2chunk import DocumentProcessor
+
+    processor = DocumentProcessor()
+
+    files = [
+        f for f in sorted(RAW_DIR.rglob("*"))
+        if f.is_file()
+        and f.suffix.lower() in SUPPORTED_EXTS
+        and not f.name.startswith(".")
+        and "/PDF/" not in str(f)  # 한글(HWP)과 중복 — PDF 폴더 제외
+        and "/PDF" not in str(f.parent.name)
+    ]
+
+    print(f"Found {len(files)} parseable files in {RAW_DIR}")
+
+    total_docs = 0
+    total_chunks = 0
+    total_errors = 0
+    start = time.time()
+
+    with (
+        open(docs_path, "w", encoding="utf-8") as docs_f,
+        open(chunks_path, "w", encoding="utf-8") as chunks_f,
+        open(errors_path, "w", encoding="utf-8") as errors_f,
+    ):
+        for i, fpath in enumerate(files):
+            rel = fpath.relative_to(RAW_DIR)
+            doc_id = _doc_id(str(rel))
+            category = _extract_category(rel)
+            year = _extract_year(fpath.name)
+            title = _extract_title(fpath.name)
+            doc_type = fpath.suffix.lower().lstrip(".")
+
+            if (i + 1) % 50 == 0 or i == 0:
+                elapsed = time.time() - start
+                print(
+                    f"  [{i+1}/{len(files)}] {elapsed:.0f}s "
+                    f"docs={total_docs} chunks={total_chunks} errors={total_errors}"
+                )
+
+            try:
+                result = processor.extract_chunks(
+                    str(fpath),
+                    chunk_size=CHUNK_SIZE,
+                    chunk_overlap=CHUNK_OVERLAP,
+                    include_position_metadata=True,
+                )
+
+                chunks_with_meta = list(result.chunks_with_metadata)
+                if not chunks_with_meta:
+                    # Fallback: try plain text extraction
+                    text = processor.extract_text(str(fpath))
+                    if text and text.strip():
+                        chunks_with_meta = [{"text": text, "page_number": None}]
+
+                if not chunks_with_meta:
+                    errors_f.write(json.dumps({
+                        "doc_id": doc_id,
+                        "path": str(rel),
+                        "error": "empty extraction",
+                    }, ensure_ascii=False) + "\n")
+                    total_errors += 1
+                    continue
+
+                doc = ParsedDocument(
+                    doc_id=doc_id,
+                    source_path=str(rel),
+                    title=title,
+                    doc_type=doc_type,
+                    category=category,
+                    year=year,
+                    metadata={"original_filename": fpath.name},
+                    chunk_count=len(chunks_with_meta),
+                )
+                docs_f.write(json.dumps(asdict(doc), ensure_ascii=False) + "\n")
+                total_docs += 1
+
+                for idx, chunk_data in enumerate(chunks_with_meta):
+                    text = chunk_data.get("text", "") if isinstance(chunk_data, dict) else str(chunk_data)
+                    if not text.strip():
+                        continue
+                    chunk = ParsedChunk(
+                        chunk_id=f"{doc_id}_c{idx:04d}",
+                        doc_id=doc_id,
+                        text=text.strip(),
+                        index=idx,
+                        page_number=chunk_data.get("page_number") if isinstance(chunk_data, dict) else None,
+                        line_start=chunk_data.get("line_start") if isinstance(chunk_data, dict) else None,
+                        line_end=chunk_data.get("line_end") if isinstance(chunk_data, dict) else None,
+                    )
+                    chunks_f.write(json.dumps(asdict(chunk), ensure_ascii=False) + "\n")
+                    total_chunks += 1
+
+            except Exception as exc:
+                errors_f.write(json.dumps({
+                    "doc_id": doc_id,
+                    "path": str(rel),
+                    "error": str(exc)[:500],
+                }, ensure_ascii=False) + "\n")
+                total_errors += 1
+
+    elapsed = time.time() - start
+    print(f"\n{'='*60}")
+    print(f"KRRA Parse Complete — {elapsed:.1f}s")
+    print(f"  Documents: {total_docs}")
+    print(f"  Chunks:    {total_chunks}")
+    print(f"  Errors:    {total_errors}")
+    print(f"  Output:    {OUT_DIR.relative_to(REPO_ROOT)}/")
+    print(f"{'='*60}")
+
+
+if __name__ == "__main__":
+    parse_all()
diff --git a/uv.lock b/uv.lock