Skip to content

Commit c003f7a

Browse files
SonAIengineclaude
andcommitted
feat(eval): add KRRA parse + ingest scripts
Add evaluation harness scripts for the KRRA (마사회) document corpus: - `eval/scripts/parse_krra.py` — walks 마사회/ directory, extracts text + chunks from HWP/XLSX files using xgen-doc2chunk, writes standardized JSONL to eval/data/parsed/krra/. Skips PDF/ subdirectories (duplicates of HWP files). Extracts category from folder name and title from filename conventions. - `eval/scripts/ingest_krra.py` — reads parsed JSONL and builds a Kuzu graph with Category, Document, and Chunk nodes connected by PART_OF, CONTAINS, and NEXT_CHUNK edges. Uses SynapticGraph without ontology constraints to allow general-purpose data. Results on 마사회 corpus: - Input: 1,111 files (1,071 HWP + 38 XLSX + 2 XLS) - Parsed: 1,110 documents, 18,600 chunks (253 HWP encoding errors) - Graph: ~19,720 nodes, ~19,710 edges, 240 MB Kuzu DB - Parse time: ~12 min, ingest time: ~6 min Note: this is structural ingestion only (filesystem → graph). Entity extraction, relation detection, and ontology construction are the next step — the extensions (PhraseExtractor, EntityExtractor, RelationDetector) exist but are not yet wired into this pipeline. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent f6d2199 commit c003f7a

File tree

3 files changed

+373
-1
lines changed

3 files changed

+373
-1
lines changed

eval/scripts/ingest_krra.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
"""Ingest parsed KRRA chunks into a SynapticGraph (Kuzu backend).
2+
3+
Reads eval/data/parsed/krra/{documents,chunks}.jsonl and builds a graph
4+
with Document nodes, Chunk nodes, Category nodes, and edges.
5+
6+
Usage:
7+
uv run python eval/scripts/ingest_krra.py
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import asyncio
13+
import json
14+
import sys
15+
import time
16+
from pathlib import Path
17+
18+
REPO_ROOT = Path(__file__).resolve().parents[2]
19+
sys.path.insert(0, str(REPO_ROOT))
20+
21+
from synaptic.backends.kuzu import KuzuBackend # noqa: E402
22+
from synaptic.graph import SynapticGraph # noqa: E402
23+
from synaptic.models import EdgeKind, NodeKind # noqa: E402
24+
25+
PARSED_DIR = REPO_ROOT / "eval" / "data" / "parsed" / "krra"
26+
GRAPH_DIR = REPO_ROOT / "eval" / "data" / "krra_graph.kuzu"
27+
28+
29+
async def main() -> int:
30+
docs_path = PARSED_DIR / "documents.jsonl"
31+
chunks_path = PARSED_DIR / "chunks.jsonl"
32+
33+
if not docs_path.exists():
34+
print(f"ERROR: {docs_path} not found. Run parse_krra.py first.")
35+
return 1
36+
37+
# Load parsed data
38+
docs: list[dict] = []
39+
with open(docs_path, encoding="utf-8") as f:
40+
for line in f:
41+
if line.strip():
42+
docs.append(json.loads(line))
43+
44+
chunks: list[dict] = []
45+
with open(chunks_path, encoding="utf-8") as f:
46+
for line in f:
47+
if line.strip():
48+
chunks.append(json.loads(line))
49+
50+
print(f"Loaded {len(docs)} documents, {len(chunks)} chunks")
51+
52+
# Build chunk lookup: doc_id → [chunk_dicts]
53+
doc_chunks: dict[str, list[dict]] = {}
54+
for c in chunks:
55+
doc_chunks.setdefault(c["doc_id"], []).append(c)
56+
57+
# Create graph
58+
import shutil
59+
60+
if GRAPH_DIR.exists():
61+
shutil.rmtree(GRAPH_DIR)
62+
63+
backend = KuzuBackend(str(GRAPH_DIR))
64+
await backend.connect()
65+
graph = SynapticGraph(backend) # no ontology constraints → 범용 데이터 허용
66+
67+
start = time.time()
68+
69+
# Phase 1: Category nodes
70+
categories = sorted({d["category"] for d in docs})
71+
cat_ids: dict[str, str] = {}
72+
for cat_name in categories:
73+
node = await graph.add(
74+
title=cat_name,
75+
content=f"마사회 문서 카테고리: {cat_name}",
76+
kind=NodeKind.CONCEPT,
77+
tags=["category", "krra"],
78+
)
79+
cat_ids[cat_name] = node.id
80+
print(f" Created {len(categories)} category nodes")
81+
82+
# Phase 2: Document + Chunk nodes
83+
total_doc_nodes = 0
84+
total_chunk_nodes = 0
85+
86+
for i, doc in enumerate(docs):
87+
if (i + 1) % 100 == 0:
88+
elapsed = time.time() - start
89+
print(f" [{i+1}/{len(docs)}] {elapsed:.0f}s — docs={total_doc_nodes} chunks={total_chunk_nodes}")
90+
91+
# Document node
92+
doc_node = await graph.add(
93+
title=doc["title"],
94+
content="", # content is in chunks
95+
kind=NodeKind.ENTITY,
96+
tags=["document", "krra", doc.get("doc_type", "")],
97+
source=doc["source_path"],
98+
properties={
99+
"doc_id": doc["doc_id"],
100+
"doc_type": doc.get("doc_type", ""),
101+
"year": str(doc["year"]) if doc.get("year") else "",
102+
"category": doc.get("category", ""),
103+
"original_filename": doc.get("metadata", {}).get("original_filename", ""),
104+
"chunk_count": str(doc.get("chunk_count", 0)),
105+
},
106+
)
107+
total_doc_nodes += 1
108+
109+
# Link to category
110+
cat_id = cat_ids.get(doc.get("category", ""))
111+
if cat_id:
112+
await graph.link(doc_node.id, cat_id, kind=EdgeKind.PART_OF)
113+
114+
# Chunk nodes
115+
doc_chunk_list = doc_chunks.get(doc["doc_id"], [])
116+
prev_chunk_id: str | None = None
117+
118+
for chunk_data in sorted(doc_chunk_list, key=lambda x: x["index"]):
119+
chunk_node = await graph.add(
120+
title=f"{doc['title']} #{chunk_data['index']}",
121+
content=chunk_data["text"],
122+
kind=NodeKind.CHUNK,
123+
tags=["chunk", "krra"],
124+
source=doc["source_path"],
125+
properties={
126+
"doc_id": doc["doc_id"],
127+
"chunk_index": str(chunk_data["index"]),
128+
"page_number": str(chunk_data.get("page_number") or ""),
129+
},
130+
)
131+
total_chunk_nodes += 1
132+
133+
# Document → Chunk
134+
await graph.link(doc_node.id, chunk_node.id, kind=EdgeKind.CONTAINS)
135+
136+
# Sequential chunk linking
137+
if prev_chunk_id:
138+
await graph.link(prev_chunk_id, chunk_node.id, kind=EdgeKind.NEXT_CHUNK)
139+
prev_chunk_id = chunk_node.id
140+
141+
elapsed = time.time() - start
142+
print(f"\n{'='*60}")
143+
print(f"KRRA Ingest Complete — {elapsed:.1f}s")
144+
print(f" Categories: {len(categories)}")
145+
print(f" Documents: {total_doc_nodes}")
146+
print(f" Chunks: {total_chunk_nodes}")
147+
print(f" Graph path: {GRAPH_DIR.relative_to(REPO_ROOT)}")
148+
print(f"{'='*60}")
149+
150+
# Quick test: search
151+
print("\n[Quick search test]")
152+
for q in ["경마 운영계획", "인권경영", "정보기술 시스템"]:
153+
result = await graph.search(q, limit=3)
154+
hits = len(result.nodes)
155+
top = result.nodes[0].node.title if result.nodes else "-"
156+
print(f" '{q}' → {hits} hits, top: {top}")
157+
158+
await backend.close()
159+
return 0
160+
161+
162+
if __name__ == "__main__":
163+
sys.exit(asyncio.run(main()))

eval/scripts/parse_krra.py

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
"""Parse KRRA (마사회) documents using xgen-doc2chunk.
2+
3+
Walks the raw document directory, extracts text + chunks from each file,
4+
and writes standardized JSONL to eval/data/parsed/krra/.
5+
6+
Usage:
7+
uv run python eval/scripts/parse_krra.py
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import hashlib
13+
import json
14+
import re
15+
import sys
16+
import time
17+
from dataclasses import asdict, dataclass, field
18+
from pathlib import Path
19+
from typing import Any
20+
21+
REPO_ROOT = Path(__file__).resolve().parents[2]
22+
RAW_DIR = REPO_ROOT / "마사회"
23+
OUT_DIR = REPO_ROOT / "eval" / "data" / "parsed" / "krra"
24+
25+
SUPPORTED_EXTS = {
26+
".pdf", ".txt", ".md", ".docx", ".doc", ".rtf",
27+
".hwp", ".hwpx",
28+
".xlsx", ".xls", ".csv", ".tsv",
29+
".pptx", ".odp",
30+
".png", ".jpg", ".jpeg",
31+
}
32+
33+
CHUNK_SIZE = 1000
34+
CHUNK_OVERLAP = 200
35+
36+
37+
@dataclass(slots=True)
38+
class ParsedChunk:
39+
chunk_id: str
40+
doc_id: str
41+
text: str
42+
index: int
43+
page_number: int | None = None
44+
line_start: int | None = None
45+
line_end: int | None = None
46+
47+
48+
@dataclass(slots=True)
49+
class ParsedDocument:
50+
doc_id: str
51+
source_path: str
52+
title: str
53+
doc_type: str
54+
category: str
55+
year: int | None = None
56+
metadata: dict[str, Any] = field(default_factory=dict)
57+
chunk_count: int = 0
58+
59+
60+
def _doc_id(path: str) -> str:
61+
return hashlib.md5(path.encode()).hexdigest()[:16]
62+
63+
64+
def _extract_year(filename: str) -> int | None:
65+
m = re.match(r"(\d{4})년도", filename)
66+
return int(m.group(1)) if m else None
67+
68+
69+
def _extract_category(rel_path: Path) -> str:
70+
parts = rel_path.parts
71+
return parts[0] if parts else "unknown"
72+
73+
74+
def _extract_title(filename: str) -> str:
75+
# Remove year prefix and extension
76+
name = re.sub(r"^\d{4}년도_", "", filename)
77+
name = Path(name).stem
78+
# Remove common prefixes
79+
name = re.sub(r"^\(본문\)\s*", "", name)
80+
name = re.sub(r"^\(붙임[#\d]*\)\s*", "", name)
81+
name = re.sub(r"^붙임\d*\s*", "", name)
82+
name = re.sub(r"^\[붙임\d*\]\s*", "", name)
83+
name = re.sub(r"^\(별첨\d*\)\s*", "", name)
84+
return name.strip() or filename
85+
86+
87+
def parse_all() -> None:
88+
if not RAW_DIR.exists():
89+
print(f"ERROR: {RAW_DIR} not found")
90+
sys.exit(1)
91+
92+
OUT_DIR.mkdir(parents=True, exist_ok=True)
93+
docs_path = OUT_DIR / "documents.jsonl"
94+
chunks_path = OUT_DIR / "chunks.jsonl"
95+
errors_path = OUT_DIR / "errors.jsonl"
96+
97+
from xgen_doc2chunk import DocumentProcessor
98+
99+
processor = DocumentProcessor()
100+
101+
files = [
102+
f for f in sorted(RAW_DIR.rglob("*"))
103+
if f.is_file()
104+
and f.suffix.lower() in SUPPORTED_EXTS
105+
and not f.name.startswith(".")
106+
and "/PDF/" not in str(f) # 한글(HWP)과 중복 — PDF 폴더 제외
107+
and "/PDF" not in str(f.parent.name)
108+
]
109+
110+
print(f"Found {len(files)} parseable files in {RAW_DIR}")
111+
112+
total_docs = 0
113+
total_chunks = 0
114+
total_errors = 0
115+
start = time.time()
116+
117+
with (
118+
open(docs_path, "w", encoding="utf-8") as docs_f,
119+
open(chunks_path, "w", encoding="utf-8") as chunks_f,
120+
open(errors_path, "w", encoding="utf-8") as errors_f,
121+
):
122+
for i, fpath in enumerate(files):
123+
rel = fpath.relative_to(RAW_DIR)
124+
doc_id = _doc_id(str(rel))
125+
category = _extract_category(rel)
126+
year = _extract_year(fpath.name)
127+
title = _extract_title(fpath.name)
128+
doc_type = fpath.suffix.lower().lstrip(".")
129+
130+
if (i + 1) % 50 == 0 or i == 0:
131+
elapsed = time.time() - start
132+
print(
133+
f" [{i+1}/{len(files)}] {elapsed:.0f}s "
134+
f"docs={total_docs} chunks={total_chunks} errors={total_errors}"
135+
)
136+
137+
try:
138+
result = processor.extract_chunks(
139+
str(fpath),
140+
chunk_size=CHUNK_SIZE,
141+
chunk_overlap=CHUNK_OVERLAP,
142+
include_position_metadata=True,
143+
)
144+
145+
chunks_with_meta = list(result.chunks_with_metadata)
146+
if not chunks_with_meta:
147+
# Fallback: try plain text extraction
148+
text = processor.extract_text(str(fpath))
149+
if text and text.strip():
150+
chunks_with_meta = [{"text": text, "page_number": None}]
151+
152+
if not chunks_with_meta:
153+
errors_f.write(json.dumps({
154+
"doc_id": doc_id,
155+
"path": str(rel),
156+
"error": "empty extraction",
157+
}, ensure_ascii=False) + "\n")
158+
total_errors += 1
159+
continue
160+
161+
doc = ParsedDocument(
162+
doc_id=doc_id,
163+
source_path=str(rel),
164+
title=title,
165+
doc_type=doc_type,
166+
category=category,
167+
year=year,
168+
metadata={"original_filename": fpath.name},
169+
chunk_count=len(chunks_with_meta),
170+
)
171+
docs_f.write(json.dumps(asdict(doc), ensure_ascii=False) + "\n")
172+
total_docs += 1
173+
174+
for idx, chunk_data in enumerate(chunks_with_meta):
175+
text = chunk_data.get("text", "") if isinstance(chunk_data, dict) else str(chunk_data)
176+
if not text.strip():
177+
continue
178+
chunk = ParsedChunk(
179+
chunk_id=f"{doc_id}_c{idx:04d}",
180+
doc_id=doc_id,
181+
text=text.strip(),
182+
index=idx,
183+
page_number=chunk_data.get("page_number") if isinstance(chunk_data, dict) else None,
184+
line_start=chunk_data.get("line_start") if isinstance(chunk_data, dict) else None,
185+
line_end=chunk_data.get("line_end") if isinstance(chunk_data, dict) else None,
186+
)
187+
chunks_f.write(json.dumps(asdict(chunk), ensure_ascii=False) + "\n")
188+
total_chunks += 1
189+
190+
except Exception as exc:
191+
errors_f.write(json.dumps({
192+
"doc_id": doc_id,
193+
"path": str(rel),
194+
"error": str(exc)[:500],
195+
}, ensure_ascii=False) + "\n")
196+
total_errors += 1
197+
198+
elapsed = time.time() - start
199+
print(f"\n{'='*60}")
200+
print(f"KRRA Parse Complete — {elapsed:.1f}s")
201+
print(f" Documents: {total_docs}")
202+
print(f" Chunks: {total_chunks}")
203+
print(f" Errors: {total_errors}")
204+
print(f" Output: {OUT_DIR.relative_to(REPO_ROOT)}/")
205+
print(f"{'='*60}")
206+
207+
208+
if __name__ == "__main__":
209+
parse_all()

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)