Skip to content

Commit b71ac85

Browse files
authored
Merge pull request #9 from CocoRoF/main
feat: Empower Chunking Method
2 parents caf79de + 53f4185 commit b71ac85

10 files changed

Lines changed: 1607 additions & 643 deletions

contextifier/chunking/chunking.py

Lines changed: 191 additions & 43 deletions
Large diffs are not rendered by default.

contextifier/chunking/constants.py

Lines changed: 67 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# chunking_helper/constants.py
22
"""
3-
Chunking Module Constants - 청킹 관련 상수, 패턴, 데이터클래스 정의
3+
Chunking Module Constants - Definition of constants, patterns, and dataclasses for chunking
44
5-
이 모듈은 청킹 시스템 전반에서 사용되는 모든 상수와 데이터 구조를 정의합니다.
5+
This module defines all constants and data structures used throughout the chunking system.
66
"""
77
import logging
88
from dataclasses import dataclass
@@ -13,7 +13,7 @@
1313

1414

1515
# ============================================================================
16-
# 코드 언어 맵핑
16+
# Code Language Mapping
1717
# ============================================================================
1818

1919
LANGCHAIN_CODE_LANGUAGE_MAP = {
@@ -27,71 +27,108 @@
2727

2828

2929
# ============================================================================
30-
# 보호 영역 패턴 (청킹 시 분할되지 않아야 하는 블록)
30+
# Protected Region Patterns (Blocks that should not be split during chunking)
3131
# ============================================================================
3232

33-
# HTML 테이블 - 모든 <table> 태그 보호 (속성에 관계없이)
33+
# HTML table - Protect all <table> tags (regardless of attributes)
3434
HTML_TABLE_PATTERN = r'<table[^>]*>.*?</table>'
3535

36-
# 차트 블록 - 항상 보호됨 (어떤 조건에서도 chunking 불가)
36+
# Chart block - Always protected (cannot be chunked under any condition)
37+
# Default format: [chart]...[/chart] - can be customized via ChartProcessor
3738
CHART_BLOCK_PATTERN = r'\[chart\].*?\[/chart\]'
3839

39-
# 텍스트박스 블록 - 항상 보호됨 (어떤 조건에서도 chunking 불가)
40+
# Textbox block - Always protected (cannot be chunked under any condition)
4041
TEXTBOX_BLOCK_PATTERN = r'\[textbox\].*?\[/textbox\]'
4142

42-
# 이미지 태그 - 항상 보호됨 (어떤 조건에서도 chunking 불가)
43-
# 형식: [image:path], [Image: {path}], [image : path] 등 (대소문자 무관, 띄어쓰기 허용, {} 감싸기 허용)
43+
# Image tag - Always protected (cannot be chunked under any condition)
44+
# Format: [image:path], [Image: {path}], [image : path] etc. (case-insensitive, whitespace allowed, {} wrapping allowed)
4445
IMAGE_TAG_PATTERN = r'\[(?i:image)\s*:\s*\{?[^\]\}]+\}?\]'
4546

46-
# Markdown 테이블 (| 로 시작하는 연속된 행들, 헤더 구분선 |---|---| 포함)
47+
# Page/Slide/Sheet tag patterns - Always protected (NEVER overlap)
48+
# Default formats from PageTagProcessor
49+
PAGE_TAG_PATTERN = r'\[Page Number:\s*\d+\]'
50+
SLIDE_TAG_PATTERN = r'\[Slide Number:\s*\d+\]'
51+
SHEET_TAG_PATTERN = r'\[Sheet:\s*[^\]]+\]'
52+
53+
# OCR variants of page/slide tags
54+
PAGE_TAG_OCR_PATTERN = r'\[Page Number:\s*\d+\s*\(OCR(?:\+Ref)?\)\]'
55+
SLIDE_TAG_OCR_PATTERN = r'\[Slide Number:\s*\d+\s*\(OCR(?:\+Ref)?\)\]'
56+
57+
# Document metadata block - Always protected (NEVER overlap)
58+
# Default format: <Document-Metadata>...</Document-Metadata> - can be customized via MetadataFormatter
59+
METADATA_BLOCK_PATTERN = r'<Document-Metadata>.*?</Document-Metadata>'
60+
61+
# Data analysis block - Always protected
62+
DATA_ANALYSIS_PATTERN = r'\[(?:Data Analysis|데이터 분석)\].*?\[/(?:Data Analysis|데이터 분석)\]'
63+
64+
# Markdown table patterns
65+
# Complete Markdown table pattern (rows starting with |, including header separator |---|---|)
4766
MARKDOWN_TABLE_PATTERN = r'(?:^|\n)(\|[^\n]+\|\n\|[-:|\s]+\|\n(?:\|[^\n]+\|(?:\n|$))+)'
4867

49-
# Markdown 테이블 개별 행 패턴 (row 단위 보호용)
68+
# Markdown table individual row pattern (for row-level protection)
5069
MARKDOWN_TABLE_ROW_PATTERN = r'\|[^\n]+\|'
5170

71+
# Markdown table header separator pattern (|---|---| or |:---:|---| etc.)
72+
MARKDOWN_TABLE_SEPARATOR_PATTERN = r'^\|[\s\-:]+\|[\s\-:|]*$'
73+
74+
# Markdown table header detection (first row followed by separator)
75+
MARKDOWN_TABLE_HEADER_PATTERN = r'^(\|[^\n]+\|\n)(\|[-:|\s]+\|)'
76+
5277

5378
# ============================================================================
54-
# 테이블 청킹 관련 상수
79+
# Table Chunking Related Constants
5580
# ============================================================================
5681

57-
# 테이블 래핑 오버헤드 (테이블 태그, 줄바꿈 등)
82+
# Table wrapping overhead (table tags, line breaks, etc.)
5883
TABLE_WRAPPER_OVERHEAD = 30 # <table border='1'>\n</table>
5984

60-
# 행당 최소 오버헤드 (<tr>\n</tr>)
85+
# Minimum overhead per row (<tr>\n</tr>)
6186
ROW_OVERHEAD = 12
6287

63-
# 셀당 오버헤드 (<td></td> 또는 <th></th>)
88+
# Overhead per cell (<td></td> or <th></th>)
6489
CELL_OVERHEAD = 10
6590

66-
# 청크 인덱스 메타데이터 오버헤드
67-
CHUNK_INDEX_OVERHEAD = 30 # [테이블 청크 1/10]\n
91+
# Chunk index metadata overhead
92+
CHUNK_INDEX_OVERHEAD = 30 # [Table chunk 1/10]\n
6893

69-
# 테이블이 이 크기보다 크면 분할 대상
70-
TABLE_SIZE_THRESHOLD_MULTIPLIER = 1.2 # chunk_size의 1.2배
94+
# Tables larger than this are subject to splitting
95+
TABLE_SIZE_THRESHOLD_MULTIPLIER = 1.2 # 1.2x of chunk_size
7196

72-
# 테이블 기반 파일 타입 (CSV, Excel)
97+
# Table-based file types (CSV, TSV, Excel)
7398
TABLE_BASED_FILE_TYPES = {'csv', 'tsv', 'xlsx', 'xls'}
7499

75100

76101
# ============================================================================
77-
# 데이터클래스
102+
# Dataclasses
78103
# ============================================================================
79104

80105
@dataclass
81106
class TableRow:
82-
"""테이블 행 데이터"""
83-
html: str
107+
"""Table row data (HTML or Markdown)"""
108+
html: str # Raw content (HTML or Markdown)
84109
is_header: bool
85110
cell_count: int
86111
char_length: int
87112

88113

89114
@dataclass
90115
class ParsedTable:
91-
"""파싱된 테이블 정보"""
92-
header_rows: List[TableRow] # 헤더 행들
93-
data_rows: List[TableRow] # 데이터 행들
94-
total_cols: int # 총 열 수
95-
original_html: str # 원본 HTML
96-
header_html: str # 헤더 HTML (재사용용)
97-
header_size: int # 헤더 크기 (문자 수)
116+
"""Parsed table information (HTML)"""
117+
header_rows: List[TableRow] # Header rows
118+
data_rows: List[TableRow] # Data rows
119+
total_cols: int # Total columns
120+
original_html: str # Original HTML
121+
header_html: str # Header HTML (for reuse)
122+
header_size: int # Header size (characters)
123+
124+
125+
@dataclass
126+
class ParsedMarkdownTable:
127+
"""Parsed Markdown table information"""
128+
header_row: str # Header row (first row with column names)
129+
separator_row: str # Separator row (|---|---|)
130+
data_rows: List[str] # Data rows
131+
total_cols: int # Total columns
132+
original_text: str # Original Markdown text
133+
header_text: str # Header + separator for reuse
134+
header_size: int # Header size (characters)

0 commit comments

Comments
 (0)