Skip to content

Commit f5104f1

Browse files
authored
Merge pull request #7 from CocoRoF/main
feat: refactoring
2 parents 39ff5a2 + 590eef8 commit f5104f1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+4274
-2093
lines changed

contextifier/chunking/__init__.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,7 @@
2121

2222
# === Main Chunking Functions (chunking.py) ===
2323
from contextifier.chunking.chunking import (
24-
# Primary API
2524
create_chunks,
26-
# Backward compatibility (deprecated)
27-
split_text_preserving_html_blocks,
28-
split_table_based_content,
29-
is_table_based_file_type,
3025
)
3126

3227
# constants
@@ -113,10 +108,6 @@
113108
__all__ = [
114109
# === Primary API ===
115110
"create_chunks",
116-
# === Backward Compatibility (deprecated) ===
117-
"split_text_preserving_html_blocks",
118-
"split_table_based_content",
119-
"is_table_based_file_type",
120111
# constants
121112
"LANGCHAIN_CODE_LANGUAGE_MAP",
122113
"HTML_TABLE_PATTERN",

contextifier/chunking/chunking.py

Lines changed: 41 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -26,71 +26,37 @@
2626

2727
# Import from individual modules
2828
from contextifier.chunking.constants import (
29-
LANGCHAIN_CODE_LANGUAGE_MAP,
30-
HTML_TABLE_PATTERN,
31-
CHART_BLOCK_PATTERN,
32-
TEXTBOX_BLOCK_PATTERN,
33-
IMAGE_TAG_PATTERN,
34-
MARKDOWN_TABLE_PATTERN,
35-
TABLE_WRAPPER_OVERHEAD,
36-
CHUNK_INDEX_OVERHEAD,
3729
TABLE_SIZE_THRESHOLD_MULTIPLIER,
3830
TABLE_BASED_FILE_TYPES,
39-
TableRow,
40-
ParsedTable,
41-
)
42-
43-
from contextifier.chunking.table_parser import (
44-
parse_html_table as _parse_html_table,
45-
extract_cell_spans as _extract_cell_spans,
46-
extract_cell_spans_with_positions as _extract_cell_spans_with_positions,
47-
has_complex_spans as _has_complex_spans,
31+
HTML_TABLE_PATTERN,
4832
)
49-
5033
from contextifier.chunking.table_chunker import (
51-
calculate_available_space as _calculate_available_space,
52-
adjust_rowspan_in_chunk as _adjust_rowspan_in_chunk,
53-
build_table_chunk as _build_table_chunk,
54-
update_chunk_metadata as _update_chunk_metadata,
55-
split_table_into_chunks as _split_table_into_chunks,
56-
split_table_preserving_rowspan as _split_table_preserving_rowspan,
5734
chunk_large_table as _chunk_large_table,
5835
)
5936

6037
from contextifier.chunking.protected_regions import (
6138
find_protected_regions as _find_protected_regions,
6239
get_protected_region_positions as _get_protected_region_positions,
63-
ensure_protected_region_integrity as _ensure_protected_region_integrity,
6440
split_with_protected_regions as _split_with_protected_regions,
65-
split_large_chunk_with_protected_regions as _split_large_chunk_with_protected_regions,
66-
ensure_table_integrity as _ensure_table_integrity,
67-
split_large_chunk_with_table_protection as _split_large_chunk_with_table_protection,
6841
)
6942

7043
from contextifier.chunking.page_chunker import (
71-
split_into_pages as _split_into_pages,
72-
merge_pages as _merge_pages,
73-
get_overlap_content as _get_overlap_content,
7444
chunk_by_pages as _chunk_by_pages,
7545
)
7646

7747
from contextifier.chunking.text_chunker import (
7848
chunk_plain_text as _chunk_plain_text,
7949
chunk_text_without_tables,
8050
chunk_with_row_protection,
81-
chunk_with_row_protection_simple,
8251
clean_chunks as _clean_chunks,
83-
chunk_code_text,
8452
reconstruct_text_from_chunks,
8553
find_overlap_length,
86-
estimate_chunks_count,
8754
)
8855

8956
from contextifier.chunking.sheet_processor import (
9057
extract_document_metadata as _extract_document_metadata,
9158
prepend_metadata_to_chunks as _prepend_metadata_to_chunks,
9259
extract_sheet_sections as _extract_sheet_sections,
93-
extract_content_segments as _extract_content_segments,
9460
chunk_multi_sheet_content,
9561
chunk_single_table_content,
9662
)
@@ -148,6 +114,24 @@ def _get_sheet_marker_pattern(page_tag_processor: Optional[Any] = None) -> str:
148114
return r'\[Sheet:\s*([^\]]+)\]'
149115

150116

117+
def _get_image_tag_pattern(image_processor: Optional[Any] = None) -> str:
118+
"""
119+
Get image tag regex pattern from ImageProcessor or use default.
120+
121+
Args:
122+
image_processor: ImageProcessor instance (optional)
123+
124+
Returns:
125+
Regex pattern for image tags
126+
"""
127+
if image_processor is not None:
128+
return image_processor.get_pattern_string()
129+
else:
130+
# Default pattern: [Image:...] or [image:...] with optional spaces and braces
131+
from contextifier.chunking.constants import IMAGE_TAG_PATTERN
132+
return IMAGE_TAG_PATTERN
133+
134+
151135
# ============================================================================
152136
# Public API - Single entry point for external use
153137
# ============================================================================
@@ -161,6 +145,7 @@ def create_chunks(
161145
include_position_metadata: bool = True,
162146
chunking_strategy: str = "recursive",
163147
page_tag_processor: Optional[Any] = None,
148+
image_processor: Optional[Any] = None,
164149
stride: Optional[int] = None,
165150
parent_chunk_size: Optional[int] = None,
166151
child_chunk_size: Optional[int] = None,
@@ -182,6 +167,9 @@ def create_chunks(
182167
page_tag_processor: PageTagProcessor instance for custom tag patterns
183168
- If None, uses default patterns [Page Number: n], [Slide Number: n], [Sheet: name]
184169
- If provided, uses the processor's configured patterns
170+
image_processor: ImageProcessor instance for custom image tag patterns
171+
- If None, uses default pattern [Image:...]
172+
- If provided, uses the processor's configured patterns for protected regions
185173
stride: Stride for sliding window strategy - future implementation
186174
parent_chunk_size: Parent chunk size for hierarchical strategy - future implementation
187175
child_chunk_size: Child chunk size for hierarchical strategy - future implementation
@@ -204,7 +192,8 @@ def create_chunks(
204192
text, chunk_size, chunk_overlap,
205193
file_extension=file_extension,
206194
force_chunking=force_chunking,
207-
page_tag_processor=page_tag_processor
195+
page_tag_processor=page_tag_processor,
196+
image_processor=image_processor
208197
)
209198

210199
# Return chunks without metadata
@@ -317,7 +306,8 @@ def _split_text(
317306
chunk_overlap: int,
318307
file_extension: Optional[str] = None,
319308
force_chunking: Optional[bool] = False,
320-
page_tag_processor: Optional[Any] = None
309+
page_tag_processor: Optional[Any] = None,
310+
image_processor: Optional[Any] = None
321311
) -> List[str]:
322312
"""
323313
Split text into chunks. (Internal use)
@@ -338,6 +328,7 @@ def _split_text(
338328
file_extension: File extension (csv, xlsx, pdf, etc.) - used for table-based processing
339329
force_chunking: Force chunking (disable table protection except for table-based files)
340330
page_tag_processor: PageTagProcessor instance for custom tag patterns
331+
image_processor: ImageProcessor instance for custom image tag patterns
341332
342333
Returns:
343334
List of chunks
@@ -355,8 +346,7 @@ def _split_text(
355346

356347
if is_table_based:
357348
# Check for large tables
358-
table_pattern = r'<table\s+border=["\']1["\']>.*?</table>'
359-
table_matches = list(re.finditer(table_pattern, text, re.DOTALL | re.IGNORECASE))
349+
table_matches = list(re.finditer(HTML_TABLE_PATTERN, text, re.DOTALL | re.IGNORECASE))
360350

361351
# Need to split if table is larger than chunk_size
362352
has_large_table = any(
@@ -377,14 +367,17 @@ def _split_text(
377367
page_marker_patterns = _get_page_marker_patterns(page_tag_processor)
378368
has_page_markers = any(re.search(pattern, text) for pattern in page_marker_patterns)
379369

370+
# Get image tag pattern from ImageProcessor or use default
371+
image_pattern = _get_image_tag_pattern(image_processor)
372+
380373
if has_page_markers:
381374
# Page-based chunking
382375
logger.debug("Page markers found, using page-based chunking")
383-
chunks = _chunk_by_pages(text, chunk_size, chunk_overlap, is_table_based, force_chunking, page_tag_processor)
376+
chunks = _chunk_by_pages(text, chunk_size, chunk_overlap, is_table_based, force_chunking, page_tag_processor, image_pattern)
384377
else:
385378
# Find protected regions (HTML tables, chart blocks, Markdown tables)
386379
# Disable table protection on force_chunking (charts are always protected)
387-
protected_regions = _find_protected_regions(text, is_table_based, force_chunking)
380+
protected_regions = _find_protected_regions(text, is_table_based, force_chunking, image_pattern)
388381
protected_positions = _get_protected_region_positions(protected_regions)
389382

390383
if protected_positions:
@@ -400,10 +393,10 @@ def _split_text(
400393
chunks = _chunk_with_row_protection(text, chunk_size, chunk_overlap, force_chunking)
401394
else:
402395
logger.debug("No protected blocks found, using standard chunking")
403-
return _chunk_text_without_tables(text, chunk_size, chunk_overlap, metadata_block)
396+
return _chunk_text_without_tables(text, chunk_size, chunk_overlap, metadata_block, page_tag_processor)
404397

405398
# Clean chunks
406-
cleaned_chunks = _clean_chunks(chunks)
399+
cleaned_chunks = _clean_chunks(chunks, page_tag_processor)
407400

408401
# Add metadata
409402
cleaned_chunks = _prepend_metadata_to_chunks(cleaned_chunks, metadata_block)
@@ -412,22 +405,6 @@ def _split_text(
412405

413406
return cleaned_chunks
414407

415-
416-
def _is_table_based_file_type(file_extension: Optional[str]) -> bool:
417-
"""
418-
Check if the file extension is a table-based file type. (Internal use)
419-
420-
Args:
421-
file_extension: File extension
422-
423-
Returns:
424-
True if table-based file type
425-
"""
426-
if not file_extension:
427-
return False
428-
return file_extension.lower() in TABLE_BASED_FILE_TYPES
429-
430-
431408
# ============================================================================
432409
# Internal Wrapper Functions
433410
# ============================================================================
@@ -436,15 +413,17 @@ def _chunk_text_without_tables(
436413
text: str,
437414
chunk_size: int,
438415
chunk_overlap: int,
439-
metadata: Optional[str]
416+
metadata: Optional[str],
417+
page_tag_processor: Optional[Any] = None
440418
) -> List[str]:
441419
"""
442420
Chunking logic for text without tables.
443421
Wrapper function for chunk_text_without_tables.
444422
"""
445423
return chunk_text_without_tables(
446424
text, chunk_size, chunk_overlap, metadata,
447-
_prepend_metadata_to_chunks
425+
_prepend_metadata_to_chunks,
426+
page_tag_processor
448427
)
449428

450429

@@ -656,84 +635,4 @@ def _page_for_pos(p: int) -> int:
656635
return table
657636

658637
except Exception:
659-
return [{"line_num": 1, "start": 0, "end": len(text), "page": 1}]
660-
661-
662-
# ============================================================================
663-
# Backward Compatibility - Deprecated public functions
664-
# ============================================================================
665-
666-
def split_table_based_content(
667-
text: str,
668-
chunk_size: int,
669-
chunk_overlap: int
670-
) -> List[str]:
671-
"""
672-
Chunk table-based content (CSV/Excel).
673-
674-
.. deprecated::
675-
Use `create_chunks()` instead. This function is kept for backward compatibility.
676-
677-
Args:
678-
text: Full text (metadata + table)
679-
chunk_size: Maximum chunk size
680-
chunk_overlap: Overlap size between chunks
681-
682-
Returns:
683-
List of chunks
684-
"""
685-
logger.warning(
686-
"split_table_based_content() is deprecated. "
687-
"Use create_chunks() with appropriate file_extension instead."
688-
)
689-
return _split_table_based_content(text, chunk_size, chunk_overlap)
690-
691-
692-
def split_text_preserving_html_blocks(
693-
text: str,
694-
chunk_size: int,
695-
chunk_overlap: int,
696-
file_extension: Optional[str] = None,
697-
force_chunking: Optional[bool] = False
698-
) -> List[str]:
699-
"""
700-
Chunk text while preserving HTML tables and considering page boundaries.
701-
702-
.. deprecated::
703-
Use `create_chunks()` instead. This function is kept for backward compatibility.
704-
705-
Args:
706-
text: Original text
707-
chunk_size: Maximum chunk size
708-
chunk_overlap: Overlap size between chunks
709-
file_extension: File extension (csv, xlsx, pdf, etc.)
710-
force_chunking: Force chunking (disable table protection)
711-
712-
Returns:
713-
List of chunks
714-
"""
715-
logger.warning(
716-
"split_text_preserving_html_blocks() is deprecated. "
717-
"Use create_chunks() instead."
718-
)
719-
return _split_text(
720-
text, chunk_size, chunk_overlap,
721-
file_extension=file_extension,
722-
force_chunking=force_chunking
723-
)
724-
725-
726-
def is_table_based_file_type(file_extension: Optional[str]) -> bool:
727-
"""
728-
Check if the file extension is a table-based file type.
729-
730-
.. deprecated::
731-
Use `_is_table_based_file_type()` instead (internal use only).
732-
733-
Args:
734-
file_extension: File extension
735-
736-
Returns:
737-
True if table-based file type
738-
"""
739-
return _is_table_based_file_type(file_extension)
638+
return [{"line_num": 1, "start": 0, "end": len(text), "page": 1}]

contextifier/chunking/page_chunker.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"""
1010
import logging
1111
import re
12-
from typing import List, Tuple
12+
from typing import List, Optional, Tuple
1313

1414
from contextifier.chunking.protected_regions import (
1515
find_protected_regions, get_protected_region_positions,
@@ -95,7 +95,8 @@ def chunk_by_pages(
9595
chunk_overlap: int,
9696
is_table_based: bool = False,
9797
force_chunking: bool = False,
98-
page_tag_processor = None
98+
page_tag_processor = None,
99+
image_pattern: Optional[str] = None
99100
) -> List[str]:
100101
"""
101102
페이지 단위로 텍스트를 청킹합니다.
@@ -117,6 +118,7 @@ def chunk_by_pages(
117118
is_table_based: Whether the file is table-based
118119
force_chunking: Force chunking (disable table protection)
119120
page_tag_processor: PageTagProcessor instance for custom patterns
121+
image_pattern: Custom regex pattern for image tags
120122
"""
121123
# Build page marker patterns from PageTagProcessor or use defaults
122124
if page_tag_processor is not None:
@@ -147,9 +149,9 @@ def chunk_by_pages(
147149

148150
logger.debug(f"Split into {len(pages)} pages")
149151

150-
# 보호 영역 위치 식별 (HTML 테이블, 차트 블록, Markdown 테이블)
152+
# 보호 영역 위치 식별 (HTML 테이블, 차트 블록, Markdown 테이블, 이미지 태그)
151153
# force_chunking 시 테이블 보호 해제 (차트는 항상 보호)
152-
protected_regions = find_protected_regions(text, is_table_based, force_chunking)
154+
protected_regions = find_protected_regions(text, is_table_based, force_chunking, image_pattern)
153155
protected_positions = get_protected_region_positions(protected_regions)
154156

155157
# 페이지 병합하여 청크 생성
@@ -219,7 +221,7 @@ def chunk_by_pages(
219221
for chunk in chunks:
220222
if len(chunk) > max_size * 1.5:
221223
# 매우 큰 청크: 보호 영역 보호하면서 분할
222-
sub_chunks = split_large_chunk_with_protected_regions(chunk, chunk_size, chunk_overlap, is_table_based, force_chunking)
224+
sub_chunks = split_large_chunk_with_protected_regions(chunk, chunk_size, chunk_overlap, is_table_based, force_chunking, image_pattern)
223225
final_chunks.extend(sub_chunks)
224226
else:
225227
final_chunks.append(chunk)

0 commit comments

Comments
 (0)