2626
2727# Import from individual modules
2828from contextifier .chunking .constants import (
29- LANGCHAIN_CODE_LANGUAGE_MAP ,
30- HTML_TABLE_PATTERN ,
31- CHART_BLOCK_PATTERN ,
32- TEXTBOX_BLOCK_PATTERN ,
33- IMAGE_TAG_PATTERN ,
34- MARKDOWN_TABLE_PATTERN ,
35- TABLE_WRAPPER_OVERHEAD ,
36- CHUNK_INDEX_OVERHEAD ,
3729 TABLE_SIZE_THRESHOLD_MULTIPLIER ,
3830 TABLE_BASED_FILE_TYPES ,
39- TableRow ,
40- ParsedTable ,
41- )
42-
43- from contextifier .chunking .table_parser import (
44- parse_html_table as _parse_html_table ,
45- extract_cell_spans as _extract_cell_spans ,
46- extract_cell_spans_with_positions as _extract_cell_spans_with_positions ,
47- has_complex_spans as _has_complex_spans ,
31+ HTML_TABLE_PATTERN ,
4832)
49-
5033from contextifier .chunking .table_chunker import (
51- calculate_available_space as _calculate_available_space ,
52- adjust_rowspan_in_chunk as _adjust_rowspan_in_chunk ,
53- build_table_chunk as _build_table_chunk ,
54- update_chunk_metadata as _update_chunk_metadata ,
55- split_table_into_chunks as _split_table_into_chunks ,
56- split_table_preserving_rowspan as _split_table_preserving_rowspan ,
5734 chunk_large_table as _chunk_large_table ,
5835)
5936
6037from contextifier .chunking .protected_regions import (
6138 find_protected_regions as _find_protected_regions ,
6239 get_protected_region_positions as _get_protected_region_positions ,
63- ensure_protected_region_integrity as _ensure_protected_region_integrity ,
6440 split_with_protected_regions as _split_with_protected_regions ,
65- split_large_chunk_with_protected_regions as _split_large_chunk_with_protected_regions ,
66- ensure_table_integrity as _ensure_table_integrity ,
67- split_large_chunk_with_table_protection as _split_large_chunk_with_table_protection ,
6841)
6942
7043from contextifier .chunking .page_chunker import (
71- split_into_pages as _split_into_pages ,
72- merge_pages as _merge_pages ,
73- get_overlap_content as _get_overlap_content ,
7444 chunk_by_pages as _chunk_by_pages ,
7545)
7646
7747from contextifier .chunking .text_chunker import (
7848 chunk_plain_text as _chunk_plain_text ,
7949 chunk_text_without_tables ,
8050 chunk_with_row_protection ,
81- chunk_with_row_protection_simple ,
8251 clean_chunks as _clean_chunks ,
83- chunk_code_text ,
8452 reconstruct_text_from_chunks ,
8553 find_overlap_length ,
86- estimate_chunks_count ,
8754)
8855
8956from contextifier .chunking .sheet_processor import (
9057 extract_document_metadata as _extract_document_metadata ,
9158 prepend_metadata_to_chunks as _prepend_metadata_to_chunks ,
9259 extract_sheet_sections as _extract_sheet_sections ,
93- extract_content_segments as _extract_content_segments ,
9460 chunk_multi_sheet_content ,
9561 chunk_single_table_content ,
9662)
@@ -148,6 +114,24 @@ def _get_sheet_marker_pattern(page_tag_processor: Optional[Any] = None) -> str:
148114 return r'\[Sheet:\s*([^\]]+)\]'
149115
150116
117+ def _get_image_tag_pattern (image_processor : Optional [Any ] = None ) -> str :
118+ """
119+ Get image tag regex pattern from ImageProcessor or use default.
120+
121+ Args:
122+ image_processor: ImageProcessor instance (optional)
123+
124+ Returns:
125+ Regex pattern for image tags
126+ """
127+ if image_processor is not None :
128+ return image_processor .get_pattern_string ()
129+ else :
130+ # Default pattern: [Image:...] or [image:...] with optional spaces and braces
131+ from contextifier .chunking .constants import IMAGE_TAG_PATTERN
132+ return IMAGE_TAG_PATTERN
133+
134+
151135# ============================================================================
152136# Public API - Single entry point for external use
153137# ============================================================================
@@ -161,6 +145,7 @@ def create_chunks(
161145 include_position_metadata : bool = True ,
162146 chunking_strategy : str = "recursive" ,
163147 page_tag_processor : Optional [Any ] = None ,
148+ image_processor : Optional [Any ] = None ,
164149 stride : Optional [int ] = None ,
165150 parent_chunk_size : Optional [int ] = None ,
166151 child_chunk_size : Optional [int ] = None ,
@@ -182,6 +167,9 @@ def create_chunks(
182167 page_tag_processor: PageTagProcessor instance for custom tag patterns
183168 - If None, uses default patterns [Page Number: n], [Slide Number: n], [Sheet: name]
184169 - If provided, uses the processor's configured patterns
170+ image_processor: ImageProcessor instance for custom image tag patterns
171+ - If None, uses default pattern [Image:...]
172+ - If provided, uses the processor's configured patterns for protected regions
185173 stride: Stride for sliding window strategy - future implementation
186174 parent_chunk_size: Parent chunk size for hierarchical strategy - future implementation
187175 child_chunk_size: Child chunk size for hierarchical strategy - future implementation
@@ -204,7 +192,8 @@ def create_chunks(
204192 text , chunk_size , chunk_overlap ,
205193 file_extension = file_extension ,
206194 force_chunking = force_chunking ,
207- page_tag_processor = page_tag_processor
195+ page_tag_processor = page_tag_processor ,
196+ image_processor = image_processor
208197 )
209198
210199 # Return chunks without metadata
@@ -317,7 +306,8 @@ def _split_text(
317306 chunk_overlap : int ,
318307 file_extension : Optional [str ] = None ,
319308 force_chunking : Optional [bool ] = False ,
320- page_tag_processor : Optional [Any ] = None
309+ page_tag_processor : Optional [Any ] = None ,
310+ image_processor : Optional [Any ] = None
321311) -> List [str ]:
322312 """
323313 Split text into chunks. (Internal use)
@@ -338,6 +328,7 @@ def _split_text(
338328 file_extension: File extension (csv, xlsx, pdf, etc.) - used for table-based processing
339329 force_chunking: Force chunking (disable table protection except for table-based files)
340330 page_tag_processor: PageTagProcessor instance for custom tag patterns
331+ image_processor: ImageProcessor instance for custom image tag patterns
341332
342333 Returns:
343334 List of chunks
@@ -355,8 +346,7 @@ def _split_text(
355346
356347 if is_table_based :
357348 # Check for large tables
358- table_pattern = r'<table\s+border=["\']1["\']>.*?</table>'
359- table_matches = list (re .finditer (table_pattern , text , re .DOTALL | re .IGNORECASE ))
349+ table_matches = list (re .finditer (HTML_TABLE_PATTERN , text , re .DOTALL | re .IGNORECASE ))
360350
361351 # Need to split if table is larger than chunk_size
362352 has_large_table = any (
@@ -377,14 +367,17 @@ def _split_text(
377367 page_marker_patterns = _get_page_marker_patterns (page_tag_processor )
378368 has_page_markers = any (re .search (pattern , text ) for pattern in page_marker_patterns )
379369
370+ # Get image tag pattern from ImageProcessor or use default
371+ image_pattern = _get_image_tag_pattern (image_processor )
372+
380373 if has_page_markers :
381374 # Page-based chunking
382375 logger .debug ("Page markers found, using page-based chunking" )
383- chunks = _chunk_by_pages (text , chunk_size , chunk_overlap , is_table_based , force_chunking , page_tag_processor )
376+ chunks = _chunk_by_pages (text , chunk_size , chunk_overlap , is_table_based , force_chunking , page_tag_processor , image_pattern )
384377 else :
385378 # Find protected regions (HTML tables, chart blocks, Markdown tables)
386379 # Disable table protection on force_chunking (charts are always protected)
387- protected_regions = _find_protected_regions (text , is_table_based , force_chunking )
380+ protected_regions = _find_protected_regions (text , is_table_based , force_chunking , image_pattern )
388381 protected_positions = _get_protected_region_positions (protected_regions )
389382
390383 if protected_positions :
@@ -400,10 +393,10 @@ def _split_text(
400393 chunks = _chunk_with_row_protection (text , chunk_size , chunk_overlap , force_chunking )
401394 else :
402395 logger .debug ("No protected blocks found, using standard chunking" )
403- return _chunk_text_without_tables (text , chunk_size , chunk_overlap , metadata_block )
396+ return _chunk_text_without_tables (text , chunk_size , chunk_overlap , metadata_block , page_tag_processor )
404397
405398 # Clean chunks
406- cleaned_chunks = _clean_chunks (chunks )
399+ cleaned_chunks = _clean_chunks (chunks , page_tag_processor )
407400
408401 # Add metadata
409402 cleaned_chunks = _prepend_metadata_to_chunks (cleaned_chunks , metadata_block )
@@ -412,22 +405,6 @@ def _split_text(
412405
413406 return cleaned_chunks
414407
415-
416- def _is_table_based_file_type (file_extension : Optional [str ]) -> bool :
417- """
418- Check if the file extension is a table-based file type. (Internal use)
419-
420- Args:
421- file_extension: File extension
422-
423- Returns:
424- True if table-based file type
425- """
426- if not file_extension :
427- return False
428- return file_extension .lower () in TABLE_BASED_FILE_TYPES
429-
430-
431408# ============================================================================
432409# Internal Wrapper Functions
433410# ============================================================================
@@ -436,15 +413,17 @@ def _chunk_text_without_tables(
436413 text : str ,
437414 chunk_size : int ,
438415 chunk_overlap : int ,
439- metadata : Optional [str ]
416+ metadata : Optional [str ],
417+ page_tag_processor : Optional [Any ] = None
440418) -> List [str ]:
441419 """
442420 Chunking logic for text without tables.
443421 Wrapper function for chunk_text_without_tables.
444422 """
445423 return chunk_text_without_tables (
446424 text , chunk_size , chunk_overlap , metadata ,
447- _prepend_metadata_to_chunks
425+ _prepend_metadata_to_chunks ,
426+ page_tag_processor
448427 )
449428
450429
@@ -656,84 +635,4 @@ def _page_for_pos(p: int) -> int:
656635 return table
657636
658637 except Exception :
659- return [{"line_num" : 1 , "start" : 0 , "end" : len (text ), "page" : 1 }]
660-
661-
662- # ============================================================================
663- # Backward Compatibility - Deprecated public functions
664- # ============================================================================
665-
666- def split_table_based_content (
667- text : str ,
668- chunk_size : int ,
669- chunk_overlap : int
670- ) -> List [str ]:
671- """
672- Chunk table-based content (CSV/Excel).
673-
674- .. deprecated::
675- Use `create_chunks()` instead. This function is kept for backward compatibility.
676-
677- Args:
678- text: Full text (metadata + table)
679- chunk_size: Maximum chunk size
680- chunk_overlap: Overlap size between chunks
681-
682- Returns:
683- List of chunks
684- """
685- logger .warning (
686- "split_table_based_content() is deprecated. "
687- "Use create_chunks() with appropriate file_extension instead."
688- )
689- return _split_table_based_content (text , chunk_size , chunk_overlap )
690-
691-
692- def split_text_preserving_html_blocks (
693- text : str ,
694- chunk_size : int ,
695- chunk_overlap : int ,
696- file_extension : Optional [str ] = None ,
697- force_chunking : Optional [bool ] = False
698- ) -> List [str ]:
699- """
700- Chunk text while preserving HTML tables and considering page boundaries.
701-
702- .. deprecated::
703- Use `create_chunks()` instead. This function is kept for backward compatibility.
704-
705- Args:
706- text: Original text
707- chunk_size: Maximum chunk size
708- chunk_overlap: Overlap size between chunks
709- file_extension: File extension (csv, xlsx, pdf, etc.)
710- force_chunking: Force chunking (disable table protection)
711-
712- Returns:
713- List of chunks
714- """
715- logger .warning (
716- "split_text_preserving_html_blocks() is deprecated. "
717- "Use create_chunks() instead."
718- )
719- return _split_text (
720- text , chunk_size , chunk_overlap ,
721- file_extension = file_extension ,
722- force_chunking = force_chunking
723- )
724-
725-
726- def is_table_based_file_type (file_extension : Optional [str ]) -> bool :
727- """
728- Check if the file extension is a table-based file type.
729-
730- .. deprecated::
731- Use `_is_table_based_file_type()` instead (internal use only).
732-
733- Args:
734- file_extension: File extension
735-
736- Returns:
737- True if table-based file type
738- """
739- return _is_table_based_file_type (file_extension )
638+ return [{"line_num" : 1 , "start" : 0 , "end" : len (text ), "page" : 1 }]
0 commit comments