CocoRoF
diff --git a/‎contextifier/chunking/__init__.py‎
Lines changed: 0 additions & 9 deletions b/‎contextifier/chunking/__init__.py‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎contextifier/chunking/chunking.py‎
Lines changed: 41 additions & 142 deletions b/‎contextifier/chunking/chunking.py‎
Lines changed: 41 additions & 142 deletions
diff --git a/‎contextifier/chunking/page_chunker.py‎
Lines changed: 7 additions & 5 deletions b/‎contextifier/chunking/page_chunker.py‎
Lines changed: 7 additions & 5 deletions
@@ -21,12 +21,7 @@
 
 # === Main Chunking Functions (chunking.py) ===
 from contextifier.chunking.chunking import (
-    # Primary API
     create_chunks,
-    # Backward compatibility (deprecated)
-    split_text_preserving_html_blocks,
-    split_table_based_content,
-    is_table_based_file_type,
 )
 
 # constants
@@ -113,10 +108,6 @@
 __all__ = [
     # === Primary API ===
     "create_chunks",
-    # === Backward Compatibility (deprecated) ===
-    "split_text_preserving_html_blocks",
-    "split_table_based_content",
-    "is_table_based_file_type",
     # constants
     "LANGCHAIN_CODE_LANGUAGE_MAP",
     "HTML_TABLE_PATTERN",
 
@@ -26,71 +26,37 @@
 
 # Import from individual modules
 from contextifier.chunking.constants import (
-    LANGCHAIN_CODE_LANGUAGE_MAP,
-    HTML_TABLE_PATTERN,
-    CHART_BLOCK_PATTERN,
-    TEXTBOX_BLOCK_PATTERN,
-    IMAGE_TAG_PATTERN,
-    MARKDOWN_TABLE_PATTERN,
-    TABLE_WRAPPER_OVERHEAD,
-    CHUNK_INDEX_OVERHEAD,
     TABLE_SIZE_THRESHOLD_MULTIPLIER,
     TABLE_BASED_FILE_TYPES,
-    TableRow,
-    ParsedTable,
-)
-
-from contextifier.chunking.table_parser import (
-    parse_html_table as _parse_html_table,
-    extract_cell_spans as _extract_cell_spans,
-    extract_cell_spans_with_positions as _extract_cell_spans_with_positions,
-    has_complex_spans as _has_complex_spans,
+    HTML_TABLE_PATTERN,
 )
-
 from contextifier.chunking.table_chunker import (
-    calculate_available_space as _calculate_available_space,
-    adjust_rowspan_in_chunk as _adjust_rowspan_in_chunk,
-    build_table_chunk as _build_table_chunk,
-    update_chunk_metadata as _update_chunk_metadata,
-    split_table_into_chunks as _split_table_into_chunks,
-    split_table_preserving_rowspan as _split_table_preserving_rowspan,
     chunk_large_table as _chunk_large_table,
 )
 
 from contextifier.chunking.protected_regions import (
     find_protected_regions as _find_protected_regions,
     get_protected_region_positions as _get_protected_region_positions,
-    ensure_protected_region_integrity as _ensure_protected_region_integrity,
     split_with_protected_regions as _split_with_protected_regions,
-    split_large_chunk_with_protected_regions as _split_large_chunk_with_protected_regions,
-    ensure_table_integrity as _ensure_table_integrity,
-    split_large_chunk_with_table_protection as _split_large_chunk_with_table_protection,
 )
 
 from contextifier.chunking.page_chunker import (
-    split_into_pages as _split_into_pages,
-    merge_pages as _merge_pages,
-    get_overlap_content as _get_overlap_content,
     chunk_by_pages as _chunk_by_pages,
 )
 
 from contextifier.chunking.text_chunker import (
     chunk_plain_text as _chunk_plain_text,
     chunk_text_without_tables,
     chunk_with_row_protection,
-    chunk_with_row_protection_simple,
     clean_chunks as _clean_chunks,
-    chunk_code_text,
     reconstruct_text_from_chunks,
     find_overlap_length,
-    estimate_chunks_count,
 )
 
 from contextifier.chunking.sheet_processor import (
     extract_document_metadata as _extract_document_metadata,
     prepend_metadata_to_chunks as _prepend_metadata_to_chunks,
     extract_sheet_sections as _extract_sheet_sections,
-    extract_content_segments as _extract_content_segments,
     chunk_multi_sheet_content,
     chunk_single_table_content,
 )
@@ -148,6 +114,24 @@ def _get_sheet_marker_pattern(page_tag_processor: Optional[Any] = None) -> str:
         return r'\[Sheet:\s*([^\]]+)\]'
 
 
+def _get_image_tag_pattern(image_processor: Optional[Any] = None) -> str:
+    """
+    Get image tag regex pattern from ImageProcessor or use default.
+    
+    Args:
+        image_processor: ImageProcessor instance (optional)
+        
+    Returns:
+        Regex pattern for image tags
+    """
+    if image_processor is not None:
+        return image_processor.get_pattern_string()
+    else:
+        # Default pattern: [Image:...] or [image:...] with optional spaces and braces
+        from contextifier.chunking.constants import IMAGE_TAG_PATTERN
+        return IMAGE_TAG_PATTERN
+
+
 # ============================================================================
 # Public API - Single entry point for external use
 # ============================================================================
@@ -161,6 +145,7 @@ def create_chunks(
     include_position_metadata: bool = True,
     chunking_strategy: str = "recursive",
     page_tag_processor: Optional[Any] = None,
+    image_processor: Optional[Any] = None,
     stride: Optional[int] = None,
     parent_chunk_size: Optional[int] = None,
     child_chunk_size: Optional[int] = None,
@@ -182,6 +167,9 @@ def create_chunks(
         page_tag_processor: PageTagProcessor instance for custom tag patterns
             - If None, uses default patterns [Page Number: n], [Slide Number: n], [Sheet: name]
             - If provided, uses the processor's configured patterns
+        image_processor: ImageProcessor instance for custom image tag patterns
+            - If None, uses default pattern [Image:...]
+            - If provided, uses the processor's configured patterns for protected regions
         stride: Stride for sliding window strategy - future implementation
         parent_chunk_size: Parent chunk size for hierarchical strategy - future implementation
         child_chunk_size: Child chunk size for hierarchical strategy - future implementation
@@ -204,7 +192,8 @@ def create_chunks(
         text, chunk_size, chunk_overlap,
         file_extension=file_extension,
         force_chunking=force_chunking,
-        page_tag_processor=page_tag_processor
+        page_tag_processor=page_tag_processor,
+        image_processor=image_processor
     )
 
     # Return chunks without metadata
@@ -317,7 +306,8 @@ def _split_text(
     chunk_overlap: int,
     file_extension: Optional[str] = None,
     force_chunking: Optional[bool] = False,
-    page_tag_processor: Optional[Any] = None
+    page_tag_processor: Optional[Any] = None,
+    image_processor: Optional[Any] = None
 ) -> List[str]:
     """
     Split text into chunks. (Internal use)
@@ -338,6 +328,7 @@ def _split_text(
         file_extension: File extension (csv, xlsx, pdf, etc.) - used for table-based processing
         force_chunking: Force chunking (disable table protection except for table-based files)
         page_tag_processor: PageTagProcessor instance for custom tag patterns
+        image_processor: ImageProcessor instance for custom image tag patterns
 
     Returns:
         List of chunks
@@ -355,8 +346,7 @@ def _split_text(
 
     if is_table_based:
         # Check for large tables
-        table_pattern = r'<table\s+border=["\']1["\']>.*?</table>'
-        table_matches = list(re.finditer(table_pattern, text, re.DOTALL | re.IGNORECASE))
+        table_matches = list(re.finditer(HTML_TABLE_PATTERN, text, re.DOTALL | re.IGNORECASE))
 
         # Need to split if table is larger than chunk_size
         has_large_table = any(
@@ -377,14 +367,17 @@ def _split_text(
     page_marker_patterns = _get_page_marker_patterns(page_tag_processor)
     has_page_markers = any(re.search(pattern, text) for pattern in page_marker_patterns)
 
+    # Get image tag pattern from ImageProcessor or use default
+    image_pattern = _get_image_tag_pattern(image_processor)
+
     if has_page_markers:
         # Page-based chunking
         logger.debug("Page markers found, using page-based chunking")
-        chunks = _chunk_by_pages(text, chunk_size, chunk_overlap, is_table_based, force_chunking, page_tag_processor)
+        chunks = _chunk_by_pages(text, chunk_size, chunk_overlap, is_table_based, force_chunking, page_tag_processor, image_pattern)
     else:
         # Find protected regions (HTML tables, chart blocks, Markdown tables)
         # Disable table protection on force_chunking (charts are always protected)
-        protected_regions = _find_protected_regions(text, is_table_based, force_chunking)
+        protected_regions = _find_protected_regions(text, is_table_based, force_chunking, image_pattern)
         protected_positions = _get_protected_region_positions(protected_regions)
 
         if protected_positions:
@@ -400,10 +393,10 @@ def _split_text(
                 chunks = _chunk_with_row_protection(text, chunk_size, chunk_overlap, force_chunking)
             else:
                 logger.debug("No protected blocks found, using standard chunking")
-                return _chunk_text_without_tables(text, chunk_size, chunk_overlap, metadata_block)
+                return _chunk_text_without_tables(text, chunk_size, chunk_overlap, metadata_block, page_tag_processor)
 
     # Clean chunks
-    cleaned_chunks = _clean_chunks(chunks)
+    cleaned_chunks = _clean_chunks(chunks, page_tag_processor)
 
     # Add metadata
     cleaned_chunks = _prepend_metadata_to_chunks(cleaned_chunks, metadata_block)
@@ -412,22 +405,6 @@ def _split_text(
 
     return cleaned_chunks
 
-
-def _is_table_based_file_type(file_extension: Optional[str]) -> bool:
-    """
-    Check if the file extension is a table-based file type. (Internal use)
-
-    Args:
-        file_extension: File extension
-
-    Returns:
-        True if table-based file type
-    """
-    if not file_extension:
-        return False
-    return file_extension.lower() in TABLE_BASED_FILE_TYPES
-
-
 # ============================================================================
 # Internal Wrapper Functions
 # ============================================================================
@@ -436,15 +413,17 @@ def _chunk_text_without_tables(
     text: str,
     chunk_size: int,
     chunk_overlap: int,
-    metadata: Optional[str]
+    metadata: Optional[str],
+    page_tag_processor: Optional[Any] = None
 ) -> List[str]:
     """
     Chunking logic for text without tables.
     Wrapper function for chunk_text_without_tables.
     """
     return chunk_text_without_tables(
         text, chunk_size, chunk_overlap, metadata,
-        _prepend_metadata_to_chunks
+        _prepend_metadata_to_chunks,
+        page_tag_processor
     )
 
 
@@ -656,84 +635,4 @@ def _page_for_pos(p: int) -> int:
         return table
 
     except Exception:
-        return [{"line_num": 1, "start": 0, "end": len(text), "page": 1}]
-
-
-# ============================================================================
-# Backward Compatibility - Deprecated public functions
-# ============================================================================
-
-def split_table_based_content(
-    text: str,
-    chunk_size: int,
-    chunk_overlap: int
-) -> List[str]:
-    """
-    Chunk table-based content (CSV/Excel).
-
-    .. deprecated::
-        Use `create_chunks()` instead. This function is kept for backward compatibility.
-
-    Args:
-        text: Full text (metadata + table)
-        chunk_size: Maximum chunk size
-        chunk_overlap: Overlap size between chunks
-
-    Returns:
-        List of chunks
-    """
-    logger.warning(
-        "split_table_based_content() is deprecated. "
-        "Use create_chunks() with appropriate file_extension instead."
-    )
-    return _split_table_based_content(text, chunk_size, chunk_overlap)
-
-
-def split_text_preserving_html_blocks(
-    text: str,
-    chunk_size: int,
-    chunk_overlap: int,
-    file_extension: Optional[str] = None,
-    force_chunking: Optional[bool] = False
-) -> List[str]:
-    """
-    Chunk text while preserving HTML tables and considering page boundaries.
-
-    .. deprecated::
-        Use `create_chunks()` instead. This function is kept for backward compatibility.
-
-    Args:
-        text: Original text
-        chunk_size: Maximum chunk size
-        chunk_overlap: Overlap size between chunks
-        file_extension: File extension (csv, xlsx, pdf, etc.)
-        force_chunking: Force chunking (disable table protection)
-
-    Returns:
-        List of chunks
-    """
-    logger.warning(
-        "split_text_preserving_html_blocks() is deprecated. "
-        "Use create_chunks() instead."
-    )
-    return _split_text(
-        text, chunk_size, chunk_overlap,
-        file_extension=file_extension,
-        force_chunking=force_chunking
-    )
-
-
-def is_table_based_file_type(file_extension: Optional[str]) -> bool:
-    """
-    Check if the file extension is a table-based file type.
-
-    .. deprecated::
-        Use `_is_table_based_file_type()` instead (internal use only).
-
-    Args:
-        file_extension: File extension
-
-    Returns:
-        True if table-based file type
-    """
-    return _is_table_based_file_type(file_extension)
+        return [{"line_num": 1, "start": 0, "end": len(text), "page": 1}]
@@ -9,7 +9,7 @@
 """
 import logging
 import re
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 from contextifier.chunking.protected_regions import (
     find_protected_regions, get_protected_region_positions,
@@ -95,7 +95,8 @@ def chunk_by_pages(
     chunk_overlap: int,
     is_table_based: bool = False,
     force_chunking: bool = False,
-    page_tag_processor = None
+    page_tag_processor = None,
+    image_pattern: Optional[str] = None
 ) -> List[str]:
     """
     페이지 단위로 텍스트를 청킹합니다.
@@ -117,6 +118,7 @@ def chunk_by_pages(
         is_table_based: Whether the file is table-based
         force_chunking: Force chunking (disable table protection)
         page_tag_processor: PageTagProcessor instance for custom patterns
+        image_pattern: Custom regex pattern for image tags
     """
     # Build page marker patterns from PageTagProcessor or use defaults
     if page_tag_processor is not None:
@@ -147,9 +149,9 @@ def chunk_by_pages(
 
     logger.debug(f"Split into {len(pages)} pages")
 
-    # 보호 영역 위치 식별 (HTML 테이블, 차트 블록, Markdown 테이블)
+    # 보호 영역 위치 식별 (HTML 테이블, 차트 블록, Markdown 테이블, 이미지 태그)
     # force_chunking 시 테이블 보호 해제 (차트는 항상 보호)
-    protected_regions = find_protected_regions(text, is_table_based, force_chunking)
+    protected_regions = find_protected_regions(text, is_table_based, force_chunking, image_pattern)
     protected_positions = get_protected_region_positions(protected_regions)
 
     # 페이지 병합하여 청크 생성
@@ -219,7 +221,7 @@ def chunk_by_pages(
     for chunk in chunks:
         if len(chunk) > max_size * 1.5:
             # 매우 큰 청크: 보호 영역 보호하면서 분할
-            sub_chunks = split_large_chunk_with_protected_regions(chunk, chunk_size, chunk_overlap, is_table_based, force_chunking)
+            sub_chunks = split_large_chunk_with_protected_regions(chunk, chunk_size, chunk_overlap, is_table_based, force_chunking, image_pattern)
             final_chunks.extend(sub_chunks)
         else:
             final_chunks.append(chunk)