Skip to content

Commit caf79de

Browse files
authored
Merge pull request #8 from CocoRoF/main
refactor
2 parents f5104f1 + 52af099 commit caf79de

File tree

104 files changed

+10613
-5609
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

104 files changed

+10613
-5609
lines changed

Process Logic.md

Lines changed: 558 additions & 0 deletions
Large diffs are not rendered by default.

contextifier/core/document_processor.py

Lines changed: 86 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -263,8 +263,8 @@ class DocumentProcessor:
263263
"""
264264

265265
# === Supported File Type Classifications ===
266-
DOCUMENT_TYPES = frozenset(['pdf', 'docx', 'doc', 'pptx', 'ppt', 'hwp', 'hwpx'])
267-
TEXT_TYPES = frozenset(['txt', 'md', 'markdown', 'rtf'])
266+
DOCUMENT_TYPES = frozenset(['pdf', 'docx', 'doc', 'rtf', 'pptx', 'ppt', 'hwp', 'hwpx'])
267+
TEXT_TYPES = frozenset(['txt', 'md', 'markdown'])
268268
CODE_TYPES = frozenset([
269269
'py', 'js', 'ts', 'java', 'cpp', 'c', 'h', 'cs', 'go', 'rs',
270270
'php', 'rb', 'swift', 'kt', 'scala', 'dart', 'r', 'sql',
@@ -291,6 +291,8 @@ def __init__(
291291
slide_tag_suffix: Optional[str] = None,
292292
chart_tag_prefix: Optional[str] = None,
293293
chart_tag_suffix: Optional[str] = None,
294+
metadata_tag_prefix: Optional[str] = None,
295+
metadata_tag_suffix: Optional[str] = None,
294296
**kwargs
295297
):
296298
"""
@@ -328,6 +330,12 @@ def __init__(
328330
chart_tag_suffix: Suffix for chart tags in extracted text
329331
- Default: "[/chart]"
330332
- Example: "</chart>" for XML format
333+
metadata_tag_prefix: Opening tag for metadata section
334+
- Default: "<Document-Metadata>"
335+
- Example: "<metadata>" for custom format
336+
metadata_tag_suffix: Closing tag for metadata section
337+
- Default: "</Document-Metadata>"
338+
- Example: "</metadata>" for custom format
331339
**kwargs: Additional configuration options
332340
333341
Example:
@@ -342,7 +350,9 @@ def __init__(
342350
... page_tag_prefix="<page>",
343351
... page_tag_suffix="</page>",
344352
... chart_tag_prefix="<chart>",
345-
... chart_tag_suffix="</chart>"
353+
... chart_tag_suffix="</chart>",
354+
... metadata_tag_prefix="<meta>",
355+
... metadata_tag_suffix="</meta>"
346356
... )
347357
348358
>>> # Markdown format
@@ -359,6 +369,10 @@ def __init__(
359369
self._ocr_engine = ocr_engine
360370
self._kwargs = kwargs
361371
self._supported_extensions: Optional[List[str]] = None
372+
373+
# Store metadata tag settings
374+
self._metadata_tag_prefix = metadata_tag_prefix
375+
self._metadata_tag_suffix = metadata_tag_suffix
362376

363377
# Logger setup
364378
self._logger = logging.getLogger("contextify.processor")
@@ -389,12 +403,19 @@ def __init__(
389403
chart_tag_prefix=chart_tag_prefix,
390404
chart_tag_suffix=chart_tag_suffix
391405
)
406+
407+
# Create instance-specific MetadataFormatter
408+
self._metadata_formatter = self._create_metadata_formatter(
409+
metadata_tag_prefix=metadata_tag_prefix,
410+
metadata_tag_suffix=metadata_tag_suffix
411+
)
392412

393413
# Add processors to config for handlers to access
394414
if isinstance(self._config, dict):
395415
self._config["image_processor"] = self._image_processor
396416
self._config["page_tag_processor"] = self._page_tag_processor
397417
self._config["chart_processor"] = self._chart_processor
418+
self._config["metadata_formatter"] = self._metadata_formatter
398419

399420
# =========================================================================
400421
# Public Properties
@@ -484,6 +505,26 @@ def chart_processor(self) -> Any:
484505
"""Current ChartProcessor instance for this DocumentProcessor."""
485506
return self._chart_processor
486507

508+
@property
509+
def metadata_tag_config(self) -> Dict[str, Any]:
510+
"""
511+
Current metadata formatter configuration.
512+
513+
Returns:
514+
Dictionary containing:
515+
- metadata_tag_prefix: Opening tag for metadata section
516+
- metadata_tag_suffix: Closing tag for metadata section
517+
"""
518+
return {
519+
"metadata_tag_prefix": self._metadata_formatter.metadata_tag_prefix,
520+
"metadata_tag_suffix": self._metadata_formatter.metadata_tag_suffix,
521+
}
522+
523+
@property
524+
def metadata_formatter(self) -> Any:
525+
"""Current MetadataFormatter instance for this DocumentProcessor."""
526+
return self._metadata_formatter
527+
487528
@property
488529
def ocr_engine(self) -> Optional[Any]:
489530
"""Current OCR engine instance."""
@@ -875,6 +916,34 @@ def _create_chart_processor(
875916
tag_suffix=chart_tag_suffix
876917
)
877918

919+
def _create_metadata_formatter(
920+
self,
921+
metadata_tag_prefix: Optional[str] = None,
922+
metadata_tag_suffix: Optional[str] = None
923+
) -> Any:
924+
"""
925+
Create a MetadataFormatter instance for this DocumentProcessor.
926+
927+
This creates an instance-specific MetadataFormatter that will be
928+
passed to handlers via config.
929+
930+
Args:
931+
metadata_tag_prefix: Opening tag (default: "<Document-Metadata>")
932+
metadata_tag_suffix: Closing tag (default: "</Document-Metadata>")
933+
934+
Returns:
935+
MetadataFormatter instance
936+
"""
937+
from contextifier.core.functions.metadata_extractor import MetadataFormatter
938+
939+
kwargs = {}
940+
if metadata_tag_prefix is not None:
941+
kwargs["metadata_tag_prefix"] = metadata_tag_prefix
942+
if metadata_tag_suffix is not None:
943+
kwargs["metadata_tag_suffix"] = metadata_tag_suffix
944+
945+
return MetadataFormatter(**kwargs)
946+
878947
def _build_supported_extensions(self) -> List[str]:
879948
"""Build list of supported extensions."""
880949
extensions = list(
@@ -940,6 +1009,19 @@ def _get_handler_registry(self) -> Dict[str, Callable]:
9401009
except ImportError as e:
9411010
self._logger.warning(f"DOC handler not available: {e}")
9421011

1012+
# RTF handler
1013+
try:
1014+
from contextifier.core.processor.rtf_handler import RTFHandler
1015+
rtf_handler = RTFHandler(
1016+
config=self._config,
1017+
image_processor=self._image_processor,
1018+
page_tag_processor=self._page_tag_processor,
1019+
chart_processor=self._chart_processor
1020+
)
1021+
self._handler_registry['rtf'] = rtf_handler.extract_text
1022+
except ImportError as e:
1023+
self._logger.warning(f"RTF handler not available: {e}")
1024+
9431025
# PPT/PPTX handler
9441026
try:
9451027
from contextifier.core.processor.ppt_handler import PPTHandler
@@ -997,7 +1079,7 @@ def _get_handler_registry(self) -> Dict[str, Callable]:
9971079

9981080
# HWPX handler
9991081
try:
1000-
from contextifier.core.processor.hwps_handler import HWPXHandler
1082+
from contextifier.core.processor.hwpx_handler import HWPXHandler
10011083
hwpx_handler = HWPXHandler(
10021084
config=self._config,
10031085
image_processor=self._image_processor,
Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
11
# libs/core/functions/__init__.py
22
"""
3-
Functions - 공통 유틸리티 함수 모듈
3+
Functions - Common Utility Functions Module
44
5-
문서 처리에 사용되는 공통 유틸리티 함수들을 제공합니다.
5+
Provides common utility functions used in document processing.
66
7-
모듈 구성:
8-
- utils: 텍스트 정리, 코드 정리, JSON 정리 등 유틸리티 함수
9-
- img_processor: 이미지 처리 및 저장 (ImageProcessor 클래스)
10-
- ppt2pdf: PPT를 PDF로 변환하는 함수
7+
Module Components:
8+
- utils: Text cleaning, code cleaning, JSON sanitization utilities
9+
- img_processor: Image processing and storage (ImageProcessor class)
10+
- storage_backend: Storage backend implementations (Local, MinIO, S3)
11+
- metadata_extractor: Document metadata extraction interface
1112
12-
사용 예시:
13+
Usage Example:
1314
from contextifier.core.functions import clean_text, clean_code_text
1415
from contextifier.core.functions import ImageProcessor, save_image_to_file
16+
from contextifier.core.functions.storage_backend import LocalStorageBackend
1517
from contextifier.core.functions.utils import sanitize_text_for_json
1618
"""
1719

@@ -21,26 +23,62 @@
2123
sanitize_text_for_json,
2224
)
2325

24-
# 이미지 처리 모듈
26+
# Storage backend module
27+
from contextifier.core.functions.storage_backend import (
28+
StorageType,
29+
BaseStorageBackend,
30+
LocalStorageBackend,
31+
MinIOStorageBackend,
32+
S3StorageBackend,
33+
create_storage_backend,
34+
get_default_backend,
35+
)
36+
37+
# Image processor module
2538
from contextifier.core.functions.img_processor import (
2639
ImageProcessor,
2740
ImageProcessorConfig,
2841
ImageFormat,
2942
NamingStrategy,
3043
save_image_to_file,
3144
create_image_processor,
45+
DEFAULT_IMAGE_CONFIG,
46+
)
47+
48+
# Metadata extraction module
49+
from contextifier.core.functions.metadata_extractor import (
50+
MetadataField,
51+
DocumentMetadata,
52+
MetadataFormatter,
53+
BaseMetadataExtractor,
54+
format_metadata,
3255
)
3356

3457
__all__ = [
35-
# 텍스트 유틸리티
58+
# Text utilities
3659
"clean_text",
3760
"clean_code_text",
3861
"sanitize_text_for_json",
39-
# 이미지 처리
62+
# Storage backends
63+
"StorageType",
64+
"BaseStorageBackend",
65+
"LocalStorageBackend",
66+
"MinIOStorageBackend",
67+
"S3StorageBackend",
68+
"create_storage_backend",
69+
"get_default_backend",
70+
# Image processor (base class for all format-specific processors)
4071
"ImageProcessor",
4172
"ImageProcessorConfig",
4273
"ImageFormat",
4374
"NamingStrategy",
4475
"save_image_to_file",
4576
"create_image_processor",
77+
"DEFAULT_IMAGE_CONFIG",
78+
# Metadata extraction
79+
"MetadataField",
80+
"DocumentMetadata",
81+
"MetadataFormatter",
82+
"BaseMetadataExtractor",
83+
"format_metadata",
4684
]

0 commit comments

Comments
 (0)