@@ -263,8 +263,8 @@ class DocumentProcessor:
263263 """
264264
265265 # === Supported File Type Classifications ===
266- DOCUMENT_TYPES = frozenset (['pdf' , 'docx' , 'doc' , 'pptx' , 'ppt' , 'hwp' , 'hwpx' ])
267- TEXT_TYPES = frozenset (['txt' , 'md' , 'markdown' , 'rtf' ])
266+ DOCUMENT_TYPES = frozenset (['pdf' , 'docx' , 'doc' , 'rtf' , ' pptx' , 'ppt' , 'hwp' , 'hwpx' ])
267+ TEXT_TYPES = frozenset (['txt' , 'md' , 'markdown' ])
268268 CODE_TYPES = frozenset ([
269269 'py' , 'js' , 'ts' , 'java' , 'cpp' , 'c' , 'h' , 'cs' , 'go' , 'rs' ,
270270 'php' , 'rb' , 'swift' , 'kt' , 'scala' , 'dart' , 'r' , 'sql' ,
@@ -291,6 +291,8 @@ def __init__(
291291 slide_tag_suffix : Optional [str ] = None ,
292292 chart_tag_prefix : Optional [str ] = None ,
293293 chart_tag_suffix : Optional [str ] = None ,
294+ metadata_tag_prefix : Optional [str ] = None ,
295+ metadata_tag_suffix : Optional [str ] = None ,
294296 ** kwargs
295297 ):
296298 """
@@ -328,6 +330,12 @@ def __init__(
328330 chart_tag_suffix: Suffix for chart tags in extracted text
329331 - Default: "[/chart]"
330332 - Example: "</chart>" for XML format
333+ metadata_tag_prefix: Opening tag for metadata section
334+ - Default: "<Document-Metadata>"
335+ - Example: "<metadata>" for custom format
336+ metadata_tag_suffix: Closing tag for metadata section
337+ - Default: "</Document-Metadata>"
338+ - Example: "</metadata>" for custom format
331339 **kwargs: Additional configuration options
332340
333341 Example:
@@ -342,7 +350,9 @@ def __init__(
342350 ... page_tag_prefix="<page>",
343351 ... page_tag_suffix="</page>",
344352 ... chart_tag_prefix="<chart>",
345- ... chart_tag_suffix="</chart>"
353+ ... chart_tag_suffix="</chart>",
354+ ... metadata_tag_prefix="<meta>",
355+ ... metadata_tag_suffix="</meta>"
346356 ... )
347357
348358 >>> # Markdown format
@@ -359,6 +369,10 @@ def __init__(
359369 self ._ocr_engine = ocr_engine
360370 self ._kwargs = kwargs
361371 self ._supported_extensions : Optional [List [str ]] = None
372+
373+ # Store metadata tag settings
374+ self ._metadata_tag_prefix = metadata_tag_prefix
375+ self ._metadata_tag_suffix = metadata_tag_suffix
362376
363377 # Logger setup
364378 self ._logger = logging .getLogger ("contextify.processor" )
@@ -389,12 +403,19 @@ def __init__(
389403 chart_tag_prefix = chart_tag_prefix ,
390404 chart_tag_suffix = chart_tag_suffix
391405 )
406+
407+ # Create instance-specific MetadataFormatter
408+ self ._metadata_formatter = self ._create_metadata_formatter (
409+ metadata_tag_prefix = metadata_tag_prefix ,
410+ metadata_tag_suffix = metadata_tag_suffix
411+ )
392412
393413 # Add processors to config for handlers to access
394414 if isinstance (self ._config , dict ):
395415 self ._config ["image_processor" ] = self ._image_processor
396416 self ._config ["page_tag_processor" ] = self ._page_tag_processor
397417 self ._config ["chart_processor" ] = self ._chart_processor
418+ self ._config ["metadata_formatter" ] = self ._metadata_formatter
398419
399420 # =========================================================================
400421 # Public Properties
@@ -484,6 +505,26 @@ def chart_processor(self) -> Any:
484505 """Current ChartProcessor instance for this DocumentProcessor."""
485506 return self ._chart_processor
486507
508+ @property
509+ def metadata_tag_config (self ) -> Dict [str , Any ]:
510+ """
511+ Current metadata formatter configuration.
512+
513+ Returns:
514+ Dictionary containing:
515+ - metadata_tag_prefix: Opening tag for metadata section
516+ - metadata_tag_suffix: Closing tag for metadata section
517+ """
518+ return {
519+ "metadata_tag_prefix" : self ._metadata_formatter .metadata_tag_prefix ,
520+ "metadata_tag_suffix" : self ._metadata_formatter .metadata_tag_suffix ,
521+ }
522+
523+ @property
524+ def metadata_formatter (self ) -> Any :
525+ """Current MetadataFormatter instance for this DocumentProcessor."""
526+ return self ._metadata_formatter
527+
487528 @property
488529 def ocr_engine (self ) -> Optional [Any ]:
489530 """Current OCR engine instance."""
@@ -875,6 +916,34 @@ def _create_chart_processor(
875916 tag_suffix = chart_tag_suffix
876917 )
877918
919+ def _create_metadata_formatter (
920+ self ,
921+ metadata_tag_prefix : Optional [str ] = None ,
922+ metadata_tag_suffix : Optional [str ] = None
923+ ) -> Any :
924+ """
925+ Create a MetadataFormatter instance for this DocumentProcessor.
926+
927+ This creates an instance-specific MetadataFormatter that will be
928+ passed to handlers via config.
929+
930+ Args:
931+ metadata_tag_prefix: Opening tag (default: "<Document-Metadata>")
932+ metadata_tag_suffix: Closing tag (default: "</Document-Metadata>")
933+
934+ Returns:
935+ MetadataFormatter instance
936+ """
937+ from contextifier .core .functions .metadata_extractor import MetadataFormatter
938+
939+ kwargs = {}
940+ if metadata_tag_prefix is not None :
941+ kwargs ["metadata_tag_prefix" ] = metadata_tag_prefix
942+ if metadata_tag_suffix is not None :
943+ kwargs ["metadata_tag_suffix" ] = metadata_tag_suffix
944+
945+ return MetadataFormatter (** kwargs )
946+
878947 def _build_supported_extensions (self ) -> List [str ]:
879948 """Build list of supported extensions."""
880949 extensions = list (
@@ -940,6 +1009,19 @@ def _get_handler_registry(self) -> Dict[str, Callable]:
9401009 except ImportError as e :
9411010 self ._logger .warning (f"DOC handler not available: { e } " )
9421011
1012+ # RTF handler
1013+ try :
1014+ from contextifier .core .processor .rtf_handler import RTFHandler
1015+ rtf_handler = RTFHandler (
1016+ config = self ._config ,
1017+ image_processor = self ._image_processor ,
1018+ page_tag_processor = self ._page_tag_processor ,
1019+ chart_processor = self ._chart_processor
1020+ )
1021+ self ._handler_registry ['rtf' ] = rtf_handler .extract_text
1022+ except ImportError as e :
1023+ self ._logger .warning (f"RTF handler not available: { e } " )
1024+
9431025 # PPT/PPTX handler
9441026 try :
9451027 from contextifier .core .processor .ppt_handler import PPTHandler
@@ -997,7 +1079,7 @@ def _get_handler_registry(self) -> Dict[str, Callable]:
9971079
9981080 # HWPX handler
9991081 try :
1000- from contextifier .core .processor .hwps_handler import HWPXHandler
1082+ from contextifier .core .processor .hwpx_handler import HWPXHandler
10011083 hwpx_handler = HWPXHandler (
10021084 config = self ._config ,
10031085 image_processor = self ._image_processor ,
0 commit comments