Minimal attachment text #32

omasoud · 2025-10-04T22:25:35Z

omasoud
Oct 4, 2025

It would be nice if there was an option to just be able to get the attachment text without headers, metadata, etc.
Things that get produced, most cannot be gotten rid of:

# PDF Document: <file>
# Document: <file>
[page <k>]
*Total pages processed: <k>*
## Document Metadata <and then a few lines>
*Document processed: <k> paragraphs*
## File Info
- **Docx Images Error**: ....

I had to go with this workaround:

minimal_presenters.py

"""
Minimal presenters for the 'attachments' library (PDF + DOCX).

Register by importing this module once:
    import minimal_presenters

Usage:
    from attachments import attach, load, present
    import minimal_presenters   # ensures registration

    pdf_att  = attach("file.pdf")  | load.pdf_to_pdfplumber  | present.minimal
    docx_att = attach("file.docx") | load.docx_to_python_docx | present.minimal
     pptx_att = attach("file.pptx") | load.pptx_to_python_pptx | present.minimal

    print(pdf_att.text)
    print(docx_att.text)
     print(pptx_att.text)
    print(pdf_att.metadata["text_extraction_quality"])  # preserved
"""

from attachments.core import presenter, Attachment

@presenter
def minimal(att: Attachment, pdf: "pdfplumber.PDF") -> Attachment:
    """
    Extract raw PDF text (no headers, page labels, analysis blocks).
    Preserve scan-quality metadata used by other logic.
    """
    try:
        if "selected_pages" in att.metadata:
            pages = att.metadata["selected_pages"]
        else:
            pages = range(1, len(pdf.pages) + 1)

        total_chars = 0
        pages_with_text = 0
        chunks = []

        for page_num in pages:
            if 1 <= page_num <= len(pdf.pages):
                page = pdf.pages[page_num - 1]
                txt = (page.extract_text() or "").strip()
                if txt:
                    pages_with_text += 1
                    total_chars += len(txt)
                    chunks.append(txt)
                # Skip placeholders for empty (likely scanned) pages

        avg = (total_chars / len(pages)) if pages else 0
        is_scanned = (
            pages_with_text == 0
            or avg < 50
            or (pages_with_text / len(pages)) < 0.3
        )

        att.metadata.update({
            "is_likely_scanned": is_scanned,
            "pages_with_text": pages_with_text,
            "total_pages": len(pages),
            "avg_text_per_page": avg,
            "text_extraction_quality": (
                "poor" if avg < 20 and is_scanned else
                ("limited" if is_scanned else "good")
            ),
        })

        att.text = "\n\n".join(chunks)
    except Exception as e:
        att.metadata.setdefault("errors", []).append(f"minimal_pdf_error: {e}")
    return att


@presenter
def minimal(att: Attachment, doc: "docx.Document") -> Attachment:
    """
    Extract plain DOCX paragraph text only (no headers, counts, or style formatting).
    """
    try:
        paras = [
            p.text.strip()
            for p in doc.paragraphs
            if p.text and p.text.strip()
        ]
        att.text = "\n\n".join(paras)
    except Exception as e:
        att.metadata.setdefault("errors", []).append(f"minimal_docx_error: {e}")
    return att

@presenter
def minimal(att: Attachment, pres: "pptx.Presentation") -> Attachment:
    """
    Extract plain PPTX slide text only (no headers, counts, or style formatting).
    """
    texts = []
    for slide in pres.slides:
        buf = []
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                buf.append(shape.text.strip())
        if buf:
            texts.append("\n".join(buf))
    att.text = "\n\n".join(texts)
    return att

minimal_universal.py

from attachments import attach, load, present, modify

# Ensure presenters are registered
from . import minimal_presenters  


def minimal_universal(path: str, include_images: bool = False):
    """
    Process a single file/URL into a clean Attachment using present.minimal
    (PDF & DOCX supported; others fall back to raw text).
    
    Parameters
    ----------
    path : str
        Local filesystem path or URL.
    include_images : bool
        If True and the loader supports images (e.g. PDF pages), also attach images.
    
    Returns
    -------
    Attachment
        An Attachment whose .text (and therefore str(att)) is the minimal clean text.
    """
    att = attach(path)

    # Try morphing for URLs or unknown endings
    if path.startswith(("http://", "https://")):
        try:
            att = att | load.url_to_response | modify.morph_to_detected_type
        except Exception:
            pass  # Graceful fallback

    def _finalize(pipeline):
        if include_images:
            # Chain images AFTER minimal (so minimal doesn't pick up decorative text)
            return att | pipeline | present.images
        return att | pipeline

    lower = att.path.lower() if att.path else path.lower()

    try:
        if lower.endswith(".pdf"):
            return _finalize(load.pdf_to_pdfplumber | present.minimal)
        if lower.endswith((".docx", ".doc")):
            return _finalize(load.docx_to_python_docx | present.minimal)

        # Attempt a late morph if not already identified
        try:
            att2 = att | modify.morph_to_detected_type
        except Exception:
            att2 = att

        lower2 = att2.path.lower() if att2.path else lower
        if lower2.endswith(".pdf"):
            return _finalize(load.pdf_to_pdfplumber | present.minimal)
        if lower2.endswith((".docx", ".doc")):
            return _finalize(load.docx_to_python_docx | present.minimal)

        # Fallback to generic text (no minimal variant needed)
        return att2 | load.text_to_string
    except Exception as e:
        att.metadata.setdefault("errors", []).append(f"minimal_universal_error: {e}")
        return att

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Minimal attachment text #32

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Uh oh!

Minimal attachment text #32

Uh oh!

omasoud Oct 4, 2025

minimal_presenters.py

minimal_universal.py

Replies: 0 comments

omasoud
Oct 4, 2025