You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
It would be nice if there was an option to just be able to get the attachment text without headers, metadata, etc.
Things that get produced, most cannot be gotten rid of:
# PDF Document: <file>
# Document: <file>
[page <k>]
*Total pages processed: <k>*
## Document Metadata <and then a few lines>
*Document processed: <k> paragraphs*
## File Info
- **Docx Images Error**: ....
I had to go with this workaround:
minimal_presenters.py
"""Minimal presenters for the 'attachments' library (PDF + DOCX).Register by importing this module once: import minimal_presentersUsage: from attachments import attach, load, present import minimal_presenters # ensures registration pdf_att = attach("file.pdf") | load.pdf_to_pdfplumber | present.minimal docx_att = attach("file.docx") | load.docx_to_python_docx | present.minimal pptx_att = attach("file.pptx") | load.pptx_to_python_pptx | present.minimal print(pdf_att.text) print(docx_att.text) print(pptx_att.text) print(pdf_att.metadata["text_extraction_quality"]) # preserved"""fromattachments.coreimportpresenter, Attachment@presenterdefminimal(att: Attachment, pdf: "pdfplumber.PDF") ->Attachment:
""" Extract raw PDF text (no headers, page labels, analysis blocks). Preserve scan-quality metadata used by other logic. """try:
if"selected_pages"inatt.metadata:
pages=att.metadata["selected_pages"]
else:
pages=range(1, len(pdf.pages) +1)
total_chars=0pages_with_text=0chunks= []
forpage_numinpages:
if1<=page_num<=len(pdf.pages):
page=pdf.pages[page_num-1]
txt= (page.extract_text() or"").strip()
iftxt:
pages_with_text+=1total_chars+=len(txt)
chunks.append(txt)
# Skip placeholders for empty (likely scanned) pagesavg= (total_chars/len(pages)) ifpageselse0is_scanned= (
pages_with_text==0oravg<50or (pages_with_text/len(pages)) <0.3
)
att.metadata.update({
"is_likely_scanned": is_scanned,
"pages_with_text": pages_with_text,
"total_pages": len(pages),
"avg_text_per_page": avg,
"text_extraction_quality": (
"poor"ifavg<20andis_scannedelse
("limited"ifis_scannedelse"good")
),
})
att.text="\n\n".join(chunks)
exceptExceptionase:
att.metadata.setdefault("errors", []).append(f"minimal_pdf_error: {e}")
returnatt@presenterdefminimal(att: Attachment, doc: "docx.Document") ->Attachment:
""" Extract plain DOCX paragraph text only (no headers, counts, or style formatting). """try:
paras= [
p.text.strip()
forpindoc.paragraphsifp.textandp.text.strip()
]
att.text="\n\n".join(paras)
exceptExceptionase:
att.metadata.setdefault("errors", []).append(f"minimal_docx_error: {e}")
returnatt@presenterdefminimal(att: Attachment, pres: "pptx.Presentation") ->Attachment:
""" Extract plain PPTX slide text only (no headers, counts, or style formatting). """texts= []
forslideinpres.slides:
buf= []
forshapeinslide.shapes:
ifhasattr(shape, "text") andshape.text.strip():
buf.append(shape.text.strip())
ifbuf:
texts.append("\n".join(buf))
att.text="\n\n".join(texts)
returnatt
minimal_universal.py
fromattachmentsimportattach, load, present, modify# Ensure presenters are registeredfrom . importminimal_presentersdefminimal_universal(path: str, include_images: bool=False):
""" Process a single file/URL into a clean Attachment using present.minimal (PDF & DOCX supported; others fall back to raw text). Parameters ---------- path : str Local filesystem path or URL. include_images : bool If True and the loader supports images (e.g. PDF pages), also attach images. Returns ------- Attachment An Attachment whose .text (and therefore str(att)) is the minimal clean text. """att=attach(path)
# Try morphing for URLs or unknown endingsifpath.startswith(("http://", "https://")):
try:
att=att|load.url_to_response|modify.morph_to_detected_typeexceptException:
pass# Graceful fallbackdef_finalize(pipeline):
ifinclude_images:
# Chain images AFTER minimal (so minimal doesn't pick up decorative text)returnatt|pipeline|present.imagesreturnatt|pipelinelower=att.path.lower() ifatt.pathelsepath.lower()
try:
iflower.endswith(".pdf"):
return_finalize(load.pdf_to_pdfplumber|present.minimal)
iflower.endswith((".docx", ".doc")):
return_finalize(load.docx_to_python_docx|present.minimal)
# Attempt a late morph if not already identifiedtry:
att2=att|modify.morph_to_detected_typeexceptException:
att2=attlower2=att2.path.lower() ifatt2.pathelseloweriflower2.endswith(".pdf"):
return_finalize(load.pdf_to_pdfplumber|present.minimal)
iflower2.endswith((".docx", ".doc")):
return_finalize(load.docx_to_python_docx|present.minimal)
# Fallback to generic text (no minimal variant needed)returnatt2|load.text_to_stringexceptExceptionase:
att.metadata.setdefault("errors", []).append(f"minimal_universal_error: {e}")
returnatt
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
Uh oh!
There was an error while loading. Please reload this page.
-
It would be nice if there was an option to just be able to get the attachment text without headers, metadata, etc.
Things that get produced, most cannot be gotten rid of:
I had to go with this workaround:
minimal_presenters.py
minimal_universal.py
Beta Was this translation helpful? Give feedback.
All reactions