Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions frappe_pdf_extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
__all__ = [
"extract_pdf_to_json",
"process_one",
"process_dir",
]

__version__ = "0.1.0"
88 changes: 88 additions & 0 deletions frappe_pdf_extractor/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from __future__ import annotations

from typing import Dict, List, Optional

try:
import frappe # type: ignore
from frappe.utils import get_site_path
from frappe.utils.file_manager import get_file_path # type: ignore
except Exception: # pragma: no cover - allows import outside Frappe
frappe = None # type: ignore
def get_site_path(*parts): # type: ignore
import os
return os.path.abspath("/".join(parts))
def get_file_path(file_url: str): # type: ignore
return file_url

from .pdf_to_json import process_dir, process_one


def _resolve_path_maybe_frappe(path_or_ref: str) -> str:
"""Resolve a path that may be a filesystem path, a /files URL, or a File docname."""
import os

candidate = path_or_ref
# Absolute/relative path on filesystem
if os.path.exists(candidate):
return os.path.abspath(candidate)

# Site files or private files
if candidate.startswith("/files/") or candidate.startswith("/private/files/"):
return get_site_path(candidate.lstrip("/"))

# Try Frappe File doctype lookup
if frappe is not None:
try:
file_doc = frappe.get_doc("File", candidate)
file_url = getattr(file_doc, "file_url", None) or getattr(file_doc, "file_name", None)
if file_url:
return get_file_path(file_url)
except Exception:
pass

# As a last resort, return as-is
return candidate


# Whitelisted API for Frappe/ERPNext
if frappe is not None:
@frappe.whitelist(allow_guest=False)
def extract_pdf(pdf: Optional[str] = None, dir: Optional[str] = None, outdir: Optional[str] = None, keep_txt: int | bool = 0):
"""Extract PDF(s) to JSON.

Args:
pdf: Path, /files URL, or File docname for a single PDF
dir: Path or /files directory containing PDFs (recursive)
outdir: Output directory (defaults to the same as the PDF or directory)
keep_txt: 1/0 to retain intermediate pdftotext output
Returns:
Dict with paths of generated JSON files
"""
if not pdf and not dir:
frappe.throw("Provide either 'pdf' or 'dir'")

keep = bool(int(keep_txt)) if isinstance(keep_txt, (str, int)) else bool(keep_txt)

if pdf:
pdf_path = _resolve_path_maybe_frappe(pdf)
json_path = process_one(pdf_path, outdir=outdir, keep_txt=keep)
return {"json_path": json_path}
else:
dir_path = _resolve_path_maybe_frappe(dir) # type: ignore[arg-type]
outputs = process_dir(dir_path, outdir=outdir, keep_txt=keep)
return {"json_paths": outputs}

else:
# Non-Frappe shim for testing
def extract_pdf(pdf: Optional[str] = None, dir: Optional[str] = None, outdir: Optional[str] = None, keep_txt: int | bool = 0):
if not pdf and not dir:
raise ValueError("Provide either 'pdf' or 'dir'")
keep = bool(int(keep_txt)) if isinstance(keep_txt, (str, int)) else bool(keep_txt)
if pdf:
pdf_path = _resolve_path_maybe_frappe(pdf)
json_path = process_one(pdf_path, outdir=outdir, keep_txt=keep)
return {"json_path": json_path}
else:
dir_path = _resolve_path_maybe_frappe(dir) # type: ignore[arg-type]
outputs = process_dir(dir_path, outdir=outdir, keep_txt=keep)
return {"json_paths": outputs}
8 changes: 8 additions & 0 deletions frappe_pdf_extractor/hooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
app_name = "frappe_pdf_extractor"
app_title = "Frappe PDF Extractor"
app_publisher = "Your Company"
app_description = "Universal PDF to JSON extraction via pdftotext"
app_email = "dev@example.com"
app_version = "0.1.0"

# Whitelist methods are declared in api.py with @frappe.whitelist
1 change: 1 addition & 0 deletions frappe_pdf_extractor/modules.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
frappe_pdf_extractor
Loading