From b06d0872cfa08603ce5cbfbbb95943d2fbcc64cd Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 9 Oct 2025 02:06:58 +0000 Subject: [PATCH] feat: Add frappe_pdf_extractor app This commit introduces the frappe_pdf_extractor app, enabling PDF to JSON conversion. Co-authored-by: calledasfiverr --- frappe_pdf_extractor/__init__.py | 7 + frappe_pdf_extractor/api.py | 88 +++++++ frappe_pdf_extractor/hooks.py | 8 + frappe_pdf_extractor/modules.txt | 1 + frappe_pdf_extractor/pdf_to_json.py | 367 ++++++++++++++++++++++++++++ setup.py | 11 + 6 files changed, 482 insertions(+) create mode 100644 frappe_pdf_extractor/__init__.py create mode 100644 frappe_pdf_extractor/api.py create mode 100644 frappe_pdf_extractor/hooks.py create mode 100644 frappe_pdf_extractor/modules.txt create mode 100644 frappe_pdf_extractor/pdf_to_json.py create mode 100644 setup.py diff --git a/frappe_pdf_extractor/__init__.py b/frappe_pdf_extractor/__init__.py new file mode 100644 index 0000000..1f01325 --- /dev/null +++ b/frappe_pdf_extractor/__init__.py @@ -0,0 +1,7 @@ +__all__ = [ + "extract_pdf_to_json", + "process_one", + "process_dir", +] + +__version__ = "0.1.0" diff --git a/frappe_pdf_extractor/api.py b/frappe_pdf_extractor/api.py new file mode 100644 index 0000000..c6173e8 --- /dev/null +++ b/frappe_pdf_extractor/api.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +from typing import Dict, List, Optional + +try: + import frappe # type: ignore + from frappe.utils import get_site_path + from frappe.utils.file_manager import get_file_path # type: ignore +except Exception: # pragma: no cover - allows import outside Frappe + frappe = None # type: ignore + def get_site_path(*parts): # type: ignore + import os + return os.path.abspath("/".join(parts)) + def get_file_path(file_url: str): # type: ignore + return file_url + +from .pdf_to_json import process_dir, process_one + + +def _resolve_path_maybe_frappe(path_or_ref: str) -> str: + """Resolve a path that may be a filesystem path, a /files URL, or a File docname.""" + import os + + candidate = path_or_ref + # Absolute/relative path on filesystem + if os.path.exists(candidate): + return os.path.abspath(candidate) + + # Site files or private files + if candidate.startswith("/files/") or candidate.startswith("/private/files/"): + return get_site_path(candidate.lstrip("/")) + + # Try Frappe File doctype lookup + if frappe is not None: + try: + file_doc = frappe.get_doc("File", candidate) + file_url = getattr(file_doc, "file_url", None) or getattr(file_doc, "file_name", None) + if file_url: + return get_file_path(file_url) + except Exception: + pass + + # As a last resort, return as-is + return candidate + + +# Whitelisted API for Frappe/ERPNext +if frappe is not None: + @frappe.whitelist(allow_guest=False) + def extract_pdf(pdf: Optional[str] = None, dir: Optional[str] = None, outdir: Optional[str] = None, keep_txt: int | bool = 0): + """Extract PDF(s) to JSON. + + Args: + pdf: Path, /files URL, or File docname for a single PDF + dir: Path or /files directory containing PDFs (recursive) + outdir: Output directory (defaults to the same as the PDF or directory) + keep_txt: 1/0 to retain intermediate pdftotext output + Returns: + Dict with paths of generated JSON files + """ + if not pdf and not dir: + frappe.throw("Provide either 'pdf' or 'dir'") + + keep = bool(int(keep_txt)) if isinstance(keep_txt, (str, int)) else bool(keep_txt) + + if pdf: + pdf_path = _resolve_path_maybe_frappe(pdf) + json_path = process_one(pdf_path, outdir=outdir, keep_txt=keep) + return {"json_path": json_path} + else: + dir_path = _resolve_path_maybe_frappe(dir) # type: ignore[arg-type] + outputs = process_dir(dir_path, outdir=outdir, keep_txt=keep) + return {"json_paths": outputs} + +else: + # Non-Frappe shim for testing + def extract_pdf(pdf: Optional[str] = None, dir: Optional[str] = None, outdir: Optional[str] = None, keep_txt: int | bool = 0): + if not pdf and not dir: + raise ValueError("Provide either 'pdf' or 'dir'") + keep = bool(int(keep_txt)) if isinstance(keep_txt, (str, int)) else bool(keep_txt) + if pdf: + pdf_path = _resolve_path_maybe_frappe(pdf) + json_path = process_one(pdf_path, outdir=outdir, keep_txt=keep) + return {"json_path": json_path} + else: + dir_path = _resolve_path_maybe_frappe(dir) # type: ignore[arg-type] + outputs = process_dir(dir_path, outdir=outdir, keep_txt=keep) + return {"json_paths": outputs} diff --git a/frappe_pdf_extractor/hooks.py b/frappe_pdf_extractor/hooks.py new file mode 100644 index 0000000..4e22b88 --- /dev/null +++ b/frappe_pdf_extractor/hooks.py @@ -0,0 +1,8 @@ +app_name = "frappe_pdf_extractor" +app_title = "Frappe PDF Extractor" +app_publisher = "Your Company" +app_description = "Universal PDF to JSON extraction via pdftotext" +app_email = "dev@example.com" +app_version = "0.1.0" + +# Whitelist methods are declared in api.py with @frappe.whitelist diff --git a/frappe_pdf_extractor/modules.txt b/frappe_pdf_extractor/modules.txt new file mode 100644 index 0000000..fb25ffd --- /dev/null +++ b/frappe_pdf_extractor/modules.txt @@ -0,0 +1 @@ +frappe_pdf_extractor diff --git a/frappe_pdf_extractor/pdf_to_json.py b/frappe_pdf_extractor/pdf_to_json.py new file mode 100644 index 0000000..5c9f6d1 --- /dev/null +++ b/frappe_pdf_extractor/pdf_to_json.py @@ -0,0 +1,367 @@ +""" +Universal PDF → JSON extractor using `pdftotext -layout`. + +Features +- Extracts: + • KEY: VALUE pairs on one line + • Label on one line, value on next line + • Two-column lists (e.g., CRITERIA | COMMENT tables) + • Emails, phone numbers, addresses (best-effort) +- Normalizes: + • Keys → snake_case + • Dates DD-MM-YYYY / DD/MM/YYYY → YYYY-MM-DD + • Empty / "N/A" → None +- Batch mode (process a folder of PDFs) + +Requirements +- Python 3.9+ +- Poppler utils installed and `pdftotext` available on PATH +""" +from __future__ import annotations + +import argparse +import json +import os +import re +import shutil +import subprocess +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Tuple + + +# ---------- Utilities ---------- + +def run_pdftotext(pdf_path: str, txt_path: str) -> None: + """Run pdftotext with -layout and write to txt_path.""" + if shutil.which("pdftotext") is None: + raise RuntimeError("pdftotext not found on PATH. Install poppler-utils.") + + result = subprocess.run( + ["pdftotext", "-layout", pdf_path, txt_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + if result.returncode != 0 or not Path(txt_path).is_file(): + raise RuntimeError(f"pdftotext failed for {pdf_path}: {result.stderr.strip()}") + + +def snake(text: str) -> str: + """Normalize text to snake_case with ASCII letters/digits/underscores.""" + # Replace any non-alphanumeric with spaces, collapse, then underscores + s = re.sub(r"[^A-Za-z0-9]+", " ", text.strip()) + s = re.sub(r"\s+", "_", s.strip()) + return s.lower() + + +def normalize_date(value: str) -> str: + v = value.strip() + # DD-MM-YYYY or DD/MM/YYYY -> YYYY-MM-DD + m = re.match(r"^(\d{2})[\/\-](\d{2})[\/\-](\d{4})$", v) + if m: + day, month, year = int(m.group(1)), int(m.group(2)), int(m.group(3)) + return f"{year:04d}-{month:02d}-{day:02d}" + # Already YYYY-MM-DD + if re.match(r"^\d{4}-\d{2}-\d{2}$", v): + return v + return v + + +def normalize_value(value: Optional[str]) -> Optional[str]: + if value is None: + return None + v = value.strip() + if v == "" or v.upper() == "N/A": + return None + v = re.sub(r"\s+", " ", v) + return normalize_date(v) + + +def looks_like_label(line: str) -> bool: + # Lines that appear to be section/field labels (mostly uppercase / titlecaps) + return bool(re.match(r"^[A-Z0-9][A-Z0-9/\-\s&\.,#]+$", line.strip())) + + +def is_probably_value(line: str) -> bool: + t = line.strip() + if t == "": + return False + if re.search(r"[a-z]", t): + return True + if re.search(r"\d", t) and not looks_like_label(t): + return True + if re.match(r"^(GOOD|YES|NO|POOR|NA|N/N|N/ A|N/\s*A)$", t): + return True + return not looks_like_label(t) + + +# ---------- Extraction passes ---------- + +def extract_key_value_same_line(lines: Iterable[str]) -> Dict[str, str]: + pairs: Dict[str, str] = {} + for line in lines: + t = line.strip() + if t == "": + continue + m = re.match(r"^([A-Z0-9][A-Z0-9/\-\s&\.,#]+?)\s*[:\-]\s*(.+)$", t) + if not m: + continue + key = snake(m.group(1)) + val = normalize_value(m.group(2)) + if key and val is not None: + pairs[key] = val + return pairs + + +def extract_key_value_stacked(lines: List[str]) -> Dict[str, str]: + pairs: Dict[str, str] = {} + n = len(lines) + i = 0 + while i < n: + kline = lines[i].strip() + if kline and ":" not in kline and looks_like_label(kline) and len(kline) >= 3: + j = i + 1 + while j < n and lines[j].strip() == "": + j += 1 + if j < n: + vline = lines[j].strip() + if vline and not looks_like_label(vline): + key = snake(kline) + val = normalize_value(vline) + if val is not None and key not in pairs: + pairs[key] = val + i = j + i += 1 + return pairs + + +def _find_two_column_headers(lines: List[str]) -> List[int]: + header_indices: List[int] = [] + for idx, line in enumerate(lines): + t = line.strip() + if t == "" or len(t) < 5: + continue + if re.match(r"^\s*[A-Z][A-Z\s/&\-\.#]+?\s{2,}[A-Z][A-Z\s/&\-\.#]+?\s*$", line): + header_indices.append(idx) + return header_indices + + +def extract_two_column_lists(lines: List[str]) -> Dict[str, List[Dict[str, Optional[str]]]]: + lists: Dict[str, List[Dict[str, Optional[str]]]] = {} + headers = _find_two_column_headers(lines) + for hidx in headers: + header = lines[hidx].strip() + parts = re.split(r"\s{2,}", header) + if len(parts) != 2: + continue + left_header, right_header = snake(parts[0]), snake(parts[1]) + section: List[Dict[str, Optional[str]]] = [] + i = hidx + 1 + while i < len(lines): + row = lines[i].strip() + if row == "": + i += 1 + continue + if re.match(r"^\s*[A-Z][A-Z\s/&\-\.#]+?\s{2,}[A-Z][A-Z\s/&\-\.#]+?\s*$", row): + break + cells = re.split(r"\s{2,}", lines[i]) + if len(cells) >= 2: + left = normalize_value(cells[0].strip()) + right = normalize_value(cells[1].strip()) + if left is not None or right is not None: + section.append({left_header: left, right_header: right}) + i += 1 + if section: + section_key = f"{left_header}_and_{right_header}_list" + lists.setdefault(section_key, []) + lists[section_key].extend(section) + return lists + + +# Extract emails, phones, basic addresses (best-effort) + +def extract_contacts_and_addresses(text: str) -> Dict[str, List[str]]: + emails: List[str] = [] + m = re.findall(r"[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}", text, flags=re.IGNORECASE) + if m: + emails = sorted({e.lower() for e in m}) + + phones: List[str] = [] + m2 = re.findall(r"(?:\+?\d[\d\-\s]{7,}\d)", text) + if m2: + for p in m2: + pp = re.sub(r"[^\d+]", "", p) + if len(re.sub(r"\D", "", pp)) >= 9: + phones.append(pp) + phones = sorted(set(phones)) + + lines = re.split(r"\R", text) + addr_candidates: List[str] = [] + for line in lines: + l = line.strip() + if not l: + continue + if re.search( + r"\d+.*(Street|St|Road|Rd|Avenue|Ave|Lane|Ln|Industrial|Park|Unit|Building|Benoni|Bellville|Gauteng|Western Cape)", + l, + flags=re.IGNORECASE, + ): + addr_candidates.append(l) + addresses = sorted(set(addr_candidates)) + + return {"emails": emails, "phones": phones, "addresses": addresses} + + +# Map common aliases to stable keys + +def apply_aliases(data: Dict[str, object]) -> Dict[str, object]: + aliases = { + "test_certificate_number": [ + "test_certificate_number", + "test_certificate", + "test_certificate_-_esl07969", + "certificate_no", + "certificate_number", + ], + "primary_client": ["primary_client", "client_primary"], + "secondary_client": ["secondary_client", "client_secondary"], + "test_date": ["test_date", "inspection_date", "date_of_test"], + "repair_date": ["repair_date", "date_of_repair"], + "model": ["model", "type", "model_type"], + "serial_number": ["serial_number", "serial_no", "serial_#"], + "rated_capacity": ["rated_capacity", "capacity_rated"], + "actual_capacity": ["actual_capacity", "capacity_actual"], + "hours": ["hours", "operating_hours"], + "technician": ["technician", "inspector", "engineer"], + "ecsa_accreditation": ["ecsa_accreditation", "ecsa_no", "ecsa_registration"], + "compliance_to_standards": [ + "compliance_to_standards", + "compliance_to_standards_yes", + "compliance", + ], + "remarks": ["remarks", "notes"], + "comments": ["comments", "summary"], + } + + final: Dict[str, object] = {} + for canon, cands in aliases.items(): + for c in cands: + if c in data: + final[canon] = data[c] + break + + for k, v in data.items(): + if k not in final: + final[k] = v + + for dk in ("test_date", "repair_date"): + if dk in final and isinstance(final[dk], str): + final[dk] = normalize_value(final[dk]) + + return final + + +# ---------- Orchestration ---------- + +def process_one(pdf_path: str, outdir: Optional[str] = None, keep_txt: bool = False) -> str: + pdf_path = os.path.abspath(pdf_path) + if not os.path.isfile(pdf_path): + raise FileNotFoundError(f"Not a file: {pdf_path}") + + base = Path(pdf_path).stem + with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{base}.txt") as tf: + txt_path = tf.name + + try: + run_pdftotext(pdf_path, txt_path) + with open(txt_path, "r", encoding="utf-8", errors="ignore") as f: + text = f.read() + lines = re.split(r"\R", text) + + kv1 = extract_key_value_same_line(lines) + kv2 = extract_key_value_stacked(lines) + list2 = extract_two_column_lists(lines) + meta = extract_contacts_and_addresses(text) + + merged: Dict[str, object] = {} + merged.update(kv1) + merged.update(kv2) + merged.update(list2) + if meta.get("emails"): + merged["emails"] = meta["emails"] + if meta.get("phones"): + merged["phones"] = meta["phones"] + if meta.get("addresses"): + merged["addresses"] = meta["addresses"] + + final = apply_aliases(merged) + + out_dir_final = outdir if outdir else os.path.dirname(pdf_path) + os.makedirs(out_dir_final, exist_ok=True) + json_path = os.path.join(out_dir_final, f"{base}.json") + with open(json_path, "w", encoding="utf-8") as jf: + json.dump(final, jf, ensure_ascii=False, indent=2) + + if keep_txt: + keep_path = os.path.join(out_dir_final, f"{base}.txt") + try: + shutil.copyfile(txt_path, keep_path) + except Exception: + pass + return json_path + finally: + try: + if os.path.exists(txt_path): + os.unlink(txt_path) + except Exception: + pass + + +def process_dir(dir_path: str, outdir: Optional[str] = None, keep_txt: bool = False) -> List[str]: + dir_path = os.path.abspath(dir_path) + if not os.path.isdir(dir_path): + raise NotADirectoryError(f"Not a directory: {dir_path}") + + json_paths: List[str] = [] + for root, _dirs, files in os.walk(dir_path): + for fn in files: + if fn.lower().endswith(".pdf"): + pdf = os.path.join(root, fn) + json_paths.append(process_one(pdf, outdir=outdir, keep_txt=keep_txt)) + return json_paths + + +def extract_pdf_to_json(pdf: str, outdir: Optional[str] = None, keep_txt: bool = False) -> str: + """Convenience wrapper for single-file extraction.""" + return process_one(pdf, outdir=outdir, keep_txt=keep_txt) + + +# ---------- CLI ---------- + +def _parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="PDF to JSON extractor (pdftotext -layout)") + p.add_argument("--pdf", help="Path to a single PDF") + p.add_argument("--dir", help="Path to a directory of PDFs") + p.add_argument("--outdir", help="Optional output directory") + p.add_argument("--keep-txt", dest="keep_txt", action="store_true", help="Keep intermediate .txt files") + return p.parse_args() + + +def main() -> None: + args = _parse_args() + if not args.pdf and not args.dir: + raise SystemExit("Usage: --pdf=/path/file.pdf OR --dir=/path/folder [--outdir=...] [--keep-txt]") + if args.pdf: + json_path = process_one(args.pdf, outdir=args.outdir, keep_txt=args.keep_txt) + print(f"[OK] {args.pdf} -> {json_path}") + else: + outputs = process_dir(args.dir, outdir=args.outdir, keep_txt=args.keep_txt) + for op in outputs: + print(f"[OK] -> {op}") + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ff4a88b --- /dev/null +++ b/setup.py @@ -0,0 +1,11 @@ +from setuptools import setup, find_packages + +setup( + name="frappe_pdf_extractor", + version="0.1.0", + description="Universal PDF to JSON extractor for Frappe/ERPNext (pdftotext)", + author="Your Company", + packages=find_packages(), + include_package_data=True, + install_requires=[], +)