From b06d0872cfa08603ce5cbfbbb95943d2fbcc64cd Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Thu, 9 Oct 2025 02:06:58 +0000
Subject: [PATCH] feat: Add frappe_pdf_extractor app

This commit introduces the frappe_pdf_extractor app, enabling PDF to JSON conversion.

Co-authored-by: calledasfiverr <calledasfiverr@gmail.com>
---
 frappe_pdf_extractor/__init__.py    |   7 +
 frappe_pdf_extractor/api.py         |  88 +++++++
 frappe_pdf_extractor/hooks.py       |   8 +
 frappe_pdf_extractor/modules.txt    |   1 +
 frappe_pdf_extractor/pdf_to_json.py | 367 ++++++++++++++++++++++++++++
 setup.py                            |  11 +
 6 files changed, 482 insertions(+)
 create mode 100644 frappe_pdf_extractor/__init__.py
 create mode 100644 frappe_pdf_extractor/api.py
 create mode 100644 frappe_pdf_extractor/hooks.py
 create mode 100644 frappe_pdf_extractor/modules.txt
 create mode 100644 frappe_pdf_extractor/pdf_to_json.py
 create mode 100644 setup.py

diff --git a/frappe_pdf_extractor/__init__.py b/frappe_pdf_extractor/__init__.py
new file mode 100644
index 0000000..1f01325
--- /dev/null
+++ b/frappe_pdf_extractor/__init__.py
@@ -0,0 +1,7 @@
+__all__ = [
+    "extract_pdf_to_json",
+    "process_one",
+    "process_dir",
+]
+
+__version__ = "0.1.0"
diff --git a/frappe_pdf_extractor/api.py b/frappe_pdf_extractor/api.py
new file mode 100644
index 0000000..c6173e8
--- /dev/null
+++ b/frappe_pdf_extractor/api.py
@@ -0,0 +1,88 @@
+from __future__ import annotations
+
+from typing import Dict, List, Optional
+
+try:
+    import frappe  # type: ignore
+    from frappe.utils import get_site_path
+    from frappe.utils.file_manager import get_file_path  # type: ignore
+except Exception:  # pragma: no cover - allows import outside Frappe
+    frappe = None  # type: ignore
+    def get_site_path(*parts):  # type: ignore
+        import os
+        return os.path.abspath("/".join(parts))
+    def get_file_path(file_url: str):  # type: ignore
+        return file_url
+
+from .pdf_to_json import process_dir, process_one
+
+
+def _resolve_path_maybe_frappe(path_or_ref: str) -> str:
+    """Resolve a path that may be a filesystem path, a /files URL, or a File docname."""
+    import os
+
+    candidate = path_or_ref
+    # Absolute/relative path on filesystem
+    if os.path.exists(candidate):
+        return os.path.abspath(candidate)
+
+    # Site files or private files
+    if candidate.startswith("/files/") or candidate.startswith("/private/files/"):
+        return get_site_path(candidate.lstrip("/"))
+
+    # Try Frappe File doctype lookup
+    if frappe is not None:
+        try:
+            file_doc = frappe.get_doc("File", candidate)
+            file_url = getattr(file_doc, "file_url", None) or getattr(file_doc, "file_name", None)
+            if file_url:
+                return get_file_path(file_url)
+        except Exception:
+            pass
+
+    # As a last resort, return as-is
+    return candidate
+
+
+# Whitelisted API for Frappe/ERPNext
+if frappe is not None:
+    @frappe.whitelist(allow_guest=False)
+    def extract_pdf(pdf: Optional[str] = None, dir: Optional[str] = None, outdir: Optional[str] = None, keep_txt: int | bool = 0):
+        """Extract PDF(s) to JSON.
+
+        Args:
+            pdf: Path, /files URL, or File docname for a single PDF
+            dir: Path or /files directory containing PDFs (recursive)
+            outdir: Output directory (defaults to the same as the PDF or directory)
+            keep_txt: 1/0 to retain intermediate pdftotext output
+        Returns:
+            Dict with paths of generated JSON files
+        """
+        if not pdf and not dir:
+            frappe.throw("Provide either 'pdf' or 'dir'")
+
+        keep = bool(int(keep_txt)) if isinstance(keep_txt, (str, int)) else bool(keep_txt)
+
+        if pdf:
+            pdf_path = _resolve_path_maybe_frappe(pdf)
+            json_path = process_one(pdf_path, outdir=outdir, keep_txt=keep)
+            return {"json_path": json_path}
+        else:
+            dir_path = _resolve_path_maybe_frappe(dir)  # type: ignore[arg-type]
+            outputs = process_dir(dir_path, outdir=outdir, keep_txt=keep)
+            return {"json_paths": outputs}
+
+else:
+    # Non-Frappe shim for testing
+    def extract_pdf(pdf: Optional[str] = None, dir: Optional[str] = None, outdir: Optional[str] = None, keep_txt: int | bool = 0):
+        if not pdf and not dir:
+            raise ValueError("Provide either 'pdf' or 'dir'")
+        keep = bool(int(keep_txt)) if isinstance(keep_txt, (str, int)) else bool(keep_txt)
+        if pdf:
+            pdf_path = _resolve_path_maybe_frappe(pdf)
+            json_path = process_one(pdf_path, outdir=outdir, keep_txt=keep)
+            return {"json_path": json_path}
+        else:
+            dir_path = _resolve_path_maybe_frappe(dir)  # type: ignore[arg-type]
+            outputs = process_dir(dir_path, outdir=outdir, keep_txt=keep)
+            return {"json_paths": outputs}
diff --git a/frappe_pdf_extractor/hooks.py b/frappe_pdf_extractor/hooks.py
new file mode 100644
index 0000000..4e22b88
--- /dev/null
+++ b/frappe_pdf_extractor/hooks.py
@@ -0,0 +1,8 @@
+app_name = "frappe_pdf_extractor"
+app_title = "Frappe PDF Extractor"
+app_publisher = "Your Company"
+app_description = "Universal PDF to JSON extraction via pdftotext"
+app_email = "dev@example.com"
+app_version = "0.1.0"
+
+# Whitelist methods are declared in api.py with @frappe.whitelist
diff --git a/frappe_pdf_extractor/modules.txt b/frappe_pdf_extractor/modules.txt
new file mode 100644
index 0000000..fb25ffd
--- /dev/null
+++ b/frappe_pdf_extractor/modules.txt
@@ -0,0 +1 @@
+frappe_pdf_extractor
diff --git a/frappe_pdf_extractor/pdf_to_json.py b/frappe_pdf_extractor/pdf_to_json.py
new file mode 100644
index 0000000..5c9f6d1
--- /dev/null
+++ b/frappe_pdf_extractor/pdf_to_json.py
@@ -0,0 +1,367 @@
+"""
+Universal PDF → JSON extractor using `pdftotext -layout`.
+
+Features
+- Extracts:
+  • KEY: VALUE pairs on one line
+  • Label on one line, value on next line
+  • Two-column lists (e.g., CRITERIA | COMMENT tables)
+  • Emails, phone numbers, addresses (best-effort)
+- Normalizes:
+  • Keys → snake_case
+  • Dates DD-MM-YYYY / DD/MM/YYYY → YYYY-MM-DD
+  • Empty / "N/A" → None
+- Batch mode (process a folder of PDFs)
+
+Requirements
+- Python 3.9+
+- Poppler utils installed and `pdftotext` available on PATH
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import shutil
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple
+
+
+# ---------- Utilities ----------
+
+def run_pdftotext(pdf_path: str, txt_path: str) -> None:
+    """Run pdftotext with -layout and write to txt_path."""
+    if shutil.which("pdftotext") is None:
+        raise RuntimeError("pdftotext not found on PATH. Install poppler-utils.")
+
+    result = subprocess.run(
+        ["pdftotext", "-layout", pdf_path, txt_path],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0 or not Path(txt_path).is_file():
+        raise RuntimeError(f"pdftotext failed for {pdf_path}: {result.stderr.strip()}")
+
+
+def snake(text: str) -> str:
+    """Normalize text to snake_case with ASCII letters/digits/underscores."""
+    # Replace any non-alphanumeric with spaces, collapse, then underscores
+    s = re.sub(r"[^A-Za-z0-9]+", " ", text.strip())
+    s = re.sub(r"\s+", "_", s.strip())
+    return s.lower()
+
+
+def normalize_date(value: str) -> str:
+    v = value.strip()
+    # DD-MM-YYYY or DD/MM/YYYY -> YYYY-MM-DD
+    m = re.match(r"^(\d{2})[\/\-](\d{2})[\/\-](\d{4})$", v)
+    if m:
+        day, month, year = int(m.group(1)), int(m.group(2)), int(m.group(3))
+        return f"{year:04d}-{month:02d}-{day:02d}"
+    # Already YYYY-MM-DD
+    if re.match(r"^\d{4}-\d{2}-\d{2}$", v):
+        return v
+    return v
+
+
+def normalize_value(value: Optional[str]) -> Optional[str]:
+    if value is None:
+        return None
+    v = value.strip()
+    if v == "" or v.upper() == "N/A":
+        return None
+    v = re.sub(r"\s+", " ", v)
+    return normalize_date(v)
+
+
+def looks_like_label(line: str) -> bool:
+    # Lines that appear to be section/field labels (mostly uppercase / titlecaps)
+    return bool(re.match(r"^[A-Z0-9][A-Z0-9/\-\s&\.,#]+$", line.strip()))
+
+
+def is_probably_value(line: str) -> bool:
+    t = line.strip()
+    if t == "":
+        return False
+    if re.search(r"[a-z]", t):
+        return True
+    if re.search(r"\d", t) and not looks_like_label(t):
+        return True
+    if re.match(r"^(GOOD|YES|NO|POOR|NA|N/N|N/ A|N/\s*A)$", t):
+        return True
+    return not looks_like_label(t)
+
+
+# ---------- Extraction passes ----------
+
+def extract_key_value_same_line(lines: Iterable[str]) -> Dict[str, str]:
+    pairs: Dict[str, str] = {}
+    for line in lines:
+        t = line.strip()
+        if t == "":
+            continue
+        m = re.match(r"^([A-Z0-9][A-Z0-9/\-\s&\.,#]+?)\s*[:\-]\s*(.+)$", t)
+        if not m:
+            continue
+        key = snake(m.group(1))
+        val = normalize_value(m.group(2))
+        if key and val is not None:
+            pairs[key] = val
+    return pairs
+
+
+def extract_key_value_stacked(lines: List[str]) -> Dict[str, str]:
+    pairs: Dict[str, str] = {}
+    n = len(lines)
+    i = 0
+    while i < n:
+        kline = lines[i].strip()
+        if kline and ":" not in kline and looks_like_label(kline) and len(kline) >= 3:
+            j = i + 1
+            while j < n and lines[j].strip() == "":
+                j += 1
+            if j < n:
+                vline = lines[j].strip()
+                if vline and not looks_like_label(vline):
+                    key = snake(kline)
+                    val = normalize_value(vline)
+                    if val is not None and key not in pairs:
+                        pairs[key] = val
+                    i = j
+        i += 1
+    return pairs
+
+
+def _find_two_column_headers(lines: List[str]) -> List[int]:
+    header_indices: List[int] = []
+    for idx, line in enumerate(lines):
+        t = line.strip()
+        if t == "" or len(t) < 5:
+            continue
+        if re.match(r"^\s*[A-Z][A-Z\s/&\-\.#]+?\s{2,}[A-Z][A-Z\s/&\-\.#]+?\s*$", line):
+            header_indices.append(idx)
+    return header_indices
+
+
+def extract_two_column_lists(lines: List[str]) -> Dict[str, List[Dict[str, Optional[str]]]]:
+    lists: Dict[str, List[Dict[str, Optional[str]]]] = {}
+    headers = _find_two_column_headers(lines)
+    for hidx in headers:
+        header = lines[hidx].strip()
+        parts = re.split(r"\s{2,}", header)
+        if len(parts) != 2:
+            continue
+        left_header, right_header = snake(parts[0]), snake(parts[1])
+        section: List[Dict[str, Optional[str]]] = []
+        i = hidx + 1
+        while i < len(lines):
+            row = lines[i].strip()
+            if row == "":
+                i += 1
+                continue
+            if re.match(r"^\s*[A-Z][A-Z\s/&\-\.#]+?\s{2,}[A-Z][A-Z\s/&\-\.#]+?\s*$", row):
+                break
+            cells = re.split(r"\s{2,}", lines[i])
+            if len(cells) >= 2:
+                left = normalize_value(cells[0].strip())
+                right = normalize_value(cells[1].strip())
+                if left is not None or right is not None:
+                    section.append({left_header: left, right_header: right})
+            i += 1
+        if section:
+            section_key = f"{left_header}_and_{right_header}_list"
+            lists.setdefault(section_key, [])
+            lists[section_key].extend(section)
+    return lists
+
+
+# Extract emails, phones, basic addresses (best-effort)
+
+def extract_contacts_and_addresses(text: str) -> Dict[str, List[str]]:
+    emails: List[str] = []
+    m = re.findall(r"[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}", text, flags=re.IGNORECASE)
+    if m:
+        emails = sorted({e.lower() for e in m})
+
+    phones: List[str] = []
+    m2 = re.findall(r"(?:\+?\d[\d\-\s]{7,}\d)", text)
+    if m2:
+        for p in m2:
+            pp = re.sub(r"[^\d+]", "", p)
+            if len(re.sub(r"\D", "", pp)) >= 9:
+                phones.append(pp)
+        phones = sorted(set(phones))
+
+    lines = re.split(r"\R", text)
+    addr_candidates: List[str] = []
+    for line in lines:
+        l = line.strip()
+        if not l:
+            continue
+        if re.search(
+            r"\d+.*(Street|St|Road|Rd|Avenue|Ave|Lane|Ln|Industrial|Park|Unit|Building|Benoni|Bellville|Gauteng|Western Cape)",
+            l,
+            flags=re.IGNORECASE,
+        ):
+            addr_candidates.append(l)
+    addresses = sorted(set(addr_candidates))
+
+    return {"emails": emails, "phones": phones, "addresses": addresses}
+
+
+# Map common aliases to stable keys
+
+def apply_aliases(data: Dict[str, object]) -> Dict[str, object]:
+    aliases = {
+        "test_certificate_number": [
+            "test_certificate_number",
+            "test_certificate",
+            "test_certificate_-_esl07969",
+            "certificate_no",
+            "certificate_number",
+        ],
+        "primary_client": ["primary_client", "client_primary"],
+        "secondary_client": ["secondary_client", "client_secondary"],
+        "test_date": ["test_date", "inspection_date", "date_of_test"],
+        "repair_date": ["repair_date", "date_of_repair"],
+        "model": ["model", "type", "model_type"],
+        "serial_number": ["serial_number", "serial_no", "serial_#"],
+        "rated_capacity": ["rated_capacity", "capacity_rated"],
+        "actual_capacity": ["actual_capacity", "capacity_actual"],
+        "hours": ["hours", "operating_hours"],
+        "technician": ["technician", "inspector", "engineer"],
+        "ecsa_accreditation": ["ecsa_accreditation", "ecsa_no", "ecsa_registration"],
+        "compliance_to_standards": [
+            "compliance_to_standards",
+            "compliance_to_standards_yes",
+            "compliance",
+        ],
+        "remarks": ["remarks", "notes"],
+        "comments": ["comments", "summary"],
+    }
+
+    final: Dict[str, object] = {}
+    for canon, cands in aliases.items():
+        for c in cands:
+            if c in data:
+                final[canon] = data[c]
+                break
+
+    for k, v in data.items():
+        if k not in final:
+            final[k] = v
+
+    for dk in ("test_date", "repair_date"):
+        if dk in final and isinstance(final[dk], str):
+            final[dk] = normalize_value(final[dk])
+
+    return final
+
+
+# ---------- Orchestration ----------
+
+def process_one(pdf_path: str, outdir: Optional[str] = None, keep_txt: bool = False) -> str:
+    pdf_path = os.path.abspath(pdf_path)
+    if not os.path.isfile(pdf_path):
+        raise FileNotFoundError(f"Not a file: {pdf_path}")
+
+    base = Path(pdf_path).stem
+    with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{base}.txt") as tf:
+        txt_path = tf.name
+
+    try:
+        run_pdftotext(pdf_path, txt_path)
+        with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
+            text = f.read()
+        lines = re.split(r"\R", text)
+
+        kv1 = extract_key_value_same_line(lines)
+        kv2 = extract_key_value_stacked(lines)
+        list2 = extract_two_column_lists(lines)
+        meta = extract_contacts_and_addresses(text)
+
+        merged: Dict[str, object] = {}
+        merged.update(kv1)
+        merged.update(kv2)
+        merged.update(list2)
+        if meta.get("emails"):
+            merged["emails"] = meta["emails"]
+        if meta.get("phones"):
+            merged["phones"] = meta["phones"]
+        if meta.get("addresses"):
+            merged["addresses"] = meta["addresses"]
+
+        final = apply_aliases(merged)
+
+        out_dir_final = outdir if outdir else os.path.dirname(pdf_path)
+        os.makedirs(out_dir_final, exist_ok=True)
+        json_path = os.path.join(out_dir_final, f"{base}.json")
+        with open(json_path, "w", encoding="utf-8") as jf:
+            json.dump(final, jf, ensure_ascii=False, indent=2)
+
+        if keep_txt:
+            keep_path = os.path.join(out_dir_final, f"{base}.txt")
+            try:
+                shutil.copyfile(txt_path, keep_path)
+            except Exception:
+                pass
+        return json_path
+    finally:
+        try:
+            if os.path.exists(txt_path):
+                os.unlink(txt_path)
+        except Exception:
+            pass
+
+
+def process_dir(dir_path: str, outdir: Optional[str] = None, keep_txt: bool = False) -> List[str]:
+    dir_path = os.path.abspath(dir_path)
+    if not os.path.isdir(dir_path):
+        raise NotADirectoryError(f"Not a directory: {dir_path}")
+
+    json_paths: List[str] = []
+    for root, _dirs, files in os.walk(dir_path):
+        for fn in files:
+            if fn.lower().endswith(".pdf"):
+                pdf = os.path.join(root, fn)
+                json_paths.append(process_one(pdf, outdir=outdir, keep_txt=keep_txt))
+    return json_paths
+
+
+def extract_pdf_to_json(pdf: str, outdir: Optional[str] = None, keep_txt: bool = False) -> str:
+    """Convenience wrapper for single-file extraction."""
+    return process_one(pdf, outdir=outdir, keep_txt=keep_txt)
+
+
+# ---------- CLI ----------
+
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="PDF to JSON extractor (pdftotext -layout)")
+    p.add_argument("--pdf", help="Path to a single PDF")
+    p.add_argument("--dir", help="Path to a directory of PDFs")
+    p.add_argument("--outdir", help="Optional output directory")
+    p.add_argument("--keep-txt", dest="keep_txt", action="store_true", help="Keep intermediate .txt files")
+    return p.parse_args()
+
+
+def main() -> None:
+    args = _parse_args()
+    if not args.pdf and not args.dir:
+        raise SystemExit("Usage: --pdf=/path/file.pdf OR --dir=/path/folder [--outdir=...] [--keep-txt]")
+    if args.pdf:
+        json_path = process_one(args.pdf, outdir=args.outdir, keep_txt=args.keep_txt)
+        print(f"[OK] {args.pdf} -> {json_path}")
+    else:
+        outputs = process_dir(args.dir, outdir=args.outdir, keep_txt=args.keep_txt)
+        for op in outputs:
+            print(f"[OK] -> {op}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..ff4a88b
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,11 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="frappe_pdf_extractor",
+    version="0.1.0",
+    description="Universal PDF to JSON extractor for Frappe/ERPNext (pdftotext)",
+    author="Your Company",
+    packages=find_packages(),
+    include_package_data=True,
+    install_requires=[],
+)