cargo/process_input.py at master · codedmachine111/cargo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import streamlit as st
import tempfile
import pytesseract
import uuid
import os
from unstructured.partition.pdf import partition_pdf

# Configure Pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_all_data(pdf_path):
    '''
        Extracts text, tables and images from a PDF file and saves the images to a specified directory.

        Parameters:
        pdf_path (str): Path to the PDF file.

        Returns:
        tuple: Lists of text elements, table elements, and a unique folder ID for the extracted images.
    '''
    text_elements = []
    table_elements = []

    unique_id = str(uuid.uuid4())
    image_dir = os.path.join('figures', unique_id)
    os.makedirs(image_dir, exist_ok=True)

    raw_elements = partition_pdf(
        filename=pdf_path,
        chunking_strategy="by_title",
        extract_images_in_pdf=True,
        infer_table_structure=True,
        max_characters=1000,
        new_after_n_chars=1500,
        combine_text_under_n_chars=250,
        strategy="hi_res",
        extract_image_block_output_dir=image_dir
    )

    for element in raw_elements:
        if 'unstructured.documents.elements.CompositeElement' in str(type(element)):
            text_elements.append(str(element))
        elif 'unstructured.documents.elements.Table' in str(type(element)):
            table_elements.append(str(element))

    return text_elements, table_elements, unique_id

def processInput(file):
    '''
        Processes and extracts data from an uploaded PDF file.

        Parameters:
        file: The uploaded file object.

        Returns:
        tuple: Extracted text, tables, and folder ID containing images.
        Returns an empty list if the file is None or not a PDF.
    '''
    if file is None:
        st.error("Could not upload files! Try again!")
        return []
    else:
        if file.type == "application/pdf":
            # Extract Text and Tables
            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
                temp_pdf.write(file.read())
                temp_pdf.seek(0)
                text, tables, folder_id = extract_all_data(temp_pdf.name)
        else:
            st.error("Please upload only PDF files")
    return text, tables, folder_id