forked from MarkPDFdown/markpdfdown
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
153 lines (135 loc) · 5.44 KB
/
Copy pathmain.py
File metadata and controls
153 lines (135 loc) · 5.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import sys
import time
import shutil
import logging
from core import LLMClient
from core.FileWorker import create_worker
from core.Util import *
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stderr)
]
)
logger = logging.getLogger(__name__)
def completion(message, model="", system_prompt="", image_paths=None, temperature=0.5, max_tokens=8192, retry_times=3):
"""
Call OpenAI's completion interface for text generation
Args:
message (str): User input message
model (str): Model name
system_prompt (str, optional): System prompt, defaults to empty string
image_paths (List[str], optional): List of image paths, defaults to None
temperature (float, optional): Temperature for text generation, defaults to 0.5
max_tokens (int, optional): Maximum number of tokens for generated text, defaults to 8192
Returns:
str: Generated text content
"""
# Get API key and API base URL from environment variables
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
logger.error("Please set the OPENAI_API_KEY environment variables")
exit(1)
base_url = os.getenv("OPENAI_API_BASE")
if not base_url:
base_url = "https://api.openai.com/v1/"
# If no model is specified, use the default model
if not model:
model = os.getenv("OPENAI_DEFAULT_MODEL")
if not model:
model = "gpt-4o"
# Initialize LLMClient
client = LLMClient.LLMClient(base_url=base_url, api_key=api_key, model=model)
# Call completion method with retry mechanism
for _ in range(retry_times):
try:
response = client.completion(user_message=message, system_prompt=system_prompt, image_paths=image_paths, temperature=temperature, max_tokens=max_tokens)
return response
except Exception as e:
logger.error(f"LLM call failed: {str(e)}")
# If retry fails, wait for a while before retrying
time.sleep(0.5)
return ""
def convert_image_to_markdown(image_path):
"""
Convert image to Markdown format
Args:
image_path (str): Path to the image
Returns:
str: Converted Markdown string
"""
user_prompt = """
Please read the content in the image and transcribe it into Markdown, paying special attention to maintaining the format of headings, text, formulas, and table rows and columns. Only output the Markdown, no additional explanation is needed.
"""
response = completion(message=user_prompt, model="", image_paths=[image_path], temperature=0.3, max_tokens=8192)
response = remove_markdown_warp(response, "markdown")
return response
if __name__ == "__main__":
start_page = 1
end_page = 0
if len(sys.argv) > 2:
start_page = int(sys.argv[1])
end_page = int(sys.argv[2])
elif len(sys.argv) > 1:
start_page = 1
end_page = int(sys.argv[1])
# Read binary data from standard input
input_data = sys.stdin.buffer.read()
if not input_data:
logger.error("No input data received")
logger.error("Usage: python main.py [start_page] [end_page] < path_to_input.pdf")
exit(1)
# Create output directory
output_dir = f"output/{time.strftime('%Y%m%d%H%M%S')}"
os.makedirs(output_dir, exist_ok=True)
# Try to get extension from file name
input_filename = os.path.basename(sys.stdin.buffer.name)
input_ext = os.path.splitext(input_filename)[1]
# If there is no extension or the file comes from standard input, try to determine the type by file content
if not input_ext or input_filename == '<stdin>':
# PDF file magic number/signature is %PDF-
if input_data.startswith(b'%PDF-'):
input_ext = '.pdf'
logger.info("Recognized as PDF file by file content")
# JPEG file magic number/signature is FF D8 FF DB
elif input_data.startswith(b'\xFF\xD8\xFF\xDB'):
input_ext = '.jpg'
logger.info("Recognized as JPEG file by file content")
# PNG file magic number/signature is 89 50 4E 47
elif input_data.startswith(b'\x89\x50\x4E\x47'):
input_ext = '.png'
logger.info("Recognized as PNG file by file content")
# BMP file magic number/signature is 42 4D
elif input_data.startswith(b'\x42\x4D'):
input_ext = '.bmp'
logger.info("Recognized as BMP file by file content")
else:
logger.error("Unsupported file type")
exit(1)
input_path = os.path.join(output_dir, f"input{input_ext}")
with open(input_path, "wb") as f:
f.write(input_data)
# create file worker
try:
worker = create_worker(input_path, start_page, end_page)
except ValueError as e:
logger.error(str(e))
exit(1)
# convert to images
img_paths = worker.convert_to_images()
logger.info("Image conversion completed")
# convert to markdown
markdown = ""
for img_path in sorted(img_paths):
img_path = img_path.replace("\\", "/")
logger.info("Converting image %s to Markdown", img_path)
markdown += convert_image_to_markdown(img_path)
markdown += "\n\n"
logger.info("Image conversion to Markdown completed")
# Output Markdown
print(markdown)
# Remote output path
shutil.rmtree(output_dir)
exit(0)