forked from VectifyAI/PageIndex
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathbatch_process.py
More file actions
96 lines (75 loc) · 3.51 KB
/
batch_process.py
File metadata and controls
96 lines (75 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python3
"""Batch process all PDFs in a directory and save page indexes."""
import argparse
import os
import json
import glob
import time
from pageindex import config, page_index_main
def process_all_pdfs(input_dir: str, output_dir: str, model: str = 'gpt-4o-mini', delay: int = 0, force: bool = False):
"""Process all PDFs in input_dir (recursively) and save to output_dir."""
# Find all PDFs recursively
pdf_pattern = os.path.join(input_dir, '**', '*.pdf')
pdf_files = glob.glob(pdf_pattern, recursive=True)
if not pdf_files:
print(f"No PDF files found in {input_dir}")
return
print(f"Found {len(pdf_files)} PDF files to process")
os.makedirs(output_dir, exist_ok=True)
# Configure options
opt = config(
model=model,
toc_check_page_num=20,
max_page_num_each_node=10,
max_token_num_each_node=20000,
if_add_node_id='yes',
if_add_node_summary='yes',
if_add_doc_description='no',
if_add_node_text='yes'
)
processed = 0
failed = []
for i, pdf_path in enumerate(pdf_files, 1):
# Preserve folder structure from input_dir
relative_path = os.path.relpath(pdf_path, input_dir)
relative_dir = os.path.dirname(relative_path)
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_subdir = os.path.join(output_dir, relative_dir)
os.makedirs(output_subdir, exist_ok=True)
pdf_filename = os.path.basename(pdf_path)
output_file = os.path.join(output_subdir, f'{pdf_filename}.pageindex.json')
# Skip if already processed (unless force is set)
if os.path.exists(output_file) and not force:
print(f"[{i}/{len(pdf_files)}] Skipping (already exists): {pdf_name}")
continue
print(f"\n[{i}/{len(pdf_files)}] Processing: {pdf_name}")
print(f" Source: {pdf_path}")
try:
toc_with_page_number = page_index_main(pdf_path, opt)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
print(f" Saved: {output_file}")
processed += 1
if delay > 0 and i < len(pdf_files):
print(f" Waiting {delay}s before next file...")
time.sleep(delay)
except Exception as e:
print(f" ERROR: {e}")
failed.append((pdf_path, str(e)))
print(f"\n{'='*50}")
print(f"Completed: {processed} files processed")
if failed:
print(f"Failed: {len(failed)} files")
for path, error in failed:
print(f" - {os.path.basename(path)}: {error}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Batch process PDFs to generate page indexes')
parser.add_argument('--input_dir', type=str, required=True, help='Directory containing PDFs')
parser.add_argument('--output_dir', type=str, required=True, help='Directory to save indexes')
parser.add_argument('--model', type=str, default='gpt-4o-mini', help='Model to use')
parser.add_argument('--delay', type=int, default=0, help='Delay in seconds between files (for rate limits)')
parser.add_argument('--force', action='store_true', help='Reprocess files even if output already exists')
args = parser.parse_args()
if not os.path.isdir(args.input_dir):
raise ValueError(f"Input directory not found: {args.input_dir}")
process_all_pdfs(args.input_dir, args.output_dir, args.model, args.delay, args.force)