PageIndex/batch_process.py at main · eigenpal/PageIndex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python3
"""Batch process all PDFs in a directory and save page indexes."""

import argparse
import os
import json
import glob
import time
from pageindex import config, page_index_main

def process_all_pdfs(input_dir: str, output_dir: str, model: str = 'gpt-4o-mini', delay: int = 0, force: bool = False):
    """Process all PDFs in input_dir (recursively) and save to output_dir."""

    # Find all PDFs recursively
    pdf_pattern = os.path.join(input_dir, '**', '*.pdf')
    pdf_files = glob.glob(pdf_pattern, recursive=True)

    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return

    print(f"Found {len(pdf_files)} PDF files to process")
    os.makedirs(output_dir, exist_ok=True)

    # Configure options
    opt = config(
        model=model,
        toc_check_page_num=20,
        max_page_num_each_node=10,
        max_token_num_each_node=20000,
        if_add_node_id='yes',
        if_add_node_summary='yes',
        if_add_doc_description='no',
        if_add_node_text='yes'
    )

    processed = 0
    failed = []

    for i, pdf_path in enumerate(pdf_files, 1):
        # Preserve folder structure from input_dir
        relative_path = os.path.relpath(pdf_path, input_dir)
        relative_dir = os.path.dirname(relative_path)
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

        output_subdir = os.path.join(output_dir, relative_dir)
        os.makedirs(output_subdir, exist_ok=True)
        pdf_filename = os.path.basename(pdf_path)
        output_file = os.path.join(output_subdir, f'{pdf_filename}.pageindex.json')

        # Skip if already processed (unless force is set)
        if os.path.exists(output_file) and not force:
            print(f"[{i}/{len(pdf_files)}] Skipping (already exists): {pdf_name}")
            continue

        print(f"\n[{i}/{len(pdf_files)}] Processing: {pdf_name}")
        print(f"  Source: {pdf_path}")

        try:
            toc_with_page_number = page_index_main(pdf_path, opt)

            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)

            print(f"  Saved: {output_file}")
            processed += 1

            if delay > 0 and i < len(pdf_files):
                print(f"  Waiting {delay}s before next file...")
                time.sleep(delay)

        except Exception as e:
            print(f"  ERROR: {e}")
            failed.append((pdf_path, str(e)))

    print(f"\n{'='*50}")
    print(f"Completed: {processed} files processed")
    if failed:
        print(f"Failed: {len(failed)} files")
        for path, error in failed:
            print(f"  - {os.path.basename(path)}: {error}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Batch process PDFs to generate page indexes')
    parser.add_argument('--input_dir', type=str, required=True, help='Directory containing PDFs')
    parser.add_argument('--output_dir', type=str, required=True, help='Directory to save indexes')
    parser.add_argument('--model', type=str, default='gpt-4o-mini', help='Model to use')
    parser.add_argument('--delay', type=int, default=0, help='Delay in seconds between files (for rate limits)')
    parser.add_argument('--force', action='store_true', help='Reprocess files even if output already exists')

    args = parser.parse_args()

    if not os.path.isdir(args.input_dir):
        raise ValueError(f"Input directory not found: {args.input_dir}")

    process_all_pdfs(args.input_dir, args.output_dir, args.model, args.delay, args.force)