-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchunking.py
More file actions
38 lines (32 loc) · 1.1 KB
/
chunking.py
File metadata and controls
38 lines (32 loc) · 1.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from openai import OpenAI
import json
from typing import List
from tqdm import tqdm
import PyPDF2
import sys
import subprocess
def install_requirements():
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
except subprocess.CalledProcessError as e:
print(f"failed to install packages: {e}")
install_requirements()
def process_text(text: str, chunk_size: int = 40):
text_chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
print(f"created {len(text_chunks)} chunks")
def extract_text_from_pdf(file_path):
pdf_file_obj = open(file_path, 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
text = ''
for page_num in range(len(pdf_reader.pages)):
page_obj = pdf_reader.pages[page_num]
text += page_obj.extract_text()
pdf_file_obj.close()
return text
if len(sys.argv) < 2:
print("please provide the pdf file name as a commandline argument.")
sys.exit(1)
pdf_name = sys.argv[1]
text = extract_text_from_pdf(pdf_name)
print(text[:1000])
responses = {"responses": process_text(text)}