Skip to content

Commit a8384bf

Browse files
committed
feat: ocr script
1 parent 4b51a27 commit a8384bf

1 file changed

Lines changed: 269 additions & 0 deletions

File tree

src/ocr/ocr.py

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
#!/usr/bin/env python3
2+
"""
3+
OCR PDF files in a path. A path should be of the following format:
4+
5+
<basename>/
6+
|_ <path>/
7+
| |_ abc_1.pdf
8+
| |_ abc_2.pdf
9+
10+
The output will be
11+
12+
<altopath>/
13+
|_ <path>/
14+
| |_ abc_1/
15+
| |_ abc_1_0001.xml
16+
| |_ abc_1_0002.xml
17+
| |_ abc_2/
18+
| |_ abc_2_0001.xml
19+
| |_ abc_2_0002.xml
20+
"""
21+
from glob import glob
22+
from pypdf import PdfWriter, PdfReader
23+
from pyriksdagen.utils import get_data_location
24+
from tqdm import tqdm
25+
from trainerlog import get_logger
26+
import argparse, cv2
27+
import numpy as np
28+
import os, pytesseract, subprocess
29+
import shutil
30+
31+
32+
33+
logger = get_logger("OCR")
34+
envs = {
35+
"prot": "RECORDS",
36+
"mot": "MOTIONS"
37+
}
38+
dflt_paths = {
39+
"prot": "records",
40+
"mot": "motions"
41+
}
42+
43+
44+
45+
46+
def write(img, outpath):
47+
cv2.imwrite(outpath,img)
48+
49+
50+
def deskew(image, ipath, ibase):
51+
"""
52+
This is unpredictable (turns images 90º) & not called from anywhere.
53+
"""
54+
coords = np.column_stack(np.where(image > 0))
55+
angle = cv2.minAreaRect(coords)[-1]
56+
if angle < -45:
57+
angle = -(90 + angle)
58+
#else:
59+
# angle = -angle
60+
(h, w) = image.shape[:2]
61+
center = (w // 2, h // 2)
62+
M = cv2.getRotationMatrix2D(center, angle, 1.0)
63+
rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
64+
write(rotated, f"{ipath}{ibase}_3_deskew.jpg")
65+
return rotated, ipath, ibase
66+
67+
68+
def get_grayscale(image, ipath, ibase):
69+
img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
70+
write(img, f"{ipath}{ibase}_1_grayscale.jpg")
71+
return img, ipath, ibase
72+
73+
74+
def thresholding(image, ipath, ibase):
75+
img = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
76+
write(img, f"{ipath}{ibase}_2_thresholding.jpg")
77+
return img, ipath, ibase
78+
79+
80+
def preprocess_img(img, ipath, ibase):
81+
return thresholding(*get_grayscale(img, ipath, ibase))
82+
83+
84+
def extract_imgs(pdf, to):
85+
imgs = glob(f"{to}/*.png")
86+
for img in imgs:
87+
os.remove(img)
88+
pages = [p for p in os.listdir(f"{to}") if p.endswith(".pdf")]
89+
for page in pages:
90+
try:
91+
ok_code = subprocess.run(["pdfimages", "-png",
92+
f"{to}/{page}", f"{to}/{os.path.basename(page)[:-4]}"],
93+
#stdout=subprocess.PIPE,
94+
stderr=subprocess.STDOUT)#,
95+
#text=False)
96+
except subprocessCalledProcessError as e:
97+
logger.critical("extract imgs error:", e)
98+
raise
99+
return to
100+
101+
102+
def extract_pages(pdf, to):
103+
try:
104+
ok_code = subprocess.run(["gs",
105+
#"-dDEBUG",
106+
"-dNOPAUSE",
107+
"-dNOPROMPT",
108+
"-dBATCH",
109+
"-dSAFER",
110+
#"-dPDFSTOPONERROR",
111+
#"-dVERBOSE",
112+
"-sDEVICE=pdfimage24",
113+
"-r300",
114+
f'-sOutputFile={to}/{os.path.basename(pdf)[:-4]}_%04d.pdf',
115+
pdf])#,
116+
#stdout=subprocess.PIPE,
117+
#stderr=subprocess.STDOUT)#,
118+
#text=False)
119+
except subprocess.CalledProcessError as e:
120+
logger.critical(f"Extract pages error: {e}")
121+
raise
122+
else:
123+
logger.info(f"Pdf is paginated: {ok_code}")
124+
return extract_imgs(pdf, to)
125+
126+
127+
def ocr(img, outpath):
128+
with open(outpath, 'wb+') as outf:
129+
outf.write(pytesseract.image_to_alto_xml(img, lang='swe'))
130+
131+
132+
133+
134+
def main(args):
135+
136+
if args.basename:
137+
basename = args.basename
138+
else:
139+
basename = os.environ.get(
140+
f"{envs[args.doc_type]}_PDF",
141+
f"riksdagen-{dflt_paths[args.doc_type]}-pdf/data")
142+
143+
if ocr:
144+
if args.altopath:
145+
altopath = args.altopath
146+
else:
147+
altopath = os.environ.get(
148+
f"{envs[args.doc_type]}_ALTO",
149+
f"riksdagen-{dflt_paths[args.doc_type]}-alto/data")
150+
151+
if args.docs:
152+
pdfs = [glob(f"{basename}/{args.path}/{_}")[0] for _ in args.docs]
153+
else:
154+
pdfs = glob(f"{basename}/{args.path}/*.pdf")
155+
156+
157+
for pidx, pdf in enumerate(pdfs, start=1):
158+
print(pidx, "of", len(pdfs), "::", pdf)
159+
if not os.path.exists(f"{os.path.dirname(pdf)}/{os.path.basename(pdf)[:-4]}"):
160+
os.mkdir(f"{os.path.dirname(pdf)}/{os.path.basename(pdf)[:-4]}")
161+
162+
year = os.path.basename(os.path.dirname(pdf))
163+
doc_base = os.path.basename(pdf)[:-4]
164+
165+
if ocr:
166+
logger.info("Checking alto path...")
167+
if not os.path.exists(f"{altopath}/{year}"):
168+
os.mkdir(f"{altopath}/{year}")
169+
if not os.path.exists(
170+
f"{altopath}/{year}/{doc_base}"):
171+
os.mkdir(f"{altopath}/{year}/{doc_base}")
172+
logger.info("... OK")
173+
174+
if args.clobber_dest:
175+
logger.info("Clobbering alto path...")
176+
dcontent = glob(f"{altopath}/{year}/{doc_base}/*")
177+
for X in dcontent:
178+
try:
179+
if os.path.isfile(X):
180+
os.remove(X)
181+
elif os.path.isdir(X):
182+
shutil.rmtree(X)
183+
except Exception as e:
184+
logger.warn(f"Failed to delete {X}:\n{e}")
185+
logger.info("... OK")
186+
187+
if args.repair_pdf:
188+
logger.info("repairing pdf")
189+
ok_code = subprocess.run(["mutool", "clean", "-gg", pdf, pdf])
190+
logger.info(f"... done {ok_code}")
191+
192+
if args.split_pdf:
193+
logger.info(f"Extracting pages from pdf {pidx} of {len(pdfs)} -- {pdf}")
194+
img_dir = extract_pages(pdf, f"{basename}/{year}/{doc_base}")
195+
logger.info(" --> OK")
196+
else:
197+
img_dir = f"{basename}/{year}/{doc_base}"
198+
199+
if ocr:
200+
logger.info(f"Iterating through {len(img_dir)} images")
201+
img_dir_fs = glob(f"{img_dir}/*.png")
202+
for img_path in img_dir_fs:
203+
iidx = img_path.split('-')[-2]
204+
logger.info(f" --> {img_path} -- pdf {pidx} / {len(pdfs)} : img {iidx} of {len(img_dir_fs)}")
205+
img_base = os.path.basename(img_path)[:-4]
206+
img = cv2.imread(f"{img_path}")
207+
if args.no_preprocessing:
208+
logger.info(" skipping preprocessing")
209+
alto_path = f"{altopath}/{year}/{doc_base}/{img_base}.xml"
210+
logger.info(" --> starting OCR")
211+
ocr(img, alto_path)
212+
logger.info(" |-> done.")
213+
else:
214+
logger.info(" preprocessing")
215+
img, ipath, ibase = preprocess_img(img, img_dir, img_base)
216+
logger.info(" |-> Done")
217+
alto_path = f"{altopath}/{year}/{doc_base}/{img_base}.xml"
218+
logger.info(" --> starting OCR")
219+
ocr(img, alto_path)
220+
logger.info(" |-> done.")
221+
222+
223+
224+
225+
if __name__ == '__main__':
226+
def _str2bool(v):
227+
if isinstance(v, bool):
228+
return v
229+
if v.lower() in ('true', '1', 't', 'y', 'yes'):
230+
return True
231+
elif v.lower() in ('false', '0', 'f', 'n', 'no'):
232+
return False
233+
else:
234+
raise argparse.ArgumentTypeError('Boolean value expected.')
235+
236+
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
237+
parser.add_argument('-p', '--path',
238+
required=True,
239+
type=str,
240+
help="Path of directory containing PDF files to OCR")
241+
parser.add_argument('-d', '--docs',
242+
type=str,
243+
nargs="+",
244+
default=None,
245+
help="List of specific docs, otherwise, all that match --doc-type in the path will be OCRed")
246+
parser.add_argument('-t', '--doc-type',
247+
type=str,
248+
#required=True,
249+
help="Type of doc to OCR, e.g. mot, prop, prot.")
250+
parser.add_argument('-b', '--basename',
251+
type=str,
252+
help="Base path to pdf repo. If unset, looks for environment variable based on the doc-type, otherwise defualts to, e.g. riksdagen-records-pdf/data")
253+
parser.add_argument("-a", "--altopath",
254+
type=str,
255+
help="Path to alto files repo. If unset, looks for environment variable based on the doc-type, otherwise defualts to, e.g. riksdagen-records-alto/data")
256+
parser.add_argument("-s", "--split-pdf",
257+
action='store_true',
258+
help="Split up multipage pdfs --> one page per file.")
259+
parser.add_argument('-P', '--no-preprocessing',
260+
type=bool,
261+
default=True,
262+
help="skip preprocessing steps")
263+
parser.add_argument("-c", "--clobber-dest",
264+
action='store_true',
265+
help="remove everything from the destination directory before ocr")
266+
parser.add_argument("-r", "--repair-pdf", type=_str2bool, default=True)
267+
parser.add_argument("--ocr", type=_str2bool, default=True)
268+
args = parser.parse_args()
269+
main(args)

0 commit comments

Comments
 (0)