Skip to content

Commit 0e949e0

Browse files
committed
feat: add script
1 parent d3970f0 commit 0e949e0

1 file changed

Lines changed: 82 additions & 0 deletions

File tree

src/cur-prot/src-info-to-header.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/usr/bin/env python3
2+
from lxml import etree
3+
from pyriksdagen.args import (
4+
fetch_parser,
5+
impute_args,
6+
)
7+
from pyriksdagen.io import (
8+
parse_tei,
9+
write_tei
10+
)
11+
from pyriksdagen.utils import (
12+
elem_iter,
13+
get_formatted_uuid,
14+
)
15+
from tqdm import tqdm
16+
from trainerlog import get_logger
17+
import os
18+
19+
20+
21+
22+
logger = get_logger(name="Trainer Log", level=os.environ.get("LOGLEVEL", None))
23+
24+
25+
def get_source(py):
26+
if py <= "199495":
27+
return "OCR"
28+
else:
29+
return "digital-origin"
30+
31+
32+
def add_source(root, source, ns):
33+
try:
34+
profileDesc = root.find(f"{ns['tei_ns']}profileDesc")
35+
assert profileDesc is not None
36+
logger.debug("profileDesc elem found")
37+
except:
38+
logger.debug("Creating profileDesc elem")
39+
profileDesc = etree.SubElement(root.find(f"{ns['tei_ns']}teiHeader"), "profileDesc")
40+
41+
textClass = etree.SubElement(profileDesc, "textClass")
42+
classCode = etree.SubElement(textClass, "classCode")
43+
classCode.text = source
44+
return root
45+
46+
47+
def add_url(root, url, ns):
48+
bibl = root.find(f".//{ns['tei_ns']}sourceDesc/{ns['tei_ns']}bibl")
49+
try:
50+
assert bibl is not None
51+
except:
52+
logger.critical("No sourceDescr/bibl elem found.")
53+
else:
54+
idno = etree.SubElement(bibl, "idno")
55+
idno.set("type", "URI")
56+
idno.set("subtype", "PDF")
57+
idno.text = url
58+
return root
59+
60+
61+
62+
63+
def main(args):
64+
url_base = "https://swerik-project.github.io/riksdagen-records-pdf"
65+
for record in tqdm(args.records):
66+
logger.debug(record)
67+
spl = record.split('/')
68+
py = spl[-2]
69+
record_base=spl[-1][:-4]
70+
url = f"{url_base}/{py}/{record_base}"
71+
source = get_source(py)
72+
root, ns = parse_tei(record)
73+
root = add_source(root, source, ns)
74+
root = add_url(root, url, ns)
75+
write_tei(root, record)
76+
77+
78+
79+
80+
if __name__ == '__main__':
81+
parser = fetch_parser("records", docstring=__doc__)
82+
main(impute_args(parser.parse_args()))

0 commit comments

Comments
 (0)