|
| 1 | +#!/usr/bin/env python3 |
| 2 | +from lxml import etree |
| 3 | +from pyriksdagen.args import ( |
| 4 | + fetch_parser, |
| 5 | + impute_args, |
| 6 | +) |
| 7 | +from pyriksdagen.io import ( |
| 8 | + parse_tei, |
| 9 | + write_tei |
| 10 | +) |
| 11 | +from pyriksdagen.utils import ( |
| 12 | + elem_iter, |
| 13 | + get_formatted_uuid, |
| 14 | +) |
| 15 | +from tqdm import tqdm |
| 16 | +from trainerlog import get_logger |
| 17 | +import os |
| 18 | + |
| 19 | + |
| 20 | + |
| 21 | + |
| 22 | +logger = get_logger(name="Trainer Log", level=os.environ.get("LOGLEVEL", None)) |
| 23 | + |
| 24 | + |
| 25 | +def get_source(py): |
| 26 | + if py <= "199495": |
| 27 | + return "OCR" |
| 28 | + else: |
| 29 | + return "digital-origin" |
| 30 | + |
| 31 | + |
| 32 | +def add_source(root, source, ns): |
| 33 | + try: |
| 34 | + profileDesc = root.find(f"{ns['tei_ns']}profileDesc") |
| 35 | + assert profileDesc is not None |
| 36 | + logger.debug("profileDesc elem found") |
| 37 | + except: |
| 38 | + logger.debug("Creating profileDesc elem") |
| 39 | + profileDesc = etree.SubElement(root.find(f"{ns['tei_ns']}teiHeader"), "profileDesc") |
| 40 | + |
| 41 | + textClass = etree.SubElement(profileDesc, "textClass") |
| 42 | + classCode = etree.SubElement(textClass, "classCode") |
| 43 | + classCode.text = source |
| 44 | + return root |
| 45 | + |
| 46 | + |
| 47 | +def add_url(root, url, ns): |
| 48 | + bibl = root.find(f".//{ns['tei_ns']}sourceDesc/{ns['tei_ns']}bibl") |
| 49 | + try: |
| 50 | + assert bibl is not None |
| 51 | + except: |
| 52 | + logger.critical("No sourceDescr/bibl elem found.") |
| 53 | + else: |
| 54 | + idno = etree.SubElement(bibl, "idno") |
| 55 | + idno.set("type", "URI") |
| 56 | + idno.set("subtype", "PDF") |
| 57 | + idno.text = url |
| 58 | + return root |
| 59 | + |
| 60 | + |
| 61 | + |
| 62 | + |
| 63 | +def main(args): |
| 64 | + url_base = "https://swerik-project.github.io/riksdagen-records-pdf" |
| 65 | + for record in tqdm(args.records): |
| 66 | + logger.debug(record) |
| 67 | + spl = record.split('/') |
| 68 | + py = spl[-2] |
| 69 | + record_base=spl[-1][:-4] |
| 70 | + url = f"{url_base}/{py}/{record_base}" |
| 71 | + source = get_source(py) |
| 72 | + root, ns = parse_tei(record) |
| 73 | + root = add_source(root, source, ns) |
| 74 | + root = add_url(root, url, ns) |
| 75 | + write_tei(root, record) |
| 76 | + |
| 77 | + |
| 78 | + |
| 79 | + |
| 80 | +if __name__ == '__main__': |
| 81 | + parser = fetch_parser("records", docstring=__doc__) |
| 82 | + main(impute_args(parser.parse_args())) |
0 commit comments