diff --git a/src/merge_utterances.py b/src/merge_utterances.py index 85c3674..f461c62 100644 --- a/src/merge_utterances.py +++ b/src/merge_utterances.py @@ -3,9 +3,13 @@ """ from lxml import etree import argparse -from pyriksdagen.utils import protocol_iterators +from pyriksdagen.utils import corpus_iterator from tqdm import tqdm from pyriksdagen.utils import TEI_NS +from pyriksdagen.io import ( + parse_tei, + write_tei +) def merge_us(root): for body in root.findall(f".//{TEI_NS}body"): @@ -23,18 +27,25 @@ def merge_us(root): previous_u = None return root +def remove_dead_nextprev_links(root): + all_ids = set(root.xpath('//@xml:id', namespaces={"xml": "http://www.w3.org/XML/1998/namespace"})) + for body in root.findall(f".//{TEI_NS}body"): + for div in body.findall(f".//{TEI_NS}div"): + for elem in div: + if elem.tag.split("}")[-1] == "u": + if elem.attrib.get("next") not in all_ids and "next" in elem.attrib: + del elem.attrib["next"] + if elem.attrib.get("prev") not in all_ids and "prev" in elem.attrib: + del elem.attrib["prev"] + return root + def main(args): - protocols = sorted(list(protocol_iterators(args.records_folder, start=args.start, end=args.end))) - parser = etree.XMLParser(remove_blank_text=True) + protocols = sorted(list(corpus_iterator("prot", args.records_folder, start=args.start, end=args.end))) for p in tqdm(protocols): - root = etree.parse(p, parser).getroot() + root, ns = parse_tei(p) root = merge_us(root) - - b = etree.tostring( - root, pretty_print=True, encoding="utf-8", xml_declaration=True - ) - with open(p, "wb") as f: - f.write(b) + root = remove_dead_nextprev_links(root) + write_tei(root, p) if __name__ == "__main__":