From 8cd87d0dc15b063ecc013b4779e1d2faac1728dd Mon Sep 17 00:00:00 2001 From: ninpnin Date: Tue, 10 Mar 2026 15:13:11 +0100 Subject: [PATCH 1/2] fix: remove dead next/prev links when merging utterances --- src/merge_utterances.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/merge_utterances.py b/src/merge_utterances.py index 85c3674..f5ed4fd 100644 --- a/src/merge_utterances.py +++ b/src/merge_utterances.py @@ -3,9 +3,13 @@ """ from lxml import etree import argparse -from pyriksdagen.utils import protocol_iterators +from pyriksdagen.utils import corpus_iterator from tqdm import tqdm from pyriksdagen.utils import TEI_NS +from pyriksdagen.io import ( + parse_tei, + write_tei +) def merge_us(root): for body in root.findall(f".//{TEI_NS}body"): @@ -23,18 +27,26 @@ def merge_us(root): previous_u = None return root +def remove_dead_nextprev_links(root): + all_ids = set(root.xpath('//@xml:id', namespaces={"xml": "http://www.w3.org/XML/1998/namespace"})) + for body in root.findall(f".//{TEI_NS}body"): + for div in body.findall(f".//{TEI_NS}div"): + for elem in div: + if elem.tag.split("}")[-1] == "u": + if elem.attrib.get("next") not in all_ids and "next" in elem.attrib: + del elem.attrib["next"] + if elem.attrib.get("prev") not in all_ids and "prev" in elem.attrib: + del elem.attrib["prev"] + return root + def main(args): - protocols = sorted(list(protocol_iterators(args.records_folder, start=args.start, end=args.end))) + protocols = sorted(list(corpus_iterator("prot", args.records_folder, start=args.start, end=args.end))) parser = etree.XMLParser(remove_blank_text=True) for p in tqdm(protocols): - root = etree.parse(p, parser).getroot() + root, ns = parse_tei(p) root = merge_us(root) - - b = etree.tostring( - root, pretty_print=True, encoding="utf-8", xml_declaration=True - ) - with open(p, "wb") as f: - f.write(b) + root = remove_dead_nextprev_links(root) + write_tei(root, p) if __name__ == "__main__": From ae8d2ff5c57eb5a3a15a64a093a880b151763ba1 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Tue, 10 Mar 2026 15:14:25 +0100 Subject: [PATCH 2/2] refactor: remove bloat --- src/merge_utterances.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/merge_utterances.py b/src/merge_utterances.py index f5ed4fd..f461c62 100644 --- a/src/merge_utterances.py +++ b/src/merge_utterances.py @@ -41,7 +41,6 @@ def remove_dead_nextprev_links(root): def main(args): protocols = sorted(list(corpus_iterator("prot", args.records_folder, start=args.start, end=args.end))) - parser = etree.XMLParser(remove_blank_text=True) for p in tqdm(protocols): root, ns = parse_tei(p) root = merge_us(root)