Skip to content

Commit 95a4699

Browse files
committed
fix: add next/prev links when merging utterances
1 parent ae8d2ff commit 95a4699

1 file changed

Lines changed: 19 additions & 1 deletion

File tree

src/merge_utterances.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import argparse
66
from pyriksdagen.utils import corpus_iterator
77
from tqdm import tqdm
8-
from pyriksdagen.utils import TEI_NS
8+
from pyriksdagen.utils import TEI_NS, XML_NS
99
from pyriksdagen.io import (
1010
parse_tei,
1111
write_tei
@@ -39,12 +39,30 @@ def remove_dead_nextprev_links(root):
3939
del elem.attrib["prev"]
4040
return root
4141

42+
def add_new_nextprev(root):
43+
for body in root.findall(f".//{TEI_NS}body"):
44+
for div in body.findall(f".//{TEI_NS}div"):
45+
previous_u = None
46+
for elem in div:
47+
if elem.tag.split("}")[-1] == "u":
48+
if previous_u is None:
49+
previous_u = elem
50+
else:
51+
previous_u.attrib["next"] = elem.attrib[f"{XML_NS}id"]
52+
elem.attrib["prev"] = previous_u.attrib[f"{XML_NS}id"]
53+
54+
elif elem.tag.split("}")[-1] == "note":
55+
if elem.attrib.get("type") == "speaker":
56+
previous_u = None
57+
return root
58+
4259
def main(args):
4360
protocols = sorted(list(corpus_iterator("prot", args.records_folder, start=args.start, end=args.end)))
4461
for p in tqdm(protocols):
4562
root, ns = parse_tei(p)
4663
root = merge_us(root)
4764
root = remove_dead_nextprev_links(root)
65+
root = add_new_nextprev(root)
4866
write_tei(root, p)
4967

5068

0 commit comments

Comments
 (0)