From 4396c8efa818777d211b5d12ecccaf324451b031 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Wed, 6 May 2026 19:33:31 -0700 Subject: [PATCH 1/5] move marker reconstruction into the compiler --- opensiddur/exporter/compiler.py | 4 +- opensiddur/exporter/external_compiler.py | 34 +- opensiddur/exporter/inline_compiler.py | 2 +- opensiddur/exporter/tex/marker_reconstruct.py | 295 ------------------ opensiddur/exporter/tex/xelatex.py | 9 +- .../tests/exporter/test_marker_reconstruct.py | 6 +- .../tests/exporter/test_parallel_e2e.py | 31 +- 7 files changed, 40 insertions(+), 341 deletions(-) delete mode 100644 opensiddur/exporter/tex/marker_reconstruct.py diff --git a/opensiddur/exporter/compiler.py b/opensiddur/exporter/compiler.py index 51f73ef..f0ddbcf 100644 --- a/opensiddur/exporter/compiler.py +++ b/opensiddur/exporter/compiler.py @@ -24,14 +24,12 @@ from lxml.etree import ElementBase from lxml import etree +from opensiddur.exporter.constants import JLPTEI_NAMESPACE, PROCESSING_NAMESPACE from opensiddur.exporter.linear import LinearData, get_linear_data from opensiddur.exporter.refdb import ReferenceDatabase from opensiddur.exporter.settings import load_default_settings, load_settings from opensiddur.exporter.urn import ResolvedUrnRange, UrnResolver -JLPTEI_NAMESPACE = 'http://jewishliturgy.org/ns/jlptei/2' -PROCESSING_NAMESPACE = 'http://jewishliturgy.org/ns/processing' - class _ProcessingCommand(Enum): """ Possible ways the compiler can process an element """ # copy the element and recurse into its children, copying its text content diff --git a/opensiddur/exporter/external_compiler.py b/opensiddur/exporter/external_compiler.py index 0c1fd44..2e2e976 100644 --- a/opensiddur/exporter/external_compiler.py +++ b/opensiddur/exporter/external_compiler.py @@ -7,27 +7,26 @@ from opensiddur.exporter.compiler import ( CompilerProcessor, - JLPTEI_NAMESPACE, - PROCESSING_NAMESPACE, _ProcessingCommand, _ProcessingContext, _AnnotationCommand, ) +from opensiddur.exporter.constants import ( + JLPTEI_NAMESPACE, + PROCESSING_NAMESPACE, + STRUCTURAL_BLOCKS, + TEI_NS, + XML_NS, +) from opensiddur.exporter.linear import LinearData from opensiddur.exporter.refdb import ReferenceDatabase from opensiddur.exporter.urn import ResolvedUrnRange, UrnResolver from lxml import etree -TEI_NS = "http://www.tei-c.org/ns/1.0" -XML_NS = "http://www.w3.org/XML/1998/namespace" - -STRUCTURAL_BLOCKS = frozenset({ - f"{{{TEI_NS}}}div", - f"{{{TEI_NS}}}p", - f"{{{TEI_NS}}}ab", - f"{{{TEI_NS}}}lg", - f"{{{TEI_NS}}}l", -}) +from opensiddur.exporter.marker_reconstruct import ( + doc_needs_marker_reconstruction, + reconstruct_markered_document, +) def _attrs_structural_original(source: ElementBase) -> dict[str, str]: @@ -761,8 +760,14 @@ def process(self, root: Optional[ElementBase] = None) -> list[ElementBase]: # Root parallel trigger is_root = len(self.linear_data.processing_context) == 0 + def _reconstruct_if_needed(processed: list[ElementBase]) -> None: + if processed and doc_needs_marker_reconstruction(processed[0]): + reconstruct_markered_document(processed[0]) + if is_root and self.linear_data.parallel_projects and not self._in_parallel_compilation: - return self._process_parallel_root() + processed = self._process_parallel_root() + _reconstruct_if_needed(processed) + return processed # set the root language to the language of the deepest common ancestor if present, else root self.root_language = self._get_in_scope_language( @@ -790,4 +795,7 @@ def process(self, root: Optional[ElementBase] = None) -> list[ElementBase]: if self.from_start is None and processed: self._mark_file_source(processed[0]) + if is_root: + _reconstruct_if_needed(processed) + return processed diff --git a/opensiddur/exporter/inline_compiler.py b/opensiddur/exporter/inline_compiler.py index 40810ae..a6a433f 100644 --- a/opensiddur/exporter/inline_compiler.py +++ b/opensiddur/exporter/inline_compiler.py @@ -5,11 +5,11 @@ from opensiddur.exporter.compiler import ( CompilerProcessor, - PROCESSING_NAMESPACE, _ProcessingCommand, _ProcessingContext, _AnnotationCommand, ) +from opensiddur.exporter.constants import PROCESSING_NAMESPACE from opensiddur.exporter.external_compiler import ExternalCompilerProcessor from opensiddur.exporter.linear import LinearData from opensiddur.exporter.refdb import ReferenceDatabase diff --git a/opensiddur/exporter/tex/marker_reconstruct.py b/opensiddur/exporter/tex/marker_reconstruct.py deleted file mode 100644 index 6bc4037..0000000 --- a/opensiddur/exporter/tex/marker_reconstruct.py +++ /dev/null @@ -1,295 +0,0 @@ -""" -Reconstruct flattened p:start / p:suspend / p:resume / p:end streams inside -parallel export columns into nested TEI, then prune empty segments and set p:part. -""" - -from __future__ import annotations - -from collections import defaultdict -from dataclasses import dataclass, field -from typing import Any - -from lxml import etree - -from opensiddur.exporter.external_compiler import PROCESSING_NAMESPACE, STRUCTURAL_BLOCKS - -_P_START = f"{{{PROCESSING_NAMESPACE}}}start" -_P_END = f"{{{PROCESSING_NAMESPACE}}}end" -_P_SUSPEND = f"{{{PROCESSING_NAMESPACE}}}suspend" -_P_RESUME = f"{{{PROCESSING_NAMESPACE}}}resume" -_P_LOGICAL = f"{{{PROCESSING_NAMESPACE}}}logical-id" -_P_PART = f"{{{PROCESSING_NAMESPACE}}}part" -_PARALLEL_ITEM = f"{{{PROCESSING_NAMESPACE}}}parallelItem" -_PARALLEL = f"{{{PROCESSING_NAMESPACE}}}parallel" - - -def _structural_marker_map(el: etree.ElementBase) -> dict[str, str]: - out = {} - for key, pname in ( - (_P_START, "start"), - (_P_END, "end"), - (_P_SUSPEND, "suspend"), - (_P_RESUME, "resume"), - ): - if val := el.get(key): - out[pname] = val - return out - - -def substantive_content(el: etree.ElementBase) -> bool: - def walk(x: etree.ElementBase) -> bool: - if (x.text or "").strip(): - return True - for c in x: - if walk(c): - return True - if (c.tail or "").strip(): - return True - return False - - return walk(el) - - -@dataclass -class _Frame: - pid: str - tag: str - attrs: dict[str, str] - buffer: list[etree.ElementBase] = field(default_factory=list) - #: Text serialized before structural children (marker text/tails from compiler) - text_chunks: list[str] = field(default_factory=list) - - -def _absorb_marker_strings(frame: _Frame, el: etree.ElementBase) -> None: - if el.text: - frame.text_chunks.append(el.text) - if el.tail: - frame.text_chunks.append(el.tail) - - -def _carrier_attrs_from_marker_el(el: etree.ElementBase) -> dict[str, str]: - p_pref = f"{{{PROCESSING_NAMESPACE}}}" - xml_id_key = "{http://www.w3.org/XML/1998/namespace}id" - return { - k: v - for k, v in el.attrib.items() - if k != xml_id_key and not k.startswith(p_pref) - } - - -def _new_wrapped_segment( - tag: str, - attrs: dict[str, str], - children: list[etree.ElementBase], - *, - logical_id: str | None, - leading_text_chunks: list[str], -) -> etree.ElementBase: - nsmap = dict(children[0].nsmap) if children else {} - wrapped = etree.Element(tag, nsmap=nsmap) if nsmap else etree.Element(tag) - for k, v in attrs.items(): - wrapped.set(k, v) - prefix = "".join(leading_text_chunks) - leading_text_chunks.clear() - if prefix: - wrapped.text = prefix - for c in children: - wrapped.append(c) - - marker_keys = (_P_START, _P_SUSPEND, _P_RESUME, _P_END, _P_LOGICAL, _P_PART) - for mk in marker_keys: - if mk in wrapped.attrib: - del wrapped.attrib[mk] - - if logical_id: - wrapped.set(_P_LOGICAL, logical_id) - return wrapped - - -def _close_open_segment( - stack: list[_Frame], - fragments: list[etree.ElementBase], - pid_state: dict[str, dict[str, Any]], - *, - pid: str, - kind: str, -) -> None: - if not stack: - raise ValueError(f"p:{kind} for {pid=} with empty reconstruction stack") - - frame = stack.pop() - if frame.pid != pid: - raise ValueError(f"p:{kind} expected top frame pid={pid}, got {frame.pid}") - - st = pid_state.setdefault(pid, {}) - suspended_before = st.get("suspended", False) - logical_id: str | None = None - - if kind == "suspend": - st["suspended"] = True - logical_id = pid - elif kind == "end": - if suspended_before: - logical_id = pid - pid_state.pop(pid, None) - else: - raise ValueError(kind) - - wrapped = _new_wrapped_segment( - frame.tag, - frame.attrs, - frame.buffer, - logical_id=logical_id, - leading_text_chunks=frame.text_chunks, - ) - - if stack: - stack[-1].buffer.append(wrapped) - else: - fragments.append(wrapped) - - -def _move_plain_content( - el: etree.ElementBase, - stack: list[_Frame], - fragments: list[etree.ElementBase], -) -> None: - if stack: - stack[-1].buffer.append(el) - else: - fragments.append(el) - - -def reconstruct_parallel_item( - pi: etree.ElementBase, - pid_state: defaultdict[str, dict[str, Any]], -) -> None: - """Rebuild pi's direct linear stream into nested TEI fragments (mutating pi).""" - fragments: list[etree.ElementBase] = [] - stack: list[_Frame] = [] - - while len(pi) > 0: - el = pi[0] - pi.remove(el) - - mmap = _structural_marker_map(el) - - if el.tag not in STRUCTURAL_BLOCKS: - _move_plain_content(el, stack, fragments) - continue - - if not mmap: - _move_plain_content(el, stack, fragments) - continue - - if "start" in mmap: - pid = mmap["start"] - fr = _Frame(pid, el.tag, _carrier_attrs_from_marker_el(el), []) - _absorb_marker_strings(fr, el) - stack.append(fr) - continue - - if "resume" in mmap: - pid = mmap["resume"] - fr = _Frame(pid, el.tag, _carrier_attrs_from_marker_el(el), []) - _absorb_marker_strings(fr, el) - stack.append(fr) - continue - - if "suspend" in mmap: - _close_open_segment(stack, fragments, pid_state, pid=mmap["suspend"], kind="suspend") - continue - - if "end" in mmap: - _close_open_segment(stack, fragments, pid_state, pid=mmap["end"], kind="end") - continue - - _move_plain_content(el, stack, fragments) - - if stack: - raise ValueError(f"unclosed structural frames remain in parallelItem: {[f.pid for f in stack]}") - - for frag in fragments: - pi.append(frag) - - -def _collect_logical_buckets(root: etree.ElementBase) -> dict[str, list[etree.ElementBase]]: - buckets: dict[str, list[etree.ElementBase]] = defaultdict(list) - for el in root.iter(): - lid = el.get(_P_LOGICAL) - if lid: - buckets[lid].append(el) - return buckets - - -def normalize_segment_parts(root: etree.ElementBase) -> None: - buckets = _collect_logical_buckets(root) - stray_markers = {_P_START, _P_SUSPEND, _P_RESUME, _P_END} - - for _lid, elems in buckets.items(): - for e in list(elems): - if substantive_content(e): - continue - parent = e.getparent() - if parent is not None: - parent.remove(e) - - surviving = [e for e in elems if e.getparent() is not None] - - for e in surviving: - for mk in stray_markers: - if mk in e.attrib: - del e.attrib[mk] - - for e in surviving: - if _P_PART in e.attrib: - del e.attrib[_P_PART] - - if len(surviving) == 1: - el = surviving[0] - el.attrib.pop(_P_LOGICAL, None) - elif len(surviving) > 1: - for i, el in enumerate(surviving): - if i == 0: - el.set(_P_PART, "first") - elif i == len(surviving) - 1: - el.set(_P_PART, "last") - else: - el.set(_P_PART, "middle") - el.attrib.pop(_P_LOGICAL, None) - - -def _strip_stray_processing_markers_under(element: etree.ElementBase) -> None: - for el in element.iter(): - keys_to_strip = (_P_START, _P_SUSPEND, _P_RESUME, _P_END, _P_LOGICAL) - for mk in keys_to_strip: - if mk in el.attrib: - del el.attrib[mk] - - -def doc_needs_marker_reconstruction(root: etree.ElementBase) -> bool: - if root.find(f".//{{{PROCESSING_NAMESPACE}}}parallel") is not None: - return True - for el in root.iter(): - if el.tag not in STRUCTURAL_BLOCKS: - continue - if any(el.get(attr) for attr in (_P_START, _P_END, _P_SUSPEND, _P_RESUME)): - return True - return False - - -def reconstruct_markered_document(root: etree.ElementBase) -> None: - pid_state: defaultdict[str, dict[str, Any]] = defaultdict(dict) - - for parallel in root.iter(): - if parallel.tag != _PARALLEL: - continue - for pi in parallel: - if pi.tag == _PARALLEL_ITEM: - reconstruct_parallel_item(pi, pid_state) - - normalize_segment_parts(root) - - header = root.find(".//{http://www.tei-c.org/ns/1.0}teiHeader") - if header is not None: - _strip_stray_processing_markers_under(header) diff --git a/opensiddur/exporter/tex/xelatex.py b/opensiddur/exporter/tex/xelatex.py index 2862f54..2f609b2 100644 --- a/opensiddur/exporter/tex/xelatex.py +++ b/opensiddur/exporter/tex/xelatex.py @@ -20,10 +20,6 @@ sys.path.insert(0, str(project_root)) from opensiddur.common.xslt import xslt_transform, xslt_transform_string -from opensiddur.exporter.tex.marker_reconstruct import ( - doc_needs_marker_reconstruction, - reconstruct_markered_document, -) XSLT_FILE = Path(__file__).parent / "xelatex.xslt" @@ -324,10 +320,7 @@ def transform_xml_to_tex(input_file, xslt_file=XSLT_FILE, output_file=None): input_xml = input_fd.read() root = etree.fromstring(input_xml.encode("utf-8")) - if doc_needs_marker_reconstruction(root): - reconstruct_markered_document(root) - input_xml = etree.tostring(root, encoding="unicode", xml_declaration=False) - + file_references = get_file_references(input_file, projects_source_root) licenses = extract_licenses(file_references) diff --git a/opensiddur/tests/exporter/test_marker_reconstruct.py b/opensiddur/tests/exporter/test_marker_reconstruct.py index 2bea9fc..dbdb90f 100644 --- a/opensiddur/tests/exporter/test_marker_reconstruct.py +++ b/opensiddur/tests/exporter/test_marker_reconstruct.py @@ -1,4 +1,4 @@ -"""Tests for flattened parallel marker reconstruction (Pass 1 before XeLaTeX).""" +"""Tests for flattened parallel marker reconstruction (compiler output stage).""" import unittest import unittest.mock @@ -7,13 +7,13 @@ from opensiddur.common.xslt import xslt_transform_string from opensiddur.exporter.external_compiler import PROCESSING_NAMESPACE, TEI_NS -from opensiddur.exporter.tex.marker_reconstruct import ( +from opensiddur.exporter.marker_reconstruct import ( doc_needs_marker_reconstruction, reconstruct_markered_document, reconstruct_parallel_item, substantive_content, ) -from opensiddur.exporter.tex import marker_reconstruct as mr +from opensiddur.exporter import marker_reconstruct as mr from opensiddur.exporter.tex.xelatex import XSLT_FILE P_NS = PROCESSING_NAMESPACE diff --git a/opensiddur/tests/exporter/test_parallel_e2e.py b/opensiddur/tests/exporter/test_parallel_e2e.py index 6661c8c..562e141 100644 --- a/opensiddur/tests/exporter/test_parallel_e2e.py +++ b/opensiddur/tests/exporter/test_parallel_e2e.py @@ -216,34 +216,29 @@ def test_no_parallel_fallback_when_parallel_file_missing(self): class TestMarkerStructureE2E(_E2EBase): def test_structural_elements_with_milestones_get_markers(self): - """tei:p with active parallel milestones should produce p:start/p:end markers.""" + """Compiler output should be reconstructed (no raw p:start/p:end markers).""" result = self._compile_primary() all_xml = "".join(etree.tostring(el, encoding="unicode") for el in result) root = etree.fromstring(f"{all_xml}") - # Look for p:start attributes on tei:p elements - start_markers = root.findall(f".//{{{TEI_NS}}}p[@{{{P_NS}}}start]") - end_markers = root.findall(f".//{{{TEI_NS}}}p[@{{{P_NS}}}end]") - self.assertGreater(len(start_markers), 0, "Expected p:start markers on tei:p") - self.assertGreater(len(end_markers), 0, "Expected p:end markers on tei:p") + # Reconstruction happens in the compiler now, so markers should not survive + start_markers = root.findall(f".//*[@{{{P_NS}}}start]") + end_markers = root.findall(f".//*[@{{{P_NS}}}end]") + self.assertEqual(len(start_markers), 0, "p:start markers should be consumed by reconstruction") + self.assertEqual(len(end_markers), 0, "p:end markers should be consumed by reconstruction") + + # Still expect parallel structure to exist in compiled output + parallels = root.findall(f".//{{{P_NS}}}parallel") + self.assertGreater(len(parallels), 0) def test_start_end_marker_ids_match(self): - """Every p:start ID should have a corresponding p:end ID.""" + """Legacy marker pairing test: markers should not be present post-reconstruct.""" result = self._compile_primary() all_xml = "".join(etree.tostring(el, encoding="unicode") for el in result) root = etree.fromstring(f"{all_xml}") - start_ids = set() - end_ids = set() - for el in root.iter(): - sid = el.get(f"{{{P_NS}}}start") - eid = el.get(f"{{{P_NS}}}end") - if sid: - start_ids.add(sid) - if eid: - end_ids.add(eid) - - self.assertEqual(start_ids, end_ids, "Every p:start should have a matching p:end") + self.assertEqual(len(root.findall(f".//*[@{{{P_NS}}}start]")), 0) + self.assertEqual(len(root.findall(f".//*[@{{{P_NS}}}end]")), 0) def test_column_order_attribute(self): """p:parallel elements should have a column-order attribute.""" From 4da8433c9e28b3d1cee6bf21b5d1669ebdd58c9b Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Wed, 6 May 2026 19:58:13 -0700 Subject: [PATCH 2/5] commit constants file --- opensiddur/exporter/constants.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 opensiddur/exporter/constants.py diff --git a/opensiddur/exporter/constants.py b/opensiddur/exporter/constants.py new file mode 100644 index 0000000..34f316e --- /dev/null +++ b/opensiddur/exporter/constants.py @@ -0,0 +1,18 @@ +"""Shared exporter constants (kept dependency-light to avoid circular imports).""" + +JLPTEI_NAMESPACE = "http://jewishliturgy.org/ns/jlptei/2" +PROCESSING_NAMESPACE = "http://jewishliturgy.org/ns/processing" + +TEI_NS = "http://www.tei-c.org/ns/1.0" +XML_NS = "http://www.w3.org/XML/1998/namespace" + +STRUCTURAL_BLOCKS = frozenset( + { + f"{{{TEI_NS}}}div", + f"{{{TEI_NS}}}p", + f"{{{TEI_NS}}}ab", + f"{{{TEI_NS}}}lg", + f"{{{TEI_NS}}}l", + } +) + From 9bb78b224aba7aabe8b4491649dcb64c8f912504 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Wed, 6 May 2026 20:03:11 -0700 Subject: [PATCH 3/5] add marker reconstruct to git --- opensiddur/exporter/marker_reconstruct.py | 292 ++++++++++++++++++++++ 1 file changed, 292 insertions(+) create mode 100644 opensiddur/exporter/marker_reconstruct.py diff --git a/opensiddur/exporter/marker_reconstruct.py b/opensiddur/exporter/marker_reconstruct.py new file mode 100644 index 0000000..10c03fb --- /dev/null +++ b/opensiddur/exporter/marker_reconstruct.py @@ -0,0 +1,292 @@ +""" +Reconstruct flattened p:start / p:suspend / p:resume / p:end streams inside +parallel export columns into nested TEI, then prune empty segments and set p:part. +""" + +from __future__ import annotations + +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Any + +from lxml import etree + +from opensiddur.exporter.constants import PROCESSING_NAMESPACE, STRUCTURAL_BLOCKS + +_P_START = f"{{{PROCESSING_NAMESPACE}}}start" +_P_END = f"{{{PROCESSING_NAMESPACE}}}end" +_P_SUSPEND = f"{{{PROCESSING_NAMESPACE}}}suspend" +_P_RESUME = f"{{{PROCESSING_NAMESPACE}}}resume" +_P_LOGICAL = f"{{{PROCESSING_NAMESPACE}}}logical-id" +_P_PART = f"{{{PROCESSING_NAMESPACE}}}part" +_PARALLEL_ITEM = f"{{{PROCESSING_NAMESPACE}}}parallelItem" +_PARALLEL = f"{{{PROCESSING_NAMESPACE}}}parallel" + + +def _structural_marker_map(el: etree.ElementBase) -> dict[str, str]: + out = {} + for key, pname in ( + (_P_START, "start"), + (_P_END, "end"), + (_P_SUSPEND, "suspend"), + (_P_RESUME, "resume"), + ): + if val := el.get(key): + out[pname] = val + return out + + +def substantive_content(el: etree.ElementBase) -> bool: + def walk(x: etree.ElementBase) -> bool: + if (x.text or "").strip(): + return True + for c in x: + if walk(c): + return True + if (c.tail or "").strip(): + return True + return False + + return walk(el) + + +@dataclass +class _Frame: + pid: str + tag: str + attrs: dict[str, str] + buffer: list[etree.ElementBase] = field(default_factory=list) + #: Text serialized before structural children (marker text/tails from compiler) + text_chunks: list[str] = field(default_factory=list) + + +def _absorb_marker_strings(frame: _Frame, el: etree.ElementBase) -> None: + if el.text: + frame.text_chunks.append(el.text) + if el.tail: + frame.text_chunks.append(el.tail) + + +def _carrier_attrs_from_marker_el(el: etree.ElementBase) -> dict[str, str]: + p_pref = f"{{{PROCESSING_NAMESPACE}}}" + xml_id_key = "{http://www.w3.org/XML/1998/namespace}id" + return {k: v for k, v in el.attrib.items() if k != xml_id_key and not k.startswith(p_pref)} + + +def _new_wrapped_segment( + tag: str, + attrs: dict[str, str], + children: list[etree.ElementBase], + *, + logical_id: str | None, + leading_text_chunks: list[str], +) -> etree.ElementBase: + nsmap = dict(children[0].nsmap) if children else {} + wrapped = etree.Element(tag, nsmap=nsmap) if nsmap else etree.Element(tag) + for k, v in attrs.items(): + wrapped.set(k, v) + prefix = "".join(leading_text_chunks) + leading_text_chunks.clear() + if prefix: + wrapped.text = prefix + for c in children: + wrapped.append(c) + + marker_keys = (_P_START, _P_SUSPEND, _P_RESUME, _P_END, _P_LOGICAL, _P_PART) + for mk in marker_keys: + if mk in wrapped.attrib: + del wrapped.attrib[mk] + + if logical_id: + wrapped.set(_P_LOGICAL, logical_id) + return wrapped + + +def _close_open_segment( + stack: list[_Frame], + fragments: list[etree.ElementBase], + pid_state: dict[str, dict[str, Any]], + *, + pid: str, + kind: str, +) -> None: + if not stack: + raise ValueError(f"p:{kind} for {pid=} with empty reconstruction stack") + + frame = stack.pop() + if frame.pid != pid: + raise ValueError(f"p:{kind} expected top frame pid={pid}, got {frame.pid}") + + st = pid_state.setdefault(pid, {}) + suspended_before = st.get("suspended", False) + logical_id: str | None = None + + if kind == "suspend": + st["suspended"] = True + logical_id = pid + elif kind == "end": + if suspended_before: + logical_id = pid + pid_state.pop(pid, None) + else: + raise ValueError(kind) + + wrapped = _new_wrapped_segment( + frame.tag, + frame.attrs, + frame.buffer, + logical_id=logical_id, + leading_text_chunks=frame.text_chunks, + ) + + if stack: + stack[-1].buffer.append(wrapped) + else: + fragments.append(wrapped) + + +def _move_plain_content( + el: etree.ElementBase, + stack: list[_Frame], + fragments: list[etree.ElementBase], +) -> None: + if stack: + stack[-1].buffer.append(el) + else: + fragments.append(el) + + +def reconstruct_parallel_item( + pi: etree.ElementBase, + pid_state: defaultdict[str, dict[str, Any]], +) -> None: + """Rebuild pi's direct linear stream into nested TEI fragments (mutating pi).""" + fragments: list[etree.ElementBase] = [] + stack: list[_Frame] = [] + + while len(pi) > 0: + el = pi[0] + pi.remove(el) + + mmap = _structural_marker_map(el) + + if el.tag not in STRUCTURAL_BLOCKS: + _move_plain_content(el, stack, fragments) + continue + + if not mmap: + _move_plain_content(el, stack, fragments) + continue + + if "start" in mmap: + pid = mmap["start"] + fr = _Frame(pid, el.tag, _carrier_attrs_from_marker_el(el), []) + _absorb_marker_strings(fr, el) + stack.append(fr) + continue + + if "resume" in mmap: + pid = mmap["resume"] + fr = _Frame(pid, el.tag, _carrier_attrs_from_marker_el(el), []) + _absorb_marker_strings(fr, el) + stack.append(fr) + continue + + if "suspend" in mmap: + _close_open_segment(stack, fragments, pid_state, pid=mmap["suspend"], kind="suspend") + continue + + if "end" in mmap: + _close_open_segment(stack, fragments, pid_state, pid=mmap["end"], kind="end") + continue + + _move_plain_content(el, stack, fragments) + + if stack: + raise ValueError(f"unclosed structural frames remain in parallelItem: {[f.pid for f in stack]}") + + for frag in fragments: + pi.append(frag) + + +def _collect_logical_buckets(root: etree.ElementBase) -> dict[str, list[etree.ElementBase]]: + buckets: dict[str, list[etree.ElementBase]] = defaultdict(list) + for el in root.iter(): + lid = el.get(_P_LOGICAL) + if lid: + buckets[lid].append(el) + return buckets + + +def normalize_segment_parts(root: etree.ElementBase) -> None: + buckets = _collect_logical_buckets(root) + stray_markers = {_P_START, _P_SUSPEND, _P_RESUME, _P_END} + + for _lid, elems in buckets.items(): + for e in list(elems): + if substantive_content(e): + continue + parent = e.getparent() + if parent is not None: + parent.remove(e) + + surviving = [e for e in elems if e.getparent() is not None] + + for e in surviving: + for mk in stray_markers: + if mk in e.attrib: + del e.attrib[mk] + + for e in surviving: + if _P_PART in e.attrib: + del e.attrib[_P_PART] + + if len(surviving) == 1: + el = surviving[0] + el.attrib.pop(_P_LOGICAL, None) + elif len(surviving) > 1: + for i, el in enumerate(surviving): + if i == 0: + el.set(_P_PART, "first") + elif i == len(surviving) - 1: + el.set(_P_PART, "last") + else: + el.set(_P_PART, "middle") + el.attrib.pop(_P_LOGICAL, None) + + +def _strip_stray_processing_markers_under(element: etree.ElementBase) -> None: + for el in element.iter(): + keys_to_strip = (_P_START, _P_SUSPEND, _P_RESUME, _P_END, _P_LOGICAL) + for mk in keys_to_strip: + if mk in el.attrib: + del el.attrib[mk] + + +def doc_needs_marker_reconstruction(root: etree.ElementBase) -> bool: + if root.find(f".//{{{PROCESSING_NAMESPACE}}}parallel") is not None: + return True + for el in root.iter(): + if el.tag not in STRUCTURAL_BLOCKS: + continue + if any(el.get(attr) for attr in (_P_START, _P_END, _P_SUSPEND, _P_RESUME)): + return True + return False + + +def reconstruct_markered_document(root: etree.ElementBase) -> None: + pid_state: defaultdict[str, dict[str, Any]] = defaultdict(dict) + + for parallel in root.iter(): + if parallel.tag != _PARALLEL: + continue + for pi in parallel: + if pi.tag == _PARALLEL_ITEM: + reconstruct_parallel_item(pi, pid_state) + + normalize_segment_parts(root) + + header = root.find(".//{http://www.tei-c.org/ns/1.0}teiHeader") + if header is not None: + _strip_stray_processing_markers_under(header) + From 75983fe6992f31a266b341ce386fdfdb3d58002a Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Wed, 6 May 2026 20:05:17 -0700 Subject: [PATCH 4/5] update github actions to avoid warning --- .github/workflows/tests.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6dcc935..f4dc3ec 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -9,21 +9,23 @@ on: jobs: test: runs-on: ubuntu-latest + env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true steps: - name: Install Jing (RelaxNG validator) run: sudo apt-get update && sudo apt-get install --no-install-recommends --no-install-suggests -y jing - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.13' - name: Install uv - uses: astral-sh/setup-uv@v6 + uses: astral-sh/setup-uv@v8 with: enable-cache: true @@ -40,7 +42,7 @@ jobs: uv run coverage xml - name: Upload coverage to Codecov - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@v6 with: file: ./coverage.xml fail_ci_if_error: false From 4d6b77827084f34ae92a7b48dc1292c9436fb7ed Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Wed, 6 May 2026 20:10:13 -0700 Subject: [PATCH 5/5] try again --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f4dc3ec..68a840b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -25,7 +25,7 @@ jobs: python-version: '3.13' - name: Install uv - uses: astral-sh/setup-uv@v8 + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 with: enable-cache: true @@ -44,6 +44,6 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v6 with: - file: ./coverage.xml + files: ./coverage.xml fail_ci_if_error: false token: ${{ secrets.CODECOV_TOKEN }}