diff --git a/README.md b/README.md index b1b8132..c997a3a 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,13 @@ uv run python -m opensiddur.importer.jps1917.convert_wikisource \ --project-dir ~/src/opensiddur-repos/opensiddur-projects/project/jps1917 ``` +Example: download Miqra al pi ha-Masorah from Google Sheets into sourcetexts: + +```bash +uv run python -m opensiddur.importer.miqra_al_pi_hamasorah.download \ + --sourcetexts-root ~/src/opensiddur-repos/sourcetexts/sources +``` + ## JLPTEI sources JLPTEI sources are compiled into the `project` directory. diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..3b065ab --- /dev/null +++ b/codecov.yml @@ -0,0 +1,11 @@ +# https://docs.codecov.com/docs/commit-status +coverage: + status: + project: + default: + target: 85% + informational: false + patch: + default: + target: 85% + informational: false diff --git a/opensiddur/exporter/pdf/pdf.py b/opensiddur/exporter/pdf/pdf.py index df1801d..cd8cbba 100755 --- a/opensiddur/exporter/pdf/pdf.py +++ b/opensiddur/exporter/pdf/pdf.py @@ -392,5 +392,5 @@ def main(): # pragma: no cover sys.exit(1) -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover main() diff --git a/opensiddur/exporter/tex/bibtex.xslt b/opensiddur/exporter/tex/bibtex.xslt index 6b4c83f..d9cbe8d 100644 --- a/opensiddur/exporter/tex/bibtex.xslt +++ b/opensiddur/exporter/tex/bibtex.xslt @@ -3,12 +3,34 @@ xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:j="http://jewishliturgy.org/ns/jlptei/2" + xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="tei j"> + + + + + + + + + + + + + + + + + + + + + @@ -215,7 +237,7 @@ = { - + }, diff --git a/opensiddur/exporter/tex/latex.py b/opensiddur/exporter/tex/latex.py index 6b7e4a8..5629d86 100644 --- a/opensiddur/exporter/tex/latex.py +++ b/opensiddur/exporter/tex/latex.py @@ -464,5 +464,5 @@ def main(): # pragma: no cover ) -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover main() diff --git a/opensiddur/exporter/tex/reledmac.xslt b/opensiddur/exporter/tex/reledmac.xslt index 82d7980..c466758 100644 --- a/opensiddur/exporter/tex/reledmac.xslt +++ b/opensiddur/exporter/tex/reledmac.xslt @@ -539,7 +539,7 @@ \pend \eledchapter{ - + } @@ -550,7 +550,7 @@ \pend \eledsubsection{ - + } @@ -659,8 +659,10 @@ + - + + @@ -668,8 +670,10 @@ + - + + @@ -725,7 +729,9 @@ - \leavevmode\\ + + \leavevmode\\{} @@ -893,6 +899,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + urn:x-opensiddur:text:bible: + + / + + / + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [ + + ] + + + + + + + ( + + ) + + + + + + [ + + ] + + + + + + + + indent + + + + + + + + + + + + + + + + + + + + + + + + + ͏ึด + + + + + + + + + + + + + + * + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py b/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py new file mode 100644 index 0000000..bf604c7 --- /dev/null +++ b/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py @@ -0,0 +1,680 @@ +""" +Convert Miqra al pi ha-Masorah wikitext (per templates.tsv) to intermediate XML. + +All templates documented in sources/miqra_al_pi_hamasorah/sheets/templates.tsv are +handled here, including when nested inside verse text (e.g. {{ื ื•ืกื—|โ€ฆ}}). +""" + +from __future__ import annotations + +import re +from typing import Callable, Optional +from urllib.parse import quote + +import mwparserfromhell + +from opensiddur.importer.util.mediawiki_processor import ( + ConversionResult, + MediaWikiProcessor, +) + +MIQRA_NS = "urn:x-opensiddur:miqra:intermediate" +MW_NS = "urn:x-opensiddur:mw:intermediate" + +_STRIP_TEMPLATES = frozenset( + { + "ืž:ืคืกื•ืง", + "ืž:ืคืกื•ืง-ืฉื™ืจื”", + "ืž:ืฉื•ืœื™ื™ื", + "ืž:ืฉื•ืœื™ื™ื-ืกื•ืฃ", + "ืž:ื˜ืขืžื™ ื”ืžืงืจื", + "ืž:ื˜ืขืžื™ ื”ืžืงืจื-ืกื•ืฃ", + "ื˜ืขืžื™ ื”ืžืงืจื ื‘ืื™ื ื˜ืจื ื˜", + "ืชื‘ื ื™ืช:ื˜ืขืžื™ ื”ืžืงืจื ื‘ืื™ื ื˜ืจื ื˜", + "ืž:ืกืคืจ ื—ื“ืฉ", + "ืž:ืจื•ื•ื— ื‘ืชืจื™ ืขืฉืจ", + "ืจื•ื•ื— ื‘ืชืจื™ ืขืฉืจ", + "ืž:ืจื•ื•ื— ื‘ืชืจื™ ืขืฉืจ ื‘ืคืกื•ืง ื”ืจืืฉื•ืŸ", + "ืž:ืจื•ื•ื— ืœืกืคืจ ื‘ืชื”ืœื™ื", + "ืจื•ื•ื— ืœืกืคืจ ื‘ืชื”ืœื™ื", + "ืž:ืจื•ื•ื— ืœืกืคืจ ื‘ืชื”ืœื™ื ื‘ืคืกื•ืง ื”ืจืืฉื•ืŸ", + "ื ื™ื•ื•ื˜ ื˜ืขืžื™ื", + "ืฉื ื”ื“ืฃ ื”ืžืœื", + "ืž:ืื™ืŸ ืคืจืฉื” ื‘ืชื—ื™ืœืช ืคืจืง", + 'ืž:ืื™ืŸ ืคืจืฉื” ื‘ืชื—ื™ืœืช ืคืจืง ื‘ืกืคืจื™ ืืž"ืช', + "ืž:ืื™ืŸ ืจื•ื•ื— ืฉืœ ืคืจืฉื” ื‘ืชื—ื™ืœืช ืคืจืฉืช ื”ืฉื‘ื•ืข", + "ืž:ื™ื™ืฉื•ืจ-ื‘ืฉื ื™-ื”ืฆื“ื“ื™ื", + "ืž:ื™ื™ืฉื•ืจ-ื‘ืฉื ื™-ื”ืฆื“ื“ื™ื-ืกื•ืฃ", + "ื‘ืกื™ืก-ืžืฉืชืžืฉ", + 'ืฆื•ืจื•ืช ื›ืชื™ื‘ื” ื‘ืกืคืจื™ ืืž"ืช', + "documentation", + "name", + "template", + "ืชื‘ื ื™ืช", + } +) + +_ANY_HI_RE = re.compile(r"'''''(.*?)'''''|'''(.*?)'''|''(.*?)''") +_TAG_OPEN_RE = re.compile(r"<(miqra|mw):([a-zA-Z0-9-]+)([^>]*?)(/?)>") +_KETEG_START_RE = re.compile(r"<ืงื˜ืข\s+ื”ืชื—ืœื”=([^/>]+)\s*/>", re.IGNORECASE) +_KETEG_END_RE = re.compile(r"<ืงื˜ืข\s+ืกื•ืฃ=([^/>]+)\s*/>", re.IGNORECASE) + + +def normalize_template_name(name: str) -> str: + n = str(name).strip() + if n.lower().startswith("ืชื‘ื ื™ืช:"): + n = n.split(":", 1)[1].strip() + n = n.replace("''", '"').replace("ืด", '"').replace("ืณ", "'") + return n.strip() + + +def link_target_to_uri(target: str) -> str: + """Turn a URL or Hebrew Wikisource page title into a valid URI for tei:ref/@target.""" + t = (target or "").strip() + if not t: + return "" + if re.match(r"^https?://", t, re.I): + return t + if t.startswith("//"): + return "https:" + t + page, sep, frag = t.partition("#") + page = page.replace(" ", "_").strip() + if page: + uri = "https://he.wikisource.org/wiki/" + quote(page, safe="/:%") + else: + uri = "https://he.wikisource.org/wiki/" + if sep: + uri += "#" + quote(frag, safe=":/%.-_") + return uri + + +def _xml_escape(text: str) -> str: + return ( + (text or "") + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'") + ) + + +def _wikitext_basic_markup_to_xml(text: str) -> str: + s = text or "" + out: list[str] = [] + pos = 0 + for m in _ANY_HI_RE.finditer(s): + out.append(_xml_escape(s[pos : m.start()])) + if m.group(1) is not None: + rend, inner = "bold-italic", m.group(1) + elif m.group(2) is not None: + rend, inner = "bold", m.group(2) + else: + rend, inner = "italic", m.group(3) or "" + out.append(f'{_xml_escape(inner)}') + pos = m.end() + out.append(_xml_escape(s[pos:])) + return "".join(out) + + +def _escape_outside_tags(fragment: str) -> str: + """Escape text nodes while preserving nested miqra:/mw: XML elements.""" + + out: list[str] = [] + pos = 0 + while pos < len(fragment): + m = _TAG_OPEN_RE.search(fragment, pos) + if not m: + out.append(_wikitext_basic_markup_to_xml(fragment[pos:])) + break + out.append(_wikitext_basic_markup_to_xml(fragment[pos : m.start()])) + ns, local, _attrs, self_close = m.group(1), m.group(2), m.group(3), m.group(4) + if self_close == "/": + out.append(m.group(0)) + pos = m.end() + continue + close = f"" + depth = 1 + search = m.end() + closed_at: Optional[int] = None + while depth > 0 and search <= len(fragment): + next_close = fragment.find(close, search) + if next_close == -1: + break + inner_open = _TAG_OPEN_RE.search(fragment, search, next_close) + if inner_open and inner_open.start() < next_close and inner_open.group(4) != "/": + inner_local = inner_open.group(2) + if inner_open.group(1) == ns and inner_local == local: + depth += 1 + search = inner_open.end() + else: + depth -= 1 + if depth == 0: + closed_at = next_close + else: + search = next_close + len(close) + if closed_at is None: + out.append(_wikitext_basic_markup_to_xml(fragment[m.start() :])) + break + inner = fragment[m.end() : closed_at] + out.append(m.group(0)) + out.append(_escape_outside_tags(inner)) + out.append(close) + pos = closed_at + len(close) + return "".join(out) + + +def _preprocess_column_c(wikitext: str) -> str: + """Column C markers from templates.tsv (not templates).""" + s = wikitext or "" + s = s.replace("__", " ") + s = re.sub(r"(?", s) + return s + + +def _preprocess_miqra_tags(wikitext: str) -> str: + s = wikitext or "" + s = _KETEG_START_RE.sub( + r'', s + ) + s = _KETEG_END_RE.sub(r'', s) + return s + + +class MiqraWikiTextProcessor(MediaWikiProcessor): + """MediaWiki processor with handlers for all Miqra templates.""" + + def __init__(self) -> None: + self._note_seq = 0 + super().__init__() + + def _initialize_handlers(self) -> None: + self.template_handlers = {} + self.tag_handlers = {} + self.preprocessors = [_preprocess_miqra_tags] + self.postprocessors = [] + self._register_template_handlers() + self._register_tag_handlers() + + def process_wikitext(self, wikitext: str) -> ConversionResult: + """Miqra uses recursive nested processing, not the JPS top-level loop.""" + warnings: list[str] = [] + errors: list[str] = [] + metadata: dict = {} + + text = wikitext or "" + for pre in self.preprocessors: + try: + text = pre(text) + except Exception as e: + errors.append(str(e)) + + try: + xml_content = self._process_nested_content(text) + except Exception as e: + xml_content = text + errors.append(str(e)) + + return ConversionResult( + xml_content=xml_content, + metadata=metadata, + warnings=warnings, + errors=errors, + wikilinks=self.wikilinks.copy(), + ) + + def _register_tag_handlers(self) -> None: + self.tag_handlers["noinclude"] = self._handle_strip_tag + + def _handle_strip_tag(self, tag) -> str: + return "" + + def _register_template_handlers(self) -> None: + h = self.add_template_handler + for name in _STRIP_TEMPLATES: + h(name, self._handle_strip) + + h("ื ื•ืกื—", self._handle_nosach) + h("ืฉ", self._handle_footnote_mark) + h("ืฉื", self._handle_strip) + + h("ืคืค", self._handle_parashah_open) + h("ืคืคืค", self._handle_parashah_open_line) + h("ืจื•ื•ื— ื‘ืกื•ืฃ ืฉื•ืจื”", self._handle_strip) + h("ืกืก", self._handle_parashah_close) + h("ืกืกืก", self._handle_parashah_close_inline) + h("ืกืก2", self._handle_parashah_close_narrow) + h("ืž:ืฉืฉืฉ", self._handle_shirah_break) + + h("ืจ0", self._handle_poetic_space) + h("ืจ1", self._handle_poetic_indent1) + h("ืจ2", self._handle_poetic_indent2) + h("ืจ3", self._handle_poetic_line) + h("ืจ4", self._handle_poetic_verse) + h("ืคืจืฉื”-ืžืจื›ื–", self._handle_centered_title) + + h("ื›ืชื™ื‘ ื•ืœื ืงืจื™", self._handle_ketiv_only) + h("ืงืจื™ ื•ืœื ื›ืชื™ื‘", self._handle_qeri_only) + h('ืž:ืงื•"ื›-ืื-2', self._handle_qok_if_matres) + h('ืž:ืงื•"ื› ืงืจื™ ืฉื•ื ื” ืžื”ื›ืชื™ื‘ ื‘ืฉืชื™ ืžื™ืœื™ื', self._handle_qok_two_qeri_words) + + h("ืž:ืื•ืช-ื’", self._handle_large_letter) + h("ืž:ืื•ืช-ืง", self._handle_small_letter) + h("ืž:ืื•ืช ืชืœื•ื™ื”", self._handle_raised_letter) + h("ืž:ืื•ืช ืžื ื•ืงื“ืช", self._handle_dotted_letter) + h('ืž:ื ื•"ืŸ ื”ืคื•ื›ื”', self._handle_inverted_nun) + h("ืž:ื™ืจื•ืฉืœื", self._handle_yerushalem) + h("ืž:ื™ืจื•ืฉืœืžื”", self._handle_yerushalema) + h("ื™ืจื— ื‘ืŸ ื™ื•ืžื•", self._handle_accent_yerah) + h("ื™ืจื— ื‘ืŸ ื™ื•ืžื•-2", self._handle_accent_with_word) + h("ื’ืœื’ืœ", self._handle_accent_galgal) + h("ื’ืœื’ืœ-2", self._handle_accent_with_word) + h("ืืชื ื— ื”ืคื•ืš", self._handle_accent_etnah) + h("ืž:ืงืžืฅ", self._handle_qamats) + h("ืž:ื˜ืขื ื•ืžืชื’ ื‘ืื•ืช ืื—ืช", self._handle_taam_meteg) + h("ืฉื ื™ ื˜ืขืžื™ื ื‘ืื•ืช ืื—ืช", self._handle_two_taamim) + h( + "ืฉื ื™ ื˜ืขืžื™ื ื‘ืื•ืช ืื—ืช ืงืžืฅ-ืชื—ืชื•ืŸ-ืคืชื—-ืขืœื™ื•ืŸ", + self._handle_two_taamim_qupo, + ) + h("ืž:ื˜ืขื", self._handle_taam_dummy) + h("ืชื‘ื ื™ืช:ืž:ื˜ืขื", self._handle_taam_dummy) + h("ืž:ื’ืจืฉ ื•ืชืœื™ืฉื ื’ื“ื•ืœื”", self._handle_geresh_telisha) + h("ืž:ื’ืจืฉื™ื™ื ื•ืชืœื™ืฉื ื’ื“ื•ืœื”", self._handle_gershayim_telisha) + h("ืž:ื›ืœ ืงืžืฅ ืงื˜ืŸ ืžืจื›ื", self._handle_kol_qamats) + h("ืž:ืœื’ืจืžื™ื”-2", self._handle_legarmeh) + h("ืž:ืคืกืง", self._handle_paseq) + h("ืž:ืžืงืฃ ืืคื•ืจ", self._handle_grey_maqaf) + h("ืž:ื“ื—ื™", self._handle_dechi) + h("ืž:ืฆื™ื ื•ืจ", self._handle_tzinor) + + h("ืž:ื”ืขืจื”", self._handle_mam_note) + h("ืขื•ื’ืŸ ื‘ืฉื•ืจื”", self._handle_line_anchor) + h("ืž:ืกื™ื•ื ื‘ื˜ื•ื‘", self._handle_good_ending) + h("ืงืง", self._handle_dual_trope_link) + h("ืž:ื›ืคื•ืœ", self._handle_dual_accent) + + h("ืž:ืงื™ืฉื•ืจ ื‘ื”ืขืจื”", self._handle_note_link) + h("ืž:ืงื™ืฉื•ืจ ืคื ื™ืžื™ ื‘ื”ืขืจื”", self._handle_note_link) + h("ืžื•ื“ื’ืฉ", self._handle_emphasis) + + def _lookup_handler(self, name: str) -> Optional[Callable]: + n = normalize_template_name(name) + if n in self.template_handlers: + return self.template_handlers[n] + if n.startswith('ืž:ื›ื•"ืง') or n.startswith('ื›ื•"ืง') or n.startswith("ื›ื•''ืง"): + return self._handle_ketiv_qeri + if n.startswith('ืž:ืงื•"ื›') or n.startswith('ืงื•"ื›') or n.startswith("ืงื•''ื›"): + return self._handle_qeri_ketiv + return None + + def _process_nested_content(self, content: str, depth: int = 0) -> str: + if depth > 12: + return content + + parsed = mwparserfromhell.parse(content) + nodes_to_replace = [] + + for node in parsed.nodes: + if hasattr(node, "name"): + template_name = str(node.name).strip() + handler = self._lookup_handler(template_name) + if handler is None: + n = normalize_template_name(template_name) + if n in _STRIP_TEMPLATES: + handler = self._handle_strip + else: + processed = self._process_nested_content(str(node), depth + 1) + nodes_to_replace.append((node, processed)) + continue + try: + processed_node = self._process_template_with_nesting(node, depth + 1) + replacement = handler(processed_node) + except Exception: + replacement = handler(node) + nodes_to_replace.append((node, replacement)) + elif hasattr(node, "tag"): + tag_name = str(node.tag).strip().lower() + if tag_name in self.tag_handlers: + try: + processed_node = self._process_tag_with_nesting(node, depth + 1) + replacement = self.tag_handlers[tag_name](processed_node) + except Exception: + replacement = self.tag_handlers[tag_name](node) + nodes_to_replace.append((node, replacement)) + else: + processed = self._process_nested_content(str(node), depth + 1) + nodes_to_replace.append((node, processed)) + elif "Wikilink" in str(node.__class__): + nodes_to_replace.append((node, self._handle_wikilink_miqra(node))) + + elif node.__class__.__name__ == "Heading": + # Note text uses "=source=reading" notation; mwparser treats it as wikitext headings. + title = self._process_nested_content(str(node.title), depth + 1) + nodes_to_replace.append((node, "=" + title + "=")) + + for node, replacement in nodes_to_replace: + parsed.replace(node, replacement) + + return str(parsed) + + def _handle_wikilink_miqra(self, node) -> str: + raw_title = str(getattr(node, "title", "")).strip() + target = _xml_escape(link_target_to_uri(raw_title)) + text = str(getattr(node, "text", "")).strip() if getattr(node, "text", None) else "" + if text: + return f'{_xml_escape(text)}' + return f'' + + def _p(self, content: str) -> str: + return self._process_nested_content(content or "") + + def _param_value(self, template, key: str | int) -> str: + """Read a template parameter by name or 1-based index. + + mwparserfromhell's ``template.get(1)`` returns ``'1=value'`` when the + wikitext uses explicit ``1=value`` syntax; iterating ``params`` is reliable. + """ + key_s = str(key).strip() + for p in template.params: + pname = str(p.name).strip() + if pname == key_s: + return str(p.value).strip() + if pname.isdigit() and key_s.isdigit() and int(pname) == int(key_s): + return str(p.value).strip() + return "" + + def _param(self, template, index: int) -> str: + return self._param_value(template, index) + + def _named_param(self, template, name: str) -> str: + return self._param_value(template, name) + + def _note_params(self, template) -> str: + parts: list[str] = [] + for p in template.params: + pname = str(p.name).strip() + if pname.isdigit() and int(pname) >= 2: + parts.append(self._p(str(p.value))) + elif pname in ("2", "ื”ืขืจื•ืช", "ื”ืขืจื”", "notes"): + parts.append(self._p(str(p.value))) + return "".join(parts) + + def _mid_verse_attr(self, template) -> str: + for p in template.params: + if "ืคืกืงื ื‘ืืžืฆืข ืคืกื•ืง" in str(p.value): + return ' midVerse="true"' + return "" + + def _next_note_id(self) -> str: + self._note_seq += 1 + return f"miqra-note-{self._note_seq}" + + # --- handlers --- + + def _handle_strip(self, template) -> str: + return "" + + def _handle_nosach(self, template) -> str: + display = self._p(self._param(template, 1)) + notes = self._note_params(template) + if not notes: + return display + note_id = self._next_note_id() + return ( + f'' + f"{display}" + f"" + f'{notes}' + ) + + def _handle_footnote_mark(self, template) -> str: + return "" + + def _handle_ketiv_qeri(self, template) -> str: + ketiv = self._p(self._param(template, 1)) + qeri = self._p(self._param(template, 2)) + return ( + f'' + f"{ketiv}" + f"{qeri}" + f"" + ) + + def _handle_qeri_ketiv(self, template) -> str: + ketiv = self._p(self._param(template, 1)) + qeri = self._p(self._param(template, 2)) + return ( + f'' + f"{ketiv}" + f"{qeri}" + f"" + ) + + def _handle_qok_if_matres(self, template) -> str: + display = self._p(self._param(template, 1)) + ketiv = self._p(self._param(template, 2)) + qeri = self._p(self._param(template, 3)) + return ( + f"{display}" + f'' + f"{ketiv}" + f"{qeri}" + f"" + ) + + def _handle_qok_two_qeri_words(self, template) -> str: + ketiv = self._p(self._param(template, 1)) + q1 = self._p(self._param(template, 2)) + q2 = self._p(self._param(template, 3)) + return ( + f'' + f"{q1}" + f"{q2}" + f"{ketiv}" + f"" + ) + + def _handle_ketiv_only(self, template) -> str: + ketiv = self._p(self._param(template, 1)) + return f'({ketiv})' + + def _handle_qeri_only(self, template) -> str: + qeri = self._p(self._param(template, 1)) + return f"[{qeri}]" + + def _handle_parashah_open(self, template) -> str: + return f'' + + def _handle_parashah_open_line(self, template) -> str: + return f'' + + def _handle_parashah_close(self, template) -> str: + return f'' + + def _handle_parashah_close_inline(self, template) -> str: + return f'' + + def _handle_parashah_close_narrow(self, template) -> str: + return f'' + + def _handle_shirah_break(self, template) -> str: + return '' + + def _handle_poetic_space(self, template) -> str: + return '' + + def _handle_poetic_indent1(self, template) -> str: + return '' + + def _handle_poetic_indent2(self, template) -> str: + return '' + + def _handle_poetic_line(self, template) -> str: + return '' + + def _handle_poetic_verse(self, template) -> str: + return '' + + def _handle_centered_title(self, template) -> str: + title = self._p(self._param(template, 1)) + return f"{title}" + + def _handle_large_letter(self, template) -> str: + letter = self._p(self._param(template, 1)) + return f'{letter}' + + def _handle_small_letter(self, template) -> str: + letter = self._p(self._param(template, 1)) + return f'{letter}' + + def _handle_raised_letter(self, template) -> str: + letter = self._p(self._param(template, 1)) + return f'{letter}' + + def _handle_dotted_letter(self, template) -> str: + word = self._p(self._param(template, 1)) + return f"{word}" + + def _handle_inverted_nun(self, template) -> str: + sym = self._p(self._param(template, 1)) + return f"{sym}" + + def _handle_yerushalem(self, template) -> str: + p1 = _xml_escape(self._param(template, 1)) + p2 = _xml_escape(self._param(template, 2)) + return f'' + + def _handle_yerushalema(self, template) -> str: + p1 = _xml_escape(self._param(template, 1)) + p2 = _xml_escape(self._param(template, 2)) + return f'' + + def _handle_accent_yerah(self, template) -> str: + return '' + + def _handle_accent_galgal(self, template) -> str: + return '' + + def _handle_accent_with_word(self, template) -> str: + # Word param already includes the accent (galgal / yerah ben yomo). + return self._p(self._param(template, 1)) + + def _handle_accent_etnah(self, template) -> str: + return '' + + def _handle_qamats(self, template) -> str: + d = self._named_param(template, "ื“") + s = self._named_param(template, "ืก") + text = d or s or self._param(template, 1) + return self._p(text) + + def _handle_taam_meteg(self, template) -> str: + return self._p(self._param(template, 1)) + + def _handle_two_taamim(self, template) -> str: + return '' + + def _handle_two_taamim_qupo(self, template) -> str: + above = self._p(self._named_param(template, "ืขืœื™ื•") or self._param(template, 1)) + return f'' + + def _handle_taam_dummy(self, template) -> str: + raw = self._param(template, 1) + return self._p(raw[1:] if raw else "") + + def _handle_geresh_telisha(self, template) -> str: + return '' + + def _handle_gershayim_telisha(self, template) -> str: + return '' + + def _handle_kol_qamats(self, template) -> str: + return self._p(self._param(template, 1)) or "ื›ึผึธืœ" + + def _handle_legarmeh(self, template) -> str: + return 'ื€' + + def _handle_paseq(self, template) -> str: + return 'ื€' + + def _handle_grey_maqaf(self, template) -> str: + return 'ึพ' + + def _handle_dechi(self, template) -> str: + # Wikisource shows param 1; param 2 marks the dechi (offset accent) form. + return self._p(self._param(template, 1)) + + def _handle_tzinor(self, template) -> str: + # Wikisource shows param 1; param 2 marks the tzinor accent placement. + return self._p(self._param(template, 1)) + + def _handle_mam_note(self, template) -> str: + body = self._p(self._param(template, 1)) + note_id = self._next_note_id() + return ( + f'' + f'{body}' + ) + + def _handle_line_anchor(self, template) -> str: + label = _xml_escape(self._param(template, 1)) + return f'' + + def _handle_good_ending(self, template) -> str: + text = self._p(self._param(template, 1)) + return f"{text}" + + def _handle_dual_trope_link(self, template) -> str: + target = self._p(self._param(template, 1)) + return f"{target}" + + def _handle_dual_accent(self, template) -> str: + dual = self._p(self._named_param(template, "ื›ืคื•ืœ")) + a = self._p(self._named_param(template, "ื")) + b = self._p(self._named_param(template, "ื‘")) + return ( + f'' + f"{a}" + f"{b}" + f"" + ) + + def _handle_note_link(self, template) -> str: + raw_target = self._named_param(template, "1") or self._param(template, 1) + label = self._named_param(template, "2") or self._param(template, 2) + if not label: + label = raw_target + target = _xml_escape(link_target_to_uri(raw_target)) + return f'{self._p(label)}' + + def _handle_emphasis(self, template) -> str: + text = self._p(self._param(template, 1)) + return f'{text}' + + +_processor: Optional[MiqraWikiTextProcessor] = None + + +def _get_processor() -> MiqraWikiTextProcessor: + global _processor + if _processor is None: + _processor = MiqraWikiTextProcessor() + return _processor + + +def wikitext_to_intermediate_xml( + wikitext: str, *, column_c: bool = False +) -> str: + """Convert wikitext to an escaped intermediate XML fragment.""" + text = wikitext or "" + if column_c: + text = _preprocess_column_c(text) + result = _get_processor().process_wikitext(text) + return _escape_outside_tags(result.xml_content) + + +def reset_processor() -> None: + """Reset the shared processor (for tests).""" + global _processor + _processor = None diff --git a/opensiddur/importer/util/mediawiki_processor.py b/opensiddur/importer/util/mediawiki_processor.py new file mode 100644 index 0000000..208dfbf --- /dev/null +++ b/opensiddur/importer/util/mediawiki_processor.py @@ -0,0 +1,826 @@ +""" +MediaWiki/Wikitext to intermediate XML processor. + +Reusable framework originally built for the JPS1917 importer. Other importers +(e.g. Miqra al pi ha-Masorah) subclass ``MediaWikiProcessor`` and register their +own template/tag handlers. +""" + +from __future__ import annotations + +import re +import mwparserfromhell +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, List + + +class ProcessingStage(Enum): + """Stages of MediaWiki processing""" + PREPROCESS = "preprocess" + TEMPLATES = "templates" + TAGS = "tags" + POSTPROCESS = "postprocess" + + +@dataclass +class ConversionResult: + """Result of a conversion operation""" + xml_content: str + metadata: Dict[str, Any] + warnings: List[str] + errors: List[str] + wikilinks: List[Dict[str, Any]] + + +class MediaWikiProcessor: + """ + Modular MediaWiki to XML processor for JPS1917 content. + + This processor handles the conversion of MediaWiki syntax to XML, + with separate modules for different types of templates and tags. + """ + + def __init__(self): + self.template_handlers = {} + self.tag_handlers = {} + self.preprocessors = [] + self.postprocessors = [] + self.wikilinks = [] # Store captured wikilinks + self._initialize_handlers() + + def _initialize_handlers(self): + """Initialize all template and tag handlers""" + self._initialize_template_handlers() + self._initialize_tag_handlers() + self._initialize_preprocessors() + self._initialize_postprocessors() + self._initialize_wikilink_handlers() + + def _initialize_template_handlers(self): + """Initialize handlers for MediaWiki templates""" + + # Text Formatting Templates + self.template_handlers['sc'] = self._handle_small_caps + self.template_handlers['larger'] = self._handle_larger_text + self.template_handlers['x-larger'] = self._handle_x_larger_text + self.template_handlers['xx-larger'] = self._handle_xx_larger_text + self.template_handlers['xxx-larger'] = self._handle_xxx_larger_text + self.template_handlers['smaller'] = self._handle_smaller_text + + # Layout Templates + self.template_handlers['c'] = self._handle_center + self.template_handlers['right'] = self._handle_right_align + self.template_handlers['rule'] = self._handle_horizontal_rule + self.template_handlers['nop'] = self._handle_no_paragraph + + # Biblical Content Templates + self.template_handlers['verse'] = self._handle_verse + self.template_handlers['rh'] = self._handle_right_header + self.template_handlers['dropinitial'] = self._handle_drop_initial + self.template_handlers['dhr'] = self._handle_double_horizontal_rule + + # Navigation Templates + self.template_handlers['anchor'] = self._handle_anchor + self.template_handlers['anchor+'] = self._handle_anchor_plus + + # Language Templates + self.template_handlers['lang'] = self._handle_language + + # Reference Templates + self.template_handlers['smallrefs'] = self._handle_small_refs + + # Special Templates + self.template_handlers['hws'] = self._handle_hws + self.template_handlers['hwe'] = self._handle_hwe + self.template_handlers['***'] = self._handle_asterisks + self.template_handlers['reconstruct'] = self._handle_reconstruct + self.template_handlers['SIC'] = self._handle_sic + self.template_handlers['sic'] = self._handle_sic + self.template_handlers['sup'] = self._handle_superscript + self.template_handlers['bar'] = self._handle_bar + self.template_handlers['gap'] = self._handle_gap + self.template_handlers['overfloat left'] = self._handle_overfloat_left + self.template_handlers['float right'] = self._handle_float_right + self.template_handlers['smaller block/s'] = self._handle_smaller_block_start + self.template_handlers['smaller block/e'] = self._handle_smaller_block_end + + def _initialize_tag_handlers(self): + """Initialize handlers for HTML/XML tags""" + + # Structural Tags + self.tag_handlers['section'] = self._handle_section + self.tag_handlers['table'] = self._handle_table + self.tag_handlers['tr'] = self._handle_table_row + self.tag_handlers['td'] = self._handle_table_cell + + # Text Formatting Tags + self.tag_handlers['i'] = self._handle_italic + self.tag_handlers['br'] = self._handle_line_break + self.tag_handlers['span'] = self._handle_span + + # Content Tags + self.tag_handlers['dd'] = self._handle_definition_description + self.tag_handlers['ref'] = self._handle_reference + + # MediaWiki Specific Tags + self.tag_handlers['noinclude'] = self._handle_noinclude + self.tag_handlers['pagequality'] = self._handle_pagequality + + def _initialize_preprocessors(self): + """Initialize preprocessing functions""" + self.preprocessors = [ + self._fix_noinclude_line_breaks, + self._convert_paragraph_breaks, + self._normalize_whitespace, + self._handle_special_characters, # Enable special character processing + self._extract_metadata + ] + + def _initialize_postprocessors(self): + """Initialize postprocessing functions""" + self.postprocessors = [ + self._validate_xml_structure, + self._finalize_metadata + ] + + def _initialize_wikilink_handlers(self): + """Initialize wikilink processing""" + # Wikilinks are processed during the main parsing loop + pass + + def _process_nested_content(self, content: str, depth: int = 0) -> str: + """Recursively process nested templates and other elements""" + # Prevent infinite recursion + if depth > 10: + return content + + # Parse the content to handle nested elements + parsed = mwparserfromhell.parse(content) + nodes_to_replace = [] + + # Process nodes recursively + for node in parsed.nodes: + if hasattr(node, 'name'): # Template + template_name = str(node.name).strip() + if template_name in self.template_handlers: + try: + # Process nested content within the template + processed_node = self._process_template_with_nesting(node, depth + 1) + replacement = self.template_handlers[template_name](processed_node) + nodes_to_replace.append((node, replacement)) + except Exception as e: + # If nested processing fails, try without nesting + replacement = self.template_handlers[template_name](node) + nodes_to_replace.append((node, replacement)) + else: + # Unknown template - process its content for nested elements + processed_content = self._process_nested_content(str(node), depth + 1) + nodes_to_replace.append((node, processed_content)) + + elif hasattr(node, 'tag'): # Tag + tag_name = str(node.tag).strip().lower() + if tag_name in self.tag_handlers: + try: + # Process nested content within the tag + processed_node = self._process_tag_with_nesting(node, depth + 1) + replacement = self.tag_handlers[tag_name](processed_node) + nodes_to_replace.append((node, replacement)) + except Exception as e: + # If nested processing fails, try without nesting + replacement = self.tag_handlers[tag_name](node) + nodes_to_replace.append((node, replacement)) + else: + # Unknown tag - process its content for nested elements + processed_content = self._process_nested_content(str(node), depth + 1) + nodes_to_replace.append((node, processed_content)) + + elif hasattr(node, '__class__') and 'Wikilink' in str(node.__class__): # Wikilink + try: + replacement = self._handle_wikilink(node) + nodes_to_replace.append((node, replacement)) + except Exception as e: + # If wikilink processing fails, keep original + nodes_to_replace.append((node, str(node))) + + # Replace all nodes + for node, replacement in nodes_to_replace: + parsed.replace(node, replacement) + + return str(parsed) + + def _process_template_with_nesting(self, template, depth: int = 0) -> object: + """Process a template and its nested content""" + # Create a copy of the template to avoid modifying the original + import copy + processed_template = copy.deepcopy(template) + + # Process each parameter of the template + for param in processed_template.params: + if hasattr(param, 'value'): + # Process nested content in parameter values + processed_value = self._process_nested_content(str(param.value), depth + 1) + param.value = processed_value + + return processed_template + + def _process_tag_with_nesting(self, tag, depth: int = 0) -> object: + """Process a tag and its nested content""" + # Create a copy of the tag to avoid modifying the original + import copy + processed_tag = copy.deepcopy(tag) + + # Process nested content within the tag + if hasattr(processed_tag, 'contents') and processed_tag.contents: + processed_contents = self._process_nested_content(str(processed_tag.contents), depth + 1) + processed_tag.contents = processed_contents + + return processed_tag + + # ============================================================================ + # TEMPLATE HANDLERS + # ============================================================================ + + def _handle_small_caps(self, template) -> str: + """Convert {{sc|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_larger_text(self, template) -> str: + """Convert {{larger|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_x_larger_text(self, template) -> str: + """Convert {{x-larger|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_xx_larger_text(self, template) -> str: + """Convert {{xx-larger|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_xxx_larger_text(self, template) -> str: + """Convert {{xxx-larger|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_smaller_text(self, template) -> str: + """Convert {{smaller|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_center(self, template) -> str: + """Convert {{c|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_right_align(self, template) -> str: + """Convert {{right|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_horizontal_rule(self, template) -> str: + """Convert {{rule}} to """ + return '' + + def _handle_no_paragraph(self, template) -> str: + """Convert {{nop}} to """ + return '' + + def _handle_verse(self, template) -> str: + """Convert {{verse|chapter|verse|text}} to text""" + chapter = str(template.get('chapter', template.get(1, ''))).replace("chapter=", "") + verse = str(template.get('verse', template.get(2, ''))).replace("verse=", "") + text = str(template.get(3, template.get('text', ''))) + chapter_attr = f' chapter="{chapter}"' if chapter else '' + verse_attr = f' verse="{verse}"' if verse else '' + if not chapter or not verse: + print(f"Invalid verse template: {template} {template.get(1, '')=} {template.get(2, '')=} {template.get(3, '')=}") + + return f'{text}' + + def _handle_right_header(self, template) -> str: + """Convert {{rh|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_drop_initial(self, template) -> str: + """Convert {{dropinitial|letter}} to letter""" + letter = str(template.get(1, '')) + return f'{letter}' + + def _handle_double_horizontal_rule(self, template) -> str: + """Convert {{dhr}} to """ + value = str(template.get(1, '')) + if value: + value=f' value="{value}"' + else: + value="" + return f'' + + def _handle_anchor(self, template) -> str: + """Convert {{anchor|name}} to """ + name = str(template.get(1, '')) + return f'' + + def _handle_anchor_plus(self, template) -> str: + """Convert {{anchor+|name|text}} to text""" + name = str(template.get(1, '')) + text = str(template.get(2, '')) + return f'{text}' + + def _handle_language(self, template) -> str: + """Convert {{lang|code|text}} to text""" + code = str(template.get(1, '')) + text = str(template.get(2, '')) + return f'{text}' + + def _handle_small_refs(self, template) -> str: + """Convert {{smallrefs}} to """ + return '' + + def _handle_hws(self, template) -> str: + """Convert {{hws|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_hwe(self, template) -> str: + """Convert {{hwe|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_asterisks(self, template) -> str: + """Convert {{***}} to ***""" + n = str(template.get(1, '3')) + return f'***' + + def _handle_reconstruct(self, template) -> str: + """Convert {{reconstruct|content|text}} to text""" + content = str(template.get(1, '')) + text = str(template.get(2, '')) + return f'{content}{text}' + + def _handle_sic(self, template) -> str: + """Convert {{SIC|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_superscript(self, template) -> str: + """Convert {{sup|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + + def _handle_bar(self, template) -> str: + """Convert {{bar|length}} to """ + length = str(template.get(1, '6')) + return f'' + + def _handle_gap(self, template) -> str: + """Convert {{gap|length}} to """ + length = str(template.get(1, '')) + if length: + return f'' + else: + return '' + + def _handle_overfloat_left(self, template) -> str: + """Convert {{overfloat left|align|padding|text}} to text""" + # Get parameters - can be positional or named + align = str(template.get('align', template.get(1, ''))) + padding = str(template.get('padding', template.get(2, ''))) + text = str(template.get('text', template.get(3, ''))) + + # Clean up named parameters (remove parameter name prefixes) + align = align.replace('align=', '') if align.startswith('align=') else align + padding = padding.replace('padding=', '') if padding.startswith('padding=') else padding + text = text.replace('text=', '') if text.startswith('text=') else text + + # Build attributes + attributes = [] + if align: + attributes.append(f'align="{align}"') + if padding: + attributes.append(f'padding="{padding}"') + + attr_str = ' ' + ' '.join(attributes) if attributes else '' + + return f'{text}' + + def _handle_float_right(self, template) -> str: + """Convert {{float right|text}} to text""" + text = str(template.get(1, '')) + return f'{text}' + + def _handle_smaller_block_start(self, template) -> str: + """Convert {{smaller block/s}} to """ + return '' + + def _handle_smaller_block_end(self, template) -> str: + """Convert {{smaller block/e}} to """ + return '' + + # ============================================================================ + # WIKILINK HANDLERS + # ============================================================================ + + def _handle_wikilink(self, wikilink) -> str: + """Process and capture wikilinks""" + # Extract wikilink information + title = str(wikilink.title) if hasattr(wikilink, 'title') and wikilink.title else '' + text = str(wikilink.text) if hasattr(wikilink, 'text') and wikilink.text else title + + # Process templates within the wikilink text + processed_text = self._process_nested_content(text) + + # Store wikilink information + wikilink_info = { + 'title': title, + 'text': processed_text, + 'namespace': str(wikilink.namespace) if hasattr(wikilink, 'namespace') and wikilink.namespace else None, + 'section': str(wikilink.section) if hasattr(wikilink, 'section') and wikilink.section else None, + 'fragment': str(wikilink.fragment) if hasattr(wikilink, 'fragment') and wikilink.fragment else None + } + self.wikilinks.append(wikilink_info) + + # Convert to XML - use __link__ tag with attributes + attributes = [] + if title: + attributes.append(f'title="{title}"') + if wikilink_info['namespace']: + attributes.append(f'namespace="{wikilink_info["namespace"]}"') + if wikilink_info['section']: + attributes.append(f'section="{wikilink_info["section"]}"') + if wikilink_info['fragment']: + attributes.append(f'fragment="{wikilink_info["fragment"]}"') + + attr_str = ' ' + ' '.join(attributes) if attributes else '' + return f'<__link__{attr_str}>{processed_text}' + + # ============================================================================ + # TAG HANDLERS + # ============================================================================ + + def _handle_section(self, tag) -> str: + """Convert
to
with begin and end attributes""" + content = str(tag.contents) if tag.contents else '' + + # Extract begin and end attributes + attributes = [] + if hasattr(tag, 'attributes') and tag.attributes: + for attr in tag.attributes: + if hasattr(attr, 'name') and hasattr(attr, 'value'): + attr_name = str(attr.name) + attr_value = str(attr.value) + if attr_name in ['begin', 'end']: + attributes.append(f'{attr_name}="{attr_value}"') + + # Add begin and end attributes if they exist + attr_str = ' ' + ' '.join(attributes) if attributes else '' + + return f'{content}
' + + def _handle_table(self, tag) -> str: + """Convert to
""" + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}
' + + def _handle_table_row(self, tag) -> str: + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + + def _handle_table_cell(self, tag) -> str: + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + + def _handle_italic(self, tag) -> str: + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + + def _handle_line_break(self, tag) -> str: + """Convert
to
""" + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'' + + def _handle_span(self, tag) -> str: + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + + def _handle_definition_description(self, tag) -> str: + """Convert
to
""" + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}
' + + def _handle_reference(self, tag) -> str: + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + + def _handle_noinclude(self, tag) -> str: + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + + def _handle_pagequality(self, tag) -> str: + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + + def _extract_tag_attributes(self, tag) -> Dict[str, str]: + """Extract all attributes from a tag""" + attributes = {} + if hasattr(tag, 'attributes') and tag.attributes: + for attr in tag.attributes: + if hasattr(attr, 'name') and hasattr(attr, 'value'): + attributes[str(attr.name)] = str(attr.value) + return attributes + + # ============================================================================ + # PREPROCESSORS + # ============================================================================ + + def _fix_noinclude_line_breaks(self, content: str) -> str: + """Insert a blank line after tags when followed by non-whitespace content""" + # Pattern to match followed by optional whitespace and any non-whitespace character + # This handles cases like: :text, text, {{template}}, etc. + pattern = r'()\s*(\S)' + + def replace_noinclude_content(match): + noinclude_tag = match.group(1) + following_content = match.group(2) + # Insert a newline after and before the following content + return f'{noinclude_tag}\n{following_content}' + + # Apply the replacement + content = re.sub(pattern, replace_noinclude_content, content) + + return content + + def _normalize_whitespace(self, content: str) -> str: + """Normalize whitespace in content""" + # Normalize multiple spaces to single space + content = re.sub(r' +', ' ', content) + # Normalize line breaks, but preserve paragraph markers + content = re.sub(r'\n+', '\n', content) + return content.strip() + + def _convert_paragraph_breaks(self, content: str) -> str: + """Convert double newlines to paragraph indicators, but skip if {{nop}} is directly adjacent""" + + # First, protect {{nop}} markers and their immediate context + # Replace {{nop}} with a temporary marker + content = content.replace('{{nop}}', '___NOP_MARKER___') + + # Convert \n\n to

\n paragraph indicators, but not if they're adjacent to ___NOP_MARKER___ + # This regex matches \n\n that are NOT preceded or followed by ___NOP_MARKER___ + content = re.sub(r'(?\n', content) + + # Restore {{nop}} markers + content = content.replace('___NOP_MARKER___', '{{nop}}') + + return content + + def _handle_special_characters(self, content: str) -> str: + """Handle special characters and entities - escape ampersands not in XML/HTML entities""" + # More comprehensive regex to match XML/HTML entities + # This includes named entities like &, <, >, ", ' + # and numeric entities like { and  + entity_pattern = r'&(?:[a-zA-Z][a-zA-Z0-9]*|#[0-9]+|#x[0-9a-fA-F]+);' + + # Split content by entities to preserve them + parts = re.split(f'({entity_pattern})', content) + + # Process each part + result_parts = [] + for part in parts: + if re.match(entity_pattern, part): + # This is an entity, keep it as-is + result_parts.append(part) + else: + # This is not an entity, escape standalone ampersands + escaped_part = part.replace('&', '&') + result_parts.append(escaped_part) + + return ''.join(result_parts) + + def _extract_metadata(self, content: str) -> Dict[str, Any]: + """Extract metadata from content""" + metadata = {} + # Extract page quality information + # Extract language information + # Extract structural information + return metadata + + # ============================================================================ + # POSTPROCESSORS + # ============================================================================ + + def _validate_xml_structure(self, content: str) -> str: + """Validate and fix XML structure""" + # Ensure proper nesting + # Validate against schema + # Fix common issues + return content + + def _cleanup_empty_elements(self, content: str) -> str: + """Remove or fix empty elements""" + # Remove empty elements + content = re.sub(r'<(\w+)[^>]*>', '', content) + return content + + def _finalize_metadata(self, content: str) -> str: + """Finalize metadata and add to content""" + # Add final metadata + # Ensure proper document structure + return content + + # ============================================================================ + # MAIN PROCESSING METHODS + # ============================================================================ + + def process_wikitext(self, wikitext: str) -> ConversionResult: + """ + Main method to process MediaWiki wikitext to XML. + + Args: + wikitext: The MediaWiki content to convert + + Returns: + ConversionResult with XML content and metadata + """ + warnings = [] + errors = [] + metadata = {} + + try: + # Preprocessing + content = wikitext + for preprocessor in self.preprocessors: + if preprocessor == self._extract_metadata: + metadata.update(preprocessor(content)) + else: + content = preprocessor(content) + + # Parse MediaWiki content + parsed = mwparserfromhell.parse(content) + + # Process all nodes with nested content support + nodes_to_replace = [] + + # Process nodes in the order they appear in the document + for node in parsed.nodes: + if hasattr(node, 'name'): # Template + template_name = str(node.name).strip() + if template_name in self.template_handlers: + try: + # Process nested content within the template + processed_node = self._process_template_with_nesting(node) + replacement = self.template_handlers[template_name](processed_node) + nodes_to_replace.append((node, replacement)) + except Exception as e: + errors.append(f"Error processing template {template_name}: {str(e)}") + else: + warnings.append(f"Unknown template: {template_name}") + + elif hasattr(node, 'tag'): # Tag + tag_name = str(node.tag).strip().lower() + if tag_name in self.tag_handlers: + try: + # Process nested content within the tag + processed_node = self._process_tag_with_nesting(node) + replacement = self.tag_handlers[tag_name](processed_node) + nodes_to_replace.append((node, replacement)) + except Exception as e: + errors.append(f"Error processing tag {tag_name}: {str(e)}") + else: + warnings.append(f"Unknown tag: {tag_name}") + + elif hasattr(node, '__class__') and 'Wikilink' in str(node.__class__): # Wikilink + try: + replacement = self._handle_wikilink(node) + nodes_to_replace.append((node, replacement)) + except Exception as e: + errors.append(f"Error processing wikilink: {str(e)}") + + # Replace all nodes in order + for node, replacement in nodes_to_replace: + parsed.replace(node, replacement) + + # Get processed content + xml_content = str(parsed) + + # Postprocessing + for postprocessor in self.postprocessors: + xml_content = postprocessor(xml_content) + + # Wrap in mediawiki tag + xml_content = f'{xml_content}' + + return ConversionResult( + xml_content=xml_content, + metadata=metadata, + warnings=warnings, + errors=errors, + wikilinks=self.wikilinks.copy() + ) + + except Exception as e: + errors.append(f"Fatal error in processing: {str(e)}") + return ConversionResult( + xml_content="", + metadata={}, + warnings=warnings, + errors=errors, + wikilinks=[] + ) + + def add_template_handler(self, template_name: str, handler_func): + """Add a custom template handler""" + self.template_handlers[template_name] = handler_func + + def add_tag_handler(self, tag_name: str, handler_func): + """Add a custom tag handler""" + self.tag_handlers[tag_name] = handler_func + + def add_preprocessor(self, preprocessor_func): + """Add a custom preprocessor""" + self.preprocessors.append(preprocessor_func) + + def add_postprocessor(self, postprocessor_func): + """Add a custom postprocessor""" + self.postprocessors.append(postprocessor_func) + + def get_wikilinks(self) -> List[Dict[str, Any]]: + """Get all captured wikilinks""" + return self.wikilinks.copy() + + def clear_wikilinks(self): + """Clear all captured wikilinks""" + self.wikilinks.clear() + + +# ============================================================================ +# CONVENIENCE FUNCTIONS +# ============================================================================ + +def create_processor() -> MediaWikiProcessor: + """Create a new MediaWiki processor instance""" + return MediaWikiProcessor() + + +def process_page(page_content: str) -> ConversionResult: + """Process a single page of MediaWiki content""" + processor = create_processor() + return processor.process_wikitext(page_content) + + +def _demo_main() -> None: # pragma: no cover + processor = create_processor() + sample_wikitext = """ + {{verse|1|1|In the beginning God created the heaven and the earth.}} + + {{verse|1|2|And the earth was without form, and void; and darkness was upon the face of the deep.}} + + {{sc|Genesis}} {{c|Chapter 1}} + {{larger|The Creation}} + This is a reference + + See also [[Genesis]] and [[Creation myth]] for more information. + + Nested example: {{sc|{{larger|Bold Large Text}}}} + Complex nested: {{verse|1|3|{{sc|God}} said, {{larger|Let there be light}}}} + """ + result = processor.process_wikitext(sample_wikitext) + print("XML Output:") + print(result.xml_content) + print("\nWarnings:", result.warnings) + print("Errors:", result.errors) + print("Wikilinks:", result.wikilinks) + + +if __name__ == "__main__": # pragma: no cover + _demo_main() diff --git a/opensiddur/importer/util/pages.py b/opensiddur/importer/util/pages.py index 117d25a..6f33d4e 100644 --- a/opensiddur/importer/util/pages.py +++ b/opensiddur/importer/util/pages.py @@ -29,6 +29,21 @@ def jps1917_credits_directory(sourcetexts_root: Path | None = None) -> Path: return jps1917_data_directory(sourcetexts_root) / "credits" +def miqra_al_pi_hamasorah_data_directory(sourcetexts_root: Path | None = None) -> Path: + """Miqra al pi ha-Masorah raw dumps: /miqra_al_pi_hamasorah.""" + root = ( + sourcetexts_root.resolve() + if sourcetexts_root is not None + else default_sourcetexts_root() + ) + return root / "miqra_al_pi_hamasorah" + + +def miqra_al_pi_hamasorah_sheets_directory(sourcetexts_root: Path | None = None) -> Path: + """Per-tab TSV files from the Google Sheet export.""" + return miqra_al_pi_hamasorah_data_directory(sourcetexts_root) / "sheets" + + def get_page(page_number: str | int, sourcetexts_root: Path | None = None) -> Optional[Page]: """Return the wikitext of the given Page, or None if it does not exist.""" page_num = int(page_number) diff --git a/opensiddur/importer/wlc/download_tanach.py b/opensiddur/importer/wlc/download_tanach.py index f3f74e6..1790156 100644 --- a/opensiddur/importer/wlc/download_tanach.py +++ b/opensiddur/importer/wlc/download_tanach.py @@ -72,9 +72,13 @@ def main(argv: list[str] | None = None) -> int: return 0 -if __name__ == "__main__": +def _run_cli() -> None: # pragma: no cover try: sys.exit(main()) except Exception as e: logger.error("Error downloading/unzipping Tanach: %s", e) raise + + +if __name__ == "__main__": # pragma: no cover + _run_cli() diff --git a/opensiddur/importer/wlc/wlc.py b/opensiddur/importer/wlc/wlc.py index bc69f1b..48f8019 100644 --- a/opensiddur/importer/wlc/wlc.py +++ b/opensiddur/importer/wlc/wlc.py @@ -93,5 +93,5 @@ def main(argv: list[str] | None = None) -> int: return 0 -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover sys.exit(main()) diff --git a/opensiddur/tests/exporter/test_latex.py b/opensiddur/tests/exporter/test_latex.py index aa9f537..ffd14b0 100644 --- a/opensiddur/tests/exporter/test_latex.py +++ b/opensiddur/tests/exporter/test_latex.py @@ -242,6 +242,21 @@ def test_dedupes_when_multiple_files_share_index(self): preamble, _ = extract_sources([f1, f2]) self.assertEqual(preamble.count("@"), 1) + def test_bibtex_wraps_hebrew_fields_in_texthebrew(self): + index = """ + + + + ืžืงืจื ืขืœ ืคื™ ื”ืžืกื•ืจื” + Avi Kadish + + + """.encode("utf-8") + doc = self._create("p", "doc.xml", b"") + self._create("p", "index.xml", index) + preamble, _ = extract_sources([doc]) + self.assertIn(r"title = {\texthebrew{ืžืงืจื ืขืœ ืคื™ ื”ืžืกื•ืจื”}}", preamble) + class TestGetFileReferences(unittest.TestCase): diff --git a/opensiddur/tests/exporter/test_reledmac_xslt.py b/opensiddur/tests/exporter/test_reledmac_xslt.py index e67d5fe..3aca2e8 100644 --- a/opensiddur/tests/exporter/test_reledmac_xslt.py +++ b/opensiddur/tests/exporter/test_reledmac_xslt.py @@ -533,7 +533,7 @@ def test_lb_emits_leavevmode_linebreak(self): """ out = _transform(xml) - self.assertIn(r"\leavevmode\\", out) + self.assertIn(r"\leavevmode\\{}", out) class TestStructuralElements(unittest.TestCase): @@ -565,8 +565,44 @@ def test_div_head_emits_sectioning(self): """ out = _transform(xml) - # Top-level body div with head โ†’ \eledchapter - self.assertIn(r"\eledchapter{Genesis}", out) + # Top-level body div with head โ†’ \eledchapter (LTR wrapper when not Hebrew) + self.assertIn( + r"\eledchapter{{\textdir TLT\selectlanguage{english}Genesis}}", + out, + ) + + def test_english_head_in_hebrew_document_uses_ltr_wrapper(self): + xml = """ + + + + Genesis + ื‘ึฐึผืจึตืืฉึดืื™ืช + + + """ + out = _transform(xml) + self.assertIn( + r"\eledchapter{{\textdir TLT\selectlanguage{english}Genesis}}", + out, + ) + + def test_hebrew_head_in_hebrew_document_has_no_ltr_wrapper(self): + xml = """ + + + + ื‘ืจืืฉื™ืช + ื‘ึฐึผืจึตืืฉึดืื™ืช + + + """ + out = _transform(xml) + self.assertIn(r"\eledchapter{ื‘ืจืืฉื™ืช}", out) + self.assertNotIn( + r"\eledchapter{{\textdir TLT\selectlanguage{english}ื‘ืจืืฉื™ืช}}", + out, + ) if __name__ == "__main__": diff --git a/opensiddur/tests/fixtures/miqra_minimal.xlsx b/opensiddur/tests/fixtures/miqra_minimal.xlsx new file mode 100644 index 0000000..fbb6dd8 Binary files /dev/null and b/opensiddur/tests/fixtures/miqra_minimal.xlsx differ diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/__init__.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py new file mode 100644 index 0000000..a2be3e5 --- /dev/null +++ b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py @@ -0,0 +1,148 @@ +import unittest +from pathlib import Path +from unittest.mock import patch +import tempfile + + +from opensiddur.importer.miqra_al_pi_hamasorah.convert_tsv import ( + _extract_chapter_verse_numbers, + main, +) + + +class TestMiqraConvertTsv(unittest.TestCase): + @patch("opensiddur.importer.miqra_al_pi_hamasorah.convert_tsv.validate") + def test_only_book_writes_output(self, mock_validate): + mock_validate.return_value = (True, []) + + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + sourcetexts_root = tmp_path / "sources" + sheets_dir = sourcetexts_root / "miqra_al_pi_hamasorah" / "sheets" + sheets_dir.mkdir(parents=True, exist_ok=True) + + # Minimal README (front matter) + (sheets_dir / "readme.tsv").write_text( + "License\tCC-BY-SA 4.0\nAttribution\tHebrew Wikisource\n", + encoding="utf-8", + ) + + # Torah TSV: parashah in nav + two verses in one paragraph + (sheets_dir / "torah.tsv").write_text( + "\t".join(["Page key", "Row id", "Nav", "Scaffold", "Text"]) + + "\n" + + "\t".join( + [ + "ืกืคืจ ื‘ืจืืฉื™ืช/ื", + "ื", + "//{{ืคืค}}//", + "{{ืž:ืคืกื•ืง|ื‘ืจืืฉื™ืช|1|1}}", + '{{ื ื•ืกื—|{{ืž:ืื•ืช-ื’|ื‘ึผึฐ}}ืจึตืืฉืึดึ–ื™ืช|2=test note}}', + ] + ) + + "\n" + + "\t".join( + [ + "ืกืคืจ ื‘ืจืืฉื™ืช/ื", + "ื‘", + "", + "{{ืž:ืคืกื•ืง|ื‘ืจืืฉื™ืช|1|2}}", + "ื•ึฐื”ึธืึธึ—ืจึถืฅ ื”ึธื™ึฐืชึธึฅื” ืชึนึ™ื”ื•ึผึ™ ื•ึธื‘ึนึ”ื”ื•ึผ", + ] + ) + + "\n", + encoding="utf-8", + ) + + project_dir = tmp_path / "project" + rc = main( + [ + "--sourcetexts-root", + str(sourcetexts_root), + "--project-dir", + str(project_dir), + "--only-book", + "genesis", + ] + ) + self.assertEqual(rc, 0) + + genesis_xml = project_dir / "genesis.xml" + self.assertTrue(genesis_xml.exists()) + xml = genesis_xml.read_text(encoding="utf-8") + self.assertIn("", xml) + self.assertIn('', xml) + self.assertIn("ื•ึฐื”ึธืึธึ—ืจึถืฅ", xml) + self.assertIn('', xml) + self.assertIn("Genesis", xml) + self.assertIn('rend="large"', xml) + self.assertIn("ื‘ึผึฐ", xml) + self.assertIn("tei:standOff", xml) + self.assertIn("test note", xml) + # Standoff notes must link to the in-text marker for annotation resolution + self.assertIn('target="#miqra-note-1-ref', xml) + + def test_special_tsv_row_does_not_produce_invalid_urn_segments(self): + # special.tsv uses a 2-column schema; must not be merged into book output. + ch, v = _extract_chapter_verse_numbers( + "ืกืคืจ ืฉืžื•ืช/ื˜ื• ืชืชืช", + "{{#ืงื˜ืข:ืฉื™ืจืช ื”ื™ื/ืฆื•ืจืช ื”ืฉื™ืจ|ืฆื•ืจืช-ื”ืฉื™ืจ}}{{ืž:ื˜ืขืžื™", + "", + ) + self.assertEqual(ch, "") + self.assertEqual(v, "") + + @patch("opensiddur.importer.miqra_al_pi_hamasorah.convert_tsv.validate") + def test_special_tsv_not_merged_into_book(self, mock_validate): + mock_validate.return_value = (True, []) + + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + sourcetexts_root = tmp_path / "sources" + sheets_dir = sourcetexts_root / "miqra_al_pi_hamasorah" / "sheets" + sheets_dir.mkdir(parents=True, exist_ok=True) + + (sheets_dir / "torah.tsv").write_text( + "\t".join(["Page key", "Row id", "Nav", "Scaffold", "Text"]) + + "\n" + + "\t".join( + [ + "ืกืคืจ ืฉืžื•ืช/ื˜ื•", + "ื", + "", + "{{ืž:ืคืกื•ืง|ืฉืžื•ืช|15|1}}", + "ืฉื™ืจื”", + ] + ) + + "\n", + encoding="utf-8", + ) + (sheets_dir / "special.tsv").write_text( + "ืกืคืจ ืฉืžื•ืช/ื˜ื• ืชืชืช\t{{#ืงื˜ืข:ืฉื™ืจืช ื”ื™ื/ืฆื•ืจืช ื”ืฉื™ืจ|ืฆื•ืจืช-ื”ืฉื™ืจ}}{{ืž:ื˜ืขืžื™\n", + encoding="utf-8", + ) + + project_dir = tmp_path / "project" + main( + [ + "--sourcetexts-root", + str(sourcetexts_root), + "--project-dir", + str(project_dir), + "--only-book", + "exodus", + ] + ) + xml = (project_dir / "exodus.xml").read_text(encoding="utf-8") + self.assertIn("urn:x-opensiddur:text:bible:exodus/15/1", xml) + self.assertNotIn("ืฆื•ืจืช-ื”ืฉื™ืจ", xml) + self.assertNotIn("ื”ืฉื™ืจ|", xml) + + +if __name__ == "__main__": + unittest.main() + diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_download.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_download.py new file mode 100644 index 0000000..6e4c498 --- /dev/null +++ b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_download.py @@ -0,0 +1,106 @@ +import json +import tempfile +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + +from opensiddur.importer.miqra_al_pi_hamasorah import download +from opensiddur.importer.util.pages import ( + miqra_al_pi_hamasorah_data_directory, + miqra_al_pi_hamasorah_sheets_directory, +) + +FIXTURE_XLSX = ( + Path(__file__).resolve().parents[2] / "fixtures" / "miqra_minimal.xlsx" +) + + +class TestDownloadMiqra(unittest.TestCase): + def setUp(self) -> None: + self.tmp = tempfile.TemporaryDirectory() + self.sourcetexts_root = Path(self.tmp.name) + + def tearDown(self) -> None: + self.tmp.cleanup() + + def _mock_response(self) -> MagicMock: + response = MagicMock() + response.raise_for_status = MagicMock() + response.content = FIXTURE_XLSX.read_bytes() + return response + + @patch("opensiddur.importer.miqra_al_pi_hamasorah.download.requests.get") + def test_download_writes_tsv_and_manifest(self, mock_get: MagicMock) -> None: + mock_get.return_value = self._mock_response() + + download.download_miqra(self.sourcetexts_root) + + data_dir = miqra_al_pi_hamasorah_data_directory(self.sourcetexts_root) + sheets_dir = miqra_al_pi_hamasorah_sheets_directory(self.sourcetexts_root) + + torah_tsv = sheets_dir / "torah.tsv" + readme_tsv = sheets_dir / "readme.tsv" + self.assertTrue(torah_tsv.is_file()) + self.assertTrue(readme_tsv.is_file()) + self.assertFalse((sheets_dir / "unknowntab.tsv").exists()) + + torah_lines = torah_tsv.read_text(encoding="utf-8").splitlines() + self.assertEqual(len(torah_lines), 2) + self.assertIn("ื‘ึฐึผืจึตืืฉึดืื™ืช", torah_lines[1]) + + manifest_path = data_dir / "manifest.json" + self.assertTrue(manifest_path.is_file()) + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + self.assertEqual(manifest["spreadsheet_id"], download.SPREADSHEET_ID) + slugs = {s["slug"] for s in manifest["sheets"]} + self.assertIn("torah", slugs) + self.assertIn("readme", slugs) + for entry in manifest["sheets"]: + self.assertIn("sha256", entry) + self.assertEqual(len(entry["sha256"]), 64) + + xlsx_files = list(data_dir.glob("*.xlsx")) + self.assertEqual(xlsx_files, []) + + mock_get.assert_called_once() + call_kwargs = mock_get.call_args + self.assertEqual(call_kwargs[0][0], download.EXPORT_XLSX_URL) + self.assertIn("User-Agent", call_kwargs[1]["headers"]) + + @patch("opensiddur.importer.miqra_al_pi_hamasorah.download.requests.get") + def test_dry_run_writes_nothing(self, mock_get: MagicMock) -> None: + download.download_miqra(self.sourcetexts_root, dry_run=True) + + data_dir = miqra_al_pi_hamasorah_data_directory(self.sourcetexts_root) + self.assertFalse(data_dir.exists()) + mock_get.assert_not_called() + + @patch("opensiddur.importer.miqra_al_pi_hamasorah.download.logger") + @patch("opensiddur.importer.miqra_al_pi_hamasorah.download.requests.get") + def test_unknown_sheet_logs_warning( + self, mock_get: MagicMock, mock_logger: MagicMock + ) -> None: + mock_get.return_value = self._mock_response() + download.download_miqra(self.sourcetexts_root) + + warning_calls = [ + c + for c in mock_logger.warning.call_args_list + if "UnknownTab" in str(c) + ] + self.assertEqual(len(warning_calls), 1) + + def test_main_dry_run_exit_code(self) -> None: + with patch( + "opensiddur.importer.miqra_al_pi_hamasorah.download.download_miqra" + ) as mock_download: + code = download.main( + ["--dry-run", "--sourcetexts-root", str(self.sourcetexts_root)] + ) + self.assertEqual(code, 0) + mock_download.assert_called_once() + self.assertTrue(mock_download.call_args.kwargs["dry_run"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py new file mode 100644 index 0000000..8fcc69a --- /dev/null +++ b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py @@ -0,0 +1,389 @@ +import unittest + +from opensiddur.importer.miqra_al_pi_hamasorah.miqra_wikitext import ( + _escape_outside_tags, + _preprocess_column_c, + _preprocess_miqra_tags, + _wikitext_basic_markup_to_xml, + _xml_escape, + link_target_to_uri, + normalize_template_name, + reset_processor, + wikitext_to_intermediate_xml, +) + + +class TestNormalizeTemplateName(unittest.TestCase): + def test_strips_whitespace(self): + self.assertEqual(normalize_template_name(" ืคืค "), "ืคืค") + + def test_strips_tevnit_prefix(self): + self.assertEqual(normalize_template_name("ืชื‘ื ื™ืช:ืž:ื˜ืขื"), "ืž:ื˜ืขื") + self.assertEqual(normalize_template_name("ืชื‘ื ื™ืช:ื ื•ืกื—"), "ื ื•ืกื—") + + def test_normalizes_quotes(self): + self.assertEqual(normalize_template_name("ืž:ื›ื•''ืง"), 'ืž:ื›ื•"ืง') + self.assertEqual( + normalize_template_name("ืž:ืงื•ืดื›"), + 'ืž:ืงื•"ื›', + ) + + +class TestLinkTargetToUri(unittest.TestCase): + def test_empty_target(self): + self.assertEqual(link_target_to_uri(""), "") + self.assertEqual(link_target_to_uri(" "), "") + + def test_protocol_relative_url(self): + self.assertEqual( + link_target_to_uri("//cdn.example.com/x.pdf"), + "https://cdn.example.com/x.pdf", + ) + + def test_fragment_preserved(self): + uri = link_target_to_uri("ื“ืฃ#ืคืจืง") + self.assertIn("#", uri) + self.assertTrue(uri.startswith("https://he.wikisource.org/wiki/")) + + +class TestPreprocessors(unittest.TestCase): + def test_column_c_double_underscore(self): + self.assertEqual(_preprocess_column_c("a__b"), "a b") + + def test_column_c_line_break(self): + self.assertEqual( + _preprocess_column_c("http://host/path"), + "http://host/path", + ) + self.assertEqual( + _preprocess_column_c("https://host/path"), + "https://host/path", + ) + self.assertIn("", _preprocess_column_c("ืฉื•ืจื”//ื”ืžืฉืš")) + + def test_miqra_keteg_tags(self): + s = "<ืงื˜ืข ื”ืชื—ืœื”=foo/>text<ืงื˜ืข ืกื•ืฃ=foo/>" + out = _preprocess_miqra_tags(s) + self.assertIn('', out) + self.assertIn('', out) + + +class TestMarkupAndEscape(unittest.TestCase): + def test_xml_escape(self): + self.assertEqual( + _xml_escape('a & b "d" \'e\''), + "a & b <c> "d" 'e'", + ) + + def test_wikitext_bold_italic(self): + self.assertEqual( + _wikitext_basic_markup_to_xml("plain '''bold''' ''italic''"), + 'plain bold italic', + ) + + def test_wikitext_bold_italic_combined(self): + self.assertIn( + 'rend="bold-italic"', + _wikitext_basic_markup_to_xml("'''''both'''''"), + ) + + def test_escape_outside_tags_preserves_miqra_elements(self): + inner = _escape_outside_tags( + "plain ื '''bold'''" + ) + self.assertIn("", inner) + self.assertIn("ื", inner) + self.assertIn('rend="bold"', inner) + + def test_wikitext_markup_in_verse_via_integration(self): + frag = wikitext_to_intermediate_xml("'''ื“ื‘ืจ'''") + self.assertIn('', frag) + self.assertIn("ื“ื‘ืจ", frag) + + +class TestMiqraWikitext(unittest.TestCase): + def setUp(self): + reset_processor() + + def test_nosach_nested_large_letter(self): + frag = wikitext_to_intermediate_xml( + '{{ื ื•ืกื—|{{ืž:ืื•ืช-ื’|ื‘ึผึฐ}}ืจึตืืฉืึดึ–ื™ืช|2=note text}}' + ) + self.assertIn("', frag) + self.assertIn("ื‘ึผึฐ", frag) + self.assertIn("', frag) + self.assertIn("ื›ืชื™ื‘", frag) + self.assertIn("ืงึฐืจึดื™", frag) + + def test_qeri_ketiv(self): + frag = wikitext_to_intermediate_xml('{{ืงื•"ื›|ื›ืชื™ื‘|ืงึฐืจึดื™}}') + self.assertIn('order="qeri-first"', frag) + + def test_parashah_open(self): + frag = wikitext_to_intermediate_xml("{{ืคืค}}") + self.assertIn('(ื›ืชื™ื‘)", k) + self.assertIn("[ืงึฐืจึดื™]", q) + + def test_qok_if_matres(self): + frag = wikitext_to_intermediate_xml( + '{{ืž:ืงื•"ื›-ืื-2|display|ื›ืชื™ื‘|ืงึฐืจึดื™}}' + ) + self.assertIn("display", frag) + self.assertIn("", frag) + self.assertIn("ื›ืชื™ื‘", frag) + self.assertIn("ืงึฐืจึดื™", frag) + + def test_qok_two_qeri_words(self): + frag = wikitext_to_intermediate_xml( + '{{ืž:ืงื•"ื› ืงืจื™ ืฉื•ื ื” ืžื”ื›ืชื™ื‘ ื‘ืฉืชื™ ืžื™ืœื™ื|ื›ืชื™ื‘|ืง1|ืง2}}' + ) + self.assertIn('type="split-qeri"', frag) + self.assertIn("ืง1", frag) + self.assertIn("ืง2", frag) + self.assertIn("ื›ืชื™ื‘", frag) + + def test_parashah_variants(self): + cases = [ + ("{{ืคืคืค}}", 'type="open-line"'), + ("{{ืกืก}}", 'type="close"'), + ("{{ืกืกืก}}", 'type="close-inline"'), + ("{{ืกืก2}}", 'type="close-narrow"'), + ("{{ืž:ืฉืฉืฉ}}", 'type="shirah"'), + ] + for wikitext, expected in cases: + with self.subTest(wikitext=wikitext): + self.assertIn(expected, wikitext_to_intermediate_xml(wikitext)) + + def test_parashah_mid_verse_attribute(self): + frag = wikitext_to_intermediate_xml("{{ืคืค|ืคืกืงื ื‘ืืžืฆืข ืคืกื•ืง}}") + self.assertIn('midVerse="true"', frag) + + def test_poetic_levels(self): + for level, template in enumerate(("ืจ0", "ืจ1", "ืจ2", "ืจ3", "ืจ4")): + frag = wikitext_to_intermediate_xml(f"{{{{{template}}}}}") + self.assertIn(f'', frag) + + def test_centered_title(self): + frag = wikitext_to_intermediate_xml("{{ืคืจืฉื”-ืžืจื›ื–|ื›ื•ืชืจืช}}") + self.assertIn("ื›ื•ืชืจืช", frag) + + def test_letter_formatting(self): + frag = wikitext_to_intermediate_xml( + "{{ืž:ืื•ืช-ืง|ืง}}{{ืž:ืื•ืช ืชืœื•ื™ื”|ืช}}{{ืž:ืื•ืช ืžื ื•ืงื“ืช|ืž}}{{ืž:ื ื•\"ืŸ ื”ืคื•ื›ื”|ืŸ}}" + ) + self.assertIn('rend="small"', frag) + self.assertIn('rend="raised"', frag) + self.assertIn("", frag) + self.assertIn("", frag) + + def test_yerushalem_variants(self): + y = wikitext_to_intermediate_xml("{{ืž:ื™ืจื•ืฉืœื|v|a}}") + ya = wikitext_to_intermediate_xml("{{ืž:ื™ืจื•ืฉืœืžื”|v|a}}") + self.assertIn('', y) + self.assertIn('', ya) + + def test_standalone_accents(self): + frag = wikitext_to_intermediate_xml( + "{{ื™ืจื— ื‘ืŸ ื™ื•ืžื•}}{{ื’ืœื’ืœ}}{{ืืชื ื— ื”ืคื•ืš}}" + ) + self.assertIn('type="yerah-ben-yomo"', frag) + self.assertIn('type="galgal"', frag) + self.assertIn('type="etnah-hafukh"', frag) + + def test_taam_handlers(self): + frag = wikitext_to_intermediate_xml( + "{{ืž:ื˜ืขื ื•ืžืชื’ ื‘ืื•ืช ืื—ืช|ื}}" + "{{ืฉื ื™ ื˜ืขืžื™ื ื‘ืื•ืช ืื—ืช}}" + "{{ืž:ื’ืจืฉ ื•ืชืœื™ืฉื ื’ื“ื•ืœื”}}" + "{{ืž:ื’ืจืฉื™ื™ื ื•ืชืœื™ืฉื ื’ื“ื•ืœื”}}" + ) + self.assertIn("ื", frag) + self.assertIn('type="geresh-telisha-gedola"', frag) + self.assertIn('type="gershayim-telisha-gedola"', frag) + + def test_qamats_named_params(self): + frag = wikitext_to_intermediate_xml("{{ืž:ืงืžืฅ|ื“=ื“ึธึผ}}") + self.assertIn("ื“ึธึผ", frag) + + def test_taam_dummy_strips_leading_marker(self): + frag = wikitext_to_intermediate_xml("{{ืž:ื˜ืขื|Xืื•ืช}}") + self.assertIn("ืื•ืช", frag) + self.assertNotIn("Xืื•ืช", frag) + + def test_qupo_accent(self): + frag = wikitext_to_intermediate_xml( + "{{ืฉื ื™ ื˜ืขืžื™ื ื‘ืื•ืช ืื—ืช ืงืžืฅ-ืชื—ืชื•ืŸ-ืคืชื—-ืขืœื™ื•ืŸ|ืขืœื™ื•=ื}}" + ) + self.assertIn('', frag) + + def test_punctuation_and_maqaf(self): + frag = wikitext_to_intermediate_xml( + "{{ืž:ืœื’ืจืžื™ื”-2}}{{ืž:ืคืกืง}}{{ืž:ืžืงืฃ ืืคื•ืจ}}" + ) + self.assertIn('type="legarmeh"', frag) + self.assertIn('type="paseq"', frag) + self.assertIn('rend="grey"', frag) + + def test_kol_qamats_default(self): + self.assertIn("ื›ึผึธืœ", wikitext_to_intermediate_xml("{{ืž:ื›ืœ ืงืžืฅ ืงื˜ืŸ ืžืจื›ื}}")) + + def test_notes_and_anchors(self): + frag = wikitext_to_intermediate_xml( + "{{ืž:ื”ืขืจื”|ื’ื•ืฃ ื”ืขืจื”}}{{ืขื•ื’ืŸ ื‘ืฉื•ืจื”|label}}" + "{{ืž:ืกื™ื•ื ื‘ื˜ื•ื‘|ืกื•ืฃ ื˜ื•ื‘}}" + ) + self.assertIn("', frag) + self.assertIn("ืกื•ืฃ ื˜ื•ื‘", frag) + + def test_dual_trope_and_accent(self): + frag = wikitext_to_intermediate_xml( + "{{ืงืง|target}}" + "{{ืž:ื›ืคื•ืœ|ื›ืคื•ืœ=ื“|ื=ื|ื‘=ื‘}}" + ) + self.assertIn("target", frag) + self.assertIn('', frag) + self.assertIn('role="ื"', frag) + self.assertIn('role="ื‘"', frag) + + def test_emphasis_and_footnote_mark(self): + frag = wikitext_to_intermediate_xml("{{ืžื•ื“ื’ืฉ|ื—ืฉื•ื‘}}{{ืฉ}}") + self.assertIn('ื—ืฉื•ื‘', frag) + self.assertIn("", frag) + + def test_wikilink(self): + frag = wikitext_to_intermediate_xml("[[ื“ืฃ]] and [[ื“ืฃ|ืชื•ื•ื™ืช]]") + self.assertIn('hiddenstill" + ) + self.assertIn("visible", frag) + self.assertIn("still", frag) + self.assertNotIn("hidden", frag) + + def test_keteg_segments_in_wikitext(self): + frag = wikitext_to_intermediate_xml("<ืงื˜ืข ื”ืชื—ืœื”=seg/>") + self.assertIn('', frag) + + def test_column_c_line_break_integration(self): + frag = wikitext_to_intermediate_xml("ื//ื‘", column_c=True) + self.assertIn("", frag) + + def test_nosach_without_note(self): + frag = wikitext_to_intermediate_xml("{{ื ื•ืกื—|ื˜ืงืกื˜}}") + self.assertEqual(frag, "ื˜ืงืกื˜") + self.assertNotIn("=2.0.0", "diff-match-patch>=20241021", "pydantic>=2.11.7", + "openpyxl>=3.1.5", ] [project.urls] diff --git a/uv.lock b/uv.lock index 8f7acaa..60f0375 100644 --- a/uv.lock +++ b/uv.lock @@ -605,6 +605,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/5e/4b5aaaabddfacfe36ba7768817bd1f71a7a810a43705e531f3ae4c690767/emoji-2.15.0-py3-none-any.whl", hash = "sha256:205296793d66a89d88af4688fa57fd6496732eb48917a87175a023c8138995eb", size = 608433, upload-time = "2025-09-21T12:13:01.197Z" }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, +] + [[package]] name = "executing" version = "2.2.1" @@ -1840,6 +1849,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7d/32/37734d769bc8b42e4938785313cc05aade6cb0fa72479d3220a0d61a4e78/openai-2.33.0-py3-none-any.whl", hash = "sha256:03ac37d70e8c9e3a8124214e3afa785e2cbc12e627fbd98177a086ef2fd87ad5", size = 1162695, upload-time = "2026-04-28T14:04:40.482Z" }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, +] + [[package]] name = "opensiddur-ai" version = "0.1.0" @@ -1856,6 +1877,7 @@ dependencies = [ { name = "markdown" }, { name = "mwparserfromhell" }, { name = "openai" }, + { name = "openpyxl" }, { name = "pydantic" }, { name = "pyppeteer" }, { name = "requests" }, @@ -1885,6 +1907,7 @@ requires-dist = [ { name = "markdown", specifier = ">=3.9" }, { name = "mwparserfromhell", specifier = ">=0.7.2" }, { name = "openai", specifier = ">=1.101.0" }, + { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pydantic", specifier = ">=2.11.7" }, { name = "pyppeteer", specifier = ">=2.0.0" }, { name = "requests", specifier = ">=2.32.4" },