From 29d1cd921514c3b498b45998cfb08ea747d9a094 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Wed, 27 May 2026 00:00:06 -0700 Subject: [PATCH 01/10] miqra al pi hamasorah download --- README.md | 7 + .../importer/miqra_al_pi_hamasorah/README.md | 65 +++++ .../miqra_al_pi_hamasorah/__init__.py | 0 .../miqra_al_pi_hamasorah/download.py | 223 ++++++++++++++++++ opensiddur/importer/util/pages.py | 15 ++ opensiddur/tests/fixtures/miqra_minimal.xlsx | Bin 0 -> 6007 bytes .../miqra_al_pi_hamasorah/__init__.py | 0 .../miqra_al_pi_hamasorah/test_download.py | 106 +++++++++ pyproject.toml | 1 + uv.lock | 23 ++ 10 files changed, 440 insertions(+) create mode 100644 opensiddur/importer/miqra_al_pi_hamasorah/README.md create mode 100644 opensiddur/importer/miqra_al_pi_hamasorah/__init__.py create mode 100644 opensiddur/importer/miqra_al_pi_hamasorah/download.py create mode 100644 opensiddur/tests/fixtures/miqra_minimal.xlsx create mode 100644 opensiddur/tests/importer/miqra_al_pi_hamasorah/__init__.py create mode 100644 opensiddur/tests/importer/miqra_al_pi_hamasorah/test_download.py diff --git a/README.md b/README.md index b1b8132..c997a3a 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,13 @@ uv run python -m opensiddur.importer.jps1917.convert_wikisource \ --project-dir ~/src/opensiddur-repos/opensiddur-projects/project/jps1917 ``` +Example: download Miqra al pi ha-Masorah from Google Sheets into sourcetexts: + +```bash +uv run python -m opensiddur.importer.miqra_al_pi_hamasorah.download \ + --sourcetexts-root ~/src/opensiddur-repos/sourcetexts/sources +``` + ## JLPTEI sources JLPTEI sources are compiled into the `project` directory. diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/README.md b/opensiddur/importer/miqra_al_pi_hamasorah/README.md new file mode 100644 index 0000000..19b3cfe --- /dev/null +++ b/opensiddur/importer/miqra_al_pi_hamasorah/README.md @@ -0,0 +1,65 @@ +# Miqra al pi ha-Masorah importer (download) + +Scripts to download [*Miqra according to the Masorah*](https://docs.google.com/spreadsheets/d/1mkQyj6by1AtBUabpbaxaZq9Z2X3pX8ZpwG91ZCSOEYs/edit) from its public Google Sheet and prepare per-tab TSV files for a future JLPTEI importer. + +## License + +The README tab of the source spreadsheet states that the text is prepared by Sefer Avi Kadish, based on Hebrew Wikisource material, and is licensed **CC-BY-SA 4.0 International**, with attribution to Hebrew Wikisource. See the downloaded `sheets/readme.tsv` for the full Hebrew and English wording. + +## Download + +Prerequisites: clone [opensiddur/sourcetexts](https://github.com/opensiddur/sourcetexts) (or use `/sources`). + +```bash +uv run python -m opensiddur.importer.miqra_al_pi_hamasorah.download \ + --sourcetexts-root ~/src/opensiddur-repos/sourcetexts/sources +``` + +Use `--dry-run` to print paths without downloading. + +Output layout: + +``` +/miqra_al_pi_hamasorah/ + manifest.json + sheets/ + torah.tsv + neviim_rishonim.tsv + … +``` + +The script downloads the workbook once as XLSX, splits each known tab to UTF-8 TSV, writes `manifest.json` (checksums and row counts), and deletes the temporary workbook. + +## Worksheet → file mapping + +| Tab | Output file | +|-----|-------------| +| שינויים changes | `changes.tsv` | +| README | `readme.tsv` | +| כתובים אחרונים | `ketuvim_aharonim.tsv` | +| חמש מגילות | `chamisha_megillot.tsv` | +| ספרי אמ"ת | `sifrei_emet.tsv` | +| נביאים אחרונים | `neviim_acharonim.tsv` | +| נביאים ראשונים | `neviim_rishonim.tsv` | +| תורה | `torah.tsv` | +| תבניות templates | `templates.tsv` | +| מיוחד special | `special.tsv` | +| AutoEdits | `auto_edits.tsv` | + +## Biblical text columns + +On the six biblical-book tabs (Torah, Nevi'im, Ketuvim, etc.), each data row uses: + +| Column | Role | +|--------|------| +| A | Page key (e.g. `ספר בראשית/א`) | +| B | Row id (`0` = section header; Hebrew letters = verses) | +| C | Navigation / header wikitext | +| D | Verse scaffolding (`{{מ:פסוק|…}}`) | +| E | Pointed Hebrew text and `{{נוסח|…}}` templates | + +Content is Hebrew Wikisource-style wikitext, related to the [JPS 1917](../jps1917/) importer pipeline. + +## Importer status + +Only the download step is implemented. A JLPTEI converter will read `sheets/*.tsv` in a later change. diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/__init__.py b/opensiddur/importer/miqra_al_pi_hamasorah/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/download.py b/opensiddur/importer/miqra_al_pi_hamasorah/download.py new file mode 100644 index 0000000..342ba83 --- /dev/null +++ b/opensiddur/importer/miqra_al_pi_hamasorah/download.py @@ -0,0 +1,223 @@ +"""Download Miqra al pi ha-Masorah from Google Sheets into per-tab TSV files.""" + +from __future__ import annotations + +import argparse +import csv +import hashlib +import json +import logging +import sys +import tempfile +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import requests +from openpyxl import load_workbook + +from opensiddur.importer.util.pages import ( + default_sourcetexts_root, + miqra_al_pi_hamasorah_data_directory, + miqra_al_pi_hamasorah_sheets_directory, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +SPREADSHEET_ID = "1mkQyj6by1AtBUabpbaxaZq9Z2X3pX8ZpwG91ZCSOEYs" +SOURCE_URL = ( + f"https://docs.google.com/spreadsheets/d/{SPREADSHEET_ID}/edit" +) +EXPORT_XLSX_URL = ( + f"https://docs.google.com/spreadsheets/d/{SPREADSHEET_ID}/export?format=xlsx" +) +USER_AGENT = ( + "OpenSiddur-AI/1.0 (https://github.com/opensiddur/opensiddur-ai; " + "opensiddur@example.com)" +) + +# Exact worksheet titles from the workbook → output slug (without .tsv). +SHEET_SLUGS: dict[str, str] = { + "שינויים changes": "changes", + "README": "readme", + "כתובים אחרונים": "ketuvim_aharonim", + "חמש מגילות": "chamisha_megillot", + "ספרי אמ\"ת": "sifrei_emet", + "נביאים אחרונים": "neviim_acharonim", + "נביאים ראשונים": "neviim_rishonim", + "תורה": "torah", + "תבניות templates": "templates", + "מיוחד special": "special", + "AutoEdits": "auto_edits", +} + + +def _cell_value(value: object) -> str: + if value is None: + return "" + return str(value) + + +def _worksheet_rows(worksheet: Any) -> tuple[list[list[str]], int, int]: + """Return (rows, row_count, max_columns) for a worksheet.""" + rows: list[list[str]] = [] + max_col = 0 + for row in worksheet.iter_rows(values_only=True): + cells = [_cell_value(c) for c in row] + while cells and cells[-1] == "": + cells.pop() + if not any(cells): + continue + max_col = max(max_col, len(cells)) + rows.append(cells) + if max_col == 0: + return [], 0, 0 + padded = [cells + [""] * (max_col - len(cells)) for cells in rows] + return padded, len(padded), max_col + + +def _write_tsv(path: Path, rows: list[list[str]]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8", newline="") as f: + writer = csv.writer( + f, + delimiter="\t", + lineterminator="\n", + quoting=csv.QUOTE_MINIMAL, + ) + writer.writerows(rows) + + +def _sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as f: + for chunk in iter(lambda: f.read(65536), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _split_workbook(xlsx_path: Path, sheets_dir: Path) -> list[dict[str, Any]]: + sheet_entries: list[dict[str, Any]] = [] + workbook = load_workbook(xlsx_path, read_only=True, data_only=True) + try: + for worksheet in workbook.worksheets: + title = worksheet.title + slug = SHEET_SLUGS.get(title) + if slug is None: + logger.warning("Skipping unknown worksheet: %r", title) + continue + rows, row_count, col_count = _worksheet_rows(worksheet) + out_path = sheets_dir / f"{slug}.tsv" + _write_tsv(out_path, rows) + rel_path = f"sheets/{slug}.tsv" + sheet_entries.append( + { + "name": title, + "slug": slug, + "path": rel_path, + "rows": row_count, + "columns": col_count, + } + ) + logger.info("Wrote %s (%d rows, %d columns)", out_path, row_count, col_count) + finally: + workbook.close() + return sheet_entries + + +def download_miqra( + sourcetexts_root: Path | None = None, + *, + dry_run: bool = False, +) -> None: + """Download the spreadsheet and write per-tab TSV files plus manifest.json.""" + data_dir = miqra_al_pi_hamasorah_data_directory(sourcetexts_root) + sheets_dir = miqra_al_pi_hamasorah_sheets_directory(sourcetexts_root) + manifest_path = data_dir / "manifest.json" + + if dry_run: + logger.info("Would download %s", EXPORT_XLSX_URL) + logger.info("Would write TSV files under %s", sheets_dir) + logger.info("Would write manifest to %s", manifest_path) + return + + data_dir.mkdir(parents=True, exist_ok=True) + sheets_dir.mkdir(parents=True, exist_ok=True) + + headers = {"User-Agent": USER_AGENT} + logger.info("Downloading %s ...", EXPORT_XLSX_URL) + response = requests.get(EXPORT_XLSX_URL, headers=headers, timeout=300) + response.raise_for_status() + + tmp_path: Path | None = None + try: + with tempfile.NamedTemporaryFile( + suffix=".xlsx", + delete=False, + dir=data_dir, + ) as tmp: + tmp.write(response.content) + tmp_path = Path(tmp.name) + + logger.info("Splitting workbook into TSV files ...") + sheet_entries = _split_workbook(tmp_path, sheets_dir) + + for entry in sheet_entries: + tsv_path = data_dir / entry["path"] + entry["sha256"] = _sha256_file(tsv_path) + + manifest = { + "spreadsheet_id": SPREADSHEET_ID, + "source_url": SOURCE_URL, + "export_url": EXPORT_XLSX_URL, + "downloaded_at": datetime.now(timezone.utc).isoformat(), + "sheets": sheet_entries, + } + manifest_path.write_text( + json.dumps(manifest, ensure_ascii=False, indent=2) + "\n", + encoding="utf-8", + ) + logger.info("Wrote manifest to %s", manifest_path) + finally: + if tmp_path is not None and tmp_path.exists(): + tmp_path.unlink() + logger.info("Removed temporary workbook %s", tmp_path) + + +def _build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=( + "Download Miqra al pi ha-Masorah from Google Sheets into per-tab TSV " + "files under /miqra_al_pi_hamasorah." + ) + ) + parser.add_argument( + "--sourcetexts-root", + type=Path, + default=default_sourcetexts_root(), + help=( + "Root of the sourcetexts tree; output is written under " + "/miqra_al_pi_hamasorah (default: /sources)." + ), + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Log actions without downloading or writing files.", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + args = _build_arg_parser().parse_args(argv) + download_miqra(args.sourcetexts_root, dry_run=args.dry_run) + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as e: + logger.error("Error downloading Miqra al pi ha-Masorah: %s", e) + raise diff --git a/opensiddur/importer/util/pages.py b/opensiddur/importer/util/pages.py index 117d25a..6f33d4e 100644 --- a/opensiddur/importer/util/pages.py +++ b/opensiddur/importer/util/pages.py @@ -29,6 +29,21 @@ def jps1917_credits_directory(sourcetexts_root: Path | None = None) -> Path: return jps1917_data_directory(sourcetexts_root) / "credits" +def miqra_al_pi_hamasorah_data_directory(sourcetexts_root: Path | None = None) -> Path: + """Miqra al pi ha-Masorah raw dumps: /miqra_al_pi_hamasorah.""" + root = ( + sourcetexts_root.resolve() + if sourcetexts_root is not None + else default_sourcetexts_root() + ) + return root / "miqra_al_pi_hamasorah" + + +def miqra_al_pi_hamasorah_sheets_directory(sourcetexts_root: Path | None = None) -> Path: + """Per-tab TSV files from the Google Sheet export.""" + return miqra_al_pi_hamasorah_data_directory(sourcetexts_root) / "sheets" + + def get_page(page_number: str | int, sourcetexts_root: Path | None = None) -> Optional[Page]: """Return the wikitext of the given Page, or None if it does not exist.""" page_num = int(page_number) diff --git a/opensiddur/tests/fixtures/miqra_minimal.xlsx b/opensiddur/tests/fixtures/miqra_minimal.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..fbb6dd8bb4d6aebf56d3a13c391993062cb20d9d GIT binary patch literal 6007 zcmZ`-1yqxL-ySi#5tLC%Bcr=PV3g7&ARsVcbPEU)(kUGhBBcV-t&@;Z!hq2sNGl*I z4Zg$YdtV>Z045b~OB_~0FE@HVc<`bE|-u&qNrN?jiXM zvj%>^ZN9GchY22o$jmMD(YY`I0P??^VCCWty|ke}sm;EFk0{(6dX&qdiXyuEtq-BW zO=K`J(LZ7DK+TRAQ#m>CcqamNB&{qcn#+?3HePN7Ny(hH<+DHi;#~cJvC*B@o5`K- z<||5Ln%#^wt35s+E>4+q@@zfE-AFAeB`j)HdsPY>ixp}n%5no!-xQ%ELbFofmP|mp zg)xS=T~!GHYKBR&}1apG{*Ek14ThPqr!s zeQXb$2oWN|^|h29Fz@Nw%+O5@#xQcDI7eUW#L&PG$|7WaX~dgJ_nMzeVS~S(4k#(F zu8H~Xedn=U3y)0M^|6@bogQmx;Qc&rD8NE4&qq^IvE52gbl0rT7zXGN3+SPw=$HwK z!Bh9S);DH4Zvx_F9{W~1J!=pObIiY?Y;6%>2DXzM)YCS0F~3>6%qVz7)u$H+Ex@2O zlxJ&-tld3!>^z(LECWS;m7o_0cIKlsx*0g&QnI9B9Qj4B{bQ@Wt*n7V*U+#^P)JWe z3b0xApa4aMCqOJIkz+LA?SJrbGcUGx%2UV*G{3mxBUaxx9OFudJ+Ix8+n%%U_@Pi9 zzqSB2XgzT-^)|8iO`NVK>M&q#{kdH>L7a&u%Hh#0Dn1HPKIlUX-_eZdcpHT%9rRp> zv9X`ObtwEwZFdK1i% zDCn0HATi*O5(q2BDA{azj!ZuDSX;^~Ngls9e?D( zL1GcB0CMY?U|mDXmeC-s@&OXLlpE{L+%kJ$6mX!a?94p=y0wOBJO#p{l-JGOyhJWm zr%Eh9DT|jAo?)*=hgpcvpOznfREsDhpQj9Hfa7(~#YG;Lx@1Wk<{PO_g&vVXKnvL( zD;`6Rr?^Fsp#%MQGX?2JR2b1Kp|o@>^j+O`(bXLH{8Bz#oR5gdQ0JaS!aiIlBG z_N#ypZb8<4{P&>-_IYn2gk)YZ2oL~wniyo3{d@>S(-7~p-iAEJ_&GEs+Fp7T(kdRv z6M!t*YTB31A8rt34zO)Q#h84CIGcPteak;JF2uuOW$51D-KAHN%sC;R*Sc|TCzHFg zc{0foT9I|1o0i=Bu^+gn|kU?&{y_Q{Z)MVru(UpIkK<`iLZLrIP6DAHAR#K0SEg zy~JF^r<<b_19@@3ygQGGT!WgBLd7cOda>HeJsMs3Dwpa*WDMlwvz21#g3nVgPlV41guA|_bNoS zNej9wOz5%hy`5qs44Yoi>u&+U*(}sM{G4;R*?&%(jY!vCmxk;WWX?0xM0h;+Yiw64 z`)M@Ba4aw)dCZzES^Ib=_~!GovFXBt*?A)W zky7I#Yc6tGbzFbOBpj<)lYJrcglm@N#Z~2qDSjf~Afr8^d>gU32&F8L;Al*lcpQO? z{HWZr?DPn+-2t%>T_VtoKeJw>U`c~OYUW~#xU&6Y{Flv`4;X(AkTWueI1~l|0LKOZ zZu~Vs9=W(Xc-TRqo*ulv9>1nYr{1)4HXqqJ)smWTY%q8uEM~+}r;aq$AZkhmVs@i7 zx^3jZ#)S3waB77~M}DM1rawD#CtDoL+ka(2+W^petX~6Rd!_Sc_93{2z@CZMmoRSN zn(2rlj6)4I+j#q5Ytzi^0)DIEq?#aK&Ui|&&Sa9Dj z#I2R~r3#4fiFfEwCL+^~sNrov+$8cng@lH1q8hJ^PcW_Gm}A+_S@)24-6YB2OBrA2 zkoRo9e(0%nY%29bzlE~ZcV-Kow$X<%SOQeS*&;NO%dXn?a=CYjKH$SQBkRuGa4#CM ze#&ml(+`9(pB6Ebk_wKBs*fU(1__LtY|Rp>rJMSq&DY_L&ARg&cEUa~(`;HS-WM}2J@W

mHITQ`H`>aGmj^m=)vZafPViBg zFYonD!@ z|1eRU3g?=$yPgp&5p7))0kpjkSOJoGcO-8l;%#cx??Btw??_dU!?0oMs1G`4n18w* zckPu&Fxu_=|L%6dOScE;OrzaSbe?)!MJlPjthK(LVvz?UDR#bJmKdIW98lT`MA*{h(Aw3uml`G2@8}8V(c`Y}wM9Y&xofky}!3)P}j;lw*=`)T*OK z9G6^-57wiROkk_Y*gXxsj-NBVhm~aw`$QYuB4Bk39}@_qc)GoU#Usbbv@7trA=}YT zHSF{v{=7MF7Fm6bgeTT7m353p(n=mb)riYqV3yl)RTU>V9aAaS7zIpQ`Gg9g`?k4f zt#Eq(#Sgcyv=5*7kg zOeqaZ=hMO?MpasJRnK=NP@(VI6bzc|A7CdjTV~W3vJZEkbHEz}nl2n44eO6!HhOKWC!>F0P$aK z)5Fu(5$bUnt678T=;_N|`GxdVa)z**>aLc8MLsuS3u%D9r8xQ8Lho+U{<BJo<}zylDm3-XBOyPPH95)pzgC7T4L-gKO-93QP$PdPhY{o04$ZJMO8)Me&zeo8tp#sM-Ry2_hBUTF6u=s^U;dDD-StG2G(3oZacJ;#P~}N= zcm$nutTv!QIe$bjeoguD>gjOx#CjLb^ry;^=Xm&WmY$SU`#(z>)=u+gt2N-pKcPYP zcLxJW#$wpnjeYS?iry#4hLJ?gDCLzArCU7@=*rQpA@sgcxH}v>bzdZoZ_6I*FqhHd z_10O(F{IXC>jH||FbNK%jHB}u&hcxnM(Ea@QQ0e0$)h9x|(p-BQnaou!&h+#!kbK4&3`C>4AE>v=Up zV|^vKgz*7=?9SIaH88JkdqcO6slNsX;J!x&B;)>R_gF`6@9fsP@?DD zGj08^FkQE2^7n$L=wtKZpGu~#Tmx#P_Qv@=g%!Txz<$L4z5uG~S@Ka*n(-SqXY?C2 z=Ex#J4g!L~;W(fLP+eww>O z9X)t?F7G)>P0r{pAs5&=n9GMdcq>v6x3FoST!u$T87+Q>~xL+OeCR9zpz!6>F@UW$lGu*e0BH1;T&y@a7mi` z+|udZRL$)ncOQ_<*AcX6kC4_WGDA8L4mKU$@(dIS)<)nkVEe%NL7%|H%MRAt`&(77 z;8c-v1@du^RjkOPT9<2Em}4Fbn7vv3n50v+HWCLkO6XxcZVL5JR5#=_<5^9Tw;W2C z;8R%l_bS*;|I`SCn;Oz;ioTMXsM>dRfa$)tpDH4yGyy8;%@u_-&&Z9SRUSB%r@S@>lx(Jutsgu`^kmc| z977*86-j8s2ezzQ&g5`CCF5w-i580NJ+V4IH;gS{_fDb})|$0*N5E-0sD*3bSXyRQ zygmA$h~r|t;&drx-QYA|F0RnlJ!gvpqC~~V#mVg;K*=j#gfcyO7B{VB-H z*^DqR`sQUX5dIOPyaix55quplt_9LB+xdCzGeg5HTQHabCBZ{}gOVeT7$D9C8JZNz^l9gN;_`NdR6tj5 zz_f%ckzU4* zg#`eDFgpH@{_lqHD*Wow>^J-i`tSc?g?5$T>PyrA2!fgLei8iRE7Vn%tBU(K3m3sZ zvi#EDS6QwK$lok4(Zd04%+<5}l9N|iuIicpv0&Us%NsPyZ*_AOdUYKAhK{0}_TPrn zRp8b3{u>BK@2dar^d|Do8m)Pd-I>*eGn O1oWaOaQbCZ1O5YC8*J48 literal 0 HcmV?d00001 diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/__init__.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_download.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_download.py new file mode 100644 index 0000000..6e4c498 --- /dev/null +++ b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_download.py @@ -0,0 +1,106 @@ +import json +import tempfile +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + +from opensiddur.importer.miqra_al_pi_hamasorah import download +from opensiddur.importer.util.pages import ( + miqra_al_pi_hamasorah_data_directory, + miqra_al_pi_hamasorah_sheets_directory, +) + +FIXTURE_XLSX = ( + Path(__file__).resolve().parents[2] / "fixtures" / "miqra_minimal.xlsx" +) + + +class TestDownloadMiqra(unittest.TestCase): + def setUp(self) -> None: + self.tmp = tempfile.TemporaryDirectory() + self.sourcetexts_root = Path(self.tmp.name) + + def tearDown(self) -> None: + self.tmp.cleanup() + + def _mock_response(self) -> MagicMock: + response = MagicMock() + response.raise_for_status = MagicMock() + response.content = FIXTURE_XLSX.read_bytes() + return response + + @patch("opensiddur.importer.miqra_al_pi_hamasorah.download.requests.get") + def test_download_writes_tsv_and_manifest(self, mock_get: MagicMock) -> None: + mock_get.return_value = self._mock_response() + + download.download_miqra(self.sourcetexts_root) + + data_dir = miqra_al_pi_hamasorah_data_directory(self.sourcetexts_root) + sheets_dir = miqra_al_pi_hamasorah_sheets_directory(self.sourcetexts_root) + + torah_tsv = sheets_dir / "torah.tsv" + readme_tsv = sheets_dir / "readme.tsv" + self.assertTrue(torah_tsv.is_file()) + self.assertTrue(readme_tsv.is_file()) + self.assertFalse((sheets_dir / "unknowntab.tsv").exists()) + + torah_lines = torah_tsv.read_text(encoding="utf-8").splitlines() + self.assertEqual(len(torah_lines), 2) + self.assertIn("בְּרֵאשִׁית", torah_lines[1]) + + manifest_path = data_dir / "manifest.json" + self.assertTrue(manifest_path.is_file()) + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + self.assertEqual(manifest["spreadsheet_id"], download.SPREADSHEET_ID) + slugs = {s["slug"] for s in manifest["sheets"]} + self.assertIn("torah", slugs) + self.assertIn("readme", slugs) + for entry in manifest["sheets"]: + self.assertIn("sha256", entry) + self.assertEqual(len(entry["sha256"]), 64) + + xlsx_files = list(data_dir.glob("*.xlsx")) + self.assertEqual(xlsx_files, []) + + mock_get.assert_called_once() + call_kwargs = mock_get.call_args + self.assertEqual(call_kwargs[0][0], download.EXPORT_XLSX_URL) + self.assertIn("User-Agent", call_kwargs[1]["headers"]) + + @patch("opensiddur.importer.miqra_al_pi_hamasorah.download.requests.get") + def test_dry_run_writes_nothing(self, mock_get: MagicMock) -> None: + download.download_miqra(self.sourcetexts_root, dry_run=True) + + data_dir = miqra_al_pi_hamasorah_data_directory(self.sourcetexts_root) + self.assertFalse(data_dir.exists()) + mock_get.assert_not_called() + + @patch("opensiddur.importer.miqra_al_pi_hamasorah.download.logger") + @patch("opensiddur.importer.miqra_al_pi_hamasorah.download.requests.get") + def test_unknown_sheet_logs_warning( + self, mock_get: MagicMock, mock_logger: MagicMock + ) -> None: + mock_get.return_value = self._mock_response() + download.download_miqra(self.sourcetexts_root) + + warning_calls = [ + c + for c in mock_logger.warning.call_args_list + if "UnknownTab" in str(c) + ] + self.assertEqual(len(warning_calls), 1) + + def test_main_dry_run_exit_code(self) -> None: + with patch( + "opensiddur.importer.miqra_al_pi_hamasorah.download.download_miqra" + ) as mock_download: + code = download.main( + ["--dry-run", "--sourcetexts-root", str(self.sourcetexts_root)] + ) + self.assertEqual(code, 0) + mock_download.assert_called_once() + self.assertTrue(mock_download.call_args.kwargs["dry_run"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/pyproject.toml b/pyproject.toml index 07982ea..8fdfecb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dependencies = [ "pyppeteer>=2.0.0", "diff-match-patch>=20241021", "pydantic>=2.11.7", + "openpyxl>=3.1.5", ] [project.urls] diff --git a/uv.lock b/uv.lock index 8f7acaa..60f0375 100644 --- a/uv.lock +++ b/uv.lock @@ -605,6 +605,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/5e/4b5aaaabddfacfe36ba7768817bd1f71a7a810a43705e531f3ae4c690767/emoji-2.15.0-py3-none-any.whl", hash = "sha256:205296793d66a89d88af4688fa57fd6496732eb48917a87175a023c8138995eb", size = 608433, upload-time = "2025-09-21T12:13:01.197Z" }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, +] + [[package]] name = "executing" version = "2.2.1" @@ -1840,6 +1849,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7d/32/37734d769bc8b42e4938785313cc05aade6cb0fa72479d3220a0d61a4e78/openai-2.33.0-py3-none-any.whl", hash = "sha256:03ac37d70e8c9e3a8124214e3afa785e2cbc12e627fbd98177a086ef2fd87ad5", size = 1162695, upload-time = "2026-04-28T14:04:40.482Z" }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, +] + [[package]] name = "opensiddur-ai" version = "0.1.0" @@ -1856,6 +1877,7 @@ dependencies = [ { name = "markdown" }, { name = "mwparserfromhell" }, { name = "openai" }, + { name = "openpyxl" }, { name = "pydantic" }, { name = "pyppeteer" }, { name = "requests" }, @@ -1885,6 +1907,7 @@ requires-dist = [ { name = "markdown", specifier = ">=3.9" }, { name = "mwparserfromhell", specifier = ">=0.7.2" }, { name = "openai", specifier = ">=1.101.0" }, + { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pydantic", specifier = ">=2.11.7" }, { name = "pyppeteer", specifier = ">=2.0.0" }, { name = "requests", specifier = ">=2.32.4" }, From 623dc293134361702435af5e7ff37268e006debc Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Wed, 27 May 2026 21:54:52 -0700 Subject: [PATCH 02/10] wip: miqra al pi hamasorah converter --- .../importer/jps1917/mediawiki_processor.py | 832 +----------------- .../importer/miqra_al_pi_hamasorah/README.md | 2 +- .../miqra_al_pi_hamasorah/convert_tsv.py | 601 +++++++++++++ .../miqra_al_pi_hamasorah/miqra_to_tei.xslt | 243 +++++ .../miqra_al_pi_hamasorah/miqra_wikitext.py | 633 +++++++++++++ .../importer/util/mediawiki_processor.py | 515 +++++++++++ .../miqra_al_pi_hamasorah/test_convert_tsv.py | 134 +++ .../test_miqra_wikitext.py | 106 +++ 8 files changed, 2242 insertions(+), 824 deletions(-) create mode 100644 opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py create mode 100644 opensiddur/importer/miqra_al_pi_hamasorah/miqra_to_tei.xslt create mode 100644 opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py create mode 100644 opensiddur/importer/util/mediawiki_processor.py create mode 100644 opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py create mode 100644 opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py diff --git a/opensiddur/importer/jps1917/mediawiki_processor.py b/opensiddur/importer/jps1917/mediawiki_processor.py index 3d62bac..9894597 100644 --- a/opensiddur/importer/jps1917/mediawiki_processor.py +++ b/opensiddur/importer/jps1917/mediawiki_processor.py @@ -1,828 +1,14 @@ """ -MediaWiki to XML Processor for JPS1917 Converter +JPS1917 MediaWiki processor (compatibility wrapper). -This module provides a modular framework for converting MediaWiki syntax to XML. -Based on analysis of 1917 JPS Wikisource content, it handles templates and tags -found in the source material. - -Analysis Results Summary: -- Templates: 30+ types, 50,000+ instances (verse, sc, c, larger, etc.) -- Tags: 11 types, 25,000+ instances (noinclude, dd, ref, table, etc.) +The implementation lives in `opensiddur.importer.util.mediawiki_processor` so it +can be reused by other importers. """ -import re -import mwparserfromhell -from typing import Dict, List, Any -from dataclasses import dataclass -from enum import Enum - - -class ProcessingStage(Enum): - """Stages of MediaWiki processing""" - PREPROCESS = "preprocess" - TEMPLATES = "templates" - TAGS = "tags" - POSTPROCESS = "postprocess" - - -@dataclass -class ConversionResult: - """Result of a conversion operation""" - xml_content: str - metadata: Dict[str, Any] - warnings: List[str] - errors: List[str] - wikilinks: List[Dict[str, Any]] - - -class MediaWikiProcessor: - """ - Modular MediaWiki to XML processor for JPS1917 content. - - This processor handles the conversion of MediaWiki syntax to XML, - with separate modules for different types of templates and tags. - """ - - def __init__(self): - self.template_handlers = {} - self.tag_handlers = {} - self.preprocessors = [] - self.postprocessors = [] - self.wikilinks = [] # Store captured wikilinks - self._initialize_handlers() - - def _initialize_handlers(self): - """Initialize all template and tag handlers""" - self._initialize_template_handlers() - self._initialize_tag_handlers() - self._initialize_preprocessors() - self._initialize_postprocessors() - self._initialize_wikilink_handlers() - - def _initialize_template_handlers(self): - """Initialize handlers for MediaWiki templates""" - - # Text Formatting Templates - self.template_handlers['sc'] = self._handle_small_caps - self.template_handlers['larger'] = self._handle_larger_text - self.template_handlers['x-larger'] = self._handle_x_larger_text - self.template_handlers['xx-larger'] = self._handle_xx_larger_text - self.template_handlers['xxx-larger'] = self._handle_xxx_larger_text - self.template_handlers['smaller'] = self._handle_smaller_text - - # Layout Templates - self.template_handlers['c'] = self._handle_center - self.template_handlers['right'] = self._handle_right_align - self.template_handlers['rule'] = self._handle_horizontal_rule - self.template_handlers['nop'] = self._handle_no_paragraph - - # Biblical Content Templates - self.template_handlers['verse'] = self._handle_verse - self.template_handlers['rh'] = self._handle_right_header - self.template_handlers['dropinitial'] = self._handle_drop_initial - self.template_handlers['dhr'] = self._handle_double_horizontal_rule - - # Navigation Templates - self.template_handlers['anchor'] = self._handle_anchor - self.template_handlers['anchor+'] = self._handle_anchor_plus - - # Language Templates - self.template_handlers['lang'] = self._handle_language - - # Reference Templates - self.template_handlers['smallrefs'] = self._handle_small_refs - - # Special Templates - self.template_handlers['hws'] = self._handle_hws - self.template_handlers['hwe'] = self._handle_hwe - self.template_handlers['***'] = self._handle_asterisks - self.template_handlers['reconstruct'] = self._handle_reconstruct - self.template_handlers['SIC'] = self._handle_sic - self.template_handlers['sic'] = self._handle_sic - self.template_handlers['sup'] = self._handle_superscript - self.template_handlers['bar'] = self._handle_bar - self.template_handlers['gap'] = self._handle_gap - self.template_handlers['overfloat left'] = self._handle_overfloat_left - self.template_handlers['float right'] = self._handle_float_right - self.template_handlers['smaller block/s'] = self._handle_smaller_block_start - self.template_handlers['smaller block/e'] = self._handle_smaller_block_end - - def _initialize_tag_handlers(self): - """Initialize handlers for HTML/XML tags""" - - # Structural Tags - self.tag_handlers['section'] = self._handle_section - self.tag_handlers['table'] = self._handle_table - self.tag_handlers['tr'] = self._handle_table_row - self.tag_handlers['td'] = self._handle_table_cell - - # Text Formatting Tags - self.tag_handlers['i'] = self._handle_italic - self.tag_handlers['br'] = self._handle_line_break - self.tag_handlers['span'] = self._handle_span - - # Content Tags - self.tag_handlers['dd'] = self._handle_definition_description - self.tag_handlers['ref'] = self._handle_reference - - # MediaWiki Specific Tags - self.tag_handlers['noinclude'] = self._handle_noinclude - self.tag_handlers['pagequality'] = self._handle_pagequality - - def _initialize_preprocessors(self): - """Initialize preprocessing functions""" - self.preprocessors = [ - self._fix_noinclude_line_breaks, - self._convert_paragraph_breaks, - self._normalize_whitespace, - self._handle_special_characters, # Enable special character processing - self._extract_metadata - ] - - def _initialize_postprocessors(self): - """Initialize postprocessing functions""" - self.postprocessors = [ - self._validate_xml_structure, - self._finalize_metadata - ] - - def _initialize_wikilink_handlers(self): - """Initialize wikilink processing""" - # Wikilinks are processed during the main parsing loop - pass - - def _process_nested_content(self, content: str, depth: int = 0) -> str: - """Recursively process nested templates and other elements""" - # Prevent infinite recursion - if depth > 10: - return content - - # Parse the content to handle nested elements - parsed = mwparserfromhell.parse(content) - nodes_to_replace = [] - - # Process nodes recursively - for node in parsed.nodes: - if hasattr(node, 'name'): # Template - template_name = str(node.name).strip() - if template_name in self.template_handlers: - try: - # Process nested content within the template - processed_node = self._process_template_with_nesting(node, depth + 1) - replacement = self.template_handlers[template_name](processed_node) - nodes_to_replace.append((node, replacement)) - except Exception as e: - # If nested processing fails, try without nesting - replacement = self.template_handlers[template_name](node) - nodes_to_replace.append((node, replacement)) - else: - # Unknown template - process its content for nested elements - processed_content = self._process_nested_content(str(node), depth + 1) - nodes_to_replace.append((node, processed_content)) - - elif hasattr(node, 'tag'): # Tag - tag_name = str(node.tag).strip().lower() - if tag_name in self.tag_handlers: - try: - # Process nested content within the tag - processed_node = self._process_tag_with_nesting(node, depth + 1) - replacement = self.tag_handlers[tag_name](processed_node) - nodes_to_replace.append((node, replacement)) - except Exception as e: - # If nested processing fails, try without nesting - replacement = self.tag_handlers[tag_name](node) - nodes_to_replace.append((node, replacement)) - else: - # Unknown tag - process its content for nested elements - processed_content = self._process_nested_content(str(node), depth + 1) - nodes_to_replace.append((node, processed_content)) - - elif hasattr(node, '__class__') and 'Wikilink' in str(node.__class__): # Wikilink - try: - replacement = self._handle_wikilink(node) - nodes_to_replace.append((node, replacement)) - except Exception as e: - # If wikilink processing fails, keep original - nodes_to_replace.append((node, str(node))) - - # Replace all nodes - for node, replacement in nodes_to_replace: - parsed.replace(node, replacement) - - return str(parsed) - - def _process_template_with_nesting(self, template, depth: int = 0) -> object: - """Process a template and its nested content""" - # Create a copy of the template to avoid modifying the original - import copy - processed_template = copy.deepcopy(template) - - # Process each parameter of the template - for param in processed_template.params: - if hasattr(param, 'value'): - # Process nested content in parameter values - processed_value = self._process_nested_content(str(param.value), depth + 1) - param.value = processed_value - - return processed_template - - def _process_tag_with_nesting(self, tag, depth: int = 0) -> object: - """Process a tag and its nested content""" - # Create a copy of the tag to avoid modifying the original - import copy - processed_tag = copy.deepcopy(tag) - - # Process nested content within the tag - if hasattr(processed_tag, 'contents') and processed_tag.contents: - processed_contents = self._process_nested_content(str(processed_tag.contents), depth + 1) - processed_tag.contents = processed_contents - - return processed_tag - - # ============================================================================ - # TEMPLATE HANDLERS - # ============================================================================ - - def _handle_small_caps(self, template) -> str: - """Convert {{sc|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_larger_text(self, template) -> str: - """Convert {{larger|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_x_larger_text(self, template) -> str: - """Convert {{x-larger|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_xx_larger_text(self, template) -> str: - """Convert {{xx-larger|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_xxx_larger_text(self, template) -> str: - """Convert {{xxx-larger|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_smaller_text(self, template) -> str: - """Convert {{smaller|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_center(self, template) -> str: - """Convert {{c|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_right_align(self, template) -> str: - """Convert {{right|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_horizontal_rule(self, template) -> str: - """Convert {{rule}} to """ - return '' - - def _handle_no_paragraph(self, template) -> str: - """Convert {{nop}} to """ - return '' - - def _handle_verse(self, template) -> str: - """Convert {{verse|chapter|verse|text}} to text""" - chapter = str(template.get('chapter', template.get(1, ''))).replace("chapter=", "") - verse = str(template.get('verse', template.get(2, ''))).replace("verse=", "") - text = str(template.get(3, template.get('text', ''))) - chapter_attr = f' chapter="{chapter}"' if chapter else '' - verse_attr = f' verse="{verse}"' if verse else '' - if not chapter or not verse: - print(f"Invalid verse template: {template} {template.get(1, '')=} {template.get(2, '')=} {template.get(3, '')=}") - - return f'{text}' - - def _handle_right_header(self, template) -> str: - """Convert {{rh|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_drop_initial(self, template) -> str: - """Convert {{dropinitial|letter}} to letter""" - letter = str(template.get(1, '')) - return f'{letter}' - - def _handle_double_horizontal_rule(self, template) -> str: - """Convert {{dhr}} to """ - value = str(template.get(1, '')) - if value: - value=f' value="{value}"' - else: - value="" - return f'' - - def _handle_anchor(self, template) -> str: - """Convert {{anchor|name}} to """ - name = str(template.get(1, '')) - return f'' - - def _handle_anchor_plus(self, template) -> str: - """Convert {{anchor+|name|text}} to text""" - name = str(template.get(1, '')) - text = str(template.get(2, '')) - return f'{text}' - - def _handle_language(self, template) -> str: - """Convert {{lang|code|text}} to text""" - code = str(template.get(1, '')) - text = str(template.get(2, '')) - return f'{text}' - - def _handle_small_refs(self, template) -> str: - """Convert {{smallrefs}} to """ - return '' - - def _handle_hws(self, template) -> str: - """Convert {{hws|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_hwe(self, template) -> str: - """Convert {{hwe|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_asterisks(self, template) -> str: - """Convert {{***}} to ***""" - n = str(template.get(1, '3')) - return f'***' - - def _handle_reconstruct(self, template) -> str: - """Convert {{reconstruct|content|text}} to text""" - content = str(template.get(1, '')) - text = str(template.get(2, '')) - return f'{content}{text}' - - def _handle_sic(self, template) -> str: - """Convert {{SIC|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_superscript(self, template) -> str: - """Convert {{sup|text}} to text""" - content = str(template.get(1, '')) - return f'{content}' - - def _handle_bar(self, template) -> str: - """Convert {{bar|length}} to """ - length = str(template.get(1, '6')) - return f'' - - def _handle_gap(self, template) -> str: - """Convert {{gap|length}} to """ - length = str(template.get(1, '')) - if length: - return f'' - else: - return '' - - def _handle_overfloat_left(self, template) -> str: - """Convert {{overfloat left|align|padding|text}} to text""" - # Get parameters - can be positional or named - align = str(template.get('align', template.get(1, ''))) - padding = str(template.get('padding', template.get(2, ''))) - text = str(template.get('text', template.get(3, ''))) - - # Clean up named parameters (remove parameter name prefixes) - align = align.replace('align=', '') if align.startswith('align=') else align - padding = padding.replace('padding=', '') if padding.startswith('padding=') else padding - text = text.replace('text=', '') if text.startswith('text=') else text - - # Build attributes - attributes = [] - if align: - attributes.append(f'align="{align}"') - if padding: - attributes.append(f'padding="{padding}"') - - attr_str = ' ' + ' '.join(attributes) if attributes else '' - - return f'{text}' - - def _handle_float_right(self, template) -> str: - """Convert {{float right|text}} to text""" - text = str(template.get(1, '')) - return f'{text}' - - def _handle_smaller_block_start(self, template) -> str: - """Convert {{smaller block/s}} to """ - return '' - - def _handle_smaller_block_end(self, template) -> str: - """Convert {{smaller block/e}} to """ - return '' - - # ============================================================================ - # WIKILINK HANDLERS - # ============================================================================ - - def _handle_wikilink(self, wikilink) -> str: - """Process and capture wikilinks""" - # Extract wikilink information - title = str(wikilink.title) if hasattr(wikilink, 'title') and wikilink.title else '' - text = str(wikilink.text) if hasattr(wikilink, 'text') and wikilink.text else title - - # Process templates within the wikilink text - processed_text = self._process_nested_content(text) - - # Store wikilink information - wikilink_info = { - 'title': title, - 'text': processed_text, - 'namespace': str(wikilink.namespace) if hasattr(wikilink, 'namespace') and wikilink.namespace else None, - 'section': str(wikilink.section) if hasattr(wikilink, 'section') and wikilink.section else None, - 'fragment': str(wikilink.fragment) if hasattr(wikilink, 'fragment') and wikilink.fragment else None - } - self.wikilinks.append(wikilink_info) - - # Convert to XML - use __link__ tag with attributes - attributes = [] - if title: - attributes.append(f'title="{title}"') - if wikilink_info['namespace']: - attributes.append(f'namespace="{wikilink_info["namespace"]}"') - if wikilink_info['section']: - attributes.append(f'section="{wikilink_info["section"]}"') - if wikilink_info['fragment']: - attributes.append(f'fragment="{wikilink_info["fragment"]}"') - - attr_str = ' ' + ' '.join(attributes) if attributes else '' - return f'<__link__{attr_str}>{processed_text}' - - # ============================================================================ - # TAG HANDLERS - # ============================================================================ - - def _handle_section(self, tag) -> str: - """Convert

to
with begin and end attributes""" - content = str(tag.contents) if tag.contents else '' - - # Extract begin and end attributes - attributes = [] - if hasattr(tag, 'attributes') and tag.attributes: - for attr in tag.attributes: - if hasattr(attr, 'name') and hasattr(attr, 'value'): - attr_name = str(attr.name) - attr_value = str(attr.value) - if attr_name in ['begin', 'end']: - attributes.append(f'{attr_name}="{attr_value}"') - - # Add begin and end attributes if they exist - attr_str = ' ' + ' '.join(attributes) if attributes else '' - - return f'{content}
' - - def _handle_table(self, tag) -> str: - """Convert to
""" - content = str(tag.contents) if tag.contents else '' - attributes = self._extract_tag_attributes(tag) - attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' - return f'{content}
' - - def _handle_table_row(self, tag) -> str: - """Convert to """ - content = str(tag.contents) if tag.contents else '' - attributes = self._extract_tag_attributes(tag) - attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' - return f'{content}' - - def _handle_table_cell(self, tag) -> str: - """Convert to """ - content = str(tag.contents) if tag.contents else '' - attributes = self._extract_tag_attributes(tag) - attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' - return f'{content}' - - def _handle_italic(self, tag) -> str: - """Convert to """ - content = str(tag.contents) if tag.contents else '' - attributes = self._extract_tag_attributes(tag) - attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' - return f'{content}' - - def _handle_line_break(self, tag) -> str: - """Convert
to
""" - attributes = self._extract_tag_attributes(tag) - attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' - return f'' - - def _handle_span(self, tag) -> str: - """Convert to """ - content = str(tag.contents) if tag.contents else '' - attributes = self._extract_tag_attributes(tag) - attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' - return f'{content}' - - def _handle_definition_description(self, tag) -> str: - """Convert
to
""" - content = str(tag.contents) if tag.contents else '' - attributes = self._extract_tag_attributes(tag) - attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' - return f'{content}
' - - def _handle_reference(self, tag) -> str: - """Convert to """ - content = str(tag.contents) if tag.contents else '' - attributes = self._extract_tag_attributes(tag) - attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' - return f'{content}' - - def _handle_noinclude(self, tag) -> str: - """Convert to """ - content = str(tag.contents) if tag.contents else '' - attributes = self._extract_tag_attributes(tag) - attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' - return f'{content}' - - def _handle_pagequality(self, tag) -> str: - """Convert to """ - content = str(tag.contents) if tag.contents else '' - attributes = self._extract_tag_attributes(tag) - attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' - return f'{content}' - - def _extract_tag_attributes(self, tag) -> Dict[str, str]: - """Extract all attributes from a tag""" - attributes = {} - if hasattr(tag, 'attributes') and tag.attributes: - for attr in tag.attributes: - if hasattr(attr, 'name') and hasattr(attr, 'value'): - attributes[str(attr.name)] = str(attr.value) - return attributes - - # ============================================================================ - # PREPROCESSORS - # ============================================================================ - - def _fix_noinclude_line_breaks(self, content: str) -> str: - """Insert a blank line after tags when followed by non-whitespace content""" - # Pattern to match followed by optional whitespace and any non-whitespace character - # This handles cases like: :text, text, {{template}}, etc. - pattern = r'()\s*(\S)' - - def replace_noinclude_content(match): - noinclude_tag = match.group(1) - following_content = match.group(2) - # Insert a newline after and before the following content - return f'{noinclude_tag}\n{following_content}' - - # Apply the replacement - content = re.sub(pattern, replace_noinclude_content, content) - - return content - - def _normalize_whitespace(self, content: str) -> str: - """Normalize whitespace in content""" - # Normalize multiple spaces to single space - content = re.sub(r' +', ' ', content) - # Normalize line breaks, but preserve paragraph markers - content = re.sub(r'\n+', '\n', content) - return content.strip() - - def _convert_paragraph_breaks(self, content: str) -> str: - """Convert double newlines to paragraph indicators, but skip if {{nop}} is directly adjacent""" - - # First, protect {{nop}} markers and their immediate context - # Replace {{nop}} with a temporary marker - content = content.replace('{{nop}}', '___NOP_MARKER___') - - # Convert \n\n to

\n paragraph indicators, but not if they're adjacent to ___NOP_MARKER___ - # This regex matches \n\n that are NOT preceded or followed by ___NOP_MARKER___ - content = re.sub(r'(?\n', content) - - # Restore {{nop}} markers - content = content.replace('___NOP_MARKER___', '{{nop}}') - - return content - - def _handle_special_characters(self, content: str) -> str: - """Handle special characters and entities - escape ampersands not in XML/HTML entities""" - # More comprehensive regex to match XML/HTML entities - # This includes named entities like &, <, >, ", ' - # and numeric entities like { and  - entity_pattern = r'&(?:[a-zA-Z][a-zA-Z0-9]*|#[0-9]+|#x[0-9a-fA-F]+);' - - # Split content by entities to preserve them - parts = re.split(f'({entity_pattern})', content) - - # Process each part - result_parts = [] - for part in parts: - if re.match(entity_pattern, part): - # This is an entity, keep it as-is - result_parts.append(part) - else: - # This is not an entity, escape standalone ampersands - escaped_part = part.replace('&', '&') - result_parts.append(escaped_part) - - return ''.join(result_parts) - - def _extract_metadata(self, content: str) -> Dict[str, Any]: - """Extract metadata from content""" - metadata = {} - # Extract page quality information - # Extract language information - # Extract structural information - return metadata - - # ============================================================================ - # POSTPROCESSORS - # ============================================================================ - - def _validate_xml_structure(self, content: str) -> str: - """Validate and fix XML structure""" - # Ensure proper nesting - # Validate against schema - # Fix common issues - return content - - def _cleanup_empty_elements(self, content: str) -> str: - """Remove or fix empty elements""" - # Remove empty elements - content = re.sub(r'<(\w+)[^>]*>', '', content) - return content - - def _finalize_metadata(self, content: str) -> str: - """Finalize metadata and add to content""" - # Add final metadata - # Ensure proper document structure - return content - - # ============================================================================ - # MAIN PROCESSING METHODS - # ============================================================================ - - def process_wikitext(self, wikitext: str) -> ConversionResult: - """ - Main method to process MediaWiki wikitext to XML. - - Args: - wikitext: The MediaWiki content to convert - - Returns: - ConversionResult with XML content and metadata - """ - warnings = [] - errors = [] - metadata = {} - - try: - # Preprocessing - content = wikitext - for preprocessor in self.preprocessors: - if preprocessor == self._extract_metadata: - metadata.update(preprocessor(content)) - else: - content = preprocessor(content) - - # Parse MediaWiki content - parsed = mwparserfromhell.parse(content) - - # Process all nodes with nested content support - nodes_to_replace = [] - - # Process nodes in the order they appear in the document - for node in parsed.nodes: - if hasattr(node, 'name'): # Template - template_name = str(node.name).strip() - if template_name in self.template_handlers: - try: - # Process nested content within the template - processed_node = self._process_template_with_nesting(node) - replacement = self.template_handlers[template_name](processed_node) - nodes_to_replace.append((node, replacement)) - except Exception as e: - errors.append(f"Error processing template {template_name}: {str(e)}") - else: - warnings.append(f"Unknown template: {template_name}") - - elif hasattr(node, 'tag'): # Tag - tag_name = str(node.tag).strip().lower() - if tag_name in self.tag_handlers: - try: - # Process nested content within the tag - processed_node = self._process_tag_with_nesting(node) - replacement = self.tag_handlers[tag_name](processed_node) - nodes_to_replace.append((node, replacement)) - except Exception as e: - errors.append(f"Error processing tag {tag_name}: {str(e)}") - else: - warnings.append(f"Unknown tag: {tag_name}") - - elif hasattr(node, '__class__') and 'Wikilink' in str(node.__class__): # Wikilink - try: - replacement = self._handle_wikilink(node) - nodes_to_replace.append((node, replacement)) - except Exception as e: - errors.append(f"Error processing wikilink: {str(e)}") - - # Replace all nodes in order - for node, replacement in nodes_to_replace: - parsed.replace(node, replacement) - - # Get processed content - xml_content = str(parsed) - - # Postprocessing - for postprocessor in self.postprocessors: - xml_content = postprocessor(xml_content) - - # Wrap in mediawiki tag - xml_content = f'{xml_content}' - - return ConversionResult( - xml_content=xml_content, - metadata=metadata, - warnings=warnings, - errors=errors, - wikilinks=self.wikilinks.copy() - ) - - except Exception as e: - errors.append(f"Fatal error in processing: {str(e)}") - return ConversionResult( - xml_content="", - metadata={}, - warnings=warnings, - errors=errors, - wikilinks=[] - ) - - def add_template_handler(self, template_name: str, handler_func): - """Add a custom template handler""" - self.template_handlers[template_name] = handler_func - - def add_tag_handler(self, tag_name: str, handler_func): - """Add a custom tag handler""" - self.tag_handlers[tag_name] = handler_func - - def add_preprocessor(self, preprocessor_func): - """Add a custom preprocessor""" - self.preprocessors.append(preprocessor_func) - - def add_postprocessor(self, postprocessor_func): - """Add a custom postprocessor""" - self.postprocessors.append(postprocessor_func) - - def get_wikilinks(self) -> List[Dict[str, Any]]: - """Get all captured wikilinks""" - return self.wikilinks.copy() - - def clear_wikilinks(self): - """Clear all captured wikilinks""" - self.wikilinks.clear() - - -# ============================================================================ -# CONVENIENCE FUNCTIONS -# ============================================================================ - -def create_processor() -> MediaWikiProcessor: - """Create a new MediaWiki processor instance""" - return MediaWikiProcessor() - - -def process_page(page_content: str) -> ConversionResult: - """Process a single page of MediaWiki content""" - processor = create_processor() - return processor.process_wikitext(page_content) - - -if __name__ == "__main__": - # Example usage - processor = create_processor() - - # Example MediaWiki content with nested templates - sample_wikitext = """ - {{verse|1|1|In the beginning God created the heaven and the earth.}} - - {{verse|1|2|And the earth was without form, and void; and darkness was upon the face of the deep.}} +from opensiddur.importer.util.mediawiki_processor import ( # noqa: F401 + ConversionResult, + MediaWikiProcessor, + create_processor, + process_page, +) - {{sc|Genesis}} {{c|Chapter 1}} - {{larger|The Creation}} - This is a reference - - See also [[Genesis]] and [[Creation myth]] for more information. - - Nested example: {{sc|{{larger|Bold Large Text}}}} - Complex nested: {{verse|1|3|{{sc|God}} said, {{larger|Let there be light}}}} - """ - - result = processor.process_wikitext(sample_wikitext) - print("XML Output:") - print(result.xml_content) - print("\nWarnings:", result.warnings) - print("Errors:", result.errors) - print("Wikilinks:", result.wikilinks) diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/README.md b/opensiddur/importer/miqra_al_pi_hamasorah/README.md index 19b3cfe..827ef3e 100644 --- a/opensiddur/importer/miqra_al_pi_hamasorah/README.md +++ b/opensiddur/importer/miqra_al_pi_hamasorah/README.md @@ -4,7 +4,7 @@ Scripts to download [*Miqra according to the Masorah*](https://docs.google.com/s ## License -The README tab of the source spreadsheet states that the text is prepared by Sefer Avi Kadish, based on Hebrew Wikisource material, and is licensed **CC-BY-SA 4.0 International**, with attribution to Hebrew Wikisource. See the downloaded `sheets/readme.tsv` for the full Hebrew and English wording. +The README tab of the source spreadsheet states that the text is prepared by Avi Kadish, based on Hebrew Wikisource material, and is licensed **CC-BY-SA 4.0 International**, with attribution to Hebrew Wikisource. See the downloaded `sheets/readme.tsv` for the full Hebrew and English wording. ## Download diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py b/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py new file mode 100644 index 0000000..43a3217 --- /dev/null +++ b/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py @@ -0,0 +1,601 @@ +from __future__ import annotations + +import argparse +import csv +import logging +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable, Optional + +import mwparserfromhell + +from opensiddur.common.constants import PROJECT_DIRECTORY +from opensiddur.common.xslt import xslt_transform_string +from opensiddur.importer.util.pages import ( + default_sourcetexts_root, + miqra_al_pi_hamasorah_data_directory, + miqra_al_pi_hamasorah_sheets_directory, +) +from opensiddur.importer.util.prettify import prettify_xml +from opensiddur.importer.util.validation import validate +from opensiddur.importer.miqra_al_pi_hamasorah.miqra_wikitext import ( + wikitext_to_intermediate_xml, +) + +logger = logging.getLogger(__name__) + +MIQRA_TO_TEI_XSLT = Path(__file__).parent / "miqra_to_tei.xslt" + +# Biblical-book tabs only (5-column A–E schema). Do not ingest special/auto_edits/etc. +BIBLICAL_TSV_SLUGS = frozenset( + { + "torah", + "neviim_rishonim", + "neviim_acharonim", + "sifrei_emet", + "chamisha_megillot", + "ketuvim_aharonim", + } +) + +_NON_VERSE_ROW_IDS = frozenset({"0", "תתת"}) + + +def make_project_directory(project_dir: Path | None = None) -> Path: + directory = ( + project_dir.resolve() + if project_dir is not None + else PROJECT_DIRECTORY / "miqra_al_pi_hamasorah" + ) + directory.mkdir(parents=True, exist_ok=True) + return directory + + +def _default_project_directory() -> Path: + return PROJECT_DIRECTORY / "miqra_al_pi_hamasorah" + + +@dataclass(frozen=True) +class Book: + book_name_he: str + book_name_en: str + file_name: str + + +@dataclass(frozen=True) +class Index: + index_title_en: str + index_title_he: Optional[str] + index_sub_en: Optional[str] + index_sub_he: Optional[str] + file_name: str + transclusions: list[Book | "Index"] + + +TANAKH_INDEX: list[Index] = [ + Index( + index_title_en="Miqra al pi ha-Masorah", + index_title_he="מקרא על פי המסורה", + index_sub_en=None, + index_sub_he=None, + file_name="index", + transclusions=[ + Index( + index_title_en="The Law", + index_title_he="תורה", + index_sub_en=None, + index_sub_he=None, + file_name="the_law", + transclusions=[ + Book("בראשית", "Genesis", "genesis"), + Book("שמות", "Exodus", "exodus"), + Book("ויקרא", "Leviticus", "leviticus"), + Book("במדבר", "Numbers", "numbers"), + Book("דברים", "Deuteronomy", "deuteronomy"), + ], + ), + Index( + index_title_en="The Prophets", + index_title_he="נביאים", + index_sub_en=None, + index_sub_he=None, + file_name="the_prophets", + transclusions=[ + Book("יהושע", "Joshua", "joshua"), + Book("שפטים", "Judges", "judges"), + Book("שמואל א", "I Samuel", "samuel_1"), + Book("שמואל ב", "II Samuel", "samuel_2"), + Book("מלכים א", "I Kings", "kings_1"), + Book("מלכים ב", "II Kings", "kings_2"), + Book("ישעיה", "Isaiah", "isaiah"), + Book("ירמיה", "Jeremiah", "jeremiah"), + Book("יחזקאל", "Ezekiel", "ezekiel"), + Index( + index_title_en="The Twelve", + index_title_he=None, + index_sub_en=None, + index_sub_he=None, + file_name="the_twelve", + transclusions=[ + Book("הושע", "Hosea", "hosea"), + Book("יואל", "Joel", "joel"), + Book("עמוס", "Amos", "amos"), + Book("עובדיה", "Obadiah", "obadiah"), + Book("יונה", "Jonah", "jonah"), + Book("מיכה", "Micah", "micah"), + Book("נחום", "Nahum", "nahum"), + Book("חבקוק", "Habakkuk", "habakkuk"), + Book("צפניה", "Zephaniah", "zephaniah"), + Book("חגי", "Haggai", "haggai"), + Book("זכריה", "Zechariah", "zechariah"), + Book("מלאכי", "Malachi", "malachi"), + ], + ), + ], + ), + Index( + index_title_en="The Writings", + index_title_he="כתובים", + index_sub_en=None, + index_sub_he=None, + file_name="the_writings", + transclusions=[ + Book("תהלים", "Psalms", "psalms"), + Book("משלי", "Proverbs", "proverbs"), + Book("איוב", "Job", "job"), + Book("שיר השירים", "Song of Songs", "song_of_songs"), + Book("רות", "Ruth", "ruth"), + Book("איכה", "Lamentations", "lamentations"), + Book("קהלת", "Ecclesiastes", "ecclesiastes"), + Book("אסתר", "Esther", "esther"), + Book("דניאל", "Daniel", "daniel"), + Book("עזרא", "Ezra", "ezra"), + Book("נחמיה", "Nehemiah", "nehemiah"), + Book("דברי הימים א", "I Chronicles", "chronicles_1"), + Book("דברי הימים ב", "II Chronicles", "chronicles_2"), + ], + ), + ], + ) +] + + +def _flatten_books(indices: Iterable[Index]) -> list[Book]: + books: list[Book] = [] + for idx in indices: + for t in idx.transclusions: + if isinstance(t, Book): + books.append(t) + else: + books.extend(_flatten_books([t])) + return books + + +def header( + title_he: Optional[str], + title_en: str, + *, + project_id: str = "miqra_al_pi_hamasorah", + namespace: str = "bible", + entrypoint: str = "tanakh", + qualifier: str = "", + license_url: str = "https://creativecommons.org/licenses/by-sa/4.0/", + license_name: str = "Creative Commons Attribution-ShareAlike 4.0 International", +) -> str: + title_he_xml = ( + f"""{title_he}""" if title_he else "" + ) + return f""" + + + {title_en} + {title_he_xml} + + + + Open Siddur Project + + urn:x-opensiddur:text:{namespace}:{entrypoint}{qualifier}@{project_id} + + {license_name} + + + + + מקרא על פי המסורה + Avi Kadish + + Hebrew Wikisource + + https://he.wikisource.org/wiki/%D7%9E%D7%A7%D7%A8%D7%90_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%94%D7%9E%D7%A1%D7%95%D7%A8%D7%94#%D7%A8%D7%90%D7%A9 + Prepared by Avi Kadish, based on Hebrew Wikisource material; distributed via a public Google Sheet. + + + + +""" + + +def tei_file( + header_xml: str, + *, + default_lang: str = "he", + front: str = "", + body: str = "", + back: str = "", + stand_off: str = "", +) -> str: + return f""" +{header_xml} + +{front} +{body} +{back} + +{stand_off} + +""" + + +def validate_and_write_tei_file(tei_content: str, file_name: str, project_dir: Path | None) -> Path: + directory = project_dir.resolve() if project_dir is not None else _default_project_directory() + out_path = directory / f"{file_name}.xml" + pretty_xml = prettify_xml(tei_content, remove_xml_declaration=True) + is_valid, errors = validate(pretty_xml) + if not is_valid: + raise Exception(f"Errors in {file_name}: {errors}") + out_path.write_text(pretty_xml, encoding="utf-8") + return out_path + + +def _xml_escape(text: str) -> str: + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'") + ) + + +_PAGE_KEY_RE = re.compile(r"^\s*(?:ספר\s+)?(?P[^/]+)\s*/\s*(?P[^/\s]+)\s*$") + + +def _book_key_from_page_key(page_key: str) -> Optional[str]: + m = _PAGE_KEY_RE.match(page_key or "") + if not m: + return None + return m.group("book").strip() + + +_HEBREW_NUM_RE = re.compile(r"^[\u05d0-\u05ea\"׳״\s]+$") + + +def _hebrew_numeral_to_int(value: str) -> Optional[int]: + """ + Very small Hebrew-numeral parser for verse/chapter labels. + + Handles single-letter verse labels (א,ב,ג,...) and common gershayim/geresh marks. + For anything more complex, we return None and fall back to original. + """ + s = (value or "").strip() + if not s: + return None + s = s.replace("״", "").replace("׳", "").replace("'", "").replace('"', "").strip() + if not s: + return None + if not _HEBREW_NUM_RE.match(s): + return None + + # Simple gematria for Hebrew letters + mapping = { + "א": 1, + "ב": 2, + "ג": 3, + "ד": 4, + "ה": 5, + "ו": 6, + "ז": 7, + "ח": 8, + "ט": 9, + "י": 10, + "כ": 20, + "ך": 20, + "ל": 30, + "מ": 40, + "ם": 40, + "נ": 50, + "ן": 50, + "ס": 60, + "ע": 70, + "פ": 80, + "ף": 80, + "צ": 90, + "ץ": 90, + "ק": 100, + "ר": 200, + "ש": 300, + "ת": 400, + } + + total = 0 + for ch in s: + if ch.isspace(): + continue + v = mapping.get(ch) + if v is None: + return None + total += v + return total if total > 0 else None + + +def _normalize_to_arabic_numerals(value: str) -> str: + s = (value or "").strip() + if not s: + return "" + if s.isdigit(): + return s + n = _hebrew_numeral_to_int(s) + if n is not None: + return str(n) + return "" + + +def _valid_urn_segment(value: str) -> str: + """Return an Arabic numeral string suitable for URN path segments, or empty.""" + normalized = _normalize_to_arabic_numerals(value) + return normalized if normalized.isdigit() else "" + + +def _chapter_from_page_key(page_key: str) -> str: + m = _PAGE_KEY_RE.match(page_key or "") + if not m: + return "" + return _normalize_to_arabic_numerals(m.group("chapter").strip()) + + +def _extract_m_pasuk(scaffold_wikitext: str) -> tuple[str, str]: + """ + Extract (chapter, verse) from {{מ:פסוק|...}} when present. + Expected: {{מ:פסוק|||}}. + """ + parsed = mwparserfromhell.parse(scaffold_wikitext or "") + # Top-level only: avoid nested {{מ:פסוק|...}} inside verse text in other columns. + for t in parsed.filter_templates(recursive=False): + if str(t.name).strip() != "מ:פסוק": + continue + ch_raw = str(t.get(2).value).strip() if t.has(2) else "" + v_raw = str(t.get(3).value).strip() if t.has(3) else "" + ch = _valid_urn_segment(ch_raw) + v = _valid_urn_segment(v_raw) + if ch and v: + return ch, v + return "", "" + + +def _extract_chapter_verse_numbers(page_key: str, row_id: str, scaffold_wikitext: str) -> tuple[str, str]: + row_id = (row_id or "").strip() + if row_id in _NON_VERSE_ROW_IDS or len(row_id) > 8: + return "", "" + + ch2, v2 = _extract_m_pasuk(scaffold_wikitext) + if ch2 and v2: + return ch2, v2 + + chapter = _valid_urn_segment(_chapter_from_page_key(page_key)) + verse = _valid_urn_segment(row_id) + if chapter and verse: + return chapter, verse + return "", "" + + +def _build_book_name_map() -> dict[str, Book]: + # Map Hebrew book title → Book + books = _flatten_books(TANAKH_INDEX) + return {b.book_name_he: b for b in books} + + +def _iter_tsv_rows(tsv_path: Path) -> Iterable[list[str]]: + with tsv_path.open("r", encoding="utf-8", newline="") as f: + reader = csv.reader(f, delimiter="\t") + for row in reader: + yield row + + +def _looks_like_header_row(row: list[str]) -> bool: + # Conservative heuristic: TSV export may include a header row with obvious labels. + joined = "\t".join(row).lower() + return any(k in joined for k in ("page", "row", "navigation", "scaffold", "text", "עמוד", "שורה")) + + +def miqra_rows_to_intermediate(book: Book, sheets_dir: Path) -> str: + """ + Build an intermediate XML document for a single book. + + We scan all TSVs under sheets_dir and select rows whose page key identifies + the requested book. + """ + he_to_book = _build_book_name_map() + target_he = book.book_name_he + + rows_xml: list[str] = [] + for tsv_path in sorted(sheets_dir.glob("*.tsv")): + slug = tsv_path.stem + if slug not in BIBLICAL_TSV_SLUGS: + continue + + first = True + for row in _iter_tsv_rows(tsv_path): + if first and _looks_like_header_row(row): + first = False + continue + first = False + + # Biblical tabs: require the 5-column A–E schema. + if len(row) < 5: + continue + + page_key = row[0] + row_id = row[1] + nav = row[2] + scaffold = row[3] + text = row[4] + + book_he = _book_key_from_page_key(page_key) or "" + resolved = he_to_book.get(book_he) + if resolved is None or resolved.book_name_he != target_he: + continue + + chapter_n, verse_n = _extract_chapter_verse_numbers(page_key, row_id, scaffold) + if not chapter_n or not verse_n: + continue + + rows_xml.append( + f""" + {wikitext_to_intermediate_xml(nav, column_c=True)} + {wikitext_to_intermediate_xml(scaffold)} + {wikitext_to_intermediate_xml(text)} +""" + ) + + rows_joined = "\n".join(rows_xml) + return f""" +{rows_joined} + +""" + + +def intermediate_to_tei(intermediate_xml: str, *, xslt_params: Optional[dict[str, Any]] = None) -> dict[str, str]: + outputs = xslt_transform_string( + MIQRA_TO_TEI_XSLT, + intermediate_xml, + multiple_results=True, + xslt_params=xslt_params, + ) + return { + "front": outputs.get("front", ""), + "body": outputs.get("body", outputs.get("", "")), + "stand_off": outputs.get("standoff", ""), + } + + +def book_file(book: Book, *, sourcetexts_root: Path | None, project_dir: Path | None) -> None: + sheets_dir = miqra_al_pi_hamasorah_sheets_directory(sourcetexts_root) + if not sheets_dir.exists(): + raise FileNotFoundError(f"Missing Miqra sheets directory: {sheets_dir} (run download first)") + + intermediate = miqra_rows_to_intermediate(book, sheets_dir) + xml_dict = intermediate_to_tei(intermediate) + header_xml = header(book.book_name_he, book.book_name_en, qualifier=f":{book.file_name}") + tei_content = tei_file(header_xml, **xml_dict) + make_project_directory(project_dir) + validate_and_write_tei_file(tei_content, book.file_name, project_dir) + + +def _readme_front_matter(sourcetexts_root: Path | None) -> str: + sheets_dir = miqra_al_pi_hamasorah_sheets_directory(sourcetexts_root) + readme = sheets_dir / "readme.tsv" + if not readme.exists(): + return "" + lines: list[str] = [] + for row in _iter_tsv_rows(readme): + # Preserve all cells; this is human prose. + line = " ".join(c for c in row if c).strip() + if line: + lines.append(line) + paras = "\n".join([f"{_xml_escape(l)}" for l in lines]) + return f"{paras}" + + +def index_file(idx: Index, *, sourcetexts_root: Path | None, project_dir: Path | None) -> None: + transclusion_str = "\n".join( + [ + f"""""" + for t in idx.transclusions + ] + ) + index_body = f""" + + {_xml_escape(idx.index_title_en)} + {transclusion_str} + + +""" + front = _readme_front_matter(sourcetexts_root) if idx.file_name == "index" else "" + header_xml = header(idx.index_title_he, idx.index_title_en, qualifier=f":{idx.file_name}") + tei_content = tei_file(header_xml, front=front, body=index_body) + make_project_directory(project_dir) + validate_and_write_tei_file(tei_content, idx.file_name, project_dir) + + for t in idx.transclusions: + if isinstance(t, Index): + index_file(t, sourcetexts_root=sourcetexts_root, project_dir=project_dir) + else: + book_file(t, sourcetexts_root=sourcetexts_root, project_dir=project_dir) + + +def _build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Convert Miqra al pi ha-Masorah TSV sheets to JLPTEI." + ) + parser.add_argument( + "--sourcetexts-root", + type=Path, + default=default_sourcetexts_root(), + help="Root of sourcetexts tree (default: /sources).", + ) + parser.add_argument( + "--project-dir", + type=Path, + default=None, + help=( + "Output project directory (default: /project/miqra_al_pi_hamasorah)." + ), + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print planned actions without writing files.", + ) + parser.add_argument( + "--only-book", + type=str, + default=None, + help="Only generate a single book by file slug (e.g. genesis).", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + logging.basicConfig(level=logging.INFO) + args = _build_arg_parser().parse_args(argv) + + data_dir = miqra_al_pi_hamasorah_data_directory(args.sourcetexts_root) + sheets_dir = miqra_al_pi_hamasorah_sheets_directory(args.sourcetexts_root) + out_dir = args.project_dir if args.project_dir is not None else _default_project_directory() + + if args.dry_run: + logger.info("Would read Miqra TSVs from %s", sheets_dir) + logger.info("Would write project files to %s", out_dir) + if args.only_book: + logger.info("Would generate only book: %s", args.only_book) + return 0 + + if args.only_book: + all_books = {b.file_name: b for b in _flatten_books(TANAKH_INDEX)} + book = all_books.get(args.only_book) + if book is None: + raise ValueError(f"Unknown book slug: {args.only_book}") + book_file(book, sourcetexts_root=args.sourcetexts_root, project_dir=args.project_dir) + return 0 + + # Generate index + all transclusions recursively + index_file(TANAKH_INDEX[0], sourcetexts_root=args.sourcetexts_root, project_dir=args.project_dir) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/miqra_to_tei.xslt b/opensiddur/importer/miqra_al_pi_hamasorah/miqra_to_tei.xslt new file mode 100644 index 0000000..f080fd9 --- /dev/null +++ b/opensiddur/importer/miqra_al_pi_hamasorah/miqra_to_tei.xslt @@ -0,0 +1,243 @@ + + + + + + + + + + + urn:x-opensiddur:text:bible: + + + + + + + + + + + + + + + + + + + + + + + + + + + urn:x-opensiddur:text:bible: + + / + + / + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [ + + ] + + + + + + + ( + + ) + + + + + + [ + + ] + + + + + + + + + + + + + + + + + + + + + indent + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ͏ִ + + + + + + + + + + + + + + + + + + * + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py b/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py new file mode 100644 index 0000000..6ca5e25 --- /dev/null +++ b/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py @@ -0,0 +1,633 @@ +""" +Convert Miqra al pi ha-Masorah wikitext (per templates.tsv) to intermediate XML. + +All templates documented in sources/miqra_al_pi_hamasorah/sheets/templates.tsv are +handled here, including when nested inside verse text (e.g. {{נוסח|…}}). +""" + +from __future__ import annotations + +import re +from typing import Callable, Optional +from urllib.parse import quote + +import mwparserfromhell + +from opensiddur.importer.util.mediawiki_processor import MediaWikiProcessor + +MIQRA_NS = "urn:x-opensiddur:miqra:intermediate" +MW_NS = "urn:x-opensiddur:mw:intermediate" + +_STRIP_TEMPLATES = frozenset( + { + "מ:פסוק", + "מ:פסוק-שירה", + "מ:שוליים", + "מ:שוליים-סוף", + "מ:טעמי המקרא", + "מ:טעמי המקרא-סוף", + "טעמי המקרא באינטרנט", + "תבנית:טעמי המקרא באינטרנט", + "מ:ספר חדש", + "מ:רווח בתרי עשר", + "רווח בתרי עשר", + "מ:רווח לספר בתהלים", + "רווח לספר בתהלים", + "מ:אין פרשה בתחילת פרק", + 'מ:אין פרשה בתחילת פרק בספרי אמ"ת', + "מ:אין רווח של פרשה בתחילת פרשת השבוע", + "מ:יישור-בשני-הצדדים", + "מ:יישור-בשני-הצדדים-סוף", + "בסיס-משתמש", + 'צורות כתיבה בספרי אמ"ת', + "documentation", + "name", + "template", + "תבנית", + } +) + +_BOLD_ITALIC_RE = re.compile(r"'''''(.*?)'''''") +_BOLD_RE = re.compile(r"'''(.*?)'''") +_ITALIC_RE = re.compile(r"''(.*?)''") +_ANY_HI_RE = re.compile(r"'''''(.*?)'''''|'''(.*?)'''|''(.*?)''") +_TAG_OPEN_RE = re.compile(r"<(miqra|mw):([a-zA-Z0-9-]+)([^>]*?)(/?)>") +_KETEG_START_RE = re.compile(r"<קטע\s+התחלה=([^/>]+)\s*/>", re.IGNORECASE) +_KETEG_END_RE = re.compile(r"<קטע\s+סוף=([^/>]+)\s*/>", re.IGNORECASE) + + +def normalize_template_name(name: str) -> str: + n = str(name).strip() + if n.lower().startswith("תבנית:"): + n = n.split(":", 1)[1].strip() + n = n.replace("''", '"').replace("״", '"').replace("׳", "'") + return n.strip() + + +def link_target_to_uri(target: str) -> str: + """Turn a URL or Hebrew Wikisource page title into a valid URI for tei:ref/@target.""" + t = (target or "").strip() + if not t: + return "" + if re.match(r"^https?://", t, re.I): + return t + if t.startswith("//"): + return "https:" + t + page, sep, frag = t.partition("#") + page = page.replace(" ", "_").strip() + if page: + uri = "https://he.wikisource.org/wiki/" + quote(page, safe="/:%") + else: + uri = "https://he.wikisource.org/wiki/" + if sep: + uri += "#" + quote(frag, safe=":/%.-_") + return uri + + +def _xml_escape(text: str) -> str: + return ( + (text or "") + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'") + ) + + +def _wikitext_basic_markup_to_xml(text: str) -> str: + s = text or "" + out: list[str] = [] + pos = 0 + for m in _ANY_HI_RE.finditer(s): + out.append(_xml_escape(s[pos : m.start()])) + if m.group(1) is not None: + rend, inner = "bold-italic", m.group(1) + elif m.group(2) is not None: + rend, inner = "bold", m.group(2) + else: + rend, inner = "italic", m.group(3) or "" + out.append(f'{_xml_escape(inner)}') + pos = m.end() + out.append(_xml_escape(s[pos:])) + return "".join(out) + + +def _escape_outside_tags(fragment: str) -> str: + """Escape text nodes while preserving nested miqra:/mw: XML elements.""" + + out: list[str] = [] + pos = 0 + while pos < len(fragment): + m = _TAG_OPEN_RE.search(fragment, pos) + if not m: + out.append(_wikitext_basic_markup_to_xml(fragment[pos:])) + break + out.append(_wikitext_basic_markup_to_xml(fragment[pos : m.start()])) + ns, local, _attrs, self_close = m.group(1), m.group(2), m.group(3), m.group(4) + if self_close == "/": + out.append(m.group(0)) + pos = m.end() + continue + close = f"" + depth = 1 + search = m.end() + closed_at: Optional[int] = None + while depth > 0 and search <= len(fragment): + next_close = fragment.find(close, search) + if next_close == -1: + break + inner_open = _TAG_OPEN_RE.search(fragment, search, next_close) + if inner_open and inner_open.start() < next_close and inner_open.group(4) != "/": + inner_local = inner_open.group(2) + if inner_open.group(1) == ns and inner_local == local: + depth += 1 + search = inner_open.end() + else: + depth -= 1 + if depth == 0: + closed_at = next_close + else: + search = next_close + len(close) + if closed_at is None: + out.append(_wikitext_basic_markup_to_xml(fragment[m.start() :])) + break + inner = fragment[m.end() : closed_at] + out.append(m.group(0)) + out.append(_escape_outside_tags(inner)) + out.append(close) + pos = closed_at + len(close) + return "".join(out) + + +def _preprocess_column_c(wikitext: str) -> str: + """Column C markers from templates.tsv (not templates).""" + s = wikitext or "" + s = s.replace("__", " ") + s = re.sub(r"(?", s) + return s + + +def _preprocess_miqra_tags(wikitext: str) -> str: + s = wikitext or "" + s = _KETEG_START_RE.sub( + r'', s + ) + s = _KETEG_END_RE.sub(r'', s) + return s + + +class MiqraWikiTextProcessor(MediaWikiProcessor): + """MediaWiki processor with handlers for all Miqra templates.""" + + def __init__(self) -> None: + self._note_seq = 0 + super().__init__() + + def _initialize_handlers(self) -> None: + self.template_handlers = {} + self.tag_handlers = {} + self.preprocessors = [_preprocess_miqra_tags] + self.postprocessors = [] + self._register_template_handlers() + self._register_tag_handlers() + + def _register_tag_handlers(self) -> None: + self.tag_handlers["noinclude"] = self._handle_strip_tag + + def _handle_strip_tag(self, tag) -> str: + return "" + + def _register_template_handlers(self) -> None: + h = self.add_template_handler + for name in _STRIP_TEMPLATES: + h(name, self._handle_strip) + + h("נוסח", self._handle_nosach) + h("ש", self._handle_footnote_mark) + h("שם", self._handle_strip) + + h("פפ", self._handle_parashah_open) + h("פפפ", self._handle_parashah_open_line) + h("רווח בסוף שורה", self._handle_strip) + h("סס", self._handle_parashah_close) + h("ססס", self._handle_parashah_close_inline) + h("סס2", self._handle_parashah_close_narrow) + h("מ:ששש", self._handle_shirah_break) + + h("ר0", self._handle_poetic_space) + h("ר1", self._handle_poetic_indent1) + h("ר2", self._handle_poetic_indent2) + h("ר3", self._handle_poetic_line) + h("ר4", self._handle_poetic_verse) + h("פרשה-מרכז", self._handle_centered_title) + + h("כתיב ולא קרי", self._handle_ketiv_only) + h("קרי ולא כתיב", self._handle_qeri_only) + h('מ:קו"כ-אם-2', self._handle_qok_if_matres) + h('מ:קו"כ קרי שונה מהכתיב בשתי מילים', self._handle_qok_two_qeri_words) + + h("מ:אות-ג", self._handle_large_letter) + h("מ:אות-ק", self._handle_small_letter) + h("מ:אות תלויה", self._handle_raised_letter) + h("מ:אות מנוקדת", self._handle_dotted_letter) + h('מ:נו"ן הפוכה', self._handle_inverted_nun) + h("מ:ירושלם", self._handle_yerushalem) + h("מ:ירושלמה", self._handle_yerushalema) + h("ירח בן יומו", self._handle_accent_yerah) + h("גלגל", self._handle_accent_galgal) + h("אתנח הפוך", self._handle_accent_etnah) + h("מ:קמץ", self._handle_qamats) + h("מ:טעם ומתג באות אחת", self._handle_taam_meteg) + h("שני טעמים באות אחת", self._handle_two_taamim) + h( + "שני טעמים באות אחת קמץ-תחתון-פתח-עליון", + self._handle_two_taamim_qupo, + ) + h("מ:טעם", self._handle_taam_dummy) + h("תבנית:מ:טעם", self._handle_taam_dummy) + h("מ:גרש ותלישא גדולה", self._handle_geresh_telisha) + h("מ:גרשיים ותלישא גדולה", self._handle_gershayim_telisha) + h("מ:כל קמץ קטן מרכא", self._handle_kol_qamats) + h("מ:לגרמיה-2", self._handle_legarmeh) + h("מ:פסק", self._handle_paseq) + h("מ:מקף אפור", self._handle_grey_maqaf) + + h("מ:הערה", self._handle_mam_note) + h("עוגן בשורה", self._handle_line_anchor) + h("מ:סיום בטוב", self._handle_good_ending) + h("קק", self._handle_dual_trope_link) + h("מ:כפול", self._handle_dual_accent) + + h("מ:קישור בהערה", self._handle_note_link) + h("מ:קישור פנימי בהערה", self._handle_note_link) + h("מודגש", self._handle_emphasis) + + def _lookup_handler(self, name: str) -> Optional[Callable]: + n = normalize_template_name(name) + if n in self.template_handlers: + return self.template_handlers[n] + if n.startswith('מ:כו"ק') or n.startswith('כו"ק') or n.startswith("כו''ק"): + return self._handle_ketiv_qeri + if n.startswith('מ:קו"כ') or n.startswith('קו"כ') or n.startswith("קו''כ"): + return self._handle_qeri_ketiv + return None + + def _process_nested_content(self, content: str, depth: int = 0) -> str: + if depth > 12: + return content + + parsed = mwparserfromhell.parse(content) + nodes_to_replace = [] + + for node in parsed.nodes: + if hasattr(node, "name"): + template_name = str(node.name).strip() + handler = self._lookup_handler(template_name) + if handler is None: + n = normalize_template_name(template_name) + if n in _STRIP_TEMPLATES: + handler = self._handle_strip + else: + processed = self._process_nested_content(str(node), depth + 1) + nodes_to_replace.append((node, processed)) + continue + try: + processed_node = self._process_template_with_nesting(node, depth + 1) + replacement = handler(processed_node) + except Exception: + replacement = handler(node) + nodes_to_replace.append((node, replacement)) + elif hasattr(node, "tag"): + tag_name = str(node.tag).strip().lower() + if tag_name in self.tag_handlers: + try: + processed_node = self._process_tag_with_nesting(node, depth + 1) + replacement = self.tag_handlers[tag_name](processed_node) + except Exception: + replacement = self.tag_handlers[tag_name](node) + nodes_to_replace.append((node, replacement)) + else: + processed = self._process_nested_content(str(node), depth + 1) + nodes_to_replace.append((node, processed)) + elif "Wikilink" in str(node.__class__): + nodes_to_replace.append((node, self._handle_wikilink_miqra(node))) + + elif node.__class__.__name__ == "Heading": + # Note text uses "=source=reading" notation; mwparser treats it as wikitext headings. + title = self._process_nested_content(str(node.title), depth + 1) + nodes_to_replace.append((node, "=" + title + "=")) + + for node, replacement in nodes_to_replace: + parsed.replace(node, replacement) + + return str(parsed) + + def _handle_wikilink_miqra(self, node) -> str: + raw_title = str(getattr(node, "title", "")).strip() + target = _xml_escape(link_target_to_uri(raw_title)) + text = str(getattr(node, "text", "")).strip() if getattr(node, "text", None) else "" + if text: + return f'{_xml_escape(text)}' + return f'' + + def _p(self, content: str) -> str: + return self._process_nested_content(content or "") + + def _param_value(self, template, key: str | int) -> str: + """Read a template parameter by name or 1-based index. + + mwparserfromhell's ``template.get(1)`` returns ``'1=value'`` when the + wikitext uses explicit ``1=value`` syntax; iterating ``params`` is reliable. + """ + key_s = str(key).strip() + for p in template.params: + pname = str(p.name).strip() + if pname == key_s: + return str(p.value).strip() + if pname.isdigit() and key_s.isdigit() and int(pname) == int(key_s): + return str(p.value).strip() + return "" + + def _param(self, template, index: int) -> str: + return self._param_value(template, index) + + def _named_param(self, template, name: str) -> str: + return self._param_value(template, name) + + def _note_params(self, template) -> str: + parts: list[str] = [] + for p in template.params: + pname = str(p.name).strip() + if pname.isdigit() and int(pname) >= 2: + parts.append(self._p(str(p.value))) + elif pname in ("2", "הערות", "הערה", "notes"): + parts.append(self._p(str(p.value))) + return "".join(parts) + + def _mid_verse_attr(self, template) -> str: + for p in template.params: + if "פסקא באמצע פסוק" in str(p.value): + return ' midVerse="true"' + return "" + + def _next_note_id(self) -> str: + self._note_seq += 1 + return f"miqra-note-{self._note_seq}" + + # --- handlers --- + + def _handle_strip(self, template) -> str: + return "" + + def _handle_nosach(self, template) -> str: + display = self._p(self._param(template, 1)) + notes = self._note_params(template) + if not notes: + return display + note_id = self._next_note_id() + return ( + f'' + f"{display}" + f"" + f'{notes}' + ) + + def _handle_footnote_mark(self, template) -> str: + return "" + + def _handle_ketiv_qeri(self, template) -> str: + ketiv = self._p(self._param(template, 1)) + qeri = self._p(self._param(template, 2)) + return ( + f'' + f"{ketiv}" + f"{qeri}" + f"" + ) + + def _handle_qeri_ketiv(self, template) -> str: + ketiv = self._p(self._param(template, 1)) + qeri = self._p(self._param(template, 2)) + return ( + f'' + f"{ketiv}" + f"{qeri}" + f"" + ) + + def _handle_qok_if_matres(self, template) -> str: + display = self._p(self._param(template, 1)) + ketiv = self._p(self._param(template, 2)) + qeri = self._p(self._param(template, 3)) + return ( + f"{display}" + f'' + f"{ketiv}" + f"{qeri}" + f"" + ) + + def _handle_qok_two_qeri_words(self, template) -> str: + ketiv = self._p(self._param(template, 1)) + q1 = self._p(self._param(template, 2)) + q2 = self._p(self._param(template, 3)) + return ( + f'' + f"{q1}" + f"{q2}" + f"{ketiv}" + f"" + ) + + def _handle_ketiv_only(self, template) -> str: + ketiv = self._p(self._param(template, 1)) + return f'({ketiv})' + + def _handle_qeri_only(self, template) -> str: + qeri = self._p(self._param(template, 1)) + return f"[{qeri}]" + + def _handle_parashah_open(self, template) -> str: + return f'' + + def _handle_parashah_open_line(self, template) -> str: + return f'' + + def _handle_parashah_close(self, template) -> str: + return f'' + + def _handle_parashah_close_inline(self, template) -> str: + return f'' + + def _handle_parashah_close_narrow(self, template) -> str: + return f'' + + def _handle_shirah_break(self, template) -> str: + return '' + + def _handle_poetic_space(self, template) -> str: + return '' + + def _handle_poetic_indent1(self, template) -> str: + return '' + + def _handle_poetic_indent2(self, template) -> str: + return '' + + def _handle_poetic_line(self, template) -> str: + return '' + + def _handle_poetic_verse(self, template) -> str: + return '' + + def _handle_centered_title(self, template) -> str: + title = self._p(self._param(template, 1)) + return f"{title}" + + def _handle_large_letter(self, template) -> str: + letter = self._p(self._param(template, 1)) + return f'{letter}' + + def _handle_small_letter(self, template) -> str: + letter = self._p(self._param(template, 1)) + return f'{letter}' + + def _handle_raised_letter(self, template) -> str: + letter = self._p(self._param(template, 1)) + return f'{letter}' + + def _handle_dotted_letter(self, template) -> str: + word = self._p(self._param(template, 1)) + return f"{word}" + + def _handle_inverted_nun(self, template) -> str: + sym = self._p(self._param(template, 1)) + return f"{sym}" + + def _handle_yerushalem(self, template) -> str: + p1 = _xml_escape(self._param(template, 1)) + p2 = _xml_escape(self._param(template, 2)) + return f'' + + def _handle_yerushalema(self, template) -> str: + p1 = _xml_escape(self._param(template, 1)) + p2 = _xml_escape(self._param(template, 2)) + return f'' + + def _handle_accent_yerah(self, template) -> str: + return '' + + def _handle_accent_galgal(self, template) -> str: + return '' + + def _handle_accent_etnah(self, template) -> str: + return '' + + def _handle_qamats(self, template) -> str: + d = self._named_param(template, "ד") + s = self._named_param(template, "ס") + text = d or s or self._param(template, 1) + return self._p(text) + + def _handle_taam_meteg(self, template) -> str: + return self._p(self._param(template, 1)) + + def _handle_two_taamim(self, template) -> str: + return '' + + def _handle_two_taamim_qupo(self, template) -> str: + above = self._p(self._named_param(template, "עליו") or self._param(template, 1)) + return f'' + + def _handle_taam_dummy(self, template) -> str: + raw = self._param(template, 1) + return self._p(raw[1:] if raw else "") + + def _handle_geresh_telisha(self, template) -> str: + return '' + + def _handle_gershayim_telisha(self, template) -> str: + return '' + + def _handle_kol_qamats(self, template) -> str: + return self._p(self._param(template, 1)) or "כָּל" + + def _handle_legarmeh(self, template) -> str: + return '׀' + + def _handle_paseq(self, template) -> str: + return '׀' + + def _handle_grey_maqaf(self, template) -> str: + return '־' + + def _handle_mam_note(self, template) -> str: + body = self._p(self._param(template, 1)) + note_id = self._next_note_id() + return ( + f'' + f'{body}' + ) + + def _handle_line_anchor(self, template) -> str: + label = _xml_escape(self._param(template, 1)) + return f'' + + def _handle_good_ending(self, template) -> str: + text = self._p(self._param(template, 1)) + return f"{text}" + + def _handle_dual_trope_link(self, template) -> str: + target = self._p(self._param(template, 1)) + return f"{target}" + + def _handle_dual_accent(self, template) -> str: + dual = self._p(self._named_param(template, "כפול")) + a = self._p(self._named_param(template, "א")) + b = self._p(self._named_param(template, "ב")) + return ( + f'' + f"{a}" + f"{b}" + f"" + ) + + def _handle_note_link(self, template) -> str: + raw_target = self._named_param(template, "1") or self._param(template, 1) + label = self._named_param(template, "2") or self._param(template, 2) + if not label: + label = raw_target + target = _xml_escape(link_target_to_uri(raw_target)) + return f'{self._p(label)}' + + def _handle_emphasis(self, template) -> str: + text = self._p(self._param(template, 1)) + return f'{text}' + + +_processor: Optional[MiqraWikiTextProcessor] = None + + +def _get_processor() -> MiqraWikiTextProcessor: + global _processor + if _processor is None: + _processor = MiqraWikiTextProcessor() + return _processor + + +def wikitext_to_intermediate_xml( + wikitext: str, *, column_c: bool = False +) -> str: + """Convert wikitext to an escaped intermediate XML fragment.""" + text = wikitext or "" + if column_c: + text = _preprocess_column_c(text) + result = _get_processor().process_wikitext(text) + return _escape_outside_tags(result.xml_content) + + +def reset_processor() -> None: + """Reset the shared processor (for tests).""" + global _processor + _processor = None diff --git a/opensiddur/importer/util/mediawiki_processor.py b/opensiddur/importer/util/mediawiki_processor.py new file mode 100644 index 0000000..76120b3 --- /dev/null +++ b/opensiddur/importer/util/mediawiki_processor.py @@ -0,0 +1,515 @@ +""" +MediaWiki/Wikitext to intermediate XML processor. + +This module contains the reusable MediaWiki processing framework originally built +for the JPS1917 importer. Other importers (e.g. Miqra al pi ha‑Masorah) can reuse +it by adding/overriding template and tag handlers. +""" + +# NOTE: The initial implementation is intentionally a direct move of the existing +# processor to provide a stable API surface (`MediaWikiProcessor`, `create_processor`) +# for multiple importers. Importer-specific specializations should be layered on +# top by registering handlers. + +from __future__ import annotations + +import re +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, List + +import mwparserfromhell + + +class ProcessingStage(Enum): + """Stages of MediaWiki processing""" + + PREPROCESS = "preprocess" + TEMPLATES = "templates" + TAGS = "tags" + POSTPROCESS = "postprocess" + + +@dataclass +class ConversionResult: + """Result of a conversion operation""" + + xml_content: str + metadata: Dict[str, Any] + warnings: List[str] + errors: List[str] + wikilinks: List[Dict[str, Any]] + + +class MediaWikiProcessor: + """ + Modular MediaWiki to XML processor. + + Provides a modular framework for converting MediaWiki syntax to an + intermediate XML that can be transformed to TEI via XSLT. + """ + + def __init__(self): + self.template_handlers = {} + self.tag_handlers = {} + self.preprocessors = [] + self.postprocessors = [] + self.wikilinks = [] # Store captured wikilinks + self._initialize_handlers() + + def _initialize_handlers(self): + """Initialize all template and tag handlers""" + self._initialize_template_handlers() + self._initialize_tag_handlers() + self._initialize_preprocessors() + self._initialize_postprocessors() + self._initialize_wikilink_handlers() + + # ------------------------------------------------------------------------- + # Default handler initialization + # + # These defaults match the original JPS1917 processor behavior. Other + # importers can clear/override and register their own handlers as needed. + # ------------------------------------------------------------------------- + + def _initialize_template_handlers(self): + """Initialize handlers for MediaWiki templates""" + + # Text Formatting Templates + self.template_handlers["sc"] = self._handle_small_caps + self.template_handlers["larger"] = self._handle_larger_text + self.template_handlers["x-larger"] = self._handle_x_larger_text + self.template_handlers["xx-larger"] = self._handle_xx_larger_text + self.template_handlers["xxx-larger"] = self._handle_xxx_larger_text + self.template_handlers["smaller"] = self._handle_smaller_text + + # Layout Templates + self.template_handlers["c"] = self._handle_center + self.template_handlers["right"] = self._handle_right_align + self.template_handlers["rule"] = self._handle_horizontal_rule + self.template_handlers["nop"] = self._handle_no_paragraph + + # Biblical Content Templates + self.template_handlers["verse"] = self._handle_verse + self.template_handlers["rh"] = self._handle_right_header + self.template_handlers["dropinitial"] = self._handle_drop_initial + self.template_handlers["dhr"] = self._handle_double_horizontal_rule + + # Navigation Templates + self.template_handlers["anchor"] = self._handle_anchor + self.template_handlers["anchor+"] = self._handle_anchor_plus + + # Language Templates + self.template_handlers["lang"] = self._handle_language + + # Reference Templates + self.template_handlers["smallrefs"] = self._handle_small_refs + + # Special Templates + self.template_handlers["hws"] = self._handle_hws + self.template_handlers["hwe"] = self._handle_hwe + self.template_handlers["***"] = self._handle_asterisks + self.template_handlers["reconstruct"] = self._handle_reconstruct + self.template_handlers["SIC"] = self._handle_sic + self.template_handlers["sic"] = self._handle_sic + self.template_handlers["sup"] = self._handle_superscript + self.template_handlers["bar"] = self._handle_bar + self.template_handlers["gap"] = self._handle_gap + self.template_handlers["overfloat left"] = self._handle_overfloat_left + self.template_handlers["float right"] = self._handle_float_right + self.template_handlers["smaller block/s"] = self._handle_smaller_block_start + self.template_handlers["smaller block/e"] = self._handle_smaller_block_end + + def _initialize_tag_handlers(self): + """Initialize handlers for HTML/XML tags""" + + # Structural Tags + self.tag_handlers["section"] = self._handle_section + self.tag_handlers["table"] = self._handle_table + self.tag_handlers["tr"] = self._handle_table_row + self.tag_handlers["td"] = self._handle_table_cell + + # Text Formatting Tags + self.tag_handlers["i"] = self._handle_italic + self.tag_handlers["br"] = self._handle_line_break + self.tag_handlers["span"] = self._handle_span + + # Content Tags + self.tag_handlers["dd"] = self._handle_definition_description + self.tag_handlers["ref"] = self._handle_reference + + # MediaWiki Specific Tags + self.tag_handlers["noinclude"] = self._handle_noinclude + self.tag_handlers["pagequality"] = self._handle_pagequality + + def _initialize_preprocessors(self): + """Initialize preprocessing functions""" + self.preprocessors = [ + self._fix_noinclude_line_breaks, + self._convert_paragraph_breaks, + self._normalize_whitespace, + self._handle_special_characters, + self._extract_metadata, + ] + + def _initialize_postprocessors(self): + """Initialize postprocessing functions""" + self.postprocessors = [ + self._validate_xml_structure, + self._finalize_metadata, + ] + + def _initialize_wikilink_handlers(self): + """Initialize wikilink processing""" + pass + + # ------------------------------------------------------------------------- + # Core processing + # ------------------------------------------------------------------------- + + def _process_nested_content(self, content: str, depth: int = 0) -> str: + """Recursively process nested templates and other elements""" + if depth > 10: + return content + + parsed = mwparserfromhell.parse(content) + nodes_to_replace = [] + + for node in parsed.nodes: + if hasattr(node, "name"): # Template + template_name = str(node.name).strip() + if template_name in self.template_handlers: + try: + processed_node = self._process_template_with_nesting(node, depth + 1) + replacement = self.template_handlers[template_name](processed_node) + nodes_to_replace.append((node, replacement)) + except Exception: + replacement = self.template_handlers[template_name](node) + nodes_to_replace.append((node, replacement)) + else: + processed_content = self._process_nested_content(str(node), depth + 1) + nodes_to_replace.append((node, processed_content)) + + elif hasattr(node, "tag"): # Tag + tag_name = str(node.tag).strip().lower() + if tag_name in self.tag_handlers: + try: + processed_node = self._process_tag_with_nesting(node, depth + 1) + replacement = self.tag_handlers[tag_name](processed_node) + nodes_to_replace.append((node, replacement)) + except Exception: + replacement = self.tag_handlers[tag_name](node) + nodes_to_replace.append((node, replacement)) + else: + processed_content = self._process_nested_content(str(node), depth + 1) + nodes_to_replace.append((node, processed_content)) + + elif hasattr(node, "__class__") and "Wikilink" in str(node.__class__): + try: + replacement = self._handle_wikilink(node) + nodes_to_replace.append((node, replacement)) + except Exception: + nodes_to_replace.append((node, str(node))) + + for node, replacement in nodes_to_replace: + parsed.replace(node, replacement) + + return str(parsed) + + def _process_template_with_nesting(self, template, depth: int = 0) -> object: + import copy + + processed_template = copy.deepcopy(template) + for param in processed_template.params: + if hasattr(param, "value"): + processed_value = self._process_nested_content(str(param.value), depth + 1) + param.value = processed_value + return processed_template + + def _process_tag_with_nesting(self, tag, depth: int = 0) -> object: + import copy + + processed_tag = copy.deepcopy(tag) + if hasattr(processed_tag, "contents") and processed_tag.contents: + processed_contents = self._process_nested_content( + str(processed_tag.contents), depth + 1 + ) + processed_tag.contents = processed_contents + return processed_tag + + # ------------------------------------------------------------------------- + # Template handlers (JPS1917 defaults) + # ------------------------------------------------------------------------- + + def _handle_small_caps(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_larger_text(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_x_larger_text(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_xx_larger_text(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_xxx_larger_text(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_smaller_text(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_center(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_right_align(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_horizontal_rule(self, template) -> str: + return "" + + def _handle_double_horizontal_rule(self, template) -> str: + return "" + + def _handle_no_paragraph(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_verse(self, template) -> str: + chapter = str(template.get(1, "")).strip() + verse = str(template.get(2, "")).strip() + content = str(template.get(3, "")) + return f'{content}' + + def _handle_right_header(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_drop_initial(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_anchor(self, template) -> str: + name = str(template.get(1, "")).strip() + return f'' + + def _handle_anchor_plus(self, template) -> str: + name = str(template.get(1, "")).strip() + return f'' + + def _handle_language(self, template) -> str: + code = str(template.get(1, "")).strip() + content = str(template.get(2, "")) + return f'{content}' + + def _handle_small_refs(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_hws(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_hwe(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_asterisks(self, template) -> str: + return "" + + def _handle_reconstruct(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_sic(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_superscript(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_bar(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_gap(self, template) -> str: + return "" + + def _handle_overfloat_left(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_float_right(self, template) -> str: + content = str(template.get(1, "")) + return f"{content}" + + def _handle_smaller_block_start(self, template) -> str: + return "" + + def _handle_smaller_block_end(self, template) -> str: + return "" + + # ------------------------------------------------------------------------- + # Tag handlers (JPS1917 defaults) + # ------------------------------------------------------------------------- + + def _handle_section(self, tag) -> str: + begin = getattr(tag, "attributes", {}).get("begin", "") + return f'

' + + def _handle_table(self, tag) -> str: + contents = getattr(tag, "contents", "") or "" + return f"{contents}
" + + def _handle_table_row(self, tag) -> str: + contents = getattr(tag, "contents", "") or "" + return f"{contents}" + + def _handle_table_cell(self, tag) -> str: + contents = getattr(tag, "contents", "") or "" + return f"{contents}" + + def _handle_italic(self, tag) -> str: + contents = getattr(tag, "contents", "") or "" + return f"{contents}" + + def _handle_line_break(self, tag) -> str: + return "
" + + def _handle_span(self, tag) -> str: + contents = getattr(tag, "contents", "") or "" + return f"{contents}" + + def _handle_definition_description(self, tag) -> str: + contents = getattr(tag, "contents", "") or "" + return f"
{contents}
" + + def _handle_reference(self, tag) -> str: + name = getattr(tag, "attributes", {}).get("name", "") + contents = getattr(tag, "contents", "") or "" + return f'{contents}' + + def _handle_noinclude(self, tag) -> str: + contents = getattr(tag, "contents", "") or "" + return f"{contents}" + + def _handle_pagequality(self, tag) -> str: + contents = getattr(tag, "contents", "") or "" + return f"{contents}" + + # ------------------------------------------------------------------------- + # Pre/post processing (JPS1917 defaults) + # ------------------------------------------------------------------------- + + def _fix_noinclude_line_breaks(self, text: str, metadata: Dict[str, Any]) -> str: + return re.sub(r"\n", "", text) + + def _convert_paragraph_breaks(self, text: str, metadata: Dict[str, Any]) -> str: + return text.replace("\n\n", "

") + + def _normalize_whitespace(self, text: str, metadata: Dict[str, Any]) -> str: + return re.sub(r"[ \t]+", " ", text) + + def _handle_special_characters(self, text: str, metadata: Dict[str, Any]) -> str: + # Preserve only minimal escaping at this stage. + return text + + def _extract_metadata(self, text: str, metadata: Dict[str, Any]) -> str: + metadata.setdefault("length", len(text)) + return text + + def _validate_xml_structure(self, xml_content: str, metadata: Dict[str, Any]) -> str: + # Lightweight sanity check; TEI validation happens later. + return xml_content + + def _finalize_metadata(self, xml_content: str, metadata: Dict[str, Any]) -> str: + metadata["processed"] = True + return xml_content + + # ------------------------------------------------------------------------- + # Wikilinks + # ------------------------------------------------------------------------- + + def _handle_wikilink(self, node) -> str: + try: + title = str(getattr(node, "title", "")).strip() + text = str(getattr(node, "text", "")).strip() if getattr(node, "text", None) else "" + self.wikilinks.append({"title": title, "text": text}) + if text: + return f'<__link__ title="{title}">{text}' + return f'<__link__ title="{title}"/>' + except Exception: + return str(node) + + # ------------------------------------------------------------------------- + # Public API + # ------------------------------------------------------------------------- + + def process_wikitext(self, wikitext: str) -> ConversionResult: + warnings: List[str] = [] + errors: List[str] = [] + metadata: Dict[str, Any] = {} + + text = wikitext or "" + for pre in self.preprocessors: + try: + text = pre(text, metadata) + except Exception as e: + errors.append(str(e)) + + try: + xml_content = self._process_nested_content(text) + except Exception as e: + xml_content = text + errors.append(str(e)) + + for post in self.postprocessors: + try: + xml_content = post(xml_content, metadata) + except Exception as e: + errors.append(str(e)) + + return ConversionResult( + xml_content=xml_content, + metadata=metadata, + warnings=warnings, + errors=errors, + wikilinks=self.wikilinks.copy(), + ) + + def add_template_handler(self, template_name: str, handler_func): + self.template_handlers[template_name] = handler_func + + def add_tag_handler(self, tag_name: str, handler_func): + self.tag_handlers[tag_name] = handler_func + + def add_preprocessor(self, preprocessor_func): + self.preprocessors.append(preprocessor_func) + + def add_postprocessor(self, postprocessor_func): + self.postprocessors.append(postprocessor_func) + + def get_wikilinks(self) -> List[Dict[str, Any]]: + return self.wikilinks.copy() + + def clear_wikilinks(self): + self.wikilinks.clear() + + +def create_processor() -> MediaWikiProcessor: + return MediaWikiProcessor() + + +def process_page(page_content: str) -> ConversionResult: + processor = create_processor() + return processor.process_wikitext(page_content) + diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py new file mode 100644 index 0000000..3c440fd --- /dev/null +++ b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py @@ -0,0 +1,134 @@ +import unittest +from pathlib import Path +from unittest.mock import patch +import tempfile + + +from opensiddur.importer.miqra_al_pi_hamasorah.convert_tsv import ( + _extract_chapter_verse_numbers, + main, +) + + +class TestMiqraConvertTsv(unittest.TestCase): + @patch("opensiddur.importer.miqra_al_pi_hamasorah.convert_tsv.validate") + def test_only_book_writes_output(self, mock_validate): + mock_validate.return_value = (True, []) + + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + sourcetexts_root = tmp_path / "sources" + sheets_dir = sourcetexts_root / "miqra_al_pi_hamasorah" / "sheets" + sheets_dir.mkdir(parents=True, exist_ok=True) + + # Minimal README (front matter) + (sheets_dir / "readme.tsv").write_text( + "License\tCC-BY-SA 4.0\nAttribution\tHebrew Wikisource\n", + encoding="utf-8", + ) + + # Minimal Torah TSV: header + one data row for Genesis 1 + (sheets_dir / "torah.tsv").write_text( + "\t".join(["Page key", "Row id", "Nav", "Scaffold", "Text"]) + + "\n" + + "\t".join( + [ + "ספר בראשית/א", + "א", + "", + "{{מ:פסוק|בראשית|1|1}}", + '{{נוסח|{{מ:אות-ג|בְּ}}רֵאשִׁ֖ית|2=test note}}', + ] + ) + + "\n", + encoding="utf-8", + ) + + project_dir = tmp_path / "project" + rc = main( + [ + "--sourcetexts-root", + str(sourcetexts_root), + "--project-dir", + str(project_dir), + "--only-book", + "genesis", + ] + ) + self.assertEqual(rc, 0) + + genesis_xml = project_dir / "genesis.xml" + self.assertTrue(genesis_xml.exists()) + xml = genesis_xml.read_text(encoding="utf-8") + self.assertIn("", xml) + self.assertIn('', xml) + self.assertIn("Genesis", xml) + self.assertIn('rend="large"', xml) + self.assertIn("בְּ", xml) + self.assertIn("tei:standOff", xml) + self.assertIn("test note", xml) + + def test_special_tsv_row_does_not_produce_invalid_urn_segments(self): + # special.tsv uses a 2-column schema; must not be merged into book output. + ch, v = _extract_chapter_verse_numbers( + "ספר שמות/טו תתת", + "{{#קטע:שירת הים/צורת השיר|צורת-השיר}}{{מ:טעמי", + "", + ) + self.assertEqual(ch, "") + self.assertEqual(v, "") + + @patch("opensiddur.importer.miqra_al_pi_hamasorah.convert_tsv.validate") + def test_special_tsv_not_merged_into_book(self, mock_validate): + mock_validate.return_value = (True, []) + + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + sourcetexts_root = tmp_path / "sources" + sheets_dir = sourcetexts_root / "miqra_al_pi_hamasorah" / "sheets" + sheets_dir.mkdir(parents=True, exist_ok=True) + + (sheets_dir / "torah.tsv").write_text( + "\t".join(["Page key", "Row id", "Nav", "Scaffold", "Text"]) + + "\n" + + "\t".join( + [ + "ספר שמות/טו", + "א", + "", + "{{מ:פסוק|שמות|15|1}}", + "שירה", + ] + ) + + "\n", + encoding="utf-8", + ) + (sheets_dir / "special.tsv").write_text( + "ספר שמות/טו תתת\t{{#קטע:שירת הים/צורת השיר|צורת-השיר}}{{מ:טעמי\n", + encoding="utf-8", + ) + + project_dir = tmp_path / "project" + main( + [ + "--sourcetexts-root", + str(sourcetexts_root), + "--project-dir", + str(project_dir), + "--only-book", + "exodus", + ] + ) + xml = (project_dir / "exodus.xml").read_text(encoding="utf-8") + self.assertIn("urn:x-opensiddur:text:bible:exodus/15/1", xml) + self.assertNotIn("צורת-השיר", xml) + self.assertNotIn("השיר|", xml) + + +if __name__ == "__main__": + unittest.main() + diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py new file mode 100644 index 0000000..3a44834 --- /dev/null +++ b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py @@ -0,0 +1,106 @@ +import unittest + +from opensiddur.importer.miqra_al_pi_hamasorah.miqra_wikitext import ( + link_target_to_uri, + normalize_template_name, + reset_processor, + wikitext_to_intermediate_xml, +) + + +class TestMiqraWikitext(unittest.TestCase): + def setUp(self): + reset_processor() + + def test_nosach_nested_large_letter(self): + frag = wikitext_to_intermediate_xml( + '{{נוסח|{{מ:אות-ג|בְּ}}רֵאשִׁ֖ית|2=note text}}' + ) + self.assertIn("', frag) + self.assertIn("בְּ", frag) + self.assertIn("', frag) + self.assertIn("כתיב", frag) + self.assertIn("קְרִי", frag) + + def test_qeri_ketiv(self): + frag = wikitext_to_intermediate_xml('{{קו"כ|כתיב|קְרִי}}') + self.assertIn('order="qeri-first"', frag) + + def test_parashah_open(self): + frag = wikitext_to_intermediate_xml("{{פפ}}") + self.assertIn(' Date: Wed, 27 May 2026 22:33:51 -0700 Subject: [PATCH 03/10] wip: miqra with notes, still some issues on rendering --- opensiddur/exporter/tex/reledmac.xslt | 38 ++- .../miqra_al_pi_hamasorah/miqra_to_tei.xslt | 233 ++++++++++++------ .../tests/exporter/test_reledmac_xslt.py | 40 ++- .../miqra_al_pi_hamasorah/test_convert_tsv.py | 20 +- 4 files changed, 241 insertions(+), 90 deletions(-) diff --git a/opensiddur/exporter/tex/reledmac.xslt b/opensiddur/exporter/tex/reledmac.xslt index 82d7980..a06bcd2 100644 --- a/opensiddur/exporter/tex/reledmac.xslt +++ b/opensiddur/exporter/tex/reledmac.xslt @@ -539,7 +539,7 @@ \pend \eledchapter{ - + } @@ -550,7 +550,7 @@ \pend \eledsubsection{ - + } @@ -659,8 +659,10 @@ + - + + @@ -668,8 +670,10 @@ + - + + @@ -893,6 +897,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + urn:x-opensiddur:text:bible: - + / / @@ -51,96 +128,95 @@ - - - + - - + + + + + + - + - + + - + - + + + + + - + + + + - + - + - + - + - + - + [ - + ] - + - + ( - + ) - + [ - + ] - - - - - - - - - - - - - - - + + indent @@ -148,78 +224,69 @@ - - - - - + - + - - + - + - - - - - - + + - + ͏ִ - + - + - + - - - - - + * - - + + + - + - + + + + - + - + - + - + @@ -228,15 +295,19 @@ - - + + + + + + - + - + diff --git a/opensiddur/tests/exporter/test_reledmac_xslt.py b/opensiddur/tests/exporter/test_reledmac_xslt.py index e67d5fe..18c7d2d 100644 --- a/opensiddur/tests/exporter/test_reledmac_xslt.py +++ b/opensiddur/tests/exporter/test_reledmac_xslt.py @@ -565,8 +565,44 @@ def test_div_head_emits_sectioning(self): """ out = _transform(xml) - # Top-level body div with head → \eledchapter - self.assertIn(r"\eledchapter{Genesis}", out) + # Top-level body div with head → \eledchapter (LTR wrapper when not Hebrew) + self.assertIn( + r"\eledchapter{{\textdir TLT\selectlanguage{english}Genesis}}", + out, + ) + + def test_english_head_in_hebrew_document_uses_ltr_wrapper(self): + xml = """ + + + + Genesis + בְּרֵאשִׁית + + + """ + out = _transform(xml) + self.assertIn( + r"\eledchapter{{\textdir TLT\selectlanguage{english}Genesis}}", + out, + ) + + def test_hebrew_head_in_hebrew_document_has_no_ltr_wrapper(self): + xml = """ + + + + בראשית + בְּרֵאשִׁית + + + """ + out = _transform(xml) + self.assertIn(r"\eledchapter{בראשית}", out) + self.assertNotIn( + r"\eledchapter{{\textdir TLT\selectlanguage{english}בראשית}}", + out, + ) if __name__ == "__main__": diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py index 3c440fd..a2be3e5 100644 --- a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py +++ b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py @@ -27,7 +27,7 @@ def test_only_book_writes_output(self, mock_validate): encoding="utf-8", ) - # Minimal Torah TSV: header + one data row for Genesis 1 + # Torah TSV: parashah in nav + two verses in one paragraph (sheets_dir / "torah.tsv").write_text( "\t".join(["Page key", "Row id", "Nav", "Scaffold", "Text"]) + "\n" @@ -35,11 +35,21 @@ def test_only_book_writes_output(self, mock_validate): [ "ספר בראשית/א", "א", - "", + "//{{פפ}}//", "{{מ:פסוק|בראשית|1|1}}", '{{נוסח|{{מ:אות-ג|בְּ}}רֵאשִׁ֖ית|2=test note}}', ] ) + + "\n" + + "\t".join( + [ + "ספר בראשית/א", + "ב", + "", + "{{מ:פסוק|בראשית|1|2}}", + "וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ", + ] + ) + "\n", encoding="utf-8", ) @@ -64,13 +74,17 @@ def test_only_book_writes_output(self, mock_validate): self.assertIn('unit="verse"', xml) self.assertIn('n="1"', xml) self.assertIn("urn:x-opensiddur:text:bible:genesis/1/1", xml) - self.assertIn("", xml) + self.assertNotIn("", xml) + self.assertIn('', xml) + self.assertIn("וְהָאָ֗רֶץ", xml) self.assertIn('', xml) self.assertIn("Genesis", xml) self.assertIn('rend="large"', xml) self.assertIn("בְּ", xml) self.assertIn("tei:standOff", xml) self.assertIn("test note", xml) + # Standoff notes must link to the in-text marker for annotation resolution + self.assertIn('target="#miqra-note-1-ref', xml) def test_special_tsv_row_does_not_produce_invalid_urn_segments(self): # special.tsv uses a 2-column schema; must not be merged into book output. From 643d6439eedf9ebf8d5b43a96de0b39a1c8d03bc Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Wed, 27 May 2026 22:50:09 -0700 Subject: [PATCH 04/10] wip: fix some rtl/ltr bugs in the tex renderer --- opensiddur/exporter/tex/bibtex.xslt | 24 +++++++++++++++++++++++- opensiddur/tests/exporter/test_latex.py | 15 +++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/opensiddur/exporter/tex/bibtex.xslt b/opensiddur/exporter/tex/bibtex.xslt index 6b4c83f..d9cbe8d 100644 --- a/opensiddur/exporter/tex/bibtex.xslt +++ b/opensiddur/exporter/tex/bibtex.xslt @@ -3,12 +3,34 @@ xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:j="http://jewishliturgy.org/ns/jlptei/2" + xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="tei j"> + + + + + + + + + + + + + + + + + + + + + @@ -215,7 +237,7 @@ = { - + }, diff --git a/opensiddur/tests/exporter/test_latex.py b/opensiddur/tests/exporter/test_latex.py index aa9f537..ffd14b0 100644 --- a/opensiddur/tests/exporter/test_latex.py +++ b/opensiddur/tests/exporter/test_latex.py @@ -242,6 +242,21 @@ def test_dedupes_when_multiple_files_share_index(self): preamble, _ = extract_sources([f1, f2]) self.assertEqual(preamble.count("@"), 1) + def test_bibtex_wraps_hebrew_fields_in_texthebrew(self): + index = """ + + + + מקרא על פי המסורה + Avi Kadish + + + """.encode("utf-8") + doc = self._create("p", "doc.xml", b"") + self._create("p", "index.xml", index) + preamble, _ = extract_sources([doc]) + self.assertIn(r"title = {\texthebrew{מקרא על פי המסורה}}", preamble) + class TestGetFileReferences(unittest.TestCase): From 62e647c482bfeff1e1435e8bf7f0af1ceef6ba0c Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Thu, 28 May 2026 20:36:24 -0700 Subject: [PATCH 05/10] wip: make sure all templates are covered --- opensiddur/exporter/tex/reledmac.xslt | 4 ++- .../miqra_al_pi_hamasorah/miqra_wikitext.py | 20 +++++++++++++ .../tests/exporter/test_reledmac_xslt.py | 2 +- .../test_miqra_wikitext.py | 30 +++++++++++++++++++ 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/opensiddur/exporter/tex/reledmac.xslt b/opensiddur/exporter/tex/reledmac.xslt index a06bcd2..c466758 100644 --- a/opensiddur/exporter/tex/reledmac.xslt +++ b/opensiddur/exporter/tex/reledmac.xslt @@ -729,7 +729,9 @@ - \leavevmode\\ + + \leavevmode\\{} diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py b/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py index 6ca5e25..b220304 100644 --- a/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py +++ b/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py @@ -31,8 +31,12 @@ "מ:ספר חדש", "מ:רווח בתרי עשר", "רווח בתרי עשר", + "מ:רווח בתרי עשר בפסוק הראשון", "מ:רווח לספר בתהלים", "רווח לספר בתהלים", + "מ:רווח לספר בתהלים בפסוק הראשון", + "ניווט טעמים", + "שם הדף המלא", "מ:אין פרשה בתחילת פרק", 'מ:אין פרשה בתחילת פרק בספרי אמ"ת', "מ:אין רווח של פרשה בתחילת פרשת השבוע", @@ -235,7 +239,9 @@ def _register_template_handlers(self) -> None: h("מ:ירושלם", self._handle_yerushalem) h("מ:ירושלמה", self._handle_yerushalema) h("ירח בן יומו", self._handle_accent_yerah) + h("ירח בן יומו-2", self._handle_accent_with_word) h("גלגל", self._handle_accent_galgal) + h("גלגל-2", self._handle_accent_with_word) h("אתנח הפוך", self._handle_accent_etnah) h("מ:קמץ", self._handle_qamats) h("מ:טעם ומתג באות אחת", self._handle_taam_meteg) @@ -252,6 +258,8 @@ def _register_template_handlers(self) -> None: h("מ:לגרמיה-2", self._handle_legarmeh) h("מ:פסק", self._handle_paseq) h("מ:מקף אפור", self._handle_grey_maqaf) + h("מ:דחי", self._handle_dechi) + h("מ:צינור", self._handle_tzinor) h("מ:הערה", self._handle_mam_note) h("עוגן בשורה", self._handle_line_anchor) @@ -521,6 +529,10 @@ def _handle_accent_yerah(self, template) -> str: def _handle_accent_galgal(self, template) -> str: return '' + def _handle_accent_with_word(self, template) -> str: + # Word param already includes the accent (galgal / yerah ben yomo). + return self._p(self._param(template, 1)) + def _handle_accent_etnah(self, template) -> str: return '' @@ -562,6 +574,14 @@ def _handle_paseq(self, template) -> str: def _handle_grey_maqaf(self, template) -> str: return '־' + def _handle_dechi(self, template) -> str: + # Wikisource shows param 1; param 2 marks the dechi (offset accent) form. + return self._p(self._param(template, 1)) + + def _handle_tzinor(self, template) -> str: + # Wikisource shows param 1; param 2 marks the tzinor accent placement. + return self._p(self._param(template, 1)) + def _handle_mam_note(self, template) -> str: body = self._p(self._param(template, 1)) note_id = self._next_note_id() diff --git a/opensiddur/tests/exporter/test_reledmac_xslt.py b/opensiddur/tests/exporter/test_reledmac_xslt.py index 18c7d2d..3aca2e8 100644 --- a/opensiddur/tests/exporter/test_reledmac_xslt.py +++ b/opensiddur/tests/exporter/test_reledmac_xslt.py @@ -533,7 +533,7 @@ def test_lb_emits_leavevmode_linebreak(self): """ out = _transform(xml) - self.assertIn(r"\leavevmode\\", out) + self.assertIn(r"\leavevmode\\{}", out) class TestStructuralElements(unittest.TestCase): diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py index 3a44834..273bcf3 100644 --- a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py +++ b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py @@ -68,6 +68,36 @@ def test_column_c_double_underscore(self): frag = wikitext_to_intermediate_xml("word__word", column_c=True) self.assertIn("word word", frag) + def test_dechi_shows_first_parameter_only(self): + frag = wikitext_to_intermediate_xml( + "{{מ:דחי|חַ֭טָּאִים|חַ֭טָּאִ֭ים}}" + ) + self.assertIn("חַ֭טָּאִים", frag) + self.assertNotIn("חַ֭טָּאִ֭ים", frag) + self.assertNotIn("{{מ:דחי", frag) + + def test_tzinor_shows_first_parameter_only(self): + frag = wikitext_to_intermediate_xml( + "{{מ:צינור|בָטַחְתִּי֮|בָטַ֮חְתִּי֮}}" + ) + self.assertIn("בָטַחְתִּי֮", frag) + self.assertNotIn("בָטַ֮חְתִּי֮", frag) + self.assertNotIn("{{מ:צינור", frag) + + def test_galgal2_shows_first_parameter_only(self): + frag = wikitext_to_intermediate_xml("{{גלגל-2|אֵ֪ין|אֵ֪֪ין}}") + self.assertIn("אֵ֪ין", frag) + self.assertNotIn("אֵ֪֪ין", frag) + self.assertNotIn("{{גלגל-2", frag) + + def test_yerah_ben_yomo2_shows_first_parameter_only(self): + frag = wikitext_to_intermediate_xml( + "{{ירח בן יומו-2|אַלְפַּ֪יִם|אַלְפַּ֪֪יִם}}" + ) + self.assertIn("אַלְפַּ֪יִם", frag) + self.assertNotIn("אַלְפַּ֪֪יִם", frag) + self.assertNotIn("{{ירח בן יומו-2", frag) + def test_all_templates_from_doc_have_handlers(self): """Every template name in templates.tsv examples is recognized.""" from pathlib import Path From e1713e522f3642943a319cdf55d6699520ba9f4f Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Thu, 28 May 2026 22:21:47 -0700 Subject: [PATCH 06/10] restore 1917 jps processor so it wont fail --- .../miqra_al_pi_hamasorah/miqra_wikitext.py | 32 +- .../importer/util/mediawiki_processor.py | 957 ++++++++++++------ 2 files changed, 665 insertions(+), 324 deletions(-) diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py b/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py index b220304..1c2ab77 100644 --- a/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py +++ b/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py @@ -13,7 +13,10 @@ import mwparserfromhell -from opensiddur.importer.util.mediawiki_processor import MediaWikiProcessor +from opensiddur.importer.util.mediawiki_processor import ( + ConversionResult, + MediaWikiProcessor, +) MIQRA_NS = "urn:x-opensiddur:miqra:intermediate" MW_NS = "urn:x-opensiddur:mw:intermediate" @@ -196,6 +199,33 @@ def _initialize_handlers(self) -> None: self._register_template_handlers() self._register_tag_handlers() + def process_wikitext(self, wikitext: str) -> ConversionResult: + """Miqra uses recursive nested processing, not the JPS top-level loop.""" + warnings: list[str] = [] + errors: list[str] = [] + metadata: dict = {} + + text = wikitext or "" + for pre in self.preprocessors: + try: + text = pre(text) + except Exception as e: + errors.append(str(e)) + + try: + xml_content = self._process_nested_content(text) + except Exception as e: + xml_content = text + errors.append(str(e)) + + return ConversionResult( + xml_content=xml_content, + metadata=metadata, + warnings=warnings, + errors=errors, + wikilinks=self.wikilinks.copy(), + ) + def _register_tag_handlers(self) -> None: self.tag_handlers["noinclude"] = self._handle_strip_tag diff --git a/opensiddur/importer/util/mediawiki_processor.py b/opensiddur/importer/util/mediawiki_processor.py index 76120b3..faeccff 100644 --- a/opensiddur/importer/util/mediawiki_processor.py +++ b/opensiddur/importer/util/mediawiki_processor.py @@ -1,29 +1,22 @@ """ MediaWiki/Wikitext to intermediate XML processor. -This module contains the reusable MediaWiki processing framework originally built -for the JPS1917 importer. Other importers (e.g. Miqra al pi ha‑Masorah) can reuse -it by adding/overriding template and tag handlers. +Reusable framework originally built for the JPS1917 importer. Other importers +(e.g. Miqra al pi ha-Masorah) subclass ``MediaWikiProcessor`` and register their +own template/tag handlers. """ -# NOTE: The initial implementation is intentionally a direct move of the existing -# processor to provide a stable API surface (`MediaWikiProcessor`, `create_processor`) -# for multiple importers. Importer-specific specializations should be layered on -# top by registering handlers. - from __future__ import annotations import re +import mwparserfromhell from dataclasses import dataclass from enum import Enum from typing import Any, Dict, List -import mwparserfromhell - class ProcessingStage(Enum): """Stages of MediaWiki processing""" - PREPROCESS = "preprocess" TEMPLATES = "templates" TAGS = "tags" @@ -33,7 +26,6 @@ class ProcessingStage(Enum): @dataclass class ConversionResult: """Result of a conversion operation""" - xml_content: str metadata: Dict[str, Any] warnings: List[str] @@ -43,12 +35,12 @@ class ConversionResult: class MediaWikiProcessor: """ - Modular MediaWiki to XML processor. - - Provides a modular framework for converting MediaWiki syntax to an - intermediate XML that can be transformed to TEI via XSLT. + Modular MediaWiki to XML processor for JPS1917 content. + + This processor handles the conversion of MediaWiki syntax to XML, + with separate modules for different types of templates and tags. """ - + def __init__(self): self.template_handlers = {} self.tag_handlers = {} @@ -56,7 +48,7 @@ def __init__(self): self.postprocessors = [] self.wikilinks = [] # Store captured wikilinks self._initialize_handlers() - + def _initialize_handlers(self): """Initialize all template and tag handlers""" self._initialize_template_handlers() @@ -64,452 +56,771 @@ def _initialize_handlers(self): self._initialize_preprocessors() self._initialize_postprocessors() self._initialize_wikilink_handlers() - - # ------------------------------------------------------------------------- - # Default handler initialization - # - # These defaults match the original JPS1917 processor behavior. Other - # importers can clear/override and register their own handlers as needed. - # ------------------------------------------------------------------------- - + def _initialize_template_handlers(self): """Initialize handlers for MediaWiki templates""" - + # Text Formatting Templates - self.template_handlers["sc"] = self._handle_small_caps - self.template_handlers["larger"] = self._handle_larger_text - self.template_handlers["x-larger"] = self._handle_x_larger_text - self.template_handlers["xx-larger"] = self._handle_xx_larger_text - self.template_handlers["xxx-larger"] = self._handle_xxx_larger_text - self.template_handlers["smaller"] = self._handle_smaller_text - + self.template_handlers['sc'] = self._handle_small_caps + self.template_handlers['larger'] = self._handle_larger_text + self.template_handlers['x-larger'] = self._handle_x_larger_text + self.template_handlers['xx-larger'] = self._handle_xx_larger_text + self.template_handlers['xxx-larger'] = self._handle_xxx_larger_text + self.template_handlers['smaller'] = self._handle_smaller_text + # Layout Templates - self.template_handlers["c"] = self._handle_center - self.template_handlers["right"] = self._handle_right_align - self.template_handlers["rule"] = self._handle_horizontal_rule - self.template_handlers["nop"] = self._handle_no_paragraph - + self.template_handlers['c'] = self._handle_center + self.template_handlers['right'] = self._handle_right_align + self.template_handlers['rule'] = self._handle_horizontal_rule + self.template_handlers['nop'] = self._handle_no_paragraph + # Biblical Content Templates - self.template_handlers["verse"] = self._handle_verse - self.template_handlers["rh"] = self._handle_right_header - self.template_handlers["dropinitial"] = self._handle_drop_initial - self.template_handlers["dhr"] = self._handle_double_horizontal_rule - + self.template_handlers['verse'] = self._handle_verse + self.template_handlers['rh'] = self._handle_right_header + self.template_handlers['dropinitial'] = self._handle_drop_initial + self.template_handlers['dhr'] = self._handle_double_horizontal_rule + # Navigation Templates - self.template_handlers["anchor"] = self._handle_anchor - self.template_handlers["anchor+"] = self._handle_anchor_plus - + self.template_handlers['anchor'] = self._handle_anchor + self.template_handlers['anchor+'] = self._handle_anchor_plus + # Language Templates - self.template_handlers["lang"] = self._handle_language - + self.template_handlers['lang'] = self._handle_language + # Reference Templates - self.template_handlers["smallrefs"] = self._handle_small_refs - + self.template_handlers['smallrefs'] = self._handle_small_refs + # Special Templates - self.template_handlers["hws"] = self._handle_hws - self.template_handlers["hwe"] = self._handle_hwe - self.template_handlers["***"] = self._handle_asterisks - self.template_handlers["reconstruct"] = self._handle_reconstruct - self.template_handlers["SIC"] = self._handle_sic - self.template_handlers["sic"] = self._handle_sic - self.template_handlers["sup"] = self._handle_superscript - self.template_handlers["bar"] = self._handle_bar - self.template_handlers["gap"] = self._handle_gap - self.template_handlers["overfloat left"] = self._handle_overfloat_left - self.template_handlers["float right"] = self._handle_float_right - self.template_handlers["smaller block/s"] = self._handle_smaller_block_start - self.template_handlers["smaller block/e"] = self._handle_smaller_block_end - + self.template_handlers['hws'] = self._handle_hws + self.template_handlers['hwe'] = self._handle_hwe + self.template_handlers['***'] = self._handle_asterisks + self.template_handlers['reconstruct'] = self._handle_reconstruct + self.template_handlers['SIC'] = self._handle_sic + self.template_handlers['sic'] = self._handle_sic + self.template_handlers['sup'] = self._handle_superscript + self.template_handlers['bar'] = self._handle_bar + self.template_handlers['gap'] = self._handle_gap + self.template_handlers['overfloat left'] = self._handle_overfloat_left + self.template_handlers['float right'] = self._handle_float_right + self.template_handlers['smaller block/s'] = self._handle_smaller_block_start + self.template_handlers['smaller block/e'] = self._handle_smaller_block_end + def _initialize_tag_handlers(self): """Initialize handlers for HTML/XML tags""" - + # Structural Tags - self.tag_handlers["section"] = self._handle_section - self.tag_handlers["table"] = self._handle_table - self.tag_handlers["tr"] = self._handle_table_row - self.tag_handlers["td"] = self._handle_table_cell - + self.tag_handlers['section'] = self._handle_section + self.tag_handlers['table'] = self._handle_table + self.tag_handlers['tr'] = self._handle_table_row + self.tag_handlers['td'] = self._handle_table_cell + # Text Formatting Tags - self.tag_handlers["i"] = self._handle_italic - self.tag_handlers["br"] = self._handle_line_break - self.tag_handlers["span"] = self._handle_span - + self.tag_handlers['i'] = self._handle_italic + self.tag_handlers['br'] = self._handle_line_break + self.tag_handlers['span'] = self._handle_span + # Content Tags - self.tag_handlers["dd"] = self._handle_definition_description - self.tag_handlers["ref"] = self._handle_reference - + self.tag_handlers['dd'] = self._handle_definition_description + self.tag_handlers['ref'] = self._handle_reference + # MediaWiki Specific Tags - self.tag_handlers["noinclude"] = self._handle_noinclude - self.tag_handlers["pagequality"] = self._handle_pagequality - + self.tag_handlers['noinclude'] = self._handle_noinclude + self.tag_handlers['pagequality'] = self._handle_pagequality + def _initialize_preprocessors(self): """Initialize preprocessing functions""" self.preprocessors = [ self._fix_noinclude_line_breaks, self._convert_paragraph_breaks, self._normalize_whitespace, - self._handle_special_characters, - self._extract_metadata, + self._handle_special_characters, # Enable special character processing + self._extract_metadata ] - + def _initialize_postprocessors(self): """Initialize postprocessing functions""" self.postprocessors = [ self._validate_xml_structure, - self._finalize_metadata, + self._finalize_metadata ] - + def _initialize_wikilink_handlers(self): """Initialize wikilink processing""" + # Wikilinks are processed during the main parsing loop pass - - # ------------------------------------------------------------------------- - # Core processing - # ------------------------------------------------------------------------- - + def _process_nested_content(self, content: str, depth: int = 0) -> str: """Recursively process nested templates and other elements""" + # Prevent infinite recursion if depth > 10: return content - + + # Parse the content to handle nested elements parsed = mwparserfromhell.parse(content) nodes_to_replace = [] - + + # Process nodes recursively for node in parsed.nodes: - if hasattr(node, "name"): # Template + if hasattr(node, 'name'): # Template template_name = str(node.name).strip() if template_name in self.template_handlers: try: + # Process nested content within the template processed_node = self._process_template_with_nesting(node, depth + 1) replacement = self.template_handlers[template_name](processed_node) nodes_to_replace.append((node, replacement)) - except Exception: + except Exception as e: + # If nested processing fails, try without nesting replacement = self.template_handlers[template_name](node) nodes_to_replace.append((node, replacement)) else: + # Unknown template - process its content for nested elements processed_content = self._process_nested_content(str(node), depth + 1) nodes_to_replace.append((node, processed_content)) - - elif hasattr(node, "tag"): # Tag + + elif hasattr(node, 'tag'): # Tag tag_name = str(node.tag).strip().lower() if tag_name in self.tag_handlers: try: + # Process nested content within the tag processed_node = self._process_tag_with_nesting(node, depth + 1) replacement = self.tag_handlers[tag_name](processed_node) nodes_to_replace.append((node, replacement)) - except Exception: + except Exception as e: + # If nested processing fails, try without nesting replacement = self.tag_handlers[tag_name](node) nodes_to_replace.append((node, replacement)) else: + # Unknown tag - process its content for nested elements processed_content = self._process_nested_content(str(node), depth + 1) nodes_to_replace.append((node, processed_content)) - - elif hasattr(node, "__class__") and "Wikilink" in str(node.__class__): + + elif hasattr(node, '__class__') and 'Wikilink' in str(node.__class__): # Wikilink try: replacement = self._handle_wikilink(node) nodes_to_replace.append((node, replacement)) - except Exception: + except Exception as e: + # If wikilink processing fails, keep original nodes_to_replace.append((node, str(node))) - + + # Replace all nodes for node, replacement in nodes_to_replace: parsed.replace(node, replacement) - + return str(parsed) - + def _process_template_with_nesting(self, template, depth: int = 0) -> object: + """Process a template and its nested content""" + # Create a copy of the template to avoid modifying the original import copy - processed_template = copy.deepcopy(template) + + # Process each parameter of the template for param in processed_template.params: - if hasattr(param, "value"): + if hasattr(param, 'value'): + # Process nested content in parameter values processed_value = self._process_nested_content(str(param.value), depth + 1) param.value = processed_value + return processed_template - + def _process_tag_with_nesting(self, tag, depth: int = 0) -> object: + """Process a tag and its nested content""" + # Create a copy of the tag to avoid modifying the original import copy - processed_tag = copy.deepcopy(tag) - if hasattr(processed_tag, "contents") and processed_tag.contents: - processed_contents = self._process_nested_content( - str(processed_tag.contents), depth + 1 - ) + + # Process nested content within the tag + if hasattr(processed_tag, 'contents') and processed_tag.contents: + processed_contents = self._process_nested_content(str(processed_tag.contents), depth + 1) processed_tag.contents = processed_contents + return processed_tag - - # ------------------------------------------------------------------------- - # Template handlers (JPS1917 defaults) - # ------------------------------------------------------------------------- - + + # ============================================================================ + # TEMPLATE HANDLERS + # ============================================================================ + def _handle_small_caps(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{sc|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_larger_text(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{larger|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_x_larger_text(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{x-larger|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_xx_larger_text(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{xx-larger|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_xxx_larger_text(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{xxx-larger|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_smaller_text(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{smaller|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_center(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{c|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_right_align(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{right|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_horizontal_rule(self, template) -> str: - return "" - - def _handle_double_horizontal_rule(self, template) -> str: - return "" - + """Convert {{rule}} to """ + return '' + def _handle_no_paragraph(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{nop}} to """ + return '' + def _handle_verse(self, template) -> str: - chapter = str(template.get(1, "")).strip() - verse = str(template.get(2, "")).strip() - content = str(template.get(3, "")) - return f'{content}' - + """Convert {{verse|chapter|verse|text}} to text""" + chapter = str(template.get('chapter', template.get(1, ''))).replace("chapter=", "") + verse = str(template.get('verse', template.get(2, ''))).replace("verse=", "") + text = str(template.get(3, template.get('text', ''))) + chapter_attr = f' chapter="{chapter}"' if chapter else '' + verse_attr = f' verse="{verse}"' if verse else '' + if not chapter or not verse: + print(f"Invalid verse template: {template} {template.get(1, '')=} {template.get(2, '')=} {template.get(3, '')=}") + + return f'{text}' + def _handle_right_header(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{rh|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_drop_initial(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{dropinitial|letter}} to letter""" + letter = str(template.get(1, '')) + return f'{letter}' + + def _handle_double_horizontal_rule(self, template) -> str: + """Convert {{dhr}} to """ + value = str(template.get(1, '')) + if value: + value=f' value="{value}"' + else: + value="" + return f'' + def _handle_anchor(self, template) -> str: - name = str(template.get(1, "")).strip() + """Convert {{anchor|name}} to """ + name = str(template.get(1, '')) return f'' - + def _handle_anchor_plus(self, template) -> str: - name = str(template.get(1, "")).strip() - return f'' - + """Convert {{anchor+|name|text}} to text""" + name = str(template.get(1, '')) + text = str(template.get(2, '')) + return f'{text}' + def _handle_language(self, template) -> str: - code = str(template.get(1, "")).strip() - content = str(template.get(2, "")) - return f'{content}' - + """Convert {{lang|code|text}} to text""" + code = str(template.get(1, '')) + text = str(template.get(2, '')) + return f'{text}' + def _handle_small_refs(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{smallrefs}} to """ + return '' + def _handle_hws(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{hws|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_hwe(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{hwe|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_asterisks(self, template) -> str: - return "" - + """Convert {{***}} to ***""" + n = str(template.get(1, '3')) + return f'***' + def _handle_reconstruct(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{reconstruct|content|text}} to text""" + content = str(template.get(1, '')) + text = str(template.get(2, '')) + return f'{content}{text}' + def _handle_sic(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{SIC|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_superscript(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{sup|text}} to text""" + content = str(template.get(1, '')) + return f'{content}' + def _handle_bar(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{bar|length}} to """ + length = str(template.get(1, '6')) + return f'' + def _handle_gap(self, template) -> str: - return "" - + """Convert {{gap|length}} to """ + length = str(template.get(1, '')) + if length: + return f'' + else: + return '' + def _handle_overfloat_left(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{overfloat left|align|padding|text}} to text""" + # Get parameters - can be positional or named + align = str(template.get('align', template.get(1, ''))) + padding = str(template.get('padding', template.get(2, ''))) + text = str(template.get('text', template.get(3, ''))) + + # Clean up named parameters (remove parameter name prefixes) + align = align.replace('align=', '') if align.startswith('align=') else align + padding = padding.replace('padding=', '') if padding.startswith('padding=') else padding + text = text.replace('text=', '') if text.startswith('text=') else text + + # Build attributes + attributes = [] + if align: + attributes.append(f'align="{align}"') + if padding: + attributes.append(f'padding="{padding}"') + + attr_str = ' ' + ' '.join(attributes) if attributes else '' + + return f'{text}' + def _handle_float_right(self, template) -> str: - content = str(template.get(1, "")) - return f"{content}" - + """Convert {{float right|text}} to text""" + text = str(template.get(1, '')) + return f'{text}' + def _handle_smaller_block_start(self, template) -> str: - return "" - + """Convert {{smaller block/s}} to """ + return '' + def _handle_smaller_block_end(self, template) -> str: - return "" - - # ------------------------------------------------------------------------- - # Tag handlers (JPS1917 defaults) - # ------------------------------------------------------------------------- - + """Convert {{smaller block/e}} to """ + return '' + + # ============================================================================ + # WIKILINK HANDLERS + # ============================================================================ + + def _handle_wikilink(self, wikilink) -> str: + """Process and capture wikilinks""" + # Extract wikilink information + title = str(wikilink.title) if hasattr(wikilink, 'title') and wikilink.title else '' + text = str(wikilink.text) if hasattr(wikilink, 'text') and wikilink.text else title + + # Process templates within the wikilink text + processed_text = self._process_nested_content(text) + + # Store wikilink information + wikilink_info = { + 'title': title, + 'text': processed_text, + 'namespace': str(wikilink.namespace) if hasattr(wikilink, 'namespace') and wikilink.namespace else None, + 'section': str(wikilink.section) if hasattr(wikilink, 'section') and wikilink.section else None, + 'fragment': str(wikilink.fragment) if hasattr(wikilink, 'fragment') and wikilink.fragment else None + } + self.wikilinks.append(wikilink_info) + + # Convert to XML - use __link__ tag with attributes + attributes = [] + if title: + attributes.append(f'title="{title}"') + if wikilink_info['namespace']: + attributes.append(f'namespace="{wikilink_info["namespace"]}"') + if wikilink_info['section']: + attributes.append(f'section="{wikilink_info["section"]}"') + if wikilink_info['fragment']: + attributes.append(f'fragment="{wikilink_info["fragment"]}"') + + attr_str = ' ' + ' '.join(attributes) if attributes else '' + return f'<__link__{attr_str}>{processed_text}' + + # ============================================================================ + # TAG HANDLERS + # ============================================================================ + def _handle_section(self, tag) -> str: - begin = getattr(tag, "attributes", {}).get("begin", "") - return f'

' - + """Convert
to
with begin and end attributes""" + content = str(tag.contents) if tag.contents else '' + + # Extract begin and end attributes + attributes = [] + if hasattr(tag, 'attributes') and tag.attributes: + for attr in tag.attributes: + if hasattr(attr, 'name') and hasattr(attr, 'value'): + attr_name = str(attr.name) + attr_value = str(attr.value) + if attr_name in ['begin', 'end']: + attributes.append(f'{attr_name}="{attr_value}"') + + # Add begin and end attributes if they exist + attr_str = ' ' + ' '.join(attributes) if attributes else '' + + return f'{content}
' + def _handle_table(self, tag) -> str: - contents = getattr(tag, "contents", "") or "" - return f"{contents}
" - + """Convert to
""" + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}
' + def _handle_table_row(self, tag) -> str: - contents = getattr(tag, "contents", "") or "" - return f"{contents}" - + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + def _handle_table_cell(self, tag) -> str: - contents = getattr(tag, "contents", "") or "" - return f"{contents}" - + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + def _handle_italic(self, tag) -> str: - contents = getattr(tag, "contents", "") or "" - return f"{contents}" - + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + def _handle_line_break(self, tag) -> str: - return "
" - + """Convert
to
""" + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'' + def _handle_span(self, tag) -> str: - contents = getattr(tag, "contents", "") or "" - return f"{contents}" - + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + def _handle_definition_description(self, tag) -> str: - contents = getattr(tag, "contents", "") or "" - return f"
{contents}
" - + """Convert
to
""" + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}
' + def _handle_reference(self, tag) -> str: - name = getattr(tag, "attributes", {}).get("name", "") - contents = getattr(tag, "contents", "") or "" - return f'{contents}' - + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + def _handle_noinclude(self, tag) -> str: - contents = getattr(tag, "contents", "") or "" - return f"{contents}" - + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + def _handle_pagequality(self, tag) -> str: - contents = getattr(tag, "contents", "") or "" - return f"{contents}" - - # ------------------------------------------------------------------------- - # Pre/post processing (JPS1917 defaults) - # ------------------------------------------------------------------------- - - def _fix_noinclude_line_breaks(self, text: str, metadata: Dict[str, Any]) -> str: - return re.sub(r"\n", "", text) - - def _convert_paragraph_breaks(self, text: str, metadata: Dict[str, Any]) -> str: - return text.replace("\n\n", "

") - - def _normalize_whitespace(self, text: str, metadata: Dict[str, Any]) -> str: - return re.sub(r"[ \t]+", " ", text) - - def _handle_special_characters(self, text: str, metadata: Dict[str, Any]) -> str: - # Preserve only minimal escaping at this stage. - return text - - def _extract_metadata(self, text: str, metadata: Dict[str, Any]) -> str: - metadata.setdefault("length", len(text)) - return text - - def _validate_xml_structure(self, xml_content: str, metadata: Dict[str, Any]) -> str: - # Lightweight sanity check; TEI validation happens later. - return xml_content - - def _finalize_metadata(self, xml_content: str, metadata: Dict[str, Any]) -> str: - metadata["processed"] = True - return xml_content - - # ------------------------------------------------------------------------- - # Wikilinks - # ------------------------------------------------------------------------- - - def _handle_wikilink(self, node) -> str: - try: - title = str(getattr(node, "title", "")).strip() - text = str(getattr(node, "text", "")).strip() if getattr(node, "text", None) else "" - self.wikilinks.append({"title": title, "text": text}) - if text: - return f'<__link__ title="{title}">{text}' - return f'<__link__ title="{title}"/>' - except Exception: - return str(node) - - # ------------------------------------------------------------------------- - # Public API - # ------------------------------------------------------------------------- - + """Convert to """ + content = str(tag.contents) if tag.contents else '' + attributes = self._extract_tag_attributes(tag) + attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else '' + return f'{content}' + + def _extract_tag_attributes(self, tag) -> Dict[str, str]: + """Extract all attributes from a tag""" + attributes = {} + if hasattr(tag, 'attributes') and tag.attributes: + for attr in tag.attributes: + if hasattr(attr, 'name') and hasattr(attr, 'value'): + attributes[str(attr.name)] = str(attr.value) + return attributes + + # ============================================================================ + # PREPROCESSORS + # ============================================================================ + + def _fix_noinclude_line_breaks(self, content: str) -> str: + """Insert a blank line after tags when followed by non-whitespace content""" + # Pattern to match followed by optional whitespace and any non-whitespace character + # This handles cases like: :text, text, {{template}}, etc. + pattern = r'()\s*(\S)' + + def replace_noinclude_content(match): + noinclude_tag = match.group(1) + following_content = match.group(2) + # Insert a newline after and before the following content + return f'{noinclude_tag}\n{following_content}' + + # Apply the replacement + content = re.sub(pattern, replace_noinclude_content, content) + + return content + + def _normalize_whitespace(self, content: str) -> str: + """Normalize whitespace in content""" + # Normalize multiple spaces to single space + content = re.sub(r' +', ' ', content) + # Normalize line breaks, but preserve paragraph markers + content = re.sub(r'\n+', '\n', content) + return content.strip() + + def _convert_paragraph_breaks(self, content: str) -> str: + """Convert double newlines to paragraph indicators, but skip if {{nop}} is directly adjacent""" + + # First, protect {{nop}} markers and their immediate context + # Replace {{nop}} with a temporary marker + content = content.replace('{{nop}}', '___NOP_MARKER___') + + # Convert \n\n to

\n paragraph indicators, but not if they're adjacent to ___NOP_MARKER___ + # This regex matches \n\n that are NOT preceded or followed by ___NOP_MARKER___ + content = re.sub(r'(?\n', content) + + # Restore {{nop}} markers + content = content.replace('___NOP_MARKER___', '{{nop}}') + + return content + + def _handle_special_characters(self, content: str) -> str: + """Handle special characters and entities - escape ampersands not in XML/HTML entities""" + # More comprehensive regex to match XML/HTML entities + # This includes named entities like &, <, >, ", ' + # and numeric entities like { and  + entity_pattern = r'&(?:[a-zA-Z][a-zA-Z0-9]*|#[0-9]+|#x[0-9a-fA-F]+);' + + # Split content by entities to preserve them + parts = re.split(f'({entity_pattern})', content) + + # Process each part + result_parts = [] + for part in parts: + if re.match(entity_pattern, part): + # This is an entity, keep it as-is + result_parts.append(part) + else: + # This is not an entity, escape standalone ampersands + escaped_part = part.replace('&', '&') + result_parts.append(escaped_part) + + return ''.join(result_parts) + + def _extract_metadata(self, content: str) -> Dict[str, Any]: + """Extract metadata from content""" + metadata = {} + # Extract page quality information + # Extract language information + # Extract structural information + return metadata + + # ============================================================================ + # POSTPROCESSORS + # ============================================================================ + + def _validate_xml_structure(self, content: str) -> str: + """Validate and fix XML structure""" + # Ensure proper nesting + # Validate against schema + # Fix common issues + return content + + def _cleanup_empty_elements(self, content: str) -> str: + """Remove or fix empty elements""" + # Remove empty elements + content = re.sub(r'<(\w+)[^>]*>', '', content) + return content + + def _finalize_metadata(self, content: str) -> str: + """Finalize metadata and add to content""" + # Add final metadata + # Ensure proper document structure + return content + + # ============================================================================ + # MAIN PROCESSING METHODS + # ============================================================================ + def process_wikitext(self, wikitext: str) -> ConversionResult: - warnings: List[str] = [] - errors: List[str] = [] - metadata: Dict[str, Any] = {} - - text = wikitext or "" - for pre in self.preprocessors: - try: - text = pre(text, metadata) - except Exception as e: - errors.append(str(e)) - + """ + Main method to process MediaWiki wikitext to XML. + + Args: + wikitext: The MediaWiki content to convert + + Returns: + ConversionResult with XML content and metadata + """ + warnings = [] + errors = [] + metadata = {} + try: - xml_content = self._process_nested_content(text) + # Preprocessing + content = wikitext + for preprocessor in self.preprocessors: + if preprocessor == self._extract_metadata: + metadata.update(preprocessor(content)) + else: + content = preprocessor(content) + + # Parse MediaWiki content + parsed = mwparserfromhell.parse(content) + + # Process all nodes with nested content support + nodes_to_replace = [] + + # Process nodes in the order they appear in the document + for node in parsed.nodes: + if hasattr(node, 'name'): # Template + template_name = str(node.name).strip() + if template_name in self.template_handlers: + try: + # Process nested content within the template + processed_node = self._process_template_with_nesting(node) + replacement = self.template_handlers[template_name](processed_node) + nodes_to_replace.append((node, replacement)) + except Exception as e: + errors.append(f"Error processing template {template_name}: {str(e)}") + else: + warnings.append(f"Unknown template: {template_name}") + + elif hasattr(node, 'tag'): # Tag + tag_name = str(node.tag).strip().lower() + if tag_name in self.tag_handlers: + try: + # Process nested content within the tag + processed_node = self._process_tag_with_nesting(node) + replacement = self.tag_handlers[tag_name](processed_node) + nodes_to_replace.append((node, replacement)) + except Exception as e: + errors.append(f"Error processing tag {tag_name}: {str(e)}") + else: + warnings.append(f"Unknown tag: {tag_name}") + + elif hasattr(node, '__class__') and 'Wikilink' in str(node.__class__): # Wikilink + try: + replacement = self._handle_wikilink(node) + nodes_to_replace.append((node, replacement)) + except Exception as e: + errors.append(f"Error processing wikilink: {str(e)}") + + # Replace all nodes in order + for node, replacement in nodes_to_replace: + parsed.replace(node, replacement) + + # Get processed content + xml_content = str(parsed) + + # Postprocessing + for postprocessor in self.postprocessors: + xml_content = postprocessor(xml_content) + + # Wrap in mediawiki tag + xml_content = f'{xml_content}' + + return ConversionResult( + xml_content=xml_content, + metadata=metadata, + warnings=warnings, + errors=errors, + wikilinks=self.wikilinks.copy() + ) + except Exception as e: - xml_content = text - errors.append(str(e)) - - for post in self.postprocessors: - try: - xml_content = post(xml_content, metadata) - except Exception as e: - errors.append(str(e)) - - return ConversionResult( - xml_content=xml_content, - metadata=metadata, - warnings=warnings, - errors=errors, - wikilinks=self.wikilinks.copy(), - ) - + errors.append(f"Fatal error in processing: {str(e)}") + return ConversionResult( + xml_content="", + metadata={}, + warnings=warnings, + errors=errors, + wikilinks=[] + ) + def add_template_handler(self, template_name: str, handler_func): + """Add a custom template handler""" self.template_handlers[template_name] = handler_func - + def add_tag_handler(self, tag_name: str, handler_func): + """Add a custom tag handler""" self.tag_handlers[tag_name] = handler_func - + def add_preprocessor(self, preprocessor_func): + """Add a custom preprocessor""" self.preprocessors.append(preprocessor_func) - + def add_postprocessor(self, postprocessor_func): + """Add a custom postprocessor""" self.postprocessors.append(postprocessor_func) - + def get_wikilinks(self) -> List[Dict[str, Any]]: + """Get all captured wikilinks""" return self.wikilinks.copy() - + def clear_wikilinks(self): + """Clear all captured wikilinks""" self.wikilinks.clear() +# ============================================================================ +# CONVENIENCE FUNCTIONS +# ============================================================================ + def create_processor() -> MediaWikiProcessor: + """Create a new MediaWiki processor instance""" return MediaWikiProcessor() def process_page(page_content: str) -> ConversionResult: + """Process a single page of MediaWiki content""" processor = create_processor() return processor.process_wikitext(page_content) + +if __name__ == "__main__": + # Example usage + processor = create_processor() + + # Example MediaWiki content with nested templates + sample_wikitext = """ + {{verse|1|1|In the beginning God created the heaven and the earth.}} + + {{verse|1|2|And the earth was without form, and void; and darkness was upon the face of the deep.}} + + {{sc|Genesis}} {{c|Chapter 1}} + {{larger|The Creation}} + This is a reference + + See also [[Genesis]] and [[Creation myth]] for more information. + + Nested example: {{sc|{{larger|Bold Large Text}}}} + Complex nested: {{verse|1|3|{{sc|God}} said, {{larger|Let there be light}}}} + """ + + result = processor.process_wikitext(sample_wikitext) + print("XML Output:") + print(result.xml_content) + print("\nWarnings:", result.warnings) + print("Errors:", result.errors) + print("Wikilinks:", result.wikilinks) From 0ae7ab2868a2b060f6828137f32efbc7967098d8 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Thu, 28 May 2026 22:26:22 -0700 Subject: [PATCH 07/10] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py | 1 - opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py b/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py index 43a3217..b3b2c31 100644 --- a/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py +++ b/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py @@ -572,7 +572,6 @@ def main(argv: list[str] | None = None) -> int: logging.basicConfig(level=logging.INFO) args = _build_arg_parser().parse_args(argv) - data_dir = miqra_al_pi_hamasorah_data_directory(args.sourcetexts_root) sheets_dir = miqra_al_pi_hamasorah_sheets_directory(args.sourcetexts_root) out_dir = args.project_dir if args.project_dir is not None else _default_project_directory() diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py b/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py index 1c2ab77..bf604c7 100644 --- a/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py +++ b/opensiddur/importer/miqra_al_pi_hamasorah/miqra_wikitext.py @@ -54,9 +54,6 @@ } ) -_BOLD_ITALIC_RE = re.compile(r"'''''(.*?)'''''") -_BOLD_RE = re.compile(r"'''(.*?)'''") -_ITALIC_RE = re.compile(r"''(.*?)''") _ANY_HI_RE = re.compile(r"'''''(.*?)'''''|'''(.*?)'''|''(.*?)''") _TAG_OPEN_RE = re.compile(r"<(miqra|mw):([a-zA-Z0-9-]+)([^>]*?)(/?)>") _KETEG_START_RE = re.compile(r"<קטע\s+התחלה=([^/>]+)\s*/>", re.IGNORECASE) From 9471772b0ff6d7d513dcc7029cbd1705dd247553 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Thu, 28 May 2026 22:31:52 -0700 Subject: [PATCH 08/10] remove unused import --- opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py b/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py index b3b2c31..aeba957 100644 --- a/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py +++ b/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py @@ -15,7 +15,6 @@ from opensiddur.common.xslt import xslt_transform_string from opensiddur.importer.util.pages import ( default_sourcetexts_root, - miqra_al_pi_hamasorah_data_directory, miqra_al_pi_hamasorah_sheets_directory, ) from opensiddur.importer.util.prettify import prettify_xml From 94df6984d81af8a54fc6e00dc5ce8f88f8b469a9 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Thu, 28 May 2026 22:41:08 -0700 Subject: [PATCH 09/10] chore: try to increase code coverage --- opensiddur/exporter/pdf/pdf.py | 2 +- opensiddur/exporter/tex/latex.py | 2 +- .../exporter/validate_urn_references.py | 2 +- .../importer/agent/checkpoint_example.py | 9 +- opensiddur/importer/agent/example_usage.py | 2 +- .../importer/jps1917/template_finder.py | 27 +- opensiddur/importer/jps1917/test_processor.py | 4 +- opensiddur/importer/jps1917/wikisource.py | 2 +- .../miqra_al_pi_hamasorah/convert_tsv.py | 2 +- .../miqra_al_pi_hamasorah/download.py | 6 +- .../importer/util/mediawiki_processor.py | 14 +- opensiddur/importer/wlc/download_tanach.py | 6 +- opensiddur/importer/wlc/wlc.py | 2 +- .../test_miqra_wikitext.py | 253 ++++++++++++++++++ 14 files changed, 298 insertions(+), 35 deletions(-) diff --git a/opensiddur/exporter/pdf/pdf.py b/opensiddur/exporter/pdf/pdf.py index df1801d..cd8cbba 100755 --- a/opensiddur/exporter/pdf/pdf.py +++ b/opensiddur/exporter/pdf/pdf.py @@ -392,5 +392,5 @@ def main(): # pragma: no cover sys.exit(1) -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover main() diff --git a/opensiddur/exporter/tex/latex.py b/opensiddur/exporter/tex/latex.py index 6b7e4a8..5629d86 100644 --- a/opensiddur/exporter/tex/latex.py +++ b/opensiddur/exporter/tex/latex.py @@ -464,5 +464,5 @@ def main(): # pragma: no cover ) -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover main() diff --git a/opensiddur/exporter/validate_urn_references.py b/opensiddur/exporter/validate_urn_references.py index 274acca..2a70d3d 100644 --- a/opensiddur/exporter/validate_urn_references.py +++ b/opensiddur/exporter/validate_urn_references.py @@ -178,6 +178,6 @@ def main(argv: Optional[list[str]] = None) -> int: return 0 -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover raise SystemExit(main()) diff --git a/opensiddur/importer/agent/checkpoint_example.py b/opensiddur/importer/agent/checkpoint_example.py index 6274622..55379eb 100644 --- a/opensiddur/importer/agent/checkpoint_example.py +++ b/opensiddur/importer/agent/checkpoint_example.py @@ -171,9 +171,10 @@ def demo_interrupted_session(): print(f"\n🧹 Cleaned up {checkpoint_file}") -if __name__ == "__main__": - # Run the checkpointing demo +def _run_cli() -> None: # pragma: no cover demo_checkpointing() - - # Run the interrupted session demo demo_interrupted_session() + + +if __name__ == "__main__": # pragma: no cover + _run_cli() diff --git a/opensiddur/importer/agent/example_usage.py b/opensiddur/importer/agent/example_usage.py index 7b472f2..a82367b 100644 --- a/opensiddur/importer/agent/example_usage.py +++ b/opensiddur/importer/agent/example_usage.py @@ -63,5 +63,5 @@ def main(): print(f" [{role}]: {message[:100]}{'...' if len(message) > 100 else ''}") -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover main() diff --git a/opensiddur/importer/jps1917/template_finder.py b/opensiddur/importer/jps1917/template_finder.py index da3af2e..2800641 100644 --- a/opensiddur/importer/jps1917/template_finder.py +++ b/opensiddur/importer/jps1917/template_finder.py @@ -414,7 +414,7 @@ def save_template_analysis(template_data: Dict, output_file: str = "template_ana print(f"Template analysis saved to {output_file}") -if __name__ == "__main__": # pragma: no cover +def _run_cli() -> None: # pragma: no cover import argparse parser = argparse.ArgumentParser(description="Analyze JPS 1917 Wikisource MediaWiki templates and tags.") @@ -427,27 +427,28 @@ def save_template_analysis(template_data: Dict, output_file: str = "template_ana args = parser.parse_args() root = args.sourcetexts_root - # Example usage print("Starting MediaWiki template and tag analysis...") - - # Find all templates - print("\n" + "="*50) + + print("\n" + "=" * 50) print("ANALYZING TEMPLATES") - print("="*50) + print("=" * 50) template_data = find_all_templates(sourcetexts_root=root) print_template_summary(template_data) save_template_analysis(template_data, "jps1917_template_analysis.json") - - # Find all tags - print("\n" + "="*50) + + print("\n" + "=" * 50) print("ANALYZING TAGS") - print("="*50) + print("=" * 50) tag_data = find_all_tags(sourcetexts_root=root) print_tag_summary(tag_data) save_tag_analysis(tag_data, "jps1917_tag_analysis.json") - - print("\n" + "="*50) + + print("\n" + "=" * 50) print("ANALYSIS COMPLETE!") - print("="*50) + print("=" * 50) print("Template analysis saved to: jps1917_template_analysis.json") print("Tag analysis saved to: jps1917_tag_analysis.json") + + +if __name__ == "__main__": # pragma: no cover + _run_cli() diff --git a/opensiddur/importer/jps1917/test_processor.py b/opensiddur/importer/jps1917/test_processor.py index 0d0480d..75c0b75 100644 --- a/opensiddur/importer/jps1917/test_processor.py +++ b/opensiddur/importer/jps1917/test_processor.py @@ -122,7 +122,7 @@ def handle_custom_tag(tag): print(f"\nErrors: {result.errors}") -def main(): +def main(): # pragma: no cover """Run all tests""" print("MediaWiki to XML Processor Test Suite") print("Based on 1917 JPS Wikisource Analysis") @@ -143,5 +143,5 @@ def main(): traceback.print_exc() -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover main() diff --git a/opensiddur/importer/jps1917/wikisource.py b/opensiddur/importer/jps1917/wikisource.py index 49a5825..c977389 100644 --- a/opensiddur/importer/jps1917/wikisource.py +++ b/opensiddur/importer/jps1917/wikisource.py @@ -131,5 +131,5 @@ def main(argv: list[str] | None = None) -> int: return 0 -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover sys.exit(main()) diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py b/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py index aeba957..d3c128e 100644 --- a/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py +++ b/opensiddur/importer/miqra_al_pi_hamasorah/convert_tsv.py @@ -594,6 +594,6 @@ def main(argv: list[str] | None = None) -> int: return 0 -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover sys.exit(main()) diff --git a/opensiddur/importer/miqra_al_pi_hamasorah/download.py b/opensiddur/importer/miqra_al_pi_hamasorah/download.py index 342ba83..1b1bc85 100644 --- a/opensiddur/importer/miqra_al_pi_hamasorah/download.py +++ b/opensiddur/importer/miqra_al_pi_hamasorah/download.py @@ -215,9 +215,13 @@ def main(argv: list[str] | None = None) -> int: return 0 -if __name__ == "__main__": +def _run_cli() -> None: # pragma: no cover try: sys.exit(main()) except Exception as e: logger.error("Error downloading Miqra al pi ha-Masorah: %s", e) raise + + +if __name__ == "__main__": # pragma: no cover + _run_cli() diff --git a/opensiddur/importer/util/mediawiki_processor.py b/opensiddur/importer/util/mediawiki_processor.py index faeccff..208dfbf 100644 --- a/opensiddur/importer/util/mediawiki_processor.py +++ b/opensiddur/importer/util/mediawiki_processor.py @@ -798,11 +798,8 @@ def process_page(page_content: str) -> ConversionResult: return processor.process_wikitext(page_content) -if __name__ == "__main__": - # Example usage +def _demo_main() -> None: # pragma: no cover processor = create_processor() - - # Example MediaWiki content with nested templates sample_wikitext = """ {{verse|1|1|In the beginning God created the heaven and the earth.}} @@ -811,16 +808,19 @@ def process_page(page_content: str) -> ConversionResult: {{sc|Genesis}} {{c|Chapter 1}} {{larger|The Creation}} This is a reference - + See also [[Genesis]] and [[Creation myth]] for more information. - + Nested example: {{sc|{{larger|Bold Large Text}}}} Complex nested: {{verse|1|3|{{sc|God}} said, {{larger|Let there be light}}}} """ - result = processor.process_wikitext(sample_wikitext) print("XML Output:") print(result.xml_content) print("\nWarnings:", result.warnings) print("Errors:", result.errors) print("Wikilinks:", result.wikilinks) + + +if __name__ == "__main__": # pragma: no cover + _demo_main() diff --git a/opensiddur/importer/wlc/download_tanach.py b/opensiddur/importer/wlc/download_tanach.py index f3f74e6..1790156 100644 --- a/opensiddur/importer/wlc/download_tanach.py +++ b/opensiddur/importer/wlc/download_tanach.py @@ -72,9 +72,13 @@ def main(argv: list[str] | None = None) -> int: return 0 -if __name__ == "__main__": +def _run_cli() -> None: # pragma: no cover try: sys.exit(main()) except Exception as e: logger.error("Error downloading/unzipping Tanach: %s", e) raise + + +if __name__ == "__main__": # pragma: no cover + _run_cli() diff --git a/opensiddur/importer/wlc/wlc.py b/opensiddur/importer/wlc/wlc.py index bc69f1b..48f8019 100644 --- a/opensiddur/importer/wlc/wlc.py +++ b/opensiddur/importer/wlc/wlc.py @@ -93,5 +93,5 @@ def main(argv: list[str] | None = None) -> int: return 0 -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover sys.exit(main()) diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py index 273bcf3..8fcc69a 100644 --- a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py +++ b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py @@ -1,6 +1,11 @@ import unittest from opensiddur.importer.miqra_al_pi_hamasorah.miqra_wikitext import ( + _escape_outside_tags, + _preprocess_column_c, + _preprocess_miqra_tags, + _wikitext_basic_markup_to_xml, + _xml_escape, link_target_to_uri, normalize_template_name, reset_processor, @@ -8,6 +13,94 @@ ) +class TestNormalizeTemplateName(unittest.TestCase): + def test_strips_whitespace(self): + self.assertEqual(normalize_template_name(" פפ "), "פפ") + + def test_strips_tevnit_prefix(self): + self.assertEqual(normalize_template_name("תבנית:מ:טעם"), "מ:טעם") + self.assertEqual(normalize_template_name("תבנית:נוסח"), "נוסח") + + def test_normalizes_quotes(self): + self.assertEqual(normalize_template_name("מ:כו''ק"), 'מ:כו"ק') + self.assertEqual( + normalize_template_name("מ:קו״כ"), + 'מ:קו"כ', + ) + + +class TestLinkTargetToUri(unittest.TestCase): + def test_empty_target(self): + self.assertEqual(link_target_to_uri(""), "") + self.assertEqual(link_target_to_uri(" "), "") + + def test_protocol_relative_url(self): + self.assertEqual( + link_target_to_uri("//cdn.example.com/x.pdf"), + "https://cdn.example.com/x.pdf", + ) + + def test_fragment_preserved(self): + uri = link_target_to_uri("דף#פרק") + self.assertIn("#", uri) + self.assertTrue(uri.startswith("https://he.wikisource.org/wiki/")) + + +class TestPreprocessors(unittest.TestCase): + def test_column_c_double_underscore(self): + self.assertEqual(_preprocess_column_c("a__b"), "a b") + + def test_column_c_line_break(self): + self.assertEqual( + _preprocess_column_c("http://host/path"), + "http://host/path", + ) + self.assertEqual( + _preprocess_column_c("https://host/path"), + "https://host/path", + ) + self.assertIn("", _preprocess_column_c("שורה//המשך")) + + def test_miqra_keteg_tags(self): + s = "<קטע התחלה=foo/>text<קטע סוף=foo/>" + out = _preprocess_miqra_tags(s) + self.assertIn('', out) + self.assertIn('', out) + + +class TestMarkupAndEscape(unittest.TestCase): + def test_xml_escape(self): + self.assertEqual( + _xml_escape('a & b "d" \'e\''), + "a & b <c> "d" 'e'", + ) + + def test_wikitext_bold_italic(self): + self.assertEqual( + _wikitext_basic_markup_to_xml("plain '''bold''' ''italic''"), + 'plain bold italic', + ) + + def test_wikitext_bold_italic_combined(self): + self.assertIn( + 'rend="bold-italic"', + _wikitext_basic_markup_to_xml("'''''both'''''"), + ) + + def test_escape_outside_tags_preserves_miqra_elements(self): + inner = _escape_outside_tags( + "plain א '''bold'''" + ) + self.assertIn("", inner) + self.assertIn("א", inner) + self.assertIn('rend="bold"', inner) + + def test_wikitext_markup_in_verse_via_integration(self): + frag = wikitext_to_intermediate_xml("'''דבר'''") + self.assertIn('', frag) + self.assertIn("דבר", frag) + + class TestMiqraWikitext(unittest.TestCase): def setUp(self): reset_processor() @@ -98,6 +191,166 @@ def test_yerah_ben_yomo2_shows_first_parameter_only(self): self.assertNotIn("אַלְפַּ֪֪יִם", frag) self.assertNotIn("{{ירח בן יומו-2", frag) + def test_ketiv_only_and_qeri_only(self): + k = wikitext_to_intermediate_xml("{{כתיב ולא קרי|כתיב}}") + q = wikitext_to_intermediate_xml("{{קרי ולא כתיב|קְרִי}}") + self.assertIn("(כתיב)", k) + self.assertIn("[קְרִי]", q) + + def test_qok_if_matres(self): + frag = wikitext_to_intermediate_xml( + '{{מ:קו"כ-אם-2|display|כתיב|קְרִי}}' + ) + self.assertIn("display", frag) + self.assertIn("", frag) + self.assertIn("כתיב", frag) + self.assertIn("קְרִי", frag) + + def test_qok_two_qeri_words(self): + frag = wikitext_to_intermediate_xml( + '{{מ:קו"כ קרי שונה מהכתיב בשתי מילים|כתיב|ק1|ק2}}' + ) + self.assertIn('type="split-qeri"', frag) + self.assertIn("ק1", frag) + self.assertIn("ק2", frag) + self.assertIn("כתיב", frag) + + def test_parashah_variants(self): + cases = [ + ("{{פפפ}}", 'type="open-line"'), + ("{{סס}}", 'type="close"'), + ("{{ססס}}", 'type="close-inline"'), + ("{{סס2}}", 'type="close-narrow"'), + ("{{מ:ששש}}", 'type="shirah"'), + ] + for wikitext, expected in cases: + with self.subTest(wikitext=wikitext): + self.assertIn(expected, wikitext_to_intermediate_xml(wikitext)) + + def test_parashah_mid_verse_attribute(self): + frag = wikitext_to_intermediate_xml("{{פפ|פסקא באמצע פסוק}}") + self.assertIn('midVerse="true"', frag) + + def test_poetic_levels(self): + for level, template in enumerate(("ר0", "ר1", "ר2", "ר3", "ר4")): + frag = wikitext_to_intermediate_xml(f"{{{{{template}}}}}") + self.assertIn(f'', frag) + + def test_centered_title(self): + frag = wikitext_to_intermediate_xml("{{פרשה-מרכז|כותרת}}") + self.assertIn("כותרת", frag) + + def test_letter_formatting(self): + frag = wikitext_to_intermediate_xml( + "{{מ:אות-ק|ק}}{{מ:אות תלויה|ת}}{{מ:אות מנוקדת|מ}}{{מ:נו\"ן הפוכה|ן}}" + ) + self.assertIn('rend="small"', frag) + self.assertIn('rend="raised"', frag) + self.assertIn("", frag) + self.assertIn("", frag) + + def test_yerushalem_variants(self): + y = wikitext_to_intermediate_xml("{{מ:ירושלם|v|a}}") + ya = wikitext_to_intermediate_xml("{{מ:ירושלמה|v|a}}") + self.assertIn('', y) + self.assertIn('', ya) + + def test_standalone_accents(self): + frag = wikitext_to_intermediate_xml( + "{{ירח בן יומו}}{{גלגל}}{{אתנח הפוך}}" + ) + self.assertIn('type="yerah-ben-yomo"', frag) + self.assertIn('type="galgal"', frag) + self.assertIn('type="etnah-hafukh"', frag) + + def test_taam_handlers(self): + frag = wikitext_to_intermediate_xml( + "{{מ:טעם ומתג באות אחת|א}}" + "{{שני טעמים באות אחת}}" + "{{מ:גרש ותלישא גדולה}}" + "{{מ:גרשיים ותלישא גדולה}}" + ) + self.assertIn("א", frag) + self.assertIn('type="geresh-telisha-gedola"', frag) + self.assertIn('type="gershayim-telisha-gedola"', frag) + + def test_qamats_named_params(self): + frag = wikitext_to_intermediate_xml("{{מ:קמץ|ד=דָּ}}") + self.assertIn("דָּ", frag) + + def test_taam_dummy_strips_leading_marker(self): + frag = wikitext_to_intermediate_xml("{{מ:טעם|Xאות}}") + self.assertIn("אות", frag) + self.assertNotIn("Xאות", frag) + + def test_qupo_accent(self): + frag = wikitext_to_intermediate_xml( + "{{שני טעמים באות אחת קמץ-תחתון-פתח-עליון|עליו=א}}" + ) + self.assertIn('', frag) + + def test_punctuation_and_maqaf(self): + frag = wikitext_to_intermediate_xml( + "{{מ:לגרמיה-2}}{{מ:פסק}}{{מ:מקף אפור}}" + ) + self.assertIn('type="legarmeh"', frag) + self.assertIn('type="paseq"', frag) + self.assertIn('rend="grey"', frag) + + def test_kol_qamats_default(self): + self.assertIn("כָּל", wikitext_to_intermediate_xml("{{מ:כל קמץ קטן מרכא}}")) + + def test_notes_and_anchors(self): + frag = wikitext_to_intermediate_xml( + "{{מ:הערה|גוף הערה}}{{עוגן בשורה|label}}" + "{{מ:סיום בטוב|סוף טוב}}" + ) + self.assertIn("', frag) + self.assertIn("סוף טוב", frag) + + def test_dual_trope_and_accent(self): + frag = wikitext_to_intermediate_xml( + "{{קק|target}}" + "{{מ:כפול|כפול=ד|א=א|ב=ב}}" + ) + self.assertIn("target", frag) + self.assertIn('', frag) + self.assertIn('role="א"', frag) + self.assertIn('role="ב"', frag) + + def test_emphasis_and_footnote_mark(self): + frag = wikitext_to_intermediate_xml("{{מודגש|חשוב}}{{ש}}") + self.assertIn('חשוב', frag) + self.assertIn("", frag) + + def test_wikilink(self): + frag = wikitext_to_intermediate_xml("[[דף]] and [[דף|תווית]]") + self.assertIn('hiddenstill" + ) + self.assertIn("visible", frag) + self.assertIn("still", frag) + self.assertNotIn("hidden", frag) + + def test_keteg_segments_in_wikitext(self): + frag = wikitext_to_intermediate_xml("<קטע התחלה=seg/>") + self.assertIn('', frag) + + def test_column_c_line_break_integration(self): + frag = wikitext_to_intermediate_xml("א//ב", column_c=True) + self.assertIn("", frag) + + def test_nosach_without_note(self): + frag = wikitext_to_intermediate_xml("{{נוסח|טקסט}}") + self.assertEqual(frag, "טקסט") + self.assertNotIn(" Date: Thu, 28 May 2026 22:46:57 -0700 Subject: [PATCH 10/10] chore: set codecov to 85% --- codecov.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 codecov.yml diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..3b065ab --- /dev/null +++ b/codecov.yml @@ -0,0 +1,11 @@ +# https://docs.codecov.com/docs/commit-status +coverage: + status: + project: + default: + target: 85% + informational: false + patch: + default: + target: 85% + informational: false