"""
+ content = str(tag.contents) if tag.contents else ''
+ attributes = self._extract_tag_attributes(tag)
+ attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else ''
+ return f''
+
+ def _handle_table_row(self, tag) -> str:
+ """Convert to
"""
+ content = str(tag.contents) if tag.contents else ''
+ attributes = self._extract_tag_attributes(tag)
+ attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else ''
+ return f'
{content}
'
+
+ def _handle_table_cell(self, tag) -> str:
+ """Convert to | """
+ content = str(tag.contents) if tag.contents else ''
+ attributes = self._extract_tag_attributes(tag)
+ attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else ''
+ return f' | {content} | '
+
+ def _handle_italic(self, tag) -> str:
+ """Convert to """
+ content = str(tag.contents) if tag.contents else ''
+ attributes = self._extract_tag_attributes(tag)
+ attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else ''
+ return f'{content}'
+
+ def _handle_line_break(self, tag) -> str:
+ """Convert
to
"""
+ attributes = self._extract_tag_attributes(tag)
+ attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else ''
+ return f'
'
+
+ def _handle_span(self, tag) -> str:
+ """Convert to """
+ content = str(tag.contents) if tag.contents else ''
+ attributes = self._extract_tag_attributes(tag)
+ attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else ''
+ return f'{content}'
+
+ def _handle_definition_description(self, tag) -> str:
+ """Convert to """
+ content = str(tag.contents) if tag.contents else ''
+ attributes = self._extract_tag_attributes(tag)
+ attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else ''
+ return f'{content}'
+
+ def _handle_reference(self, tag) -> str:
+ """Convert [ to ]["""
+ content = str(tag.contents) if tag.contents else ''
+ attributes = self._extract_tag_attributes(tag)
+ attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else ''
+ return f'][{content}]'
+
+ def _handle_noinclude(self, tag) -> str:
+ """Convert to """
+ content = str(tag.contents) if tag.contents else ''
+ attributes = self._extract_tag_attributes(tag)
+ attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else ''
+ return f'{content}'
+
+ def _handle_pagequality(self, tag) -> str:
+ """Convert to """
+ content = str(tag.contents) if tag.contents else ''
+ attributes = self._extract_tag_attributes(tag)
+ attr_str = ' ' + ' '.join([f'{k}="{v}"' for k, v in attributes.items()]) if attributes else ''
+ return f'{content}'
+
+ def _extract_tag_attributes(self, tag) -> Dict[str, str]:
+ """Extract all attributes from a tag"""
+ attributes = {}
+ if hasattr(tag, 'attributes') and tag.attributes:
+ for attr in tag.attributes:
+ if hasattr(attr, 'name') and hasattr(attr, 'value'):
+ attributes[str(attr.name)] = str(attr.value)
+ return attributes
+
+ # ============================================================================
+ # PREPROCESSORS
+ # ============================================================================
+
+ def _fix_noinclude_line_breaks(self, content: str) -> str:
+ """Insert a blank line after tags when followed by non-whitespace content"""
+ # Pattern to match followed by optional whitespace and any non-whitespace character
+ # This handles cases like: :text, text, {{template}}, etc.
+ pattern = r'()\s*(\S)'
+
+ def replace_noinclude_content(match):
+ noinclude_tag = match.group(1)
+ following_content = match.group(2)
+ # Insert a newline after and before the following content
+ return f'{noinclude_tag}\n{following_content}'
+
+ # Apply the replacement
+ content = re.sub(pattern, replace_noinclude_content, content)
+
+ return content
+
+ def _normalize_whitespace(self, content: str) -> str:
+ """Normalize whitespace in content"""
+ # Normalize multiple spaces to single space
+ content = re.sub(r' +', ' ', content)
+ # Normalize line breaks, but preserve paragraph markers
+ content = re.sub(r'\n+', '\n', content)
+ return content.strip()
+
+ def _convert_paragraph_breaks(self, content: str) -> str:
+ """Convert double newlines to paragraph indicators, but skip if {{nop}} is directly adjacent"""
+
+ # First, protect {{nop}} markers and their immediate context
+ # Replace {{nop}} with a temporary marker
+ content = content.replace('{{nop}}', '___NOP_MARKER___')
+
+ # Convert \n\n to \n paragraph indicators, but not if they're adjacent to ___NOP_MARKER___
+ # This regex matches \n\n that are NOT preceded or followed by ___NOP_MARKER___
+ content = re.sub(r'(?\n', content)
+
+ # Restore {{nop}} markers
+ content = content.replace('___NOP_MARKER___', '{{nop}}')
+
+ return content
+
+ def _handle_special_characters(self, content: str) -> str:
+ """Handle special characters and entities - escape ampersands not in XML/HTML entities"""
+ # More comprehensive regex to match XML/HTML entities
+ # This includes named entities like &, <, >, ", '
+ # and numeric entities like { and
+ entity_pattern = r'&(?:[a-zA-Z][a-zA-Z0-9]*|#[0-9]+|#x[0-9a-fA-F]+);'
+
+ # Split content by entities to preserve them
+ parts = re.split(f'({entity_pattern})', content)
+
+ # Process each part
+ result_parts = []
+ for part in parts:
+ if re.match(entity_pattern, part):
+ # This is an entity, keep it as-is
+ result_parts.append(part)
+ else:
+ # This is not an entity, escape standalone ampersands
+ escaped_part = part.replace('&', '&')
+ result_parts.append(escaped_part)
+
+ return ''.join(result_parts)
+
+ def _extract_metadata(self, content: str) -> Dict[str, Any]:
+ """Extract metadata from content"""
+ metadata = {}
+ # Extract page quality information
+ # Extract language information
+ # Extract structural information
+ return metadata
+
+ # ============================================================================
+ # POSTPROCESSORS
+ # ============================================================================
+
+ def _validate_xml_structure(self, content: str) -> str:
+ """Validate and fix XML structure"""
+ # Ensure proper nesting
+ # Validate against schema
+ # Fix common issues
+ return content
+
+ def _cleanup_empty_elements(self, content: str) -> str:
+ """Remove or fix empty elements"""
+ # Remove empty elements
+ content = re.sub(r'<(\w+)[^>]*>\1>', '', content)
+ return content
+
+ def _finalize_metadata(self, content: str) -> str:
+ """Finalize metadata and add to content"""
+ # Add final metadata
+ # Ensure proper document structure
+ return content
+
+ # ============================================================================
+ # MAIN PROCESSING METHODS
+ # ============================================================================
+
+ def process_wikitext(self, wikitext: str) -> ConversionResult:
+ """
+ Main method to process MediaWiki wikitext to XML.
+
+ Args:
+ wikitext: The MediaWiki content to convert
+
+ Returns:
+ ConversionResult with XML content and metadata
+ """
+ warnings = []
+ errors = []
+ metadata = {}
+
+ try:
+ # Preprocessing
+ content = wikitext
+ for preprocessor in self.preprocessors:
+ if preprocessor == self._extract_metadata:
+ metadata.update(preprocessor(content))
+ else:
+ content = preprocessor(content)
+
+ # Parse MediaWiki content
+ parsed = mwparserfromhell.parse(content)
+
+ # Process all nodes with nested content support
+ nodes_to_replace = []
+
+ # Process nodes in the order they appear in the document
+ for node in parsed.nodes:
+ if hasattr(node, 'name'): # Template
+ template_name = str(node.name).strip()
+ if template_name in self.template_handlers:
+ try:
+ # Process nested content within the template
+ processed_node = self._process_template_with_nesting(node)
+ replacement = self.template_handlers[template_name](processed_node)
+ nodes_to_replace.append((node, replacement))
+ except Exception as e:
+ errors.append(f"Error processing template {template_name}: {str(e)}")
+ else:
+ warnings.append(f"Unknown template: {template_name}")
+
+ elif hasattr(node, 'tag'): # Tag
+ tag_name = str(node.tag).strip().lower()
+ if tag_name in self.tag_handlers:
+ try:
+ # Process nested content within the tag
+ processed_node = self._process_tag_with_nesting(node)
+ replacement = self.tag_handlers[tag_name](processed_node)
+ nodes_to_replace.append((node, replacement))
+ except Exception as e:
+ errors.append(f"Error processing tag {tag_name}: {str(e)}")
+ else:
+ warnings.append(f"Unknown tag: {tag_name}")
+
+ elif hasattr(node, '__class__') and 'Wikilink' in str(node.__class__): # Wikilink
+ try:
+ replacement = self._handle_wikilink(node)
+ nodes_to_replace.append((node, replacement))
+ except Exception as e:
+ errors.append(f"Error processing wikilink: {str(e)}")
+
+ # Replace all nodes in order
+ for node, replacement in nodes_to_replace:
+ parsed.replace(node, replacement)
+
+ # Get processed content
+ xml_content = str(parsed)
+
+ # Postprocessing
+ for postprocessor in self.postprocessors:
+ xml_content = postprocessor(xml_content)
+
+ # Wrap in mediawiki tag
+ xml_content = f'{xml_content}'
+
+ return ConversionResult(
+ xml_content=xml_content,
+ metadata=metadata,
+ warnings=warnings,
+ errors=errors,
+ wikilinks=self.wikilinks.copy()
+ )
+
+ except Exception as e:
+ errors.append(f"Fatal error in processing: {str(e)}")
+ return ConversionResult(
+ xml_content="",
+ metadata={},
+ warnings=warnings,
+ errors=errors,
+ wikilinks=[]
+ )
+
+ def add_template_handler(self, template_name: str, handler_func):
+ """Add a custom template handler"""
+ self.template_handlers[template_name] = handler_func
+
+ def add_tag_handler(self, tag_name: str, handler_func):
+ """Add a custom tag handler"""
+ self.tag_handlers[tag_name] = handler_func
+
+ def add_preprocessor(self, preprocessor_func):
+ """Add a custom preprocessor"""
+ self.preprocessors.append(preprocessor_func)
+
+ def add_postprocessor(self, postprocessor_func):
+ """Add a custom postprocessor"""
+ self.postprocessors.append(postprocessor_func)
+
+ def get_wikilinks(self) -> List[Dict[str, Any]]:
+ """Get all captured wikilinks"""
+ return self.wikilinks.copy()
+
+ def clear_wikilinks(self):
+ """Clear all captured wikilinks"""
+ self.wikilinks.clear()
+
+
+# ============================================================================
+# CONVENIENCE FUNCTIONS
+# ============================================================================
+
+def create_processor() -> MediaWikiProcessor:
+ """Create a new MediaWiki processor instance"""
+ return MediaWikiProcessor()
+
+
+def process_page(page_content: str) -> ConversionResult:
+ """Process a single page of MediaWiki content"""
+ processor = create_processor()
+ return processor.process_wikitext(page_content)
+
+
+def _demo_main() -> None: # pragma: no cover
+ processor = create_processor()
+ sample_wikitext = """
+ {{verse|1|1|In the beginning God created the heaven and the earth.}}
+
+ {{verse|1|2|And the earth was without form, and void; and darkness was upon the face of the deep.}}
+
+ {{sc|Genesis}} {{c|Chapter 1}}
+ {{larger|The Creation}}
+ [This is a reference]
+
+ See also [[Genesis]] and [[Creation myth]] for more information.
+
+ Nested example: {{sc|{{larger|Bold Large Text}}}}
+ Complex nested: {{verse|1|3|{{sc|God}} said, {{larger|Let there be light}}}}
+ """
+ result = processor.process_wikitext(sample_wikitext)
+ print("XML Output:")
+ print(result.xml_content)
+ print("\nWarnings:", result.warnings)
+ print("Errors:", result.errors)
+ print("Wikilinks:", result.wikilinks)
+
+
+if __name__ == "__main__": # pragma: no cover
+ _demo_main()
diff --git a/opensiddur/importer/util/pages.py b/opensiddur/importer/util/pages.py
index 117d25a..6f33d4e 100644
--- a/opensiddur/importer/util/pages.py
+++ b/opensiddur/importer/util/pages.py
@@ -29,6 +29,21 @@ def jps1917_credits_directory(sourcetexts_root: Path | None = None) -> Path:
return jps1917_data_directory(sourcetexts_root) / "credits"
+def miqra_al_pi_hamasorah_data_directory(sourcetexts_root: Path | None = None) -> Path:
+ """Miqra al pi ha-Masorah raw dumps: /miqra_al_pi_hamasorah."""
+ root = (
+ sourcetexts_root.resolve()
+ if sourcetexts_root is not None
+ else default_sourcetexts_root()
+ )
+ return root / "miqra_al_pi_hamasorah"
+
+
+def miqra_al_pi_hamasorah_sheets_directory(sourcetexts_root: Path | None = None) -> Path:
+ """Per-tab TSV files from the Google Sheet export."""
+ return miqra_al_pi_hamasorah_data_directory(sourcetexts_root) / "sheets"
+
+
def get_page(page_number: str | int, sourcetexts_root: Path | None = None) -> Optional[Page]:
"""Return the wikitext of the given Page, or None if it does not exist."""
page_num = int(page_number)
diff --git a/opensiddur/importer/wlc/download_tanach.py b/opensiddur/importer/wlc/download_tanach.py
index f3f74e6..1790156 100644
--- a/opensiddur/importer/wlc/download_tanach.py
+++ b/opensiddur/importer/wlc/download_tanach.py
@@ -72,9 +72,13 @@ def main(argv: list[str] | None = None) -> int:
return 0
-if __name__ == "__main__":
+def _run_cli() -> None: # pragma: no cover
try:
sys.exit(main())
except Exception as e:
logger.error("Error downloading/unzipping Tanach: %s", e)
raise
+
+
+if __name__ == "__main__": # pragma: no cover
+ _run_cli()
diff --git a/opensiddur/importer/wlc/wlc.py b/opensiddur/importer/wlc/wlc.py
index bc69f1b..48f8019 100644
--- a/opensiddur/importer/wlc/wlc.py
+++ b/opensiddur/importer/wlc/wlc.py
@@ -93,5 +93,5 @@ def main(argv: list[str] | None = None) -> int:
return 0
-if __name__ == "__main__":
+if __name__ == "__main__": # pragma: no cover
sys.exit(main())
diff --git a/opensiddur/tests/exporter/test_latex.py b/opensiddur/tests/exporter/test_latex.py
index aa9f537..ffd14b0 100644
--- a/opensiddur/tests/exporter/test_latex.py
+++ b/opensiddur/tests/exporter/test_latex.py
@@ -242,6 +242,21 @@ def test_dedupes_when_multiple_files_share_index(self):
preamble, _ = extract_sources([f1, f2])
self.assertEqual(preamble.count("@"), 1)
+ def test_bibtex_wraps_hebrew_fields_in_texthebrew(self):
+ index = """
+
+
+
+ ืืงืจื ืขื ืคื ืืืกืืจื
+ Avi Kadish
+
+
+ """.encode("utf-8")
+ doc = self._create("p", "doc.xml", b"")
+ self._create("p", "index.xml", index)
+ preamble, _ = extract_sources([doc])
+ self.assertIn(r"title = {\texthebrew{ืืงืจื ืขื ืคื ืืืกืืจื}}", preamble)
+
class TestGetFileReferences(unittest.TestCase):
diff --git a/opensiddur/tests/exporter/test_reledmac_xslt.py b/opensiddur/tests/exporter/test_reledmac_xslt.py
index e67d5fe..3aca2e8 100644
--- a/opensiddur/tests/exporter/test_reledmac_xslt.py
+++ b/opensiddur/tests/exporter/test_reledmac_xslt.py
@@ -533,7 +533,7 @@ def test_lb_emits_leavevmode_linebreak(self):
"""
out = _transform(xml)
- self.assertIn(r"\leavevmode\\", out)
+ self.assertIn(r"\leavevmode\\{}", out)
class TestStructuralElements(unittest.TestCase):
@@ -565,8 +565,44 @@ def test_div_head_emits_sectioning(self):
"""
out = _transform(xml)
- # Top-level body div with head โ \eledchapter
- self.assertIn(r"\eledchapter{Genesis}", out)
+ # Top-level body div with head โ \eledchapter (LTR wrapper when not Hebrew)
+ self.assertIn(
+ r"\eledchapter{{\textdir TLT\selectlanguage{english}Genesis}}",
+ out,
+ )
+
+ def test_english_head_in_hebrew_document_uses_ltr_wrapper(self):
+ xml = """
+
+
+
+ Genesis
+ ืึฐึผืจึตืืฉึดืืืช
+
+
+ """
+ out = _transform(xml)
+ self.assertIn(
+ r"\eledchapter{{\textdir TLT\selectlanguage{english}Genesis}}",
+ out,
+ )
+
+ def test_hebrew_head_in_hebrew_document_has_no_ltr_wrapper(self):
+ xml = """
+
+
+
+ ืืจืืฉืืช
+ ืึฐึผืจึตืืฉึดืืืช
+
+
+ """
+ out = _transform(xml)
+ self.assertIn(r"\eledchapter{ืืจืืฉืืช}", out)
+ self.assertNotIn(
+ r"\eledchapter{{\textdir TLT\selectlanguage{english}ืืจืืฉืืช}}",
+ out,
+ )
if __name__ == "__main__":
diff --git a/opensiddur/tests/fixtures/miqra_minimal.xlsx b/opensiddur/tests/fixtures/miqra_minimal.xlsx
new file mode 100644
index 0000000..fbb6dd8
Binary files /dev/null and b/opensiddur/tests/fixtures/miqra_minimal.xlsx differ
diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/__init__.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py
new file mode 100644
index 0000000..a2be3e5
--- /dev/null
+++ b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_convert_tsv.py
@@ -0,0 +1,148 @@
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+import tempfile
+
+
+from opensiddur.importer.miqra_al_pi_hamasorah.convert_tsv import (
+ _extract_chapter_verse_numbers,
+ main,
+)
+
+
+class TestMiqraConvertTsv(unittest.TestCase):
+ @patch("opensiddur.importer.miqra_al_pi_hamasorah.convert_tsv.validate")
+ def test_only_book_writes_output(self, mock_validate):
+ mock_validate.return_value = (True, [])
+
+ with tempfile.TemporaryDirectory() as tmp:
+ tmp_path = Path(tmp)
+ sourcetexts_root = tmp_path / "sources"
+ sheets_dir = sourcetexts_root / "miqra_al_pi_hamasorah" / "sheets"
+ sheets_dir.mkdir(parents=True, exist_ok=True)
+
+ # Minimal README (front matter)
+ (sheets_dir / "readme.tsv").write_text(
+ "License\tCC-BY-SA 4.0\nAttribution\tHebrew Wikisource\n",
+ encoding="utf-8",
+ )
+
+ # Torah TSV: parashah in nav + two verses in one paragraph
+ (sheets_dir / "torah.tsv").write_text(
+ "\t".join(["Page key", "Row id", "Nav", "Scaffold", "Text"])
+ + "\n"
+ + "\t".join(
+ [
+ "ืกืคืจ ืืจืืฉืืช/ื",
+ "ื",
+ "//{{ืคืค}}//",
+ "{{ื:ืคืกืืง|ืืจืืฉืืช|1|1}}",
+ '{{ื ืืกื|{{ื:ืืืช-ื|ืึผึฐ}}ืจึตืืฉืึดึืืช|2=test note}}',
+ ]
+ )
+ + "\n"
+ + "\t".join(
+ [
+ "ืกืคืจ ืืจืืฉืืช/ื",
+ "ื",
+ "",
+ "{{ื:ืคืกืืง|ืืจืืฉืืช|1|2}}",
+ "ืึฐืึธืึธึืจึถืฅ ืึธืึฐืชึธึฅื ืชึนึืืึผึ ืึธืึนึืืึผ",
+ ]
+ )
+ + "\n",
+ encoding="utf-8",
+ )
+
+ project_dir = tmp_path / "project"
+ rc = main(
+ [
+ "--sourcetexts-root",
+ str(sourcetexts_root),
+ "--project-dir",
+ str(project_dir),
+ "--only-book",
+ "genesis",
+ ]
+ )
+ self.assertEqual(rc, 0)
+
+ genesis_xml = project_dir / "genesis.xml"
+ self.assertTrue(genesis_xml.exists())
+ xml = genesis_xml.read_text(encoding="utf-8")
+ self.assertIn("", xml)
+ self.assertIn('', xml)
+ self.assertIn("ืึฐืึธืึธึืจึถืฅ", xml)
+ self.assertIn('', xml)
+ self.assertIn("Genesis", xml)
+ self.assertIn('rend="large"', xml)
+ self.assertIn("ืึผึฐ", xml)
+ self.assertIn("tei:standOff", xml)
+ self.assertIn("test note", xml)
+ # Standoff notes must link to the in-text marker for annotation resolution
+ self.assertIn('target="#miqra-note-1-ref', xml)
+
+ def test_special_tsv_row_does_not_produce_invalid_urn_segments(self):
+ # special.tsv uses a 2-column schema; must not be merged into book output.
+ ch, v = _extract_chapter_verse_numbers(
+ "ืกืคืจ ืฉืืืช/ืื ืชืชืช",
+ "{{#ืงืืข:ืฉืืจืช ืืื/ืฆืืจืช ืืฉืืจ|ืฆืืจืช-ืืฉืืจ}}{{ื:ืืขืื",
+ "",
+ )
+ self.assertEqual(ch, "")
+ self.assertEqual(v, "")
+
+ @patch("opensiddur.importer.miqra_al_pi_hamasorah.convert_tsv.validate")
+ def test_special_tsv_not_merged_into_book(self, mock_validate):
+ mock_validate.return_value = (True, [])
+
+ with tempfile.TemporaryDirectory() as tmp:
+ tmp_path = Path(tmp)
+ sourcetexts_root = tmp_path / "sources"
+ sheets_dir = sourcetexts_root / "miqra_al_pi_hamasorah" / "sheets"
+ sheets_dir.mkdir(parents=True, exist_ok=True)
+
+ (sheets_dir / "torah.tsv").write_text(
+ "\t".join(["Page key", "Row id", "Nav", "Scaffold", "Text"])
+ + "\n"
+ + "\t".join(
+ [
+ "ืกืคืจ ืฉืืืช/ืื",
+ "ื",
+ "",
+ "{{ื:ืคืกืืง|ืฉืืืช|15|1}}",
+ "ืฉืืจื",
+ ]
+ )
+ + "\n",
+ encoding="utf-8",
+ )
+ (sheets_dir / "special.tsv").write_text(
+ "ืกืคืจ ืฉืืืช/ืื ืชืชืช\t{{#ืงืืข:ืฉืืจืช ืืื/ืฆืืจืช ืืฉืืจ|ืฆืืจืช-ืืฉืืจ}}{{ื:ืืขืื\n",
+ encoding="utf-8",
+ )
+
+ project_dir = tmp_path / "project"
+ main(
+ [
+ "--sourcetexts-root",
+ str(sourcetexts_root),
+ "--project-dir",
+ str(project_dir),
+ "--only-book",
+ "exodus",
+ ]
+ )
+ xml = (project_dir / "exodus.xml").read_text(encoding="utf-8")
+ self.assertIn("urn:x-opensiddur:text:bible:exodus/15/1", xml)
+ self.assertNotIn("ืฆืืจืช-ืืฉืืจ", xml)
+ self.assertNotIn("ืืฉืืจ|", xml)
+
+
+if __name__ == "__main__":
+ unittest.main()
+
diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_download.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_download.py
new file mode 100644
index 0000000..6e4c498
--- /dev/null
+++ b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_download.py
@@ -0,0 +1,106 @@
+import json
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+from opensiddur.importer.miqra_al_pi_hamasorah import download
+from opensiddur.importer.util.pages import (
+ miqra_al_pi_hamasorah_data_directory,
+ miqra_al_pi_hamasorah_sheets_directory,
+)
+
+FIXTURE_XLSX = (
+ Path(__file__).resolve().parents[2] / "fixtures" / "miqra_minimal.xlsx"
+)
+
+
+class TestDownloadMiqra(unittest.TestCase):
+ def setUp(self) -> None:
+ self.tmp = tempfile.TemporaryDirectory()
+ self.sourcetexts_root = Path(self.tmp.name)
+
+ def tearDown(self) -> None:
+ self.tmp.cleanup()
+
+ def _mock_response(self) -> MagicMock:
+ response = MagicMock()
+ response.raise_for_status = MagicMock()
+ response.content = FIXTURE_XLSX.read_bytes()
+ return response
+
+ @patch("opensiddur.importer.miqra_al_pi_hamasorah.download.requests.get")
+ def test_download_writes_tsv_and_manifest(self, mock_get: MagicMock) -> None:
+ mock_get.return_value = self._mock_response()
+
+ download.download_miqra(self.sourcetexts_root)
+
+ data_dir = miqra_al_pi_hamasorah_data_directory(self.sourcetexts_root)
+ sheets_dir = miqra_al_pi_hamasorah_sheets_directory(self.sourcetexts_root)
+
+ torah_tsv = sheets_dir / "torah.tsv"
+ readme_tsv = sheets_dir / "readme.tsv"
+ self.assertTrue(torah_tsv.is_file())
+ self.assertTrue(readme_tsv.is_file())
+ self.assertFalse((sheets_dir / "unknowntab.tsv").exists())
+
+ torah_lines = torah_tsv.read_text(encoding="utf-8").splitlines()
+ self.assertEqual(len(torah_lines), 2)
+ self.assertIn("ืึฐึผืจึตืืฉึดืืืช", torah_lines[1])
+
+ manifest_path = data_dir / "manifest.json"
+ self.assertTrue(manifest_path.is_file())
+ manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+ self.assertEqual(manifest["spreadsheet_id"], download.SPREADSHEET_ID)
+ slugs = {s["slug"] for s in manifest["sheets"]}
+ self.assertIn("torah", slugs)
+ self.assertIn("readme", slugs)
+ for entry in manifest["sheets"]:
+ self.assertIn("sha256", entry)
+ self.assertEqual(len(entry["sha256"]), 64)
+
+ xlsx_files = list(data_dir.glob("*.xlsx"))
+ self.assertEqual(xlsx_files, [])
+
+ mock_get.assert_called_once()
+ call_kwargs = mock_get.call_args
+ self.assertEqual(call_kwargs[0][0], download.EXPORT_XLSX_URL)
+ self.assertIn("User-Agent", call_kwargs[1]["headers"])
+
+ @patch("opensiddur.importer.miqra_al_pi_hamasorah.download.requests.get")
+ def test_dry_run_writes_nothing(self, mock_get: MagicMock) -> None:
+ download.download_miqra(self.sourcetexts_root, dry_run=True)
+
+ data_dir = miqra_al_pi_hamasorah_data_directory(self.sourcetexts_root)
+ self.assertFalse(data_dir.exists())
+ mock_get.assert_not_called()
+
+ @patch("opensiddur.importer.miqra_al_pi_hamasorah.download.logger")
+ @patch("opensiddur.importer.miqra_al_pi_hamasorah.download.requests.get")
+ def test_unknown_sheet_logs_warning(
+ self, mock_get: MagicMock, mock_logger: MagicMock
+ ) -> None:
+ mock_get.return_value = self._mock_response()
+ download.download_miqra(self.sourcetexts_root)
+
+ warning_calls = [
+ c
+ for c in mock_logger.warning.call_args_list
+ if "UnknownTab" in str(c)
+ ]
+ self.assertEqual(len(warning_calls), 1)
+
+ def test_main_dry_run_exit_code(self) -> None:
+ with patch(
+ "opensiddur.importer.miqra_al_pi_hamasorah.download.download_miqra"
+ ) as mock_download:
+ code = download.main(
+ ["--dry-run", "--sourcetexts-root", str(self.sourcetexts_root)]
+ )
+ self.assertEqual(code, 0)
+ mock_download.assert_called_once()
+ self.assertTrue(mock_download.call_args.kwargs["dry_run"])
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py
new file mode 100644
index 0000000..8fcc69a
--- /dev/null
+++ b/opensiddur/tests/importer/miqra_al_pi_hamasorah/test_miqra_wikitext.py
@@ -0,0 +1,389 @@
+import unittest
+
+from opensiddur.importer.miqra_al_pi_hamasorah.miqra_wikitext import (
+ _escape_outside_tags,
+ _preprocess_column_c,
+ _preprocess_miqra_tags,
+ _wikitext_basic_markup_to_xml,
+ _xml_escape,
+ link_target_to_uri,
+ normalize_template_name,
+ reset_processor,
+ wikitext_to_intermediate_xml,
+)
+
+
+class TestNormalizeTemplateName(unittest.TestCase):
+ def test_strips_whitespace(self):
+ self.assertEqual(normalize_template_name(" ืคืค "), "ืคืค")
+
+ def test_strips_tevnit_prefix(self):
+ self.assertEqual(normalize_template_name("ืชืื ืืช:ื:ืืขื"), "ื:ืืขื")
+ self.assertEqual(normalize_template_name("ืชืื ืืช:ื ืืกื"), "ื ืืกื")
+
+ def test_normalizes_quotes(self):
+ self.assertEqual(normalize_template_name("ื:ืื''ืง"), 'ื:ืื"ืง')
+ self.assertEqual(
+ normalize_template_name("ื:ืงืืดื"),
+ 'ื:ืงื"ื',
+ )
+
+
+class TestLinkTargetToUri(unittest.TestCase):
+ def test_empty_target(self):
+ self.assertEqual(link_target_to_uri(""), "")
+ self.assertEqual(link_target_to_uri(" "), "")
+
+ def test_protocol_relative_url(self):
+ self.assertEqual(
+ link_target_to_uri("//cdn.example.com/x.pdf"),
+ "https://cdn.example.com/x.pdf",
+ )
+
+ def test_fragment_preserved(self):
+ uri = link_target_to_uri("ืืฃ#ืคืจืง")
+ self.assertIn("#", uri)
+ self.assertTrue(uri.startswith("https://he.wikisource.org/wiki/"))
+
+
+class TestPreprocessors(unittest.TestCase):
+ def test_column_c_double_underscore(self):
+ self.assertEqual(_preprocess_column_c("a__b"), "a b")
+
+ def test_column_c_line_break(self):
+ self.assertEqual(
+ _preprocess_column_c("http://host/path"),
+ "http://host/path",
+ )
+ self.assertEqual(
+ _preprocess_column_c("https://host/path"),
+ "https://host/path",
+ )
+ self.assertIn("", _preprocess_column_c("ืฉืืจื//ืืืฉื"))
+
+ def test_miqra_keteg_tags(self):
+ s = "<ืงืืข ืืชืืื=foo/>text<ืงืืข ืกืืฃ=foo/>"
+ out = _preprocess_miqra_tags(s)
+ self.assertIn('', out)
+ self.assertIn('', out)
+
+
+class TestMarkupAndEscape(unittest.TestCase):
+ def test_xml_escape(self):
+ self.assertEqual(
+ _xml_escape('a & b "d" \'e\''),
+ "a & b <c> "d" 'e'",
+ )
+
+ def test_wikitext_bold_italic(self):
+ self.assertEqual(
+ _wikitext_basic_markup_to_xml("plain '''bold''' ''italic''"),
+ 'plain bold italic',
+ )
+
+ def test_wikitext_bold_italic_combined(self):
+ self.assertIn(
+ 'rend="bold-italic"',
+ _wikitext_basic_markup_to_xml("'''''both'''''"),
+ )
+
+ def test_escape_outside_tags_preserves_miqra_elements(self):
+ inner = _escape_outside_tags(
+ "plain ื '''bold'''"
+ )
+ self.assertIn("", inner)
+ self.assertIn("ื", inner)
+ self.assertIn('rend="bold"', inner)
+
+ def test_wikitext_markup_in_verse_via_integration(self):
+ frag = wikitext_to_intermediate_xml("'''ืืืจ'''")
+ self.assertIn('', frag)
+ self.assertIn("ืืืจ", frag)
+
+
+class TestMiqraWikitext(unittest.TestCase):
+ def setUp(self):
+ reset_processor()
+
+ def test_nosach_nested_large_letter(self):
+ frag = wikitext_to_intermediate_xml(
+ '{{ื ืืกื|{{ื:ืืืช-ื|ืึผึฐ}}ืจึตืืฉืึดึืืช|2=note text}}'
+ )
+ self.assertIn("', frag)
+ self.assertIn("ืึผึฐ", frag)
+ self.assertIn("', frag)
+ self.assertIn("ืืชืื", frag)
+ self.assertIn("ืงึฐืจึดื", frag)
+
+ def test_qeri_ketiv(self):
+ frag = wikitext_to_intermediate_xml('{{ืงื"ื|ืืชืื|ืงึฐืจึดื}}')
+ self.assertIn('order="qeri-first"', frag)
+
+ def test_parashah_open(self):
+ frag = wikitext_to_intermediate_xml("{{ืคืค}}")
+ self.assertIn('(ืืชืื)", k)
+ self.assertIn("[ืงึฐืจึดื]", q)
+
+ def test_qok_if_matres(self):
+ frag = wikitext_to_intermediate_xml(
+ '{{ื:ืงื"ื-ืื-2|display|ืืชืื|ืงึฐืจึดื}}'
+ )
+ self.assertIn("display", frag)
+ self.assertIn("", frag)
+ self.assertIn("ืืชืื", frag)
+ self.assertIn("ืงึฐืจึดื", frag)
+
+ def test_qok_two_qeri_words(self):
+ frag = wikitext_to_intermediate_xml(
+ '{{ื:ืงื"ื ืงืจื ืฉืื ื ืืืืชืื ืืฉืชื ืืืืื|ืืชืื|ืง1|ืง2}}'
+ )
+ self.assertIn('type="split-qeri"', frag)
+ self.assertIn("ืง1", frag)
+ self.assertIn("ืง2", frag)
+ self.assertIn("ืืชืื", frag)
+
+ def test_parashah_variants(self):
+ cases = [
+ ("{{ืคืคืค}}", 'type="open-line"'),
+ ("{{ืกืก}}", 'type="close"'),
+ ("{{ืกืกืก}}", 'type="close-inline"'),
+ ("{{ืกืก2}}", 'type="close-narrow"'),
+ ("{{ื:ืฉืฉืฉ}}", 'type="shirah"'),
+ ]
+ for wikitext, expected in cases:
+ with self.subTest(wikitext=wikitext):
+ self.assertIn(expected, wikitext_to_intermediate_xml(wikitext))
+
+ def test_parashah_mid_verse_attribute(self):
+ frag = wikitext_to_intermediate_xml("{{ืคืค|ืคืกืงื ืืืืฆืข ืคืกืืง}}")
+ self.assertIn('midVerse="true"', frag)
+
+ def test_poetic_levels(self):
+ for level, template in enumerate(("ืจ0", "ืจ1", "ืจ2", "ืจ3", "ืจ4")):
+ frag = wikitext_to_intermediate_xml(f"{{{{{template}}}}}")
+ self.assertIn(f'', frag)
+
+ def test_centered_title(self):
+ frag = wikitext_to_intermediate_xml("{{ืคืจืฉื-ืืจืื|ืืืชืจืช}}")
+ self.assertIn("ืืืชืจืช", frag)
+
+ def test_letter_formatting(self):
+ frag = wikitext_to_intermediate_xml(
+ "{{ื:ืืืช-ืง|ืง}}{{ื:ืืืช ืชืืืื|ืช}}{{ื:ืืืช ืื ืืงืืช|ื}}{{ื:ื ื\"ื ืืคืืื|ื}}"
+ )
+ self.assertIn('rend="small"', frag)
+ self.assertIn('rend="raised"', frag)
+ self.assertIn("", frag)
+ self.assertIn("", frag)
+
+ def test_yerushalem_variants(self):
+ y = wikitext_to_intermediate_xml("{{ื:ืืจืืฉืื|v|a}}")
+ ya = wikitext_to_intermediate_xml("{{ื:ืืจืืฉืืื|v|a}}")
+ self.assertIn('', y)
+ self.assertIn('', ya)
+
+ def test_standalone_accents(self):
+ frag = wikitext_to_intermediate_xml(
+ "{{ืืจื ืื ืืืื}}{{ืืืื}}{{ืืชื ื ืืคืื}}"
+ )
+ self.assertIn('type="yerah-ben-yomo"', frag)
+ self.assertIn('type="galgal"', frag)
+ self.assertIn('type="etnah-hafukh"', frag)
+
+ def test_taam_handlers(self):
+ frag = wikitext_to_intermediate_xml(
+ "{{ื:ืืขื ืืืชื ืืืืช ืืืช|ื}}"
+ "{{ืฉื ื ืืขืืื ืืืืช ืืืช}}"
+ "{{ื:ืืจืฉ ืืชืืืฉื ืืืืื}}"
+ "{{ื:ืืจืฉืืื ืืชืืืฉื ืืืืื}}"
+ )
+ self.assertIn("ื", frag)
+ self.assertIn('type="geresh-telisha-gedola"', frag)
+ self.assertIn('type="gershayim-telisha-gedola"', frag)
+
+ def test_qamats_named_params(self):
+ frag = wikitext_to_intermediate_xml("{{ื:ืงืืฅ|ื=ืึธึผ}}")
+ self.assertIn("ืึธึผ", frag)
+
+ def test_taam_dummy_strips_leading_marker(self):
+ frag = wikitext_to_intermediate_xml("{{ื:ืืขื|Xืืืช}}")
+ self.assertIn("ืืืช", frag)
+ self.assertNotIn("Xืืืช", frag)
+
+ def test_qupo_accent(self):
+ frag = wikitext_to_intermediate_xml(
+ "{{ืฉื ื ืืขืืื ืืืืช ืืืช ืงืืฅ-ืชืืชืื-ืคืชื-ืขืืืื|ืขืืื=ื}}"
+ )
+ self.assertIn('', frag)
+
+ def test_punctuation_and_maqaf(self):
+ frag = wikitext_to_intermediate_xml(
+ "{{ื:ืืืจืืื-2}}{{ื:ืคืกืง}}{{ื:ืืงืฃ ืืคืืจ}}"
+ )
+ self.assertIn('type="legarmeh"', frag)
+ self.assertIn('type="paseq"', frag)
+ self.assertIn('rend="grey"', frag)
+
+ def test_kol_qamats_default(self):
+ self.assertIn("ืึผึธื", wikitext_to_intermediate_xml("{{ื:ืื ืงืืฅ ืงืื ืืจืื}}"))
+
+ def test_notes_and_anchors(self):
+ frag = wikitext_to_intermediate_xml(
+ "{{ื:ืืขืจื|ืืืฃ ืืขืจื}}{{ืขืืื ืืฉืืจื|label}}"
+ "{{ื:ืกืืื ืืืื|ืกืืฃ ืืื}}"
+ )
+ self.assertIn("', frag)
+ self.assertIn("ืกืืฃ ืืื", frag)
+
+ def test_dual_trope_and_accent(self):
+ frag = wikitext_to_intermediate_xml(
+ "{{ืงืง|target}}"
+ "{{ื:ืืคืื|ืืคืื=ื|ื=ื|ื=ื}}"
+ )
+ self.assertIn("target", frag)
+ self.assertIn('', frag)
+ self.assertIn('role="ื"', frag)
+ self.assertIn('role="ื"', frag)
+
+ def test_emphasis_and_footnote_mark(self):
+ frag = wikitext_to_intermediate_xml("{{ืืืืืฉ|ืืฉืื}}{{ืฉ}}")
+ self.assertIn('ืืฉืื', frag)
+ self.assertIn("", frag)
+
+ def test_wikilink(self):
+ frag = wikitext_to_intermediate_xml("[[ืืฃ]] and [[ืืฃ|ืชืืืืช]]")
+ self.assertIn('hiddenstill"
+ )
+ self.assertIn("visible", frag)
+ self.assertIn("still", frag)
+ self.assertNotIn("hidden", frag)
+
+ def test_keteg_segments_in_wikitext(self):
+ frag = wikitext_to_intermediate_xml("<ืงืืข ืืชืืื=seg/>")
+ self.assertIn('', frag)
+
+ def test_column_c_line_break_integration(self):
+ frag = wikitext_to_intermediate_xml("ื//ื", column_c=True)
+ self.assertIn("", frag)
+
+ def test_nosach_without_note(self):
+ frag = wikitext_to_intermediate_xml("{{ื ืืกื|ืืงืกื}}")
+ self.assertEqual(frag, "ืืงืกื")
+ self.assertNotIn("=2.0.0",
"diff-match-patch>=20241021",
"pydantic>=2.11.7",
+ "openpyxl>=3.1.5",
]
[project.urls]
diff --git a/uv.lock b/uv.lock
index 8f7acaa..60f0375 100644
--- a/uv.lock
+++ b/uv.lock
@@ -605,6 +605,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e1/5e/4b5aaaabddfacfe36ba7768817bd1f71a7a810a43705e531f3ae4c690767/emoji-2.15.0-py3-none-any.whl", hash = "sha256:205296793d66a89d88af4688fa57fd6496732eb48917a87175a023c8138995eb", size = 608433, upload-time = "2025-09-21T12:13:01.197Z" },
]
+[[package]]
+name = "et-xmlfile"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
+]
+
[[package]]
name = "executing"
version = "2.2.1"
@@ -1840,6 +1849,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/7d/32/37734d769bc8b42e4938785313cc05aade6cb0fa72479d3220a0d61a4e78/openai-2.33.0-py3-none-any.whl", hash = "sha256:03ac37d70e8c9e3a8124214e3afa785e2cbc12e627fbd98177a086ef2fd87ad5", size = 1162695, upload-time = "2026-04-28T14:04:40.482Z" },
]
+[[package]]
+name = "openpyxl"
+version = "3.1.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "et-xmlfile" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
+]
+
[[package]]
name = "opensiddur-ai"
version = "0.1.0"
@@ -1856,6 +1877,7 @@ dependencies = [
{ name = "markdown" },
{ name = "mwparserfromhell" },
{ name = "openai" },
+ { name = "openpyxl" },
{ name = "pydantic" },
{ name = "pyppeteer" },
{ name = "requests" },
@@ -1885,6 +1907,7 @@ requires-dist = [
{ name = "markdown", specifier = ">=3.9" },
{ name = "mwparserfromhell", specifier = ">=0.7.2" },
{ name = "openai", specifier = ">=1.101.0" },
+ { name = "openpyxl", specifier = ">=3.1.5" },
{ name = "pydantic", specifier = ">=2.11.7" },
{ name = "pyppeteer", specifier = ">=2.0.0" },
{ name = "requests", specifier = ">=2.32.4" },