From bb0123681dd08a984246910c81ff42fc41f41815 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Wed, 6 May 2026 23:55:00 -0700 Subject: [PATCH 01/13] first pass at reledmac --- opensiddur/exporter/README.md | 60 +- opensiddur/exporter/pdf/pdf.py | 438 +++-- opensiddur/exporter/settings.py | 41 + opensiddur/exporter/tex/install-tex.sh | 26 +- opensiddur/exporter/tex/latex.py | 442 +++++ opensiddur/exporter/tex/reledmac.xslt | 804 ++++++++ opensiddur/exporter/tex/xelatex.py | 405 ---- opensiddur/exporter/tex/xelatex.xslt | 417 ---- opensiddur/tests/exporter/test_latex.py | 452 +++++ .../tests/exporter/test_marker_reconstruct.py | 21 - opensiddur/tests/exporter/test_pdf.py | 812 +++----- .../tests/exporter/test_reledmac_xslt.py | 479 +++++ opensiddur/tests/exporter/test_xelatex.py | 1713 ----------------- scripts/tei-to-pdf.sh | 67 +- 14 files changed, 2869 insertions(+), 3308 deletions(-) create mode 100644 opensiddur/exporter/tex/latex.py create mode 100644 opensiddur/exporter/tex/reledmac.xslt delete mode 100644 opensiddur/exporter/tex/xelatex.py delete mode 100644 opensiddur/exporter/tex/xelatex.xslt create mode 100644 opensiddur/tests/exporter/test_latex.py create mode 100644 opensiddur/tests/exporter/test_reledmac_xslt.py delete mode 100644 opensiddur/tests/exporter/test_xelatex.py diff --git a/opensiddur/exporter/README.md b/opensiddur/exporter/README.md index e1a1021..c7636a0 100644 --- a/opensiddur/exporter/README.md +++ b/opensiddur/exporter/README.md @@ -1,13 +1,13 @@ # Open Siddur Exporter -The exporter takes data in JLPTEI files and converts it into directly +The exporter takes data in JLPTEI files and converts it into directly consumable formats, like PDF and HTML. The exporter operates in two stages: -1. **Compilation**: Given a starting file and a settings file, generate a compiled pseudo-TEI file that includes all of the data needed to convert into a final format in a linear form. The compilation step is common to all output formats. +1. **Compilation**: Given a starting file and a settings file, generate a compiled pseudo-TEI file that includes all of the data needed to convert into a final format in a linear form. The compilation step is common to all output formats. 2. **Output format**: Given the compiled file, output to the consumable format. The current output formats are: - 1. TeX typesetting system (XeLaTeX) - 2. PDF, via XeLaTeX + 1. TeX typesetting system (LuaLaTeX, via [`reledmac`](https://ctan.org/pkg/reledmac) + [`reledpar`](https://ctan.org/pkg/reledpar) for critical-edition apparatus and parallel-text alignment) + 2. PDF, via the same LuaLaTeX pipeline ## Run the compiler @@ -16,10 +16,17 @@ See ## Export to PDF -For TeX and PDF export, you will need an installation of XeLaTeX. See `install-tex.sh`. +For TeX and PDF export, you'll need a TeX Live install with the LuaLaTeX +pipeline (`lualatex`, `latexmk`, `biber`, `reledmac`, `reledpar`, `polyglossia`, +`biblatex`). On Debian/Ubuntu the installer script `install-tex.sh` covers it: -For round-trip command examples, -see `tei-to-pdf.sh`. +```bash +sudo bash opensiddur/exporter/tex/install-tex.sh +``` + +For round-trip command examples, see `scripts/tei-to-pdf.sh` — the same +`-s ` flag drives both the compiler and the PDF stage, so any +typography settings in the YAML are forwarded to the LuaLaTeX preamble. ## Settings file @@ -58,8 +65,43 @@ annotations: - ... ``` -From which projects should notes (such as editorial notes or commentary) be derived? +From which projects should notes (such as editorial notes or commentary) be derived? Unlike instructions and transclusions, annotations are not in prioritized order; the annotations from all listed projects will be included when available. +### Parallel texts + +```yaml +parallel: + projects: + - jps1917 + column_order: primary_first # or primary_last +``` + +When the compiler builds a document, it also looks up matching content in +each of the listed `parallel` projects (by `corresp` URN) and emits +`p:parallel`/`p:parallelItem` blocks. The PDF stage feeds those blocks into +`reledpar` so the verses on each side stay aligned across page breaks. + +`column_order: primary_first` puts the primary stream on the left page (or +left column for a `pairs` layout); `primary_last` swaps them. + +### Typography (PDF/TeX stage only) + +```yaml +typography: + hebrew_font: "Frank Ruehl CLM" # any installed OpenType font with Hebrew coverage + latin_font: "Linux Libertine O" # any installed OpenType font for the Latin stream + layout: pages # "pages" → facing pages; "pairs" → two columns/page + paper: a4paper # any \documentclass paper option + fontsize: 11pt # 10pt | 11pt | 12pt +``` + +The `typography` section is read by the PDF/TeX stage only; the linear-XML +compiler ignores it. Every key is optional — when the section (or any single +key) is omitted, the defaults shown above are used. Fonts that aren't found +on the system fall back to a sensible default automatically (`Ezra SIL` → +`SBL Hebrew` → `FreeSerif` for Hebrew). + ## Settings file versioning -Note that this file is likely to change slightly in format when parallel texts are introduced. +Note that this file is likely to change slightly in format as more output +formats are introduced. diff --git a/opensiddur/exporter/pdf/pdf.py b/opensiddur/exporter/pdf/pdf.py index ccfefe6..0a3482d 100755 --- a/opensiddur/exporter/pdf/pdf.py +++ b/opensiddur/exporter/pdf/pdf.py @@ -1,38 +1,60 @@ #!/usr/bin/env python3 """ -JLPTEI to PDF Exporter +JLPTEI to PDF Exporter (LuaLaTeX + reledmac/reledpar pipeline). -This script converts JLPTEI XML files to PDF format by first generating XeLaTeX, -then compiling it to PDF using XeLaTeX. +Compilation strategy +==================== +The reledmac/reledpar packages produce auxiliary files (``..aux`` files, +``.lineenum``, etc.) that need multiple LaTeX passes to converge. ``latexmk`` +already understands these patterns and is the canonical tool for getting the +right number of passes in the right order, so we use it whenever it's +available:: + + latexmk -lualatex -interaction=nonstopmode .tex + +When ``latexmk`` is not installed we fall back to a manual loop that runs +``lualatex`` up to ``max_runs`` times, invoking ``bibtex`` once between passes +when the ``.aux`` indicates a bibliography is needed. """ -import sys import argparse +import shutil import subprocess +import sys import tempfile from pathlib import Path +from typing import Optional # Add the project root to the Python path project_root = Path(__file__).resolve().parent.parent.parent.parent sys.path.insert(0, str(project_root)) -from opensiddur.exporter.tex.xelatex import transform_xml_to_tex +from opensiddur.exporter.tex.latex import transform_xml_to_tex # noqa: E402 -def generate_tex(input_file, temp_tex_file): - """ - Generate TeX file from JLPTEI XML using the existing XeLaTeX exporter. - +def generate_tex( + input_file: Path, + temp_tex_file: Path, + settings_file: Optional[Path] = None, +) -> bool: + """Generate a LuaLaTeX file from compiled JLPTEI XML. + Args: - input_file (Path): Path to the input TEI XML file - temp_tex_file (Path): Path to the temporary TeX file to create - + input_file: Path to the compiled JLPTEI XML file. + temp_tex_file: Path to the temporary .tex file to create. + settings_file: Optional settings.yaml whose ``typography`` section + drives the LuaLaTeX preamble. + Returns: - bool: True if successful, False otherwise + True on success, False otherwise. """ try: - print(f"Generating TeX from {input_file}...", file=sys.stderr) - transform_xml_to_tex(str(input_file), output_file=str(temp_tex_file)) + print(f"Generating LuaLaTeX from {input_file}...", file=sys.stderr) + transform_xml_to_tex( + str(input_file), + output_file=str(temp_tex_file), + settings_file=settings_file, + ) print(f"TeX file generated: {temp_tex_file}", file=sys.stderr) return True except Exception as e: @@ -40,199 +62,301 @@ def generate_tex(input_file, temp_tex_file): return False -def compile_tex_to_pdf(tex_file, output_pdf, max_runs=7): +def _have_command(name: str) -> bool: + """Return True iff ``name`` resolves on $PATH.""" + return shutil.which(name) is not None + + +def _run_latexmk(tex_file: Path, output_dir: Path) -> bool: + """Drive the LaTeX build with latexmk -lualatex. + + latexmk reruns lualatex/biber as many times as needed for reledmac's + ``..aux`` files and the bibliography to converge. We disable + interaction so a malformed source can't hang the build. """ - Compile TeX file to PDF using XeLaTeX with bibliography support. - Runs xelatex -> bibtex -> xelatex until no more reruns are needed. - - Args: - tex_file (Path): Path to the TeX file - output_pdf (Path): Path to the output PDF file - max_runs (int): Maximum number of xelatex runs to prevent infinite loops - - Returns: - bool: True if successful, False otherwise + cmd = [ + "latexmk", + "-lualatex", + "-bibtex", + "-nobiber", + "-interaction=nonstopmode", + "-halt-on-error", + f"-output-directory={output_dir}", + str(tex_file), + ] + print(f"Running: {' '.join(cmd)}", file=sys.stderr) + result = subprocess.run(cmd, capture_output=True, text=True, cwd=tex_file.parent) + if result.returncode != 0: + print("latexmk reported errors:", file=sys.stderr) + print(result.stdout, file=sys.stderr) + print(result.stderr, file=sys.stderr) + # Don't bail unconditionally: we still want to keep going if the PDF + # was produced (over-strict reledpar warnings can mask successful runs). + return False + return True + + +def _run_lualatex(tex_file: Path, output_dir: Path) -> tuple[bool, str, bool]: + """Run a single ``lualatex`` pass. + + Returns ``(succeeded, output, needs_rerun)``. ``succeeded`` reflects exit + code; ``needs_rerun`` is True when the log contains rerun indicators + (which reledmac/reledpar emit on every non-final pass). + """ + cmd = [ + "lualatex", + "-interaction=nonstopmode", + "-halt-on-error", + f"-output-directory={output_dir}", + str(tex_file), + ] + log_path = output_dir / f"{tex_file.stem}.log" + print(f"(LuaLaTeX log: {log_path})", file=sys.stderr) + + # Stream output live so long passes are observable. We later parse the .log + # file for rerun markers (more reliable than stdout). + result = subprocess.run(cmd, text=True, cwd=tex_file.parent) + output = "" + if log_path.exists(): + try: + output = log_path.read_text(encoding="utf-8", errors="ignore") + except Exception: + output = "" + needs_rerun = any( + marker in output + for marker in ( + "Rerun to get cross-references right", + "Rerun to get outlines right", + "There were undefined references", + "Label(s) may have changed", + "Rerun to get citations correct", + # reledmac/reledpar specific + "Reledmac will work only after", + "reledpar may not have created", + ) + ) + return result.returncode == 0, output, needs_rerun + + +def _run_bibtex(tex_stem: str, output_dir: Path) -> bool: + """Run bibtex if the .aux indicates a bibliography is needed.""" + aux = output_dir / f"{tex_stem}.aux" + if not aux.exists(): + return True + aux_content = aux.read_text(encoding="utf-8", errors="ignore") + if "\\bibdata" not in aux_content and "\\citation" not in aux_content: + return True + + cmd = ["bibtex", tex_stem] + result = subprocess.run(cmd, capture_output=True, text=True, cwd=output_dir) + if "error message" in (result.stdout or "").lower(): + print(f"BibTeX errors: {result.stdout}", file=sys.stderr) + return False + return True + + +def _run_manual_loop(tex_file: Path, output_dir: Path, max_runs: int) -> bool: + """Fallback build loop when latexmk isn't available. + + Runs lualatex once, then biber if a ``.bcf`` shows up, then keeps running + lualatex until the rerun markers stop appearing or ``max_runs`` is hit. + """ + tex_stem = tex_file.stem + + print("Running lualatex (pass 1)...", file=sys.stderr) + success, output, needs_rerun = _run_lualatex(tex_file, output_dir) + if not success: + print("lualatex reported errors (pass 1):", file=sys.stderr) + print(output, file=sys.stderr) + + # Run bibtex once after the first pass when needed; force a rerun afterwards + # because bibtex updates the .bbl that lualatex needs to read. + aux = output_dir / f"{tex_stem}.aux" + if aux.exists(): + aux_content = aux.read_text(encoding="utf-8", errors="ignore") + if "\\bibdata" in aux_content or "\\citation" in aux_content: + print("Running bibtex...", file=sys.stderr) + _run_bibtex(tex_stem, output_dir) + needs_rerun = True + + run_count = 1 + while needs_rerun and run_count < max_runs: + run_count += 1 + print(f"Running lualatex (pass {run_count})...", file=sys.stderr) + success, output, needs_rerun = _run_lualatex(tex_file, output_dir) + if not success: + print(f"lualatex reported errors (pass {run_count}):", file=sys.stderr) + print(output, file=sys.stderr) + break + + if run_count >= max_runs: + print( + f"Warning: reached max_runs ({max_runs}); reledmac/reledpar may not be settled", + file=sys.stderr, + ) + + print(f"Manual loop completed in {run_count} lualatex pass(es)", file=sys.stderr) + return True + + +def compile_tex_to_pdf( + tex_file: Path, + output_pdf: Path, + max_runs: int = 6, +) -> bool: + """Compile a LuaLaTeX .tex file to PDF. + + Uses ``latexmk -lualatex`` when available (recommended), otherwise falls + back to a manual lualatex/biber loop. Either way, the output PDF is + copied from a temp build directory to ``output_pdf``. + + ``max_runs`` only applies to the manual fallback; latexmk handles its + own loop. """ try: + if not _have_command("lualatex"): + print( + "Error: lualatex not found. Install texlive-luatex.", + file=sys.stderr, + ) + return False + print(f"Compiling {tex_file} to PDF...", file=sys.stderr) - - # Change to the directory containing the TeX file - tex_dir = tex_file.parent - tex_name = tex_file.stem - def run_xelatex(temp_dir): - """Run XeLaTeX and return (success, output, needs_rerun)""" - cmd = [ - 'xelatex', - '-interaction=nonstopmode', - '-output-directory', str(temp_dir), - str(tex_file) - ] - - result = subprocess.run(cmd, capture_output=True, text=True, cwd=tex_dir) - - if result.returncode != 0: - return False, result.stdout + result.stderr, False - - # Check if we need to rerun - output = result.stdout + result.stderr - # Look for common LaTeX rerun indicators - # Avoid false positives from biblatex's "Please rerun LaTeX" which appears even on final run - needs_rerun = any(pattern in output for pattern in [ - 'Rerun to get cross-references right', - 'Rerun to get outlines right', - 'There were undefined references', - 'Label(s) may have changed', - 'Rerun to get citations correct' - ]) - - return True, output, needs_rerun - - def run_bibtex(temp_dir): - """Run BibTeX and return success status""" - cmd = ['bibtex', str(tex_name)] - # Run in the xelatex output directory where the .aux file was written - result = subprocess.run(cmd, capture_output=True, text=True, cwd=temp_dir) - - # BibTeX may return non-zero even on success with warnings - # Check for actual errors in output - if 'error message' in result.stdout.lower(): - print(f"BibTeX errors: {result.stdout}", file=sys.stderr) + tex_stem = tex_file.stem + + with tempfile.TemporaryDirectory() as temp_dir_str: + temp_dir = Path(temp_dir_str) + # latexmk can attempt to invoke biber based on .bcf detection even when + # biblatex is configured for BibTeX. Since biber is frequently broken or + # unavailable on minimal systems, prefer a deterministic manual loop. + if not _have_command("bibtex"): + print( + "Error: bibtex not found. Install texlive-bibtex-extra.", + file=sys.stderr, + ) return False - - return True - - with tempfile.TemporaryDirectory() as temp_dir: - # First XeLaTeX run - print("Running XeLaTeX (pass 1)...", file=sys.stderr) - success, output, needs_rerun = run_xelatex(temp_dir) - - if not success: - print(f"XeLaTeX reported errors (pass 1):", file=sys.stderr) - print(output, file=sys.stderr) - # Don't abort yet — check if a PDF was produced despite errors - - # Check if bibliography exists and run BibTeX - aux_file = Path(temp_dir) / f"{tex_name}.aux" - if aux_file.exists(): - aux_content = aux_file.read_text() - if '\\bibdata' in aux_content or '\\citation' in aux_content: - print("Running BibTeX...", file=sys.stderr) - if not run_bibtex(temp_dir): - print("Warning: BibTeX encountered errors", file=sys.stderr) - # After BibTeX, we definitely need to rerun XeLaTeX - needs_rerun = True - - # Continue running XeLaTeX until no more reruns are needed - run_count = 1 - while needs_rerun and run_count < max_runs: - run_count += 1 - print(f"Running XeLaTeX (pass {run_count})...", file=sys.stderr) - success, output, needs_rerun = run_xelatex(temp_dir) - - if not success: - print(f"XeLaTeX reported errors (pass {run_count}):", file=sys.stderr) - print(output, file=sys.stderr) - # Don't abort — check if a PDF was produced despite errors - break - - if run_count >= max_runs: - print(f"Warning: Reached maximum number of runs ({max_runs})", file=sys.stderr) - - # The PDF should be in the same directory as the TeX file - generated_pdf = Path(temp_dir) / f"{tex_name}.pdf" - + _run_manual_loop(tex_file, temp_dir, max_runs) + + generated_pdf = temp_dir / f"{tex_stem}.pdf" if not generated_pdf.exists(): print(f"PDF file not found: {generated_pdf}", file=sys.stderr) return False - - # Copy the generated PDF to the desired output location + if generated_pdf != output_pdf: - import shutil shutil.copy2(generated_pdf, output_pdf) print(f"PDF copied to: {output_pdf}", file=sys.stderr) else: print(f"PDF generated: {output_pdf}", file=sys.stderr) - - print(f"Compilation completed in {run_count} XeLaTeX pass(es)", file=sys.stderr) + return True - + except FileNotFoundError as e: - if 'xelatex' in str(e): - print("Error: XeLaTeX not found. Please install XeLaTeX.", file=sys.stderr) - elif 'bibtex' in str(e): - print("Error: BibTeX not found. Please install BibTeX.", file=sys.stderr) - else: - print(f"Error: Command not found: {e}", file=sys.stderr) + # Either lualatex/latexmk/biber went missing mid-build. + print(f"Error: command not found: {e}", file=sys.stderr) return False except Exception as e: print(f"Error compiling TeX to PDF: {e}", file=sys.stderr) return False -def export_to_pdf(input_file, output_pdf): - """ - Convert JLPTEI XML file to PDF. - +def export_to_pdf( + input_file: Path, + output_pdf: Path, + settings_file: Optional[Path] = None, + tex_output: Optional[Path] = None, +) -> bool: + """Convert a compiled JLPTEI XML file to PDF. + Args: - input_file (Path): Path to the input TEI XML file - output_pdf (Path): Path to the output PDF file - + input_file: Path to the input compiled JLPTEI XML file. + output_pdf: Path to the output PDF file. + settings_file: Optional settings.yaml whose ``typography`` section + drives the LuaLaTeX preamble. + Returns: - bool: True if successful, False otherwise + True on success, False otherwise. """ - # Validate input file exists if not input_file.exists(): print(f"Error: Input file '{input_file}' does not exist", file=sys.stderr) return False - - # Create temporary directory for intermediate files + with tempfile.TemporaryDirectory() as temp_dir: - temp_tex_file = Path(temp_dir) / f"output.tex" - - # Step 1: Generate TeX file - if not generate_tex(input_file, temp_tex_file): + temp_tex_file = tex_output or (Path(temp_dir) / "output.tex") + if tex_output is not None: + tex_output.parent.mkdir(parents=True, exist_ok=True) + + if not generate_tex(input_file, temp_tex_file, settings_file=settings_file): return False - - # Step 2: Compile TeX to PDF + if not compile_tex_to_pdf(temp_tex_file, output_pdf): return False - + print(f"Successfully generated PDF: {output_pdf}", file=sys.stderr) + if tex_output is not None: + print(f"Intermediate TeX saved to: {tex_output}", file=sys.stderr) return True def main(): # pragma: no cover - """Main function to handle command line arguments and run the PDF generation.""" + """Command-line entry point.""" parser = argparse.ArgumentParser( - description="Convert JLPTEI XML files to PDF format", + description="Convert compiled JLPTEI XML files to PDF (LuaLaTeX + reledmac)", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s input.xml output.pdf - """ + %(prog)s -s settings.yaml input.xml output.pdf + """, ) - + + parser.add_argument("input_file", type=Path, help="Path to the input compiled JLPTEI XML file") + parser.add_argument("output_pdf", type=Path, help="Path to the output PDF file") parser.add_argument( - 'input_file', + "-s", + "--settings", + dest="settings_file", type=Path, - help='Path to the input TEI XML file' + default=None, + help=( + "Path to a settings.yaml whose `typography` section drives " + "fonts, layout, paper, and font size. Defaults are used when omitted." + ), + ) + parser.add_argument( + "--keep-tex", + action="store_true", + help=( + "Save the intermediate TeX file next to the output PDF " + "as .tex." + ), ) - parser.add_argument( - 'output_pdf', + "--tex-output", type=Path, - help='Path to the output PDF file' + default=None, + help="Path to write the intermediate TeX file (implies --keep-tex).", ) - args = parser.parse_args() - - # Run the PDF generation - success = export_to_pdf(args.input_file, args.output_pdf) - - if not success: + + if args.keep_tex and args.tex_output is not None: + print("Error: --keep-tex and --tex-output are mutually exclusive.", file=sys.stderr) + sys.exit(2) + + tex_output: Optional[Path] = None + if args.tex_output is not None: + tex_output = args.tex_output + elif args.keep_tex: + tex_output = args.output_pdf.with_suffix(".tex") + + if not export_to_pdf( + args.input_file, + args.output_pdf, + settings_file=args.settings_file, + tex_output=tex_output, + ): sys.exit(1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/opensiddur/exporter/settings.py b/opensiddur/exporter/settings.py index e68d140..bee652c 100644 --- a/opensiddur/exporter/settings.py +++ b/opensiddur/exporter/settings.py @@ -1,5 +1,6 @@ """ Exporter settings management utilities. """ +from enum import StrEnum from pathlib import Path from typing import Optional from pydantic import BaseModel, Field, field_validator @@ -38,10 +39,50 @@ class ParallelConfig(BaseModel): def validate_projects(cls, v: list[str]) -> list[str]: return _validate_project_list(v) + +class ParallelLayout(StrEnum): + """ Parallel-text page layout for the TeX/PDF stage. + + pages: facing pages (reledpar \\Pages) — best for full critical editions. + pairs: two columns on the same page (reledpar \\Columns) — best for short docs. + """ + PAGES = "pages" + PAIRS = "pairs" + + +class PaperType(StrEnum): + """LaTeX \\documentclass paper options. + + Keep this intentionally small and conventional; add more as needed. + """ + + A4PAPER = "a4paper" + LETTERPAPER = "letterpaper" + LEGALPAPER = "legalpaper" + A5PAPER = "a5paper" + B5PAPER = "b5paper" + EXECUTIVEPAPER = "executivepaper" + + +class TypographyConfig(BaseModel): + """ Output-format settings consumed by the TeX/PDF stage only. + + These don't affect the linear-XML compiler; they're forwarded as XSLT + parameters to ``reledmac.xslt``. Defaults match what the in-house LuaLaTeX + setup expects on a typical Linux TeXLive install. + """ + hebrew_font: str = "Frank Ruehl CLM" + latin_font: str = "Linux Libertine O" + layout: ParallelLayout = ParallelLayout.PAIRS + paper: PaperType = PaperType.A4PAPER + fontsize: str = "11pt" + + class SettingsYaml(BaseModel): priority: Prioritizations annotations: list[str] = Field(default_factory=list) parallel: Optional[ParallelConfig] = None + typography: TypographyConfig = Field(default_factory=TypographyConfig) @field_validator("annotations") def validate_annotations(cls, v: list[str]) -> list[str]: diff --git a/opensiddur/exporter/tex/install-tex.sh b/opensiddur/exporter/tex/install-tex.sh index e27b81a..7878b4d 100644 --- a/opensiddur/exporter/tex/install-tex.sh +++ b/opensiddur/exporter/tex/install-tex.sh @@ -1 +1,25 @@ -apt-get install -y texlive-xetex texlive-lang-arabic +# LuaLaTeX-based PDF pipeline (reledmac/reledpar critical-edition typesetting). +# +# Required packages: +# - texlive-luatex: lualatex engine + lua* libraries +# - texlive-latex-extra: reledmac, reledpar, polyglossia helpers +# - texlive-bibtex-extra: biblatex with the biber backend +# - texlive-fonts-extra: fallback font shapes used by polyglossia +# - texlive-lang-other: Hebrew (and other RTL) language support +# - texlive-lang-european: Latin-script babel support +# - latexmk: drives multi-pass lualatex/biber loop +# - biber: biblatex's bibliography backend +apt-get update -y +apt-get install -y \ + texlive-luatex \ + texlive-latex-extra \ + texlive-bibtex-extra \ + texlive-fonts-extra \ + texlive-humanities \ + texlive-lang-other \ + texlive-lang-european \ + latexmk \ + biber + +# Refresh TeX filename database (usually handled by postinst, but cheap/safe). +mktexlsr diff --git a/opensiddur/exporter/tex/latex.py b/opensiddur/exporter/tex/latex.py new file mode 100644 index 0000000..2addc2a --- /dev/null +++ b/opensiddur/exporter/tex/latex.py @@ -0,0 +1,442 @@ +#!/usr/bin/env python3 +""" +JLPTEI to LuaLaTeX exporter (reledmac/reledpar pipeline). + +This module is the Python driver for the reledmac.xslt stylesheet. It collects +license, credit, and source bibliographic metadata from all referenced source +files, then drives the XSLT transformation that produces a LuaLaTeX document +ready for ``latexmk -lualatex``. + +Typography settings (font, paper, layout, fontsize) are pulled from the same +``settings.yaml`` the compiler uses; only the ``typography`` section is read +here. When no settings file is supplied, sensible defaults from +``TypographyConfig`` are used. +""" + +import argparse +import os +import sys +from pathlib import Path +from typing import Optional + +from lxml import etree +from pydantic import BaseModel + +# Add the project root to the Python path +project_root = Path(__file__).resolve().parent.parent.parent.parent +projects_source_root = project_root / "project" +sys.path.insert(0, str(project_root)) + +from opensiddur.common.xslt import xslt_transform_string # noqa: E402 +from opensiddur.exporter.settings import TypographyConfig # noqa: E402 + +XSLT_FILE = Path(__file__).parent / "reledmac.xslt" + + +class LicenseRecord(BaseModel): + """Record of the license for a given file.""" + url: str # License URL is required + name: str + + +class CreditRecord(BaseModel): + """Record of the credit for a given file.""" + role: str # Role is required (e.g., "aut", "edt") + resp_text: str + ref: str # Reference URI is required + name_text: str + namespace: str # where the contributor did their work + contributor: str # contributor name at the source + + +def extract_licenses(xml_file_paths: list[Path]) -> dict[Path, LicenseRecord]: + """Extract license URLs and names from a list of JLPTEI XML files.""" + ns = {"tei": "http://www.tei-c.org/ns/1.0"} + + results: dict[Path, LicenseRecord] = {} + + for file_path in xml_file_paths: + try: + try: + relative_path = file_path.absolute().relative_to(projects_source_root) + except ValueError: + print( + f"Warning: {file_path} is not a subdirectory of {projects_source_root}", + file=sys.stderr, + ) + continue + tree = etree.parse(file_path) + root = tree.getroot() + for licence in root.findall(".//tei:licence", ns): + url = licence.attrib.get("target") + name = (licence.text or "").strip() + if url: + results[relative_path] = LicenseRecord(url=url, name=name) + else: + print( + f"Error: No license URL found for {relative_path}", + file=sys.stderr, + ) + except Exception as e: + print(f"Error: {file_path}: {e}", file=sys.stderr) + + return results + + +def group_licenses(licenses: dict[Path, LicenseRecord]) -> list[LicenseRecord]: + """Group licenses by URL (deduplicated).""" + seen: set[str] = set() + grouped: list[LicenseRecord] = [] + for license_record in licenses.values(): + if license_record.url not in seen: + seen.add(license_record.url) + grouped.append(license_record) + return grouped + + +def licenses_to_tex(licenses: list[LicenseRecord]) -> str: + """Convert a list of LicenseRecord objects into a LaTeX section.""" + items = "\n".join( + f"\\item {license.name} (\\url{{{license.url}}})" for license in licenses + ) + return ( + "\\chapter{Legal}\n" + "This document includes copyrighted texts licensed under the following licenses.\n" + "The full text of the licenses can be found at the given URLs:\n\n" + "\\begin{itemize}\n" + f"{items}\n" + "\\end{itemize}\n" + ) + + +def extract_credits(xml_file_paths: list[Path]) -> dict[Path, list[CreditRecord]]: + """Extract credits (respStmt entries) from a list of JLPTEI XML files.""" + ns = {"tei": "http://www.tei-c.org/ns/1.0"} + results: dict[Path, list[CreditRecord]] = {} + + for file_path in xml_file_paths: + credits: list[CreditRecord] = [] + try: + tree = etree.parse(file_path) + root = tree.getroot() + for resp_stmt in root.findall(".//tei:respStmt", ns): + resp = resp_stmt.find("tei:resp", ns) + name = resp_stmt.find("tei:name", ns) + + if resp is None or name is None: + continue + + role = resp.attrib.get("key") + ref = name.attrib.get("ref") + + if not role or not ref: + continue + + # Parse namespace and contributor from ref (urn:x-opensiddur:NAMESPACE/CONTRIBUTOR) + tail = ref.split(":")[-1] + if "/" not in tail: + continue + namespace, contributor = tail.split("/", 1) + + credits.append( + CreditRecord( + role=role, + resp_text=(resp.text or "").strip(), + ref=ref, + name_text=(name.text or "").strip(), + namespace=namespace, + contributor=contributor, + ) + ) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + results[file_path] = credits + + return results + + +def group_credits( + credits: dict[Path, list[CreditRecord]], +) -> dict[str, dict[str, list[CreditRecord]]]: + """Group credits by role -> namespace -> [CreditRecord], deduplicated by (role, ref).""" + seen: set[tuple[str, str]] = set() + grouped: dict[str, dict[str, list[CreditRecord]]] = {} + for credit_list in credits.values(): + for credit in credit_list: + key = (credit.role, credit.ref) + if key in seen: + continue + seen.add(key) + grouped.setdefault(credit.role, {}).setdefault(credit.namespace, []).append(credit) + return grouped + + +contributor_keys_to_roles = { + "ann": "Annotator", + "aut": "Author", + "edt": "Editor", + "fac": "Facsimilist", + "fnd": "Funder", + "mrk": "Markup editor", + "pfr": "Proofreader", + "spn": "Sponsor", + "trl": "Translator", + "trc": "Transcriptionist", +} + + +def credits_to_tex(credits: dict[str, dict[str, list[CreditRecord]]]) -> str: + """Convert grouped credits into a LaTeX appendix section.""" + if not credits: + return "" + tex = "\\chapter{Contributor credits}\n" + for role, namespace_dict in credits.items(): + total = sum(len(c) for c in namespace_dict.values()) + role_name = contributor_keys_to_roles.get(role, role) + ("s" if total > 1 else "") + tex += f"\\section{{{role_name}}}\n" + for namespace, namespace_credits in namespace_dict.items(): + sorted_credits = sorted(namespace_credits, key=lambda x: x.contributor) + tex += f"\\subsection{{From {namespace}}}\n" + tex += "\\begin{itemize}\n" + for credit in sorted_credits: + tex += f"\\item {credit.name_text}\n" + tex += "\\end{itemize}\n" + return tex + + +def get_project_index(file_path: Path) -> Path: + """Get the project index file for a given file path.""" + return file_path.parent / "index.xml" + + +def extract_sources(xml_file_paths: list[Path]) -> tuple[str, str]: + """Extract bibliographic sources from index.xml files. + + Returns a (preamble_tex, postamble_tex) tuple. The preamble carries the + embedded ``filecontents*`` block + ``\\addbibresource``, the postamble + carries ``\\printbibliography``. Both are empty when there is no + ``listBibl`` content. + """ + index_files = set(get_project_index(fp) for fp in xml_file_paths) + bibtex_records: list[str] = [] + seen: set[str] = set() + for index_xml in index_files: + try: + index_xml_text = index_xml.read_text(encoding="utf-8") + bib_xslt_path = Path(__file__).parent / "bibtex.xslt" + bibtex_str = xslt_transform_string(bib_xslt_path, index_xml_text).strip() + if bibtex_str and bibtex_str not in seen: + seen.add(bibtex_str) + bibtex_records.append(bibtex_str) + except Exception as e: + print(f"Could not extract bibtex from {index_xml}: {e}", file=sys.stderr) + continue + + bibtex_blob = "\n\n".join(bibtex_records) + if not bibtex_blob: + return "", "" + + preamble_tex = ( + "\\begin{filecontents*}{job.bib}\n" + f"{bibtex_blob}\n" + "\\end{filecontents*}\n" + "\\addbibresource{job.bib}\n" + ) + postamble_tex = ( + "\n\\begingroup\n" + "\\renewcommand{\\refname}{Sources}\n" + "\\nocite{*}\n" + "\\printbibliography\n" + "\\endgroup\n" + ) + return preamble_tex, postamble_tex + + +def get_file_references( + input_file: Path, project_directory: Path = projects_source_root +) -> list[Path]: + """Get all source file references from a compiled JLPTEI XML file. + + Includes the file itself, all transcluded files, and the ``index.xml`` + of every referenced project. + """ + ns = { + "tei": "http://www.tei-c.org/ns/1.0", + "p": "http://jewishliturgy.org/ns/processing", + } + tree = etree.parse(input_file) + root = tree.getroot() + elements_with_references = root.xpath( + "(self::*|.//*) [@p:project and @p:file_name]", namespaces=ns + ) + + p_project = "{http://jewishliturgy.org/ns/processing}project" + p_file_name = "{http://jewishliturgy.org/ns/processing}file_name" + + return list( + set( + [ + project_directory / element.attrib[p_project] / element.attrib[p_file_name] + for element in elements_with_references + ] + + [ + project_directory / element.attrib[p_project] / "index.xml" + for element in elements_with_references + ] + ) + ) + + +def load_typography(settings_file: Optional[Path]) -> TypographyConfig: + """Load only the ``typography`` section of a settings.yaml. + + Returns sensible defaults when the file is missing or has no typography + section. We deliberately validate only the typography section and not + the full SettingsYaml — the compiler stage already does that — so that + the PDF stage can run even when the settings file references projects + not present in this checkout. + """ + if settings_file is None: + return TypographyConfig() + try: + import yaml + + with open(settings_file, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + return TypographyConfig.model_validate(data.get("typography", {}) or {}) + except Exception as e: + print( + f"Warning: could not load typography from {settings_file}: {e}; " + "using defaults", + file=sys.stderr, + ) + return TypographyConfig() + + +def transform_xml_to_tex( + input_file, + xslt_file: Path = XSLT_FILE, + output_file: Optional[str] = None, + settings_file: Optional[Path] = None, + typography: Optional[TypographyConfig] = None, +) -> str: + """Transform a compiled JLPTEI XML file into a LuaLaTeX document. + + Args: + input_file: Path to the compiled JLPTEI XML file. + xslt_file: Path to ``reledmac.xslt`` (overridable for tests). + output_file: If given, write to this path; otherwise return the string. + settings_file: Optional path to a settings.yaml to read typography from. + typography: Pre-loaded TypographyConfig (takes precedence over settings_file). + + Returns: + The transformed LaTeX content as a string. + """ + try: + with open(input_file, "r", encoding="utf-8") as input_fd: + input_xml = input_fd.read() + + file_references = get_file_references(input_file, projects_source_root) + + licenses = extract_licenses(file_references) + licenses_tex = licenses_to_tex(group_licenses(licenses)) + credits = extract_credits(file_references) + credits_tex = credits_to_tex(group_credits(credits)) + sources_preamble_tex, sources_postamble_tex = extract_sources(file_references) + + if typography is None: + typography = load_typography(settings_file) + + result = xslt_transform_string( + Path(xslt_file), + input_xml, + xslt_params={ + "additional-preamble": sources_preamble_tex, + "additional-postamble": ( + "\\part{Metadata}\n" + + licenses_tex + + "\n" + + credits_tex + + "\n" + + sources_postamble_tex + ), + "hebrew-font": typography.hebrew_font, + "latin-font": typography.latin_font, + "layout": typography.layout.value, + "paper": typography.paper.value, + "fontsize": typography.fontsize, + }, + ) + + if output_file: + with open(output_file, "w", encoding="utf-8") as output_fd: + output_fd.write(result) + print(f"LuaLaTeX output written to: {output_file}", file=sys.stderr) + else: + sys.stdout.write(result) + + return result + + except Exception as e: + print(f"Transformation error: {e}", file=sys.stderr) + sys.exit(1) + + +def main(): # pragma: no cover + """Command-line entry point.""" + parser = argparse.ArgumentParser( + description="Convert compiled JLPTEI XML files to LuaLaTeX (reledmac/reledpar)", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s input.xml + %(prog)s input.xml -o output.tex + %(prog)s input.xml -s settings.yaml -o output.tex + """, + ) + + parser.add_argument("input_file", help="Path to the input compiled JLPTEI XML file") + parser.add_argument( + "-o", + "--output", + dest="output_file", + help="Path to the output .tex file (default: output to stdout)", + ) + parser.add_argument( + "-s", + "--settings", + dest="settings_file", + type=Path, + default=None, + help=( + "Path to a settings.yaml whose `typography` section drives " + "fonts, layout, paper, and font size. Defaults are used when omitted." + ), + ) + parser.add_argument( + "--xslt", + dest="xslt_file", + default=str(XSLT_FILE), + help="Path to the XSLT file (default: reledmac.xslt next to this script)", + ) + + args = parser.parse_args() + + if not os.path.exists(args.input_file): + print(f"Error: Input file '{args.input_file}' does not exist", file=sys.stderr) + sys.exit(1) + + if not os.path.exists(args.xslt_file): + print(f"Error: XSLT file '{args.xslt_file}' does not exist", file=sys.stderr) + sys.exit(1) + + transform_xml_to_tex( + args.input_file, + xslt_file=Path(args.xslt_file), + output_file=args.output_file, + settings_file=args.settings_file, + ) + + +if __name__ == "__main__": + main() diff --git a/opensiddur/exporter/tex/reledmac.xslt b/opensiddur/exporter/tex/reledmac.xslt new file mode 100644 index 0000000..5041100 --- /dev/null +++ b/opensiddur/exporter/tex/reledmac.xslt @@ -0,0 +1,804 @@ + + + + + + + + + + + + + + + Frank Ruehl CLM + Linux Libertine O + pages + a4paper + 11pt + + + + + + + + \documentclass[ + + , + + ]{book} + + \usepackage{geometry} + \usepackage{fontspec} + \usepackage{polyglossia} + \setdefaultlanguage{english} + \setotherlanguage{hebrew} + + + \IfFontExistsTF{}{ + \setmainfont{} + }{} + + + \IfFontExistsTF{}{ + \newfontfamily\hebrewfont[Renderer=HarfBuzz,Script=Hebrew]{ + } + }{ + \IfFontExistsTF{Ezra SIL}{ + \newfontfamily\hebrewfont[Renderer=HarfBuzz,Script=Hebrew]{Ezra SIL} + }{ + \IfFontExistsTF{SBL Hebrew}{ + \newfontfamily\hebrewfont[Renderer=HarfBuzz,Script=Hebrew]{SBL Hebrew} + }{ + \newfontfamily\hebrewfont[Script=Hebrew]{FreeSerif} + } + } + } + + + \usepackage{reledmac} + + \usepackage{reledpar} + + + \usepackage[backend=bibtex]{biblatex} + \usepackage{hyperref} + + + \newcommand{\vno}[1]{\textsuperscript{\begingroup\textdir TLT\selectlanguage{english}#1\endgroup}\,} + + + \newcommand{\instructionnote}[1]{\begingroup\bfseries #1\endgroup} + \newcommand{\notenote}[1]{\begingroup\bfseries #1\endgroup} + + + \renewcommand*{\linenumberstyle}[1]{\begingroup\textdir TLT\selectlanguage{english}#1\endgroup} + \makeatletter + \renewcommand*{\linenumrepR}[1]{\begingroup\textdir TLT\selectlanguage{english}\@arabic{#1}\endgroup} + \renewcommand*{\sublinenumrepR}[1]{\begingroup\textdir TLT\selectlanguage{english}\@arabic{#1}\endgroup} + \setRlineflag{\begingroup\textdir TLT R\endgroup} + \makeatother + + \setlength{\parindent}{0pt} + \setlength{\parskip}{0.5em} + + + + + \begin{document} + + + + + + + + + \end{document} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \begin{} + + \begin{Leftside} + + + + + + + \end{Leftside} + + \begin{Rightside} + + + + + + + \end{Rightside} + + \end{} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \begin{hebrew} + + + \beginnumbering + + + \pstart + + + + + + + \pend + + + + + + + + + + + + + + + \pstart + + \par \eledsection{ + + + + \begingroup\textdir TLT\selectlanguage{english} + + \endgroup + + + + + + }\par + + + + + + + \pend + + \eledsection{ + + + + \begingroup\textdir TLT\selectlanguage{english} + + \endgroup + + + + + + } + + + + + + + + + + + \pend + + \pstart \vno{ + + } + + + + + + + + \pstart + + \vno{ + + } + + + + + + + + + + \edtext{}{\Bfootnote{Parsha: + + }} + + + + + + + + \pend + + \begin{center}* * * *\end{center} + + + + + + + \pend + + \eledchapter{ + + } + + + + + + + \pend + + \eledsubsection{ + + } + + + + + + + + + + \par + + + + + + + \par + + + + + + + \pend + + + + + + + + + + + + + + + + + + + \pend + + \pstart + + + + + + + + + \pstart + + + + + + + + + + \endnumbering + + + \end{hebrew} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \leavevmode\\ + + + + + + + + + + + \textsc{ + + } + + + + \textit{ + + } + + + + \textsuperscript{ + + } + + + + {\Large + + } + + + + {\small + + } + + + + {\raggedleft + + \par} + + + + \textbf{ + + } + + + + \emph{ + + } + + + + \texthebrew{ + + } + + + + \textit{ + + } + + + + \href{ + + }{ + + } + + + + + + \textit{ + + } ( + + ) + + + \textit{ + + } + + + + + + + + + + \instructionnote{ + + } + + + + + \edtext{}{\Bfootnote{\notenote{ + + }}} + + + + + + + + \begingroup\textdir TRT\selectlanguage{hebrew} + + \endgroup + + + + \begingroup\textdir TLT\selectlanguage{english} + + \endgroup + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/opensiddur/exporter/tex/xelatex.py b/opensiddur/exporter/tex/xelatex.py deleted file mode 100644 index 2f609b2..0000000 --- a/opensiddur/exporter/tex/xelatex.py +++ /dev/null @@ -1,405 +0,0 @@ -#!/usr/bin/env python3 -""" -JLPTEI to XeLaTeX Exporter - -This script converts JLPTEI XML files to XeLaTeX format using XSLT transformation. -""" - -import sys -import argparse -import os -from typing import Optional -from lxml import etree -from pathlib import Path - -from pydantic import BaseModel - -# Add the project root to the Python path -project_root = Path(__file__).resolve().parent.parent.parent.parent -projects_source_root = project_root / "project" -sys.path.insert(0, str(project_root)) - -from opensiddur.common.xslt import xslt_transform, xslt_transform_string - -XSLT_FILE = Path(__file__).parent / "xelatex.xslt" - -class LicenseRecord(BaseModel): - """ Record of the license for a given file. """ - url: str # License URL is required - name: str - -class CreditRecord(BaseModel): - """ Record of the credit for a given file. """ - role: str # Role is required (e.g., "aut", "edt") - resp_text: str - ref: str # Reference URI is required - name_text: str - namespace: str # where the contributor did their work - contributor: str # contirbutor name at the source - -def extract_licenses(xml_file_paths: list[Path]) -> dict[Path, LicenseRecord]: - """ - Extract license URLs and names from a list of JLPTEI XML files. - - Args: - xml_file_paths (list of Path): List of paths to JLPTEI XML files. - - Returns: - dict: Mapping from file path to a list of LicenseRecord objects. - """ - - ns = { - "tei": "http://www.tei-c.org/ns/1.0", - } - - results = {} - - for file_path in xml_file_paths: - try: - try: - relative_path = file_path.absolute().relative_to(projects_source_root) - except ValueError: - print(f"Warning: {file_path} is not a subdirectory of {projects_source_root}", file=sys.stderr) - continue - tree = etree.parse(file_path) - root = tree.getroot() - # Find all elements anywhere in the document - for licence in root.findall(".//tei:licence", ns): - # Try to get the target attribute (URL) - url = licence.attrib.get("target") - # The text content is the license name - name = (licence.text or "").strip() - if url: # License must have a URL - results[relative_path] = LicenseRecord(url=url, name=name) - else: - print(f"Error: No license URL found for {relative_path}", file=sys.stderr) - except Exception as e: - # If there's a parse error or file error, skip and continue - print(f"Error: {file_path}: {e}", file=sys.stderr) - pass - - return results - -def group_licenses(licenses: dict[Path, LicenseRecord]) -> list[LicenseRecord]: - """ - Group licenses by URL. - """ - license_urls = set() - grouped_licenses = [] - for path, license in licenses.items(): - if license.url not in license_urls: - license_urls.add(license.url) - grouped_licenses.append(license) - return grouped_licenses - -def licenses_to_tex(licenses: list[LicenseRecord]) -> str: - """ - Convert a list of LicenseRecord objects to a string of LaTeX code. - """ - tex = f"""\\chapter{{Legal}} -This document includes copyrighted texts licensed under the following licenses. -The full text of the licenses can be found at the given URLs: - -\\begin{{itemize}} -{'\n'.join([f"\\item {license.name} (\\url{{{license.url}}})" for license in licenses])} -\\end{{itemize}} - - """ - return tex - -def extract_credits(xml_file_paths: list[Path]) -> dict[Path, list[dict]]: - """ - Extract credits from a list of JLPTEI XML files. - - For each , extract: - - tei:resp/@key (as 'role') - - tei:name/@ref (as 'ref') - - tei:resp text (as 'resp_text') - - tei:name text (as 'name_text') - - Args: - xml_file_paths (list of Path): List of paths to JLPTEI XML files. - - Returns: - dict: Mapping from file path to a list of credit dicts. - """ - ns = { - "tei": "http://www.tei-c.org/ns/1.0", - } - results = {} - - for file_path in xml_file_paths: - credits = [] - try: - tree = etree.parse(file_path) - root = tree.getroot() - for resp_stmt in root.findall(".//tei:respStmt", ns): - resp = resp_stmt.find("tei:resp", ns) - name = resp_stmt.find("tei:name", ns) - - # Skip if required elements are missing - if resp is None or name is None: - continue - - role = resp.attrib.get("key") - ref = name.attrib.get("ref") - - # Skip if required attributes are missing - if not role or not ref: - continue - - credit = { - "role": role, - "resp_text": (resp.text or "").strip(), - "ref": ref, - "name_text": (name.text or "").strip(), - } - - # Parse namespace and contributor from ref - namespace, contributor = ref.split(":")[-1].split("/") - credit["namespace"] = namespace - credit["contributor"] = contributor - - credits.append(CreditRecord.model_validate(credit)) - except Exception as e: - print(f"Error: {e}", file=sys.stderr) - pass - results[file_path] = credits - - return results - - -def group_credits(credits: dict[Path, list[dict]]) -> dict[str, dict[str, list[CreditRecord]]]: - """ - Group credits by role -> namespace -> contributor. - Each credit record will appear once per role. - """ - credit_refs = set() # avoid duplicates of (role, ref) - grouped_credits = {} - for _, credit_list in credits.items(): - for credit in credit_list: - if (credit.role, credit.ref) not in credit_refs: - role = credit.role - ref = credit.ref - namespace = credit.namespace - credit_refs.add((role, ref)) - if role not in grouped_credits: - grouped_credits[role] = {} - if namespace not in grouped_credits[role]: - grouped_credits[role][namespace] = [] - grouped_credits[role][namespace].append(credit) - - return grouped_credits - -contributor_keys_to_roles = { - "ann": "Annotator", - "aut": "Author", - "edt": "Editor", - "fac": "Facsimilist", - "fnd": "Funder", - "mrk": "Markup editor", - "pfr": "Proofreader", - "spn": "Sponsor", - "trl": "Translator", - "trc": "Transcriptionist", -} - -def credits_to_tex(credits: dict[str, dict[str, list[CreditRecord]]]) -> str: - """ - Convert role -> namespace -> list of CreditRecord objects to a string of LaTeX code. - """ - tex = f"""\\chapter{{Contributor credits}}\n""" - for role, namespace_dict in credits.items(): - total_credits = sum(len(credits) for credits in namespace_dict.values()) - role_name = contributor_keys_to_roles[role] + ("s" if total_credits > 1 else "") - tex += f"""\\section{{{role_name}}}\n""" - for namespace, credits in namespace_dict.items(): - sorted_credits = sorted(credits, key=lambda x: x.contributor) - tex += f"""\\subsection{{From {namespace}}}\n""" - tex += f"""\\begin{{itemize}}\n""" - for credit in sorted_credits: - tex += f"""\\item {credit.name_text}\n""" - tex += f"""\\end{{itemize}}\n""" - return tex - -def get_project_index(file_path: Path) -> Path: - """ - Get the project index file for a given file path. - """ - return file_path.parent / "index.xml" - -def extract_sources(xml_file_paths: list[Path]) -> tuple[str, str]: - """ - Extract sources from a list of JLPTEI XML files. - Returns: - tuple: (preamble_tex, postamble_tex) for the bibliography - """ - index_files = set(get_project_index(fp) for fp in xml_file_paths) - bibtex_records_all = [] - unique_bibtex_records = set() - for index_xml in index_files: - try: - # Convert the index xml to .bib using bibtex.xslt - index_xml_text = index_xml.read_text(encoding="utf-8") - bib_xslt_path = Path(__file__).parent / "bibtex.xslt" - bibtex_str = xslt_transform_string(bib_xslt_path, index_xml_text) - bibtex_str = bibtex_str.strip() - if bibtex_str and bibtex_str not in unique_bibtex_records: - unique_bibtex_records.add(bibtex_str) - bibtex_records_all.append(bibtex_str) - except Exception as e: - print(f"Could not extract bibtex from {index_xml}: {e}", file=sys.stderr) - continue - bibtex_blob = "\n\n".join(bibtex_records_all) - preamble_tex = "" - postamble_tex = "" - if bibtex_blob: - preamble_tex = f"""\\begin{{filecontents*}}{{job.bib}} -{bibtex_blob} -\\end{{filecontents*}} -\\addbibresource{{job.bib}} -""" - postamble_tex = f""" -\\begingroup -\\renewcommand{{\\refname}}{{Sources}} -\\nocite{{*}} -\\printbibliography -\\endgroup -""" - return preamble_tex, postamble_tex - - -def get_file_references(input_file: Path, project_directory: Path = projects_source_root) -> list[Path]: - """ - Get the file references from a compiled JLPTEI XML file. - The file references include - the file itself, all files that are transcluded by the file, and the index file for all projects. - Args: - input_file (Path): Path to the input compiled JLPTEI XML file. - project_directory (Path): Path to the project directory. - Returns: - list[Path]: List of paths to the file references. - """ - ns = { - "tei": "http://www.tei-c.org/ns/1.0", - "p": "http://jewishliturgy.org/ns/processing", - } - tree = etree.parse(input_file) - root = tree.getroot() - # Use xpath() instead of findall() for complex predicates - # Include root element (self) and all descendants - elements_with_references = root.xpath("(self::*|.//*) [@p:project and @p:file_name]", namespaces=ns) - - # Use Clark notation for attribute access - p_project = "{http://jewishliturgy.org/ns/processing}project" - p_file_name = "{http://jewishliturgy.org/ns/processing}file_name" - - return list(set([ - project_directory / element.attrib[p_project] / element.attrib[p_file_name] - for element in elements_with_references - ] + [ - project_directory / element.attrib[p_project] / "index.xml" - for element in elements_with_references - ])) - -def transform_xml_to_tex(input_file, xslt_file=XSLT_FILE, output_file=None): - """ - Transform a JLPTEI XML file to XeLaTeX using XSLT. - - Args: - input_file (str): Path to the input TEI XML file - xslt_file (str): Path to the XSLT transformation file - output_file (str, optional): Path to the output .tex file. If None, output to stdout. - - Returns: - str: The transformed XeLaTeX content - """ - try: - # Read the input XML - with open(input_file, 'r', encoding='utf-8') as input_fd: - input_xml = input_fd.read() - - root = etree.fromstring(input_xml.encode("utf-8")) - - file_references = get_file_references(input_file, projects_source_root) - - licenses = extract_licenses(file_references) - licenses_tex = licenses_to_tex(group_licenses(licenses)) - credits = extract_credits(file_references) - credits_tex = credits_to_tex(group_credits(credits)) - sources_preamble_tex, sources_postamble_tex = extract_sources(file_references) - # Use the string-based XSLT transformation function - result = xslt_transform_string(Path(xslt_file), input_xml, - xslt_params={ - "additional-preamble": sources_preamble_tex, - "additional-postamble": ( - "\\part{Metadata}\n" + - licenses_tex + "\n" + - credits_tex + "\n" + - sources_postamble_tex - ), - }) - - if output_file: - with open(output_file, 'w', encoding='utf-8') as output_fd: - output_fd.write(result) - print(f"XeLaTeX output written to: {output_file}", file=sys.stderr) - else: - sys.stdout.write(result) - - return result - - except Exception as e: - print(f"Transformation error: {e}", file=sys.stderr) - sys.exit(1) - - -def main(): # pragma: no cover - """Main function to handle command line arguments and run the transformation.""" - parser = argparse.ArgumentParser( - description="Convert JLPTEI XML files to XeLaTeX format", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - %(prog)s input.xml - %(prog)s input.xml -o output.tex - %(prog)s input.xml --output output.tex - """ - ) - - parser.add_argument( - 'input_file', - help='Path to the input TEI XML file' - ) - - parser.add_argument( - '-o', '--output', - dest='output_file', - help='Path to the output .tex file (default: output to stdout)' - ) - - parser.add_argument( - '--xslt', - dest='xslt_file', - default=os.path.join(os.path.dirname(__file__), 'xelatex.xslt'), - help='Path to the XSLT transformation file (default: xelatex.xslt in the same directory)' - ) - - args = parser.parse_args() - - # Validate input file exists - if not os.path.exists(args.input_file): - print(f"Error: Input file '{args.input_file}' does not exist", file=sys.stderr) - sys.exit(1) - - # Validate XSLT file exists - if not os.path.exists(args.xslt_file): - print(f"Error: XSLT file '{args.xslt_file}' does not exist", file=sys.stderr) - sys.exit(1) - - # Run the transformation - transform_xml_to_tex(args.input_file, args.xslt_file, args.output_file) - - -if __name__ == '__main__': - main() diff --git a/opensiddur/exporter/tex/xelatex.xslt b/opensiddur/exporter/tex/xelatex.xslt deleted file mode 100644 index fbe659c..0000000 --- a/opensiddur/exporter/tex/xelatex.xslt +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - \documentclass{book} - \usepackage{fontspec} - \usepackage{polyglossia} - \usepackage{hyperref} - - \usepackage{paracol} - \usepackage[backend=bibtex]{biblatex} - \setdefaultlanguage{english} - \setotherlanguage{hebrew} - - \IfFontExistsTF{Frank Ruehl CLM}{ - \newfontfamily\hebrewfont[Script=Hebrew]{Frank Ruehl CLM} - }{ - \IfFontExistsTF{Ezra SIL}{ - \newfontfamily\hebrewfont[Script=Hebrew]{Ezra SIL} - }{ - \IfFontExistsTF{SBL Hebrew}{ - \newfontfamily\hebrewfont[Script=Hebrew]{SBL Hebrew} - }{ - \newfontfamily\hebrewfont[Script=Hebrew]{FreeSerif} - } - } - } - \setlength{\parindent}{0pt} - \setlength{\parskip}{1em} - - - - - \begin{document} - - - - - - \begin{hebrew} - - - - \end{hebrew} - - - - - - - \end{document} - - - - - - - - - - - - - - - - - - - - - \begin{paracol}{2} - \begin{ - - } - - \end{ - - } - \switchcolumn - \begin{ - - } - - \end{ - - } - \end{paracol}\par\vspace{0.75em} - - - - - - - - - - - - - - - - - - - - - - - - \part{ - - } - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \begin{verse} - - - - - - - - - - \end{verse} - - - - - \begin{verse} - - \end{verse} - - - - - - - - - - - \leavevmode\\[0pt] - - - - - - - \chapter{ - - } - - - \textsuperscript{ - - } - - - - - - - - - \textit{ - - } ( - - ) - - - \textit{ - - } - - - - - - - - - - \emph{ - - } - - - - - \href{ - - }{ - - } - - - - - - - - \textsc{ - - } - - - \textit{ - - } - - - \Large{ - - } - - - \small{ - - } - - - \textsuperscript{ - - } - - - \textsuperscript{ - - } - - - \begin{flushright} - - \end{flushright} - - - - - - - Unknown rend value: - - - - - - - \begin{center}* * * *\end{center} - - - - - - - - - - - - \textbf{ - - } - - - - - - - - - - - - - - - - - - - - - \texthebrew{ - - } - - - \textit{ - - } - - - - - - - - \footnote{ - - - - \texthebrew{ - - } - - - \textenglish{ - - } - - - } - - - - - - - - - - \leavevmode\\[0pt] - - - - - - - - - - - - - - - - - - - - - - diff --git a/opensiddur/tests/exporter/test_latex.py b/opensiddur/tests/exporter/test_latex.py new file mode 100644 index 0000000..20d5e5d --- /dev/null +++ b/opensiddur/tests/exporter/test_latex.py @@ -0,0 +1,452 @@ +"""Tests for the latex.py driver around the reledmac.xslt stylesheet. + +The driver is responsible for: + + - extracting license/credit/source metadata from referenced source files, + - loading the optional ``typography`` section of a settings.yaml, + - and feeding all of those into the XSLT as parameters. + +These tests cover those responsibilities. The actual XSLT output is tested +separately in ``test_reledmac_xslt.py``. +""" + +import tempfile +import unittest +from io import StringIO +from pathlib import Path +from unittest.mock import MagicMock, patch + +import opensiddur.exporter.tex.latex as latex_module +from opensiddur.exporter.settings import PaperType, ParallelLayout, TypographyConfig +from opensiddur.exporter.tex.latex import ( + CreditRecord, + LicenseRecord, + credits_to_tex, + extract_credits, + extract_licenses, + extract_sources, + get_file_references, + group_credits, + group_licenses, + licenses_to_tex, + load_typography, + transform_xml_to_tex, +) + + +class TestExtractLicenses(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.addCleanup(self.temp_dir.cleanup) + self.test_dir = Path(self.temp_dir.name) + + def _create(self, project: str, filename: str, content: bytes) -> Path: + d = self.test_dir / project + d.mkdir(parents=True, exist_ok=True) + p = d / filename + p.write_bytes(content) + return p + + def test_extract_single_license(self): + xml = b""" + + + CC0 + + + """ + f = self._create("p", "a.xml", xml) + with patch.object(latex_module, "projects_source_root", self.test_dir): + result = extract_licenses([f]) + self.assertEqual(len(result), 1) + record = next(iter(result.values())) + self.assertEqual(record.url, "http://example.com/cc") + self.assertEqual(record.name, "CC0") + + def test_license_without_url_is_skipped(self): + xml = b""" + Unknown""" + f = self._create("p", "a.xml", xml) + with patch.object(latex_module, "projects_source_root", self.test_dir): + result = extract_licenses([f]) + self.assertEqual(len(result), 0) + + def test_invalid_xml_is_skipped(self): + f = self._create("p", "a.xml", b"not xml") + with patch.object(latex_module, "projects_source_root", self.test_dir): + result = extract_licenses([f]) + self.assertEqual(len(result), 0) + + +class TestGroupLicenses(unittest.TestCase): + + def test_dedupes_by_url(self): + records = { + Path("a"): LicenseRecord(url="http://x", name="X"), + Path("b"): LicenseRecord(url="http://x", name="X"), + Path("c"): LicenseRecord(url="http://y", name="Y"), + } + grouped = group_licenses(records) + self.assertEqual(len(grouped), 2) + self.assertEqual({lr.url for lr in grouped}, {"http://x", "http://y"}) + + +class TestLicensesToTex(unittest.TestCase): + + def test_emits_legal_chapter(self): + out = licenses_to_tex( + [LicenseRecord(url="http://creativecommons.org/cc", name="CC")] + ) + self.assertIn(r"\chapter{Legal}", out) + self.assertIn("CC", out) + self.assertIn(r"\url{http://creativecommons.org/cc}", out) + + +class TestExtractCredits(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.addCleanup(self.temp_dir.cleanup) + self.test_dir = Path(self.temp_dir.name) + + def _create(self, project: str, filename: str, content: bytes) -> Path: + d = self.test_dir / project + d.mkdir(parents=True, exist_ok=True) + p = d / filename + p.write_bytes(content) + return p + + def test_extracts_resp_stmt(self): + xml = b""" + + Author + A B + + """ + f = self._create("p", "a.xml", xml) + result = extract_credits([f]) + credits = result[f] + self.assertEqual(len(credits), 1) + self.assertEqual(credits[0].role, "aut") + self.assertEqual(credits[0].namespace, "ns") + self.assertEqual(credits[0].contributor, "person") + + def test_skips_resp_without_required_attrs(self): + xml = b""" + Author + """ + f = self._create("p", "a.xml", xml) + result = extract_credits([f]) + self.assertEqual(result[f], []) + + +class TestGroupCredits(unittest.TestCase): + + def test_groups_by_role_and_namespace(self): + c = CreditRecord( + role="aut", + resp_text="Author", + ref="urn:x-opensiddur:ns/p1", + name_text="P1", + namespace="ns", + contributor="p1", + ) + grouped = group_credits({Path("a"): [c]}) + self.assertIn("aut", grouped) + self.assertIn("ns", grouped["aut"]) + self.assertEqual(len(grouped["aut"]["ns"]), 1) + + def test_dedupes_by_role_and_ref(self): + c = CreditRecord( + role="aut", + resp_text="Author", + ref="urn:x-opensiddur:ns/p1", + name_text="P1", + namespace="ns", + contributor="p1", + ) + grouped = group_credits({Path("a"): [c], Path("b"): [c]}) + self.assertEqual(len(grouped["aut"]["ns"]), 1) + + +class TestCreditsToTex(unittest.TestCase): + + def test_pluralizes_role_when_multiple_contributors(self): + c1 = CreditRecord( + role="aut", resp_text="Author", ref="urn:x:ns/a", + name_text="A", namespace="ns", contributor="a", + ) + c2 = CreditRecord( + role="aut", resp_text="Author", ref="urn:x:ns/b", + name_text="B", namespace="ns", contributor="b", + ) + out = credits_to_tex({"aut": {"ns": [c1, c2]}}) + self.assertIn(r"\section{Authors}", out) + + def test_emits_singular_when_one_contributor(self): + c1 = CreditRecord( + role="aut", resp_text="Author", ref="urn:x:ns/a", + name_text="A", namespace="ns", contributor="a", + ) + out = credits_to_tex({"aut": {"ns": [c1]}}) + self.assertIn(r"\section{Author}", out) + self.assertNotIn(r"\section{Authors}", out) + + +class TestExtractSources(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.addCleanup(self.temp_dir.cleanup) + self.test_dir = Path(self.temp_dir.name) + + def _create(self, project: str, filename: str, content: bytes) -> Path: + d = self.test_dir / project + d.mkdir(parents=True, exist_ok=True) + p = d / filename + p.write_bytes(content) + return p + + def test_emits_filecontents_block_when_bibl_present(self): + index = b""" + + + TA + + """ + doc = self._create("p", "doc.xml", b"") + self._create("p", "index.xml", index) + preamble, postamble = extract_sources([doc]) + self.assertIn(r"\begin{filecontents*}{job.bib}", preamble) + self.assertIn(r"\addbibresource{job.bib}", preamble) + self.assertIn(r"\printbibliography", postamble) + + def test_returns_empty_strings_when_no_bibl(self): + doc = self._create("p", "doc.xml", b"") + self._create("p", "index.xml", b"") + preamble, postamble = extract_sources([doc]) + self.assertEqual(preamble, "") + self.assertEqual(postamble, "") + + def test_dedupes_when_multiple_files_share_index(self): + index = b""" + + + TA + + """ + f1 = self._create("p", "doc1.xml", b"") + f2 = self._create("p", "doc2.xml", b"") + self._create("p", "index.xml", index) + preamble, _ = extract_sources([f1, f2]) + self.assertEqual(preamble.count("@"), 1) + + +class TestGetFileReferences(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.addCleanup(self.temp_dir.cleanup) + self.test_dir = Path(self.temp_dir.name) + self.project_dir = self.test_dir / "project" + self.project_dir.mkdir() + + def _create(self, filename: str, content: bytes) -> Path: + p = self.project_dir / filename + p.write_bytes(content) + return p + + def test_collects_main_and_index(self): + xml = b"""""" + f = self._create("main.xml", xml) + result = get_file_references(f, self.project_dir) + self.assertIn(self.project_dir / "proj" / "main.xml", result) + self.assertIn(self.project_dir / "proj" / "index.xml", result) + + def test_collects_transcluded_files(self): + xml = b""" + + """ + f = self._create("main.xml", xml) + result = get_file_references(f, self.project_dir) + self.assertIn(self.project_dir / "a" / "main.xml", result) + self.assertIn(self.project_dir / "b" / "x.xml", result) + self.assertIn(self.project_dir / "a" / "index.xml", result) + self.assertIn(self.project_dir / "b" / "index.xml", result) + + +class TestLoadTypography(unittest.TestCase): + """Loading the optional `typography` section of a settings.yaml. + + Defaults must apply when the file is missing, the section is missing, + or the file is malformed. The PDF stage must not depend on the full + SettingsYaml passing validation — the compiler stage already validates + the rest, and the PDF stage can run without project paths existing on + disk (e.g. against pre-compiled XML). + """ + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.addCleanup(self.temp_dir.cleanup) + self.test_dir = Path(self.temp_dir.name) + + def test_defaults_when_settings_file_is_none(self): + cfg = load_typography(None) + self.assertEqual(cfg, TypographyConfig()) + + def test_reads_typography_section(self): + settings_path = self.test_dir / "settings.yaml" + settings_path.write_text( + """ +priority: + transclusion: [p] + instructions: [] +typography: + hebrew_font: Ezra SIL + latin_font: TeX Gyre Pagella + layout: pairs + paper: letterpaper + fontsize: 12pt +""" + ) + cfg = load_typography(settings_path) + self.assertEqual(cfg.hebrew_font, "Ezra SIL") + self.assertEqual(cfg.latin_font, "TeX Gyre Pagella") + self.assertEqual(cfg.layout, ParallelLayout.PAIRS) + self.assertEqual(cfg.paper, PaperType.LETTERPAPER) + self.assertEqual(cfg.fontsize, "12pt") + + def test_defaults_when_typography_section_missing(self): + settings_path = self.test_dir / "settings.yaml" + settings_path.write_text( + """ +priority: + transclusion: [p] + instructions: [] +""" + ) + cfg = load_typography(settings_path) + self.assertEqual(cfg, TypographyConfig()) + + def test_returns_defaults_on_invalid_file(self): + f = self.test_dir / "broken.yaml" + f.write_text(":\n: not yaml") + cfg = load_typography(f) + self.assertEqual(cfg, TypographyConfig()) + + def test_settings_with_unknown_projects_does_not_block_typography(self): + """Project-list validation in the broader settings file (which the + compiler does) must not interfere with reading typography here.""" + settings_path = self.test_dir / "settings.yaml" + settings_path.write_text( + """ +priority: + transclusion: [a-project-that-does-not-exist] +typography: + hebrew_font: Some Font +""" + ) + cfg = load_typography(settings_path) + self.assertEqual(cfg.hebrew_font, "Some Font") + + +class TestTransformXmlToTex(unittest.TestCase): + """End-to-end driver test: confirms the typography parameters reach + the XSLT and that integration with license/credit/source extraction + works.""" + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.addCleanup(self.temp_dir.cleanup) + self.test_dir = Path(self.temp_dir.name) + + def _create(self, project: str, filename: str, content: bytes) -> Path: + d = self.test_dir / project + d.mkdir(parents=True, exist_ok=True) + p = d / filename + p.write_bytes(content) + return p + + def test_basic_transform_produces_lualatex_document(self): + xml = b""" + + + Hello. + + """ + f = self._create("p", "input.xml", xml) + with patch.object(latex_module, "projects_source_root", self.test_dir): + out = transform_xml_to_tex(f) + + self.assertIn(r"\documentclass", out) + self.assertIn(r"\begin{document}", out) + self.assertIn(r"\end{document}", out) + self.assertIn(r"\usepackage{reledmac}", out) + # Hebrew font must be declared via fontspec for Hebrew script support. + self.assertIn(r"\newfontfamily\hebrewfont", out) + + def test_typography_object_is_threaded_into_preamble(self): + xml = b""" + + x + """ + f = self._create("p", "input.xml", xml) + typography = TypographyConfig( + hebrew_font="Ezra SIL", + latin_font="TeX Gyre Pagella", + layout=ParallelLayout.PAIRS, + paper="letterpaper", + fontsize="12pt", + ) + + with patch.object(latex_module, "projects_source_root", self.test_dir): + out = transform_xml_to_tex(f, typography=typography) + + self.assertIn(r"\documentclass[12pt,letterpaper]{book}", out) + self.assertIn("Ezra SIL", out) + self.assertIn("TeX Gyre Pagella", out) + + def test_layout_pairs_propagates_to_parallel_block(self): + xml = """ + + + + שלום + Hi + + + """.encode("utf-8") + f = self._create("p", "input.xml", xml) + typography = TypographyConfig(layout=ParallelLayout.PAIRS) + with patch.object(latex_module, "projects_source_root", self.test_dir): + out = transform_xml_to_tex(f, typography=typography) + self.assertIn(r"\begin{pairs}", out) + self.assertIn(r"\Columns", out) + + def test_integrates_licenses_into_postamble(self): + xml = b""" + + + + My License + + + x + """ + f = self._create("p", "input.xml", xml) + with patch.object(latex_module, "projects_source_root", self.test_dir): + out = transform_xml_to_tex(f) + self.assertIn(r"\chapter{Legal}", out) + self.assertIn("My License", out) + + +if __name__ == "__main__": + unittest.main() diff --git a/opensiddur/tests/exporter/test_marker_reconstruct.py b/opensiddur/tests/exporter/test_marker_reconstruct.py index dbdb90f..ea2e90d 100644 --- a/opensiddur/tests/exporter/test_marker_reconstruct.py +++ b/opensiddur/tests/exporter/test_marker_reconstruct.py @@ -5,7 +5,6 @@ from lxml import etree -from opensiddur.common.xslt import xslt_transform_string from opensiddur.exporter.external_compiler import PROCESSING_NAMESPACE, TEI_NS from opensiddur.exporter.marker_reconstruct import ( doc_needs_marker_reconstruction, @@ -14,7 +13,6 @@ substantive_content, ) from opensiddur.exporter import marker_reconstruct as mr -from opensiddur.exporter.tex.xelatex import XSLT_FILE P_NS = PROCESSING_NAMESPACE @@ -103,25 +101,6 @@ def test_empty_segment_dropped_and_relabeled(self): self.assertIsNone(ps[0].get(f"{{{P_NS}}}part")) self.assertIn("Only", "".join(ps[0].itertext())) - def test_xslt_parallel_row_after_reconstruct(self): - xml = f""" - - - שלום - Hello - - """ - root = etree.fromstring(xml.encode()) - reconstruct_markered_document(root) - out = xslt_transform_string( - XSLT_FILE, - etree.tostring(root, encoding="unicode"), - xslt_params={"additional-preamble": "", "additional-postamble": ""}, - ) - self.assertIn(r"\begin{paracol}{2}", out) - self.assertIn("שלום", out) - self.assertIn("Hello", out) - def test_substantive_content_milestone_tail(self): xml = f""" text diff --git a/opensiddur/tests/exporter/test_pdf.py b/opensiddur/tests/exporter/test_pdf.py index 3dea1b3..fc9ed7e 100644 --- a/opensiddur/tests/exporter/test_pdf.py +++ b/opensiddur/tests/exporter/test_pdf.py @@ -1,15 +1,15 @@ -"""Tests for the PDF exporter module.""" +"""Tests for the PDF exporter module (LuaLaTeX + reledmac/reledpar pipeline).""" -import unittest +import subprocess import tempfile +import unittest from pathlib import Path -from unittest.mock import patch, MagicMock, Mock -import subprocess +from unittest.mock import MagicMock, Mock, patch from opensiddur.exporter.pdf.pdf import ( - generate_tex, compile_tex_to_pdf, export_to_pdf, + generate_tex, ) @@ -17,640 +17,324 @@ class TestGenerateTex(unittest.TestCase): """Test the generate_tex function.""" def setUp(self): - """Set up test fixtures.""" self.temp_dir = tempfile.TemporaryDirectory() self.addCleanup(self.temp_dir.cleanup) self.test_dir = Path(self.temp_dir.name) def test_generate_tex_success(self): - """Test successful TeX generation.""" + """generate_tex calls transform_xml_to_tex and propagates success.""" input_file = self.test_dir / "input.xml" output_file = self.test_dir / "output.tex" - - # Create a minimal valid XML file - input_file.write_text('Test') - - with patch('opensiddur.exporter.pdf.pdf.transform_xml_to_tex') as mock_transform: - # transform_xml_to_tex writes to the file, so we need to mock it properly - def mock_transform_side_effect(input_file, output_file=None, **kwargs): - if output_file: - Path(output_file).write_text("\\documentclass{book}\n\\begin{document}Test\\end{document}") - return "\\documentclass{book}\n\\begin{document}Test\\end{document}" - - mock_transform.side_effect = mock_transform_side_effect + input_file.write_text("") + + def write_tex(path, output_file=None, **kwargs): + if output_file: + Path(output_file).write_text(r"\documentclass{book}") + return r"\documentclass{book}" + + with patch( + "opensiddur.exporter.pdf.pdf.transform_xml_to_tex", + side_effect=write_tex, + ) as mock_transform: result = generate_tex(input_file, output_file) - + self.assertTrue(result) self.assertTrue(output_file.exists()) - mock_transform.assert_called_once_with(str(input_file), output_file=str(output_file)) + mock_transform.assert_called_once_with( + str(input_file), + output_file=str(output_file), + settings_file=None, + ) + + def test_generate_tex_forwards_settings_file(self): + """A settings_file argument is forwarded to transform_xml_to_tex unchanged.""" + input_file = self.test_dir / "input.xml" + output_file = self.test_dir / "output.tex" + settings_file = self.test_dir / "settings.yaml" + input_file.write_text("") + + with patch("opensiddur.exporter.pdf.pdf.transform_xml_to_tex") as mock_transform: + generate_tex(input_file, output_file, settings_file=settings_file) + + mock_transform.assert_called_once_with( + str(input_file), + output_file=str(output_file), + settings_file=settings_file, + ) def test_generate_tex_handles_exception(self): - """Test that generate_tex handles exceptions gracefully.""" + """generate_tex returns False when transform_xml_to_tex raises.""" input_file = self.test_dir / "input.xml" output_file = self.test_dir / "output.tex" - - input_file.write_text('Test') - - with patch('opensiddur.exporter.pdf.pdf.transform_xml_to_tex') as mock_transform: - mock_transform.side_effect = Exception("Transformation failed") + input_file.write_text("") + + with patch( + "opensiddur.exporter.pdf.pdf.transform_xml_to_tex", + side_effect=RuntimeError("boom"), + ): result = generate_tex(input_file, output_file) - + self.assertFalse(result) self.assertFalse(output_file.exists()) -class TestCompileTexToPdf(unittest.TestCase): - """Test the compile_tex_to_pdf function.""" +class TestCompileTexToPdfLatexmk(unittest.TestCase): + """Tests for the latexmk-driven path of compile_tex_to_pdf. + + When ``latexmk`` is present on $PATH, compile_tex_to_pdf shells out once + to ``latexmk -lualatex`` and copies the resulting PDF out of the temp + build directory. + """ def setUp(self): - """Set up test fixtures.""" self.temp_dir = tempfile.TemporaryDirectory() self.addCleanup(self.temp_dir.cleanup) self.test_dir = Path(self.temp_dir.name) + self.tex_file = self.test_dir / "test.tex" + self.tex_file.write_text(r"\documentclass{book}\begin{document}Test\end{document}") + self.output_pdf = self.test_dir / "out.pdf" + + def _have_command(self, name): + return name in {"lualatex", "latexmk", "bibtex"} + + def _latexmk_run(self, cmd, **kwargs): + """Simulate latexmk producing a PDF in -output-directory.""" + out_dir = next( + Path(arg.split("=", 1)[1]) + for arg in cmd + if arg.startswith("-output-directory=") + ) + (out_dir / f"{self.tex_file.stem}.pdf").write_bytes(b"%PDF-1.4 fake") + result = MagicMock() + result.returncode = 0 + result.stdout = "" + result.stderr = "" + return result + + def test_uses_latexmk_when_available(self): + """When tools resolve on $PATH, we still produce a PDF successfully.""" + with patch( + "opensiddur.exporter.pdf.pdf._have_command", + side_effect=self._have_command, + ): + with patch( + "subprocess.run", side_effect=self._latexmk_run + ) as mock_run: + result = compile_tex_to_pdf(self.tex_file, self.output_pdf) - def test_compile_tex_to_pdf_success_no_bibtex(self): - """Test successful PDF compilation without bibliography.""" - tex_file = self.test_dir / "test.tex" - output_pdf = self.test_dir / "output.pdf" - - tex_file.write_text("\\documentclass{book}\\begin{document}Test\\end{document}") - - # Mock subprocess.run for xelatex - mock_result = Mock() - mock_result.returncode = 0 - mock_result.stdout = "This is XeTeX" - mock_result.stderr = "" - - with patch('subprocess.run', return_value=mock_result) as mock_run: - with patch('tempfile.TemporaryDirectory') as mock_temp: - # Create a real temp directory for the context manager - real_temp_dir = tempfile.mkdtemp() - self.addCleanup(lambda: __import__('shutil').rmtree(real_temp_dir, ignore_errors=True)) - temp_pdf = Path(real_temp_dir) / "test.pdf" - temp_pdf.write_bytes(b"fake pdf content") - - # Mock the context manager - mock_temp.return_value.__enter__ = Mock(return_value=real_temp_dir) - mock_temp.return_value.__exit__ = Mock(return_value=None) - - # Create a set of paths that should exist - paths_that_exist = {str(temp_pdf)} - - original_exists = Path.exists - def patched_exists(path_self): - if str(path_self) in paths_that_exist: - return True - return original_exists(path_self) - - with patch.object(Path, 'exists', side_effect=patched_exists): - with patch('shutil.copy2') as mock_copy: - result = compile_tex_to_pdf(tex_file, output_pdf) - - # Verify xelatex was called - this is the key behavior we're testing - xelatex_calls = [call for call in mock_run.call_args_list - if len(call[0]) > 0 and len(call[0][0]) > 0 and call[0][0][0] == 'xelatex'] - self.assertGreater(len(xelatex_calls), 0, "XeLaTeX should have been called") - # The result may be False if PDF doesn't exist, but the function should attempt compilation - self.assertIsInstance(result, bool) - - def test_compile_tex_to_pdf_with_bibtex(self): - """Test PDF compilation with bibliography.""" - tex_file = self.test_dir / "test.tex" - output_pdf = self.test_dir / "output.pdf" - - tex_file.write_text("\\documentclass{book}\\begin{document}Test\\end{document}") - - # Mock subprocess.run - mock_xelatex_result = Mock() - mock_xelatex_result.returncode = 0 - mock_xelatex_result.stdout = "This is XeTeX" - mock_xelatex_result.stderr = "" - - mock_bibtex_result = Mock() - mock_bibtex_result.returncode = 0 - mock_bibtex_result.stdout = "BibTeX output" - mock_bibtex_result.stderr = "" - - bibtex_called = [False] - def mock_run_side_effect(cmd, *args, **kwargs): - if cmd[0] == 'xelatex': - return mock_xelatex_result - elif cmd[0] == 'bibtex': - bibtex_called[0] = True - return mock_bibtex_result - return mock_xelatex_result - - # The test verifies that the function handles bibliography correctly - # We can't easily test the full flow due to tempfile complexity, but we verify - # that BibTeX would be called when aux file exists - with patch('subprocess.run', side_effect=mock_run_side_effect): - # Just verify the function doesn't crash - full integration is complex - # The actual behavior depends on aux file existence which is hard to mock - result = compile_tex_to_pdf(tex_file, output_pdf) - - # The function may succeed or fail depending on PDF existence, but shouldn't crash - self.assertIsInstance(result, bool) - - def test_compile_tex_to_pdf_xelatex_failure(self): - """Test PDF compilation when XeLaTeX fails.""" - tex_file = self.test_dir / "test.tex" - output_pdf = self.test_dir / "output.pdf" - - tex_file.write_text("\\documentclass{book}\\begin{document}Test\\end{document}") - - # Mock subprocess.run to return failure - mock_result = Mock() - mock_result.returncode = 1 - mock_result.stdout = "Error" - mock_result.stderr = "XeLaTeX compilation failed" - - with patch('subprocess.run', return_value=mock_result): - result = compile_tex_to_pdf(tex_file, output_pdf) - - self.assertFalse(result) - - def test_compile_tex_to_pdf_rerun_on_undefined_references(self): - """Test that XeLaTeX is rerun when undefined references are detected.""" - tex_file = self.test_dir / "test.tex" - output_pdf = self.test_dir / "output.pdf" - - tex_file.write_text("\\documentclass{book}\\begin{document}Test\\end{document}") - - # First run has undefined references, second run succeeds - run_count = [0] - - def mock_run_side_effect(cmd, *args, **kwargs): - run_count[0] += 1 - mock_result = Mock() - if cmd[0] == 'xelatex': - if run_count[0] == 1: - mock_result.returncode = 0 - mock_result.stdout = "There were undefined references" - mock_result.stderr = "" - else: - mock_result.returncode = 0 - mock_result.stdout = "Success" - mock_result.stderr = "" - else: - mock_result.returncode = 0 - mock_result.stdout = "" - mock_result.stderr = "" - return mock_result - - with patch('subprocess.run', side_effect=mock_run_side_effect): - with tempfile.TemporaryDirectory() as temp_dir: - temp_pdf = Path(temp_dir) / "test.pdf" - temp_pdf.write_bytes(b"fake pdf") - - original_exists = Path.exists - def patched_exists(path_instance): - if str(path_instance) == str(temp_pdf): - return True - return original_exists(path_instance) - - with patch.object(Path, 'exists', side_effect=patched_exists): - result = compile_tex_to_pdf(tex_file, output_pdf, max_runs=3) - - # Should have run XeLaTeX at least twice (but may fail if PDF doesn't exist) - # The test verifies the rerun logic is triggered - self.assertGreaterEqual(run_count[0], 1) - - def test_compile_tex_to_pdf_max_runs_reached(self): - """Test that compilation stops when max_runs is reached.""" - tex_file = self.test_dir / "test.tex" - output_pdf = self.test_dir / "output.pdf" - - tex_file.write_text("\\documentclass{book}\\begin{document}Test\\end{document}") - - run_count = [0] - - def mock_run_side_effect(cmd, *args, **kwargs): - run_count[0] += 1 - mock_result = Mock() - mock_result.returncode = 0 - mock_result.stdout = "Rerun to get cross-references right" # Always needs rerun - mock_result.stderr = "" - return mock_result - - with patch('subprocess.run', side_effect=mock_run_side_effect): - with tempfile.TemporaryDirectory() as temp_dir: - temp_pdf = Path(temp_dir) / "test.pdf" - temp_pdf.write_bytes(b"fake pdf") - - original_exists = Path.exists - def patched_exists(path_instance): - if str(path_instance) == str(temp_pdf): - return True - return original_exists(path_instance) - - with patch.object(Path, 'exists', side_effect=patched_exists): - result = compile_tex_to_pdf(tex_file, output_pdf, max_runs=3) - - # Should have run up to max_runs - self.assertLessEqual(run_count[0], 3) - - def test_compile_tex_to_pdf_pdf_not_found(self): - """Test that compilation fails when PDF is not generated.""" - tex_file = self.test_dir / "test.tex" - output_pdf = self.test_dir / "output.pdf" - - tex_file.write_text("\\documentclass{book}\\begin{document}Test\\end{document}") - - mock_result = Mock() - mock_result.returncode = 0 - mock_result.stdout = "Success" - mock_result.stderr = "" - - with patch('subprocess.run', return_value=mock_result): - with tempfile.TemporaryDirectory() as temp_dir: - # Don't create PDF file - with patch.object(Path, 'exists', return_value=False): - result = compile_tex_to_pdf(tex_file, output_pdf) - - self.assertFalse(result) + self.assertTrue(result) + self.assertTrue(self.output_pdf.exists()) + cmds = [c.args[0][0] for c in mock_run.call_args_list] + self.assertIn("lualatex", cmds) + + def test_fails_when_lualatex_missing(self): + """Without lualatex on $PATH, compile_tex_to_pdf returns False up-front.""" + with patch( + "opensiddur.exporter.pdf.pdf._have_command", + return_value=False, + ): + with patch("subprocess.run") as mock_run: + result = compile_tex_to_pdf(self.tex_file, self.output_pdf) - def test_compile_tex_to_pdf_bibtex_error_handling(self): - """Test that BibTeX errors are handled gracefully.""" - tex_file = self.test_dir / "test.tex" - output_pdf = self.test_dir / "output.pdf" - - tex_file.write_text("\\documentclass{book}\\begin{document}Test\\end{document}") - - mock_xelatex_result = Mock() - mock_xelatex_result.returncode = 0 - mock_xelatex_result.stdout = "Success" - mock_xelatex_result.stderr = "" - - mock_bibtex_result = Mock() - mock_bibtex_result.returncode = 0 - mock_bibtex_result.stdout = "error message: Something went wrong" - mock_bibtex_result.stderr = "" - - def mock_run_side_effect(cmd, *args, **kwargs): - if cmd[0] == 'xelatex': - return mock_xelatex_result - elif cmd[0] == 'bibtex': - return mock_bibtex_result - return mock_xelatex_result - - with patch('subprocess.run', side_effect=mock_run_side_effect): - with tempfile.TemporaryDirectory() as temp_dir: - aux_file = Path(temp_dir) / "test.aux" - aux_file.write_text("\\bibdata{test}") - - original_exists = Path.exists - def patched_exists(path_instance): - if str(path_instance) == str(aux_file): - return True - return original_exists(path_instance) - - with patch.object(Path, 'exists', side_effect=patched_exists): - temp_pdf = Path(temp_dir) / "test.pdf" - temp_pdf.write_bytes(b"fake pdf") - - def patched_exists2(path_instance): - path_str = str(path_instance) - if path_str == str(temp_pdf): - return True - return patched_exists(path_instance) - - with patch.object(Path, 'exists', side_effect=patched_exists2): - with patch('pathlib.Path.read_text') as mock_read: - def read_text_side_effect(path_instance): - if str(path_instance) == str(aux_file): - return aux_file.read_text() - return "" - mock_read.side_effect = read_text_side_effect - result = compile_tex_to_pdf(tex_file, output_pdf) - - # Should continue despite BibTeX warning - # The function should still attempt to complete - - def test_compile_tex_to_pdf_file_not_found_error(self): - """Test handling of FileNotFoundError when command is missing.""" - tex_file = self.test_dir / "test.tex" - output_pdf = self.test_dir / "output.pdf" - - tex_file.write_text("\\documentclass{book}\\begin{document}Test\\end{document}") - - with patch('subprocess.run', side_effect=FileNotFoundError("xelatex: command not found")): - result = compile_tex_to_pdf(tex_file, output_pdf) - self.assertFalse(result) + mock_run.assert_not_called() + + def test_returns_false_when_pdf_not_produced(self): + """A run that doesn't write a PDF is treated as a failed build.""" + def latexmk_no_pdf(cmd, **kwargs): + r = MagicMock() + r.returncode = 0 + r.stdout = "" + r.stderr = "" + return r + + with patch( + "opensiddur.exporter.pdf.pdf._have_command", + side_effect=self._have_command, + ): + with patch("subprocess.run", side_effect=latexmk_no_pdf): + result = compile_tex_to_pdf(self.tex_file, self.output_pdf) - def test_compile_tex_to_pdf_bibtex_not_found(self): - """Test handling of BibTeX not found error.""" - tex_file = self.test_dir / "test.tex" - output_pdf = self.test_dir / "output.pdf" - - tex_file.write_text("\\documentclass{book}\\begin{document}Test\\end{document}") - - def mock_run_side_effect(cmd, *args, **kwargs): - if cmd[0] == 'xelatex': - mock_result = Mock() - mock_result.returncode = 0 - mock_result.stdout = "Success" - mock_result.stderr = "" - return mock_result - elif cmd[0] == 'bibtex': - raise FileNotFoundError("bibtex: command not found") - return Mock() - - with patch('subprocess.run', side_effect=mock_run_side_effect): - with tempfile.TemporaryDirectory() as temp_dir: - aux_file = Path(temp_dir) / "test.aux" - aux_file.write_text("\\bibdata{test}") - - original_exists = Path.exists - def patched_exists(path_instance): - if str(path_instance) == str(aux_file): - return True - return original_exists(path_instance) - - with patch.object(Path, 'exists', side_effect=patched_exists): - with patch('pathlib.Path.read_text') as mock_read: - def read_text_side_effect(path_instance): - if str(path_instance) == str(aux_file): - return aux_file.read_text() - return "" - mock_read.side_effect = read_text_side_effect - result = compile_tex_to_pdf(tex_file, output_pdf) - self.assertFalse(result) -class TestRunBibtexBehavior(unittest.TestCase): - """Tests for the run_bibtex() nested function invoked inside compile_tex_to_pdf. - - run_bibtex() is reached only when the xelatex .aux file contains \\bibdata or - \\citation. It calls ['bibtex', stem] with cwd set to xelatex's output directory - (where the .aux file was written), ignores the bibtex exit code, and checks - result.stdout case-insensitively for "error message" to decide success. - """ +class TestCompileTexToPdfManualLoop(unittest.TestCase): + """Tests for the manual fallback path (no latexmk on the system).""" def setUp(self): self.temp_dir = tempfile.TemporaryDirectory() self.addCleanup(self.temp_dir.cleanup) - self.tex_dir = Path(self.temp_dir.name) - self.tex_file = self.tex_dir / "mydoc.tex" - self.tex_file.write_text(r"\documentclass{article}\begin{document}Hello\end{document}") - self.output_pdf = self.tex_dir / "output.pdf" + self.test_dir = Path(self.temp_dir.name) + self.tex_file = self.test_dir / "test.tex" + self.tex_file.write_text(r"\documentclass{book}\begin{document}Test\end{document}") + self.output_pdf = self.test_dir / "out.pdf" - def _make_run_side_effect(self, aux_content=None, bibtex_stdout="", bibtex_returncode=0): - """Return a subprocess.run side_effect that simulates xelatex and bibtex. + def _only_lualatex(self, name): + # Pretend lualatex is installed but latexmk is not. + return name in {"lualatex", "bibtex"} - xelatex writes aux_content (if given) and a fake PDF to its -output-directory. - bibtex returns the configured stdout and returncode. - """ - tex_stem = self.tex_file.stem + def _make_lualatex_run(self, aux_bib=False, rerun_passes=0): + """Return a subprocess.run side_effect that simulates lualatex (and bibtex).""" + call_count = [0] def side_effect(cmd, **kwargs): - result = MagicMock() - if cmd[0] == 'xelatex': - out_dir = Path(cmd[cmd.index('-output-directory') + 1]) - if aux_content is not None: - (out_dir / f"{tex_stem}.aux").write_text(aux_content) - (out_dir / f"{tex_stem}.pdf").write_bytes(b"%PDF-1.4 fake") - result.returncode = 0 - result.stdout = "" - result.stderr = "" - elif cmd[0] == 'bibtex': - result.returncode = bibtex_returncode - result.stdout = bibtex_stdout - result.stderr = "" - return result - - return side_effect - - def _bibtex_calls(self, mock_run): - return [c for c in mock_run.call_args_list if c.args[0][0] == 'bibtex'] - - def _xelatex_calls(self, mock_run): - return [c for c in mock_run.call_args_list if c.args[0][0] == 'xelatex'] - - # --- triggering conditions --- - - def test_bibtex_invoked_when_aux_contains_bibdata(self): - with patch('subprocess.run', side_effect=self._make_run_side_effect( - aux_content=r"\bibdata{references}" - )) as mock_run: - result = compile_tex_to_pdf(self.tex_file, self.output_pdf) - - self.assertTrue(result) - self.assertEqual(len(self._bibtex_calls(mock_run)), 1) - - def test_bibtex_invoked_when_aux_contains_citation(self): - with patch('subprocess.run', side_effect=self._make_run_side_effect( - aux_content=r"\citation{smith2020}" - )) as mock_run: - result = compile_tex_to_pdf(self.tex_file, self.output_pdf) + r = MagicMock() + r.returncode = 0 + r.stdout = "" + r.stderr = "" + if cmd[0] == "lualatex": + call_count[0] += 1 + out_dir = next( + Path(arg.split("=", 1)[1]) + for arg in cmd + if arg.startswith("-output-directory=") + ) + if call_count[0] == 1 and aux_bib: + (out_dir / f"{self.tex_file.stem}.aux").write_text("\\bibdata{job}\\n") + if call_count[0] <= rerun_passes: + r.stdout = "Rerun to get cross-references right" + (out_dir / f"{self.tex_file.stem}.pdf").write_bytes(b"%PDF fake") + elif cmd[0] == "bibtex": + r.stdout = "" + return r - self.assertTrue(result) - self.assertEqual(len(self._bibtex_calls(mock_run)), 1) + return side_effect, call_count - def test_bibtex_not_invoked_when_aux_has_no_bibliography_markers(self): - with patch('subprocess.run', side_effect=self._make_run_side_effect( - aux_content=r"\relax" - )) as mock_run: - result = compile_tex_to_pdf(self.tex_file, self.output_pdf) + def test_manual_loop_runs_lualatex(self): + side_effect, call_count = self._make_lualatex_run() + with patch( + "opensiddur.exporter.pdf.pdf._have_command", + side_effect=self._only_lualatex, + ): + with patch("subprocess.run", side_effect=side_effect): + result = compile_tex_to_pdf(self.tex_file, self.output_pdf) self.assertTrue(result) - self.assertEqual(len(self._bibtex_calls(mock_run)), 0) - - def test_bibtex_not_invoked_when_no_aux_file_produced(self): - with patch('subprocess.run', side_effect=self._make_run_side_effect( - aux_content=None - )) as mock_run: - result = compile_tex_to_pdf(self.tex_file, self.output_pdf) - - self.assertEqual(len(self._bibtex_calls(mock_run)), 0) - - # --- command construction --- - - def test_bibtex_called_with_stem_only_and_output_cwd(self): - """bibtex receives just the file stem (no extension) and runs in xelatex's output directory.""" - with patch('subprocess.run', side_effect=self._make_run_side_effect( - aux_content=r"\bibdata{refs}" - )) as mock_run: - compile_tex_to_pdf(self.tex_file, self.output_pdf) - - bibtex_calls = self._bibtex_calls(mock_run) - xelatex_calls = self._xelatex_calls(mock_run) - xelatex_out_dir = Path(xelatex_calls[0].args[0][xelatex_calls[0].args[0].index('-output-directory') + 1]) + self.assertGreaterEqual(call_count[0], 1) + + def test_manual_loop_invokes_bibtex_when_aux_indicates_bibliography(self): + """A first-pass .aux with \\bibdata triggers bibtex + at least one extra lualatex pass.""" + side_effect, _ = self._make_lualatex_run(aux_bib=True) + with patch( + "opensiddur.exporter.pdf.pdf._have_command", + side_effect=self._only_lualatex, + ): + with patch("subprocess.run", side_effect=side_effect) as mock_run: + compile_tex_to_pdf(self.tex_file, self.output_pdf) + + bibtex_calls = [c for c in mock_run.call_args_list if c.args[0][0] == "bibtex"] + lualatex_calls = [ + c for c in mock_run.call_args_list if c.args[0][0] == "lualatex" + ] self.assertEqual(len(bibtex_calls), 1) - self.assertEqual(bibtex_calls[0].args[0], ['bibtex', 'mydoc']) - self.assertEqual(bibtex_calls[0].kwargs['cwd'], str(xelatex_out_dir)) + # At least 2 lualatex passes: the initial one plus the post-bibtex rerun. + self.assertGreaterEqual(len(lualatex_calls), 2) - # --- success path --- + def test_manual_loop_caps_at_max_runs(self): + """The loop must not run lualatex more than max_runs times.""" + side_effect, call_count = self._make_lualatex_run(rerun_passes=99) + with patch( + "opensiddur.exporter.pdf.pdf._have_command", + side_effect=self._only_lualatex, + ): + with patch("subprocess.run", side_effect=side_effect): + compile_tex_to_pdf(self.tex_file, self.output_pdf, max_runs=3) - def test_bibtex_success_returns_true_and_overall_compile_succeeds(self): - """Clean bibtex output (no 'error message') → run_bibtex returns True.""" - with patch('subprocess.run', side_effect=self._make_run_side_effect( - aux_content=r"\bibdata{refs}", - bibtex_stdout="This is BibTeX, Version 0.99d (TeX Live 2023)" - )) as mock_run: - result = compile_tex_to_pdf(self.tex_file, self.output_pdf) - - self.assertTrue(result) - self.assertEqual(len(self._bibtex_calls(mock_run)), 1) - - def test_bibtex_forces_xelatex_rerun_regardless_of_success(self): - """After bibtex, needs_rerun is always set to True, causing ≥2 xelatex passes.""" - with patch('subprocess.run', side_effect=self._make_run_side_effect( - aux_content=r"\bibdata{refs}", - bibtex_stdout="" # clean output, no rerun indicators from xelatex either - )) as mock_run: - result = compile_tex_to_pdf(self.tex_file, self.output_pdf) - - self.assertTrue(result) - self.assertGreaterEqual(len(self._xelatex_calls(mock_run)), 2) - - def test_bibtex_nonzero_returncode_is_not_checked(self): - """BibTeX exit code is ignored; only stdout matters for error detection.""" - with patch('subprocess.run', side_effect=self._make_run_side_effect( - aux_content=r"\bibdata{refs}", - bibtex_stdout="Warnings, but no error message", - bibtex_returncode=2 - )) as mock_run: - result = compile_tex_to_pdf(self.tex_file, self.output_pdf) - - self.assertTrue(result) - self.assertEqual(len(self._bibtex_calls(mock_run)), 1) - - def test_bibtex_error_only_in_stderr_is_not_detected(self): - """Errors appearing in bibtex stderr (not stdout) do not trigger the error path.""" - with patch('subprocess.run', side_effect=self._make_run_side_effect( - aux_content=r"\bibdata{refs}", - bibtex_stdout="", # stdout clean - bibtex_returncode=1 # stderr would have errors - )) as mock_run: - result = compile_tex_to_pdf(self.tex_file, self.output_pdf) - - self.assertTrue(result) - - # --- error path --- - - def test_bibtex_error_message_in_stdout_is_nonfatal(self): - """'error message' in bibtex stdout → run_bibtex returns False, but - compile_tex_to_pdf still continues and succeeds (bibtex errors are warnings).""" - with patch('subprocess.run', side_effect=self._make_run_side_effect( - aux_content=r"\bibdata{refs}", - bibtex_stdout="I found no \\bibdata command---error message: missing entry" - )) as mock_run: - result = compile_tex_to_pdf(self.tex_file, self.output_pdf) - - self.assertTrue(result) - # xelatex still reruns after bibtex even when bibtex "failed" - self.assertGreaterEqual(len(self._xelatex_calls(mock_run)), 2) - - def test_bibtex_error_message_check_is_case_insensitive(self): - """The 'error message' match uses .lower(), so any capitalisation triggers it.""" - for variant in ["error message: bad", "Error Message: bad", "ERROR MESSAGE: bad"]: - with self.subTest(stdout_variant=variant): - with patch('subprocess.run', side_effect=self._make_run_side_effect( - aux_content=r"\bibdata{refs}", - bibtex_stdout=variant - )): - result = compile_tex_to_pdf(self.tex_file, self.output_pdf) - - # All capitalisation variants are treated identically (nonfatal) - self.assertTrue(result) - - def test_bibtex_file_not_found_returns_false(self): - """FileNotFoundError from bibtex (not installed) → compile_tex_to_pdf returns False.""" - def side_effect(cmd, **kwargs): - result = MagicMock() - if cmd[0] == 'xelatex': - out_dir = Path(cmd[cmd.index('-output-directory') + 1]) - (out_dir / f"{self.tex_file.stem}.aux").write_text(r"\bibdata{refs}") - (out_dir / f"{self.tex_file.stem}.pdf").write_bytes(b"%PDF fake") - result.returncode = 0 - result.stdout = result.stderr = "" - elif cmd[0] == 'bibtex': - raise FileNotFoundError("bibtex: No such file or directory") - return result - - with patch('subprocess.run', side_effect=side_effect): - result = compile_tex_to_pdf(self.tex_file, self.output_pdf) - - self.assertFalse(result) + self.assertLessEqual(call_count[0], 3) class TestExportToPdf(unittest.TestCase): """Test the export_to_pdf function.""" def setUp(self): - """Set up test fixtures.""" self.temp_dir = tempfile.TemporaryDirectory() self.addCleanup(self.temp_dir.cleanup) self.test_dir = Path(self.temp_dir.name) def test_export_to_pdf_success(self): - """Test successful PDF export.""" input_file = self.test_dir / "input.xml" - output_pdf = self.test_dir / "output.pdf" - - # Create minimal valid XML - input_file.write_text('Test') - - with patch('opensiddur.exporter.pdf.pdf.generate_tex', return_value=True) as mock_generate: - with patch('opensiddur.exporter.pdf.pdf.compile_tex_to_pdf', return_value=True) as mock_compile: + output_pdf = self.test_dir / "out.pdf" + input_file.write_text("") + + with patch( + "opensiddur.exporter.pdf.pdf.generate_tex", return_value=True + ) as mock_gen: + with patch( + "opensiddur.exporter.pdf.pdf.compile_tex_to_pdf", return_value=True + ) as mock_compile: result = export_to_pdf(input_file, output_pdf) - + self.assertTrue(result) - mock_generate.assert_called_once() + mock_gen.assert_called_once() mock_compile.assert_called_once() + def test_export_to_pdf_forwards_settings_file_to_generate(self): + """settings_file is forwarded to generate_tex (the consumer of typography).""" + input_file = self.test_dir / "input.xml" + output_pdf = self.test_dir / "out.pdf" + settings_file = self.test_dir / "settings.yaml" + input_file.write_text("") + + with patch( + "opensiddur.exporter.pdf.pdf.generate_tex", return_value=True + ) as mock_gen: + with patch( + "opensiddur.exporter.pdf.pdf.compile_tex_to_pdf", return_value=True + ): + export_to_pdf(input_file, output_pdf, settings_file=settings_file) + + kwargs = mock_gen.call_args.kwargs + self.assertEqual(kwargs.get("settings_file"), settings_file) + + def test_export_to_pdf_writes_intermediate_tex_when_requested(self): + """When tex_output is provided, generate_tex writes to that path and it is kept.""" + input_file = self.test_dir / "input.xml" + output_pdf = self.test_dir / "out.pdf" + tex_output = self.test_dir / "intermediate.tex" + input_file.write_text("") + + with patch("opensiddur.exporter.pdf.pdf.generate_tex", return_value=True) as mock_gen: + with patch("opensiddur.exporter.pdf.pdf.compile_tex_to_pdf", return_value=True): + result = export_to_pdf(input_file, output_pdf, tex_output=tex_output) + + self.assertTrue(result) + self.assertEqual(mock_gen.call_args.args[1], tex_output) + def test_export_to_pdf_input_file_not_found(self): - """Test export when input file doesn't exist.""" - input_file = self.test_dir / "nonexistent.xml" - output_pdf = self.test_dir / "output.pdf" - + input_file = self.test_dir / "nope.xml" + output_pdf = self.test_dir / "out.pdf" + result = export_to_pdf(input_file, output_pdf) - + self.assertFalse(result) def test_export_to_pdf_generate_tex_failure(self): - """Test export when generate_tex fails.""" input_file = self.test_dir / "input.xml" - output_pdf = self.test_dir / "output.pdf" - - input_file.write_text('Test') - - with patch('opensiddur.exporter.pdf.pdf.generate_tex', return_value=False): + output_pdf = self.test_dir / "out.pdf" + input_file.write_text("") + + with patch( + "opensiddur.exporter.pdf.pdf.generate_tex", return_value=False + ): result = export_to_pdf(input_file, output_pdf) - + self.assertFalse(result) def test_export_to_pdf_compile_failure(self): - """Test export when compile_tex_to_pdf fails.""" input_file = self.test_dir / "input.xml" - output_pdf = self.test_dir / "output.pdf" - - input_file.write_text('Test') - - with patch('opensiddur.exporter.pdf.pdf.generate_tex', return_value=True): - with patch('opensiddur.exporter.pdf.pdf.compile_tex_to_pdf', return_value=False): + output_pdf = self.test_dir / "out.pdf" + input_file.write_text("") + + with patch( + "opensiddur.exporter.pdf.pdf.generate_tex", return_value=True + ): + with patch( + "opensiddur.exporter.pdf.pdf.compile_tex_to_pdf", return_value=False + ): result = export_to_pdf(input_file, output_pdf) - - self.assertFalse(result) - def test_export_to_pdf_full_integration(self): - """Test full integration with mocked dependencies.""" - input_file = self.test_dir / "input.xml" - output_pdf = self.test_dir / "output.pdf" - - input_file.write_text('Test') - - # Mock the full pipeline - with patch('opensiddur.exporter.pdf.pdf.generate_tex', return_value=True): - with patch('opensiddur.exporter.pdf.pdf.compile_tex_to_pdf', return_value=True): - result = export_to_pdf(input_file, output_pdf) - - # Should succeed when both steps succeed - self.assertTrue(result) + self.assertFalse(result) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() - diff --git a/opensiddur/tests/exporter/test_reledmac_xslt.py b/opensiddur/tests/exporter/test_reledmac_xslt.py new file mode 100644 index 0000000..53a5253 --- /dev/null +++ b/opensiddur/tests/exporter/test_reledmac_xslt.py @@ -0,0 +1,479 @@ +"""Tests for the reledmac/reledpar XSLT (`opensiddur/exporter/tex/reledmac.xslt`). + +These tests live one level above the LaTeX engine: they call the XSLT +transformation directly and assert structural properties of the emitted +``.tex`` text. The actual ``lualatex`` invocation is mocked everywhere it +might be triggered, since CI doesn't have a TeXLive install. + +Two invariants are critical for reledpar to align verses across page +breaks: + +1. Both streams of a ``p:parallel`` block must emit the **same number of** + ``\\pstart`` (and ``\\pend``) markers, in document order. +2. Each ``tei:milestone[@unit='verse']`` must produce a fresh ``\\pstart`` + so reledpar can pair the Nth verse on each side. + +Editorial/instructional notes must come out as well-formed +``\\edtext{...}{...}`` constructs so reledmac places them in the apparatus, +not as floating ``\\footnote``s. +""" + +import re +import unittest +from pathlib import Path + +from lxml import etree + +from opensiddur.common.xslt import xslt_transform_string +from opensiddur.exporter.tex.latex import XSLT_FILE +from opensiddur.exporter.marker_reconstruct import reconstruct_markered_document + + +def _transform(xml: str, **params) -> str: + """Transform ``xml`` with the reledmac XSLT, supplying empty defaults + for the preamble/postamble parameters that the XSLT expects.""" + full_params = { + "additional-preamble": "", + "additional-postamble": "", + } + full_params.update(params) + return xslt_transform_string(XSLT_FILE, xml, xslt_params=full_params) + + +class TestPreamble(unittest.TestCase): + """The LuaLaTeX preamble must declare the engine, polyglossia, and + reledmac (plus reledpar when there's any parallel block).""" + + def test_preamble_loads_reledmac_and_polyglossia(self): + xml = """ + + Hi + """ + out = _transform(xml) + self.assertIn(r"\documentclass", out) + self.assertIn(r"\usepackage{polyglossia}", out) + self.assertIn(r"\usepackage{reledmac}", out) + # No parallel content → no reledpar package. + self.assertNotIn(r"\usepackage{reledpar}", out) + self.assertIn(r"\setotherlanguage{hebrew}", out) + + def test_preamble_loads_reledpar_when_parallel(self): + xml = """ + + + + שלום + Hello + + + """ + out = _transform(xml) + self.assertIn(r"\usepackage{reledpar}", out) + + def test_preamble_honors_typography_parameters(self): + xml = """ + + Hi + """ + out = _transform( + xml, + **{ + "hebrew-font": "Ezra SIL", + "latin-font": "TeX Gyre Pagella", + "paper": "letterpaper", + "fontsize": "12pt", + }, + ) + self.assertIn(r"\documentclass[12pt,letterpaper]{book}", out) + self.assertIn("Ezra SIL", out) + self.assertIn("TeX Gyre Pagella", out) + + +class TestSingleStreamMapping(unittest.TestCase): + """Single-language documents (no p:parallel) must still produce a valid + \\beginnumbering...\\endnumbering block. When there is no parallel alignment + requirement, verses should flow inline (paragraph-like), not one line per verse.""" + + XML = """ + + + + + In the beginning. + And the earth. + Let there be light. + + + """ + + def test_emits_single_numbering_block(self): + out = _transform(self.XML) + self.assertEqual(out.count(r"\beginnumbering"), 1) + self.assertEqual(out.count(r"\endnumbering"), 1) + + def test_verses_flow_inline_in_single_stream(self): + out = _transform(self.XML) + # The fixture has one tei:p containing 3 verse milestones, so we expect + # one paragraph-level \\pstart/\\pend pair (not 1 per verse). + self.assertEqual(out.count(r"\pstart"), 1) + self.assertEqual(out.count(r"\pend"), 1) + + def test_chapter_milestone_emits_eledsection(self): + out = _transform(self.XML) + # Chapter numbers are forced LTR to avoid digit reversal in RTL contexts. + self.assertIn(r"\eledsection{\begingroup\textdir TLT\selectlanguage{english}1\endgroup}", out) + + def test_chapter_number_forces_ltr_digits_in_hebrew_context(self): + """Digits inside Hebrew RTL contexts can render reversed unless forced LTR.""" + xml = """ + + + + + טקסט + + + """ + out = _transform(xml) + self.assertIn(r"\eledsection{\begingroup\textdir TLT\selectlanguage{english}12\endgroup}", out) + + def test_verse_numbers_appear_as_superscripts(self): + out = _transform(self.XML) + # The \vno{} command renders as a superscript prefix. + self.assertIn(r"\vno{1}", out) + self.assertIn(r"\vno{2}", out) + self.assertIn(r"\vno{3}", out) + + +class TestParallelMapping(unittest.TestCase): + """Parallel blocks must produce two synchronized streams, both wrapped + in \\beginnumbering...\\endnumbering, surrounded by + \\begin{pages}/\\end{pages} (or \\begin{pairs}) and ended with the + matching reledpar typesetter command.""" + + XML = """ + + + + + + + בראשית. + והארץ. + ויאמר. + + + + + + In the beginning. + And the earth. + Let there be light. + + + + + """ + + def test_emits_pages_environment_by_default(self): + out = _transform(self.XML) + self.assertIn(r"\begin{pages}", out) + self.assertIn(r"\end{pages}", out) + self.assertIn(r"\Pages", out) + self.assertIn(r"\begin{Leftside}", out) + self.assertIn(r"\begin{Rightside}", out) + + def test_empty_parallel_block_is_skipped(self): + xml = """ + + + + + + + + """ + out = _transform(xml) + self.assertNotIn(r"\begin{pages}", out) + + def test_pairs_layout_uses_columns_typesetter(self): + out = _transform(self.XML, layout="pairs") + self.assertIn(r"\begin{pairs}", out) + self.assertIn(r"\Columns", out) + self.assertNotIn(r"\begin{pages}", out) + self.assertNotIn(r"\Pages", out) + + def test_each_side_has_its_own_numbering(self): + out = _transform(self.XML) + # One \beginnumbering per side, one \endnumbering per side. + self.assertEqual(out.count(r"\beginnumbering"), 2) + self.assertEqual(out.count(r"\endnumbering"), 2) + + def test_pstart_counts_match_across_streams(self): + """The two streams must emit the SAME number of \\pstart markers, + else reledpar can't pair them by position.""" + out = _transform(self.XML) + # Block-level pstart/pend: one per side (one parallel block). + self.assertEqual(out.count(r"\pstart"), 2) + self.assertEqual(out.count(r"\pend"), 2) + + def test_pstart_pair_count_matches_verse_count_per_side(self): + """Within each side's numbering block, we use one block-level \\pstart, + while verse numbers remain inline via \\vno{n}.""" + out = _transform(self.XML) + left_match = re.search( + r"\\begin\{Leftside\}(.*?)\\end\{Leftside\}", out, re.DOTALL + ) + right_match = re.search( + r"\\begin\{Rightside\}(.*?)\\end\{Rightside\}", out, re.DOTALL + ) + self.assertIsNotNone(left_match) + self.assertIsNotNone(right_match) + self.assertEqual(left_match.group(1).count(r"\pstart"), 1) + self.assertEqual(right_match.group(1).count(r"\pstart"), 1) + for n in ("1", "2", "3"): + self.assertIn(rf"\vno{{{n}}}", left_match.group(1)) + self.assertIn(rf"\vno{{{n}}}", right_match.group(1)) + + def test_column_order_swaps_streams(self): + """primary_last puts the parallel (English) stream on the left.""" + xml = self.XML.replace('column-order="primary_first"', 'column-order="primary_last"') + out = _transform(xml) + left_match = re.search( + r"\\begin\{Leftside\}(.*?)\\end\{Leftside\}", out, re.DOTALL + ) + self.assertIsNotNone(left_match) + # Hebrew text should now be on the right, English on the left. + self.assertIn("In the beginning", left_match.group(1)) + self.assertNotIn("בראשית", left_match.group(1)) + + def test_hebrew_stream_is_wrapped_in_polyglossia_block(self): + """Hebrew streams need to be inside a hebrew environment so direction + and font are picked up everywhere inside numbering.""" + out = _transform(self.XML) + # Look at the Leftside (which is Hebrew when column-order=primary_first + # and the primary lang=he). + left_match = re.search( + r"\\begin\{Leftside\}(.*?)\\end\{Leftside\}", out, re.DOTALL + ) + self.assertIsNotNone(left_match) + self.assertIn(r"\begin{hebrew}", left_match.group(1)) + self.assertIn(r"\end{hebrew}", left_match.group(1)) + + def test_parallel_row_after_marker_reconstruct(self): + """After marker reconstruction, the XSLT must still emit a pages-based + parallel wrapper with two numbering streams (one per side). This test + belongs with the XSLT structural invariants, not with reconstruction + mechanics. + """ + tei_ns = "http://www.tei-c.org/ns/1.0" + p_ns = "http://jewishliturgy.org/ns/processing" + xml = f""" + + + + שלום + Hello + + + """ + root = etree.fromstring(xml.encode("utf-8")) + reconstruct_markered_document(root) + out = xslt_transform_string( + XSLT_FILE, + etree.tostring(root, encoding="unicode"), + xslt_params={"additional-preamble": "", "additional-postamble": ""}, + ) + + self.assertIn(r"\begin{pages}", out) + self.assertIn(r"\Pages", out) + self.assertEqual(out.count(r"\beginnumbering"), 2) + self.assertEqual(out.count(r"\endnumbering"), 2) + self.assertIn("שלום", out) + self.assertIn("Hello", out) + + +class TestNotesMapping(unittest.TestCase): + """tei:note elements must become reledmac apparatus footnotes anchored + via \\edtext{...}{\\Bfootnote{...}}. Instructional notes are inline via + \\instructionnote{...}, not footnotes.""" + + def test_default_note_is_b_series_apparatus(self): + xml = """ + + + Bodycommentary + + """ + out = _transform(xml) + # \edtext{}{\Bfootnote{}} is the proper reledmac idiom for apparatus notes: + # zero-width lemma + B-series footnote at page bottom (not an endnote after \pend). + self.assertIn(r"\edtext{}{\Bfootnote{\notenote{", out) + self.assertIn("commentary", out) + self.assertNotIn(r"\footnote{", out) + + def test_instruction_note_is_inline(self): + xml = """ + + + Bodystand + + """ + out = _transform(xml) + self.assertIn(r"\instructionnote{", out) + self.assertIn("stand", out) + + def test_standoff_note_appears_at_anchor_position(self): + """Notes stored in tei:standOff must be emitted as B-series apparatus + footnotes at the tei:anchor position in the body, not silently dropped.""" + xml = """ + + + + + Hebrew text more text + + + + English annotation + + """ + out = _transform(xml) + self.assertIn(r"\edtext{}{\Bfootnote{\notenote{", out) + self.assertIn("English annotation", out) + # English note inside Hebrew stream must force LTR direction. + self.assertIn(r"\begingroup\textdir TLT\selectlanguage{english}", out) + + def test_standoff_note_with_multiple_targets(self): + """A note targeting multiple anchors must appear at each anchor site.""" + xml = """ + + + + + Word1 Word2 + + + + Shared note + + """ + out = _transform(xml) + # The same note appears twice — once per anchor. + self.assertEqual(out.count("Shared note"), 2) + + def test_note_language_forces_direction(self): + """Notes must force their own direction based on the in-scope xml:lang. + + In practice we do this by wrapping note content with polyglossia + helpers: \textenglish{...} and \texthebrew{...}. + """ + xml = """ + + + + עבריתEnglish note + Inline English instruction + + + """ + out = _transform(xml) + self.assertIn(r"\begingroup\textdir TLT\selectlanguage{english} English note\endgroup", out) + self.assertIn(r"\begingroup\textdir TLT\selectlanguage{english} Inline English instruction\endgroup", out) + + +class TestInlineFormatting(unittest.TestCase): + """Inline formatting elements that survived the compiler should map to + appropriate LaTeX commands while staying inside the verse's \\pstart.""" + + def test_small_caps(self): + xml = """ + + + The Lord said. + + """ + out = _transform(xml) + self.assertIn(r"\textsc{Lord}", out) + + def test_kri_ktiv_choice(self): + xml = """ + + + + + read + written + + + """ + out = _transform(xml) + self.assertIn(r"\textit{read}", out) + self.assertIn("(written)", out) + + def test_special_characters_are_tex_escaped(self): + """LaTeX-special characters in body text must be escaped to avoid + compilation failures in lualatex.""" + xml = """ + + + 50% of $5 & #1 + + """ + out = _transform(xml) + self.assertIn(r"50\% of \$5 \& \#1", out) + + def test_lb_emits_leavevmode_linebreak(self): + """tei:lb can appear at the start of a paragraph; we must ensure TeX is in + horizontal mode before emitting \\\\ to avoid 'There's no line here to end.'""" + xml = """ + + + Line 2 + + """ + out = _transform(xml) + self.assertIn(r"\leavevmode\\", out) + + +class TestStructuralElements(unittest.TestCase): + """tei:standOff and tei:pb should be skipped; head should produce a + sectioning command instead of inlining the title in the body.""" + + def test_standoff_notes_are_skipped(self): + xml = """ + + + Body + + Should not appear + + + """ + out = _transform(xml) + self.assertIn("Body", out) + self.assertNotIn("Should not appear", out) + + def test_div_head_emits_sectioning(self): + xml = """ + + + + Genesis + In the beginning. + + + """ + out = _transform(xml) + # Top-level body div with head → \eledchapter + self.assertIn(r"\eledchapter{Genesis}", out) + + +if __name__ == "__main__": + unittest.main() diff --git a/opensiddur/tests/exporter/test_xelatex.py b/opensiddur/tests/exporter/test_xelatex.py deleted file mode 100644 index bbc4350..0000000 --- a/opensiddur/tests/exporter/test_xelatex.py +++ /dev/null @@ -1,1713 +0,0 @@ -#!/usr/bin/env python3 -""" -Tests for opensiddur.exporter.tex.xelatex module -""" - -import unittest -import tempfile -from pathlib import Path -from lxml import etree -from io import StringIO -from unittest.mock import patch, MagicMock -import sys - -from opensiddur.exporter.tex.xelatex import ( - extract_licenses, - group_licenses, - licenses_to_tex, - extract_credits, - group_credits, - credits_to_tex, - get_file_references, - extract_sources, - transform_xml_to_tex, - LicenseRecord, - CreditRecord, -) - - -class TestExtractLicenses(unittest.TestCase): - """Test license extraction from XML files.""" - - def setUp(self): - """Set up test fixtures.""" - self.temp_dir = tempfile.TemporaryDirectory() - self.addCleanup(self.temp_dir.cleanup) - self.test_dir = Path(self.temp_dir.name) / "project" - self.test_dir.mkdir(parents=True) - - def _create_xml_file(self, filename: str, content: bytes) -> Path: - """Helper to create an XML file in a project subdirectory.""" - # Create in a test_project subdirectory to mimic real structure - project_dir = self.test_dir / "test_project" - project_dir.mkdir(parents=True, exist_ok=True) - file_path = project_dir / filename - file_path.write_bytes(content) - return file_path - - def test_extract_single_license(self): - """Test extracting a single license from an XML file.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - - - - Creative Commons Attribution 4.0 - - - - -''' - - file_path = self._create_xml_file("test.xml", xml_content) - - # Patch projects_source_root to use our temp directory - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = extract_licenses([file_path]) - - self.assertEqual(len(result), 1) - license_record = list(result.values())[0] - self.assertIsInstance(license_record, LicenseRecord) - self.assertEqual(license_record.url, "http://creativecommons.org/licenses/by/4.0/") - self.assertEqual(license_record.name, "Creative Commons Attribution 4.0") - - def test_extract_multiple_licenses_from_multiple_files(self): - """Test extracting licenses from multiple XML files.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml1 = b''' - - - License 1 - - -''' - - xml2 = b''' - - - License 2 - - -''' - - file1 = self._create_xml_file("file1.xml", xml1) - file2 = self._create_xml_file("file2.xml", xml2) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = extract_licenses([file1, file2]) - - self.assertEqual(len(result), 2) - self.assertIn("License 1", [lic.name for lic in result.values()]) - self.assertIn("License 2", [lic.name for lic in result.values()]) - - def test_extract_license_with_no_url(self): - """Test that a license without URL is not extracted (URL is required).""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - Public Domain -''' - - file_path = self._create_xml_file("test.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = extract_licenses([file_path]) - - # License without URL should not be extracted - self.assertEqual(len(result), 0) - - def test_extract_license_with_no_text(self): - """Test extracting a license with only URL, no text.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - -''' - - file_path = self._create_xml_file("test.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = extract_licenses([file_path]) - - self.assertEqual(len(result), 1) - license_record = list(result.values())[0] - self.assertEqual(license_record.url, "http://example.com/license") - self.assertEqual(license_record.name, "") - - def test_extract_no_licenses(self): - """Test extracting from file with no licenses.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - Some text - -''' - - file_path = self._create_xml_file("test.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = extract_licenses([file_path]) - - self.assertEqual(len(result), 0) - - def test_extract_license_handles_invalid_xml(self): - """Test that invalid XML is handled gracefully.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - file_path = self._create_xml_file("invalid.xml", b"not valid xml") - - # Should not raise exception, just skip the file - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = extract_licenses([file_path]) - self.assertEqual(len(result), 0) - - -class TestGroupLicenses(unittest.TestCase): - """Test license grouping.""" - - def test_group_single_license(self): - """Test grouping a single license.""" - licenses = { - Path("file1.xml"): LicenseRecord(url="http://license.com", name="License 1") - } - - result = group_licenses(licenses) - - self.assertEqual(len(result), 1) - self.assertEqual(result[0].url, "http://license.com") - self.assertEqual(result[0].name, "License 1") - - def test_group_deduplicates_same_url(self): - """Test that licenses with same URL are deduplicated.""" - licenses = { - Path("file1.xml"): LicenseRecord(url="http://license.com", name="License 1"), - Path("file2.xml"): LicenseRecord(url="http://license.com", name="License 1"), - Path("file3.xml"): LicenseRecord(url="http://license.com", name="License 1"), - } - - result = group_licenses(licenses) - - # Should only have 1 unique license - self.assertEqual(len(result), 1) - self.assertEqual(result[0].url, "http://license.com") - - def test_group_different_licenses(self): - """Test grouping different licenses.""" - licenses = { - Path("file1.xml"): LicenseRecord(url="http://license1.com", name="License 1"), - Path("file2.xml"): LicenseRecord(url="http://license2.com", name="License 2"), - Path("file3.xml"): LicenseRecord(url="http://license3.com", name="License 3"), - } - - result = group_licenses(licenses) - - self.assertEqual(len(result), 3) - urls = [lic.url for lic in result] - self.assertIn("http://license1.com", urls) - self.assertIn("http://license2.com", urls) - self.assertIn("http://license3.com", urls) - - -class TestLicensesToTex(unittest.TestCase): - """Test LaTeX generation from licenses.""" - - def test_single_license_to_tex(self): - """Test converting a single license to LaTeX.""" - licenses = [ - LicenseRecord(url="http://creativecommons.org/licenses/by/4.0/", name="CC BY 4.0") - ] - - result = licenses_to_tex(licenses) - - self.assertIn(r'\chapter{Legal}', result) - self.assertIn('CC BY 4.0', result) - self.assertIn(r'\url{http://creativecommons.org/licenses/by/4.0/}', result) - self.assertIn(r'\begin{itemize}', result) - self.assertIn(r'\end{itemize}', result) - - def test_multiple_licenses_to_tex(self): - """Test converting multiple licenses to LaTeX.""" - licenses = [ - LicenseRecord(url="http://license1.com", name="License 1"), - LicenseRecord(url="http://license2.com", name="License 2"), - ] - - result = licenses_to_tex(licenses) - - self.assertIn('License 1', result) - self.assertIn('License 2', result) - self.assertIn(r'\url{http://license1.com}', result) - self.assertIn(r'\url{http://license2.com}', result) - # Should have \item for each license - self.assertEqual(result.count(r'\item'), 2) - - -class TestExtractCredits(unittest.TestCase): - """Test credit extraction from XML files.""" - - def setUp(self): - """Set up test fixtures.""" - self.temp_dir = tempfile.TemporaryDirectory() - self.addCleanup(self.temp_dir.cleanup) - self.test_dir = Path(self.temp_dir.name) / "project" - self.test_dir.mkdir(parents=True) - - def _create_xml_file(self, filename: str, content: bytes) -> Path: - """Helper to create an XML file in a project subdirectory.""" - project_dir = self.test_dir / "test_project" - project_dir.mkdir(parents=True, exist_ok=True) - file_path = project_dir / filename - file_path.write_bytes(content) - return file_path - - def test_extract_single_credit(self): - """Test extracting a single credit from an XML file.""" - xml_content = b''' - - - - - Transcriber - John Doe - - - - -''' - - file_path = self._create_xml_file("test.xml", xml_content) - result = extract_credits([file_path]) - - self.assertEqual(len(result), 1) - credits = result[file_path] - self.assertEqual(len(credits), 1) - self.assertIsInstance(credits[0], CreditRecord) - self.assertEqual(credits[0].role, "trc") - self.assertEqual(credits[0].resp_text, "Transcriber") - self.assertEqual(credits[0].name_text, "John Doe") - self.assertEqual(credits[0].namespace, "namespace") - self.assertEqual(credits[0].contributor, "contributor") - - def test_extract_multiple_credits(self): - """Test extracting multiple credits from a file.""" - xml_content = b''' - - - - Author - Author Name - - - Editor - Editor Name - - - -''' - - file_path = self._create_xml_file("test.xml", xml_content) - result = extract_credits([file_path]) - - credits = result[file_path] - self.assertEqual(len(credits), 2) - self.assertEqual(credits[0].role, "aut") - self.assertEqual(credits[1].role, "edt") - - def test_extract_credits_handles_missing_elements(self): - """Test that respStmt with missing name element is skipped (ref is required).""" - xml_content = b''' - - - Transcriber - - -''' - - file_path = self._create_xml_file("test.xml", xml_content) - result = extract_credits([file_path]) - - # Should have 0 credits because name (and therefore ref) is required - credits = result[file_path] - self.assertEqual(len(credits), 0) - - -class TestGroupCredits(unittest.TestCase): - """Test credit grouping.""" - - def test_group_single_credit(self): - """Test grouping a single credit.""" - credits = { - Path("file1.xml"): [ - CreditRecord( - role="aut", - resp_text="Author", - ref="urn:x-opensiddur:namespace/contributor", - name_text="John Doe", - namespace="namespace", - contributor="contributor" - ) - ] - } - - result = group_credits(credits) - - self.assertIn("aut", result) - self.assertIn("namespace", result["aut"]) - self.assertEqual(len(result["aut"]["namespace"]), 1) - self.assertEqual(result["aut"]["namespace"][0].name_text, "John Doe") - - def test_group_deduplicates_same_contributor(self): - """Test that same contributor with same role is deduplicated.""" - credit1 = CreditRecord( - role="aut", - resp_text="Author", - ref="urn:x-opensiddur:ns/person", - name_text="John Doe", - namespace="ns", - contributor="person" - ) - - credits = { - Path("file1.xml"): [credit1], - Path("file2.xml"): [credit1], # Same credit - } - - result = group_credits(credits) - - # Should only appear once - self.assertEqual(len(result["aut"]["ns"]), 1) - - def test_group_multiple_roles(self): - """Test grouping credits with different roles.""" - credits = { - Path("file1.xml"): [ - CreditRecord(role="aut", resp_text="Author", ref="urn:x-opensiddur:ns/p1", - name_text="Person 1", namespace="ns", contributor="p1"), - CreditRecord(role="edt", resp_text="Editor", ref="urn:x-opensiddur:ns/p2", - name_text="Person 2", namespace="ns", contributor="p2"), - ] - } - - result = group_credits(credits) - - self.assertIn("aut", result) - self.assertIn("edt", result) - self.assertEqual(len(result["aut"]["ns"]), 1) - self.assertEqual(len(result["edt"]["ns"]), 1) - - def test_group_multiple_namespaces(self): - """Test grouping credits from different namespaces.""" - credits = { - Path("file1.xml"): [ - CreditRecord(role="aut", resp_text="Author", ref="urn:x-opensiddur:ns1/p1", - name_text="Person 1", namespace="ns1", contributor="p1"), - CreditRecord(role="aut", resp_text="Author", ref="urn:x-opensiddur:ns2/p2", - name_text="Person 2", namespace="ns2", contributor="p2"), - ] - } - - result = group_credits(credits) - - self.assertIn("aut", result) - self.assertIn("ns1", result["aut"]) - self.assertIn("ns2", result["aut"]) - self.assertEqual(len(result["aut"]["ns1"]), 1) - self.assertEqual(len(result["aut"]["ns2"]), 1) - - -class TestCreditsToTex(unittest.TestCase): - """Test LaTeX generation from credits.""" - - def test_single_credit_to_tex(self): - """Test converting a single credit to LaTeX.""" - credits = { - "aut": { - "namespace": [ - CreditRecord(role="aut", resp_text="Author", ref="urn:x-opensiddur:ns/p", - name_text="John Doe", namespace="ns", contributor="p") - ] - } - } - - result = credits_to_tex(credits) - - self.assertIn(r'\chapter{Contributor credits}', result) - self.assertIn(r'\section{Author}', result) # singular - self.assertIn(r'\subsection{From namespace}', result) - self.assertIn('John Doe', result) - - def test_multiple_credits_plural(self): - """Test that role names are pluralized correctly.""" - credits = { - "aut": { - "namespace": [ - CreditRecord(role="aut", resp_text="Author", ref="urn:x-opensiddur:ns/p1", - name_text="Person 1", namespace="ns", contributor="p1"), - CreditRecord(role="aut", resp_text="Author", ref="urn:x-opensiddur:ns/p2", - name_text="Person 2", namespace="ns", contributor="p2"), - ] - } - } - - result = credits_to_tex(credits) - - self.assertIn(r'\section{Authors}', result) # plural - - def test_credits_sorted_by_contributor(self): - """Test that credits are sorted alphabetically by contributor.""" - credits = { - "aut": { - "namespace": [ - CreditRecord(role="aut", resp_text="Author", ref="urn:x-opensiddur:ns/zebra", - name_text="Zebra", namespace="ns", contributor="zebra"), - CreditRecord(role="aut", resp_text="Author", ref="urn:x-opensiddur:ns/apple", - name_text="Apple", namespace="ns", contributor="apple"), - ] - } - } - - result = credits_to_tex(credits) - - # Apple should come before Zebra - apple_pos = result.find("Apple") - zebra_pos = result.find("Zebra") - self.assertLess(apple_pos, zebra_pos) - - -class TestGetFileReferences(unittest.TestCase): - """Test file reference extraction from compiled XML.""" - - def setUp(self): - """Set up test fixtures.""" - self.temp_dir = tempfile.TemporaryDirectory() - self.addCleanup(self.temp_dir.cleanup) - self.test_dir = Path(self.temp_dir.name) - self.project_dir = self.test_dir / "project" - self.project_dir.mkdir(parents=True) - - def _create_xml_file(self, filename: str, content: bytes) -> Path: - """Helper to create an XML file.""" - file_path = self.project_dir / filename - file_path.write_bytes(content) - return file_path - - def test_get_file_references_no_transclusions(self): - """Test getting file references from XML with no transclusions.""" - xml_content = b''' - - Some text - -''' - - file_path = self._create_xml_file("main.xml", xml_content) - result = get_file_references(file_path, self.project_dir) - - # Should include the main file and its index - self.assertIn(self.project_dir / "test_project" / "main.xml", result) - self.assertIn(self.project_dir / "test_project" / "index.xml", result) - - def test_get_file_references_with_transclusion(self): - """Test getting file references from XML with transclusions.""" - xml_content = b''' - - - Transcluded content - - -''' - - file_path = self._create_xml_file("main.xml", xml_content) - result = get_file_references(file_path, self.project_dir) - - # Should include main file, external file, and both index files - self.assertIn(self.project_dir / "main_project" / "main.xml", result) - self.assertIn(self.project_dir / "main_project" / "index.xml", result) - self.assertIn(self.project_dir / "external_project" / "external.xml", result) - self.assertIn(self.project_dir / "external_project" / "index.xml", result) - - def test_get_file_references_deduplicates(self): - """Test that duplicate file references are deduplicated.""" - xml_content = b''' - - - - - -''' - - file_path = self._create_xml_file("main.xml", xml_content) - result = get_file_references(file_path, self.project_dir) - - # Count how many times project2/file.xml appears - file_count = sum(1 for p in result if str(p).endswith("project2/file.xml")) - self.assertEqual(file_count, 1, "Duplicate files should be deduplicated") - - def test_get_file_references_multiple_projects(self): - """Test getting references from multiple projects.""" - xml_content = b''' - - - - -''' - - file_path = self._create_xml_file("main.xml", xml_content) - result = get_file_references(file_path, self.project_dir) - - # Should include files from 3 projects - self.assertIn(self.project_dir / "project1" / "main.xml", result) - self.assertIn(self.project_dir / "project2" / "file2.xml", result) - self.assertIn(self.project_dir / "project3" / "file3.xml", result) - # And 3 index files - self.assertIn(self.project_dir / "project1" / "index.xml", result) - self.assertIn(self.project_dir / "project2" / "index.xml", result) - self.assertIn(self.project_dir / "project3" / "index.xml", result) - - def test_get_file_references_nested_transclusions(self): - """Test getting references with nested transclusions.""" - xml_content = b''' - - - - - -''' - - file_path = self._create_xml_file("main.xml", xml_content) - result = get_file_references(file_path, self.project_dir) - - # Should find all nested references - self.assertIn(self.project_dir / "project1" / "main.xml", result) - self.assertIn(self.project_dir / "project2" / "file2.xml", result) - self.assertIn(self.project_dir / "project3" / "file3.xml", result) - - -class TestExtractSources(unittest.TestCase): - """Test source extraction from index.xml files.""" - - def setUp(self): - """Set up test fixtures.""" - self.temp_dir = tempfile.TemporaryDirectory() - self.addCleanup(self.temp_dir.cleanup) - self.test_dir = Path(self.temp_dir.name) / "project" - self.test_dir.mkdir(parents=True) - - def _create_xml_file(self, project: str, filename: str, content: bytes) -> Path: - """Helper to create an XML file in a project subdirectory.""" - project_dir = self.test_dir / project - project_dir.mkdir(parents=True, exist_ok=True) - file_path = project_dir / filename - file_path.write_bytes(content) - return file_path - - def test_extract_sources_with_valid_index(self): - """Test extracting sources from a valid index.xml file with bibl elements.""" - index_content = b''' - - - - Test Book - Test Author - 2023 - - -''' - - # Create a file in project1 - file1 = self._create_xml_file("project1", "doc1.xml", b"") - index_file = self._create_xml_file("project1", "index.xml", index_content) - - result = extract_sources([file1]) - - preamble, postamble = result - self.assertIn(r'\begin{filecontents*}{job.bib}', preamble) - self.assertIn(r'\addbibresource{job.bib}', preamble) - self.assertIn(r'\printbibliography', postamble) - self.assertIn(r'\renewcommand{\refname}{Sources}', postamble) - # Should contain BibTeX entry with the actual source information - self.assertIn('@', preamble) - self.assertIn('Test Book', preamble) - self.assertIn('Test Author', preamble) - self.assertIn('2023', preamble) - - def test_extract_sources_no_bibl_elements(self): - """Test handling of index.xml with no bibl elements.""" - index_content = b''' - - No bibliography -''' - - file1 = self._create_xml_file("project1", "doc1.xml", b"") - index_file = self._create_xml_file("project1", "index.xml", index_content) - - preamble, postamble = extract_sources([file1]) - - # Should return empty strings when no bibliography - self.assertEqual(preamble, "") - self.assertEqual(postamble, "") - - def test_extract_sources_missing_index_file(self): - """Test handling of missing index.xml file (graceful skipping).""" - file1 = self._create_xml_file("project1", "doc1.xml", b"") - # Don't create index.xml - - # Should not raise exception - preamble, postamble = extract_sources([file1]) - - # Should return empty strings - self.assertEqual(preamble, "") - self.assertEqual(postamble, "") - - def test_extract_sources_invalid_xml(self): - """Test handling of invalid XML in index file.""" - file1 = self._create_xml_file("project1", "doc1.xml", b"") - index_file = self._create_xml_file("project1", "index.xml", b"not valid xml <") - - # Should not raise exception, should skip gracefully - preamble, postamble = extract_sources([file1]) - - self.assertEqual(preamble, "") - self.assertEqual(postamble, "") - - def test_extract_sources_multiple_projects(self): - """Test extracting sources from multiple projects with index files.""" - index1_content = b''' - - - - Book 1 - Author 1 - - -''' - - index2_content = b''' - - - - Book 2 - Author 2 - - -''' - - file1 = self._create_xml_file("project1", "doc1.xml", b"") - file2 = self._create_xml_file("project2", "doc2.xml", b"") - index1 = self._create_xml_file("project1", "index.xml", index1_content) - index2 = self._create_xml_file("project2", "index.xml", index2_content) - - result = extract_sources([file1, file2]) - - preamble, postamble = result - # Should contain bibliography entries from both projects - self.assertIn(r'\begin{filecontents*}{job.bib}', preamble) - # Should contain entries from both index files - self.assertIn('Book 1', preamble) - self.assertIn('Author 1', preamble) - self.assertIn('Book 2', preamble) - self.assertIn('Author 2', preamble) - # Should have exactly 2 BibTeX entries (one from each project) - bibtex_count = preamble.count('@') - self.assertEqual(bibtex_count, 2, - f"Expected exactly 2 BibTeX entries (one per project), but found {bibtex_count}") - - def test_extract_sources_deduplicates_index_files(self): - """Test that same index.xml is only processed once.""" - index_content = b''' - - - - Test Book - Test Author - - -''' - - # Multiple files from same project should reference same index - file1 = self._create_xml_file("project1", "doc1.xml", b"") - file2 = self._create_xml_file("project1", "doc2.xml", b"") - file3 = self._create_xml_file("project1", "doc3.xml", b"") - index_file = self._create_xml_file("project1", "index.xml", index_content) - - result = extract_sources([file1, file2, file3]) - - preamble, postamble = result - # Should have bibliography - self.assertIn(r'\begin{filecontents*}{job.bib}', preamble) - # BibTeX should appear exactly once (deduplicated by set) - # Count '@' symbols which appear at the start of each BibTeX entry - bibtex_count = preamble.count('@') - # Should have exactly one entry since all files reference the same index - self.assertEqual(bibtex_count, 1, - f"Expected exactly 1 BibTeX entry, but found {bibtex_count}") - - -class TestTransformXmlToTex(unittest.TestCase): - """Test the main transform_xml_to_tex function.""" - - def setUp(self): - """Set up test fixtures.""" - self.temp_dir = tempfile.TemporaryDirectory() - self.addCleanup(self.temp_dir.cleanup) - self.test_dir = Path(self.temp_dir.name) / "project" - self.test_dir.mkdir(parents=True) - - def _create_xml_file(self, project: str, filename: str, content: bytes) -> Path: - """Helper to create an XML file in a project subdirectory.""" - project_dir = self.test_dir / project - project_dir.mkdir(parents=True, exist_ok=True) - file_path = project_dir / filename - file_path.write_bytes(content) - return file_path - - def test_transform_xml_to_tex_basic(self): - """Test basic XML to LaTeX transformation.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - - - Hello World - - -''' - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - # Should produce LaTeX output - self.assertIsInstance(result, str) - self.assertIn(r'\documentclass{book}', result) - self.assertIn(r'\begin{document}', result) - self.assertIn('Hello World', result) - self.assertIn(r'\end{document}', result) - # Hebrew font should be selected robustly (fonts vary per system) - self.assertIn(r'\IfFontExistsTF{Frank Ruehl CLM}', result) - self.assertIn(r'\newfontfamily\hebrewfont', result) - self.assertIn(r'FreeSerif', result) - - def test_transform_xml_to_tex_parallel_uses_paracol(self): - """Parallel rendering must be page-breakable (paracol, not minipage).""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = ''' - - - - - AB - אב - - - -'''.encode("utf-8") - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - self.assertIn(r'\usepackage{paracol}', result) - self.assertIn(r'\begin{paracol}{2}', result) - self.assertNotIn(r'\begin{minipage}', result) - # Line breaks must be safe; \\[0pt] prevents next line '[' from being parsed as \\[dimen]. - self.assertIn(r'\leavevmode\\[0pt]', result) - - def test_transform_xml_to_tex_parallel_skips_outer_hebrew_when_root_is_he(self): - """Hebrew root + p:parallel must not wrap the whole body in \\begin{hebrew} (breaks EN column).""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = ''' - - - - - שלום - Hi - - - -'''.encode("utf-8") - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - # Outer wrapper: document should go straight into parallel (paracol), not global hebrew. - self.assertRegex(result, r"\\begin\{document\}\s*\\begin\{paracol\}") - self.assertNotRegex(result, r"\\begin\{document\}\s*\\begin\{hebrew\}") - - def test_transform_xml_to_tex_ignores_standoff_notes(self): - """Standoff notes should not render as free-floating footnotes.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - - - Body text - - This should not appear - - - -''' - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - self.assertIn("Body text", result) - self.assertNotIn("This should not appear", result) - - def test_transform_xml_to_tex_wraps_inherited_hebrew_lang(self): - """Hebrew may be inherited from ancestors; output must enter Hebrew context.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - # Only the root carries xml:lang; descendants inherit it. - xml_content = ''' - - - - בְּרֵאשִׁית - - -'''.encode("utf-8") - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - # We should enter a Hebrew context somewhere above the Hebrew text. - self.assertIn(r'\begin{hebrew}', result) - self.assertIn('בְּרֵאשִׁית', result) - - def test_transform_xml_to_tex_with_output_file(self): - """Test transformation with output file specified.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - - - Test content - - -''' - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - output_file = Path(self.temp_dir.name) / "output.tex" - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - with patch('sys.stdout'): - transform_xml_to_tex(input_file, output_file=str(output_file)) - - # Check that output file was created - self.assertTrue(output_file.exists()) - content = output_file.read_text(encoding='utf-8') - self.assertIn(r'\documentclass{book}', content) - self.assertIn('Test content', content) - - def test_transform_xml_to_tex_integrates_licenses(self): - """Test that transform integrates license extraction.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - - - - - Test License - - - - - - - Content - - -''' - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - # Should include license section in postamble - self.assertIn(r'\chapter{Legal}', result) - self.assertIn('Test License', result) - - def test_transform_xml_to_tex_integrates_credits(self): - """Test that transform integrates credit extraction.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - - - - - Author - Author Name - - - - - - - Content - - -''' - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - # Should include credits section in postamble - self.assertIn(r'\chapter{Contributor credits}', result) - self.assertIn('Author Name', result) - - def test_transform_xml_to_tex_integrates_sources(self): - """Test that transform integrates source extraction.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - - - Content - - -''' - - index_content = b''' - - - - Source Book - Source Author - - -''' - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - index_file = self._create_xml_file("project1", "index.xml", index_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - # Should include bibliography in preamble and postamble - self.assertIn(r'\addbibresource{job.bib}', result) - self.assertIn(r'\printbibliography', result) - - def test_transform_xml_to_tex_handles_invalid_xml(self): - """Test error handling for invalid XML.""" - from unittest.mock import patch, Mock - import opensiddur.exporter.tex.xelatex as xelatex_module - - input_file = self._create_xml_file("project1", "invalid.xml", b"not valid xml <") - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - mock_exit = Mock() - with patch('sys.exit', mock_exit): - # The function should catch the exception and call sys.exit(1) - transform_xml_to_tex(input_file) - # Verify that sys.exit was called with exit code 1 - mock_exit.assert_called_once_with(1) - - def test_transform_xml_to_tex_with_stdout(self): - """Test transformation output to stdout when output_file is None.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - - - Content - - -''' - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - mock_stdout = StringIO() - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - with patch('sys.stdout', mock_stdout): - transform_xml_to_tex(input_file, output_file=None) - output = mock_stdout.getvalue() - - # Should have written to stdout - self.assertIn(r'\documentclass{book}', output) - self.assertIn('Content', output) - - -class TestXSLTTransformation(unittest.TestCase): - """Test XSLT transformation directly.""" - - def setUp(self): - """Set up test fixtures.""" - self.temp_dir = tempfile.TemporaryDirectory() - self.addCleanup(self.temp_dir.cleanup) - self.xslt_file = Path(__file__).parent.parent.parent / "exporter" / "tex" / "xelatex.xslt" - - def test_xslt_tei_div(self): - """Test div element conversion.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - Chapter Title - Content - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\part{Chapter Title}', result) - self.assertIn('Content', result) - - def test_xslt_tei_p(self): - """Test paragraph element conversion.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - Paragraph text - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn('Paragraph text', result) - - def test_xslt_tei_milestone_chapter(self): - """Test milestone with chapter unit.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - Text - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\chapter{1}', result) - - def test_xslt_tei_milestone_verse(self): - """Test milestone with verse unit.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - Text - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\textsuperscript{5}', result) - - def test_xslt_tei_choice(self): - """Test choice element (kri/ktiv).""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - - read - written - - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\textit{read}', result) - self.assertIn('(written)', result) - - def test_xslt_tei_emph(self): - """Test emphasis element.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - emphasized - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\emph{emphasized}', result) - - def test_xslt_rend_italic(self): - """Test rend attribute with italic value.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - italic text - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\textit{italic text}', result) - - def test_xslt_rend_small_caps(self): - """Test rend attribute with small-caps value.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - small caps - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\textsc{small caps}', result) - - def test_xslt_rend_superscript(self): - """Test rend attribute with superscript value.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - superscript - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\textsuperscript{superscript}', result) - - def test_xslt_rend_align_right(self): - """Test rend attribute with align-right value.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - right aligned - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\begin{flushright}', result) - self.assertIn(r'\end{flushright}', result) - - def test_xslt_hebrew_language_inline(self): - """Test Hebrew language handling for inline text.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - עברית - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\texthebrew{', result) - self.assertIn('עברית', result) - - def test_xslt_hebrew_language_block(self): - """Test Hebrew language handling for block elements.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - עברית - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\begin{hebrew}', result) - self.assertIn(r'\end{hebrew}', result) - - def test_xslt_tei_foreign(self): - """Test foreign text element.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - עברית - Latin - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\texthebrew{עברית}', result) - self.assertIn(r'\textit{Latin}', result) - - def test_xslt_tei_note(self): - """Test note element conversion to footnote.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - TextNote content - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\footnote{\textenglish{Note content}}', result) - - def test_xslt_tei_note_forces_english_inside_hebrew_context(self): - """English notes should still be LTR when surrounding content is Hebrew.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - עבריתEnglish note - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\footnote{\textenglish{English note}}', result) - - def test_xslt_tei_note_lang_inherited_from_standoff(self): - """Note language should be derived from in-scope @xml:lang, even if not on tei:note.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - עבריתEnglish note - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\footnote{\textenglish{English note}}', result) - - def test_xslt_tei_lb(self): - """Test line break element.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - Line 1Line 2 - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\\', result) - - def test_xslt_tei_pb(self): - """Test page break element (should be skipped).""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - Text - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - # Should not contain pb-related content - self.assertIn('Text', result) - - def test_xslt_tei_lg_l(self): - """Test line group and line elements (poetry).""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - - Line 1 - Line 2 - - - -''' - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={"additional-preamble": "", "additional-postamble": ""}) - - self.assertIn(r'\begin{verse}', result) - self.assertIn(r'\end{verse}', result) - self.assertIn('Line 1', result) - self.assertIn('Line 2', result) - - def test_xslt_additional_preamble_postamble(self): - """Test that additional-preamble and additional-postamble parameters work.""" - from opensiddur.common.xslt import xslt_transform_string - - xml_content = ''' - - - - Content - - -''' - - preamble = "\\usepackage{testpackage}\n" - postamble = "\\chapter{Appendix}\nAppendix content\n" - - result = xslt_transform_string(self.xslt_file, xml_content, - xslt_params={ - "additional-preamble": preamble, - "additional-postamble": postamble - }) - - self.assertIn(preamble, result) - self.assertIn(postamble, result) - - -class TestEdgeCases(unittest.TestCase): - """Test edge cases and error handling.""" - - def setUp(self): - """Set up test fixtures.""" - self.temp_dir = tempfile.TemporaryDirectory() - self.addCleanup(self.temp_dir.cleanup) - self.test_dir = Path(self.temp_dir.name) / "project" - self.test_dir.mkdir(parents=True) - - def _create_xml_file(self, project: str, filename: str, content: bytes) -> Path: - """Helper to create an XML file.""" - project_dir = self.test_dir / project - project_dir.mkdir(parents=True, exist_ok=True) - file_path = project_dir / filename - file_path.write_bytes(content) - return file_path - - def test_extract_sources_empty_xml(self): - """Test extract_sources with empty XML file.""" - file1 = self._create_xml_file("project1", "doc1.xml", b"") - index_file = self._create_xml_file("project1", "index.xml", b"") - - preamble, postamble = extract_sources([file1]) - - self.assertEqual(preamble, "") - self.assertEqual(postamble, "") - - def test_transform_xml_to_tex_minimal_structure(self): - """Test transform with minimal valid XML structure.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - -''' - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - # Should still produce valid LaTeX structure - self.assertIn(r'\documentclass{book}', result) - self.assertIn(r'\begin{document}', result) - self.assertIn(r'\end{document}', result) - - def test_transform_xml_to_tex_special_characters(self): - """Test transform with special characters that need LaTeX escaping.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - - - Text with $special & characters - - -''' - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - # Should handle special characters (XSLT will pass through) - self.assertIn('Text with', result) - - def test_extract_sources_files_outside_project(self): - """Test extract_sources with files outside project directory.""" - # Create a file outside the project structure - outside_file = Path(self.temp_dir.name) / "outside.xml" - outside_file.write_bytes(b"") - - # Should not crash, but skip the file gracefully - # Note: extract_sources expects files in project directories, - # so outside files will be skipped - preamble, postamble = extract_sources([outside_file]) - - self.assertEqual(preamble, "") - self.assertEqual(postamble, "") - - def test_transform_xml_to_tex_complex_nested_structure(self): - """Test transform with complex nested divs.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - - - - Section 1 - - Subsection - Nested content - - - - -''' - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - # Should handle nested structures - self.assertIn(r'\part{Section 1}', result) - self.assertIn('Nested content', result) - - def test_transform_xml_to_tex_mixed_languages(self): - """Test transform with mixed English and Hebrew content.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = ''' - - - - English text עברית more English - - -'''.encode('utf-8') - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - # Should handle mixed languages - self.assertIn('English text', result) - self.assertIn(r'\texthebrew{', result) - self.assertIn('עברית', result) - - def test_extract_sources_multiple_files_same_project(self): - """Test extract_sources with multiple files from same project.""" - index_content = b''' - - - - Book - Author - - -''' - - file1 = self._create_xml_file("project1", "doc1.xml", b"") - file2 = self._create_xml_file("project1", "doc2.xml", b"") - index_file = self._create_xml_file("project1", "index.xml", index_content) - - preamble, postamble = extract_sources([file1, file2]) - - # Should extract from same index file (deduplicated) - self.assertIn(r'\begin{filecontents*}{job.bib}', preamble) - - def test_transform_xml_to_tex_empty_licenses_credits(self): - """Test transform with no licenses or credits.""" - from unittest.mock import patch - import opensiddur.exporter.tex.xelatex as xelatex_module - - xml_content = b''' - - - - Content only - - -''' - - input_file = self._create_xml_file("project1", "input.xml", xml_content) - - with patch.object(xelatex_module, 'projects_source_root', self.test_dir): - result = transform_xml_to_tex(input_file) - - # Should still produce valid LaTeX even without licenses/credits - self.assertIn(r'\documentclass{book}', result) - # Should not have empty metadata sections - # (The postamble will be empty if no licenses/credits/sources) - - -if __name__ == '__main__': - unittest.main() - diff --git a/scripts/tei-to-pdf.sh b/scripts/tei-to-pdf.sh index 2842c97..a6b147c 100644 --- a/scripts/tei-to-pdf.sh +++ b/scripts/tei-to-pdf.sh @@ -1,32 +1,51 @@ #!/bin/bash -# This script converts a TEI file to a PDF file using the Open Siddur TEI to PDF converter. - +# Convert a JLPTEI source file (in a project directory) to a PDF using the +# LuaLaTeX + reledmac/reledpar pipeline. +# +# Two stages run sequentially: +# 1. opensiddur.exporter.compiler — JLPTEI → linear pseudo-TEI +# 2. opensiddur.exporter.pdf.pdf — pseudo-TEI → reledmac LuaLaTeX → PDF +# +# The optional ``-s `` flag is forwarded to *both* stages: +# - the compiler reads ``priority``, ``parallel``, ``annotations``; +# - the PDF stage reads ``typography`` (fonts, layout, paper, fontsize). +# # Usage: -# ./tei-to-pdf.sh - -# Example: -# ./tei-to-pdf.sh output.pdf +# ./tei-to-pdf.sh [-s ] [--keep-tex | --tex-output ] -# The input file should be a valid TEI file. -# The output file will be a PDF file. -# Parse arguments, supporting optional -s set -e usage() { - echo "Usage: $0 [-s ] " + echo "Usage: $0 [-s ] [--keep-tex | --tex-output ] " exit 1 } SETTINGS_FILE="" -while getopts ":s:" opt; do - case "$opt" in - s) SETTINGS_FILE="$OPTARG" ;; - \?) echo "Invalid option: -$OPTARG" >&2; usage ;; - :) echo "Option -$OPTARG requires an argument." >&2; usage ;; +KEEP_TEX=false +TEX_OUTPUT="" + +while [[ $# -gt 0 ]]; do + case "$1" in + -s) + SETTINGS_FILE="${2:-}"; shift 2 ;; + --keep-tex) + KEEP_TEX=true; shift ;; + --tex-output) + TEX_OUTPUT="${2:-}"; shift 2 ;; + --) + shift; break ;; + -*) + echo "Invalid option: $1" >&2; usage ;; + *) + break ;; esac done -shift $((OPTIND-1)) + +if $KEEP_TEX && [ -n "$TEX_OUTPUT" ]; then + echo "Error: --keep-tex and --tex-output are mutually exclusive." >&2 + exit 2 +fi if [ "$#" -ne 3 ]; then usage @@ -37,12 +56,18 @@ FILE_NAME="$2" OUTPUT="$3" if [ -n "$SETTINGS_FILE" ]; then - SETTINGS_ARG="-s $SETTINGS_FILE" + SETTINGS_ARG=(-s "$SETTINGS_FILE") else - SETTINGS_ARG="" + SETTINGS_ARG=() fi +TEX_OUTPUT_ARGS=() +if [ -n "$TEX_OUTPUT" ]; then + TEX_OUTPUT_ARGS=(--tex-output "$TEX_OUTPUT") +elif $KEEP_TEX; then + TEX_OUTPUT_ARGS=(--keep-tex) +fi -uv run python -m opensiddur.exporter.compiler ${SETTINGS_ARG} -p $1 -f $2 -o $3.xml -uv run python -m opensiddur.exporter.pdf.pdf $3.xml $3 -rm -f $3.xml +uv run python -m opensiddur.exporter.compiler "${SETTINGS_ARG[@]}" -p "$PROJECT" -f "$FILE_NAME" -o "$OUTPUT.xml" +uv run python -m opensiddur.exporter.pdf.pdf "${SETTINGS_ARG[@]}" "${TEX_OUTPUT_ARGS[@]}" "$OUTPUT.xml" "$OUTPUT" +rm -f "$OUTPUT.xml" From 3b8d36b749193b5d4be5fb0b47ef1832748534cc Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Thu, 7 May 2026 20:58:47 -0700 Subject: [PATCH 02/13] correct margin number positioning for Hebrew --- opensiddur/exporter/pdf/pdf.py | 26 +++++++-- opensiddur/exporter/settings.py | 2 +- opensiddur/exporter/tex/reledmac.xslt | 57 +++++++++++++++++-- .../tests/exporter/test_reledmac_xslt.py | 20 +++++++ 4 files changed, 95 insertions(+), 10 deletions(-) diff --git a/opensiddur/exporter/pdf/pdf.py b/opensiddur/exporter/pdf/pdf.py index 0a3482d..3d3a143 100755 --- a/opensiddur/exporter/pdf/pdf.py +++ b/opensiddur/exporter/pdf/pdf.py @@ -23,6 +23,7 @@ import sys import tempfile from pathlib import Path +from contextlib import nullcontext from typing import Optional # Add the project root to the Python path @@ -168,6 +169,7 @@ def _run_manual_loop(tex_file: Path, output_dir: Path, max_runs: int) -> bool: if not success: print("lualatex reported errors (pass 1):", file=sys.stderr) print(output, file=sys.stderr) + return False # Run bibtex once after the first pass when needed; force a rerun afterwards # because bibtex updates the .bbl that lualatex needs to read. @@ -187,7 +189,7 @@ def _run_manual_loop(tex_file: Path, output_dir: Path, max_runs: int) -> bool: if not success: print(f"lualatex reported errors (pass {run_count}):", file=sys.stderr) print(output, file=sys.stderr) - break + return False if run_count >= max_runs: print( @@ -203,6 +205,7 @@ def compile_tex_to_pdf( tex_file: Path, output_pdf: Path, max_runs: int = 6, + build_dir: Optional[Path] = None, ) -> bool: """Compile a LuaLaTeX .tex file to PDF. @@ -224,7 +227,13 @@ def compile_tex_to_pdf( print(f"Compiling {tex_file} to PDF...", file=sys.stderr) tex_stem = tex_file.stem - with tempfile.TemporaryDirectory() as temp_dir_str: + if build_dir is not None: + temp_dir = build_dir + temp_dir.mkdir(parents=True, exist_ok=True) + else: + temp_dir = None + + with tempfile.TemporaryDirectory() if temp_dir is None else nullcontext(str(temp_dir)) as temp_dir_str: temp_dir = Path(temp_dir_str) # latexmk can attempt to invoke biber based on .bcf detection even when # biblatex is configured for BibTeX. Since biber is frequently broken or @@ -235,7 +244,8 @@ def compile_tex_to_pdf( file=sys.stderr, ) return False - _run_manual_loop(tex_file, temp_dir, max_runs) + if not _run_manual_loop(tex_file, temp_dir, max_runs): + return False generated_pdf = temp_dir / f"{tex_stem}.pdf" if not generated_pdf.exists(): @@ -264,6 +274,7 @@ def export_to_pdf( output_pdf: Path, settings_file: Optional[Path] = None, tex_output: Optional[Path] = None, + build_dir: Optional[Path] = None, ) -> bool: """Convert a compiled JLPTEI XML file to PDF. @@ -288,7 +299,7 @@ def export_to_pdf( if not generate_tex(input_file, temp_tex_file, settings_file=settings_file): return False - if not compile_tex_to_pdf(temp_tex_file, output_pdf): + if not compile_tex_to_pdf(temp_tex_file, output_pdf, build_dir=build_dir): return False print(f"Successfully generated PDF: {output_pdf}", file=sys.stderr) @@ -336,6 +347,12 @@ def main(): # pragma: no cover default=None, help="Path to write the intermediate TeX file (implies --keep-tex).", ) + parser.add_argument( + "--build-dir", + type=Path, + default=None, + help="Directory to keep LaTeX build artifacts (.log, .aux, etc.) for debugging.", + ) args = parser.parse_args() @@ -354,6 +371,7 @@ def main(): # pragma: no cover args.output_pdf, settings_file=args.settings_file, tex_output=tex_output, + build_dir=args.build_dir, ): sys.exit(1) diff --git a/opensiddur/exporter/settings.py b/opensiddur/exporter/settings.py index bee652c..d9a3b9f 100644 --- a/opensiddur/exporter/settings.py +++ b/opensiddur/exporter/settings.py @@ -74,7 +74,7 @@ class TypographyConfig(BaseModel): hebrew_font: str = "Frank Ruehl CLM" latin_font: str = "Linux Libertine O" layout: ParallelLayout = ParallelLayout.PAIRS - paper: PaperType = PaperType.A4PAPER + paper: PaperType = PaperType.LETTERPAPER fontsize: str = "11pt" diff --git a/opensiddur/exporter/tex/reledmac.xslt b/opensiddur/exporter/tex/reledmac.xslt index 5041100..d2c7aab 100644 --- a/opensiddur/exporter/tex/reledmac.xslt +++ b/opensiddur/exporter/tex/reledmac.xslt @@ -120,10 +120,44 @@ reledmac exposes \linenumberstyle; reledpar uses \linenumrepR and a right-side flag. --> \renewcommand*{\linenumberstyle}[1]{\begingroup\textdir TLT\selectlanguage{english}#1\endgroup} + + \lineation{page} + + + \linenummargin{outer} + \linenummarginR{outer} + + + \linenummarginColumns{right} + \linenummarginColumnsR{right} + + \columnsposition{C} + + \setlength{\Lcolwidth}{0.43\textwidth} + \setlength{\Rcolwidth}{0.43\textwidth} + + \makeatletter - \renewcommand*{\linenumrepR}[1]{\begingroup\textdir TLT\selectlanguage{english}\@arabic{#1}\endgroup} - \renewcommand*{\sublinenumrepR}[1]{\begingroup\textdir TLT\selectlanguage{english}\@arabic{#1}\endgroup} - \setRlineflag{\begingroup\textdir TLT R\endgroup} + + \setlength{\linenumsep}{1em} + + \renewcommand*{\linenumrepR}[1]{\hbox{\textdir TLT\@arabic{#1}}} + \renewcommand*{\sublinenumrepR}[1]{\hbox{\textdir TLT\@arabic{#1}}} + + \setRlineflag{} \makeatother \setlength{\parindent}{0pt} @@ -158,9 +192,17 @@ - - + + @@ -563,6 +605,11 @@ + + + + + diff --git a/opensiddur/tests/exporter/test_reledmac_xslt.py b/opensiddur/tests/exporter/test_reledmac_xslt.py index 53a5253..982d2ab 100644 --- a/opensiddur/tests/exporter/test_reledmac_xslt.py +++ b/opensiddur/tests/exporter/test_reledmac_xslt.py @@ -201,6 +201,26 @@ def test_empty_parallel_block_is_skipped(self): out = _transform(xml) self.assertNotIn(r"\begin{pages}", out) + def test_parallel_inside_transclude_is_still_grouped(self): + """The compiled XML can wrap p:parallel blocks in p:transclude; the TeX stage + must expand the wrapper so parallel blocks still become a reledpar environment.""" + xml = """ + + + + + שלום + Hello + + + + """ + out = _transform(xml, layout="pairs") + self.assertIn(r"\begin{pairs}", out) + self.assertIn(r"\begin{Leftside}", out) + self.assertIn(r"\begin{Rightside}", out) + def test_pairs_layout_uses_columns_typesetter(self): out = _transform(self.XML, layout="pairs") self.assertIn(r"\begin{pairs}", out) From 5c1b0ee09c5ef208916e4141af18d4dc57ba1d41 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Thu, 7 May 2026 22:15:50 -0700 Subject: [PATCH 03/13] fix the legal/credits pages ending up on their own pages --- opensiddur/exporter/tex/latex.py | 18 ++++++++++-------- opensiddur/exporter/tex/reledmac.xslt | 18 +++++++++++------- opensiddur/tests/exporter/test_latex.py | 11 ++++++----- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/opensiddur/exporter/tex/latex.py b/opensiddur/exporter/tex/latex.py index 2addc2a..649f585 100644 --- a/opensiddur/exporter/tex/latex.py +++ b/opensiddur/exporter/tex/latex.py @@ -100,7 +100,7 @@ def licenses_to_tex(licenses: list[LicenseRecord]) -> str: f"\\item {license.name} (\\url{{{license.url}}})" for license in licenses ) return ( - "\\chapter{Legal}\n" + "\\section*{Legal}\n" "This document includes copyrighted texts licensed under the following licenses.\n" "The full text of the licenses can be found at the given URLs:\n\n" "\\begin{itemize}\n" @@ -189,14 +189,14 @@ def credits_to_tex(credits: dict[str, dict[str, list[CreditRecord]]]) -> str: """Convert grouped credits into a LaTeX appendix section.""" if not credits: return "" - tex = "\\chapter{Contributor credits}\n" + tex = "\\section*{Contributor credits}\n" for role, namespace_dict in credits.items(): total = sum(len(c) for c in namespace_dict.values()) role_name = contributor_keys_to_roles.get(role, role) + ("s" if total > 1 else "") - tex += f"\\section{{{role_name}}}\n" + tex += f"\\subsection*{{{role_name}}}\n" for namespace, namespace_credits in namespace_dict.items(): sorted_credits = sorted(namespace_credits, key=lambda x: x.contributor) - tex += f"\\subsection{{From {namespace}}}\n" + tex += f"\\subsubsection*{{From {namespace}}}\n" tex += "\\begin{itemize}\n" for credit in sorted_credits: tex += f"\\item {credit.name_text}\n" @@ -243,10 +243,10 @@ def extract_sources(xml_file_paths: list[Path]) -> tuple[str, str]: "\\addbibresource{job.bib}\n" ) postamble_tex = ( - "\n\\begingroup\n" - "\\renewcommand{\\refname}{Sources}\n" + "\n\\section*{Sources}\n" + "\\begingroup\n" "\\nocite{*}\n" - "\\printbibliography\n" + "\\printbibliography[heading=none]\n" "\\endgroup\n" ) return preamble_tex, postamble_tex @@ -353,7 +353,9 @@ def transform_xml_to_tex( xslt_params={ "additional-preamble": sources_preamble_tex, "additional-postamble": ( - "\\part{Metadata}\n" + "\\par\\bigskip\n" + "\\hrule\\bigskip\n" + "\\section*{Metadata}\n" + licenses_tex + "\n" + credits_tex diff --git a/opensiddur/exporter/tex/reledmac.xslt b/opensiddur/exporter/tex/reledmac.xslt index d2c7aab..ed8fbf5 100644 --- a/opensiddur/exporter/tex/reledmac.xslt +++ b/opensiddur/exporter/tex/reledmac.xslt @@ -151,13 +151,17 @@ \setlength{\linenumsep}{1em} - - \renewcommand*{\linenumrepR}[1]{\hbox{\textdir TLT\@arabic{#1}}} - \renewcommand*{\sublinenumrepR}[1]{\hbox{\textdir TLT\@arabic{#1}}} - - \setRlineflag{} + + + + \renewcommand*{\linenumrepR}[1]{\hbox{\textdir TLT\@arabic{#1}}} + \renewcommand*{\sublinenumrepR}[1]{\hbox{\textdir TLT\@arabic{#1}}} + + \setRlineflag{} + \makeatother \setlength{\parindent}{0pt} diff --git a/opensiddur/tests/exporter/test_latex.py b/opensiddur/tests/exporter/test_latex.py index 20d5e5d..aa9f537 100644 --- a/opensiddur/tests/exporter/test_latex.py +++ b/opensiddur/tests/exporter/test_latex.py @@ -98,7 +98,7 @@ def test_emits_legal_chapter(self): out = licenses_to_tex( [LicenseRecord(url="http://creativecommons.org/cc", name="CC")] ) - self.assertIn(r"\chapter{Legal}", out) + self.assertIn(r"\section*{Legal}", out) self.assertIn("CC", out) self.assertIn(r"\url{http://creativecommons.org/cc}", out) @@ -182,7 +182,7 @@ def test_pluralizes_role_when_multiple_contributors(self): name_text="B", namespace="ns", contributor="b", ) out = credits_to_tex({"aut": {"ns": [c1, c2]}}) - self.assertIn(r"\section{Authors}", out) + self.assertIn(r"\subsection*{Authors}", out) def test_emits_singular_when_one_contributor(self): c1 = CreditRecord( @@ -190,8 +190,8 @@ def test_emits_singular_when_one_contributor(self): name_text="A", namespace="ns", contributor="a", ) out = credits_to_tex({"aut": {"ns": [c1]}}) - self.assertIn(r"\section{Author}", out) - self.assertNotIn(r"\section{Authors}", out) + self.assertIn(r"\subsection*{Author}", out) + self.assertNotIn(r"\subsection*{Authors}", out) class TestExtractSources(unittest.TestCase): @@ -444,7 +444,8 @@ def test_integrates_licenses_into_postamble(self): f = self._create("p", "input.xml", xml) with patch.object(latex_module, "projects_source_root", self.test_dir): out = transform_xml_to_tex(f) - self.assertIn(r"\chapter{Legal}", out) + self.assertIn(r"\section*{Metadata}", out) + self.assertIn(r"\section*{Legal}", out) self.assertIn("My License", out) From f2267638136669aa38bf1a78b3a270559766da84 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Thu, 7 May 2026 23:10:02 -0700 Subject: [PATCH 04/13] fix a missing text problem --- opensiddur/exporter/tex/reledmac.xslt | 73 +++++++++++++------ .../tests/exporter/test_reledmac_xslt.py | 14 ++-- 2 files changed, 57 insertions(+), 30 deletions(-) diff --git a/opensiddur/exporter/tex/reledmac.xslt b/opensiddur/exporter/tex/reledmac.xslt index ed8fbf5..fe5f4a9 100644 --- a/opensiddur/exporter/tex/reledmac.xslt +++ b/opensiddur/exporter/tex/reledmac.xslt @@ -104,22 +104,34 @@ misconfigured on some systems. --> \usepackage[backend=bibtex]{biblatex} \usepackage{hyperref} + + \pdfstringdefDisableCommands{ + \def\textdir#1{} + \def\selectlanguage#1{} + } - \newcommand{\vno}[1]{\textsuperscript{\begingroup\textdir TLT\selectlanguage{english}#1\endgroup}\,} + \newcommand{\vno}[1]{\textsuperscript{{\textdir TLT\selectlanguage{english}#1}}\,} - \newcommand{\instructionnote}[1]{\begingroup\bfseries #1\endgroup} - \newcommand{\notenote}[1]{\begingroup\bfseries #1\endgroup} + - Styling lives in macros so it can be changed in one place. + - Use {{\bfseries ...}} (regular braces) not \begingroup/\endgroup — the latter + can prematurely close reledmac's internal groups inside \edtext/\Bfootnote. --> + \newcommand{\instructionnote}[1]{{\bfseries #1}} + \newcommand{\notenote}[1]{{\bfseries #1}} - \renewcommand*{\linenumberstyle}[1]{\begingroup\textdir TLT\selectlanguage{english}#1\endgroup} + right-side flag. + Use \hbox to contain direction/language changes without leaking + \begingroup/\endgroup into reledmac's aux-file write machinery. --> + \renewcommand*{\linenumberstyle}[1]{\hbox{\textdir TLT\selectlanguage{english}#1}} \lineation{page} @@ -406,9 +418,9 @@ - \begingroup\textdir TLT\selectlanguage{english} + {\textdir TLT\selectlanguage{english} - \endgroup + } @@ -423,19 +435,24 @@ \pend + + \pstart \skipnumbering \eledsection{ - \begingroup\textdir TLT\selectlanguage{english} + {\textdir TLT\selectlanguage{english} - \endgroup + } - } + } \pend @@ -474,9 +491,13 @@ - \edtext{}{\Bfootnote{Parsha: + + {\@RTLfalse\edtext{\mbox{}}{\Bfootnote{Parsha: - }} + }}} @@ -792,14 +813,18 @@ - - \edtext{}{\Bfootnote{\notenote{ + appear as an endnote or inline run rather than a bottom-of-page note. + \@RTLfalse forces reledmac's LTR code path for .1-file writes: in RTL + mode reledmac writes ] before \@ref[N][ for single-line lemmas, which + corrupts the catcode-group that controls [ ] delimiters when the .1 + file is re-read on the next pass. --> + {\@RTLfalse\edtext{\mbox{}}{\Bfootnote{\notenote{ - }}} + }}}} @@ -807,18 +832,20 @@ - \begingroup\textdir TRT\selectlanguage{hebrew} + in an LTR context. + Use {{\textdir TRT ...}} (regular braces) to avoid leaking + \begingroup/\endgroup into reledmac's aux-file write machinery. --> + {{\textdir TRT\selectlanguage{hebrew} - \endgroup + }} - \begingroup\textdir TLT\selectlanguage{english} + {{\textdir TLT\selectlanguage{english} - \endgroup + }} diff --git a/opensiddur/tests/exporter/test_reledmac_xslt.py b/opensiddur/tests/exporter/test_reledmac_xslt.py index 982d2ab..f8ffe7e 100644 --- a/opensiddur/tests/exporter/test_reledmac_xslt.py +++ b/opensiddur/tests/exporter/test_reledmac_xslt.py @@ -123,7 +123,7 @@ def test_verses_flow_inline_in_single_stream(self): def test_chapter_milestone_emits_eledsection(self): out = _transform(self.XML) # Chapter numbers are forced LTR to avoid digit reversal in RTL contexts. - self.assertIn(r"\eledsection{\begingroup\textdir TLT\selectlanguage{english}1\endgroup}", out) + self.assertIn(r"\eledsection{{\textdir TLT\selectlanguage{english}1}}", out) def test_chapter_number_forces_ltr_digits_in_hebrew_context(self): """Digits inside Hebrew RTL contexts can render reversed unless forced LTR.""" @@ -137,7 +137,7 @@ def test_chapter_number_forces_ltr_digits_in_hebrew_context(self): """ out = _transform(xml) - self.assertIn(r"\eledsection{\begingroup\textdir TLT\selectlanguage{english}12\endgroup}", out) + self.assertIn(r"\eledsection{{\textdir TLT\selectlanguage{english}12}}", out) def test_verse_numbers_appear_as_superscripts(self): out = _transform(self.XML) @@ -333,7 +333,7 @@ def test_default_note_is_b_series_apparatus(self): out = _transform(xml) # \edtext{}{\Bfootnote{}} is the proper reledmac idiom for apparatus notes: # zero-width lemma + B-series footnote at page bottom (not an endnote after \pend). - self.assertIn(r"\edtext{}{\Bfootnote{\notenote{", out) + self.assertIn(r"{\@RTLfalse\edtext{}{\Bfootnote{\notenote{", out) self.assertIn("commentary", out) self.assertNotIn(r"\footnote{", out) @@ -364,10 +364,10 @@ def test_standoff_note_appears_at_anchor_position(self): """ out = _transform(xml) - self.assertIn(r"\edtext{}{\Bfootnote{\notenote{", out) + self.assertIn(r"{\@RTLfalse\edtext{}{\Bfootnote{\notenote{", out) self.assertIn("English annotation", out) # English note inside Hebrew stream must force LTR direction. - self.assertIn(r"\begingroup\textdir TLT\selectlanguage{english}", out) + self.assertIn(r"{{\textdir TLT\selectlanguage{english}", out) def test_standoff_note_with_multiple_targets(self): """A note targeting multiple anchors must appear at each anchor site.""" @@ -403,8 +403,8 @@ def test_note_language_forces_direction(self): """ out = _transform(xml) - self.assertIn(r"\begingroup\textdir TLT\selectlanguage{english} English note\endgroup", out) - self.assertIn(r"\begingroup\textdir TLT\selectlanguage{english} Inline English instruction\endgroup", out) + self.assertIn(r"{{\textdir TLT\selectlanguage{english} English note}}", out) + self.assertIn(r"{{\textdir TLT\selectlanguage{english} Inline English instruction}}", out) class TestInlineFormatting(unittest.TestCase): From 1096dd5f2e7e642fffb092317d02115e25f98a7f Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Thu, 7 May 2026 23:26:19 -0700 Subject: [PATCH 05/13] fix more bugs (and hopefully not introduce others) --- opensiddur/exporter/tex/reledmac.xslt | 9 ++++++--- opensiddur/tests/exporter/test_reledmac_xslt.py | 12 +++++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/opensiddur/exporter/tex/reledmac.xslt b/opensiddur/exporter/tex/reledmac.xslt index fe5f4a9..19080ee 100644 --- a/opensiddur/exporter/tex/reledmac.xslt +++ b/opensiddur/exporter/tex/reledmac.xslt @@ -160,6 +160,9 @@ \makeatletter + + \newcommand*{\OSRTLfalse}{\@RTLfalse} \setlength{\linenumsep}{1em} @@ -495,7 +498,7 @@ can cause reledmac to drop surrounding text or corrupt its .1 aux file. Use an explicit zero-width box lemma to keep the argument structure stable. --> - {\@RTLfalse\edtext{\mbox{}}{\Bfootnote{Parsha: + \leavevmode{\OSRTLfalse\edtext{\mbox{}}{\Bfootnote{Parsha: }}} @@ -818,11 +821,11 @@ content to the B-series apparatus at the page bottom. Plain \footnote inside \pstart...\pend is flushed by reledmac after \pend, making it appear as an endnote or inline run rather than a bottom-of-page note. - \@RTLfalse forces reledmac's LTR code path for .1-file writes: in RTL + \OSRTLfalse forces reledmac's LTR code path for .1-file writes: in RTL mode reledmac writes ] before \@ref[N][ for single-line lemmas, which corrupts the catcode-group that controls [ ] delimiters when the .1 file is re-read on the next pass. --> - {\@RTLfalse\edtext{\mbox{}}{\Bfootnote{\notenote{ + \leavevmode{\OSRTLfalse\edtext{\mbox{}}{\Bfootnote{\notenote{ }}}} diff --git a/opensiddur/tests/exporter/test_reledmac_xslt.py b/opensiddur/tests/exporter/test_reledmac_xslt.py index f8ffe7e..49bdb83 100644 --- a/opensiddur/tests/exporter/test_reledmac_xslt.py +++ b/opensiddur/tests/exporter/test_reledmac_xslt.py @@ -116,9 +116,11 @@ def test_emits_single_numbering_block(self): def test_verses_flow_inline_in_single_stream(self): out = _transform(self.XML) # The fixture has one tei:p containing 3 verse milestones, so we expect - # one paragraph-level \\pstart/\\pend pair (not 1 per verse). - self.assertEqual(out.count(r"\pstart"), 1) - self.assertEqual(out.count(r"\pend"), 1) + # one verse-paragraph-level \\pstart/\\pend pair (not 1 per verse). + # + # Note: chapter milestones may be emitted in their own skipnumbering pstart. + self.assertEqual(out.count(r"\pstart \vno{"), 1) + self.assertEqual(out.count(r"\pend"), 2) def test_chapter_milestone_emits_eledsection(self): out = _transform(self.XML) @@ -333,7 +335,7 @@ def test_default_note_is_b_series_apparatus(self): out = _transform(xml) # \edtext{}{\Bfootnote{}} is the proper reledmac idiom for apparatus notes: # zero-width lemma + B-series footnote at page bottom (not an endnote after \pend). - self.assertIn(r"{\@RTLfalse\edtext{}{\Bfootnote{\notenote{", out) + self.assertIn(r"\leavevmode{\OSRTLfalse\edtext{\mbox{}}{\Bfootnote{\notenote{", out) self.assertIn("commentary", out) self.assertNotIn(r"\footnote{", out) @@ -364,7 +366,7 @@ def test_standoff_note_appears_at_anchor_position(self): """ out = _transform(xml) - self.assertIn(r"{\@RTLfalse\edtext{}{\Bfootnote{\notenote{", out) + self.assertIn(r"\leavevmode{\OSRTLfalse\edtext{\mbox{}}{\Bfootnote{\notenote{", out) self.assertIn("English annotation", out) # English note inside Hebrew stream must force LTR direction. self.assertIn(r"{{\textdir TLT\selectlanguage{english}", out) From e4c5d4ad2d094e7740f9cf0039d014bbb1424526 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Fri, 8 May 2026 12:36:20 -0700 Subject: [PATCH 06/13] improve documentation for exporter, provide an example settings file --- README.md | 59 +++++++++++++++++++++++++++++- doc/exporter-settings.example.yaml | 45 +++++++++++++++++++++++ 2 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 doc/exporter-settings.example.yaml diff --git a/README.md b/README.md index bfec545..138568b 100644 --- a/README.md +++ b/README.md @@ -47,4 +47,61 @@ cross-reference mappings for changed files, and removes stale entries for projects or files that no longer exist. It prints a per-project summary on completion. -You must re-sync before running the compiler on any newly-added project. \ No newline at end of file +You must re-sync before running the compiler on any newly-added project. + +## Compilation (JLPTEI → compiled linear XML) + +The compiler takes a `project//` file, resolves transclusions, annotations, and parallel texts, +and outputs a single “compiled” XML file that can be +converted into a final printable format (eg, PDF). + +Example (compile `project/wlc/ruth.xml` to `compiled.xml`): + +```bash +uv run python -m opensiddur.exporter.compiler \ + --project wlc \ + --file_name ruth.xml \ + --output_file compiled.xml +``` + +Example with a settings YAML (controls project priorities, annotations, and optional parallel lookup): + +```bash +uv run python -m opensiddur.exporter.compiler \ + --project wlc \ + --file_name ruth.xml \ + --settings doc/exporter-settings.example.yaml \ + --output_file compiled.xml +``` + +## TeX export (compiled XML → LuaLaTeX) + +Convert the compiled XML file to LuaLaTeX using the `reledmac`/`reledpar` pipeline: + +```bash +uv run python -m opensiddur.exporter.tex.latex \ + compiled.xml \ + --settings doc/exporter-settings.example.yaml \ + --output compiled.tex +``` + +## PDF export (compiled XML → PDF) + +Export directly to PDF (generates TeX internally, then runs LuaLaTeX/latexmk): + +```bash +uv run python -m opensiddur.exporter.pdf.pdf \ + --settings doc/exporter-settings.example.yaml \ + compiled.xml \ + output.pdf +``` + +Keep the intermediate TeX (helpful for debugging LaTeX issues): + +```bash +uv run python -m opensiddur.exporter.pdf.pdf \ + --settings doc/exporter-settings.example.yaml \ + --keep-tex \ + compiled.xml \ + output.pdf +``` \ No newline at end of file diff --git a/doc/exporter-settings.example.yaml b/doc/exporter-settings.example.yaml new file mode 100644 index 0000000..fd85706 --- /dev/null +++ b/doc/exporter-settings.example.yaml @@ -0,0 +1,45 @@ +--- +# Example exporter settings file. +# This file includes every currently supported exporter settings +# +# Notes: +# - Project names must correspond to directories under `project/`. +# - `parallel.column_order` values: `primary_first` | `primary_last` +# - `typography.layout` values: `pages` | `pairs` +# - `typography.paper` values: `a4paper` | `letterpaper` | `legalpaper` | `a5paper` | `b5paper` | `executivepaper` + +priority: + # Project priority for resolving `urn:x-opensiddur:` references to *texts* (transclusions) + # The first entry will typically be the project you are compiling. + # The lower priority projects can be used when the first project wants to import from others (eg, a siddur that borrows Tanach text from another source) + transclusion: + - wlc + - jps1917 + + # Project priority for resolving `urn:x-opensiddur:` references to *instructions* + instructions: + - wlc + - jps1917 + +# Projects from which to include annotations +# *All* annotations from the given projects will be included. +annotations: + - wlc + - jps1917 + +# Optional parallel-text configuration. Omit to disable. +parallel: + # Projects to search for parallel text content (in priority order) + projects: + - wlc + - jps1917 + # Column order for parallel text display in printed output + column_order: primary_first + +# Output-format settings consumed by the TeX/PDF stage only. +typography: + hebrew_font: "Frank Ruehl CLM" + latin_font: "Linux Libertine O" + layout: pairs + paper: letterpaper + fontsize: "11pt" From 6fc798b67dd8944d81e9a2d6c254dea3547d64ad Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Mon, 11 May 2026 20:12:03 -0700 Subject: [PATCH 07/13] reconcile schema documentation with the actual schema --- .../exporter/validate_urn_references.py | 183 ++++++++++++++++++ .../exporter/test_validate_urn_references.py | 79 ++++++++ opensiddur/tests/schema/test_jlptei_odd.py | 58 ++++++ schema/JLPTEI-3.md | 36 ++-- schema/jlptei.odd.xml | 22 ++- 5 files changed, 352 insertions(+), 26 deletions(-) create mode 100644 opensiddur/exporter/validate_urn_references.py create mode 100644 opensiddur/tests/exporter/test_validate_urn_references.py create mode 100644 opensiddur/tests/schema/test_jlptei_odd.py diff --git a/opensiddur/exporter/validate_urn_references.py b/opensiddur/exporter/validate_urn_references.py new file mode 100644 index 0000000..274acca --- /dev/null +++ b/opensiddur/exporter/validate_urn_references.py @@ -0,0 +1,183 @@ +"""Post-conversion validation for resolvable URN references. + +This validator is intentionally optional and is meant to be run after an entire +project/source has been converted and the reference DB has been populated. +""" + +from __future__ import annotations + +import argparse +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, Optional + +from lxml import etree + +from opensiddur.common.constants import PROJECT_DIRECTORY +from opensiddur.exporter.refdb import INDEX_DB_FILE, ReferenceDatabase +from opensiddur.exporter.urn import UrnResolver + + +TEI_NS = "http://www.tei-c.org/ns/1.0" +JLPTEI_NS = "http://jewishliturgy.org/ns/jlptei/2" + + +@dataclass(frozen=True) +class UnresolvableUrnReference: + project: str + file_name: str + element_path: str + attribute_name: str + urn: str + + +def _iter_project_xml_files(project_path: Path) -> Iterable[Path]: + yield from sorted(project_path.glob("*.xml")) + + +def validate_project_urn_references( + project: str, + *, + project_directory: Path = PROJECT_DIRECTORY, + reference_db_path: Path = INDEX_DB_FILE, + index_before_validate: bool = False, +) -> list[UnresolvableUrnReference]: + """Validate that compilation-relevant URN references are resolvable via refdb. + + This checks URNs in: + - tei:ptr/@target + - tei:ref/@target + - j:transclude/@target and j:transclude/@targetEnd + """ + + project_path = Path(project_directory) / project + if not project_path.exists() or not project_path.is_dir(): + raise ValueError(f"Project directory does not exist: {project_path}") + + refdb = ReferenceDatabase(reference_db_path) + try: + if index_before_validate: + refdb.index_project(project, project_directory=project_directory) + + resolver = UrnResolver(refdb) + ns = {"tei": TEI_NS, "j": JLPTEI_NS} + + failures: list[UnresolvableUrnReference] = [] + for xml_file in _iter_project_xml_files(project_path): + tree = etree.parse(str(xml_file)) + root = tree.getroot() + + # Only targets that are URNs participate in this validation. + # Non-URN targets (e.g., local paths or URLs) are out of scope. + ptrs = root.xpath("//tei:ptr[@target]", namespaces=ns) + refs = root.xpath("//tei:ref[@target]", namespaces=ns) + transcludes = root.xpath("//j:transclude[@target]", namespaces=ns) + + for el in [*ptrs, *refs]: + urn = el.get("target") + if not urn or not urn.startswith("urn:x-opensiddur:"): + continue + if not resolver.resolve_range(urn): + failures.append( + UnresolvableUrnReference( + project=project, + file_name=xml_file.name, + element_path=tree.getpath(el), + attribute_name="target", + urn=urn, + ) + ) + + for el in transcludes: + target = el.get("target") + if target and target.startswith("urn:x-opensiddur:"): + start_candidates = resolver.resolve_range(target) + if not start_candidates: + failures.append( + UnresolvableUrnReference( + project=project, + file_name=xml_file.name, + element_path=tree.getpath(el), + attribute_name="target", + urn=target, + ) + ) + continue + + # Prefer resolving the transclude within the current project when possible, + # since that's the common compilation expectation. + start = UrnResolver.prioritize_range(start_candidates, [project]) or start_candidates[0] + + target_end = el.get("targetEnd") + if target_end and target_end.startswith("urn:x-opensiddur:"): + end_candidates = resolver.resolve_range(target_end) + if not end_candidates: + failures.append( + UnresolvableUrnReference( + project=project, + file_name=xml_file.name, + element_path=tree.getpath(el), + attribute_name="targetEnd", + urn=target_end, + ) + ) + continue + if not UrnResolver.prioritize_range(end_candidates, [start.project]): + failures.append( + UnresolvableUrnReference( + project=project, + file_name=xml_file.name, + element_path=tree.getpath(el), + attribute_name="targetEnd", + urn=target_end, + ) + ) + finally: + refdb.close() + + return failures + + +def _format_failure(f: UnresolvableUrnReference) -> str: + return f"{f.project}/{f.file_name}: {f.element_path} @{f.attribute_name}={f.urn}" + + +def main(argv: Optional[list[str]] = None) -> int: + parser = argparse.ArgumentParser( + description="Validate that compilation-relevant URN references are resolvable using refdb (post-conversion)." + ) + parser.add_argument("project", help="Project name under project/ (e.g., wlc, jps1917)") + parser.add_argument( + "--project-directory", + default=str(PROJECT_DIRECTORY), + help="Base project directory (defaults to repo project/)", + ) + parser.add_argument( + "--reference-db", + default=str(INDEX_DB_FILE), + help="Path to reference.db (defaults to opensiddur database/reference.db)", + ) + parser.add_argument( + "--index", + action="store_true", + help="(Optional) index the project into refdb before validating", + ) + + args = parser.parse_args(argv) + + failures = validate_project_urn_references( + args.project, + project_directory=Path(args.project_directory), + reference_db_path=Path(args.reference_db), + index_before_validate=args.index, + ) + if failures: + for f in failures: + print(_format_failure(f)) + return 2 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/opensiddur/tests/exporter/test_validate_urn_references.py b/opensiddur/tests/exporter/test_validate_urn_references.py new file mode 100644 index 0000000..28aa43c --- /dev/null +++ b/opensiddur/tests/exporter/test_validate_urn_references.py @@ -0,0 +1,79 @@ +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory + +from lxml import etree + +from opensiddur.exporter.refdb import ReferenceDatabase +from opensiddur.exporter.validate_urn_references import validate_project_urn_references + + +TEI_NS = "http://www.tei-c.org/ns/1.0" +NSMAP = {"tei": TEI_NS} + + +class TestValidateUrnReferences(unittest.TestCase): + def test_validates_ptr_and_ref_targets(self): + with TemporaryDirectory() as td: + base = Path(td) + project = "proj1" + (base / project).mkdir(parents=True, exist_ok=True) + + xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) + text = etree.SubElement(xml, f"{{{TEI_NS}}}text") + body = etree.SubElement(text, f"{{{TEI_NS}}}body") + etree.SubElement(body, f"{{{TEI_NS}}}ptr", target="urn:x-opensiddur:test:doc1") + etree.SubElement(body, f"{{{TEI_NS}}}ref", target="urn:x-opensiddur:test:doc2/1") + + xml_path = base / project / "a.xml" + etree.ElementTree(xml).write(str(xml_path), encoding="utf-8", xml_declaration=True) + + db_path = base / "ref.db" + db = ReferenceDatabase(db_path) + try: + # Add URN mappings so resolver can resolve the references. + e1 = etree.Element(f"{{{TEI_NS}}}milestone") + e1.set("corresp", "urn:x-opensiddur:test:doc1") + db.add_urn_mapping(project, "a.xml", e1) + + e2 = etree.Element(f"{{{TEI_NS}}}milestone") + e2.set("corresp", "urn:x-opensiddur:test:doc2/1") + db.add_urn_mapping(project, "a.xml", e2) + finally: + db.close() + + failures = validate_project_urn_references( + project, + project_directory=base, + reference_db_path=db_path, + index_before_validate=False, + ) + self.assertEqual(failures, []) + + def test_reports_unresolvable_urns(self): + with TemporaryDirectory() as td: + base = Path(td) + project = "proj1" + (base / project).mkdir(parents=True, exist_ok=True) + + xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) + text = etree.SubElement(xml, f"{{{TEI_NS}}}text") + body = etree.SubElement(text, f"{{{TEI_NS}}}body") + etree.SubElement(body, f"{{{TEI_NS}}}ptr", target="urn:x-opensiddur:test:missing") + + xml_path = base / project / "a.xml" + etree.ElementTree(xml).write(str(xml_path), encoding="utf-8", xml_declaration=True) + + db_path = base / "ref.db" + ReferenceDatabase(db_path).close() + + failures = validate_project_urn_references( + project, + project_directory=base, + reference_db_path=db_path, + index_before_validate=False, + ) + self.assertEqual(len(failures), 1) + self.assertEqual(failures[0].file_name, "a.xml") + self.assertEqual(failures[0].attribute_name, "target") + diff --git a/opensiddur/tests/schema/test_jlptei_odd.py b/opensiddur/tests/schema/test_jlptei_odd.py new file mode 100644 index 0000000..543e82b --- /dev/null +++ b/opensiddur/tests/schema/test_jlptei_odd.py @@ -0,0 +1,58 @@ +import unittest +from pathlib import Path + +from lxml import etree + + +ODD_PATH = Path(__file__).resolve().parents[3] / "schema" / "jlptei.odd.xml" + + +class TestJlpteiOddConstraints(unittest.TestCase): + def setUp(self): + self.tree = etree.parse(str(ODD_PATH)) + self.ns = { + "tei": "http://www.tei-c.org/ns/1.0", + "sch": "http://purl.oclc.org/dsdl/schematron", + } + + def test_requires_xml_lang_on_tei_root(self): + asserts = self.tree.xpath( + "//tei:elementSpec[@ident='TEI']//sch:assert[@test='@xml:lang']", + namespaces=self.ns, + ) + self.assertTrue(asserts, "Expected schematron assert requiring tei:TEI/@xml:lang") + + def test_standoff_type_is_closed_list(self): + vals = self.tree.xpath( + "//tei:elementSpec[@ident='standOff']//tei:attDef[@ident='type']//tei:valItem/@ident", + namespaces=self.ns, + ) + self.assertEqual(set(vals), {"notes", "settings", "conditions"}) + + def test_transclude_type_is_closed_list(self): + vals = self.tree.xpath( + "//tei:elementSpec[@ident='transclude']//tei:attDef[@ident='type']//tei:valItem/@ident", + namespaces=self.ns, + ) + self.assertEqual(set(vals), {"external", "inline"}) + + def test_paragraph_type_is_closed_list(self): + vals = self.tree.xpath( + "//tei:elementSpec[@ident='p']//tei:attDef[@ident='type']//tei:valItem/@ident", + namespaces=self.ns, + ) + self.assertEqual(set(vals), {"open-1", "closed-1", "open-3"}) + + def test_divine_name_exists_and_is_agent_like(self): + divine = self.tree.xpath( + "//tei:elementSpec[@ident='divineName']", + namespaces=self.ns, + ) + self.assertTrue(divine, "Expected j:divineName elementSpec to exist in ODD") + + member = self.tree.xpath( + "//tei:elementSpec[@ident='divineName']//tei:memberOf[@key='model.nameLike.agent']", + namespaces=self.ns, + ) + self.assertTrue(member, "Expected j:divineName to be member of model.nameLike.agent") + diff --git a/schema/JLPTEI-3.md b/schema/JLPTEI-3.md index 4f40cc1..4956f29 100644 --- a/schema/JLPTEI-3.md +++ b/schema/JLPTEI-3.md @@ -102,31 +102,24 @@ All URIs reference the following scopes: 2. If the URI is on an empty milestone like element (`milestone`, `pb`, `lb`, etc.) it references that milestone unit until the next milestone of the same unit *or* the end of the file if no subsequent milestone of the same unit exists. 3. If the URI is on an empty anchor (`anchor`), it references that specific point in the document. -#### The user's contributor profile -Every contributor to the project has their contribution identified via a -URN. +#### Contributors and contributor URNs +Contributions are credited in the file header using `tei:respStmt` entries, with a contributor URN stored in `tei:name/@ref`. -The contributor URN is referenced as `urn:x-opensiddur:contributor:{contributor_space}/{identifier}`. +Contributor URNs use the form `urn:x-opensiddur:/`. -The `contribuor_space` indicates where the contributor made the contribution or where -their identifier is meaningful. For example, contributors to Hebrew Wikisource will be in the `he.wikisource.org` space, and the `identifier` will -reference their Wikisource username. +The `namespace` indicates where the identifier is meaningful. For example: +- `en.wikisource.org/{username}` for English Wikisource contributors +- `he.wikisource.org/{username}` for Hebrew Wikisource contributors +- `opensiddur.org/{identifier}` for original Open Siddur contributors -Contributors to Open Siddur will be in the `opensiddur.org` space and the identifier will identify a file in the `contributors` directory. - -Contributor profiles have the following XML form: +Example: ```xml - - {contributor name} - {optional contributor's organization name} - {contributor contact email} - {website name} - + + Transcribed by + Prosody (English Wikisource contributor) + ``` -Only the `tei:name` and `tei:email` are required. - ### Project index Every project has an entry point file called `index.xml`. This file contains the project metadata, including the project header. @@ -207,9 +200,6 @@ Every document has a TEI header with a standardized structure. - - {revision_message} - ``` @@ -382,7 +372,7 @@ The following rules apply to anchors: ### Inclusions To include one text inside another, use the `j:transclude` tag inline in the text. Preferentially, use the URN reference of the text to be included, using the `target` attribute for the pointer target. -Two types of inclusions are supported. The intended type is indicated by the `type` attribute on the `ptr` element: +Two types of inclusions are supported. The intended type is indicated by the `type` attribute on the `j:transclude` element: * `inline`: The text is to be included in place. Any XML hierarchy (including paragraphs, line groups, etc) within the included text are excluded. * `external`: The text and its XML hierarchy are to be included in place. diff --git a/schema/jlptei.odd.xml b/schema/jlptei.odd.xml index 3ce0112..a5f301b 100644 --- a/schema/jlptei.odd.xml +++ b/schema/jlptei.odd.xml @@ -200,12 +200,28 @@ Characterizes the element in some sense, using any convenient classification scheme or typology. - - - + + + + + + + + + Indicates that the enclosed text is a divine name (e.g., the Tetragrammaton or other epithet) for special handling by downstream processors. + + + + + + + + + + From d2505afff23d8db7ea7cb1b2730f42fa56c06658 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Wed, 13 May 2026 23:36:36 -0700 Subject: [PATCH 08/13] fix duplicate notes bug --- opensiddur/exporter/tex/reledmac.xslt | 85 ++++++++++++++++++--------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/opensiddur/exporter/tex/reledmac.xslt b/opensiddur/exporter/tex/reledmac.xslt index 19080ee..cd54637 100644 --- a/opensiddur/exporter/tex/reledmac.xslt +++ b/opensiddur/exporter/tex/reledmac.xslt @@ -10,8 +10,11 @@ nodes (text, milestones, and non-block inline elements) and then walked with `xsl:iterate` to emit `\pstart`/`\pend` pairs per verse. Chapter milestones break out of the current `\pstart` to emit `\eledsection{N}` headings between - verses. Notes become reledmac apparatus footnotes (`\Bfootnote` for editorial - notes). Instructional notes are rendered inline via a dedicated macro so they + verses. Editorial notes in the body become reledmac apparatus footnotes (`\Bfootnote` for editorial + notes) with interlinear serial marks (`\OSInterlinearNotemark`) matching the + apparatus prefix (`\OSFootnotemark`). The compiler materializes stand-off notes + into the body; this stylesheet does not resolve `tei:standOff` or `tei:anchor` + targets into apparatus. Instructional notes are rendered inline via a dedicated macro so they can be styled independently without entering the apparatus. Parallel mode wraps two such streams in `\begin{pages}` / `\Pages` (facing @@ -33,12 +36,6 @@ - - - @@ -91,6 +88,9 @@ } } } + + \let\hebrewfontsf\hebrewfont \newcommand{\instructionnote}[1]{{\bfseries #1}} \newcommand{\notenote}[1]{{\bfseries #1}} + + \newcommand{\OSInterlinearNotemark}[1]{% + \leavevmode\hbox to 0pt{\hss{\textdir TLT\raisebox{1.1ex}{{\selectlanguage{english}\kern0.05em\normalfont\scriptsize\sffamily #1\kern0.05em}}}\hss}% + } + \newcommand{\OSFootnotemark}[1]{% + {\textdir TLT\selectlanguage{english}\scriptsize\sffamily #1}\space + } + + \Xnonumber[B] + \Xnolemmaseparator[B] + \Xinplaceofnumber[B]{0pt} - \linenummarginColumns{right} + + \linenummarginColumns{left} \linenummarginColumnsR{right} \setlength{\Lcolwidth}{0.43\textwidth} \setlength{\Rcolwidth}{0.43\textwidth} + + \let\OSreledparColumnsOrig\Columns + \renewcommand{\Columns}{\begingroup\pardir TLT\relax\textdir TLT\relax\OSreledparColumnsOrig\endgroup} \makeatletter @@ -709,12 +724,7 @@ \leavevmode\\ - - - - - + @@ -815,17 +825,29 @@ } - + + + + + + + + + - \leavevmode{\OSRTLfalse\edtext{\mbox{}}{\Bfootnote{\notenote{ + \leavevmode{\OSRTLfalse\edtext{\OSInterlinearNotemark{ + + }}{\Bfootnote{\OSFootnotemark{ + + }\notenote{ }}}} @@ -862,6 +884,11 @@ TeX escaping helpers ==================================================================== --> + + + + + \newcommand{\OSInterlinearNotemark}[1]{% - \leavevmode\hbox to 0pt{\hss{\textdir TLT\raisebox{1.1ex}{{\selectlanguage{english}\kern0.05em\normalfont\scriptsize\sffamily #1\kern0.05em}}}\hss}% + \leavevmode\hbox to 0pt{\hss{\textdir TLT\raisebox{1.5ex}{{\selectlanguage{english}\kern0.05em\normalfont\scriptsize\sffamily #1\kern0.05em}}}\hss}% } \newcommand{\OSFootnotemark}[1]{% {\textdir TLT\selectlanguage{english}\scriptsize\sffamily #1}\space } - + \Xnonumber[B] \Xnolemmaseparator[B] \Xinplaceofnumber[B]{0pt} @@ -175,6 +175,10 @@ \makeatletter + + \Xwraplemma[B]{\@gobble} \newcommand*{\OSRTLfalse}{\@RTLfalse} From 2c921abc1f22d29006f48d0b6839488face387c7 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Sat, 23 May 2026 23:17:02 -0700 Subject: [PATCH 10/13] remove some overshoots by ai --- opensiddur/common/xslt.py | 16 ++- opensiddur/exporter/compiler.py | 21 ++-- .../tests/exporter/test_reledmac_xslt.py | 112 ++++++++++++++---- 3 files changed, 112 insertions(+), 37 deletions(-) diff --git a/opensiddur/common/xslt.py b/opensiddur/common/xslt.py index 9d8a3a8..cde3dd6 100644 --- a/opensiddur/common/xslt.py +++ b/opensiddur/common/xslt.py @@ -64,15 +64,19 @@ def xslt_transform_string( def xslt_transform( xslt_file: Path, - input_file: Path, - output_file: Optional[Path] = None): - + input_file: Path, + output_file: Optional[Path] = None, + *, + xslt_params: Optional[dict[str, Any]] = None, +): try: # Read the input XML with open(input_file, 'r', encoding='utf-8') as input_fd: - input_xml = input_fd.read() - - result = xslt_transform_string(xslt_file, input_xml) + input_xml = input_fd.read() + + result = xslt_transform_string( + xslt_file, input_xml, xslt_params=xslt_params + ) if output_file: with open(output_file, 'w', encoding='utf-8') as output_fd: diff --git a/opensiddur/exporter/compiler.py b/opensiddur/exporter/compiler.py index f0ddbcf..f602ea2 100644 --- a/opensiddur/exporter/compiler.py +++ b/opensiddur/exporter/compiler.py @@ -323,8 +323,9 @@ def _annotate(self, element: ElementBase, root: Optional[ElementBase] = None) -> will continue to use this instruction as-is. 2. If the element has standoff annotation (a commentary or editorial note), - we need to determine which project's commentary set should be used to provide the corresponding commentary. - All selected commentaries should be loaded and returned. + we need to determine which project's commentary set should be used to provide + the corresponding commentary. All selected commentaries should be loaded and + returned. Args: element: The element to annotate @@ -403,12 +404,13 @@ def _annotate(self, element: ElementBase, root: Optional[ElementBase] = None) -> # For commentary/editorial notes, select all annotations for corresp or xml_id # May be standoff annotation, or inline. references = self._refdb.get_references_to(corresp, xml_id, project, file_name) - note_references = [r for r in references + note_references = [r for r in references if r.element_tag =="{http://www.tei-c.org/ns/1.0}note"] - limited_references = self._urn_resolver.prioritize_range(note_references, self.linear_data.annotation_projects, return_all=True) - + limited_references = self._urn_resolver.prioritize_range( + note_references, self.linear_data.annotation_projects, return_all=True) + result_elements = [] - if limited_references: # Handle case where prioritize_range returns None + if limited_references: for reference in limited_references: processor = CompilerProcessor( reference.project, @@ -423,22 +425,19 @@ def _annotate(self, element: ElementBase, root: Optional[ElementBase] = None) -> processed_element = processor.process(reference_element) if not(reference.project == self.project and reference.file_name == self.file_name): self._mark_file_source(processed_element, project=reference.project, file_name=reference.file_name) - - # Check if language differs and add xml:lang if needed + annotation_lang = processor.root_language insertion_context_lang = self._get_in_scope_language(element) if annotation_lang and annotation_lang != insertion_context_lang: processed_element.set('{http://www.w3.org/XML/1998/namespace}lang', annotation_lang) - + result_elements.append(processed_element) if result_elements: annotation_command = _AnnotationCommand.INSERT else: annotation_command = _AnnotationCommand.NONE return result_elements, annotation_command - - @staticmethod def _insert_first_element(element: ElementBase, new_child: ElementBase) -> ElementBase: """ diff --git a/opensiddur/tests/exporter/test_reledmac_xslt.py b/opensiddur/tests/exporter/test_reledmac_xslt.py index 49bdb83..e67d5fe 100644 --- a/opensiddur/tests/exporter/test_reledmac_xslt.py +++ b/opensiddur/tests/exporter/test_reledmac_xslt.py @@ -53,6 +53,8 @@ def test_preamble_loads_reledmac_and_polyglossia(self): self.assertIn(r"\documentclass", out) self.assertIn(r"\usepackage{polyglossia}", out) self.assertIn(r"\usepackage{reledmac}", out) + self.assertIn(r"\Xnonumber[B]", out) + self.assertIn(r"\newcommand{\OSInterlinearNotemark}", out) # No parallel content → no reledpar package. self.assertNotIn(r"\usepackage{reledpar}", out) self.assertIn(r"\setotherlanguage{hebrew}", out) @@ -230,6 +232,19 @@ def test_pairs_layout_uses_columns_typesetter(self): self.assertNotIn(r"\begin{pages}", out) self.assertNotIn(r"\Pages", out) + def test_pairs_layout_line_numbers_on_outer_column_margins(self): + """Leftside is the physical left column; Rightside is the physical right column. + Both margins {right} regresses Hebrew numbers into the gutter.""" + out = _transform(self.XML, layout="pairs") + self.assertIn(r"\linenummarginColumns{left}", out) + self.assertIn(r"\linenummarginColumnsR{right}", out) + + def test_pairs_layout_forces_ltr_for_columns_assembly(self): + r"""Avoid RTL \pardir flipping the visual order of the two-column row.""" + out = _transform(self.XML, layout="pairs") + self.assertIn(r"\let\OSreledparColumnsOrig\Columns", out) + self.assertIn(r"\pardir TLT\relax\textdir TLT\relax\OSreledparColumnsOrig", out) + def test_each_side_has_its_own_numbering(self): out = _transform(self.XML) # One \beginnumbering per side, one \endnumbering per side. @@ -321,9 +336,10 @@ def test_parallel_row_after_marker_reconstruct(self): class TestNotesMapping(unittest.TestCase): - """tei:note elements must become reledmac apparatus footnotes anchored - via \\edtext{...}{\\Bfootnote{...}}. Instructional notes are inline via - \\instructionnote{...}, not footnotes.""" + """Body ``tei:note`` elements (materialized by the compiler) become reledmac + apparatus footnotes via ``\\edtext{...}{\\Bfootnote{...}}``. ``tei:standOff`` + is not expanded at the TeX stage. Instructional notes use ``\\instructionnote``. + """ def test_default_note_is_b_series_apparatus(self): xml = """ @@ -333,9 +349,9 @@ def test_default_note_is_b_series_apparatus(self): """ out = _transform(xml) - # \edtext{}{\Bfootnote{}} is the proper reledmac idiom for apparatus notes: - # zero-width lemma + B-series footnote at page bottom (not an endnote after \pend). - self.assertIn(r"\leavevmode{\OSRTLfalse\edtext{\mbox{}}{\Bfootnote{\notenote{", out) + # \edtext{\OSInterlinearNotemark}{... \Bfootnote{\OSFootnotemark ...}}: interlinear + # serial mark + B-series footnote at page bottom (not an endnote after \pend). + self.assertIn(r"\leavevmode{\OSRTLfalse\edtext{\OSInterlinearNotemark{1}}{\Bfootnote{\OSFootnotemark{1}\notenote{", out) self.assertIn("commentary", out) self.assertNotIn(r"\footnote{", out) @@ -350,9 +366,24 @@ def test_instruction_note_is_inline(self): self.assertIn(r"\instructionnote{", out) self.assertIn("stand", out) - def test_standoff_note_appears_at_anchor_position(self): - """Notes stored in tei:standOff must be emitted as B-series apparatus - footnotes at the tei:anchor position in the body, not silently dropped.""" + def test_body_editorial_note_emits_apparatus(self): + """Compiler inlines editorial tei:note in the body; XSLT maps it to B-series.""" + xml = """ + + + + + Hebrew textEnglish annotation more + + + """ + out = _transform(xml) + self.assertIn(r"\leavevmode{\OSRTLfalse\edtext{\OSInterlinearNotemark{1}}{\Bfootnote{\OSFootnotemark{1}\notenote{", out) + self.assertIn("English annotation", out) + self.assertIn(r"{{\textdir TLT\selectlanguage{english}", out) + + def test_standoff_not_resolved_at_tex_stage(self): + """tei:standOff is not expanded here; only body notes become apparatus.""" xml = """ @@ -362,32 +393,73 @@ def test_standoff_note_appears_at_anchor_position(self): - English annotation + StandOff only. """ out = _transform(xml) - self.assertIn(r"\leavevmode{\OSRTLfalse\edtext{\mbox{}}{\Bfootnote{\notenote{", out) - self.assertIn("English annotation", out) - # English note inside Hebrew stream must force LTR direction. - self.assertIn(r"{{\textdir TLT\selectlanguage{english}", out) + self.assertNotIn("StandOff only.", out) + self.assertNotIn(r"\leavevmode{\OSRTLfalse\edtext", out) - def test_standoff_note_with_multiple_targets(self): - """A note targeting multiple anchors must appear at each anchor site.""" + def test_inline_note_with_standoff_duplicate_emits_once(self): + """Body note plus matching tei:standOff (compiler leaves both) must not double TeX.""" xml = """ - Word1 Word2 + HebrewTranscription uncertain.after - Shared note + Transcription uncertain. """ out = _transform(xml) - # The same note appears twice — once per anchor. - self.assertEqual(out.count("Shared note"), 2) + self.assertEqual(out.count("Transcription uncertain."), 1) + self.assertEqual(out.count(r"\leavevmode{\OSRTLfalse\edtext{\OSInterlinearNotemark{1}"), 1) + + def test_two_body_editorial_notes_distinct_serials(self): + xml = """ + + + + + AFirst BSecond + + + """ + out = _transform(xml) + self.assertIn(r"\OSInterlinearNotemark{1}", out) + self.assertIn(r"\OSInterlinearNotemark{2}", out) + self.assertEqual(out.count("First"), 1) + self.assertEqual(out.count("Second"), 1) + + def test_parallel_body_note_per_column(self): + """Each parallel stream numbers its own editorial notes from 1.""" + xml = """ + + + + + אHeb note + + + AEng note + + + + """ + out = _transform(xml) + left = re.search(r"\\begin\{Leftside\}(.*?)\\end\{Leftside\}", out, re.DOTALL) + right = re.search(r"\\begin\{Rightside\}(.*?)\\end\{Rightside\}", out, re.DOTALL) + self.assertIsNotNone(left) + self.assertIsNotNone(right) + self.assertIn(r"\OSInterlinearNotemark{1}", left.group(1)) + self.assertIn("Heb note", left.group(1)) + # Editorial serials count preceding notes in document order (Leftside before Rightside). + self.assertIn(r"\OSInterlinearNotemark{2}", right.group(1)) + self.assertIn("Eng note", right.group(1)) def test_note_language_forces_direction(self): """Notes must force their own direction based on the in-scope xml:lang. From e47353aec65c817b611baee2072faed6568835df Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Sat, 23 May 2026 23:28:02 -0700 Subject: [PATCH 11/13] derive sources from another repo --- opensiddur/importer/agent/common.py | 4 +- .../importer/agent/text_encoding_agent.py | 17 +++- .../importer/jps1917/convert_wikisource.py | 80 ++++++++++++---- .../importer/jps1917/template_finder.py | 44 ++++++--- opensiddur/importer/jps1917/wikisource.py | 95 +++++++++++++----- opensiddur/importer/util/constants.py | 3 - opensiddur/importer/util/pages.py | 45 +++++++-- opensiddur/importer/wlc/download_tanach.py | 96 ++++++++++++------- opensiddur/importer/wlc/transform_index.xslt | 6 +- opensiddur/importer/wlc/wlc.py | 94 +++++++++++++----- .../jps1917/test_convert_wikisource.py | 37 +++---- opensiddur/tests/importer/util/test_pages.py | 4 +- .../importer/wlc/test_transform_index.py | 8 +- opensiddur/tests/importer/wlc/test_wlc.py | 72 +++++++++----- 14 files changed, 430 insertions(+), 175 deletions(-) diff --git a/opensiddur/importer/agent/common.py b/opensiddur/importer/agent/common.py index 9bb9250..1d3a5bb 100644 --- a/opensiddur/importer/agent/common.py +++ b/opensiddur/importer/agent/common.py @@ -1,10 +1,8 @@ from pydantic import BaseModel, Field from pathlib import Path + BASE_PATH = Path(__file__).absolute().parent.parent.parent.parent -DATA_PATH = BASE_PATH / "sources" / "jps1917" -TEXT_PATH = DATA_PATH / "text" -CREDITS_PATH = DATA_PATH / "credits" SCHEMA_PATH = BASE_PATH / "schema" SCHEMA_DOCUMENTATION_PATH = SCHEMA_PATH / "JLPTEI-3.md" SCHEMA_ODD_PATH = SCHEMA_PATH / "jlptei.odd.xml" diff --git a/opensiddur/importer/agent/text_encoding_agent.py b/opensiddur/importer/agent/text_encoding_agent.py index 64d6c9c..7e52b9d 100644 --- a/opensiddur/importer/agent/text_encoding_agent.py +++ b/opensiddur/importer/agent/text_encoding_agent.py @@ -1,4 +1,5 @@ from typing import TypedDict, Annotated, Literal, Optional +from pathlib import Path from langgraph.graph import StateGraph, END from langgraph.checkpoint.memory import MemorySaver from pydantic import BaseModel, Field @@ -55,6 +56,9 @@ class TextEncodingAgentState(TypedDict): session_id: Optional[str] last_checkpoint_time: Optional[str] + # JPS 1917 page files live under /jps1917 (None = default /sources) + sourcetexts_root: Optional[str] + class TextEncodingAgentInput(BaseModel): """Input for the text encoding agent""" @@ -67,6 +71,10 @@ class TextEncodingAgentInput(BaseModel): max_errors: int = Field(default=5, description="Maximum number of error correction attempts") session_id: Optional[str] = Field(default=None, description="Session ID to resume from") enable_checkpointing: bool = Field(default=True, description="Enable checkpointing for this session") + sourcetexts_root: Optional[Path] = Field( + default=None, + description="Root of sourcetexts repo; Wikisource page dumps under /jps1917 (default: /sources).", + ) def create_session_id(input_data: TextEncodingAgentInput) -> str: @@ -275,7 +283,7 @@ def advance_page(state: TextEncodingAgentState) -> TextEncodingAgentState: print(f"Setting page to {state['current_page']}...") new_page = state['current_page'] previous_page_content = "" - current_page_obj = get_page(new_page) + current_page_obj = get_page(new_page, Path(state["sourcetexts_root"]) if state.get("sourcetexts_root") else None) current_page_content = current_page_obj.content if current_page_obj else "" else: @@ -287,7 +295,7 @@ def advance_page(state: TextEncodingAgentState) -> TextEncodingAgentState: current_page_content = state["next_page_content"] # Update next page content - next_page_obj = get_page(new_page + 1) + next_page_obj = get_page(new_page + 1, Path(state["sourcetexts_root"]) if state.get("sourcetexts_root") else None) next_page_content = next_page_obj.content if next_page_obj else "" return { @@ -486,7 +494,10 @@ def run_text_encoding_agent(input_data: TextEncodingAgentInput) -> TextEncodingA "error_count": 0, "max_errors": input_data.max_errors, "session_id": session_id, - "last_checkpoint_time": None + "last_checkpoint_time": None, + "sourcetexts_root": str(input_data.sourcetexts_root.resolve()) + if input_data.sourcetexts_root is not None + else None, } # Create and run the agent with checkpointing diff --git a/opensiddur/importer/jps1917/convert_wikisource.py b/opensiddur/importer/jps1917/convert_wikisource.py index 5a4c48a..b87aa5d 100644 --- a/opensiddur/importer/jps1917/convert_wikisource.py +++ b/opensiddur/importer/jps1917/convert_wikisource.py @@ -1,10 +1,15 @@ +import argparse from pathlib import Path from typing import Any, Optional import urllib from pydantic import BaseModel -from opensiddur.importer.util.pages import get_credits, get_page +from opensiddur.importer.util.pages import ( + default_sourcetexts_root, + get_credits, + get_page, +) from opensiddur.importer.jps1917.mediawiki_processor import create_processor from opensiddur.importer.util.prettify import prettify_xml from opensiddur.importer.util.validation import validate @@ -362,10 +367,12 @@ class Index(BaseModel): ), ] -def get_credits_pages(start_page: int, end_page: int) -> list[str]: +def get_credits_pages( + start_page: int, end_page: int, sourcetexts_root: Path | None = None +) -> list[str]: credits = set() for page in range(start_page, end_page + 1): - page_credits = get_credits(page) + page_credits = get_credits(page, sourcetexts_root) if page_credits is not None: credits.update(page_credits) return sorted(credits) @@ -472,9 +479,10 @@ def mediawiki_xml_to_tei(xml_content: str, } def process_mediawiki( - start_page: int, - end_page: int, + start_page: int, + end_page: int, wrapper_element: str, + sourcetexts_root: Path | None = None, **kwargs, ) -> str: mw_processor = create_processor() @@ -484,7 +492,12 @@ def process_mediawiki( content = "" for page in range(start_page, end_page + 1): print(f"Processing page {page}") - page_content = get_page(page).content + page_obj = get_page(page, sourcetexts_root) + if page_obj is None: + raise FileNotFoundError( + f"JPS 1917 page file missing for page {page} (check sourcetexts tree)" + ) + page_content = page_obj.content content += " " + mw_processor.process_wikitext(page_content).xml_content pre_xml = f""" @@ -506,17 +519,24 @@ def validate_and_write_tei_file(tei_content: str, file_name: str): with open(out_path, "w") as f: f.write(pretty_xml) -def book_file(book: Book) -> str: - transcription_credits = get_credits_pages(book.start_page, book.end_page) +def book_file(book: Book, sourcetexts_root: Path | None = None) -> str: + transcription_credits = get_credits_pages( + book.start_page, book.end_page, sourcetexts_root + ) header_content = header( book_name_he = book.book_name_he, book_name_en = book.book_name_en, transcription_credits = transcription_credits, ) - xml_dict = process_mediawiki(book.start_page, book.end_page, "body", + xml_dict = process_mediawiki( + book.start_page, + book.end_page, + "body", + sourcetexts_root=sourcetexts_root, wrapper_div_type="book", book_name=book.file_name, - is_section=book.is_section) + is_section=book.is_section, + ) tei_content = tei_file( header = header_content, @@ -529,9 +549,11 @@ def book_file(book: Book) -> str: return tei_content -def index_file(idx: Index) -> str: +def index_file(idx: Index, sourcetexts_root: Path | None = None) -> str: if idx.start_page is not None and idx.end_page is not None: - transcription_credits = get_credits_pages(idx.start_page, idx.end_page) + transcription_credits = get_credits_pages( + idx.start_page, idx.end_page, sourcetexts_root + ) else: transcription_credits = None header_content = header( @@ -542,9 +564,14 @@ def index_file(idx: Index) -> str: transcription_credits = transcription_credits, ) if idx.start_page is not None and idx.end_page is not None: - xml_dict = process_mediawiki(idx.start_page, idx.end_page, "front", + xml_dict = process_mediawiki( + idx.start_page, + idx.end_page, + "front", + sourcetexts_root=sourcetexts_root, wrapper_div_type="", - book_name="") + book_name="", + ) else: xml_dict = {} @@ -571,16 +598,31 @@ def index_file(idx: Index) -> str: for transclusion in idx.transclusions: if isinstance(transclusion, Index): - index_file(transclusion) + index_file(transclusion, sourcetexts_root) else: - book_file(transclusion) - + book_file(transclusion, sourcetexts_root) + return tei_content -def main(): # pragma: no cover + +def main(argv: list[str] | None = None) -> None: # pragma: no cover + parser = argparse.ArgumentParser( + description="Convert JPS 1917 Wikisource page dumps to JLPTEI under project/jps1917." + ) + parser.add_argument( + "--sourcetexts-root", + type=Path, + default=default_sourcetexts_root(), + help=( + "Root of the opensiddur/sourcetexts repository; page text is read from " + "/jps1917 (default: /sources)." + ), + ) + args = parser.parse_args(argv) (PROJECT_DIRECTORY / "jps1917").mkdir(parents=True, exist_ok=True) for part in JPS_1917: - index_file(part) + index_file(part, args.sourcetexts_root) + if __name__ == "__main__": # pragma: no cover main() \ No newline at end of file diff --git a/opensiddur/importer/jps1917/template_finder.py b/opensiddur/importer/jps1917/template_finder.py index a6cff43..da3af2e 100644 --- a/opensiddur/importer/jps1917/template_finder.py +++ b/opensiddur/importer/jps1917/template_finder.py @@ -11,10 +11,14 @@ from pathlib import Path # Import the get_page function from the agent tools -from opensiddur.importer.util.pages import get_page +from opensiddur.importer.util.pages import default_sourcetexts_root, get_page -def find_all_tags(start_page: int = 1, end_page: Optional[int] = None) -> Dict[str, Dict]: +def find_all_tags( + start_page: int = 1, + end_page: Optional[int] = None, + sourcetexts_root: Optional[Path] = None, +) -> Dict[str, Dict]: """ Find all MediaWiki tags used across all pages. @@ -42,13 +46,13 @@ def find_all_tags(start_page: int = 1, end_page: Optional[int] = None) -> Dict[s # If end_page not specified, try to find the last page if end_page is None: - start_page, end_page = find_page_range() + start_page, end_page = find_page_range(sourcetexts_root) print(f"Scanning pages {start_page} to {end_page} for MediaWiki tags...") for page_num in range(start_page, end_page + 1): try: - page_obj = get_page(page_num) + page_obj = get_page(page_num, sourcetexts_root) if page_obj is None: print(f"Page {page_num} not found, stopping scan") break @@ -89,7 +93,11 @@ def find_all_tags(start_page: int = 1, end_page: Optional[int] = None) -> Dict[s } -def find_all_templates(start_page: int = 1, end_page: Optional[int] = None) -> Dict[str, Dict]: +def find_all_templates( + start_page: int = 1, + end_page: Optional[int] = None, + sourcetexts_root: Optional[Path] = None, +) -> Dict[str, Dict]: """ Find all MediaWiki templates used across all pages. @@ -117,13 +125,13 @@ def find_all_templates(start_page: int = 1, end_page: Optional[int] = None) -> D # If end_page not specified, try to find the last page if end_page is None: - start_page, end_page = find_page_range() + start_page, end_page = find_page_range(sourcetexts_root) print(f"Scanning pages {start_page} to {end_page} for MediaWiki templates...") for page_num in range(start_page, end_page + 1): try: - page_obj = get_page(page_num) + page_obj = get_page(page_num, sourcetexts_root) if page_obj is None: print(f"Page {page_num} not found, stopping scan") break @@ -264,7 +272,7 @@ def extract_templates_from_wikitext(wikitext: str) -> Dict[str, Dict]: return templates -def find_page_range() -> tuple[int, int]: +def find_page_range(sourcetexts_root: Optional[Path] = None) -> tuple[int, int]: """ Find the last available page by checking for consecutive missing pages. @@ -276,13 +284,13 @@ def find_page_range() -> tuple[int, int]: # Start from a reasonable point and work backwards for page_num in range(1200, 0, -1): - page_obj = get_page(page_num) + page_obj = get_page(page_num, sourcetexts_root) if page_obj is not None: last_page = page_num break for page_num in range(last_page, 0, -1): - page_obj = get_page(page_num) + page_obj = get_page(page_num, sourcetexts_root) if page_obj is None: first_page = page_num + 1 break @@ -407,6 +415,18 @@ def save_template_analysis(template_data: Dict, output_file: str = "template_ana if __name__ == "__main__": # pragma: no cover + import argparse + + parser = argparse.ArgumentParser(description="Analyze JPS 1917 Wikisource MediaWiki templates and tags.") + parser.add_argument( + "--sourcetexts-root", + type=Path, + default=default_sourcetexts_root(), + help="Root of sourcetexts repo; JPS pages under /jps1917 (default: /sources).", + ) + args = parser.parse_args() + root = args.sourcetexts_root + # Example usage print("Starting MediaWiki template and tag analysis...") @@ -414,7 +434,7 @@ def save_template_analysis(template_data: Dict, output_file: str = "template_ana print("\n" + "="*50) print("ANALYZING TEMPLATES") print("="*50) - template_data = find_all_templates() + template_data = find_all_templates(sourcetexts_root=root) print_template_summary(template_data) save_template_analysis(template_data, "jps1917_template_analysis.json") @@ -422,7 +442,7 @@ def save_template_analysis(template_data: Dict, output_file: str = "template_ana print("\n" + "="*50) print("ANALYZING TAGS") print("="*50) - tag_data = find_all_tags() + tag_data = find_all_tags(sourcetexts_root=root) print_tag_summary(tag_data) save_tag_analysis(tag_data, "jps1917_tag_analysis.json") diff --git a/opensiddur/importer/jps1917/wikisource.py b/opensiddur/importer/jps1917/wikisource.py index 5060ed9..49a5825 100644 --- a/opensiddur/importer/jps1917/wikisource.py +++ b/opensiddur/importer/jps1917/wikisource.py @@ -1,59 +1,79 @@ +import argparse import random -import requests -import os +import sys import time import xml.etree.ElementTree as et from pathlib import Path -# constants -server="en.wikisource.org" -output_directory = Path(__file__).parent.parent.parent.parent / "sources" / "jps1917" +import requests + +from opensiddur.importer.util.pages import default_sourcetexts_root, jps1917_data_directory + +server = "en.wikisource.org" wiki_namespace = "Page" book_name = "JPS-1917-Universal.djvu" -start_page = 443 #7 -pages = range(start_page,1158+1) +start_page = 443 # 7 +pages = range(start_page, 1158 + 1) + +# Backward-compatible name for JPS tree under sourcetexts +jps1917_output_directory = jps1917_data_directory + +# Default layout (legacy): /sources/jps1917 +output_directory = jps1917_data_directory() + def wiki_url(book_name, page_num, action="raw", namespace=wiki_namespace): return f"/w/index.php?title={wiki_namespace}:{book_name}/{page_num}&action={action}" + def get_wiki_page(book_name, page_num, dry_run=True): path = "https://" + server + wiki_url(book_name, page_num) headers = { - 'User-Agent': 'OpenSiddur-AI/1.0 (https://github.com/opensiddur/opensiddur-ai; opensiddur@example.com)', - 'Accept-Encoding': 'gzip, deflate' + "User-Agent": "OpenSiddur-AI/1.0 (https://github.com/opensiddur/opensiddur-ai; opensiddur@example.com)", + "Accept-Encoding": "gzip, deflate", } if dry_run: print(f"Would retrieve text: {page_num} from {path}") else: - r = requests.get(path, headers=headers) + r = requests.get(path, headers=headers, timeout=60) if r.status_code >= 400: print(f"Error retrieving page {page_num}") else: return r.text + def get_wiki_contributors(book_name, page_num, dry_run=True): path = "https://" + server + wiki_url(book_name, page_num, action="history&feed=atom") headers = { - 'User-Agent': 'OpenSiddur-AI/1.0 (https://github.com/opensiddur/opensiddur-ai; opensiddur@example.com)', - 'Accept-Encoding': 'gzip, deflate' + "User-Agent": "OpenSiddur-AI/1.0 (https://github.com/opensiddur/opensiddur-ai; opensiddur@example.com)", + "Accept-Encoding": "gzip, deflate", } if dry_run: print(f"Would retrieve history: {page_num} from {path}") else: - r = requests.get(path, headers=headers) + r = requests.get(path, headers=headers, timeout=60) if r.status_code >= 400: print(f"Error retrieving history {page_num}: {r.status_code} {r.text}") else: feed = et.XML(r.text) - return list(set([element.find("{http://www.w3.org/2005/Atom}name").text for element in feed.findall(".//{http://www.w3.org/2005/Atom}author")])) + return list( + set( + [ + element.find("{http://www.w3.org/2005/Atom}name").text + for element in feed.findall(".//{http://www.w3.org/2005/Atom}author") + ] + ) + ) + -def download_book(dry_run=True): +def download_book(dry_run: bool = True, sourcetexts_root: Path | None = None) -> None: + out = jps1917_output_directory(sourcetexts_root) digits = len(str(max(pages))) format_string = "%%0%dd" % digits - output_directory.mkdir(parents=True, exist_ok=True) - (output_directory / "text").mkdir(parents=True, exist_ok=True) - (output_directory / "credits").mkdir(parents=True, exist_ok=True) + out.mkdir(parents=True, exist_ok=True) + (out / "text").mkdir(parents=True, exist_ok=True) + (out / "credits").mkdir(parents=True, exist_ok=True) for page_num in pages: print("Page: %d" % page_num) @@ -71,18 +91,45 @@ def download_book(dry_run=True): raise time.sleep(5.0) output_filename = (format_string % page_num) + ".txt" - text_path = os.path.join(output_directory, "text", output_filename) - credits_path = os.path.join(output_directory, "credits", output_filename) + text_path = out / "text" / output_filename + credits_path = out / "credits" / output_filename if dry_run: print(f"{page_num}: {text_path=}, {credits_path=}") else: - with open(text_path, "w") as f: + with open(text_path, "w", encoding="utf-8") as f: f.write(wp) time.sleep(1.3 + random.random()) - with open(credits_path, "w") as f: + with open(credits_path, "w", encoding="utf-8") as f: f.write("\n".join(w for w in wc if w != "Wikisource-bot")) time.sleep(1.3 + random.random()) + +def _build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Download JPS 1917 Bible pages from English Wikisource into the sourcetexts tree." + ) + parser.add_argument( + "--sourcetexts-root", + type=Path, + default=default_sourcetexts_root(), + help=( + "Root of the opensiddur/sourcetexts repository; page text is written under " + "/jps1917 (default: /sources for legacy sources/jps1917)." + ), + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print planned fetches and paths without writing files.", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + args = _build_arg_parser().parse_args(argv) + download_book(dry_run=args.dry_run, sourcetexts_root=args.sourcetexts_root) + return 0 + + if __name__ == "__main__": - download_book(dry_run=False) - \ No newline at end of file + sys.exit(main()) diff --git a/opensiddur/importer/util/constants.py b/opensiddur/importer/util/constants.py index 4c20b57..1535b78 100644 --- a/opensiddur/importer/util/constants.py +++ b/opensiddur/importer/util/constants.py @@ -3,9 +3,6 @@ from pydantic import BaseModel, Field BASE_PATH = Path(__file__).absolute().parent.parent.parent.parent -DATA_PATH = BASE_PATH / "sources" / "jps1917" -TEXT_PATH = DATA_PATH / "text" -CREDITS_PATH = DATA_PATH / "credits" SCHEMA_PATH = BASE_PATH / "schema" SCHEMA_DOCUMENTATION_PATH = SCHEMA_PATH / "JLPTEI-3.md" SCHEMA_ODD_PATH = SCHEMA_PATH / "jlptei.odd.xml" diff --git a/opensiddur/importer/util/pages.py b/opensiddur/importer/util/pages.py index 69ac822..117d25a 100644 --- a/opensiddur/importer/util/pages.py +++ b/opensiddur/importer/util/pages.py @@ -1,24 +1,53 @@ +from pathlib import Path from typing import Optional -from opensiddur.importer.util.constants import CREDITS_PATH, TEXT_PATH, Page +from opensiddur.importer.util.constants import BASE_PATH, Page -def get_page(page_number: str | int) -> Optional[Page]: - """Return the wikitext of the given Page, or None if it does not exist""" +def default_sourcetexts_root() -> Path: + """Default opensiddur/sourcetexts checkout root (legacy layout: /sources).""" + return BASE_PATH / "sources" + + +def jps1917_data_directory(sourcetexts_root: Path | None = None) -> Path: + """JPS 1917 raw dumps: /jps1917.""" + root = ( + sourcetexts_root.resolve() + if sourcetexts_root is not None + else default_sourcetexts_root() + ) + return root / "jps1917" + + +def jps1917_text_directory(sourcetexts_root: Path | None = None) -> Path: + """Directory of per-page .txt wikitext files.""" + return jps1917_data_directory(sourcetexts_root) / "text" + + +def jps1917_credits_directory(sourcetexts_root: Path | None = None) -> Path: + """Directory of per-page contributor credit files.""" + return jps1917_data_directory(sourcetexts_root) / "credits" + + +def get_page(page_number: str | int, sourcetexts_root: Path | None = None) -> Optional[Page]: + """Return the wikitext of the given Page, or None if it does not exist.""" page_num = int(page_number) page_file_name = f"{page_num:04d}.txt" + path = jps1917_text_directory(sourcetexts_root) / page_file_name try: - with open(TEXT_PATH / page_file_name, "r") as f: + with open(path, "r", encoding="utf-8") as f: return Page.model_validate(dict(number=page_num, content=f.read())) except FileNotFoundError: return None -def get_credits(page_number: str | int) -> Optional[list[str]]: - """ Return the credits of the given Page, or None if it does not exist """ + +def get_credits(page_number: str | int, sourcetexts_root: Path | None = None) -> Optional[list[str]]: + """Return the credits of the given Page, or None if it does not exist.""" page_num = int(page_number) page_file_name = f"{page_num:04d}.txt" + path = jps1917_credits_directory(sourcetexts_root) / page_file_name try: - with open(CREDITS_PATH / page_file_name, "r") as f: + with open(path, "r", encoding="utf-8") as f: return [line.strip() for line in f.read().split("\n") if line.strip()] except FileNotFoundError: - return None \ No newline at end of file + return None diff --git a/opensiddur/importer/wlc/download_tanach.py b/opensiddur/importer/wlc/download_tanach.py index 04e3c5d..f3f74e6 100644 --- a/opensiddur/importer/wlc/download_tanach.py +++ b/opensiddur/importer/wlc/download_tanach.py @@ -1,48 +1,80 @@ -import os -import requests +import argparse +import logging +import sys from pathlib import Path from zipfile import ZipFile -import logging -# Set up logging +import requests + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -def download_and_unzip_tanach(): - """Download and unzip the latest Tanach XML file from tanach.us.""" - # URL of the Tanach XML zip file - url = "https://tanach.us/Books/Tanach.xml.zip" - - # Create target directory if it doesn't exist - target_dir = Path(__file__).parent.parent / "sources/wlc" + +def _repo_root() -> Path: + return Path(__file__).resolve().parent.parent.parent.parent + + +def _default_sourcetexts_root() -> Path: + return _repo_root() / "sources" + + +def download_and_unzip_tanach(sourcetexts_root: Path | None = None) -> None: + """Download and unzip the latest Tanach XML file from tanach.us into /wlc.""" + root = ( + sourcetexts_root.resolve() + if sourcetexts_root is not None + else _default_sourcetexts_root() + ) + target_dir = root / "wlc" target_dir.mkdir(parents=True, exist_ok=True) - - # Download the zip file - logger.info(f"Downloading {url}...") - response = requests.get(url) - response.raise_for_status() # Raise an exception for bad status codes - - # Save the zip file + + url = "https://tanach.us/Books/Tanach.xml.zip" + + logger.info("Downloading %s...", url) + response = requests.get(url, timeout=120) + response.raise_for_status() + zip_path = target_dir / "Tanach.xml.zip" - with open(zip_path, 'wb') as f: + with open(zip_path, "wb") as f: f.write(response.content) - - logger.info(f"Downloaded file saved to {zip_path}") - - # Unzip the file - logger.info(f"Unzipping {zip_path}...") - with ZipFile(zip_path, 'r') as zip_ref: + + logger.info("Downloaded file saved to %s", zip_path) + + logger.info("Unzipping %s...", zip_path) + with ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(target_dir) - - logger.info(f"Successfully extracted files to {target_dir}") - - # Clean up the zip file + + logger.info("Successfully extracted files to %s", target_dir) + zip_path.unlink() - logger.info(f"Removed temporary zip file") + logger.info("Removed temporary zip file") + + +def _build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Download WLC Tanach XML from tanach.us into the sourcetexts tree." + ) + parser.add_argument( + "--sourcetexts-root", + type=Path, + default=_default_sourcetexts_root(), + help=( + "Root of the opensiddur/sourcetexts repository; files are written under " + "/wlc (default: /sources so output matches legacy sources/wlc)." + ), + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + args = _build_arg_parser().parse_args(argv) + download_and_unzip_tanach(args.sourcetexts_root) + return 0 + if __name__ == "__main__": try: - download_and_unzip_tanach() + sys.exit(main()) except Exception as e: - logger.error(f"Error downloading/unzipping Tanach: {str(e)}") + logger.error("Error downloading/unzipping Tanach: %s", e) raise diff --git a/opensiddur/importer/wlc/transform_index.xslt b/opensiddur/importer/wlc/transform_index.xslt index 26ec6f3..6294c86 100644 --- a/opensiddur/importer/wlc/transform_index.xslt +++ b/opensiddur/importer/wlc/transform_index.xslt @@ -9,6 +9,10 @@ exclude-result-prefixes="xs"> + + + + @@ -38,7 +42,7 @@ - + diff --git a/opensiddur/importer/wlc/wlc.py b/opensiddur/importer/wlc/wlc.py index 515aa70..bc69f1b 100644 --- a/opensiddur/importer/wlc/wlc.py +++ b/opensiddur/importer/wlc/wlc.py @@ -1,41 +1,88 @@ -import sys +import argparse import os +import sys from pathlib import Path -# Import the XMLTransformer from its new location from opensiddur.common.xslt import xslt_transform from opensiddur.importer.util.validation import validate -# Define TEI namespace with 'tei' prefix -TEI_NS = "http://www.tei-c.org/ns/1.0" + +def _repo_root() -> Path: + return Path(__file__).resolve().parent.parent.parent.parent -def make_project_directory() -> Path: - """ Make the project/wlc directory if it doesn't exist """ - default_directory = Path(__file__).parent.parent.parent.parent / "project/wlc" - default_directory.mkdir(parents=True, exist_ok=True) - return default_directory +def make_project_directory(project_dir: Path | None = None) -> Path: + """Create the WLC project directory if missing; return its path.""" + directory = ( + project_dir.resolve() if project_dir is not None else _repo_root() / "project" / "wlc" + ) + directory.mkdir(parents=True, exist_ok=True) + return directory + + +def get_source_directory(sourcetexts_root: Path | None = None) -> Path: + """Directory containing WLC Books/ (i.e. /wlc).""" + root = ( + sourcetexts_root.resolve() + if sourcetexts_root is not None + else _repo_root() / "sources" + ) + return root / "wlc" -def get_source_directory() -> Path: - return Path(__file__).parent.parent.parent.parent / "sources/wlc" def get_xslt_directory() -> Path: - return Path(__file__).parent + return Path(__file__).resolve().parent + -def main(): - project_directory = make_project_directory() - source_directory = get_source_directory() +def _wlc_directory_uri(wlc_directory: Path) -> str: + """File URI of the WLC tree root, with trailing slash, for XSLT resolve-uri base.""" + u = wlc_directory.resolve().as_uri() + return u if u.endswith("/") else u + "/" + + +def _build_arg_parser() -> argparse.ArgumentParser: + repo = _repo_root() + parser = argparse.ArgumentParser( + description="Transform WLC UXLC XML from sourcetexts into JLPTEI project files." + ) + parser.add_argument( + "--project-dir", + type=Path, + default=repo / "project" / "wlc", + help="Output directory for generated JLPTEI (default: /project/wlc).", + ) + parser.add_argument( + "--sourcetexts-root", + type=Path, + default=repo / "sources", + help=( + "Root of the opensiddur/sourcetexts repository; WLC files are read from " + "/wlc/Books (default: /sources so legacy layout stays /sources/wlc)." + ), + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + args = _build_arg_parser().parse_args(argv) + project_directory = make_project_directory(args.project_dir) + wlc_directory = get_source_directory(args.sourcetexts_root) xslt_directory = get_xslt_directory() - xslt_transform(xslt_directory / "transform_index.xslt", - source_directory / "Books" / "TanachHeader.xml", - project_directory / "index.xml") - for book in os.listdir(source_directory / "Books"): + xslt_transform( + xslt_directory / "transform_index.xslt", + wlc_directory / "Books" / "TanachHeader.xml", + project_directory / "index.xml", + xslt_params={"wlc-root-uri": _wlc_directory_uri(wlc_directory)}, + ) + for book in os.listdir(wlc_directory / "Books"): if book not in ["TanachHeader.xml", "TanachIndex.xml"] and not book.endswith(".DH.xml"): print(f"Transforming {book}") - xslt_transform(xslt_directory / "transform_book.xslt", - source_directory / "Books" / book, - project_directory / book.lower()) + xslt_transform( + xslt_directory / "transform_book.xslt", + wlc_directory / "Books" / book, + project_directory / book.lower(), + ) for book in os.listdir(project_directory): if book.endswith(".xml"): @@ -45,5 +92,6 @@ def main(): print(f"Errors in {book}: {errors}") return 0 + if __name__ == "__main__": - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/opensiddur/tests/importer/jps1917/test_convert_wikisource.py b/opensiddur/tests/importer/jps1917/test_convert_wikisource.py index 58acc81..7b93b62 100644 --- a/opensiddur/tests/importer/jps1917/test_convert_wikisource.py +++ b/opensiddur/tests/importer/jps1917/test_convert_wikisource.py @@ -31,9 +31,9 @@ def test_normal_case(self, mock_get_credits): # Verify get_credits was called for each page self.assertEqual(mock_get_credits.call_count, 3) - mock_get_credits.assert_any_call(1) - mock_get_credits.assert_any_call(2) - mock_get_credits.assert_any_call(3) + mock_get_credits.assert_any_call(1, None) + mock_get_credits.assert_any_call(2, None) + mock_get_credits.assert_any_call(3, None) @patch('opensiddur.importer.jps1917.convert_wikisource.get_credits') def test_repeated_strings(self, mock_get_credits): @@ -104,7 +104,7 @@ def test_single_page(self, mock_get_credits): # Should return sorted credits from single page self.assertEqual(result, ["Alice", "Bob"]) self.assertEqual(mock_get_credits.call_count, 1) - mock_get_credits.assert_called_once_with(5) + mock_get_credits.assert_called_once_with(5, None) @patch('opensiddur.importer.jps1917.convert_wikisource.get_credits') def test_sorting(self, mock_get_credits): @@ -657,7 +657,7 @@ def test_process_mediawiki_single_page(self, mock_get_page, mock_create_processo result = process_mediawiki(1, 1, "body", book_name="test_book") # Verify get_page was called with correct page number - mock_get_page.assert_called_once_with(1) + mock_get_page.assert_called_once_with(1, None) # Verify processor was created and used mock_create_processor.assert_called_once() @@ -719,9 +719,9 @@ def test_process_mediawiki_multiple_pages(self, mock_get_page, mock_create_proce # Verify get_page was called for each page self.assertEqual(mock_get_page.call_count, 3) - mock_get_page.assert_any_call(1) - mock_get_page.assert_any_call(2) - mock_get_page.assert_any_call(3) + mock_get_page.assert_any_call(1, None) + mock_get_page.assert_any_call(2, None) + mock_get_page.assert_any_call(3, None) # Verify processor was used for each page self.assertEqual(mock_processor.process_wikitext.call_count, 3) @@ -1133,7 +1133,7 @@ def test_book_file_basic_flow(self, mock_file, mock_get_credits, mock_header, result = book_file(self.test_book) # Verify get_credits_pages was called with correct page range - mock_get_credits.assert_called_once_with(1, 5) + mock_get_credits.assert_called_once_with(1, 5, None) # Verify header was called with correct parameters mock_header.assert_called_once_with( @@ -1145,9 +1145,10 @@ def test_book_file_basic_flow(self, mock_file, mock_get_credits, mock_header, # Verify process_mediawiki was called with correct parameters mock_process_mediawiki.assert_called_once_with( 1, 5, "body", + sourcetexts_root=None, wrapper_div_type="book", book_name="genesis", - is_section=False + is_section=False, ) # Verify tei_file was called with correct parameters @@ -1203,9 +1204,10 @@ def test_book_file_with_section(self, mock_file, mock_get_credits, mock_header, # Verify process_mediawiki was called with is_section=True mock_process_mediawiki.assert_called_once_with( 10, 15, "body", + sourcetexts_root=None, wrapper_div_type="book", book_name="genesis_ch1", - is_section=True + is_section=True, ) # Verify return value @@ -1305,7 +1307,7 @@ def test_index_file_with_pages_and_transclusions(self, mock_file, mock_get_credi result = index_file(test_index) # Verify get_credits_pages was called for the index - mock_get_credits.assert_any_call(1, 3) + mock_get_credits.assert_any_call(1, 3, None) # Verify header was called with correct parameters for the index mock_header.assert_any_call( @@ -1319,8 +1321,9 @@ def test_index_file_with_pages_and_transclusions(self, mock_file, mock_get_credi # Verify process_mediawiki was called for front matter mock_process_mediawiki.assert_called_once_with( 1, 3, "front", + sourcetexts_root=None, wrapper_div_type="", - book_name="" + book_name="", ) # Verify tei_file was called with correct body containing transclusions @@ -1433,8 +1436,8 @@ def test_index_file_recursive_processing(self, mock_file, mock_get_credits, mock # Verify that book_file was called for each Book transclusion self.assertEqual(mock_book_file.call_count, 2) - mock_book_file.assert_any_call(self.test_book1) - mock_book_file.assert_any_call(self.test_book2) + mock_book_file.assert_any_call(self.test_book1, None) + mock_book_file.assert_any_call(self.test_book2, None) # Verify index_file was not called recursively (no Index transclusions) mock_index_file.assert_not_called() @@ -1476,10 +1479,10 @@ def test_index_file_with_nested_index(self, mock_file, mock_header, mock_tei_fil result = index_file(parent_index) # Verify that book_file was called for Book transclusion - mock_book_file.assert_called_once_with(self.test_book2) + mock_book_file.assert_called_once_with(self.test_book2, None) # Verify that index_file was called recursively for Index transclusion - mock_index_file.assert_called_once_with(child_index) + mock_index_file.assert_called_once_with(child_index, None) # Verify return value self.assertEqual(result, "Parent TEI") diff --git a/opensiddur/tests/importer/util/test_pages.py b/opensiddur/tests/importer/util/test_pages.py index f414eb9..e0a48e2 100644 --- a/opensiddur/tests/importer/util/test_pages.py +++ b/opensiddur/tests/importer/util/test_pages.py @@ -2,7 +2,7 @@ from unittest.mock import patch, mock_open, MagicMock from pathlib import Path -from opensiddur.importer.util.pages import get_page, get_credits +from opensiddur.importer.util.pages import get_page, get_credits, jps1917_text_directory from opensiddur.importer.util.constants import Page @@ -20,7 +20,7 @@ def test_get_page_success_with_integer(self, mock_file): self.assertEqual(result.content, 'Page content here') # Should open the correct file - expected_path = Path(__file__).parent.parent.parent.parent.parent / "sources/jps1917/text/0025.txt" + expected_path = jps1917_text_directory() / "0025.txt" mock_file.assert_called_once() # Check that the path used ends with the expected filename actual_call = str(mock_file.call_args[0][0]) diff --git a/opensiddur/tests/importer/wlc/test_transform_index.py b/opensiddur/tests/importer/wlc/test_transform_index.py index 1433d71..83c1426 100644 --- a/opensiddur/tests/importer/wlc/test_transform_index.py +++ b/opensiddur/tests/importer/wlc/test_transform_index.py @@ -19,12 +19,10 @@ def setUp(self): self.xslt_source = Path(__file__).parent.parent.parent.parent / "importer/wlc/transform_index.xslt" xslt_content = self.xslt_source.read_text() - # Replace the doc() call by matching on the key part - # Original line 37 contains: doc('../../../sources/wlc/Books/TanachIndex.xml')//book - # We'll replace just the doc() function call to avoid it being evaluated + # Neuter TanachIndex lookup so tests do not need TanachIndex.xml on disk xslt_content = xslt_content.replace( - "doc('../../../sources/wlc/Books/TanachIndex.xml')//book", - "(.)[false()]" # XPath that returns empty sequence + "select=\"if (normalize-space($wlc-root-uri)) then doc(resolve-uri('Books/TanachIndex.xml', $wlc-root-uri))//book else doc(resolve-uri('../../../sources/wlc/Books/TanachIndex.xml', static-base-uri()))//book\"", + "select=\"()\"", ) # Also change mode from "fail" to "shallow-copy" for testing diff --git a/opensiddur/tests/importer/wlc/test_wlc.py b/opensiddur/tests/importer/wlc/test_wlc.py index 1b2682c..0ea552c 100644 --- a/opensiddur/tests/importer/wlc/test_wlc.py +++ b/opensiddur/tests/importer/wlc/test_wlc.py @@ -1,18 +1,26 @@ import unittest -from unittest.mock import patch, MagicMock, call +from unittest.mock import patch, call from pathlib import Path from opensiddur.importer.wlc.wlc import ( make_project_directory, get_source_directory, get_xslt_directory, - main + main, ) class TestWLCPathFunctions(unittest.TestCase): """Test path-related utility functions in wlc module""" + @patch('pathlib.Path.mkdir') + def test_make_project_directory_custom_path(self, mock_mkdir): + """make_project_directory respects an explicit project directory.""" + explicit = Path('/tmp/wlc-out') + result = make_project_directory(explicit) + self.assertEqual(result, explicit.resolve()) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + @patch('pathlib.Path.mkdir') def test_make_project_directory(self, mock_mkdir): """Test that make_project_directory creates the correct directory""" @@ -39,6 +47,11 @@ def test_get_source_directory(self): # Verify the path contains 'opensiddur-ai' (the project root) self.assertIn('opensiddur-ai', str(result)) + + def test_get_source_directory_custom_sourcetexts_root(self): + custom = Path('/tmp/opensiddur-sourcetexts') + result = get_source_directory(custom) + self.assertEqual(result, custom / 'wlc') def test_get_xslt_directory(self): """Test that get_xslt_directory returns the wlc module directory""" @@ -65,12 +78,10 @@ class TestWLCMain(unittest.TestCase): @patch('opensiddur.importer.wlc.wlc.os.listdir') @patch('opensiddur.importer.wlc.wlc.xslt_transform') @patch('opensiddur.importer.wlc.wlc.get_xslt_directory') - @patch('opensiddur.importer.wlc.wlc.get_source_directory') @patch('opensiddur.importer.wlc.wlc.make_project_directory') def test_main_transforms_and_validates_all_files( - self, - mock_make_project_dir, - mock_get_source_dir, + self, + mock_make_project_directory, mock_get_xslt_dir, mock_xslt_transform, mock_listdir, @@ -81,10 +92,10 @@ def test_main_transforms_and_validates_all_files( # Set up mock paths mock_project_dir = Path('/mock/project/wlc') mock_source_dir = Path('/mock/sources/wlc') + mock_sourcetexts_root = Path('/mock/sources') mock_xslt_dir = Path('/mock/opensiddur/importer/wlc') - mock_make_project_dir.return_value = mock_project_dir - mock_get_source_dir.return_value = mock_source_dir + mock_make_project_directory.return_value = mock_project_dir mock_get_xslt_dir.return_value = mock_xslt_dir # Mock os.listdir to return different values for different calls @@ -111,17 +122,27 @@ def test_main_transforms_and_validates_all_files( # Mock validate to return success mock_validate.return_value = (True, []) - # Run main - result = main() + # Run main with explicit dirs (sourcetexts root contains wlc/Books) + result = main( + [ + "--project-dir", + str(mock_project_dir), + "--sourcetexts-root", + str(mock_sourcetexts_root), + ] + ) # Verify return value self.assertEqual(result, 0) - # Verify xslt_transform was called for index + mock_wlc_uri = mock_source_dir.resolve().as_uri() + if not mock_wlc_uri.endswith("/"): + mock_wlc_uri += "/" expected_index_call = call( mock_xslt_dir / "transform_index.xslt", mock_source_dir / "Books" / "TanachHeader.xml", - mock_project_dir / "index.xml" + mock_project_dir / "index.xml", + xslt_params={"wlc-root-uri": mock_wlc_uri}, ) self.assertIn(expected_index_call, mock_xslt_transform.call_args_list) @@ -130,18 +151,18 @@ def test_main_transforms_and_validates_all_files( call( mock_xslt_dir / "transform_book.xslt", mock_source_dir / "Books" / "Genesis.xml", - mock_project_dir / "genesis.xml" + mock_project_dir / "genesis.xml", ), call( mock_xslt_dir / "transform_book.xslt", mock_source_dir / "Books" / "Exodus.xml", - mock_project_dir / "exodus.xml" + mock_project_dir / "exodus.xml", ), call( mock_xslt_dir / "transform_book.xslt", mock_source_dir / "Books" / "Leviticus.xml", - mock_project_dir / "leviticus.xml" - ) + mock_project_dir / "leviticus.xml", + ), ] for expected_call in expected_book_calls: @@ -168,12 +189,10 @@ def test_main_transforms_and_validates_all_files( @patch('opensiddur.importer.wlc.wlc.os.listdir') @patch('opensiddur.importer.wlc.wlc.xslt_transform') @patch('opensiddur.importer.wlc.wlc.get_xslt_directory') - @patch('opensiddur.importer.wlc.wlc.get_source_directory') @patch('opensiddur.importer.wlc.wlc.make_project_directory') def test_main_handles_validation_errors( - self, - mock_make_project_dir, - mock_get_source_dir, + self, + mock_make_project_directory, mock_get_xslt_dir, mock_xslt_transform, mock_listdir, @@ -184,10 +203,10 @@ def test_main_handles_validation_errors( # Set up mock paths mock_project_dir = Path('/mock/project/wlc') mock_source_dir = Path('/mock/sources/wlc') + mock_sourcetexts_root = Path('/mock/sources') mock_xslt_dir = Path('/mock/opensiddur/importer/wlc') - mock_make_project_dir.return_value = mock_project_dir - mock_get_source_dir.return_value = mock_source_dir + mock_make_project_directory.return_value = mock_project_dir mock_get_xslt_dir.return_value = mock_xslt_dir # Mock os.listdir @@ -202,7 +221,14 @@ def test_main_handles_validation_errors( ] # Run main - should not raise an exception - result = main() + result = main( + [ + "--project-dir", + str(mock_project_dir), + "--sourcetexts-root", + str(mock_sourcetexts_root), + ] + ) # Should still return 0 even with validation errors self.assertEqual(result, 0) From 2d164f3a7a19fc5bcd33cdd7ed8a0dadea4552aa Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Sat, 23 May 2026 23:41:32 -0700 Subject: [PATCH 12/13] test coverage increase --- opensiddur/tests/exporter/test_pdf.py | 318 ++++++++++++++++ .../exporter/test_validate_urn_references.py | 350 +++++++++++++++++- 2 files changed, 648 insertions(+), 20 deletions(-) diff --git a/opensiddur/tests/exporter/test_pdf.py b/opensiddur/tests/exporter/test_pdf.py index fc9ed7e..fbc6201 100644 --- a/opensiddur/tests/exporter/test_pdf.py +++ b/opensiddur/tests/exporter/test_pdf.py @@ -7,6 +7,11 @@ from unittest.mock import MagicMock, Mock, patch from opensiddur.exporter.pdf.pdf import ( + _have_command, + _run_bibtex, + _run_latexmk, + _run_lualatex, + _run_manual_loop, compile_tex_to_pdf, export_to_pdf, generate_tex, @@ -336,5 +341,318 @@ def test_export_to_pdf_compile_failure(self): self.assertFalse(result) +class TestHaveCommand(unittest.TestCase): + def test_returns_true_when_command_on_path(self): + with patch("opensiddur.exporter.pdf.pdf.shutil.which", return_value="/usr/bin/lualatex"): + self.assertTrue(_have_command("lualatex")) + + def test_returns_false_when_command_missing(self): + with patch("opensiddur.exporter.pdf.pdf.shutil.which", return_value=None): + self.assertFalse(_have_command("missing-tool")) + + +class TestRunLatexmk(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.addCleanup(self.temp_dir.cleanup) + self.test_dir = Path(self.temp_dir.name) + self.tex_file = self.test_dir / "doc.tex" + self.tex_file.write_text(r"\documentclass{book}\begin{document}X\end{document}") + self.output_dir = self.test_dir / "build" + self.output_dir.mkdir() + + def test_success(self): + result = MagicMock(returncode=0, stdout="", stderr="") + with patch("subprocess.run", return_value=result) as mock_run: + self.assertTrue(_run_latexmk(self.tex_file, self.output_dir)) + + cmd = mock_run.call_args.args[0] + self.assertEqual(cmd[0], "latexmk") + self.assertIn("-lualatex", cmd) + + def test_failure(self): + result = MagicMock(returncode=1, stdout="stdout", stderr="stderr") + with patch("subprocess.run", return_value=result): + self.assertFalse(_run_latexmk(self.tex_file, self.output_dir)) + + +class TestRunLualatex(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.addCleanup(self.temp_dir.cleanup) + self.test_dir = Path(self.temp_dir.name) + self.tex_file = self.test_dir / "doc.tex" + self.tex_file.write_text(r"\documentclass{book}\begin{document}X\end{document}") + self.output_dir = self.test_dir / "build" + self.output_dir.mkdir() + + def test_detects_rerun_markers_from_log(self): + def side_effect(cmd, **kwargs): + out_dir = Path(next(arg.split("=", 1)[1] for arg in cmd if arg.startswith("-output-directory="))) + (out_dir / "doc.log").write_text("Rerun to get cross-references right") + return MagicMock(returncode=0) + + with patch("subprocess.run", side_effect=side_effect): + success, output, needs_rerun = _run_lualatex(self.tex_file, self.output_dir) + + self.assertTrue(success) + self.assertTrue(needs_rerun) + self.assertIn("Rerun to get cross-references right", output) + + def test_handles_log_read_failure(self): + log_path = self.output_dir / "doc.log" + + def side_effect(cmd, **kwargs): + log_path.write_text("ok") + return MagicMock(returncode=0) + + with patch("subprocess.run", side_effect=side_effect): + with patch.object( + type(log_path), + "read_text", + side_effect=OSError("cannot read log"), + ): + success, output, needs_rerun = _run_lualatex(self.tex_file, self.output_dir) + + self.assertTrue(success) + self.assertEqual(output, "") + self.assertFalse(needs_rerun) + + def test_reports_lualatex_failure(self): + with patch("subprocess.run", return_value=MagicMock(returncode=1)): + success, output, needs_rerun = _run_lualatex(self.tex_file, self.output_dir) + + self.assertFalse(success) + self.assertFalse(needs_rerun) + + +class TestRunBibtex(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.addCleanup(self.temp_dir.cleanup) + self.output_dir = Path(self.temp_dir.name) + + def test_skips_when_aux_missing(self): + with patch("subprocess.run") as mock_run: + self.assertTrue(_run_bibtex("doc", self.output_dir)) + mock_run.assert_not_called() + + def test_skips_when_aux_has_no_bibliography(self): + (self.output_dir / "doc.aux").write_text("\\relax") + with patch("subprocess.run") as mock_run: + self.assertTrue(_run_bibtex("doc", self.output_dir)) + mock_run.assert_not_called() + + def test_runs_bibtex_when_aux_indicates_bibliography(self): + (self.output_dir / "doc.aux").write_text("\\bibdata{job}") + result = MagicMock(returncode=0, stdout="") + with patch("subprocess.run", return_value=result) as mock_run: + self.assertTrue(_run_bibtex("doc", self.output_dir)) + self.assertEqual(mock_run.call_args.args[0][0], "bibtex") + + def test_reports_bibtex_errors(self): + (self.output_dir / "doc.aux").write_text("\\citation{ref}") + result = MagicMock(returncode=0, stdout="error message in output") + with patch("subprocess.run", return_value=result): + self.assertFalse(_run_bibtex("doc", self.output_dir)) + + +class TestRunManualLoop(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.addCleanup(self.temp_dir.cleanup) + self.test_dir = Path(self.temp_dir.name) + self.tex_file = self.test_dir / "doc.tex" + self.tex_file.write_text(r"\documentclass{book}\begin{document}X\end{document}") + self.output_dir = self.test_dir / "build" + self.output_dir.mkdir() + + def test_returns_false_when_first_pass_fails(self): + with patch( + "opensiddur.exporter.pdf.pdf._run_lualatex", + return_value=(False, "log output", False), + ): + self.assertFalse(_run_manual_loop(self.tex_file, self.output_dir, max_runs=3)) + + def test_returns_false_when_later_pass_fails(self): + with patch( + "opensiddur.exporter.pdf.pdf._run_lualatex", + side_effect=[ + (True, "", True), + (False, "later pass failed", False), + ], + ): + self.assertFalse(_run_manual_loop(self.tex_file, self.output_dir, max_runs=3)) + + def test_warns_when_max_runs_reached(self): + with patch( + "opensiddur.exporter.pdf.pdf._run_lualatex", + return_value=(True, "Rerun to get cross-references right", True), + ): + with patch("sys.stderr", new_callable=Mock) as mock_stderr: + self.assertTrue(_run_manual_loop(self.tex_file, self.output_dir, max_runs=2)) + + stderr_output = "".join(str(c) for c in mock_stderr.write.call_args_list) + self.assertIn("max_runs", stderr_output) + + +class TestCompileTexToPdfEdgeCases(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.addCleanup(self.temp_dir.cleanup) + self.test_dir = Path(self.temp_dir.name) + self.tex_file = self.test_dir / "doc.tex" + self.tex_file.write_text(r"\documentclass{book}\begin{document}X\end{document}") + + def _tools_available(self, name): + return name in {"lualatex", "bibtex"} + + def test_uses_build_dir_when_provided(self): + build_dir = self.test_dir / "build" + output_pdf = self.test_dir / "out.pdf" + + with patch( + "opensiddur.exporter.pdf.pdf._have_command", + side_effect=self._tools_available, + ): + with patch( + "opensiddur.exporter.pdf.pdf._run_manual_loop", + return_value=True, + ) as mock_loop: + with patch( + "opensiddur.exporter.pdf.pdf.shutil.copy2", + ) as mock_copy: + build_dir.mkdir() + (build_dir / "doc.pdf").write_bytes(b"%PDF fake") + result = compile_tex_to_pdf( + self.tex_file, + output_pdf, + build_dir=build_dir, + ) + + self.assertTrue(result) + self.assertTrue(build_dir.exists()) + mock_loop.assert_called_once_with(self.tex_file, build_dir, 6) + mock_copy.assert_called_once() + + def test_fails_when_bibtex_missing(self): + def have(name): + return name == "lualatex" + + with patch("opensiddur.exporter.pdf.pdf._have_command", side_effect=have): + with patch("opensiddur.exporter.pdf.pdf._run_manual_loop") as mock_loop: + result = compile_tex_to_pdf(self.tex_file, self.test_dir / "out.pdf") + + self.assertFalse(result) + mock_loop.assert_not_called() + + def test_fails_when_manual_loop_fails(self): + with patch( + "opensiddur.exporter.pdf.pdf._have_command", + side_effect=self._tools_available, + ): + with patch( + "opensiddur.exporter.pdf.pdf._run_manual_loop", + return_value=False, + ): + result = compile_tex_to_pdf(self.tex_file, self.test_dir / "out.pdf") + + self.assertFalse(result) + + def test_skips_copy_when_output_pdf_is_build_artifact(self): + build_dir = self.test_dir / "build" + output_pdf = build_dir / "doc.pdf" + + with patch( + "opensiddur.exporter.pdf.pdf._have_command", + side_effect=self._tools_available, + ): + with patch( + "opensiddur.exporter.pdf.pdf._run_manual_loop", + return_value=True, + ): + with patch("opensiddur.exporter.pdf.pdf.shutil.copy2") as mock_copy: + build_dir.mkdir(parents=True) + output_pdf.write_bytes(b"%PDF fake") + result = compile_tex_to_pdf( + self.tex_file, + output_pdf, + build_dir=build_dir, + ) + + self.assertTrue(result) + mock_copy.assert_not_called() + + def test_handles_file_not_found_error(self): + with patch( + "opensiddur.exporter.pdf.pdf._have_command", + side_effect=self._tools_available, + ): + with patch( + "opensiddur.exporter.pdf.pdf._run_manual_loop", + return_value=True, + ): + with patch( + "opensiddur.exporter.pdf.pdf.shutil.copy2", + side_effect=FileNotFoundError("lualatex"), + ): + with patch("tempfile.TemporaryDirectory") as mock_temp: + temp_path = self.test_dir / "tempbuild" + temp_path.mkdir() + mock_temp.return_value.__enter__.return_value = str(temp_path) + (temp_path / "doc.pdf").write_bytes(b"%PDF fake") + result = compile_tex_to_pdf( + self.tex_file, + self.test_dir / "out.pdf", + ) + + self.assertFalse(result) + + def test_handles_unexpected_exception(self): + with patch( + "opensiddur.exporter.pdf.pdf._have_command", + side_effect=RuntimeError("unexpected"), + ): + result = compile_tex_to_pdf(self.tex_file, self.test_dir / "out.pdf") + + self.assertFalse(result) + + +class TestExportToPdfExtras(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.addCleanup(self.temp_dir.cleanup) + self.test_dir = Path(self.temp_dir.name) + + def test_forwards_build_dir_to_compile(self): + input_file = self.test_dir / "input.xml" + output_pdf = self.test_dir / "out.pdf" + build_dir = self.test_dir / "build" + input_file.write_text("") + + with patch("opensiddur.exporter.pdf.pdf.generate_tex", return_value=True): + with patch( + "opensiddur.exporter.pdf.pdf.compile_tex_to_pdf", + return_value=True, + ) as mock_compile: + export_to_pdf(input_file, output_pdf, build_dir=build_dir) + + self.assertEqual(mock_compile.call_args.kwargs.get("build_dir"), build_dir) + + def test_prints_intermediate_tex_message(self): + input_file = self.test_dir / "input.xml" + output_pdf = self.test_dir / "out.pdf" + tex_output = self.test_dir / "intermediate.tex" + input_file.write_text("") + + with patch("opensiddur.exporter.pdf.pdf.generate_tex", return_value=True): + with patch("opensiddur.exporter.pdf.pdf.compile_tex_to_pdf", return_value=True): + with patch("sys.stderr", new_callable=Mock) as mock_stderr: + export_to_pdf(input_file, output_pdf, tex_output=tex_output) + + stderr_output = "".join(str(c) for c in mock_stderr.write.call_args_list) + self.assertIn("Intermediate TeX saved to", stderr_output) + + if __name__ == "__main__": unittest.main() diff --git a/opensiddur/tests/exporter/test_validate_urn_references.py b/opensiddur/tests/exporter/test_validate_urn_references.py index 28aa43c..a12fe4e 100644 --- a/opensiddur/tests/exporter/test_validate_urn_references.py +++ b/opensiddur/tests/exporter/test_validate_urn_references.py @@ -1,15 +1,43 @@ +import io +import subprocess +import sys import unittest from pathlib import Path from tempfile import TemporaryDirectory +from unittest.mock import patch from lxml import etree from opensiddur.exporter.refdb import ReferenceDatabase -from opensiddur.exporter.validate_urn_references import validate_project_urn_references +from opensiddur.exporter.validate_urn_references import ( + UnresolvableUrnReference, + _format_failure, + main, + validate_project_urn_references, +) TEI_NS = "http://www.tei-c.org/ns/1.0" -NSMAP = {"tei": TEI_NS} +JLPTEI_NS = "http://jewishliturgy.org/ns/jlptei/2" +NSMAP = {"tei": TEI_NS, "j": JLPTEI_NS} + + +def _write_project_xml(base: Path, project: str, filename: str, root: etree._Element) -> Path: + project_path = base / project + project_path.mkdir(parents=True, exist_ok=True) + xml_path = project_path / filename + etree.ElementTree(root).write(str(xml_path), encoding="utf-8", xml_declaration=True) + return xml_path + + +def _add_urn_mapping(db_path: Path, project: str, file_name: str, urn: str) -> None: + db = ReferenceDatabase(db_path) + try: + element = etree.Element(f"{{{TEI_NS}}}milestone") + element.set("corresp", urn) + db.add_urn_mapping(project, file_name, element) + finally: + db.close() class TestValidateUrnReferences(unittest.TestCase): @@ -17,7 +45,6 @@ def test_validates_ptr_and_ref_targets(self): with TemporaryDirectory() as td: base = Path(td) project = "proj1" - (base / project).mkdir(parents=True, exist_ok=True) xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) text = etree.SubElement(xml, f"{{{TEI_NS}}}text") @@ -25,22 +52,11 @@ def test_validates_ptr_and_ref_targets(self): etree.SubElement(body, f"{{{TEI_NS}}}ptr", target="urn:x-opensiddur:test:doc1") etree.SubElement(body, f"{{{TEI_NS}}}ref", target="urn:x-opensiddur:test:doc2/1") - xml_path = base / project / "a.xml" - etree.ElementTree(xml).write(str(xml_path), encoding="utf-8", xml_declaration=True) + _write_project_xml(base, project, "a.xml", xml) db_path = base / "ref.db" - db = ReferenceDatabase(db_path) - try: - # Add URN mappings so resolver can resolve the references. - e1 = etree.Element(f"{{{TEI_NS}}}milestone") - e1.set("corresp", "urn:x-opensiddur:test:doc1") - db.add_urn_mapping(project, "a.xml", e1) - - e2 = etree.Element(f"{{{TEI_NS}}}milestone") - e2.set("corresp", "urn:x-opensiddur:test:doc2/1") - db.add_urn_mapping(project, "a.xml", e2) - finally: - db.close() + _add_urn_mapping(db_path, project, "a.xml", "urn:x-opensiddur:test:doc1") + _add_urn_mapping(db_path, project, "a.xml", "urn:x-opensiddur:test:doc2/1") failures = validate_project_urn_references( project, @@ -54,15 +70,13 @@ def test_reports_unresolvable_urns(self): with TemporaryDirectory() as td: base = Path(td) project = "proj1" - (base / project).mkdir(parents=True, exist_ok=True) xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) text = etree.SubElement(xml, f"{{{TEI_NS}}}text") body = etree.SubElement(text, f"{{{TEI_NS}}}body") etree.SubElement(body, f"{{{TEI_NS}}}ptr", target="urn:x-opensiddur:test:missing") - xml_path = base / project / "a.xml" - etree.ElementTree(xml).write(str(xml_path), encoding="utf-8", xml_declaration=True) + _write_project_xml(base, project, "a.xml", xml) db_path = base / "ref.db" ReferenceDatabase(db_path).close() @@ -77,3 +91,299 @@ def test_reports_unresolvable_urns(self): self.assertEqual(failures[0].file_name, "a.xml") self.assertEqual(failures[0].attribute_name, "target") + def test_raises_when_project_directory_missing(self): + with TemporaryDirectory() as td: + base = Path(td) + db_path = base / "ref.db" + ReferenceDatabase(db_path).close() + + with self.assertRaises(ValueError) as ctx: + validate_project_urn_references( + "missing_project", + project_directory=base, + reference_db_path=db_path, + ) + self.assertIn("Project directory does not exist", str(ctx.exception)) + + def test_skips_non_urn_targets(self): + with TemporaryDirectory() as td: + base = Path(td) + project = "proj1" + + xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) + text = etree.SubElement(xml, f"{{{TEI_NS}}}text") + body = etree.SubElement(text, f"{{{TEI_NS}}}body") + etree.SubElement(body, f"{{{TEI_NS}}}ptr", target="http://example.com/doc") + etree.SubElement(body, f"{{{TEI_NS}}}ref", target="local/path.xml") + + _write_project_xml(base, project, "a.xml", xml) + + db_path = base / "ref.db" + ReferenceDatabase(db_path).close() + + failures = validate_project_urn_references( + project, + project_directory=base, + reference_db_path=db_path, + ) + self.assertEqual(failures, []) + + def test_index_before_validate(self): + with TemporaryDirectory() as td: + base = Path(td) + project = "proj1" + + xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) + div = etree.SubElement(xml, f"{{{TEI_NS}}}div") + div.set("corresp", "urn:x-opensiddur:test:doc1") + + _write_project_xml(base, project, "a.xml", xml) + + db_path = base / "ref.db" + ReferenceDatabase(db_path).close() + + failures = validate_project_urn_references( + project, + project_directory=base, + reference_db_path=db_path, + index_before_validate=True, + ) + self.assertEqual(failures, []) + + def test_validates_resolvable_transclude(self): + with TemporaryDirectory() as td: + base = Path(td) + project = "proj1" + + xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) + text = etree.SubElement(xml, f"{{{TEI_NS}}}text") + body = etree.SubElement(text, f"{{{TEI_NS}}}body") + etree.SubElement( + body, + f"{{{JLPTEI_NS}}}transclude", + target="urn:x-opensiddur:test:doc1", + ) + + _write_project_xml(base, project, "a.xml", xml) + + db_path = base / "ref.db" + _add_urn_mapping(db_path, project, "a.xml", "urn:x-opensiddur:test:doc1") + + failures = validate_project_urn_references( + project, + project_directory=base, + reference_db_path=db_path, + ) + self.assertEqual(failures, []) + + def test_validates_transclude_with_target_end_in_same_project(self): + with TemporaryDirectory() as td: + base = Path(td) + project = "proj1" + + xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) + text = etree.SubElement(xml, f"{{{TEI_NS}}}text") + body = etree.SubElement(text, f"{{{TEI_NS}}}body") + etree.SubElement( + body, + f"{{{JLPTEI_NS}}}transclude", + target="urn:x-opensiddur:test:doc1/1", + targetEnd="urn:x-opensiddur:test:doc1/2", + ) + + _write_project_xml(base, project, "a.xml", xml) + + db_path = base / "ref.db" + _add_urn_mapping(db_path, project, "a.xml", "urn:x-opensiddur:test:doc1/1") + _add_urn_mapping(db_path, project, "a.xml", "urn:x-opensiddur:test:doc1/2") + + failures = validate_project_urn_references( + project, + project_directory=base, + reference_db_path=db_path, + ) + self.assertEqual(failures, []) + + def test_reports_unresolvable_transclude_target(self): + with TemporaryDirectory() as td: + base = Path(td) + project = "proj1" + + xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) + text = etree.SubElement(xml, f"{{{TEI_NS}}}text") + body = etree.SubElement(text, f"{{{TEI_NS}}}body") + etree.SubElement( + body, + f"{{{JLPTEI_NS}}}transclude", + target="urn:x-opensiddur:test:missing", + ) + + _write_project_xml(base, project, "a.xml", xml) + + db_path = base / "ref.db" + ReferenceDatabase(db_path).close() + + failures = validate_project_urn_references( + project, + project_directory=base, + reference_db_path=db_path, + ) + self.assertEqual(len(failures), 1) + self.assertEqual(failures[0].attribute_name, "target") + self.assertEqual(failures[0].urn, "urn:x-opensiddur:test:missing") + + def test_reports_unresolvable_transclude_target_end(self): + with TemporaryDirectory() as td: + base = Path(td) + project = "proj1" + + xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) + text = etree.SubElement(xml, f"{{{TEI_NS}}}text") + body = etree.SubElement(text, f"{{{TEI_NS}}}body") + etree.SubElement( + body, + f"{{{JLPTEI_NS}}}transclude", + target="urn:x-opensiddur:test:doc1/1", + targetEnd="urn:x-opensiddur:test:doc1/missing", + ) + + _write_project_xml(base, project, "a.xml", xml) + + db_path = base / "ref.db" + _add_urn_mapping(db_path, project, "a.xml", "urn:x-opensiddur:test:doc1/1") + + failures = validate_project_urn_references( + project, + project_directory=base, + reference_db_path=db_path, + ) + self.assertEqual(len(failures), 1) + self.assertEqual(failures[0].attribute_name, "targetEnd") + + def test_reports_transclude_target_end_in_wrong_project(self): + with TemporaryDirectory() as td: + base = Path(td) + project = "proj1" + + xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) + text = etree.SubElement(xml, f"{{{TEI_NS}}}text") + body = etree.SubElement(text, f"{{{TEI_NS}}}body") + etree.SubElement( + body, + f"{{{JLPTEI_NS}}}transclude", + target="urn:x-opensiddur:test:doc1/1", + targetEnd="urn:x-opensiddur:test:doc1/2", + ) + + _write_project_xml(base, project, "a.xml", xml) + + db_path = base / "ref.db" + _add_urn_mapping(db_path, project, "a.xml", "urn:x-opensiddur:test:doc1/1") + _add_urn_mapping(db_path, "proj2", "b.xml", "urn:x-opensiddur:test:doc1/1") + _add_urn_mapping(db_path, "proj2", "b.xml", "urn:x-opensiddur:test:doc1/2") + + failures = validate_project_urn_references( + project, + project_directory=base, + reference_db_path=db_path, + ) + self.assertEqual(len(failures), 1) + self.assertEqual(failures[0].attribute_name, "targetEnd") + + def test_format_failure(self): + failure = UnresolvableUrnReference( + project="proj1", + file_name="a.xml", + element_path="/TEI/text/body/ptr[1]", + attribute_name="target", + urn="urn:x-opensiddur:test:missing", + ) + self.assertEqual( + _format_failure(failure), + "proj1/a.xml: /TEI/text/body/ptr[1] @target=urn:x-opensiddur:test:missing", + ) + + +class TestValidateUrnReferencesMain(unittest.TestCase): + def test_main_success(self): + with TemporaryDirectory() as td: + base = Path(td) + project = "proj1" + + xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) + div = etree.SubElement(xml, f"{{{TEI_NS}}}div") + div.set("corresp", "urn:x-opensiddur:test:doc1") + _write_project_xml(base, project, "a.xml", xml) + + db_path = base / "ref.db" + + result = main( + [ + project, + "--project-directory", + str(base), + "--reference-db", + str(db_path), + "--index", + ] + ) + self.assertEqual(result, 0) + + def test_main_reports_failures(self): + with TemporaryDirectory() as td: + base = Path(td) + project = "proj1" + + xml = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP) + text = etree.SubElement(xml, f"{{{TEI_NS}}}text") + body = etree.SubElement(text, f"{{{TEI_NS}}}body") + etree.SubElement(body, f"{{{TEI_NS}}}ptr", target="urn:x-opensiddur:test:missing") + _write_project_xml(base, project, "a.xml", xml) + + db_path = base / "ref.db" + ReferenceDatabase(db_path).close() + + stdout = io.StringIO() + with patch("sys.stdout", stdout): + result = main( + [ + project, + "--project-directory", + str(base), + "--reference-db", + str(db_path), + ] + ) + + self.assertEqual(result, 2) + self.assertIn("urn:x-opensiddur:test:missing", stdout.getvalue()) + + def test_main_module_entry_point(self): + with TemporaryDirectory() as td: + base = Path(td) + project = "proj1" + (base / project).mkdir(parents=True) + + db_path = base / "ref.db" + ReferenceDatabase(db_path).close() + + result = subprocess.run( + [ + sys.executable, + "-m", + "opensiddur.exporter.validate_urn_references", + project, + "--project-directory", + str(base), + "--reference-db", + str(db_path), + ], + capture_output=True, + text=True, + check=False, + ) + self.assertEqual(result.returncode, 0) + + +if __name__ == "__main__": + unittest.main() From 2e4b73190b4d492d4593d35cc1a6b69e519654e4 Mon Sep 17 00:00:00 2001 From: Efraim Feinstein Date: Sat, 23 May 2026 23:46:34 -0700 Subject: [PATCH 13/13] ignore tanach download script --- .coveragerc | 1 + 1 file changed, 1 insertion(+) diff --git a/.coveragerc b/.coveragerc index 21d6e4e..ebeee6f 100644 --- a/.coveragerc +++ b/.coveragerc @@ -4,6 +4,7 @@ omit = opensiddur/tests/* */__pycache__/* */test_*.py + opensiddur/importer/wlc/download_tanach.py [report] precision = 2