diff --git a/.github/workflows/github-actions-ci.yml b/.github/workflows/github-actions-ci.yml index a7bda7c7..2c851668 100644 --- a/.github/workflows/github-actions-ci.yml +++ b/.github/workflows/github-actions-ci.yml @@ -111,6 +111,43 @@ jobs: path: ./dist-export/dist/ + native-test: + strategy: + matrix: + include: + - runs-on: macos-latest + install-poppler: brew install poppler + pytest-ignore-args: --ignore=tests/cv_models + - runs-on: ubuntu-24.04-arm + install-poppler: sudo apt-get install -y poppler-utils + pytest-ignore-args: tests/external/pdfalto/url_test.py tests/external/pdfalto/parser_test.py tests/document tests/utils tests/config tests/lookup + runs-on: ${{ matrix.runs-on }} + steps: + - name: Check out repository code + uses: actions/checkout@v5 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: '3.11' + + - name: Install poppler (required by pdf2image) + run: ${{ matrix.install-poppler }} + + - name: Install dependencies + run: uv sync --frozen --group dev --extra delft + if: runner.os != 'Linux' || runner.arch != 'ARM64' + + - name: Install dependencies (Linux arm64 - skip delft extras with no arm64 wheels) + run: | + uv sync --frozen --group dev + uv pip install "sciencebeam-trainer-delft>=0.0.36" + if: runner.os == 'Linux' && runner.arch == 'ARM64' + + - name: Run pytest + run: uv run python -m pytest -p no:cacheprovider ${{ matrix.pytest-ignore-args }} + + testpypi-publish: if: github.ref == 'refs/heads/main' needs: ["build-and-test"] diff --git a/sciencebeam_parser/app/parser.py b/sciencebeam_parser/app/parser.py index cfb610e9..9e2ec43a 100644 --- a/sciencebeam_parser/app/parser.py +++ b/sciencebeam_parser/app/parser.py @@ -14,7 +14,7 @@ from sciencebeam_parser.app.context import AppContext from sciencebeam_parser.config.config import AppConfig, get_download_dir -from sciencebeam_parser.external.pdfalto.wrapper import PdfAltoWrapper +from sciencebeam_parser.external.pdfalto.wrapper import PdfAltoWrapper, get_default_pdfalto_url from sciencebeam_parser.external.pdfalto.parser import parse_alto_root from sciencebeam_parser.external.wapiti.wrapper import LazyWapitiBinaryWrapper from sciencebeam_parser.lookup.loader import load_lookup_from_config @@ -177,7 +177,7 @@ def __init__(self, config: AppConfig): self.pdfalto_wrapper = PdfAltoWrapper( download_with_zip_path_support( self.download_manager, - config['pdfalto']['path'] + config['pdfalto'].get('path') or get_default_pdfalto_url() ) ) self.pdfalto_wrapper.ensure_executable() diff --git a/sciencebeam_parser/external/pdfalto/wrapper.py b/sciencebeam_parser/external/pdfalto/wrapper.py index 42bb8c52..6283915c 100644 --- a/sciencebeam_parser/external/pdfalto/wrapper.py +++ b/sciencebeam_parser/external/pdfalto/wrapper.py @@ -1,6 +1,8 @@ import logging import os +import platform import stat +import sys from typing import Optional from sciencebeam_parser.utils.background_process import exec_with_logging @@ -9,6 +11,26 @@ LOGGER = logging.getLogger(__name__) +PDFALTO_VERSION = 'v0.6.0' +_PDFALTO_BASE_URL = ( + f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}' +) + + +def get_default_pdfalto_url() -> str: + machine = platform.machine().lower() + if sys.platform == 'darwin': + os_name = 'mac' + elif sys.platform.startswith('linux'): + os_name = 'linux' + else: + raise RuntimeError(f'Unsupported platform: {sys.platform!r}') + arch = 'arm64' if machine in ('arm64', 'aarch64') else '64' + zip_name = f'pdfalto-bin-{os_name}-{arch}.zip' + internal_path = f'pdfalto/{os_name}/{arch}/pdfalto' + return f'{_PDFALTO_BASE_URL}/{zip_name}!/{internal_path}' + + class PdfAltoWrapper: def __init__(self, binary_path: str): self.binary_path = binary_path diff --git a/sciencebeam_parser/resources/default_config/config.yml b/sciencebeam_parser/resources/default_config/config.yml index a96a90c9..c787cc90 100644 --- a/sciencebeam_parser/resources/default_config/config.yml +++ b/sciencebeam_parser/resources/default_config/config.yml @@ -43,7 +43,10 @@ logging: download_dir: '~/.cache/sciencebeam-parser/downloads' pdfalto: - path: https://github.com/kermitt2/pdfalto/releases/download/v0.6.0/pdfalto-bin-linux-64.zip!/pdfalto/linux/64/pdfalto + # path is auto-detected based on the current platform if not specified + # (override with an explicit URL if needed, e.g.:) + # path: https://github.com/kermitt2/pdfalto/releases/download/v0.6.0/pdfalto-bin-linux-64.zip!/pdfalto/linux/64/pdfalto + path: wapiti: install_source: 'https://github.com/kermitt2/Wapiti/archive/a9c25d2bcccd60f1a54a7019689bd5229e866f00.tar.gz' xslt: diff --git a/tests/external/pdfalto/url_test.py b/tests/external/pdfalto/url_test.py new file mode 100644 index 00000000..9c925df7 --- /dev/null +++ b/tests/external/pdfalto/url_test.py @@ -0,0 +1,44 @@ +from unittest.mock import patch + +from sciencebeam_parser.external.pdfalto.wrapper import ( + PDFALTO_VERSION, + get_default_pdfalto_url +) + + +class TestGetDefaultPdfaltoUrl: + def test_linux_x86_64(self): + with patch('sys.platform', 'linux'), \ + patch('platform.machine', return_value='x86_64'): + url = get_default_pdfalto_url() + assert url == ( + f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}' + f'/pdfalto-bin-linux-64.zip!/pdfalto/linux/64/pdfalto' + ) + + def test_linux_aarch64(self): + with patch('sys.platform', 'linux'), \ + patch('platform.machine', return_value='aarch64'): + url = get_default_pdfalto_url() + assert url == ( + f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}' + f'/pdfalto-bin-linux-arm64.zip!/pdfalto/linux/arm64/pdfalto' + ) + + def test_macos_x86_64(self): + with patch('sys.platform', 'darwin'), \ + patch('platform.machine', return_value='x86_64'): + url = get_default_pdfalto_url() + assert url == ( + f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}' + f'/pdfalto-bin-mac-64.zip!/pdfalto/mac/64/pdfalto' + ) + + def test_macos_arm64(self): + with patch('sys.platform', 'darwin'), \ + patch('platform.machine', return_value='arm64'): + url = get_default_pdfalto_url() + assert url == ( + f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}' + f'/pdfalto-bin-mac-arm64.zip!/pdfalto/mac/arm64/pdfalto' + ) diff --git a/tests/external/pdfalto/wrapper_test.py b/tests/external/pdfalto/wrapper_test.py index a39eccae..b546c32f 100644 --- a/tests/external/pdfalto/wrapper_test.py +++ b/tests/external/pdfalto/wrapper_test.py @@ -6,7 +6,8 @@ from sciencebeam_parser.config.config import get_download_dir from sciencebeam_parser.external.pdfalto.wrapper import ( - PdfAltoWrapper + PdfAltoWrapper, + get_default_pdfalto_url ) from sciencebeam_parser.utils.download import download_with_zip_path_support @@ -22,7 +23,7 @@ def _pdfalto_wrapper(sciencebeam_parser_config: dict) -> PdfAltoWrapper: pdfalto_wrapper = PdfAltoWrapper( download_with_zip_path_support( download_manager, - sciencebeam_parser_config['pdfalto']['path'] + sciencebeam_parser_config['pdfalto'].get('path') or get_default_pdfalto_url() ) ) pdfalto_wrapper.ensure_executable()