Skip to content
37 changes: 37 additions & 0 deletions .github/workflows/github-actions-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,43 @@ jobs:
path: ./dist-export/dist/


native-test:
strategy:
matrix:
include:
- runs-on: macos-latest
install-poppler: brew install poppler
pytest-ignore-args: --ignore=tests/cv_models
- runs-on: ubuntu-24.04-arm
install-poppler: sudo apt-get install -y poppler-utils
pytest-ignore-args: tests/external/pdfalto/url_test.py tests/external/pdfalto/parser_test.py tests/document tests/utils tests/config tests/lookup
runs-on: ${{ matrix.runs-on }}
steps:
- name: Check out repository code
uses: actions/checkout@v5

- name: Install uv
uses: astral-sh/setup-uv@v5
with:
python-version: '3.11'

- name: Install poppler (required by pdf2image)
run: ${{ matrix.install-poppler }}

- name: Install dependencies
run: uv sync --frozen --group dev --extra delft
if: runner.os != 'Linux' || runner.arch != 'ARM64'

- name: Install dependencies (Linux arm64 - skip delft extras with no arm64 wheels)
run: |
uv sync --frozen --group dev
uv pip install "sciencebeam-trainer-delft>=0.0.36"
if: runner.os == 'Linux' && runner.arch == 'ARM64'

- name: Run pytest
run: uv run python -m pytest -p no:cacheprovider ${{ matrix.pytest-ignore-args }}


testpypi-publish:
if: github.ref == 'refs/heads/main'
needs: ["build-and-test"]
Expand Down
4 changes: 2 additions & 2 deletions sciencebeam_parser/app/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from sciencebeam_parser.app.context import AppContext
from sciencebeam_parser.config.config import AppConfig, get_download_dir
from sciencebeam_parser.external.pdfalto.wrapper import PdfAltoWrapper
from sciencebeam_parser.external.pdfalto.wrapper import PdfAltoWrapper, get_default_pdfalto_url
from sciencebeam_parser.external.pdfalto.parser import parse_alto_root
from sciencebeam_parser.external.wapiti.wrapper import LazyWapitiBinaryWrapper
from sciencebeam_parser.lookup.loader import load_lookup_from_config
Expand Down Expand Up @@ -177,7 +177,7 @@ def __init__(self, config: AppConfig):
self.pdfalto_wrapper = PdfAltoWrapper(
download_with_zip_path_support(
self.download_manager,
config['pdfalto']['path']
config['pdfalto'].get('path') or get_default_pdfalto_url()
)
)
self.pdfalto_wrapper.ensure_executable()
Expand Down
22 changes: 22 additions & 0 deletions sciencebeam_parser/external/pdfalto/wrapper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
import os
import platform
import stat
import sys
from typing import Optional

from sciencebeam_parser.utils.background_process import exec_with_logging
Expand All @@ -9,6 +11,26 @@
LOGGER = logging.getLogger(__name__)


PDFALTO_VERSION = 'v0.6.0'
_PDFALTO_BASE_URL = (
f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}'
)


def get_default_pdfalto_url() -> str:
machine = platform.machine().lower()
if sys.platform == 'darwin':
os_name = 'mac'
elif sys.platform.startswith('linux'):
os_name = 'linux'
else:
raise RuntimeError(f'Unsupported platform: {sys.platform!r}')
arch = 'arm64' if machine in ('arm64', 'aarch64') else '64'
zip_name = f'pdfalto-bin-{os_name}-{arch}.zip'
internal_path = f'pdfalto/{os_name}/{arch}/pdfalto'
return f'{_PDFALTO_BASE_URL}/{zip_name}!/{internal_path}'


class PdfAltoWrapper:
def __init__(self, binary_path: str):
self.binary_path = binary_path
Expand Down
5 changes: 4 additions & 1 deletion sciencebeam_parser/resources/default_config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ logging:
download_dir: '~/.cache/sciencebeam-parser/downloads'

pdfalto:
path: https://github.com/kermitt2/pdfalto/releases/download/v0.6.0/pdfalto-bin-linux-64.zip!/pdfalto/linux/64/pdfalto
# path is auto-detected based on the current platform if not specified
# (override with an explicit URL if needed, e.g.:)
# path: https://github.com/kermitt2/pdfalto/releases/download/v0.6.0/pdfalto-bin-linux-64.zip!/pdfalto/linux/64/pdfalto
path:
wapiti:
install_source: 'https://github.com/kermitt2/Wapiti/archive/a9c25d2bcccd60f1a54a7019689bd5229e866f00.tar.gz'
xslt:
Expand Down
44 changes: 44 additions & 0 deletions tests/external/pdfalto/url_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from unittest.mock import patch

from sciencebeam_parser.external.pdfalto.wrapper import (
PDFALTO_VERSION,
get_default_pdfalto_url
)


class TestGetDefaultPdfaltoUrl:
def test_linux_x86_64(self):
with patch('sys.platform', 'linux'), \
patch('platform.machine', return_value='x86_64'):
url = get_default_pdfalto_url()
assert url == (
f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}'
f'/pdfalto-bin-linux-64.zip!/pdfalto/linux/64/pdfalto'
)

def test_linux_aarch64(self):
with patch('sys.platform', 'linux'), \
patch('platform.machine', return_value='aarch64'):
url = get_default_pdfalto_url()
assert url == (
f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}'
f'/pdfalto-bin-linux-arm64.zip!/pdfalto/linux/arm64/pdfalto'
)

def test_macos_x86_64(self):
with patch('sys.platform', 'darwin'), \
patch('platform.machine', return_value='x86_64'):
url = get_default_pdfalto_url()
assert url == (
f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}'
f'/pdfalto-bin-mac-64.zip!/pdfalto/mac/64/pdfalto'
)

def test_macos_arm64(self):
with patch('sys.platform', 'darwin'), \
patch('platform.machine', return_value='arm64'):
url = get_default_pdfalto_url()
assert url == (
f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}'
f'/pdfalto-bin-mac-arm64.zip!/pdfalto/mac/arm64/pdfalto'
)
5 changes: 3 additions & 2 deletions tests/external/pdfalto/wrapper_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

from sciencebeam_parser.config.config import get_download_dir
from sciencebeam_parser.external.pdfalto.wrapper import (
PdfAltoWrapper
PdfAltoWrapper,
get_default_pdfalto_url
)
from sciencebeam_parser.utils.download import download_with_zip_path_support

Expand All @@ -22,7 +23,7 @@ def _pdfalto_wrapper(sciencebeam_parser_config: dict) -> PdfAltoWrapper:
pdfalto_wrapper = PdfAltoWrapper(
download_with_zip_path_support(
download_manager,
sciencebeam_parser_config['pdfalto']['path']
sciencebeam_parser_config['pdfalto'].get('path') or get_default_pdfalto_url()
)
)
pdfalto_wrapper.ensure_executable()
Expand Down
Loading