From d6b366c951d7c993a2bbd4b5325a1b5a5c31ebb8 Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 09:02:24 +0100 Subject: [PATCH 01/16] fix macos build --- .github/workflows/ci.yml | 3 +- .../python/tools/prepare_build_environment.sh | 48 ------------------- .../tools/prepare_build_environment_linux.sh | 31 ++++++++++++ .../tools/prepare_build_environment_macos.sh | 37 ++++++++++++++ 4 files changed, 70 insertions(+), 49 deletions(-) delete mode 100755 bindings/python/tools/prepare_build_environment.sh create mode 100755 bindings/python/tools/prepare_build_environment_linux.sh create mode 100755 bindings/python/tools/prepare_build_environment_macos.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 09027305..3426e3c5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -113,7 +113,8 @@ jobs: output-dir: wheelhouse env: CIBW_ENVIRONMENT_WINDOWS: TOKENIZER_ROOT='${{ github.workspace }}\install' - CIBW_BEFORE_ALL: bindings/python/tools/prepare_build_environment.sh + CIBW_BEFORE_ALL_LINUX: bindings/python/tools/prepare_build_environment_linux.sh + CIBW_BEFORE_ALL_MACOS: bindings/python/tools/prepare_build_environment_macos.sh CIBW_BEFORE_ALL_WINDOWS: bash bindings/python/tools/prepare_build_environment_windows.sh CIBW_BEFORE_BUILD: pip install pybind11==2.10.1 CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014 diff --git a/bindings/python/tools/prepare_build_environment.sh b/bindings/python/tools/prepare_build_environment.sh deleted file mode 100755 index 7ec955d3..00000000 --- a/bindings/python/tools/prepare_build_environment.sh +++ /dev/null @@ -1,48 +0,0 @@ -#! /bin/bash - -set -e -set -x - -ROOT_DIR=$PWD -ICU_ROOT=$ROOT_DIR/icu -CMAKE_EXTRA_ARGS="" - -if [ "$CIBW_ARCHS" == "arm64" ]; then - - # Download ICU ARM64 binaries from Homebrew. - brew fetch --force --bottle-tag=arm64_big_sur icu4c \ - | grep "Downloaded to" \ - | awk '{ print $3 }' \ - | xargs -I{} tar xf {} -C $ROOT_DIR - - mv icu4c/*.* $ICU_ROOT - - # Remove dynamic libraries to force static link. - rm $ICU_ROOT/lib/*.dylib - - CMAKE_EXTRA_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64" - -else - - # Download and compile ICU from sources. - ICU_VERSION=${ICU_VERSION:-73.2} - curl -L -O https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION/./-}/icu4c-${ICU_VERSION/./_}-src.tgz - tar xf icu4c-*-src.tgz - cd icu/source - CFLAGS="-O3 -fPIC" CXXFLAGS="-O3 -fPIC" ./configure --disable-shared --enable-static --prefix=$ICU_ROOT - make -j2 install - -fi - -cd $ROOT_DIR - -# Install cmake. -pip install "cmake==3.18.*" - -# Build Tokenizer. -rm -rf build -mkdir build -cd build -cmake -DLIB_ONLY=ON -DICU_ROOT=$ICU_ROOT $CMAKE_EXTRA_ARGS .. -VERBOSE=1 make -j2 install -cd $ROOT_DIR diff --git a/bindings/python/tools/prepare_build_environment_linux.sh b/bindings/python/tools/prepare_build_environment_linux.sh new file mode 100755 index 00000000..2b5d465f --- /dev/null +++ b/bindings/python/tools/prepare_build_environment_linux.sh @@ -0,0 +1,31 @@ +#! /bin/bash + +set -e +set -x + +ROOT_DIR="$PWD" +ICU_ROOT="$ROOT_DIR/icu" +CMAKE_EXTRA_ARGS="" + +# Download and compile ICU from sources. +ICU_VERSION="${ICU_VERSION:-73.2}" +curl -L -O "https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION/./-}/icu4c-${ICU_VERSION/./_}-src.tgz" +tar xf icu4c-*-src.tgz + +cd icu/source +CFLAGS="-O3 -fPIC" CXXFLAGS="-O3 -fPIC" \ + ./configure --disable-shared --enable-static --prefix="$ICU_ROOT" +make -j2 install +cd "$ROOT_DIR" + +# Install cmake. +pip install "cmake==3.18.*" + +# Build Tokenizer. +rm -rf build +mkdir build +cd build +cmake -DLIB_ONLY=ON -DICU_ROOT="$ICU_ROOT" .. +VERBOSE=1 make -j2 install +cd "$ROOT_DIR" + diff --git a/bindings/python/tools/prepare_build_environment_macos.sh b/bindings/python/tools/prepare_build_environment_macos.sh new file mode 100755 index 00000000..a35d0281 --- /dev/null +++ b/bindings/python/tools/prepare_build_environment_macos.sh @@ -0,0 +1,37 @@ +#! /bin/bash + +set -e +set -x + +ROOT_DIR="$PWD" +ICU_ROOT="$ROOT_DIR/icu" +CMAKE_EXTRA_ARGS="" + +mkdir -p "$ICU_ROOT" + +# Install ICU via Homebrew +brew install icu4c +ICU_PREFIX="$(brew --prefix icu4c)" + +# Copy ICU into local prefix +rsync -a "$ICU_PREFIX/" "$ICU_ROOT/" + +# Remove dynamic libraries to force static linking +rm -f "$ICU_ROOT/lib/"*.dylib || true + +# Explicit Apple Silicon handling +if [[ "$(uname -m)" == "arm64" ]]; then + CMAKE_EXTRA_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64" +fi + +# Install cmake +pip install "cmake==3.18.*" + +# Build Tokenizer +rm -rf build +mkdir build +cd build +cmake -DLIB_ONLY=ON -DICU_ROOT="$ICU_ROOT" $CMAKE_EXTRA_ARGS .. +VERBOSE=1 make -j2 install +cd "$ROOT_DIR" + From cb89b8f6d9d31b91ab9905df593eacc82d5f4c48 Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 09:08:09 +0100 Subject: [PATCH 02/16] unpin cmake --- bindings/python/tools/prepare_build_environment_linux.sh | 2 +- bindings/python/tools/prepare_build_environment_macos.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/python/tools/prepare_build_environment_linux.sh b/bindings/python/tools/prepare_build_environment_linux.sh index 2b5d465f..3c84ddc8 100755 --- a/bindings/python/tools/prepare_build_environment_linux.sh +++ b/bindings/python/tools/prepare_build_environment_linux.sh @@ -19,7 +19,7 @@ make -j2 install cd "$ROOT_DIR" # Install cmake. -pip install "cmake==3.18.*" +pip install cmake # Build Tokenizer. rm -rf build diff --git a/bindings/python/tools/prepare_build_environment_macos.sh b/bindings/python/tools/prepare_build_environment_macos.sh index a35d0281..b361ac57 100755 --- a/bindings/python/tools/prepare_build_environment_macos.sh +++ b/bindings/python/tools/prepare_build_environment_macos.sh @@ -25,7 +25,7 @@ if [[ "$(uname -m)" == "arm64" ]]; then fi # Install cmake -pip install "cmake==3.18.*" +pip install cmake # Build Tokenizer rm -rf build From a1d9246938617fba26955e8b5a05935cd2f948b7 Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 09:18:09 +0100 Subject: [PATCH 03/16] new fix macos --- .github/workflows/ci.yml | 5 +++-- bindings/python/tools/prepare_build_environment_macos.sh | 8 +++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3426e3c5..ff85221d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -57,10 +57,10 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: "3.11" - name: Install dependencies run: | @@ -119,6 +119,7 @@ jobs: CIBW_BEFORE_BUILD: pip install pybind11==2.10.1 CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014 CIBW_MANYLINUX_AARCH64_IMAGE: manylinux2014 + CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*" CIBW_TEST_COMMAND: pytest {project}/bindings/python/test/test.py CIBW_TEST_REQUIRES: pytest CIBW_ARCHS: ${{ matrix.arch }} diff --git a/bindings/python/tools/prepare_build_environment_macos.sh b/bindings/python/tools/prepare_build_environment_macos.sh index b361ac57..773783ef 100755 --- a/bindings/python/tools/prepare_build_environment_macos.sh +++ b/bindings/python/tools/prepare_build_environment_macos.sh @@ -31,7 +31,13 @@ pip install cmake rm -rf build mkdir build cd build -cmake -DLIB_ONLY=ON -DICU_ROOT="$ICU_ROOT" $CMAKE_EXTRA_ARGS .. +cmake \ + -DLIB_ONLY=ON \ + -DICU_ROOT="$ICU_ROOT" \ + -DCMAKE_INSTALL_PREFIX="$ROOT_DIR/build/install" \ + $CMAKE_EXTRA_ARGS \ + .. + VERBOSE=1 make -j2 install cd "$ROOT_DIR" From 2d42353f52b23561172be9191d3bf4e5210ce0ec Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 09:29:40 +0100 Subject: [PATCH 04/16] fix bindings --- .github/workflows/ci.yml | 4 ++-- bindings/python/tools/prepare_build_environment_linux.sh | 2 ++ bindings/python/tools/prepare_build_environment_macos.sh | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ff85221d..7e0942f1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -112,14 +112,14 @@ jobs: package-dir: bindings/python output-dir: wheelhouse env: - CIBW_ENVIRONMENT_WINDOWS: TOKENIZER_ROOT='${{ github.workspace }}\install' + CIBW_ENVIRONMENT: TOKENIZER_ROOT=${GITHUB_WORKSPACE}/build/install CIBW_BEFORE_ALL_LINUX: bindings/python/tools/prepare_build_environment_linux.sh CIBW_BEFORE_ALL_MACOS: bindings/python/tools/prepare_build_environment_macos.sh CIBW_BEFORE_ALL_WINDOWS: bash bindings/python/tools/prepare_build_environment_windows.sh CIBW_BEFORE_BUILD: pip install pybind11==2.10.1 CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014 CIBW_MANYLINUX_AARCH64_IMAGE: manylinux2014 - CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*" + CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*" CIBW_TEST_COMMAND: pytest {project}/bindings/python/test/test.py CIBW_TEST_REQUIRES: pytest CIBW_ARCHS: ${{ matrix.arch }} diff --git a/bindings/python/tools/prepare_build_environment_linux.sh b/bindings/python/tools/prepare_build_environment_linux.sh index 3c84ddc8..601a10cf 100755 --- a/bindings/python/tools/prepare_build_environment_linux.sh +++ b/bindings/python/tools/prepare_build_environment_linux.sh @@ -28,4 +28,6 @@ cd build cmake -DLIB_ONLY=ON -DICU_ROOT="$ICU_ROOT" .. VERBOSE=1 make -j2 install cd "$ROOT_DIR" +echo "TOKENIZER_ROOT=$ROOT_DIR/build/install" >> $GITHUB_ENV + diff --git a/bindings/python/tools/prepare_build_environment_macos.sh b/bindings/python/tools/prepare_build_environment_macos.sh index 773783ef..2317294c 100755 --- a/bindings/python/tools/prepare_build_environment_macos.sh +++ b/bindings/python/tools/prepare_build_environment_macos.sh @@ -40,4 +40,6 @@ cmake \ VERBOSE=1 make -j2 install cd "$ROOT_DIR" +echo "TOKENIZER_ROOT=$ROOT_DIR/build/install" >> $GITHUB_ENV + From 9125594696fe1f48c43b3857351720bd07ed0240 Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 09:42:15 +0100 Subject: [PATCH 05/16] another try --- .github/workflows/ci.yml | 1 + .../tools/prepare_build_environment_linux.sh | 60 +++++++++++++------ .../tools/prepare_build_environment_macos.sh | 2 - 3 files changed, 42 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7e0942f1..9a317502 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -125,6 +125,7 @@ jobs: CIBW_ARCHS: ${{ matrix.arch }} CIBW_SKIP: pp* *-musllinux_* CIBW_TEST_SKIP: "*-macosx_arm64" + CIBW_REPAIR_WHEEL_COMMAND_MACOS: "" - name: Upload Python wheels uses: actions/upload-artifact@v4 diff --git a/bindings/python/tools/prepare_build_environment_linux.sh b/bindings/python/tools/prepare_build_environment_linux.sh index 601a10cf..258b8296 100755 --- a/bindings/python/tools/prepare_build_environment_linux.sh +++ b/bindings/python/tools/prepare_build_environment_linux.sh @@ -1,33 +1,55 @@ -#! /bin/bash - -set -e +#!/usr/bin/env bash +set -euo pipefail set -x ROOT_DIR="$PWD" -ICU_ROOT="$ROOT_DIR/icu" -CMAKE_EXTRA_ARGS="" +INSTALL_PREFIX="$ROOT_DIR/build/install" +ICU_ROOT="$INSTALL_PREFIX" + +mkdir -p "$INSTALL_PREFIX" + +# ----------------------------- +# Build ICU from source (static) +# ----------------------------- +ICU_VERSION=${ICU_VERSION:-73.2} -# Download and compile ICU from sources. -ICU_VERSION="${ICU_VERSION:-73.2}" -curl -L -O "https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION/./-}/icu4c-${ICU_VERSION/./_}-src.tgz" +curl -LO https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION/./-}/icu4c-${ICU_VERSION/./_}-src.tgz tar xf icu4c-*-src.tgz -cd icu/source -CFLAGS="-O3 -fPIC" CXXFLAGS="-O3 -fPIC" \ - ./configure --disable-shared --enable-static --prefix="$ICU_ROOT" -make -j2 install -cd "$ROOT_DIR" +pushd icu/source + +CFLAGS="-O3 -fPIC" \ +CXXFLAGS="-O3 -fPIC" \ +./configure \ + --disable-shared \ + --enable-static \ + --prefix="$ICU_ROOT" -# Install cmake. -pip install cmake +make -j$(nproc) +make install + +popd + +# ----------------------------- +# Build Tokenizer (C++) +# ----------------------------- +pip install "cmake<4" -# Build Tokenizer. rm -rf build mkdir build cd build -cmake -DLIB_ONLY=ON -DICU_ROOT="$ICU_ROOT" .. -VERBOSE=1 make -j2 install + +cmake \ + -DLIB_ONLY=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX="$INSTALL_PREFIX" \ + -DICU_ROOT="$ICU_ROOT" \ + .. + +make -j$(nproc) +make install + cd "$ROOT_DIR" -echo "TOKENIZER_ROOT=$ROOT_DIR/build/install" >> $GITHUB_ENV +echo "Linux build complete" diff --git a/bindings/python/tools/prepare_build_environment_macos.sh b/bindings/python/tools/prepare_build_environment_macos.sh index 2317294c..773783ef 100755 --- a/bindings/python/tools/prepare_build_environment_macos.sh +++ b/bindings/python/tools/prepare_build_environment_macos.sh @@ -40,6 +40,4 @@ cmake \ VERBOSE=1 make -j2 install cd "$ROOT_DIR" -echo "TOKENIZER_ROOT=$ROOT_DIR/build/install" >> $GITHUB_ENV - From e20b8f8cfecd85787d15568b3f82d5e269b8a337 Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 09:51:45 +0100 Subject: [PATCH 06/16] fix linux again --- .../tools/prepare_build_environment_linux.sh | 40 +++++++------------ 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/bindings/python/tools/prepare_build_environment_linux.sh b/bindings/python/tools/prepare_build_environment_linux.sh index 258b8296..08ed3ffc 100755 --- a/bindings/python/tools/prepare_build_environment_linux.sh +++ b/bindings/python/tools/prepare_build_environment_linux.sh @@ -1,55 +1,43 @@ #!/usr/bin/env bash -set -euo pipefail -set -x +set -euxo pipefail ROOT_DIR="$PWD" -INSTALL_PREFIX="$ROOT_DIR/build/install" -ICU_ROOT="$INSTALL_PREFIX" +ICU_ROOT="$ROOT_DIR/icu" -mkdir -p "$INSTALL_PREFIX" +# manylinux compiler flags (required) +export CFLAGS="-O3 -fPIC" +export CXXFLAGS="-O3 -fPIC" -# ----------------------------- -# Build ICU from source (static) -# ----------------------------- +# Build ICU from source ICU_VERSION=${ICU_VERSION:-73.2} +ICU_TGZ="icu4c-${ICU_VERSION/./_}-src.tgz" -curl -LO https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION/./-}/icu4c-${ICU_VERSION/./_}-src.tgz -tar xf icu4c-*-src.tgz +curl -LO "https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION/./-}/${ICU_TGZ}" +tar xf "$ICU_TGZ" pushd icu/source - -CFLAGS="-O3 -fPIC" \ -CXXFLAGS="-O3 -fPIC" \ ./configure \ --disable-shared \ --enable-static \ --prefix="$ICU_ROOT" - make -j$(nproc) make install - popd -# ----------------------------- -# Build Tokenizer (C++) -# ----------------------------- -pip install "cmake<4" - +# Build Tokenizer C++ library rm -rf build mkdir build -cd build +pushd build cmake \ -DLIB_ONLY=ON \ -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX="$INSTALL_PREFIX" \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DICU_ROOT="$ICU_ROOT" \ .. make -j$(nproc) -make install - -cd "$ROOT_DIR" +make install DESTDIR="$ROOT_DIR/build" -echo "Linux build complete" +popd From 903968b0009513654d1acdea8cd2a67a49e9c963 Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 10:02:57 +0100 Subject: [PATCH 07/16] another try --- .../python/tools/prepare_build_environment_linux.sh | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/bindings/python/tools/prepare_build_environment_linux.sh b/bindings/python/tools/prepare_build_environment_linux.sh index 08ed3ffc..7510d75a 100755 --- a/bindings/python/tools/prepare_build_environment_linux.sh +++ b/bindings/python/tools/prepare_build_environment_linux.sh @@ -3,6 +3,7 @@ set -euxo pipefail ROOT_DIR="$PWD" ICU_ROOT="$ROOT_DIR/icu" +INSTALL_DIR="${ROOT_DIR}/build/install" # manylinux compiler flags (required) export CFLAGS="-O3 -fPIC" @@ -11,10 +12,8 @@ export CXXFLAGS="-O3 -fPIC" # Build ICU from source ICU_VERSION=${ICU_VERSION:-73.2} ICU_TGZ="icu4c-${ICU_VERSION/./_}-src.tgz" - curl -LO "https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION/./-}/${ICU_TGZ}" tar xf "$ICU_TGZ" - pushd icu/source ./configure \ --disable-shared \ @@ -26,18 +25,15 @@ popd # Build Tokenizer C++ library rm -rf build -mkdir build +mkdir -p build pushd build - cmake \ -DLIB_ONLY=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ -DICU_ROOT="$ICU_ROOT" \ .. - make -j$(nproc) -make install DESTDIR="$ROOT_DIR/build" - +make install popd - From 981f7818ccf650718d6c248b2957e6657c536bbd Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 10:08:56 +0100 Subject: [PATCH 08/16] another --- .../python/tools/prepare_build_environment_linux.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/bindings/python/tools/prepare_build_environment_linux.sh b/bindings/python/tools/prepare_build_environment_linux.sh index 7510d75a..bd9928db 100755 --- a/bindings/python/tools/prepare_build_environment_linux.sh +++ b/bindings/python/tools/prepare_build_environment_linux.sh @@ -1,9 +1,10 @@ #!/usr/bin/env bash set -euxo pipefail -ROOT_DIR="$PWD" +# Use /project as the root since that's where cibuildwheel mounts the repo +ROOT_DIR="/project" ICU_ROOT="$ROOT_DIR/icu" -INSTALL_DIR="${ROOT_DIR}/build/install" +INSTALL_DIR="$ROOT_DIR/build/install" # manylinux compiler flags (required) export CFLAGS="-O3 -fPIC" @@ -24,16 +25,16 @@ make install popd # Build Tokenizer C++ library -rm -rf build -mkdir -p build -pushd build +rm -rf "$ROOT_DIR/build" +mkdir -p "$ROOT_DIR/build" +pushd "$ROOT_DIR/build" cmake \ -DLIB_ONLY=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ -DICU_ROOT="$ICU_ROOT" \ - .. + "$ROOT_DIR" make -j$(nproc) make install popd From 7d9dbfb11323c0c6f197fa6f55a370e01147dc4a Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 10:15:40 +0100 Subject: [PATCH 09/16] pain --- .github/workflows/ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9a317502..bc1371d2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -112,7 +112,9 @@ jobs: package-dir: bindings/python output-dir: wheelhouse env: - CIBW_ENVIRONMENT: TOKENIZER_ROOT=${GITHUB_WORKSPACE}/build/install + CIBW_ENVIRONMENT_LINUX: TOKENIZER_ROOT=/project/build/install + CIBW_ENVIRONMENT_MACOS: TOKENIZER_ROOT=${GITHUB_WORKSPACE}/build/install + CIBW_ENVIRONMENT_WINDOWS: TOKENIZER_ROOT=${GITHUB_WORKSPACE}/build/install CIBW_BEFORE_ALL_LINUX: bindings/python/tools/prepare_build_environment_linux.sh CIBW_BEFORE_ALL_MACOS: bindings/python/tools/prepare_build_environment_macos.sh CIBW_BEFORE_ALL_WINDOWS: bash bindings/python/tools/prepare_build_environment_windows.sh From ebea96d3f7850def2dfe8d8649ebe804f91992f5 Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 10:22:33 +0100 Subject: [PATCH 10/16] more --- bindings/python/tools/prepare_build_environment_linux.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/bindings/python/tools/prepare_build_environment_linux.sh b/bindings/python/tools/prepare_build_environment_linux.sh index bd9928db..19b894c0 100755 --- a/bindings/python/tools/prepare_build_environment_linux.sh +++ b/bindings/python/tools/prepare_build_environment_linux.sh @@ -30,6 +30,7 @@ mkdir -p "$ROOT_DIR/build" pushd "$ROOT_DIR/build" cmake \ -DLIB_ONLY=ON \ + -DBUILD_SHARED_LIBS=OFF \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ From a17bcf9d5354ce69e9732a6665311bf7838b6e70 Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 10:35:59 +0100 Subject: [PATCH 11/16] oops --- .github/workflows/ci.yml | 2 +- bindings/python/setup.py | 48 ++++++++++++++----- .../tools/prepare_build_environment_linux.sh | 2 +- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bc1371d2..5407fa8c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -112,7 +112,7 @@ jobs: package-dir: bindings/python output-dir: wheelhouse env: - CIBW_ENVIRONMENT_LINUX: TOKENIZER_ROOT=/project/build/install + CIBW_ENVIRONMENT_LINUX: TOKENIZER_ROOT=/project/build/install ICU_ROOT=/project/icu CIBW_ENVIRONMENT_MACOS: TOKENIZER_ROOT=${GITHUB_WORKSPACE}/build/install CIBW_ENVIRONMENT_WINDOWS: TOKENIZER_ROOT=${GITHUB_WORKSPACE}/build/install CIBW_BEFORE_ALL_LINUX: bindings/python/tools/prepare_build_environment_linux.sh diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 5c2b441a..bc72112c 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -1,20 +1,18 @@ import os import sys - import pybind11 - from setuptools import Extension, find_packages, setup include_dirs = [pybind11.get_include()] library_dirs = [] - +libraries = [] +extra_objects = [] def _get_long_description(): readme_path = "README.md" with open(readme_path, encoding="utf-8") as readme_file: return readme_file.read() - def _get_project_version(): base_dir = os.path.dirname(os.path.abspath(__file__)) version_path = os.path.join(base_dir, "pyonmttok", "version.py") @@ -23,25 +21,50 @@ def _get_project_version(): exec(fp.read(), version) return version["__version__"] - def _maybe_add_library_root(lib_name, header_only=False): root = os.environ.get("%s_ROOT" % lib_name) if root is None: - return + return None include_dirs.append(os.path.join(root, "include")) if not header_only: for lib_subdir in ("lib64", "lib"): lib_dir = os.path.join(root, lib_subdir) if os.path.isdir(lib_dir): library_dirs.append(lib_dir) - break + return lib_dir + return root +tokenizer_root = _maybe_add_library_root("TOKENIZER") +icu_root = os.environ.get("ICU_ROOT") -_maybe_add_library_root("TOKENIZER") +# Handle static linking on Linux (manylinux) +if sys.platform == "linux" and tokenizer_root: + # Link statically against OpenNMTTokenizer + tokenizer_lib = os.path.join(tokenizer_root, "lib", "libOpenNMTTokenizer.a") + if os.path.exists(tokenizer_lib): + extra_objects.append(tokenizer_lib) + else: + libraries.append("OpenNMTTokenizer") + + # Link statically against ICU libraries + if icu_root: + for lib_subdir in ("lib64", "lib"): + icu_lib_dir = os.path.join(icu_root, lib_subdir) + if os.path.isdir(icu_lib_dir): + # ICU libraries must be linked in the correct order + for icu_lib in ["icui18n", "icuuc", "icudata"]: + icu_lib_path = os.path.join(icu_lib_dir, f"lib{icu_lib}.a") + if os.path.exists(icu_lib_path): + extra_objects.append(icu_lib_path) + break +else: + # Dynamic linking for macOS and Windows + libraries.append("OpenNMTTokenizer") cflags = ["-std=c++17", "-fvisibility=hidden"] ldflags = [] package_data = {} + if sys.platform == "darwin": cflags.append("-mmacosx-version-min=10.14") ldflags.append("-Wl,-rpath,/usr/local/lib") @@ -56,7 +79,8 @@ def _maybe_add_library_root(lib_name, header_only=False): extra_link_args=ldflags, include_dirs=include_dirs, library_dirs=library_dirs, - libraries=["OpenNMTTokenizer"], + libraries=libraries, + extra_objects=extra_objects, ) setup( @@ -78,12 +102,10 @@ def _maybe_add_library_root(lib_name, header_only=False): "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Text Processing :: Linguistic", "Topic :: Software Development :: Libraries :: Python Modules", ], @@ -94,7 +116,7 @@ def _maybe_add_library_root(lib_name, header_only=False): keywords="tokenization opennmt unicode bpe sentencepiece subword", packages=find_packages(), package_data=package_data, - python_requires=">=3.6", + python_requires=">=3.9", setup_requires=["pytest-runner"], tests_require=["pytest"], ext_modules=[tokenizer_module], diff --git a/bindings/python/tools/prepare_build_environment_linux.sh b/bindings/python/tools/prepare_build_environment_linux.sh index 19b894c0..393599f0 100755 --- a/bindings/python/tools/prepare_build_environment_linux.sh +++ b/bindings/python/tools/prepare_build_environment_linux.sh @@ -24,7 +24,7 @@ make -j$(nproc) make install popd -# Build Tokenizer C++ library +# Build Tokenizer C++ library as static rm -rf "$ROOT_DIR/build" mkdir -p "$ROOT_DIR/build" pushd "$ROOT_DIR/build" From cade716f77041ac3f1b3aa4d7c3968cbbfd5107c Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 10:41:30 +0100 Subject: [PATCH 12/16] black --- bindings/python/setup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bindings/python/setup.py b/bindings/python/setup.py index bc72112c..9d839d89 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -8,11 +8,13 @@ libraries = [] extra_objects = [] + def _get_long_description(): readme_path = "README.md" with open(readme_path, encoding="utf-8") as readme_file: return readme_file.read() + def _get_project_version(): base_dir = os.path.dirname(os.path.abspath(__file__)) version_path = os.path.join(base_dir, "pyonmttok", "version.py") @@ -21,6 +23,7 @@ def _get_project_version(): exec(fp.read(), version) return version["__version__"] + def _maybe_add_library_root(lib_name, header_only=False): root = os.environ.get("%s_ROOT" % lib_name) if root is None: @@ -34,6 +37,7 @@ def _maybe_add_library_root(lib_name, header_only=False): return lib_dir return root + tokenizer_root = _maybe_add_library_root("TOKENIZER") icu_root = os.environ.get("ICU_ROOT") @@ -45,7 +49,7 @@ def _maybe_add_library_root(lib_name, header_only=False): extra_objects.append(tokenizer_lib) else: libraries.append("OpenNMTTokenizer") - + # Link statically against ICU libraries if icu_root: for lib_subdir in ("lib64", "lib"): From ecbbecc015d5d47ec3ffec08cac3aaedfc1042f9 Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 10:48:13 +0100 Subject: [PATCH 13/16] . --- bindings/python/setup.py | 49 +++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 9d839d89..a2211f74 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -7,6 +7,7 @@ library_dirs = [] libraries = [] extra_objects = [] +extra_link_args = [] def _get_long_description(): @@ -42,25 +43,38 @@ def _maybe_add_library_root(lib_name, header_only=False): icu_root = os.environ.get("ICU_ROOT") # Handle static linking on Linux (manylinux) -if sys.platform == "linux" and tokenizer_root: +if sys.platform.startswith("linux") and tokenizer_root and icu_root: + print(f"Using static linking for Linux") + print(f"TOKENIZER_ROOT: {tokenizer_root}") + print(f"ICU_ROOT: {icu_root}") + # Link statically against OpenNMTTokenizer tokenizer_lib = os.path.join(tokenizer_root, "lib", "libOpenNMTTokenizer.a") if os.path.exists(tokenizer_lib): + print(f"Found tokenizer static lib: {tokenizer_lib}") extra_objects.append(tokenizer_lib) else: + print( + f"Tokenizer static lib not found at {tokenizer_lib}, using dynamic linking" + ) libraries.append("OpenNMTTokenizer") - # Link statically against ICU libraries - if icu_root: - for lib_subdir in ("lib64", "lib"): - icu_lib_dir = os.path.join(icu_root, lib_subdir) - if os.path.isdir(icu_lib_dir): - # ICU libraries must be linked in the correct order - for icu_lib in ["icui18n", "icuuc", "icudata"]: - icu_lib_path = os.path.join(icu_lib_dir, f"lib{icu_lib}.a") - if os.path.exists(icu_lib_path): - extra_objects.append(icu_lib_path) - break + # Link statically against ICU libraries in correct order + for lib_subdir in ("lib64", "lib"): + icu_lib_dir = os.path.join(icu_root, lib_subdir) + if os.path.isdir(icu_lib_dir): + # ICU libraries must be linked in this specific order + for icu_lib in ["icui18n", "icuuc", "icudata"]: + icu_lib_path = os.path.join(icu_lib_dir, f"lib{icu_lib}.a") + if os.path.exists(icu_lib_path): + print(f"Found ICU static lib: {icu_lib_path}") + extra_objects.append(icu_lib_path) + else: + print(f"WARNING: ICU lib not found: {icu_lib_path}") + break + + # Add necessary system libraries for static linking + libraries.extend(["stdc++", "m", "dl", "pthread"]) else: # Dynamic linking for macOS and Windows libraries.append("OpenNMTTokenizer") @@ -76,11 +90,20 @@ def _maybe_add_library_root(lib_name, header_only=False): cflags = ["/std:c++17", "/d2FH4-"] package_data["pyonmttok"] = ["*.dll"] +# Combine ldflags with extra_link_args +extra_link_args.extend(ldflags) + +print(f"include_dirs: {include_dirs}") +print(f"library_dirs: {library_dirs}") +print(f"libraries: {libraries}") +print(f"extra_objects: {extra_objects}") +print(f"extra_link_args: {extra_link_args}") + tokenizer_module = Extension( "pyonmttok._ext", sources=["pyonmttok/Python.cc"], extra_compile_args=cflags, - extra_link_args=ldflags, + extra_link_args=extra_link_args, include_dirs=include_dirs, library_dirs=library_dirs, libraries=libraries, From 9ab5de2188f6a6efc088e481c592b6f7cf743ed4 Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 11:03:35 +0100 Subject: [PATCH 14/16] . --- bindings/python/setup.py | 66 +++---------------- .../tools/prepare_build_environment_linux.sh | 60 +++++++---------- 2 files changed, 32 insertions(+), 94 deletions(-) diff --git a/bindings/python/setup.py b/bindings/python/setup.py index a2211f74..1888efc2 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -1,13 +1,12 @@ import os import sys + import pybind11 + from setuptools import Extension, find_packages, setup include_dirs = [pybind11.get_include()] library_dirs = [] -libraries = [] -extra_objects = [] -extra_link_args = [] def _get_long_description(): @@ -28,61 +27,21 @@ def _get_project_version(): def _maybe_add_library_root(lib_name, header_only=False): root = os.environ.get("%s_ROOT" % lib_name) if root is None: - return None + return include_dirs.append(os.path.join(root, "include")) if not header_only: for lib_subdir in ("lib64", "lib"): lib_dir = os.path.join(root, lib_subdir) if os.path.isdir(lib_dir): library_dirs.append(lib_dir) - return lib_dir - return root - - -tokenizer_root = _maybe_add_library_root("TOKENIZER") -icu_root = os.environ.get("ICU_ROOT") - -# Handle static linking on Linux (manylinux) -if sys.platform.startswith("linux") and tokenizer_root and icu_root: - print(f"Using static linking for Linux") - print(f"TOKENIZER_ROOT: {tokenizer_root}") - print(f"ICU_ROOT: {icu_root}") + break - # Link statically against OpenNMTTokenizer - tokenizer_lib = os.path.join(tokenizer_root, "lib", "libOpenNMTTokenizer.a") - if os.path.exists(tokenizer_lib): - print(f"Found tokenizer static lib: {tokenizer_lib}") - extra_objects.append(tokenizer_lib) - else: - print( - f"Tokenizer static lib not found at {tokenizer_lib}, using dynamic linking" - ) - libraries.append("OpenNMTTokenizer") - # Link statically against ICU libraries in correct order - for lib_subdir in ("lib64", "lib"): - icu_lib_dir = os.path.join(icu_root, lib_subdir) - if os.path.isdir(icu_lib_dir): - # ICU libraries must be linked in this specific order - for icu_lib in ["icui18n", "icuuc", "icudata"]: - icu_lib_path = os.path.join(icu_lib_dir, f"lib{icu_lib}.a") - if os.path.exists(icu_lib_path): - print(f"Found ICU static lib: {icu_lib_path}") - extra_objects.append(icu_lib_path) - else: - print(f"WARNING: ICU lib not found: {icu_lib_path}") - break - - # Add necessary system libraries for static linking - libraries.extend(["stdc++", "m", "dl", "pthread"]) -else: - # Dynamic linking for macOS and Windows - libraries.append("OpenNMTTokenizer") +_maybe_add_library_root("TOKENIZER") cflags = ["-std=c++17", "-fvisibility=hidden"] ldflags = [] package_data = {} - if sys.platform == "darwin": cflags.append("-mmacosx-version-min=10.14") ldflags.append("-Wl,-rpath,/usr/local/lib") @@ -90,24 +49,14 @@ def _maybe_add_library_root(lib_name, header_only=False): cflags = ["/std:c++17", "/d2FH4-"] package_data["pyonmttok"] = ["*.dll"] -# Combine ldflags with extra_link_args -extra_link_args.extend(ldflags) - -print(f"include_dirs: {include_dirs}") -print(f"library_dirs: {library_dirs}") -print(f"libraries: {libraries}") -print(f"extra_objects: {extra_objects}") -print(f"extra_link_args: {extra_link_args}") - tokenizer_module = Extension( "pyonmttok._ext", sources=["pyonmttok/Python.cc"], extra_compile_args=cflags, - extra_link_args=extra_link_args, + extra_link_args=ldflags, include_dirs=include_dirs, library_dirs=library_dirs, - libraries=libraries, - extra_objects=extra_objects, + libraries=["OpenNMTTokenizer"], ) setup( @@ -148,3 +97,4 @@ def _maybe_add_library_root(lib_name, header_only=False): tests_require=["pytest"], ext_modules=[tokenizer_module], ) + diff --git a/bindings/python/tools/prepare_build_environment_linux.sh b/bindings/python/tools/prepare_build_environment_linux.sh index 393599f0..6495c328 100755 --- a/bindings/python/tools/prepare_build_environment_linux.sh +++ b/bindings/python/tools/prepare_build_environment_linux.sh @@ -1,41 +1,29 @@ -#!/usr/bin/env bash -set -euxo pipefail +#! /bin/bash -# Use /project as the root since that's where cibuildwheel mounts the repo -ROOT_DIR="/project" -ICU_ROOT="$ROOT_DIR/icu" -INSTALL_DIR="$ROOT_DIR/build/install" +set -e +set -x -# manylinux compiler flags (required) -export CFLAGS="-O3 -fPIC" -export CXXFLAGS="-O3 -fPIC" +ROOT_DIR=$PWD +ICU_ROOT=$ROOT_DIR/icu +CMAKE_EXTRA_ARGS="" -# Build ICU from source +# Download and compile ICU from sources. ICU_VERSION=${ICU_VERSION:-73.2} -ICU_TGZ="icu4c-${ICU_VERSION/./_}-src.tgz" -curl -LO "https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION/./-}/${ICU_TGZ}" -tar xf "$ICU_TGZ" -pushd icu/source -./configure \ - --disable-shared \ - --enable-static \ - --prefix="$ICU_ROOT" -make -j$(nproc) -make install -popd +curl -L -O https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION/./-}/icu4c-${ICU_VERSION/./_}-src.tgz +tar xf icu4c-*-src.tgz +cd icu/source +CFLAGS="-O3 -fPIC" CXXFLAGS="-O3 -fPIC" ./configure --disable-shared --enable-static --prefix=$ICU_ROOT +make -j2 install -# Build Tokenizer C++ library as static -rm -rf "$ROOT_DIR/build" -mkdir -p "$ROOT_DIR/build" -pushd "$ROOT_DIR/build" -cmake \ - -DLIB_ONLY=ON \ - -DBUILD_SHARED_LIBS=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ - -DICU_ROOT="$ICU_ROOT" \ - "$ROOT_DIR" -make -j$(nproc) -make install -popd +cd $ROOT_DIR + +# Install cmake. +pip install cmake + +# Build Tokenizer. +rm -rf build +mkdir build +cd build +cmake -DLIB_ONLY=ON -DICU_ROOT=$ICU_ROOT $CMAKE_EXTRA_ARGS .. +VERBOSE=1 make -j2 install +cd $ROOT_DIR From 4fd659c9880b29d2ccb639aefd6ac7e1a5daa3bc Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 11:10:33 +0100 Subject: [PATCH 15/16] fix --- bindings/python/setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 1888efc2..01f1d286 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -97,4 +97,3 @@ def _maybe_add_library_root(lib_name, header_only=False): tests_require=["pytest"], ext_modules=[tokenizer_module], ) - From fe486b89afe6966ad815b730a652ac75c822e246 Mon Sep 17 00:00:00 2001 From: vince62s Date: Tue, 30 Dec 2025 12:12:20 +0100 Subject: [PATCH 16/16] builds seem ok --- .github/workflows/ci.yml | 2 +- CHANGELOG.md | 9 +++++++++ bindings/python/pyonmttok/version.py | 2 +- bindings/python/setup.py | 3 +-- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5407fa8c..05e67938 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -121,7 +121,7 @@ jobs: CIBW_BEFORE_BUILD: pip install pybind11==2.10.1 CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014 CIBW_MANYLINUX_AARCH64_IMAGE: manylinux2014 - CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*" + CIBW_BUILD: "cp310-* cp311-* cp312-*" CIBW_TEST_COMMAND: pytest {project}/bindings/python/test/test.py CIBW_TEST_REQUIRES: pytest CIBW_ARCHS: ${{ matrix.arch }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 157e34fc..62aa903a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,15 @@ The project follows [semantic versioning 2.0.0](https://semver.org/). The API co ### Fixes and improvements +## [v1.38.0](https://github.com/OpenNMT/Tokenizer/releases/tag/v1.38.0) (2025-12-30) + +### Fixes and improvements + +* drop python 3.9 and under +* add python 3.12 + +### Fixes and improvements + ## [v1.37.1](https://github.com/OpenNMT/Tokenizer/releases/tag/v1.37.1) (2023-03-01) ### Fixes and improvements diff --git a/bindings/python/pyonmttok/version.py b/bindings/python/pyonmttok/version.py index 4769d763..46362d72 100644 --- a/bindings/python/pyonmttok/version.py +++ b/bindings/python/pyonmttok/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "1.37.1" +__version__ = "1.38.0" diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 01f1d286..178eda86 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -78,7 +78,6 @@ def _maybe_add_library_root(lib_name, header_only=False): "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -92,7 +91,7 @@ def _maybe_add_library_root(lib_name, header_only=False): keywords="tokenization opennmt unicode bpe sentencepiece subword", packages=find_packages(), package_data=package_data, - python_requires=">=3.9", + python_requires=">=3.10", setup_requires=["pytest-runner"], tests_require=["pytest"], ext_modules=[tokenizer_module],