From 8ce5c9ba9a86dc8a06de6349f28a4ab43af45af0 Mon Sep 17 00:00:00 2001
From: Daewon Lee <daewon4you@gmail.com>
Date: Sat, 6 Jun 2026 15:53:47 +0900
Subject: [PATCH 1/8] docs: tweak benchmark table units and rename conda env to
 'sfa'

- Move the time units (ms, s) out of the column headers and onto each
  cell in the Small networks and Large networks tables. The numbers
  now carry their own unit, so a partial copy of the table no longer
  loses the units; column headers are reserved for precision modes.
- Rename the heading "Benchmarks" -> "Performance benchmarks" so the
  section is unambiguous in the table of contents.
- Rename the conda environment shipped in environment-cuda.yml from
  "sfa-cu132" to "sfa". The env name was an arbitrary label; matching
  the project name makes the install snippets read more naturally
  (`conda activate sfa` instead of `conda activate sfa-cu132`). PyPI
  package names (`sfa-cu128`, `sfa-cu132`, `sfa-cu133`) are unrelated
  to the conda env name and remain unchanged.
- Update README, INSTALL.md, and doc/install.md to use the new env
  name everywhere it appears.
---
 INSTALL.md           | 14 +++++++-------
 README.md            | 34 +++++++++++++++++-----------------
 doc/install.md       |  2 +-
 environment-cuda.yml |  2 +-
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index 3221340..79fba78 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -38,7 +38,7 @@ that is the maximum CUDA version your driver supports.
 | Package     | CUDA bundled | Minimum NVIDIA driver | When to pick                                              |
 |-------------|--------------|------------------------|-----------------------------------------------------------|
 | `sfa-cu133` | 13.3.x       | 580                    | Newest hardware / drivers; default for fresh installs.    |
-| `sfa-cu132` | 13.2.x       | 580                    | Matches the `sfa-cu132` conda env used for development.   |
+| `sfa-cu132` | 13.2.x       | 580                    | Matches the CUDA 13.2 conda env in `environment-cuda.yml`. |
 | `sfa-cu128` | 12.8.x       | 570                    | Older driver (CUDA 12 line); broadest backwards compat.   |
 
 Example (install the newest one):
@@ -81,15 +81,15 @@ the host compiler, and `conda` will not install it for you.
 git clone https://github.com/dwgoon/sfa.git && cd sfa
 
 conda env create -f environment-cuda.yml
-conda activate sfa-cu132
+conda activate sfa
 pip install -e .                 # builds the CUDA extension via the env's nvcc
 
 # CPU-only variant (skip CUDA even if nvcc is on PATH):
 SFA_BUILD_CUDA=0 pip install -e .
 ```
 
-This is also how the project maintainers build on Windows: the
-`sfa-cu132` env provides `nvcc` and cuBLAS, while system MSVC handles
+This is also how the project maintainers build on Windows: the `sfa`
+env provides `nvcc` and cuBLAS, while system MSVC handles
 `bindings.cpp`. The resulting extension is e.g.
 `sfa/_cuda/_native.cp312-win_amd64.pyd`.
 
@@ -180,7 +180,7 @@ and falls through to a CPU-only build (printing
 ### What `environment-cuda.yml` provides
 
 The shipped conda environment file creates a self-contained build
-environment named `sfa-cu132` that does **not** require any
+environment named `sfa` that does **not** require any
 system-wide CUDA install. Everything the build needs - the CUDA
 compiler, the CUDA runtime, cuBLAS headers and import libs, plus the
 Python build and runtime dependencies - is pulled in from the
@@ -199,14 +199,14 @@ Concretely, the file pins:
 
 The `cuda-toolkit` meta-package pulls in `nvcc`, `cudart`, `nvrtc`,
 `cccl`, `cupti`, the profiler API, and the rest of the CUDA dev
-toolchain. After `conda activate sfa-cu132`, `nvcc` is on `PATH` and
+toolchain. After `conda activate sfa`, `nvcc` is on `PATH` and
 `setup.py`'s CUDA-extension build picks it up automatically.
 
 Notes for adjusting the file:
 
 - To target a different CUDA major version, change the two `nvidia::`
   pins (`cuda-version` and `cuda-toolkit`) in lockstep. The env name
-  on the first line (`sfa-cu132`) is just a label; rename it freely.
+  on the first line (`sfa`) is just a label; rename it freely.
 - A host C++ compiler is still required (MSVC on Windows, GCC on
   Linux). The toolchain itself is not bundled by `cuda-toolkit`;
   conda will not install it for you.
diff --git a/README.md b/README.md
index 9d2a990..475da09 100644
--- a/README.md
+++ b/README.md
@@ -87,7 +87,7 @@ self-contained env):
 ```bash
 git clone https://github.com/dwgoon/sfa.git && cd sfa
 conda env create -f environment-cuda.yml
-conda activate sfa-cu132
+conda activate sfa
 pip install -e .
 ```
 
@@ -280,7 +280,7 @@ S_gpu = compute_influence(
 )
 ```
 
-## Benchmarks
+## Performance benchmarks
 
 ### Hardware setup
 
@@ -329,24 +329,24 @@ S_gpu = compute_influence(
 
 ### Small networks
 
-| # Nodes | # Edges  | CPU iter (FP64) ms | CPU LAPACK (FP64) ms | CUDA (FP64) ms        |
-|---------|----------|--------------------|----------------------|-----------------------|
-|    32   | 992      | 0.1 ± 0.0          | 0.2 ± 0.0 (0.4x)     | 1.3 ± 0.2 (0.06x)     |
-|    64   |  ~4.0 K  | 0.2 ± 0.0          | 0.2 ± 0.0 (0.8x)     | 1.4 ± 0.1 (0.13x)     |
-|   128   | ~16.3 K  | 2.5 ± 0.0          | 0.4 ± 0.0 (**7.2x**) | 1.9 ± 0.1 (1.3x)      |
-|   256   | ~65.3 K  | 6.9 ± 0.2          | 2.4 ± 0.1 (**2.8x**) | 3.1 ± 0.8 (2.2x)      |
-|   512   |  ~262 K  | 38.8 ± 1.7         | 190 ± 46 (0.2x)      | 6.4 ± 0.2 (**6.0x**)  |
-|  1024   | ~1.05 M  | 180 ± 8            | 486 ± 89 (0.4x)      | 47 ± 10 (**3.8x**)    |
-|  2048   | ~4.19 M  | 2140 ± 320         | 3880 ± 2990 (0.6x)   | 245 ± 2 (**8.7x**)    |
-|  4096   | ~16.8 M  | 12520 ± 2380       | 5690 ± 1390 (2.2x)   | 4320 ± 580 (**2.9x**) |
+| # Nodes | # Edges  | CPU iter (FP64)    | CPU LAPACK (FP64)         | CUDA (FP64)                 |
+|---------|----------|--------------------|---------------------------|-----------------------------|
+|    32   | 992      | 0.1 ± 0.0 ms       | 0.2 ± 0.0 ms (0.4x)       | 1.3 ± 0.2 ms (0.06x)        |
+|    64   |  ~4.0 K  | 0.2 ± 0.0 ms       | 0.2 ± 0.0 ms (0.8x)       | 1.4 ± 0.1 ms (0.13x)        |
+|   128   | ~16.3 K  | 2.5 ± 0.0 ms       | 0.4 ± 0.0 ms (**7.2x**)   | 1.9 ± 0.1 ms (1.3x)         |
+|   256   | ~65.3 K  | 6.9 ± 0.2 ms       | 2.4 ± 0.1 ms (**2.8x**)   | 3.1 ± 0.8 ms (2.2x)         |
+|   512   |  ~262 K  | 38.8 ± 1.7 ms      | 190 ± 46 ms (0.2x)        | 6.4 ± 0.2 ms (**6.0x**)     |
+|  1024   | ~1.05 M  | 180 ± 8 ms         | 486 ± 89 ms (0.4x)        | 47 ± 10 ms (**3.8x**)       |
+|  2048   | ~4.19 M  | 2140 ± 320 ms      | 3880 ± 2990 ms (0.6x)     | 245 ± 2 ms (**8.7x**)       |
+|  4096   | ~16.8 M  | 12520 ± 2380 ms    | 5690 ± 1390 ms (2.2x)     | 4320 ± 580 ms (**2.9x**)    |
 
 ### Large networks
 
-| # Nodes | # Edges | CPU LAPACK (FP64) s | CUDA TF32 (FP32) s   | CUDA FP32 (no TF32) s | CUDA FP16 s              |
-|---------|---------|---------------------|----------------------|-----------------------|--------------------------|
-|  5000   |  ~25 M  |  5.10 ± 2.24             | 0.366 ± 0.027 (14x)  | 0.356 ± 0.034 (14x)   | 0.349 ± 0.037 (**15x**)  |
-| 10000   | ~100 M  | 17.60 ± 0.57             | 1.55 ± 0.05 (11x)    | 4.07 ± 0.06 (4.3x)    | 1.13 ± 0.16 (**16x**)    |
-| 20000   | ~400 M  | 70.88 ± 0.79             | 9.13 ± 0.10 (7.8x)   | 16.30 ± 0.28 (4.3x)   | 4.28 ± 0.02 (**17x**)    |
+| # Nodes | # Edges | CPU LAPACK (FP64) | CUDA TF32 (FP32)         | CUDA FP32 (no TF32)        | CUDA FP16                  |
+|---------|---------|-------------------|--------------------------|----------------------------|----------------------------|
+|  5000   |  ~25 M  |  5.10 ± 2.24 s    | 0.366 ± 0.027 s (14x)    | 0.356 ± 0.034 s (14x)      | 0.349 ± 0.037 s (**15x**)  |
+| 10000   | ~100 M  | 17.60 ± 0.57 s    | 1.55 ± 0.05 s (11x)      | 4.07 ± 0.06 s (4.3x)       | 1.13 ± 0.16 s (**16x**)    |
+| 20000   | ~400 M  | 70.88 ± 0.79 s    | 9.13 ± 0.10 s (7.8x)     | 16.30 ± 0.28 s (4.3x)      | 4.28 ± 0.02 s (**17x**)    |
 
 - CPU paths show noticeably higher variance than GPU paths (CPU
   LAPACK FP64 stddev reaches ~25-77% of the mean at small `N`),
diff --git a/doc/install.md b/doc/install.md
index ad3f27c..98d5fcc 100644
--- a/doc/install.md
+++ b/doc/install.md
@@ -102,7 +102,7 @@ and the runtime Python deps:
 
 ```bash
 conda env create -f environment-cuda.yml
-conda activate sfa-cu132
+conda activate sfa
 pip install -e .
 ```
 
diff --git a/environment-cuda.yml b/environment-cuda.yml
index eec67e6..4525cb0 100644
--- a/environment-cuda.yml
+++ b/environment-cuda.yml
@@ -1,4 +1,4 @@
-name: sfa-cu132
+name: sfa
 channels:
   - nvidia
   - conda-forge

From 960152c57c7699167cfbae7f5eeb06b285747e11 Mon Sep 17 00:00:00 2001
From: Daewon Lee <daewon4you@gmail.com>
Date: Sat, 6 Jun 2026 16:35:38 +0900
Subject: [PATCH 2/8] CI(wheels): fix all five failure modes from the dry-run
 release build

Run 27055508425 surfaced one CPU and four CUDA failure modes when
manually triggering wheels.yml. This rewrites the workflow so the full
matrix is expected to pass:

CPU wheel - cibuildwheel rejected the pure-Python wheel
- Symptom: 'Build failed because a pure Python wheel was generated.'
- Fix: drop cibuildwheel for the CPU target and use `python -m build`
  to produce one universal sfa-<ver>-py3-none-any.whl. The 3-OS x 4-py
  matrix collapses to a single job; pure-Python wheels are not platform
  or interpreter specific.

CUDA wheels - Jimver/cuda-toolkit was too old for CUDA 13.x
- Symptom: 'Error: Version not available: 13.2.0 / 13.3.0' on every
  cu132 / cu133 cell.
- Fix: bump Jimver/cuda-toolkit v0.2.21 -> v0.2.35 (2026-03-29 release;
  default CUDA is 13.2 there).

CUDA wheels - sub-package names rejected by Ubuntu apt
- Symptom: 'Unable to locate package cuda-cublas-12-8 /
  cuda-cublas_dev-12-8 / cuda-nvrtc_dev-12-8'.
- Cause: Jimver prefixes every `sub-packages` entry with `cuda-`, but
  Ubuntu's CUDA apt repos ship cuBLAS and NVRTC as `libcublas-*` /
  `libnvrtc-*`. They must live under the separate `non-cuda-sub-packages`
  input, which is passed through verbatim.
- Fix:
    sub-packages: '["nvcc", "cudart", "cudart-dev"]'
    non-cuda-sub-packages: '["libcublas", "libcublas-dev",
                            "libnvrtc", "libnvrtc-dev"]'

CUDA wheels - cibuildwheel rejected CIBW_ENVIRONMENT
- Symptom: 'cibuildwheel: Malformed environment option ...'.
- Cause: cibuildwheel parses CIBW_ENVIRONMENT with bashlex. The
  unquoted semicolons inside SFA_CUDA_ARCH=sm_70;sm_75;... are
  interpreted as Bash statement terminators. Also, the original block
  unconditionally exported Linux-only CUDA_PATH=/usr/local/cuda and
  PATH=/usr/local/cuda/bin:$PATH, which broke the Windows runs since
  Jimver on Windows installs CUDA under
  C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vXX.Y.
- Fix: quote every value that contains ';' or '<' (SFA_CUDA_ARCH,
  SFA_CUDA_RUNTIME_REQUIRES), and split the workflow env into
  CIBW_ENVIRONMENT_LINUX (with the bind-mount paths) and
  CIBW_ENVIRONMENT_WINDOWS (which inherits CUDA_PATH from the Jimver
  step automatically).

CUDA wheels - cuBLAS / NVRTC headers and libs were scattered
- Cause: the `network` method drops cuBLAS / NVRTC headers in
  /usr/include and shared libs in /usr/lib/x86_64-linux-gnu/, while
  setup.py only looks under $CUDA_HOME/{include,lib64}/. Inside the
  manylinux container the additional host dirs are not mounted, so
  the build would have failed at link time even after the previous
  fixes.
- Fix: add a Linux-only staging step that copies cublas*.h, nvrtc.h,
  libcublas*.so*, libnvrtc*.so* into /usr/local/cuda/{include,lib64}/
  before cibuildwheel runs. A single `-v /usr/local/cuda:/usr/local/cuda:ro`
  bind mount then exposes everything the build needs to the container.

Publish job 'needs' updated to reference build_cpu_wheel (singular).
The `if: false` gate stays in place; PyPI upload is still off.
---
 .github/workflows/wheels.yml | 171 +++++++++++++++++++++--------------
 1 file changed, 101 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index a23c066..cb0d725 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -11,69 +11,52 @@ on:
 # ---------------------------------------------------------------------------
 # Coverage matrix (what gets produced on a tag push)
 # ---------------------------------------------------------------------------
-#   CPU wheels  (package name: sfa)
-#     - Linux   manylinux2014 x86_64
-#     - Windows AMD64
-#     - macOS   x86_64        (Intel)
-#     - macOS   arm64         (Apple Silicon)
-#   CUDA wheels (package name: sfa-cu130)
+#   CPU wheel   (package name: sfa)
+#     - one universal py3-none-any.whl. CPU sfa has no native extension,
+#       so a single pure-Python wheel covers Linux / macOS / Windows and
+#       every supported Python version (3.10 - 3.13).
+#   CUDA wheels (package names: sfa-cu128, sfa-cu132, sfa-cu133)
 #     - Linux   manylinux2014 x86_64
 #     - Windows AMD64
+#     - Each wheel is built per Python version by cibuildwheel.
 #   sdist
 #     - one source distribution
 #
 # Not covered (and why):
-#   - Linux aarch64   : achievable via QEMU; add an `aarch64` include line
-#                       below to opt in. Slow on x86_64 runners.
-#   - Linux musllinux : skipped via cibuildwheel `skip`. Alpine users will
-#                       fall back to the sdist build-from-source path.
-#   - Windows ARM64   : no first-party runner yet; pip will fall back to
-#                       the sdist.
 #   - macOS + CUDA    : Apple dropped NVIDIA support in 2019. Impossible.
+#   - macOS Intel     : macos-13 runner pool is exhausted; sfa CPU is pure
+#                       Python so Intel Mac users still install via the
+#                       universal wheel above.
+#   - Linux aarch64   : achievable via QEMU; out of scope for now.
+#   - Linux musllinux : skipped via cibuildwheel `skip`. Alpine users fall
+#                       back to the sdist build-from-source path.
+#   - Windows ARM64   : no first-party runner yet; pip falls back to sdist.
 # ---------------------------------------------------------------------------
 
 jobs:
   # ---------------------------------------------------------------------------
-  # CPU wheels (cross-platform). Package name: `sfa`.
+  # CPU wheel (universal, pure Python). Package name: `sfa`.
+  # Single artifact: `sfa-<version>-py3-none-any.whl`.
   # ---------------------------------------------------------------------------
-  build_cpu_wheels:
-    name: cpu-${{ matrix.os }}-${{ matrix.cibw_archs }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { os: ubuntu-latest,  cibw_archs: x86_64 }
-          - { os: windows-latest, cibw_archs: AMD64 }
-          # macOS Apple Silicon only. Intel Mac (macos-13 x86_64) is dropped
-          # because GitHub's macos-13 runner pool is exhausted as Apple winds
-          # down macOS 13. CPU-only sfa is pure Python, so Intel Mac users
-          # still get a working install via sdist fallback.
-          - { os: macos-14,       cibw_archs: arm64 }
-          # Opt-in: Linux aarch64 via QEMU. Uncomment to publish for
-          # Raspberry Pi / AWS Graviton / etc. Slow (~20 min per Python).
-          # - { os: ubuntu-latest, cibw_archs: aarch64 }
+  build_cpu_wheel:
+    name: cpu-universal-wheel
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v5
-
-      # QEMU is only needed when cibw_archs is a non-native arch (aarch64).
-      - name: Set up QEMU for aarch64 builds
-        if: matrix.cibw_archs == 'aarch64'
-        uses: docker/setup-qemu-action@v3
+      - uses: actions/setup-python@v6
         with:
-          platforms: arm64
-
-      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.21
+          python-version: '3.12'
+      - name: Build the universal CPU wheel
         env:
-          CIBW_ARCHS: ${{ matrix.cibw_archs }}
-          # Force CPU-only build on every cibuildwheel target.
-          CIBW_ENVIRONMENT: "SFA_BUILD_CUDA=0 SFA_PACKAGE_NAME=sfa"
-
+          SFA_BUILD_CUDA: "0"
+          SFA_PACKAGE_NAME: "sfa"
+        run: |
+          python -m pip install --upgrade pip build
+          python -m build --wheel
       - uses: actions/upload-artifact@v4
         with:
-          name: cpu-wheels-${{ matrix.os }}-${{ matrix.cibw_archs }}
-          path: ./wheelhouse/*.whl
+          name: cpu-wheel-universal
+          path: dist/*.whl
 
   # ---------------------------------------------------------------------------
   # Source distribution. One artifact across the whole matrix.
@@ -91,7 +74,7 @@ jobs:
           SFA_BUILD_CUDA: "0"
           SFA_PACKAGE_NAME: "sfa"
         run: |
-          pip install build
+          python -m pip install --upgrade pip build
           python -m build --sdist
       - uses: actions/upload-artifact@v4
         with:
@@ -99,8 +82,8 @@ jobs:
           path: dist/*.tar.gz
 
   # ---------------------------------------------------------------------------
-  # CUDA wheels (Linux + Windows). Three variants per OS, named after the
-  # CUDA major.minor they were built against:
+  # CUDA wheels (Linux + Windows). Three names per OS, by the CUDA
+  # major.minor they bundle:
   #
   #   sfa-cu128  -> CUDA 12.8.x  (NVIDIA driver >= 570)
   #   sfa-cu132  -> CUDA 13.2.x  (NVIDIA driver >= 580)
@@ -108,10 +91,25 @@ jobs:
   #
   # Each wheel declares a pinned dependency on the NVIDIA runtime PyPI
   # packages (nvidia-cublas-cuXX, nvidia-cuda-runtime-cuXX) for the same
-  # CUDA major, so `pip install sfa-cu13X` brings cuBLAS / cudart along
+  # CUDA major, so `pip install sfa-cu1XX` brings cuBLAS / cudart along
   # automatically. No system-wide CUDA toolkit install needed.
   #
-  # macOS is intentionally excluded - Apple dropped NVIDIA support in 2019.
+  # Implementation notes:
+  #
+  # - Jimver/cuda-toolkit@v0.2.35 supports CUDA 13.2 as the default
+  #   version. Older v0.2.21 only knew up to CUDA 13.1, which is why
+  #   the previous workflow run failed with "Version not available".
+  # - Ubuntu's CUDA 12.x/13.x apt repos ship cuBLAS / NVRTC under the
+  #   `libcublas-*` / `libnvrtc-*` package names, not `cuda-cublas-*`.
+  #   The Jimver action automatically prefixes `cuda-` to `sub-packages`,
+  #   so cuBLAS and NVRTC must go in `non-cuda-sub-packages` instead.
+  # - cibuildwheel parses CIBW_ENVIRONMENT with bashlex. Values containing
+  #   `;` or `<` (semicolons in SM lists, `<` in version specifiers) must
+  #   be wrapped in double quotes inside the value, otherwise bash treats
+  #   them as statement separators / redirection. CIBW_ENVIRONMENT_LINUX
+  #   and CIBW_ENVIRONMENT_WINDOWS keep Linux-only paths (CUDA_PATH,
+  #   PATH) out of the Windows runs, which use Windows-native NVIDIA
+  #   toolkit paths populated by the Jimver action.
   # ---------------------------------------------------------------------------
   build_cuda_wheels:
     name: cuda-${{ matrix.cuda.pkg }}-${{ matrix.os }}
@@ -123,7 +121,7 @@ jobs:
         cuda:
           - pkg: sfa-cu128
             toolkit: '12.8.0'
-            archs: 'sm_70;sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120'
+            archs: 'sm_70;sm_75;sm_80;sm_86;sm_89;sm_90'
             runtime_requires: >-
               nvidia-cublas-cu12>=12.8,<12.9
               nvidia-cuda-runtime-cu12>=12.8,<12.9
@@ -146,38 +144,71 @@ jobs:
       - uses: actions/checkout@v5
 
       - name: Install CUDA toolkit (host)
-        # Jimver/cuda-toolkit fetches the official NVIDIA installers and
-        # places nvcc + cudart-dev + libcublas-dev on the runner. On Linux
-        # the host install is then bind-mounted into the manylinux
-        # container via CIBW_CONTAINER_ENGINE below. Bump the action
-        # version if a newer CUDA release does not resolve here.
-        uses: Jimver/cuda-toolkit@v0.2.21
+        uses: Jimver/cuda-toolkit@v0.2.35
         with:
           cuda: ${{ matrix.cuda.toolkit }}
           method: 'network'
-          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "nvrtc", "nvrtc_dev"]'
+          # nvcc + cudart get the `cuda-` apt prefix; cuBLAS and NVRTC
+          # ship as `libcublas*` / `libnvrtc*` and must be listed under
+          # non-cuda-sub-packages to avoid the wrong prefix on Ubuntu.
+          sub-packages: '["nvcc", "cudart", "cudart-dev"]'
+          non-cuda-sub-packages: '["libcublas", "libcublas-dev", "libnvrtc", "libnvrtc-dev"]'
+
+      - name: Stage cuBLAS / NVRTC under CUDA_HOME (Linux only)
+        # The network method puts cuBLAS / NVRTC headers in /usr/include
+        # and shared libs in /usr/lib/x86_64-linux-gnu/, but setup.py
+        # looks for them under $CUDA_HOME/{include,lib64}/. Consolidate
+        # so a single bind mount of /usr/local/cuda is enough for the
+        # manylinux container to see everything.
+        if: runner.os == 'Linux'
+        shell: bash
+        run: |
+          set -euo pipefail
+          CUDA_HOME=/usr/local/cuda
+          sudo mkdir -p "$CUDA_HOME/include" "$CUDA_HOME/lib64"
+          for h in cublas.h cublas_v2.h cublas_api.h cublasLt.h nvrtc.h; do
+            if [ -f "/usr/include/$h" ]; then
+              sudo cp "/usr/include/$h" "$CUDA_HOME/include/"
+            fi
+          done
+          for pat in 'libcublas*.so*' 'libnvrtc*.so*'; do
+            sudo find /usr/lib/x86_64-linux-gnu/ -maxdepth 1 -name "$pat" \
+              -exec cp -a {} "$CUDA_HOME/lib64/" \;
+          done
+          echo "--- $CUDA_HOME/include ---"
+          ls -la "$CUDA_HOME/include/" | head
+          echo "--- $CUDA_HOME/lib64 ---"
+          ls -la "$CUDA_HOME/lib64/" | head
 
       - name: Build wheels
         uses: pypa/cibuildwheel@v2.21
         env:
           CIBW_ARCHS: ${{ runner.os == 'Windows' && 'AMD64' || 'x86_64' }}
-          # Linux: cibuildwheel runs the build inside a manylinux Docker
-          # container; bind-mount the host CUDA install + nvcc symlink
-          # so the build inside the container can find them.
+          # Linux build runs inside a manylinux container; bind-mount
+          # the consolidated host CUDA tree (`/usr/local/cuda` symlink
+          # is resolved by Docker to its versioned target dir).
           CIBW_CONTAINER_ENGINE: >-
             docker; create_args: -v /usr/local/cuda:/usr/local/cuda:ro
-          # Build env passed through to setup.py.
-          CIBW_ENVIRONMENT: >-
+          # Linux-only env: bind-mounted CUDA path + PATH update.
+          CIBW_ENVIRONMENT_LINUX: >-
             SFA_BUILD_CUDA=1
             SFA_PACKAGE_NAME=${{ matrix.cuda.pkg }}
-            SFA_CUDA_ARCH=${{ matrix.cuda.archs }}
+            SFA_CUDA_ARCH="${{ matrix.cuda.archs }}"
             SFA_CUDA_RUNTIME_REQUIRES="${{ matrix.cuda.runtime_requires }}"
             CUDA_PATH=/usr/local/cuda
             PATH=/usr/local/cuda/bin:$PATH
+          # Windows: Jimver sets CUDA_PATH / CUDA_PATH_VXX_Y on the
+          # host already, and they propagate into the cibuildwheel
+          # subprocess automatically.
+          CIBW_ENVIRONMENT_WINDOWS: >-
+            SFA_BUILD_CUDA=1
+            SFA_PACKAGE_NAME=${{ matrix.cuda.pkg }}
+            SFA_CUDA_ARCH="${{ matrix.cuda.archs }}"
+            SFA_CUDA_RUNTIME_REQUIRES="${{ matrix.cuda.runtime_requires }}"
           # GitHub runners have no NVIDIA GPU, so the CUDA-gated pytest
           # tests skip via tests/_skip_helpers.py (device_count() == 0).
-          # The verification script verifies the native extension loads
-          # (which exercises the bundled NVIDIA runtime libs).
+          # The verification script verifies that the native extension
+          # loads (which exercises the bundled NVIDIA runtime libs).
           CIBW_TEST_REQUIRES: "pytest"
           CIBW_TEST_COMMAND: >-
             python {package}/tests/verification.py
@@ -190,13 +221,13 @@ jobs:
 
   # ---------------------------------------------------------------------------
   # Optional: publish to PyPI on tag pushes. Requires the OIDC trusted-
-  # publisher relationship to be configured at https://pypi.org for both
-  # the `sfa` and `sfa-cu130` projects. Disabled by default; change the
-  # `if:` guard to enable.
+  # publisher relationship to be configured at https://pypi.org for each
+  # of `sfa`, `sfa-cu128`, `sfa-cu132`, `sfa-cu133`. Disabled by default;
+  # change the `if:` guard to enable.
   # ---------------------------------------------------------------------------
   publish:
     name: publish-to-pypi
-    needs: [build_cpu_wheels, build_sdist, build_cuda_wheels]
+    needs: [build_cpu_wheel, build_sdist, build_cuda_wheels]
     runs-on: ubuntu-latest
     if: false  # set to `startsWith(github.ref, 'refs/tags/v')` to enable
     permissions:

From 6df05fb1e77b74f7ba555e47c4c58465baf7b01c Mon Sep 17 00:00:00 2001
From: Daewon Lee <daewon4you@gmail.com>
Date: Sat, 6 Jun 2026 16:39:59 +0900
Subject: [PATCH 3/8] CI(wheels): split CUDA toolkit install per OS, drop wrong
 NVRTC apt name

Second wheels-build dry run (27056380944) made progress (CPU + sdist
now pass) but the 6 CUDA cells still failed at the CUDA toolkit install:

- Linux: 'Unable to locate package libnvrtc-12-8 / libnvrtc-dev-12-8'.
  Only cuBLAS lives in Ubuntu's `lib*` package family on the NVIDIA
  apt repo. NVRTC ships with the standard `cuda-` prefix
  (cuda-nvrtc-12-8, cuda-nvrtc-dev-12-8), so it belongs back in
  Jimver's `sub-packages` input, not in `non-cuda-sub-packages`.
- Windows: the NVIDIA Windows installer rejected `cudart-dev_12.8`
  (exit code 3772776473). Windows uses unprefixed names with
  underscores (cublas_dev, nvrtc_dev, ...) and does not split a
  separate cudart-dev sub-package - the headers ship inside cudart.

Fix:

- Move the toolkit install to two OS-conditional steps, each with the
  sub-package naming convention that matches its target installer.
- Linux: sub-packages now ["nvcc", "cudart", "cudart-dev", "nvrtc",
  "nvrtc-dev"] (all cuda- prefixed) and non-cuda-sub-packages reduced
  to just ["libcublas", "libcublas-dev"].
- Windows: sub-packages ["nvcc", "cudart", "cublas", "cublas_dev",
  "nvrtc", "nvrtc_dev"] - the working configuration before the
  cudart-dev typo was introduced.
- Drop NVRTC from the Linux staging step. With NVRTC pulled in via the
  cuda- prefix it lands in $CUDA_HOME/{include,lib64} directly; only
  cuBLAS (still installed as a lib* package) needs to be moved out of
  /usr/include and /usr/lib/x86_64-linux-gnu/ so the bind mount of
  /usr/local/cuda sees everything.

CPU wheel and sdist are unchanged.
---
 .github/workflows/wheels.yml | 60 +++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index cb0d725..1ce597a 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -143,42 +143,60 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
-      - name: Install CUDA toolkit (host)
+      # Linux and Windows use different sub-package naming conventions:
+      #
+      # - Linux apt repos use a `cuda-` prefix for most CUDA components
+      #   (cuda-nvcc-12-8, cuda-cudart-dev-12-8, cuda-nvrtc-12-8, ...).
+      #   The exception is cuBLAS, which ships as a separate `lib*`
+      #   package family (libcublas-12-8, libcublas-dev-12-8) and must
+      #   therefore go through Jimver's `non-cuda-sub-packages` input.
+      # - Windows uses the NVIDIA Windows installer's sub-package names,
+      #   which are unprefixed and use underscores for the dev variants
+      #   (nvcc, cudart, cublas, cublas_dev, nvrtc, nvrtc_dev). There is
+      #   no separate cudart-dev on Windows; the cudart sub-package
+      #   already includes the headers.
+
+      - name: Install CUDA toolkit (Linux)
+        if: runner.os == 'Linux'
+        uses: Jimver/cuda-toolkit@v0.2.35
+        with:
+          cuda: ${{ matrix.cuda.toolkit }}
+          method: 'network'
+          sub-packages: '["nvcc", "cudart", "cudart-dev", "nvrtc", "nvrtc-dev"]'
+          non-cuda-sub-packages: '["libcublas", "libcublas-dev"]'
+
+      - name: Install CUDA toolkit (Windows)
+        if: runner.os == 'Windows'
         uses: Jimver/cuda-toolkit@v0.2.35
         with:
           cuda: ${{ matrix.cuda.toolkit }}
           method: 'network'
-          # nvcc + cudart get the `cuda-` apt prefix; cuBLAS and NVRTC
-          # ship as `libcublas*` / `libnvrtc*` and must be listed under
-          # non-cuda-sub-packages to avoid the wrong prefix on Ubuntu.
-          sub-packages: '["nvcc", "cudart", "cudart-dev"]'
-          non-cuda-sub-packages: '["libcublas", "libcublas-dev", "libnvrtc", "libnvrtc-dev"]'
+          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "nvrtc", "nvrtc_dev"]'
 
-      - name: Stage cuBLAS / NVRTC under CUDA_HOME (Linux only)
-        # The network method puts cuBLAS / NVRTC headers in /usr/include
-        # and shared libs in /usr/lib/x86_64-linux-gnu/, but setup.py
-        # looks for them under $CUDA_HOME/{include,lib64}/. Consolidate
-        # so a single bind mount of /usr/local/cuda is enough for the
-        # manylinux container to see everything.
+      - name: Stage cuBLAS under CUDA_HOME (Linux only)
+        # libcublas / libcublas-dev install headers to /usr/include and
+        # shared libs to /usr/lib/x86_64-linux-gnu/, while setup.py only
+        # looks under $CUDA_HOME/{include,lib64}/. NVRTC was pulled in
+        # with the `cuda-` prefix above and is already in CUDA_HOME, so
+        # only cuBLAS needs staging.
         if: runner.os == 'Linux'
         shell: bash
         run: |
           set -euo pipefail
           CUDA_HOME=/usr/local/cuda
           sudo mkdir -p "$CUDA_HOME/include" "$CUDA_HOME/lib64"
-          for h in cublas.h cublas_v2.h cublas_api.h cublasLt.h nvrtc.h; do
+          for h in cublas.h cublas_v2.h cublas_api.h cublasLt.h; do
             if [ -f "/usr/include/$h" ]; then
               sudo cp "/usr/include/$h" "$CUDA_HOME/include/"
             fi
           done
-          for pat in 'libcublas*.so*' 'libnvrtc*.so*'; do
-            sudo find /usr/lib/x86_64-linux-gnu/ -maxdepth 1 -name "$pat" \
-              -exec cp -a {} "$CUDA_HOME/lib64/" \;
-          done
-          echo "--- $CUDA_HOME/include ---"
-          ls -la "$CUDA_HOME/include/" | head
-          echo "--- $CUDA_HOME/lib64 ---"
-          ls -la "$CUDA_HOME/lib64/" | head
+          sudo find /usr/lib/x86_64-linux-gnu/ -maxdepth 1 \
+            -name 'libcublas*.so*' \
+            -exec cp -a {} "$CUDA_HOME/lib64/" \;
+          echo "--- $CUDA_HOME/include (cublas only) ---"
+          ls -la "$CUDA_HOME/include/" | grep -E 'cublas|nvrtc' || true
+          echo "--- $CUDA_HOME/lib64 (cublas only) ---"
+          ls -la "$CUDA_HOME/lib64/" | grep -E 'cublas|nvrtc' || true
 
       - name: Build wheels
         uses: pypa/cibuildwheel@v2.21

From 219ff48e46607b4084b4ef6caadc192facf2ce9b Mon Sep 17 00:00:00 2001
From: Daewon Lee <daewon4you@gmail.com>
Date: Sat, 6 Jun 2026 16:49:32 +0900
Subject: [PATCH 4/8] CI(wheels): add MSVC env on Windows, exclude CUDA libs
 from auditwheel, drop cu133

Third dry run (27056467626) made it past CUDA install on Linux for
cu128 / cu132 (good) but surfaced three new blocking issues. Fixing
each so the matrix can come up green:

Issue 1 - cu133 cells fail at install with 'Version not available: 13.3.0'
- Jimver/cuda-toolkit v0.2.35 does not have CUDA 13.3 in its version
  table yet. Drop the sfa-cu133 row from the matrix until a newer
  Jimver release supports it; re-add at that point. Update docs to
  match: README, INSTALL.md, doc/install.md, the SFA_PACKAGE_NAME
  example, the conda-env note about which CUDA majors CI tests, and
  the `pip install` snippet now reference cu132 instead of cu133.

Issue 2 - Linux Build wheels fails inside auditwheel
- Error: 'Cannot repair wheel, because required library "libcudart.so.12"
  could not be located'.
- auditwheel was trying to vendor the NVIDIA runtime shared libs into
  the wheel. We don't want that - the wheel declares pinned PyPI
  dependencies on nvidia-cublas-cuXX / nvidia-cuda-runtime-cuXX /
  nvidia-cuda-nvrtc-cuXX through SFA_CUDA_RUNTIME_REQUIRES, so the
  libs arrive via pip at install time.
- Fix: override CIBW_REPAIR_WHEEL_COMMAND_LINUX to pass --exclude for
  libcudart, libcublas, libcublasLt, libnvrtc, and libnvrtc-builtins
  in both soname.12 and soname.13 forms (covers cu128 and cu132).

Issue 3 - Windows Build wheels fails with 'nvcc fatal : Cannot find
compiler cl.exe in PATH'
- cibuildwheel spawns the build in a subprocess that does not inherit
  the Developer Command Prompt environment, so cl.exe is not visible
  to nvcc even though MSVC is installed on the runner.
- Fix: insert ilammy/msvc-dev-cmd@v1 step (Windows only) after the
  Jimver toolkit step; it exports VCINSTALLDIR / PATH and friends so
  any subsequent process can find cl.exe.

CPU wheel, sdist, CUDA install on Linux (cu128/cu132), and CUDA install
on Windows (cu128/cu132) are unchanged.
---
 .github/workflows/wheels.yml | 37 +++++++++++++++++++++++++++++-------
 INSTALL.md                   | 12 +++++-------
 README.md                    |  3 +--
 doc/install.md               | 10 ++++------
 4 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 1ce597a..821d36f 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -133,13 +133,11 @@ jobs:
               nvidia-cublas-cu13>=13.2,<13.3
               nvidia-cuda-runtime-cu13>=13.2,<13.3
               nvidia-cuda-nvrtc-cu13>=13.2,<13.3
-          - pkg: sfa-cu133
-            toolkit: '13.3.0'
-            archs: 'sm_70;sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120'
-            runtime_requires: >-
-              nvidia-cublas-cu13>=13.3,<13.4
-              nvidia-cuda-runtime-cu13>=13.3,<13.4
-              nvidia-cuda-nvrtc-cu13>=13.3,<13.4
+          # sfa-cu133 (CUDA 13.3) is temporarily disabled: Jimver/cuda-toolkit
+          # v0.2.35 does not yet have a 13.3 entry in its version table, so
+          # the install step reports 'Version not available: 13.3.0'. Re-add
+          # this matrix row when a newer Jimver release ships with 13.3
+          # support.
     steps:
       - uses: actions/checkout@v5
 
@@ -173,6 +171,18 @@ jobs:
           method: 'network'
           sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "nvrtc", "nvrtc_dev"]'
 
+      - name: Set up MSVC environment (Windows)
+        # cibuildwheel spawns the build in a subprocess that does NOT
+        # inherit a Developer Command Prompt, so cl.exe is not on PATH
+        # and nvcc fails with 'Cannot find compiler cl.exe in PATH'.
+        # This action sets the MSVC env vars (VCINSTALLDIR, PATH, ...)
+        # for subsequent steps on the runner so the cibuildwheel
+        # subprocess can find cl.exe.
+        if: runner.os == 'Windows'
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: x64
+
       - name: Stage cuBLAS under CUDA_HOME (Linux only)
         # libcublas / libcublas-dev install headers to /usr/include and
         # shared libs to /usr/lib/x86_64-linux-gnu/, while setup.py only
@@ -227,6 +237,19 @@ jobs:
           # tests skip via tests/_skip_helpers.py (device_count() == 0).
           # The verification script verifies that the native extension
           # loads (which exercises the bundled NVIDIA runtime libs).
+          # Linux only: tell auditwheel NOT to bundle the NVIDIA runtime
+          # shared libs into the wheel. The wheel declares pinned PyPI
+          # dependencies on nvidia-cublas-cuXX / nvidia-cuda-runtime-cuXX /
+          # nvidia-cuda-nvrtc-cuXX through SFA_CUDA_RUNTIME_REQUIRES, so
+          # the runtime libs arrive via pip at install time. Covering
+          # both major sonames (.so.12 for cu128, .so.13 for cu132).
+          CIBW_REPAIR_WHEEL_COMMAND_LINUX: >-
+            auditwheel repair -w {dest_dir} {wheel}
+            --exclude libcudart.so.12 --exclude libcudart.so.13
+            --exclude libcublas.so.12 --exclude libcublas.so.13
+            --exclude libcublasLt.so.12 --exclude libcublasLt.so.13
+            --exclude libnvrtc.so.12 --exclude libnvrtc.so.13
+            --exclude libnvrtc-builtins.so.12 --exclude libnvrtc-builtins.so.13
           CIBW_TEST_REQUIRES: "pytest"
           CIBW_TEST_COMMAND: >-
             python {package}/tests/verification.py
diff --git a/INSTALL.md b/INSTALL.md
index 79fba78..0c0d9b5 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -10,7 +10,6 @@ NVIDIA driver new enough for that CUDA version.
 | `sfa`        | none    | -                  | Linux, macOS, Windows  |
 | `sfa-cu128`  | 12.8.x  | 570 (Linux / Win)  | Linux, Windows         |
 | `sfa-cu132`  | 13.2.x  | 580                | Linux, Windows         |
-| `sfa-cu133`  | 13.3.x  | 580                | Linux, Windows         |
 
 All CUDA wheels share the same AOT-compiled SASS matrix (SM 7.0
 through SM 12.0: Volta, Turing, Ampere, Ada, Hopper, Blackwell), plus
@@ -37,14 +36,13 @@ that is the maximum CUDA version your driver supports.
 
 | Package     | CUDA bundled | Minimum NVIDIA driver | When to pick                                              |
 |-------------|--------------|------------------------|-----------------------------------------------------------|
-| `sfa-cu133` | 13.3.x       | 580                    | Newest hardware / drivers; default for fresh installs.    |
-| `sfa-cu132` | 13.2.x       | 580                    | Matches the CUDA 13.2 conda env in `environment-cuda.yml`. |
+| `sfa-cu132` | 13.2.x       | 580                    | Newest CUDA stack; matches `environment-cuda.yml`.        |
 | `sfa-cu128` | 12.8.x       | 570                    | Older driver (CUDA 12 line); broadest backwards compat.   |
 
 Example (install the newest one):
 
 ```bash
-pip install sfa-cu133
+pip install sfa-cu132
 ```
 
 Requires Python 3.10+. macOS is not supported because Apple ended
@@ -98,8 +96,8 @@ is what the maintainers test against. The same workflow works for any
 CUDA major / minor that has a `cuda-toolkit` build on the `nvidia`
 channel: edit the two `cuda-version` / `cuda-toolkit` pins in lockstep
 (see [What `environment-cuda.yml` provides](#what-environment-cudayml-provides)
-below) and rename the env on the first line of the file. CUDA 12.8 and
-13.3 environments have been tested in CI.
+below) and rename the env on the first line of the file. CUDA 12.8
+and 13.2 environments have been tested in CI.
 
 ### Option B: conda-free build (system CUDA + system C++ compiler)
 
@@ -220,7 +218,7 @@ Notes for adjusting the file:
 |----------------------|------------------------------------------------------------------------|
 | `SFA_BUILD_CUDA`     | `0` to force a pure-Python install. Default: build if `nvcc` is found. |
 | `SFA_CUDA_ARCH`      | Semicolon-separated SM list, e.g. `sm_89` (dev) or `sm_70;sm_80;sm_89`. Default: the full wheel-wide AOT matrix. |
-| `SFA_PACKAGE_NAME`   | Override the PyPI name (used by CI to produce e.g. `sfa-cu132` or `sfa-cu133` from the same source tree). |
+| `SFA_PACKAGE_NAME`   | Override the PyPI name (used by CI to produce e.g. `sfa-cu128` or `sfa-cu132` from the same source tree). |
 
 ## Verify the install
 
diff --git a/README.md b/README.md
index 475da09..ab91746 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,6 @@ set of CUDA optimized `sfa-cuXYZ` versions:
 | `sfa`         | none   | -                   | Linux, macOS, Windows  |
 | `sfa-cu128`   | 12.8.x | 570 (Linux / Win)   | Linux, Windows         |
 | `sfa-cu132`   | 13.2.x | 580                 | Linux, Windows         |
-| `sfa-cu133`   | 13.3.x | 580                 | Linux, Windows         |
 
 Each CUDA wheel ships ahead-of-time compiled SASS for NVIDIA SM 7.0
 through SM 12.0 (Volta, Turing, Ampere, Ada, Hopper, Blackwell) plus a
@@ -67,7 +66,7 @@ supports.
 Example (install the newest one):
 
 ```bash
-pip install sfa-cu133
+pip install sfa-cu132
 ```
 
 > [!IMPORTANT]
diff --git a/doc/install.md b/doc/install.md
index 98d5fcc..4e95166 100644
--- a/doc/install.md
+++ b/doc/install.md
@@ -9,8 +9,7 @@ one** into a given environment.
 |---------------|--------|---------------------|-----------------------------|
 | `sfa`         | none   | -                   | Linux, macOS, Windows       |
 | `sfa-cu128`   | 12.8.x | 570 (Linux / Win)   | Linux, Windows              |
-| `sfa-cu132`   | 13.2.x | 580                 | Linux, Windows              |
-| `sfa-cu133`   | 13.3.x | 580                 | Linux, Windows (newest)     |
+| `sfa-cu132`   | 13.2.x | 580                 | Linux, Windows (newest)     |
 
 ## Requirements
 
@@ -33,10 +32,9 @@ Run `nvidia-smi` and look at the "CUDA Version" column. That is the
 number:
 
 ```text
-nvidia-smi -> "CUDA Version: 13.3"  -> any of sfa-cu128 / cu132 / cu133
-nvidia-smi -> "CUDA Version: 13.0"  -> sfa-cu128
-nvidia-smi -> "CUDA Version: 12.8"  -> sfa-cu128
-nvidia-smi -> "CUDA Version: 12.6"  -> upgrade your driver or use `sfa` (CPU)
+nvidia-smi -> "CUDA Version: 13.2" or higher  -> sfa-cu132 or sfa-cu128
+nvidia-smi -> "CUDA Version: 12.8" - 13.1     -> sfa-cu128
+nvidia-smi -> "CUDA Version: 12.6"             -> upgrade your driver or use `sfa` (CPU)
 ```
 
 When in doubt, start with `sfa-cu128` for the widest driver coverage

From 961edb15df06b49e432cd7fad80721ac0cee3280 Mon Sep 17 00:00:00 2001
From: Daewon Lee <daewon4you@gmail.com>
Date: Sat, 6 Jun 2026 17:09:23 +0900
Subject: [PATCH 5/8] CI(wheels): drop sm_70 from cu132 archs, bump manylinux
 image to 2_28

Fourth dry run (27056657014) graduated cu128-windows to a full pass
(11 minutes including the test phase). Three other CUDA cells failed
with two new error classes:

Issue 1 - 'nvcc fatal: Unsupported gpu architecture compute_70' on
both cu132-ubuntu and cu132-windows
- CUDA 13 nvcc no longer accepts -gencode for sm_70. The deprecation
  warning had been visible since CUDA 12 ('Support for offline
  compilation for architectures prior to sm_75 will be removed in a
  future release'); CUDA 13 is that release.
- Fix: remove sm_70 from the cu132 archs list. cu128 keeps sm_70
  because CUDA 12.8 still supports it. Volta users (P100, V100,
  Quadro GV100, etc.) install sfa-cu128.

Issue 2 - cu128-ubuntu test phase failed compiling scipy from source
- The sfa wheel itself built and was repaired (auditwheel exclude
  rules worked). The failure was when cibuildwheel ran the test
  command in a fresh venv: cp310 installed scipy 1.15.3 from a
  manylinux2014 wheel and succeeded, but cp311 picked up scipy 1.16+
  which has dropped manylinux2014 wheels. pip then fell back to an
  sdist build, the manylinux2014 container had no OpenBLAS, and
  meson aborted with 'Dependency OpenBLAS not found'.
- Fix: set CIBW_MANYLINUX_X86_64_IMAGE to manylinux_2_28. That is the
  base image scipy 1.16+ targets and matches what the rest of the
  scientific-Python wheel matrix has converged to. The bind-mount of
  /usr/local/cuda continues to work the same way.

After these two changes the expected matrix outcome is:
- CPU universal wheel        : pass (already passing)
- sdist                       : pass (already passing)
- cuda-sfa-cu128-ubuntu       : should pass (manylinux_2_28 scipy)
- cuda-sfa-cu128-windows      : pass (already passing)
- cuda-sfa-cu132-ubuntu       : should pass (no sm_70, manylinux_2_28)
- cuda-sfa-cu132-windows      : should pass (no sm_70)
---
 .github/workflows/wheels.yml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 821d36f..ce8cd52 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -128,7 +128,10 @@ jobs:
               nvidia-cuda-nvrtc-cu12>=12.8,<12.9
           - pkg: sfa-cu132
             toolkit: '13.2.0'
-            archs: 'sm_70;sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120'
+            # sm_70 (Volta) is gone from CUDA 13's nvcc - the prior dry
+            # run hit `nvcc fatal: Unsupported gpu architecture 'compute_70'`
+            # on both Linux and Windows. Volta users stay on sfa-cu128.
+            archs: 'sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120'
             runtime_requires: >-
               nvidia-cublas-cu13>=13.2,<13.3
               nvidia-cuda-runtime-cu13>=13.2,<13.3
@@ -212,6 +215,13 @@ jobs:
         uses: pypa/cibuildwheel@v2.21
         env:
           CIBW_ARCHS: ${{ runner.os == 'Windows' && 'AMD64' || 'x86_64' }}
+          # Use a manylinux base new enough that scipy still publishes
+          # binary wheels for it. scipy 1.16+ (picked up automatically
+          # on cp311+) dropped manylinux2014 in favour of manylinux_2_28,
+          # so the older default image forced a from-source scipy build
+          # at the test phase (which needs OpenBLAS and meson, neither
+          # available in the container).
+          CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
           # Linux build runs inside a manylinux container; bind-mount
           # the consolidated host CUDA tree (`/usr/local/cuda` symlink
           # is resolved by Docker to its versioned target dir).

From 7c68298c7a6cf4d50d3e091a191d91ad984c5123 Mon Sep 17 00:00:00 2001
From: Daewon Lee <daewon4you@gmail.com>
Date: Sat, 6 Jun 2026 17:22:36 +0900
Subject: [PATCH 6/8] CI(wheels): add cudart_dev to Windows CUDA sub-packages

Fifth dry run (27057065902) graduated 5 of 6 cells; cu132-windows
was the lone holdout, dying with

  CUDA\v13.2\include\cuda_runtime.h(82): fatal error C1083:
  Cannot open include file: 'crt/host_config.h':
  No such file or directory

CUDA 13's Windows installer split the runtime developer headers
(including crt/host_config.h, which cuda_runtime.h pulls in to wire
nvcc up to the host MSVC) into a separate `cudart_dev` sub-package.
The CUDA 12.8 Windows installer kept those headers inside `cudart`,
so the prior config covered cu128-windows by accident but had no
chance against cu132.

Fix: add cudart_dev to the Windows sub-packages list. The package
also exists on CUDA 12.8 (where it is a thin no-op overlay), so the
same list works for both wheels.

Linux is unaffected: the Linux toolkit install already lists
"cudart-dev" alongside "cudart", and those headers landed in
/usr/local/cuda/include/crt/ as expected.

Expected outcome of the next run: all 6 cells green.
---
 .github/workflows/wheels.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index ce8cd52..40f95b9 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -172,7 +172,12 @@ jobs:
         with:
           cuda: ${{ matrix.cuda.toolkit }}
           method: 'network'
-          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "nvrtc", "nvrtc_dev"]'
+          # `cudart_dev` is needed in addition to `cudart` starting with
+          # CUDA 13: the Windows installer split the runtime developer
+          # headers (including crt/host_config.h, required by nvcc to
+          # talk to MSVC) out of `cudart` into a separate sub-package.
+          # Listing both also works on CUDA 12.8.
+          sub-packages: '["nvcc", "cudart", "cudart_dev", "cublas", "cublas_dev", "nvrtc", "nvrtc_dev"]'
 
       - name: Set up MSVC environment (Windows)
         # cibuildwheel spawns the build in a subprocess that does NOT

From edeca22b1c8010af011aef1343e984ae7b16c734 Mon Sep 17 00:00:00 2001
From: Daewon Lee <daewon4you@gmail.com>
Date: Sat, 6 Jun 2026 17:26:45 +0900
Subject: [PATCH 7/8] CI(wheels): switch Windows CUDA install to local
 installer

Sixth dry run (27057337822) regressed both Windows cells: adding
`cudart_dev` to the Windows sub-packages list (which I thought would
just be a no-op on CUDA 12.8) instead broke the install on BOTH
cu128 and cu132 because that sub-package name does not exist on
Windows for either CUDA version. Linux jobs were unaffected.

Root cause of the underlying problem: NVIDIA's Windows installer
reorganised which sub-package carries crt/host_config.h between
CUDA 12 and CUDA 13, and the new owner is not consistently called
`cudart_dev`. Different secondary sources name it differently and
none of those names work for both 12.8 and 13.2.

Fix: skip the sub-package guessing game entirely on Windows by
switching to method: 'local'. Jimver then downloads the full NVIDIA
installer .exe and runs it silently, which lays down the complete
include tree (crt/host_config.h included) regardless of how NVIDIA
re-tags individual chunks in future point releases.

Cost: one extra ~3 GB download per Windows cell. cu128-windows had
been completing in 11 minutes on the network method, so the local
method should land it somewhere around 14-16 minutes - well within
the GitHub Actions job budget.

Linux keeps the network method + curated sub-packages + cuBLAS
staging step. That combination was verified green in the previous
run (cu128-ubuntu 4m55s, cu132-ubuntu 4m57s).
---
 .github/workflows/wheels.yml | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 40f95b9..a3f0e45 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -171,13 +171,18 @@ jobs:
         uses: Jimver/cuda-toolkit@v0.2.35
         with:
           cuda: ${{ matrix.cuda.toolkit }}
-          method: 'network'
-          # `cudart_dev` is needed in addition to `cudart` starting with
-          # CUDA 13: the Windows installer split the runtime developer
-          # headers (including crt/host_config.h, required by nvcc to
-          # talk to MSVC) out of `cudart` into a separate sub-package.
-          # Listing both also works on CUDA 12.8.
-          sub-packages: '["nvcc", "cudart", "cudart_dev", "cublas", "cublas_dev", "nvrtc", "nvrtc_dev"]'
+          # Use the full local installer on Windows. The `network` method
+          # with cherry-picked sub-packages worked on CUDA 12.8 but on
+          # CUDA 13 nvcc fails with "Cannot open include file:
+          # 'crt/host_config.h'" because that header has been moved
+          # behind a sub-package whose exact name varies between minor
+          # CUDA versions (and isn't always called `cudart_dev` -
+          # adding that string broke even the 12.8 install). The local
+          # installer ships the whole toolkit unconditionally, so every
+          # CUDA major / minor we plug into the matrix is guaranteed to
+          # land a complete include tree. Trade-off: a one-time ~3 GB
+          # download per cell.
+          method: 'local'
 
       - name: Set up MSVC environment (Windows)
         # cibuildwheel spawns the build in a subprocess that does NOT

From 5b9f900419d05ca12ac41e9dac7edabe5aa572d5 Mon Sep 17 00:00:00 2001
From: Daewon Lee <daewon4you@gmail.com>
Date: Sat, 6 Jun 2026 18:20:09 +0900
Subject: [PATCH 8/8] Release 0.2.0

Bump version 0.2.0.dev0 -> 0.2.0 in pyproject.toml and sfa/__init__.py,
and enable the publish-to-pypi job in wheels.yml (the prior `if: false`
guard is replaced by a tag-ref check) so that pushing a v0.2.0 tag from
main triggers wheel build + PyPI upload.

The wheels.yml matrix itself is unchanged; the same six cells (CPU
universal, sdist, sfa-cu128 / sfa-cu132 on ubuntu and windows) just
verified green in dry run 27057416779 will run again on the tag push,
this time producing 0.2.0 artifacts and shipping them to PyPI through
the configured trusted-publisher relationships for sfa, sfa-cu128,
and sfa-cu132.
---
 .github/workflows/wheels.yml | 2 +-
 pyproject.toml               | 2 +-
 sfa/__init__.py              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index a3f0e45..efe4eb2 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -290,7 +290,7 @@ jobs:
     name: publish-to-pypi
     needs: [build_cpu_wheel, build_sdist, build_cuda_wheels]
     runs-on: ubuntu-latest
-    if: false  # set to `startsWith(github.ref, 'refs/tags/v')` to enable
+    if: startsWith(github.ref, 'refs/tags/v')
     permissions:
       id-token: write
     steps:
diff --git a/pyproject.toml b/pyproject.toml
index 408dfb5..395ca2c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 # wheels under the SFA_PACKAGE_NAME env var.
 [project]
 name = "sfa"
-version = "0.2.0.dev0"
+version = "0.2.0"
 description = "Signal flow analysis"
 readme = "README.md"
 license = { text = "MIT" }
diff --git a/sfa/__init__.py b/sfa/__init__.py
index b79d3de..1cf5359 100644
--- a/sfa/__init__.py
+++ b/sfa/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.2.0.dev0"
+__version__ = "0.2.0"
 
 from .base import *
 from .containers import AlgorithmSet