diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index a23c066..efe4eb2 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -11,69 +11,52 @@ on:
 # ---------------------------------------------------------------------------
 # Coverage matrix (what gets produced on a tag push)
 # ---------------------------------------------------------------------------
-#   CPU wheels  (package name: sfa)
-#     - Linux   manylinux2014 x86_64
-#     - Windows AMD64
-#     - macOS   x86_64        (Intel)
-#     - macOS   arm64         (Apple Silicon)
-#   CUDA wheels (package name: sfa-cu130)
+#   CPU wheel   (package name: sfa)
+#     - one universal py3-none-any.whl. CPU sfa has no native extension,
+#       so a single pure-Python wheel covers Linux / macOS / Windows and
+#       every supported Python version (3.10 - 3.13).
+#   CUDA wheels (package names: sfa-cu128, sfa-cu132, sfa-cu133)
 #     - Linux   manylinux2014 x86_64
 #     - Windows AMD64
+#     - Each wheel is built per Python version by cibuildwheel.
 #   sdist
 #     - one source distribution
 #
 # Not covered (and why):
-#   - Linux aarch64   : achievable via QEMU; add an `aarch64` include line
-#                       below to opt in. Slow on x86_64 runners.
-#   - Linux musllinux : skipped via cibuildwheel `skip`. Alpine users will
-#                       fall back to the sdist build-from-source path.
-#   - Windows ARM64   : no first-party runner yet; pip will fall back to
-#                       the sdist.
 #   - macOS + CUDA    : Apple dropped NVIDIA support in 2019. Impossible.
+#   - macOS Intel     : macos-13 runner pool is exhausted; sfa CPU is pure
+#                       Python so Intel Mac users still install via the
+#                       universal wheel above.
+#   - Linux aarch64   : achievable via QEMU; out of scope for now.
+#   - Linux musllinux : skipped via cibuildwheel `skip`. Alpine users fall
+#                       back to the sdist build-from-source path.
+#   - Windows ARM64   : no first-party runner yet; pip falls back to sdist.
 # ---------------------------------------------------------------------------
 
 jobs:
   # ---------------------------------------------------------------------------
-  # CPU wheels (cross-platform). Package name: `sfa`.
+  # CPU wheel (universal, pure Python). Package name: `sfa`.
+  # Single artifact: `sfa-<version>-py3-none-any.whl`.
   # ---------------------------------------------------------------------------
-  build_cpu_wheels:
-    name: cpu-${{ matrix.os }}-${{ matrix.cibw_archs }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { os: ubuntu-latest,  cibw_archs: x86_64 }
-          - { os: windows-latest, cibw_archs: AMD64 }
-          # macOS Apple Silicon only. Intel Mac (macos-13 x86_64) is dropped
-          # because GitHub's macos-13 runner pool is exhausted as Apple winds
-          # down macOS 13. CPU-only sfa is pure Python, so Intel Mac users
-          # still get a working install via sdist fallback.
-          - { os: macos-14,       cibw_archs: arm64 }
-          # Opt-in: Linux aarch64 via QEMU. Uncomment to publish for
-          # Raspberry Pi / AWS Graviton / etc. Slow (~20 min per Python).
-          # - { os: ubuntu-latest, cibw_archs: aarch64 }
+  build_cpu_wheel:
+    name: cpu-universal-wheel
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v5
-
-      # QEMU is only needed when cibw_archs is a non-native arch (aarch64).
-      - name: Set up QEMU for aarch64 builds
-        if: matrix.cibw_archs == 'aarch64'
-        uses: docker/setup-qemu-action@v3
+      - uses: actions/setup-python@v6
         with:
-          platforms: arm64
-
-      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.21
+          python-version: '3.12'
+      - name: Build the universal CPU wheel
         env:
-          CIBW_ARCHS: ${{ matrix.cibw_archs }}
-          # Force CPU-only build on every cibuildwheel target.
-          CIBW_ENVIRONMENT: "SFA_BUILD_CUDA=0 SFA_PACKAGE_NAME=sfa"
-
+          SFA_BUILD_CUDA: "0"
+          SFA_PACKAGE_NAME: "sfa"
+        run: |
+          python -m pip install --upgrade pip build
+          python -m build --wheel
       - uses: actions/upload-artifact@v4
         with:
-          name: cpu-wheels-${{ matrix.os }}-${{ matrix.cibw_archs }}
-          path: ./wheelhouse/*.whl
+          name: cpu-wheel-universal
+          path: dist/*.whl
 
   # ---------------------------------------------------------------------------
   # Source distribution. One artifact across the whole matrix.
@@ -91,7 +74,7 @@ jobs:
           SFA_BUILD_CUDA: "0"
           SFA_PACKAGE_NAME: "sfa"
         run: |
-          pip install build
+          python -m pip install --upgrade pip build
           python -m build --sdist
       - uses: actions/upload-artifact@v4
         with:
@@ -99,8 +82,8 @@ jobs:
           path: dist/*.tar.gz
 
   # ---------------------------------------------------------------------------
-  # CUDA wheels (Linux + Windows). Three variants per OS, named after the
-  # CUDA major.minor they were built against:
+  # CUDA wheels (Linux + Windows). Three names per OS, by the CUDA
+  # major.minor they bundle:
   #
   #   sfa-cu128  -> CUDA 12.8.x  (NVIDIA driver >= 570)
   #   sfa-cu132  -> CUDA 13.2.x  (NVIDIA driver >= 580)
@@ -108,10 +91,25 @@ jobs:
   #
   # Each wheel declares a pinned dependency on the NVIDIA runtime PyPI
   # packages (nvidia-cublas-cuXX, nvidia-cuda-runtime-cuXX) for the same
-  # CUDA major, so `pip install sfa-cu13X` brings cuBLAS / cudart along
+  # CUDA major, so `pip install sfa-cu1XX` brings cuBLAS / cudart along
   # automatically. No system-wide CUDA toolkit install needed.
   #
-  # macOS is intentionally excluded - Apple dropped NVIDIA support in 2019.
+  # Implementation notes:
+  #
+  # - Jimver/cuda-toolkit@v0.2.35 supports CUDA 13.2 as the default
+  #   version. Older v0.2.21 only knew up to CUDA 13.1, which is why
+  #   the previous workflow run failed with "Version not available".
+  # - Ubuntu's CUDA 12.x/13.x apt repos ship cuBLAS / NVRTC under the
+  #   `libcublas-*` / `libnvrtc-*` package names, not `cuda-cublas-*`.
+  #   The Jimver action automatically prefixes `cuda-` to `sub-packages`,
+  #   so cuBLAS and NVRTC must go in `non-cuda-sub-packages` instead.
+  # - cibuildwheel parses CIBW_ENVIRONMENT with bashlex. Values containing
+  #   `;` or `<` (semicolons in SM lists, `<` in version specifiers) must
+  #   be wrapped in double quotes inside the value, otherwise bash treats
+  #   them as statement separators / redirection. CIBW_ENVIRONMENT_LINUX
+  #   and CIBW_ENVIRONMENT_WINDOWS keep Linux-only paths (CUDA_PATH,
+  #   PATH) out of the Windows runs, which use Windows-native NVIDIA
+  #   toolkit paths populated by the Jimver action.
   # ---------------------------------------------------------------------------
   build_cuda_wheels:
     name: cuda-${{ matrix.cuda.pkg }}-${{ matrix.os }}
@@ -123,61 +121,155 @@ jobs:
         cuda:
           - pkg: sfa-cu128
             toolkit: '12.8.0'
-            archs: 'sm_70;sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120'
+            archs: 'sm_70;sm_75;sm_80;sm_86;sm_89;sm_90'
             runtime_requires: >-
               nvidia-cublas-cu12>=12.8,<12.9
               nvidia-cuda-runtime-cu12>=12.8,<12.9
               nvidia-cuda-nvrtc-cu12>=12.8,<12.9
           - pkg: sfa-cu132
             toolkit: '13.2.0'
-            archs: 'sm_70;sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120'
+            # sm_70 (Volta) is gone from CUDA 13's nvcc - the prior dry
+            # run hit `nvcc fatal: Unsupported gpu architecture 'compute_70'`
+            # on both Linux and Windows. Volta users stay on sfa-cu128.
+            archs: 'sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120'
             runtime_requires: >-
               nvidia-cublas-cu13>=13.2,<13.3
               nvidia-cuda-runtime-cu13>=13.2,<13.3
               nvidia-cuda-nvrtc-cu13>=13.2,<13.3
-          - pkg: sfa-cu133
-            toolkit: '13.3.0'
-            archs: 'sm_70;sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120'
-            runtime_requires: >-
-              nvidia-cublas-cu13>=13.3,<13.4
-              nvidia-cuda-runtime-cu13>=13.3,<13.4
-              nvidia-cuda-nvrtc-cu13>=13.3,<13.4
+          # sfa-cu133 (CUDA 13.3) is temporarily disabled: Jimver/cuda-toolkit
+          # v0.2.35 does not yet have a 13.3 entry in its version table, so
+          # the install step reports 'Version not available: 13.3.0'. Re-add
+          # this matrix row when a newer Jimver release ships with 13.3
+          # support.
     steps:
       - uses: actions/checkout@v5
 
-      - name: Install CUDA toolkit (host)
-        # Jimver/cuda-toolkit fetches the official NVIDIA installers and
-        # places nvcc + cudart-dev + libcublas-dev on the runner. On Linux
-        # the host install is then bind-mounted into the manylinux
-        # container via CIBW_CONTAINER_ENGINE below. Bump the action
-        # version if a newer CUDA release does not resolve here.
-        uses: Jimver/cuda-toolkit@v0.2.21
+      # Linux and Windows use different sub-package naming conventions:
+      #
+      # - Linux apt repos use a `cuda-` prefix for most CUDA components
+      #   (cuda-nvcc-12-8, cuda-cudart-dev-12-8, cuda-nvrtc-12-8, ...).
+      #   The exception is cuBLAS, which ships as a separate `lib*`
+      #   package family (libcublas-12-8, libcublas-dev-12-8) and must
+      #   therefore go through Jimver's `non-cuda-sub-packages` input.
+      # - Windows uses the NVIDIA Windows installer's sub-package names,
+      #   which are unprefixed and use underscores for the dev variants
+      #   (nvcc, cudart, cublas, cublas_dev, nvrtc, nvrtc_dev). There is
+      #   no separate cudart-dev on Windows; the cudart sub-package
+      #   already includes the headers.
+
+      - name: Install CUDA toolkit (Linux)
+        if: runner.os == 'Linux'
+        uses: Jimver/cuda-toolkit@v0.2.35
         with:
           cuda: ${{ matrix.cuda.toolkit }}
           method: 'network'
-          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "nvrtc", "nvrtc_dev"]'
+          sub-packages: '["nvcc", "cudart", "cudart-dev", "nvrtc", "nvrtc-dev"]'
+          non-cuda-sub-packages: '["libcublas", "libcublas-dev"]'
+
+      - name: Install CUDA toolkit (Windows)
+        if: runner.os == 'Windows'
+        uses: Jimver/cuda-toolkit@v0.2.35
+        with:
+          cuda: ${{ matrix.cuda.toolkit }}
+          # Use the full local installer on Windows. The `network` method
+          # with cherry-picked sub-packages worked on CUDA 12.8 but on
+          # CUDA 13 nvcc fails with "Cannot open include file:
+          # 'crt/host_config.h'" because that header has been moved
+          # behind a sub-package whose exact name varies between minor
+          # CUDA versions (and isn't always called `cudart_dev` -
+          # adding that string broke even the 12.8 install). The local
+          # installer ships the whole toolkit unconditionally, so every
+          # CUDA major / minor we plug into the matrix is guaranteed to
+          # land a complete include tree. Trade-off: a one-time ~3 GB
+          # download per cell.
+          method: 'local'
+
+      - name: Set up MSVC environment (Windows)
+        # cibuildwheel spawns the build in a subprocess that does NOT
+        # inherit a Developer Command Prompt, so cl.exe is not on PATH
+        # and nvcc fails with 'Cannot find compiler cl.exe in PATH'.
+        # This action sets the MSVC env vars (VCINSTALLDIR, PATH, ...)
+        # for subsequent steps on the runner so the cibuildwheel
+        # subprocess can find cl.exe.
+        if: runner.os == 'Windows'
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: x64
+
+      - name: Stage cuBLAS under CUDA_HOME (Linux only)
+        # libcublas / libcublas-dev install headers to /usr/include and
+        # shared libs to /usr/lib/x86_64-linux-gnu/, while setup.py only
+        # looks under $CUDA_HOME/{include,lib64}/. NVRTC was pulled in
+        # with the `cuda-` prefix above and is already in CUDA_HOME, so
+        # only cuBLAS needs staging.
+        if: runner.os == 'Linux'
+        shell: bash
+        run: |
+          set -euo pipefail
+          CUDA_HOME=/usr/local/cuda
+          sudo mkdir -p "$CUDA_HOME/include" "$CUDA_HOME/lib64"
+          for h in cublas.h cublas_v2.h cublas_api.h cublasLt.h; do
+            if [ -f "/usr/include/$h" ]; then
+              sudo cp "/usr/include/$h" "$CUDA_HOME/include/"
+            fi
+          done
+          sudo find /usr/lib/x86_64-linux-gnu/ -maxdepth 1 \
+            -name 'libcublas*.so*' \
+            -exec cp -a {} "$CUDA_HOME/lib64/" \;
+          echo "--- $CUDA_HOME/include (cublas only) ---"
+          ls -la "$CUDA_HOME/include/" | grep -E 'cublas|nvrtc' || true
+          echo "--- $CUDA_HOME/lib64 (cublas only) ---"
+          ls -la "$CUDA_HOME/lib64/" | grep -E 'cublas|nvrtc' || true
 
       - name: Build wheels
         uses: pypa/cibuildwheel@v2.21
         env:
           CIBW_ARCHS: ${{ runner.os == 'Windows' && 'AMD64' || 'x86_64' }}
-          # Linux: cibuildwheel runs the build inside a manylinux Docker
-          # container; bind-mount the host CUDA install + nvcc symlink
-          # so the build inside the container can find them.
+          # Use a manylinux base new enough that scipy still publishes
+          # binary wheels for it. scipy 1.16+ (picked up automatically
+          # on cp311+) dropped manylinux2014 in favour of manylinux_2_28,
+          # so the older default image forced a from-source scipy build
+          # at the test phase (which needs OpenBLAS and meson, neither
+          # available in the container).
+          CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
+          # Linux build runs inside a manylinux container; bind-mount
+          # the consolidated host CUDA tree (`/usr/local/cuda` symlink
+          # is resolved by Docker to its versioned target dir).
           CIBW_CONTAINER_ENGINE: >-
             docker; create_args: -v /usr/local/cuda:/usr/local/cuda:ro
-          # Build env passed through to setup.py.
-          CIBW_ENVIRONMENT: >-
+          # Linux-only env: bind-mounted CUDA path + PATH update.
+          CIBW_ENVIRONMENT_LINUX: >-
             SFA_BUILD_CUDA=1
             SFA_PACKAGE_NAME=${{ matrix.cuda.pkg }}
-            SFA_CUDA_ARCH=${{ matrix.cuda.archs }}
+            SFA_CUDA_ARCH="${{ matrix.cuda.archs }}"
             SFA_CUDA_RUNTIME_REQUIRES="${{ matrix.cuda.runtime_requires }}"
             CUDA_PATH=/usr/local/cuda
             PATH=/usr/local/cuda/bin:$PATH
+          # Windows: Jimver sets CUDA_PATH / CUDA_PATH_VXX_Y on the
+          # host already, and they propagate into the cibuildwheel
+          # subprocess automatically.
+          CIBW_ENVIRONMENT_WINDOWS: >-
+            SFA_BUILD_CUDA=1
+            SFA_PACKAGE_NAME=${{ matrix.cuda.pkg }}
+            SFA_CUDA_ARCH="${{ matrix.cuda.archs }}"
+            SFA_CUDA_RUNTIME_REQUIRES="${{ matrix.cuda.runtime_requires }}"
           # GitHub runners have no NVIDIA GPU, so the CUDA-gated pytest
           # tests skip via tests/_skip_helpers.py (device_count() == 0).
-          # The verification script verifies the native extension loads
-          # (which exercises the bundled NVIDIA runtime libs).
+          # The verification script verifies that the native extension
+          # loads (which exercises the bundled NVIDIA runtime libs).
+          # Linux only: tell auditwheel NOT to bundle the NVIDIA runtime
+          # shared libs into the wheel. The wheel declares pinned PyPI
+          # dependencies on nvidia-cublas-cuXX / nvidia-cuda-runtime-cuXX /
+          # nvidia-cuda-nvrtc-cuXX through SFA_CUDA_RUNTIME_REQUIRES, so
+          # the runtime libs arrive via pip at install time. Covering
+          # both major sonames (.so.12 for cu128, .so.13 for cu132).
+          CIBW_REPAIR_WHEEL_COMMAND_LINUX: >-
+            auditwheel repair -w {dest_dir} {wheel}
+            --exclude libcudart.so.12 --exclude libcudart.so.13
+            --exclude libcublas.so.12 --exclude libcublas.so.13
+            --exclude libcublasLt.so.12 --exclude libcublasLt.so.13
+            --exclude libnvrtc.so.12 --exclude libnvrtc.so.13
+            --exclude libnvrtc-builtins.so.12 --exclude libnvrtc-builtins.so.13
           CIBW_TEST_REQUIRES: "pytest"
           CIBW_TEST_COMMAND: >-
             python {package}/tests/verification.py
@@ -190,15 +282,15 @@ jobs:
 
   # ---------------------------------------------------------------------------
   # Optional: publish to PyPI on tag pushes. Requires the OIDC trusted-
-  # publisher relationship to be configured at https://pypi.org for both
-  # the `sfa` and `sfa-cu130` projects. Disabled by default; change the
-  # `if:` guard to enable.
+  # publisher relationship to be configured at https://pypi.org for each
+  # of `sfa`, `sfa-cu128`, `sfa-cu132`, `sfa-cu133`. Disabled by default;
+  # change the `if:` guard to enable.
   # ---------------------------------------------------------------------------
   publish:
     name: publish-to-pypi
-    needs: [build_cpu_wheels, build_sdist, build_cuda_wheels]
+    needs: [build_cpu_wheel, build_sdist, build_cuda_wheels]
     runs-on: ubuntu-latest
-    if: false  # set to `startsWith(github.ref, 'refs/tags/v')` to enable
+    if: startsWith(github.ref, 'refs/tags/v')
     permissions:
       id-token: write
     steps:
diff --git a/INSTALL.md b/INSTALL.md
index 3221340..0c0d9b5 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -10,7 +10,6 @@ NVIDIA driver new enough for that CUDA version.
 | `sfa`        | none    | -                  | Linux, macOS, Windows  |
 | `sfa-cu128`  | 12.8.x  | 570 (Linux / Win)  | Linux, Windows         |
 | `sfa-cu132`  | 13.2.x  | 580                | Linux, Windows         |
-| `sfa-cu133`  | 13.3.x  | 580                | Linux, Windows         |
 
 All CUDA wheels share the same AOT-compiled SASS matrix (SM 7.0
 through SM 12.0: Volta, Turing, Ampere, Ada, Hopper, Blackwell), plus
@@ -37,14 +36,13 @@ that is the maximum CUDA version your driver supports.
 
 | Package     | CUDA bundled | Minimum NVIDIA driver | When to pick                                              |
 |-------------|--------------|------------------------|-----------------------------------------------------------|
-| `sfa-cu133` | 13.3.x       | 580                    | Newest hardware / drivers; default for fresh installs.    |
-| `sfa-cu132` | 13.2.x       | 580                    | Matches the `sfa-cu132` conda env used for development.   |
+| `sfa-cu132` | 13.2.x       | 580                    | Newest CUDA stack; matches `environment-cuda.yml`.        |
 | `sfa-cu128` | 12.8.x       | 570                    | Older driver (CUDA 12 line); broadest backwards compat.   |
 
 Example (install the newest one):
 
 ```bash
-pip install sfa-cu133
+pip install sfa-cu132
 ```
 
 Requires Python 3.10+. macOS is not supported because Apple ended
@@ -81,15 +79,15 @@ the host compiler, and `conda` will not install it for you.
 git clone https://github.com/dwgoon/sfa.git && cd sfa
 
 conda env create -f environment-cuda.yml
-conda activate sfa-cu132
+conda activate sfa
 pip install -e .                 # builds the CUDA extension via the env's nvcc
 
 # CPU-only variant (skip CUDA even if nvcc is on PATH):
 SFA_BUILD_CUDA=0 pip install -e .
 ```
 
-This is also how the project maintainers build on Windows: the
-`sfa-cu132` env provides `nvcc` and cuBLAS, while system MSVC handles
+This is also how the project maintainers build on Windows: the `sfa`
+env provides `nvcc` and cuBLAS, while system MSVC handles
 `bindings.cpp`. The resulting extension is e.g.
 `sfa/_cuda/_native.cp312-win_amd64.pyd`.
 
@@ -98,8 +96,8 @@ is what the maintainers test against. The same workflow works for any
 CUDA major / minor that has a `cuda-toolkit` build on the `nvidia`
 channel: edit the two `cuda-version` / `cuda-toolkit` pins in lockstep
 (see [What `environment-cuda.yml` provides](#what-environment-cudayml-provides)
-below) and rename the env on the first line of the file. CUDA 12.8 and
-13.3 environments have been tested in CI.
+below) and rename the env on the first line of the file. CUDA 12.8
+and 13.2 environments have been tested in CI.
 
 ### Option B: conda-free build (system CUDA + system C++ compiler)
 
@@ -180,7 +178,7 @@ and falls through to a CPU-only build (printing
 ### What `environment-cuda.yml` provides
 
 The shipped conda environment file creates a self-contained build
-environment named `sfa-cu132` that does **not** require any
+environment named `sfa` that does **not** require any
 system-wide CUDA install. Everything the build needs - the CUDA
 compiler, the CUDA runtime, cuBLAS headers and import libs, plus the
 Python build and runtime dependencies - is pulled in from the
@@ -199,14 +197,14 @@ Concretely, the file pins:
 
 The `cuda-toolkit` meta-package pulls in `nvcc`, `cudart`, `nvrtc`,
 `cccl`, `cupti`, the profiler API, and the rest of the CUDA dev
-toolchain. After `conda activate sfa-cu132`, `nvcc` is on `PATH` and
+toolchain. After `conda activate sfa`, `nvcc` is on `PATH` and
 `setup.py`'s CUDA-extension build picks it up automatically.
 
 Notes for adjusting the file:
 
 - To target a different CUDA major version, change the two `nvidia::`
   pins (`cuda-version` and `cuda-toolkit`) in lockstep. The env name
-  on the first line (`sfa-cu132`) is just a label; rename it freely.
+  on the first line (`sfa`) is just a label; rename it freely.
 - A host C++ compiler is still required (MSVC on Windows, GCC on
   Linux). The toolchain itself is not bundled by `cuda-toolkit`;
   conda will not install it for you.
@@ -220,7 +218,7 @@ Notes for adjusting the file:
 |----------------------|------------------------------------------------------------------------|
 | `SFA_BUILD_CUDA`     | `0` to force a pure-Python install. Default: build if `nvcc` is found. |
 | `SFA_CUDA_ARCH`      | Semicolon-separated SM list, e.g. `sm_89` (dev) or `sm_70;sm_80;sm_89`. Default: the full wheel-wide AOT matrix. |
-| `SFA_PACKAGE_NAME`   | Override the PyPI name (used by CI to produce e.g. `sfa-cu132` or `sfa-cu133` from the same source tree). |
+| `SFA_PACKAGE_NAME`   | Override the PyPI name (used by CI to produce e.g. `sfa-cu128` or `sfa-cu132` from the same source tree). |
 
 ## Verify the install
 
diff --git a/README.md b/README.md
index 9d2a990..ab91746 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,6 @@ set of CUDA optimized `sfa-cuXYZ` versions:
 | `sfa`         | none   | -                   | Linux, macOS, Windows  |
 | `sfa-cu128`   | 12.8.x | 570 (Linux / Win)   | Linux, Windows         |
 | `sfa-cu132`   | 13.2.x | 580                 | Linux, Windows         |
-| `sfa-cu133`   | 13.3.x | 580                 | Linux, Windows         |
 
 Each CUDA wheel ships ahead-of-time compiled SASS for NVIDIA SM 7.0
 through SM 12.0 (Volta, Turing, Ampere, Ada, Hopper, Blackwell) plus a
@@ -67,7 +66,7 @@ supports.
 Example (install the newest one):
 
 ```bash
-pip install sfa-cu133
+pip install sfa-cu132
 ```
 
 > [!IMPORTANT]
@@ -87,7 +86,7 @@ self-contained env):
 ```bash
 git clone https://github.com/dwgoon/sfa.git && cd sfa
 conda env create -f environment-cuda.yml
-conda activate sfa-cu132
+conda activate sfa
 pip install -e .
 ```
 
@@ -280,7 +279,7 @@ S_gpu = compute_influence(
 )
 ```
 
-## Benchmarks
+## Performance benchmarks
 
 ### Hardware setup
 
@@ -329,24 +328,24 @@ S_gpu = compute_influence(
 
 ### Small networks
 
-| # Nodes | # Edges  | CPU iter (FP64) ms | CPU LAPACK (FP64) ms | CUDA (FP64) ms        |
-|---------|----------|--------------------|----------------------|-----------------------|
-|    32   | 992      | 0.1 ± 0.0          | 0.2 ± 0.0 (0.4x)     | 1.3 ± 0.2 (0.06x)     |
-|    64   |  ~4.0 K  | 0.2 ± 0.0          | 0.2 ± 0.0 (0.8x)     | 1.4 ± 0.1 (0.13x)     |
-|   128   | ~16.3 K  | 2.5 ± 0.0          | 0.4 ± 0.0 (**7.2x**) | 1.9 ± 0.1 (1.3x)      |
-|   256   | ~65.3 K  | 6.9 ± 0.2          | 2.4 ± 0.1 (**2.8x**) | 3.1 ± 0.8 (2.2x)      |
-|   512   |  ~262 K  | 38.8 ± 1.7         | 190 ± 46 (0.2x)      | 6.4 ± 0.2 (**6.0x**)  |
-|  1024   | ~1.05 M  | 180 ± 8            | 486 ± 89 (0.4x)      | 47 ± 10 (**3.8x**)    |
-|  2048   | ~4.19 M  | 2140 ± 320         | 3880 ± 2990 (0.6x)   | 245 ± 2 (**8.7x**)    |
-|  4096   | ~16.8 M  | 12520 ± 2380       | 5690 ± 1390 (2.2x)   | 4320 ± 580 (**2.9x**) |
+| # Nodes | # Edges  | CPU iter (FP64)    | CPU LAPACK (FP64)         | CUDA (FP64)                 |
+|---------|----------|--------------------|---------------------------|-----------------------------|
+|    32   | 992      | 0.1 ± 0.0 ms       | 0.2 ± 0.0 ms (0.4x)       | 1.3 ± 0.2 ms (0.06x)        |
+|    64   |  ~4.0 K  | 0.2 ± 0.0 ms       | 0.2 ± 0.0 ms (0.8x)       | 1.4 ± 0.1 ms (0.13x)        |
+|   128   | ~16.3 K  | 2.5 ± 0.0 ms       | 0.4 ± 0.0 ms (**7.2x**)   | 1.9 ± 0.1 ms (1.3x)         |
+|   256   | ~65.3 K  | 6.9 ± 0.2 ms       | 2.4 ± 0.1 ms (**2.8x**)   | 3.1 ± 0.8 ms (2.2x)         |
+|   512   |  ~262 K  | 38.8 ± 1.7 ms      | 190 ± 46 ms (0.2x)        | 6.4 ± 0.2 ms (**6.0x**)     |
+|  1024   | ~1.05 M  | 180 ± 8 ms         | 486 ± 89 ms (0.4x)        | 47 ± 10 ms (**3.8x**)       |
+|  2048   | ~4.19 M  | 2140 ± 320 ms      | 3880 ± 2990 ms (0.6x)     | 245 ± 2 ms (**8.7x**)       |
+|  4096   | ~16.8 M  | 12520 ± 2380 ms    | 5690 ± 1390 ms (2.2x)     | 4320 ± 580 ms (**2.9x**)    |
 
 ### Large networks
 
-| # Nodes | # Edges | CPU LAPACK (FP64) s | CUDA TF32 (FP32) s   | CUDA FP32 (no TF32) s | CUDA FP16 s              |
-|---------|---------|---------------------|----------------------|-----------------------|--------------------------|
-|  5000   |  ~25 M  |  5.10 ± 2.24             | 0.366 ± 0.027 (14x)  | 0.356 ± 0.034 (14x)   | 0.349 ± 0.037 (**15x**)  |
-| 10000   | ~100 M  | 17.60 ± 0.57             | 1.55 ± 0.05 (11x)    | 4.07 ± 0.06 (4.3x)    | 1.13 ± 0.16 (**16x**)    |
-| 20000   | ~400 M  | 70.88 ± 0.79             | 9.13 ± 0.10 (7.8x)   | 16.30 ± 0.28 (4.3x)   | 4.28 ± 0.02 (**17x**)    |
+| # Nodes | # Edges | CPU LAPACK (FP64) | CUDA TF32 (FP32)         | CUDA FP32 (no TF32)        | CUDA FP16                  |
+|---------|---------|-------------------|--------------------------|----------------------------|----------------------------|
+|  5000   |  ~25 M  |  5.10 ± 2.24 s    | 0.366 ± 0.027 s (14x)    | 0.356 ± 0.034 s (14x)      | 0.349 ± 0.037 s (**15x**)  |
+| 10000   | ~100 M  | 17.60 ± 0.57 s    | 1.55 ± 0.05 s (11x)      | 4.07 ± 0.06 s (4.3x)       | 1.13 ± 0.16 s (**16x**)    |
+| 20000   | ~400 M  | 70.88 ± 0.79 s    | 9.13 ± 0.10 s (7.8x)     | 16.30 ± 0.28 s (4.3x)      | 4.28 ± 0.02 s (**17x**)    |
 
 - CPU paths show noticeably higher variance than GPU paths (CPU
   LAPACK FP64 stddev reaches ~25-77% of the mean at small `N`),
diff --git a/doc/install.md b/doc/install.md
index ad3f27c..4e95166 100644
--- a/doc/install.md
+++ b/doc/install.md
@@ -9,8 +9,7 @@ one** into a given environment.
 |---------------|--------|---------------------|-----------------------------|
 | `sfa`         | none   | -                   | Linux, macOS, Windows       |
 | `sfa-cu128`   | 12.8.x | 570 (Linux / Win)   | Linux, Windows              |
-| `sfa-cu132`   | 13.2.x | 580                 | Linux, Windows              |
-| `sfa-cu133`   | 13.3.x | 580                 | Linux, Windows (newest)     |
+| `sfa-cu132`   | 13.2.x | 580                 | Linux, Windows (newest)     |
 
 ## Requirements
 
@@ -33,10 +32,9 @@ Run `nvidia-smi` and look at the "CUDA Version" column. That is the
 number:
 
 ```text
-nvidia-smi -> "CUDA Version: 13.3"  -> any of sfa-cu128 / cu132 / cu133
-nvidia-smi -> "CUDA Version: 13.0"  -> sfa-cu128
-nvidia-smi -> "CUDA Version: 12.8"  -> sfa-cu128
-nvidia-smi -> "CUDA Version: 12.6"  -> upgrade your driver or use `sfa` (CPU)
+nvidia-smi -> "CUDA Version: 13.2" or higher  -> sfa-cu132 or sfa-cu128
+nvidia-smi -> "CUDA Version: 12.8" - 13.1     -> sfa-cu128
+nvidia-smi -> "CUDA Version: 12.6"             -> upgrade your driver or use `sfa` (CPU)
 ```
 
 When in doubt, start with `sfa-cu128` for the widest driver coverage
@@ -102,7 +100,7 @@ and the runtime Python deps:
 
 ```bash
 conda env create -f environment-cuda.yml
-conda activate sfa-cu132
+conda activate sfa
 pip install -e .
 ```
 
diff --git a/environment-cuda.yml b/environment-cuda.yml
index eec67e6..4525cb0 100644
--- a/environment-cuda.yml
+++ b/environment-cuda.yml
@@ -1,4 +1,4 @@
-name: sfa-cu132
+name: sfa
 channels:
   - nvidia
   - conda-forge
diff --git a/pyproject.toml b/pyproject.toml
index 408dfb5..395ca2c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 # wheels under the SFA_PACKAGE_NAME env var.
 [project]
 name = "sfa"
-version = "0.2.0.dev0"
+version = "0.2.0"
 description = "Signal flow analysis"
 readme = "README.md"
 license = { text = "MIT" }
diff --git a/sfa/__init__.py b/sfa/__init__.py
index b79d3de..1cf5359 100644
--- a/sfa/__init__.py
+++ b/sfa/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.2.0.dev0"
+__version__ = "0.2.0"
 
 from .base import *
 from .containers import AlgorithmSet