diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index a23c066..efe4eb2 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -11,69 +11,52 @@ on: # --------------------------------------------------------------------------- # Coverage matrix (what gets produced on a tag push) # --------------------------------------------------------------------------- -# CPU wheels (package name: sfa) -# - Linux manylinux2014 x86_64 -# - Windows AMD64 -# - macOS x86_64 (Intel) -# - macOS arm64 (Apple Silicon) -# CUDA wheels (package name: sfa-cu130) +# CPU wheel (package name: sfa) +# - one universal py3-none-any.whl. CPU sfa has no native extension, +# so a single pure-Python wheel covers Linux / macOS / Windows and +# every supported Python version (3.10 - 3.13). +# CUDA wheels (package names: sfa-cu128, sfa-cu132, sfa-cu133) # - Linux manylinux2014 x86_64 # - Windows AMD64 +# - Each wheel is built per Python version by cibuildwheel. # sdist # - one source distribution # # Not covered (and why): -# - Linux aarch64 : achievable via QEMU; add an `aarch64` include line -# below to opt in. Slow on x86_64 runners. -# - Linux musllinux : skipped via cibuildwheel `skip`. Alpine users will -# fall back to the sdist build-from-source path. -# - Windows ARM64 : no first-party runner yet; pip will fall back to -# the sdist. # - macOS + CUDA : Apple dropped NVIDIA support in 2019. Impossible. +# - macOS Intel : macos-13 runner pool is exhausted; sfa CPU is pure +# Python so Intel Mac users still install via the +# universal wheel above. +# - Linux aarch64 : achievable via QEMU; out of scope for now. +# - Linux musllinux : skipped via cibuildwheel `skip`. Alpine users fall +# back to the sdist build-from-source path. +# - Windows ARM64 : no first-party runner yet; pip falls back to sdist. # --------------------------------------------------------------------------- jobs: # --------------------------------------------------------------------------- - # CPU wheels (cross-platform). Package name: `sfa`. + # CPU wheel (universal, pure Python). Package name: `sfa`. + # Single artifact: `sfa--py3-none-any.whl`. # --------------------------------------------------------------------------- - build_cpu_wheels: - name: cpu-${{ matrix.os }}-${{ matrix.cibw_archs }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - include: - - { os: ubuntu-latest, cibw_archs: x86_64 } - - { os: windows-latest, cibw_archs: AMD64 } - # macOS Apple Silicon only. Intel Mac (macos-13 x86_64) is dropped - # because GitHub's macos-13 runner pool is exhausted as Apple winds - # down macOS 13. CPU-only sfa is pure Python, so Intel Mac users - # still get a working install via sdist fallback. - - { os: macos-14, cibw_archs: arm64 } - # Opt-in: Linux aarch64 via QEMU. Uncomment to publish for - # Raspberry Pi / AWS Graviton / etc. Slow (~20 min per Python). - # - { os: ubuntu-latest, cibw_archs: aarch64 } + build_cpu_wheel: + name: cpu-universal-wheel + runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 - - # QEMU is only needed when cibw_archs is a non-native arch (aarch64). - - name: Set up QEMU for aarch64 builds - if: matrix.cibw_archs == 'aarch64' - uses: docker/setup-qemu-action@v3 + - uses: actions/setup-python@v6 with: - platforms: arm64 - - - name: Build wheels - uses: pypa/cibuildwheel@v2.21 + python-version: '3.12' + - name: Build the universal CPU wheel env: - CIBW_ARCHS: ${{ matrix.cibw_archs }} - # Force CPU-only build on every cibuildwheel target. - CIBW_ENVIRONMENT: "SFA_BUILD_CUDA=0 SFA_PACKAGE_NAME=sfa" - + SFA_BUILD_CUDA: "0" + SFA_PACKAGE_NAME: "sfa" + run: | + python -m pip install --upgrade pip build + python -m build --wheel - uses: actions/upload-artifact@v4 with: - name: cpu-wheels-${{ matrix.os }}-${{ matrix.cibw_archs }} - path: ./wheelhouse/*.whl + name: cpu-wheel-universal + path: dist/*.whl # --------------------------------------------------------------------------- # Source distribution. One artifact across the whole matrix. @@ -91,7 +74,7 @@ jobs: SFA_BUILD_CUDA: "0" SFA_PACKAGE_NAME: "sfa" run: | - pip install build + python -m pip install --upgrade pip build python -m build --sdist - uses: actions/upload-artifact@v4 with: @@ -99,8 +82,8 @@ jobs: path: dist/*.tar.gz # --------------------------------------------------------------------------- - # CUDA wheels (Linux + Windows). Three variants per OS, named after the - # CUDA major.minor they were built against: + # CUDA wheels (Linux + Windows). Three names per OS, by the CUDA + # major.minor they bundle: # # sfa-cu128 -> CUDA 12.8.x (NVIDIA driver >= 570) # sfa-cu132 -> CUDA 13.2.x (NVIDIA driver >= 580) @@ -108,10 +91,25 @@ jobs: # # Each wheel declares a pinned dependency on the NVIDIA runtime PyPI # packages (nvidia-cublas-cuXX, nvidia-cuda-runtime-cuXX) for the same - # CUDA major, so `pip install sfa-cu13X` brings cuBLAS / cudart along + # CUDA major, so `pip install sfa-cu1XX` brings cuBLAS / cudart along # automatically. No system-wide CUDA toolkit install needed. # - # macOS is intentionally excluded - Apple dropped NVIDIA support in 2019. + # Implementation notes: + # + # - Jimver/cuda-toolkit@v0.2.35 supports CUDA 13.2 as the default + # version. Older v0.2.21 only knew up to CUDA 13.1, which is why + # the previous workflow run failed with "Version not available". + # - Ubuntu's CUDA 12.x/13.x apt repos ship cuBLAS / NVRTC under the + # `libcublas-*` / `libnvrtc-*` package names, not `cuda-cublas-*`. + # The Jimver action automatically prefixes `cuda-` to `sub-packages`, + # so cuBLAS and NVRTC must go in `non-cuda-sub-packages` instead. + # - cibuildwheel parses CIBW_ENVIRONMENT with bashlex. Values containing + # `;` or `<` (semicolons in SM lists, `<` in version specifiers) must + # be wrapped in double quotes inside the value, otherwise bash treats + # them as statement separators / redirection. CIBW_ENVIRONMENT_LINUX + # and CIBW_ENVIRONMENT_WINDOWS keep Linux-only paths (CUDA_PATH, + # PATH) out of the Windows runs, which use Windows-native NVIDIA + # toolkit paths populated by the Jimver action. # --------------------------------------------------------------------------- build_cuda_wheels: name: cuda-${{ matrix.cuda.pkg }}-${{ matrix.os }} @@ -123,61 +121,155 @@ jobs: cuda: - pkg: sfa-cu128 toolkit: '12.8.0' - archs: 'sm_70;sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120' + archs: 'sm_70;sm_75;sm_80;sm_86;sm_89;sm_90' runtime_requires: >- nvidia-cublas-cu12>=12.8,<12.9 nvidia-cuda-runtime-cu12>=12.8,<12.9 nvidia-cuda-nvrtc-cu12>=12.8,<12.9 - pkg: sfa-cu132 toolkit: '13.2.0' - archs: 'sm_70;sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120' + # sm_70 (Volta) is gone from CUDA 13's nvcc - the prior dry + # run hit `nvcc fatal: Unsupported gpu architecture 'compute_70'` + # on both Linux and Windows. Volta users stay on sfa-cu128. + archs: 'sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120' runtime_requires: >- nvidia-cublas-cu13>=13.2,<13.3 nvidia-cuda-runtime-cu13>=13.2,<13.3 nvidia-cuda-nvrtc-cu13>=13.2,<13.3 - - pkg: sfa-cu133 - toolkit: '13.3.0' - archs: 'sm_70;sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120' - runtime_requires: >- - nvidia-cublas-cu13>=13.3,<13.4 - nvidia-cuda-runtime-cu13>=13.3,<13.4 - nvidia-cuda-nvrtc-cu13>=13.3,<13.4 + # sfa-cu133 (CUDA 13.3) is temporarily disabled: Jimver/cuda-toolkit + # v0.2.35 does not yet have a 13.3 entry in its version table, so + # the install step reports 'Version not available: 13.3.0'. Re-add + # this matrix row when a newer Jimver release ships with 13.3 + # support. steps: - uses: actions/checkout@v5 - - name: Install CUDA toolkit (host) - # Jimver/cuda-toolkit fetches the official NVIDIA installers and - # places nvcc + cudart-dev + libcublas-dev on the runner. On Linux - # the host install is then bind-mounted into the manylinux - # container via CIBW_CONTAINER_ENGINE below. Bump the action - # version if a newer CUDA release does not resolve here. - uses: Jimver/cuda-toolkit@v0.2.21 + # Linux and Windows use different sub-package naming conventions: + # + # - Linux apt repos use a `cuda-` prefix for most CUDA components + # (cuda-nvcc-12-8, cuda-cudart-dev-12-8, cuda-nvrtc-12-8, ...). + # The exception is cuBLAS, which ships as a separate `lib*` + # package family (libcublas-12-8, libcublas-dev-12-8) and must + # therefore go through Jimver's `non-cuda-sub-packages` input. + # - Windows uses the NVIDIA Windows installer's sub-package names, + # which are unprefixed and use underscores for the dev variants + # (nvcc, cudart, cublas, cublas_dev, nvrtc, nvrtc_dev). There is + # no separate cudart-dev on Windows; the cudart sub-package + # already includes the headers. + + - name: Install CUDA toolkit (Linux) + if: runner.os == 'Linux' + uses: Jimver/cuda-toolkit@v0.2.35 with: cuda: ${{ matrix.cuda.toolkit }} method: 'network' - sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "nvrtc", "nvrtc_dev"]' + sub-packages: '["nvcc", "cudart", "cudart-dev", "nvrtc", "nvrtc-dev"]' + non-cuda-sub-packages: '["libcublas", "libcublas-dev"]' + + - name: Install CUDA toolkit (Windows) + if: runner.os == 'Windows' + uses: Jimver/cuda-toolkit@v0.2.35 + with: + cuda: ${{ matrix.cuda.toolkit }} + # Use the full local installer on Windows. The `network` method + # with cherry-picked sub-packages worked on CUDA 12.8 but on + # CUDA 13 nvcc fails with "Cannot open include file: + # 'crt/host_config.h'" because that header has been moved + # behind a sub-package whose exact name varies between minor + # CUDA versions (and isn't always called `cudart_dev` - + # adding that string broke even the 12.8 install). The local + # installer ships the whole toolkit unconditionally, so every + # CUDA major / minor we plug into the matrix is guaranteed to + # land a complete include tree. Trade-off: a one-time ~3 GB + # download per cell. + method: 'local' + + - name: Set up MSVC environment (Windows) + # cibuildwheel spawns the build in a subprocess that does NOT + # inherit a Developer Command Prompt, so cl.exe is not on PATH + # and nvcc fails with 'Cannot find compiler cl.exe in PATH'. + # This action sets the MSVC env vars (VCINSTALLDIR, PATH, ...) + # for subsequent steps on the runner so the cibuildwheel + # subprocess can find cl.exe. + if: runner.os == 'Windows' + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: x64 + + - name: Stage cuBLAS under CUDA_HOME (Linux only) + # libcublas / libcublas-dev install headers to /usr/include and + # shared libs to /usr/lib/x86_64-linux-gnu/, while setup.py only + # looks under $CUDA_HOME/{include,lib64}/. NVRTC was pulled in + # with the `cuda-` prefix above and is already in CUDA_HOME, so + # only cuBLAS needs staging. + if: runner.os == 'Linux' + shell: bash + run: | + set -euo pipefail + CUDA_HOME=/usr/local/cuda + sudo mkdir -p "$CUDA_HOME/include" "$CUDA_HOME/lib64" + for h in cublas.h cublas_v2.h cublas_api.h cublasLt.h; do + if [ -f "/usr/include/$h" ]; then + sudo cp "/usr/include/$h" "$CUDA_HOME/include/" + fi + done + sudo find /usr/lib/x86_64-linux-gnu/ -maxdepth 1 \ + -name 'libcublas*.so*' \ + -exec cp -a {} "$CUDA_HOME/lib64/" \; + echo "--- $CUDA_HOME/include (cublas only) ---" + ls -la "$CUDA_HOME/include/" | grep -E 'cublas|nvrtc' || true + echo "--- $CUDA_HOME/lib64 (cublas only) ---" + ls -la "$CUDA_HOME/lib64/" | grep -E 'cublas|nvrtc' || true - name: Build wheels uses: pypa/cibuildwheel@v2.21 env: CIBW_ARCHS: ${{ runner.os == 'Windows' && 'AMD64' || 'x86_64' }} - # Linux: cibuildwheel runs the build inside a manylinux Docker - # container; bind-mount the host CUDA install + nvcc symlink - # so the build inside the container can find them. + # Use a manylinux base new enough that scipy still publishes + # binary wheels for it. scipy 1.16+ (picked up automatically + # on cp311+) dropped manylinux2014 in favour of manylinux_2_28, + # so the older default image forced a from-source scipy build + # at the test phase (which needs OpenBLAS and meson, neither + # available in the container). + CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28 + # Linux build runs inside a manylinux container; bind-mount + # the consolidated host CUDA tree (`/usr/local/cuda` symlink + # is resolved by Docker to its versioned target dir). CIBW_CONTAINER_ENGINE: >- docker; create_args: -v /usr/local/cuda:/usr/local/cuda:ro - # Build env passed through to setup.py. - CIBW_ENVIRONMENT: >- + # Linux-only env: bind-mounted CUDA path + PATH update. + CIBW_ENVIRONMENT_LINUX: >- SFA_BUILD_CUDA=1 SFA_PACKAGE_NAME=${{ matrix.cuda.pkg }} - SFA_CUDA_ARCH=${{ matrix.cuda.archs }} + SFA_CUDA_ARCH="${{ matrix.cuda.archs }}" SFA_CUDA_RUNTIME_REQUIRES="${{ matrix.cuda.runtime_requires }}" CUDA_PATH=/usr/local/cuda PATH=/usr/local/cuda/bin:$PATH + # Windows: Jimver sets CUDA_PATH / CUDA_PATH_VXX_Y on the + # host already, and they propagate into the cibuildwheel + # subprocess automatically. + CIBW_ENVIRONMENT_WINDOWS: >- + SFA_BUILD_CUDA=1 + SFA_PACKAGE_NAME=${{ matrix.cuda.pkg }} + SFA_CUDA_ARCH="${{ matrix.cuda.archs }}" + SFA_CUDA_RUNTIME_REQUIRES="${{ matrix.cuda.runtime_requires }}" # GitHub runners have no NVIDIA GPU, so the CUDA-gated pytest # tests skip via tests/_skip_helpers.py (device_count() == 0). - # The verification script verifies the native extension loads - # (which exercises the bundled NVIDIA runtime libs). + # The verification script verifies that the native extension + # loads (which exercises the bundled NVIDIA runtime libs). + # Linux only: tell auditwheel NOT to bundle the NVIDIA runtime + # shared libs into the wheel. The wheel declares pinned PyPI + # dependencies on nvidia-cublas-cuXX / nvidia-cuda-runtime-cuXX / + # nvidia-cuda-nvrtc-cuXX through SFA_CUDA_RUNTIME_REQUIRES, so + # the runtime libs arrive via pip at install time. Covering + # both major sonames (.so.12 for cu128, .so.13 for cu132). + CIBW_REPAIR_WHEEL_COMMAND_LINUX: >- + auditwheel repair -w {dest_dir} {wheel} + --exclude libcudart.so.12 --exclude libcudart.so.13 + --exclude libcublas.so.12 --exclude libcublas.so.13 + --exclude libcublasLt.so.12 --exclude libcublasLt.so.13 + --exclude libnvrtc.so.12 --exclude libnvrtc.so.13 + --exclude libnvrtc-builtins.so.12 --exclude libnvrtc-builtins.so.13 CIBW_TEST_REQUIRES: "pytest" CIBW_TEST_COMMAND: >- python {package}/tests/verification.py @@ -190,15 +282,15 @@ jobs: # --------------------------------------------------------------------------- # Optional: publish to PyPI on tag pushes. Requires the OIDC trusted- - # publisher relationship to be configured at https://pypi.org for both - # the `sfa` and `sfa-cu130` projects. Disabled by default; change the - # `if:` guard to enable. + # publisher relationship to be configured at https://pypi.org for each + # of `sfa`, `sfa-cu128`, `sfa-cu132`, `sfa-cu133`. Disabled by default; + # change the `if:` guard to enable. # --------------------------------------------------------------------------- publish: name: publish-to-pypi - needs: [build_cpu_wheels, build_sdist, build_cuda_wheels] + needs: [build_cpu_wheel, build_sdist, build_cuda_wheels] runs-on: ubuntu-latest - if: false # set to `startsWith(github.ref, 'refs/tags/v')` to enable + if: startsWith(github.ref, 'refs/tags/v') permissions: id-token: write steps: diff --git a/INSTALL.md b/INSTALL.md index 3221340..0c0d9b5 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -10,7 +10,6 @@ NVIDIA driver new enough for that CUDA version. | `sfa` | none | - | Linux, macOS, Windows | | `sfa-cu128` | 12.8.x | 570 (Linux / Win) | Linux, Windows | | `sfa-cu132` | 13.2.x | 580 | Linux, Windows | -| `sfa-cu133` | 13.3.x | 580 | Linux, Windows | All CUDA wheels share the same AOT-compiled SASS matrix (SM 7.0 through SM 12.0: Volta, Turing, Ampere, Ada, Hopper, Blackwell), plus @@ -37,14 +36,13 @@ that is the maximum CUDA version your driver supports. | Package | CUDA bundled | Minimum NVIDIA driver | When to pick | |-------------|--------------|------------------------|-----------------------------------------------------------| -| `sfa-cu133` | 13.3.x | 580 | Newest hardware / drivers; default for fresh installs. | -| `sfa-cu132` | 13.2.x | 580 | Matches the `sfa-cu132` conda env used for development. | +| `sfa-cu132` | 13.2.x | 580 | Newest CUDA stack; matches `environment-cuda.yml`. | | `sfa-cu128` | 12.8.x | 570 | Older driver (CUDA 12 line); broadest backwards compat. | Example (install the newest one): ```bash -pip install sfa-cu133 +pip install sfa-cu132 ``` Requires Python 3.10+. macOS is not supported because Apple ended @@ -81,15 +79,15 @@ the host compiler, and `conda` will not install it for you. git clone https://github.com/dwgoon/sfa.git && cd sfa conda env create -f environment-cuda.yml -conda activate sfa-cu132 +conda activate sfa pip install -e . # builds the CUDA extension via the env's nvcc # CPU-only variant (skip CUDA even if nvcc is on PATH): SFA_BUILD_CUDA=0 pip install -e . ``` -This is also how the project maintainers build on Windows: the -`sfa-cu132` env provides `nvcc` and cuBLAS, while system MSVC handles +This is also how the project maintainers build on Windows: the `sfa` +env provides `nvcc` and cuBLAS, while system MSVC handles `bindings.cpp`. The resulting extension is e.g. `sfa/_cuda/_native.cp312-win_amd64.pyd`. @@ -98,8 +96,8 @@ is what the maintainers test against. The same workflow works for any CUDA major / minor that has a `cuda-toolkit` build on the `nvidia` channel: edit the two `cuda-version` / `cuda-toolkit` pins in lockstep (see [What `environment-cuda.yml` provides](#what-environment-cudayml-provides) -below) and rename the env on the first line of the file. CUDA 12.8 and -13.3 environments have been tested in CI. +below) and rename the env on the first line of the file. CUDA 12.8 +and 13.2 environments have been tested in CI. ### Option B: conda-free build (system CUDA + system C++ compiler) @@ -180,7 +178,7 @@ and falls through to a CPU-only build (printing ### What `environment-cuda.yml` provides The shipped conda environment file creates a self-contained build -environment named `sfa-cu132` that does **not** require any +environment named `sfa` that does **not** require any system-wide CUDA install. Everything the build needs - the CUDA compiler, the CUDA runtime, cuBLAS headers and import libs, plus the Python build and runtime dependencies - is pulled in from the @@ -199,14 +197,14 @@ Concretely, the file pins: The `cuda-toolkit` meta-package pulls in `nvcc`, `cudart`, `nvrtc`, `cccl`, `cupti`, the profiler API, and the rest of the CUDA dev -toolchain. After `conda activate sfa-cu132`, `nvcc` is on `PATH` and +toolchain. After `conda activate sfa`, `nvcc` is on `PATH` and `setup.py`'s CUDA-extension build picks it up automatically. Notes for adjusting the file: - To target a different CUDA major version, change the two `nvidia::` pins (`cuda-version` and `cuda-toolkit`) in lockstep. The env name - on the first line (`sfa-cu132`) is just a label; rename it freely. + on the first line (`sfa`) is just a label; rename it freely. - A host C++ compiler is still required (MSVC on Windows, GCC on Linux). The toolchain itself is not bundled by `cuda-toolkit`; conda will not install it for you. @@ -220,7 +218,7 @@ Notes for adjusting the file: |----------------------|------------------------------------------------------------------------| | `SFA_BUILD_CUDA` | `0` to force a pure-Python install. Default: build if `nvcc` is found. | | `SFA_CUDA_ARCH` | Semicolon-separated SM list, e.g. `sm_89` (dev) or `sm_70;sm_80;sm_89`. Default: the full wheel-wide AOT matrix. | -| `SFA_PACKAGE_NAME` | Override the PyPI name (used by CI to produce e.g. `sfa-cu132` or `sfa-cu133` from the same source tree). | +| `SFA_PACKAGE_NAME` | Override the PyPI name (used by CI to produce e.g. `sfa-cu128` or `sfa-cu132` from the same source tree). | ## Verify the install diff --git a/README.md b/README.md index 9d2a990..ab91746 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,6 @@ set of CUDA optimized `sfa-cuXYZ` versions: | `sfa` | none | - | Linux, macOS, Windows | | `sfa-cu128` | 12.8.x | 570 (Linux / Win) | Linux, Windows | | `sfa-cu132` | 13.2.x | 580 | Linux, Windows | -| `sfa-cu133` | 13.3.x | 580 | Linux, Windows | Each CUDA wheel ships ahead-of-time compiled SASS for NVIDIA SM 7.0 through SM 12.0 (Volta, Turing, Ampere, Ada, Hopper, Blackwell) plus a @@ -67,7 +66,7 @@ supports. Example (install the newest one): ```bash -pip install sfa-cu133 +pip install sfa-cu132 ``` > [!IMPORTANT] @@ -87,7 +86,7 @@ self-contained env): ```bash git clone https://github.com/dwgoon/sfa.git && cd sfa conda env create -f environment-cuda.yml -conda activate sfa-cu132 +conda activate sfa pip install -e . ``` @@ -280,7 +279,7 @@ S_gpu = compute_influence( ) ``` -## Benchmarks +## Performance benchmarks ### Hardware setup @@ -329,24 +328,24 @@ S_gpu = compute_influence( ### Small networks -| # Nodes | # Edges | CPU iter (FP64) ms | CPU LAPACK (FP64) ms | CUDA (FP64) ms | -|---------|----------|--------------------|----------------------|-----------------------| -| 32 | 992 | 0.1 ± 0.0 | 0.2 ± 0.0 (0.4x) | 1.3 ± 0.2 (0.06x) | -| 64 | ~4.0 K | 0.2 ± 0.0 | 0.2 ± 0.0 (0.8x) | 1.4 ± 0.1 (0.13x) | -| 128 | ~16.3 K | 2.5 ± 0.0 | 0.4 ± 0.0 (**7.2x**) | 1.9 ± 0.1 (1.3x) | -| 256 | ~65.3 K | 6.9 ± 0.2 | 2.4 ± 0.1 (**2.8x**) | 3.1 ± 0.8 (2.2x) | -| 512 | ~262 K | 38.8 ± 1.7 | 190 ± 46 (0.2x) | 6.4 ± 0.2 (**6.0x**) | -| 1024 | ~1.05 M | 180 ± 8 | 486 ± 89 (0.4x) | 47 ± 10 (**3.8x**) | -| 2048 | ~4.19 M | 2140 ± 320 | 3880 ± 2990 (0.6x) | 245 ± 2 (**8.7x**) | -| 4096 | ~16.8 M | 12520 ± 2380 | 5690 ± 1390 (2.2x) | 4320 ± 580 (**2.9x**) | +| # Nodes | # Edges | CPU iter (FP64) | CPU LAPACK (FP64) | CUDA (FP64) | +|---------|----------|--------------------|---------------------------|-----------------------------| +| 32 | 992 | 0.1 ± 0.0 ms | 0.2 ± 0.0 ms (0.4x) | 1.3 ± 0.2 ms (0.06x) | +| 64 | ~4.0 K | 0.2 ± 0.0 ms | 0.2 ± 0.0 ms (0.8x) | 1.4 ± 0.1 ms (0.13x) | +| 128 | ~16.3 K | 2.5 ± 0.0 ms | 0.4 ± 0.0 ms (**7.2x**) | 1.9 ± 0.1 ms (1.3x) | +| 256 | ~65.3 K | 6.9 ± 0.2 ms | 2.4 ± 0.1 ms (**2.8x**) | 3.1 ± 0.8 ms (2.2x) | +| 512 | ~262 K | 38.8 ± 1.7 ms | 190 ± 46 ms (0.2x) | 6.4 ± 0.2 ms (**6.0x**) | +| 1024 | ~1.05 M | 180 ± 8 ms | 486 ± 89 ms (0.4x) | 47 ± 10 ms (**3.8x**) | +| 2048 | ~4.19 M | 2140 ± 320 ms | 3880 ± 2990 ms (0.6x) | 245 ± 2 ms (**8.7x**) | +| 4096 | ~16.8 M | 12520 ± 2380 ms | 5690 ± 1390 ms (2.2x) | 4320 ± 580 ms (**2.9x**) | ### Large networks -| # Nodes | # Edges | CPU LAPACK (FP64) s | CUDA TF32 (FP32) s | CUDA FP32 (no TF32) s | CUDA FP16 s | -|---------|---------|---------------------|----------------------|-----------------------|--------------------------| -| 5000 | ~25 M | 5.10 ± 2.24 | 0.366 ± 0.027 (14x) | 0.356 ± 0.034 (14x) | 0.349 ± 0.037 (**15x**) | -| 10000 | ~100 M | 17.60 ± 0.57 | 1.55 ± 0.05 (11x) | 4.07 ± 0.06 (4.3x) | 1.13 ± 0.16 (**16x**) | -| 20000 | ~400 M | 70.88 ± 0.79 | 9.13 ± 0.10 (7.8x) | 16.30 ± 0.28 (4.3x) | 4.28 ± 0.02 (**17x**) | +| # Nodes | # Edges | CPU LAPACK (FP64) | CUDA TF32 (FP32) | CUDA FP32 (no TF32) | CUDA FP16 | +|---------|---------|-------------------|--------------------------|----------------------------|----------------------------| +| 5000 | ~25 M | 5.10 ± 2.24 s | 0.366 ± 0.027 s (14x) | 0.356 ± 0.034 s (14x) | 0.349 ± 0.037 s (**15x**) | +| 10000 | ~100 M | 17.60 ± 0.57 s | 1.55 ± 0.05 s (11x) | 4.07 ± 0.06 s (4.3x) | 1.13 ± 0.16 s (**16x**) | +| 20000 | ~400 M | 70.88 ± 0.79 s | 9.13 ± 0.10 s (7.8x) | 16.30 ± 0.28 s (4.3x) | 4.28 ± 0.02 s (**17x**) | - CPU paths show noticeably higher variance than GPU paths (CPU LAPACK FP64 stddev reaches ~25-77% of the mean at small `N`), diff --git a/doc/install.md b/doc/install.md index ad3f27c..4e95166 100644 --- a/doc/install.md +++ b/doc/install.md @@ -9,8 +9,7 @@ one** into a given environment. |---------------|--------|---------------------|-----------------------------| | `sfa` | none | - | Linux, macOS, Windows | | `sfa-cu128` | 12.8.x | 570 (Linux / Win) | Linux, Windows | -| `sfa-cu132` | 13.2.x | 580 | Linux, Windows | -| `sfa-cu133` | 13.3.x | 580 | Linux, Windows (newest) | +| `sfa-cu132` | 13.2.x | 580 | Linux, Windows (newest) | ## Requirements @@ -33,10 +32,9 @@ Run `nvidia-smi` and look at the "CUDA Version" column. That is the number: ```text -nvidia-smi -> "CUDA Version: 13.3" -> any of sfa-cu128 / cu132 / cu133 -nvidia-smi -> "CUDA Version: 13.0" -> sfa-cu128 -nvidia-smi -> "CUDA Version: 12.8" -> sfa-cu128 -nvidia-smi -> "CUDA Version: 12.6" -> upgrade your driver or use `sfa` (CPU) +nvidia-smi -> "CUDA Version: 13.2" or higher -> sfa-cu132 or sfa-cu128 +nvidia-smi -> "CUDA Version: 12.8" - 13.1 -> sfa-cu128 +nvidia-smi -> "CUDA Version: 12.6" -> upgrade your driver or use `sfa` (CPU) ``` When in doubt, start with `sfa-cu128` for the widest driver coverage @@ -102,7 +100,7 @@ and the runtime Python deps: ```bash conda env create -f environment-cuda.yml -conda activate sfa-cu132 +conda activate sfa pip install -e . ``` diff --git a/environment-cuda.yml b/environment-cuda.yml index eec67e6..4525cb0 100644 --- a/environment-cuda.yml +++ b/environment-cuda.yml @@ -1,4 +1,4 @@ -name: sfa-cu132 +name: sfa channels: - nvidia - conda-forge diff --git a/pyproject.toml b/pyproject.toml index 408dfb5..395ca2c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta" # wheels under the SFA_PACKAGE_NAME env var. [project] name = "sfa" -version = "0.2.0.dev0" +version = "0.2.0" description = "Signal flow analysis" readme = "README.md" license = { text = "MIT" } diff --git a/sfa/__init__.py b/sfa/__init__.py index b79d3de..1cf5359 100644 --- a/sfa/__init__.py +++ b/sfa/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.2.0.dev0" +__version__ = "0.2.0" from .base import * from .containers import AlgorithmSet