From 8e5aca1206a415d328d6e2ad9e50581f37d0b026 Mon Sep 17 00:00:00 2001 From: Daewon Lee Date: Sat, 6 Jun 2026 21:43:59 +0900 Subject: [PATCH 1/3] Distribute 0.2.0 via GitHub Releases while PyPI publish is unavailable The maintainer's PyPI account is currently unavailable, so the publish-to-pypi job in wheels.yml is moved back behind `if: false` and a sibling publish-github-release job is added. The new job runs on the same `v*` tag push, downloads every build artifact (CPU universal wheel, sdist, plus all per-Python / per-OS / per-CUDA wheels), and uses softprops/action-gh-release@v2 to attach them to the corresponding GitHub Release. End users install with pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/ The wheels.yml matrix itself is unchanged from the 6/6-green dry run at 27057416779; only the tail-end publish path changes. When the PyPI account is restored, re-enabling that channel is a one-line edit to flip `if: false` back to the tag-ref check; the GitHub Releases channel can stay on permanently. README and INSTALL.md gain a "Install from GitHub Releases" section documenting the wheel-filename pattern `sfa_cuXYZ-VER-cpPP-cpPP-PLAT.whl` so users can construct the right URL for their Python version / OS / CUDA combination, plus example commands for CPU, CUDA 13.2 Linux, CUDA 12.8 Windows, sdist, and the git-tag source fallback. --- .github/workflows/wheels.yml | 42 ++++++++++++++++++++++--- INSTALL.md | 59 ++++++++++++++++++++++++++++++++++++ README.md | 19 ++++++++++++ 3 files changed, 116 insertions(+), 4 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index efe4eb2..6207882 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -280,17 +280,51 @@ jobs: name: cuda-wheels-${{ matrix.cuda.pkg }}-${{ matrix.os }} path: ./wheelhouse/*.whl + # --------------------------------------------------------------------------- + # Publish all build artifacts (CPU universal wheel, sdist, and every + # CUDA wheel produced by the matrix) to the GitHub Release that the + # `v*` tag triggers. End users install from these URLs directly: + # + # pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/ + # + # This is the primary distribution channel while the PyPI publish job + # below is disabled. + # --------------------------------------------------------------------------- + publish_github_release: + name: publish-github-release + needs: [build_cpu_wheel, build_sdist, build_cuda_wheels] + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/v') + permissions: + contents: write + steps: + - uses: actions/download-artifact@v4 + with: + path: dist + merge-multiple: true + - name: List collected artifacts + run: ls -la dist/ + - uses: softprops/action-gh-release@v2 + with: + files: dist/* + generate_release_notes: true + prerelease: false + fail_on_unmatched_files: true + # --------------------------------------------------------------------------- # Optional: publish to PyPI on tag pushes. Requires the OIDC trusted- # publisher relationship to be configured at https://pypi.org for each - # of `sfa`, `sfa-cu128`, `sfa-cu132`, `sfa-cu133`. Disabled by default; - # change the `if:` guard to enable. + # of `sfa`, `sfa-cu128`, `sfa-cu132`. Currently disabled because the + # maintainer's PyPI account is unavailable; re-enable by flipping the + # `if:` guard back to `startsWith(github.ref, 'refs/tags/v')` once + # trusted publishers are in place. Existing GitHub Release artifacts + # are unaffected by this gate. # --------------------------------------------------------------------------- - publish: + publish_pypi: name: publish-to-pypi needs: [build_cpu_wheel, build_sdist, build_cuda_wheels] runs-on: ubuntu-latest - if: startsWith(github.ref, 'refs/tags/v') + if: false permissions: id-token: write steps: diff --git a/INSTALL.md b/INSTALL.md index 0c0d9b5..3d97957 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -20,6 +20,15 @@ runtime they link against. This file is a quick reference. See [doc/install.md](doc/install.md) for the full guide. +> [!NOTE] +> For the **0.2.0** line, the primary distribution channel is the +> [GitHub Releases page](https://github.com/dwgoon/sfa/releases), +> not PyPI. The `pip install sfa-cuXYZ` shortcuts shown below assume +> the wheels are on PyPI; until they land there, use the explicit +> release URLs in the "Install from GitHub Releases" section. The +> 0.1.0 `sfa` CPU package remains installable from PyPI through the +> `pip install sfa` snippet. + ## CPU (any OS) ```bash @@ -53,6 +62,56 @@ NVIDIA driver development in 2019. > Install **only one** `sfa-cuXYZ` per environment; mixing them causes > import conflicts. +## Install from GitHub Releases + +When a `v*` tag is pushed, `wheels.yml` attaches every artifact (one +universal CPU wheel, the sdist, and a CUDA wheel for every supported +Python + OS + CUDA combination) to the corresponding GitHub Release. +`pip` can install those wheels directly: + +```bash +# CPU (universal; any OS, any Python from 3.10 to 3.13). +pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/sfa-0.2.0-py3-none-any.whl +``` + +For CUDA wheels, the filename encodes Python version, OS, and CUDA +major.minor. The pattern is + +``` +sfa_cuXYZ-VER-cpPP-cpPP-PLAT.whl +``` + +with + +| Token | Meaning | Possible values | +|--------|------------------------------------------------------|--------------------------------------------------| +| `XYZ` | CUDA major.minor (no dot) | `128` (CUDA 12.8) or `132` (CUDA 13.2) | +| `VER` | sfa version (matches the tag without the `v` prefix) | e.g. `0.2.0` | +| `PP` | CPython major+minor (no dot) | `310`, `311`, `312`, `313` | +| `PLAT` | Wheel platform tag | `manylinux_2_28_x86_64` (Linux) / `win_amd64` (Windows) | + +Examples: + +```bash +# Linux + Python 3.12 + CUDA 13.2 +pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/sfa_cu132-0.2.0-cp312-cp312-manylinux_2_28_x86_64.whl + +# Windows + Python 3.10 + CUDA 12.8 +pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/sfa_cu128-0.2.0-cp310-cp310-win_amd64.whl +``` + +If none of the prebuilt wheels match your environment, install the +sdist or a tagged git source instead: + +```bash +# sdist (compiles a pure-Python install; no CUDA extension). +pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/sfa-0.2.0.tar.gz + +# git source at the v0.2.0 tag (set SFA_BUILD_CUDA=1 + have nvcc to +# build the CUDA extension; otherwise a pure-Python install is made). +pip install git+https://github.com/dwgoon/sfa.git@v0.2.0 +``` + ## Optional extras Matplotlib-based plotting helpers in `sfa.plot`: diff --git a/README.md b/README.md index ab91746..ff5903c 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,25 @@ pip install sfa-cu132 > `sfa-cuXYZ` share the `sfa` Python namespace and will conflict if > stacked. +### Install from GitHub Releases (current 0.2.0 primary channel) + +The 0.2.0 line is distributed through the project's +[GitHub Releases page](https://github.com/dwgoon/sfa/releases) until +the new CUDA wheels land on PyPI. Each `v*` tag attaches one universal +CPU wheel, the sdist, and a per-Python / per-OS / per-CUDA wheel for +each `sfa-cuXYZ` variant. Examples: + +```bash +# CPU (universal, any OS / Python 3.10 - 3.13) +pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/sfa-0.2.0-py3-none-any.whl + +# CUDA 13.2, Linux, Python 3.12 +pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/sfa_cu132-0.2.0-cp312-cp312-manylinux_2_28_x86_64.whl +``` + +See [INSTALL.md](INSTALL.md#install-from-github-releases) for the +full wheel-filename pattern and Windows / older-Python URLs. + ### Build from source For a new CUDA major version, a custom GPU architecture, or From ca9227aea30b369f6587e74f314890555c23fe37 Mon Sep 17 00:00:00 2001 From: Daewon Lee Date: Sat, 6 Jun 2026 22:00:17 +0900 Subject: [PATCH 2/3] Fix wheel name: CUDA cells produced 'sfa' instead of 'sfa-cuXYZ' Downloading an artifact from the green dry run (27057416779) revealed every CUDA cell's wheels were named `sfa-0.2.0.dev0-...whl` instead of `sfa_cu128-...` / `sfa_cu132-...`. The wheel metadata Name was `sfa` for every cell, so a release in that state would have collapsed the three planned distributions into a single namespace collision. Root cause: setup.py read SFA_PACKAGE_NAME from the env and passed it as `name=` to setup(), but PEP 621 makes pyproject.toml's `[project] name` table authoritative whenever it is present, and our [project] had a static `name = "sfa"`. setup.py's name argument was silently ignored, so SFA_PACKAGE_NAME never reached the wheel. PEP 621 also forbids declaring `name` as dynamic, ruling out the "setup.py supplies it" route. The fix: - Keep `[project] name = "sfa"` static (CPU + sdist default). For each CUDA cell, the new "Rename wheel project to ..." step sed-rewrites that line to the matrix wheel name before cibuildwheel runs. The same source tree therefore yields `sfa`, `sfa-cu128`, and `sfa-cu132` wheels by simply changing one line in pyproject.toml. - Move `dependencies` to `dynamic` in pyproject.toml. With no entry under `[tool.setuptools.dynamic]`, setuptools reads the resolved list from setup.py's `install_requires`, which keeps the SFA_CUDA_RUNTIME_REQUIRES env-var path alive so each CUDA wheel pins `nvidia-cublas-cu1X`, `nvidia-cuda-runtime-cu1X`, etc. setup.py is trimmed to just what pyproject.toml does not cover (install_requires composition, the CUDA build extension, and the build_ext command class). Verified locally: sed name="sfa-cu132" + SFA_CUDA_RUNTIME_REQUIRES=... build -> file: sfa_cu132-0.2.0-py3-none-any.whl METADATA Name: sfa-cu132 METADATA Requires-Dist includes nvidia-cublas-cu13<13.3,>=13.2 The CPU and sdist jobs lose the now-redundant `SFA_PACKAGE_NAME: "sfa"` env entries; the CUDA matrix loses `SFA_PACKAGE_NAME=...` from both CIBW_ENVIRONMENT_LINUX and CIBW_ENVIRONMENT_WINDOWS because the sed step has already changed the in-tree name by the time cibuildwheel starts. --- .github/workflows/wheels.yml | 22 ++++++++++++++++++---- pyproject.toml | 27 +++++++++++++++------------ setup.py | 29 ++++++----------------------- 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 6207882..f1a3d1d 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -49,7 +49,6 @@ jobs: - name: Build the universal CPU wheel env: SFA_BUILD_CUDA: "0" - SFA_PACKAGE_NAME: "sfa" run: | python -m pip install --upgrade pip build python -m build --wheel @@ -72,7 +71,6 @@ jobs: - name: Build sdist env: SFA_BUILD_CUDA: "0" - SFA_PACKAGE_NAME: "sfa" run: | python -m pip install --upgrade pip build python -m build --sdist @@ -221,6 +219,22 @@ jobs: echo "--- $CUDA_HOME/lib64 (cublas only) ---" ls -la "$CUDA_HOME/lib64/" | grep -E 'cublas|nvrtc' || true + - name: Rename wheel project to ${{ matrix.cuda.pkg }} + # PEP 621 forbids `name` from being declared dynamic in + # pyproject.toml, so the same source tree's static + # `name = "sfa"` must be sed-replaced in place per CUDA cell + # to produce the per-CUDA wheel names (sfa-cu128 / sfa-cu132). + # The CPU and sdist jobs leave this line untouched and ship + # under the default `sfa` name. Runs on both Linux and + # Windows runners via Git Bash; cibuildwheel afterwards reads + # the modified pyproject.toml inside its build sandbox. + shell: bash + run: | + set -euo pipefail + sed -i 's/^name = "sfa"$/name = "${{ matrix.cuda.pkg }}"/' pyproject.toml + echo "--- name line after rename ---" + grep '^name' pyproject.toml + - name: Build wheels uses: pypa/cibuildwheel@v2.21 env: @@ -238,9 +252,10 @@ jobs: CIBW_CONTAINER_ENGINE: >- docker; create_args: -v /usr/local/cuda:/usr/local/cuda:ro # Linux-only env: bind-mounted CUDA path + PATH update. + # The wheel project name is set by the sed step above + # (pyproject.toml's `[project] name`), not by an env var. CIBW_ENVIRONMENT_LINUX: >- SFA_BUILD_CUDA=1 - SFA_PACKAGE_NAME=${{ matrix.cuda.pkg }} SFA_CUDA_ARCH="${{ matrix.cuda.archs }}" SFA_CUDA_RUNTIME_REQUIRES="${{ matrix.cuda.runtime_requires }}" CUDA_PATH=/usr/local/cuda @@ -250,7 +265,6 @@ jobs: # subprocess automatically. CIBW_ENVIRONMENT_WINDOWS: >- SFA_BUILD_CUDA=1 - SFA_PACKAGE_NAME=${{ matrix.cuda.pkg }} SFA_CUDA_ARCH="${{ matrix.cuda.archs }}" SFA_CUDA_RUNTIME_REQUIRES="${{ matrix.cuda.runtime_requires }}" # GitHub runners have no NVIDIA GPU, so the CUDA-gated pytest diff --git a/pyproject.toml b/pyproject.toml index 395ca2c..89b9ac1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,9 +2,17 @@ requires = ["setuptools>=68", "wheel", "pybind11>=2.13", "numpy>=2.0"] build-backend = "setuptools.build_meta" -# The project `name` is set dynamically from setup.py so the same source -# tree can produce both "sfa" (CPU-only) and "sfa-cu130" (CUDA-enabled) -# wheels under the SFA_PACKAGE_NAME env var. +# PEP 621 requires `name` to be a static string here (it explicitly +# forbids name from being dynamic). The CPU and sdist builds use this +# default `sfa`; CUDA build cells in `wheels.yml` overwrite this line +# in-place with `sed` so the same source tree produces sfa-cu128 / +# sfa-cu132 wheels under their own PyPI names. +# +# `dependencies` is left dynamic so setup.py can append the per-CUDA +# nvidia-cublas-cuXX / nvidia-cuda-runtime-cuXX runtime packages via +# the SFA_CUDA_RUNTIME_REQUIRES env var. With no entry under +# [tool.setuptools.dynamic], setuptools reads the resolved value from +# setup.py's `install_requires=` call. [project] name = "sfa" version = "0.2.0" @@ -13,13 +21,7 @@ readme = "README.md" license = { text = "MIT" } authors = [{ name = "Daewon Lee", email = "daewon4you@gmail.com" }] requires-python = ">=3.10" -dependencies = [ - "numpy", - "scipy", - "pandas", - "networkx", - "threadpoolctl", -] +dynamic = ["dependencies"] [project.optional-dependencies] cuda = [] # toolchain provided by conda env (environment-cuda.yml) @@ -40,8 +42,9 @@ include = ["sfa*"] # Builds CPU-only wheels by default. To produce CUDA wheels, set # environment variables in the GitHub Actions workflow: # SFA_BUILD_CUDA=1 -# SFA_PACKAGE_NAME=sfa-cu130 -# SFA_CUDA_ARCH=sm_70;sm_75;sm_80;sm_86;sm_89;sm_90 (AOT fat binary) +# SFA_CUDA_ARCH=sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120 +# and rewrite the static `[project] name` line above to the target +# wheel name (sfa-cu128 / sfa-cu132) via a sed step before running. # --------------------------------------------------------------------------- [tool.cibuildwheel] build = "cp310-* cp311-* cp312-* cp313-*" diff --git a/setup.py b/setup.py index 5b6758d..d179953 100644 --- a/setup.py +++ b/setup.py @@ -253,14 +253,6 @@ def _cuda_extension() -> Optional[Extension]: ext_modules.append(cuda_ext) -# PyPI package name. Default is the cross-platform CPU-only package "sfa". -# CUDA-enabled wheels are published under per-CUDA-version names -# (sfa-cu128, sfa-cu132, sfa-cu133) so users can pick the variant that -# matches their NVIDIA driver. Override at build time: -# SFA_PACKAGE_NAME=sfa-cu132 SFA_BUILD_CUDA=1 python -m build --wheel -_pkg_name = os.environ.get("SFA_PACKAGE_NAME", "sfa") - - # Per-build extra install_requires fed in by the CI matrix. Used to pin # the NVIDIA runtime PyPI packages (nvidia-cublas-cuXX, # nvidia-cuda-runtime-cuXX) that match the CUDA major version we are @@ -268,6 +260,12 @@ def _cuda_extension() -> Optional[Extension]: # requirement specifiers, e.g. # SFA_CUDA_RUNTIME_REQUIRES="nvidia-cublas-cu13>=13.2,<13.3 \ # nvidia-cuda-runtime-cu13>=13.2,<13.3" +# +# The wheel's static metadata (name, version, description, author, etc.) +# lives in pyproject.toml's [project] table; setup.py only supplies what +# isn't there: the CUDA build extension, the dynamic install_requires +# composed from the base list plus _extra_runtime, and the +# build-extension command class. _extra_runtime = [ spec for spec in os.environ.get("SFA_CUDA_RUNTIME_REQUIRES", "").split() if spec.strip() @@ -275,24 +273,9 @@ def _cuda_extension() -> Optional[Extension]: setup( - name=_pkg_name, - version="0.2.0.dev0", - description="Signal flow analysis", - url="http://github.com/dwgoon/sfa", - author="Daewon Lee", - author_email="daewon4you@gmail.com", - license="MIT", - packages=find_packages(), - package_data={"": ["*.tsv", "*.sif", "*.json"]}, - python_requires=">=3.10", install_requires=(["numpy", "scipy", "pandas", "networkx", "threadpoolctl"] + _extra_runtime), - extras_require={ - "plot": ["matplotlib", "seaborn"], - "cuda": [], # toolchain provided by conda env (environment-cuda.yml) - "test": ["pytest"], - }, ext_modules=ext_modules, cmdclass={"build_ext": CudaBuildExt} if ext_modules else {}, zip_safe=False, From 4a42c2e9b2e2beb1b242bdfeb4372d48504829a6 Mon Sep 17 00:00:00 2001 From: Daewon Lee Date: Thu, 11 Jun 2026 10:40:17 +0900 Subject: [PATCH 3/3] Fix propagate_iterative returning a stale iterate on non-convergence - algorithms/sp: the ping-pong swap left the final swap in place when lim_iter was exhausted, so the returned x was the second-to-last iterate and disagreed with trajectory[-1]. Track convergence and return the correct buffer; this also stops lim_iter==0 from returning an uninitialized array. Returned x now always equals trajectory[-1]. - _blas_ctypes.solve: snapshot and restore the backend thread count so a one-off num_threads no longer leaks process-wide. - control/influence: raise ValueError on an unknown device; unify the convergence comparison to '<= tol' (dense + sparse); drop the dead np.inf self-influence assignment that was always overwritten. - tests: cover the non-converged and lim_iter==0 propagate_iterative paths. --- sfa/_blas_ctypes.py | 68 ++++++++++++++++++++------ sfa/algorithms/sp.py | 12 ++++- sfa/control/influence.py | 11 +++-- tests/test_cpu_dtype_and_trajectory.py | 46 +++++++++++++++++ 4 files changed, 115 insertions(+), 22 deletions(-) diff --git a/sfa/_blas_ctypes.py b/sfa/_blas_ctypes.py index 4effbe8..c475a9a 100644 --- a/sfa/_blas_ctypes.py +++ b/sfa/_blas_ctypes.py @@ -79,8 +79,34 @@ def __init__(self, name: str, lib: ctypes.CDLL): ctypes.POINTER(ctypes.c_float), ctypes.c_int, ] - # Thread setters are backend-specific. + # Thread setters/getters are backend-specific. self.set_num_threads = self._resolve_thread_setter() + self.get_num_threads = self._resolve_thread_getter() + + def _resolve_thread_getter(self): + """Return a callable ``f() -> int`` reporting the backend thread count. + + Used to snapshot and restore the process-wide thread count around a + single ``solve`` so a one-off ``num_threads`` does not leak into the + rest of the process. Returns ``None`` when the backend exposes no + getter (then the count is simply left as set). + """ + lib = self.lib + # MKL: int MKL_Get_Max_Threads(void) / int mkl_get_max_threads(void) + for sym in ('MKL_Get_Max_Threads', 'mkl_get_max_threads'): + fn = getattr(lib, sym, None) + if fn is not None: + fn.argtypes = [] + fn.restype = ctypes.c_int + return fn + # OpenBLAS: int openblas_get_num_threads(void) / goto_get_num_threads + for sym in ('openblas_get_num_threads', 'goto_get_num_threads'): + fn = getattr(lib, sym, None) + if fn is not None: + fn.argtypes = [] + fn.restype = ctypes.c_int + return fn + return None def _resolve_thread_setter(self): """Return a callable ``f(n: int) -> None`` for this backend.""" @@ -322,7 +348,15 @@ def solve(A: np.ndarray, B: np.ndarray, *, else: be = _load(canon) + # Snapshot the backend's current thread count so a one-off num_threads + # does not leak into the rest of the process; restore it after the solve. + prev_threads = None if num_threads is not None: + if be.get_num_threads is not None: + try: + prev_threads = int(be.get_num_threads()) + except Exception: + prev_threads = None be.set_num_threads(int(num_threads)) # Make sure layout is C-contiguous (LAPACK_ROW_MAJOR friendly). @@ -332,20 +366,24 @@ def solve(A: np.ndarray, B: np.ndarray, *, NRHS = B.shape[1] ipiv = np.empty(N, dtype=np.int32) - if A.dtype == np.float64: - info = be.dgesv( - _LAPACK_ROW_MAJOR, N, NRHS, - A.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), N, - ipiv.ctypes.data_as(ctypes.POINTER(ctypes.c_int)), - B.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), NRHS, - ) - else: # float32 - info = be.sgesv( - _LAPACK_ROW_MAJOR, N, NRHS, - A.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), N, - ipiv.ctypes.data_as(ctypes.POINTER(ctypes.c_int)), - B.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), NRHS, - ) + try: + if A.dtype == np.float64: + info = be.dgesv( + _LAPACK_ROW_MAJOR, N, NRHS, + A.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), N, + ipiv.ctypes.data_as(ctypes.POINTER(ctypes.c_int)), + B.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), NRHS, + ) + else: # float32 + info = be.sgesv( + _LAPACK_ROW_MAJOR, N, NRHS, + A.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), N, + ipiv.ctypes.data_as(ctypes.POINTER(ctypes.c_int)), + B.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), NRHS, + ) + finally: + if prev_threads is not None: + be.set_num_threads(prev_threads) if info != 0: raise np.linalg.LinAlgError( diff --git a/sfa/algorithms/sp.py b/sfa/algorithms/sp.py index 1718d33..7129875 100644 --- a/sfa/algorithms/sp.py +++ b/sfa/algorithms/sp.py @@ -125,6 +125,7 @@ def propagate_iterative(self, trj[0] = x_t1 num_iter = 0 + converged = False with thread_limit(num_threads): for i in range(lim_iter): # x_t2 = a*W @ x_t1 + (1-a)*b (no temporary allocations) @@ -135,15 +136,22 @@ def propagate_iterative(self, if get_trj: trj[num_iter] = x_t2 if np.linalg.norm(x_t2 - x_t1) <= tol: + converged = True break # Swap views instead of copying: x_t1 becomes the freshest # state, and the old x_t1 buffer is recycled as next x_t2. x_t1, x_t2 = x_t2, x_t1 + # When we converge we break *before* the swap, so x_t2 holds the + # latest iterate. When we exhaust lim_iter the final swap left the + # latest iterate in x_t1. Pick the right buffer so the returned x is + # always the last computed state (and matches trj[-1]). This also + # avoids returning the uninitialized x_t2 buffer when lim_iter == 0. + x_fin = x_t2 if converged else x_t1 if get_trj: # Slice to actual rows (no copy). - return x_t2, trj[: num_iter + 1] - return x_t2, num_iter + return x_fin, trj[: num_iter + 1] + return x_fin, num_iter # end of def propagate_iterative diff --git a/sfa/control/influence.py b/sfa/control/influence.py index d92a2cd..5e6cd36 100644 --- a/sfa/control/influence.py +++ b/sfa/control/influence.py @@ -156,6 +156,10 @@ def compute_influence(W, import cupy as cp if isinstance(ret, cp.ndarray): ret = cp.asnumpy(ret) + else: + raise ValueError( + f"Unknown device: {device!r}. " + "Expected one of 'cpu', 'cuda[:N]', or 'gpu:N'.") if get_iter: S_ret, num_iter = ret @@ -173,9 +177,6 @@ def compute_influence(W, for trg in outputs: for src in n2i: - if src == trg: - df.loc[src, trg] = np.inf - idx_src = n2i[src] idx_trg = n2i[trg] df.loc[src, trg] = S_ret[idx_trg, idx_src] @@ -272,7 +273,7 @@ def _compute_influence_cpu(W, alpha=0.5, beta=0.5, S=None, np.dot(S1, aW, out=S2) S2 += I norm = np.linalg.norm(S2 - S1) - if norm < tol: + if norm <= tol: break S1[:, :] = S2 @@ -299,7 +300,7 @@ def _compute_influence_cpu_sparse(W, alpha, beta, S, for cnt in range(1, max_iter + 1): S2[:, :] = S1.dot(aW) + I norm = sp.sparse.linalg.norm(S2 - S1) - if norm < tol: + if norm <= tol: break # end of if S1[:, :] = S2 diff --git a/tests/test_cpu_dtype_and_trajectory.py b/tests/test_cpu_dtype_and_trajectory.py index 92ce07f..3c8e371 100644 --- a/tests/test_cpu_dtype_and_trajectory.py +++ b/tests/test_cpu_dtype_and_trajectory.py @@ -102,3 +102,49 @@ def test_cuda_influence_use_tf32_off_higher_precision(): err_fp32 = np.abs(S_fp32.astype(np.float64) - S_cpu).max() # FP32 path should be at least as accurate as TF32 (often much more so). assert err_fp32 <= err_tf32 + 1e-12 + + +def test_cpu_sp_nonconvergence_returns_last_iterate(): + """Non-converged runs must return the *last* iterate, not the one before. + + Regression test: the ping-pong swap optimization left the final swap in + place when lim_iter was exhausted, so the function returned the + second-to-last iterate (and a value that disagreed with trajectory[-1]). + """ + alg = SignalPropagation("SP") + N = 16 + W = _W(N, seed=3) + xi = np.zeros(N) + rng = np.random.default_rng(3) + b = rng.standard_normal(N) * 0.1 + a = 0.5 + n_steps = 5 # small lim_iter + unreachable tol => guaranteed non-convergence + + x, num_iter = alg.propagate_iterative( + W, xi, b, a=a, lim_iter=n_steps, tol=1e-30, get_trj=False) + assert num_iter == n_steps + + # Manual reference: exactly n_steps fixed-point updates from xi. + ref = xi.astype(np.float64).copy() + for _ in range(n_steps): + ref = a * W.dot(ref) + (1 - a) * b + np.testing.assert_allclose(x, ref, rtol=1e-12, atol=1e-12) + + # The returned state must equal the last trajectory row. + x_trj, trj = alg.propagate_iterative( + W, xi, b, a=a, lim_iter=n_steps, tol=1e-30, get_trj=True) + assert trj.shape[0] == n_steps + 1 + np.testing.assert_allclose(trj[-1], x_trj, rtol=1e-12, atol=1e-12) + + +def test_cpu_sp_zero_iterations_returns_initial(): + """lim_iter == 0 must return the initial state, not an uninitialized buffer.""" + alg = SignalPropagation("SP") + N = 8 + W = _W(N, seed=5) + xi = np.arange(N, dtype=np.float64) + b = np.zeros(N) + x, num_iter = alg.propagate_iterative( + W, xi, b, a=0.5, lim_iter=0, tol=1e-5) + assert num_iter == 0 + np.testing.assert_allclose(x, xi)