From 8e5aca1206a415d328d6e2ad9e50581f37d0b026 Mon Sep 17 00:00:00 2001
From: Daewon Lee <daewon4you@gmail.com>
Date: Sat, 6 Jun 2026 21:43:59 +0900
Subject: [PATCH 1/3] Distribute 0.2.0 via GitHub Releases while PyPI publish
 is unavailable

The maintainer's PyPI account is currently unavailable, so the
publish-to-pypi job in wheels.yml is moved back behind `if: false` and
a sibling publish-github-release job is added. The new job runs on
the same `v*` tag push, downloads every build artifact (CPU universal
wheel, sdist, plus all per-Python / per-OS / per-CUDA wheels), and
uses softprops/action-gh-release@v2 to attach them to the
corresponding GitHub Release. End users install with

  pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/<wheel>

The wheels.yml matrix itself is unchanged from the 6/6-green dry run
at 27057416779; only the tail-end publish path changes. When the
PyPI account is restored, re-enabling that channel is a one-line
edit to flip `if: false` back to the tag-ref check; the GitHub
Releases channel can stay on permanently.

README and INSTALL.md gain a "Install from GitHub Releases" section
documenting the wheel-filename pattern
`sfa_cuXYZ-VER-cpPP-cpPP-PLAT.whl` so users can construct the right
URL for their Python version / OS / CUDA combination, plus example
commands for CPU, CUDA 13.2 Linux, CUDA 12.8 Windows, sdist, and the
git-tag source fallback.
---
 .github/workflows/wheels.yml | 42 ++++++++++++++++++++++---
 INSTALL.md                   | 59 ++++++++++++++++++++++++++++++++++++
 README.md                    | 19 ++++++++++++
 3 files changed, 116 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index efe4eb2..6207882 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -280,17 +280,51 @@ jobs:
           name: cuda-wheels-${{ matrix.cuda.pkg }}-${{ matrix.os }}
           path: ./wheelhouse/*.whl
 
+  # ---------------------------------------------------------------------------
+  # Publish all build artifacts (CPU universal wheel, sdist, and every
+  # CUDA wheel produced by the matrix) to the GitHub Release that the
+  # `v*` tag triggers. End users install from these URLs directly:
+  #
+  #   pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/<wheel-name>
+  #
+  # This is the primary distribution channel while the PyPI publish job
+  # below is disabled.
+  # ---------------------------------------------------------------------------
+  publish_github_release:
+    name: publish-github-release
+    needs: [build_cpu_wheel, build_sdist, build_cuda_wheels]
+    runs-on: ubuntu-latest
+    if: startsWith(github.ref, 'refs/tags/v')
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+          merge-multiple: true
+      - name: List collected artifacts
+        run: ls -la dist/
+      - uses: softprops/action-gh-release@v2
+        with:
+          files: dist/*
+          generate_release_notes: true
+          prerelease: false
+          fail_on_unmatched_files: true
+
   # ---------------------------------------------------------------------------
   # Optional: publish to PyPI on tag pushes. Requires the OIDC trusted-
   # publisher relationship to be configured at https://pypi.org for each
-  # of `sfa`, `sfa-cu128`, `sfa-cu132`, `sfa-cu133`. Disabled by default;
-  # change the `if:` guard to enable.
+  # of `sfa`, `sfa-cu128`, `sfa-cu132`. Currently disabled because the
+  # maintainer's PyPI account is unavailable; re-enable by flipping the
+  # `if:` guard back to `startsWith(github.ref, 'refs/tags/v')` once
+  # trusted publishers are in place. Existing GitHub Release artifacts
+  # are unaffected by this gate.
   # ---------------------------------------------------------------------------
-  publish:
+  publish_pypi:
     name: publish-to-pypi
     needs: [build_cpu_wheel, build_sdist, build_cuda_wheels]
     runs-on: ubuntu-latest
-    if: startsWith(github.ref, 'refs/tags/v')
+    if: false
     permissions:
       id-token: write
     steps:
diff --git a/INSTALL.md b/INSTALL.md
index 0c0d9b5..3d97957 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -20,6 +20,15 @@ runtime they link against.
 This file is a quick reference. See
 [doc/install.md](doc/install.md) for the full guide.
 
+> [!NOTE]
+> For the **0.2.0** line, the primary distribution channel is the
+> [GitHub Releases page](https://github.com/dwgoon/sfa/releases),
+> not PyPI. The `pip install sfa-cuXYZ` shortcuts shown below assume
+> the wheels are on PyPI; until they land there, use the explicit
+> release URLs in the "Install from GitHub Releases" section. The
+> 0.1.0 `sfa` CPU package remains installable from PyPI through the
+> `pip install sfa` snippet.
+
 ## CPU (any OS)
 
 ```bash
@@ -53,6 +62,56 @@ NVIDIA driver development in 2019.
 > Install **only one** `sfa-cuXYZ` per environment; mixing them causes
 > import conflicts.
 
+## Install from GitHub Releases
+
+When a `v*` tag is pushed, `wheels.yml` attaches every artifact (one
+universal CPU wheel, the sdist, and a CUDA wheel for every supported
+Python + OS + CUDA combination) to the corresponding GitHub Release.
+`pip` can install those wheels directly:
+
+```bash
+# CPU (universal; any OS, any Python from 3.10 to 3.13).
+pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/sfa-0.2.0-py3-none-any.whl
+```
+
+For CUDA wheels, the filename encodes Python version, OS, and CUDA
+major.minor. The pattern is
+
+```
+sfa_cuXYZ-VER-cpPP-cpPP-PLAT.whl
+```
+
+with
+
+| Token  | Meaning                                              | Possible values                                  |
+|--------|------------------------------------------------------|--------------------------------------------------|
+| `XYZ`  | CUDA major.minor (no dot)                            | `128` (CUDA 12.8) or `132` (CUDA 13.2)           |
+| `VER`  | sfa version (matches the tag without the `v` prefix) | e.g. `0.2.0`                                     |
+| `PP`   | CPython major+minor (no dot)                         | `310`, `311`, `312`, `313`                       |
+| `PLAT` | Wheel platform tag                                   | `manylinux_2_28_x86_64` (Linux) / `win_amd64` (Windows) |
+
+Examples:
+
+```bash
+# Linux + Python 3.12 + CUDA 13.2
+pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/sfa_cu132-0.2.0-cp312-cp312-manylinux_2_28_x86_64.whl
+
+# Windows + Python 3.10 + CUDA 12.8
+pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/sfa_cu128-0.2.0-cp310-cp310-win_amd64.whl
+```
+
+If none of the prebuilt wheels match your environment, install the
+sdist or a tagged git source instead:
+
+```bash
+# sdist (compiles a pure-Python install; no CUDA extension).
+pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/sfa-0.2.0.tar.gz
+
+# git source at the v0.2.0 tag (set SFA_BUILD_CUDA=1 + have nvcc to
+# build the CUDA extension; otherwise a pure-Python install is made).
+pip install git+https://github.com/dwgoon/sfa.git@v0.2.0
+```
+
 ## Optional extras
 
 Matplotlib-based plotting helpers in `sfa.plot`:
diff --git a/README.md b/README.md
index ab91746..ff5903c 100644
--- a/README.md
+++ b/README.md
@@ -74,6 +74,25 @@ pip install sfa-cu132
 > `sfa-cuXYZ` share the `sfa` Python namespace and will conflict if
 > stacked.
 
+### Install from GitHub Releases (current 0.2.0 primary channel)
+
+The 0.2.0 line is distributed through the project's
+[GitHub Releases page](https://github.com/dwgoon/sfa/releases) until
+the new CUDA wheels land on PyPI. Each `v*` tag attaches one universal
+CPU wheel, the sdist, and a per-Python / per-OS / per-CUDA wheel for
+each `sfa-cuXYZ` variant. Examples:
+
+```bash
+# CPU (universal, any OS / Python 3.10 - 3.13)
+pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/sfa-0.2.0-py3-none-any.whl
+
+# CUDA 13.2, Linux, Python 3.12
+pip install https://github.com/dwgoon/sfa/releases/download/v0.2.0/sfa_cu132-0.2.0-cp312-cp312-manylinux_2_28_x86_64.whl
+```
+
+See [INSTALL.md](INSTALL.md#install-from-github-releases) for the
+full wheel-filename pattern and Windows / older-Python URLs.
+
 ### Build from source
 
 For a new CUDA major version, a custom GPU architecture, or

From ca9227aea30b369f6587e74f314890555c23fe37 Mon Sep 17 00:00:00 2001
From: Daewon Lee <daewon4you@gmail.com>
Date: Sat, 6 Jun 2026 22:00:17 +0900
Subject: [PATCH 2/3] Fix wheel name: CUDA cells produced 'sfa' instead of
 'sfa-cuXYZ'

Downloading an artifact from the green dry run (27057416779) revealed
every CUDA cell's wheels were named `sfa-0.2.0.dev0-...whl` instead
of `sfa_cu128-...` / `sfa_cu132-...`. The wheel metadata Name was
`sfa` for every cell, so a release in that state would have collapsed
the three planned distributions into a single namespace collision.

Root cause: setup.py read SFA_PACKAGE_NAME from the env and passed it
as `name=` to setup(), but PEP 621 makes pyproject.toml's
`[project] name` table authoritative whenever it is present, and our
[project] had a static `name = "sfa"`. setup.py's name argument was
silently ignored, so SFA_PACKAGE_NAME never reached the wheel.

PEP 621 also forbids declaring `name` as dynamic, ruling out the
"setup.py supplies it" route. The fix:

- Keep `[project] name = "sfa"` static (CPU + sdist default). For each
  CUDA cell, the new "Rename wheel project to ..." step sed-rewrites
  that line to the matrix wheel name before cibuildwheel runs. The
  same source tree therefore yields `sfa`, `sfa-cu128`, and
  `sfa-cu132` wheels by simply changing one line in pyproject.toml.
- Move `dependencies` to `dynamic` in pyproject.toml. With no entry
  under `[tool.setuptools.dynamic]`, setuptools reads the resolved
  list from setup.py's `install_requires`, which keeps the
  SFA_CUDA_RUNTIME_REQUIRES env-var path alive so each CUDA wheel
  pins `nvidia-cublas-cu1X`, `nvidia-cuda-runtime-cu1X`, etc.

setup.py is trimmed to just what pyproject.toml does not cover
(install_requires composition, the CUDA build extension, and the
build_ext command class). Verified locally:

  sed name="sfa-cu132" + SFA_CUDA_RUNTIME_REQUIRES=... build ->
    file:    sfa_cu132-0.2.0-py3-none-any.whl
    METADATA Name: sfa-cu132
    METADATA Requires-Dist includes nvidia-cublas-cu13<13.3,>=13.2

The CPU and sdist jobs lose the now-redundant
`SFA_PACKAGE_NAME: "sfa"` env entries; the CUDA matrix loses
`SFA_PACKAGE_NAME=...` from both CIBW_ENVIRONMENT_LINUX and
CIBW_ENVIRONMENT_WINDOWS because the sed step has already changed
the in-tree name by the time cibuildwheel starts.
---
 .github/workflows/wheels.yml | 22 ++++++++++++++++++----
 pyproject.toml               | 27 +++++++++++++++------------
 setup.py                     | 29 ++++++-----------------------
 3 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 6207882..f1a3d1d 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -49,7 +49,6 @@ jobs:
       - name: Build the universal CPU wheel
         env:
           SFA_BUILD_CUDA: "0"
-          SFA_PACKAGE_NAME: "sfa"
         run: |
           python -m pip install --upgrade pip build
           python -m build --wheel
@@ -72,7 +71,6 @@ jobs:
       - name: Build sdist
         env:
           SFA_BUILD_CUDA: "0"
-          SFA_PACKAGE_NAME: "sfa"
         run: |
           python -m pip install --upgrade pip build
           python -m build --sdist
@@ -221,6 +219,22 @@ jobs:
           echo "--- $CUDA_HOME/lib64 (cublas only) ---"
           ls -la "$CUDA_HOME/lib64/" | grep -E 'cublas|nvrtc' || true
 
+      - name: Rename wheel project to ${{ matrix.cuda.pkg }}
+        # PEP 621 forbids `name` from being declared dynamic in
+        # pyproject.toml, so the same source tree's static
+        # `name = "sfa"` must be sed-replaced in place per CUDA cell
+        # to produce the per-CUDA wheel names (sfa-cu128 / sfa-cu132).
+        # The CPU and sdist jobs leave this line untouched and ship
+        # under the default `sfa` name. Runs on both Linux and
+        # Windows runners via Git Bash; cibuildwheel afterwards reads
+        # the modified pyproject.toml inside its build sandbox.
+        shell: bash
+        run: |
+          set -euo pipefail
+          sed -i 's/^name = "sfa"$/name = "${{ matrix.cuda.pkg }}"/' pyproject.toml
+          echo "--- name line after rename ---"
+          grep '^name' pyproject.toml
+
       - name: Build wheels
         uses: pypa/cibuildwheel@v2.21
         env:
@@ -238,9 +252,10 @@ jobs:
           CIBW_CONTAINER_ENGINE: >-
             docker; create_args: -v /usr/local/cuda:/usr/local/cuda:ro
           # Linux-only env: bind-mounted CUDA path + PATH update.
+          # The wheel project name is set by the sed step above
+          # (pyproject.toml's `[project] name`), not by an env var.
           CIBW_ENVIRONMENT_LINUX: >-
             SFA_BUILD_CUDA=1
-            SFA_PACKAGE_NAME=${{ matrix.cuda.pkg }}
             SFA_CUDA_ARCH="${{ matrix.cuda.archs }}"
             SFA_CUDA_RUNTIME_REQUIRES="${{ matrix.cuda.runtime_requires }}"
             CUDA_PATH=/usr/local/cuda
@@ -250,7 +265,6 @@ jobs:
           # subprocess automatically.
           CIBW_ENVIRONMENT_WINDOWS: >-
             SFA_BUILD_CUDA=1
-            SFA_PACKAGE_NAME=${{ matrix.cuda.pkg }}
             SFA_CUDA_ARCH="${{ matrix.cuda.archs }}"
             SFA_CUDA_RUNTIME_REQUIRES="${{ matrix.cuda.runtime_requires }}"
           # GitHub runners have no NVIDIA GPU, so the CUDA-gated pytest
diff --git a/pyproject.toml b/pyproject.toml
index 395ca2c..89b9ac1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,9 +2,17 @@
 requires = ["setuptools>=68", "wheel", "pybind11>=2.13", "numpy>=2.0"]
 build-backend = "setuptools.build_meta"
 
-# The project `name` is set dynamically from setup.py so the same source
-# tree can produce both "sfa" (CPU-only) and "sfa-cu130" (CUDA-enabled)
-# wheels under the SFA_PACKAGE_NAME env var.
+# PEP 621 requires `name` to be a static string here (it explicitly
+# forbids name from being dynamic). The CPU and sdist builds use this
+# default `sfa`; CUDA build cells in `wheels.yml` overwrite this line
+# in-place with `sed` so the same source tree produces sfa-cu128 /
+# sfa-cu132 wheels under their own PyPI names.
+#
+# `dependencies` is left dynamic so setup.py can append the per-CUDA
+# nvidia-cublas-cuXX / nvidia-cuda-runtime-cuXX runtime packages via
+# the SFA_CUDA_RUNTIME_REQUIRES env var. With no entry under
+# [tool.setuptools.dynamic], setuptools reads the resolved value from
+# setup.py's `install_requires=` call.
 [project]
 name = "sfa"
 version = "0.2.0"
@@ -13,13 +21,7 @@ readme = "README.md"
 license = { text = "MIT" }
 authors = [{ name = "Daewon Lee", email = "daewon4you@gmail.com" }]
 requires-python = ">=3.10"
-dependencies = [
-    "numpy",
-    "scipy",
-    "pandas",
-    "networkx",
-    "threadpoolctl",
-]
+dynamic = ["dependencies"]
 
 [project.optional-dependencies]
 cuda = []  # toolchain provided by conda env (environment-cuda.yml)
@@ -40,8 +42,9 @@ include = ["sfa*"]
 # Builds CPU-only wheels by default. To produce CUDA wheels, set
 # environment variables in the GitHub Actions workflow:
 #   SFA_BUILD_CUDA=1
-#   SFA_PACKAGE_NAME=sfa-cu130
-#   SFA_CUDA_ARCH=sm_70;sm_75;sm_80;sm_86;sm_89;sm_90  (AOT fat binary)
+#   SFA_CUDA_ARCH=sm_75;sm_80;sm_86;sm_89;sm_90;sm_100;sm_120
+# and rewrite the static `[project] name` line above to the target
+# wheel name (sfa-cu128 / sfa-cu132) via a sed step before running.
 # ---------------------------------------------------------------------------
 [tool.cibuildwheel]
 build = "cp310-* cp311-* cp312-* cp313-*"
diff --git a/setup.py b/setup.py
index 5b6758d..d179953 100644
--- a/setup.py
+++ b/setup.py
@@ -253,14 +253,6 @@ def _cuda_extension() -> Optional[Extension]:
     ext_modules.append(cuda_ext)
 
 
-# PyPI package name. Default is the cross-platform CPU-only package "sfa".
-# CUDA-enabled wheels are published under per-CUDA-version names
-# (sfa-cu128, sfa-cu132, sfa-cu133) so users can pick the variant that
-# matches their NVIDIA driver. Override at build time:
-#   SFA_PACKAGE_NAME=sfa-cu132 SFA_BUILD_CUDA=1 python -m build --wheel
-_pkg_name = os.environ.get("SFA_PACKAGE_NAME", "sfa")
-
-
 # Per-build extra install_requires fed in by the CI matrix. Used to pin
 # the NVIDIA runtime PyPI packages (nvidia-cublas-cuXX,
 # nvidia-cuda-runtime-cuXX) that match the CUDA major version we are
@@ -268,6 +260,12 @@ def _cuda_extension() -> Optional[Extension]:
 # requirement specifiers, e.g.
 #   SFA_CUDA_RUNTIME_REQUIRES="nvidia-cublas-cu13>=13.2,<13.3 \
 #                              nvidia-cuda-runtime-cu13>=13.2,<13.3"
+#
+# The wheel's static metadata (name, version, description, author, etc.)
+# lives in pyproject.toml's [project] table; setup.py only supplies what
+# isn't there: the CUDA build extension, the dynamic install_requires
+# composed from the base list plus _extra_runtime, and the
+# build-extension command class.
 _extra_runtime = [
     spec for spec in os.environ.get("SFA_CUDA_RUNTIME_REQUIRES", "").split()
     if spec.strip()
@@ -275,24 +273,9 @@ def _cuda_extension() -> Optional[Extension]:
 
 
 setup(
-    name=_pkg_name,
-    version="0.2.0.dev0",
-    description="Signal flow analysis",
-    url="http://github.com/dwgoon/sfa",
-    author="Daewon Lee",
-    author_email="daewon4you@gmail.com",
-    license="MIT",
-    packages=find_packages(),
-    package_data={"": ["*.tsv", "*.sif", "*.json"]},
-    python_requires=">=3.10",
     install_requires=(["numpy", "scipy", "pandas", "networkx",
                        "threadpoolctl"]
                       + _extra_runtime),
-    extras_require={
-        "plot": ["matplotlib", "seaborn"],
-        "cuda": [],  # toolchain provided by conda env (environment-cuda.yml)
-        "test": ["pytest"],
-    },
     ext_modules=ext_modules,
     cmdclass={"build_ext": CudaBuildExt} if ext_modules else {},
     zip_safe=False,

From 4a42c2e9b2e2beb1b242bdfeb4372d48504829a6 Mon Sep 17 00:00:00 2001
From: Daewon Lee <daewon4you@gmail.com>
Date: Thu, 11 Jun 2026 10:40:17 +0900
Subject: [PATCH 3/3] Fix propagate_iterative returning a stale iterate on
 non-convergence

- algorithms/sp: the ping-pong swap left the final swap in place when
  lim_iter was exhausted, so the returned x was the second-to-last
  iterate and disagreed with trajectory[-1]. Track convergence and
  return the correct buffer; this also stops lim_iter==0 from returning
  an uninitialized array. Returned x now always equals trajectory[-1].
- _blas_ctypes.solve: snapshot and restore the backend thread count so a
  one-off num_threads no longer leaks process-wide.
- control/influence: raise ValueError on an unknown device; unify the
  convergence comparison to '<= tol' (dense + sparse); drop the dead
  np.inf self-influence assignment that was always overwritten.
- tests: cover the non-converged and lim_iter==0 propagate_iterative paths.
---
 sfa/_blas_ctypes.py                    | 68 ++++++++++++++++++++------
 sfa/algorithms/sp.py                   | 12 ++++-
 sfa/control/influence.py               | 11 +++--
 tests/test_cpu_dtype_and_trajectory.py | 46 +++++++++++++++++
 4 files changed, 115 insertions(+), 22 deletions(-)

diff --git a/sfa/_blas_ctypes.py b/sfa/_blas_ctypes.py
index 4effbe8..c475a9a 100644
--- a/sfa/_blas_ctypes.py
+++ b/sfa/_blas_ctypes.py
@@ -79,8 +79,34 @@ def __init__(self, name: str, lib: ctypes.CDLL):
             ctypes.POINTER(ctypes.c_float), ctypes.c_int,
         ]
 
-        # Thread setters are backend-specific.
+        # Thread setters/getters are backend-specific.
         self.set_num_threads = self._resolve_thread_setter()
+        self.get_num_threads = self._resolve_thread_getter()
+
+    def _resolve_thread_getter(self):
+        """Return a callable ``f() -> int`` reporting the backend thread count.
+
+        Used to snapshot and restore the process-wide thread count around a
+        single ``solve`` so a one-off ``num_threads`` does not leak into the
+        rest of the process. Returns ``None`` when the backend exposes no
+        getter (then the count is simply left as set).
+        """
+        lib = self.lib
+        # MKL: int MKL_Get_Max_Threads(void) / int mkl_get_max_threads(void)
+        for sym in ('MKL_Get_Max_Threads', 'mkl_get_max_threads'):
+            fn = getattr(lib, sym, None)
+            if fn is not None:
+                fn.argtypes = []
+                fn.restype = ctypes.c_int
+                return fn
+        # OpenBLAS: int openblas_get_num_threads(void) / goto_get_num_threads
+        for sym in ('openblas_get_num_threads', 'goto_get_num_threads'):
+            fn = getattr(lib, sym, None)
+            if fn is not None:
+                fn.argtypes = []
+                fn.restype = ctypes.c_int
+                return fn
+        return None
 
     def _resolve_thread_setter(self):
         """Return a callable ``f(n: int) -> None`` for this backend."""
@@ -322,7 +348,15 @@ def solve(A: np.ndarray, B: np.ndarray, *,
     else:
         be = _load(canon)
 
+    # Snapshot the backend's current thread count so a one-off num_threads
+    # does not leak into the rest of the process; restore it after the solve.
+    prev_threads = None
     if num_threads is not None:
+        if be.get_num_threads is not None:
+            try:
+                prev_threads = int(be.get_num_threads())
+            except Exception:
+                prev_threads = None
         be.set_num_threads(int(num_threads))
 
     # Make sure layout is C-contiguous (LAPACK_ROW_MAJOR friendly).
@@ -332,20 +366,24 @@ def solve(A: np.ndarray, B: np.ndarray, *,
     NRHS = B.shape[1]
     ipiv = np.empty(N, dtype=np.int32)
 
-    if A.dtype == np.float64:
-        info = be.dgesv(
-            _LAPACK_ROW_MAJOR, N, NRHS,
-            A.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), N,
-            ipiv.ctypes.data_as(ctypes.POINTER(ctypes.c_int)),
-            B.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), NRHS,
-        )
-    else:  # float32
-        info = be.sgesv(
-            _LAPACK_ROW_MAJOR, N, NRHS,
-            A.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), N,
-            ipiv.ctypes.data_as(ctypes.POINTER(ctypes.c_int)),
-            B.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), NRHS,
-        )
+    try:
+        if A.dtype == np.float64:
+            info = be.dgesv(
+                _LAPACK_ROW_MAJOR, N, NRHS,
+                A.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), N,
+                ipiv.ctypes.data_as(ctypes.POINTER(ctypes.c_int)),
+                B.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), NRHS,
+            )
+        else:  # float32
+            info = be.sgesv(
+                _LAPACK_ROW_MAJOR, N, NRHS,
+                A.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), N,
+                ipiv.ctypes.data_as(ctypes.POINTER(ctypes.c_int)),
+                B.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), NRHS,
+            )
+    finally:
+        if prev_threads is not None:
+            be.set_num_threads(prev_threads)
 
     if info != 0:
         raise np.linalg.LinAlgError(
diff --git a/sfa/algorithms/sp.py b/sfa/algorithms/sp.py
index 1718d33..7129875 100644
--- a/sfa/algorithms/sp.py
+++ b/sfa/algorithms/sp.py
@@ -125,6 +125,7 @@ def propagate_iterative(self,
             trj[0] = x_t1
 
         num_iter = 0
+        converged = False
         with thread_limit(num_threads):
             for i in range(lim_iter):
                 # x_t2 = a*W @ x_t1 + (1-a)*b  (no temporary allocations)
@@ -135,15 +136,22 @@ def propagate_iterative(self,
                 if get_trj:
                     trj[num_iter] = x_t2
                 if np.linalg.norm(x_t2 - x_t1) <= tol:
+                    converged = True
                     break
                 # Swap views instead of copying: x_t1 becomes the freshest
                 # state, and the old x_t1 buffer is recycled as next x_t2.
                 x_t1, x_t2 = x_t2, x_t1
 
+        # When we converge we break *before* the swap, so x_t2 holds the
+        # latest iterate. When we exhaust lim_iter the final swap left the
+        # latest iterate in x_t1. Pick the right buffer so the returned x is
+        # always the last computed state (and matches trj[-1]). This also
+        # avoids returning the uninitialized x_t2 buffer when lim_iter == 0.
+        x_fin = x_t2 if converged else x_t1
         if get_trj:
             # Slice to actual rows (no copy).
-            return x_t2, trj[: num_iter + 1]
-        return x_t2, num_iter
+            return x_fin, trj[: num_iter + 1]
+        return x_fin, num_iter
 
     # end of def propagate_iterative
 
diff --git a/sfa/control/influence.py b/sfa/control/influence.py
index d92a2cd..5e6cd36 100644
--- a/sfa/control/influence.py
+++ b/sfa/control/influence.py
@@ -156,6 +156,10 @@ def compute_influence(W,
             import cupy as cp
             if isinstance(ret, cp.ndarray):
                 ret = cp.asnumpy(ret)
+    else:
+        raise ValueError(
+            f"Unknown device: {device!r}. "
+            "Expected one of 'cpu', 'cuda[:N]', or 'gpu:N'.")
 
     if get_iter:
         S_ret, num_iter = ret
@@ -173,9 +177,6 @@ def compute_influence(W,
 
         for trg in outputs:
             for src in n2i:
-                if src == trg:
-                    df.loc[src, trg] = np.inf
-
                 idx_src = n2i[src]
                 idx_trg = n2i[trg]
                 df.loc[src, trg] = S_ret[idx_trg, idx_src]
@@ -272,7 +273,7 @@ def _compute_influence_cpu(W, alpha=0.5, beta=0.5, S=None,
             np.dot(S1, aW, out=S2)
             S2 += I
             norm = np.linalg.norm(S2 - S1)
-            if norm < tol:
+            if norm <= tol:
                 break
             S1[:, :] = S2
 
@@ -299,7 +300,7 @@ def _compute_influence_cpu_sparse(W, alpha, beta, S,
     for cnt in range(1, max_iter + 1):
         S2[:, :] = S1.dot(aW) + I
         norm = sp.sparse.linalg.norm(S2 - S1)
-        if norm < tol:
+        if norm <= tol:
             break
         # end of if
         S1[:, :] = S2
diff --git a/tests/test_cpu_dtype_and_trajectory.py b/tests/test_cpu_dtype_and_trajectory.py
index 92ce07f..3c8e371 100644
--- a/tests/test_cpu_dtype_and_trajectory.py
+++ b/tests/test_cpu_dtype_and_trajectory.py
@@ -102,3 +102,49 @@ def test_cuda_influence_use_tf32_off_higher_precision():
     err_fp32 = np.abs(S_fp32.astype(np.float64) - S_cpu).max()
     # FP32 path should be at least as accurate as TF32 (often much more so).
     assert err_fp32 <= err_tf32 + 1e-12
+
+
+def test_cpu_sp_nonconvergence_returns_last_iterate():
+    """Non-converged runs must return the *last* iterate, not the one before.
+
+    Regression test: the ping-pong swap optimization left the final swap in
+    place when lim_iter was exhausted, so the function returned the
+    second-to-last iterate (and a value that disagreed with trajectory[-1]).
+    """
+    alg = SignalPropagation("SP")
+    N = 16
+    W = _W(N, seed=3)
+    xi = np.zeros(N)
+    rng = np.random.default_rng(3)
+    b = rng.standard_normal(N) * 0.1
+    a = 0.5
+    n_steps = 5  # small lim_iter + unreachable tol => guaranteed non-convergence
+
+    x, num_iter = alg.propagate_iterative(
+        W, xi, b, a=a, lim_iter=n_steps, tol=1e-30, get_trj=False)
+    assert num_iter == n_steps
+
+    # Manual reference: exactly n_steps fixed-point updates from xi.
+    ref = xi.astype(np.float64).copy()
+    for _ in range(n_steps):
+        ref = a * W.dot(ref) + (1 - a) * b
+    np.testing.assert_allclose(x, ref, rtol=1e-12, atol=1e-12)
+
+    # The returned state must equal the last trajectory row.
+    x_trj, trj = alg.propagate_iterative(
+        W, xi, b, a=a, lim_iter=n_steps, tol=1e-30, get_trj=True)
+    assert trj.shape[0] == n_steps + 1
+    np.testing.assert_allclose(trj[-1], x_trj, rtol=1e-12, atol=1e-12)
+
+
+def test_cpu_sp_zero_iterations_returns_initial():
+    """lim_iter == 0 must return the initial state, not an uninitialized buffer."""
+    alg = SignalPropagation("SP")
+    N = 8
+    W = _W(N, seed=5)
+    xi = np.arange(N, dtype=np.float64)
+    b = np.zeros(N)
+    x, num_iter = alg.propagate_iterative(
+        W, xi, b, a=0.5, lim_iter=0, tol=1e-5)
+    assert num_iter == 0
+    np.testing.assert_allclose(x, xi)