From 3fd75a97618cb9096c63a2fff0e1f6ff6cd9fef7 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 14 Jan 2025 13:02:52 +0100
Subject: [PATCH 01/33] refactor API, add `precomputed` affinity

---
 heat/cluster/spectral.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 39bec27d32..9759d3ec30 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -9,7 +9,7 @@
 from heat.core.dndarray import DNDarray
 
 
-class Spectral(ht.ClusteringMixin, ht.BaseEstimator):
+class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
     """
     Spectral clustering
 
@@ -18,18 +18,19 @@ class Spectral(ht.ClusteringMixin, ht.BaseEstimator):
     n_clusters : int
         Number of clusters to fit
     gamma : float
-        Kernel coefficient sigma for 'rbf', ignored for metric='euclidean'
-    metric : string
-        How to construct the similarity matrix.
+        Kernel coefficient sigma for 'rbf', ignored for affinity='euclidean'
+    affinity : string
+        How to construct the similarity (affinity) matrix.
 
             - 'rbf' : construct the similarity matrix using a radial basis function (RBF) kernel.
             - 'euclidean' : construct the similarity matrix as only euclidean distance.
+            - 'precomputed' : interpret ``X`` as precomputed affinity matrix.
     laplacian : str
         How to calculate the graph laplacian (affinity)
         Currently supported : 'fully_connected', 'eNeighbour'
     threshold : float
         Threshold for affinity matrix if laplacian='eNeighbour'
-        Ignorded for laplacian='fully_connected'
+        Ignored for laplacian='fully_connected'
     boundary : str
         How to interpret threshold: 'upper', 'lower'
         Ignorded for laplacian='fully_connected'
@@ -45,7 +46,7 @@ def __init__(
         self,
         n_clusters: int = None,
         gamma: float = 1.0,
-        metric: str = "rbf",
+        affinity: str = "rbf",
         laplacian: str = "fully_connected",
         threshold: float = 1.0,
         boundary: str = "upper",
@@ -55,14 +56,14 @@ def __init__(
     ):
         self.n_clusters = n_clusters
         self.gamma = gamma
-        self.metric = metric
+        self.affinity = affinity
         self.laplacian = laplacian
         self.threshold = threshold
         self.boundary = boundary
         self.n_lanczos = n_lanczos
         self.assign_labels = assign_labels
 
-        if metric == "rbf":
+        if affinity == "rbf":
             sig = math.sqrt(1 / (2 * gamma))
             self._laplacian = ht.graph.Laplacian(
                 lambda x: ht.spatial.rbf(x, sigma=sig, quadratic_expansion=True),
@@ -72,7 +73,7 @@ def __init__(
                 threshold_value=threshold,
             )
 
-        elif metric == "euclidean":
+        elif affinity == "euclidean":
             self._laplacian = ht.graph.Laplacian(
                 lambda x: ht.spatial.cdist(x, quadratic_expansion=True),
                 definition="norm_sym",
@@ -80,6 +81,14 @@ def __init__(
                 threshold_key=boundary,
                 threshold_value=threshold,
             )
+        elif affinity == "precomputed":
+            self._laplacian = ht.graph.Laplacian(
+                lambda x: x,
+                definition="norm_sym",
+                mode=laplacian,
+                threshold_key=boundary,
+                threshold_value=threshold,
+            )
         else:
             raise NotImplementedError("Other kernels currently not supported")
 

From 3114725188c38c8c8c6b326f8063e6d91d568e88 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 14 Jan 2025 13:03:14 +0100
Subject: [PATCH 02/33] adapt tests

---
 heat/cluster/tests/test_spectral.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py
index 9e24dddfc5..bb80a4e88a 100644
--- a/heat/cluster/tests/test_spectral.py
+++ b/heat/cluster/tests/test_spectral.py
@@ -8,12 +8,12 @@
 
 class TestSpectral(TestCase):
     def test_clusterer(self):
-        spectral = ht.cluster.Spectral()
+        spectral = ht.cluster.SpectralClustering()
         self.assertTrue(ht.is_estimator(spectral))
         self.assertTrue(ht.is_clusterer(spectral))
 
     def test_get_and_set_params(self):
-        spectral = ht.cluster.Spectral()
+        spectral = ht.cluster.SpectralClustering()
         params = spectral.get_params()
 
         self.assertEqual(
@@ -21,7 +21,7 @@ def test_get_and_set_params(self):
             {
                 "n_clusters": None,
                 "gamma": 1.0,
-                "metric": "rbf",
+                "affinity": "rbf",
                 "laplacian": "fully_connected",
                 "threshold": 1.0,
                 "boundary": "upper",
@@ -39,14 +39,14 @@ def test_fit_iris(self):
         iris = ht.load("heat/datasets/iris.csv", sep=";", split=0)
         m = 10
         # fit the clusters
-        spectral = ht.cluster.Spectral(
-            n_clusters=3, gamma=1.0, metric="rbf", laplacian="fully_connected", n_lanczos=m
+        spectral = ht.cluster.SpectralClustering(
+            n_clusters=3, gamma=1.0, affinity="rbf", laplacian="fully_connected", n_lanczos=m
         )
         spectral.fit(iris)
         self.assertIsInstance(spectral.labels_, ht.DNDarray)
 
-        spectral = ht.cluster.Spectral(
-            metric="euclidean",
+        spectral = ht.cluster.SpectralClustering(
+            affinity="euclidean",
             laplacian="eNeighbour",
             threshold=0.5,
             boundary="upper",
@@ -55,9 +55,9 @@ def test_fit_iris(self):
         labels = spectral.fit_predict(iris)
         self.assertIsInstance(labels, ht.DNDarray)
 
-        spectral = ht.cluster.Spectral(
+        spectral = ht.cluster.SpectralClustering(
             gamma=0.1,
-            metric="rbf",
+            affinity="rbf",
             laplacian="eNeighbour",
             threshold=0.5,
             boundary="upper",
@@ -67,7 +67,7 @@ def test_fit_iris(self):
         self.assertIsInstance(labels, ht.DNDarray)
 
         kmeans = {"kmeans++": "kmeans++", "max_iter": 30, "tol": -1}
-        spectral = ht.cluster.Spectral(
+        spectral = ht.cluster.SpectralClustering(
             n_clusters=3, gamma=1.0, normalize=True, n_lanczos=m, params=kmeans
         )
         labels = spectral.fit_predict(iris)
@@ -75,9 +75,9 @@ def test_fit_iris(self):
 
         # Errors
         with self.assertRaises(NotImplementedError):
-            spectral = ht.cluster.Spectral(metric="ahalanobis", n_lanczos=m)
+            spectral = ht.cluster.SpectralClustering(affinity="ahalanobis", n_lanczos=m)
 
         iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1)
-        spectral = ht.cluster.Spectral(n_lanczos=20)
+        spectral = ht.cluster.SpectralClustering(n_lanczos=20)
         with self.assertRaises(NotImplementedError):
             spectral.fit(iris_split)

From 441ad2ca1f98c96c19702c37301b3dd89fccfc5f Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Fri, 21 Mar 2025 10:03:32 +0100
Subject: [PATCH 03/33] introduce eigen_solver parameter

---
 heat/cluster/spectral.py | 74 +++++++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 9759d3ec30..1de89333aa 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -11,21 +11,23 @@
 
 class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
     """
-    Spectral clustering
+    Spectral clustering of large memory-distributed datasets.
 
     Attributes
     ----------
-    n_clusters : int
+    n_clusters : int, default=8
         Number of clusters to fit
-    gamma : float
+    eigen_solver : str, default='lanczos'
+        Eigenvalue decomposition strategy to use. Currently only 'lanczos' is supported
+    gamma : float, default=1.0
         Kernel coefficient sigma for 'rbf', ignored for affinity='euclidean'
-    affinity : string
+    affinity : str, default='rbf'
         How to construct the similarity (affinity) matrix.
 
             - 'rbf' : construct the similarity matrix using a radial basis function (RBF) kernel.
             - 'euclidean' : construct the similarity matrix as only euclidean distance.
             - 'precomputed' : interpret ``X`` as precomputed affinity matrix.
-    laplacian : str
+    laplacian : str, default='fully_connected'
         How to calculate the graph laplacian (affinity)
         Currently supported : 'fully_connected', 'eNeighbour'
     threshold : float
@@ -34,9 +36,9 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
     boundary : str
         How to interpret threshold: 'upper', 'lower'
         Ignorded for laplacian='fully_connected'
-    n_lanczos : int
+    n_lanczos : int, default=300
         number of Lanczos iterations for Eigenvalue decomposition
-    assign_labels: str
+    assign_labels: str, default='kmeans'
          The strategy to use to assign labels in the embedding space.
     **params: dict
           Parameter dictionary for the assign_labels estimator
@@ -45,6 +47,7 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
     def __init__(
         self,
         n_clusters: int = None,
+        eigen_solver: str = "lanczos",
         gamma: float = 1.0,
         affinity: str = "rbf",
         laplacian: str = "fully_connected",
@@ -55,6 +58,7 @@ def __init__(
         **params,
     ):
         self.n_clusters = n_clusters
+        self.eigen_solver = eigen_solver
         self.gamma = gamma
         self.affinity = affinity
         self.laplacian = laplacian
@@ -109,34 +113,39 @@ def labels_(self) -> DNDarray:
         """
         return self._labels
 
-    def _spectral_embedding(self, x: DNDarray) -> Tuple[DNDarray, DNDarray]:
+    def spectral_embedding(self, x: DNDarray, eigen_solver: str) -> Tuple[DNDarray, DNDarray]:
         """
-        Helper function for dataset x embedding.
-        Returns Tupel(Eigenvalues, Eigenvectors) of the graph's Laplacian matrix.
+        Returns Tuple(Eigenvalues, Eigenvectors) of the graph's Laplacian matrix.
 
         Parameters
         ----------
         x : DNDarray
             Sample Matrix for which the embedding should be calculated
+        eigen_solver : str
+            Eigenvalue decomposition strategy to use. Currently only 'lanczos' is supported
+
+        See Also
+        --------
+        :func:`heat.linalg.lanczos`
 
         Notes
         -----
-        This will throw out the complex side of the eigenvalues found during this.
-
+        The imaginary part of the eigenvalues is discarded, as the Laplacian matrix is symmetric and the eigenvectors
+        are real.
         """
         L = self._laplacian.construct(x)
-        # 3. Eigenvalue and -vector calculation via Lanczos Algorithm
-        v0 = ht.full(
-            (L.shape[0],),
-            fill_value=1.0 / math.sqrt(L.shape[0]),
-            dtype=L.dtype,
-            split=0,
-            device=L.device,
-        )
-        V, T = ht.lanczos(L, self.n_lanczos, v0)
-
-        # if int(torch.__version__.split(".")[1]) >= 9:
-        try:
+        # 3. Eigenvalue and -vector calculation
+        # for now via Lanczos Algorithm only
+        if eigen_solver == "lanczos":
+            v0 = ht.full(
+                (L.shape[0],),
+                fill_value=1.0 / math.sqrt(L.shape[0]),
+                dtype=L.dtype,
+                split=0,
+                device=L.device,
+            )
+            V, T = ht.lanczos(L, self.n_lanczos, v0)
+
             # 4. Calculate and Sort Eigenvalues and Eigenvectors of tridiagonal matrix T
             eval, evec = torch.linalg.eig(T.larray)
 
@@ -146,15 +155,10 @@ def _spectral_embedding(self, x: DNDarray) -> Tuple[DNDarray, DNDarray]:
             eigenvectors = ht.matmul(V, ht.array(evec))[:, idx]
 
             return eigenvalues.real, eigenvectors.real
-        except AttributeError:  # torch version is less than 1.9.0
-            # 4. Calculate and Sort Eigenvalues and Eigenvectors of tridiagonal matrix T
-            eval, evec = torch.eig(T.larray, eigenvectors=True)
-            # If x is an Eigenvector of T, then y = V@x is the corresponding Eigenvector of L
-            eval, idx = torch.sort(eval[:, 0], dim=0)
-            eigenvalues = ht.array(eval)
-            eigenvectors = ht.matmul(V, ht.array(evec))[:, idx]
-
-            return eigenvalues, eigenvectors
+        else:
+            raise NotImplementedError(
+                "Other Eigenvalue Decomposition methods are not yet supported"
+            )
 
     def fit(self, x: DNDarray):
         """
@@ -177,7 +181,7 @@ def fit(self, x: DNDarray):
         if x.split is not None and x.split != 0:
             raise NotImplementedError("Not implemented for other splitting-axes")
         # 2. Embed Dataset into lower-dimensional Eigenvector space
-        eigenvalues, eigenvectors = self._spectral_embedding(x)
+        eigenvalues, eigenvectors = self.spectral_embedding(x, self.eigen_solver)
 
         # 3. Find the spectral gap, if number of clusters is not defined from the outside
         if self.n_clusters is None:
@@ -219,7 +223,7 @@ def predict(self, x: DNDarray) -> DNDarray:
         if x.split is not None and x.split != 0:
             raise NotImplementedError("Not implemented for other splitting-axes")
 
-        _, eigenvectors = self._spectral_embedding(x)
+        _, eigenvectors = self.spectral_embedding(x, self.eigen_solver)
 
         components = eigenvectors[:, : self.n_clusters].copy()
 

From bd8033376575bc5eb11bc76c9a046aca89a7b017 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Fri, 21 Mar 2025 10:03:50 +0100
Subject: [PATCH 04/33] adapt tests

---
 heat/cluster/tests/test_spectral.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py
index bb80a4e88a..8ebb2fd657 100644
--- a/heat/cluster/tests/test_spectral.py
+++ b/heat/cluster/tests/test_spectral.py
@@ -20,6 +20,7 @@ def test_get_and_set_params(self):
             params,
             {
                 "n_clusters": None,
+                "eigen_solver": "lanczos",
                 "gamma": 1.0,
                 "affinity": "rbf",
                 "laplacian": "fully_connected",

From 0ec780e244e43604d166b478a2e43e1093c65f7c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 4 Apr 2025 09:35:21 +0000
Subject: [PATCH 05/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 heat/cluster/tests/test_spectral.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py
index 6b334cff3d..b7bc456dc4 100644
--- a/heat/cluster/tests/test_spectral.py
+++ b/heat/cluster/tests/test_spectral.py
@@ -37,14 +37,14 @@ def test_get_and_set_params(self):
         self.assertEqual(10, spectral.n_clusters)
 
     def test_fit_iris(self):
-        if not self.is_mps:       
+        if not self.is_mps:
             # get some test data
             iris = ht.load("heat/datasets/iris.csv", sep=";", split=0)
             m = 10
             # fit the clusters
             spectral = ht.cluster.SpectralClustering(
                 n_clusters=3, gamma=1.0, affinity="rbf", laplacian="fully_connected", n_lanczos=m
-            ) 
+            )
             spectral.fit(iris)
             self.assertIsInstance(spectral.labels_, ht.DNDarray)
 
@@ -83,4 +83,4 @@ def test_fit_iris(self):
             iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1)
             spectral = ht.cluster.SpectralClustering(n_lanczos=20)
             with self.assertRaises(NotImplementedError):
-                spectral.fit(iris_split)
\ No newline at end of file
+                spectral.fit(iris_split)

From 1fb1fb061c29e170552c493ad3de7cbbe3524c3f Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 6 May 2025 05:17:42 +0200
Subject: [PATCH 06/33] add fit_predict

---
 heat/cluster/spectral.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 1de89333aa..3866e1003a 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -200,6 +200,24 @@ def fit(self, x: DNDarray):
 
         return self
 
+    def fit_predict(self, x: DNDarray) -> DNDarray:
+        """
+        Fit the model to the data and return the labels.
+
+        Parameters
+        ----------
+        x : DNDarray
+            Training instances to cluster. Shape = (n_samples, n_features)
+
+        Returns
+        -------
+        DNDarray
+            Labels of each point.
+        """
+        self.fit(x)
+        return self.labels_
+        # return self._cluster.predict(components)
+
     def predict(self, x: DNDarray) -> DNDarray:
         """
         Return the label each sample in X belongs to.

From 8868d34cf4ff7f31135848ec2a503c5adb9f26d3 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 20 May 2025 10:46:00 +0200
Subject: [PATCH 07/33] support eigh decomposition

---
 heat/cluster/spectral.py | 76 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 69 insertions(+), 7 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 3866e1003a..e846824835 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -5,8 +5,9 @@
 import heat as ht
 import math
 import torch
-from typing import Tuple
+from typing import Tuple, Union
 from heat.core.dndarray import DNDarray
+from heat.core.linalg import eigh
 
 
 class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
@@ -18,7 +19,9 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
     n_clusters : int, default=8
         Number of clusters to fit
     eigen_solver : str, default='lanczos'
-        Eigenvalue decomposition strategy to use. Currently only 'lanczos' is supported
+        Eigenvalue decomposition strategy to use. Supported: 'lanczos', 'zolotarev'.
+    n_components : int, default=None
+        Number of components to use for the embedding. If None, n_clusters is used
     gamma : float, default=1.0
         Kernel coefficient sigma for 'rbf', ignored for affinity='euclidean'
     affinity : str, default='rbf'
@@ -113,7 +116,27 @@ def labels_(self) -> DNDarray:
         """
         return self._labels
 
-    def spectral_embedding(self, x: DNDarray, eigen_solver: str) -> Tuple[DNDarray, DNDarray]:
+    def __set_diag(self, x: DNDarray, value: Union[int, float], norm_laplacian: bool) -> DNDarray:
+        """
+        Set the diagonal of the matrix to a specific value.
+        """
+        # TODO see scikit learn
+        n_nodes = x.shape[0]
+        # for now we assume that the matrix is dense
+        if norm_laplacian:
+            # set diagonal to `value`
+            # tensor.view(-1) == ndarray.flat
+            x.larray.view(-1)[:: n_nodes + 1] = value
+        return x
+
+    def spectral_embedding(
+        self,
+        x: DNDarray,
+        n_components: int = 8,
+        eigen_solver: str = "zolotarev",
+        norm_laplacian: bool = True,
+        drop_first: bool = True,
+    ) -> Tuple[DNDarray, DNDarray]:
         """
         Returns Tuple(Eigenvalues, Eigenvectors) of the graph's Laplacian matrix.
 
@@ -121,22 +144,61 @@ def spectral_embedding(self, x: DNDarray, eigen_solver: str) -> Tuple[DNDarray,
         ----------
         x : DNDarray
             Sample Matrix for which the embedding should be calculated
+        n_components : int, default=8
+            Number of components to use for the embedding
         eigen_solver : str
-            Eigenvalue decomposition strategy to use. Currently only 'lanczos' is supported
+            Eigenvalue decomposition strategy to use. Default is 'zolotarev' #TODO expand
+        norm_laplacian : bool, default=True
+            Whether to use the normalized Laplacian
+        drop_first : bool, default=True
+            Whether to drop the first component (the smallest eigenvalue)
 
         See Also
         --------
         :func:`heat.linalg.lanczos`
+        :func:`heat.linalg.eigh`
+        :func:`heat.linalg.polar`
 
         Notes
         -----
         The imaginary part of the eigenvalues is discarded, as the Laplacian matrix is symmetric and the eigenvectors
-        are real.
+        are real. TODO: check if this is still correct
         """
         L = self._laplacian.construct(x)
         # 3. Eigenvalue and -vector calculation
-        # for now via Lanczos Algorithm only
-        if eigen_solver == "lanczos":
+        if eigen_solver == "zolotarev":
+            # TODO: adapt this to Heat
+            L = self.__set_diag(L, 1, norm_laplacian)  # TODO should it be a method?
+            # extract the diagonal
+            dd = L.diagonal()
+            #            try:
+            # we skip multiplying by -1 as we are not using ARPACK
+            # keeping comment below as historical record for now
+            ####
+            # # We are computing the opposite of the laplacian inplace so as
+            # # to spare a memory allocation of a possibly very large array
+            # L *= -1
+            ####
+            # compute the Zolo-PD eigenvalue decomposition
+            _, diffusion_map = eigh(L)
+            # ht.linalg.eigh returns the eigenvalues in descending order
+            # select the first n_components, no need to reverse order
+            diffusion_map = diffusion_map[:, :n_components]
+            embedding = diffusion_map.T
+            if norm_laplacian:
+                # REVERT FROM DIVISION TO MULTIPLICATION
+                # (user requirement, see https://codebase.helmholtz.cloud/helmholtz-analytics/SCIMES/-/blob/master/scimes/old_spectral_embedding.py?ref_type=heads#L62)
+                # TODO: generalize / adapt to current scikit-learn
+                embedding = embedding * dd
+            # except RuntimeError:
+            #     # When submatrices are exactly singular, an LU decomposition
+            #     # in arpack fails. We fallback to lobpcg
+            #     eigen_solver = "lobpcg"
+            #     # Revert the laplacian to its opposite to have lobpcg work
+            #     L *= -1
+
+        #  Lanczos Algorithm
+        elif eigen_solver == "lanczos":
             v0 = ht.full(
                 (L.shape[0],),
                 fill_value=1.0 / math.sqrt(L.shape[0]),

From 631a4aa7e34fa5e29522960f7f716605ed7af3fe Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 20 May 2025 10:52:05 +0200
Subject: [PATCH 08/33] add argument n_components

---
 heat/cluster/spectral.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index e846824835..b91f795658 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -51,6 +51,7 @@ def __init__(
         self,
         n_clusters: int = None,
         eigen_solver: str = "lanczos",
+        n_components: int = None,
         gamma: float = 1.0,
         affinity: str = "rbf",
         laplacian: str = "fully_connected",
@@ -62,6 +63,7 @@ def __init__(
     ):
         self.n_clusters = n_clusters
         self.eigen_solver = eigen_solver
+        self.n_components = n_components
         self.gamma = gamma
         self.affinity = affinity
         self.laplacian = laplacian
@@ -167,7 +169,6 @@ def spectral_embedding(
         L = self._laplacian.construct(x)
         # 3. Eigenvalue and -vector calculation
         if eigen_solver == "zolotarev":
-            # TODO: adapt this to Heat
             L = self.__set_diag(L, 1, norm_laplacian)  # TODO should it be a method?
             # extract the diagonal
             dd = L.diagonal()
@@ -215,7 +216,8 @@ def spectral_embedding(
             eval, idx = torch.sort(eval.real, dim=0)
             eigenvalues = ht.array(eval)
             eigenvectors = ht.matmul(V, ht.array(evec))[:, idx]
-
+            # TODO: spectral_embedding should only return the embedding (eigenvectors)
+            # TODO: move components calculation from fit() to here and only return n_components eigenvalues as embedding
             return eigenvalues.real, eigenvectors.real
         else:
             raise NotImplementedError(

From 46fa36dad0e03a42543ddfd6c3ea5994fbad0cae Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Fri, 6 Jun 2025 10:30:54 +0200
Subject: [PATCH 09/33] return n_components eigenval only

---
 heat/cluster/spectral.py | 53 ++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index b91f795658..3019c22487 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -63,7 +63,7 @@ def __init__(
     ):
         self.n_clusters = n_clusters
         self.eigen_solver = eigen_solver
-        self.n_components = n_components
+        self.n_components = n_components if n_components is not None else n_clusters
         self.gamma = gamma
         self.affinity = affinity
         self.laplacian = laplacian
@@ -140,7 +140,7 @@ def spectral_embedding(
         drop_first: bool = True,
     ) -> Tuple[DNDarray, DNDarray]:
         """
-        Returns Tuple(Eigenvalues, Eigenvectors) of the graph's Laplacian matrix.
+        Returns the embedding (eigenvectors) of the graph's Laplacian matrix.
 
         Parameters
         ----------
@@ -149,7 +149,8 @@ def spectral_embedding(
         n_components : int, default=8
             Number of components to use for the embedding
         eigen_solver : str
-            Eigenvalue decomposition strategy to use. Default is 'zolotarev' #TODO expand
+            Eigenvalue decomposition strategy to use. Default is 'zolotarev' (see documentation in :func:`heat.linalg.polar`).
+            TODO: add 'lanczos' as an option, maybe a default torch option for smaller (non-distr) datasets?
         norm_laplacian : bool, default=True
             Whether to use the normalized Laplacian
         drop_first : bool, default=True
@@ -172,7 +173,6 @@ def spectral_embedding(
             L = self.__set_diag(L, 1, norm_laplacian)  # TODO should it be a method?
             # extract the diagonal
             dd = L.diagonal()
-            #            try:
             # we skip multiplying by -1 as we are not using ARPACK
             # keeping comment below as historical record for now
             ####
@@ -184,19 +184,12 @@ def spectral_embedding(
             _, diffusion_map = eigh(L)
             # ht.linalg.eigh returns the eigenvalues in descending order
             # select the first n_components, no need to reverse order
-            diffusion_map = diffusion_map[:, :n_components]
-            embedding = diffusion_map.T
+            embedding = diffusion_map.T[:n_components]
             if norm_laplacian:
                 # REVERT FROM DIVISION TO MULTIPLICATION
                 # (user requirement, see https://codebase.helmholtz.cloud/helmholtz-analytics/SCIMES/-/blob/master/scimes/old_spectral_embedding.py?ref_type=heads#L62)
                 # TODO: generalize / adapt to current scikit-learn
                 embedding = embedding * dd
-            # except RuntimeError:
-            #     # When submatrices are exactly singular, an LU decomposition
-            #     # in arpack fails. We fallback to lobpcg
-            #     eigen_solver = "lobpcg"
-            #     # Revert the laplacian to its opposite to have lobpcg work
-            #     L *= -1
 
         #  Lanczos Algorithm
         elif eigen_solver == "lanczos":
@@ -213,16 +206,28 @@ def spectral_embedding(
             eval, evec = torch.linalg.eig(T.larray)
 
             # If x is an Eigenvector of T, then y = V@x is the corresponding Eigenvector of L
+            # sort local eval, and evec accordingly
             eval, idx = torch.sort(eval.real, dim=0)
-            eigenvalues = ht.array(eval)
-            eigenvectors = ht.matmul(V, ht.array(evec))[:, idx]
-            # TODO: spectral_embedding should only return the embedding (eigenvectors)
-            # TODO: move components calculation from fit() to here and only return n_components eigenvalues as embedding
-            return eigenvalues.real, eigenvectors.real
+            evec = evec[:, idx]
+            # Calculate global eigenvectors of L
+            eigenvectors = ht.matmul(V, ht.array(evec))
+            # transpose to have the eigenvectors as rows
+            eigenvectors = eigenvectors.T
+            embedding = eigenvectors[:n_components].real
+            # Set n_clusters to spectral gap, if it is not defined by the user
+            if self.n_clusters is None:
+                diff = eval[1:] - eval[:-1]
+                tmp = torch.argmax(diff).item()
+                self.n_clusters = tmp + 1
         else:
             raise NotImplementedError(
-                "Other Eigenvalue Decomposition methods are not yet supported"
+                f"Eigenvalue decomposition method {eigen_solver} is not supported. Supported methods are 'zolotarev' and 'lanczos'."
             )
+        # 5. Drop first component if requested
+        if drop_first:
+            embedding = embedding[1:]
+        # 6. Return the embedding transposed back to original shape
+        return embedding.T
 
     def fit(self, x: DNDarray):
         """
@@ -245,15 +250,9 @@ def fit(self, x: DNDarray):
         if x.split is not None and x.split != 0:
             raise NotImplementedError("Not implemented for other splitting-axes")
         # 2. Embed Dataset into lower-dimensional Eigenvector space
-        eigenvalues, eigenvectors = self.spectral_embedding(x, self.eigen_solver)
-
-        # 3. Find the spectral gap, if number of clusters is not defined from the outside
-        if self.n_clusters is None:
-            diff = eigenvalues[1:] - eigenvalues[:-1]
-            tmp = ht.argmax(diff).item()
-            self.n_clusters = tmp + 1
-
-        components = eigenvectors[:, : self.n_clusters].copy()
+        components = self.spectral_embedding(
+            x, n_components=self.n_components, eigen_solver=self.eigen_solver
+        )
 
         params = self._cluster.get_params()
         params["n_clusters"] = self.n_clusters

From 07c140adacc42294aa612330b00fc6d427450649 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 17 Jun 2025 11:34:09 +0200
Subject: [PATCH 10/33] introduce random_state and docs editing

---
 heat/cluster/spectral.py | 54 +++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 3019c22487..cc030c8572 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -22,11 +22,12 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
         Eigenvalue decomposition strategy to use. Supported: 'lanczos', 'zolotarev'.
     n_components : int, default=None
         Number of components to use for the embedding. If None, n_clusters is used
+    random_state : int, default=None
+        Random seed for reproducibility. If None, no random seed is set.
     gamma : float, default=1.0
         Kernel coefficient sigma for 'rbf', ignored for affinity='euclidean'
     affinity : str, default='rbf'
         How to construct the similarity (affinity) matrix.
-
             - 'rbf' : construct the similarity matrix using a radial basis function (RBF) kernel.
             - 'euclidean' : construct the similarity matrix as only euclidean distance.
             - 'precomputed' : interpret ``X`` as precomputed affinity matrix.
@@ -38,9 +39,9 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
         Ignored for laplacian='fully_connected'
     boundary : str
         How to interpret threshold: 'upper', 'lower'
-        Ignorded for laplacian='fully_connected'
+        Ignored for laplacian='fully_connected'
     n_lanczos : int, default=300
-        number of Lanczos iterations for Eigenvalue decomposition
+        number of Lanczos iterations for Eigenvalue decomposition. Ignored if eigen_solver='zolotarev'.
     assign_labels: str, default='kmeans'
          The strategy to use to assign labels in the embedding space.
     **params: dict
@@ -52,6 +53,7 @@ def __init__(
         n_clusters: int = None,
         eigen_solver: str = "lanczos",
         n_components: int = None,
+        random_state: Union[int, None] = None,
         gamma: float = 1.0,
         affinity: str = "rbf",
         laplacian: str = "fully_connected",
@@ -64,6 +66,9 @@ def __init__(
         self.n_clusters = n_clusters
         self.eigen_solver = eigen_solver
         self.n_components = n_components if n_components is not None else n_clusters
+        self.random_state = random_state
+        if self.random_state is not None:
+            ht.random.seed(self.random_state)
         self.gamma = gamma
         self.affinity = affinity
         self.laplacian = laplacian
@@ -121,17 +126,18 @@ def labels_(self) -> DNDarray:
     def __set_diag(self, x: DNDarray, value: Union[int, float], norm_laplacian: bool) -> DNDarray:
         """
         Set the diagonal of the matrix to a specific value.
+        Modified for dense PyTorch tensors from scikit-learn.manifold._spectral_embedding._set_diag
+        https://github.com/scikit-learn/scikit-learn/blob/031d2f83b7c9d1027d1477abb2bf34652621d603/sklearn/manifold/_spectral_embedding.py#L108
         """
-        # TODO see scikit learn
         n_nodes = x.shape[0]
-        # for now we assume that the matrix is dense
+        # we assume that the matrix is dense
         if norm_laplacian:
             # set diagonal to `value`
-            # tensor.view(-1) == ndarray.flat
+            # NB: tensor.view(-1) == ndarray.flat
             x.larray.view(-1)[:: n_nodes + 1] = value
         return x
 
-    def spectral_embedding(
+    def __spectral_embedding(
         self,
         x: DNDarray,
         n_components: int = 8,
@@ -232,25 +238,28 @@ def spectral_embedding(
     def fit(self, x: DNDarray):
         """
         Clusters dataset X via spectral embedding.
-        Computes the low-dim representation by calculation of eigenspectrum (eigenvalues and eigenvectors) of the graph
-        laplacian from the similarity matrix and fits the eigenvectors that correspond to the k lowest eigenvalues with
-        a seperate clustering algorithm (currently only kmeans is supported). Similarity metrics for adjacency
-        calculations are supported via spatial.distance. The eigenvalues and eigenvectors are computed by reducing the
-        Laplacian via lanczos iterations and using the torch eigenvalue solver on this smaller matrix. If other
-        eigenvalue decompostion methods are supported, this will be expanded.
+        Computes the low-dim representation by calculation of the spectral embedding (eigenvectors corresponding to the lowest `n_clusters` eigenvalues) of the
+        graph laplacian computed from the similarity matrix. Similarity metrics for adjacency
+        calculations are supported via :func:`heat.spatial.distance`.
+
+        See :func:`__spectral_embedding` for more details on the decomposition of the graph laplacian.
 
         Parameters
         ----------
         x : DNDarray
             Training instances to cluster. Shape = (n_samples, n_features)
+
+        See Also
+        --------
+        :func:`__spectral_embedding`
         """
         # 1. input sanitation
         if not isinstance(x, DNDarray):
             raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}")
-        if x.split is not None and x.split != 0:
-            raise NotImplementedError("Not implemented for other splitting-axes")
+        if x.is_distributed() and x.split != 0:
+            raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.")
         # 2. Embed Dataset into lower-dimensional Eigenvector space
-        components = self.spectral_embedding(
+        components = self.__spectral_embedding(
             x, n_components=self.n_components, eigen_solver=self.eigen_solver
         )
 
@@ -284,8 +293,8 @@ def fit_predict(self, x: DNDarray) -> DNDarray:
     def predict(self, x: DNDarray) -> DNDarray:
         """
         Return the label each sample in X belongs to.
-        X is transformed to the low-dim representation by calculation of eigenspectrum (eigenvalues and eigenvectors) of
-        the graph laplacian from the similarity matrix. Inference of lables is done by extraction of the closest
+        X is transformed to the low-dim representation by calculation of the embedding (eigenvectors corresponding to the lowest `n_clusters` eigenvalues) of
+        the graph laplacian from the similarity matrix. Inference of labels is done by extraction of the closest
         centroid of the n_clusters eigenvectors from the previously fitted clustering algorithm (kmeans).
 
         Parameters
@@ -301,11 +310,10 @@ def predict(self, x: DNDarray) -> DNDarray:
         # input sanitation
         if not isinstance(x, DNDarray):
             raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}")
-        if x.split is not None and x.split != 0:
-            raise NotImplementedError("Not implemented for other splitting-axes")
-
-        _, eigenvectors = self.spectral_embedding(x, self.eigen_solver)
+        if x.is_distributed() and x.split != 0:
+            raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.")
 
-        components = eigenvectors[:, : self.n_clusters].copy()
+        # TODO is copy necessary?
+        components = self.__spectral_embedding(x, self.eigen_solver).copy()
 
         return self._cluster.predict(components)

From 34883fc9eca7f8894dd738f8c5668eda53af8ea6 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 17 Jun 2025 11:46:44 +0200
Subject: [PATCH 11/33] introduce method DNDarray.diagonal

---
 heat/core/manipulations.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/heat/core/manipulations.py b/heat/core/manipulations.py
index 30f5b917cf..41df7ba6a1 100644
--- a/heat/core/manipulations.py
+++ b/heat/core/manipulations.py
@@ -867,6 +867,10 @@ def diagonal(a: DNDarray, offset: int = 0, dim1: int = 0, dim2: int = 1) -> DNDa
     return factories.array(result, dtype=a.dtype, is_split=split, device=a.device, comm=a.comm)
 
 
+DNDarray.diagonal = lambda self, offset=0, dim1=0, dim2=1: diagonal(self, offset, dim1, dim2)
+DNDarray.diagonal.__doc__ = diagonal.__doc__
+
+
 def dsplit(x: Sequence[DNDarray, ...], indices_or_sections: Iterable) -> List[DNDarray, ...]:
     """
     Split array into multiple sub-DNDarrays along the 3rd axis (depth).

From 8dce6fad2caf0148302eadb0333ec3a5ba249126 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 17 Jun 2025 11:47:17 +0200
Subject: [PATCH 12/33] update parameters

---
 heat/cluster/tests/test_spectral.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py
index b7bc456dc4..73e0df4c5b 100644
--- a/heat/cluster/tests/test_spectral.py
+++ b/heat/cluster/tests/test_spectral.py
@@ -21,7 +21,9 @@ def test_get_and_set_params(self):
             params,
             {
                 "n_clusters": None,
-                "eigen_solver": "lanczos",
+                "eigen_solver": "zolotarev",
+                "n_components": None,
+                "random_state": None,
                 "gamma": 1.0,
                 "affinity": "rbf",
                 "laplacian": "fully_connected",

From 7b07624109c3ff451c9d8e7cc19640f9e11c1706 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 17 Jun 2025 11:47:57 +0200
Subject: [PATCH 13/33] set default eigendec to zolotarev

---
 heat/cluster/spectral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index cc030c8572..af11291329 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -18,7 +18,7 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
     ----------
     n_clusters : int, default=8
         Number of clusters to fit
-    eigen_solver : str, default='lanczos'
+    eigen_solver : str, default='zolotarev'
         Eigenvalue decomposition strategy to use. Supported: 'lanczos', 'zolotarev'.
     n_components : int, default=None
         Number of components to use for the embedding. If None, n_clusters is used
@@ -51,7 +51,7 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
     def __init__(
         self,
         n_clusters: int = None,
-        eigen_solver: str = "lanczos",
+        eigen_solver: str = "zolotarev",
         n_components: int = None,
         random_state: Union[int, None] = None,
         gamma: float = 1.0,

From f932362572e78ac58cf382cdfd5f402b2ca8b62b Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 17 Jun 2025 11:56:38 +0200
Subject: [PATCH 14/33] fix tests

---
 heat/cluster/spectral.py            | 2 +-
 heat/cluster/tests/test_spectral.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index af11291329..5969bf131e 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -50,7 +50,7 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
 
     def __init__(
         self,
-        n_clusters: int = None,
+        n_clusters: int = 8,
         eigen_solver: str = "zolotarev",
         n_components: int = None,
         random_state: Union[int, None] = None,
diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py
index 73e0df4c5b..e6a13db62b 100644
--- a/heat/cluster/tests/test_spectral.py
+++ b/heat/cluster/tests/test_spectral.py
@@ -20,9 +20,9 @@ def test_get_and_set_params(self):
         self.assertEqual(
             params,
             {
-                "n_clusters": None,
+                "n_clusters": 8,
                 "eigen_solver": "zolotarev",
-                "n_components": None,
+                "n_components": 8,
                 "random_state": None,
                 "gamma": 1.0,
                 "affinity": "rbf",

From f45c67d14a4a15d074e7cfc1b981800eb6fa04db Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 17 Jun 2025 15:18:10 +0200
Subject: [PATCH 15/33] expand tests

---
 heat/cluster/spectral.py            | 1 -
 heat/cluster/tests/test_spectral.py | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 5969bf131e..100d971815 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -288,7 +288,6 @@ def fit_predict(self, x: DNDarray) -> DNDarray:
         """
         self.fit(x)
         return self.labels_
-        # return self._cluster.predict(components)
 
     def predict(self, x: DNDarray) -> DNDarray:
         """
diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py
index e6a13db62b..800719a93c 100644
--- a/heat/cluster/tests/test_spectral.py
+++ b/heat/cluster/tests/test_spectral.py
@@ -45,12 +45,13 @@ def test_fit_iris(self):
             m = 10
             # fit the clusters
             spectral = ht.cluster.SpectralClustering(
-                n_clusters=3, gamma=1.0, affinity="rbf", laplacian="fully_connected", n_lanczos=m
+                n_clusters=3, random_state=0, gamma=1.0, affinity="rbf", laplacian="fully_connected"
             )
             spectral.fit(iris)
             self.assertIsInstance(spectral.labels_, ht.DNDarray)
 
             spectral = ht.cluster.SpectralClustering(
+                eigen_solver="lanczos",
                 affinity="euclidean",
                 laplacian="eNeighbour",
                 threshold=0.5,
@@ -80,7 +81,7 @@ def test_fit_iris(self):
 
             # Errors
             with self.assertRaises(NotImplementedError):
-                spectral = ht.cluster.SpectralClustering(affinity="ahalanobis", n_lanczos=m)
+                spectral = ht.cluster.SpectralClustering(affinity="mahalanobis", n_lanczos=m)
 
             iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1)
             spectral = ht.cluster.SpectralClustering(n_lanczos=20)

From fc36c8c301ad829b560550ef6b07f75081927d43 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Fri, 4 Jul 2025 11:52:45 +0200
Subject: [PATCH 16/33] spectral_emb instead of __spectral_emb

---
 heat/cluster/spectral.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 100d971815..09a7e68cff 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -137,7 +137,7 @@ def __set_diag(self, x: DNDarray, value: Union[int, float], norm_laplacian: bool
             x.larray.view(-1)[:: n_nodes + 1] = value
         return x
 
-    def __spectral_embedding(
+    def spectral_embedding(
         self,
         x: DNDarray,
         n_components: int = 8,
@@ -242,7 +242,7 @@ def fit(self, x: DNDarray):
         graph laplacian computed from the similarity matrix. Similarity metrics for adjacency
         calculations are supported via :func:`heat.spatial.distance`.
 
-        See :func:`__spectral_embedding` for more details on the decomposition of the graph laplacian.
+        See :func:`spectral_embedding` for more details on the decomposition of the graph laplacian.
 
         Parameters
         ----------
@@ -251,7 +251,7 @@ def fit(self, x: DNDarray):
 
         See Also
         --------
-        :func:`__spectral_embedding`
+        :func:`spectral_embedding`
         """
         # 1. input sanitation
         if not isinstance(x, DNDarray):
@@ -259,7 +259,7 @@ def fit(self, x: DNDarray):
         if x.is_distributed() and x.split != 0:
             raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.")
         # 2. Embed Dataset into lower-dimensional Eigenvector space
-        components = self.__spectral_embedding(
+        components = self.spectral_embedding(
             x, n_components=self.n_components, eigen_solver=self.eigen_solver
         )
 
@@ -313,6 +313,6 @@ def predict(self, x: DNDarray) -> DNDarray:
             raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.")
 
         # TODO is copy necessary?
-        components = self.__spectral_embedding(x, self.eigen_solver).copy()
+        components = self.spectral_embedding(x, self.eigen_solver).copy()
 
         return self._cluster.predict(components)

From 45486af7f4d64f4684e9018069c0e948494fcb3a Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Fri, 4 Jul 2025 12:23:57 +0200
Subject: [PATCH 17/33] reinstate __spectral_amb

---
 heat/cluster/spectral.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 09a7e68cff..100d971815 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -137,7 +137,7 @@ def __set_diag(self, x: DNDarray, value: Union[int, float], norm_laplacian: bool
             x.larray.view(-1)[:: n_nodes + 1] = value
         return x
 
-    def spectral_embedding(
+    def __spectral_embedding(
         self,
         x: DNDarray,
         n_components: int = 8,
@@ -242,7 +242,7 @@ def fit(self, x: DNDarray):
         graph laplacian computed from the similarity matrix. Similarity metrics for adjacency
         calculations are supported via :func:`heat.spatial.distance`.
 
-        See :func:`spectral_embedding` for more details on the decomposition of the graph laplacian.
+        See :func:`__spectral_embedding` for more details on the decomposition of the graph laplacian.
 
         Parameters
         ----------
@@ -251,7 +251,7 @@ def fit(self, x: DNDarray):
 
         See Also
         --------
-        :func:`spectral_embedding`
+        :func:`__spectral_embedding`
         """
         # 1. input sanitation
         if not isinstance(x, DNDarray):
@@ -259,7 +259,7 @@ def fit(self, x: DNDarray):
         if x.is_distributed() and x.split != 0:
             raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.")
         # 2. Embed Dataset into lower-dimensional Eigenvector space
-        components = self.spectral_embedding(
+        components = self.__spectral_embedding(
             x, n_components=self.n_components, eigen_solver=self.eigen_solver
         )
 
@@ -313,6 +313,6 @@ def predict(self, x: DNDarray) -> DNDarray:
             raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.")
 
         # TODO is copy necessary?
-        components = self.spectral_embedding(x, self.eigen_solver).copy()
+        components = self.__spectral_embedding(x, self.eigen_solver).copy()
 
         return self._cluster.predict(components)

From ee42e636291a10eb5ef8e01006b16b9f6ba875eb Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:15:16 +0100
Subject: [PATCH 18/33] Update docstring and args defaults

---
 heat/cluster/spectral.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index b842d83c95..3aaf2a7856 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -12,26 +12,26 @@
 
 class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
     """
-    Spectral clustering of large memory-distributed datasets.
+    Spectral clustering of large memory-distributed arrays.
 
     Attributes
     ----------
-    n_clusters : int, default=8
+    n_clusters : int, optional
         Number of clusters to fit
     eigen_solver : str, default='randomized'
         The eigenvalue decomposition strategy to use.
-            - 'lanczos' : Use Lanczos iterations to reduce the Laplacian matrix size before applying the torch eigenvalue solver.
             - 'randomized' : Use a randomized algorithm to compute the approximate eigenvalues and eigenvectors.
-    n_components : int, default=None
-        Number of components to use for the embedding. If None, n_clusters is used
-    random_state : int, default=None
+            - 'lanczos' : Use Lanczos iterations to reduce the Laplacian matrix size before applying the torch eigenvalue solver.
+    n_components : int, optional
+        Number of components to use for the embedding. If None, it is set to ``n_clusters``
+    random_state : int, optional
         Random seed for reproducibility. If None, no random seed is set.
     gamma : float, default=1.0
         Kernel coefficient sigma for 'rbf', ignored for affinity='euclidean'
     affinity : str, default='rbf'
         How to construct the similarity (affinity) matrix.
             - 'rbf' : construct the similarity matrix using a radial basis function (RBF) kernel.
-            - 'euclidean' : construct the similarity matrix as only euclidean distance.
+            - 'euclidean' : construct the similarity matrix as euclidean distance.
             - 'precomputed' : interpret ``X`` as precomputed affinity matrix.
     laplacian : str, default='fully_connected'
         How to calculate the graph laplacian (affinity)
@@ -42,18 +42,18 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
     boundary : str
         How to interpret threshold: 'upper', 'lower'
         Ignored for laplacian='fully_connected'
-    reigh_rank : int
+    reigh_rank : int, default: 100
         number of samples for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'.
-        It must hold reigh_rank >= n_clusters. If n_clusters is None (automatic selection of number of clusters),
-        reigh_rank gives an upper bound on the number of clusters that can be found. Therefore, reigh_rank should
+        It must hold :math:`reigh_rank >= n_clusters`. If ``n_clusters`` is None (automatic selection of number of clusters),
+        ``reigh_rank`` gives an upper bound on the number of clusters that can be found. Therefore, reigh_rank should
         be set high enough to capture the expected number of clusters in that case.
-    reigh_n_oversamples : int
-        number of oversamples for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. Default is 10.
-    reigh_power_iter : int
-        number of power iterations for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. Default is 0.
+    reigh_n_oversamples : int, default: 10
+        number of oversamples for randomized eigenvalue decomposition. Only used if ``eigen_solver``='randomized'. Default is 10.
+    reigh_power_iter : int, default: 0
+        number of power iterations for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. 
         Consider increasing this value if the eigen-spectrum of the Laplacian decays slowly.
-    lanczos_n_iter : int
-        number of Lanczos iterations for Eigenvalue decomposition. Only used if eigen_solver='lanczos'. Default is 300.
+    lanczos_n_iter : int, default: 300
+        number of Lanczos iterations for Eigenvalue decomposition. Only used if eigen_solver='lanczos'. 
     assign_labels: str, default='kmeans'
          The strategy to use to assign labels in the embedding space.
     **params: dict
@@ -62,9 +62,9 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
 
     def __init__(
         self,
-        n_clusters: int = None,
+        n_clusters: Union[int, None] = None,
         eigen_solver: str = "randomized",
-        n_components: int = None,
+        n_components: Union[int, None] = None,
         random_state: Union[int, None] = None,
         gamma: float = 1.0,
         affinity: str = "rbf",

From ad63d2ceacbab1fbd6cab24ab858361a686498a5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 10 Mar 2026 09:15:26 +0000
Subject: [PATCH 19/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 heat/cluster/spectral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 3aaf2a7856..fbb553e9a9 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -50,10 +50,10 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator):
     reigh_n_oversamples : int, default: 10
         number of oversamples for randomized eigenvalue decomposition. Only used if ``eigen_solver``='randomized'. Default is 10.
     reigh_power_iter : int, default: 0
-        number of power iterations for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. 
+        number of power iterations for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'.
         Consider increasing this value if the eigen-spectrum of the Laplacian decays slowly.
     lanczos_n_iter : int, default: 300
-        number of Lanczos iterations for Eigenvalue decomposition. Only used if eigen_solver='lanczos'. 
+        number of Lanczos iterations for Eigenvalue decomposition. Only used if eigen_solver='lanczos'.
     assign_labels: str, default='kmeans'
          The strategy to use to assign labels in the embedding space.
     **params: dict

From 3414f197e18cbfd5ae7ede0a3f6b0d35e15a9d29 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:17:50 +0100
Subject: [PATCH 20/33] Update __spectral_embedding docs

---
 heat/cluster/spectral.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index fbb553e9a9..6c11e75d10 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -168,7 +168,7 @@ def __set_diag(self, x: DNDarray, value: Union[int, float], norm_laplacian: bool
     def __spectral_embedding(
         self,
         x: DNDarray,
-        n_components: int = 8,
+        n_components: Union[int, None] = None,
         eigen_solver: str = "randomized",
         norm_laplacian: bool = True,
         drop_first: bool = True,
@@ -180,10 +180,10 @@ def __spectral_embedding(
         ----------
         x : DNDarray
             Sample Matrix for which the embedding should be calculated
-        n_components : int, default=8
-            Number of components to use for the embedding
-        eigen_solver : str
-            Eigenvalue decomposition strategy to use. Default is 'randomized' (see documentation in :func:`heat.linalg.reigh`).
+        n_components : int, optional
+            Number of components to use for the embedding. If ``n_components`` is None, it will be set to ``SpectralClustering.n_clusters``.
+        eigen_solver : str, default: `randomized`
+            Eigenvalue decomposition strategy to use.
             TODO: add 'lanczos' as an option, maybe a default torch option for smaller (non-distr) datasets?
         norm_laplacian : bool, default=True
             Whether to use the normalized Laplacian
@@ -193,7 +193,7 @@ def __spectral_embedding(
         See Also
         --------
         :func:`heat.linalg.lanczos`
-        :func:`heat.linalg.eigh`
+        :func:`heat.linalg.reigh`
         :func:`heat.linalg.polar`
 
         Notes
@@ -203,6 +203,9 @@ def __spectral_embedding(
         """
         L = self._laplacian.construct(x)
 
+        # After sklearn.manifold._spectral_embedding.py
+        # https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/manifold/_spectral_embedding.py#L295
+
         # Eigenvalue and -vector calculation
         if eigen_solver == "randomized":
             L = self.__set_diag(L, 1, norm_laplacian)  # TODO should it be a method?

From 3b51b3c86cefeb629f294f0117154c876494a244 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:19:31 +0100
Subject: [PATCH 21/33] Edits

---
 heat/cluster/spectral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 6c11e75d10..2762e6e145 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -208,7 +208,7 @@ def __spectral_embedding(
 
         # Eigenvalue and -vector calculation
         if eigen_solver == "randomized":
-            L = self.__set_diag(L, 1, norm_laplacian)  # TODO should it be a method?
+            L = self.__set_diag(L, 1, norm_laplacian)  
             # extract the diagonal
             dd = L.diagonal()
             # unlike in sklearn, we skip multiplying by -1 as we are not using ARPACK
@@ -219,7 +219,7 @@ def __spectral_embedding(
             # L *= -1
             ####
             # compute the randomized eigenvalue decomposition
-            # ht.linalg.reigh returns the eigenvalues in descending order
+            # NB:  ht.linalg.reigh returns the eigenvalues in descending order
             eval, evec = reigh(
                 L,
                 n_eigenvalues=self.reigh_rank,

From 3b386a93b3559193c8a97bddf23154f27e80d9af Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 10 Mar 2026 09:19:40 +0000
Subject: [PATCH 22/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 heat/cluster/spectral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 2762e6e145..6a3b2e3eaa 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -208,7 +208,7 @@ def __spectral_embedding(
 
         # Eigenvalue and -vector calculation
         if eigen_solver == "randomized":
-            L = self.__set_diag(L, 1, norm_laplacian)  
+            L = self.__set_diag(L, 1, norm_laplacian)
             # extract the diagonal
             dd = L.diagonal()
             # unlike in sklearn, we skip multiplying by -1 as we are not using ARPACK

From 116ecef72fa203327ad3e66dfc797be982f025f7 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:21:11 +0100
Subject: [PATCH 23/33] Remove dead code

---
 heat/cluster/spectral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 6a3b2e3eaa..15764f907d 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -231,7 +231,7 @@ def __spectral_embedding(
                 diff = eval[1:] - eval[:-1]
                 tmp = ht.argmax(diff).item()
                 self.n_clusters = tmp + 1
-            # select the first n_components
+            # select the largest n_components
             embedding = evec.T[:n_components]
             if norm_laplacian:
                 # REVERT FROM DIVISION TO MULTIPLICATION

From 82b0eb66bbced49dbfe8f4fbc8ade32eb953c720 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:22:09 +0100
Subject: [PATCH 24/33] Remove dead code

---
 heat/cluster/spectral.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 15764f907d..1c15b7660d 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -184,7 +184,6 @@ def __spectral_embedding(
             Number of components to use for the embedding. If ``n_components`` is None, it will be set to ``SpectralClustering.n_clusters``.
         eigen_solver : str, default: `randomized`
             Eigenvalue decomposition strategy to use.
-            TODO: add 'lanczos' as an option, maybe a default torch option for smaller (non-distr) datasets?
         norm_laplacian : bool, default=True
             Whether to use the normalized Laplacian
         drop_first : bool, default=True

From 1c40715b8fdd6eb6efd887324222c49b3d095900 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:22:51 +0100
Subject: [PATCH 25/33] Remove outdated reference to linalg.polar

---
 heat/cluster/spectral.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 1c15b7660d..4552e656bb 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -193,7 +193,6 @@ def __spectral_embedding(
         --------
         :func:`heat.linalg.lanczos`
         :func:`heat.linalg.reigh`
-        :func:`heat.linalg.polar`
 
         Notes
         -----

From bee92cca83176bbd372031b6fb5089eb81998f0e Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:23:21 +0100
Subject: [PATCH 26/33] remove dead code

---
 heat/cluster/spectral.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 4552e656bb..3c7c1487f9 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -209,13 +209,6 @@ def __spectral_embedding(
             L = self.__set_diag(L, 1, norm_laplacian)
             # extract the diagonal
             dd = L.diagonal()
-            # unlike in sklearn, we skip multiplying by -1 as we are not using ARPACK
-            # keeping comment below as historical record for now
-            ####
-            # # We are computing the opposite of the laplacian inplace so as
-            # # to spare a memory allocation of a possibly very large array
-            # L *= -1
-            ####
             # compute the randomized eigenvalue decomposition
             # NB:  ht.linalg.reigh returns the eigenvalues in descending order
             eval, evec = reigh(

From f71a0752f068e8289754e608475ade6bea9f58ec Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:24:24 +0100
Subject: [PATCH 27/33] sklearn API consistency

---
 heat/cluster/spectral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 3c7c1487f9..10f41ad057 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -213,7 +213,7 @@ def __spectral_embedding(
             # NB:  ht.linalg.reigh returns the eigenvalues in descending order
             eval, evec = reigh(
                 L,
-                n_eigenvalues=self.reigh_rank,
+                n_eigenvalues=n_components,
                 n_oversamples=self.reigh_n_oversamples,
                 power_iter=self.reigh_power_iter,
             )

From 03bbc018558b2911715919f04bbf4257ed56a96d Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:28:35 +0100
Subject: [PATCH 28/33] __spectral_embedding returns embedding only

Change return type from Tuple to DNDarray for the function.
---
 heat/cluster/spectral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 10f41ad057..6439f4e4fb 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -172,7 +172,7 @@ def __spectral_embedding(
         eigen_solver: str = "randomized",
         norm_laplacian: bool = True,
         drop_first: bool = True,
-    ) -> Tuple[DNDarray, DNDarray]:
+    ) -> DNDarray:
         """
         Returns the embedding (eigenvectors) of the graph's Laplacian matrix.
 

From 870dc7a06f8c2876bc2b77911026d42e1cca162e Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:35:12 +0100
Subject: [PATCH 29/33] assess n_components before decomposition

---
 heat/cluster/spectral.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 6439f4e4fb..ba56e35749 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -209,6 +209,18 @@ def __spectral_embedding(
             L = self.__set_diag(L, 1, norm_laplacian)
             # extract the diagonal
             dd = L.diagonal()
+            
+            # assess n_components = n_clusters if not specified by user
+            if self.n_clusters is None:
+                # set n_clusters to spectral gap
+                diff = eval[1:] - eval[:-1]
+                tmp = ht.argmax(diff).item()
+                self.n_clusters = tmp + 1
+            if n_components is None:
+                n_components = self.n_clusters
+            if drop_first:
+                n_components += 1
+                
             # compute the randomized eigenvalue decomposition
             # NB:  ht.linalg.reigh returns the eigenvalues in descending order
             eval, evec = reigh(
@@ -217,11 +229,7 @@ def __spectral_embedding(
                 n_oversamples=self.reigh_n_oversamples,
                 power_iter=self.reigh_power_iter,
             )
-            # Set n_clusters to spectral gap, if it is not defined by the user
-            if self.n_clusters is None:
-                diff = eval[1:] - eval[:-1]
-                tmp = ht.argmax(diff).item()
-                self.n_clusters = tmp + 1
+
             # select the largest n_components
             embedding = evec.T[:n_components]
             if norm_laplacian:

From 673d43d59e19799bc9ebdfe15393c1c65207e393 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 10 Mar 2026 09:35:25 +0000
Subject: [PATCH 30/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 heat/cluster/spectral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index ba56e35749..49f4a8e3b4 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -209,7 +209,7 @@ def __spectral_embedding(
             L = self.__set_diag(L, 1, norm_laplacian)
             # extract the diagonal
             dd = L.diagonal()
-            
+
             # assess n_components = n_clusters if not specified by user
             if self.n_clusters is None:
                 # set n_clusters to spectral gap
@@ -220,7 +220,7 @@ def __spectral_embedding(
                 n_components = self.n_clusters
             if drop_first:
                 n_components += 1
-                
+
             # compute the randomized eigenvalue decomposition
             # NB:  ht.linalg.reigh returns the eigenvalues in descending order
             eval, evec = reigh(

From d5ca6d0f325583b5e9e88d4a25f29da6af93eafb Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:36:26 +0100
Subject: [PATCH 31/33] apply standard normalization

---
 heat/cluster/spectral.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 49f4a8e3b4..90f498630c 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -233,10 +233,7 @@ def __spectral_embedding(
             # select the largest n_components
             embedding = evec.T[:n_components]
             if norm_laplacian:
-                # REVERT FROM DIVISION TO MULTIPLICATION
-                # (user requirement, see https://codebase.helmholtz.cloud/helmholtz-analytics/SCIMES/-/blob/master/scimes/old_spectral_embedding.py?ref_type=heads#L62)
-                # TODO: generalize / adapt to current scikit-learn
-                embedding = embedding * dd
+                embedding = embedding / dd
             # Drop smallest component if requested
             if drop_first:
                 embedding = embedding[:-1]

From ee9bdabc6fc60043faaae9f9cd56dd7ad5192a49 Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:37:33 +0100
Subject: [PATCH 32/33] Update error message

---
 heat/cluster/spectral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 90f498630c..2f7993cb8a 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -273,7 +273,7 @@ def __spectral_embedding(
                 embedding = embedding[1:]
         else:
             raise NotImplementedError(
-                f"Eigenvalue decomposition method {eigen_solver} is not supported. Supported methods are 'zolotarev' and 'lanczos'."
+                f"Eigenvalue decomposition method {eigen_solver} is not supported. Supported methods are 'randomized' and 'lanczos'."
             )
         # Return the embedding transposed back to original shape
         return embedding.T

From 68a31c4a3dbcb557039065ad1088b38c06e489ea Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:47:48 +0100
Subject: [PATCH 33/33] Refactor comments in spectral.py for clarity

Removed outdated notes about eigenvalues and added comments regarding the Laplacian matrix properties.
---
 heat/cluster/spectral.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py
index 2f7993cb8a..cb195cd53a 100644
--- a/heat/cluster/spectral.py
+++ b/heat/cluster/spectral.py
@@ -193,11 +193,6 @@ def __spectral_embedding(
         --------
         :func:`heat.linalg.lanczos`
         :func:`heat.linalg.reigh`
-
-        Notes
-        -----
-        The imaginary part of the eigenvalues is discarded, as the Laplacian matrix is symmetric and the eigenvectors
-        are real. TODO: check if this is still correct
         """
         L = self._laplacian.construct(x)
 
@@ -267,6 +262,10 @@ def __spectral_embedding(
                 tmp = torch.argmax(diff).item()
                 self.n_clusters = tmp + 1
             # n_components = n_clusters if not specified
+            if n_components is None:
+                n_components = self.n_clusters
+            # the Laplacian is symmetric and the eigenvectors are real
+            # TODO: check the returned order of evec, eval here
             embedding = eigenvectors[:n_components].real
             # Drop smallest component if requested
             if drop_first: