From 3fd75a97618cb9096c63a2fff0e1f6ff6cd9fef7 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 14 Jan 2025 13:02:52 +0100 Subject: [PATCH 01/33] refactor API, add `precomputed` affinity --- heat/cluster/spectral.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 39bec27d32..9759d3ec30 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -9,7 +9,7 @@ from heat.core.dndarray import DNDarray -class Spectral(ht.ClusteringMixin, ht.BaseEstimator): +class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): """ Spectral clustering @@ -18,18 +18,19 @@ class Spectral(ht.ClusteringMixin, ht.BaseEstimator): n_clusters : int Number of clusters to fit gamma : float - Kernel coefficient sigma for 'rbf', ignored for metric='euclidean' - metric : string - How to construct the similarity matrix. + Kernel coefficient sigma for 'rbf', ignored for affinity='euclidean' + affinity : string + How to construct the similarity (affinity) matrix. - 'rbf' : construct the similarity matrix using a radial basis function (RBF) kernel. - 'euclidean' : construct the similarity matrix as only euclidean distance. + - 'precomputed' : interpret ``X`` as precomputed affinity matrix. laplacian : str How to calculate the graph laplacian (affinity) Currently supported : 'fully_connected', 'eNeighbour' threshold : float Threshold for affinity matrix if laplacian='eNeighbour' - Ignorded for laplacian='fully_connected' + Ignored for laplacian='fully_connected' boundary : str How to interpret threshold: 'upper', 'lower' Ignorded for laplacian='fully_connected' @@ -45,7 +46,7 @@ def __init__( self, n_clusters: int = None, gamma: float = 1.0, - metric: str = "rbf", + affinity: str = "rbf", laplacian: str = "fully_connected", threshold: float = 1.0, boundary: str = "upper", @@ -55,14 +56,14 @@ def __init__( ): self.n_clusters = n_clusters self.gamma = gamma - self.metric = metric + self.affinity = affinity self.laplacian = laplacian self.threshold = threshold self.boundary = boundary self.n_lanczos = n_lanczos self.assign_labels = assign_labels - if metric == "rbf": + if affinity == "rbf": sig = math.sqrt(1 / (2 * gamma)) self._laplacian = ht.graph.Laplacian( lambda x: ht.spatial.rbf(x, sigma=sig, quadratic_expansion=True), @@ -72,7 +73,7 @@ def __init__( threshold_value=threshold, ) - elif metric == "euclidean": + elif affinity == "euclidean": self._laplacian = ht.graph.Laplacian( lambda x: ht.spatial.cdist(x, quadratic_expansion=True), definition="norm_sym", @@ -80,6 +81,14 @@ def __init__( threshold_key=boundary, threshold_value=threshold, ) + elif affinity == "precomputed": + self._laplacian = ht.graph.Laplacian( + lambda x: x, + definition="norm_sym", + mode=laplacian, + threshold_key=boundary, + threshold_value=threshold, + ) else: raise NotImplementedError("Other kernels currently not supported") From 3114725188c38c8c8c6b326f8063e6d91d568e88 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 14 Jan 2025 13:03:14 +0100 Subject: [PATCH 02/33] adapt tests --- heat/cluster/tests/test_spectral.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py index 9e24dddfc5..bb80a4e88a 100644 --- a/heat/cluster/tests/test_spectral.py +++ b/heat/cluster/tests/test_spectral.py @@ -8,12 +8,12 @@ class TestSpectral(TestCase): def test_clusterer(self): - spectral = ht.cluster.Spectral() + spectral = ht.cluster.SpectralClustering() self.assertTrue(ht.is_estimator(spectral)) self.assertTrue(ht.is_clusterer(spectral)) def test_get_and_set_params(self): - spectral = ht.cluster.Spectral() + spectral = ht.cluster.SpectralClustering() params = spectral.get_params() self.assertEqual( @@ -21,7 +21,7 @@ def test_get_and_set_params(self): { "n_clusters": None, "gamma": 1.0, - "metric": "rbf", + "affinity": "rbf", "laplacian": "fully_connected", "threshold": 1.0, "boundary": "upper", @@ -39,14 +39,14 @@ def test_fit_iris(self): iris = ht.load("heat/datasets/iris.csv", sep=";", split=0) m = 10 # fit the clusters - spectral = ht.cluster.Spectral( - n_clusters=3, gamma=1.0, metric="rbf", laplacian="fully_connected", n_lanczos=m + spectral = ht.cluster.SpectralClustering( + n_clusters=3, gamma=1.0, affinity="rbf", laplacian="fully_connected", n_lanczos=m ) spectral.fit(iris) self.assertIsInstance(spectral.labels_, ht.DNDarray) - spectral = ht.cluster.Spectral( - metric="euclidean", + spectral = ht.cluster.SpectralClustering( + affinity="euclidean", laplacian="eNeighbour", threshold=0.5, boundary="upper", @@ -55,9 +55,9 @@ def test_fit_iris(self): labels = spectral.fit_predict(iris) self.assertIsInstance(labels, ht.DNDarray) - spectral = ht.cluster.Spectral( + spectral = ht.cluster.SpectralClustering( gamma=0.1, - metric="rbf", + affinity="rbf", laplacian="eNeighbour", threshold=0.5, boundary="upper", @@ -67,7 +67,7 @@ def test_fit_iris(self): self.assertIsInstance(labels, ht.DNDarray) kmeans = {"kmeans++": "kmeans++", "max_iter": 30, "tol": -1} - spectral = ht.cluster.Spectral( + spectral = ht.cluster.SpectralClustering( n_clusters=3, gamma=1.0, normalize=True, n_lanczos=m, params=kmeans ) labels = spectral.fit_predict(iris) @@ -75,9 +75,9 @@ def test_fit_iris(self): # Errors with self.assertRaises(NotImplementedError): - spectral = ht.cluster.Spectral(metric="ahalanobis", n_lanczos=m) + spectral = ht.cluster.SpectralClustering(affinity="ahalanobis", n_lanczos=m) iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1) - spectral = ht.cluster.Spectral(n_lanczos=20) + spectral = ht.cluster.SpectralClustering(n_lanczos=20) with self.assertRaises(NotImplementedError): spectral.fit(iris_split) From 441ad2ca1f98c96c19702c37301b3dd89fccfc5f Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Fri, 21 Mar 2025 10:03:32 +0100 Subject: [PATCH 03/33] introduce eigen_solver parameter --- heat/cluster/spectral.py | 74 +++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 9759d3ec30..1de89333aa 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -11,21 +11,23 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): """ - Spectral clustering + Spectral clustering of large memory-distributed datasets. Attributes ---------- - n_clusters : int + n_clusters : int, default=8 Number of clusters to fit - gamma : float + eigen_solver : str, default='lanczos' + Eigenvalue decomposition strategy to use. Currently only 'lanczos' is supported + gamma : float, default=1.0 Kernel coefficient sigma for 'rbf', ignored for affinity='euclidean' - affinity : string + affinity : str, default='rbf' How to construct the similarity (affinity) matrix. - 'rbf' : construct the similarity matrix using a radial basis function (RBF) kernel. - 'euclidean' : construct the similarity matrix as only euclidean distance. - 'precomputed' : interpret ``X`` as precomputed affinity matrix. - laplacian : str + laplacian : str, default='fully_connected' How to calculate the graph laplacian (affinity) Currently supported : 'fully_connected', 'eNeighbour' threshold : float @@ -34,9 +36,9 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): boundary : str How to interpret threshold: 'upper', 'lower' Ignorded for laplacian='fully_connected' - n_lanczos : int + n_lanczos : int, default=300 number of Lanczos iterations for Eigenvalue decomposition - assign_labels: str + assign_labels: str, default='kmeans' The strategy to use to assign labels in the embedding space. **params: dict Parameter dictionary for the assign_labels estimator @@ -45,6 +47,7 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): def __init__( self, n_clusters: int = None, + eigen_solver: str = "lanczos", gamma: float = 1.0, affinity: str = "rbf", laplacian: str = "fully_connected", @@ -55,6 +58,7 @@ def __init__( **params, ): self.n_clusters = n_clusters + self.eigen_solver = eigen_solver self.gamma = gamma self.affinity = affinity self.laplacian = laplacian @@ -109,34 +113,39 @@ def labels_(self) -> DNDarray: """ return self._labels - def _spectral_embedding(self, x: DNDarray) -> Tuple[DNDarray, DNDarray]: + def spectral_embedding(self, x: DNDarray, eigen_solver: str) -> Tuple[DNDarray, DNDarray]: """ - Helper function for dataset x embedding. - Returns Tupel(Eigenvalues, Eigenvectors) of the graph's Laplacian matrix. + Returns Tuple(Eigenvalues, Eigenvectors) of the graph's Laplacian matrix. Parameters ---------- x : DNDarray Sample Matrix for which the embedding should be calculated + eigen_solver : str + Eigenvalue decomposition strategy to use. Currently only 'lanczos' is supported + + See Also + -------- + :func:`heat.linalg.lanczos` Notes ----- - This will throw out the complex side of the eigenvalues found during this. - + The imaginary part of the eigenvalues is discarded, as the Laplacian matrix is symmetric and the eigenvectors + are real. """ L = self._laplacian.construct(x) - # 3. Eigenvalue and -vector calculation via Lanczos Algorithm - v0 = ht.full( - (L.shape[0],), - fill_value=1.0 / math.sqrt(L.shape[0]), - dtype=L.dtype, - split=0, - device=L.device, - ) - V, T = ht.lanczos(L, self.n_lanczos, v0) - - # if int(torch.__version__.split(".")[1]) >= 9: - try: + # 3. Eigenvalue and -vector calculation + # for now via Lanczos Algorithm only + if eigen_solver == "lanczos": + v0 = ht.full( + (L.shape[0],), + fill_value=1.0 / math.sqrt(L.shape[0]), + dtype=L.dtype, + split=0, + device=L.device, + ) + V, T = ht.lanczos(L, self.n_lanczos, v0) + # 4. Calculate and Sort Eigenvalues and Eigenvectors of tridiagonal matrix T eval, evec = torch.linalg.eig(T.larray) @@ -146,15 +155,10 @@ def _spectral_embedding(self, x: DNDarray) -> Tuple[DNDarray, DNDarray]: eigenvectors = ht.matmul(V, ht.array(evec))[:, idx] return eigenvalues.real, eigenvectors.real - except AttributeError: # torch version is less than 1.9.0 - # 4. Calculate and Sort Eigenvalues and Eigenvectors of tridiagonal matrix T - eval, evec = torch.eig(T.larray, eigenvectors=True) - # If x is an Eigenvector of T, then y = V@x is the corresponding Eigenvector of L - eval, idx = torch.sort(eval[:, 0], dim=0) - eigenvalues = ht.array(eval) - eigenvectors = ht.matmul(V, ht.array(evec))[:, idx] - - return eigenvalues, eigenvectors + else: + raise NotImplementedError( + "Other Eigenvalue Decomposition methods are not yet supported" + ) def fit(self, x: DNDarray): """ @@ -177,7 +181,7 @@ def fit(self, x: DNDarray): if x.split is not None and x.split != 0: raise NotImplementedError("Not implemented for other splitting-axes") # 2. Embed Dataset into lower-dimensional Eigenvector space - eigenvalues, eigenvectors = self._spectral_embedding(x) + eigenvalues, eigenvectors = self.spectral_embedding(x, self.eigen_solver) # 3. Find the spectral gap, if number of clusters is not defined from the outside if self.n_clusters is None: @@ -219,7 +223,7 @@ def predict(self, x: DNDarray) -> DNDarray: if x.split is not None and x.split != 0: raise NotImplementedError("Not implemented for other splitting-axes") - _, eigenvectors = self._spectral_embedding(x) + _, eigenvectors = self.spectral_embedding(x, self.eigen_solver) components = eigenvectors[:, : self.n_clusters].copy() From bd8033376575bc5eb11bc76c9a046aca89a7b017 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Fri, 21 Mar 2025 10:03:50 +0100 Subject: [PATCH 04/33] adapt tests --- heat/cluster/tests/test_spectral.py | 1 + 1 file changed, 1 insertion(+) diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py index bb80a4e88a..8ebb2fd657 100644 --- a/heat/cluster/tests/test_spectral.py +++ b/heat/cluster/tests/test_spectral.py @@ -20,6 +20,7 @@ def test_get_and_set_params(self): params, { "n_clusters": None, + "eigen_solver": "lanczos", "gamma": 1.0, "affinity": "rbf", "laplacian": "fully_connected", From 0ec780e244e43604d166b478a2e43e1093c65f7c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 4 Apr 2025 09:35:21 +0000 Subject: [PATCH 05/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- heat/cluster/tests/test_spectral.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py index 6b334cff3d..b7bc456dc4 100644 --- a/heat/cluster/tests/test_spectral.py +++ b/heat/cluster/tests/test_spectral.py @@ -37,14 +37,14 @@ def test_get_and_set_params(self): self.assertEqual(10, spectral.n_clusters) def test_fit_iris(self): - if not self.is_mps: + if not self.is_mps: # get some test data iris = ht.load("heat/datasets/iris.csv", sep=";", split=0) m = 10 # fit the clusters spectral = ht.cluster.SpectralClustering( n_clusters=3, gamma=1.0, affinity="rbf", laplacian="fully_connected", n_lanczos=m - ) + ) spectral.fit(iris) self.assertIsInstance(spectral.labels_, ht.DNDarray) @@ -83,4 +83,4 @@ def test_fit_iris(self): iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1) spectral = ht.cluster.SpectralClustering(n_lanczos=20) with self.assertRaises(NotImplementedError): - spectral.fit(iris_split) \ No newline at end of file + spectral.fit(iris_split) From 1fb1fb061c29e170552c493ad3de7cbbe3524c3f Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 6 May 2025 05:17:42 +0200 Subject: [PATCH 06/33] add fit_predict --- heat/cluster/spectral.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 1de89333aa..3866e1003a 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -200,6 +200,24 @@ def fit(self, x: DNDarray): return self + def fit_predict(self, x: DNDarray) -> DNDarray: + """ + Fit the model to the data and return the labels. + + Parameters + ---------- + x : DNDarray + Training instances to cluster. Shape = (n_samples, n_features) + + Returns + ------- + DNDarray + Labels of each point. + """ + self.fit(x) + return self.labels_ + # return self._cluster.predict(components) + def predict(self, x: DNDarray) -> DNDarray: """ Return the label each sample in X belongs to. From 8868d34cf4ff7f31135848ec2a503c5adb9f26d3 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 20 May 2025 10:46:00 +0200 Subject: [PATCH 07/33] support eigh decomposition --- heat/cluster/spectral.py | 76 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 7 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 3866e1003a..e846824835 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -5,8 +5,9 @@ import heat as ht import math import torch -from typing import Tuple +from typing import Tuple, Union from heat.core.dndarray import DNDarray +from heat.core.linalg import eigh class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): @@ -18,7 +19,9 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): n_clusters : int, default=8 Number of clusters to fit eigen_solver : str, default='lanczos' - Eigenvalue decomposition strategy to use. Currently only 'lanczos' is supported + Eigenvalue decomposition strategy to use. Supported: 'lanczos', 'zolotarev'. + n_components : int, default=None + Number of components to use for the embedding. If None, n_clusters is used gamma : float, default=1.0 Kernel coefficient sigma for 'rbf', ignored for affinity='euclidean' affinity : str, default='rbf' @@ -113,7 +116,27 @@ def labels_(self) -> DNDarray: """ return self._labels - def spectral_embedding(self, x: DNDarray, eigen_solver: str) -> Tuple[DNDarray, DNDarray]: + def __set_diag(self, x: DNDarray, value: Union[int, float], norm_laplacian: bool) -> DNDarray: + """ + Set the diagonal of the matrix to a specific value. + """ + # TODO see scikit learn + n_nodes = x.shape[0] + # for now we assume that the matrix is dense + if norm_laplacian: + # set diagonal to `value` + # tensor.view(-1) == ndarray.flat + x.larray.view(-1)[:: n_nodes + 1] = value + return x + + def spectral_embedding( + self, + x: DNDarray, + n_components: int = 8, + eigen_solver: str = "zolotarev", + norm_laplacian: bool = True, + drop_first: bool = True, + ) -> Tuple[DNDarray, DNDarray]: """ Returns Tuple(Eigenvalues, Eigenvectors) of the graph's Laplacian matrix. @@ -121,22 +144,61 @@ def spectral_embedding(self, x: DNDarray, eigen_solver: str) -> Tuple[DNDarray, ---------- x : DNDarray Sample Matrix for which the embedding should be calculated + n_components : int, default=8 + Number of components to use for the embedding eigen_solver : str - Eigenvalue decomposition strategy to use. Currently only 'lanczos' is supported + Eigenvalue decomposition strategy to use. Default is 'zolotarev' #TODO expand + norm_laplacian : bool, default=True + Whether to use the normalized Laplacian + drop_first : bool, default=True + Whether to drop the first component (the smallest eigenvalue) See Also -------- :func:`heat.linalg.lanczos` + :func:`heat.linalg.eigh` + :func:`heat.linalg.polar` Notes ----- The imaginary part of the eigenvalues is discarded, as the Laplacian matrix is symmetric and the eigenvectors - are real. + are real. TODO: check if this is still correct """ L = self._laplacian.construct(x) # 3. Eigenvalue and -vector calculation - # for now via Lanczos Algorithm only - if eigen_solver == "lanczos": + if eigen_solver == "zolotarev": + # TODO: adapt this to Heat + L = self.__set_diag(L, 1, norm_laplacian) # TODO should it be a method? + # extract the diagonal + dd = L.diagonal() + # try: + # we skip multiplying by -1 as we are not using ARPACK + # keeping comment below as historical record for now + #### + # # We are computing the opposite of the laplacian inplace so as + # # to spare a memory allocation of a possibly very large array + # L *= -1 + #### + # compute the Zolo-PD eigenvalue decomposition + _, diffusion_map = eigh(L) + # ht.linalg.eigh returns the eigenvalues in descending order + # select the first n_components, no need to reverse order + diffusion_map = diffusion_map[:, :n_components] + embedding = diffusion_map.T + if norm_laplacian: + # REVERT FROM DIVISION TO MULTIPLICATION + # (user requirement, see https://codebase.helmholtz.cloud/helmholtz-analytics/SCIMES/-/blob/master/scimes/old_spectral_embedding.py?ref_type=heads#L62) + # TODO: generalize / adapt to current scikit-learn + embedding = embedding * dd + # except RuntimeError: + # # When submatrices are exactly singular, an LU decomposition + # # in arpack fails. We fallback to lobpcg + # eigen_solver = "lobpcg" + # # Revert the laplacian to its opposite to have lobpcg work + # L *= -1 + + # Lanczos Algorithm + elif eigen_solver == "lanczos": v0 = ht.full( (L.shape[0],), fill_value=1.0 / math.sqrt(L.shape[0]), From 631a4aa7e34fa5e29522960f7f716605ed7af3fe Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 20 May 2025 10:52:05 +0200 Subject: [PATCH 08/33] add argument n_components --- heat/cluster/spectral.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index e846824835..b91f795658 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -51,6 +51,7 @@ def __init__( self, n_clusters: int = None, eigen_solver: str = "lanczos", + n_components: int = None, gamma: float = 1.0, affinity: str = "rbf", laplacian: str = "fully_connected", @@ -62,6 +63,7 @@ def __init__( ): self.n_clusters = n_clusters self.eigen_solver = eigen_solver + self.n_components = n_components self.gamma = gamma self.affinity = affinity self.laplacian = laplacian @@ -167,7 +169,6 @@ def spectral_embedding( L = self._laplacian.construct(x) # 3. Eigenvalue and -vector calculation if eigen_solver == "zolotarev": - # TODO: adapt this to Heat L = self.__set_diag(L, 1, norm_laplacian) # TODO should it be a method? # extract the diagonal dd = L.diagonal() @@ -215,7 +216,8 @@ def spectral_embedding( eval, idx = torch.sort(eval.real, dim=0) eigenvalues = ht.array(eval) eigenvectors = ht.matmul(V, ht.array(evec))[:, idx] - + # TODO: spectral_embedding should only return the embedding (eigenvectors) + # TODO: move components calculation from fit() to here and only return n_components eigenvalues as embedding return eigenvalues.real, eigenvectors.real else: raise NotImplementedError( From 46fa36dad0e03a42543ddfd6c3ea5994fbad0cae Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Fri, 6 Jun 2025 10:30:54 +0200 Subject: [PATCH 09/33] return n_components eigenval only --- heat/cluster/spectral.py | 53 ++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index b91f795658..3019c22487 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -63,7 +63,7 @@ def __init__( ): self.n_clusters = n_clusters self.eigen_solver = eigen_solver - self.n_components = n_components + self.n_components = n_components if n_components is not None else n_clusters self.gamma = gamma self.affinity = affinity self.laplacian = laplacian @@ -140,7 +140,7 @@ def spectral_embedding( drop_first: bool = True, ) -> Tuple[DNDarray, DNDarray]: """ - Returns Tuple(Eigenvalues, Eigenvectors) of the graph's Laplacian matrix. + Returns the embedding (eigenvectors) of the graph's Laplacian matrix. Parameters ---------- @@ -149,7 +149,8 @@ def spectral_embedding( n_components : int, default=8 Number of components to use for the embedding eigen_solver : str - Eigenvalue decomposition strategy to use. Default is 'zolotarev' #TODO expand + Eigenvalue decomposition strategy to use. Default is 'zolotarev' (see documentation in :func:`heat.linalg.polar`). + TODO: add 'lanczos' as an option, maybe a default torch option for smaller (non-distr) datasets? norm_laplacian : bool, default=True Whether to use the normalized Laplacian drop_first : bool, default=True @@ -172,7 +173,6 @@ def spectral_embedding( L = self.__set_diag(L, 1, norm_laplacian) # TODO should it be a method? # extract the diagonal dd = L.diagonal() - # try: # we skip multiplying by -1 as we are not using ARPACK # keeping comment below as historical record for now #### @@ -184,19 +184,12 @@ def spectral_embedding( _, diffusion_map = eigh(L) # ht.linalg.eigh returns the eigenvalues in descending order # select the first n_components, no need to reverse order - diffusion_map = diffusion_map[:, :n_components] - embedding = diffusion_map.T + embedding = diffusion_map.T[:n_components] if norm_laplacian: # REVERT FROM DIVISION TO MULTIPLICATION # (user requirement, see https://codebase.helmholtz.cloud/helmholtz-analytics/SCIMES/-/blob/master/scimes/old_spectral_embedding.py?ref_type=heads#L62) # TODO: generalize / adapt to current scikit-learn embedding = embedding * dd - # except RuntimeError: - # # When submatrices are exactly singular, an LU decomposition - # # in arpack fails. We fallback to lobpcg - # eigen_solver = "lobpcg" - # # Revert the laplacian to its opposite to have lobpcg work - # L *= -1 # Lanczos Algorithm elif eigen_solver == "lanczos": @@ -213,16 +206,28 @@ def spectral_embedding( eval, evec = torch.linalg.eig(T.larray) # If x is an Eigenvector of T, then y = V@x is the corresponding Eigenvector of L + # sort local eval, and evec accordingly eval, idx = torch.sort(eval.real, dim=0) - eigenvalues = ht.array(eval) - eigenvectors = ht.matmul(V, ht.array(evec))[:, idx] - # TODO: spectral_embedding should only return the embedding (eigenvectors) - # TODO: move components calculation from fit() to here and only return n_components eigenvalues as embedding - return eigenvalues.real, eigenvectors.real + evec = evec[:, idx] + # Calculate global eigenvectors of L + eigenvectors = ht.matmul(V, ht.array(evec)) + # transpose to have the eigenvectors as rows + eigenvectors = eigenvectors.T + embedding = eigenvectors[:n_components].real + # Set n_clusters to spectral gap, if it is not defined by the user + if self.n_clusters is None: + diff = eval[1:] - eval[:-1] + tmp = torch.argmax(diff).item() + self.n_clusters = tmp + 1 else: raise NotImplementedError( - "Other Eigenvalue Decomposition methods are not yet supported" + f"Eigenvalue decomposition method {eigen_solver} is not supported. Supported methods are 'zolotarev' and 'lanczos'." ) + # 5. Drop first component if requested + if drop_first: + embedding = embedding[1:] + # 6. Return the embedding transposed back to original shape + return embedding.T def fit(self, x: DNDarray): """ @@ -245,15 +250,9 @@ def fit(self, x: DNDarray): if x.split is not None and x.split != 0: raise NotImplementedError("Not implemented for other splitting-axes") # 2. Embed Dataset into lower-dimensional Eigenvector space - eigenvalues, eigenvectors = self.spectral_embedding(x, self.eigen_solver) - - # 3. Find the spectral gap, if number of clusters is not defined from the outside - if self.n_clusters is None: - diff = eigenvalues[1:] - eigenvalues[:-1] - tmp = ht.argmax(diff).item() - self.n_clusters = tmp + 1 - - components = eigenvectors[:, : self.n_clusters].copy() + components = self.spectral_embedding( + x, n_components=self.n_components, eigen_solver=self.eigen_solver + ) params = self._cluster.get_params() params["n_clusters"] = self.n_clusters From 07c140adacc42294aa612330b00fc6d427450649 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 17 Jun 2025 11:34:09 +0200 Subject: [PATCH 10/33] introduce random_state and docs editing --- heat/cluster/spectral.py | 54 +++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 3019c22487..cc030c8572 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -22,11 +22,12 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): Eigenvalue decomposition strategy to use. Supported: 'lanczos', 'zolotarev'. n_components : int, default=None Number of components to use for the embedding. If None, n_clusters is used + random_state : int, default=None + Random seed for reproducibility. If None, no random seed is set. gamma : float, default=1.0 Kernel coefficient sigma for 'rbf', ignored for affinity='euclidean' affinity : str, default='rbf' How to construct the similarity (affinity) matrix. - - 'rbf' : construct the similarity matrix using a radial basis function (RBF) kernel. - 'euclidean' : construct the similarity matrix as only euclidean distance. - 'precomputed' : interpret ``X`` as precomputed affinity matrix. @@ -38,9 +39,9 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): Ignored for laplacian='fully_connected' boundary : str How to interpret threshold: 'upper', 'lower' - Ignorded for laplacian='fully_connected' + Ignored for laplacian='fully_connected' n_lanczos : int, default=300 - number of Lanczos iterations for Eigenvalue decomposition + number of Lanczos iterations for Eigenvalue decomposition. Ignored if eigen_solver='zolotarev'. assign_labels: str, default='kmeans' The strategy to use to assign labels in the embedding space. **params: dict @@ -52,6 +53,7 @@ def __init__( n_clusters: int = None, eigen_solver: str = "lanczos", n_components: int = None, + random_state: Union[int, None] = None, gamma: float = 1.0, affinity: str = "rbf", laplacian: str = "fully_connected", @@ -64,6 +66,9 @@ def __init__( self.n_clusters = n_clusters self.eigen_solver = eigen_solver self.n_components = n_components if n_components is not None else n_clusters + self.random_state = random_state + if self.random_state is not None: + ht.random.seed(self.random_state) self.gamma = gamma self.affinity = affinity self.laplacian = laplacian @@ -121,17 +126,18 @@ def labels_(self) -> DNDarray: def __set_diag(self, x: DNDarray, value: Union[int, float], norm_laplacian: bool) -> DNDarray: """ Set the diagonal of the matrix to a specific value. + Modified for dense PyTorch tensors from scikit-learn.manifold._spectral_embedding._set_diag + https://github.com/scikit-learn/scikit-learn/blob/031d2f83b7c9d1027d1477abb2bf34652621d603/sklearn/manifold/_spectral_embedding.py#L108 """ - # TODO see scikit learn n_nodes = x.shape[0] - # for now we assume that the matrix is dense + # we assume that the matrix is dense if norm_laplacian: # set diagonal to `value` - # tensor.view(-1) == ndarray.flat + # NB: tensor.view(-1) == ndarray.flat x.larray.view(-1)[:: n_nodes + 1] = value return x - def spectral_embedding( + def __spectral_embedding( self, x: DNDarray, n_components: int = 8, @@ -232,25 +238,28 @@ def spectral_embedding( def fit(self, x: DNDarray): """ Clusters dataset X via spectral embedding. - Computes the low-dim representation by calculation of eigenspectrum (eigenvalues and eigenvectors) of the graph - laplacian from the similarity matrix and fits the eigenvectors that correspond to the k lowest eigenvalues with - a seperate clustering algorithm (currently only kmeans is supported). Similarity metrics for adjacency - calculations are supported via spatial.distance. The eigenvalues and eigenvectors are computed by reducing the - Laplacian via lanczos iterations and using the torch eigenvalue solver on this smaller matrix. If other - eigenvalue decompostion methods are supported, this will be expanded. + Computes the low-dim representation by calculation of the spectral embedding (eigenvectors corresponding to the lowest `n_clusters` eigenvalues) of the + graph laplacian computed from the similarity matrix. Similarity metrics for adjacency + calculations are supported via :func:`heat.spatial.distance`. + + See :func:`__spectral_embedding` for more details on the decomposition of the graph laplacian. Parameters ---------- x : DNDarray Training instances to cluster. Shape = (n_samples, n_features) + + See Also + -------- + :func:`__spectral_embedding` """ # 1. input sanitation if not isinstance(x, DNDarray): raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}") - if x.split is not None and x.split != 0: - raise NotImplementedError("Not implemented for other splitting-axes") + if x.is_distributed() and x.split != 0: + raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.") # 2. Embed Dataset into lower-dimensional Eigenvector space - components = self.spectral_embedding( + components = self.__spectral_embedding( x, n_components=self.n_components, eigen_solver=self.eigen_solver ) @@ -284,8 +293,8 @@ def fit_predict(self, x: DNDarray) -> DNDarray: def predict(self, x: DNDarray) -> DNDarray: """ Return the label each sample in X belongs to. - X is transformed to the low-dim representation by calculation of eigenspectrum (eigenvalues and eigenvectors) of - the graph laplacian from the similarity matrix. Inference of lables is done by extraction of the closest + X is transformed to the low-dim representation by calculation of the embedding (eigenvectors corresponding to the lowest `n_clusters` eigenvalues) of + the graph laplacian from the similarity matrix. Inference of labels is done by extraction of the closest centroid of the n_clusters eigenvectors from the previously fitted clustering algorithm (kmeans). Parameters @@ -301,11 +310,10 @@ def predict(self, x: DNDarray) -> DNDarray: # input sanitation if not isinstance(x, DNDarray): raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}") - if x.split is not None and x.split != 0: - raise NotImplementedError("Not implemented for other splitting-axes") - - _, eigenvectors = self.spectral_embedding(x, self.eigen_solver) + if x.is_distributed() and x.split != 0: + raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.") - components = eigenvectors[:, : self.n_clusters].copy() + # TODO is copy necessary? + components = self.__spectral_embedding(x, self.eigen_solver).copy() return self._cluster.predict(components) From 34883fc9eca7f8894dd738f8c5668eda53af8ea6 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 17 Jun 2025 11:46:44 +0200 Subject: [PATCH 11/33] introduce method DNDarray.diagonal --- heat/core/manipulations.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/heat/core/manipulations.py b/heat/core/manipulations.py index 30f5b917cf..41df7ba6a1 100644 --- a/heat/core/manipulations.py +++ b/heat/core/manipulations.py @@ -867,6 +867,10 @@ def diagonal(a: DNDarray, offset: int = 0, dim1: int = 0, dim2: int = 1) -> DNDa return factories.array(result, dtype=a.dtype, is_split=split, device=a.device, comm=a.comm) +DNDarray.diagonal = lambda self, offset=0, dim1=0, dim2=1: diagonal(self, offset, dim1, dim2) +DNDarray.diagonal.__doc__ = diagonal.__doc__ + + def dsplit(x: Sequence[DNDarray, ...], indices_or_sections: Iterable) -> List[DNDarray, ...]: """ Split array into multiple sub-DNDarrays along the 3rd axis (depth). From 8dce6fad2caf0148302eadb0333ec3a5ba249126 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 17 Jun 2025 11:47:17 +0200 Subject: [PATCH 12/33] update parameters --- heat/cluster/tests/test_spectral.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py index b7bc456dc4..73e0df4c5b 100644 --- a/heat/cluster/tests/test_spectral.py +++ b/heat/cluster/tests/test_spectral.py @@ -21,7 +21,9 @@ def test_get_and_set_params(self): params, { "n_clusters": None, - "eigen_solver": "lanczos", + "eigen_solver": "zolotarev", + "n_components": None, + "random_state": None, "gamma": 1.0, "affinity": "rbf", "laplacian": "fully_connected", From 7b07624109c3ff451c9d8e7cc19640f9e11c1706 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 17 Jun 2025 11:47:57 +0200 Subject: [PATCH 13/33] set default eigendec to zolotarev --- heat/cluster/spectral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index cc030c8572..af11291329 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -18,7 +18,7 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): ---------- n_clusters : int, default=8 Number of clusters to fit - eigen_solver : str, default='lanczos' + eigen_solver : str, default='zolotarev' Eigenvalue decomposition strategy to use. Supported: 'lanczos', 'zolotarev'. n_components : int, default=None Number of components to use for the embedding. If None, n_clusters is used @@ -51,7 +51,7 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): def __init__( self, n_clusters: int = None, - eigen_solver: str = "lanczos", + eigen_solver: str = "zolotarev", n_components: int = None, random_state: Union[int, None] = None, gamma: float = 1.0, From f932362572e78ac58cf382cdfd5f402b2ca8b62b Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 17 Jun 2025 11:56:38 +0200 Subject: [PATCH 14/33] fix tests --- heat/cluster/spectral.py | 2 +- heat/cluster/tests/test_spectral.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index af11291329..5969bf131e 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -50,7 +50,7 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): def __init__( self, - n_clusters: int = None, + n_clusters: int = 8, eigen_solver: str = "zolotarev", n_components: int = None, random_state: Union[int, None] = None, diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py index 73e0df4c5b..e6a13db62b 100644 --- a/heat/cluster/tests/test_spectral.py +++ b/heat/cluster/tests/test_spectral.py @@ -20,9 +20,9 @@ def test_get_and_set_params(self): self.assertEqual( params, { - "n_clusters": None, + "n_clusters": 8, "eigen_solver": "zolotarev", - "n_components": None, + "n_components": 8, "random_state": None, "gamma": 1.0, "affinity": "rbf", From f45c67d14a4a15d074e7cfc1b981800eb6fa04db Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 17 Jun 2025 15:18:10 +0200 Subject: [PATCH 15/33] expand tests --- heat/cluster/spectral.py | 1 - heat/cluster/tests/test_spectral.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 5969bf131e..100d971815 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -288,7 +288,6 @@ def fit_predict(self, x: DNDarray) -> DNDarray: """ self.fit(x) return self.labels_ - # return self._cluster.predict(components) def predict(self, x: DNDarray) -> DNDarray: """ diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py index e6a13db62b..800719a93c 100644 --- a/heat/cluster/tests/test_spectral.py +++ b/heat/cluster/tests/test_spectral.py @@ -45,12 +45,13 @@ def test_fit_iris(self): m = 10 # fit the clusters spectral = ht.cluster.SpectralClustering( - n_clusters=3, gamma=1.0, affinity="rbf", laplacian="fully_connected", n_lanczos=m + n_clusters=3, random_state=0, gamma=1.0, affinity="rbf", laplacian="fully_connected" ) spectral.fit(iris) self.assertIsInstance(spectral.labels_, ht.DNDarray) spectral = ht.cluster.SpectralClustering( + eigen_solver="lanczos", affinity="euclidean", laplacian="eNeighbour", threshold=0.5, @@ -80,7 +81,7 @@ def test_fit_iris(self): # Errors with self.assertRaises(NotImplementedError): - spectral = ht.cluster.SpectralClustering(affinity="ahalanobis", n_lanczos=m) + spectral = ht.cluster.SpectralClustering(affinity="mahalanobis", n_lanczos=m) iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1) spectral = ht.cluster.SpectralClustering(n_lanczos=20) From fc36c8c301ad829b560550ef6b07f75081927d43 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Fri, 4 Jul 2025 11:52:45 +0200 Subject: [PATCH 16/33] spectral_emb instead of __spectral_emb --- heat/cluster/spectral.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 100d971815..09a7e68cff 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -137,7 +137,7 @@ def __set_diag(self, x: DNDarray, value: Union[int, float], norm_laplacian: bool x.larray.view(-1)[:: n_nodes + 1] = value return x - def __spectral_embedding( + def spectral_embedding( self, x: DNDarray, n_components: int = 8, @@ -242,7 +242,7 @@ def fit(self, x: DNDarray): graph laplacian computed from the similarity matrix. Similarity metrics for adjacency calculations are supported via :func:`heat.spatial.distance`. - See :func:`__spectral_embedding` for more details on the decomposition of the graph laplacian. + See :func:`spectral_embedding` for more details on the decomposition of the graph laplacian. Parameters ---------- @@ -251,7 +251,7 @@ def fit(self, x: DNDarray): See Also -------- - :func:`__spectral_embedding` + :func:`spectral_embedding` """ # 1. input sanitation if not isinstance(x, DNDarray): @@ -259,7 +259,7 @@ def fit(self, x: DNDarray): if x.is_distributed() and x.split != 0: raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.") # 2. Embed Dataset into lower-dimensional Eigenvector space - components = self.__spectral_embedding( + components = self.spectral_embedding( x, n_components=self.n_components, eigen_solver=self.eigen_solver ) @@ -313,6 +313,6 @@ def predict(self, x: DNDarray) -> DNDarray: raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.") # TODO is copy necessary? - components = self.__spectral_embedding(x, self.eigen_solver).copy() + components = self.spectral_embedding(x, self.eigen_solver).copy() return self._cluster.predict(components) From 45486af7f4d64f4684e9018069c0e948494fcb3a Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Fri, 4 Jul 2025 12:23:57 +0200 Subject: [PATCH 17/33] reinstate __spectral_amb --- heat/cluster/spectral.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 09a7e68cff..100d971815 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -137,7 +137,7 @@ def __set_diag(self, x: DNDarray, value: Union[int, float], norm_laplacian: bool x.larray.view(-1)[:: n_nodes + 1] = value return x - def spectral_embedding( + def __spectral_embedding( self, x: DNDarray, n_components: int = 8, @@ -242,7 +242,7 @@ def fit(self, x: DNDarray): graph laplacian computed from the similarity matrix. Similarity metrics for adjacency calculations are supported via :func:`heat.spatial.distance`. - See :func:`spectral_embedding` for more details on the decomposition of the graph laplacian. + See :func:`__spectral_embedding` for more details on the decomposition of the graph laplacian. Parameters ---------- @@ -251,7 +251,7 @@ def fit(self, x: DNDarray): See Also -------- - :func:`spectral_embedding` + :func:`__spectral_embedding` """ # 1. input sanitation if not isinstance(x, DNDarray): @@ -259,7 +259,7 @@ def fit(self, x: DNDarray): if x.is_distributed() and x.split != 0: raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.") # 2. Embed Dataset into lower-dimensional Eigenvector space - components = self.spectral_embedding( + components = self.__spectral_embedding( x, n_components=self.n_components, eigen_solver=self.eigen_solver ) @@ -313,6 +313,6 @@ def predict(self, x: DNDarray) -> DNDarray: raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.") # TODO is copy necessary? - components = self.spectral_embedding(x, self.eigen_solver).copy() + components = self.__spectral_embedding(x, self.eigen_solver).copy() return self._cluster.predict(components) From ee42e636291a10eb5ef8e01006b16b9f6ba875eb Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:15:16 +0100 Subject: [PATCH 18/33] Update docstring and args defaults --- heat/cluster/spectral.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index b842d83c95..3aaf2a7856 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -12,26 +12,26 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): """ - Spectral clustering of large memory-distributed datasets. + Spectral clustering of large memory-distributed arrays. Attributes ---------- - n_clusters : int, default=8 + n_clusters : int, optional Number of clusters to fit eigen_solver : str, default='randomized' The eigenvalue decomposition strategy to use. - - 'lanczos' : Use Lanczos iterations to reduce the Laplacian matrix size before applying the torch eigenvalue solver. - 'randomized' : Use a randomized algorithm to compute the approximate eigenvalues and eigenvectors. - n_components : int, default=None - Number of components to use for the embedding. If None, n_clusters is used - random_state : int, default=None + - 'lanczos' : Use Lanczos iterations to reduce the Laplacian matrix size before applying the torch eigenvalue solver. + n_components : int, optional + Number of components to use for the embedding. If None, it is set to ``n_clusters`` + random_state : int, optional Random seed for reproducibility. If None, no random seed is set. gamma : float, default=1.0 Kernel coefficient sigma for 'rbf', ignored for affinity='euclidean' affinity : str, default='rbf' How to construct the similarity (affinity) matrix. - 'rbf' : construct the similarity matrix using a radial basis function (RBF) kernel. - - 'euclidean' : construct the similarity matrix as only euclidean distance. + - 'euclidean' : construct the similarity matrix as euclidean distance. - 'precomputed' : interpret ``X`` as precomputed affinity matrix. laplacian : str, default='fully_connected' How to calculate the graph laplacian (affinity) @@ -42,18 +42,18 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): boundary : str How to interpret threshold: 'upper', 'lower' Ignored for laplacian='fully_connected' - reigh_rank : int + reigh_rank : int, default: 100 number of samples for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. - It must hold reigh_rank >= n_clusters. If n_clusters is None (automatic selection of number of clusters), - reigh_rank gives an upper bound on the number of clusters that can be found. Therefore, reigh_rank should + It must hold :math:`reigh_rank >= n_clusters`. If ``n_clusters`` is None (automatic selection of number of clusters), + ``reigh_rank`` gives an upper bound on the number of clusters that can be found. Therefore, reigh_rank should be set high enough to capture the expected number of clusters in that case. - reigh_n_oversamples : int - number of oversamples for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. Default is 10. - reigh_power_iter : int - number of power iterations for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. Default is 0. + reigh_n_oversamples : int, default: 10 + number of oversamples for randomized eigenvalue decomposition. Only used if ``eigen_solver``='randomized'. Default is 10. + reigh_power_iter : int, default: 0 + number of power iterations for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. Consider increasing this value if the eigen-spectrum of the Laplacian decays slowly. - lanczos_n_iter : int - number of Lanczos iterations for Eigenvalue decomposition. Only used if eigen_solver='lanczos'. Default is 300. + lanczos_n_iter : int, default: 300 + number of Lanczos iterations for Eigenvalue decomposition. Only used if eigen_solver='lanczos'. assign_labels: str, default='kmeans' The strategy to use to assign labels in the embedding space. **params: dict @@ -62,9 +62,9 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): def __init__( self, - n_clusters: int = None, + n_clusters: Union[int, None] = None, eigen_solver: str = "randomized", - n_components: int = None, + n_components: Union[int, None] = None, random_state: Union[int, None] = None, gamma: float = 1.0, affinity: str = "rbf", From ad63d2ceacbab1fbd6cab24ab858361a686498a5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Mar 2026 09:15:26 +0000 Subject: [PATCH 19/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- heat/cluster/spectral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 3aaf2a7856..fbb553e9a9 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -50,10 +50,10 @@ class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): reigh_n_oversamples : int, default: 10 number of oversamples for randomized eigenvalue decomposition. Only used if ``eigen_solver``='randomized'. Default is 10. reigh_power_iter : int, default: 0 - number of power iterations for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. + number of power iterations for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. Consider increasing this value if the eigen-spectrum of the Laplacian decays slowly. lanczos_n_iter : int, default: 300 - number of Lanczos iterations for Eigenvalue decomposition. Only used if eigen_solver='lanczos'. + number of Lanczos iterations for Eigenvalue decomposition. Only used if eigen_solver='lanczos'. assign_labels: str, default='kmeans' The strategy to use to assign labels in the embedding space. **params: dict From 3414f197e18cbfd5ae7ede0a3f6b0d35e15a9d29 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:17:50 +0100 Subject: [PATCH 20/33] Update __spectral_embedding docs --- heat/cluster/spectral.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index fbb553e9a9..6c11e75d10 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -168,7 +168,7 @@ def __set_diag(self, x: DNDarray, value: Union[int, float], norm_laplacian: bool def __spectral_embedding( self, x: DNDarray, - n_components: int = 8, + n_components: Union[int, None] = None, eigen_solver: str = "randomized", norm_laplacian: bool = True, drop_first: bool = True, @@ -180,10 +180,10 @@ def __spectral_embedding( ---------- x : DNDarray Sample Matrix for which the embedding should be calculated - n_components : int, default=8 - Number of components to use for the embedding - eigen_solver : str - Eigenvalue decomposition strategy to use. Default is 'randomized' (see documentation in :func:`heat.linalg.reigh`). + n_components : int, optional + Number of components to use for the embedding. If ``n_components`` is None, it will be set to ``SpectralClustering.n_clusters``. + eigen_solver : str, default: `randomized` + Eigenvalue decomposition strategy to use. TODO: add 'lanczos' as an option, maybe a default torch option for smaller (non-distr) datasets? norm_laplacian : bool, default=True Whether to use the normalized Laplacian @@ -193,7 +193,7 @@ def __spectral_embedding( See Also -------- :func:`heat.linalg.lanczos` - :func:`heat.linalg.eigh` + :func:`heat.linalg.reigh` :func:`heat.linalg.polar` Notes @@ -203,6 +203,9 @@ def __spectral_embedding( """ L = self._laplacian.construct(x) + # After sklearn.manifold._spectral_embedding.py + # https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/manifold/_spectral_embedding.py#L295 + # Eigenvalue and -vector calculation if eigen_solver == "randomized": L = self.__set_diag(L, 1, norm_laplacian) # TODO should it be a method? From 3b51b3c86cefeb629f294f0117154c876494a244 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:19:31 +0100 Subject: [PATCH 21/33] Edits --- heat/cluster/spectral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 6c11e75d10..2762e6e145 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -208,7 +208,7 @@ def __spectral_embedding( # Eigenvalue and -vector calculation if eigen_solver == "randomized": - L = self.__set_diag(L, 1, norm_laplacian) # TODO should it be a method? + L = self.__set_diag(L, 1, norm_laplacian) # extract the diagonal dd = L.diagonal() # unlike in sklearn, we skip multiplying by -1 as we are not using ARPACK @@ -219,7 +219,7 @@ def __spectral_embedding( # L *= -1 #### # compute the randomized eigenvalue decomposition - # ht.linalg.reigh returns the eigenvalues in descending order + # NB: ht.linalg.reigh returns the eigenvalues in descending order eval, evec = reigh( L, n_eigenvalues=self.reigh_rank, From 3b386a93b3559193c8a97bddf23154f27e80d9af Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Mar 2026 09:19:40 +0000 Subject: [PATCH 22/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- heat/cluster/spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 2762e6e145..6a3b2e3eaa 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -208,7 +208,7 @@ def __spectral_embedding( # Eigenvalue and -vector calculation if eigen_solver == "randomized": - L = self.__set_diag(L, 1, norm_laplacian) + L = self.__set_diag(L, 1, norm_laplacian) # extract the diagonal dd = L.diagonal() # unlike in sklearn, we skip multiplying by -1 as we are not using ARPACK From 116ecef72fa203327ad3e66dfc797be982f025f7 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:21:11 +0100 Subject: [PATCH 23/33] Remove dead code --- heat/cluster/spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 6a3b2e3eaa..15764f907d 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -231,7 +231,7 @@ def __spectral_embedding( diff = eval[1:] - eval[:-1] tmp = ht.argmax(diff).item() self.n_clusters = tmp + 1 - # select the first n_components + # select the largest n_components embedding = evec.T[:n_components] if norm_laplacian: # REVERT FROM DIVISION TO MULTIPLICATION From 82b0eb66bbced49dbfe8f4fbc8ade32eb953c720 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:22:09 +0100 Subject: [PATCH 24/33] Remove dead code --- heat/cluster/spectral.py | 1 - 1 file changed, 1 deletion(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 15764f907d..1c15b7660d 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -184,7 +184,6 @@ def __spectral_embedding( Number of components to use for the embedding. If ``n_components`` is None, it will be set to ``SpectralClustering.n_clusters``. eigen_solver : str, default: `randomized` Eigenvalue decomposition strategy to use. - TODO: add 'lanczos' as an option, maybe a default torch option for smaller (non-distr) datasets? norm_laplacian : bool, default=True Whether to use the normalized Laplacian drop_first : bool, default=True From 1c40715b8fdd6eb6efd887324222c49b3d095900 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:22:51 +0100 Subject: [PATCH 25/33] Remove outdated reference to linalg.polar --- heat/cluster/spectral.py | 1 - 1 file changed, 1 deletion(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 1c15b7660d..4552e656bb 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -193,7 +193,6 @@ def __spectral_embedding( -------- :func:`heat.linalg.lanczos` :func:`heat.linalg.reigh` - :func:`heat.linalg.polar` Notes ----- From bee92cca83176bbd372031b6fb5089eb81998f0e Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:23:21 +0100 Subject: [PATCH 26/33] remove dead code --- heat/cluster/spectral.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 4552e656bb..3c7c1487f9 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -209,13 +209,6 @@ def __spectral_embedding( L = self.__set_diag(L, 1, norm_laplacian) # extract the diagonal dd = L.diagonal() - # unlike in sklearn, we skip multiplying by -1 as we are not using ARPACK - # keeping comment below as historical record for now - #### - # # We are computing the opposite of the laplacian inplace so as - # # to spare a memory allocation of a possibly very large array - # L *= -1 - #### # compute the randomized eigenvalue decomposition # NB: ht.linalg.reigh returns the eigenvalues in descending order eval, evec = reigh( From f71a0752f068e8289754e608475ade6bea9f58ec Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:24:24 +0100 Subject: [PATCH 27/33] sklearn API consistency --- heat/cluster/spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 3c7c1487f9..10f41ad057 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -213,7 +213,7 @@ def __spectral_embedding( # NB: ht.linalg.reigh returns the eigenvalues in descending order eval, evec = reigh( L, - n_eigenvalues=self.reigh_rank, + n_eigenvalues=n_components, n_oversamples=self.reigh_n_oversamples, power_iter=self.reigh_power_iter, ) From 03bbc018558b2911715919f04bbf4257ed56a96d Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:28:35 +0100 Subject: [PATCH 28/33] __spectral_embedding returns embedding only Change return type from Tuple to DNDarray for the function. --- heat/cluster/spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 10f41ad057..6439f4e4fb 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -172,7 +172,7 @@ def __spectral_embedding( eigen_solver: str = "randomized", norm_laplacian: bool = True, drop_first: bool = True, - ) -> Tuple[DNDarray, DNDarray]: + ) -> DNDarray: """ Returns the embedding (eigenvectors) of the graph's Laplacian matrix. From 870dc7a06f8c2876bc2b77911026d42e1cca162e Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:35:12 +0100 Subject: [PATCH 29/33] assess n_components before decomposition --- heat/cluster/spectral.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 6439f4e4fb..ba56e35749 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -209,6 +209,18 @@ def __spectral_embedding( L = self.__set_diag(L, 1, norm_laplacian) # extract the diagonal dd = L.diagonal() + + # assess n_components = n_clusters if not specified by user + if self.n_clusters is None: + # set n_clusters to spectral gap + diff = eval[1:] - eval[:-1] + tmp = ht.argmax(diff).item() + self.n_clusters = tmp + 1 + if n_components is None: + n_components = self.n_clusters + if drop_first: + n_components += 1 + # compute the randomized eigenvalue decomposition # NB: ht.linalg.reigh returns the eigenvalues in descending order eval, evec = reigh( @@ -217,11 +229,7 @@ def __spectral_embedding( n_oversamples=self.reigh_n_oversamples, power_iter=self.reigh_power_iter, ) - # Set n_clusters to spectral gap, if it is not defined by the user - if self.n_clusters is None: - diff = eval[1:] - eval[:-1] - tmp = ht.argmax(diff).item() - self.n_clusters = tmp + 1 + # select the largest n_components embedding = evec.T[:n_components] if norm_laplacian: From 673d43d59e19799bc9ebdfe15393c1c65207e393 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Mar 2026 09:35:25 +0000 Subject: [PATCH 30/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- heat/cluster/spectral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index ba56e35749..49f4a8e3b4 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -209,7 +209,7 @@ def __spectral_embedding( L = self.__set_diag(L, 1, norm_laplacian) # extract the diagonal dd = L.diagonal() - + # assess n_components = n_clusters if not specified by user if self.n_clusters is None: # set n_clusters to spectral gap @@ -220,7 +220,7 @@ def __spectral_embedding( n_components = self.n_clusters if drop_first: n_components += 1 - + # compute the randomized eigenvalue decomposition # NB: ht.linalg.reigh returns the eigenvalues in descending order eval, evec = reigh( From d5ca6d0f325583b5e9e88d4a25f29da6af93eafb Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:36:26 +0100 Subject: [PATCH 31/33] apply standard normalization --- heat/cluster/spectral.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 49f4a8e3b4..90f498630c 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -233,10 +233,7 @@ def __spectral_embedding( # select the largest n_components embedding = evec.T[:n_components] if norm_laplacian: - # REVERT FROM DIVISION TO MULTIPLICATION - # (user requirement, see https://codebase.helmholtz.cloud/helmholtz-analytics/SCIMES/-/blob/master/scimes/old_spectral_embedding.py?ref_type=heads#L62) - # TODO: generalize / adapt to current scikit-learn - embedding = embedding * dd + embedding = embedding / dd # Drop smallest component if requested if drop_first: embedding = embedding[:-1] From ee9bdabc6fc60043faaae9f9cd56dd7ad5192a49 Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:37:33 +0100 Subject: [PATCH 32/33] Update error message --- heat/cluster/spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 90f498630c..2f7993cb8a 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -273,7 +273,7 @@ def __spectral_embedding( embedding = embedding[1:] else: raise NotImplementedError( - f"Eigenvalue decomposition method {eigen_solver} is not supported. Supported methods are 'zolotarev' and 'lanczos'." + f"Eigenvalue decomposition method {eigen_solver} is not supported. Supported methods are 'randomized' and 'lanczos'." ) # Return the embedding transposed back to original shape return embedding.T From 68a31c4a3dbcb557039065ad1088b38c06e489ea Mon Sep 17 00:00:00 2001 From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:47:48 +0100 Subject: [PATCH 33/33] Refactor comments in spectral.py for clarity Removed outdated notes about eigenvalues and added comments regarding the Laplacian matrix properties. --- heat/cluster/spectral.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 2f7993cb8a..cb195cd53a 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -193,11 +193,6 @@ def __spectral_embedding( -------- :func:`heat.linalg.lanczos` :func:`heat.linalg.reigh` - - Notes - ----- - The imaginary part of the eigenvalues is discarded, as the Laplacian matrix is symmetric and the eigenvectors - are real. TODO: check if this is still correct """ L = self._laplacian.construct(x) @@ -267,6 +262,10 @@ def __spectral_embedding( tmp = torch.argmax(diff).item() self.n_clusters = tmp + 1 # n_components = n_clusters if not specified + if n_components is None: + n_components = self.n_clusters + # the Laplacian is symmetric and the eigenvectors are real + # TODO: check the returned order of evec, eval here embedding = eigenvectors[:n_components].real # Drop smallest component if requested if drop_first: