diff --git a/.talismanrc b/.talismanrc index 976c575f94..9199d0b98d 100644 --- a/.talismanrc +++ b/.talismanrc @@ -1,3 +1,10 @@ threshold: medium allowed_patterns: - 'uses: [A-Za-z-\/]+@[\w\d]+' + +fileignoreconfig: +- filename: .github/workflows/ci.yaml + checksum: 3bf095a6ff388eb7aacb80e3b33ed600b377bbce6a97f01ac1579dc702024a13 +- filename: .github/workflows/ci_full.yaml + checksum: 54389cc7b9ddd0010ed99d1194da724dbcc51922358f739a5aa30e65e8e2c0e0 +version: "1.0" diff --git a/heat/cluster/spectral.py b/heat/cluster/spectral.py index 6f7fd9d6e0..cb195cd53a 100644 --- a/heat/cluster/spectral.py +++ b/heat/cluster/spectral.py @@ -5,51 +5,56 @@ import heat as ht import math import torch -from typing import Tuple +from typing import Tuple, Union from heat.core.dndarray import DNDarray +from heat.core.linalg import reigh -class Spectral(ht.ClusteringMixin, ht.BaseEstimator): +class SpectralClustering(ht.ClusteringMixin, ht.BaseEstimator): """ - Spectral clustering + Spectral clustering of large memory-distributed arrays. Attributes ---------- - n_clusters : int + n_clusters : int, optional Number of clusters to fit - gamma : float - Kernel coefficient sigma for 'rbf', ignored for metric='euclidean' - metric : string - How to construct the similarity matrix. - + eigen_solver : str, default='randomized' + The eigenvalue decomposition strategy to use. + - 'randomized' : Use a randomized algorithm to compute the approximate eigenvalues and eigenvectors. + - 'lanczos' : Use Lanczos iterations to reduce the Laplacian matrix size before applying the torch eigenvalue solver. + n_components : int, optional + Number of components to use for the embedding. If None, it is set to ``n_clusters`` + random_state : int, optional + Random seed for reproducibility. If None, no random seed is set. + gamma : float, default=1.0 + Kernel coefficient sigma for 'rbf', ignored for affinity='euclidean' + affinity : str, default='rbf' + How to construct the similarity (affinity) matrix. - 'rbf' : construct the similarity matrix using a radial basis function (RBF) kernel. - - 'euclidean' : construct the similarity matrix as only euclidean distance. - laplacian : str + - 'euclidean' : construct the similarity matrix as euclidean distance. + - 'precomputed' : interpret ``X`` as precomputed affinity matrix. + laplacian : str, default='fully_connected' How to calculate the graph laplacian (affinity) Currently supported : 'fully_connected', 'eNeighbour' threshold : float Threshold for affinity matrix if laplacian='eNeighbour' - Ignorded for laplacian='fully_connected' + Ignored for laplacian='fully_connected' boundary : str How to interpret threshold: 'upper', 'lower' - Ignorded for laplacian='fully_connected' - eigen_solver : str - The eigenvalue decomposition strategy to use. - - 'lanczos' : Use Lanczos iterations to reduce the Laplacian matrix size before applying the torch eigenvalue solver. - - 'randomized' : Use a randomized algorithm to compute the approximate eigenvalues and eigenvectors. - reigh_rank : int + Ignored for laplacian='fully_connected' + reigh_rank : int, default: 100 number of samples for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. - It must hold reigh_rank >= n_clusters. If n_clusters is None (automatic selection of number of clusters), - reigh_rank gives an upper bound on the number of clusters that can be found. Therefore, reigh_rank should + It must hold :math:`reigh_rank >= n_clusters`. If ``n_clusters`` is None (automatic selection of number of clusters), + ``reigh_rank`` gives an upper bound on the number of clusters that can be found. Therefore, reigh_rank should be set high enough to capture the expected number of clusters in that case. - reigh_n_oversamples : int - number of oversamples for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. Default is 10. - reigh_power_iter : int - number of power iterations for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. Default is 0. + reigh_n_oversamples : int, default: 10 + number of oversamples for randomized eigenvalue decomposition. Only used if ``eigen_solver``='randomized'. Default is 10. + reigh_power_iter : int, default: 0 + number of power iterations for randomized eigenvalue decomposition. Only used if eigen_solver='randomized'. Consider increasing this value if the eigen-spectrum of the Laplacian decays slowly. - lanczos_n_iter : int - number of Lanczos iterations for Eigenvalue decomposition. Only used if eigen_solver='lanczos'. Default is 300. - assign_labels: str + lanczos_n_iter : int, default: 300 + number of Lanczos iterations for Eigenvalue decomposition. Only used if eigen_solver='lanczos'. + assign_labels: str, default='kmeans' The strategy to use to assign labels in the embedding space. **params: dict Parameter dictionary for the assign_labels estimator @@ -57,13 +62,15 @@ class Spectral(ht.ClusteringMixin, ht.BaseEstimator): def __init__( self, - n_clusters: int = None, + n_clusters: Union[int, None] = None, + eigen_solver: str = "randomized", + n_components: Union[int, None] = None, + random_state: Union[int, None] = None, gamma: float = 1.0, - metric: str = "rbf", + affinity: str = "rbf", laplacian: str = "fully_connected", threshold: float = 1.0, boundary: str = "upper", - eigen_solver: str = "randomized", reigh_rank: int = 100, reigh_n_oversamples: int = 10, reigh_power_iter: int = 0, @@ -72,8 +79,13 @@ def __init__( **params, ): self.n_clusters = n_clusters + self.eigen_solver = eigen_solver + self.n_components = n_components if n_components is not None else n_clusters + self.random_state = random_state + if self.random_state is not None: + ht.random.seed(self.random_state) self.gamma = gamma - self.metric = metric + self.affinity = affinity self.laplacian = laplacian self.threshold = threshold self.boundary = boundary @@ -93,7 +105,7 @@ def __init__( ): raise ValueError("reigh_rank must be at least equal to n_clusters") - if metric == "rbf": + if affinity == "rbf": sig = math.sqrt(1 / (2 * gamma)) self._laplacian = ht.graph.Laplacian( lambda x: ht.spatial.rbf(x, sigma=sig, quadratic_expansion=True), @@ -103,7 +115,7 @@ def __init__( threshold_value=threshold, ) - elif metric == "euclidean": + elif affinity == "euclidean": self._laplacian = ht.graph.Laplacian( lambda x: ht.spatial.cdist(x, quadratic_expansion=True), definition="norm_sym", @@ -111,6 +123,14 @@ def __init__( threshold_key=boundary, threshold_value=threshold, ) + elif affinity == "precomputed": + self._laplacian = ht.graph.Laplacian( + lambda x: x, + definition="norm_sym", + mode=laplacian, + threshold_key=boundary, + threshold_value=threshold, + ) else: raise NotImplementedError("Other kernels currently not supported") @@ -131,25 +151,90 @@ def labels_(self) -> DNDarray: """ return self._labels - def _spectral_embedding(self, x: DNDarray) -> Tuple[DNDarray, DNDarray]: + def __set_diag(self, x: DNDarray, value: Union[int, float], norm_laplacian: bool) -> DNDarray: + """ + Set the diagonal of the matrix to a specific value. + Modified for dense PyTorch tensors from scikit-learn.manifold._spectral_embedding._set_diag + https://github.com/scikit-learn/scikit-learn/blob/031d2f83b7c9d1027d1477abb2bf34652621d603/sklearn/manifold/_spectral_embedding.py#L108 + """ + n_nodes = x.shape[0] + # we assume that the matrix is dense + if norm_laplacian: + # set diagonal to `value` + # NB: tensor.view(-1) == ndarray.flat + x.larray.view(-1)[:: n_nodes + 1] = value + return x + + def __spectral_embedding( + self, + x: DNDarray, + n_components: Union[int, None] = None, + eigen_solver: str = "randomized", + norm_laplacian: bool = True, + drop_first: bool = True, + ) -> DNDarray: """ - Helper function for dataset x embedding. - Returns Tupel(Eigenvalues, Eigenvectors) of the graph's Laplacian matrix. + Returns the embedding (eigenvectors) of the graph's Laplacian matrix. Parameters ---------- x : DNDarray Sample Matrix for which the embedding should be calculated - - Notes - ----- - This will throw out the complex side of the eigenvalues found during this. - + n_components : int, optional + Number of components to use for the embedding. If ``n_components`` is None, it will be set to ``SpectralClustering.n_clusters``. + eigen_solver : str, default: `randomized` + Eigenvalue decomposition strategy to use. + norm_laplacian : bool, default=True + Whether to use the normalized Laplacian + drop_first : bool, default=True + Whether to drop the the smallest eigenvalue and corresponding eigenvector. + + See Also + -------- + :func:`heat.linalg.lanczos` + :func:`heat.linalg.reigh` """ L = self._laplacian.construct(x) - if self.eigen_solver == "lanczos": - # use Lanczos Algorithm: for Eigenvalue and -vector calculation + # After sklearn.manifold._spectral_embedding.py + # https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/manifold/_spectral_embedding.py#L295 + + # Eigenvalue and -vector calculation + if eigen_solver == "randomized": + L = self.__set_diag(L, 1, norm_laplacian) + # extract the diagonal + dd = L.diagonal() + + # assess n_components = n_clusters if not specified by user + if self.n_clusters is None: + # set n_clusters to spectral gap + diff = eval[1:] - eval[:-1] + tmp = ht.argmax(diff).item() + self.n_clusters = tmp + 1 + if n_components is None: + n_components = self.n_clusters + if drop_first: + n_components += 1 + + # compute the randomized eigenvalue decomposition + # NB: ht.linalg.reigh returns the eigenvalues in descending order + eval, evec = reigh( + L, + n_eigenvalues=n_components, + n_oversamples=self.reigh_n_oversamples, + power_iter=self.reigh_power_iter, + ) + + # select the largest n_components + embedding = evec.T[:n_components] + if norm_laplacian: + embedding = embedding / dd + # Drop smallest component if requested + if drop_first: + embedding = embedding[:-1] + + # Lanczos Algorithm + elif eigen_solver == "lanczos": v0 = ht.full( (L.shape[0],), fill_value=1.0 / math.sqrt(L.shape[0]), @@ -159,65 +244,66 @@ def _spectral_embedding(self, x: DNDarray) -> Tuple[DNDarray, DNDarray]: ) V, T = ht.lanczos(L, self.lanczos_n_iter, v0) - # if int(torch.__version__.split(".")[1]) >= 9: - try: - # 4. Calculate and Sort Eigenvalues and Eigenvectors of tridiagonal matrix T - eval, evec = torch.linalg.eig(T.larray) - - # If x is an Eigenvector of T, then y = V@x is the corresponding Eigenvector of L - eval, idx = torch.sort(eval.real, dim=0) - eigenvalues = ht.array(eval) - eigenvectors = ht.matmul(V, ht.array(evec))[:, idx] - - return eigenvalues.real, eigenvectors.real - except AttributeError: # torch version is less than 1.9.0 - # 4. Calculate and Sort Eigenvalues and Eigenvectors of tridiagonal matrix T - eval, evec = torch.eig(T.larray, eigenvectors=True) - # If x is an Eigenvector of T, then y = V@x is the corresponding Eigenvector of L - eval, idx = torch.sort(eval[:, 0], dim=0) - eigenvalues = ht.array(eval) - eigenvectors = ht.matmul(V, ht.array(evec))[:, idx] - return eigenvalues, eigenvectors + # Calculate and sort eigenvalues and eigenvectors of tridiagonal matrix T + # T is non-distributed + eval, evec = torch.linalg.eig(T.larray) + + # If x is an Eigenvector of T, then y = V@x is the corresponding Eigenvector of L + # sort local eval, and evec accordingly + eval, idx = torch.sort(eval.real, dim=0) + evec = evec[:, idx] + # Calculate global eigenvectors of L + eigenvectors = ht.matmul(V, ht.array(evec)) + # transpose to have the eigenvectors as rows + eigenvectors = eigenvectors.T + # Set n_clusters to spectral gap, if it is not defined by the user + if self.n_clusters is None: + diff = eval[1:] - eval[:-1] + tmp = torch.argmax(diff).item() + self.n_clusters = tmp + 1 + # n_components = n_clusters if not specified + if n_components is None: + n_components = self.n_clusters + # the Laplacian is symmetric and the eigenvectors are real + # TODO: check the returned order of evec, eval here + embedding = eigenvectors[:n_components].real + # Drop smallest component if requested + if drop_first: + embedding = embedding[1:] else: - # use randomized eigenvalue decomposition with the chosen arguments - eigenvalues, eigenvectors = ht.linalg.reigh( - L, - self.reigh_rank, - n_oversamples=self.reigh_n_oversamples, - power_iter=self.reigh_power_iter, + raise NotImplementedError( + f"Eigenvalue decomposition method {eigen_solver} is not supported. Supported methods are 'randomized' and 'lanczos'." ) - return eigenvalues, eigenvectors + # Return the embedding transposed back to original shape + return embedding.T def fit(self, x: DNDarray): """ Clusters dataset X via spectral embedding. - Computes the low-dim representation by calculation of eigenspectrum (eigenvalues and eigenvectors) of the graph - laplacian from the similarity matrix and fits the eigenvectors that correspond to the k lowest eigenvalues with - a seperate clustering algorithm (currently only kmeans is supported). Similarity metrics for adjacency - calculations are supported via spatial.distance. The eigenvalues and eigenvectors are computed by reducing the - Laplacian via lanczos iterations and using the torch eigenvalue solver on this smaller matrix. If other - eigenvalue decompostion methods are supported, this will be expanded. + Computes the low-dim representation by calculation of the spectral embedding (eigenvectors corresponding to the lowest `n_clusters` eigenvalues) of the + graph laplacian computed from the similarity matrix. Similarity metrics for adjacency + calculations are supported via :func:`heat.spatial.distance`. + + See :func:`__spectral_embedding` for more details on the decomposition of the graph laplacian. Parameters ---------- x : DNDarray Training instances to cluster. Shape = (n_samples, n_features) + + See Also + -------- + :func:`__spectral_embedding` """ - # 1. input sanitation + # input sanitation if not isinstance(x, DNDarray): raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}") - if x.split is not None and x.split != 0: - raise NotImplementedError("Not implemented for other splitting-axes") - # 2. Embed Dataset into lower-dimensional Eigenvector space - eigenvalues, eigenvectors = self._spectral_embedding(x) - - # 3. Find the spectral gap, if number of clusters is not defined from the outside - if self.n_clusters is None: - diff = eigenvalues[1:] - eigenvalues[:-1] - tmp = ht.argmax(diff).item() - self.n_clusters = tmp + 1 - - components = eigenvectors[:, : self.n_clusters].copy() + if x.is_distributed() and x.split != 0: + raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.") + # Embed dataset into lower-dimensional eigenvector space + components = self.__spectral_embedding( + x, n_components=self.n_components, eigen_solver=self.eigen_solver + ) params = self._cluster.get_params() params["n_clusters"] = self.n_clusters @@ -228,11 +314,28 @@ def fit(self, x: DNDarray): return self + def fit_predict(self, x: DNDarray) -> DNDarray: + """ + Fit the model to the data and return the labels. + + Parameters + ---------- + x : DNDarray + Training instances to cluster. Shape = (n_samples, n_features) + + Returns + ------- + DNDarray + Labels of each point. + """ + self.fit(x) + return self.labels_ + def predict(self, x: DNDarray) -> DNDarray: """ Return the label each sample in X belongs to. - X is transformed to the low-dim representation by calculation of eigenspectrum (eigenvalues and eigenvectors) of - the graph laplacian from the similarity matrix. Inference of lables is done by extraction of the closest + X is transformed to the low-dim representation by calculation of the embedding (eigenvectors corresponding to the lowest `n_clusters` eigenvalues) of + the graph laplacian from the similarity matrix. Inference of labels is done by extraction of the closest centroid of the n_clusters eigenvectors from the previously fitted clustering algorithm (kmeans). Parameters @@ -248,11 +351,10 @@ def predict(self, x: DNDarray) -> DNDarray: # input sanitation if not isinstance(x, DNDarray): raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}") - if x.split is not None and x.split != 0: - raise NotImplementedError("Not implemented for other splitting-axes") - - _, eigenvectors = self._spectral_embedding(x) + if x.is_distributed() and x.split != 0: + raise NotImplementedError(f"Distribution along axis {x.split} is not supported yet.") - components = eigenvectors[:, : self.n_clusters].copy() + # TODO is copy necessary? + components = self.__spectral_embedding(x, self.eigen_solver).copy() return self._cluster.predict(components) diff --git a/heat/cluster/tests/test_spectral.py b/heat/cluster/tests/test_spectral.py index 3bdabc2682..cacec16742 100644 --- a/heat/cluster/tests/test_spectral.py +++ b/heat/cluster/tests/test_spectral.py @@ -9,26 +9,28 @@ class TestSpectral(TestCase): def test_clusterer(self): - spectral = ht.cluster.Spectral() + spectral = ht.cluster.SpectralClustering() self.assertTrue(ht.is_estimator(spectral)) self.assertTrue(ht.is_clusterer(spectral)) def test_get_and_set_params(self): - spectral = ht.cluster.Spectral() + spectral = ht.cluster.SpectralClustering() params = spectral.get_params() self.assertEqual( params, { "n_clusters": None, + "eigen_solver": "randomized", + "n_components": None, + "random_state": None, "gamma": 1.0, - "metric": "rbf", + "affinity": "rbf", "laplacian": "fully_connected", "threshold": 1.0, "boundary": "upper", "lanczos_n_iter": 300, "assign_labels": "kmeans", - "eigen_solver": "randomized", "reigh_rank": 100, "reigh_n_oversamples": 10, "reigh_power_iter": 0, @@ -40,40 +42,39 @@ def test_get_and_set_params(self): self.assertEqual(10, spectral.n_clusters) def test_fit_iris(self): - # skip on MPS, matmul on ComplexFloat not supported as of PyTorch 2.5 if not self.is_mps: # get some test data iris = ht.load("heat/datasets/iris.csv", sep=";", split=0) lanczosniter = 10 reighrank = 10 # fit the clusters - spectral = ht.cluster.Spectral( - n_clusters=3, gamma=1.0, metric="rbf", laplacian="fully_connected", eigen_solver="lanczos", lanczos_n_iter=lanczosniter + spectral = ht.cluster.SpectralClustering( + n_clusters=3, random_state=0, gamma=1.0, affinity="rbf", laplacian="fully_connected", eigen_solver="lanczos", lanczos_n_iter=lanczosniter ) spectral.fit(iris) self.assertIsInstance(spectral.labels_, ht.DNDarray) - spectral = ht.cluster.Spectral( - metric="euclidean", + spectral = ht.cluster.SpectralClustering( + eigen_solver="randomized", + affinity="euclidean", laplacian="eNeighbour", threshold=0.5, boundary="upper", - eigen_solver="randomized", reigh_rank=reighrank, ) labels = spectral.fit_predict(iris) self.assertIsInstance(labels, ht.DNDarray) kmeans = {"kmeans++": "kmeans++", "max_iter": 30, "tol": -1} - spectral = ht.cluster.Spectral( + spectral = ht.cluster.SpectralClustering( n_clusters=3, gamma=1.0, normalize=True, params=kmeans ) labels = spectral.fit_predict(iris) self.assertIsInstance(labels, ht.DNDarray) - spectral = ht.cluster.Spectral( + spectral = ht.cluster.SpectralClustering( gamma=0.1, - metric="rbf", + affinity="rbf", laplacian="eNeighbour", threshold=0.5, boundary="upper", @@ -87,15 +88,15 @@ def test_fit_iris(self): # Errors with self.assertRaises(NotImplementedError): - spectral = ht.cluster.Spectral(eigen_solver="unknown") - - with self.assertRaises(ValueError): - spectral = ht.cluster.Spectral(eigen_solver="randomized", reigh_rank=2, n_clusters=5) + spectral = ht.cluster.SpectralClustering(affinity="mahalanobis", lanczos_n_iter=2) with self.assertRaises(NotImplementedError): - spectral = ht.cluster.Spectral(metric="ahalanobis", lanczos_n_iter=2) + spectral = ht.cluster.SpectralClustering(eigen_solver="unknown") + + with self.assertRaises(ValueError): + spectral = ht.cluster.SpectralClustering(eigen_solver="randomized", reigh_rank=2, n_clusters=5) iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1) - spectral = ht.cluster.Spectral(eigen_solver="lanczos", lanczos_n_iter=20) + spectral = ht.cluster.SpectralClustering(eigen_solver="lanczos", lanczos_n_iter=20) with self.assertRaises(NotImplementedError): spectral.fit(iris_split) diff --git a/heat/core/manipulations.py b/heat/core/manipulations.py index 6f297515b7..53a173ab88 100644 --- a/heat/core/manipulations.py +++ b/heat/core/manipulations.py @@ -869,6 +869,10 @@ def diagonal(a: DNDarray, offset: int = 0, dim1: int = 0, dim2: int = 1) -> DNDa return factories.array(result, dtype=a.dtype, is_split=split, device=a.device, comm=a.comm) +DNDarray.diagonal = lambda self, offset=0, dim1=0, dim2=1: diagonal(self, offset, dim1, dim2) +DNDarray.diagonal.__doc__ = diagonal.__doc__ + + def dsplit(x: Sequence[DNDarray, ...], indices_or_sections: Iterable) -> List[DNDarray, ...]: """ Split array into multiple sub-DNDarrays along the 3rd axis (depth).