Skip to content

Commit dee5866

Browse files
committed
Integrate PCA projection support from PR #24
1 parent a9e30cc commit dee5866

File tree

5 files changed

+447
-20
lines changed

5 files changed

+447
-20
lines changed

src/hyperview/cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,9 @@ def _build_parser() -> argparse.ArgumentParser:
119119
)
120120
parser.add_argument(
121121
"--method",
122-
choices=["umap"],
122+
choices=["umap", "pca"],
123123
default="umap",
124-
help="Projection method (currently only 'umap')",
124+
help="Projection method: 'umap' (default) or 'pca'",
125125
)
126126
parser.add_argument(
127127
"--layout",

src/hyperview/core/dataset.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -588,12 +588,12 @@ def compute_visualization(
588588
589589
Args:
590590
space_key: Embedding space to project. If None, uses the first available.
591-
method: Projection method ('umap' supported).
591+
method: Projection method ('umap' or 'pca').
592592
layout: Layout spec like 'euclidean', 'euclidean:3d', or 'spherical'.
593593
Omitting the suffix defaults to 2D for Euclidean/Poincare and 3D for spherical.
594-
n_neighbors: Number of neighbors for UMAP.
595-
min_dist: Minimum distance for UMAP.
596-
metric: Distance metric for UMAP.
594+
n_neighbors: Number of neighbors for UMAP (ignored for PCA).
595+
min_dist: Minimum distance for UMAP (ignored for PCA).
596+
metric: Distance metric for UMAP (ignored for PCA).
597597
force: Force recomputation even if layout exists.
598598
599599
Returns:

src/hyperview/embeddings/pipelines.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def compute_layout(
137137
Args:
138138
storage: Storage backend with embeddings.
139139
space_key: Embedding space to project. If None, uses the first available.
140-
method: Projection method ('umap' supported).
140+
method: Projection method ('umap' or 'pca').
141141
geometry: Output geometry type ('euclidean', 'poincare', or 'spherical').
142142
layout_dimension: Visualization dimension (2D or 3D).
143143
n_neighbors: Number of neighbors for UMAP.
@@ -154,8 +154,10 @@ def compute_layout(
154154
"""
155155
from hyperview.embeddings.projection import ProjectionEngine
156156

157-
if method != "umap":
158-
raise ValueError(f"Invalid method: {method}. Only 'umap' is supported.")
157+
if method not in ("umap", "pca"):
158+
raise ValueError(
159+
f"Invalid method: {method}. Supported methods: 'umap', 'pca'."
160+
)
159161
layout_dimension = normalize_layout_dimension(layout_dimension)
160162

161163
if geometry not in ("euclidean", "poincare", "spherical"):
@@ -191,14 +193,21 @@ def compute_layout(
191193
if len(ids) == 0:
192194
raise ValueError(f"No embeddings in space '{space_key}'. Call compute_embeddings() first.")
193195

194-
if len(ids) < 3:
195-
raise ValueError(f"Need at least 3 samples for visualization, have {len(ids)}")
196+
min_samples = 3 if method == "umap" else 2
197+
if len(ids) < min_samples:
198+
raise ValueError(
199+
f"Need at least {min_samples} samples for {method} visualization, have {len(ids)}"
200+
)
196201

197-
layout_params = {
198-
"n_neighbors": n_neighbors,
199-
"min_dist": min_dist,
200-
"metric": metric,
201-
}
202+
layout_params: dict[str, Any] | None
203+
if method == "umap":
204+
layout_params = {
205+
"n_neighbors": n_neighbors,
206+
"min_dist": min_dist,
207+
"metric": metric,
208+
}
209+
else:
210+
layout_params = None
202211

203212
normalize_input = geometry == "spherical"
204213

src/hyperview/embeddings/projection.py

Lines changed: 179 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88

99
logger = logging.getLogger(__name__)
1010

11+
_MIN_PCA_SAMPLES = 2
12+
_EPS = 1e-7
13+
1114

1215
class ProjectionEngine:
1316
"""Engine for projecting high-dimensional embeddings to low-dimensional layouts."""
@@ -18,6 +21,61 @@ def l2_normalize_rows(self, embeddings: np.ndarray) -> np.ndarray:
1821
norms = np.maximum(norms, 1e-12)
1922
return (embeddings / norms).astype(np.float32)
2023

24+
def logmap_0_hyperboloid(
25+
self,
26+
embeddings: np.ndarray,
27+
curvature: float = 1.0,
28+
) -> np.ndarray:
29+
"""Map hyperboloid points to the tangent space at the origin."""
30+
if embeddings.ndim != 2 or embeddings.shape[1] < 2:
31+
raise ValueError(
32+
"embeddings must have shape (N, D+1) with D >= 1, "
33+
f"got {embeddings.shape}"
34+
)
35+
36+
c = float(curvature)
37+
sqrt_c = np.sqrt(c)
38+
39+
t = embeddings[:, 0]
40+
x = embeddings[:, 1:]
41+
42+
arg = np.clip(sqrt_c * t, 1.0, None)
43+
theta = np.arccosh(arg) / sqrt_c
44+
45+
norm_x = np.linalg.norm(x, axis=1)
46+
safe_norm = np.where(norm_x > _EPS, norm_x, 1.0)
47+
scale = np.where(norm_x > _EPS, theta / safe_norm, 0.0)
48+
49+
return (x * scale[:, np.newaxis]).astype(np.float32)
50+
51+
def expmap_0_hyperboloid(
52+
self,
53+
tangent_vectors: np.ndarray,
54+
curvature: float = 1.0,
55+
) -> np.ndarray:
56+
"""Map tangent vectors at the origin back to the hyperboloid."""
57+
if tangent_vectors.ndim != 2:
58+
raise ValueError(
59+
f"tangent_vectors must be 2-D, got shape {tangent_vectors.shape}"
60+
)
61+
62+
c = float(curvature)
63+
sqrt_c = np.sqrt(c)
64+
65+
norm_v = np.linalg.norm(tangent_vectors, axis=1)
66+
scaled_norm = sqrt_c * norm_v
67+
68+
t = np.cosh(scaled_norm) / sqrt_c
69+
safe_scaled = np.where(scaled_norm > _EPS, scaled_norm, 1.0)
70+
coeff = np.where(
71+
scaled_norm > _EPS,
72+
np.sinh(scaled_norm) / safe_scaled,
73+
1.0,
74+
)
75+
spatial = tangent_vectors * coeff[:, np.newaxis]
76+
77+
return np.column_stack([t, spatial]).astype(np.float32)
78+
2179
def to_poincare_ball(
2280
self,
2381
hyperboloid_embeddings: np.ndarray,
@@ -86,7 +144,7 @@ def project(
86144
87145
This separates two concerns:
88146
1) Geometry/model transforms for the *input* embeddings (e.g. hyperboloid -> Poincaré)
89-
2) Dimensionality reduction / layout (currently UMAP)
147+
2) Dimensionality reduction / layout (UMAP or PCA)
90148
91149
Args:
92150
embeddings: Input embeddings (N x D) or hyperboloid (N x D+1).
@@ -95,7 +153,7 @@ def project(
95153
n_components: Number of output dimensions.
96154
normalize_input: Whether to L2-normalize vectors before projection.
97155
curvature: Curvature parameter for hyperbolic embeddings (positive c).
98-
method: Layout method (currently only 'umap').
156+
method: Layout method ('umap' or 'pca').
99157
n_neighbors: UMAP neighbors.
100158
min_dist: UMAP min_dist.
101159
metric: Input metric (used for euclidean inputs).
@@ -104,11 +162,24 @@ def project(
104162
Returns:
105163
Layout coordinates (N x n_components).
106164
"""
107-
if method != "umap":
108-
raise ValueError(f"Invalid method: {method}. Only 'umap' is supported.")
165+
if method not in ("umap", "pca"):
166+
raise ValueError(
167+
f"Invalid method: {method}. Supported methods: 'umap', 'pca'."
168+
)
109169
if n_components < 2:
110170
raise ValueError(f"n_components must be >= 2, got {n_components}")
111171

172+
if method == "pca":
173+
return self._project_pca(
174+
embeddings,
175+
input_geometry=input_geometry,
176+
output_geometry=output_geometry,
177+
n_components=n_components,
178+
normalize_input=normalize_input,
179+
curvature=curvature or 1.0,
180+
verbose=verbose,
181+
)
182+
112183
prepared = embeddings
113184
prepared_metric: str = metric
114185

@@ -148,6 +219,110 @@ def project(
148219
"Must be 'euclidean', 'poincare', or 'spherical'."
149220
)
150221

222+
def _project_pca(
223+
self,
224+
embeddings: np.ndarray,
225+
*,
226+
input_geometry: str,
227+
output_geometry: str,
228+
n_components: int,
229+
normalize_input: bool,
230+
curvature: float,
231+
verbose: bool,
232+
) -> np.ndarray:
233+
"""Project embeddings with deterministic PCA."""
234+
if output_geometry not in ("euclidean", "poincare", "spherical"):
235+
raise ValueError(
236+
f"Invalid output_geometry: {output_geometry}. "
237+
"Must be 'euclidean', 'poincare', or 'spherical'."
238+
)
239+
if output_geometry == "poincare" and n_components != 2:
240+
raise ValueError("Poincare layouts currently require 2D output")
241+
if len(embeddings) < _MIN_PCA_SAMPLES:
242+
raise ValueError(
243+
f"PCA requires at least {_MIN_PCA_SAMPLES} samples, got {len(embeddings)}"
244+
)
245+
if not np.all(np.isfinite(embeddings)):
246+
raise ValueError("Embeddings contain NaN or Inf values.")
247+
248+
if input_geometry == "hyperboloid":
249+
data = self.logmap_0_hyperboloid(embeddings, curvature=curvature)
250+
else:
251+
data = embeddings
252+
if normalize_input:
253+
data = self.l2_normalize_rows(data)
254+
255+
coords, explained = self._pca_svd(data, n_components=n_components)
256+
257+
if verbose:
258+
explained_str = ", ".join(f"{ratio:.4f}" for ratio in explained)
259+
logger.info("PCA explained variance ratio: [%s]", explained_str)
260+
261+
if output_geometry == "poincare":
262+
if input_geometry == "hyperboloid":
263+
hyperboloid_coords = self.expmap_0_hyperboloid(coords, curvature=curvature)
264+
projected = self.to_poincare_ball(hyperboloid_coords, curvature=curvature)
265+
else:
266+
projected = self._map_to_poincare_disk(coords)
267+
268+
projected = self._center_poincare(projected)
269+
projected = self._scale_poincare(projected, factor=0.65)
270+
return projected.astype(np.float32)
271+
272+
projected = self._normalize_coords(coords)
273+
if output_geometry == "spherical":
274+
projected = self.l2_normalize_rows(projected)
275+
276+
return projected.astype(np.float32)
277+
278+
def _pca_svd(
279+
self,
280+
data: np.ndarray,
281+
n_components: int = 2,
282+
) -> tuple[np.ndarray, np.ndarray]:
283+
"""Compute PCA with NumPy SVD and pad missing components with zeros."""
284+
data = data.astype(np.float64, copy=False)
285+
centered = data - data.mean(axis=0, keepdims=True)
286+
287+
_u, singular_values, vh = np.linalg.svd(centered, full_matrices=False)
288+
289+
k = min(n_components, vh.shape[0])
290+
if k > 0:
291+
projected = centered @ vh[:k].T
292+
else:
293+
projected = np.empty((len(data), 0), dtype=np.float64)
294+
295+
if k < n_components:
296+
padding = np.zeros((projected.shape[0], n_components - k), dtype=projected.dtype)
297+
projected = np.hstack([projected, padding])
298+
299+
variance = singular_values**2 / max(len(data) - 1, 1)
300+
explained = np.zeros(n_components, dtype=np.float32)
301+
total_variance = float(variance.sum())
302+
if total_variance > 0 and k > 0:
303+
explained[:k] = variance[:k] / total_variance
304+
305+
return projected.astype(np.float32), explained
306+
307+
def _map_to_poincare_disk(
308+
self,
309+
coords_2d: np.ndarray,
310+
alpha: float = 1.0,
311+
) -> np.ndarray:
312+
"""Map 2D Euclidean coordinates into the open Poincare disk."""
313+
if coords_2d.ndim != 2 or coords_2d.shape[1] != 2:
314+
raise ValueError(f"coords_2d must have shape (N, 2), got {coords_2d.shape}")
315+
316+
max_abs = np.abs(coords_2d).max()
317+
normalized = coords_2d if max_abs <= _EPS else coords_2d / max_abs
318+
319+
radii = np.linalg.norm(normalized, axis=1)
320+
safe_radii = np.where(radii > _EPS, radii, 1.0)
321+
mapped_radii = np.tanh(alpha * radii)
322+
scale = np.where(radii > _EPS, mapped_radii / safe_radii, 0.0)
323+
324+
return (normalized * scale[:, np.newaxis]).astype(np.float32)
325+
151326
def project_umap(
152327
self,
153328
embeddings: np.ndarray,

0 commit comments

Comments
 (0)