88
99logger = logging .getLogger (__name__ )
1010
11+ _MIN_PCA_SAMPLES = 2
12+ _EPS = 1e-7
13+
1114
1215class ProjectionEngine :
1316 """Engine for projecting high-dimensional embeddings to low-dimensional layouts."""
@@ -18,6 +21,61 @@ def l2_normalize_rows(self, embeddings: np.ndarray) -> np.ndarray:
1821 norms = np .maximum (norms , 1e-12 )
1922 return (embeddings / norms ).astype (np .float32 )
2023
24+ def logmap_0_hyperboloid (
25+ self ,
26+ embeddings : np .ndarray ,
27+ curvature : float = 1.0 ,
28+ ) -> np .ndarray :
29+ """Map hyperboloid points to the tangent space at the origin."""
30+ if embeddings .ndim != 2 or embeddings .shape [1 ] < 2 :
31+ raise ValueError (
32+ "embeddings must have shape (N, D+1) with D >= 1, "
33+ f"got { embeddings .shape } "
34+ )
35+
36+ c = float (curvature )
37+ sqrt_c = np .sqrt (c )
38+
39+ t = embeddings [:, 0 ]
40+ x = embeddings [:, 1 :]
41+
42+ arg = np .clip (sqrt_c * t , 1.0 , None )
43+ theta = np .arccosh (arg ) / sqrt_c
44+
45+ norm_x = np .linalg .norm (x , axis = 1 )
46+ safe_norm = np .where (norm_x > _EPS , norm_x , 1.0 )
47+ scale = np .where (norm_x > _EPS , theta / safe_norm , 0.0 )
48+
49+ return (x * scale [:, np .newaxis ]).astype (np .float32 )
50+
51+ def expmap_0_hyperboloid (
52+ self ,
53+ tangent_vectors : np .ndarray ,
54+ curvature : float = 1.0 ,
55+ ) -> np .ndarray :
56+ """Map tangent vectors at the origin back to the hyperboloid."""
57+ if tangent_vectors .ndim != 2 :
58+ raise ValueError (
59+ f"tangent_vectors must be 2-D, got shape { tangent_vectors .shape } "
60+ )
61+
62+ c = float (curvature )
63+ sqrt_c = np .sqrt (c )
64+
65+ norm_v = np .linalg .norm (tangent_vectors , axis = 1 )
66+ scaled_norm = sqrt_c * norm_v
67+
68+ t = np .cosh (scaled_norm ) / sqrt_c
69+ safe_scaled = np .where (scaled_norm > _EPS , scaled_norm , 1.0 )
70+ coeff = np .where (
71+ scaled_norm > _EPS ,
72+ np .sinh (scaled_norm ) / safe_scaled ,
73+ 1.0 ,
74+ )
75+ spatial = tangent_vectors * coeff [:, np .newaxis ]
76+
77+ return np .column_stack ([t , spatial ]).astype (np .float32 )
78+
2179 def to_poincare_ball (
2280 self ,
2381 hyperboloid_embeddings : np .ndarray ,
@@ -86,7 +144,7 @@ def project(
86144
87145 This separates two concerns:
88146 1) Geometry/model transforms for the *input* embeddings (e.g. hyperboloid -> Poincaré)
89- 2) Dimensionality reduction / layout (currently UMAP)
147+ 2) Dimensionality reduction / layout (UMAP or PCA )
90148
91149 Args:
92150 embeddings: Input embeddings (N x D) or hyperboloid (N x D+1).
@@ -95,7 +153,7 @@ def project(
95153 n_components: Number of output dimensions.
96154 normalize_input: Whether to L2-normalize vectors before projection.
97155 curvature: Curvature parameter for hyperbolic embeddings (positive c).
98- method: Layout method (currently only 'umap ').
156+ method: Layout method ('umap' or 'pca ').
99157 n_neighbors: UMAP neighbors.
100158 min_dist: UMAP min_dist.
101159 metric: Input metric (used for euclidean inputs).
@@ -104,11 +162,24 @@ def project(
104162 Returns:
105163 Layout coordinates (N x n_components).
106164 """
107- if method != "umap" :
108- raise ValueError (f"Invalid method: { method } . Only 'umap' is supported." )
165+ if method not in ("umap" , "pca" ):
166+ raise ValueError (
167+ f"Invalid method: { method } . Supported methods: 'umap', 'pca'."
168+ )
109169 if n_components < 2 :
110170 raise ValueError (f"n_components must be >= 2, got { n_components } " )
111171
172+ if method == "pca" :
173+ return self ._project_pca (
174+ embeddings ,
175+ input_geometry = input_geometry ,
176+ output_geometry = output_geometry ,
177+ n_components = n_components ,
178+ normalize_input = normalize_input ,
179+ curvature = curvature or 1.0 ,
180+ verbose = verbose ,
181+ )
182+
112183 prepared = embeddings
113184 prepared_metric : str = metric
114185
@@ -148,6 +219,110 @@ def project(
148219 "Must be 'euclidean', 'poincare', or 'spherical'."
149220 )
150221
222+ def _project_pca (
223+ self ,
224+ embeddings : np .ndarray ,
225+ * ,
226+ input_geometry : str ,
227+ output_geometry : str ,
228+ n_components : int ,
229+ normalize_input : bool ,
230+ curvature : float ,
231+ verbose : bool ,
232+ ) -> np .ndarray :
233+ """Project embeddings with deterministic PCA."""
234+ if output_geometry not in ("euclidean" , "poincare" , "spherical" ):
235+ raise ValueError (
236+ f"Invalid output_geometry: { output_geometry } . "
237+ "Must be 'euclidean', 'poincare', or 'spherical'."
238+ )
239+ if output_geometry == "poincare" and n_components != 2 :
240+ raise ValueError ("Poincare layouts currently require 2D output" )
241+ if len (embeddings ) < _MIN_PCA_SAMPLES :
242+ raise ValueError (
243+ f"PCA requires at least { _MIN_PCA_SAMPLES } samples, got { len (embeddings )} "
244+ )
245+ if not np .all (np .isfinite (embeddings )):
246+ raise ValueError ("Embeddings contain NaN or Inf values." )
247+
248+ if input_geometry == "hyperboloid" :
249+ data = self .logmap_0_hyperboloid (embeddings , curvature = curvature )
250+ else :
251+ data = embeddings
252+ if normalize_input :
253+ data = self .l2_normalize_rows (data )
254+
255+ coords , explained = self ._pca_svd (data , n_components = n_components )
256+
257+ if verbose :
258+ explained_str = ", " .join (f"{ ratio :.4f} " for ratio in explained )
259+ logger .info ("PCA explained variance ratio: [%s]" , explained_str )
260+
261+ if output_geometry == "poincare" :
262+ if input_geometry == "hyperboloid" :
263+ hyperboloid_coords = self .expmap_0_hyperboloid (coords , curvature = curvature )
264+ projected = self .to_poincare_ball (hyperboloid_coords , curvature = curvature )
265+ else :
266+ projected = self ._map_to_poincare_disk (coords )
267+
268+ projected = self ._center_poincare (projected )
269+ projected = self ._scale_poincare (projected , factor = 0.65 )
270+ return projected .astype (np .float32 )
271+
272+ projected = self ._normalize_coords (coords )
273+ if output_geometry == "spherical" :
274+ projected = self .l2_normalize_rows (projected )
275+
276+ return projected .astype (np .float32 )
277+
278+ def _pca_svd (
279+ self ,
280+ data : np .ndarray ,
281+ n_components : int = 2 ,
282+ ) -> tuple [np .ndarray , np .ndarray ]:
283+ """Compute PCA with NumPy SVD and pad missing components with zeros."""
284+ data = data .astype (np .float64 , copy = False )
285+ centered = data - data .mean (axis = 0 , keepdims = True )
286+
287+ _u , singular_values , vh = np .linalg .svd (centered , full_matrices = False )
288+
289+ k = min (n_components , vh .shape [0 ])
290+ if k > 0 :
291+ projected = centered @ vh [:k ].T
292+ else :
293+ projected = np .empty ((len (data ), 0 ), dtype = np .float64 )
294+
295+ if k < n_components :
296+ padding = np .zeros ((projected .shape [0 ], n_components - k ), dtype = projected .dtype )
297+ projected = np .hstack ([projected , padding ])
298+
299+ variance = singular_values ** 2 / max (len (data ) - 1 , 1 )
300+ explained = np .zeros (n_components , dtype = np .float32 )
301+ total_variance = float (variance .sum ())
302+ if total_variance > 0 and k > 0 :
303+ explained [:k ] = variance [:k ] / total_variance
304+
305+ return projected .astype (np .float32 ), explained
306+
307+ def _map_to_poincare_disk (
308+ self ,
309+ coords_2d : np .ndarray ,
310+ alpha : float = 1.0 ,
311+ ) -> np .ndarray :
312+ """Map 2D Euclidean coordinates into the open Poincare disk."""
313+ if coords_2d .ndim != 2 or coords_2d .shape [1 ] != 2 :
314+ raise ValueError (f"coords_2d must have shape (N, 2), got { coords_2d .shape } " )
315+
316+ max_abs = np .abs (coords_2d ).max ()
317+ normalized = coords_2d if max_abs <= _EPS else coords_2d / max_abs
318+
319+ radii = np .linalg .norm (normalized , axis = 1 )
320+ safe_radii = np .where (radii > _EPS , radii , 1.0 )
321+ mapped_radii = np .tanh (alpha * radii )
322+ scale = np .where (radii > _EPS , mapped_radii / safe_radii , 0.0 )
323+
324+ return (normalized * scale [:, np .newaxis ]).astype (np .float32 )
325+
151326 def project_umap (
152327 self ,
153328 embeddings : np .ndarray ,
0 commit comments