From a9de25220b27317178f8ea6a9004b95c12736d35 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Wed, 19 Nov 2025 14:56:23 +0000 Subject: [PATCH 01/23] Adding an initial test knn weighter --- weighting/knn_weighter.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 weighting/knn_weighter.py diff --git a/weighting/knn_weighter.py b/weighting/knn_weighter.py new file mode 100644 index 0000000..e69de29 From c973acdffc089df00e6dc1e19519ef5e0959dd90 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Wed, 19 Nov 2025 18:42:14 +0000 Subject: [PATCH 02/23] Testing HMF weighting --- weighting/knn_weighter.py | 1078 +++++++++++++++++++++++++++++++ weighting/test_hmf_weighting.py | 634 ++++++++++++++++++ 2 files changed, 1712 insertions(+) create mode 100644 weighting/test_hmf_weighting.py diff --git a/weighting/knn_weighter.py b/weighting/knn_weighter.py index e69de29..9f25a47 100644 --- a/weighting/knn_weighter.py +++ b/weighting/knn_weighter.py @@ -0,0 +1,1078 @@ +"""k-NN reweighting of a biased sample to match a parent population. + +This script reads two HDF5 files: + +* A "parent" (global) population. +* A "sample" population (which may be biased). + +The set of kernels is defined by the sample file: every group +``Grids/Kernel_*`` in the sample must also exist in the parent. Each such +group must contain a dataset ``GridPointOverDensities``. + +For each kernel: + +* Read overdensities from parent and sample. +* Transform as ``log10(1 + delta)``. +* Stack over all kernels into a multi-dimensional feature space. + +We then use k-NN density-ratio estimation to compute weights for the sample +so that its multi-scale environment distribution matches the parent. + +An optional auto-tuning step searches over candidate k values, choosing the +one that: + +* Satisfies an effective sample size (ESS) floor, if possible. +* Minimizes the mean 1D KS distance between parent and weighted sample + across all dimensions. + +Example +------- + python knn_weighter.py \\ + --parent parent.hdf5 \\ + --sample sample.hdf5 \\ + --autotune_k 35,50,80,100 \\ + --ess_floor 0.20 +""" + +from __future__ import annotations + +import argparse +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import h5py +import matplotlib.pyplot as plt +import numpy as np +from sklearn.neighbors import NearestNeighbors + +# --------------------------------------------------------------------- +# Utilities +# --------------------------------------------------------------------- + + +def effective_sample_size(w: np.ndarray) -> float: + """Compute the effective sample size (ESS) of a weight vector. + + ESS is defined as: + + .. math:: + + \\text{ESS} = \\frac{(\\sum_i w_i)^2}{\\sum_i w_i^2}. + + Args: + w: One-dimensional array of non-negative weights. + + Returns: + Effective sample size as a float. + """ + s = float(w.sum()) + return (s * s) / float(np.sum(w * w) + 1e-18) + + +def weighted_ks( + x_parent: np.ndarray, + x_sample: np.ndarray, + w_sample: np.ndarray, + rng: np.random.Generator, + n_parent: int = 100_000, + n_sample: int = 100_000, +) -> float: + """Compute a weighted 1D KolmogorovSmirnov distance. + + The parent distribution is unweighted; the sample distribution is + weighted. For speed, both are optionally downsampled before computing + the KS distance. + + Args: + x_parent: Parent data values (1D). + x_sample: Sample data values (1D). + w_sample: Weights for ``x_sample``, same length as ``x_sample``. + rng: NumPy random number generator instance. + n_parent: Maximum number of parent points to use. + n_sample: Maximum number of sample points to use. + + Returns: + The KS distance as a float. + """ + # Downsample parent. + if len(x_parent) > n_parent: + xp = x_parent[rng.choice(len(x_parent), n_parent, replace=False)] + else: + xp = x_parent + + # Downsample sample + weights. + if len(x_sample) > n_sample: + idx = rng.choice(len(x_sample), n_sample, replace=False) + xs = x_sample[idx] + ws = w_sample[idx] + else: + xs, ws = x_sample, w_sample + + # Sort and build CDFs. + xp = np.sort(xp) + order = np.argsort(xs) + xs = xs[order] + ws = np.clip(ws[order], 0, None) + ws_sum = ws.sum() + if ws_sum < 1e-15: + # All weights are effectively zero - return worst-case KS distance + return 1.0 + ws /= ws_sum + cdf_s = np.cumsum(ws) + + grid = np.unique(np.concatenate([xp, xs])) + cdf_p = np.searchsorted(xp, grid, side="right") / len(xp) + + idx = np.searchsorted(xs, grid, side="right") - 1 + idx = np.clip(idx, -1, len(xs) - 1) + cdf_s_at_grid = np.where(idx >= 0, cdf_s[idx], 0.0) + + return float(np.max(np.abs(cdf_p - cdf_s_at_grid))) + + +def whiten_by_parent( + Xp: np.ndarray, + Xs: np.ndarray, +) -> Tuple[np.ndarray, np.ndarray]: + """Apply parent-covariance whitening (Mahalanobis transform). + + The parent distribution defines the mean and covariance used to whiten + both the parent and the sample data: + + .. math:: + + X' = (X - \\mu_\\text{parent}) L^{-1}, + + where ``L`` is the Cholesky factor of the parent covariance. + + Args: + Xp: Parent data array of shape ``(N, D)``. + Xs: Sample data array of shape ``(M, D)``. + + Returns: + Tuple ``(Xp_whitened, Xs_whitened)`` with the same shapes as inputs. + """ + mu = Xp.mean(axis=0) + Xm = Xp - mu + Sigma = np.cov(Xm, rowvar=False) + d = Sigma.shape[0] + # Tiny ridge for numerical stability. + Sigma = Sigma + (1e-12 * np.trace(Sigma) / max(d, 1)) * np.eye(d) + L = np.linalg.cholesky(Sigma) + Linv = np.linalg.inv(L) + return (Xp - mu) @ Linv, (Xs - mu) @ Linv + + +def drop_nan_rows_per_scale( + samples: Dict[float, np.ndarray], +) -> Dict[float, np.ndarray]: + """Remove entries that are NaN in any dimension for a population. + + For a dictionary of 1D arrays (one per smoothing scale), this function + removes indices where *any* scale is NaN, ensuring consistent length + across all scales. + + Args: + samples: Mapping from scale (e.g., in Mpc/h) to 1D NumPy arrays of + equal length. + + Returns: + New dictionary with NaN-containing entries removed consistently + across all scales. + + Raises: + ValueError: If the arrays in ``samples`` do not all have the same + length. + """ + if not samples: + return samples + + first = next(iter(samples.values())) + length = len(first) + mask = np.ones(length, dtype=bool) + + for arr in samples.values(): + if len(arr) != length: + raise ValueError( + "All arrays in a population must have the same length; " + "found mismatched lengths." + ) + mask &= ~np.isnan(arr) + + n_dropped = length - np.sum(mask) + if n_dropped > 0: + print( + f"[NaN] Dropped {n_dropped:,}/{length:,} " + f"({100.0 * n_dropped / length:.2f}%) entries containing NaN." + ) + + return {scale: arr[mask] for scale, arr in samples.items()} + + +# --------------------------------------------------------------------- +# Core k-NN reweighter with auto-tuning +# --------------------------------------------------------------------- + + +@dataclass +class KNNReweighterConfig: + """Configuration for :class:`KNNReweighter`. + + Attributes: + k: Default number of neighbors for k-NN if auto-tuning is disabled. + autotune_k: Candidate k values to try when auto-tuning. If None, + a single fixed k is used. + ess_floor: Minimum ESS fraction (ESS / M) for a candidate k to be + considered "feasible" during auto-tuning. + preprocess: Preprocessing mode: ``"whiten"``, ``"standardize"``, or + ``"none"``. + standardize_by_parent: If True and ``preprocess == "standardize"``, + use parent statistics only; otherwise, use pooled stats. + algorithm: Nearest-neighbors algorithm (as in scikit-learn). + metric: Distance metric (e.g., ``"minkowski"``, ``"euclidean"``). + leaf_size: Leaf size for tree-based nearest-neighbor algorithms. + tau: Temperature exponent applied to the weights (``w := w**tau``). + Use tau < 1 to reduce weight variance (smoother, higher ESS). + Use tau > 1 to increase contrast (sharper weights, lower ESS). + Default tau=1 preserves density ratio exactly. + clip_range: Tuple ``(min, max)`` for clipping weights. + ks_eval_parent: Maximum number of parent points for KS evaluation. + ks_eval_sample: Maximum number of sample points for KS evaluation. + random_state: Random seed for diagnostics and KS subsampling. + """ + + # k / tuning + k: int = 50 + autotune_k: Optional[List[int]] = None + ess_floor: float = 0.20 # as fraction of M + + # preprocessing + preprocess: str = "whiten" # whiten | standardize | none + standardize_by_parent: bool = True # only used if preprocess=standardize + + # neighbors + algorithm: str = "auto" + metric: str = "minkowski" + leaf_size: int = 40 + + # stabilization + tau: float = 1.0 # tau < 1: reduce variance, tau > 1: increase contrast + clip_range: Tuple[float, float] = (0.01, 50.0) + + # diagnostics + ks_eval_parent: int = 100_000 + ks_eval_sample: int = 100_000 + random_state: int = 42 + + +class KNNReweighter: + """k-NN density-ratio reweighter for multi-scale environments.""" + + def __init__(self, cfg: KNNReweighterConfig): + """Initialize the reweighter. + + Args: + cfg: Configuration object controlling k-NN and diagnostics. + """ + self.cfg = cfg + self.scales: List[float] = [] + self.weights_: Optional[np.ndarray] = None + self.info_: Optional[Dict] = None + + def fit( + self, + parent_samples: Dict[float, np.ndarray], + sample_samples: Dict[float, np.ndarray], + ) -> Tuple[np.ndarray, Dict]: + """Compute weights that map the sample to the parent distribution. + + The features are constructed by column-stacking the overdensities at + different smoothing scales. + + Args: + parent_samples: Mapping from scale to parent overdensity array + (1D, same length per scale). + sample_samples: Mapping from scale to sample overdensity array + (1D, same length per scale). + + Returns: + Tuple ``(weights, info)``: + * ``weights``: 1D array of length M with reweighting factors. + * ``info``: Dictionary of diagnostics and configuration values. + + Raises: + ValueError: If the parent and sample do not share the same set + of scales. + """ + if set(parent_samples.keys()) != set(sample_samples.keys()): + raise ValueError( + "parent_samples and sample_samples must have identical keys " + "(same set of smoothing scales)." + ) + + self.scales = sorted(parent_samples.keys()) + Xp = np.column_stack([parent_samples[s] for s in self.scales]) # [N, D] + Xs = np.column_stack([sample_samples[s] for s in self.scales]) # [M, D] + N, d = Xp.shape + M = Xs.shape[0] + rng = np.random.default_rng(self.cfg.random_state) + + print(f"[kNN] {d}D features | Parent N={N:,} | Sample M={M:,}") + Xp_prep, Xs_prep = self._preprocess(Xp, Xs) + + # Solve for weights, optionally auto-tuning k. + if self.cfg.autotune_k: + best_k, w, tuning_info = self._autotune_k( + Xp_prep, Xs_prep, Xp, Xs, N, M, d, rng + ) + k_used = best_k + else: + k_used = self.cfg.k + w = self._weights_for_k(Xp_prep, Xs_prep, N, M, d, k_used) + tuning_info = None + + # Basic diagnostics: ESS, clipping, KS. + ess = effective_sample_size(w) + clip_frac = float( + np.mean( + (w <= self.cfg.clip_range[0] + 1e-15) + | (w >= self.cfg.clip_range[1] - 1e-15) + ) + ) + + ks_vals = [ + weighted_ks( + Xp[:, j], + Xs[:, j], + w, + rng, + n_parent=min(self.cfg.ks_eval_parent, N), + n_sample=min(self.cfg.ks_eval_sample, M), + ) + for j in range(d) + ] + ks_mean = float(np.mean(ks_vals)) + + info = { + "method": "knn_density_ratio", + "k": int(k_used), + "ks_mean": ks_mean, + "ks_per_dim": ks_vals, + "preprocess": self.cfg.preprocess, + "standardize_by_parent": self.cfg.standardize_by_parent, + "scales": self.scales, + "parent_samples": N, + "sample_samples": M, + "algorithm": self.cfg.algorithm, + "metric": self.cfg.metric, + "leaf_size": self.cfg.leaf_size, + "tau": self.cfg.tau, + "clip_range": self.cfg.clip_range, + "clip_fraction": clip_frac, + "ess": float(ess), + "mean_weight": float(np.mean(w)), + "std_weight": float(np.std(w)), + "min_weight": float(np.min(w)), + "max_weight": float(np.max(w)), + "tuning": tuning_info, + } + + print( + f"[kNN] k={k_used} | KS(mean)={ks_mean:.4f} | " + f"ESS={ess:,.0f}/{M:,} | clipped={100 * clip_frac:.1f}%" + ) + + self.weights_ = w + self.info_ = info + return w, info + + # --------------------- internals --------------------- + + def _preprocess( + self, + Xp: np.ndarray, + Xs: np.ndarray, + ) -> Tuple[np.ndarray, np.ndarray]: + """Apply the requested preprocessing to parent and sample data. + + Args: + Xp: Parent data array of shape ``(N, D)``. + Xs: Sample data array of shape ``(M, D)``. + + Returns: + Tuple ``(Xp_processed, Xs_processed)``. + + Raises: + ValueError: If ``preprocess`` is not one of the supported modes. + """ + mode = self.cfg.preprocess.lower() + + if mode == "whiten": + print("[kNN] Preprocess: parent-covariance whitening.") + return whiten_by_parent(Xp, Xs) + + if mode == "standardize": + if self.cfg.standardize_by_parent: + mu = Xp.mean(axis=0) + sd = Xp.std(axis=0) + 1e-12 + src = "parent" + else: + both = np.vstack([Xp, Xs]) + mu = both.mean(axis=0) + sd = both.std(axis=0) + 1e-12 + src = "pooled" + print(f"[kNN] Preprocess: z-score (by {src} stats).") + return (Xp - mu) / sd, (Xs - mu) / sd + + if mode == "none": + print("[kNN] Preprocess: none.") + return Xp, Xs + + raise ValueError('preprocess must be "whiten", "standardize", or "none"') + + def _knn_dists( + self, + X_query: np.ndarray, + X_ref: np.ndarray, + *, + k: int, + ) -> np.ndarray: + """Compute distances to the k nearest neighbors. + + Args: + X_query: Query points of shape ``(Q, D)``. + X_ref: Reference points of shape ``(R, D)``. + k: Number of neighbors to retrieve. + + Returns: + Array of shape ``(Q, k)`` with distances to the k nearest neighbors. + """ + nn = NearestNeighbors( + n_neighbors=k, + algorithm=self.cfg.algorithm, + leaf_size=self.cfg.leaf_size, + metric=self.cfg.metric, + ).fit(X_ref) + return nn.kneighbors(X_query, return_distance=True)[0] + + def _weights_for_k( + self, + Xp_prep: np.ndarray, + Xs_prep: np.ndarray, + N: int, + M: int, + d: int, + k: int, + ) -> np.ndarray: + """Compute stabilized weights for a single k. + + The core k-NN density-ratio formula is: + + .. math:: + + w_i \\propto \\frac{M}{N}\\left(\\frac{r_\\text{sample, i}} + {r_\\text{parent, i}}\\right)^d, + + with optional temperature exponent, clipping, and renormalization. + + Args: + Xp_prep: Preprocessed parent data, shape ``(N, D)``. + Xs_prep: Preprocessed sample data, shape ``(M, D)``. + N: Number of parent points. + M: Number of sample points. + d: Dimensionality of the feature space. + k: Number of neighbors. + + Returns: + One-dimensional array of weights of length M. + + Raises: + ValueError: If k is out of valid range. + """ + if k < 1: + raise ValueError(f"k must be >= 1, got k={k}") + if k >= N: + raise ValueError( + f"k must be < N (parent size), got k={k}, N={N}" + ) + if k + 1 > M: + raise ValueError( + f"k+1 must be <= M (sample size), got k={k}, M={M}" + ) + + dist_p = self._knn_dists(Xs_prep, Xp_prep, k=k) # [M, k] + r_parent = dist_p[:, -1] + + dist_s = self._knn_dists(Xs_prep, Xs_prep, k=k + 1) # include self + r_sample = dist_s[:, -1] + + eps = 1e-12 + w = (M / N) * ((r_sample + eps) / (r_parent + eps)) ** d + + if self.cfg.tau != 1.0: + w = np.power(w, self.cfg.tau) + + w = np.clip(w, self.cfg.clip_range[0], self.cfg.clip_range[1]) + w *= len(w) / (w.sum() + 1e-18) + return w + + def _autotune_k( + self, + Xp_prep: np.ndarray, + Xs_prep: np.ndarray, + Xp_raw: np.ndarray, + Xs_raw: np.ndarray, + N: int, + M: int, + d: int, + rng: np.random.Generator, + ) -> Tuple[int, np.ndarray, Dict]: + """Grid-search over candidate k values with an ESS floor. + + Args: + Xp_prep: Preprocessed parent data, shape ``(N, D)``. + Xs_prep: Preprocessed sample data, shape ``(M, D)``. + Xp_raw: Raw parent data for KS diagnostics, shape ``(N, D)``. + Xs_raw: Raw sample data for KS diagnostics, shape ``(M, D)``. + N: Number of parent points. + M: Number of sample points. + d: Dimensionality of the feature space. + rng: NumPy random generator for KS subsampling. + + Returns: + Tuple ``(best_k, best_weights, summary_dict)`` where + ``summary_dict`` contains per-k diagnostics. + """ + candidates = sorted( + {int(k) for k in self.cfg.autotune_k if 1 <= int(k) < min(N, M)} + ) + if not candidates: + raise ValueError( + "autotune_k is non-empty but no valid candidates remain. " + "Check that k < min(N, M)." + ) + + print( + f"[kNN] Auto-tune k over {candidates} with ESS floor " + f"{int(self.cfg.ess_floor * 100)}%..." + ) + + results = [] + best_tuple = None + best_k = None + best_w = None + + for k in candidates: + w = self._weights_for_k(Xp_prep, Xs_prep, N, M, d, k) + ess = effective_sample_size(w) + + ks_vals = [ + weighted_ks( + Xp_raw[:, j], + Xs_raw[:, j], + w, + rng, + n_parent=min(self.cfg.ks_eval_parent, N), + n_sample=min(self.cfg.ks_eval_sample, M), + ) + for j in range(d) + ] + ks_mean = float(np.mean(ks_vals)) + feasible = ess >= self.cfg.ess_floor * M + results.append( + {"k": k, "ess": float(ess), "ks_mean": ks_mean, "feasible": feasible} + ) + + # Selection criteria (lexicographic ordering): + # 1. Prefer feasible (ESS >= floor) over infeasible + # 2. Among same feasibility, prefer lower KS distance + # 3. Among same KS, prefer higher ESS + # 4. Among same ESS, prefer smaller k (simpler model) + selection_key = (not feasible, ks_mean, -ess, k) + if (best_tuple is None) or (selection_key < best_tuple): + best_tuple, best_k, best_w = selection_key, k, w + + print( + "[kNN] Tuning results: " + + ", ".join( + f"k={r['k']}: KS={r['ks_mean']:.3f}, ESS={int(r['ess'])}" + + ("" if r["feasible"] else "") + for r in results + ) + ) + + summary = { + "candidates": candidates, + "results": results, + "best_k": best_k, + "ess_floor": self.cfg.ess_floor, + } + return best_k, best_w, summary + + +# --------------------------------------------------------------------- +# HDF5 loading +# --------------------------------------------------------------------- + + +def load_multiscale_overdensities( + parent_path: str, + sample_path: str, +) -> Tuple[Dict[float, np.ndarray], Dict[float, np.ndarray]]: + """Load log10(1 + delta) overdensities from parent and sample HDF5 files. + + The set of kernels is defined by the *sample* file. For every group + ``Grids/Kernel_N`` present in the sample file: + + * The same group must exist in the parent file. + * Each group must contain a dataset ``GridPointOverDensities``. + * The kernel radius is read from the group's ``kernel_radius`` attribute. + * The values are read from both files and transformed as + ``log10(1 + delta)``. + + The returned dictionaries map the smoothing scale (from the kernel_radius + attribute) to a 1D NumPy array of overdensities. + + Args: + parent_path: Path to the parent/global population HDF5 file. + sample_path: Path to the sample/observed population HDF5 file. + + Returns: + Tuple ``(parent_data, sample_data)``, where each is a mapping from + scale to 1D NumPy arrays. + + Raises: + KeyError: If required groups/datasets are missing. + ValueError: If no kernels are found; if kernel_radius attribute is + missing; or if the parent is missing kernels found in the sample + file. + """ + parent_data: Dict[float, np.ndarray] = {} + sample_data: Dict[float, np.ndarray] = {} + + with ( + h5py.File(sample_path, "r") as f_sample, + h5py.File(parent_path, "r") as f_parent, + ): + if "Grids" not in f_sample: + raise KeyError("Sample file is missing group 'Grids'.") + if "Grids" not in f_parent: + raise KeyError("Parent file is missing group 'Grids'.") + + sample_grids = f_sample["Grids"] + parent_grids = f_parent["Grids"] + + kernel_keys = [k for k in sample_grids.keys() if k.startswith("Kernel_")] + if not kernel_keys: + raise ValueError( + "No kernel groups found in sample file under 'Grids'. " + "Expected groups named like 'Kernel_0', 'Kernel_1', etc." + ) + + missing = [k for k in kernel_keys if k not in parent_grids] + if missing: + raise ValueError( + "Parent file is missing the following kernel groups " + f"present in the sample file: {', '.join(missing)}" + ) + + print( + f"[HDF5] Found {len(kernel_keys)} kernels in sample file: " + + ", ".join(kernel_keys) + ) + + for key in kernel_keys: + # Read kernel radius from group attribute + try: + scale = float(sample_grids[key].attrs["kernel_radius"]) + except KeyError as exc: + raise ValueError( + f"Kernel group '{key}' is missing 'kernel_radius' attribute. " + "Cannot determine smoothing scale." + ) from exc + + try: + parent_ds = parent_grids[key]["GridPointOverDensities"][...] + sample_ds = sample_grids[key]["GridPointOverDensities"][...] + except KeyError as exc: + raise KeyError( + f"Missing 'GridPointOverDensities' for kernel '{key}'." + ) from exc + + # Validate overdensity values before log transform + parent_min = np.min(parent_ds) + sample_min = np.min(sample_ds) + if parent_min <= -1.0: + print( + f"[WARNING] Parent overdensities for kernel '{key}' (R={scale}) " + f"contain � <= -1 (min={parent_min:.3f}). This will produce NaN " + f"after log10(1 + �) transform." + ) + if sample_min <= -1.0: + print( + f"[WARNING] Sample overdensities for kernel '{key}' (R={scale}) " + f"contain � <= -1 (min={sample_min:.3f}). This will produce NaN " + f"after log10(1 + �) transform." + ) + + parent_data[scale] = np.log10(np.asarray(parent_ds, dtype=float) + 1.0) + sample_data[scale] = np.log10(np.asarray(sample_ds, dtype=float) + 1.0) + + return parent_data, sample_data + + +# --------------------------------------------------------------------- +# Plotting and simple diagnostics +# --------------------------------------------------------------------- + + +def plot_diagnostics( + parent_samples: Dict[float, np.ndarray], + sample_samples: Dict[float, np.ndarray], + weights: np.ndarray, + info: Dict, + save_path: Optional[str] = None, +) -> plt.Figure: + """Plot simple diagnostics: 1D distributions and weight histogram. + + The plot has three panels: + + * Parent vs sample vs weighted sample for the first scale. + * Parent vs sample vs weighted sample for the second scale (if present). + * Histogram of the weights. + + Args: + parent_samples: Mapping from scale to parent overdensity array. + sample_samples: Mapping from scale to sample overdensity array. + weights: Weights for the sample population. + info: Diagnostics dictionary returned by :meth:`KNNReweighter.fit`. + save_path: Optional path to save the diagnostic PNG. + + Returns: + The Matplotlib Figure instance. + """ + scales = sorted(parent_samples.keys()) + fig, axes = plt.subplots(1, 3, figsize=(15, 4)) + axes = axes.flatten() + + def _plot_1d(ax, scale: float) -> None: + parent = parent_samples[scale] + sample = sample_samples[scale] + bins = np.linspace( + min(parent.min(), sample.min()), max(parent.max(), sample.max()), 50 + ) + ax.hist( + parent, + bins=bins, + density=True, + label="Parent", + histtype="step", + lw=2, + ) + ax.hist( + sample, + bins=bins, + density=True, + label="Sample (raw)", + histtype="step", + lw=2, + ) + ax.hist( + sample, + bins=bins, + weights=weights, + density=True, + label="Sample (weighted)", + histtype="step", + lw=2, + ) + ax.set_xlabel(f"� (R={scale} Mpc/h)") + ax.set_ylabel("PDF") + ax.set_title(f"Scale {scale} Mpc/h") + ax.legend() + ax.grid(True, alpha=0.3) + + # First scale. + _plot_1d(axes[0], scales[0]) + + # Second scale (if available). + if len(scales) > 1: + _plot_1d(axes[1], scales[1]) + else: + axes[1].axis("off") + + # Weight histogram. + axw = axes[2] + axw.hist(weights, bins=60, alpha=0.85, edgecolor="black") + axw.axvline( + np.mean(weights), + color="red", + linestyle="--", + lw=2, + label=f"Mean={np.mean(weights):.3f}", + ) + axw.set_xlabel("Weight") + axw.set_ylabel("Count") + ess = int(info["ess"]) + M = info["sample_samples"] + ks_mean = info["ks_mean"] + axw.set_title(f"w distribution | ESS={ess}/{M:,}, KS(mean)={ks_mean:.3f}") + axw.legend() + axw.grid(True, alpha=0.3) + + plt.tight_layout() + if save_path: + plt.savefig(save_path, dpi=300, bbox_inches="tight") + print(f"[plot] Saved: {save_path}") + return fig + + +# --------------------------------------------------------------------- +# Pipeline +# --------------------------------------------------------------------- + + +def run_full_analysis( + parent_samples: Dict[float, np.ndarray], + sample_samples: Dict[float, np.ndarray], + show_plots: bool = True, + save_path: Optional[str] = None, + **knn_kwargs, +) -> Tuple[np.ndarray, Dict]: + """Run the full k-NN reweighting and simple diagnostics. + + Args: + parent_samples: Mapping from scale to parent overdensity array. + sample_samples: Mapping from scale to sample overdensity array. + show_plots: If True, display the diagnostic plot. + save_path: Optional path to save the diagnostic PNG. + **knn_kwargs: Keyword arguments used to configure + :class:`KNNReweighterConfig`. + + Returns: + Tuple ``(weights, info)``: + * ``weights``: 1D array of weights for the sample population. + * ``info``: Diagnostics dictionary from + :meth:`KNNReweighter.fit`. + """ + print("=== Multi-Scale Environment Weighting (k-NN, auto-k) ===") + + cfg = KNNReweighterConfig(**knn_kwargs) + rw = KNNReweighter(cfg) + + weights, info = rw.fit(parent_samples, sample_samples) + + t = info.get("tuning") + tune_str = "" + if t: + tune_str = ( + f" | auto-k over {t['candidates']} " + f"-> best={t['best_k']} (ESS floor {int(t['ess_floor'] * 100)}%)" + ) + + print( + "\n[Summary] " + f"k={info['k']}, KS(mean)={info['ks_mean']:.4f}, " + f"ESS={int(info['ess'])}/{info['sample_samples']}, " + f"clip={info['clip_range']}, " + f"clipped={100 * info['clip_fraction']:.1f}%" + f"{tune_str}" + ) + + if show_plots: + print("\n[Plot] Rendering diagnostics...") + _ = plot_diagnostics(parent_samples, sample_samples, weights, info, save_path) + plt.show() + + return weights, info + + +# --------------------------------------------------------------------- +# __main__ (argparse) +# --------------------------------------------------------------------- + + +def _build_arg_parser() -> argparse.ArgumentParser: + """Create the command-line argument parser. + + Returns: + Configured :class:`argparse.ArgumentParser` instance. + """ + parser = argparse.ArgumentParser( + description=( + "k-NN density-ratio weighting for multi-scale environments.\n\n" + "The sample file defines which Grids/Kernel_* groups are used; " + "the same kernels must exist in the parent file." + ) + ) + + # Required I/O. + parser.add_argument( + "--parent", + type=str, + required=True, + help="Path to parent/global population HDF5 file.", + ) + parser.add_argument( + "--sample", + type=str, + required=True, + help="Path to sample/observed population HDF5 file.", + ) + + # k / auto-tuning. + parser.add_argument( + "--k", + type=int, + default=50, + help="Default number of neighbors (used if --autotune_k is empty).", + ) + parser.add_argument( + "--autotune_k", + type=str, + default="", + help=( + "Comma-separated candidate k values for auto-tuning, " + "e.g. '35,50,80,100'. If empty, k is used directly." + ), + ) + parser.add_argument( + "--ess_floor", + type=float, + default=0.20, + help="Minimum ESS fraction (ESS / M) for a candidate k to be feasible.", + ) + + # Preprocessing. + parser.add_argument( + "--preprocess", + choices=["whiten", "standardize", "none"], + default="whiten", + help="Feature preprocessing mode.", + ) + parser.add_argument( + "--standardize_by_parent", + action="store_true", + help="If preprocess=standardize, use parent stats (default True).", + ) + parser.add_argument( + "--standardize_by_pooled", + dest="standardize_by_parent", + action="store_false", + help="If preprocess=standardize, use pooled parent+sample stats.", + ) + parser.set_defaults(standardize_by_parent=True) + + # Neighbors. + parser.add_argument( + "--metric", + type=str, + default="minkowski", + help="Distance metric (e.g., 'minkowski', 'euclidean').", + ) + parser.add_argument( + "--algorithm", + type=str, + default="auto", + help="NearestNeighbors algorithm (scikit-learn).", + ) + parser.add_argument( + "--leaf_size", + type=int, + default=40, + help="Leaf size for tree-based nearest-neighbor algorithms.", + ) + + # Stabilization. + parser.add_argument( + "--tau", + type=float, + default=1.0, + help="Temperature exponent (weights := weights ** tau).", + ) + parser.add_argument( + "--clip_min", + type=float, + default=0.01, + help="Lower clip bound for weights.", + ) + parser.add_argument( + "--clip_max", + type=float, + default=50.0, + help="Upper clip bound for weights.", + ) + + # Misc / plotting. + parser.add_argument( + "--random_state", + type=int, + default=42, + help="Random seed for diagnostics and KS subsampling.", + ) + parser.add_argument( + "--save", + type=str, + default=None, + help="Path to save diagnostic PNG.", + ) + parser.add_argument( + "--no_plots", + action="store_true", + help="Disable plotting.", + ) + parser.add_argument( + "--output", + type=str, + default=None, + help="Path to save computed weights as .npy file.", + ) + + return parser + + +if __name__ == "__main__": + parser = _build_arg_parser() + args = parser.parse_args() + + # Load data from the two HDF5 files. + parent_data, sample_data = load_multiscale_overdensities( + parent_path=args.parent, + sample_path=args.sample, + ) + + # Remove NaNs consistently within each population. + parent_data = drop_nan_rows_per_scale(parent_data) + sample_data = drop_nan_rows_per_scale(sample_data) + + # Build kwargs for run_full_analysis. + auto_list = [int(x) for x in args.autotune_k.split(",") if x.strip().isdigit()] + kwargs = dict( + k=args.k, + autotune_k=auto_list if auto_list else None, + ess_floor=args.ess_floor, + preprocess=args.preprocess, + standardize_by_parent=args.standardize_by_parent, + algorithm=args.algorithm, + metric=args.metric, + leaf_size=args.leaf_size, + tau=args.tau, + clip_range=(args.clip_min, args.clip_max), + random_state=args.random_state, + ) + + # Run pipeline. + weights, info = run_full_analysis( + parent_data, + sample_data, + show_plots=not args.no_plots, + save_path=args.save, + **kwargs, + ) + + # Save weights to file if requested. + if args.output: + np.save(args.output, weights) + print(f"[Output] Saved weights to: {args.output}") + + print(f"\n[Done] Generated {len(weights)} weights.") diff --git a/weighting/test_hmf_weighting.py b/weighting/test_hmf_weighting.py new file mode 100644 index 0000000..e679d2a --- /dev/null +++ b/weighting/test_hmf_weighting.py @@ -0,0 +1,634 @@ +"""Test k-NN weighting on halo mass functions. + +This script evaluates how well k-NN density-ratio weighting can reproduce +the true halo mass function (HMF) from a small subsample. + +Workflow: +1. Load multi-scale overdensity features from parent (global) and sample (full box) HDF5 files +2. Load halo masses for the full sample population (the truth) +3. Subsample N halos randomly from the full sample +4. Apply k-NN weighting to the subsample using parent environment as reference +5. Compare: Truth (full sample) vs Subsample (raw) vs Subsample (weighted) + +The goal is to test whether a small weighted subsample can reproduce the +true HMF of the full sample. + +The script produces diagnostic plots showing: +- HMF comparison: Truth vs raw subsample vs weighted subsample +- Environment distributions at different scales +- Weight distribution statistics + +Example +------- + python test_hmf_weighting.py \\ + --parent parent_grids.hdf5 \\ + --sample sample_grids.hdf5 \\ + --masses sample_halo_masses.txt \\ + --subsample 10000 \\ + --autotune_k 35,50,80,100 \\ + --output weighted_masses.npy \\ + --save hmf_diagnostics.png +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path +from typing import Dict, Optional, Tuple + +import h5py +import matplotlib.pyplot as plt +import numpy as np + +# Import the k-NN weighting machinery +from knn_weighter import ( + KNNReweighter, + KNNReweighterConfig, + drop_nan_rows_per_scale, + effective_sample_size, + load_multiscale_overdensities, +) + + +def load_halo_masses(mass_file: str) -> np.ndarray: + """Load halo masses from a text file. + + Args: + mass_file: Path to text file containing one mass per line. + + Returns: + 1D array of halo masses. + + Raises: + ValueError: If file is empty or contains invalid data. + """ + masses = np.loadtxt(mass_file) + if masses.ndim != 1: + raise ValueError( + f"Mass file should contain a single column, got shape {masses.shape}" + ) + if len(masses) == 0: + raise ValueError("Mass file is empty") + + print(f"[Masses] Loaded {len(masses):,} halo masses from {mass_file}") + print(f"[Masses] Mass range: {masses.min():.2e} - {masses.max():.2e}") + + return masses + + +def subsample_data( + sample_overdens: Dict[float, np.ndarray], + masses: np.ndarray, + n_subsample: int, + random_state: int = 42, +) -> Tuple[Dict[float, np.ndarray], np.ndarray, np.ndarray]: + """Randomly subsample halos and their environment features. + + Args: + sample_overdens: Mapping from scale to sample overdensity arrays. + masses: Halo masses (same length as overdensity arrays). + n_subsample: Number of halos to randomly select. + random_state: Random seed for reproducibility. + + Returns: + Tuple (subsampled_overdens, subsampled_masses, indices). + + Raises: + ValueError: If n_subsample > sample size. + """ + n_total = len(masses) + if n_subsample > n_total: + raise ValueError( + f"Requested subsample size {n_subsample:,} exceeds " + f"sample size {n_total:,}" + ) + + rng = np.random.default_rng(random_state) + indices = rng.choice(n_total, size=n_subsample, replace=False) + + subsampled_overdens = { + scale: arr[indices] for scale, arr in sample_overdens.items() + } + subsampled_masses = masses[indices] + + print( + f"[Subsample] Randomly selected {n_subsample:,}/{n_total:,} halos " + f"(seed={random_state})" + ) + print( + f"[Subsample] Subsampled mass range: " + f"{subsampled_masses.min():.2e} - {subsampled_masses.max():.2e}" + ) + + return subsampled_overdens, subsampled_masses, indices + + +def compute_hmf( + masses: np.ndarray, + weights: Optional[np.ndarray] = None, + n_bins: int = 20, + mass_range: Optional[Tuple[float, float]] = None, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """Compute halo mass function (differential number density). + + Args: + masses: Halo masses. + weights: Optional weights for each halo. + n_bins: Number of mass bins. + mass_range: Optional (min, max) mass range. If None, uses data range. + + Returns: + Tuple (bin_centers, dn_dlogM, bin_edges) where dn_dlogM is the + differential number density per logarithmic mass bin. + """ + if weights is None: + weights = np.ones_like(masses) + + if mass_range is None: + mass_range = (masses.min(), masses.max()) + + log_masses = np.log10(masses) + log_range = (np.log10(mass_range[0]), np.log10(mass_range[1])) + + counts, bin_edges = np.histogram( + log_masses, bins=n_bins, range=log_range, weights=weights + ) + bin_widths = np.diff(bin_edges) + bin_centers = 0.5 * (bin_edges[1:] + bin_edges[:-1]) + + # Differential number density: dn/dlog10(M) + dn_dlogM = counts / bin_widths + # Normalize to total number + dn_dlogM *= len(masses) / dn_dlogM.sum() / bin_widths.mean() + + return bin_centers, dn_dlogM, bin_edges + + +def plot_hmf_comparison( + truth_masses: np.ndarray, + subsample_masses: np.ndarray, + weights: np.ndarray, + info: Dict, + parent_overdens: Dict[float, np.ndarray], + subsample_overdens: Dict[float, np.ndarray], + save_path: Optional[str] = None, +) -> plt.Figure: + """Plot HMF comparison and diagnostics. + + Creates a multi-panel figure with: + - HMF comparison (truth vs raw subsample vs weighted subsample) + - Environment distributions at two scales + - Weight distribution histogram + + Args: + truth_masses: True halo masses (full sample). + subsample_masses: Subsampled halo masses. + weights: Weights for subsampled halos. + info: Diagnostics from KNNReweighter. + parent_overdens: Parent overdensity features (global reference). + subsample_overdens: Subsample overdensity features. + save_path: Optional path to save figure. + + Returns: + Figure instance. + """ + fig = plt.figure(figsize=(18, 10)) + gs = fig.add_gridspec(2, 3, hspace=0.3, wspace=0.3) + + scales = sorted(parent_overdens.keys()) + + # Determine common mass range for HMF + all_masses = np.concatenate([truth_masses, subsample_masses]) + mass_range = (all_masses.min(), all_masses.max()) + + # --- Panel 1: Halo Mass Function --- + ax_hmf = fig.add_subplot(gs[0, :]) + + # Compute HMFs + bc_truth, hmf_truth, _ = compute_hmf( + truth_masses, n_bins=25, mass_range=mass_range + ) + bc_subsample, hmf_subsample, _ = compute_hmf( + subsample_masses, n_bins=25, mass_range=mass_range + ) + bc_weighted, hmf_weighted, _ = compute_hmf( + subsample_masses, weights=weights, n_bins=25, mass_range=mass_range + ) + + ax_hmf.plot( + bc_truth, hmf_truth, "k-", lw=2.5, label=f"Truth (full sample, N={len(truth_masses):,})" + ) + ax_hmf.plot( + bc_subsample, + hmf_subsample, + "r--", + lw=2, + alpha=0.7, + label=f"Subsample (raw, N={len(subsample_masses):,})", + ) + ax_hmf.plot( + bc_weighted, + hmf_weighted, + "b-", + lw=2, + alpha=0.8, + label=f"Subsample (weighted, ESS={int(info['ess'])}/{len(subsample_masses):,})", + ) + + ax_hmf.set_xlabel(r"$\log_{10}(M_{\rm halo} \, [M_\odot])$", fontsize=13) + ax_hmf.set_ylabel(r"$dn / d\log_{10}M$ (normalized)", fontsize=13) + ax_hmf.set_title( + f"Halo Mass Function | k={info['k']}, KS(env)={info['ks_mean']:.3f}", + fontsize=14, + fontweight="bold", + ) + ax_hmf.legend(fontsize=11, loc="upper right") + ax_hmf.grid(True, alpha=0.3) + ax_hmf.set_yscale("log") + + # --- Panel 2: Environment at scale 0 --- + ax_env1 = fig.add_subplot(gs[1, 0]) + _plot_environment_dist( + ax_env1, parent_overdens[scales[0]], sample_overdens[scales[0]], weights, scales[0] + ) + + # --- Panel 3: Environment at scale 1 (if available) --- + ax_env2 = fig.add_subplot(gs[1, 1]) + if len(scales) > 1: + _plot_environment_dist( + ax_env2, + parent_overdens[scales[1]], + sample_overdens[scales[1]], + weights, + scales[1], + ) + else: + ax_env2.text( + 0.5, 0.5, "Only 1 scale available", ha="center", va="center", fontsize=12 + ) + ax_env2.axis("off") + + # --- Panel 4: Weight distribution --- + ax_weights = fig.add_subplot(gs[1, 2]) + ax_weights.hist(weights, bins=50, alpha=0.85, edgecolor="black", color="steelblue") + ax_weights.axvline( + np.mean(weights), + color="red", + linestyle="--", + lw=2, + label=f"Mean={np.mean(weights):.2f}", + ) + ax_weights.axvline( + np.median(weights), + color="orange", + linestyle=":", + lw=2, + label=f"Median={np.median(weights):.2f}", + ) + ax_weights.set_xlabel("Weight", fontsize=12) + ax_weights.set_ylabel("Count", fontsize=12) + ax_weights.set_title( + f"Weight Distribution\nESS={int(info['ess'])}/{len(weights):,}, " + f"clip={100*info['clip_fraction']:.1f}%", + fontsize=11, + ) + ax_weights.legend(fontsize=10) + ax_weights.grid(True, alpha=0.3) + + plt.suptitle( + "k-NN Halo Mass Function Weighting Test", + fontsize=16, + fontweight="bold", + y=0.995, + ) + + if save_path: + plt.savefig(save_path, dpi=300, bbox_inches="tight") + print(f"[Plot] Saved: {save_path}") + + return fig + + +def _plot_environment_dist( + ax: plt.Axes, + parent_env: np.ndarray, + subsample_env: np.ndarray, + weights: np.ndarray, + scale: float, +) -> None: + """Plot environment distribution at a single scale. + + Args: + ax: Matplotlib axes. + parent_env: Parent (global) overdensities. + subsample_env: Subsample overdensities. + weights: Subsample weights. + scale: Smoothing scale (Mpc/h). + """ + bins = np.linspace( + min(parent_env.min(), subsample_env.min()), + max(parent_env.max(), subsample_env.max()), + 40, + ) + + ax.hist( + parent_env, + bins=bins, + density=True, + alpha=0.6, + label="Parent (global)", + histtype="stepfilled", + color="gray", + ) + ax.hist( + subsample_env, + bins=bins, + density=True, + alpha=0.7, + label="Subsample (raw)", + histtype="step", + lw=2, + color="red", + ) + ax.hist( + subsample_env, + bins=bins, + weights=weights, + density=True, + alpha=0.8, + label="Subsample (weighted)", + histtype="step", + lw=2, + color="blue", + ) + + ax.set_xlabel(f"log10(1 + δ) at R={scale:.2f} Mpc/h", fontsize=11) + ax.set_ylabel("PDF", fontsize=11) + ax.set_title(f"Environment (scale {scale:.2f})", fontsize=11) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.3) + + +def print_summary_statistics( + truth_masses: np.ndarray, + subsample_masses: np.ndarray, + weights: np.ndarray, + info: Dict, +) -> None: + """Print summary statistics for the weighting test. + + Args: + truth_masses: True halo masses (full sample). + subsample_masses: Subsampled halo masses. + weights: Subsample weights. + info: Diagnostics from KNNReweighter. + """ + print("\n" + "=" * 70) + print("SUMMARY STATISTICS") + print("=" * 70) + + print(f"\nPopulation Sizes:") + print(f" Truth (full sample): {len(truth_masses):,} halos") + print(f" Subsample: {len(subsample_masses):,} halos ({100*len(subsample_masses)/len(truth_masses):.1f}% of truth)") + print(f" ESS: {int(info['ess']):,} (effective sample size)") + print(f" ESS/N_subsample: {info['ess']/len(subsample_masses):.1%}") + + print(f"\nk-NN Configuration:") + print(f" k: {info['k']}") + print(f" Preprocess: {info['preprocess']}") + print(f" Metric: {info['metric']}") + print(f" Tau: {info['tau']}") + + print(f"\nEnvironment Matching (KS distance):") + print(f" Mean across scales: {info['ks_mean']:.4f}") + for i, (scale, ks) in enumerate(zip(info['scales'], info['ks_per_dim'])): + print(f" Scale {i} (R={scale:.2f}): {ks:.4f}") + + print(f"\nWeight Statistics:") + print(f" Mean: {info['mean_weight']:.3f}") + print(f" Std: {info['std_weight']:.3f}") + print(f" Min: {info['min_weight']:.3f}") + print(f" Max: {info['max_weight']:.3f}") + print(f" Clip frac: {100*info['clip_fraction']:.1f}%") + print(f" Clip range: {info['clip_range']}") + + # Mass statistics + print(f"\nMass Range (log10 Msun):") + print(f" Truth: {np.log10(truth_masses.min()):.2f} - {np.log10(truth_masses.max()):.2f}") + print(f" Subsample: {np.log10(subsample_masses.min()):.2f} - {np.log10(subsample_masses.max()):.2f}") + + # Weighted vs unweighted mean log mass + mean_log_m_truth = np.mean(np.log10(truth_masses)) + mean_log_m_subsample = np.mean(np.log10(subsample_masses)) + mean_log_m_weighted = np.average(np.log10(subsample_masses), weights=weights) + + print(f"\nMean log10(M) [Msun]:") + print(f" Truth: {mean_log_m_truth:.3f}") + print(f" Subsample: {mean_log_m_subsample:.3f} (bias: {mean_log_m_subsample - mean_log_m_truth:+.3f})") + print(f" Weighted: {mean_log_m_weighted:.3f} (bias: {mean_log_m_weighted - mean_log_m_truth:+.3f})") + + print("=" * 70 + "\n") + + +def main(): + """Main execution function.""" + parser = argparse.ArgumentParser( + description="Test k-NN weighting on halo mass functions", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + # Required inputs + parser.add_argument( + "--parent", + type=str, + required=True, + help="Path to parent/global population HDF5 file (gridded overdensities, global reference)", + ) + parser.add_argument( + "--sample", + type=str, + required=True, + help="Path to full sample HDF5 file (gridded overdensities, this is the truth)", + ) + parser.add_argument( + "--masses", + type=str, + required=True, + help="Path to text file with full sample halo masses (one per line, truth HMF)", + ) + + # Subsampling + parser.add_argument( + "--subsample", + type=int, + required=True, + help="Number of halos to randomly select from full sample for weighting test", + ) + + # k-NN parameters + parser.add_argument( + "--k", + type=int, + default=50, + help="Number of neighbors (default: 50)", + ) + parser.add_argument( + "--autotune_k", + type=str, + default="", + help="Comma-separated k values for auto-tuning (e.g., '35,50,80,100')", + ) + parser.add_argument( + "--ess_floor", + type=float, + default=0.20, + help="Minimum ESS fraction for auto-tuning (default: 0.20)", + ) + parser.add_argument( + "--preprocess", + choices=["whiten", "standardize", "none"], + default="whiten", + help="Feature preprocessing mode (default: whiten)", + ) + parser.add_argument( + "--tau", + type=float, + default=1.0, + help="Temperature parameter for weights (default: 1.0)", + ) + parser.add_argument( + "--clip_min", + type=float, + default=0.01, + help="Lower clip bound for weights (default: 0.01)", + ) + parser.add_argument( + "--clip_max", + type=float, + default=50.0, + help="Upper clip bound for weights (default: 50.0)", + ) + + # Output + parser.add_argument( + "--output", + type=str, + default=None, + help="Path to save weighted masses as .npy file", + ) + parser.add_argument( + "--save", + type=str, + default=None, + help="Path to save diagnostic plot", + ) + parser.add_argument( + "--no_plots", + action="store_true", + help="Disable interactive plotting", + ) + parser.add_argument( + "--random_state", + type=int, + default=42, + help="Random seed for subsampling and diagnostics (default: 42)", + ) + + args = parser.parse_args() + + # Load overdensity features + print("\n" + "=" * 70) + print("LOADING DATA") + print("=" * 70 + "\n") + + parent_overdens, sample_overdens = load_multiscale_overdensities( + parent_path=args.parent, + sample_path=args.sample, + ) + + # Load halo masses (for the full sample = truth) + truth_masses = load_halo_masses(args.masses) + + # Check consistency + first_scale = next(iter(sample_overdens.values())) + if len(first_scale) != len(truth_masses): + raise ValueError( + f"Mismatch: sample overdensities have {len(first_scale):,} entries " + f"but mass file has {len(truth_masses):,} entries" + ) + + # Drop NaNs + parent_overdens = drop_nan_rows_per_scale(parent_overdens) + sample_overdens = drop_nan_rows_per_scale(sample_overdens) + + # After NaN drop, masses may need filtering + first_scale_after = next(iter(sample_overdens.values())) + if len(first_scale_after) != len(first_scale): + print( + f"[WARNING] NaN filtering reduced sample from {len(first_scale):,} " + f"to {len(first_scale_after):,}. This requires the mass file to have " + f"matching NaN entries, which is not currently supported." + ) + print("[WARNING] Proceeding anyway - results may be incorrect!") + + # Subsample from the full sample + if args.subsample is None: + raise ValueError( + "--subsample is required. Specify how many halos to randomly select " + "from the full sample for weighting." + ) + + subsample_overdens, subsample_masses, subsample_indices = subsample_data( + sample_overdens, truth_masses, args.subsample, args.random_state + ) + + # Run k-NN weighting + # Goal: weight subsample to match parent (global) environment distribution + print("\n" + "=" * 70) + print("COMPUTING WEIGHTS") + print("=" * 70 + "\n") + + auto_list = [int(x) for x in args.autotune_k.split(",") if x.strip().isdigit()] + cfg = KNNReweighterConfig( + k=args.k, + autotune_k=auto_list if auto_list else None, + ess_floor=args.ess_floor, + preprocess=args.preprocess, + tau=args.tau, + clip_range=(args.clip_min, args.clip_max), + random_state=args.random_state, + ) + + reweighter = KNNReweighter(cfg) + weights, info = reweighter.fit(parent_overdens, subsample_overdens) + + # Print summary + print_summary_statistics(truth_masses, subsample_masses, weights, info) + + # Save weighted masses if requested + if args.output: + weighted_data = np.column_stack([subsample_masses, weights]) + np.save(args.output, weighted_data) + print(f"[Output] Saved (subsample_masses, weights) to: {args.output}") + + # Plot results + if not args.no_plots: + print("\n[Plot] Generating diagnostics...\n") + fig = plot_hmf_comparison( + truth_masses, + subsample_masses, + weights, + info, + parent_overdens, + subsample_overdens, + save_path=args.save, + ) + plt.show() + + print("\n[Done] HMF weighting test complete.\n") + + +if __name__ == "__main__": + main() From a3d5142e058271aa7e0b320b19e4981809b4240d Mon Sep 17 00:00:00 2001 From: wjr21 Date: Wed, 19 Nov 2025 18:47:34 +0000 Subject: [PATCH 03/23] Correcting attirbute --- weighting/knn_weighter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/weighting/knn_weighter.py b/weighting/knn_weighter.py index 9f25a47..d4c2ad9 100644 --- a/weighting/knn_weighter.py +++ b/weighting/knn_weighter.py @@ -626,11 +626,11 @@ def load_multiscale_overdensities( * The same group must exist in the parent file. * Each group must contain a dataset ``GridPointOverDensities``. - * The kernel radius is read from the group's ``kernel_radius`` attribute. + * The kernel radius is read from the group's ``KernelRadius`` attribute. * The values are read from both files and transformed as ``log10(1 + delta)``. - The returned dictionaries map the smoothing scale (from the kernel_radius + The returned dictionaries map the smoothing scale (from the KernelRadius attribute) to a 1D NumPy array of overdensities. Args: @@ -643,7 +643,7 @@ def load_multiscale_overdensities( Raises: KeyError: If required groups/datasets are missing. - ValueError: If no kernels are found; if kernel_radius attribute is + ValueError: If no kernels are found; if KernelRadius attribute is missing; or if the parent is missing kernels found in the sample file. """ @@ -684,10 +684,10 @@ def load_multiscale_overdensities( for key in kernel_keys: # Read kernel radius from group attribute try: - scale = float(sample_grids[key].attrs["kernel_radius"]) + scale = float(sample_grids[key].attrs["KernelRadius"]) except KeyError as exc: raise ValueError( - f"Kernel group '{key}' is missing 'kernel_radius' attribute. " + f"Kernel group '{key}' is missing 'KernelRadius' attribute. " "Cannot determine smoothing scale." ) from exc From 75f178eb80f1afd3d165921a6d55a19272f09736 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Wed, 19 Nov 2025 18:51:43 +0000 Subject: [PATCH 04/23] Clean out nans --- weighting/test_hmf_weighting.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/weighting/test_hmf_weighting.py b/weighting/test_hmf_weighting.py index e679d2a..473c0a9 100644 --- a/weighting/test_hmf_weighting.py +++ b/weighting/test_hmf_weighting.py @@ -559,21 +559,30 @@ def main(): f"but mass file has {len(truth_masses):,} entries" ) - # Drop NaNs + # Clean parent data + print("\n[Cleaning] Removing NaN entries from parent...") parent_overdens = drop_nan_rows_per_scale(parent_overdens) - sample_overdens = drop_nan_rows_per_scale(sample_overdens) - # After NaN drop, masses may need filtering - first_scale_after = next(iter(sample_overdens.values())) - if len(first_scale_after) != len(first_scale): + # Clean full sample data - need to synchronize masses with NaN filtering + print("[Cleaning] Removing NaN entries from full sample...") + n_before = len(next(iter(sample_overdens.values()))) + + # Build NaN mask from overdensities + mask = np.ones(n_before, dtype=bool) + for arr in sample_overdens.values(): + mask &= ~np.isnan(arr) + + n_dropped = n_before - np.sum(mask) + if n_dropped > 0: print( - f"[WARNING] NaN filtering reduced sample from {len(first_scale):,} " - f"to {len(first_scale_after):,}. This requires the mass file to have " - f"matching NaN entries, which is not currently supported." + f"[NaN] Dropped {n_dropped:,}/{n_before:,} " + f"({100.0 * n_dropped / n_before:.2f}%) entries containing NaN." ) - print("[WARNING] Proceeding anyway - results may be incorrect!") + # Apply mask to both overdensities and masses + sample_overdens = {scale: arr[mask] for scale, arr in sample_overdens.items()} + truth_masses = truth_masses[mask] - # Subsample from the full sample + # Subsample from the cleaned full sample if args.subsample is None: raise ValueError( "--subsample is required. Specify how many halos to randomly select " From ed8c4f3642b25f7402bae55931d9233761b1de40 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Wed, 19 Nov 2025 19:10:46 +0000 Subject: [PATCH 05/23] Sanitising inputs --- weighting/knn_weighter.py | 83 ++++++++++++++++++++++++--------- weighting/test_hmf_weighting.py | 41 +++++++++------- 2 files changed, 84 insertions(+), 40 deletions(-) diff --git a/weighting/knn_weighter.py b/weighting/knn_weighter.py index d4c2ad9..ace80f9 100644 --- a/weighting/knn_weighter.py +++ b/weighting/knn_weighter.py @@ -163,6 +163,56 @@ def whiten_by_parent( return (Xp - mu) @ Linv, (Xs - mu) @ Linv +def filter_and_transform_overdensities( + overdens: Dict[float, np.ndarray], +) -> Tuple[Dict[float, np.ndarray], np.ndarray]: + """Filter out -1 values and apply log10(1 + delta) transform. + + Grid points with overdensity = -1 indicate no particles nearby. + These must be removed before applying the log transform to avoid NaN. + + Args: + overdens: Mapping from scale to raw overdensity arrays (may contain -1). + + Returns: + Tuple (transformed_overdens, mask) where: + - transformed_overdens: log10(1 + delta) for valid points + - mask: Boolean array indicating which rows were kept + + Raises: + ValueError: If arrays have mismatched lengths. + """ + if not overdens: + return overdens, np.array([], dtype=bool) + + first = next(iter(overdens.values())) + length = len(first) + + # Build mask: keep rows where ALL scales have delta > -1 + mask = np.ones(length, dtype=bool) + for scale, arr in overdens.items(): + if len(arr) != length: + raise ValueError( + "All arrays must have the same length; found mismatched lengths." + ) + mask &= (arr > -1.0) + + n_dropped = length - np.sum(mask) + if n_dropped > 0: + print( + f"[Filter] Dropped {n_dropped:,}/{length:,} " + f"({100.0 * n_dropped / length:.2f}%) grid points with delta <= -1 " + f"(no particles nearby)." + ) + + # Apply log transform to filtered data + transformed = { + scale: np.log10(arr[mask] + 1.0) for scale, arr in overdens.items() + } + + return transformed, mask + + def drop_nan_rows_per_scale( samples: Dict[float, np.ndarray], ) -> Dict[float, np.ndarray]: @@ -619,7 +669,7 @@ def load_multiscale_overdensities( parent_path: str, sample_path: str, ) -> Tuple[Dict[float, np.ndarray], Dict[float, np.ndarray]]: - """Load log10(1 + delta) overdensities from parent and sample HDF5 files. + """Load raw overdensities from parent and sample HDF5 files. The set of kernels is defined by the *sample* file. For every group ``Grids/Kernel_N`` present in the sample file: @@ -627,11 +677,14 @@ def load_multiscale_overdensities( * The same group must exist in the parent file. * Each group must contain a dataset ``GridPointOverDensities``. * The kernel radius is read from the group's ``KernelRadius`` attribute. - * The values are read from both files and transformed as - ``log10(1 + delta)``. + * Raw overdensity values are returned (NOT log-transformed). + + Note: Values of -1 indicate grid points with no particles nearby. + Use ``filter_and_transform_overdensities()`` to remove these and + apply the log10(1 + delta) transform. The returned dictionaries map the smoothing scale (from the KernelRadius - attribute) to a 1D NumPy array of overdensities. + attribute) to a 1D NumPy array of raw overdensities. Args: parent_path: Path to the parent/global population HDF5 file. @@ -699,24 +752,10 @@ def load_multiscale_overdensities( f"Missing 'GridPointOverDensities' for kernel '{key}'." ) from exc - # Validate overdensity values before log transform - parent_min = np.min(parent_ds) - sample_min = np.min(sample_ds) - if parent_min <= -1.0: - print( - f"[WARNING] Parent overdensities for kernel '{key}' (R={scale}) " - f"contain � <= -1 (min={parent_min:.3f}). This will produce NaN " - f"after log10(1 + �) transform." - ) - if sample_min <= -1.0: - print( - f"[WARNING] Sample overdensities for kernel '{key}' (R={scale}) " - f"contain � <= -1 (min={sample_min:.3f}). This will produce NaN " - f"after log10(1 + �) transform." - ) - - parent_data[scale] = np.log10(np.asarray(parent_ds, dtype=float) + 1.0) - sample_data[scale] = np.log10(np.asarray(sample_ds, dtype=float) + 1.0) + # Store raw overdensities (will be filtered and transformed later) + # Note: -1 values indicate grid points with no particles nearby + parent_data[scale] = np.asarray(parent_ds, dtype=float) + sample_data[scale] = np.asarray(sample_ds, dtype=float) return parent_data, sample_data diff --git a/weighting/test_hmf_weighting.py b/weighting/test_hmf_weighting.py index 473c0a9..574bd2a 100644 --- a/weighting/test_hmf_weighting.py +++ b/weighting/test_hmf_weighting.py @@ -47,6 +47,7 @@ KNNReweighterConfig, drop_nan_rows_per_scale, effective_sample_size, + filter_and_transform_overdensities, load_multiscale_overdensities, ) @@ -559,28 +560,32 @@ def main(): f"but mass file has {len(truth_masses):,} entries" ) - # Clean parent data - print("\n[Cleaning] Removing NaN entries from parent...") + # Filter and transform parent data (removes delta <= -1, applies log transform) + print("\n[Cleaning] Filtering and transforming parent overdensities...") + parent_overdens, _ = filter_and_transform_overdensities(parent_overdens) + + # Additional NaN check on parent (shouldn't be any after filtering) parent_overdens = drop_nan_rows_per_scale(parent_overdens) - # Clean full sample data - need to synchronize masses with NaN filtering - print("[Cleaning] Removing NaN entries from full sample...") - n_before = len(next(iter(sample_overdens.values()))) + # Filter and transform full sample data - need to synchronize with masses + print("[Cleaning] Filtering and transforming full sample overdensities...") + sample_overdens, mask = filter_and_transform_overdensities(sample_overdens) - # Build NaN mask from overdensities - mask = np.ones(n_before, dtype=bool) - for arr in sample_overdens.values(): - mask &= ~np.isnan(arr) + # Apply mask to masses to keep them synchronized + truth_masses = truth_masses[mask] - n_dropped = n_before - np.sum(mask) - if n_dropped > 0: - print( - f"[NaN] Dropped {n_dropped:,}/{n_before:,} " - f"({100.0 * n_dropped / n_before:.2f}%) entries containing NaN." - ) - # Apply mask to both overdensities and masses - sample_overdens = {scale: arr[mask] for scale, arr in sample_overdens.items()} - truth_masses = truth_masses[mask] + # Additional NaN check on sample (shouldn't be any after filtering) + n_before = len(next(iter(sample_overdens.values()))) + sample_overdens = drop_nan_rows_per_scale(sample_overdens) + n_after = len(next(iter(sample_overdens.values()))) + + if n_after != n_before: + # Need to rebuild mask for masses + mask_nan = np.ones(n_before, dtype=bool) + for arr in sample_overdens.values(): + # This shouldn't happen, but just in case + pass + print(f"[WARNING] Found {n_before - n_after} additional NaN values after transform!") # Subsample from the cleaned full sample if args.subsample is None: From 246583305606b8730a8041974c2a624cf9107841 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Wed, 19 Nov 2025 20:15:34 +0000 Subject: [PATCH 06/23] Expanding the test suite to debug grid points --- CMakeLists.txt | 1 + scripts/build_and_test_debug.sh | 104 +++++++ src/debugging_utils.cpp | 411 +++++++++++++++++++++++++++ src/debugging_utils.hpp | 85 ++++++ src/gridder.cpp | 34 +++ tests/test_file_grid params.yml | 23 ++ tests/test_file_gridding.py | 474 ++++++++++++++++++++++++++++++++ 7 files changed, 1132 insertions(+) create mode 100755 scripts/build_and_test_debug.sh create mode 100644 src/debugging_utils.cpp create mode 100644 src/debugging_utils.hpp create mode 100644 tests/test_file_grid params.yml create mode 100755 tests/test_file_gridding.py diff --git a/CMakeLists.txt b/CMakeLists.txt index b9149d6..3833e3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,6 +175,7 @@ set(SOURCE_FILES src/partition.cpp src/params.cpp src/simulation.cpp + src/debugging_utils.cpp ) # ========== Target ========== diff --git a/scripts/build_and_test_debug.sh b/scripts/build_and_test_debug.sh new file mode 100755 index 0000000..047775b --- /dev/null +++ b/scripts/build_and_test_debug.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# +# Build in debug mode and run comprehensive tests +# This script compiles with DEBUGGING_CHECKS enabled and runs the full test suite +# + +set -e # Exit on error + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}==============================================" +echo " BUILD AND TEST IN DEBUG MODE" +echo -e "==============================================${NC}" +echo "" + +# Get script directory and project root +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" + +cd "$PROJECT_ROOT" + +# Clean previous debug build +echo -e "${YELLOW}Cleaning previous debug build...${NC}" +rm -rf build_debug + +# Configure with debug mode +echo -e "${YELLOW}Configuring debug build...${NC}" +cmake -B build_debug -DCMAKE_BUILD_TYPE=Debug + +# Build +echo -e "${YELLOW}Building in debug mode...${NC}" +cmake --build build_debug + +if [ $? -ne 0 ]; then + echo -e "${RED}✗ Build failed${NC}" + exit 1 +fi + +echo -e "${GREEN}✓ Build successful${NC}" +echo "" + +# Check executable +if [ ! -f "build_debug/parent_gridder" ]; then + echo -e "${RED}✗ Executable not found: build_debug/parent_gridder${NC}" + exit 1 +fi + +echo -e "${BLUE}Running comprehensive test suite...${NC}" +echo "" + +# Run file-based gridding tests +echo -e "${YELLOW}=== File-Based Gridding Tests ===${NC}" +python3 tests/test_file_gridding.py build_debug/parent_gridder + +FILE_TEST_EXIT=$? + +echo "" + +# Run simple tests +echo -e "${YELLOW}=== Simple Test ===${NC}" +bash tests/run_simple_test.sh + +SIMPLE_TEST_EXIT=$? + +echo "" + +# Summary +echo -e "${BLUE}==============================================" +echo " TEST SUMMARY" +echo -e "==============================================${NC}" + +TOTAL_FAILED=0 + +if [ $FILE_TEST_EXIT -eq 0 ]; then + echo -e "${GREEN}✓${NC} File-based gridding tests: PASSED" +else + echo -e "${RED}✗${NC} File-based gridding tests: FAILED" + TOTAL_FAILED=$((TOTAL_FAILED + 1)) +fi + +if [ $SIMPLE_TEST_EXIT -eq 0 ]; then + echo -e "${GREEN}✓${NC} Simple tests: PASSED" +else + echo -e "${RED}✗${NC} Simple tests: FAILED" + TOTAL_FAILED=$((TOTAL_FAILED + 1)) +fi + +echo -e "${BLUE}==============================================${NC}" + +if [ $TOTAL_FAILED -eq 0 ]; then + echo -e "${GREEN}ALL TESTS PASSED!${NC}" + exit 0 +else + echo -e "${RED}$TOTAL_FAILED TEST SUITE(S) FAILED${NC}" + echo "" + echo "Debug build is available at: build_debug/parent_gridder" + echo "You can run individual tests or use a debugger to investigate failures." + exit 1 +fi diff --git a/src/debugging_utils.cpp b/src/debugging_utils.cpp new file mode 100644 index 0000000..4eea66f --- /dev/null +++ b/src/debugging_utils.cpp @@ -0,0 +1,411 @@ +// Standard includes +#include +#include +#include + +// Local includes +#include "debugging_utils.hpp" +#include "cell.hpp" +#include "grid_point.hpp" +#include "logger.hpp" +#include "metadata.hpp" +#include "simulation.hpp" + +#ifdef DEBUGGING_CHECKS + +/** + * @brief Find the smallest distance dx along one axis within a box of size + * box_size (handles periodic boundaries) + */ +static double nearest(const double dx, const double box_size) { + return ((dx > 0.5 * box_size) + ? (dx - box_size) + : ((dx < -0.5 * box_size) ? (dx + box_size) : dx)); +} + +/** + * @brief Validate that grid points are correctly assigned to their containing + * cells + * + * This checks that each grid point is spatially within the bounds of the cell + * it's been assigned to. + */ +void validateGridPointCellAssignment(Simulation *sim, Grid *grid) { + message("[DEBUG] Validating grid point to cell assignments..."); + + int errors = 0; + std::vector &cells = sim->cells; + + for (size_t cid = 0; cid < sim->nr_cells; cid++) { + Cell *cell = &cells[cid]; + + for (GridPoint *gp : cell->grid_points) { + // Check if grid point is within cell bounds + bool inside_x = (gp->loc[0] >= cell->loc[0]) && + (gp->loc[0] < cell->loc[0] + cell->width[0]); + bool inside_y = (gp->loc[1] >= cell->loc[1]) && + (gp->loc[1] < cell->loc[1] + cell->width[1]); + bool inside_z = (gp->loc[2] >= cell->loc[2]) && + (gp->loc[2] < cell->loc[2] + cell->width[2]); + + if (!inside_x || !inside_y || !inside_z) { + message("[DEBUG] ERROR: Grid point at (%.3f, %.3f, %.3f) assigned to " + "cell %zu " + "with bounds [%.3f-%.3f, %.3f-%.3f, %.3f-%.3f]", + gp->loc[0], gp->loc[1], gp->loc[2], cid, cell->loc[0], + cell->loc[0] + cell->width[0], cell->loc[1], + cell->loc[1] + cell->width[1], cell->loc[2], + cell->loc[2] + cell->width[2]); + errors++; + } + } + } + + if (errors > 0) { + error("[DEBUG] Found %d grid points incorrectly assigned to cells", errors); + } else { + message("[DEBUG] ✓ All grid points correctly assigned to cells"); + } +} + +/** + * @brief Validate that all useful cells have been correctly identified + * + * Checks that: + * 1. All cells with grid points are marked as useful + * 2. All neighbors of cells with grid points are marked as useful + */ +void validateUsefulCells(Simulation *sim, Grid *grid) { + message("[DEBUG] Validating useful cell flagging..."); + + int errors = 0; + std::vector &cells = sim->cells; + + for (size_t cid = 0; cid < sim->nr_cells; cid++) { + Cell *cell = &cells[cid]; + + // Check 1: Cells with grid points must be useful + if (cell->grid_points.size() > 0 && !cell->is_useful) { + message("[DEBUG] ERROR: Cell %zu has %zu grid points but is_useful = " + "false", + cid, cell->grid_points.size()); + errors++; + } + + // Check 2: Neighbors of cells with grid points must be useful + if (cell->grid_points.size() > 0) { + for (Cell *neighbor : cell->neighbours) { + if (!neighbor->is_useful) { + message( + "[DEBUG] ERROR: Cell %zu has grid points but its neighbor " + "(cell at %.3f,%.3f,%.3f) is not marked as useful", + cid, neighbor->loc[0], neighbor->loc[1], neighbor->loc[2]); + errors++; + } + } + } + } + + // Count total useful cells and cells with grid points + int useful_count = 0; + int cells_with_gps = 0; + for (size_t cid = 0; cid < sim->nr_cells; cid++) { + if (cells[cid].is_useful) + useful_count++; + if (cells[cid].grid_points.size() > 0) + cells_with_gps++; + } + + message("[DEBUG] Useful cells: %d, Cells with grid points: %d", useful_count, + cells_with_gps); + + if (errors > 0) { + error("[DEBUG] Found %d useful cell flagging errors", errors); + } else { + message("[DEBUG] ✓ All useful cells correctly flagged"); + } +} + +/** + * @brief Validate that particles are in the correct cells + * + * Checks that each particle's position is within the bounds of its assigned + * cell. + */ +void validateParticleCellAssignment(Simulation *sim) { + message("[DEBUG] Validating particle to cell assignments..."); + + int errors = 0; + std::vector &cells = sim->cells; + + for (size_t cid = 0; cid < sim->nr_cells; cid++) { + Cell *cell = &cells[cid]; + + for (Particle *part : cell->particles) { + // Check if particle is within cell bounds + bool inside_x = (part->pos[0] >= cell->loc[0]) && + (part->pos[0] < cell->loc[0] + cell->width[0]); + bool inside_y = (part->pos[1] >= cell->loc[1]) && + (part->pos[1] < cell->loc[1] + cell->width[1]); + bool inside_z = (part->pos[2] >= cell->loc[2]) && + (part->pos[2] < cell->loc[2] + cell->width[2]); + + if (!inside_x || !inside_y || !inside_z) { + message("[DEBUG] ERROR: Particle at (%.3f, %.3f, %.3f) assigned to " + "cell %zu " + "with bounds [%.3f-%.3f, %.3f-%.3f, %.3f-%.3f]", + part->pos[0], part->pos[1], part->pos[2], cid, cell->loc[0], + cell->loc[0] + cell->width[0], cell->loc[1], + cell->loc[1] + cell->width[1], cell->loc[2], + cell->loc[2] + cell->width[2]); + errors++; + if (errors > 10) { // Limit error spam + message("[DEBUG] ... (suppressing further particle assignment " + "errors)"); + break; + } + } + } + if (errors > 10) + break; + } + + if (errors > 0) { + error("[DEBUG] Found %d particles incorrectly assigned to cells", errors); + } else { + message("[DEBUG] ✓ All particles correctly assigned to cells"); + } +} + +/** + * @brief Count particles within radius of a grid point (brute force check) + */ +int bruteForceCountParticles(GridPoint *grid_point, Simulation *sim, + double radius) { + int count = 0; + double radius2 = radius * radius; + double *dim = sim->dim; + + std::vector &cells = sim->cells; + for (size_t cid = 0; cid < sim->nr_cells; cid++) { + Cell *cell = &cells[cid]; + + for (Particle *part : cell->particles) { + double dx = nearest(part->pos[0] - grid_point->loc[0], dim[0]); + double dy = nearest(part->pos[1] - grid_point->loc[1], dim[1]); + double dz = nearest(part->pos[2] - grid_point->loc[2], dim[2]); + double r2 = dx * dx + dy * dy + dz * dz; + + if (r2 <= radius2) { + count++; + } + } + } + + return count; +} + +/** + * @brief Check if grid points can find any particles within kernel radii + * + * For each grid point, performs a brute force search to count particles within + * the largest kernel radius, then compares to what the gridder found. + */ +void validateGridPointsHaveParticles(Simulation *sim, Grid *grid) { + message("[DEBUG] Validating grid points can find particles..."); + + // Find the maximum kernel radius + double max_kernel = *std::max_element(grid->kernel_radii.begin(), + grid->kernel_radii.end()); + + int empty_count = 0; + int mismatch_count = 0; + int checked_count = 0; + + // Sample a subset of grid points to avoid excessive computation + size_t sample_interval = std::max(size_t(1), grid->grid_points.size() / 100); + + for (size_t i = 0; i < grid->grid_points.size(); i += sample_interval) { + GridPoint *gp = &grid->grid_points[i]; + checked_count++; + + // Brute force count + int brute_count = bruteForceCountParticles(gp, sim, max_kernel); + + // Get what gridder found for largest kernel + int gridder_count = 0; + if (gp->counts.find(max_kernel) != gp->counts.end()) { + gridder_count = gp->counts[max_kernel]; + } + + if (brute_count == 0) { + message("[DEBUG] Grid point %zu at (%.3f, %.3f, %.3f): No particles " + "within %.3f Mpc/h", + i, gp->loc[0], gp->loc[1], gp->loc[2], max_kernel); + empty_count++; + } else if (brute_count != gridder_count) { + message("[DEBUG] Grid point %zu at (%.3f, %.3f, %.3f): Brute force " + "found %d particles, gridder found %d (within %.3f Mpc/h)", + i, gp->loc[0], gp->loc[1], gp->loc[2], brute_count, + gridder_count, max_kernel); + mismatch_count++; + } + } + + message("[DEBUG] Checked %d grid points:", checked_count); + message("[DEBUG] - %d had no particles within max kernel radius", + empty_count); + message("[DEBUG] - %d had count mismatches between brute force and " + "gridder", + mismatch_count); + + if (mismatch_count > 0) { + error("[DEBUG] Found %d grid points with particle count mismatches", + mismatch_count); + } else if (empty_count > checked_count / 2) { + message("[DEBUG] WARNING: More than 50%% of sampled grid points have no " + "particles nearby"); + } else { + message("[DEBUG] ✓ Grid point particle counts look reasonable"); + } +} + +/** + * @brief Validate octree structure integrity + * + * Recursively checks that: + * 1. Child cells properly partition their parent + * 2. Particle counts match between parent and children + * 3. Grid point counts match between parent and children + */ +static void validateCellRecursive(Cell *cell, int *errors) { + if (!cell->is_split) + return; + + // Check particle count consistency + size_t child_part_count = 0; + for (int i = 0; i < Cell::OCTREE_CHILDREN; i++) { + if (cell->children[i] != nullptr) { + child_part_count += cell->children[i]->part_count; + validateCellRecursive(cell->children[i], errors); + } + } + + if (child_part_count != cell->part_count) { + message( + "[DEBUG] ERROR: Cell at (%.3f, %.3f, %.3f) has %zu particles but " + "children have %zu total", + cell->loc[0], cell->loc[1], cell->loc[2], cell->part_count, + child_part_count); + (*errors)++; + } + + // Check grid point count consistency + size_t child_gp_count = 0; + for (int i = 0; i < Cell::OCTREE_CHILDREN; i++) { + if (cell->children[i] != nullptr) { + child_gp_count += cell->children[i]->grid_points.size(); + } + } + + if (child_gp_count != cell->grid_points.size()) { + message("[DEBUG] ERROR: Cell at (%.3f, %.3f, %.3f) has %zu grid points " + "but children have %zu total", + cell->loc[0], cell->loc[1], cell->loc[2], cell->grid_points.size(), + child_gp_count); + (*errors)++; + } +} + +void validateOctreeStructure(Simulation *sim) { + message("[DEBUG] Validating octree structure..."); + + int errors = 0; + std::vector &cells = sim->cells; + + for (size_t cid = 0; cid < sim->nr_cells; cid++) { + validateCellRecursive(&cells[cid], &errors); + } + + if (errors > 0) { + error("[DEBUG] Found %d octree structure errors", errors); + } else { + message("[DEBUG] ✓ Octree structure is valid"); + } +} + +/** + * @brief Check that file-based grid points are within simulation boundaries + */ +void validateFileGridPoints(Grid *grid, Simulation *sim) { + if (!grid->grid_from_file) + return; + + message("[DEBUG] Validating file-based grid points are within simulation " + "boundaries..."); + + int outside_count = 0; + double *dim = sim->dim; + + for (GridPoint &gp : grid->grid_points) { + if (gp.loc[0] < 0 || gp.loc[0] >= dim[0] || gp.loc[1] < 0 || + gp.loc[1] >= dim[1] || gp.loc[2] < 0 || gp.loc[2] >= dim[2]) { + message("[DEBUG] ERROR: Grid point at (%.3f, %.3f, %.3f) is outside " + "simulation box [0-%.3f, 0-%.3f, 0-%.3f]", + gp.loc[0], gp.loc[1], gp.loc[2], dim[0], dim[1], dim[2]); + outside_count++; + } + } + + if (outside_count > 0) { + error("[DEBUG] Found %d grid points outside simulation boundaries", + outside_count); + } else { + message("[DEBUG] ✓ All file-based grid points within simulation " + "boundaries"); + } +} + +/** + * @brief Print detailed diagnostics about a specific grid point + */ +void diagnoseGridPoint(GridPoint *grid_point, Simulation *sim, Grid *grid) { + message("[DEBUG] ==== Grid Point Diagnostic ===="); + message("[DEBUG] Location: (%.6f, %.6f, %.6f)", grid_point->loc[0], + grid_point->loc[1], grid_point->loc[2]); + + // Find which cell it's in + Cell *cell = getCellContainingPoint(grid_point->loc); + message("[DEBUG] Assigned to cell at (%.3f, %.3f, %.3f) with width (%.3f, " + "%.3f, %.3f)", + cell->loc[0], cell->loc[1], cell->loc[2], cell->width[0], + cell->width[1], cell->width[2]); + message("[DEBUG] Cell has %zu particles, %zu grid points", cell->part_count, + cell->grid_points.size()); + message("[DEBUG] Cell is_useful: %s, is_split: %s", + cell->is_useful ? "true" : "false", cell->is_split ? "true" : "false"); + + // Check neighbors + message("[DEBUG] Cell has %zu neighbors", cell->neighbours.size()); + size_t total_neighbor_particles = 0; + for (Cell *neighbor : cell->neighbours) { + total_neighbor_particles += neighbor->part_count; + } + message("[DEBUG] Neighbors have %zu total particles", total_neighbor_particles); + + // Check particle counts for each kernel + message("[DEBUG] Particle counts by kernel radius:"); + for (double radius : grid->kernel_radii) { + int count = 0; + if (grid_point->counts.find(radius) != grid_point->counts.end()) { + count = grid_point->counts[radius]; + } + int brute_count = bruteForceCountParticles(grid_point, sim, radius); + message("[DEBUG] %.3f Mpc/h: gridder=%d, brute_force=%d", radius, count, + brute_count); + } + + message("[DEBUG] ================================"); +} + +#endif // DEBUGGING_CHECKS diff --git a/src/debugging_utils.hpp b/src/debugging_utils.hpp new file mode 100644 index 0000000..39040bf --- /dev/null +++ b/src/debugging_utils.hpp @@ -0,0 +1,85 @@ +#ifndef DEBUGGING_UTILS_HPP +#define DEBUGGING_UTILS_HPP + +// Header for comprehensive debugging utilities +// These are active only when DEBUGGING_CHECKS is defined + +#include "cell.hpp" +#include "grid_point.hpp" +#include "logger.hpp" +#include "metadata.hpp" +#include "simulation.hpp" +#include + +#ifdef DEBUGGING_CHECKS + +/** + * @brief Validate that grid points are correctly assigned to their containing + * cells + * + * @param sim Simulation object + * @param grid Grid object + */ +void validateGridPointCellAssignment(Simulation *sim, Grid *grid); + +/** + * @brief Validate that all useful cells have been correctly identified + * + * @param sim Simulation object + * @param grid Grid object + */ +void validateUsefulCells(Simulation *sim, Grid *grid); + +/** + * @brief Validate that particles are in the correct cells + * + * @param sim Simulation object + */ +void validateParticleCellAssignment(Simulation *sim); + +/** + * @brief Check if grid points can find any particles within kernel radii + * + * @param sim Simulation object + * @param grid Grid object + */ +void validateGridPointsHaveParticles(Simulation *sim, Grid *grid); + +/** + * @brief Validate octree structure integrity + * + * @param sim Simulation object + */ +void validateOctreeStructure(Simulation *sim); + +/** + * @brief Check that file-based grid points are within simulation boundaries + * + * @param grid Grid object + * @param sim Simulation object + */ +void validateFileGridPoints(Grid *grid, Simulation *sim); + +/** + * @brief Print detailed diagnostics about a specific grid point + * + * @param grid_point The grid point to diagnose + * @param sim Simulation object + * @param grid Grid object + */ +void diagnoseGridPoint(GridPoint *grid_point, Simulation *sim, Grid *grid); + +/** + * @brief Count particles within radius of a grid point (brute force) + * + * @param grid_point The grid point + * @param sim Simulation object + * @param radius Search radius + * @return Number of particles found + */ +int bruteForceCountParticles(GridPoint *grid_point, Simulation *sim, + double radius); + +#endif // DEBUGGING_CHECKS + +#endif // DEBUGGING_UTILS_HPP diff --git a/src/gridder.cpp b/src/gridder.cpp index c18fecb..ad026d6 100644 --- a/src/gridder.cpp +++ b/src/gridder.cpp @@ -22,6 +22,10 @@ #include "simulation.hpp" #include "talking.hpp" +#ifdef DEBUGGING_CHECKS +#include "debugging_utils.hpp" +#endif + /** * @brief Function to handle the command line arguments using robust parser * @@ -243,6 +247,11 @@ int main(int argc, char *argv[]) { return 0; // Clean exit, not an error } +#ifdef DEBUGGING_CHECKS + // Validate file-based grid points are within simulation boundaries + validateFileGridPoints(grid, sim); +#endif + #ifdef WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif @@ -255,6 +264,11 @@ int main(int argc, char *argv[]) { return 1; } +#ifdef DEBUGGING_CHECKS + // Validate grid points are correctly assigned to cells + validateGridPointCellAssignment(sim, grid); +#endif + #ifdef WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif @@ -269,6 +283,11 @@ int main(int argc, char *argv[]) { return 1; } +#ifdef DEBUGGING_CHECKS + // Validate useful cells are correctly flagged + validateUsefulCells(sim, grid); +#endif + #ifdef WITH_MPI MPI_Barrier(MPI_COMM_WORLD); @@ -322,6 +341,11 @@ int main(int argc, char *argv[]) { return 1; } +#ifdef DEBUGGING_CHECKS + // Validate particles are in the correct cells + validateParticleCellAssignment(sim); +#endif + #ifdef WITH_MPI MPI_Barrier(MPI_COMM_WORLD); @@ -360,6 +384,11 @@ int main(int argc, char *argv[]) { return 1; } +#ifdef DEBUGGING_CHECKS + // Validate octree structure integrity + validateOctreeStructure(sim); +#endif + #ifdef WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif @@ -373,6 +402,11 @@ int main(int argc, char *argv[]) { return 1; } +#ifdef DEBUGGING_CHECKS + // Validate grid points have particles (sample check with brute force) + validateGridPointsHaveParticles(sim, grid); +#endif + #ifdef WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif diff --git a/tests/test_file_grid params.yml b/tests/test_file_grid params.yml new file mode 100644 index 0000000..4846101 --- /dev/null +++ b/tests/test_file_grid params.yml @@ -0,0 +1,23 @@ +# Test parameter file for file-based gridding +# Tests gridding with coordinates loaded from a file + +Kernels: + nkernels: 3 + kernel_radius_1: 0.5 # Small scale + kernel_radius_2: 2.0 # Medium scale + kernel_radius_3: 5.0 # Large scale + +Grid: + type: file + grid_file: tests/data/test_grid_coordinates.txt + +Tree: + max_leaf_count: 200 + +Input: + filepath: tests/data/test_snapshot.hdf5 + +Output: + filepath: tests/data/ + basename: file_grid_output.hdf5 + write_masses: 1 diff --git a/tests/test_file_gridding.py b/tests/test_file_gridding.py new file mode 100755 index 0000000..a910317 --- /dev/null +++ b/tests/test_file_gridding.py @@ -0,0 +1,474 @@ +#!/usr/bin/env python3 +""" +Comprehensive test suite for file-based gridding functionality. + +This tests the critical case where grid points are loaded from a file, +which is the workflow that's failing for halo center gridding. +""" + +import argparse +import h5py +import numpy as np +import os +import subprocess +import sys +import tempfile + + +class FileGriddingTest: + """Test suite for file-based gridding""" + + def __init__(self, executable, test_dir="tests/data"): + self.executable = executable + self.test_dir = test_dir + os.makedirs(test_dir, exist_ok=True) + + # Test configuration + self.box_size = 10.0 # Mpc/h + self.n_particles = 1000 + self.kernel_radii = [0.5, 2.0, 5.0] + + def create_test_snapshot(self, filename): + """Create a simple test snapshot with particles in clusters""" + print(f"Creating test snapshot: {filename}") + + with h5py.File(filename, 'w') as f: + # Header + header = f.create_group('Header') + header.attrs['BoxSize'] = self.box_size + header.attrs['NumPart_Total'] = [0, self.n_particles, 0, 0, 0, 0] + header.attrs['NumPart_ThisFile'] = [0, self.n_particles, 0, 0, 0, 0] + header.attrs['Redshift'] = 7.0 + + # Units + units = f.create_group('Units') + units.attrs['Unit mass in cgs (U_M)'] = 1.989e43 # 10^10 Msun + units.attrs['Unit length in cgs (U_L)'] = 3.086e24 # Mpc + + # Cosmology + cosmo = f.create_group('Cosmology') + cosmo.attrs['Critical density [internal units]'] = 1.0 + + # Create particle distributions: 5 clusters + n_clusters = 5 + particles_per_cluster = self.n_particles // n_clusters + cluster_centers = [ + [2.0, 2.0, 2.0], + [5.0, 5.0, 5.0], + [8.0, 2.0, 5.0], + [2.0, 8.0, 5.0], + [5.0, 5.0, 8.0], + ] + + positions = [] + masses = [] + + for center in cluster_centers: + # Gaussian distribution around center + cluster_pos = np.random.normal( + loc=center, + scale=0.3, # Tight cluster + size=(particles_per_cluster, 3) + ) + # Wrap to box + cluster_pos = np.mod(cluster_pos, self.box_size) + positions.append(cluster_pos) + masses.extend([1.0] * particles_per_cluster) + + # Add remaining particles uniformly + remainder = self.n_particles - (n_clusters * particles_per_cluster) + if remainder > 0: + uniform_pos = np.random.uniform( + 0, self.box_size, size=(remainder, 3) + ) + positions.append(uniform_pos) + masses.extend([1.0] * remainder) + + positions = np.vstack(positions) + masses = np.array(masses) + + # Write particle data + part1 = f.create_group('PartType1') + part1.create_dataset('Coordinates', data=positions) + part1.create_dataset('Masses', data=masses) + part1.create_dataset('ParticleIDs', data=np.arange(len(masses))) + + print(f" Created {len(masses)} particles in {n_clusters} clusters") + return cluster_centers + + def create_grid_file(self, filename, grid_points): + """Create a text file with grid point coordinates""" + print(f"Creating grid file: {filename}") + + with open(filename, 'w') as f: + f.write("# Test grid point coordinates\n") + f.write("# Format: x y z (Mpc/h)\n") + f.write(f"# {len(grid_points)} grid points\n") + f.write("\n") + + for point in grid_points: + f.write(f"{point[0]:.6f} {point[1]:.6f} {point[2]:.6f}\n") + + print(f" Wrote {len(grid_points)} grid points") + + def create_param_file(self, filename, snapshot_file, grid_file, + output_file): + """Create a parameter file for the gridder""" + print(f"Creating parameter file: {filename}") + + content = f"""# Test parameter file for file-based gridding +Kernels: + nkernels: {len(self.kernel_radii)} +""" + for i, radius in enumerate(self.kernel_radii, 1): + content += f" kernel_radius_{i}: {radius}\n" + + content += f""" +Grid: + type: file + grid_file: {grid_file} + +Tree: + max_leaf_count: 200 + +Input: + filepath: {snapshot_file} + +Output: + filepath: {os.path.dirname(output_file)}/ + basename: {os.path.basename(output_file)} + write_masses: 1 +""" + + with open(filename, 'w') as f: + f.write(content) + + def run_gridder(self, param_file, nthreads=4, verbosity=2): + """Run the gridder executable""" + print(f"\nRunning gridder: {self.executable}") + print(f" Params: {param_file}") + print(f" Threads: {nthreads}") + print(f" Verbosity: {verbosity}") + + cmd = [self.executable, param_file, str(nthreads), str(verbosity)] + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=True + ) + print(" Gridder completed successfully") + if verbosity >= 2: + print("\n--- Gridder Output ---") + print(result.stdout) + if result.stderr: + print("\n--- Stderr ---") + print(result.stderr) + print("--- End Output ---\n") + return True + except subprocess.CalledProcessError as e: + print(f" ERROR: Gridder failed with exit code {e.returncode}") + print("\n--- Stdout ---") + print(e.stdout) + print("\n--- Stderr ---") + print(e.stderr) + return False + + def validate_output(self, output_file, expected_grid_points): + """Validate the gridder output""" + print(f"\nValidating output: {output_file}") + + if not os.path.exists(output_file): + print(f" ERROR: Output file not found: {output_file}") + return False + + errors = [] + + with h5py.File(output_file, 'r') as f: + # Check basic structure + required_groups = ['Header', 'Grids', 'Cells'] + for group in required_groups: + if group not in f: + errors.append(f"Missing required group: {group}") + + if 'Grids' not in f: + return False + + # Check kernels + kernels = list(f['Grids'].keys()) + print(f" Found {len(kernels)} kernels: {kernels}") + + if len(kernels) != len(self.kernel_radii): + errors.append( + f"Expected {len(self.kernel_radii)} kernels, " + f"found {len(kernels)}" + ) + + # Check each kernel + for i, expected_radius in enumerate(self.kernel_radii): + kernel_name = f'Kernel_{i}' + + if kernel_name not in f['Grids']: + errors.append(f"Missing kernel: {kernel_name}") + continue + + kernel_grp = f['Grids'][kernel_name] + + # Check kernel radius attribute + if 'KernelRadius' in kernel_grp.attrs: + actual_radius = kernel_grp.attrs['KernelRadius'] + if not np.isclose(actual_radius, expected_radius): + errors.append( + f"{kernel_name}: Expected radius {expected_radius}, " + f"got {actual_radius}" + ) + else: + errors.append( + f"{kernel_name}: Missing KernelRadius attribute" + ) + + # Check datasets + required_datasets = ['GridPointOverDensities', 'GridPointMasses'] + for dataset in required_datasets: + if dataset not in kernel_grp: + errors.append( + f"{kernel_name}: Missing dataset {dataset}" + ) + continue + + data = kernel_grp[dataset][:] + print(f" {kernel_name}/{dataset}: shape={data.shape}, " + f"min={data.min():.3f}, max={data.max():.3f}, " + f"mean={data.mean():.3f}") + + # Check for data quality issues + n_empty = np.sum(data == -1) + n_zero = np.sum(data == 0) + n_positive = np.sum(data > 0) + + print(f" Values: {n_positive} positive, " + f"{n_zero} zero, {n_empty} empty (-1)") + + # CRITICAL CHECK: For halo center gridding, we expect + # most grid points to find particles + if dataset == 'GridPointMasses': + if n_empty > len(data) * 0.5: + errors.append( + f"{kernel_name}: More than 50% of grid points " + f"have no particles ({n_empty}/{len(data)}). " + f"This suggests a major problem!" + ) + + if errors: + print("\n VALIDATION ERRORS:") + for error in errors: + print(f" - {error}") + return False + + print(" ✓ Validation passed") + return True + + def test_cluster_centers(self): + """ + Test 1: Grid points at cluster centers + This is the critical test - grid points at known high-density regions + """ + print("\n" + "="*60) + print("TEST 1: Grid points at cluster centers") + print("="*60) + + # Create test files + snapshot_file = os.path.join(self.test_dir, "test_clusters.hdf5") + grid_file = os.path.join(self.test_dir, "cluster_centers.txt") + param_file = os.path.join(self.test_dir, "test_clusters.yml") + output_file = os.path.join(self.test_dir, "cluster_output.hdf5") + + # Create snapshot with known cluster centers + cluster_centers = self.create_test_snapshot(snapshot_file) + + # Create grid file with grid points AT cluster centers + # These points MUST find particles + self.create_grid_file(grid_file, cluster_centers) + + # Create parameter file + self.create_param_file(param_file, snapshot_file, grid_file, + output_file) + + # Run gridder + if not self.run_gridder(param_file): + print("\n✗ TEST 1 FAILED: Gridder execution failed") + return False + + # Validate output + if not self.validate_output(output_file, cluster_centers): + print("\n✗ TEST 1 FAILED: Output validation failed") + return False + + print("\n✓ TEST 1 PASSED: Cluster center gridding works") + return True + + def test_random_points(self): + """ + Test 2: Random grid points throughout box + """ + print("\n" + "="*60) + print("TEST 2: Random grid points") + print("="*60) + + snapshot_file = os.path.join(self.test_dir, "test_random.hdf5") + grid_file = os.path.join(self.test_dir, "random_points.txt") + param_file = os.path.join(self.test_dir, "test_random.yml") + output_file = os.path.join(self.test_dir, "random_output.hdf5") + + # Create snapshot + self.create_test_snapshot(snapshot_file) + + # Create random grid points + n_grid = 50 + grid_points = np.random.uniform(0, self.box_size, size=(n_grid, 3)) + self.create_grid_file(grid_file, grid_points) + + # Create parameter file + self.create_param_file(param_file, snapshot_file, grid_file, + output_file) + + # Run and validate + if not self.run_gridder(param_file): + print("\n✗ TEST 2 FAILED: Gridder execution failed") + return False + + if not self.validate_output(output_file, grid_points): + print("\n✗ TEST 2 FAILED: Output validation failed") + return False + + print("\n✓ TEST 2 PASSED: Random point gridding works") + return True + + def test_boundary_points(self): + """ + Test 3: Grid points near box boundaries (periodic BC test) + """ + print("\n" + "="*60) + print("TEST 3: Boundary and periodic BC test") + print("="*60) + + snapshot_file = os.path.join(self.test_dir, "test_boundary.hdf5") + grid_file = os.path.join(self.test_dir, "boundary_points.txt") + param_file = os.path.join(self.test_dir, "test_boundary.yml") + output_file = os.path.join(self.test_dir, "boundary_output.hdf5") + + # Create snapshot + self.create_test_snapshot(snapshot_file) + + # Create grid points at box edges and corners + eps = 0.1 # Small offset from boundary + grid_points = [ + # Corners + [eps, eps, eps], + [self.box_size - eps, eps, eps], + [eps, self.box_size - eps, eps], + [eps, eps, self.box_size - eps], + [self.box_size - eps, self.box_size - eps, eps], + [self.box_size - eps, eps, self.box_size - eps], + [eps, self.box_size - eps, self.box_size - eps], + [self.box_size - eps, self.box_size - eps, self.box_size - eps], + # Face centers + [self.box_size / 2, eps, self.box_size / 2], + [self.box_size / 2, self.box_size - eps, self.box_size / 2], + [eps, self.box_size / 2, self.box_size / 2], + [self.box_size - eps, self.box_size / 2, self.box_size / 2], + ] + self.create_grid_file(grid_file, grid_points) + + # Create parameter file + self.create_param_file(param_file, snapshot_file, grid_file, + output_file) + + # Run and validate + if not self.run_gridder(param_file): + print("\n✗ TEST 3 FAILED: Gridder execution failed") + return False + + if not self.validate_output(output_file, grid_points): + print("\n✗ TEST 3 FAILED: Output validation failed") + return False + + print("\n✓ TEST 3 PASSED: Boundary point gridding works") + return True + + def run_all_tests(self): + """Run all tests and report results""" + print("\n" + "="*60) + print("FILE-BASED GRIDDING TEST SUITE") + print("="*60) + print(f"Executable: {self.executable}") + print(f"Test directory: {self.test_dir}") + print("") + + tests = [ + ("Cluster Centers", self.test_cluster_centers), + ("Random Points", self.test_random_points), + ("Boundary Points", self.test_boundary_points), + ] + + results = {} + for name, test_func in tests: + try: + results[name] = test_func() + except Exception as e: + print(f"\n✗ TEST '{name}' CRASHED: {e}") + import traceback + traceback.print_exc() + results[name] = False + + # Print summary + print("\n" + "="*60) + print("TEST SUMMARY") + print("="*60) + + for name, passed in results.items(): + status = "✓ PASSED" if passed else "✗ FAILED" + print(f" {status}: {name}") + + n_passed = sum(results.values()) + n_total = len(results) + + print("") + print(f"Total: {n_passed}/{n_total} tests passed") + print("="*60) + + return all(results.values()) + + +def main(): + parser = argparse.ArgumentParser( + description="Test suite for file-based gridding functionality" + ) + parser.add_argument( + "executable", + help="Path to parent_gridder executable" + ) + parser.add_argument( + "--test-dir", + default="tests/data", + help="Directory for test files (default: tests/data)" + ) + + args = parser.parse_args() + + # Check executable exists + if not os.path.exists(args.executable): + print(f"ERROR: Executable not found: {args.executable}") + return 1 + + # Run tests + tester = FileGriddingTest(args.executable, args.test_dir) + success = tester.run_all_tests() + + return 0 if success else 1 + + +if __name__ == "__main__": + sys.exit(main()) From 084f4f80dfad86bef0f911a64a44c49f1fb72b3f Mon Sep 17 00:00:00 2001 From: wjr21 Date: Thu, 20 Nov 2025 08:28:27 +0000 Subject: [PATCH 07/23] Fixing debugging compilation --- src/debugging_utils.cpp | 26 ++++++-------------------- src/grid_point.cpp | 11 +++++++++++ src/grid_point.hpp | 1 + 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/debugging_utils.cpp b/src/debugging_utils.cpp index 4eea66f..13d23fc 100644 --- a/src/debugging_utils.cpp +++ b/src/debugging_utils.cpp @@ -13,16 +13,6 @@ #ifdef DEBUGGING_CHECKS -/** - * @brief Find the smallest distance dx along one axis within a box of size - * box_size (handles periodic boundaries) - */ -static double nearest(const double dx, const double box_size) { - return ((dx > 0.5 * box_size) - ? (dx - box_size) - : ((dx < -0.5 * box_size) ? (dx + box_size) : dx)); -} - /** * @brief Validate that grid points are correctly assigned to their containing * cells @@ -30,7 +20,8 @@ static double nearest(const double dx, const double box_size) { * This checks that each grid point is spatially within the bounds of the cell * it's been assigned to. */ -void validateGridPointCellAssignment(Simulation *sim, Grid *grid) { +void validateGridPointCellAssignment(Simulation *sim, + [[maybe_unused]] Grid *grid) { message("[DEBUG] Validating grid point to cell assignments..."); int errors = 0; @@ -75,7 +66,7 @@ void validateGridPointCellAssignment(Simulation *sim, Grid *grid) { * 1. All cells with grid points are marked as useful * 2. All neighbors of cells with grid points are marked as useful */ -void validateUsefulCells(Simulation *sim, Grid *grid) { +void validateUsefulCells(Simulation *sim, [[maybe_unused]] Grid *grid) { message("[DEBUG] Validating useful cell flagging..."); int errors = 0; @@ -233,10 +224,7 @@ void validateGridPointsHaveParticles(Simulation *sim, Grid *grid) { int brute_count = bruteForceCountParticles(gp, sim, max_kernel); // Get what gridder found for largest kernel - int gridder_count = 0; - if (gp->counts.find(max_kernel) != gp->counts.end()) { - gridder_count = gp->counts[max_kernel]; - } + int gridder_count = gp->getCount(max_kernel); if (brute_count == 0) { message("[DEBUG] Grid point %zu at (%.3f, %.3f, %.3f): No particles " @@ -396,10 +384,8 @@ void diagnoseGridPoint(GridPoint *grid_point, Simulation *sim, Grid *grid) { // Check particle counts for each kernel message("[DEBUG] Particle counts by kernel radius:"); for (double radius : grid->kernel_radii) { - int count = 0; - if (grid_point->counts.find(radius) != grid_point->counts.end()) { - count = grid_point->counts[radius]; - } + // Get count from gridder + int count = grid_point->getCount(radius); int brute_count = bruteForceCountParticles(grid_point, sim, radius); message("[DEBUG] %.3f Mpc/h: gridder=%d, brute_force=%d", radius, count, brute_count); diff --git a/src/grid_point.cpp b/src/grid_point.cpp index f4a9e67..a9f78d9 100644 --- a/src/grid_point.cpp +++ b/src/grid_point.cpp @@ -86,6 +86,17 @@ double GridPoint::getMass(const double kernel_radius) const { } } +// Method to get the particle count inside the kernel radius +int GridPoint::getCount(const double kernel_radius) const { + // Check if the kernel radius exists in the count map + auto it = this->count_map.find(kernel_radius); + if (it != this->count_map.end()) { + return static_cast(it->second); + } else { + return 0; // Return 0 if the kernel radius is not found + } +} + /** * @brief Construct a new Grid object * diff --git a/src/grid_point.hpp b/src/grid_point.hpp index ede437e..e103b6f 100644 --- a/src/grid_point.hpp +++ b/src/grid_point.hpp @@ -28,6 +28,7 @@ class GridPoint { double kernel_radius); double getOverDensity(const double kernel_radius, Simulation *sim) const; double getMass(const double kernel_radius) const; + int getCount(const double kernel_radius) const; private: //! The count of particles in each kernel radius From 54df4c1163e9b9131cda4ef9d658f2788948c73a Mon Sep 17 00:00:00 2001 From: wjr21 Date: Thu, 20 Nov 2025 09:34:46 +0000 Subject: [PATCH 08/23] Adjusting tests --- tests/test_file_gridding.py | 52 +++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/test_file_gridding.py b/tests/test_file_gridding.py index a910317..b49d352 100755 --- a/tests/test_file_gridding.py +++ b/tests/test_file_gridding.py @@ -49,6 +49,32 @@ def create_test_snapshot(self, filename): cosmo = f.create_group('Cosmology') cosmo.attrs['Critical density [internal units]'] = 1.0 + # Cells metadata (required by gridder) + cells = f.create_group('Cells') + cells_meta = cells.create_group('Meta-data') + cells_meta.attrs['dimension'] = np.array([2, 2, 2], dtype=np.int32) + cells_meta.attrs['nr_cells'] = 8 + cells_meta.attrs['size'] = self.box_size + + # Cells Centres + cell_width = self.box_size / 2.0 + centres = [] + for i in range(2): + for j in range(2): + for k in range(2): + centres.append([ + (i + 0.5) * cell_width, + (j + 0.5) * cell_width, + (k + 0.5) * cell_width + ]) + cells.create_dataset('Centres', data=np.array(centres)) + + # Cells Counts (will be filled correctly after we know particle positions) + cells.create_dataset('Counts', data=np.zeros((8, 6), dtype=np.int32)) + + # Cells OffsetsInFile + cells.create_dataset('OffsetsInFile', data=np.zeros((8, 6), dtype=np.int64)) + # Create particle distributions: 5 clusters n_clusters = 5 particles_per_cluster = self.n_particles // n_clusters @@ -87,6 +113,32 @@ def create_test_snapshot(self, filename): positions = np.vstack(positions) masses = np.array(masses) + # Compute cell counts + cell_counts = np.zeros((8, 6), dtype=np.int32) + cell_offsets = np.zeros((8, 6), dtype=np.int64) + + # Assign particles to cells + for i, pos in enumerate(positions): + ix = int(pos[0] / cell_width) + iy = int(pos[1] / cell_width) + iz = int(pos[2] / cell_width) + # Ensure within bounds + ix = max(0, min(1, ix)) + iy = max(0, min(1, iy)) + iz = max(0, min(1, iz)) + cell_id = ix * 4 + iy * 2 + iz + cell_counts[cell_id, 1] += 1 # PartType1 + + # Set offsets (cumulative) + offset = 0 + for cell_id in range(8): + cell_offsets[cell_id, 1] = offset + offset += cell_counts[cell_id, 1] + + # Update cell datasets + f['Cells/Counts'][:] = cell_counts + f['Cells/OffsetsInFile'][:] = cell_offsets + # Write particle data part1 = f.create_group('PartType1') part1.create_dataset('Coordinates', data=positions) From 817e40a392701135dc28efb97b26de4a0ddffb1c Mon Sep 17 00:00:00 2001 From: wjr21 Date: Thu, 20 Nov 2025 09:44:23 +0000 Subject: [PATCH 09/23] Fixing test suite --- tests/test_file_gridding.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/test_file_gridding.py b/tests/test_file_gridding.py index b49d352..1df8fcc 100755 --- a/tests/test_file_gridding.py +++ b/tests/test_file_gridding.py @@ -69,11 +69,13 @@ def create_test_snapshot(self, filename): ]) cells.create_dataset('Centres', data=np.array(centres)) - # Cells Counts (will be filled correctly after we know particle positions) - cells.create_dataset('Counts', data=np.zeros((8, 6), dtype=np.int32)) + # Create Counts and OffsetsInFile groups (not datasets!) + counts_grp = cells.create_group('Counts') + offsets_grp = cells.create_group('OffsetsInFile') - # Cells OffsetsInFile - cells.create_dataset('OffsetsInFile', data=np.zeros((8, 6), dtype=np.int64)) + # Will be filled after we know particle positions + counts_grp.create_dataset('PartType1', data=np.zeros(8, dtype=np.int32)) + offsets_grp.create_dataset('PartType1', data=np.zeros(8, dtype=np.int32)) # Create particle distributions: 5 clusters n_clusters = 5 @@ -113,9 +115,9 @@ def create_test_snapshot(self, filename): positions = np.vstack(positions) masses = np.array(masses) - # Compute cell counts - cell_counts = np.zeros((8, 6), dtype=np.int32) - cell_offsets = np.zeros((8, 6), dtype=np.int64) + # Compute cell counts for PartType1 + cell_counts = np.zeros(8, dtype=np.int32) + cell_offsets = np.zeros(8, dtype=np.int32) # Assign particles to cells for i, pos in enumerate(positions): @@ -127,17 +129,17 @@ def create_test_snapshot(self, filename): iy = max(0, min(1, iy)) iz = max(0, min(1, iz)) cell_id = ix * 4 + iy * 2 + iz - cell_counts[cell_id, 1] += 1 # PartType1 + cell_counts[cell_id] += 1 # Set offsets (cumulative) offset = 0 for cell_id in range(8): - cell_offsets[cell_id, 1] = offset - offset += cell_counts[cell_id, 1] + cell_offsets[cell_id] = offset + offset += cell_counts[cell_id] # Update cell datasets - f['Cells/Counts'][:] = cell_counts - f['Cells/OffsetsInFile'][:] = cell_offsets + f['Cells/Counts/PartType1'][:] = cell_counts + f['Cells/OffsetsInFile/PartType1'][:] = cell_offsets # Write particle data part1 = f.create_group('PartType1') From f5662473d57ac5cf49458b5111c6efb371713920 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Thu, 20 Nov 2025 10:57:04 +0000 Subject: [PATCH 10/23] More test fixing... --- tests/test_file_gridding.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/tests/test_file_gridding.py b/tests/test_file_gridding.py index 1df8fcc..a286a23 100755 --- a/tests/test_file_gridding.py +++ b/tests/test_file_gridding.py @@ -214,13 +214,23 @@ def run_gridder(self, param_file, nthreads=4, verbosity=2): check=True ) print(" Gridder completed successfully") - if verbosity >= 2: + + # Always print debug output (contains validation results) + if result.stdout: print("\n--- Gridder Output ---") print(result.stdout) - if result.stderr: - print("\n--- Stderr ---") - print(result.stderr) - print("--- End Output ---\n") + if result.stderr: + print("\n--- Stderr ---") + print(result.stderr) + print("--- End Output ---\n") + + # Check for debug validation failures in output + if '[DEBUG]' in result.stdout or '[DEBUG]' in result.stderr: + debug_output = result.stdout + result.stderr + if 'ERROR' in debug_output or 'FAILED' in debug_output: + print("\n!!! DEBUG CHECKS DETECTED ERRORS !!!") + print("See debug output above for details.\n") + return True except subprocess.CalledProcessError as e: print(f" ERROR: Gridder failed with exit code {e.returncode}") @@ -250,9 +260,12 @@ def validate_output(self, output_file, expected_grid_points): if 'Grids' not in f: return False - # Check kernels - kernels = list(f['Grids'].keys()) + # Check kernels (filter out non-kernel datasets like GridPointPositions) + all_keys = list(f['Grids'].keys()) + kernels = [k for k in all_keys if k.startswith('Kernel_')] print(f" Found {len(kernels)} kernels: {kernels}") + if len(all_keys) != len(kernels): + print(f" (Ignoring non-kernel datasets: {[k for k in all_keys if not k.startswith('Kernel_')]})") if len(kernels) != len(self.kernel_radii): errors.append( @@ -308,12 +321,22 @@ def validate_output(self, output_file, expected_grid_points): # CRITICAL CHECK: For halo center gridding, we expect # most grid points to find particles if dataset == 'GridPointMasses': + # Check for empty (-1) values if n_empty > len(data) * 0.5: errors.append( f"{kernel_name}: More than 50% of grid points " f"have no particles ({n_empty}/{len(data)}). " f"This suggests a major problem!" ) + # CRITICAL: Check for zero values when we expect particles + # (cluster centers should have particles!) + if 'cluster' in output_file.lower() and n_zero > len(data) * 0.5: + errors.append( + f"CRITICAL BUG: {kernel_name}: Grid points at cluster " + f"centers found ZERO particles ({n_zero}/{len(data)}). " + f"Particles exist at these locations but gridder " + f"is not finding them!" + ) if errors: print("\n VALIDATION ERRORS:") From 563a12c1a53440f2d721a9fe47b04dc94ae9ba52 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Thu, 20 Nov 2025 11:04:42 +0000 Subject: [PATCH 11/23] More info in debugging tests --- src/cell.cpp | 21 +++++++++++++++++++++ src/construct_grid_points.cpp | 12 ++++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/cell.cpp b/src/cell.cpp index 790f0cd..bf09178 100644 --- a/src/cell.cpp +++ b/src/cell.cpp @@ -1222,6 +1222,27 @@ void assignGridPointsToCells([[maybe_unused]] Simulation *sim, Grid *grid) { // Get the cells for debugging checks std::vector &cells = sim->cells; + // Print cell assignment summary + message("[DEBUG] Grid point to cell assignment summary:"); + int cells_with_gps = 0; + for (size_t cid = 0; cid < sim->nr_cells; cid++) { + Cell *cell = &cells[cid]; + if (cell->grid_points.size() > 0) { + cells_with_gps++; + message("[DEBUG] Cell %zu at (%.3f, %.3f, %.3f) width (%.3f, %.3f, %.3f) has %zu grid points", + cid, cell->loc[0], cell->loc[1], cell->loc[2], + cell->width[0], cell->width[1], cell->width[2], + cell->grid_points.size()); + // Print first grid point in this cell + if (cell->grid_points.size() > 0) { + GridPoint *gp = cell->grid_points[0]; + message("[DEBUG] First GP: (%.6f, %.6f, %.6f)", + gp->loc[0], gp->loc[1], gp->loc[2]); + } + } + } + message("[DEBUG] Total cells with grid points: %d", cells_with_gps); + // Check grid points are in the right cells for (size_t cid = 0; cid < sim->nr_cells; cid++) { Cell *cell = &cells[cid]; diff --git a/src/construct_grid_points.cpp b/src/construct_grid_points.cpp index b25b43a..8bd1976 100644 --- a/src/construct_grid_points.cpp +++ b/src/construct_grid_points.cpp @@ -413,6 +413,18 @@ static void createGridPointsFromFile(Simulation *sim, Grid *grid) { message("Created %d grid points from file %s", valid_points, grid->grid_file.c_str()); +#ifdef DEBUGGING_CHECKS + // Print first few grid points to verify they loaded correctly + message("[DEBUG] First 5 grid points loaded from file:"); + int n_to_print = std::min(5, valid_points); + for (int i = 0; i < n_to_print; i++) { + message("[DEBUG] Point %d: (%.6f, %.6f, %.6f)", i, + grid->grid_points[i].loc[0], + grid->grid_points[i].loc[1], + grid->grid_points[i].loc[2]); + } +#endif + // Initialize mass and count maps for all grid points message("Initializing grid point maps for %d kernel radii", grid->nkernels); for (GridPoint &gp : grid->grid_points) { From e4833913fa14ce4b882075d698021b3e5d627bdb Mon Sep 17 00:00:00 2001 From: wjr21 Date: Thu, 20 Nov 2025 11:12:40 +0000 Subject: [PATCH 12/23] Fixing test --- tests/test_file_gridding.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_file_gridding.py b/tests/test_file_gridding.py index a286a23..caa0cc2 100755 --- a/tests/test_file_gridding.py +++ b/tests/test_file_gridding.py @@ -35,7 +35,8 @@ def create_test_snapshot(self, filename): with h5py.File(filename, 'w') as f: # Header header = f.create_group('Header') - header.attrs['BoxSize'] = self.box_size + # BoxSize must be an array [x, y, z] not a scalar! + header.attrs['BoxSize'] = np.array([self.box_size, self.box_size, self.box_size]) header.attrs['NumPart_Total'] = [0, self.n_particles, 0, 0, 0, 0] header.attrs['NumPart_ThisFile'] = [0, self.n_particles, 0, 0, 0, 0] header.attrs['Redshift'] = 7.0 From 90590d76a7d7d1e582e0a233e9978518d7e91f23 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Thu, 20 Nov 2025 12:01:44 +0000 Subject: [PATCH 13/23] Introducing a brute force sanity check in ddebug mode --- src/construct_cells.cpp | 45 +++++--- src/debugging_utils.cpp | 105 +++++++++++-------- src/grid_point.cpp | 17 +++ src/grid_point.hpp | 10 ++ src/output.cpp | 59 +++++++++++ tests/analyze_debug_output.py | 76 ++++++++++++++ make_test_snap.py => tests/make_test_snap.py | 0 tests/run_simple_test.sh | 2 +- tests/test_file_gridding.py | 5 +- tests/test_gridder.py | 2 +- 10 files changed, 255 insertions(+), 66 deletions(-) create mode 100644 tests/analyze_debug_output.py rename make_test_snap.py => tests/make_test_snap.py (100%) diff --git a/src/construct_cells.cpp b/src/construct_cells.cpp index b6f648d..c94598b 100644 --- a/src/construct_cells.cpp +++ b/src/construct_cells.cpp @@ -47,23 +47,28 @@ void getTopCells(Simulation *sim, Grid *grid) { // neighbouring cells (this simplifies boilerplate elsewhere) // How many cells do we need to walk out for the biggest kernel? This is - // the maximum distance at which we will need to consider another cell - const int nwalk = std::ceil(grid->max_kernel_radius / width[0]) + 1; - int nwalk_upper = nwalk; - int nwalk_lower = nwalk; - - // If nwalk is greater than the number of cells in the simulation, we need - // to walk out to the edge of the simulation - if (nwalk > cdim[0] / 2) { - nwalk_upper = cdim[0] / 2; - nwalk_lower = cdim[0] / 2; + // the maximum distance at which we will need to consider another cell. + // We compute this separately for each dimension since cell widths and + // grid dimensions may differ. + int nwalk[3]; + for (int dim = 0; dim < 3; dim++) { + nwalk[dim] = std::ceil(grid->max_kernel_radius / width[dim]) + 1; + + // Clamp to half the grid dimension to prevent duplicate neighbors + // through periodic wrapping. If we walk more than cdim/2, we'll + // encounter the same cell from multiple periodic images. + if (nwalk[dim] > cdim[dim] / 2) { + nwalk[dim] = cdim[dim] / 2; + } } - message("Looking for neighbours within %d cells", nwalk); + message("Looking for neighbours within [%d, %d, %d] cells", + nwalk[0], nwalk[1], nwalk[2]); - // Calculate maximum neighbors + // Calculate maximum neighbors (use the maximum nwalk for reservation) + const int max_nwalk = std::max({nwalk[0], nwalk[1], nwalk[2]}); const int max_neighbors = - (2 * nwalk + 1) * (2 * nwalk + 1) * (2 * nwalk + 1) - + (2 * max_nwalk + 1) * (2 * max_nwalk + 1) * (2 * max_nwalk + 1) - 1; // -1 excludes self // Loop over the cells attaching the pointers the neighbouring cells (taking @@ -82,10 +87,11 @@ void getTopCells(Simulation *sim, Grid *grid) { // Reserve space for neighbors cell->neighbours.reserve(max_neighbors); - // Loop over the neighbours - for (int ii = -nwalk_lower; ii < nwalk_upper + 1; ii++) { - for (int jj = -nwalk_lower; jj < nwalk_upper + 1; jj++) { - for (int kk = -nwalk_lower; kk < nwalk_upper + 1; kk++) { + // Loop over the neighbours using dimension-specific nwalk values + // The nwalk values are already clamped to cdim/2, preventing duplicates + for (int ii = -nwalk[0]; ii <= nwalk[0]; ii++) { + for (int jj = -nwalk[1]; jj <= nwalk[1]; jj++) { + for (int kk = -nwalk[2]; kk <= nwalk[2]; kk++) { // Skip the cell itself if (ii == 0 && jj == 0 && kk == 0) @@ -97,6 +103,11 @@ void getTopCells(Simulation *sim, Grid *grid) { int kkk = (k + kk + cdim[2]) % cdim[2]; int cjd = iii * cdim[1] * cdim[2] + jjj * cdim[2] + kkk; + // Skip if this wraps back to the cell itself + // (can happen with periodic boundaries in small boxes) + if (cjd == static_cast(cid)) + continue; + // Attach the neighbour to the cell cell->neighbours.push_back(&cells[cjd]); } diff --git a/src/debugging_utils.cpp b/src/debugging_utils.cpp index 13d23fc..6de561f 100644 --- a/src/debugging_utils.cpp +++ b/src/debugging_utils.cpp @@ -199,62 +199,75 @@ int bruteForceCountParticles(GridPoint *grid_point, Simulation *sim, /** * @brief Check if grid points can find any particles within kernel radii * - * For each grid point, performs a brute force search to count particles within - * the largest kernel radius, then compares to what the gridder found. + * For ALL grid points, performs a brute force search to count particles within + * each kernel radius, stores the counts, then compares to what the gridder found. + * In debug mode, these brute force counts will be written to the output file. */ void validateGridPointsHaveParticles(Simulation *sim, Grid *grid) { - message("[DEBUG] Validating grid points can find particles..."); + message("[DEBUG] Computing brute force particle counts for all grid points and all kernels..."); + message("[DEBUG] This will be written to output file for comparison."); - // Find the maximum kernel radius - double max_kernel = *std::max_element(grid->kernel_radii.begin(), - grid->kernel_radii.end()); + int total_mismatches = 0; + int total_empty_kernels = 0; + int total_checks = 0; - int empty_count = 0; - int mismatch_count = 0; - int checked_count = 0; + // For ALL grid points (not sampled), compute brute force counts for ALL kernels + for (size_t i = 0; i < grid->grid_points.size(); i++) { + GridPoint *gp = &grid->grid_points[i]; - // Sample a subset of grid points to avoid excessive computation - size_t sample_interval = std::max(size_t(1), grid->grid_points.size() / 100); + // Compute brute force for each kernel radius + for (double kernel_rad : grid->kernel_radii) { + int brute_count = bruteForceCountParticles(gp, sim, kernel_rad); + int gridder_count = gp->getCount(kernel_rad); - for (size_t i = 0; i < grid->grid_points.size(); i += sample_interval) { - GridPoint *gp = &grid->grid_points[i]; - checked_count++; - - // Brute force count - int brute_count = bruteForceCountParticles(gp, sim, max_kernel); - - // Get what gridder found for largest kernel - int gridder_count = gp->getCount(max_kernel); - - if (brute_count == 0) { - message("[DEBUG] Grid point %zu at (%.3f, %.3f, %.3f): No particles " - "within %.3f Mpc/h", - i, gp->loc[0], gp->loc[1], gp->loc[2], max_kernel); - empty_count++; - } else if (brute_count != gridder_count) { - message("[DEBUG] Grid point %zu at (%.3f, %.3f, %.3f): Brute force " - "found %d particles, gridder found %d (within %.3f Mpc/h)", - i, gp->loc[0], gp->loc[1], gp->loc[2], brute_count, - gridder_count, max_kernel); - mismatch_count++; + // Store the brute force count + gp->setBruteForceCount(kernel_rad, brute_count); + + total_checks++; + + if (brute_count == 0) { + total_empty_kernels++; + } + + if (brute_count != gridder_count) { + total_mismatches++; + } } } - message("[DEBUG] Checked %d grid points:", checked_count); - message("[DEBUG] - %d had no particles within max kernel radius", - empty_count); - message("[DEBUG] - %d had count mismatches between brute force and " - "gridder", - mismatch_count); - - if (mismatch_count > 0) { - error("[DEBUG] Found %d grid points with particle count mismatches", - mismatch_count); - } else if (empty_count > checked_count / 2) { - message("[DEBUG] WARNING: More than 50%% of sampled grid points have no " - "particles nearby"); + message("[DEBUG] Brute force validation complete:"); + message("[DEBUG] - Checked %zu grid points × %d kernels = %d combinations", + grid->grid_points.size(), grid->nkernels, total_checks); + message("[DEBUG] - %d had no particles (expected for small kernels/sparse regions)", + total_empty_kernels); + message("[DEBUG] - %d had count mismatches between brute force and gridder", + total_mismatches); + + if (total_mismatches > 0) { + message("[DEBUG] ╔═══════════════════════════════════════════════════════════════╗"); + message("[DEBUG] ║ WARNING: PARTICLE COUNT MISMATCHES DETECTED ║"); + message("[DEBUG] ╠═══════════════════════════════════════════════════════════════╣"); + message("[DEBUG] ║ Found %5d mismatches between octree and brute force ║", total_mismatches); + message("[DEBUG] ║ ║"); + message("[DEBUG] ║ THE RESULTS IN THE OUTPUT FILE ARE INCORRECT! ║"); + message("[DEBUG] ║ ║"); + message("[DEBUG] ║ The output file contains both: ║"); + message("[DEBUG] ║ - GridPointCounts: ║"); + message("[DEBUG] ║ Counts from octree traversal (INCORRECT - has bugs) ║"); + message("[DEBUG] ║ - BruteForceGridPointCounts: ║"); + message("[DEBUG] ║ Counts from exhaustive search (CORRECT - ground truth) ║"); + message("[DEBUG] ║ ║"); + message("[DEBUG] ║ Use BruteForceGridPointCounts for analysis. ║"); + message("[DEBUG] ║ ║"); + message("[DEBUG] ║ This likely indicates a bug in periodic boundary handling ║"); + message("[DEBUG] ║ or octree traversal, especially for grid points near box ║"); + message("[DEBUG] ║ boundaries or with small kernel radii. ║"); + message("[DEBUG] ╚═══════════════════════════════════════════════════════════════╝"); + } else if (total_empty_kernels > total_checks / 2) { + message("[DEBUG] WARNING: More than 50%% of grid point/kernel combinations " + "have no particles nearby"); } else { - message("[DEBUG] ✓ Grid point particle counts look reasonable"); + message("[DEBUG] ✓ Grid point particle counts match brute force validation"); } } diff --git a/src/grid_point.cpp b/src/grid_point.cpp index a9f78d9..e33afdf 100644 --- a/src/grid_point.cpp +++ b/src/grid_point.cpp @@ -97,6 +97,23 @@ int GridPoint::getCount(const double kernel_radius) const { } } +#ifdef DEBUGGING_CHECKS +// Method to set the brute force count for a kernel radius +void GridPoint::setBruteForceCount(const double kernel_radius, int count) { + this->brute_force_count_map[kernel_radius] = count; +} + +// Method to get the brute force count for a kernel radius +int GridPoint::getBruteForceCount(const double kernel_radius) const { + auto it = this->brute_force_count_map.find(kernel_radius); + if (it != this->brute_force_count_map.end()) { + return it->second; + } else { + return -1; // Return -1 if not computed (shouldn't happen) + } +} +#endif + /** * @brief Construct a new Grid object * diff --git a/src/grid_point.hpp b/src/grid_point.hpp index e103b6f..375832e 100644 --- a/src/grid_point.hpp +++ b/src/grid_point.hpp @@ -30,12 +30,22 @@ class GridPoint { double getMass(const double kernel_radius) const; int getCount(const double kernel_radius) const; +#ifdef DEBUGGING_CHECKS + void setBruteForceCount(const double kernel_radius, int count); + int getBruteForceCount(const double kernel_radius) const; +#endif + private: //! The count of particles in each kernel radius std::unordered_map count_map; //! The mass of particles in each kernel radius std::unordered_map mass_map; + +#ifdef DEBUGGING_CHECKS + //! Brute force particle counts (for validation in debug mode) + std::unordered_map brute_force_count_map; +#endif }; class Grid { diff --git a/src/output.cpp b/src/output.cpp index 6ffb54f..b6d0d14 100644 --- a/src/output.cpp +++ b/src/output.cpp @@ -148,6 +148,27 @@ void writeGridFileSerial(Simulation *sim, Grid *grid) { } } + // Create particle count dataset from tree traversal algorithm + if (!hdf5.createDataset("Grids/" + kernel_name + + "/GridPointCounts", + grid_point_overdens_dims)) { + error("Failed to create GridPointCounts dataset for kernel %zu (radius=%f)", + kernel_idx, kernel_rad); + continue; + } + +#ifdef DEBUGGING_CHECKS + // In debug mode, also write brute force counts for validation + // These are computed by exhaustively checking every particle's distance + if (!hdf5.createDataset("Grids/" + kernel_name + + "/BruteForceGridPointCounts", + grid_point_overdens_dims)) { + error("Failed to create BruteForceGridPointCounts dataset for kernel %zu (radius=%f)", + kernel_idx, kernel_rad); + continue; + } +#endif + // Process each cell for (size_t cid = 0; cid < sim->nr_cells; cid++) { Cell *cell = &cells[cid]; @@ -163,12 +184,18 @@ void writeGridFileSerial(Simulation *sim, Grid *grid) { // Prepare data arrays std::vector cell_grid_overdens; std::vector cell_grid_masses; + std::vector cell_grid_counts; std::vector cell_grid_pos; cell_grid_overdens.reserve(count); + cell_grid_counts.reserve(count); cell_grid_pos.reserve(count * 3); if (metadata->write_masses) { cell_grid_masses.reserve(count); } +#ifdef DEBUGGING_CHECKS + std::vector cell_grid_brute_counts; + cell_grid_brute_counts.reserve(count); +#endif // Extract data from grid points for (const GridPoint *gp : cell->grid_points) { @@ -180,6 +207,14 @@ void writeGridFileSerial(Simulation *sim, Grid *grid) { cell_grid_masses.push_back(gp->getMass(kernel_rad)); } + // Get particle counts from gridder algorithm + cell_grid_counts.push_back(gp->getCount(kernel_rad)); + +#ifdef DEBUGGING_CHECKS + // Get brute force counts in debug mode + cell_grid_brute_counts.push_back(gp->getBruteForceCount(kernel_rad)); +#endif + // Store positions if not done yet if (!written_positions) { cell_grid_pos.push_back(gp->loc[0]); @@ -209,6 +244,30 @@ void writeGridFileSerial(Simulation *sim, Grid *grid) { } } + // Write gridder particle counts + if (!cell_grid_counts.empty()) { + if (!hdf5.writeDatasetSlice( + "Grids/" + kernel_name + "/GridPointCounts", cell_grid_counts, + {static_cast(start_idx)}, + {static_cast(count)})) { + error("Failed to write counts slice for cell %zu, kernel %zu (radius=%f)", + cid, kernel_idx, kernel_rad); + } + } + +#ifdef DEBUGGING_CHECKS + // Write brute force counts in debug mode + if (!cell_grid_brute_counts.empty()) { + if (!hdf5.writeDatasetSlice( + "Grids/" + kernel_name + "/BruteForceGridPointCounts", cell_grid_brute_counts, + {static_cast(start_idx)}, + {static_cast(count)})) { + error("Failed to write brute force counts slice for cell %zu, kernel %zu (radius=%f)", + cid, kernel_idx, kernel_rad); + } + } +#endif + // Write position slice if needed if (!written_positions && !cell_grid_pos.empty()) { if (!hdf5.writeDatasetSlice( diff --git a/tests/analyze_debug_output.py b/tests/analyze_debug_output.py new file mode 100644 index 0000000..e140510 --- /dev/null +++ b/tests/analyze_debug_output.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +""" +Analyze debug output to diagnose tree traversal bugs +""" +import h5py +import numpy as np +import sys + +def analyze_file(filename): + print('='*70) + print(f'ANALYZING: {filename}') + print('='*70) + + with h5py.File(filename, 'r') as f: + # Get grid point positions + positions = np.array(f['Grids/GridPointPositions']) + + print(f'\nGrid point locations:') + for i, pos in enumerate(positions): + print(f' GP {i:2d}: ({pos[0]:5.1f}, {pos[1]:5.1f}, {pos[2]:5.1f})') + + print('\n' + '='*70) + print('Per-kernel comparison:') + print('='*70) + + for kernel_name in sorted(f['Grids'].keys()): + if kernel_name == 'GridPointPositions': + continue + + kernel_rad = f[f'Grids/{kernel_name}'].attrs['KernelRadius'] + octree_counts = np.array(f[f'Grids/{kernel_name}/GridPointCounts']) + brute_counts = np.array(f[f'Grids/{kernel_name}/BruteForceGridPointCounts']) + + print(f'\n{kernel_name} (radius = {kernel_rad} Mpc/h):') + print(f' GP | Position | Octree | Brute | Diff | Status') + print(f' ----+----------------------+--------+--------+--------+-------') + + for i in range(len(octree_counts)): + pos_str = f'({positions[i][0]:5.1f},{positions[i][1]:5.1f},{positions[i][2]:5.1f})' + diff = octree_counts[i] - brute_counts[i] + + if diff == 0: + status = 'OK' + elif octree_counts[i] == 0 and brute_counts[i] > 0: + status = 'MISS' # Octree missed particles + elif octree_counts[i] > brute_counts[i]: + status = 'OVER' # Octree over-counted + else: + status = 'UNDER' # Octree under-counted + + print(f' {i:3d} | {pos_str:20s} | {octree_counts[i]:6d} | {brute_counts[i]:6d} | {diff:+7d} | {status}') + + # Summary + total_mismatches = np.sum(octree_counts != brute_counts) + total_miss = np.sum((octree_counts == 0) & (brute_counts > 0)) + total_over = np.sum(octree_counts > brute_counts) + + print(f'\n Summary: {total_mismatches}/{len(octree_counts)} mismatches') + print(f' - Complete misses (0 when should find particles): {total_miss}') + print(f' - Over-counting (found more than truth): {total_over}') + +if __name__ == '__main__': + if len(sys.argv) > 1: + analyze_file(sys.argv[1]) + else: + # Default files to analyze + files = [ + 'tests/data/boundary_output.hdf5', + 'tests/data/cluster_output.hdf5', + ] + for f in files: + try: + analyze_file(f) + print('\n') + except FileNotFoundError: + print(f'File not found: {f}\n') diff --git a/make_test_snap.py b/tests/make_test_snap.py similarity index 100% rename from make_test_snap.py rename to tests/make_test_snap.py diff --git a/tests/run_simple_test.sh b/tests/run_simple_test.sh index 473e9ca..c4f5633 100755 --- a/tests/run_simple_test.sh +++ b/tests/run_simple_test.sh @@ -67,7 +67,7 @@ run_test() { # Create simple test snapshot cd "$ROOT_DIR" - python3 make_test_snap.py \ + python3 tests/make_test_snap.py \ --output "$TEST_SNAPSHOT" \ --cdim 3 \ --boxsize 10.0 \ diff --git a/tests/test_file_gridding.py b/tests/test_file_gridding.py index caa0cc2..33f3748 100755 --- a/tests/test_file_gridding.py +++ b/tests/test_file_gridding.py @@ -55,7 +55,10 @@ def create_test_snapshot(self, filename): cells_meta = cells.create_group('Meta-data') cells_meta.attrs['dimension'] = np.array([2, 2, 2], dtype=np.int32) cells_meta.attrs['nr_cells'] = 8 - cells_meta.attrs['size'] = self.box_size + # Cell size must be an array [x, y, z] for each cell dimension, not a scalar! + # With dimension [2, 2, 2] and box size 10.0, each cell is 5.0 x 5.0 x 5.0 + cell_width = self.box_size / 2.0 + cells_meta.attrs['size'] = np.array([cell_width, cell_width, cell_width]) # Cells Centres cell_width = self.box_size / 2.0 diff --git a/tests/test_gridder.py b/tests/test_gridder.py index a11925c..d52309e 100644 --- a/tests/test_gridder.py +++ b/tests/test_gridder.py @@ -54,7 +54,7 @@ def test_snapshot(): DATA_DIR.mkdir(parents=True, exist_ok=True) # Create a minimal test snapshot using make_test_snap.py - make_snap_script = PROJECT_ROOT / "make_test_snap.py" + make_snap_script = TESTS_DIR / "make_test_snap.py" # Find a donor snapshot (use any existing HDF5 file or create minimal one) donor_path = DATA_DIR / "donor_minimal.hdf5" From d2201333d3f792c0631cb6093648181d57796e21 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Thu, 20 Nov 2025 13:59:52 +0000 Subject: [PATCH 14/23] Get mean density from cosmology not particles We don't want to load everything just to calculate a mean density! --- example_params.yml | 5 + src/cell.cpp | 5 - src/gridder.cpp | 9 + src/simulation.cpp | 51 +++++ src/simulation.hpp | 3 + tests/file_grid_test_params.yml | 6 + tests/simple_test_params.yml | 6 + tests/test_cosmology.py | 346 ++++++++++++++++++++++++++++++++ tests/test_file_grid params.yml | 6 + 9 files changed, 432 insertions(+), 5 deletions(-) create mode 100644 tests/test_cosmology.py diff --git a/example_params.yml b/example_params.yml index 66c3e0f..9a2e000 100644 --- a/example_params.yml +++ b/example_params.yml @@ -13,6 +13,11 @@ Grid: n_grid_points: 1000000 # number of grid points to sample (applicable to random) random_seed: 42 # seed for random grid point generation (applicable to random) +Cosmology: + h: 0.681 # Reduced Hubble constant + Omega_cdm: 0.256011 # Cold dark matter density parameter + Omega_b: 0.048600 # Baryon density parameter + Performance: Tree: diff --git a/src/cell.cpp b/src/cell.cpp index bf09178..1ba30c8 100644 --- a/src/cell.cpp +++ b/src/cell.cpp @@ -843,11 +843,6 @@ void assignPartsToCells(Simulation *sim) { total_mass = global_total_mass; #endif - // Compute the mean comoving density - sim->mean_density = total_mass / sim->volume; - - message("Mean comoving density: %e 10**10 Msun / cMpc^3", sim->mean_density); - #ifdef DEBUGGING_CHECKS // Make sure we have attached all the particles (only count local cells in // MPI) diff --git a/src/gridder.cpp b/src/gridder.cpp index ad026d6..8f9f7d8 100644 --- a/src/gridder.cpp +++ b/src/gridder.cpp @@ -219,6 +219,15 @@ int main(int argc, char *argv[]) { message("Number of top level cells: %d", sim->nr_cells); + // Calculate mean density from cosmological parameters + // This must be done before any overdensity calculations + try { + sim->calculateMeanDensityFromCosmology(params); + } catch (const std::exception &e) { + std::cerr << "Failed to calculate mean density from cosmology: " << e.what() << std::endl; + return 1; + } + #ifdef WITH_MPI // Partition the cells across MPI ranks (work-based partition) try { diff --git a/src/simulation.cpp b/src/simulation.cpp index 912db54..77c9c7d 100644 --- a/src/simulation.cpp +++ b/src/simulation.cpp @@ -105,3 +105,54 @@ void Simulation::readSimulationData() { // Compute the comoving volume of the simulation this->volume = this->dim[0] * this->dim[1] * this->dim[2]; } + +/** + * @brief Calculate mean comoving density from cosmological parameters + * + * Computes the mean matter density at the simulation redshift using: + * ρ_mean = ρ_crit(z=0) × Ω_m × (1+z)³ + * + * where ρ_crit(z=0) = 3H₀²/(8πG) is the critical density today + * + * @param params The parameters object containing cosmology + */ +void Simulation::calculateMeanDensityFromCosmology(Parameters *params) { + + // Read cosmology parameters + double h = params->getParameterNoDefault("Cosmology/h"); + double Omega_cdm = params->getParameterNoDefault("Cosmology/Omega_cdm"); + double Omega_b = params->getParameterNoDefault("Cosmology/Omega_b"); + + // Total matter density parameter + double Omega_m = Omega_cdm + Omega_b; + + // Physical constants in internal units (10^10 Msun, Mpc, km/s) + // H0 = 100 h km/s/Mpc + double H0_kmsMpc = 100.0 * h; // km/s/Mpc + + // Convert to internal time units (H0 in units of 1/time where time is Mpc/(km/s)) + // H0 = 100 h km/s/Mpc = 100 h / Mpc * (km/s) + // In our units: [H0] = km/s/Mpc + + // Critical density today: ρ_crit = 3H₀²/(8πG) + // G = 4.3009e-6 (10^10 Msun)^-1 Mpc (km/s)^2 in internal units + const double G = 4.300917270069976e-6; // Gravitational constant in (10^10 Msun)^-1 Mpc (km/s)^2 + + // ρ_crit(z=0) = 3H₀²/(8πG) in units of 10^10 Msun / Mpc^3 + double rho_crit_0 = (3.0 * H0_kmsMpc * H0_kmsMpc) / (8.0 * M_PI * G); + + // Mean density at redshift z: ρ_mean(z) = ρ_crit(0) × Ω_m × (1+z)³ + double scale_factor = 1.0 / (1.0 + this->redshift); + double density_evolution = 1.0 / (scale_factor * scale_factor * scale_factor); // (1+z)³ + + this->mean_density = rho_crit_0 * Omega_m * density_evolution; + + Metadata *metadata = &Metadata::getInstance(); + if (metadata->rank == 0) { + message("Cosmology: h=%.4f, Omega_m=%.6f (Omega_cdm=%.6f + Omega_b=%.6f)", + h, Omega_m, Omega_cdm, Omega_b); + message("Critical density today: %.6e 10^10 Msun/Mpc^3", rho_crit_0); + message("Mean comoving density at z=%.4f: %.6e 10^10 Msun/Mpc^3", + this->redshift, this->mean_density); + } +} diff --git a/src/simulation.hpp b/src/simulation.hpp index 729a933..900bfa2 100644 --- a/src/simulation.hpp +++ b/src/simulation.hpp @@ -76,6 +76,9 @@ class Simulation { // Prototype for reader function (defined in simulation.cpp) void readSimulationData(); + // Calculate mean density from cosmological parameters + void calculateMeanDensityFromCosmology(Parameters *params); + private: // Helper function for cleanup void deleteChildCells(Cell *cell); diff --git a/tests/file_grid_test_params.yml b/tests/file_grid_test_params.yml index 44cd162..b3f8f55 100644 --- a/tests/file_grid_test_params.yml +++ b/tests/file_grid_test_params.yml @@ -9,6 +9,12 @@ Grid: grid_file: tests/data/grid_points_files/valid_points.txt # n_grid_points is optional for file type - will be counted from file +Cosmology: + h: 0.681 # Reduced Hubble constant + Omega_cdm: 0.256011 # Cold dark matter density parameter + Omega_b: 0.048600 # Baryon density parameter + + Tree: max_leaf_count: 200 diff --git a/tests/simple_test_params.yml b/tests/simple_test_params.yml index 33e6d01..db1956f 100644 --- a/tests/simple_test_params.yml +++ b/tests/simple_test_params.yml @@ -8,6 +8,12 @@ Grid: cdim: 3 # Small grid (3x3x3), grid point at center type: uniform +Cosmology: + h: 0.681 # Reduced Hubble constant + Omega_cdm: 0.256011 # Cold dark matter density parameter + Omega_b: 0.048600 # Baryon density parameter + + Tree: max_leaf_count: 200 diff --git a/tests/test_cosmology.py b/tests/test_cosmology.py new file mode 100644 index 0000000..7a7a6ec --- /dev/null +++ b/tests/test_cosmology.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 +""" +Unit tests for cosmological mean density calculations. + +Tests verify that the C++ cosmology implementation correctly computes +mean comoving density at various redshifts and with different cosmological +parameters. +""" + +import subprocess +import pytest +import h5py +import numpy as np +from pathlib import Path +import yaml +import tempfile +import os + +# Get paths +PROJECT_ROOT = Path(__file__).parent.parent +BUILD_DIR = PROJECT_ROOT / "build" +GRIDDER_EXE = BUILD_DIR / "parent_gridder" +TESTS_DIR = PROJECT_ROOT / "tests" +DATA_DIR = TESTS_DIR / "data" + + +def create_test_snapshot(filepath, redshift, boxsize=10.0): + """Create a minimal test snapshot at specified redshift""" + with h5py.File(filepath, 'w') as f: + # Header + header = f.create_group('Header') + header.attrs['Redshift'] = redshift + header.attrs['BoxSize'] = np.array([boxsize, boxsize, boxsize]) + header.attrs['NumPart_Total'] = np.array([0, 1, 0, 0, 0, 0], dtype=np.uint64) + + # Units (10^10 Msun, Mpc, km/s) + units = f.create_group('Units') + units.attrs['Unit mass in cgs (U_M)'] = 1.989e43 + units.attrs['Unit length in cgs (U_L)'] = 3.086e24 + units.attrs['Unit time in cgs (U_t)'] = 3.086e19 + + # Cells metadata + cells_meta = f.create_group('Cells/Meta-data') + cells_meta.attrs['dimension'] = np.array([2, 2, 2], dtype=np.int32) + cells_meta.attrs['size'] = np.array([boxsize/2, boxsize/2, boxsize/2]) + + # One particle at center + part1 = f.create_group('PartType1') + part1.create_dataset('Coordinates', data=np.array([[boxsize/2, boxsize/2, boxsize/2]])) + part1.create_dataset('Masses', data=np.array([1.0])) + + # Cell counts (one particle in central cell) + f.create_dataset('Cells/Counts/PartType1', data=np.array([0, 0, 0, 1, 0, 0, 0, 0], dtype=np.int32)) + f.create_dataset('Cells/OffsetsInFile/PartType1', data=np.array([0, 0, 0, 0, 1, 1, 1, 1], dtype=np.int32)) + + +def create_test_params(filepath, snapshot_path, cosmology): + """Create test parameter file with specified cosmology""" + params = { + 'Kernels': { + 'nkernels': 1, + 'kernel_radius_1': 1.0 + }, + 'Grid': { + 'type': 'uniform', + 'cdim': 2 + }, + 'Cosmology': cosmology, + 'Tree': { + 'max_leaf_count': 200 + }, + 'Input': { + 'filepath': str(snapshot_path), + 'placeholder': '0000' + }, + 'Output': { + 'filepath': str(filepath.parent) + '/', + 'basename': filepath.name, + 'write_masses': 0 + } + } + + with open(filepath, 'w') as f: + yaml.dump(params, f, default_flow_style=False) + + +def run_gridder_and_get_density(snapshot_path, params_path, output_path): + """Run gridder and extract mean density from output""" + result = subprocess.run( + [str(GRIDDER_EXE), str(params_path), "1"], + capture_output=True, + text=True, + timeout=30 + ) + + if result.returncode != 0: + raise RuntimeError(f"Gridder failed:\nstdout: {result.stdout}\nstderr: {result.stderr}") + + # Extract mean density from stdout + for line in result.stdout.split('\n'): + if 'Mean comoving density at z=' in line: + # Parse: "Mean comoving density at z=X.XXXX: Y.YYYYYYe+XX 10^10 Msun/Mpc^3" + parts = line.split(':') + if len(parts) >= 2: + density_str = parts[1].strip().split()[0] + return float(density_str) + + raise RuntimeError(f"Could not extract mean density from output:\n{result.stdout}") + + +def calculate_expected_density(h, Omega_cdm, Omega_b, redshift): + """ + Calculate expected mean density using standard cosmological formula. + + ρ_mean(z) = ρ_crit(z=0) × Ω_m × (1+z)³ + + where ρ_crit(z=0) = 3H₀²/(8πG) + + Units: 10^10 Msun/Mpc^3 + """ + # Physical constants in internal units + H0_kmsMpc = 100.0 * h # km/s/Mpc + G = 4.300917270069976e-6 # (10^10 Msun)^-1 Mpc (km/s)^2 + + # Critical density today + rho_crit_0 = (3.0 * H0_kmsMpc**2) / (8.0 * np.pi * G) + + # Total matter density parameter + Omega_m = Omega_cdm + Omega_b + + # Mean density at redshift z + density_evolution = (1.0 + redshift)**3 + mean_density = rho_crit_0 * Omega_m * density_evolution + + return mean_density + + +class TestCosmologyCalculations: + """Test suite for cosmological mean density calculations""" + + @pytest.fixture(scope="class") + def test_workspace(self): + """Create temporary workspace for tests""" + with tempfile.TemporaryDirectory() as tmpdir: + workspace = Path(tmpdir) + yield workspace + + def test_planck2018_z0(self, test_workspace): + """Test Planck 2018 cosmology at z=0""" + # Planck 2018 parameters + cosmology = { + 'h': 0.6736, + 'Omega_cdm': 0.2607, + 'Omega_b': 0.0493 + } + redshift = 0.0 + + # Create test files + snapshot_path = test_workspace / "snapshot_z0.hdf5" + params_path = test_workspace / "params_z0.yml" + output_path = test_workspace / "output_z0.hdf5" + + create_test_snapshot(snapshot_path, redshift) + create_test_params(params_path, snapshot_path, cosmology) + + # Run gridder and extract density + gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path) + + # Calculate expected density + expected_density = calculate_expected_density( + cosmology['h'], cosmology['Omega_cdm'], cosmology['Omega_b'], redshift + ) + + # Check agreement (within 0.01% - account for floating point precision) + relative_error = abs(gridder_density - expected_density) / expected_density + assert relative_error < 1e-4, \ + f"Mean density mismatch at z={redshift}: gridder={gridder_density:.6e}, expected={expected_density:.6e}" + + def test_planck2018_z1(self, test_workspace): + """Test Planck 2018 cosmology at z=1""" + cosmology = { + 'h': 0.6736, + 'Omega_cdm': 0.2607, + 'Omega_b': 0.0493 + } + redshift = 1.0 + + snapshot_path = test_workspace / "snapshot_z1.hdf5" + params_path = test_workspace / "params_z1.yml" + output_path = test_workspace / "output_z1.hdf5" + + create_test_snapshot(snapshot_path, redshift) + create_test_params(params_path, snapshot_path, cosmology) + + gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path) + expected_density = calculate_expected_density( + cosmology['h'], cosmology['Omega_cdm'], cosmology['Omega_b'], redshift + ) + + # At z=1, density should be 8x higher than z=0 (since (1+z)³ = 8) + relative_error = abs(gridder_density - expected_density) / expected_density + assert relative_error < 1e-4, \ + f"Mean density mismatch at z={redshift}: gridder={gridder_density:.6e}, expected={expected_density:.6e}" + + def test_high_redshift_z7(self, test_workspace): + """Test at high redshift z=7 (relevant for JWST observations)""" + cosmology = { + 'h': 0.6736, + 'Omega_cdm': 0.2607, + 'Omega_b': 0.0493 + } + redshift = 7.0 + + snapshot_path = test_workspace / "snapshot_z7.hdf5" + params_path = test_workspace / "params_z7.yml" + output_path = test_workspace / "output_z7.hdf5" + + create_test_snapshot(snapshot_path, redshift) + create_test_params(params_path, snapshot_path, cosmology) + + gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path) + expected_density = calculate_expected_density( + cosmology['h'], cosmology['Omega_cdm'], cosmology['Omega_b'], redshift + ) + + # At z=7, density should be (1+7)³ = 512x higher than z=0 + relative_error = abs(gridder_density - expected_density) / expected_density + assert relative_error < 1e-4, \ + f"Mean density mismatch at z={redshift}: gridder={gridder_density:.6e}, expected={expected_density:.6e}" + + def test_wmap9_cosmology(self, test_workspace): + """Test WMAP9 cosmology (different from Planck)""" + # WMAP9 parameters + cosmology = { + 'h': 0.697, + 'Omega_cdm': 0.233, + 'Omega_b': 0.0463 + } + redshift = 2.0 + + snapshot_path = test_workspace / "snapshot_wmap9.hdf5" + params_path = test_workspace / "params_wmap9.yml" + output_path = test_workspace / "output_wmap9.hdf5" + + create_test_snapshot(snapshot_path, redshift) + create_test_params(params_path, snapshot_path, cosmology) + + gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path) + expected_density = calculate_expected_density( + cosmology['h'], cosmology['Omega_cdm'], cosmology['Omega_b'], redshift + ) + + relative_error = abs(gridder_density - expected_density) / expected_density + assert relative_error < 1e-4, \ + f"Mean density mismatch with WMAP9 cosmology: gridder={gridder_density:.6e}, expected={expected_density:.6e}" + + def test_low_omega_matter(self, test_workspace): + """Test with artificially low matter density""" + cosmology = { + 'h': 0.7, + 'Omega_cdm': 0.15, + 'Omega_b': 0.03 + } + redshift = 0.5 + + snapshot_path = test_workspace / "snapshot_low_om.hdf5" + params_path = test_workspace / "params_low_om.yml" + output_path = test_workspace / "output_low_om.hdf5" + + create_test_snapshot(snapshot_path, redshift) + create_test_params(params_path, snapshot_path, cosmology) + + gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path) + expected_density = calculate_expected_density( + cosmology['h'], cosmology['Omega_cdm'], cosmology['Omega_b'], redshift + ) + + relative_error = abs(gridder_density - expected_density) / expected_density + assert relative_error < 1e-4, \ + f"Mean density mismatch with low Omega_m: gridder={gridder_density:.6e}, expected={expected_density:.6e}" + + def test_high_h(self, test_workspace): + """Test with high Hubble constant""" + cosmology = { + 'h': 0.9, # Unrealistically high but good test + 'Omega_cdm': 0.25, + 'Omega_b': 0.05 + } + redshift = 1.5 + + snapshot_path = test_workspace / "snapshot_high_h.hdf5" + params_path = test_workspace / "params_high_h.yml" + output_path = test_workspace / "output_high_h.hdf5" + + create_test_snapshot(snapshot_path, redshift) + create_test_params(params_path, snapshot_path, cosmology) + + gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path) + expected_density = calculate_expected_density( + cosmology['h'], cosmology['Omega_cdm'], cosmology['Omega_b'], redshift + ) + + relative_error = abs(gridder_density - expected_density) / expected_density + assert relative_error < 1e-4, \ + f"Mean density mismatch with high h: gridder={gridder_density:.6e}, expected={expected_density:.6e}" + + def test_density_evolution(self, test_workspace): + """Test that density evolves correctly with redshift""" + cosmology = { + 'h': 0.6736, + 'Omega_cdm': 0.2607, + 'Omega_b': 0.0493 + } + + redshifts = [0.0, 1.0, 2.0, 5.0] + densities = [] + + for z in redshifts: + snapshot_path = test_workspace / f"snapshot_z{z}.hdf5" + params_path = test_workspace / f"params_z{z}.yml" + output_path = test_workspace / f"output_z{z}.hdf5" + + create_test_snapshot(snapshot_path, z) + create_test_params(params_path, snapshot_path, cosmology) + + density = run_gridder_and_get_density(snapshot_path, params_path, output_path) + densities.append(density) + + # Check that density increases monotonically with redshift + for i in range(len(redshifts) - 1): + assert densities[i+1] > densities[i], \ + f"Density should increase with redshift: rho(z={redshifts[i]})={densities[i]:.6e}, rho(z={redshifts[i+1]})={densities[i+1]:.6e}" + + # Check that density ratios match (1+z)³ scaling + for i in range(len(redshifts) - 1): + z1, z2 = redshifts[i], redshifts[i+1] + expected_ratio = ((1 + z2) / (1 + z1))**3 + actual_ratio = densities[i+1] / densities[i] + relative_error = abs(actual_ratio - expected_ratio) / expected_ratio + assert relative_error < 1e-4, \ + f"Density ratio mismatch between z={z1} and z={z2}: actual={actual_ratio:.6f}, expected={expected_ratio:.6f}" + + +if __name__ == '__main__': + # Run tests with pytest + pytest.main([__file__, '-v', '--tb=short']) diff --git a/tests/test_file_grid params.yml b/tests/test_file_grid params.yml index 4846101..90dd7d3 100644 --- a/tests/test_file_grid params.yml +++ b/tests/test_file_grid params.yml @@ -11,6 +11,12 @@ Grid: type: file grid_file: tests/data/test_grid_coordinates.txt +Cosmology: + h: 0.681 # Reduced Hubble constant + Omega_cdm: 0.256011 # Cold dark matter density parameter + Omega_b: 0.048600 # Baryon density parameter + + Tree: max_leaf_count: 200 From 1a64a10235c887a02ac59e25e7dda0f195a20d69 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Thu, 20 Nov 2025 14:20:44 +0000 Subject: [PATCH 15/23] HAndling missing pi --- src/simulation.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/simulation.cpp b/src/simulation.cpp index 77c9c7d..2ee596b 100644 --- a/src/simulation.cpp +++ b/src/simulation.cpp @@ -1,9 +1,17 @@ +// Standard includes +#include + // Local includes #include "simulation.hpp" #include "cell.hpp" #include "hdf_io.hpp" #include "metadata.hpp" +// Define M_PI if not available (POSIX extension, not standard C++) +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + /** * @brief Construct a new Simulation object * From c56c440fc41310ac73f43ab2ed33876690df8d6c Mon Sep 17 00:00:00 2001 From: wjr21 Date: Thu, 20 Nov 2025 14:38:32 +0000 Subject: [PATCH 16/23] Ensuring everything is comoving --- src/simulation.cpp | 14 +++++++------- tests/test_cosmology.py | 39 +++++++++++++++++---------------------- 2 files changed, 24 insertions(+), 29 deletions(-) diff --git a/src/simulation.cpp b/src/simulation.cpp index 2ee596b..d85d4d3 100644 --- a/src/simulation.cpp +++ b/src/simulation.cpp @@ -149,18 +149,18 @@ void Simulation::calculateMeanDensityFromCosmology(Parameters *params) { // ρ_crit(z=0) = 3H₀²/(8πG) in units of 10^10 Msun / Mpc^3 double rho_crit_0 = (3.0 * H0_kmsMpc * H0_kmsMpc) / (8.0 * M_PI * G); - // Mean density at redshift z: ρ_mean(z) = ρ_crit(0) × Ω_m × (1+z)³ - double scale_factor = 1.0 / (1.0 + this->redshift); - double density_evolution = 1.0 / (scale_factor * scale_factor * scale_factor); // (1+z)³ - - this->mean_density = rho_crit_0 * Omega_m * density_evolution; + // Mean COMOVING density: ρ_comoving = ρ_crit(0) × Ω_m + // Note: In comoving coordinates, density does NOT evolve with redshift + // The (1+z)³ factor would convert to physical density, but SWIFT uses comoving coordinates + this->mean_density = rho_crit_0 * Omega_m; Metadata *metadata = &Metadata::getInstance(); if (metadata->rank == 0) { message("Cosmology: h=%.4f, Omega_m=%.6f (Omega_cdm=%.6f + Omega_b=%.6f)", h, Omega_m, Omega_cdm, Omega_b); message("Critical density today: %.6e 10^10 Msun/Mpc^3", rho_crit_0); - message("Mean comoving density at z=%.4f: %.6e 10^10 Msun/Mpc^3", - this->redshift, this->mean_density); + message("Mean comoving density (constant with z): %.6e 10^10 Msun/cMpc^3", + this->mean_density); + message("(Snapshot at z=%.4f, but density in comoving coordinates)", this->redshift); } } diff --git a/tests/test_cosmology.py b/tests/test_cosmology.py index 7a7a6ec..cdcb5c0 100644 --- a/tests/test_cosmology.py +++ b/tests/test_cosmology.py @@ -110,13 +110,16 @@ def run_gridder_and_get_density(snapshot_path, params_path, output_path): def calculate_expected_density(h, Omega_cdm, Omega_b, redshift): """ - Calculate expected mean density using standard cosmological formula. + Calculate expected mean COMOVING density using standard cosmological formula. - ρ_mean(z) = ρ_crit(z=0) × Ω_m × (1+z)³ + ρ_comoving = ρ_crit(z=0) × Ω_m + + Note: In comoving coordinates, density does NOT evolve with redshift. + The (1+z)³ factor converts to physical density, but SWIFT uses comoving coords. where ρ_crit(z=0) = 3H₀²/(8πG) - Units: 10^10 Msun/Mpc^3 + Units: 10^10 Msun/cMpc^3 """ # Physical constants in internal units H0_kmsMpc = 100.0 * h # km/s/Mpc @@ -128,9 +131,8 @@ def calculate_expected_density(h, Omega_cdm, Omega_b, redshift): # Total matter density parameter Omega_m = Omega_cdm + Omega_b - # Mean density at redshift z - density_evolution = (1.0 + redshift)**3 - mean_density = rho_crit_0 * Omega_m * density_evolution + # Mean COMOVING density (constant with redshift!) + mean_density = rho_crit_0 * Omega_m return mean_density @@ -304,15 +306,15 @@ def test_high_h(self, test_workspace): assert relative_error < 1e-4, \ f"Mean density mismatch with high h: gridder={gridder_density:.6e}, expected={expected_density:.6e}" - def test_density_evolution(self, test_workspace): - """Test that density evolves correctly with redshift""" + def test_density_constant_with_redshift(self, test_workspace): + """Test that COMOVING density is constant with redshift""" cosmology = { 'h': 0.6736, 'Omega_cdm': 0.2607, 'Omega_b': 0.0493 } - redshifts = [0.0, 1.0, 2.0, 5.0] + redshifts = [0.0, 1.0, 2.0, 5.0, 7.0] densities = [] for z in redshifts: @@ -326,19 +328,12 @@ def test_density_evolution(self, test_workspace): density = run_gridder_and_get_density(snapshot_path, params_path, output_path) densities.append(density) - # Check that density increases monotonically with redshift - for i in range(len(redshifts) - 1): - assert densities[i+1] > densities[i], \ - f"Density should increase with redshift: rho(z={redshifts[i]})={densities[i]:.6e}, rho(z={redshifts[i+1]})={densities[i+1]:.6e}" - - # Check that density ratios match (1+z)³ scaling - for i in range(len(redshifts) - 1): - z1, z2 = redshifts[i], redshifts[i+1] - expected_ratio = ((1 + z2) / (1 + z1))**3 - actual_ratio = densities[i+1] / densities[i] - relative_error = abs(actual_ratio - expected_ratio) / expected_ratio - assert relative_error < 1e-4, \ - f"Density ratio mismatch between z={z1} and z={z2}: actual={actual_ratio:.6f}, expected={expected_ratio:.6f}" + # Check that comoving density is constant with redshift (within 0.01%) + reference_density = densities[0] + for i, z in enumerate(redshifts): + relative_diff = abs(densities[i] - reference_density) / reference_density + assert relative_diff < 1e-4, \ + f"Comoving density should be constant: rho(z={z})={densities[i]:.6e}, rho(z=0)={reference_density:.6e}, diff={relative_diff:.6e}" if __name__ == '__main__': From 9401bffefbd007f74f34e3fd037a879753cfc879 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Thu, 20 Nov 2025 14:48:54 +0000 Subject: [PATCH 17/23] Fixing mean density calculation (must use internal units) --- src/simulation.cpp | 10 +++---- tests/test_cosmology.py | 59 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 62 insertions(+), 7 deletions(-) diff --git a/src/simulation.cpp b/src/simulation.cpp index d85d4d3..5eab1ed 100644 --- a/src/simulation.cpp +++ b/src/simulation.cpp @@ -143,8 +143,9 @@ void Simulation::calculateMeanDensityFromCosmology(Parameters *params) { // In our units: [H0] = km/s/Mpc // Critical density today: ρ_crit = 3H₀²/(8πG) - // G = 4.3009e-6 (10^10 Msun)^-1 Mpc (km/s)^2 in internal units - const double G = 4.300917270069976e-6; // Gravitational constant in (10^10 Msun)^-1 Mpc (km/s)^2 + // G in (10^10 Msun)^-1 Mpc (km/s)^2 internal units + // Derived from G_SI = 6.674e-11 m^3 kg^-1 s^-2 with proper unit conversion + const double G = 4.301744232015554e+01; // Gravitational constant in (10^10 Msun)^-1 Mpc (km/s)^2 // ρ_crit(z=0) = 3H₀²/(8πG) in units of 10^10 Msun / Mpc^3 double rho_crit_0 = (3.0 * H0_kmsMpc * H0_kmsMpc) / (8.0 * M_PI * G); @@ -159,8 +160,7 @@ void Simulation::calculateMeanDensityFromCosmology(Parameters *params) { message("Cosmology: h=%.4f, Omega_m=%.6f (Omega_cdm=%.6f + Omega_b=%.6f)", h, Omega_m, Omega_cdm, Omega_b); message("Critical density today: %.6e 10^10 Msun/Mpc^3", rho_crit_0); - message("Mean comoving density (constant with z): %.6e 10^10 Msun/cMpc^3", - this->mean_density); - message("(Snapshot at z=%.4f, but density in comoving coordinates)", this->redshift); + message("Mean comoving density at z=%.4f: %.6e 10^10 Msun/cMpc^3", + this->redshift, this->mean_density); } } diff --git a/tests/test_cosmology.py b/tests/test_cosmology.py index cdcb5c0..d9d67bf 100644 --- a/tests/test_cosmology.py +++ b/tests/test_cosmology.py @@ -16,6 +16,14 @@ import tempfile import os +# Try to import astropy for direct comparison tests +try: + from astropy.cosmology import FlatLambdaCDM + from astropy import units as u + ASTROPY_AVAILABLE = True +except ImportError: + ASTROPY_AVAILABLE = False + # Get paths PROJECT_ROOT = Path(__file__).parent.parent BUILD_DIR = PROJECT_ROOT / "build" @@ -99,7 +107,7 @@ def run_gridder_and_get_density(snapshot_path, params_path, output_path): # Extract mean density from stdout for line in result.stdout.split('\n'): if 'Mean comoving density at z=' in line: - # Parse: "Mean comoving density at z=X.XXXX: Y.YYYYYYe+XX 10^10 Msun/Mpc^3" + # Parse: "Mean comoving density at z=X.XXXX: Y.YYYYYYe+XX 10^10 Msun/cMpc^3" parts = line.split(':') if len(parts) >= 2: density_str = parts[1].strip().split()[0] @@ -123,7 +131,7 @@ def calculate_expected_density(h, Omega_cdm, Omega_b, redshift): """ # Physical constants in internal units H0_kmsMpc = 100.0 * h # km/s/Mpc - G = 4.300917270069976e-6 # (10^10 Msun)^-1 Mpc (km/s)^2 + G = 4.301744232015554e+01 # (10^10 Msun)^-1 Mpc (km/s)^2 # Critical density today rho_crit_0 = (3.0 * H0_kmsMpc**2) / (8.0 * np.pi * G) @@ -335,6 +343,53 @@ def test_density_constant_with_redshift(self, test_workspace): assert relative_diff < 1e-4, \ f"Comoving density should be constant: rho(z={z})={densities[i]:.6e}, rho(z=0)={reference_density:.6e}, diff={relative_diff:.6e}" + @pytest.mark.skipif(not ASTROPY_AVAILABLE, reason="astropy not available") + def test_astropy_comparison(self, test_workspace): + """Test that our calculation matches astropy exactly""" + # Planck 2018 parameters + cosmology = { + 'h': 0.6736, + 'Omega_cdm': 0.2607, + 'Omega_b': 0.0493 + } + redshift = 2.0 + + # Create test files + snapshot_path = test_workspace / "snapshot_astropy.hdf5" + params_path = test_workspace / "params_astropy.yml" + output_path = test_workspace / "output_astropy.hdf5" + + create_test_snapshot(snapshot_path, redshift) + create_test_params(params_path, snapshot_path, cosmology) + + # Get density from our C++ code + gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path) + + # Calculate with astropy + h = cosmology['h'] + Omega_m = cosmology['Omega_cdm'] + cosmology['Omega_b'] + H0 = h * 100.0 # km/s/Mpc + + # Create astropy cosmology (flat ΛCDM with Omega_Lambda = 1 - Omega_m) + cosmo = FlatLambdaCDM(H0=H0, Om0=Omega_m) + + # Get critical density at z=0 in comoving coordinates + rho_crit_0 = cosmo.critical_density(0) + + # Convert to our units: 10^10 Msun/Mpc^3 + # astropy gives g/cm^3, we need 10^10 Msun/Mpc^3 + Msun_in_g = 1.989e33 + Mpc_in_cm = 3.086e24 + rho_crit_0_our_units = rho_crit_0.to(u.g / u.cm**3).value * (Mpc_in_cm**3) / (1e10 * Msun_in_g) + + # Mean comoving density + astropy_density = rho_crit_0_our_units * Omega_m + + # Check agreement (within 0.1% - account for different constants) + relative_error = abs(gridder_density - astropy_density) / astropy_density + assert relative_error < 1e-3, \ + f"Density mismatch vs astropy: gridder={gridder_density:.6e}, astropy={astropy_density:.6e}, error={relative_error:.6e}" + if __name__ == '__main__': # Run tests with pytest From 7d1bbeb76af87437b7f5641822935d90552c168b Mon Sep 17 00:00:00 2001 From: wjr21 Date: Fri, 21 Nov 2025 09:31:00 +0000 Subject: [PATCH 18/23] Formatting --- weighting/knn_weighter.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/weighting/knn_weighter.py b/weighting/knn_weighter.py index ace80f9..fc54981 100644 --- a/weighting/knn_weighter.py +++ b/weighting/knn_weighter.py @@ -77,7 +77,7 @@ def weighted_ks( n_parent: int = 100_000, n_sample: int = 100_000, ) -> float: - """Compute a weighted 1D KolmogorovSmirnov distance. + """Compute a weighted 1D Kolmogorov-Smirnov distance. The parent distribution is unweighted; the sample distribution is weighted. For speed, both are optionally downsampled before computing @@ -195,7 +195,7 @@ def filter_and_transform_overdensities( raise ValueError( "All arrays must have the same length; found mismatched lengths." ) - mask &= (arr > -1.0) + mask &= arr > -1.0 n_dropped = length - np.sum(mask) if n_dropped > 0: @@ -206,9 +206,7 @@ def filter_and_transform_overdensities( ) # Apply log transform to filtered data - transformed = { - scale: np.log10(arr[mask] + 1.0) for scale, arr in overdens.items() - } + transformed = {scale: np.log10(arr[mask] + 1.0) for scale, arr in overdens.items()} return transformed, mask @@ -542,13 +540,9 @@ def _weights_for_k( if k < 1: raise ValueError(f"k must be >= 1, got k={k}") if k >= N: - raise ValueError( - f"k must be < N (parent size), got k={k}, N={N}" - ) + raise ValueError(f"k must be < N (parent size), got k={k}, N={N}") if k + 1 > M: - raise ValueError( - f"k+1 must be <= M (sample size), got k={k}, M={M}" - ) + raise ValueError(f"k+1 must be <= M (sample size), got k={k}, M={M}") dist_p = self._knn_dists(Xs_prep, Xp_prep, k=k) # [M, k] r_parent = dist_p[:, -1] From cac953c4ea5181651d3f634d88dd136cb3c0f892 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Fri, 21 Nov 2025 09:50:08 +0000 Subject: [PATCH 19/23] Writing an empty file when the grid is empty --- src/construct_grid_points.cpp | 4 ++-- src/gridder.cpp | 23 +++++++++++++++++++++-- src/output.cpp | 19 +++++++++++++++++++ 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/construct_grid_points.cpp b/src/construct_grid_points.cpp index 8bd1976..326cce7 100644 --- a/src/construct_grid_points.cpp +++ b/src/construct_grid_points.cpp @@ -307,8 +307,8 @@ readGridPointCoordinates(const std::string &filename, message("WARNING: No valid grid point coordinates found in file: %s", filename.c_str()); message("The file is either empty or contains only comments/invalid lines."); - message("Exiting gracefully - no grid points to process."); - throw std::runtime_error("Empty grid file - no grid points to process"); + // Return 0 - the caller will handle the empty grid case + return 0; } message("Read %d valid grid point coordinates from %s", valid_points, diff --git a/src/gridder.cpp b/src/gridder.cpp index 8f9f7d8..2d5f69b 100644 --- a/src/gridder.cpp +++ b/src/gridder.cpp @@ -252,8 +252,27 @@ int main(int argc, char *argv[]) { // Check if we actually have grid points to process if (grid->n_grid_points == 0) { message("No grid points available for processing."); - message("Program will now exit."); - return 0; // Clean exit, not an error + message("Writing empty output file before exiting..."); + + // Write empty output file +#ifdef WITH_MPI + try { + writeGridFileParallel(sim, grid); + } catch (const std::exception &e) { + error("Failed to write empty output file: %s", e.what()); + return 1; + } +#else + try { + writeGridFileSerial(sim, grid); + } catch (const std::exception &e) { + error("Failed to write empty output file: %s", e.what()); + return 1; + } +#endif + + message("No grid points were created - empty output file written."); + return 1; // Exit with error code since no work was done } #ifdef DEBUGGING_CHECKS diff --git a/src/output.cpp b/src/output.cpp index b6d0d14..fdd9cfa 100644 --- a/src/output.cpp +++ b/src/output.cpp @@ -98,6 +98,15 @@ void writeGridFileSerial(Simulation *sim, Grid *grid) { return; } + // If there are no grid points, skip dataset creation and exit early + if (grid->n_grid_points == 0) { + message("No grid points to write - creating empty output file with header only"); + hdf5.close(); + message("Successfully wrote empty grid data (serial mode)"); + toc("Writing output (in serial)"); + return; + } + // Create dataset for grid positions std::array grid_point_positions_dims = { static_cast(grid->n_grid_points), static_cast(3)}; @@ -390,6 +399,16 @@ void writeGridFileParallel(Simulation *sim, Grid *grid) { hdf5.writeAttribute("Header", "MaxKernelRadius", grid->max_kernel_radius); + // If there are no local grid points, skip dataset creation and exit early + if (total_local_grid_points == 0) { + message("Rank %d: No local grid points to write - creating empty output file with header only", + metadata->rank); + hdf5.close(); + message("Rank %d: Successfully wrote empty grid data (parallel mode)", metadata->rank); + toc("Writing output (in parallel)"); + return; + } + // Write local cell information if (!local_cell_ids.empty()) { std::array local_cell_dims = { From 001d831194cca51edc17a9458f8255fb7b479999 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Fri, 28 Nov 2025 22:47:25 +0000 Subject: [PATCH 20/23] Removing unfinished weightings --- weighting/knn_weighter.py | 1111 ------------------------------- weighting/test_hmf_weighting.py | 648 ------------------ 2 files changed, 1759 deletions(-) delete mode 100644 weighting/knn_weighter.py delete mode 100644 weighting/test_hmf_weighting.py diff --git a/weighting/knn_weighter.py b/weighting/knn_weighter.py deleted file mode 100644 index fc54981..0000000 --- a/weighting/knn_weighter.py +++ /dev/null @@ -1,1111 +0,0 @@ -"""k-NN reweighting of a biased sample to match a parent population. - -This script reads two HDF5 files: - -* A "parent" (global) population. -* A "sample" population (which may be biased). - -The set of kernels is defined by the sample file: every group -``Grids/Kernel_*`` in the sample must also exist in the parent. Each such -group must contain a dataset ``GridPointOverDensities``. - -For each kernel: - -* Read overdensities from parent and sample. -* Transform as ``log10(1 + delta)``. -* Stack over all kernels into a multi-dimensional feature space. - -We then use k-NN density-ratio estimation to compute weights for the sample -so that its multi-scale environment distribution matches the parent. - -An optional auto-tuning step searches over candidate k values, choosing the -one that: - -* Satisfies an effective sample size (ESS) floor, if possible. -* Minimizes the mean 1D KS distance between parent and weighted sample - across all dimensions. - -Example -------- - python knn_weighter.py \\ - --parent parent.hdf5 \\ - --sample sample.hdf5 \\ - --autotune_k 35,50,80,100 \\ - --ess_floor 0.20 -""" - -from __future__ import annotations - -import argparse -from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple - -import h5py -import matplotlib.pyplot as plt -import numpy as np -from sklearn.neighbors import NearestNeighbors - -# --------------------------------------------------------------------- -# Utilities -# --------------------------------------------------------------------- - - -def effective_sample_size(w: np.ndarray) -> float: - """Compute the effective sample size (ESS) of a weight vector. - - ESS is defined as: - - .. math:: - - \\text{ESS} = \\frac{(\\sum_i w_i)^2}{\\sum_i w_i^2}. - - Args: - w: One-dimensional array of non-negative weights. - - Returns: - Effective sample size as a float. - """ - s = float(w.sum()) - return (s * s) / float(np.sum(w * w) + 1e-18) - - -def weighted_ks( - x_parent: np.ndarray, - x_sample: np.ndarray, - w_sample: np.ndarray, - rng: np.random.Generator, - n_parent: int = 100_000, - n_sample: int = 100_000, -) -> float: - """Compute a weighted 1D Kolmogorov-Smirnov distance. - - The parent distribution is unweighted; the sample distribution is - weighted. For speed, both are optionally downsampled before computing - the KS distance. - - Args: - x_parent: Parent data values (1D). - x_sample: Sample data values (1D). - w_sample: Weights for ``x_sample``, same length as ``x_sample``. - rng: NumPy random number generator instance. - n_parent: Maximum number of parent points to use. - n_sample: Maximum number of sample points to use. - - Returns: - The KS distance as a float. - """ - # Downsample parent. - if len(x_parent) > n_parent: - xp = x_parent[rng.choice(len(x_parent), n_parent, replace=False)] - else: - xp = x_parent - - # Downsample sample + weights. - if len(x_sample) > n_sample: - idx = rng.choice(len(x_sample), n_sample, replace=False) - xs = x_sample[idx] - ws = w_sample[idx] - else: - xs, ws = x_sample, w_sample - - # Sort and build CDFs. - xp = np.sort(xp) - order = np.argsort(xs) - xs = xs[order] - ws = np.clip(ws[order], 0, None) - ws_sum = ws.sum() - if ws_sum < 1e-15: - # All weights are effectively zero - return worst-case KS distance - return 1.0 - ws /= ws_sum - cdf_s = np.cumsum(ws) - - grid = np.unique(np.concatenate([xp, xs])) - cdf_p = np.searchsorted(xp, grid, side="right") / len(xp) - - idx = np.searchsorted(xs, grid, side="right") - 1 - idx = np.clip(idx, -1, len(xs) - 1) - cdf_s_at_grid = np.where(idx >= 0, cdf_s[idx], 0.0) - - return float(np.max(np.abs(cdf_p - cdf_s_at_grid))) - - -def whiten_by_parent( - Xp: np.ndarray, - Xs: np.ndarray, -) -> Tuple[np.ndarray, np.ndarray]: - """Apply parent-covariance whitening (Mahalanobis transform). - - The parent distribution defines the mean and covariance used to whiten - both the parent and the sample data: - - .. math:: - - X' = (X - \\mu_\\text{parent}) L^{-1}, - - where ``L`` is the Cholesky factor of the parent covariance. - - Args: - Xp: Parent data array of shape ``(N, D)``. - Xs: Sample data array of shape ``(M, D)``. - - Returns: - Tuple ``(Xp_whitened, Xs_whitened)`` with the same shapes as inputs. - """ - mu = Xp.mean(axis=0) - Xm = Xp - mu - Sigma = np.cov(Xm, rowvar=False) - d = Sigma.shape[0] - # Tiny ridge for numerical stability. - Sigma = Sigma + (1e-12 * np.trace(Sigma) / max(d, 1)) * np.eye(d) - L = np.linalg.cholesky(Sigma) - Linv = np.linalg.inv(L) - return (Xp - mu) @ Linv, (Xs - mu) @ Linv - - -def filter_and_transform_overdensities( - overdens: Dict[float, np.ndarray], -) -> Tuple[Dict[float, np.ndarray], np.ndarray]: - """Filter out -1 values and apply log10(1 + delta) transform. - - Grid points with overdensity = -1 indicate no particles nearby. - These must be removed before applying the log transform to avoid NaN. - - Args: - overdens: Mapping from scale to raw overdensity arrays (may contain -1). - - Returns: - Tuple (transformed_overdens, mask) where: - - transformed_overdens: log10(1 + delta) for valid points - - mask: Boolean array indicating which rows were kept - - Raises: - ValueError: If arrays have mismatched lengths. - """ - if not overdens: - return overdens, np.array([], dtype=bool) - - first = next(iter(overdens.values())) - length = len(first) - - # Build mask: keep rows where ALL scales have delta > -1 - mask = np.ones(length, dtype=bool) - for scale, arr in overdens.items(): - if len(arr) != length: - raise ValueError( - "All arrays must have the same length; found mismatched lengths." - ) - mask &= arr > -1.0 - - n_dropped = length - np.sum(mask) - if n_dropped > 0: - print( - f"[Filter] Dropped {n_dropped:,}/{length:,} " - f"({100.0 * n_dropped / length:.2f}%) grid points with delta <= -1 " - f"(no particles nearby)." - ) - - # Apply log transform to filtered data - transformed = {scale: np.log10(arr[mask] + 1.0) for scale, arr in overdens.items()} - - return transformed, mask - - -def drop_nan_rows_per_scale( - samples: Dict[float, np.ndarray], -) -> Dict[float, np.ndarray]: - """Remove entries that are NaN in any dimension for a population. - - For a dictionary of 1D arrays (one per smoothing scale), this function - removes indices where *any* scale is NaN, ensuring consistent length - across all scales. - - Args: - samples: Mapping from scale (e.g., in Mpc/h) to 1D NumPy arrays of - equal length. - - Returns: - New dictionary with NaN-containing entries removed consistently - across all scales. - - Raises: - ValueError: If the arrays in ``samples`` do not all have the same - length. - """ - if not samples: - return samples - - first = next(iter(samples.values())) - length = len(first) - mask = np.ones(length, dtype=bool) - - for arr in samples.values(): - if len(arr) != length: - raise ValueError( - "All arrays in a population must have the same length; " - "found mismatched lengths." - ) - mask &= ~np.isnan(arr) - - n_dropped = length - np.sum(mask) - if n_dropped > 0: - print( - f"[NaN] Dropped {n_dropped:,}/{length:,} " - f"({100.0 * n_dropped / length:.2f}%) entries containing NaN." - ) - - return {scale: arr[mask] for scale, arr in samples.items()} - - -# --------------------------------------------------------------------- -# Core k-NN reweighter with auto-tuning -# --------------------------------------------------------------------- - - -@dataclass -class KNNReweighterConfig: - """Configuration for :class:`KNNReweighter`. - - Attributes: - k: Default number of neighbors for k-NN if auto-tuning is disabled. - autotune_k: Candidate k values to try when auto-tuning. If None, - a single fixed k is used. - ess_floor: Minimum ESS fraction (ESS / M) for a candidate k to be - considered "feasible" during auto-tuning. - preprocess: Preprocessing mode: ``"whiten"``, ``"standardize"``, or - ``"none"``. - standardize_by_parent: If True and ``preprocess == "standardize"``, - use parent statistics only; otherwise, use pooled stats. - algorithm: Nearest-neighbors algorithm (as in scikit-learn). - metric: Distance metric (e.g., ``"minkowski"``, ``"euclidean"``). - leaf_size: Leaf size for tree-based nearest-neighbor algorithms. - tau: Temperature exponent applied to the weights (``w := w**tau``). - Use tau < 1 to reduce weight variance (smoother, higher ESS). - Use tau > 1 to increase contrast (sharper weights, lower ESS). - Default tau=1 preserves density ratio exactly. - clip_range: Tuple ``(min, max)`` for clipping weights. - ks_eval_parent: Maximum number of parent points for KS evaluation. - ks_eval_sample: Maximum number of sample points for KS evaluation. - random_state: Random seed for diagnostics and KS subsampling. - """ - - # k / tuning - k: int = 50 - autotune_k: Optional[List[int]] = None - ess_floor: float = 0.20 # as fraction of M - - # preprocessing - preprocess: str = "whiten" # whiten | standardize | none - standardize_by_parent: bool = True # only used if preprocess=standardize - - # neighbors - algorithm: str = "auto" - metric: str = "minkowski" - leaf_size: int = 40 - - # stabilization - tau: float = 1.0 # tau < 1: reduce variance, tau > 1: increase contrast - clip_range: Tuple[float, float] = (0.01, 50.0) - - # diagnostics - ks_eval_parent: int = 100_000 - ks_eval_sample: int = 100_000 - random_state: int = 42 - - -class KNNReweighter: - """k-NN density-ratio reweighter for multi-scale environments.""" - - def __init__(self, cfg: KNNReweighterConfig): - """Initialize the reweighter. - - Args: - cfg: Configuration object controlling k-NN and diagnostics. - """ - self.cfg = cfg - self.scales: List[float] = [] - self.weights_: Optional[np.ndarray] = None - self.info_: Optional[Dict] = None - - def fit( - self, - parent_samples: Dict[float, np.ndarray], - sample_samples: Dict[float, np.ndarray], - ) -> Tuple[np.ndarray, Dict]: - """Compute weights that map the sample to the parent distribution. - - The features are constructed by column-stacking the overdensities at - different smoothing scales. - - Args: - parent_samples: Mapping from scale to parent overdensity array - (1D, same length per scale). - sample_samples: Mapping from scale to sample overdensity array - (1D, same length per scale). - - Returns: - Tuple ``(weights, info)``: - * ``weights``: 1D array of length M with reweighting factors. - * ``info``: Dictionary of diagnostics and configuration values. - - Raises: - ValueError: If the parent and sample do not share the same set - of scales. - """ - if set(parent_samples.keys()) != set(sample_samples.keys()): - raise ValueError( - "parent_samples and sample_samples must have identical keys " - "(same set of smoothing scales)." - ) - - self.scales = sorted(parent_samples.keys()) - Xp = np.column_stack([parent_samples[s] for s in self.scales]) # [N, D] - Xs = np.column_stack([sample_samples[s] for s in self.scales]) # [M, D] - N, d = Xp.shape - M = Xs.shape[0] - rng = np.random.default_rng(self.cfg.random_state) - - print(f"[kNN] {d}D features | Parent N={N:,} | Sample M={M:,}") - Xp_prep, Xs_prep = self._preprocess(Xp, Xs) - - # Solve for weights, optionally auto-tuning k. - if self.cfg.autotune_k: - best_k, w, tuning_info = self._autotune_k( - Xp_prep, Xs_prep, Xp, Xs, N, M, d, rng - ) - k_used = best_k - else: - k_used = self.cfg.k - w = self._weights_for_k(Xp_prep, Xs_prep, N, M, d, k_used) - tuning_info = None - - # Basic diagnostics: ESS, clipping, KS. - ess = effective_sample_size(w) - clip_frac = float( - np.mean( - (w <= self.cfg.clip_range[0] + 1e-15) - | (w >= self.cfg.clip_range[1] - 1e-15) - ) - ) - - ks_vals = [ - weighted_ks( - Xp[:, j], - Xs[:, j], - w, - rng, - n_parent=min(self.cfg.ks_eval_parent, N), - n_sample=min(self.cfg.ks_eval_sample, M), - ) - for j in range(d) - ] - ks_mean = float(np.mean(ks_vals)) - - info = { - "method": "knn_density_ratio", - "k": int(k_used), - "ks_mean": ks_mean, - "ks_per_dim": ks_vals, - "preprocess": self.cfg.preprocess, - "standardize_by_parent": self.cfg.standardize_by_parent, - "scales": self.scales, - "parent_samples": N, - "sample_samples": M, - "algorithm": self.cfg.algorithm, - "metric": self.cfg.metric, - "leaf_size": self.cfg.leaf_size, - "tau": self.cfg.tau, - "clip_range": self.cfg.clip_range, - "clip_fraction": clip_frac, - "ess": float(ess), - "mean_weight": float(np.mean(w)), - "std_weight": float(np.std(w)), - "min_weight": float(np.min(w)), - "max_weight": float(np.max(w)), - "tuning": tuning_info, - } - - print( - f"[kNN] k={k_used} | KS(mean)={ks_mean:.4f} | " - f"ESS={ess:,.0f}/{M:,} | clipped={100 * clip_frac:.1f}%" - ) - - self.weights_ = w - self.info_ = info - return w, info - - # --------------------- internals --------------------- - - def _preprocess( - self, - Xp: np.ndarray, - Xs: np.ndarray, - ) -> Tuple[np.ndarray, np.ndarray]: - """Apply the requested preprocessing to parent and sample data. - - Args: - Xp: Parent data array of shape ``(N, D)``. - Xs: Sample data array of shape ``(M, D)``. - - Returns: - Tuple ``(Xp_processed, Xs_processed)``. - - Raises: - ValueError: If ``preprocess`` is not one of the supported modes. - """ - mode = self.cfg.preprocess.lower() - - if mode == "whiten": - print("[kNN] Preprocess: parent-covariance whitening.") - return whiten_by_parent(Xp, Xs) - - if mode == "standardize": - if self.cfg.standardize_by_parent: - mu = Xp.mean(axis=0) - sd = Xp.std(axis=0) + 1e-12 - src = "parent" - else: - both = np.vstack([Xp, Xs]) - mu = both.mean(axis=0) - sd = both.std(axis=0) + 1e-12 - src = "pooled" - print(f"[kNN] Preprocess: z-score (by {src} stats).") - return (Xp - mu) / sd, (Xs - mu) / sd - - if mode == "none": - print("[kNN] Preprocess: none.") - return Xp, Xs - - raise ValueError('preprocess must be "whiten", "standardize", or "none"') - - def _knn_dists( - self, - X_query: np.ndarray, - X_ref: np.ndarray, - *, - k: int, - ) -> np.ndarray: - """Compute distances to the k nearest neighbors. - - Args: - X_query: Query points of shape ``(Q, D)``. - X_ref: Reference points of shape ``(R, D)``. - k: Number of neighbors to retrieve. - - Returns: - Array of shape ``(Q, k)`` with distances to the k nearest neighbors. - """ - nn = NearestNeighbors( - n_neighbors=k, - algorithm=self.cfg.algorithm, - leaf_size=self.cfg.leaf_size, - metric=self.cfg.metric, - ).fit(X_ref) - return nn.kneighbors(X_query, return_distance=True)[0] - - def _weights_for_k( - self, - Xp_prep: np.ndarray, - Xs_prep: np.ndarray, - N: int, - M: int, - d: int, - k: int, - ) -> np.ndarray: - """Compute stabilized weights for a single k. - - The core k-NN density-ratio formula is: - - .. math:: - - w_i \\propto \\frac{M}{N}\\left(\\frac{r_\\text{sample, i}} - {r_\\text{parent, i}}\\right)^d, - - with optional temperature exponent, clipping, and renormalization. - - Args: - Xp_prep: Preprocessed parent data, shape ``(N, D)``. - Xs_prep: Preprocessed sample data, shape ``(M, D)``. - N: Number of parent points. - M: Number of sample points. - d: Dimensionality of the feature space. - k: Number of neighbors. - - Returns: - One-dimensional array of weights of length M. - - Raises: - ValueError: If k is out of valid range. - """ - if k < 1: - raise ValueError(f"k must be >= 1, got k={k}") - if k >= N: - raise ValueError(f"k must be < N (parent size), got k={k}, N={N}") - if k + 1 > M: - raise ValueError(f"k+1 must be <= M (sample size), got k={k}, M={M}") - - dist_p = self._knn_dists(Xs_prep, Xp_prep, k=k) # [M, k] - r_parent = dist_p[:, -1] - - dist_s = self._knn_dists(Xs_prep, Xs_prep, k=k + 1) # include self - r_sample = dist_s[:, -1] - - eps = 1e-12 - w = (M / N) * ((r_sample + eps) / (r_parent + eps)) ** d - - if self.cfg.tau != 1.0: - w = np.power(w, self.cfg.tau) - - w = np.clip(w, self.cfg.clip_range[0], self.cfg.clip_range[1]) - w *= len(w) / (w.sum() + 1e-18) - return w - - def _autotune_k( - self, - Xp_prep: np.ndarray, - Xs_prep: np.ndarray, - Xp_raw: np.ndarray, - Xs_raw: np.ndarray, - N: int, - M: int, - d: int, - rng: np.random.Generator, - ) -> Tuple[int, np.ndarray, Dict]: - """Grid-search over candidate k values with an ESS floor. - - Args: - Xp_prep: Preprocessed parent data, shape ``(N, D)``. - Xs_prep: Preprocessed sample data, shape ``(M, D)``. - Xp_raw: Raw parent data for KS diagnostics, shape ``(N, D)``. - Xs_raw: Raw sample data for KS diagnostics, shape ``(M, D)``. - N: Number of parent points. - M: Number of sample points. - d: Dimensionality of the feature space. - rng: NumPy random generator for KS subsampling. - - Returns: - Tuple ``(best_k, best_weights, summary_dict)`` where - ``summary_dict`` contains per-k diagnostics. - """ - candidates = sorted( - {int(k) for k in self.cfg.autotune_k if 1 <= int(k) < min(N, M)} - ) - if not candidates: - raise ValueError( - "autotune_k is non-empty but no valid candidates remain. " - "Check that k < min(N, M)." - ) - - print( - f"[kNN] Auto-tune k over {candidates} with ESS floor " - f"{int(self.cfg.ess_floor * 100)}%..." - ) - - results = [] - best_tuple = None - best_k = None - best_w = None - - for k in candidates: - w = self._weights_for_k(Xp_prep, Xs_prep, N, M, d, k) - ess = effective_sample_size(w) - - ks_vals = [ - weighted_ks( - Xp_raw[:, j], - Xs_raw[:, j], - w, - rng, - n_parent=min(self.cfg.ks_eval_parent, N), - n_sample=min(self.cfg.ks_eval_sample, M), - ) - for j in range(d) - ] - ks_mean = float(np.mean(ks_vals)) - feasible = ess >= self.cfg.ess_floor * M - results.append( - {"k": k, "ess": float(ess), "ks_mean": ks_mean, "feasible": feasible} - ) - - # Selection criteria (lexicographic ordering): - # 1. Prefer feasible (ESS >= floor) over infeasible - # 2. Among same feasibility, prefer lower KS distance - # 3. Among same KS, prefer higher ESS - # 4. Among same ESS, prefer smaller k (simpler model) - selection_key = (not feasible, ks_mean, -ess, k) - if (best_tuple is None) or (selection_key < best_tuple): - best_tuple, best_k, best_w = selection_key, k, w - - print( - "[kNN] Tuning results: " - + ", ".join( - f"k={r['k']}: KS={r['ks_mean']:.3f}, ESS={int(r['ess'])}" - + ("" if r["feasible"] else "") - for r in results - ) - ) - - summary = { - "candidates": candidates, - "results": results, - "best_k": best_k, - "ess_floor": self.cfg.ess_floor, - } - return best_k, best_w, summary - - -# --------------------------------------------------------------------- -# HDF5 loading -# --------------------------------------------------------------------- - - -def load_multiscale_overdensities( - parent_path: str, - sample_path: str, -) -> Tuple[Dict[float, np.ndarray], Dict[float, np.ndarray]]: - """Load raw overdensities from parent and sample HDF5 files. - - The set of kernels is defined by the *sample* file. For every group - ``Grids/Kernel_N`` present in the sample file: - - * The same group must exist in the parent file. - * Each group must contain a dataset ``GridPointOverDensities``. - * The kernel radius is read from the group's ``KernelRadius`` attribute. - * Raw overdensity values are returned (NOT log-transformed). - - Note: Values of -1 indicate grid points with no particles nearby. - Use ``filter_and_transform_overdensities()`` to remove these and - apply the log10(1 + delta) transform. - - The returned dictionaries map the smoothing scale (from the KernelRadius - attribute) to a 1D NumPy array of raw overdensities. - - Args: - parent_path: Path to the parent/global population HDF5 file. - sample_path: Path to the sample/observed population HDF5 file. - - Returns: - Tuple ``(parent_data, sample_data)``, where each is a mapping from - scale to 1D NumPy arrays. - - Raises: - KeyError: If required groups/datasets are missing. - ValueError: If no kernels are found; if KernelRadius attribute is - missing; or if the parent is missing kernels found in the sample - file. - """ - parent_data: Dict[float, np.ndarray] = {} - sample_data: Dict[float, np.ndarray] = {} - - with ( - h5py.File(sample_path, "r") as f_sample, - h5py.File(parent_path, "r") as f_parent, - ): - if "Grids" not in f_sample: - raise KeyError("Sample file is missing group 'Grids'.") - if "Grids" not in f_parent: - raise KeyError("Parent file is missing group 'Grids'.") - - sample_grids = f_sample["Grids"] - parent_grids = f_parent["Grids"] - - kernel_keys = [k for k in sample_grids.keys() if k.startswith("Kernel_")] - if not kernel_keys: - raise ValueError( - "No kernel groups found in sample file under 'Grids'. " - "Expected groups named like 'Kernel_0', 'Kernel_1', etc." - ) - - missing = [k for k in kernel_keys if k not in parent_grids] - if missing: - raise ValueError( - "Parent file is missing the following kernel groups " - f"present in the sample file: {', '.join(missing)}" - ) - - print( - f"[HDF5] Found {len(kernel_keys)} kernels in sample file: " - + ", ".join(kernel_keys) - ) - - for key in kernel_keys: - # Read kernel radius from group attribute - try: - scale = float(sample_grids[key].attrs["KernelRadius"]) - except KeyError as exc: - raise ValueError( - f"Kernel group '{key}' is missing 'KernelRadius' attribute. " - "Cannot determine smoothing scale." - ) from exc - - try: - parent_ds = parent_grids[key]["GridPointOverDensities"][...] - sample_ds = sample_grids[key]["GridPointOverDensities"][...] - except KeyError as exc: - raise KeyError( - f"Missing 'GridPointOverDensities' for kernel '{key}'." - ) from exc - - # Store raw overdensities (will be filtered and transformed later) - # Note: -1 values indicate grid points with no particles nearby - parent_data[scale] = np.asarray(parent_ds, dtype=float) - sample_data[scale] = np.asarray(sample_ds, dtype=float) - - return parent_data, sample_data - - -# --------------------------------------------------------------------- -# Plotting and simple diagnostics -# --------------------------------------------------------------------- - - -def plot_diagnostics( - parent_samples: Dict[float, np.ndarray], - sample_samples: Dict[float, np.ndarray], - weights: np.ndarray, - info: Dict, - save_path: Optional[str] = None, -) -> plt.Figure: - """Plot simple diagnostics: 1D distributions and weight histogram. - - The plot has three panels: - - * Parent vs sample vs weighted sample for the first scale. - * Parent vs sample vs weighted sample for the second scale (if present). - * Histogram of the weights. - - Args: - parent_samples: Mapping from scale to parent overdensity array. - sample_samples: Mapping from scale to sample overdensity array. - weights: Weights for the sample population. - info: Diagnostics dictionary returned by :meth:`KNNReweighter.fit`. - save_path: Optional path to save the diagnostic PNG. - - Returns: - The Matplotlib Figure instance. - """ - scales = sorted(parent_samples.keys()) - fig, axes = plt.subplots(1, 3, figsize=(15, 4)) - axes = axes.flatten() - - def _plot_1d(ax, scale: float) -> None: - parent = parent_samples[scale] - sample = sample_samples[scale] - bins = np.linspace( - min(parent.min(), sample.min()), max(parent.max(), sample.max()), 50 - ) - ax.hist( - parent, - bins=bins, - density=True, - label="Parent", - histtype="step", - lw=2, - ) - ax.hist( - sample, - bins=bins, - density=True, - label="Sample (raw)", - histtype="step", - lw=2, - ) - ax.hist( - sample, - bins=bins, - weights=weights, - density=True, - label="Sample (weighted)", - histtype="step", - lw=2, - ) - ax.set_xlabel(f"� (R={scale} Mpc/h)") - ax.set_ylabel("PDF") - ax.set_title(f"Scale {scale} Mpc/h") - ax.legend() - ax.grid(True, alpha=0.3) - - # First scale. - _plot_1d(axes[0], scales[0]) - - # Second scale (if available). - if len(scales) > 1: - _plot_1d(axes[1], scales[1]) - else: - axes[1].axis("off") - - # Weight histogram. - axw = axes[2] - axw.hist(weights, bins=60, alpha=0.85, edgecolor="black") - axw.axvline( - np.mean(weights), - color="red", - linestyle="--", - lw=2, - label=f"Mean={np.mean(weights):.3f}", - ) - axw.set_xlabel("Weight") - axw.set_ylabel("Count") - ess = int(info["ess"]) - M = info["sample_samples"] - ks_mean = info["ks_mean"] - axw.set_title(f"w distribution | ESS={ess}/{M:,}, KS(mean)={ks_mean:.3f}") - axw.legend() - axw.grid(True, alpha=0.3) - - plt.tight_layout() - if save_path: - plt.savefig(save_path, dpi=300, bbox_inches="tight") - print(f"[plot] Saved: {save_path}") - return fig - - -# --------------------------------------------------------------------- -# Pipeline -# --------------------------------------------------------------------- - - -def run_full_analysis( - parent_samples: Dict[float, np.ndarray], - sample_samples: Dict[float, np.ndarray], - show_plots: bool = True, - save_path: Optional[str] = None, - **knn_kwargs, -) -> Tuple[np.ndarray, Dict]: - """Run the full k-NN reweighting and simple diagnostics. - - Args: - parent_samples: Mapping from scale to parent overdensity array. - sample_samples: Mapping from scale to sample overdensity array. - show_plots: If True, display the diagnostic plot. - save_path: Optional path to save the diagnostic PNG. - **knn_kwargs: Keyword arguments used to configure - :class:`KNNReweighterConfig`. - - Returns: - Tuple ``(weights, info)``: - * ``weights``: 1D array of weights for the sample population. - * ``info``: Diagnostics dictionary from - :meth:`KNNReweighter.fit`. - """ - print("=== Multi-Scale Environment Weighting (k-NN, auto-k) ===") - - cfg = KNNReweighterConfig(**knn_kwargs) - rw = KNNReweighter(cfg) - - weights, info = rw.fit(parent_samples, sample_samples) - - t = info.get("tuning") - tune_str = "" - if t: - tune_str = ( - f" | auto-k over {t['candidates']} " - f"-> best={t['best_k']} (ESS floor {int(t['ess_floor'] * 100)}%)" - ) - - print( - "\n[Summary] " - f"k={info['k']}, KS(mean)={info['ks_mean']:.4f}, " - f"ESS={int(info['ess'])}/{info['sample_samples']}, " - f"clip={info['clip_range']}, " - f"clipped={100 * info['clip_fraction']:.1f}%" - f"{tune_str}" - ) - - if show_plots: - print("\n[Plot] Rendering diagnostics...") - _ = plot_diagnostics(parent_samples, sample_samples, weights, info, save_path) - plt.show() - - return weights, info - - -# --------------------------------------------------------------------- -# __main__ (argparse) -# --------------------------------------------------------------------- - - -def _build_arg_parser() -> argparse.ArgumentParser: - """Create the command-line argument parser. - - Returns: - Configured :class:`argparse.ArgumentParser` instance. - """ - parser = argparse.ArgumentParser( - description=( - "k-NN density-ratio weighting for multi-scale environments.\n\n" - "The sample file defines which Grids/Kernel_* groups are used; " - "the same kernels must exist in the parent file." - ) - ) - - # Required I/O. - parser.add_argument( - "--parent", - type=str, - required=True, - help="Path to parent/global population HDF5 file.", - ) - parser.add_argument( - "--sample", - type=str, - required=True, - help="Path to sample/observed population HDF5 file.", - ) - - # k / auto-tuning. - parser.add_argument( - "--k", - type=int, - default=50, - help="Default number of neighbors (used if --autotune_k is empty).", - ) - parser.add_argument( - "--autotune_k", - type=str, - default="", - help=( - "Comma-separated candidate k values for auto-tuning, " - "e.g. '35,50,80,100'. If empty, k is used directly." - ), - ) - parser.add_argument( - "--ess_floor", - type=float, - default=0.20, - help="Minimum ESS fraction (ESS / M) for a candidate k to be feasible.", - ) - - # Preprocessing. - parser.add_argument( - "--preprocess", - choices=["whiten", "standardize", "none"], - default="whiten", - help="Feature preprocessing mode.", - ) - parser.add_argument( - "--standardize_by_parent", - action="store_true", - help="If preprocess=standardize, use parent stats (default True).", - ) - parser.add_argument( - "--standardize_by_pooled", - dest="standardize_by_parent", - action="store_false", - help="If preprocess=standardize, use pooled parent+sample stats.", - ) - parser.set_defaults(standardize_by_parent=True) - - # Neighbors. - parser.add_argument( - "--metric", - type=str, - default="minkowski", - help="Distance metric (e.g., 'minkowski', 'euclidean').", - ) - parser.add_argument( - "--algorithm", - type=str, - default="auto", - help="NearestNeighbors algorithm (scikit-learn).", - ) - parser.add_argument( - "--leaf_size", - type=int, - default=40, - help="Leaf size for tree-based nearest-neighbor algorithms.", - ) - - # Stabilization. - parser.add_argument( - "--tau", - type=float, - default=1.0, - help="Temperature exponent (weights := weights ** tau).", - ) - parser.add_argument( - "--clip_min", - type=float, - default=0.01, - help="Lower clip bound for weights.", - ) - parser.add_argument( - "--clip_max", - type=float, - default=50.0, - help="Upper clip bound for weights.", - ) - - # Misc / plotting. - parser.add_argument( - "--random_state", - type=int, - default=42, - help="Random seed for diagnostics and KS subsampling.", - ) - parser.add_argument( - "--save", - type=str, - default=None, - help="Path to save diagnostic PNG.", - ) - parser.add_argument( - "--no_plots", - action="store_true", - help="Disable plotting.", - ) - parser.add_argument( - "--output", - type=str, - default=None, - help="Path to save computed weights as .npy file.", - ) - - return parser - - -if __name__ == "__main__": - parser = _build_arg_parser() - args = parser.parse_args() - - # Load data from the two HDF5 files. - parent_data, sample_data = load_multiscale_overdensities( - parent_path=args.parent, - sample_path=args.sample, - ) - - # Remove NaNs consistently within each population. - parent_data = drop_nan_rows_per_scale(parent_data) - sample_data = drop_nan_rows_per_scale(sample_data) - - # Build kwargs for run_full_analysis. - auto_list = [int(x) for x in args.autotune_k.split(",") if x.strip().isdigit()] - kwargs = dict( - k=args.k, - autotune_k=auto_list if auto_list else None, - ess_floor=args.ess_floor, - preprocess=args.preprocess, - standardize_by_parent=args.standardize_by_parent, - algorithm=args.algorithm, - metric=args.metric, - leaf_size=args.leaf_size, - tau=args.tau, - clip_range=(args.clip_min, args.clip_max), - random_state=args.random_state, - ) - - # Run pipeline. - weights, info = run_full_analysis( - parent_data, - sample_data, - show_plots=not args.no_plots, - save_path=args.save, - **kwargs, - ) - - # Save weights to file if requested. - if args.output: - np.save(args.output, weights) - print(f"[Output] Saved weights to: {args.output}") - - print(f"\n[Done] Generated {len(weights)} weights.") diff --git a/weighting/test_hmf_weighting.py b/weighting/test_hmf_weighting.py deleted file mode 100644 index 574bd2a..0000000 --- a/weighting/test_hmf_weighting.py +++ /dev/null @@ -1,648 +0,0 @@ -"""Test k-NN weighting on halo mass functions. - -This script evaluates how well k-NN density-ratio weighting can reproduce -the true halo mass function (HMF) from a small subsample. - -Workflow: -1. Load multi-scale overdensity features from parent (global) and sample (full box) HDF5 files -2. Load halo masses for the full sample population (the truth) -3. Subsample N halos randomly from the full sample -4. Apply k-NN weighting to the subsample using parent environment as reference -5. Compare: Truth (full sample) vs Subsample (raw) vs Subsample (weighted) - -The goal is to test whether a small weighted subsample can reproduce the -true HMF of the full sample. - -The script produces diagnostic plots showing: -- HMF comparison: Truth vs raw subsample vs weighted subsample -- Environment distributions at different scales -- Weight distribution statistics - -Example -------- - python test_hmf_weighting.py \\ - --parent parent_grids.hdf5 \\ - --sample sample_grids.hdf5 \\ - --masses sample_halo_masses.txt \\ - --subsample 10000 \\ - --autotune_k 35,50,80,100 \\ - --output weighted_masses.npy \\ - --save hmf_diagnostics.png -""" - -from __future__ import annotations - -import argparse -import sys -from pathlib import Path -from typing import Dict, Optional, Tuple - -import h5py -import matplotlib.pyplot as plt -import numpy as np - -# Import the k-NN weighting machinery -from knn_weighter import ( - KNNReweighter, - KNNReweighterConfig, - drop_nan_rows_per_scale, - effective_sample_size, - filter_and_transform_overdensities, - load_multiscale_overdensities, -) - - -def load_halo_masses(mass_file: str) -> np.ndarray: - """Load halo masses from a text file. - - Args: - mass_file: Path to text file containing one mass per line. - - Returns: - 1D array of halo masses. - - Raises: - ValueError: If file is empty or contains invalid data. - """ - masses = np.loadtxt(mass_file) - if masses.ndim != 1: - raise ValueError( - f"Mass file should contain a single column, got shape {masses.shape}" - ) - if len(masses) == 0: - raise ValueError("Mass file is empty") - - print(f"[Masses] Loaded {len(masses):,} halo masses from {mass_file}") - print(f"[Masses] Mass range: {masses.min():.2e} - {masses.max():.2e}") - - return masses - - -def subsample_data( - sample_overdens: Dict[float, np.ndarray], - masses: np.ndarray, - n_subsample: int, - random_state: int = 42, -) -> Tuple[Dict[float, np.ndarray], np.ndarray, np.ndarray]: - """Randomly subsample halos and their environment features. - - Args: - sample_overdens: Mapping from scale to sample overdensity arrays. - masses: Halo masses (same length as overdensity arrays). - n_subsample: Number of halos to randomly select. - random_state: Random seed for reproducibility. - - Returns: - Tuple (subsampled_overdens, subsampled_masses, indices). - - Raises: - ValueError: If n_subsample > sample size. - """ - n_total = len(masses) - if n_subsample > n_total: - raise ValueError( - f"Requested subsample size {n_subsample:,} exceeds " - f"sample size {n_total:,}" - ) - - rng = np.random.default_rng(random_state) - indices = rng.choice(n_total, size=n_subsample, replace=False) - - subsampled_overdens = { - scale: arr[indices] for scale, arr in sample_overdens.items() - } - subsampled_masses = masses[indices] - - print( - f"[Subsample] Randomly selected {n_subsample:,}/{n_total:,} halos " - f"(seed={random_state})" - ) - print( - f"[Subsample] Subsampled mass range: " - f"{subsampled_masses.min():.2e} - {subsampled_masses.max():.2e}" - ) - - return subsampled_overdens, subsampled_masses, indices - - -def compute_hmf( - masses: np.ndarray, - weights: Optional[np.ndarray] = None, - n_bins: int = 20, - mass_range: Optional[Tuple[float, float]] = None, -) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """Compute halo mass function (differential number density). - - Args: - masses: Halo masses. - weights: Optional weights for each halo. - n_bins: Number of mass bins. - mass_range: Optional (min, max) mass range. If None, uses data range. - - Returns: - Tuple (bin_centers, dn_dlogM, bin_edges) where dn_dlogM is the - differential number density per logarithmic mass bin. - """ - if weights is None: - weights = np.ones_like(masses) - - if mass_range is None: - mass_range = (masses.min(), masses.max()) - - log_masses = np.log10(masses) - log_range = (np.log10(mass_range[0]), np.log10(mass_range[1])) - - counts, bin_edges = np.histogram( - log_masses, bins=n_bins, range=log_range, weights=weights - ) - bin_widths = np.diff(bin_edges) - bin_centers = 0.5 * (bin_edges[1:] + bin_edges[:-1]) - - # Differential number density: dn/dlog10(M) - dn_dlogM = counts / bin_widths - # Normalize to total number - dn_dlogM *= len(masses) / dn_dlogM.sum() / bin_widths.mean() - - return bin_centers, dn_dlogM, bin_edges - - -def plot_hmf_comparison( - truth_masses: np.ndarray, - subsample_masses: np.ndarray, - weights: np.ndarray, - info: Dict, - parent_overdens: Dict[float, np.ndarray], - subsample_overdens: Dict[float, np.ndarray], - save_path: Optional[str] = None, -) -> plt.Figure: - """Plot HMF comparison and diagnostics. - - Creates a multi-panel figure with: - - HMF comparison (truth vs raw subsample vs weighted subsample) - - Environment distributions at two scales - - Weight distribution histogram - - Args: - truth_masses: True halo masses (full sample). - subsample_masses: Subsampled halo masses. - weights: Weights for subsampled halos. - info: Diagnostics from KNNReweighter. - parent_overdens: Parent overdensity features (global reference). - subsample_overdens: Subsample overdensity features. - save_path: Optional path to save figure. - - Returns: - Figure instance. - """ - fig = plt.figure(figsize=(18, 10)) - gs = fig.add_gridspec(2, 3, hspace=0.3, wspace=0.3) - - scales = sorted(parent_overdens.keys()) - - # Determine common mass range for HMF - all_masses = np.concatenate([truth_masses, subsample_masses]) - mass_range = (all_masses.min(), all_masses.max()) - - # --- Panel 1: Halo Mass Function --- - ax_hmf = fig.add_subplot(gs[0, :]) - - # Compute HMFs - bc_truth, hmf_truth, _ = compute_hmf( - truth_masses, n_bins=25, mass_range=mass_range - ) - bc_subsample, hmf_subsample, _ = compute_hmf( - subsample_masses, n_bins=25, mass_range=mass_range - ) - bc_weighted, hmf_weighted, _ = compute_hmf( - subsample_masses, weights=weights, n_bins=25, mass_range=mass_range - ) - - ax_hmf.plot( - bc_truth, hmf_truth, "k-", lw=2.5, label=f"Truth (full sample, N={len(truth_masses):,})" - ) - ax_hmf.plot( - bc_subsample, - hmf_subsample, - "r--", - lw=2, - alpha=0.7, - label=f"Subsample (raw, N={len(subsample_masses):,})", - ) - ax_hmf.plot( - bc_weighted, - hmf_weighted, - "b-", - lw=2, - alpha=0.8, - label=f"Subsample (weighted, ESS={int(info['ess'])}/{len(subsample_masses):,})", - ) - - ax_hmf.set_xlabel(r"$\log_{10}(M_{\rm halo} \, [M_\odot])$", fontsize=13) - ax_hmf.set_ylabel(r"$dn / d\log_{10}M$ (normalized)", fontsize=13) - ax_hmf.set_title( - f"Halo Mass Function | k={info['k']}, KS(env)={info['ks_mean']:.3f}", - fontsize=14, - fontweight="bold", - ) - ax_hmf.legend(fontsize=11, loc="upper right") - ax_hmf.grid(True, alpha=0.3) - ax_hmf.set_yscale("log") - - # --- Panel 2: Environment at scale 0 --- - ax_env1 = fig.add_subplot(gs[1, 0]) - _plot_environment_dist( - ax_env1, parent_overdens[scales[0]], sample_overdens[scales[0]], weights, scales[0] - ) - - # --- Panel 3: Environment at scale 1 (if available) --- - ax_env2 = fig.add_subplot(gs[1, 1]) - if len(scales) > 1: - _plot_environment_dist( - ax_env2, - parent_overdens[scales[1]], - sample_overdens[scales[1]], - weights, - scales[1], - ) - else: - ax_env2.text( - 0.5, 0.5, "Only 1 scale available", ha="center", va="center", fontsize=12 - ) - ax_env2.axis("off") - - # --- Panel 4: Weight distribution --- - ax_weights = fig.add_subplot(gs[1, 2]) - ax_weights.hist(weights, bins=50, alpha=0.85, edgecolor="black", color="steelblue") - ax_weights.axvline( - np.mean(weights), - color="red", - linestyle="--", - lw=2, - label=f"Mean={np.mean(weights):.2f}", - ) - ax_weights.axvline( - np.median(weights), - color="orange", - linestyle=":", - lw=2, - label=f"Median={np.median(weights):.2f}", - ) - ax_weights.set_xlabel("Weight", fontsize=12) - ax_weights.set_ylabel("Count", fontsize=12) - ax_weights.set_title( - f"Weight Distribution\nESS={int(info['ess'])}/{len(weights):,}, " - f"clip={100*info['clip_fraction']:.1f}%", - fontsize=11, - ) - ax_weights.legend(fontsize=10) - ax_weights.grid(True, alpha=0.3) - - plt.suptitle( - "k-NN Halo Mass Function Weighting Test", - fontsize=16, - fontweight="bold", - y=0.995, - ) - - if save_path: - plt.savefig(save_path, dpi=300, bbox_inches="tight") - print(f"[Plot] Saved: {save_path}") - - return fig - - -def _plot_environment_dist( - ax: plt.Axes, - parent_env: np.ndarray, - subsample_env: np.ndarray, - weights: np.ndarray, - scale: float, -) -> None: - """Plot environment distribution at a single scale. - - Args: - ax: Matplotlib axes. - parent_env: Parent (global) overdensities. - subsample_env: Subsample overdensities. - weights: Subsample weights. - scale: Smoothing scale (Mpc/h). - """ - bins = np.linspace( - min(parent_env.min(), subsample_env.min()), - max(parent_env.max(), subsample_env.max()), - 40, - ) - - ax.hist( - parent_env, - bins=bins, - density=True, - alpha=0.6, - label="Parent (global)", - histtype="stepfilled", - color="gray", - ) - ax.hist( - subsample_env, - bins=bins, - density=True, - alpha=0.7, - label="Subsample (raw)", - histtype="step", - lw=2, - color="red", - ) - ax.hist( - subsample_env, - bins=bins, - weights=weights, - density=True, - alpha=0.8, - label="Subsample (weighted)", - histtype="step", - lw=2, - color="blue", - ) - - ax.set_xlabel(f"log10(1 + δ) at R={scale:.2f} Mpc/h", fontsize=11) - ax.set_ylabel("PDF", fontsize=11) - ax.set_title(f"Environment (scale {scale:.2f})", fontsize=11) - ax.legend(fontsize=9) - ax.grid(True, alpha=0.3) - - -def print_summary_statistics( - truth_masses: np.ndarray, - subsample_masses: np.ndarray, - weights: np.ndarray, - info: Dict, -) -> None: - """Print summary statistics for the weighting test. - - Args: - truth_masses: True halo masses (full sample). - subsample_masses: Subsampled halo masses. - weights: Subsample weights. - info: Diagnostics from KNNReweighter. - """ - print("\n" + "=" * 70) - print("SUMMARY STATISTICS") - print("=" * 70) - - print(f"\nPopulation Sizes:") - print(f" Truth (full sample): {len(truth_masses):,} halos") - print(f" Subsample: {len(subsample_masses):,} halos ({100*len(subsample_masses)/len(truth_masses):.1f}% of truth)") - print(f" ESS: {int(info['ess']):,} (effective sample size)") - print(f" ESS/N_subsample: {info['ess']/len(subsample_masses):.1%}") - - print(f"\nk-NN Configuration:") - print(f" k: {info['k']}") - print(f" Preprocess: {info['preprocess']}") - print(f" Metric: {info['metric']}") - print(f" Tau: {info['tau']}") - - print(f"\nEnvironment Matching (KS distance):") - print(f" Mean across scales: {info['ks_mean']:.4f}") - for i, (scale, ks) in enumerate(zip(info['scales'], info['ks_per_dim'])): - print(f" Scale {i} (R={scale:.2f}): {ks:.4f}") - - print(f"\nWeight Statistics:") - print(f" Mean: {info['mean_weight']:.3f}") - print(f" Std: {info['std_weight']:.3f}") - print(f" Min: {info['min_weight']:.3f}") - print(f" Max: {info['max_weight']:.3f}") - print(f" Clip frac: {100*info['clip_fraction']:.1f}%") - print(f" Clip range: {info['clip_range']}") - - # Mass statistics - print(f"\nMass Range (log10 Msun):") - print(f" Truth: {np.log10(truth_masses.min()):.2f} - {np.log10(truth_masses.max()):.2f}") - print(f" Subsample: {np.log10(subsample_masses.min()):.2f} - {np.log10(subsample_masses.max()):.2f}") - - # Weighted vs unweighted mean log mass - mean_log_m_truth = np.mean(np.log10(truth_masses)) - mean_log_m_subsample = np.mean(np.log10(subsample_masses)) - mean_log_m_weighted = np.average(np.log10(subsample_masses), weights=weights) - - print(f"\nMean log10(M) [Msun]:") - print(f" Truth: {mean_log_m_truth:.3f}") - print(f" Subsample: {mean_log_m_subsample:.3f} (bias: {mean_log_m_subsample - mean_log_m_truth:+.3f})") - print(f" Weighted: {mean_log_m_weighted:.3f} (bias: {mean_log_m_weighted - mean_log_m_truth:+.3f})") - - print("=" * 70 + "\n") - - -def main(): - """Main execution function.""" - parser = argparse.ArgumentParser( - description="Test k-NN weighting on halo mass functions", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=__doc__, - ) - - # Required inputs - parser.add_argument( - "--parent", - type=str, - required=True, - help="Path to parent/global population HDF5 file (gridded overdensities, global reference)", - ) - parser.add_argument( - "--sample", - type=str, - required=True, - help="Path to full sample HDF5 file (gridded overdensities, this is the truth)", - ) - parser.add_argument( - "--masses", - type=str, - required=True, - help="Path to text file with full sample halo masses (one per line, truth HMF)", - ) - - # Subsampling - parser.add_argument( - "--subsample", - type=int, - required=True, - help="Number of halos to randomly select from full sample for weighting test", - ) - - # k-NN parameters - parser.add_argument( - "--k", - type=int, - default=50, - help="Number of neighbors (default: 50)", - ) - parser.add_argument( - "--autotune_k", - type=str, - default="", - help="Comma-separated k values for auto-tuning (e.g., '35,50,80,100')", - ) - parser.add_argument( - "--ess_floor", - type=float, - default=0.20, - help="Minimum ESS fraction for auto-tuning (default: 0.20)", - ) - parser.add_argument( - "--preprocess", - choices=["whiten", "standardize", "none"], - default="whiten", - help="Feature preprocessing mode (default: whiten)", - ) - parser.add_argument( - "--tau", - type=float, - default=1.0, - help="Temperature parameter for weights (default: 1.0)", - ) - parser.add_argument( - "--clip_min", - type=float, - default=0.01, - help="Lower clip bound for weights (default: 0.01)", - ) - parser.add_argument( - "--clip_max", - type=float, - default=50.0, - help="Upper clip bound for weights (default: 50.0)", - ) - - # Output - parser.add_argument( - "--output", - type=str, - default=None, - help="Path to save weighted masses as .npy file", - ) - parser.add_argument( - "--save", - type=str, - default=None, - help="Path to save diagnostic plot", - ) - parser.add_argument( - "--no_plots", - action="store_true", - help="Disable interactive plotting", - ) - parser.add_argument( - "--random_state", - type=int, - default=42, - help="Random seed for subsampling and diagnostics (default: 42)", - ) - - args = parser.parse_args() - - # Load overdensity features - print("\n" + "=" * 70) - print("LOADING DATA") - print("=" * 70 + "\n") - - parent_overdens, sample_overdens = load_multiscale_overdensities( - parent_path=args.parent, - sample_path=args.sample, - ) - - # Load halo masses (for the full sample = truth) - truth_masses = load_halo_masses(args.masses) - - # Check consistency - first_scale = next(iter(sample_overdens.values())) - if len(first_scale) != len(truth_masses): - raise ValueError( - f"Mismatch: sample overdensities have {len(first_scale):,} entries " - f"but mass file has {len(truth_masses):,} entries" - ) - - # Filter and transform parent data (removes delta <= -1, applies log transform) - print("\n[Cleaning] Filtering and transforming parent overdensities...") - parent_overdens, _ = filter_and_transform_overdensities(parent_overdens) - - # Additional NaN check on parent (shouldn't be any after filtering) - parent_overdens = drop_nan_rows_per_scale(parent_overdens) - - # Filter and transform full sample data - need to synchronize with masses - print("[Cleaning] Filtering and transforming full sample overdensities...") - sample_overdens, mask = filter_and_transform_overdensities(sample_overdens) - - # Apply mask to masses to keep them synchronized - truth_masses = truth_masses[mask] - - # Additional NaN check on sample (shouldn't be any after filtering) - n_before = len(next(iter(sample_overdens.values()))) - sample_overdens = drop_nan_rows_per_scale(sample_overdens) - n_after = len(next(iter(sample_overdens.values()))) - - if n_after != n_before: - # Need to rebuild mask for masses - mask_nan = np.ones(n_before, dtype=bool) - for arr in sample_overdens.values(): - # This shouldn't happen, but just in case - pass - print(f"[WARNING] Found {n_before - n_after} additional NaN values after transform!") - - # Subsample from the cleaned full sample - if args.subsample is None: - raise ValueError( - "--subsample is required. Specify how many halos to randomly select " - "from the full sample for weighting." - ) - - subsample_overdens, subsample_masses, subsample_indices = subsample_data( - sample_overdens, truth_masses, args.subsample, args.random_state - ) - - # Run k-NN weighting - # Goal: weight subsample to match parent (global) environment distribution - print("\n" + "=" * 70) - print("COMPUTING WEIGHTS") - print("=" * 70 + "\n") - - auto_list = [int(x) for x in args.autotune_k.split(",") if x.strip().isdigit()] - cfg = KNNReweighterConfig( - k=args.k, - autotune_k=auto_list if auto_list else None, - ess_floor=args.ess_floor, - preprocess=args.preprocess, - tau=args.tau, - clip_range=(args.clip_min, args.clip_max), - random_state=args.random_state, - ) - - reweighter = KNNReweighter(cfg) - weights, info = reweighter.fit(parent_overdens, subsample_overdens) - - # Print summary - print_summary_statistics(truth_masses, subsample_masses, weights, info) - - # Save weighted masses if requested - if args.output: - weighted_data = np.column_stack([subsample_masses, weights]) - np.save(args.output, weighted_data) - print(f"[Output] Saved (subsample_masses, weights) to: {args.output}") - - # Plot results - if not args.no_plots: - print("\n[Plot] Generating diagnostics...\n") - fig = plot_hmf_comparison( - truth_masses, - subsample_masses, - weights, - info, - parent_overdens, - subsample_overdens, - save_path=args.save, - ) - plt.show() - - print("\n[Done] HMF weighting test complete.\n") - - -if __name__ == "__main__": - main() From d09ceaa19c0ddd901a37f09eb9e5f1b7f4df1e45 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Fri, 28 Nov 2025 23:04:13 +0000 Subject: [PATCH 21/23] Fixing the test suite --- tests/test_suite.py | 51 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/tests/test_suite.py b/tests/test_suite.py index d187c4d..284781b 100755 --- a/tests/test_suite.py +++ b/tests/test_suite.py @@ -120,10 +120,29 @@ def create_dense_distribution(filename, npart=10000, boxsize=10.0): return npart @staticmethod - def _write_snapshot(filename, coords, masses, boxsize): - """Write HDF5 snapshot in SWIFT format""" + def _write_snapshot(filename, coords, masses, boxsize, h=0.681, Omega_cdm=0.256011, Omega_b=0.048600): + """Write HDF5 snapshot in SWIFT format + + If masses are uniform (all equal), they will be rescaled to match the + cosmological mean density. If masses vary, they are used as-is (assumed + to already represent the correct density field). + """ os.makedirs(os.path.dirname(filename), exist_ok=True) + # Calculate cosmological mean density + Omega_m = Omega_cdm + Omega_b + H0_kmsMpc = 100.0 * h + G = 4.301744232015554e+01 # (10^10 Msun)^-1 Mpc (km/s)^2 + rho_crit_0 = (3.0 * H0_kmsMpc**2) / (8.0 * np.pi * G) + mean_density = rho_crit_0 * Omega_m + + # For uniform mass distributions, rescale to match cosmological mean density + # This ensures uniform distributions give zero mean overdensity + if np.allclose(masses, masses[0]): # All masses equal + volume = boxsize**3 + total_mass = mean_density * volume + masses = np.full(len(masses), total_mass / len(masses)) + with h5py.File(filename, 'w') as f: # Header header = f.create_group('Header') @@ -207,6 +226,11 @@ def _create_param_file(self, snapshot_file, output_file, cdim=3, kernel_radii=[0 f.write(f" type: uniform\n") f.write(f" cdim: {cdim}\n") f.write(f"\n") + f.write(f"Cosmology:\n") + f.write(f" h: 0.681 # Reduced Hubble constant\n") + f.write(f" Omega_cdm: 0.256011 # Cold dark matter density parameter\n") + f.write(f" Omega_b: 0.048600 # Baryon density parameter\n") + f.write(f"\n") f.write(f"Tree:\n") f.write(f" max_leaf_count: 200\n") f.write(f"\n") @@ -243,6 +267,10 @@ def test_single_particle_center(self): boxsize = 10.0 TestDataGenerator.create_single_particle(snapshot, position, mass, boxsize) + # Read actual particle mass from snapshot (will be rescaled to match cosmology) + with h5py.File(snapshot, 'r') as f: + actual_mass = f['PartType1/Masses'][0] + # Create parameters kernel_radii = [0.5, 1.0, 2.0] param_file = self._create_param_file(snapshot, "single_particle_out.hdf5", @@ -275,8 +303,8 @@ def test_single_particle_center(self): # Particle at center should be within kernel of center grid point # for radii >= ~0.8 (half cell diagonal is ~0.866) if radius >= 0.8: - if masses[center_idx] != mass: - return False, f"Kernel {i} (r={radius}): Expected mass {mass} at center, got {masses[center_idx]}" + if not np.isclose(masses[center_idx], actual_mass): + return False, f"Kernel {i} (r={radius}): Expected mass {actual_mass} at center, got {masses[center_idx]}" return True, "Single particle test passed" @@ -563,6 +591,10 @@ def test_boundary_particles(self): # Write snapshot TestDataGenerator._write_snapshot(snapshot, positions, masses, boxsize) + # Read actual total mass from snapshot (will be rescaled to match cosmology) + with h5py.File(snapshot, 'r') as f: + expected_mass = np.sum(f['PartType1/Masses'][:]) + # Grid with points that should capture particles across boundaries param_file = self._create_param_file(snapshot, "boundary_parts_out.hdf5", cdim=3, kernel_radii=[2.0]) @@ -581,7 +613,6 @@ def test_boundary_particles(self): # All particles should be captured by at least one grid point # Check that the total mass is conserved (allowing small tolerance) total_mass_captured = np.sum(masses_out) - expected_mass = len(positions) # All particles have mass=1 if abs(total_mass_captured - expected_mass) > expected_mass * 0.1: return False, f"Mass not conserved: {total_mass_captured} vs {expected_mass}" @@ -757,6 +788,11 @@ def test_random_grid_mpi(self): 'type': 'random', 'n_grid_points': 500 }, + 'Cosmology': { + 'h': 0.681, + 'Omega_cdm': 0.256011, + 'Omega_b': 0.048600 + }, 'Tree': { 'max_leaf_count': 50 }, @@ -851,6 +887,11 @@ def _create_param_file(self, snapshot, basename, cdim=5, kernel_radii=[0.5, 1.0] 'type': 'uniform', 'cdim': cdim }, + 'Cosmology': { + 'h': 0.681, + 'Omega_cdm': 0.256011, + 'Omega_b': 0.048600 + }, 'Tree': { 'max_leaf_count': 50 }, From 87dca02a2e50d8e25b29521bce42f5ce658fa9a6 Mon Sep 17 00:00:00 2001 From: Will Roper Date: Fri, 28 Nov 2025 23:05:38 +0000 Subject: [PATCH 22/23] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/test_cosmology.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_cosmology.py b/tests/test_cosmology.py index d9d67bf..4a1e197 100644 --- a/tests/test_cosmology.py +++ b/tests/test_cosmology.py @@ -14,7 +14,6 @@ from pathlib import Path import yaml import tempfile -import os # Try to import astropy for direct comparison tests try: From 8f497043c3e0ca20692c68f5ed585ae82db4b494 Mon Sep 17 00:00:00 2001 From: wjr21 Date: Fri, 28 Nov 2025 23:12:20 +0000 Subject: [PATCH 23/23] Actually fixing the test params... --- tests/test_gridder.py | 45 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tests/test_gridder.py b/tests/test_gridder.py index d52309e..6e13129 100644 --- a/tests/test_gridder.py +++ b/tests/test_gridder.py @@ -162,6 +162,11 @@ def test_grid_points_with_comments(self, build_gridder, test_snapshot): type: file grid_file: tests/data/grid_points_files/with_comments.txt +Cosmology: + h: 0.681 + Omega_cdm: 0.256011 + Omega_b: 0.048600 + Tree: max_leaf_count: 200 @@ -215,6 +220,11 @@ def test_malformed_grid_points(self, build_gridder, test_snapshot): type: file grid_file: tests/data/grid_points_files/malformed.txt +Cosmology: + h: 0.681 + Omega_cdm: 0.256011 + Omega_b: 0.048600 + Tree: max_leaf_count: 200 @@ -270,6 +280,11 @@ def test_missing_grid_file(self, build_gridder, test_snapshot): type: file grid_file: tests/data/grid_points_files/nonexistent.txt +Cosmology: + h: 0.681 + Omega_cdm: 0.256011 + Omega_b: 0.048600 + Tree: max_leaf_count: 200 @@ -405,6 +420,11 @@ def test_random_grid_reproducibility(self, build_gridder, test_snapshot): n_grid_points: 50 random_seed: {seed} +Cosmology: + h: 0.681 + Omega_cdm: 0.256011 + Omega_b: 0.048600 + Tree: max_leaf_count: 200 @@ -468,6 +488,11 @@ def test_random_grid_different_seeds(self, build_gridder, test_snapshot): n_grid_points: 50 random_seed: {seed} +Cosmology: + h: 0.681 + Omega_cdm: 0.256011 + Omega_b: 0.048600 + Tree: max_leaf_count: 200 @@ -533,6 +558,11 @@ def test_sparse_grid_chunk_preparation(self, build_gridder, test_snapshot): type: uniform cdim: 5 # 5^3 = 125 grid points +Cosmology: + h: 0.681 + Omega_cdm: 0.256011 + Omega_b: 0.048600 + Tree: max_leaf_count: 200 @@ -598,6 +628,11 @@ def test_cell_mass_summation(self, build_gridder, test_snapshot): type: uniform cdim: 3 +Cosmology: + h: 0.681 + Omega_cdm: 0.256011 + Omega_b: 0.048600 + Tree: max_leaf_count: 200 @@ -668,6 +703,11 @@ def test_uniform_grid_on_particles_unit_overdensity(self, test_snapshot): type: uniform cdim: 3 # 3^3 = 27 grid points +Cosmology: + h: 0.681 + Omega_cdm: 0.256011 + Omega_b: 0.048600 + Tree: max_leaf_count: 200 @@ -737,6 +777,11 @@ def test_sparse_grid_on_particles_unit_overdensity(self, test_snapshot): type: file grid_file: {grid_file} +Cosmology: + h: 0.681 + Omega_cdm: 0.256011 + Omega_b: 0.048600 + Tree: max_leaf_count: 200