From a9de25220b27317178f8ea6a9004b95c12736d35 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Wed, 19 Nov 2025 14:56:23 +0000
Subject: [PATCH 01/23] Adding an initial test knn weighter

---
 weighting/knn_weighter.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 weighting/knn_weighter.py

diff --git a/weighting/knn_weighter.py b/weighting/knn_weighter.py
new file mode 100644
index 0000000..e69de29

From c973acdffc089df00e6dc1e19519ef5e0959dd90 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Wed, 19 Nov 2025 18:42:14 +0000
Subject: [PATCH 02/23] Testing HMF weighting

---
 weighting/knn_weighter.py       | 1078 +++++++++++++++++++++++++++++++
 weighting/test_hmf_weighting.py |  634 ++++++++++++++++++
 2 files changed, 1712 insertions(+)
 create mode 100644 weighting/test_hmf_weighting.py

diff --git a/weighting/knn_weighter.py b/weighting/knn_weighter.py
index e69de29..9f25a47 100644
--- a/weighting/knn_weighter.py
+++ b/weighting/knn_weighter.py
@@ -0,0 +1,1078 @@
+"""k-NN reweighting of a biased sample to match a parent population.
+
+This script reads two HDF5 files:
+
+* A "parent" (global) population.
+* A "sample" population (which may be biased).
+
+The set of kernels is defined by the sample file: every group
+``Grids/Kernel_*`` in the sample must also exist in the parent. Each such
+group must contain a dataset ``GridPointOverDensities``.
+
+For each kernel:
+
+* Read overdensities from parent and sample.
+* Transform as ``log10(1 + delta)``.
+* Stack over all kernels into a multi-dimensional feature space.
+
+We then use k-NN density-ratio estimation to compute weights for the sample
+so that its multi-scale environment distribution matches the parent.
+
+An optional auto-tuning step searches over candidate k values, choosing the
+one that:
+
+* Satisfies an effective sample size (ESS) floor, if possible.
+* Minimizes the mean 1D KS distance between parent and weighted sample
+  across all dimensions.
+
+Example
+-------
+    python knn_weighter.py \\
+        --parent parent.hdf5 \\
+        --sample sample.hdf5 \\
+        --autotune_k 35,50,80,100 \\
+        --ess_floor 0.20
+"""
+
+from __future__ import annotations
+
+import argparse
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import h5py
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+
+# ---------------------------------------------------------------------
+# Utilities
+# ---------------------------------------------------------------------
+
+
+def effective_sample_size(w: np.ndarray) -> float:
+    """Compute the effective sample size (ESS) of a weight vector.
+
+    ESS is defined as:
+
+    .. math::
+
+        \\text{ESS} = \\frac{(\\sum_i w_i)^2}{\\sum_i w_i^2}.
+
+    Args:
+        w: One-dimensional array of non-negative weights.
+
+    Returns:
+        Effective sample size as a float.
+    """
+    s = float(w.sum())
+    return (s * s) / float(np.sum(w * w) + 1e-18)
+
+
+def weighted_ks(
+    x_parent: np.ndarray,
+    x_sample: np.ndarray,
+    w_sample: np.ndarray,
+    rng: np.random.Generator,
+    n_parent: int = 100_000,
+    n_sample: int = 100_000,
+) -> float:
+    """Compute a weighted 1D KolmogorovSmirnov distance.
+
+    The parent distribution is unweighted; the sample distribution is
+    weighted. For speed, both are optionally downsampled before computing
+    the KS distance.
+
+    Args:
+        x_parent: Parent data values (1D).
+        x_sample: Sample data values (1D).
+        w_sample: Weights for ``x_sample``, same length as ``x_sample``.
+        rng: NumPy random number generator instance.
+        n_parent: Maximum number of parent points to use.
+        n_sample: Maximum number of sample points to use.
+
+    Returns:
+        The KS distance as a float.
+    """
+    # Downsample parent.
+    if len(x_parent) > n_parent:
+        xp = x_parent[rng.choice(len(x_parent), n_parent, replace=False)]
+    else:
+        xp = x_parent
+
+    # Downsample sample + weights.
+    if len(x_sample) > n_sample:
+        idx = rng.choice(len(x_sample), n_sample, replace=False)
+        xs = x_sample[idx]
+        ws = w_sample[idx]
+    else:
+        xs, ws = x_sample, w_sample
+
+    # Sort and build CDFs.
+    xp = np.sort(xp)
+    order = np.argsort(xs)
+    xs = xs[order]
+    ws = np.clip(ws[order], 0, None)
+    ws_sum = ws.sum()
+    if ws_sum < 1e-15:
+        # All weights are effectively zero - return worst-case KS distance
+        return 1.0
+    ws /= ws_sum
+    cdf_s = np.cumsum(ws)
+
+    grid = np.unique(np.concatenate([xp, xs]))
+    cdf_p = np.searchsorted(xp, grid, side="right") / len(xp)
+
+    idx = np.searchsorted(xs, grid, side="right") - 1
+    idx = np.clip(idx, -1, len(xs) - 1)
+    cdf_s_at_grid = np.where(idx >= 0, cdf_s[idx], 0.0)
+
+    return float(np.max(np.abs(cdf_p - cdf_s_at_grid)))
+
+
+def whiten_by_parent(
+    Xp: np.ndarray,
+    Xs: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Apply parent-covariance whitening (Mahalanobis transform).
+
+    The parent distribution defines the mean and covariance used to whiten
+    both the parent and the sample data:
+
+    .. math::
+
+        X' = (X - \\mu_\\text{parent}) L^{-1},
+
+    where ``L`` is the Cholesky factor of the parent covariance.
+
+    Args:
+        Xp: Parent data array of shape ``(N, D)``.
+        Xs: Sample data array of shape ``(M, D)``.
+
+    Returns:
+        Tuple ``(Xp_whitened, Xs_whitened)`` with the same shapes as inputs.
+    """
+    mu = Xp.mean(axis=0)
+    Xm = Xp - mu
+    Sigma = np.cov(Xm, rowvar=False)
+    d = Sigma.shape[0]
+    # Tiny ridge for numerical stability.
+    Sigma = Sigma + (1e-12 * np.trace(Sigma) / max(d, 1)) * np.eye(d)
+    L = np.linalg.cholesky(Sigma)
+    Linv = np.linalg.inv(L)
+    return (Xp - mu) @ Linv, (Xs - mu) @ Linv
+
+
+def drop_nan_rows_per_scale(
+    samples: Dict[float, np.ndarray],
+) -> Dict[float, np.ndarray]:
+    """Remove entries that are NaN in any dimension for a population.
+
+    For a dictionary of 1D arrays (one per smoothing scale), this function
+    removes indices where *any* scale is NaN, ensuring consistent length
+    across all scales.
+
+    Args:
+        samples: Mapping from scale (e.g., in Mpc/h) to 1D NumPy arrays of
+            equal length.
+
+    Returns:
+        New dictionary with NaN-containing entries removed consistently
+        across all scales.
+
+    Raises:
+        ValueError: If the arrays in ``samples`` do not all have the same
+            length.
+    """
+    if not samples:
+        return samples
+
+    first = next(iter(samples.values()))
+    length = len(first)
+    mask = np.ones(length, dtype=bool)
+
+    for arr in samples.values():
+        if len(arr) != length:
+            raise ValueError(
+                "All arrays in a population must have the same length; "
+                "found mismatched lengths."
+            )
+        mask &= ~np.isnan(arr)
+
+    n_dropped = length - np.sum(mask)
+    if n_dropped > 0:
+        print(
+            f"[NaN] Dropped {n_dropped:,}/{length:,} "
+            f"({100.0 * n_dropped / length:.2f}%) entries containing NaN."
+        )
+
+    return {scale: arr[mask] for scale, arr in samples.items()}
+
+
+# ---------------------------------------------------------------------
+# Core k-NN reweighter with auto-tuning
+# ---------------------------------------------------------------------
+
+
+@dataclass
+class KNNReweighterConfig:
+    """Configuration for :class:`KNNReweighter`.
+
+    Attributes:
+        k: Default number of neighbors for k-NN if auto-tuning is disabled.
+        autotune_k: Candidate k values to try when auto-tuning. If None,
+            a single fixed k is used.
+        ess_floor: Minimum ESS fraction (ESS / M) for a candidate k to be
+            considered "feasible" during auto-tuning.
+        preprocess: Preprocessing mode: ``"whiten"``, ``"standardize"``, or
+            ``"none"``.
+        standardize_by_parent: If True and ``preprocess == "standardize"``,
+            use parent statistics only; otherwise, use pooled stats.
+        algorithm: Nearest-neighbors algorithm (as in scikit-learn).
+        metric: Distance metric (e.g., ``"minkowski"``, ``"euclidean"``).
+        leaf_size: Leaf size for tree-based nearest-neighbor algorithms.
+        tau: Temperature exponent applied to the weights (``w := w**tau``).
+            Use tau < 1 to reduce weight variance (smoother, higher ESS).
+            Use tau > 1 to increase contrast (sharper weights, lower ESS).
+            Default tau=1 preserves density ratio exactly.
+        clip_range: Tuple ``(min, max)`` for clipping weights.
+        ks_eval_parent: Maximum number of parent points for KS evaluation.
+        ks_eval_sample: Maximum number of sample points for KS evaluation.
+        random_state: Random seed for diagnostics and KS subsampling.
+    """
+
+    # k / tuning
+    k: int = 50
+    autotune_k: Optional[List[int]] = None
+    ess_floor: float = 0.20  # as fraction of M
+
+    # preprocessing
+    preprocess: str = "whiten"  # whiten | standardize | none
+    standardize_by_parent: bool = True  # only used if preprocess=standardize
+
+    # neighbors
+    algorithm: str = "auto"
+    metric: str = "minkowski"
+    leaf_size: int = 40
+
+    # stabilization
+    tau: float = 1.0  # tau < 1: reduce variance, tau > 1: increase contrast
+    clip_range: Tuple[float, float] = (0.01, 50.0)
+
+    # diagnostics
+    ks_eval_parent: int = 100_000
+    ks_eval_sample: int = 100_000
+    random_state: int = 42
+
+
+class KNNReweighter:
+    """k-NN density-ratio reweighter for multi-scale environments."""
+
+    def __init__(self, cfg: KNNReweighterConfig):
+        """Initialize the reweighter.
+
+        Args:
+            cfg: Configuration object controlling k-NN and diagnostics.
+        """
+        self.cfg = cfg
+        self.scales: List[float] = []
+        self.weights_: Optional[np.ndarray] = None
+        self.info_: Optional[Dict] = None
+
+    def fit(
+        self,
+        parent_samples: Dict[float, np.ndarray],
+        sample_samples: Dict[float, np.ndarray],
+    ) -> Tuple[np.ndarray, Dict]:
+        """Compute weights that map the sample to the parent distribution.
+
+        The features are constructed by column-stacking the overdensities at
+        different smoothing scales.
+
+        Args:
+            parent_samples: Mapping from scale to parent overdensity array
+                (1D, same length per scale).
+            sample_samples: Mapping from scale to sample overdensity array
+                (1D, same length per scale).
+
+        Returns:
+            Tuple ``(weights, info)``:
+                * ``weights``: 1D array of length M with reweighting factors.
+                * ``info``: Dictionary of diagnostics and configuration values.
+
+        Raises:
+            ValueError: If the parent and sample do not share the same set
+                of scales.
+        """
+        if set(parent_samples.keys()) != set(sample_samples.keys()):
+            raise ValueError(
+                "parent_samples and sample_samples must have identical keys "
+                "(same set of smoothing scales)."
+            )
+
+        self.scales = sorted(parent_samples.keys())
+        Xp = np.column_stack([parent_samples[s] for s in self.scales])  # [N, D]
+        Xs = np.column_stack([sample_samples[s] for s in self.scales])  # [M, D]
+        N, d = Xp.shape
+        M = Xs.shape[0]
+        rng = np.random.default_rng(self.cfg.random_state)
+
+        print(f"[kNN] {d}D features | Parent N={N:,} | Sample M={M:,}")
+        Xp_prep, Xs_prep = self._preprocess(Xp, Xs)
+
+        # Solve for weights, optionally auto-tuning k.
+        if self.cfg.autotune_k:
+            best_k, w, tuning_info = self._autotune_k(
+                Xp_prep, Xs_prep, Xp, Xs, N, M, d, rng
+            )
+            k_used = best_k
+        else:
+            k_used = self.cfg.k
+            w = self._weights_for_k(Xp_prep, Xs_prep, N, M, d, k_used)
+            tuning_info = None
+
+        # Basic diagnostics: ESS, clipping, KS.
+        ess = effective_sample_size(w)
+        clip_frac = float(
+            np.mean(
+                (w <= self.cfg.clip_range[0] + 1e-15)
+                | (w >= self.cfg.clip_range[1] - 1e-15)
+            )
+        )
+
+        ks_vals = [
+            weighted_ks(
+                Xp[:, j],
+                Xs[:, j],
+                w,
+                rng,
+                n_parent=min(self.cfg.ks_eval_parent, N),
+                n_sample=min(self.cfg.ks_eval_sample, M),
+            )
+            for j in range(d)
+        ]
+        ks_mean = float(np.mean(ks_vals))
+
+        info = {
+            "method": "knn_density_ratio",
+            "k": int(k_used),
+            "ks_mean": ks_mean,
+            "ks_per_dim": ks_vals,
+            "preprocess": self.cfg.preprocess,
+            "standardize_by_parent": self.cfg.standardize_by_parent,
+            "scales": self.scales,
+            "parent_samples": N,
+            "sample_samples": M,
+            "algorithm": self.cfg.algorithm,
+            "metric": self.cfg.metric,
+            "leaf_size": self.cfg.leaf_size,
+            "tau": self.cfg.tau,
+            "clip_range": self.cfg.clip_range,
+            "clip_fraction": clip_frac,
+            "ess": float(ess),
+            "mean_weight": float(np.mean(w)),
+            "std_weight": float(np.std(w)),
+            "min_weight": float(np.min(w)),
+            "max_weight": float(np.max(w)),
+            "tuning": tuning_info,
+        }
+
+        print(
+            f"[kNN] k={k_used} | KS(mean)={ks_mean:.4f} | "
+            f"ESS={ess:,.0f}/{M:,} | clipped={100 * clip_frac:.1f}%"
+        )
+
+        self.weights_ = w
+        self.info_ = info
+        return w, info
+
+    # --------------------- internals ---------------------
+
+    def _preprocess(
+        self,
+        Xp: np.ndarray,
+        Xs: np.ndarray,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Apply the requested preprocessing to parent and sample data.
+
+        Args:
+            Xp: Parent data array of shape ``(N, D)``.
+            Xs: Sample data array of shape ``(M, D)``.
+
+        Returns:
+            Tuple ``(Xp_processed, Xs_processed)``.
+
+        Raises:
+            ValueError: If ``preprocess`` is not one of the supported modes.
+        """
+        mode = self.cfg.preprocess.lower()
+
+        if mode == "whiten":
+            print("[kNN] Preprocess: parent-covariance whitening.")
+            return whiten_by_parent(Xp, Xs)
+
+        if mode == "standardize":
+            if self.cfg.standardize_by_parent:
+                mu = Xp.mean(axis=0)
+                sd = Xp.std(axis=0) + 1e-12
+                src = "parent"
+            else:
+                both = np.vstack([Xp, Xs])
+                mu = both.mean(axis=0)
+                sd = both.std(axis=0) + 1e-12
+                src = "pooled"
+            print(f"[kNN] Preprocess: z-score (by {src} stats).")
+            return (Xp - mu) / sd, (Xs - mu) / sd
+
+        if mode == "none":
+            print("[kNN] Preprocess: none.")
+            return Xp, Xs
+
+        raise ValueError('preprocess must be "whiten", "standardize", or "none"')
+
+    def _knn_dists(
+        self,
+        X_query: np.ndarray,
+        X_ref: np.ndarray,
+        *,
+        k: int,
+    ) -> np.ndarray:
+        """Compute distances to the k nearest neighbors.
+
+        Args:
+            X_query: Query points of shape ``(Q, D)``.
+            X_ref: Reference points of shape ``(R, D)``.
+            k: Number of neighbors to retrieve.
+
+        Returns:
+            Array of shape ``(Q, k)`` with distances to the k nearest neighbors.
+        """
+        nn = NearestNeighbors(
+            n_neighbors=k,
+            algorithm=self.cfg.algorithm,
+            leaf_size=self.cfg.leaf_size,
+            metric=self.cfg.metric,
+        ).fit(X_ref)
+        return nn.kneighbors(X_query, return_distance=True)[0]
+
+    def _weights_for_k(
+        self,
+        Xp_prep: np.ndarray,
+        Xs_prep: np.ndarray,
+        N: int,
+        M: int,
+        d: int,
+        k: int,
+    ) -> np.ndarray:
+        """Compute stabilized weights for a single k.
+
+        The core k-NN density-ratio formula is:
+
+        .. math::
+
+            w_i \\propto \\frac{M}{N}\\left(\\frac{r_\\text{sample, i}}
+            {r_\\text{parent, i}}\\right)^d,
+
+        with optional temperature exponent, clipping, and renormalization.
+
+        Args:
+            Xp_prep: Preprocessed parent data, shape ``(N, D)``.
+            Xs_prep: Preprocessed sample data, shape ``(M, D)``.
+            N: Number of parent points.
+            M: Number of sample points.
+            d: Dimensionality of the feature space.
+            k: Number of neighbors.
+
+        Returns:
+            One-dimensional array of weights of length M.
+
+        Raises:
+            ValueError: If k is out of valid range.
+        """
+        if k < 1:
+            raise ValueError(f"k must be >= 1, got k={k}")
+        if k >= N:
+            raise ValueError(
+                f"k must be < N (parent size), got k={k}, N={N}"
+            )
+        if k + 1 > M:
+            raise ValueError(
+                f"k+1 must be <= M (sample size), got k={k}, M={M}"
+            )
+
+        dist_p = self._knn_dists(Xs_prep, Xp_prep, k=k)  # [M, k]
+        r_parent = dist_p[:, -1]
+
+        dist_s = self._knn_dists(Xs_prep, Xs_prep, k=k + 1)  # include self
+        r_sample = dist_s[:, -1]
+
+        eps = 1e-12
+        w = (M / N) * ((r_sample + eps) / (r_parent + eps)) ** d
+
+        if self.cfg.tau != 1.0:
+            w = np.power(w, self.cfg.tau)
+
+        w = np.clip(w, self.cfg.clip_range[0], self.cfg.clip_range[1])
+        w *= len(w) / (w.sum() + 1e-18)
+        return w
+
+    def _autotune_k(
+        self,
+        Xp_prep: np.ndarray,
+        Xs_prep: np.ndarray,
+        Xp_raw: np.ndarray,
+        Xs_raw: np.ndarray,
+        N: int,
+        M: int,
+        d: int,
+        rng: np.random.Generator,
+    ) -> Tuple[int, np.ndarray, Dict]:
+        """Grid-search over candidate k values with an ESS floor.
+
+        Args:
+            Xp_prep: Preprocessed parent data, shape ``(N, D)``.
+            Xs_prep: Preprocessed sample data, shape ``(M, D)``.
+            Xp_raw: Raw parent data for KS diagnostics, shape ``(N, D)``.
+            Xs_raw: Raw sample data for KS diagnostics, shape ``(M, D)``.
+            N: Number of parent points.
+            M: Number of sample points.
+            d: Dimensionality of the feature space.
+            rng: NumPy random generator for KS subsampling.
+
+        Returns:
+            Tuple ``(best_k, best_weights, summary_dict)`` where
+            ``summary_dict`` contains per-k diagnostics.
+        """
+        candidates = sorted(
+            {int(k) for k in self.cfg.autotune_k if 1 <= int(k) < min(N, M)}
+        )
+        if not candidates:
+            raise ValueError(
+                "autotune_k is non-empty but no valid candidates remain. "
+                "Check that k < min(N, M)."
+            )
+
+        print(
+            f"[kNN] Auto-tune k over {candidates} with ESS floor "
+            f"{int(self.cfg.ess_floor * 100)}%..."
+        )
+
+        results = []
+        best_tuple = None
+        best_k = None
+        best_w = None
+
+        for k in candidates:
+            w = self._weights_for_k(Xp_prep, Xs_prep, N, M, d, k)
+            ess = effective_sample_size(w)
+
+            ks_vals = [
+                weighted_ks(
+                    Xp_raw[:, j],
+                    Xs_raw[:, j],
+                    w,
+                    rng,
+                    n_parent=min(self.cfg.ks_eval_parent, N),
+                    n_sample=min(self.cfg.ks_eval_sample, M),
+                )
+                for j in range(d)
+            ]
+            ks_mean = float(np.mean(ks_vals))
+            feasible = ess >= self.cfg.ess_floor * M
+            results.append(
+                {"k": k, "ess": float(ess), "ks_mean": ks_mean, "feasible": feasible}
+            )
+
+            # Selection criteria (lexicographic ordering):
+            # 1. Prefer feasible (ESS >= floor) over infeasible
+            # 2. Among same feasibility, prefer lower KS distance
+            # 3. Among same KS, prefer higher ESS
+            # 4. Among same ESS, prefer smaller k (simpler model)
+            selection_key = (not feasible, ks_mean, -ess, k)
+            if (best_tuple is None) or (selection_key < best_tuple):
+                best_tuple, best_k, best_w = selection_key, k, w
+
+        print(
+            "[kNN] Tuning results: "
+            + ", ".join(
+                f"k={r['k']}: KS={r['ks_mean']:.3f}, ESS={int(r['ess'])}"
+                + ("" if r["feasible"] else "")
+                for r in results
+            )
+        )
+
+        summary = {
+            "candidates": candidates,
+            "results": results,
+            "best_k": best_k,
+            "ess_floor": self.cfg.ess_floor,
+        }
+        return best_k, best_w, summary
+
+
+# ---------------------------------------------------------------------
+# HDF5 loading
+# ---------------------------------------------------------------------
+
+
+def load_multiscale_overdensities(
+    parent_path: str,
+    sample_path: str,
+) -> Tuple[Dict[float, np.ndarray], Dict[float, np.ndarray]]:
+    """Load log10(1 + delta) overdensities from parent and sample HDF5 files.
+
+    The set of kernels is defined by the *sample* file. For every group
+    ``Grids/Kernel_N`` present in the sample file:
+
+    * The same group must exist in the parent file.
+    * Each group must contain a dataset ``GridPointOverDensities``.
+    * The kernel radius is read from the group's ``kernel_radius`` attribute.
+    * The values are read from both files and transformed as
+      ``log10(1 + delta)``.
+
+    The returned dictionaries map the smoothing scale (from the kernel_radius
+    attribute) to a 1D NumPy array of overdensities.
+
+    Args:
+        parent_path: Path to the parent/global population HDF5 file.
+        sample_path: Path to the sample/observed population HDF5 file.
+
+    Returns:
+        Tuple ``(parent_data, sample_data)``, where each is a mapping from
+        scale to 1D NumPy arrays.
+
+    Raises:
+        KeyError: If required groups/datasets are missing.
+        ValueError: If no kernels are found; if kernel_radius attribute is
+            missing; or if the parent is missing kernels found in the sample
+            file.
+    """
+    parent_data: Dict[float, np.ndarray] = {}
+    sample_data: Dict[float, np.ndarray] = {}
+
+    with (
+        h5py.File(sample_path, "r") as f_sample,
+        h5py.File(parent_path, "r") as f_parent,
+    ):
+        if "Grids" not in f_sample:
+            raise KeyError("Sample file is missing group 'Grids'.")
+        if "Grids" not in f_parent:
+            raise KeyError("Parent file is missing group 'Grids'.")
+
+        sample_grids = f_sample["Grids"]
+        parent_grids = f_parent["Grids"]
+
+        kernel_keys = [k for k in sample_grids.keys() if k.startswith("Kernel_")]
+        if not kernel_keys:
+            raise ValueError(
+                "No kernel groups found in sample file under 'Grids'. "
+                "Expected groups named like 'Kernel_0', 'Kernel_1', etc."
+            )
+
+        missing = [k for k in kernel_keys if k not in parent_grids]
+        if missing:
+            raise ValueError(
+                "Parent file is missing the following kernel groups "
+                f"present in the sample file: {', '.join(missing)}"
+            )
+
+        print(
+            f"[HDF5] Found {len(kernel_keys)} kernels in sample file: "
+            + ", ".join(kernel_keys)
+        )
+
+        for key in kernel_keys:
+            # Read kernel radius from group attribute
+            try:
+                scale = float(sample_grids[key].attrs["kernel_radius"])
+            except KeyError as exc:
+                raise ValueError(
+                    f"Kernel group '{key}' is missing 'kernel_radius' attribute. "
+                    "Cannot determine smoothing scale."
+                ) from exc
+
+            try:
+                parent_ds = parent_grids[key]["GridPointOverDensities"][...]
+                sample_ds = sample_grids[key]["GridPointOverDensities"][...]
+            except KeyError as exc:
+                raise KeyError(
+                    f"Missing 'GridPointOverDensities' for kernel '{key}'."
+                ) from exc
+
+            # Validate overdensity values before log transform
+            parent_min = np.min(parent_ds)
+            sample_min = np.min(sample_ds)
+            if parent_min <= -1.0:
+                print(
+                    f"[WARNING] Parent overdensities for kernel '{key}' (R={scale}) "
+                    f"contain � <= -1 (min={parent_min:.3f}). This will produce NaN "
+                    f"after log10(1 + �) transform."
+                )
+            if sample_min <= -1.0:
+                print(
+                    f"[WARNING] Sample overdensities for kernel '{key}' (R={scale}) "
+                    f"contain � <= -1 (min={sample_min:.3f}). This will produce NaN "
+                    f"after log10(1 + �) transform."
+                )
+
+            parent_data[scale] = np.log10(np.asarray(parent_ds, dtype=float) + 1.0)
+            sample_data[scale] = np.log10(np.asarray(sample_ds, dtype=float) + 1.0)
+
+    return parent_data, sample_data
+
+
+# ---------------------------------------------------------------------
+# Plotting and simple diagnostics
+# ---------------------------------------------------------------------
+
+
+def plot_diagnostics(
+    parent_samples: Dict[float, np.ndarray],
+    sample_samples: Dict[float, np.ndarray],
+    weights: np.ndarray,
+    info: Dict,
+    save_path: Optional[str] = None,
+) -> plt.Figure:
+    """Plot simple diagnostics: 1D distributions and weight histogram.
+
+    The plot has three panels:
+
+    * Parent vs sample vs weighted sample for the first scale.
+    * Parent vs sample vs weighted sample for the second scale (if present).
+    * Histogram of the weights.
+
+    Args:
+        parent_samples: Mapping from scale to parent overdensity array.
+        sample_samples: Mapping from scale to sample overdensity array.
+        weights: Weights for the sample population.
+        info: Diagnostics dictionary returned by :meth:`KNNReweighter.fit`.
+        save_path: Optional path to save the diagnostic PNG.
+
+    Returns:
+        The Matplotlib Figure instance.
+    """
+    scales = sorted(parent_samples.keys())
+    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
+    axes = axes.flatten()
+
+    def _plot_1d(ax, scale: float) -> None:
+        parent = parent_samples[scale]
+        sample = sample_samples[scale]
+        bins = np.linspace(
+            min(parent.min(), sample.min()), max(parent.max(), sample.max()), 50
+        )
+        ax.hist(
+            parent,
+            bins=bins,
+            density=True,
+            label="Parent",
+            histtype="step",
+            lw=2,
+        )
+        ax.hist(
+            sample,
+            bins=bins,
+            density=True,
+            label="Sample (raw)",
+            histtype="step",
+            lw=2,
+        )
+        ax.hist(
+            sample,
+            bins=bins,
+            weights=weights,
+            density=True,
+            label="Sample (weighted)",
+            histtype="step",
+            lw=2,
+        )
+        ax.set_xlabel(f"� (R={scale} Mpc/h)")
+        ax.set_ylabel("PDF")
+        ax.set_title(f"Scale {scale} Mpc/h")
+        ax.legend()
+        ax.grid(True, alpha=0.3)
+
+    # First scale.
+    _plot_1d(axes[0], scales[0])
+
+    # Second scale (if available).
+    if len(scales) > 1:
+        _plot_1d(axes[1], scales[1])
+    else:
+        axes[1].axis("off")
+
+    # Weight histogram.
+    axw = axes[2]
+    axw.hist(weights, bins=60, alpha=0.85, edgecolor="black")
+    axw.axvline(
+        np.mean(weights),
+        color="red",
+        linestyle="--",
+        lw=2,
+        label=f"Mean={np.mean(weights):.3f}",
+    )
+    axw.set_xlabel("Weight")
+    axw.set_ylabel("Count")
+    ess = int(info["ess"])
+    M = info["sample_samples"]
+    ks_mean = info["ks_mean"]
+    axw.set_title(f"w distribution | ESS={ess}/{M:,}, KS(mean)={ks_mean:.3f}")
+    axw.legend()
+    axw.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    if save_path:
+        plt.savefig(save_path, dpi=300, bbox_inches="tight")
+        print(f"[plot] Saved: {save_path}")
+    return fig
+
+
+# ---------------------------------------------------------------------
+# Pipeline
+# ---------------------------------------------------------------------
+
+
+def run_full_analysis(
+    parent_samples: Dict[float, np.ndarray],
+    sample_samples: Dict[float, np.ndarray],
+    show_plots: bool = True,
+    save_path: Optional[str] = None,
+    **knn_kwargs,
+) -> Tuple[np.ndarray, Dict]:
+    """Run the full k-NN reweighting and simple diagnostics.
+
+    Args:
+        parent_samples: Mapping from scale to parent overdensity array.
+        sample_samples: Mapping from scale to sample overdensity array.
+        show_plots: If True, display the diagnostic plot.
+        save_path: Optional path to save the diagnostic PNG.
+        **knn_kwargs: Keyword arguments used to configure
+            :class:`KNNReweighterConfig`.
+
+    Returns:
+        Tuple ``(weights, info)``:
+            * ``weights``: 1D array of weights for the sample population.
+            * ``info``: Diagnostics dictionary from
+              :meth:`KNNReweighter.fit`.
+    """
+    print("=== Multi-Scale Environment Weighting (k-NN, auto-k) ===")
+
+    cfg = KNNReweighterConfig(**knn_kwargs)
+    rw = KNNReweighter(cfg)
+
+    weights, info = rw.fit(parent_samples, sample_samples)
+
+    t = info.get("tuning")
+    tune_str = ""
+    if t:
+        tune_str = (
+            f" | auto-k over {t['candidates']} "
+            f"-> best={t['best_k']} (ESS floor {int(t['ess_floor'] * 100)}%)"
+        )
+
+    print(
+        "\n[Summary] "
+        f"k={info['k']}, KS(mean)={info['ks_mean']:.4f}, "
+        f"ESS={int(info['ess'])}/{info['sample_samples']}, "
+        f"clip={info['clip_range']}, "
+        f"clipped={100 * info['clip_fraction']:.1f}%"
+        f"{tune_str}"
+    )
+
+    if show_plots:
+        print("\n[Plot] Rendering diagnostics...")
+        _ = plot_diagnostics(parent_samples, sample_samples, weights, info, save_path)
+        plt.show()
+
+    return weights, info
+
+
+# ---------------------------------------------------------------------
+# __main__ (argparse)
+# ---------------------------------------------------------------------
+
+
+def _build_arg_parser() -> argparse.ArgumentParser:
+    """Create the command-line argument parser.
+
+    Returns:
+        Configured :class:`argparse.ArgumentParser` instance.
+    """
+    parser = argparse.ArgumentParser(
+        description=(
+            "k-NN density-ratio weighting for multi-scale environments.\n\n"
+            "The sample file defines which Grids/Kernel_* groups are used; "
+            "the same kernels must exist in the parent file."
+        )
+    )
+
+    # Required I/O.
+    parser.add_argument(
+        "--parent",
+        type=str,
+        required=True,
+        help="Path to parent/global population HDF5 file.",
+    )
+    parser.add_argument(
+        "--sample",
+        type=str,
+        required=True,
+        help="Path to sample/observed population HDF5 file.",
+    )
+
+    # k / auto-tuning.
+    parser.add_argument(
+        "--k",
+        type=int,
+        default=50,
+        help="Default number of neighbors (used if --autotune_k is empty).",
+    )
+    parser.add_argument(
+        "--autotune_k",
+        type=str,
+        default="",
+        help=(
+            "Comma-separated candidate k values for auto-tuning, "
+            "e.g. '35,50,80,100'. If empty, k is used directly."
+        ),
+    )
+    parser.add_argument(
+        "--ess_floor",
+        type=float,
+        default=0.20,
+        help="Minimum ESS fraction (ESS / M) for a candidate k to be feasible.",
+    )
+
+    # Preprocessing.
+    parser.add_argument(
+        "--preprocess",
+        choices=["whiten", "standardize", "none"],
+        default="whiten",
+        help="Feature preprocessing mode.",
+    )
+    parser.add_argument(
+        "--standardize_by_parent",
+        action="store_true",
+        help="If preprocess=standardize, use parent stats (default True).",
+    )
+    parser.add_argument(
+        "--standardize_by_pooled",
+        dest="standardize_by_parent",
+        action="store_false",
+        help="If preprocess=standardize, use pooled parent+sample stats.",
+    )
+    parser.set_defaults(standardize_by_parent=True)
+
+    # Neighbors.
+    parser.add_argument(
+        "--metric",
+        type=str,
+        default="minkowski",
+        help="Distance metric (e.g., 'minkowski', 'euclidean').",
+    )
+    parser.add_argument(
+        "--algorithm",
+        type=str,
+        default="auto",
+        help="NearestNeighbors algorithm (scikit-learn).",
+    )
+    parser.add_argument(
+        "--leaf_size",
+        type=int,
+        default=40,
+        help="Leaf size for tree-based nearest-neighbor algorithms.",
+    )
+
+    # Stabilization.
+    parser.add_argument(
+        "--tau",
+        type=float,
+        default=1.0,
+        help="Temperature exponent (weights := weights ** tau).",
+    )
+    parser.add_argument(
+        "--clip_min",
+        type=float,
+        default=0.01,
+        help="Lower clip bound for weights.",
+    )
+    parser.add_argument(
+        "--clip_max",
+        type=float,
+        default=50.0,
+        help="Upper clip bound for weights.",
+    )
+
+    # Misc / plotting.
+    parser.add_argument(
+        "--random_state",
+        type=int,
+        default=42,
+        help="Random seed for diagnostics and KS subsampling.",
+    )
+    parser.add_argument(
+        "--save",
+        type=str,
+        default=None,
+        help="Path to save diagnostic PNG.",
+    )
+    parser.add_argument(
+        "--no_plots",
+        action="store_true",
+        help="Disable plotting.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=None,
+        help="Path to save computed weights as .npy file.",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = _build_arg_parser()
+    args = parser.parse_args()
+
+    # Load data from the two HDF5 files.
+    parent_data, sample_data = load_multiscale_overdensities(
+        parent_path=args.parent,
+        sample_path=args.sample,
+    )
+
+    # Remove NaNs consistently within each population.
+    parent_data = drop_nan_rows_per_scale(parent_data)
+    sample_data = drop_nan_rows_per_scale(sample_data)
+
+    # Build kwargs for run_full_analysis.
+    auto_list = [int(x) for x in args.autotune_k.split(",") if x.strip().isdigit()]
+    kwargs = dict(
+        k=args.k,
+        autotune_k=auto_list if auto_list else None,
+        ess_floor=args.ess_floor,
+        preprocess=args.preprocess,
+        standardize_by_parent=args.standardize_by_parent,
+        algorithm=args.algorithm,
+        metric=args.metric,
+        leaf_size=args.leaf_size,
+        tau=args.tau,
+        clip_range=(args.clip_min, args.clip_max),
+        random_state=args.random_state,
+    )
+
+    # Run pipeline.
+    weights, info = run_full_analysis(
+        parent_data,
+        sample_data,
+        show_plots=not args.no_plots,
+        save_path=args.save,
+        **kwargs,
+    )
+
+    # Save weights to file if requested.
+    if args.output:
+        np.save(args.output, weights)
+        print(f"[Output] Saved weights to: {args.output}")
+
+    print(f"\n[Done] Generated {len(weights)} weights.")
diff --git a/weighting/test_hmf_weighting.py b/weighting/test_hmf_weighting.py
new file mode 100644
index 0000000..e679d2a
--- /dev/null
+++ b/weighting/test_hmf_weighting.py
@@ -0,0 +1,634 @@
+"""Test k-NN weighting on halo mass functions.
+
+This script evaluates how well k-NN density-ratio weighting can reproduce
+the true halo mass function (HMF) from a small subsample.
+
+Workflow:
+1. Load multi-scale overdensity features from parent (global) and sample (full box) HDF5 files
+2. Load halo masses for the full sample population (the truth)
+3. Subsample N halos randomly from the full sample
+4. Apply k-NN weighting to the subsample using parent environment as reference
+5. Compare: Truth (full sample) vs Subsample (raw) vs Subsample (weighted)
+
+The goal is to test whether a small weighted subsample can reproduce the
+true HMF of the full sample.
+
+The script produces diagnostic plots showing:
+- HMF comparison: Truth vs raw subsample vs weighted subsample
+- Environment distributions at different scales
+- Weight distribution statistics
+
+Example
+-------
+    python test_hmf_weighting.py \\
+        --parent parent_grids.hdf5 \\
+        --sample sample_grids.hdf5 \\
+        --masses sample_halo_masses.txt \\
+        --subsample 10000 \\
+        --autotune_k 35,50,80,100 \\
+        --output weighted_masses.npy \\
+        --save hmf_diagnostics.png
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+
+import h5py
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Import the k-NN weighting machinery
+from knn_weighter import (
+    KNNReweighter,
+    KNNReweighterConfig,
+    drop_nan_rows_per_scale,
+    effective_sample_size,
+    load_multiscale_overdensities,
+)
+
+
+def load_halo_masses(mass_file: str) -> np.ndarray:
+    """Load halo masses from a text file.
+
+    Args:
+        mass_file: Path to text file containing one mass per line.
+
+    Returns:
+        1D array of halo masses.
+
+    Raises:
+        ValueError: If file is empty or contains invalid data.
+    """
+    masses = np.loadtxt(mass_file)
+    if masses.ndim != 1:
+        raise ValueError(
+            f"Mass file should contain a single column, got shape {masses.shape}"
+        )
+    if len(masses) == 0:
+        raise ValueError("Mass file is empty")
+
+    print(f"[Masses] Loaded {len(masses):,} halo masses from {mass_file}")
+    print(f"[Masses] Mass range: {masses.min():.2e} - {masses.max():.2e}")
+
+    return masses
+
+
+def subsample_data(
+    sample_overdens: Dict[float, np.ndarray],
+    masses: np.ndarray,
+    n_subsample: int,
+    random_state: int = 42,
+) -> Tuple[Dict[float, np.ndarray], np.ndarray, np.ndarray]:
+    """Randomly subsample halos and their environment features.
+
+    Args:
+        sample_overdens: Mapping from scale to sample overdensity arrays.
+        masses: Halo masses (same length as overdensity arrays).
+        n_subsample: Number of halos to randomly select.
+        random_state: Random seed for reproducibility.
+
+    Returns:
+        Tuple (subsampled_overdens, subsampled_masses, indices).
+
+    Raises:
+        ValueError: If n_subsample > sample size.
+    """
+    n_total = len(masses)
+    if n_subsample > n_total:
+        raise ValueError(
+            f"Requested subsample size {n_subsample:,} exceeds "
+            f"sample size {n_total:,}"
+        )
+
+    rng = np.random.default_rng(random_state)
+    indices = rng.choice(n_total, size=n_subsample, replace=False)
+
+    subsampled_overdens = {
+        scale: arr[indices] for scale, arr in sample_overdens.items()
+    }
+    subsampled_masses = masses[indices]
+
+    print(
+        f"[Subsample] Randomly selected {n_subsample:,}/{n_total:,} halos "
+        f"(seed={random_state})"
+    )
+    print(
+        f"[Subsample] Subsampled mass range: "
+        f"{subsampled_masses.min():.2e} - {subsampled_masses.max():.2e}"
+    )
+
+    return subsampled_overdens, subsampled_masses, indices
+
+
+def compute_hmf(
+    masses: np.ndarray,
+    weights: Optional[np.ndarray] = None,
+    n_bins: int = 20,
+    mass_range: Optional[Tuple[float, float]] = None,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Compute halo mass function (differential number density).
+
+    Args:
+        masses: Halo masses.
+        weights: Optional weights for each halo.
+        n_bins: Number of mass bins.
+        mass_range: Optional (min, max) mass range. If None, uses data range.
+
+    Returns:
+        Tuple (bin_centers, dn_dlogM, bin_edges) where dn_dlogM is the
+        differential number density per logarithmic mass bin.
+    """
+    if weights is None:
+        weights = np.ones_like(masses)
+
+    if mass_range is None:
+        mass_range = (masses.min(), masses.max())
+
+    log_masses = np.log10(masses)
+    log_range = (np.log10(mass_range[0]), np.log10(mass_range[1]))
+
+    counts, bin_edges = np.histogram(
+        log_masses, bins=n_bins, range=log_range, weights=weights
+    )
+    bin_widths = np.diff(bin_edges)
+    bin_centers = 0.5 * (bin_edges[1:] + bin_edges[:-1])
+
+    # Differential number density: dn/dlog10(M)
+    dn_dlogM = counts / bin_widths
+    # Normalize to total number
+    dn_dlogM *= len(masses) / dn_dlogM.sum() / bin_widths.mean()
+
+    return bin_centers, dn_dlogM, bin_edges
+
+
+def plot_hmf_comparison(
+    truth_masses: np.ndarray,
+    subsample_masses: np.ndarray,
+    weights: np.ndarray,
+    info: Dict,
+    parent_overdens: Dict[float, np.ndarray],
+    subsample_overdens: Dict[float, np.ndarray],
+    save_path: Optional[str] = None,
+) -> plt.Figure:
+    """Plot HMF comparison and diagnostics.
+
+    Creates a multi-panel figure with:
+    - HMF comparison (truth vs raw subsample vs weighted subsample)
+    - Environment distributions at two scales
+    - Weight distribution histogram
+
+    Args:
+        truth_masses: True halo masses (full sample).
+        subsample_masses: Subsampled halo masses.
+        weights: Weights for subsampled halos.
+        info: Diagnostics from KNNReweighter.
+        parent_overdens: Parent overdensity features (global reference).
+        subsample_overdens: Subsample overdensity features.
+        save_path: Optional path to save figure.
+
+    Returns:
+        Figure instance.
+    """
+    fig = plt.figure(figsize=(18, 10))
+    gs = fig.add_gridspec(2, 3, hspace=0.3, wspace=0.3)
+
+    scales = sorted(parent_overdens.keys())
+
+    # Determine common mass range for HMF
+    all_masses = np.concatenate([truth_masses, subsample_masses])
+    mass_range = (all_masses.min(), all_masses.max())
+
+    # --- Panel 1: Halo Mass Function ---
+    ax_hmf = fig.add_subplot(gs[0, :])
+
+    # Compute HMFs
+    bc_truth, hmf_truth, _ = compute_hmf(
+        truth_masses, n_bins=25, mass_range=mass_range
+    )
+    bc_subsample, hmf_subsample, _ = compute_hmf(
+        subsample_masses, n_bins=25, mass_range=mass_range
+    )
+    bc_weighted, hmf_weighted, _ = compute_hmf(
+        subsample_masses, weights=weights, n_bins=25, mass_range=mass_range
+    )
+
+    ax_hmf.plot(
+        bc_truth, hmf_truth, "k-", lw=2.5, label=f"Truth (full sample, N={len(truth_masses):,})"
+    )
+    ax_hmf.plot(
+        bc_subsample,
+        hmf_subsample,
+        "r--",
+        lw=2,
+        alpha=0.7,
+        label=f"Subsample (raw, N={len(subsample_masses):,})",
+    )
+    ax_hmf.plot(
+        bc_weighted,
+        hmf_weighted,
+        "b-",
+        lw=2,
+        alpha=0.8,
+        label=f"Subsample (weighted, ESS={int(info['ess'])}/{len(subsample_masses):,})",
+    )
+
+    ax_hmf.set_xlabel(r"$\log_{10}(M_{\rm halo} \, [M_\odot])$", fontsize=13)
+    ax_hmf.set_ylabel(r"$dn / d\log_{10}M$ (normalized)", fontsize=13)
+    ax_hmf.set_title(
+        f"Halo Mass Function | k={info['k']}, KS(env)={info['ks_mean']:.3f}",
+        fontsize=14,
+        fontweight="bold",
+    )
+    ax_hmf.legend(fontsize=11, loc="upper right")
+    ax_hmf.grid(True, alpha=0.3)
+    ax_hmf.set_yscale("log")
+
+    # --- Panel 2: Environment at scale 0 ---
+    ax_env1 = fig.add_subplot(gs[1, 0])
+    _plot_environment_dist(
+        ax_env1, parent_overdens[scales[0]], sample_overdens[scales[0]], weights, scales[0]
+    )
+
+    # --- Panel 3: Environment at scale 1 (if available) ---
+    ax_env2 = fig.add_subplot(gs[1, 1])
+    if len(scales) > 1:
+        _plot_environment_dist(
+            ax_env2,
+            parent_overdens[scales[1]],
+            sample_overdens[scales[1]],
+            weights,
+            scales[1],
+        )
+    else:
+        ax_env2.text(
+            0.5, 0.5, "Only 1 scale available", ha="center", va="center", fontsize=12
+        )
+        ax_env2.axis("off")
+
+    # --- Panel 4: Weight distribution ---
+    ax_weights = fig.add_subplot(gs[1, 2])
+    ax_weights.hist(weights, bins=50, alpha=0.85, edgecolor="black", color="steelblue")
+    ax_weights.axvline(
+        np.mean(weights),
+        color="red",
+        linestyle="--",
+        lw=2,
+        label=f"Mean={np.mean(weights):.2f}",
+    )
+    ax_weights.axvline(
+        np.median(weights),
+        color="orange",
+        linestyle=":",
+        lw=2,
+        label=f"Median={np.median(weights):.2f}",
+    )
+    ax_weights.set_xlabel("Weight", fontsize=12)
+    ax_weights.set_ylabel("Count", fontsize=12)
+    ax_weights.set_title(
+        f"Weight Distribution\nESS={int(info['ess'])}/{len(weights):,}, "
+        f"clip={100*info['clip_fraction']:.1f}%",
+        fontsize=11,
+    )
+    ax_weights.legend(fontsize=10)
+    ax_weights.grid(True, alpha=0.3)
+
+    plt.suptitle(
+        "k-NN Halo Mass Function Weighting Test",
+        fontsize=16,
+        fontweight="bold",
+        y=0.995,
+    )
+
+    if save_path:
+        plt.savefig(save_path, dpi=300, bbox_inches="tight")
+        print(f"[Plot] Saved: {save_path}")
+
+    return fig
+
+
+def _plot_environment_dist(
+    ax: plt.Axes,
+    parent_env: np.ndarray,
+    subsample_env: np.ndarray,
+    weights: np.ndarray,
+    scale: float,
+) -> None:
+    """Plot environment distribution at a single scale.
+
+    Args:
+        ax: Matplotlib axes.
+        parent_env: Parent (global) overdensities.
+        subsample_env: Subsample overdensities.
+        weights: Subsample weights.
+        scale: Smoothing scale (Mpc/h).
+    """
+    bins = np.linspace(
+        min(parent_env.min(), subsample_env.min()),
+        max(parent_env.max(), subsample_env.max()),
+        40,
+    )
+
+    ax.hist(
+        parent_env,
+        bins=bins,
+        density=True,
+        alpha=0.6,
+        label="Parent (global)",
+        histtype="stepfilled",
+        color="gray",
+    )
+    ax.hist(
+        subsample_env,
+        bins=bins,
+        density=True,
+        alpha=0.7,
+        label="Subsample (raw)",
+        histtype="step",
+        lw=2,
+        color="red",
+    )
+    ax.hist(
+        subsample_env,
+        bins=bins,
+        weights=weights,
+        density=True,
+        alpha=0.8,
+        label="Subsample (weighted)",
+        histtype="step",
+        lw=2,
+        color="blue",
+    )
+
+    ax.set_xlabel(f"log10(1 + δ) at R={scale:.2f} Mpc/h", fontsize=11)
+    ax.set_ylabel("PDF", fontsize=11)
+    ax.set_title(f"Environment (scale {scale:.2f})", fontsize=11)
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+
+
+def print_summary_statistics(
+    truth_masses: np.ndarray,
+    subsample_masses: np.ndarray,
+    weights: np.ndarray,
+    info: Dict,
+) -> None:
+    """Print summary statistics for the weighting test.
+
+    Args:
+        truth_masses: True halo masses (full sample).
+        subsample_masses: Subsampled halo masses.
+        weights: Subsample weights.
+        info: Diagnostics from KNNReweighter.
+    """
+    print("\n" + "=" * 70)
+    print("SUMMARY STATISTICS")
+    print("=" * 70)
+
+    print(f"\nPopulation Sizes:")
+    print(f"  Truth (full sample): {len(truth_masses):,} halos")
+    print(f"  Subsample:           {len(subsample_masses):,} halos ({100*len(subsample_masses)/len(truth_masses):.1f}% of truth)")
+    print(f"  ESS:                 {int(info['ess']):,} (effective sample size)")
+    print(f"  ESS/N_subsample:     {info['ess']/len(subsample_masses):.1%}")
+
+    print(f"\nk-NN Configuration:")
+    print(f"  k:           {info['k']}")
+    print(f"  Preprocess:  {info['preprocess']}")
+    print(f"  Metric:      {info['metric']}")
+    print(f"  Tau:         {info['tau']}")
+
+    print(f"\nEnvironment Matching (KS distance):")
+    print(f"  Mean across scales: {info['ks_mean']:.4f}")
+    for i, (scale, ks) in enumerate(zip(info['scales'], info['ks_per_dim'])):
+        print(f"    Scale {i} (R={scale:.2f}): {ks:.4f}")
+
+    print(f"\nWeight Statistics:")
+    print(f"  Mean:      {info['mean_weight']:.3f}")
+    print(f"  Std:       {info['std_weight']:.3f}")
+    print(f"  Min:       {info['min_weight']:.3f}")
+    print(f"  Max:       {info['max_weight']:.3f}")
+    print(f"  Clip frac: {100*info['clip_fraction']:.1f}%")
+    print(f"  Clip range: {info['clip_range']}")
+
+    # Mass statistics
+    print(f"\nMass Range (log10 Msun):")
+    print(f"  Truth:     {np.log10(truth_masses.min()):.2f} - {np.log10(truth_masses.max()):.2f}")
+    print(f"  Subsample: {np.log10(subsample_masses.min()):.2f} - {np.log10(subsample_masses.max()):.2f}")
+
+    # Weighted vs unweighted mean log mass
+    mean_log_m_truth = np.mean(np.log10(truth_masses))
+    mean_log_m_subsample = np.mean(np.log10(subsample_masses))
+    mean_log_m_weighted = np.average(np.log10(subsample_masses), weights=weights)
+
+    print(f"\nMean log10(M) [Msun]:")
+    print(f"  Truth:      {mean_log_m_truth:.3f}")
+    print(f"  Subsample:  {mean_log_m_subsample:.3f} (bias: {mean_log_m_subsample - mean_log_m_truth:+.3f})")
+    print(f"  Weighted:   {mean_log_m_weighted:.3f} (bias: {mean_log_m_weighted - mean_log_m_truth:+.3f})")
+
+    print("=" * 70 + "\n")
+
+
+def main():
+    """Main execution function."""
+    parser = argparse.ArgumentParser(
+        description="Test k-NN weighting on halo mass functions",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    # Required inputs
+    parser.add_argument(
+        "--parent",
+        type=str,
+        required=True,
+        help="Path to parent/global population HDF5 file (gridded overdensities, global reference)",
+    )
+    parser.add_argument(
+        "--sample",
+        type=str,
+        required=True,
+        help="Path to full sample HDF5 file (gridded overdensities, this is the truth)",
+    )
+    parser.add_argument(
+        "--masses",
+        type=str,
+        required=True,
+        help="Path to text file with full sample halo masses (one per line, truth HMF)",
+    )
+
+    # Subsampling
+    parser.add_argument(
+        "--subsample",
+        type=int,
+        required=True,
+        help="Number of halos to randomly select from full sample for weighting test",
+    )
+
+    # k-NN parameters
+    parser.add_argument(
+        "--k",
+        type=int,
+        default=50,
+        help="Number of neighbors (default: 50)",
+    )
+    parser.add_argument(
+        "--autotune_k",
+        type=str,
+        default="",
+        help="Comma-separated k values for auto-tuning (e.g., '35,50,80,100')",
+    )
+    parser.add_argument(
+        "--ess_floor",
+        type=float,
+        default=0.20,
+        help="Minimum ESS fraction for auto-tuning (default: 0.20)",
+    )
+    parser.add_argument(
+        "--preprocess",
+        choices=["whiten", "standardize", "none"],
+        default="whiten",
+        help="Feature preprocessing mode (default: whiten)",
+    )
+    parser.add_argument(
+        "--tau",
+        type=float,
+        default=1.0,
+        help="Temperature parameter for weights (default: 1.0)",
+    )
+    parser.add_argument(
+        "--clip_min",
+        type=float,
+        default=0.01,
+        help="Lower clip bound for weights (default: 0.01)",
+    )
+    parser.add_argument(
+        "--clip_max",
+        type=float,
+        default=50.0,
+        help="Upper clip bound for weights (default: 50.0)",
+    )
+
+    # Output
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=None,
+        help="Path to save weighted masses as .npy file",
+    )
+    parser.add_argument(
+        "--save",
+        type=str,
+        default=None,
+        help="Path to save diagnostic plot",
+    )
+    parser.add_argument(
+        "--no_plots",
+        action="store_true",
+        help="Disable interactive plotting",
+    )
+    parser.add_argument(
+        "--random_state",
+        type=int,
+        default=42,
+        help="Random seed for subsampling and diagnostics (default: 42)",
+    )
+
+    args = parser.parse_args()
+
+    # Load overdensity features
+    print("\n" + "=" * 70)
+    print("LOADING DATA")
+    print("=" * 70 + "\n")
+
+    parent_overdens, sample_overdens = load_multiscale_overdensities(
+        parent_path=args.parent,
+        sample_path=args.sample,
+    )
+
+    # Load halo masses (for the full sample = truth)
+    truth_masses = load_halo_masses(args.masses)
+
+    # Check consistency
+    first_scale = next(iter(sample_overdens.values()))
+    if len(first_scale) != len(truth_masses):
+        raise ValueError(
+            f"Mismatch: sample overdensities have {len(first_scale):,} entries "
+            f"but mass file has {len(truth_masses):,} entries"
+        )
+
+    # Drop NaNs
+    parent_overdens = drop_nan_rows_per_scale(parent_overdens)
+    sample_overdens = drop_nan_rows_per_scale(sample_overdens)
+
+    # After NaN drop, masses may need filtering
+    first_scale_after = next(iter(sample_overdens.values()))
+    if len(first_scale_after) != len(first_scale):
+        print(
+            f"[WARNING] NaN filtering reduced sample from {len(first_scale):,} "
+            f"to {len(first_scale_after):,}. This requires the mass file to have "
+            f"matching NaN entries, which is not currently supported."
+        )
+        print("[WARNING] Proceeding anyway - results may be incorrect!")
+
+    # Subsample from the full sample
+    if args.subsample is None:
+        raise ValueError(
+            "--subsample is required. Specify how many halos to randomly select "
+            "from the full sample for weighting."
+        )
+
+    subsample_overdens, subsample_masses, subsample_indices = subsample_data(
+        sample_overdens, truth_masses, args.subsample, args.random_state
+    )
+
+    # Run k-NN weighting
+    # Goal: weight subsample to match parent (global) environment distribution
+    print("\n" + "=" * 70)
+    print("COMPUTING WEIGHTS")
+    print("=" * 70 + "\n")
+
+    auto_list = [int(x) for x in args.autotune_k.split(",") if x.strip().isdigit()]
+    cfg = KNNReweighterConfig(
+        k=args.k,
+        autotune_k=auto_list if auto_list else None,
+        ess_floor=args.ess_floor,
+        preprocess=args.preprocess,
+        tau=args.tau,
+        clip_range=(args.clip_min, args.clip_max),
+        random_state=args.random_state,
+    )
+
+    reweighter = KNNReweighter(cfg)
+    weights, info = reweighter.fit(parent_overdens, subsample_overdens)
+
+    # Print summary
+    print_summary_statistics(truth_masses, subsample_masses, weights, info)
+
+    # Save weighted masses if requested
+    if args.output:
+        weighted_data = np.column_stack([subsample_masses, weights])
+        np.save(args.output, weighted_data)
+        print(f"[Output] Saved (subsample_masses, weights) to: {args.output}")
+
+    # Plot results
+    if not args.no_plots:
+        print("\n[Plot] Generating diagnostics...\n")
+        fig = plot_hmf_comparison(
+            truth_masses,
+            subsample_masses,
+            weights,
+            info,
+            parent_overdens,
+            subsample_overdens,
+            save_path=args.save,
+        )
+        plt.show()
+
+    print("\n[Done] HMF weighting test complete.\n")
+
+
+if __name__ == "__main__":
+    main()

From a3d5142e058271aa7e0b320b19e4981809b4240d Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Wed, 19 Nov 2025 18:47:34 +0000
Subject: [PATCH 03/23] Correcting attirbute

---
 weighting/knn_weighter.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/weighting/knn_weighter.py b/weighting/knn_weighter.py
index 9f25a47..d4c2ad9 100644
--- a/weighting/knn_weighter.py
+++ b/weighting/knn_weighter.py
@@ -626,11 +626,11 @@ def load_multiscale_overdensities(
 
     * The same group must exist in the parent file.
     * Each group must contain a dataset ``GridPointOverDensities``.
-    * The kernel radius is read from the group's ``kernel_radius`` attribute.
+    * The kernel radius is read from the group's ``KernelRadius`` attribute.
     * The values are read from both files and transformed as
       ``log10(1 + delta)``.
 
-    The returned dictionaries map the smoothing scale (from the kernel_radius
+    The returned dictionaries map the smoothing scale (from the KernelRadius
     attribute) to a 1D NumPy array of overdensities.
 
     Args:
@@ -643,7 +643,7 @@ def load_multiscale_overdensities(
 
     Raises:
         KeyError: If required groups/datasets are missing.
-        ValueError: If no kernels are found; if kernel_radius attribute is
+        ValueError: If no kernels are found; if KernelRadius attribute is
             missing; or if the parent is missing kernels found in the sample
             file.
     """
@@ -684,10 +684,10 @@ def load_multiscale_overdensities(
         for key in kernel_keys:
             # Read kernel radius from group attribute
             try:
-                scale = float(sample_grids[key].attrs["kernel_radius"])
+                scale = float(sample_grids[key].attrs["KernelRadius"])
             except KeyError as exc:
                 raise ValueError(
-                    f"Kernel group '{key}' is missing 'kernel_radius' attribute. "
+                    f"Kernel group '{key}' is missing 'KernelRadius' attribute. "
                     "Cannot determine smoothing scale."
                 ) from exc
 

From 75f178eb80f1afd3d165921a6d55a19272f09736 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Wed, 19 Nov 2025 18:51:43 +0000
Subject: [PATCH 04/23] Clean out nans

---
 weighting/test_hmf_weighting.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/weighting/test_hmf_weighting.py b/weighting/test_hmf_weighting.py
index e679d2a..473c0a9 100644
--- a/weighting/test_hmf_weighting.py
+++ b/weighting/test_hmf_weighting.py
@@ -559,21 +559,30 @@ def main():
             f"but mass file has {len(truth_masses):,} entries"
         )
 
-    # Drop NaNs
+    # Clean parent data
+    print("\n[Cleaning] Removing NaN entries from parent...")
     parent_overdens = drop_nan_rows_per_scale(parent_overdens)
-    sample_overdens = drop_nan_rows_per_scale(sample_overdens)
 
-    # After NaN drop, masses may need filtering
-    first_scale_after = next(iter(sample_overdens.values()))
-    if len(first_scale_after) != len(first_scale):
+    # Clean full sample data - need to synchronize masses with NaN filtering
+    print("[Cleaning] Removing NaN entries from full sample...")
+    n_before = len(next(iter(sample_overdens.values())))
+
+    # Build NaN mask from overdensities
+    mask = np.ones(n_before, dtype=bool)
+    for arr in sample_overdens.values():
+        mask &= ~np.isnan(arr)
+
+    n_dropped = n_before - np.sum(mask)
+    if n_dropped > 0:
         print(
-            f"[WARNING] NaN filtering reduced sample from {len(first_scale):,} "
-            f"to {len(first_scale_after):,}. This requires the mass file to have "
-            f"matching NaN entries, which is not currently supported."
+            f"[NaN] Dropped {n_dropped:,}/{n_before:,} "
+            f"({100.0 * n_dropped / n_before:.2f}%) entries containing NaN."
         )
-        print("[WARNING] Proceeding anyway - results may be incorrect!")
+        # Apply mask to both overdensities and masses
+        sample_overdens = {scale: arr[mask] for scale, arr in sample_overdens.items()}
+        truth_masses = truth_masses[mask]
 
-    # Subsample from the full sample
+    # Subsample from the cleaned full sample
     if args.subsample is None:
         raise ValueError(
             "--subsample is required. Specify how many halos to randomly select "

From ed8c4f3642b25f7402bae55931d9233761b1de40 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Wed, 19 Nov 2025 19:10:46 +0000
Subject: [PATCH 05/23] Sanitising inputs

---
 weighting/knn_weighter.py       | 83 ++++++++++++++++++++++++---------
 weighting/test_hmf_weighting.py | 41 +++++++++-------
 2 files changed, 84 insertions(+), 40 deletions(-)

diff --git a/weighting/knn_weighter.py b/weighting/knn_weighter.py
index d4c2ad9..ace80f9 100644
--- a/weighting/knn_weighter.py
+++ b/weighting/knn_weighter.py
@@ -163,6 +163,56 @@ def whiten_by_parent(
     return (Xp - mu) @ Linv, (Xs - mu) @ Linv
 
 
+def filter_and_transform_overdensities(
+    overdens: Dict[float, np.ndarray],
+) -> Tuple[Dict[float, np.ndarray], np.ndarray]:
+    """Filter out -1 values and apply log10(1 + delta) transform.
+
+    Grid points with overdensity = -1 indicate no particles nearby.
+    These must be removed before applying the log transform to avoid NaN.
+
+    Args:
+        overdens: Mapping from scale to raw overdensity arrays (may contain -1).
+
+    Returns:
+        Tuple (transformed_overdens, mask) where:
+            - transformed_overdens: log10(1 + delta) for valid points
+            - mask: Boolean array indicating which rows were kept
+
+    Raises:
+        ValueError: If arrays have mismatched lengths.
+    """
+    if not overdens:
+        return overdens, np.array([], dtype=bool)
+
+    first = next(iter(overdens.values()))
+    length = len(first)
+
+    # Build mask: keep rows where ALL scales have delta > -1
+    mask = np.ones(length, dtype=bool)
+    for scale, arr in overdens.items():
+        if len(arr) != length:
+            raise ValueError(
+                "All arrays must have the same length; found mismatched lengths."
+            )
+        mask &= (arr > -1.0)
+
+    n_dropped = length - np.sum(mask)
+    if n_dropped > 0:
+        print(
+            f"[Filter] Dropped {n_dropped:,}/{length:,} "
+            f"({100.0 * n_dropped / length:.2f}%) grid points with delta <= -1 "
+            f"(no particles nearby)."
+        )
+
+    # Apply log transform to filtered data
+    transformed = {
+        scale: np.log10(arr[mask] + 1.0) for scale, arr in overdens.items()
+    }
+
+    return transformed, mask
+
+
 def drop_nan_rows_per_scale(
     samples: Dict[float, np.ndarray],
 ) -> Dict[float, np.ndarray]:
@@ -619,7 +669,7 @@ def load_multiscale_overdensities(
     parent_path: str,
     sample_path: str,
 ) -> Tuple[Dict[float, np.ndarray], Dict[float, np.ndarray]]:
-    """Load log10(1 + delta) overdensities from parent and sample HDF5 files.
+    """Load raw overdensities from parent and sample HDF5 files.
 
     The set of kernels is defined by the *sample* file. For every group
     ``Grids/Kernel_N`` present in the sample file:
@@ -627,11 +677,14 @@ def load_multiscale_overdensities(
     * The same group must exist in the parent file.
     * Each group must contain a dataset ``GridPointOverDensities``.
     * The kernel radius is read from the group's ``KernelRadius`` attribute.
-    * The values are read from both files and transformed as
-      ``log10(1 + delta)``.
+    * Raw overdensity values are returned (NOT log-transformed).
+
+    Note: Values of -1 indicate grid points with no particles nearby.
+    Use ``filter_and_transform_overdensities()`` to remove these and
+    apply the log10(1 + delta) transform.
 
     The returned dictionaries map the smoothing scale (from the KernelRadius
-    attribute) to a 1D NumPy array of overdensities.
+    attribute) to a 1D NumPy array of raw overdensities.
 
     Args:
         parent_path: Path to the parent/global population HDF5 file.
@@ -699,24 +752,10 @@ def load_multiscale_overdensities(
                     f"Missing 'GridPointOverDensities' for kernel '{key}'."
                 ) from exc
 
-            # Validate overdensity values before log transform
-            parent_min = np.min(parent_ds)
-            sample_min = np.min(sample_ds)
-            if parent_min <= -1.0:
-                print(
-                    f"[WARNING] Parent overdensities for kernel '{key}' (R={scale}) "
-                    f"contain � <= -1 (min={parent_min:.3f}). This will produce NaN "
-                    f"after log10(1 + �) transform."
-                )
-            if sample_min <= -1.0:
-                print(
-                    f"[WARNING] Sample overdensities for kernel '{key}' (R={scale}) "
-                    f"contain � <= -1 (min={sample_min:.3f}). This will produce NaN "
-                    f"after log10(1 + �) transform."
-                )
-
-            parent_data[scale] = np.log10(np.asarray(parent_ds, dtype=float) + 1.0)
-            sample_data[scale] = np.log10(np.asarray(sample_ds, dtype=float) + 1.0)
+            # Store raw overdensities (will be filtered and transformed later)
+            # Note: -1 values indicate grid points with no particles nearby
+            parent_data[scale] = np.asarray(parent_ds, dtype=float)
+            sample_data[scale] = np.asarray(sample_ds, dtype=float)
 
     return parent_data, sample_data
 
diff --git a/weighting/test_hmf_weighting.py b/weighting/test_hmf_weighting.py
index 473c0a9..574bd2a 100644
--- a/weighting/test_hmf_weighting.py
+++ b/weighting/test_hmf_weighting.py
@@ -47,6 +47,7 @@
     KNNReweighterConfig,
     drop_nan_rows_per_scale,
     effective_sample_size,
+    filter_and_transform_overdensities,
     load_multiscale_overdensities,
 )
 
@@ -559,28 +560,32 @@ def main():
             f"but mass file has {len(truth_masses):,} entries"
         )
 
-    # Clean parent data
-    print("\n[Cleaning] Removing NaN entries from parent...")
+    # Filter and transform parent data (removes delta <= -1, applies log transform)
+    print("\n[Cleaning] Filtering and transforming parent overdensities...")
+    parent_overdens, _ = filter_and_transform_overdensities(parent_overdens)
+
+    # Additional NaN check on parent (shouldn't be any after filtering)
     parent_overdens = drop_nan_rows_per_scale(parent_overdens)
 
-    # Clean full sample data - need to synchronize masses with NaN filtering
-    print("[Cleaning] Removing NaN entries from full sample...")
-    n_before = len(next(iter(sample_overdens.values())))
+    # Filter and transform full sample data - need to synchronize with masses
+    print("[Cleaning] Filtering and transforming full sample overdensities...")
+    sample_overdens, mask = filter_and_transform_overdensities(sample_overdens)
 
-    # Build NaN mask from overdensities
-    mask = np.ones(n_before, dtype=bool)
-    for arr in sample_overdens.values():
-        mask &= ~np.isnan(arr)
+    # Apply mask to masses to keep them synchronized
+    truth_masses = truth_masses[mask]
 
-    n_dropped = n_before - np.sum(mask)
-    if n_dropped > 0:
-        print(
-            f"[NaN] Dropped {n_dropped:,}/{n_before:,} "
-            f"({100.0 * n_dropped / n_before:.2f}%) entries containing NaN."
-        )
-        # Apply mask to both overdensities and masses
-        sample_overdens = {scale: arr[mask] for scale, arr in sample_overdens.items()}
-        truth_masses = truth_masses[mask]
+    # Additional NaN check on sample (shouldn't be any after filtering)
+    n_before = len(next(iter(sample_overdens.values())))
+    sample_overdens = drop_nan_rows_per_scale(sample_overdens)
+    n_after = len(next(iter(sample_overdens.values())))
+
+    if n_after != n_before:
+        # Need to rebuild mask for masses
+        mask_nan = np.ones(n_before, dtype=bool)
+        for arr in sample_overdens.values():
+            # This shouldn't happen, but just in case
+            pass
+        print(f"[WARNING] Found {n_before - n_after} additional NaN values after transform!")
 
     # Subsample from the cleaned full sample
     if args.subsample is None:

From 246583305606b8730a8041974c2a624cf9107841 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Wed, 19 Nov 2025 20:15:34 +0000
Subject: [PATCH 06/23] Expanding the test suite to debug grid points

---
 CMakeLists.txt                  |   1 +
 scripts/build_and_test_debug.sh | 104 +++++++
 src/debugging_utils.cpp         | 411 +++++++++++++++++++++++++++
 src/debugging_utils.hpp         |  85 ++++++
 src/gridder.cpp                 |  34 +++
 tests/test_file_grid params.yml |  23 ++
 tests/test_file_gridding.py     | 474 ++++++++++++++++++++++++++++++++
 7 files changed, 1132 insertions(+)
 create mode 100755 scripts/build_and_test_debug.sh
 create mode 100644 src/debugging_utils.cpp
 create mode 100644 src/debugging_utils.hpp
 create mode 100644 tests/test_file_grid params.yml
 create mode 100755 tests/test_file_gridding.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b9149d6..3833e3e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -175,6 +175,7 @@ set(SOURCE_FILES
   src/partition.cpp
   src/params.cpp
   src/simulation.cpp
+  src/debugging_utils.cpp
 )
 
 # ========== Target ==========
diff --git a/scripts/build_and_test_debug.sh b/scripts/build_and_test_debug.sh
new file mode 100755
index 0000000..047775b
--- /dev/null
+++ b/scripts/build_and_test_debug.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+#
+# Build in debug mode and run comprehensive tests
+# This script compiles with DEBUGGING_CHECKS enabled and runs the full test suite
+#
+
+set -e  # Exit on error
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+echo -e "${BLUE}=============================================="
+echo "  BUILD AND TEST IN DEBUG MODE"
+echo -e "==============================================${NC}"
+echo ""
+
+# Get script directory and project root
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+cd "$PROJECT_ROOT"
+
+# Clean previous debug build
+echo -e "${YELLOW}Cleaning previous debug build...${NC}"
+rm -rf build_debug
+
+# Configure with debug mode
+echo -e "${YELLOW}Configuring debug build...${NC}"
+cmake -B build_debug -DCMAKE_BUILD_TYPE=Debug
+
+# Build
+echo -e "${YELLOW}Building in debug mode...${NC}"
+cmake --build build_debug
+
+if [ $? -ne 0 ]; then
+    echo -e "${RED}✗ Build failed${NC}"
+    exit 1
+fi
+
+echo -e "${GREEN}✓ Build successful${NC}"
+echo ""
+
+# Check executable
+if [ ! -f "build_debug/parent_gridder" ]; then
+    echo -e "${RED}✗ Executable not found: build_debug/parent_gridder${NC}"
+    exit 1
+fi
+
+echo -e "${BLUE}Running comprehensive test suite...${NC}"
+echo ""
+
+# Run file-based gridding tests
+echo -e "${YELLOW}=== File-Based Gridding Tests ===${NC}"
+python3 tests/test_file_gridding.py build_debug/parent_gridder
+
+FILE_TEST_EXIT=$?
+
+echo ""
+
+# Run simple tests
+echo -e "${YELLOW}=== Simple Test ===${NC}"
+bash tests/run_simple_test.sh
+
+SIMPLE_TEST_EXIT=$?
+
+echo ""
+
+# Summary
+echo -e "${BLUE}=============================================="
+echo "  TEST SUMMARY"
+echo -e "==============================================${NC}"
+
+TOTAL_FAILED=0
+
+if [ $FILE_TEST_EXIT -eq 0 ]; then
+    echo -e "${GREEN}✓${NC} File-based gridding tests: PASSED"
+else
+    echo -e "${RED}✗${NC} File-based gridding tests: FAILED"
+    TOTAL_FAILED=$((TOTAL_FAILED + 1))
+fi
+
+if [ $SIMPLE_TEST_EXIT -eq 0 ]; then
+    echo -e "${GREEN}✓${NC} Simple tests: PASSED"
+else
+    echo -e "${RED}✗${NC} Simple tests: FAILED"
+    TOTAL_FAILED=$((TOTAL_FAILED + 1))
+fi
+
+echo -e "${BLUE}==============================================${NC}"
+
+if [ $TOTAL_FAILED -eq 0 ]; then
+    echo -e "${GREEN}ALL TESTS PASSED!${NC}"
+    exit 0
+else
+    echo -e "${RED}$TOTAL_FAILED TEST SUITE(S) FAILED${NC}"
+    echo ""
+    echo "Debug build is available at: build_debug/parent_gridder"
+    echo "You can run individual tests or use a debugger to investigate failures."
+    exit 1
+fi
diff --git a/src/debugging_utils.cpp b/src/debugging_utils.cpp
new file mode 100644
index 0000000..4eea66f
--- /dev/null
+++ b/src/debugging_utils.cpp
@@ -0,0 +1,411 @@
+// Standard includes
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+// Local includes
+#include "debugging_utils.hpp"
+#include "cell.hpp"
+#include "grid_point.hpp"
+#include "logger.hpp"
+#include "metadata.hpp"
+#include "simulation.hpp"
+
+#ifdef DEBUGGING_CHECKS
+
+/**
+ * @brief Find the smallest distance dx along one axis within a box of size
+ * box_size (handles periodic boundaries)
+ */
+static double nearest(const double dx, const double box_size) {
+  return ((dx > 0.5 * box_size)
+              ? (dx - box_size)
+              : ((dx < -0.5 * box_size) ? (dx + box_size) : dx));
+}
+
+/**
+ * @brief Validate that grid points are correctly assigned to their containing
+ * cells
+ *
+ * This checks that each grid point is spatially within the bounds of the cell
+ * it's been assigned to.
+ */
+void validateGridPointCellAssignment(Simulation *sim, Grid *grid) {
+  message("[DEBUG] Validating grid point to cell assignments...");
+
+  int errors = 0;
+  std::vector<Cell> &cells = sim->cells;
+
+  for (size_t cid = 0; cid < sim->nr_cells; cid++) {
+    Cell *cell = &cells[cid];
+
+    for (GridPoint *gp : cell->grid_points) {
+      // Check if grid point is within cell bounds
+      bool inside_x = (gp->loc[0] >= cell->loc[0]) &&
+                      (gp->loc[0] < cell->loc[0] + cell->width[0]);
+      bool inside_y = (gp->loc[1] >= cell->loc[1]) &&
+                      (gp->loc[1] < cell->loc[1] + cell->width[1]);
+      bool inside_z = (gp->loc[2] >= cell->loc[2]) &&
+                      (gp->loc[2] < cell->loc[2] + cell->width[2]);
+
+      if (!inside_x || !inside_y || !inside_z) {
+        message("[DEBUG] ERROR: Grid point at (%.3f, %.3f, %.3f) assigned to "
+                "cell %zu "
+                "with bounds [%.3f-%.3f, %.3f-%.3f, %.3f-%.3f]",
+                gp->loc[0], gp->loc[1], gp->loc[2], cid, cell->loc[0],
+                cell->loc[0] + cell->width[0], cell->loc[1],
+                cell->loc[1] + cell->width[1], cell->loc[2],
+                cell->loc[2] + cell->width[2]);
+        errors++;
+      }
+    }
+  }
+
+  if (errors > 0) {
+    error("[DEBUG] Found %d grid points incorrectly assigned to cells", errors);
+  } else {
+    message("[DEBUG] ✓ All grid points correctly assigned to cells");
+  }
+}
+
+/**
+ * @brief Validate that all useful cells have been correctly identified
+ *
+ * Checks that:
+ * 1. All cells with grid points are marked as useful
+ * 2. All neighbors of cells with grid points are marked as useful
+ */
+void validateUsefulCells(Simulation *sim, Grid *grid) {
+  message("[DEBUG] Validating useful cell flagging...");
+
+  int errors = 0;
+  std::vector<Cell> &cells = sim->cells;
+
+  for (size_t cid = 0; cid < sim->nr_cells; cid++) {
+    Cell *cell = &cells[cid];
+
+    // Check 1: Cells with grid points must be useful
+    if (cell->grid_points.size() > 0 && !cell->is_useful) {
+      message("[DEBUG] ERROR: Cell %zu has %zu grid points but is_useful = "
+              "false",
+              cid, cell->grid_points.size());
+      errors++;
+    }
+
+    // Check 2: Neighbors of cells with grid points must be useful
+    if (cell->grid_points.size() > 0) {
+      for (Cell *neighbor : cell->neighbours) {
+        if (!neighbor->is_useful) {
+          message(
+              "[DEBUG] ERROR: Cell %zu has grid points but its neighbor "
+              "(cell at %.3f,%.3f,%.3f) is not marked as useful",
+              cid, neighbor->loc[0], neighbor->loc[1], neighbor->loc[2]);
+          errors++;
+        }
+      }
+    }
+  }
+
+  // Count total useful cells and cells with grid points
+  int useful_count = 0;
+  int cells_with_gps = 0;
+  for (size_t cid = 0; cid < sim->nr_cells; cid++) {
+    if (cells[cid].is_useful)
+      useful_count++;
+    if (cells[cid].grid_points.size() > 0)
+      cells_with_gps++;
+  }
+
+  message("[DEBUG] Useful cells: %d, Cells with grid points: %d", useful_count,
+          cells_with_gps);
+
+  if (errors > 0) {
+    error("[DEBUG] Found %d useful cell flagging errors", errors);
+  } else {
+    message("[DEBUG] ✓ All useful cells correctly flagged");
+  }
+}
+
+/**
+ * @brief Validate that particles are in the correct cells
+ *
+ * Checks that each particle's position is within the bounds of its assigned
+ * cell.
+ */
+void validateParticleCellAssignment(Simulation *sim) {
+  message("[DEBUG] Validating particle to cell assignments...");
+
+  int errors = 0;
+  std::vector<Cell> &cells = sim->cells;
+
+  for (size_t cid = 0; cid < sim->nr_cells; cid++) {
+    Cell *cell = &cells[cid];
+
+    for (Particle *part : cell->particles) {
+      // Check if particle is within cell bounds
+      bool inside_x = (part->pos[0] >= cell->loc[0]) &&
+                      (part->pos[0] < cell->loc[0] + cell->width[0]);
+      bool inside_y = (part->pos[1] >= cell->loc[1]) &&
+                      (part->pos[1] < cell->loc[1] + cell->width[1]);
+      bool inside_z = (part->pos[2] >= cell->loc[2]) &&
+                      (part->pos[2] < cell->loc[2] + cell->width[2]);
+
+      if (!inside_x || !inside_y || !inside_z) {
+        message("[DEBUG] ERROR: Particle at (%.3f, %.3f, %.3f) assigned to "
+                "cell %zu "
+                "with bounds [%.3f-%.3f, %.3f-%.3f, %.3f-%.3f]",
+                part->pos[0], part->pos[1], part->pos[2], cid, cell->loc[0],
+                cell->loc[0] + cell->width[0], cell->loc[1],
+                cell->loc[1] + cell->width[1], cell->loc[2],
+                cell->loc[2] + cell->width[2]);
+        errors++;
+        if (errors > 10) { // Limit error spam
+          message("[DEBUG] ... (suppressing further particle assignment "
+                  "errors)");
+          break;
+        }
+      }
+    }
+    if (errors > 10)
+      break;
+  }
+
+  if (errors > 0) {
+    error("[DEBUG] Found %d particles incorrectly assigned to cells", errors);
+  } else {
+    message("[DEBUG] ✓ All particles correctly assigned to cells");
+  }
+}
+
+/**
+ * @brief Count particles within radius of a grid point (brute force check)
+ */
+int bruteForceCountParticles(GridPoint *grid_point, Simulation *sim,
+                             double radius) {
+  int count = 0;
+  double radius2 = radius * radius;
+  double *dim = sim->dim;
+
+  std::vector<Cell> &cells = sim->cells;
+  for (size_t cid = 0; cid < sim->nr_cells; cid++) {
+    Cell *cell = &cells[cid];
+
+    for (Particle *part : cell->particles) {
+      double dx = nearest(part->pos[0] - grid_point->loc[0], dim[0]);
+      double dy = nearest(part->pos[1] - grid_point->loc[1], dim[1]);
+      double dz = nearest(part->pos[2] - grid_point->loc[2], dim[2]);
+      double r2 = dx * dx + dy * dy + dz * dz;
+
+      if (r2 <= radius2) {
+        count++;
+      }
+    }
+  }
+
+  return count;
+}
+
+/**
+ * @brief Check if grid points can find any particles within kernel radii
+ *
+ * For each grid point, performs a brute force search to count particles within
+ * the largest kernel radius, then compares to what the gridder found.
+ */
+void validateGridPointsHaveParticles(Simulation *sim, Grid *grid) {
+  message("[DEBUG] Validating grid points can find particles...");
+
+  // Find the maximum kernel radius
+  double max_kernel = *std::max_element(grid->kernel_radii.begin(),
+                                        grid->kernel_radii.end());
+
+  int empty_count = 0;
+  int mismatch_count = 0;
+  int checked_count = 0;
+
+  // Sample a subset of grid points to avoid excessive computation
+  size_t sample_interval = std::max(size_t(1), grid->grid_points.size() / 100);
+
+  for (size_t i = 0; i < grid->grid_points.size(); i += sample_interval) {
+    GridPoint *gp = &grid->grid_points[i];
+    checked_count++;
+
+    // Brute force count
+    int brute_count = bruteForceCountParticles(gp, sim, max_kernel);
+
+    // Get what gridder found for largest kernel
+    int gridder_count = 0;
+    if (gp->counts.find(max_kernel) != gp->counts.end()) {
+      gridder_count = gp->counts[max_kernel];
+    }
+
+    if (brute_count == 0) {
+      message("[DEBUG] Grid point %zu at (%.3f, %.3f, %.3f): No particles "
+              "within %.3f Mpc/h",
+              i, gp->loc[0], gp->loc[1], gp->loc[2], max_kernel);
+      empty_count++;
+    } else if (brute_count != gridder_count) {
+      message("[DEBUG] Grid point %zu at (%.3f, %.3f, %.3f): Brute force "
+              "found %d particles, gridder found %d (within %.3f Mpc/h)",
+              i, gp->loc[0], gp->loc[1], gp->loc[2], brute_count,
+              gridder_count, max_kernel);
+      mismatch_count++;
+    }
+  }
+
+  message("[DEBUG] Checked %d grid points:", checked_count);
+  message("[DEBUG]   - %d had no particles within max kernel radius",
+          empty_count);
+  message("[DEBUG]   - %d had count mismatches between brute force and "
+          "gridder",
+          mismatch_count);
+
+  if (mismatch_count > 0) {
+    error("[DEBUG] Found %d grid points with particle count mismatches",
+          mismatch_count);
+  } else if (empty_count > checked_count / 2) {
+    message("[DEBUG] WARNING: More than 50%% of sampled grid points have no "
+            "particles nearby");
+  } else {
+    message("[DEBUG] ✓ Grid point particle counts look reasonable");
+  }
+}
+
+/**
+ * @brief Validate octree structure integrity
+ *
+ * Recursively checks that:
+ * 1. Child cells properly partition their parent
+ * 2. Particle counts match between parent and children
+ * 3. Grid point counts match between parent and children
+ */
+static void validateCellRecursive(Cell *cell, int *errors) {
+  if (!cell->is_split)
+    return;
+
+  // Check particle count consistency
+  size_t child_part_count = 0;
+  for (int i = 0; i < Cell::OCTREE_CHILDREN; i++) {
+    if (cell->children[i] != nullptr) {
+      child_part_count += cell->children[i]->part_count;
+      validateCellRecursive(cell->children[i], errors);
+    }
+  }
+
+  if (child_part_count != cell->part_count) {
+    message(
+        "[DEBUG] ERROR: Cell at (%.3f, %.3f, %.3f) has %zu particles but "
+        "children have %zu total",
+        cell->loc[0], cell->loc[1], cell->loc[2], cell->part_count,
+        child_part_count);
+    (*errors)++;
+  }
+
+  // Check grid point count consistency
+  size_t child_gp_count = 0;
+  for (int i = 0; i < Cell::OCTREE_CHILDREN; i++) {
+    if (cell->children[i] != nullptr) {
+      child_gp_count += cell->children[i]->grid_points.size();
+    }
+  }
+
+  if (child_gp_count != cell->grid_points.size()) {
+    message("[DEBUG] ERROR: Cell at (%.3f, %.3f, %.3f) has %zu grid points "
+            "but children have %zu total",
+            cell->loc[0], cell->loc[1], cell->loc[2], cell->grid_points.size(),
+            child_gp_count);
+    (*errors)++;
+  }
+}
+
+void validateOctreeStructure(Simulation *sim) {
+  message("[DEBUG] Validating octree structure...");
+
+  int errors = 0;
+  std::vector<Cell> &cells = sim->cells;
+
+  for (size_t cid = 0; cid < sim->nr_cells; cid++) {
+    validateCellRecursive(&cells[cid], &errors);
+  }
+
+  if (errors > 0) {
+    error("[DEBUG] Found %d octree structure errors", errors);
+  } else {
+    message("[DEBUG] ✓ Octree structure is valid");
+  }
+}
+
+/**
+ * @brief Check that file-based grid points are within simulation boundaries
+ */
+void validateFileGridPoints(Grid *grid, Simulation *sim) {
+  if (!grid->grid_from_file)
+    return;
+
+  message("[DEBUG] Validating file-based grid points are within simulation "
+          "boundaries...");
+
+  int outside_count = 0;
+  double *dim = sim->dim;
+
+  for (GridPoint &gp : grid->grid_points) {
+    if (gp.loc[0] < 0 || gp.loc[0] >= dim[0] || gp.loc[1] < 0 ||
+        gp.loc[1] >= dim[1] || gp.loc[2] < 0 || gp.loc[2] >= dim[2]) {
+      message("[DEBUG] ERROR: Grid point at (%.3f, %.3f, %.3f) is outside "
+              "simulation box [0-%.3f, 0-%.3f, 0-%.3f]",
+              gp.loc[0], gp.loc[1], gp.loc[2], dim[0], dim[1], dim[2]);
+      outside_count++;
+    }
+  }
+
+  if (outside_count > 0) {
+    error("[DEBUG] Found %d grid points outside simulation boundaries",
+          outside_count);
+  } else {
+    message("[DEBUG] ✓ All file-based grid points within simulation "
+            "boundaries");
+  }
+}
+
+/**
+ * @brief Print detailed diagnostics about a specific grid point
+ */
+void diagnoseGridPoint(GridPoint *grid_point, Simulation *sim, Grid *grid) {
+  message("[DEBUG] ==== Grid Point Diagnostic ====");
+  message("[DEBUG] Location: (%.6f, %.6f, %.6f)", grid_point->loc[0],
+          grid_point->loc[1], grid_point->loc[2]);
+
+  // Find which cell it's in
+  Cell *cell = getCellContainingPoint(grid_point->loc);
+  message("[DEBUG] Assigned to cell at (%.3f, %.3f, %.3f) with width (%.3f, "
+          "%.3f, %.3f)",
+          cell->loc[0], cell->loc[1], cell->loc[2], cell->width[0],
+          cell->width[1], cell->width[2]);
+  message("[DEBUG] Cell has %zu particles, %zu grid points", cell->part_count,
+          cell->grid_points.size());
+  message("[DEBUG] Cell is_useful: %s, is_split: %s",
+          cell->is_useful ? "true" : "false", cell->is_split ? "true" : "false");
+
+  // Check neighbors
+  message("[DEBUG] Cell has %zu neighbors", cell->neighbours.size());
+  size_t total_neighbor_particles = 0;
+  for (Cell *neighbor : cell->neighbours) {
+    total_neighbor_particles += neighbor->part_count;
+  }
+  message("[DEBUG] Neighbors have %zu total particles", total_neighbor_particles);
+
+  // Check particle counts for each kernel
+  message("[DEBUG] Particle counts by kernel radius:");
+  for (double radius : grid->kernel_radii) {
+    int count = 0;
+    if (grid_point->counts.find(radius) != grid_point->counts.end()) {
+      count = grid_point->counts[radius];
+    }
+    int brute_count = bruteForceCountParticles(grid_point, sim, radius);
+    message("[DEBUG]   %.3f Mpc/h: gridder=%d, brute_force=%d", radius, count,
+            brute_count);
+  }
+
+  message("[DEBUG] ================================");
+}
+
+#endif // DEBUGGING_CHECKS
diff --git a/src/debugging_utils.hpp b/src/debugging_utils.hpp
new file mode 100644
index 0000000..39040bf
--- /dev/null
+++ b/src/debugging_utils.hpp
@@ -0,0 +1,85 @@
+#ifndef DEBUGGING_UTILS_HPP
+#define DEBUGGING_UTILS_HPP
+
+// Header for comprehensive debugging utilities
+// These are active only when DEBUGGING_CHECKS is defined
+
+#include "cell.hpp"
+#include "grid_point.hpp"
+#include "logger.hpp"
+#include "metadata.hpp"
+#include "simulation.hpp"
+#include <vector>
+
+#ifdef DEBUGGING_CHECKS
+
+/**
+ * @brief Validate that grid points are correctly assigned to their containing
+ * cells
+ *
+ * @param sim Simulation object
+ * @param grid Grid object
+ */
+void validateGridPointCellAssignment(Simulation *sim, Grid *grid);
+
+/**
+ * @brief Validate that all useful cells have been correctly identified
+ *
+ * @param sim Simulation object
+ * @param grid Grid object
+ */
+void validateUsefulCells(Simulation *sim, Grid *grid);
+
+/**
+ * @brief Validate that particles are in the correct cells
+ *
+ * @param sim Simulation object
+ */
+void validateParticleCellAssignment(Simulation *sim);
+
+/**
+ * @brief Check if grid points can find any particles within kernel radii
+ *
+ * @param sim Simulation object
+ * @param grid Grid object
+ */
+void validateGridPointsHaveParticles(Simulation *sim, Grid *grid);
+
+/**
+ * @brief Validate octree structure integrity
+ *
+ * @param sim Simulation object
+ */
+void validateOctreeStructure(Simulation *sim);
+
+/**
+ * @brief Check that file-based grid points are within simulation boundaries
+ *
+ * @param grid Grid object
+ * @param sim Simulation object
+ */
+void validateFileGridPoints(Grid *grid, Simulation *sim);
+
+/**
+ * @brief Print detailed diagnostics about a specific grid point
+ *
+ * @param grid_point The grid point to diagnose
+ * @param sim Simulation object
+ * @param grid Grid object
+ */
+void diagnoseGridPoint(GridPoint *grid_point, Simulation *sim, Grid *grid);
+
+/**
+ * @brief Count particles within radius of a grid point (brute force)
+ *
+ * @param grid_point The grid point
+ * @param sim Simulation object
+ * @param radius Search radius
+ * @return Number of particles found
+ */
+int bruteForceCountParticles(GridPoint *grid_point, Simulation *sim,
+                             double radius);
+
+#endif // DEBUGGING_CHECKS
+
+#endif // DEBUGGING_UTILS_HPP
diff --git a/src/gridder.cpp b/src/gridder.cpp
index c18fecb..ad026d6 100644
--- a/src/gridder.cpp
+++ b/src/gridder.cpp
@@ -22,6 +22,10 @@
 #include "simulation.hpp"
 #include "talking.hpp"
 
+#ifdef DEBUGGING_CHECKS
+#include "debugging_utils.hpp"
+#endif
+
 /**
  * @brief Function to handle the command line arguments using robust parser
  *
@@ -243,6 +247,11 @@ int main(int argc, char *argv[]) {
     return 0; // Clean exit, not an error
   }
 
+#ifdef DEBUGGING_CHECKS
+  // Validate file-based grid points are within simulation boundaries
+  validateFileGridPoints(grid, sim);
+#endif
+
 #ifdef WITH_MPI
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
@@ -255,6 +264,11 @@ int main(int argc, char *argv[]) {
     return 1;
   }
 
+#ifdef DEBUGGING_CHECKS
+  // Validate grid points are correctly assigned to cells
+  validateGridPointCellAssignment(sim, grid);
+#endif
+
 #ifdef WITH_MPI
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
@@ -269,6 +283,11 @@ int main(int argc, char *argv[]) {
     return 1;
   }
 
+#ifdef DEBUGGING_CHECKS
+  // Validate useful cells are correctly flagged
+  validateUsefulCells(sim, grid);
+#endif
+
 #ifdef WITH_MPI
   MPI_Barrier(MPI_COMM_WORLD);
 
@@ -322,6 +341,11 @@ int main(int argc, char *argv[]) {
     return 1;
   }
 
+#ifdef DEBUGGING_CHECKS
+  // Validate particles are in the correct cells
+  validateParticleCellAssignment(sim);
+#endif
+
 #ifdef WITH_MPI
   MPI_Barrier(MPI_COMM_WORLD);
 
@@ -360,6 +384,11 @@ int main(int argc, char *argv[]) {
     return 1;
   }
 
+#ifdef DEBUGGING_CHECKS
+  // Validate octree structure integrity
+  validateOctreeStructure(sim);
+#endif
+
 #ifdef WITH_MPI
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
@@ -373,6 +402,11 @@ int main(int argc, char *argv[]) {
     return 1;
   }
 
+#ifdef DEBUGGING_CHECKS
+  // Validate grid points have particles (sample check with brute force)
+  validateGridPointsHaveParticles(sim, grid);
+#endif
+
 #ifdef WITH_MPI
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
diff --git a/tests/test_file_grid params.yml b/tests/test_file_grid params.yml
new file mode 100644
index 0000000..4846101
--- /dev/null
+++ b/tests/test_file_grid params.yml	
@@ -0,0 +1,23 @@
+# Test parameter file for file-based gridding
+# Tests gridding with coordinates loaded from a file
+
+Kernels:
+  nkernels: 3
+  kernel_radius_1: 0.5   # Small scale
+  kernel_radius_2: 2.0   # Medium scale
+  kernel_radius_3: 5.0   # Large scale
+
+Grid:
+  type: file
+  grid_file: tests/data/test_grid_coordinates.txt
+
+Tree:
+  max_leaf_count: 200
+
+Input:
+  filepath: tests/data/test_snapshot.hdf5
+
+Output:
+  filepath: tests/data/
+  basename: file_grid_output.hdf5
+  write_masses: 1
diff --git a/tests/test_file_gridding.py b/tests/test_file_gridding.py
new file mode 100755
index 0000000..a910317
--- /dev/null
+++ b/tests/test_file_gridding.py
@@ -0,0 +1,474 @@
+#!/usr/bin/env python3
+"""
+Comprehensive test suite for file-based gridding functionality.
+
+This tests the critical case where grid points are loaded from a file,
+which is the workflow that's failing for halo center gridding.
+"""
+
+import argparse
+import h5py
+import numpy as np
+import os
+import subprocess
+import sys
+import tempfile
+
+
+class FileGriddingTest:
+    """Test suite for file-based gridding"""
+
+    def __init__(self, executable, test_dir="tests/data"):
+        self.executable = executable
+        self.test_dir = test_dir
+        os.makedirs(test_dir, exist_ok=True)
+
+        # Test configuration
+        self.box_size = 10.0  # Mpc/h
+        self.n_particles = 1000
+        self.kernel_radii = [0.5, 2.0, 5.0]
+
+    def create_test_snapshot(self, filename):
+        """Create a simple test snapshot with particles in clusters"""
+        print(f"Creating test snapshot: {filename}")
+
+        with h5py.File(filename, 'w') as f:
+            # Header
+            header = f.create_group('Header')
+            header.attrs['BoxSize'] = self.box_size
+            header.attrs['NumPart_Total'] = [0, self.n_particles, 0, 0, 0, 0]
+            header.attrs['NumPart_ThisFile'] = [0, self.n_particles, 0, 0, 0, 0]
+            header.attrs['Redshift'] = 7.0
+
+            # Units
+            units = f.create_group('Units')
+            units.attrs['Unit mass in cgs (U_M)'] = 1.989e43  # 10^10 Msun
+            units.attrs['Unit length in cgs (U_L)'] = 3.086e24  # Mpc
+
+            # Cosmology
+            cosmo = f.create_group('Cosmology')
+            cosmo.attrs['Critical density [internal units]'] = 1.0
+
+            # Create particle distributions: 5 clusters
+            n_clusters = 5
+            particles_per_cluster = self.n_particles // n_clusters
+            cluster_centers = [
+                [2.0, 2.0, 2.0],
+                [5.0, 5.0, 5.0],
+                [8.0, 2.0, 5.0],
+                [2.0, 8.0, 5.0],
+                [5.0, 5.0, 8.0],
+            ]
+
+            positions = []
+            masses = []
+
+            for center in cluster_centers:
+                # Gaussian distribution around center
+                cluster_pos = np.random.normal(
+                    loc=center,
+                    scale=0.3,  # Tight cluster
+                    size=(particles_per_cluster, 3)
+                )
+                # Wrap to box
+                cluster_pos = np.mod(cluster_pos, self.box_size)
+                positions.append(cluster_pos)
+                masses.extend([1.0] * particles_per_cluster)
+
+            # Add remaining particles uniformly
+            remainder = self.n_particles - (n_clusters * particles_per_cluster)
+            if remainder > 0:
+                uniform_pos = np.random.uniform(
+                    0, self.box_size, size=(remainder, 3)
+                )
+                positions.append(uniform_pos)
+                masses.extend([1.0] * remainder)
+
+            positions = np.vstack(positions)
+            masses = np.array(masses)
+
+            # Write particle data
+            part1 = f.create_group('PartType1')
+            part1.create_dataset('Coordinates', data=positions)
+            part1.create_dataset('Masses', data=masses)
+            part1.create_dataset('ParticleIDs', data=np.arange(len(masses)))
+
+        print(f"  Created {len(masses)} particles in {n_clusters} clusters")
+        return cluster_centers
+
+    def create_grid_file(self, filename, grid_points):
+        """Create a text file with grid point coordinates"""
+        print(f"Creating grid file: {filename}")
+
+        with open(filename, 'w') as f:
+            f.write("# Test grid point coordinates\n")
+            f.write("# Format: x y z (Mpc/h)\n")
+            f.write(f"# {len(grid_points)} grid points\n")
+            f.write("\n")
+
+            for point in grid_points:
+                f.write(f"{point[0]:.6f} {point[1]:.6f} {point[2]:.6f}\n")
+
+        print(f"  Wrote {len(grid_points)} grid points")
+
+    def create_param_file(self, filename, snapshot_file, grid_file,
+                         output_file):
+        """Create a parameter file for the gridder"""
+        print(f"Creating parameter file: {filename}")
+
+        content = f"""# Test parameter file for file-based gridding
+Kernels:
+  nkernels: {len(self.kernel_radii)}
+"""
+        for i, radius in enumerate(self.kernel_radii, 1):
+            content += f"  kernel_radius_{i}: {radius}\n"
+
+        content += f"""
+Grid:
+  type: file
+  grid_file: {grid_file}
+
+Tree:
+  max_leaf_count: 200
+
+Input:
+  filepath: {snapshot_file}
+
+Output:
+  filepath: {os.path.dirname(output_file)}/
+  basename: {os.path.basename(output_file)}
+  write_masses: 1
+"""
+
+        with open(filename, 'w') as f:
+            f.write(content)
+
+    def run_gridder(self, param_file, nthreads=4, verbosity=2):
+        """Run the gridder executable"""
+        print(f"\nRunning gridder: {self.executable}")
+        print(f"  Params: {param_file}")
+        print(f"  Threads: {nthreads}")
+        print(f"  Verbosity: {verbosity}")
+
+        cmd = [self.executable, param_file, str(nthreads), str(verbosity)]
+
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            print("  Gridder completed successfully")
+            if verbosity >= 2:
+                print("\n--- Gridder Output ---")
+                print(result.stdout)
+                if result.stderr:
+                    print("\n--- Stderr ---")
+                    print(result.stderr)
+                print("--- End Output ---\n")
+            return True
+        except subprocess.CalledProcessError as e:
+            print(f"  ERROR: Gridder failed with exit code {e.returncode}")
+            print("\n--- Stdout ---")
+            print(e.stdout)
+            print("\n--- Stderr ---")
+            print(e.stderr)
+            return False
+
+    def validate_output(self, output_file, expected_grid_points):
+        """Validate the gridder output"""
+        print(f"\nValidating output: {output_file}")
+
+        if not os.path.exists(output_file):
+            print(f"  ERROR: Output file not found: {output_file}")
+            return False
+
+        errors = []
+
+        with h5py.File(output_file, 'r') as f:
+            # Check basic structure
+            required_groups = ['Header', 'Grids', 'Cells']
+            for group in required_groups:
+                if group not in f:
+                    errors.append(f"Missing required group: {group}")
+
+            if 'Grids' not in f:
+                return False
+
+            # Check kernels
+            kernels = list(f['Grids'].keys())
+            print(f"  Found {len(kernels)} kernels: {kernels}")
+
+            if len(kernels) != len(self.kernel_radii):
+                errors.append(
+                    f"Expected {len(self.kernel_radii)} kernels, "
+                    f"found {len(kernels)}"
+                )
+
+            # Check each kernel
+            for i, expected_radius in enumerate(self.kernel_radii):
+                kernel_name = f'Kernel_{i}'
+
+                if kernel_name not in f['Grids']:
+                    errors.append(f"Missing kernel: {kernel_name}")
+                    continue
+
+                kernel_grp = f['Grids'][kernel_name]
+
+                # Check kernel radius attribute
+                if 'KernelRadius' in kernel_grp.attrs:
+                    actual_radius = kernel_grp.attrs['KernelRadius']
+                    if not np.isclose(actual_radius, expected_radius):
+                        errors.append(
+                            f"{kernel_name}: Expected radius {expected_radius}, "
+                            f"got {actual_radius}"
+                        )
+                else:
+                    errors.append(
+                        f"{kernel_name}: Missing KernelRadius attribute"
+                    )
+
+                # Check datasets
+                required_datasets = ['GridPointOverDensities', 'GridPointMasses']
+                for dataset in required_datasets:
+                    if dataset not in kernel_grp:
+                        errors.append(
+                            f"{kernel_name}: Missing dataset {dataset}"
+                        )
+                        continue
+
+                    data = kernel_grp[dataset][:]
+                    print(f"  {kernel_name}/{dataset}: shape={data.shape}, "
+                          f"min={data.min():.3f}, max={data.max():.3f}, "
+                          f"mean={data.mean():.3f}")
+
+                    # Check for data quality issues
+                    n_empty = np.sum(data == -1)
+                    n_zero = np.sum(data == 0)
+                    n_positive = np.sum(data > 0)
+
+                    print(f"    Values: {n_positive} positive, "
+                          f"{n_zero} zero, {n_empty} empty (-1)")
+
+                    # CRITICAL CHECK: For halo center gridding, we expect
+                    # most grid points to find particles
+                    if dataset == 'GridPointMasses':
+                        if n_empty > len(data) * 0.5:
+                            errors.append(
+                                f"{kernel_name}: More than 50% of grid points "
+                                f"have no particles ({n_empty}/{len(data)}). "
+                                f"This suggests a major problem!"
+                            )
+
+        if errors:
+            print("\n  VALIDATION ERRORS:")
+            for error in errors:
+                print(f"    - {error}")
+            return False
+
+        print("  ✓ Validation passed")
+        return True
+
+    def test_cluster_centers(self):
+        """
+        Test 1: Grid points at cluster centers
+        This is the critical test - grid points at known high-density regions
+        """
+        print("\n" + "="*60)
+        print("TEST 1: Grid points at cluster centers")
+        print("="*60)
+
+        # Create test files
+        snapshot_file = os.path.join(self.test_dir, "test_clusters.hdf5")
+        grid_file = os.path.join(self.test_dir, "cluster_centers.txt")
+        param_file = os.path.join(self.test_dir, "test_clusters.yml")
+        output_file = os.path.join(self.test_dir, "cluster_output.hdf5")
+
+        # Create snapshot with known cluster centers
+        cluster_centers = self.create_test_snapshot(snapshot_file)
+
+        # Create grid file with grid points AT cluster centers
+        # These points MUST find particles
+        self.create_grid_file(grid_file, cluster_centers)
+
+        # Create parameter file
+        self.create_param_file(param_file, snapshot_file, grid_file,
+                              output_file)
+
+        # Run gridder
+        if not self.run_gridder(param_file):
+            print("\n✗ TEST 1 FAILED: Gridder execution failed")
+            return False
+
+        # Validate output
+        if not self.validate_output(output_file, cluster_centers):
+            print("\n✗ TEST 1 FAILED: Output validation failed")
+            return False
+
+        print("\n✓ TEST 1 PASSED: Cluster center gridding works")
+        return True
+
+    def test_random_points(self):
+        """
+        Test 2: Random grid points throughout box
+        """
+        print("\n" + "="*60)
+        print("TEST 2: Random grid points")
+        print("="*60)
+
+        snapshot_file = os.path.join(self.test_dir, "test_random.hdf5")
+        grid_file = os.path.join(self.test_dir, "random_points.txt")
+        param_file = os.path.join(self.test_dir, "test_random.yml")
+        output_file = os.path.join(self.test_dir, "random_output.hdf5")
+
+        # Create snapshot
+        self.create_test_snapshot(snapshot_file)
+
+        # Create random grid points
+        n_grid = 50
+        grid_points = np.random.uniform(0, self.box_size, size=(n_grid, 3))
+        self.create_grid_file(grid_file, grid_points)
+
+        # Create parameter file
+        self.create_param_file(param_file, snapshot_file, grid_file,
+                              output_file)
+
+        # Run and validate
+        if not self.run_gridder(param_file):
+            print("\n✗ TEST 2 FAILED: Gridder execution failed")
+            return False
+
+        if not self.validate_output(output_file, grid_points):
+            print("\n✗ TEST 2 FAILED: Output validation failed")
+            return False
+
+        print("\n✓ TEST 2 PASSED: Random point gridding works")
+        return True
+
+    def test_boundary_points(self):
+        """
+        Test 3: Grid points near box boundaries (periodic BC test)
+        """
+        print("\n" + "="*60)
+        print("TEST 3: Boundary and periodic BC test")
+        print("="*60)
+
+        snapshot_file = os.path.join(self.test_dir, "test_boundary.hdf5")
+        grid_file = os.path.join(self.test_dir, "boundary_points.txt")
+        param_file = os.path.join(self.test_dir, "test_boundary.yml")
+        output_file = os.path.join(self.test_dir, "boundary_output.hdf5")
+
+        # Create snapshot
+        self.create_test_snapshot(snapshot_file)
+
+        # Create grid points at box edges and corners
+        eps = 0.1  # Small offset from boundary
+        grid_points = [
+            # Corners
+            [eps, eps, eps],
+            [self.box_size - eps, eps, eps],
+            [eps, self.box_size - eps, eps],
+            [eps, eps, self.box_size - eps],
+            [self.box_size - eps, self.box_size - eps, eps],
+            [self.box_size - eps, eps, self.box_size - eps],
+            [eps, self.box_size - eps, self.box_size - eps],
+            [self.box_size - eps, self.box_size - eps, self.box_size - eps],
+            # Face centers
+            [self.box_size / 2, eps, self.box_size / 2],
+            [self.box_size / 2, self.box_size - eps, self.box_size / 2],
+            [eps, self.box_size / 2, self.box_size / 2],
+            [self.box_size - eps, self.box_size / 2, self.box_size / 2],
+        ]
+        self.create_grid_file(grid_file, grid_points)
+
+        # Create parameter file
+        self.create_param_file(param_file, snapshot_file, grid_file,
+                              output_file)
+
+        # Run and validate
+        if not self.run_gridder(param_file):
+            print("\n✗ TEST 3 FAILED: Gridder execution failed")
+            return False
+
+        if not self.validate_output(output_file, grid_points):
+            print("\n✗ TEST 3 FAILED: Output validation failed")
+            return False
+
+        print("\n✓ TEST 3 PASSED: Boundary point gridding works")
+        return True
+
+    def run_all_tests(self):
+        """Run all tests and report results"""
+        print("\n" + "="*60)
+        print("FILE-BASED GRIDDING TEST SUITE")
+        print("="*60)
+        print(f"Executable: {self.executable}")
+        print(f"Test directory: {self.test_dir}")
+        print("")
+
+        tests = [
+            ("Cluster Centers", self.test_cluster_centers),
+            ("Random Points", self.test_random_points),
+            ("Boundary Points", self.test_boundary_points),
+        ]
+
+        results = {}
+        for name, test_func in tests:
+            try:
+                results[name] = test_func()
+            except Exception as e:
+                print(f"\n✗ TEST '{name}' CRASHED: {e}")
+                import traceback
+                traceback.print_exc()
+                results[name] = False
+
+        # Print summary
+        print("\n" + "="*60)
+        print("TEST SUMMARY")
+        print("="*60)
+
+        for name, passed in results.items():
+            status = "✓ PASSED" if passed else "✗ FAILED"
+            print(f"  {status}: {name}")
+
+        n_passed = sum(results.values())
+        n_total = len(results)
+
+        print("")
+        print(f"Total: {n_passed}/{n_total} tests passed")
+        print("="*60)
+
+        return all(results.values())
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Test suite for file-based gridding functionality"
+    )
+    parser.add_argument(
+        "executable",
+        help="Path to parent_gridder executable"
+    )
+    parser.add_argument(
+        "--test-dir",
+        default="tests/data",
+        help="Directory for test files (default: tests/data)"
+    )
+
+    args = parser.parse_args()
+
+    # Check executable exists
+    if not os.path.exists(args.executable):
+        print(f"ERROR: Executable not found: {args.executable}")
+        return 1
+
+    # Run tests
+    tester = FileGriddingTest(args.executable, args.test_dir)
+    success = tester.run_all_tests()
+
+    return 0 if success else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 084f4f80dfad86bef0f911a64a44c49f1fb72b3f Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Thu, 20 Nov 2025 08:28:27 +0000
Subject: [PATCH 07/23] Fixing debugging compilation

---
 src/debugging_utils.cpp | 26 ++++++--------------------
 src/grid_point.cpp      | 11 +++++++++++
 src/grid_point.hpp      |  1 +
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/debugging_utils.cpp b/src/debugging_utils.cpp
index 4eea66f..13d23fc 100644
--- a/src/debugging_utils.cpp
+++ b/src/debugging_utils.cpp
@@ -13,16 +13,6 @@
 
 #ifdef DEBUGGING_CHECKS
 
-/**
- * @brief Find the smallest distance dx along one axis within a box of size
- * box_size (handles periodic boundaries)
- */
-static double nearest(const double dx, const double box_size) {
-  return ((dx > 0.5 * box_size)
-              ? (dx - box_size)
-              : ((dx < -0.5 * box_size) ? (dx + box_size) : dx));
-}
-
 /**
  * @brief Validate that grid points are correctly assigned to their containing
  * cells
@@ -30,7 +20,8 @@ static double nearest(const double dx, const double box_size) {
  * This checks that each grid point is spatially within the bounds of the cell
  * it's been assigned to.
  */
-void validateGridPointCellAssignment(Simulation *sim, Grid *grid) {
+void validateGridPointCellAssignment(Simulation *sim,
+                                      [[maybe_unused]] Grid *grid) {
   message("[DEBUG] Validating grid point to cell assignments...");
 
   int errors = 0;
@@ -75,7 +66,7 @@ void validateGridPointCellAssignment(Simulation *sim, Grid *grid) {
  * 1. All cells with grid points are marked as useful
  * 2. All neighbors of cells with grid points are marked as useful
  */
-void validateUsefulCells(Simulation *sim, Grid *grid) {
+void validateUsefulCells(Simulation *sim, [[maybe_unused]] Grid *grid) {
   message("[DEBUG] Validating useful cell flagging...");
 
   int errors = 0;
@@ -233,10 +224,7 @@ void validateGridPointsHaveParticles(Simulation *sim, Grid *grid) {
     int brute_count = bruteForceCountParticles(gp, sim, max_kernel);
 
     // Get what gridder found for largest kernel
-    int gridder_count = 0;
-    if (gp->counts.find(max_kernel) != gp->counts.end()) {
-      gridder_count = gp->counts[max_kernel];
-    }
+    int gridder_count = gp->getCount(max_kernel);
 
     if (brute_count == 0) {
       message("[DEBUG] Grid point %zu at (%.3f, %.3f, %.3f): No particles "
@@ -396,10 +384,8 @@ void diagnoseGridPoint(GridPoint *grid_point, Simulation *sim, Grid *grid) {
   // Check particle counts for each kernel
   message("[DEBUG] Particle counts by kernel radius:");
   for (double radius : grid->kernel_radii) {
-    int count = 0;
-    if (grid_point->counts.find(radius) != grid_point->counts.end()) {
-      count = grid_point->counts[radius];
-    }
+    // Get count from gridder
+    int count = grid_point->getCount(radius);
     int brute_count = bruteForceCountParticles(grid_point, sim, radius);
     message("[DEBUG]   %.3f Mpc/h: gridder=%d, brute_force=%d", radius, count,
             brute_count);
diff --git a/src/grid_point.cpp b/src/grid_point.cpp
index f4a9e67..a9f78d9 100644
--- a/src/grid_point.cpp
+++ b/src/grid_point.cpp
@@ -86,6 +86,17 @@ double GridPoint::getMass(const double kernel_radius) const {
   }
 }
 
+// Method to get the particle count inside the kernel radius
+int GridPoint::getCount(const double kernel_radius) const {
+  // Check if the kernel radius exists in the count map
+  auto it = this->count_map.find(kernel_radius);
+  if (it != this->count_map.end()) {
+    return static_cast<int>(it->second);
+  } else {
+    return 0; // Return 0 if the kernel radius is not found
+  }
+}
+
 /**
  * @brief Construct a new Grid object
  *
diff --git a/src/grid_point.hpp b/src/grid_point.hpp
index ede437e..e103b6f 100644
--- a/src/grid_point.hpp
+++ b/src/grid_point.hpp
@@ -28,6 +28,7 @@ class GridPoint {
                 double kernel_radius);
   double getOverDensity(const double kernel_radius, Simulation *sim) const;
   double getMass(const double kernel_radius) const;
+  int getCount(const double kernel_radius) const;
 
 private:
   //! The count of particles in each kernel radius

From 54df4c1163e9b9131cda4ef9d658f2788948c73a Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Thu, 20 Nov 2025 09:34:46 +0000
Subject: [PATCH 08/23] Adjusting tests

---
 tests/test_file_gridding.py | 52 +++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/tests/test_file_gridding.py b/tests/test_file_gridding.py
index a910317..b49d352 100755
--- a/tests/test_file_gridding.py
+++ b/tests/test_file_gridding.py
@@ -49,6 +49,32 @@ def create_test_snapshot(self, filename):
             cosmo = f.create_group('Cosmology')
             cosmo.attrs['Critical density [internal units]'] = 1.0
 
+            # Cells metadata (required by gridder)
+            cells = f.create_group('Cells')
+            cells_meta = cells.create_group('Meta-data')
+            cells_meta.attrs['dimension'] = np.array([2, 2, 2], dtype=np.int32)
+            cells_meta.attrs['nr_cells'] = 8
+            cells_meta.attrs['size'] = self.box_size
+
+            # Cells Centres
+            cell_width = self.box_size / 2.0
+            centres = []
+            for i in range(2):
+                for j in range(2):
+                    for k in range(2):
+                        centres.append([
+                            (i + 0.5) * cell_width,
+                            (j + 0.5) * cell_width,
+                            (k + 0.5) * cell_width
+                        ])
+            cells.create_dataset('Centres', data=np.array(centres))
+
+            # Cells Counts (will be filled correctly after we know particle positions)
+            cells.create_dataset('Counts', data=np.zeros((8, 6), dtype=np.int32))
+
+            # Cells OffsetsInFile
+            cells.create_dataset('OffsetsInFile', data=np.zeros((8, 6), dtype=np.int64))
+
             # Create particle distributions: 5 clusters
             n_clusters = 5
             particles_per_cluster = self.n_particles // n_clusters
@@ -87,6 +113,32 @@ def create_test_snapshot(self, filename):
             positions = np.vstack(positions)
             masses = np.array(masses)
 
+            # Compute cell counts
+            cell_counts = np.zeros((8, 6), dtype=np.int32)
+            cell_offsets = np.zeros((8, 6), dtype=np.int64)
+
+            # Assign particles to cells
+            for i, pos in enumerate(positions):
+                ix = int(pos[0] / cell_width)
+                iy = int(pos[1] / cell_width)
+                iz = int(pos[2] / cell_width)
+                # Ensure within bounds
+                ix = max(0, min(1, ix))
+                iy = max(0, min(1, iy))
+                iz = max(0, min(1, iz))
+                cell_id = ix * 4 + iy * 2 + iz
+                cell_counts[cell_id, 1] += 1  # PartType1
+
+            # Set offsets (cumulative)
+            offset = 0
+            for cell_id in range(8):
+                cell_offsets[cell_id, 1] = offset
+                offset += cell_counts[cell_id, 1]
+
+            # Update cell datasets
+            f['Cells/Counts'][:] = cell_counts
+            f['Cells/OffsetsInFile'][:] = cell_offsets
+
             # Write particle data
             part1 = f.create_group('PartType1')
             part1.create_dataset('Coordinates', data=positions)

From 817e40a392701135dc28efb97b26de4a0ddffb1c Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Thu, 20 Nov 2025 09:44:23 +0000
Subject: [PATCH 09/23] Fixing test suite

---
 tests/test_file_gridding.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tests/test_file_gridding.py b/tests/test_file_gridding.py
index b49d352..1df8fcc 100755
--- a/tests/test_file_gridding.py
+++ b/tests/test_file_gridding.py
@@ -69,11 +69,13 @@ def create_test_snapshot(self, filename):
                         ])
             cells.create_dataset('Centres', data=np.array(centres))
 
-            # Cells Counts (will be filled correctly after we know particle positions)
-            cells.create_dataset('Counts', data=np.zeros((8, 6), dtype=np.int32))
+            # Create Counts and OffsetsInFile groups (not datasets!)
+            counts_grp = cells.create_group('Counts')
+            offsets_grp = cells.create_group('OffsetsInFile')
 
-            # Cells OffsetsInFile
-            cells.create_dataset('OffsetsInFile', data=np.zeros((8, 6), dtype=np.int64))
+            # Will be filled after we know particle positions
+            counts_grp.create_dataset('PartType1', data=np.zeros(8, dtype=np.int32))
+            offsets_grp.create_dataset('PartType1', data=np.zeros(8, dtype=np.int32))
 
             # Create particle distributions: 5 clusters
             n_clusters = 5
@@ -113,9 +115,9 @@ def create_test_snapshot(self, filename):
             positions = np.vstack(positions)
             masses = np.array(masses)
 
-            # Compute cell counts
-            cell_counts = np.zeros((8, 6), dtype=np.int32)
-            cell_offsets = np.zeros((8, 6), dtype=np.int64)
+            # Compute cell counts for PartType1
+            cell_counts = np.zeros(8, dtype=np.int32)
+            cell_offsets = np.zeros(8, dtype=np.int32)
 
             # Assign particles to cells
             for i, pos in enumerate(positions):
@@ -127,17 +129,17 @@ def create_test_snapshot(self, filename):
                 iy = max(0, min(1, iy))
                 iz = max(0, min(1, iz))
                 cell_id = ix * 4 + iy * 2 + iz
-                cell_counts[cell_id, 1] += 1  # PartType1
+                cell_counts[cell_id] += 1
 
             # Set offsets (cumulative)
             offset = 0
             for cell_id in range(8):
-                cell_offsets[cell_id, 1] = offset
-                offset += cell_counts[cell_id, 1]
+                cell_offsets[cell_id] = offset
+                offset += cell_counts[cell_id]
 
             # Update cell datasets
-            f['Cells/Counts'][:] = cell_counts
-            f['Cells/OffsetsInFile'][:] = cell_offsets
+            f['Cells/Counts/PartType1'][:] = cell_counts
+            f['Cells/OffsetsInFile/PartType1'][:] = cell_offsets
 
             # Write particle data
             part1 = f.create_group('PartType1')

From f5662473d57ac5cf49458b5111c6efb371713920 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Thu, 20 Nov 2025 10:57:04 +0000
Subject: [PATCH 10/23] More test fixing...

---
 tests/test_file_gridding.py | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/tests/test_file_gridding.py b/tests/test_file_gridding.py
index 1df8fcc..a286a23 100755
--- a/tests/test_file_gridding.py
+++ b/tests/test_file_gridding.py
@@ -214,13 +214,23 @@ def run_gridder(self, param_file, nthreads=4, verbosity=2):
                 check=True
             )
             print("  Gridder completed successfully")
-            if verbosity >= 2:
+
+            # Always print debug output (contains validation results)
+            if result.stdout:
                 print("\n--- Gridder Output ---")
                 print(result.stdout)
-                if result.stderr:
-                    print("\n--- Stderr ---")
-                    print(result.stderr)
-                print("--- End Output ---\n")
+            if result.stderr:
+                print("\n--- Stderr ---")
+                print(result.stderr)
+            print("--- End Output ---\n")
+
+            # Check for debug validation failures in output
+            if '[DEBUG]' in result.stdout or '[DEBUG]' in result.stderr:
+                debug_output = result.stdout + result.stderr
+                if 'ERROR' in debug_output or 'FAILED' in debug_output:
+                    print("\n!!! DEBUG CHECKS DETECTED ERRORS !!!")
+                    print("See debug output above for details.\n")
+
             return True
         except subprocess.CalledProcessError as e:
             print(f"  ERROR: Gridder failed with exit code {e.returncode}")
@@ -250,9 +260,12 @@ def validate_output(self, output_file, expected_grid_points):
             if 'Grids' not in f:
                 return False
 
-            # Check kernels
-            kernels = list(f['Grids'].keys())
+            # Check kernels (filter out non-kernel datasets like GridPointPositions)
+            all_keys = list(f['Grids'].keys())
+            kernels = [k for k in all_keys if k.startswith('Kernel_')]
             print(f"  Found {len(kernels)} kernels: {kernels}")
+            if len(all_keys) != len(kernels):
+                print(f"  (Ignoring non-kernel datasets: {[k for k in all_keys if not k.startswith('Kernel_')]})")
 
             if len(kernels) != len(self.kernel_radii):
                 errors.append(
@@ -308,12 +321,22 @@ def validate_output(self, output_file, expected_grid_points):
                     # CRITICAL CHECK: For halo center gridding, we expect
                     # most grid points to find particles
                     if dataset == 'GridPointMasses':
+                        # Check for empty (-1) values
                         if n_empty > len(data) * 0.5:
                             errors.append(
                                 f"{kernel_name}: More than 50% of grid points "
                                 f"have no particles ({n_empty}/{len(data)}). "
                                 f"This suggests a major problem!"
                             )
+                        # CRITICAL: Check for zero values when we expect particles
+                        # (cluster centers should have particles!)
+                        if 'cluster' in output_file.lower() and n_zero > len(data) * 0.5:
+                            errors.append(
+                                f"CRITICAL BUG: {kernel_name}: Grid points at cluster "
+                                f"centers found ZERO particles ({n_zero}/{len(data)}). "
+                                f"Particles exist at these locations but gridder "
+                                f"is not finding them!"
+                            )
 
         if errors:
             print("\n  VALIDATION ERRORS:")

From 563a12c1a53440f2d721a9fe47b04dc94ae9ba52 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Thu, 20 Nov 2025 11:04:42 +0000
Subject: [PATCH 11/23] More info in debugging tests

---
 src/cell.cpp                  | 21 +++++++++++++++++++++
 src/construct_grid_points.cpp | 12 ++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/src/cell.cpp b/src/cell.cpp
index 790f0cd..bf09178 100644
--- a/src/cell.cpp
+++ b/src/cell.cpp
@@ -1222,6 +1222,27 @@ void assignGridPointsToCells([[maybe_unused]] Simulation *sim, Grid *grid) {
   // Get the cells for debugging checks
   std::vector<Cell> &cells = sim->cells;
 
+  // Print cell assignment summary
+  message("[DEBUG] Grid point to cell assignment summary:");
+  int cells_with_gps = 0;
+  for (size_t cid = 0; cid < sim->nr_cells; cid++) {
+    Cell *cell = &cells[cid];
+    if (cell->grid_points.size() > 0) {
+      cells_with_gps++;
+      message("[DEBUG]   Cell %zu at (%.3f, %.3f, %.3f) width (%.3f, %.3f, %.3f) has %zu grid points",
+              cid, cell->loc[0], cell->loc[1], cell->loc[2],
+              cell->width[0], cell->width[1], cell->width[2],
+              cell->grid_points.size());
+      // Print first grid point in this cell
+      if (cell->grid_points.size() > 0) {
+        GridPoint *gp = cell->grid_points[0];
+        message("[DEBUG]     First GP: (%.6f, %.6f, %.6f)",
+                gp->loc[0], gp->loc[1], gp->loc[2]);
+      }
+    }
+  }
+  message("[DEBUG] Total cells with grid points: %d", cells_with_gps);
+
   // Check grid points are in the right cells
   for (size_t cid = 0; cid < sim->nr_cells; cid++) {
     Cell *cell = &cells[cid];
diff --git a/src/construct_grid_points.cpp b/src/construct_grid_points.cpp
index b25b43a..8bd1976 100644
--- a/src/construct_grid_points.cpp
+++ b/src/construct_grid_points.cpp
@@ -413,6 +413,18 @@ static void createGridPointsFromFile(Simulation *sim, Grid *grid) {
   message("Created %d grid points from file %s", valid_points,
           grid->grid_file.c_str());
 
+#ifdef DEBUGGING_CHECKS
+  // Print first few grid points to verify they loaded correctly
+  message("[DEBUG] First 5 grid points loaded from file:");
+  int n_to_print = std::min(5, valid_points);
+  for (int i = 0; i < n_to_print; i++) {
+    message("[DEBUG]   Point %d: (%.6f, %.6f, %.6f)", i,
+            grid->grid_points[i].loc[0],
+            grid->grid_points[i].loc[1],
+            grid->grid_points[i].loc[2]);
+  }
+#endif
+
   // Initialize mass and count maps for all grid points
   message("Initializing grid point maps for %d kernel radii", grid->nkernels);
   for (GridPoint &gp : grid->grid_points) {

From e4833913fa14ce4b882075d698021b3e5d627bdb Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Thu, 20 Nov 2025 11:12:40 +0000
Subject: [PATCH 12/23] Fixing test

---
 tests/test_file_gridding.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_file_gridding.py b/tests/test_file_gridding.py
index a286a23..caa0cc2 100755
--- a/tests/test_file_gridding.py
+++ b/tests/test_file_gridding.py
@@ -35,7 +35,8 @@ def create_test_snapshot(self, filename):
         with h5py.File(filename, 'w') as f:
             # Header
             header = f.create_group('Header')
-            header.attrs['BoxSize'] = self.box_size
+            # BoxSize must be an array [x, y, z] not a scalar!
+            header.attrs['BoxSize'] = np.array([self.box_size, self.box_size, self.box_size])
             header.attrs['NumPart_Total'] = [0, self.n_particles, 0, 0, 0, 0]
             header.attrs['NumPart_ThisFile'] = [0, self.n_particles, 0, 0, 0, 0]
             header.attrs['Redshift'] = 7.0

From 90590d76a7d7d1e582e0a233e9978518d7e91f23 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Thu, 20 Nov 2025 12:01:44 +0000
Subject: [PATCH 13/23] Introducing a brute force sanity check in ddebug mode

---
 src/construct_cells.cpp                      |  45 +++++---
 src/debugging_utils.cpp                      | 105 +++++++++++--------
 src/grid_point.cpp                           |  17 +++
 src/grid_point.hpp                           |  10 ++
 src/output.cpp                               |  59 +++++++++++
 tests/analyze_debug_output.py                |  76 ++++++++++++++
 make_test_snap.py => tests/make_test_snap.py |   0
 tests/run_simple_test.sh                     |   2 +-
 tests/test_file_gridding.py                  |   5 +-
 tests/test_gridder.py                        |   2 +-
 10 files changed, 255 insertions(+), 66 deletions(-)
 create mode 100644 tests/analyze_debug_output.py
 rename make_test_snap.py => tests/make_test_snap.py (100%)

diff --git a/src/construct_cells.cpp b/src/construct_cells.cpp
index b6f648d..c94598b 100644
--- a/src/construct_cells.cpp
+++ b/src/construct_cells.cpp
@@ -47,23 +47,28 @@ void getTopCells(Simulation *sim, Grid *grid) {
   // neighbouring cells (this simplifies boilerplate elsewhere)
 
   // How many cells do we need to walk out for the biggest kernel? This is
-  // the maximum distance at which we will need to consider another cell
-  const int nwalk = std::ceil(grid->max_kernel_radius / width[0]) + 1;
-  int nwalk_upper = nwalk;
-  int nwalk_lower = nwalk;
-
-  // If nwalk is greater than the number of cells in the simulation, we need
-  // to walk out to the edge of the simulation
-  if (nwalk > cdim[0] / 2) {
-    nwalk_upper = cdim[0] / 2;
-    nwalk_lower = cdim[0] / 2;
+  // the maximum distance at which we will need to consider another cell.
+  // We compute this separately for each dimension since cell widths and
+  // grid dimensions may differ.
+  int nwalk[3];
+  for (int dim = 0; dim < 3; dim++) {
+    nwalk[dim] = std::ceil(grid->max_kernel_radius / width[dim]) + 1;
+
+    // Clamp to half the grid dimension to prevent duplicate neighbors
+    // through periodic wrapping. If we walk more than cdim/2, we'll
+    // encounter the same cell from multiple periodic images.
+    if (nwalk[dim] > cdim[dim] / 2) {
+      nwalk[dim] = cdim[dim] / 2;
+    }
   }
 
-  message("Looking for neighbours within %d cells", nwalk);
+  message("Looking for neighbours within [%d, %d, %d] cells",
+          nwalk[0], nwalk[1], nwalk[2]);
 
-  // Calculate maximum neighbors
+  // Calculate maximum neighbors (use the maximum nwalk for reservation)
+  const int max_nwalk = std::max({nwalk[0], nwalk[1], nwalk[2]});
   const int max_neighbors =
-      (2 * nwalk + 1) * (2 * nwalk + 1) * (2 * nwalk + 1) -
+      (2 * max_nwalk + 1) * (2 * max_nwalk + 1) * (2 * max_nwalk + 1) -
       1; // -1 excludes self
 
   // Loop over the cells attaching the pointers the neighbouring cells (taking
@@ -82,10 +87,11 @@ void getTopCells(Simulation *sim, Grid *grid) {
     // Reserve space for neighbors
     cell->neighbours.reserve(max_neighbors);
 
-    // Loop over the neighbours
-    for (int ii = -nwalk_lower; ii < nwalk_upper + 1; ii++) {
-      for (int jj = -nwalk_lower; jj < nwalk_upper + 1; jj++) {
-        for (int kk = -nwalk_lower; kk < nwalk_upper + 1; kk++) {
+    // Loop over the neighbours using dimension-specific nwalk values
+    // The nwalk values are already clamped to cdim/2, preventing duplicates
+    for (int ii = -nwalk[0]; ii <= nwalk[0]; ii++) {
+      for (int jj = -nwalk[1]; jj <= nwalk[1]; jj++) {
+        for (int kk = -nwalk[2]; kk <= nwalk[2]; kk++) {
 
           // Skip the cell itself
           if (ii == 0 && jj == 0 && kk == 0)
@@ -97,6 +103,11 @@ void getTopCells(Simulation *sim, Grid *grid) {
           int kkk = (k + kk + cdim[2]) % cdim[2];
           int cjd = iii * cdim[1] * cdim[2] + jjj * cdim[2] + kkk;
 
+          // Skip if this wraps back to the cell itself
+          // (can happen with periodic boundaries in small boxes)
+          if (cjd == static_cast<int>(cid))
+            continue;
+
           // Attach the neighbour to the cell
           cell->neighbours.push_back(&cells[cjd]);
         }
diff --git a/src/debugging_utils.cpp b/src/debugging_utils.cpp
index 13d23fc..6de561f 100644
--- a/src/debugging_utils.cpp
+++ b/src/debugging_utils.cpp
@@ -199,62 +199,75 @@ int bruteForceCountParticles(GridPoint *grid_point, Simulation *sim,
 /**
  * @brief Check if grid points can find any particles within kernel radii
  *
- * For each grid point, performs a brute force search to count particles within
- * the largest kernel radius, then compares to what the gridder found.
+ * For ALL grid points, performs a brute force search to count particles within
+ * each kernel radius, stores the counts, then compares to what the gridder found.
+ * In debug mode, these brute force counts will be written to the output file.
  */
 void validateGridPointsHaveParticles(Simulation *sim, Grid *grid) {
-  message("[DEBUG] Validating grid points can find particles...");
+  message("[DEBUG] Computing brute force particle counts for all grid points and all kernels...");
+  message("[DEBUG] This will be written to output file for comparison.");
 
-  // Find the maximum kernel radius
-  double max_kernel = *std::max_element(grid->kernel_radii.begin(),
-                                        grid->kernel_radii.end());
+  int total_mismatches = 0;
+  int total_empty_kernels = 0;
+  int total_checks = 0;
 
-  int empty_count = 0;
-  int mismatch_count = 0;
-  int checked_count = 0;
+  // For ALL grid points (not sampled), compute brute force counts for ALL kernels
+  for (size_t i = 0; i < grid->grid_points.size(); i++) {
+    GridPoint *gp = &grid->grid_points[i];
 
-  // Sample a subset of grid points to avoid excessive computation
-  size_t sample_interval = std::max(size_t(1), grid->grid_points.size() / 100);
+    // Compute brute force for each kernel radius
+    for (double kernel_rad : grid->kernel_radii) {
+      int brute_count = bruteForceCountParticles(gp, sim, kernel_rad);
+      int gridder_count = gp->getCount(kernel_rad);
 
-  for (size_t i = 0; i < grid->grid_points.size(); i += sample_interval) {
-    GridPoint *gp = &grid->grid_points[i];
-    checked_count++;
-
-    // Brute force count
-    int brute_count = bruteForceCountParticles(gp, sim, max_kernel);
-
-    // Get what gridder found for largest kernel
-    int gridder_count = gp->getCount(max_kernel);
-
-    if (brute_count == 0) {
-      message("[DEBUG] Grid point %zu at (%.3f, %.3f, %.3f): No particles "
-              "within %.3f Mpc/h",
-              i, gp->loc[0], gp->loc[1], gp->loc[2], max_kernel);
-      empty_count++;
-    } else if (brute_count != gridder_count) {
-      message("[DEBUG] Grid point %zu at (%.3f, %.3f, %.3f): Brute force "
-              "found %d particles, gridder found %d (within %.3f Mpc/h)",
-              i, gp->loc[0], gp->loc[1], gp->loc[2], brute_count,
-              gridder_count, max_kernel);
-      mismatch_count++;
+      // Store the brute force count
+      gp->setBruteForceCount(kernel_rad, brute_count);
+
+      total_checks++;
+
+      if (brute_count == 0) {
+        total_empty_kernels++;
+      }
+
+      if (brute_count != gridder_count) {
+        total_mismatches++;
+      }
     }
   }
 
-  message("[DEBUG] Checked %d grid points:", checked_count);
-  message("[DEBUG]   - %d had no particles within max kernel radius",
-          empty_count);
-  message("[DEBUG]   - %d had count mismatches between brute force and "
-          "gridder",
-          mismatch_count);
-
-  if (mismatch_count > 0) {
-    error("[DEBUG] Found %d grid points with particle count mismatches",
-          mismatch_count);
-  } else if (empty_count > checked_count / 2) {
-    message("[DEBUG] WARNING: More than 50%% of sampled grid points have no "
-            "particles nearby");
+  message("[DEBUG] Brute force validation complete:");
+  message("[DEBUG]   - Checked %zu grid points × %d kernels = %d combinations",
+          grid->grid_points.size(), grid->nkernels, total_checks);
+  message("[DEBUG]   - %d had no particles (expected for small kernels/sparse regions)",
+          total_empty_kernels);
+  message("[DEBUG]   - %d had count mismatches between brute force and gridder",
+          total_mismatches);
+
+  if (total_mismatches > 0) {
+    message("[DEBUG] ╔═══════════════════════════════════════════════════════════════╗");
+    message("[DEBUG] ║ WARNING: PARTICLE COUNT MISMATCHES DETECTED                   ║");
+    message("[DEBUG] ╠═══════════════════════════════════════════════════════════════╣");
+    message("[DEBUG] ║ Found %5d mismatches between octree and brute force        ║", total_mismatches);
+    message("[DEBUG] ║                                                               ║");
+    message("[DEBUG] ║ THE RESULTS IN THE OUTPUT FILE ARE INCORRECT!                ║");
+    message("[DEBUG] ║                                                               ║");
+    message("[DEBUG] ║ The output file contains both:                               ║");
+    message("[DEBUG] ║   - GridPointCounts:                                         ║");
+    message("[DEBUG] ║       Counts from octree traversal (INCORRECT - has bugs)    ║");
+    message("[DEBUG] ║   - BruteForceGridPointCounts:                               ║");
+    message("[DEBUG] ║       Counts from exhaustive search (CORRECT - ground truth) ║");
+    message("[DEBUG] ║                                                               ║");
+    message("[DEBUG] ║ Use BruteForceGridPointCounts for analysis.                  ║");
+    message("[DEBUG] ║                                                               ║");
+    message("[DEBUG] ║ This likely indicates a bug in periodic boundary handling    ║");
+    message("[DEBUG] ║ or octree traversal, especially for grid points near box     ║");
+    message("[DEBUG] ║ boundaries or with small kernel radii.                       ║");
+    message("[DEBUG] ╚═══════════════════════════════════════════════════════════════╝");
+  } else if (total_empty_kernels > total_checks / 2) {
+    message("[DEBUG] WARNING: More than 50%% of grid point/kernel combinations "
+            "have no particles nearby");
   } else {
-    message("[DEBUG] ✓ Grid point particle counts look reasonable");
+    message("[DEBUG] ✓ Grid point particle counts match brute force validation");
   }
 }
 
diff --git a/src/grid_point.cpp b/src/grid_point.cpp
index a9f78d9..e33afdf 100644
--- a/src/grid_point.cpp
+++ b/src/grid_point.cpp
@@ -97,6 +97,23 @@ int GridPoint::getCount(const double kernel_radius) const {
   }
 }
 
+#ifdef DEBUGGING_CHECKS
+// Method to set the brute force count for a kernel radius
+void GridPoint::setBruteForceCount(const double kernel_radius, int count) {
+  this->brute_force_count_map[kernel_radius] = count;
+}
+
+// Method to get the brute force count for a kernel radius
+int GridPoint::getBruteForceCount(const double kernel_radius) const {
+  auto it = this->brute_force_count_map.find(kernel_radius);
+  if (it != this->brute_force_count_map.end()) {
+    return it->second;
+  } else {
+    return -1; // Return -1 if not computed (shouldn't happen)
+  }
+}
+#endif
+
 /**
  * @brief Construct a new Grid object
  *
diff --git a/src/grid_point.hpp b/src/grid_point.hpp
index e103b6f..375832e 100644
--- a/src/grid_point.hpp
+++ b/src/grid_point.hpp
@@ -30,12 +30,22 @@ class GridPoint {
   double getMass(const double kernel_radius) const;
   int getCount(const double kernel_radius) const;
 
+#ifdef DEBUGGING_CHECKS
+  void setBruteForceCount(const double kernel_radius, int count);
+  int getBruteForceCount(const double kernel_radius) const;
+#endif
+
 private:
   //! The count of particles in each kernel radius
   std::unordered_map<double, double> count_map;
 
   //! The mass of particles in each kernel radius
   std::unordered_map<double, double> mass_map;
+
+#ifdef DEBUGGING_CHECKS
+  //! Brute force particle counts (for validation in debug mode)
+  std::unordered_map<double, int> brute_force_count_map;
+#endif
 };
 
 class Grid {
diff --git a/src/output.cpp b/src/output.cpp
index 6ffb54f..b6d0d14 100644
--- a/src/output.cpp
+++ b/src/output.cpp
@@ -148,6 +148,27 @@ void writeGridFileSerial(Simulation *sim, Grid *grid) {
       }
     }
 
+    // Create particle count dataset from tree traversal algorithm
+    if (!hdf5.createDataset<int, 1>("Grids/" + kernel_name +
+                                        "/GridPointCounts",
+                                    grid_point_overdens_dims)) {
+      error("Failed to create GridPointCounts dataset for kernel %zu (radius=%f)",
+            kernel_idx, kernel_rad);
+      continue;
+    }
+
+#ifdef DEBUGGING_CHECKS
+    // In debug mode, also write brute force counts for validation
+    // These are computed by exhaustively checking every particle's distance
+    if (!hdf5.createDataset<int, 1>("Grids/" + kernel_name +
+                                        "/BruteForceGridPointCounts",
+                                    grid_point_overdens_dims)) {
+      error("Failed to create BruteForceGridPointCounts dataset for kernel %zu (radius=%f)",
+            kernel_idx, kernel_rad);
+      continue;
+    }
+#endif
+
     // Process each cell
     for (size_t cid = 0; cid < sim->nr_cells; cid++) {
       Cell *cell = &cells[cid];
@@ -163,12 +184,18 @@ void writeGridFileSerial(Simulation *sim, Grid *grid) {
       // Prepare data arrays
       std::vector<double> cell_grid_overdens;
       std::vector<double> cell_grid_masses;
+      std::vector<int> cell_grid_counts;
       std::vector<double> cell_grid_pos;
       cell_grid_overdens.reserve(count);
+      cell_grid_counts.reserve(count);
       cell_grid_pos.reserve(count * 3);
       if (metadata->write_masses) {
         cell_grid_masses.reserve(count);
       }
+#ifdef DEBUGGING_CHECKS
+      std::vector<int> cell_grid_brute_counts;
+      cell_grid_brute_counts.reserve(count);
+#endif
 
       // Extract data from grid points
       for (const GridPoint *gp : cell->grid_points) {
@@ -180,6 +207,14 @@ void writeGridFileSerial(Simulation *sim, Grid *grid) {
           cell_grid_masses.push_back(gp->getMass(kernel_rad));
         }
 
+        // Get particle counts from gridder algorithm
+        cell_grid_counts.push_back(gp->getCount(kernel_rad));
+
+#ifdef DEBUGGING_CHECKS
+        // Get brute force counts in debug mode
+        cell_grid_brute_counts.push_back(gp->getBruteForceCount(kernel_rad));
+#endif
+
         // Store positions if not done yet
         if (!written_positions) {
           cell_grid_pos.push_back(gp->loc[0]);
@@ -209,6 +244,30 @@ void writeGridFileSerial(Simulation *sim, Grid *grid) {
         }
       }
 
+      // Write gridder particle counts
+      if (!cell_grid_counts.empty()) {
+        if (!hdf5.writeDatasetSlice<int, 1>(
+                "Grids/" + kernel_name + "/GridPointCounts", cell_grid_counts,
+                {static_cast<hsize_t>(start_idx)},
+                {static_cast<hsize_t>(count)})) {
+          error("Failed to write counts slice for cell %zu, kernel %zu (radius=%f)",
+                cid, kernel_idx, kernel_rad);
+        }
+      }
+
+#ifdef DEBUGGING_CHECKS
+      // Write brute force counts in debug mode
+      if (!cell_grid_brute_counts.empty()) {
+        if (!hdf5.writeDatasetSlice<int, 1>(
+                "Grids/" + kernel_name + "/BruteForceGridPointCounts", cell_grid_brute_counts,
+                {static_cast<hsize_t>(start_idx)},
+                {static_cast<hsize_t>(count)})) {
+          error("Failed to write brute force counts slice for cell %zu, kernel %zu (radius=%f)",
+                cid, kernel_idx, kernel_rad);
+        }
+      }
+#endif
+
       // Write position slice if needed
       if (!written_positions && !cell_grid_pos.empty()) {
         if (!hdf5.writeDatasetSlice<double, 2>(
diff --git a/tests/analyze_debug_output.py b/tests/analyze_debug_output.py
new file mode 100644
index 0000000..e140510
--- /dev/null
+++ b/tests/analyze_debug_output.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""
+Analyze debug output to diagnose tree traversal bugs
+"""
+import h5py
+import numpy as np
+import sys
+
+def analyze_file(filename):
+    print('='*70)
+    print(f'ANALYZING: {filename}')
+    print('='*70)
+
+    with h5py.File(filename, 'r') as f:
+        # Get grid point positions
+        positions = np.array(f['Grids/GridPointPositions'])
+
+        print(f'\nGrid point locations:')
+        for i, pos in enumerate(positions):
+            print(f'  GP {i:2d}: ({pos[0]:5.1f}, {pos[1]:5.1f}, {pos[2]:5.1f})')
+
+        print('\n' + '='*70)
+        print('Per-kernel comparison:')
+        print('='*70)
+
+        for kernel_name in sorted(f['Grids'].keys()):
+            if kernel_name == 'GridPointPositions':
+                continue
+
+            kernel_rad = f[f'Grids/{kernel_name}'].attrs['KernelRadius']
+            octree_counts = np.array(f[f'Grids/{kernel_name}/GridPointCounts'])
+            brute_counts = np.array(f[f'Grids/{kernel_name}/BruteForceGridPointCounts'])
+
+            print(f'\n{kernel_name} (radius = {kernel_rad} Mpc/h):')
+            print(f'  GP  | Position              | Octree | Brute  | Diff   | Status')
+            print(f'  ----+----------------------+--------+--------+--------+-------')
+
+            for i in range(len(octree_counts)):
+                pos_str = f'({positions[i][0]:5.1f},{positions[i][1]:5.1f},{positions[i][2]:5.1f})'
+                diff = octree_counts[i] - brute_counts[i]
+
+                if diff == 0:
+                    status = 'OK'
+                elif octree_counts[i] == 0 and brute_counts[i] > 0:
+                    status = 'MISS'  # Octree missed particles
+                elif octree_counts[i] > brute_counts[i]:
+                    status = 'OVER'  # Octree over-counted
+                else:
+                    status = 'UNDER' # Octree under-counted
+
+                print(f'  {i:3d} | {pos_str:20s} | {octree_counts[i]:6d} | {brute_counts[i]:6d} | {diff:+7d} | {status}')
+
+            # Summary
+            total_mismatches = np.sum(octree_counts != brute_counts)
+            total_miss = np.sum((octree_counts == 0) & (brute_counts > 0))
+            total_over = np.sum(octree_counts > brute_counts)
+
+            print(f'\n  Summary: {total_mismatches}/{len(octree_counts)} mismatches')
+            print(f'    - Complete misses (0 when should find particles): {total_miss}')
+            print(f'    - Over-counting (found more than truth): {total_over}')
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        analyze_file(sys.argv[1])
+    else:
+        # Default files to analyze
+        files = [
+            'tests/data/boundary_output.hdf5',
+            'tests/data/cluster_output.hdf5',
+        ]
+        for f in files:
+            try:
+                analyze_file(f)
+                print('\n')
+            except FileNotFoundError:
+                print(f'File not found: {f}\n')
diff --git a/make_test_snap.py b/tests/make_test_snap.py
similarity index 100%
rename from make_test_snap.py
rename to tests/make_test_snap.py
diff --git a/tests/run_simple_test.sh b/tests/run_simple_test.sh
index 473e9ca..c4f5633 100755
--- a/tests/run_simple_test.sh
+++ b/tests/run_simple_test.sh
@@ -67,7 +67,7 @@ run_test() {
 
     # Create simple test snapshot
     cd "$ROOT_DIR"
-    python3 make_test_snap.py \
+    python3 tests/make_test_snap.py \
         --output "$TEST_SNAPSHOT" \
         --cdim 3 \
         --boxsize 10.0 \
diff --git a/tests/test_file_gridding.py b/tests/test_file_gridding.py
index caa0cc2..33f3748 100755
--- a/tests/test_file_gridding.py
+++ b/tests/test_file_gridding.py
@@ -55,7 +55,10 @@ def create_test_snapshot(self, filename):
             cells_meta = cells.create_group('Meta-data')
             cells_meta.attrs['dimension'] = np.array([2, 2, 2], dtype=np.int32)
             cells_meta.attrs['nr_cells'] = 8
-            cells_meta.attrs['size'] = self.box_size
+            # Cell size must be an array [x, y, z] for each cell dimension, not a scalar!
+            # With dimension [2, 2, 2] and box size 10.0, each cell is 5.0 x 5.0 x 5.0
+            cell_width = self.box_size / 2.0
+            cells_meta.attrs['size'] = np.array([cell_width, cell_width, cell_width])
 
             # Cells Centres
             cell_width = self.box_size / 2.0
diff --git a/tests/test_gridder.py b/tests/test_gridder.py
index a11925c..d52309e 100644
--- a/tests/test_gridder.py
+++ b/tests/test_gridder.py
@@ -54,7 +54,7 @@ def test_snapshot():
     DATA_DIR.mkdir(parents=True, exist_ok=True)
 
     # Create a minimal test snapshot using make_test_snap.py
-    make_snap_script = PROJECT_ROOT / "make_test_snap.py"
+    make_snap_script = TESTS_DIR / "make_test_snap.py"
 
     # Find a donor snapshot (use any existing HDF5 file or create minimal one)
     donor_path = DATA_DIR / "donor_minimal.hdf5"

From d2201333d3f792c0631cb6093648181d57796e21 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Thu, 20 Nov 2025 13:59:52 +0000
Subject: [PATCH 14/23] Get mean density from cosmology not particles

We don't want to load everything just to calculate a mean density!
---
 example_params.yml              |   5 +
 src/cell.cpp                    |   5 -
 src/gridder.cpp                 |   9 +
 src/simulation.cpp              |  51 +++++
 src/simulation.hpp              |   3 +
 tests/file_grid_test_params.yml |   6 +
 tests/simple_test_params.yml    |   6 +
 tests/test_cosmology.py         | 346 ++++++++++++++++++++++++++++++++
 tests/test_file_grid params.yml |   6 +
 9 files changed, 432 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_cosmology.py

diff --git a/example_params.yml b/example_params.yml
index 66c3e0f..9a2e000 100644
--- a/example_params.yml
+++ b/example_params.yml
@@ -13,6 +13,11 @@ Grid:
   n_grid_points: 1000000 # number of grid points to sample (applicable to random)
   random_seed: 42 # seed for random grid point generation (applicable to random)
 
+Cosmology:
+  h: 0.681              # Reduced Hubble constant
+  Omega_cdm: 0.256011   # Cold dark matter density parameter
+  Omega_b: 0.048600     # Baryon density parameter
+
 Performance:
 
 Tree:
diff --git a/src/cell.cpp b/src/cell.cpp
index bf09178..1ba30c8 100644
--- a/src/cell.cpp
+++ b/src/cell.cpp
@@ -843,11 +843,6 @@ void assignPartsToCells(Simulation *sim) {
   total_mass = global_total_mass;
 #endif
 
-  // Compute the mean comoving density
-  sim->mean_density = total_mass / sim->volume;
-
-  message("Mean comoving density: %e 10**10 Msun / cMpc^3", sim->mean_density);
-
 #ifdef DEBUGGING_CHECKS
   // Make sure we have attached all the particles (only count local cells in
   // MPI)
diff --git a/src/gridder.cpp b/src/gridder.cpp
index ad026d6..8f9f7d8 100644
--- a/src/gridder.cpp
+++ b/src/gridder.cpp
@@ -219,6 +219,15 @@ int main(int argc, char *argv[]) {
 
   message("Number of top level cells: %d", sim->nr_cells);
 
+  // Calculate mean density from cosmological parameters
+  // This must be done before any overdensity calculations
+  try {
+    sim->calculateMeanDensityFromCosmology(params);
+  } catch (const std::exception &e) {
+    std::cerr << "Failed to calculate mean density from cosmology: " << e.what() << std::endl;
+    return 1;
+  }
+
 #ifdef WITH_MPI
   // Partition the cells across MPI ranks (work-based partition)
   try {
diff --git a/src/simulation.cpp b/src/simulation.cpp
index 912db54..77c9c7d 100644
--- a/src/simulation.cpp
+++ b/src/simulation.cpp
@@ -105,3 +105,54 @@ void Simulation::readSimulationData() {
   // Compute the comoving volume of the simulation
   this->volume = this->dim[0] * this->dim[1] * this->dim[2];
 }
+
+/**
+ * @brief Calculate mean comoving density from cosmological parameters
+ *
+ * Computes the mean matter density at the simulation redshift using:
+ * ρ_mean = ρ_crit(z=0) × Ω_m × (1+z)³
+ *
+ * where ρ_crit(z=0) = 3H₀²/(8πG) is the critical density today
+ *
+ * @param params The parameters object containing cosmology
+ */
+void Simulation::calculateMeanDensityFromCosmology(Parameters *params) {
+
+  // Read cosmology parameters
+  double h = params->getParameterNoDefault<double>("Cosmology/h");
+  double Omega_cdm = params->getParameterNoDefault<double>("Cosmology/Omega_cdm");
+  double Omega_b = params->getParameterNoDefault<double>("Cosmology/Omega_b");
+
+  // Total matter density parameter
+  double Omega_m = Omega_cdm + Omega_b;
+
+  // Physical constants in internal units (10^10 Msun, Mpc, km/s)
+  // H0 = 100 h km/s/Mpc
+  double H0_kmsMpc = 100.0 * h;  // km/s/Mpc
+
+  // Convert to internal time units (H0 in units of 1/time where time is Mpc/(km/s))
+  // H0 = 100 h km/s/Mpc = 100 h / Mpc * (km/s)
+  // In our units: [H0] = km/s/Mpc
+
+  // Critical density today: ρ_crit = 3H₀²/(8πG)
+  // G = 4.3009e-6 (10^10 Msun)^-1 Mpc (km/s)^2 in internal units
+  const double G = 4.300917270069976e-6; // Gravitational constant in (10^10 Msun)^-1 Mpc (km/s)^2
+
+  // ρ_crit(z=0) = 3H₀²/(8πG) in units of 10^10 Msun / Mpc^3
+  double rho_crit_0 = (3.0 * H0_kmsMpc * H0_kmsMpc) / (8.0 * M_PI * G);
+
+  // Mean density at redshift z: ρ_mean(z) = ρ_crit(0) × Ω_m × (1+z)³
+  double scale_factor = 1.0 / (1.0 + this->redshift);
+  double density_evolution = 1.0 / (scale_factor * scale_factor * scale_factor); // (1+z)³
+
+  this->mean_density = rho_crit_0 * Omega_m * density_evolution;
+
+  Metadata *metadata = &Metadata::getInstance();
+  if (metadata->rank == 0) {
+    message("Cosmology: h=%.4f, Omega_m=%.6f (Omega_cdm=%.6f + Omega_b=%.6f)",
+            h, Omega_m, Omega_cdm, Omega_b);
+    message("Critical density today: %.6e 10^10 Msun/Mpc^3", rho_crit_0);
+    message("Mean comoving density at z=%.4f: %.6e 10^10 Msun/Mpc^3",
+            this->redshift, this->mean_density);
+  }
+}
diff --git a/src/simulation.hpp b/src/simulation.hpp
index 729a933..900bfa2 100644
--- a/src/simulation.hpp
+++ b/src/simulation.hpp
@@ -76,6 +76,9 @@ class Simulation {
   // Prototype for reader function (defined in simulation.cpp)
   void readSimulationData();
 
+  // Calculate mean density from cosmological parameters
+  void calculateMeanDensityFromCosmology(Parameters *params);
+
 private:
   // Helper function for cleanup
   void deleteChildCells(Cell *cell);
diff --git a/tests/file_grid_test_params.yml b/tests/file_grid_test_params.yml
index 44cd162..b3f8f55 100644
--- a/tests/file_grid_test_params.yml
+++ b/tests/file_grid_test_params.yml
@@ -9,6 +9,12 @@ Grid:
   grid_file: tests/data/grid_points_files/valid_points.txt
   # n_grid_points is optional for file type - will be counted from file
 
+Cosmology:
+  h: 0.681              # Reduced Hubble constant
+  Omega_cdm: 0.256011   # Cold dark matter density parameter
+  Omega_b: 0.048600     # Baryon density parameter
+
+
 Tree:
   max_leaf_count: 200
 
diff --git a/tests/simple_test_params.yml b/tests/simple_test_params.yml
index 33e6d01..db1956f 100644
--- a/tests/simple_test_params.yml
+++ b/tests/simple_test_params.yml
@@ -8,6 +8,12 @@ Grid:
   cdim: 3  # Small grid (3x3x3), grid point at center
   type: uniform
 
+Cosmology:
+  h: 0.681              # Reduced Hubble constant
+  Omega_cdm: 0.256011   # Cold dark matter density parameter
+  Omega_b: 0.048600     # Baryon density parameter
+
+
 Tree:
   max_leaf_count: 200
 
diff --git a/tests/test_cosmology.py b/tests/test_cosmology.py
new file mode 100644
index 0000000..7a7a6ec
--- /dev/null
+++ b/tests/test_cosmology.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python3
+"""
+Unit tests for cosmological mean density calculations.
+
+Tests verify that the C++ cosmology implementation correctly computes
+mean comoving density at various redshifts and with different cosmological
+parameters.
+"""
+
+import subprocess
+import pytest
+import h5py
+import numpy as np
+from pathlib import Path
+import yaml
+import tempfile
+import os
+
+# Get paths
+PROJECT_ROOT = Path(__file__).parent.parent
+BUILD_DIR = PROJECT_ROOT / "build"
+GRIDDER_EXE = BUILD_DIR / "parent_gridder"
+TESTS_DIR = PROJECT_ROOT / "tests"
+DATA_DIR = TESTS_DIR / "data"
+
+
+def create_test_snapshot(filepath, redshift, boxsize=10.0):
+    """Create a minimal test snapshot at specified redshift"""
+    with h5py.File(filepath, 'w') as f:
+        # Header
+        header = f.create_group('Header')
+        header.attrs['Redshift'] = redshift
+        header.attrs['BoxSize'] = np.array([boxsize, boxsize, boxsize])
+        header.attrs['NumPart_Total'] = np.array([0, 1, 0, 0, 0, 0], dtype=np.uint64)
+
+        # Units (10^10 Msun, Mpc, km/s)
+        units = f.create_group('Units')
+        units.attrs['Unit mass in cgs (U_M)'] = 1.989e43
+        units.attrs['Unit length in cgs (U_L)'] = 3.086e24
+        units.attrs['Unit time in cgs (U_t)'] = 3.086e19
+
+        # Cells metadata
+        cells_meta = f.create_group('Cells/Meta-data')
+        cells_meta.attrs['dimension'] = np.array([2, 2, 2], dtype=np.int32)
+        cells_meta.attrs['size'] = np.array([boxsize/2, boxsize/2, boxsize/2])
+
+        # One particle at center
+        part1 = f.create_group('PartType1')
+        part1.create_dataset('Coordinates', data=np.array([[boxsize/2, boxsize/2, boxsize/2]]))
+        part1.create_dataset('Masses', data=np.array([1.0]))
+
+        # Cell counts (one particle in central cell)
+        f.create_dataset('Cells/Counts/PartType1', data=np.array([0, 0, 0, 1, 0, 0, 0, 0], dtype=np.int32))
+        f.create_dataset('Cells/OffsetsInFile/PartType1', data=np.array([0, 0, 0, 0, 1, 1, 1, 1], dtype=np.int32))
+
+
+def create_test_params(filepath, snapshot_path, cosmology):
+    """Create test parameter file with specified cosmology"""
+    params = {
+        'Kernels': {
+            'nkernels': 1,
+            'kernel_radius_1': 1.0
+        },
+        'Grid': {
+            'type': 'uniform',
+            'cdim': 2
+        },
+        'Cosmology': cosmology,
+        'Tree': {
+            'max_leaf_count': 200
+        },
+        'Input': {
+            'filepath': str(snapshot_path),
+            'placeholder': '0000'
+        },
+        'Output': {
+            'filepath': str(filepath.parent) + '/',
+            'basename': filepath.name,
+            'write_masses': 0
+        }
+    }
+
+    with open(filepath, 'w') as f:
+        yaml.dump(params, f, default_flow_style=False)
+
+
+def run_gridder_and_get_density(snapshot_path, params_path, output_path):
+    """Run gridder and extract mean density from output"""
+    result = subprocess.run(
+        [str(GRIDDER_EXE), str(params_path), "1"],
+        capture_output=True,
+        text=True,
+        timeout=30
+    )
+
+    if result.returncode != 0:
+        raise RuntimeError(f"Gridder failed:\nstdout: {result.stdout}\nstderr: {result.stderr}")
+
+    # Extract mean density from stdout
+    for line in result.stdout.split('\n'):
+        if 'Mean comoving density at z=' in line:
+            # Parse: "Mean comoving density at z=X.XXXX: Y.YYYYYYe+XX 10^10 Msun/Mpc^3"
+            parts = line.split(':')
+            if len(parts) >= 2:
+                density_str = parts[1].strip().split()[0]
+                return float(density_str)
+
+    raise RuntimeError(f"Could not extract mean density from output:\n{result.stdout}")
+
+
+def calculate_expected_density(h, Omega_cdm, Omega_b, redshift):
+    """
+    Calculate expected mean density using standard cosmological formula.
+
+    ρ_mean(z) = ρ_crit(z=0) × Ω_m × (1+z)³
+
+    where ρ_crit(z=0) = 3H₀²/(8πG)
+
+    Units: 10^10 Msun/Mpc^3
+    """
+    # Physical constants in internal units
+    H0_kmsMpc = 100.0 * h  # km/s/Mpc
+    G = 4.300917270069976e-6  # (10^10 Msun)^-1 Mpc (km/s)^2
+
+    # Critical density today
+    rho_crit_0 = (3.0 * H0_kmsMpc**2) / (8.0 * np.pi * G)
+
+    # Total matter density parameter
+    Omega_m = Omega_cdm + Omega_b
+
+    # Mean density at redshift z
+    density_evolution = (1.0 + redshift)**3
+    mean_density = rho_crit_0 * Omega_m * density_evolution
+
+    return mean_density
+
+
+class TestCosmologyCalculations:
+    """Test suite for cosmological mean density calculations"""
+
+    @pytest.fixture(scope="class")
+    def test_workspace(self):
+        """Create temporary workspace for tests"""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            workspace = Path(tmpdir)
+            yield workspace
+
+    def test_planck2018_z0(self, test_workspace):
+        """Test Planck 2018 cosmology at z=0"""
+        # Planck 2018 parameters
+        cosmology = {
+            'h': 0.6736,
+            'Omega_cdm': 0.2607,
+            'Omega_b': 0.0493
+        }
+        redshift = 0.0
+
+        # Create test files
+        snapshot_path = test_workspace / "snapshot_z0.hdf5"
+        params_path = test_workspace / "params_z0.yml"
+        output_path = test_workspace / "output_z0.hdf5"
+
+        create_test_snapshot(snapshot_path, redshift)
+        create_test_params(params_path, snapshot_path, cosmology)
+
+        # Run gridder and extract density
+        gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path)
+
+        # Calculate expected density
+        expected_density = calculate_expected_density(
+            cosmology['h'], cosmology['Omega_cdm'], cosmology['Omega_b'], redshift
+        )
+
+        # Check agreement (within 0.01% - account for floating point precision)
+        relative_error = abs(gridder_density - expected_density) / expected_density
+        assert relative_error < 1e-4, \
+            f"Mean density mismatch at z={redshift}: gridder={gridder_density:.6e}, expected={expected_density:.6e}"
+
+    def test_planck2018_z1(self, test_workspace):
+        """Test Planck 2018 cosmology at z=1"""
+        cosmology = {
+            'h': 0.6736,
+            'Omega_cdm': 0.2607,
+            'Omega_b': 0.0493
+        }
+        redshift = 1.0
+
+        snapshot_path = test_workspace / "snapshot_z1.hdf5"
+        params_path = test_workspace / "params_z1.yml"
+        output_path = test_workspace / "output_z1.hdf5"
+
+        create_test_snapshot(snapshot_path, redshift)
+        create_test_params(params_path, snapshot_path, cosmology)
+
+        gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path)
+        expected_density = calculate_expected_density(
+            cosmology['h'], cosmology['Omega_cdm'], cosmology['Omega_b'], redshift
+        )
+
+        # At z=1, density should be 8x higher than z=0 (since (1+z)³ = 8)
+        relative_error = abs(gridder_density - expected_density) / expected_density
+        assert relative_error < 1e-4, \
+            f"Mean density mismatch at z={redshift}: gridder={gridder_density:.6e}, expected={expected_density:.6e}"
+
+    def test_high_redshift_z7(self, test_workspace):
+        """Test at high redshift z=7 (relevant for JWST observations)"""
+        cosmology = {
+            'h': 0.6736,
+            'Omega_cdm': 0.2607,
+            'Omega_b': 0.0493
+        }
+        redshift = 7.0
+
+        snapshot_path = test_workspace / "snapshot_z7.hdf5"
+        params_path = test_workspace / "params_z7.yml"
+        output_path = test_workspace / "output_z7.hdf5"
+
+        create_test_snapshot(snapshot_path, redshift)
+        create_test_params(params_path, snapshot_path, cosmology)
+
+        gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path)
+        expected_density = calculate_expected_density(
+            cosmology['h'], cosmology['Omega_cdm'], cosmology['Omega_b'], redshift
+        )
+
+        # At z=7, density should be (1+7)³ = 512x higher than z=0
+        relative_error = abs(gridder_density - expected_density) / expected_density
+        assert relative_error < 1e-4, \
+            f"Mean density mismatch at z={redshift}: gridder={gridder_density:.6e}, expected={expected_density:.6e}"
+
+    def test_wmap9_cosmology(self, test_workspace):
+        """Test WMAP9 cosmology (different from Planck)"""
+        # WMAP9 parameters
+        cosmology = {
+            'h': 0.697,
+            'Omega_cdm': 0.233,
+            'Omega_b': 0.0463
+        }
+        redshift = 2.0
+
+        snapshot_path = test_workspace / "snapshot_wmap9.hdf5"
+        params_path = test_workspace / "params_wmap9.yml"
+        output_path = test_workspace / "output_wmap9.hdf5"
+
+        create_test_snapshot(snapshot_path, redshift)
+        create_test_params(params_path, snapshot_path, cosmology)
+
+        gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path)
+        expected_density = calculate_expected_density(
+            cosmology['h'], cosmology['Omega_cdm'], cosmology['Omega_b'], redshift
+        )
+
+        relative_error = abs(gridder_density - expected_density) / expected_density
+        assert relative_error < 1e-4, \
+            f"Mean density mismatch with WMAP9 cosmology: gridder={gridder_density:.6e}, expected={expected_density:.6e}"
+
+    def test_low_omega_matter(self, test_workspace):
+        """Test with artificially low matter density"""
+        cosmology = {
+            'h': 0.7,
+            'Omega_cdm': 0.15,
+            'Omega_b': 0.03
+        }
+        redshift = 0.5
+
+        snapshot_path = test_workspace / "snapshot_low_om.hdf5"
+        params_path = test_workspace / "params_low_om.yml"
+        output_path = test_workspace / "output_low_om.hdf5"
+
+        create_test_snapshot(snapshot_path, redshift)
+        create_test_params(params_path, snapshot_path, cosmology)
+
+        gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path)
+        expected_density = calculate_expected_density(
+            cosmology['h'], cosmology['Omega_cdm'], cosmology['Omega_b'], redshift
+        )
+
+        relative_error = abs(gridder_density - expected_density) / expected_density
+        assert relative_error < 1e-4, \
+            f"Mean density mismatch with low Omega_m: gridder={gridder_density:.6e}, expected={expected_density:.6e}"
+
+    def test_high_h(self, test_workspace):
+        """Test with high Hubble constant"""
+        cosmology = {
+            'h': 0.9,  # Unrealistically high but good test
+            'Omega_cdm': 0.25,
+            'Omega_b': 0.05
+        }
+        redshift = 1.5
+
+        snapshot_path = test_workspace / "snapshot_high_h.hdf5"
+        params_path = test_workspace / "params_high_h.yml"
+        output_path = test_workspace / "output_high_h.hdf5"
+
+        create_test_snapshot(snapshot_path, redshift)
+        create_test_params(params_path, snapshot_path, cosmology)
+
+        gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path)
+        expected_density = calculate_expected_density(
+            cosmology['h'], cosmology['Omega_cdm'], cosmology['Omega_b'], redshift
+        )
+
+        relative_error = abs(gridder_density - expected_density) / expected_density
+        assert relative_error < 1e-4, \
+            f"Mean density mismatch with high h: gridder={gridder_density:.6e}, expected={expected_density:.6e}"
+
+    def test_density_evolution(self, test_workspace):
+        """Test that density evolves correctly with redshift"""
+        cosmology = {
+            'h': 0.6736,
+            'Omega_cdm': 0.2607,
+            'Omega_b': 0.0493
+        }
+
+        redshifts = [0.0, 1.0, 2.0, 5.0]
+        densities = []
+
+        for z in redshifts:
+            snapshot_path = test_workspace / f"snapshot_z{z}.hdf5"
+            params_path = test_workspace / f"params_z{z}.yml"
+            output_path = test_workspace / f"output_z{z}.hdf5"
+
+            create_test_snapshot(snapshot_path, z)
+            create_test_params(params_path, snapshot_path, cosmology)
+
+            density = run_gridder_and_get_density(snapshot_path, params_path, output_path)
+            densities.append(density)
+
+        # Check that density increases monotonically with redshift
+        for i in range(len(redshifts) - 1):
+            assert densities[i+1] > densities[i], \
+                f"Density should increase with redshift: rho(z={redshifts[i]})={densities[i]:.6e}, rho(z={redshifts[i+1]})={densities[i+1]:.6e}"
+
+        # Check that density ratios match (1+z)³ scaling
+        for i in range(len(redshifts) - 1):
+            z1, z2 = redshifts[i], redshifts[i+1]
+            expected_ratio = ((1 + z2) / (1 + z1))**3
+            actual_ratio = densities[i+1] / densities[i]
+            relative_error = abs(actual_ratio - expected_ratio) / expected_ratio
+            assert relative_error < 1e-4, \
+                f"Density ratio mismatch between z={z1} and z={z2}: actual={actual_ratio:.6f}, expected={expected_ratio:.6f}"
+
+
+if __name__ == '__main__':
+    # Run tests with pytest
+    pytest.main([__file__, '-v', '--tb=short'])
diff --git a/tests/test_file_grid params.yml b/tests/test_file_grid params.yml
index 4846101..90dd7d3 100644
--- a/tests/test_file_grid params.yml	
+++ b/tests/test_file_grid params.yml	
@@ -11,6 +11,12 @@ Grid:
   type: file
   grid_file: tests/data/test_grid_coordinates.txt
 
+Cosmology:
+  h: 0.681              # Reduced Hubble constant
+  Omega_cdm: 0.256011   # Cold dark matter density parameter
+  Omega_b: 0.048600     # Baryon density parameter
+
+
 Tree:
   max_leaf_count: 200
 

From 1a64a10235c887a02ac59e25e7dda0f195a20d69 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Thu, 20 Nov 2025 14:20:44 +0000
Subject: [PATCH 15/23] HAndling missing pi

---
 src/simulation.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/simulation.cpp b/src/simulation.cpp
index 77c9c7d..2ee596b 100644
--- a/src/simulation.cpp
+++ b/src/simulation.cpp
@@ -1,9 +1,17 @@
+// Standard includes
+#include <cmath>
+
 // Local includes
 #include "simulation.hpp"
 #include "cell.hpp"
 #include "hdf_io.hpp"
 #include "metadata.hpp"
 
+// Define M_PI if not available (POSIX extension, not standard C++)
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
 /**
  * @brief Construct a new Simulation object
  *

From c56c440fc41310ac73f43ab2ed33876690df8d6c Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Thu, 20 Nov 2025 14:38:32 +0000
Subject: [PATCH 16/23] Ensuring everything is comoving

---
 src/simulation.cpp      | 14 +++++++-------
 tests/test_cosmology.py | 39 +++++++++++++++++----------------------
 2 files changed, 24 insertions(+), 29 deletions(-)

diff --git a/src/simulation.cpp b/src/simulation.cpp
index 2ee596b..d85d4d3 100644
--- a/src/simulation.cpp
+++ b/src/simulation.cpp
@@ -149,18 +149,18 @@ void Simulation::calculateMeanDensityFromCosmology(Parameters *params) {
   // ρ_crit(z=0) = 3H₀²/(8πG) in units of 10^10 Msun / Mpc^3
   double rho_crit_0 = (3.0 * H0_kmsMpc * H0_kmsMpc) / (8.0 * M_PI * G);
 
-  // Mean density at redshift z: ρ_mean(z) = ρ_crit(0) × Ω_m × (1+z)³
-  double scale_factor = 1.0 / (1.0 + this->redshift);
-  double density_evolution = 1.0 / (scale_factor * scale_factor * scale_factor); // (1+z)³
-
-  this->mean_density = rho_crit_0 * Omega_m * density_evolution;
+  // Mean COMOVING density: ρ_comoving = ρ_crit(0) × Ω_m
+  // Note: In comoving coordinates, density does NOT evolve with redshift
+  // The (1+z)³ factor would convert to physical density, but SWIFT uses comoving coordinates
+  this->mean_density = rho_crit_0 * Omega_m;
 
   Metadata *metadata = &Metadata::getInstance();
   if (metadata->rank == 0) {
     message("Cosmology: h=%.4f, Omega_m=%.6f (Omega_cdm=%.6f + Omega_b=%.6f)",
             h, Omega_m, Omega_cdm, Omega_b);
     message("Critical density today: %.6e 10^10 Msun/Mpc^3", rho_crit_0);
-    message("Mean comoving density at z=%.4f: %.6e 10^10 Msun/Mpc^3",
-            this->redshift, this->mean_density);
+    message("Mean comoving density (constant with z): %.6e 10^10 Msun/cMpc^3",
+            this->mean_density);
+    message("(Snapshot at z=%.4f, but density in comoving coordinates)", this->redshift);
   }
 }
diff --git a/tests/test_cosmology.py b/tests/test_cosmology.py
index 7a7a6ec..cdcb5c0 100644
--- a/tests/test_cosmology.py
+++ b/tests/test_cosmology.py
@@ -110,13 +110,16 @@ def run_gridder_and_get_density(snapshot_path, params_path, output_path):
 
 def calculate_expected_density(h, Omega_cdm, Omega_b, redshift):
     """
-    Calculate expected mean density using standard cosmological formula.
+    Calculate expected mean COMOVING density using standard cosmological formula.
 
-    ρ_mean(z) = ρ_crit(z=0) × Ω_m × (1+z)³
+    ρ_comoving = ρ_crit(z=0) × Ω_m
+
+    Note: In comoving coordinates, density does NOT evolve with redshift.
+    The (1+z)³ factor converts to physical density, but SWIFT uses comoving coords.
 
     where ρ_crit(z=0) = 3H₀²/(8πG)
 
-    Units: 10^10 Msun/Mpc^3
+    Units: 10^10 Msun/cMpc^3
     """
     # Physical constants in internal units
     H0_kmsMpc = 100.0 * h  # km/s/Mpc
@@ -128,9 +131,8 @@ def calculate_expected_density(h, Omega_cdm, Omega_b, redshift):
     # Total matter density parameter
     Omega_m = Omega_cdm + Omega_b
 
-    # Mean density at redshift z
-    density_evolution = (1.0 + redshift)**3
-    mean_density = rho_crit_0 * Omega_m * density_evolution
+    # Mean COMOVING density (constant with redshift!)
+    mean_density = rho_crit_0 * Omega_m
 
     return mean_density
 
@@ -304,15 +306,15 @@ def test_high_h(self, test_workspace):
         assert relative_error < 1e-4, \
             f"Mean density mismatch with high h: gridder={gridder_density:.6e}, expected={expected_density:.6e}"
 
-    def test_density_evolution(self, test_workspace):
-        """Test that density evolves correctly with redshift"""
+    def test_density_constant_with_redshift(self, test_workspace):
+        """Test that COMOVING density is constant with redshift"""
         cosmology = {
             'h': 0.6736,
             'Omega_cdm': 0.2607,
             'Omega_b': 0.0493
         }
 
-        redshifts = [0.0, 1.0, 2.0, 5.0]
+        redshifts = [0.0, 1.0, 2.0, 5.0, 7.0]
         densities = []
 
         for z in redshifts:
@@ -326,19 +328,12 @@ def test_density_evolution(self, test_workspace):
             density = run_gridder_and_get_density(snapshot_path, params_path, output_path)
             densities.append(density)
 
-        # Check that density increases monotonically with redshift
-        for i in range(len(redshifts) - 1):
-            assert densities[i+1] > densities[i], \
-                f"Density should increase with redshift: rho(z={redshifts[i]})={densities[i]:.6e}, rho(z={redshifts[i+1]})={densities[i+1]:.6e}"
-
-        # Check that density ratios match (1+z)³ scaling
-        for i in range(len(redshifts) - 1):
-            z1, z2 = redshifts[i], redshifts[i+1]
-            expected_ratio = ((1 + z2) / (1 + z1))**3
-            actual_ratio = densities[i+1] / densities[i]
-            relative_error = abs(actual_ratio - expected_ratio) / expected_ratio
-            assert relative_error < 1e-4, \
-                f"Density ratio mismatch between z={z1} and z={z2}: actual={actual_ratio:.6f}, expected={expected_ratio:.6f}"
+        # Check that comoving density is constant with redshift (within 0.01%)
+        reference_density = densities[0]
+        for i, z in enumerate(redshifts):
+            relative_diff = abs(densities[i] - reference_density) / reference_density
+            assert relative_diff < 1e-4, \
+                f"Comoving density should be constant: rho(z={z})={densities[i]:.6e}, rho(z=0)={reference_density:.6e}, diff={relative_diff:.6e}"
 
 
 if __name__ == '__main__':

From 9401bffefbd007f74f34e3fd037a879753cfc879 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Thu, 20 Nov 2025 14:48:54 +0000
Subject: [PATCH 17/23] Fixing mean density calculation (must use internal
 units)

---
 src/simulation.cpp      | 10 +++----
 tests/test_cosmology.py | 59 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/src/simulation.cpp b/src/simulation.cpp
index d85d4d3..5eab1ed 100644
--- a/src/simulation.cpp
+++ b/src/simulation.cpp
@@ -143,8 +143,9 @@ void Simulation::calculateMeanDensityFromCosmology(Parameters *params) {
   // In our units: [H0] = km/s/Mpc
 
   // Critical density today: ρ_crit = 3H₀²/(8πG)
-  // G = 4.3009e-6 (10^10 Msun)^-1 Mpc (km/s)^2 in internal units
-  const double G = 4.300917270069976e-6; // Gravitational constant in (10^10 Msun)^-1 Mpc (km/s)^2
+  // G in (10^10 Msun)^-1 Mpc (km/s)^2 internal units
+  // Derived from G_SI = 6.674e-11 m^3 kg^-1 s^-2 with proper unit conversion
+  const double G = 4.301744232015554e+01; // Gravitational constant in (10^10 Msun)^-1 Mpc (km/s)^2
 
   // ρ_crit(z=0) = 3H₀²/(8πG) in units of 10^10 Msun / Mpc^3
   double rho_crit_0 = (3.0 * H0_kmsMpc * H0_kmsMpc) / (8.0 * M_PI * G);
@@ -159,8 +160,7 @@ void Simulation::calculateMeanDensityFromCosmology(Parameters *params) {
     message("Cosmology: h=%.4f, Omega_m=%.6f (Omega_cdm=%.6f + Omega_b=%.6f)",
             h, Omega_m, Omega_cdm, Omega_b);
     message("Critical density today: %.6e 10^10 Msun/Mpc^3", rho_crit_0);
-    message("Mean comoving density (constant with z): %.6e 10^10 Msun/cMpc^3",
-            this->mean_density);
-    message("(Snapshot at z=%.4f, but density in comoving coordinates)", this->redshift);
+    message("Mean comoving density at z=%.4f: %.6e 10^10 Msun/cMpc^3",
+            this->redshift, this->mean_density);
   }
 }
diff --git a/tests/test_cosmology.py b/tests/test_cosmology.py
index cdcb5c0..d9d67bf 100644
--- a/tests/test_cosmology.py
+++ b/tests/test_cosmology.py
@@ -16,6 +16,14 @@
 import tempfile
 import os
 
+# Try to import astropy for direct comparison tests
+try:
+    from astropy.cosmology import FlatLambdaCDM
+    from astropy import units as u
+    ASTROPY_AVAILABLE = True
+except ImportError:
+    ASTROPY_AVAILABLE = False
+
 # Get paths
 PROJECT_ROOT = Path(__file__).parent.parent
 BUILD_DIR = PROJECT_ROOT / "build"
@@ -99,7 +107,7 @@ def run_gridder_and_get_density(snapshot_path, params_path, output_path):
     # Extract mean density from stdout
     for line in result.stdout.split('\n'):
         if 'Mean comoving density at z=' in line:
-            # Parse: "Mean comoving density at z=X.XXXX: Y.YYYYYYe+XX 10^10 Msun/Mpc^3"
+            # Parse: "Mean comoving density at z=X.XXXX: Y.YYYYYYe+XX 10^10 Msun/cMpc^3"
             parts = line.split(':')
             if len(parts) >= 2:
                 density_str = parts[1].strip().split()[0]
@@ -123,7 +131,7 @@ def calculate_expected_density(h, Omega_cdm, Omega_b, redshift):
     """
     # Physical constants in internal units
     H0_kmsMpc = 100.0 * h  # km/s/Mpc
-    G = 4.300917270069976e-6  # (10^10 Msun)^-1 Mpc (km/s)^2
+    G = 4.301744232015554e+01  # (10^10 Msun)^-1 Mpc (km/s)^2
 
     # Critical density today
     rho_crit_0 = (3.0 * H0_kmsMpc**2) / (8.0 * np.pi * G)
@@ -335,6 +343,53 @@ def test_density_constant_with_redshift(self, test_workspace):
             assert relative_diff < 1e-4, \
                 f"Comoving density should be constant: rho(z={z})={densities[i]:.6e}, rho(z=0)={reference_density:.6e}, diff={relative_diff:.6e}"
 
+    @pytest.mark.skipif(not ASTROPY_AVAILABLE, reason="astropy not available")
+    def test_astropy_comparison(self, test_workspace):
+        """Test that our calculation matches astropy exactly"""
+        # Planck 2018 parameters
+        cosmology = {
+            'h': 0.6736,
+            'Omega_cdm': 0.2607,
+            'Omega_b': 0.0493
+        }
+        redshift = 2.0
+
+        # Create test files
+        snapshot_path = test_workspace / "snapshot_astropy.hdf5"
+        params_path = test_workspace / "params_astropy.yml"
+        output_path = test_workspace / "output_astropy.hdf5"
+
+        create_test_snapshot(snapshot_path, redshift)
+        create_test_params(params_path, snapshot_path, cosmology)
+
+        # Get density from our C++ code
+        gridder_density = run_gridder_and_get_density(snapshot_path, params_path, output_path)
+
+        # Calculate with astropy
+        h = cosmology['h']
+        Omega_m = cosmology['Omega_cdm'] + cosmology['Omega_b']
+        H0 = h * 100.0  # km/s/Mpc
+
+        # Create astropy cosmology (flat ΛCDM with Omega_Lambda = 1 - Omega_m)
+        cosmo = FlatLambdaCDM(H0=H0, Om0=Omega_m)
+
+        # Get critical density at z=0 in comoving coordinates
+        rho_crit_0 = cosmo.critical_density(0)
+
+        # Convert to our units: 10^10 Msun/Mpc^3
+        # astropy gives g/cm^3, we need 10^10 Msun/Mpc^3
+        Msun_in_g = 1.989e33
+        Mpc_in_cm = 3.086e24
+        rho_crit_0_our_units = rho_crit_0.to(u.g / u.cm**3).value * (Mpc_in_cm**3) / (1e10 * Msun_in_g)
+
+        # Mean comoving density
+        astropy_density = rho_crit_0_our_units * Omega_m
+
+        # Check agreement (within 0.1% - account for different constants)
+        relative_error = abs(gridder_density - astropy_density) / astropy_density
+        assert relative_error < 1e-3, \
+            f"Density mismatch vs astropy: gridder={gridder_density:.6e}, astropy={astropy_density:.6e}, error={relative_error:.6e}"
+
 
 if __name__ == '__main__':
     # Run tests with pytest

From 7d1bbeb76af87437b7f5641822935d90552c168b Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Fri, 21 Nov 2025 09:31:00 +0000
Subject: [PATCH 18/23] Formatting

---
 weighting/knn_weighter.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/weighting/knn_weighter.py b/weighting/knn_weighter.py
index ace80f9..fc54981 100644
--- a/weighting/knn_weighter.py
+++ b/weighting/knn_weighter.py
@@ -77,7 +77,7 @@ def weighted_ks(
     n_parent: int = 100_000,
     n_sample: int = 100_000,
 ) -> float:
-    """Compute a weighted 1D KolmogorovSmirnov distance.
+    """Compute a weighted 1D Kolmogorov-Smirnov distance.
 
     The parent distribution is unweighted; the sample distribution is
     weighted. For speed, both are optionally downsampled before computing
@@ -195,7 +195,7 @@ def filter_and_transform_overdensities(
             raise ValueError(
                 "All arrays must have the same length; found mismatched lengths."
             )
-        mask &= (arr > -1.0)
+        mask &= arr > -1.0
 
     n_dropped = length - np.sum(mask)
     if n_dropped > 0:
@@ -206,9 +206,7 @@ def filter_and_transform_overdensities(
         )
 
     # Apply log transform to filtered data
-    transformed = {
-        scale: np.log10(arr[mask] + 1.0) for scale, arr in overdens.items()
-    }
+    transformed = {scale: np.log10(arr[mask] + 1.0) for scale, arr in overdens.items()}
 
     return transformed, mask
 
@@ -542,13 +540,9 @@ def _weights_for_k(
         if k < 1:
             raise ValueError(f"k must be >= 1, got k={k}")
         if k >= N:
-            raise ValueError(
-                f"k must be < N (parent size), got k={k}, N={N}"
-            )
+            raise ValueError(f"k must be < N (parent size), got k={k}, N={N}")
         if k + 1 > M:
-            raise ValueError(
-                f"k+1 must be <= M (sample size), got k={k}, M={M}"
-            )
+            raise ValueError(f"k+1 must be <= M (sample size), got k={k}, M={M}")
 
         dist_p = self._knn_dists(Xs_prep, Xp_prep, k=k)  # [M, k]
         r_parent = dist_p[:, -1]

From cac953c4ea5181651d3f634d88dd136cb3c0f892 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Fri, 21 Nov 2025 09:50:08 +0000
Subject: [PATCH 19/23] Writing an empty file when the grid is empty

---
 src/construct_grid_points.cpp |  4 ++--
 src/gridder.cpp               | 23 +++++++++++++++++++++--
 src/output.cpp                | 19 +++++++++++++++++++
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/src/construct_grid_points.cpp b/src/construct_grid_points.cpp
index 8bd1976..326cce7 100644
--- a/src/construct_grid_points.cpp
+++ b/src/construct_grid_points.cpp
@@ -307,8 +307,8 @@ readGridPointCoordinates(const std::string &filename,
     message("WARNING: No valid grid point coordinates found in file: %s",
             filename.c_str());
     message("The file is either empty or contains only comments/invalid lines.");
-    message("Exiting gracefully - no grid points to process.");
-    throw std::runtime_error("Empty grid file - no grid points to process");
+    // Return 0 - the caller will handle the empty grid case
+    return 0;
   }
 
   message("Read %d valid grid point coordinates from %s", valid_points,
diff --git a/src/gridder.cpp b/src/gridder.cpp
index 8f9f7d8..2d5f69b 100644
--- a/src/gridder.cpp
+++ b/src/gridder.cpp
@@ -252,8 +252,27 @@ int main(int argc, char *argv[]) {
   // Check if we actually have grid points to process
   if (grid->n_grid_points == 0) {
     message("No grid points available for processing.");
-    message("Program will now exit.");
-    return 0; // Clean exit, not an error
+    message("Writing empty output file before exiting...");
+
+    // Write empty output file
+#ifdef WITH_MPI
+    try {
+      writeGridFileParallel(sim, grid);
+    } catch (const std::exception &e) {
+      error("Failed to write empty output file: %s", e.what());
+      return 1;
+    }
+#else
+    try {
+      writeGridFileSerial(sim, grid);
+    } catch (const std::exception &e) {
+      error("Failed to write empty output file: %s", e.what());
+      return 1;
+    }
+#endif
+
+    message("No grid points were created - empty output file written.");
+    return 1; // Exit with error code since no work was done
   }
 
 #ifdef DEBUGGING_CHECKS
diff --git a/src/output.cpp b/src/output.cpp
index b6d0d14..fdd9cfa 100644
--- a/src/output.cpp
+++ b/src/output.cpp
@@ -98,6 +98,15 @@ void writeGridFileSerial(Simulation *sim, Grid *grid) {
     return;
   }
 
+  // If there are no grid points, skip dataset creation and exit early
+  if (grid->n_grid_points == 0) {
+    message("No grid points to write - creating empty output file with header only");
+    hdf5.close();
+    message("Successfully wrote empty grid data (serial mode)");
+    toc("Writing output (in serial)");
+    return;
+  }
+
   // Create dataset for grid positions
   std::array<hsize_t, 2> grid_point_positions_dims = {
       static_cast<hsize_t>(grid->n_grid_points), static_cast<hsize_t>(3)};
@@ -390,6 +399,16 @@ void writeGridFileParallel(Simulation *sim, Grid *grid) {
   hdf5.writeAttribute<double>("Header", "MaxKernelRadius",
                               grid->max_kernel_radius);
 
+  // If there are no local grid points, skip dataset creation and exit early
+  if (total_local_grid_points == 0) {
+    message("Rank %d: No local grid points to write - creating empty output file with header only",
+            metadata->rank);
+    hdf5.close();
+    message("Rank %d: Successfully wrote empty grid data (parallel mode)", metadata->rank);
+    toc("Writing output (in parallel)");
+    return;
+  }
+
   // Write local cell information
   if (!local_cell_ids.empty()) {
     std::array<hsize_t, 1> local_cell_dims = {

From 001d831194cca51edc17a9458f8255fb7b479999 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Fri, 28 Nov 2025 22:47:25 +0000
Subject: [PATCH 20/23] Removing unfinished weightings

---
 weighting/knn_weighter.py       | 1111 -------------------------------
 weighting/test_hmf_weighting.py |  648 ------------------
 2 files changed, 1759 deletions(-)
 delete mode 100644 weighting/knn_weighter.py
 delete mode 100644 weighting/test_hmf_weighting.py

diff --git a/weighting/knn_weighter.py b/weighting/knn_weighter.py
deleted file mode 100644
index fc54981..0000000
--- a/weighting/knn_weighter.py
+++ /dev/null
@@ -1,1111 +0,0 @@
-"""k-NN reweighting of a biased sample to match a parent population.
-
-This script reads two HDF5 files:
-
-* A "parent" (global) population.
-* A "sample" population (which may be biased).
-
-The set of kernels is defined by the sample file: every group
-``Grids/Kernel_*`` in the sample must also exist in the parent. Each such
-group must contain a dataset ``GridPointOverDensities``.
-
-For each kernel:
-
-* Read overdensities from parent and sample.
-* Transform as ``log10(1 + delta)``.
-* Stack over all kernels into a multi-dimensional feature space.
-
-We then use k-NN density-ratio estimation to compute weights for the sample
-so that its multi-scale environment distribution matches the parent.
-
-An optional auto-tuning step searches over candidate k values, choosing the
-one that:
-
-* Satisfies an effective sample size (ESS) floor, if possible.
-* Minimizes the mean 1D KS distance between parent and weighted sample
-  across all dimensions.
-
-Example
--------
-    python knn_weighter.py \\
-        --parent parent.hdf5 \\
-        --sample sample.hdf5 \\
-        --autotune_k 35,50,80,100 \\
-        --ess_floor 0.20
-"""
-
-from __future__ import annotations
-
-import argparse
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-
-import h5py
-import matplotlib.pyplot as plt
-import numpy as np
-from sklearn.neighbors import NearestNeighbors
-
-# ---------------------------------------------------------------------
-# Utilities
-# ---------------------------------------------------------------------
-
-
-def effective_sample_size(w: np.ndarray) -> float:
-    """Compute the effective sample size (ESS) of a weight vector.
-
-    ESS is defined as:
-
-    .. math::
-
-        \\text{ESS} = \\frac{(\\sum_i w_i)^2}{\\sum_i w_i^2}.
-
-    Args:
-        w: One-dimensional array of non-negative weights.
-
-    Returns:
-        Effective sample size as a float.
-    """
-    s = float(w.sum())
-    return (s * s) / float(np.sum(w * w) + 1e-18)
-
-
-def weighted_ks(
-    x_parent: np.ndarray,
-    x_sample: np.ndarray,
-    w_sample: np.ndarray,
-    rng: np.random.Generator,
-    n_parent: int = 100_000,
-    n_sample: int = 100_000,
-) -> float:
-    """Compute a weighted 1D Kolmogorov-Smirnov distance.
-
-    The parent distribution is unweighted; the sample distribution is
-    weighted. For speed, both are optionally downsampled before computing
-    the KS distance.
-
-    Args:
-        x_parent: Parent data values (1D).
-        x_sample: Sample data values (1D).
-        w_sample: Weights for ``x_sample``, same length as ``x_sample``.
-        rng: NumPy random number generator instance.
-        n_parent: Maximum number of parent points to use.
-        n_sample: Maximum number of sample points to use.
-
-    Returns:
-        The KS distance as a float.
-    """
-    # Downsample parent.
-    if len(x_parent) > n_parent:
-        xp = x_parent[rng.choice(len(x_parent), n_parent, replace=False)]
-    else:
-        xp = x_parent
-
-    # Downsample sample + weights.
-    if len(x_sample) > n_sample:
-        idx = rng.choice(len(x_sample), n_sample, replace=False)
-        xs = x_sample[idx]
-        ws = w_sample[idx]
-    else:
-        xs, ws = x_sample, w_sample
-
-    # Sort and build CDFs.
-    xp = np.sort(xp)
-    order = np.argsort(xs)
-    xs = xs[order]
-    ws = np.clip(ws[order], 0, None)
-    ws_sum = ws.sum()
-    if ws_sum < 1e-15:
-        # All weights are effectively zero - return worst-case KS distance
-        return 1.0
-    ws /= ws_sum
-    cdf_s = np.cumsum(ws)
-
-    grid = np.unique(np.concatenate([xp, xs]))
-    cdf_p = np.searchsorted(xp, grid, side="right") / len(xp)
-
-    idx = np.searchsorted(xs, grid, side="right") - 1
-    idx = np.clip(idx, -1, len(xs) - 1)
-    cdf_s_at_grid = np.where(idx >= 0, cdf_s[idx], 0.0)
-
-    return float(np.max(np.abs(cdf_p - cdf_s_at_grid)))
-
-
-def whiten_by_parent(
-    Xp: np.ndarray,
-    Xs: np.ndarray,
-) -> Tuple[np.ndarray, np.ndarray]:
-    """Apply parent-covariance whitening (Mahalanobis transform).
-
-    The parent distribution defines the mean and covariance used to whiten
-    both the parent and the sample data:
-
-    .. math::
-
-        X' = (X - \\mu_\\text{parent}) L^{-1},
-
-    where ``L`` is the Cholesky factor of the parent covariance.
-
-    Args:
-        Xp: Parent data array of shape ``(N, D)``.
-        Xs: Sample data array of shape ``(M, D)``.
-
-    Returns:
-        Tuple ``(Xp_whitened, Xs_whitened)`` with the same shapes as inputs.
-    """
-    mu = Xp.mean(axis=0)
-    Xm = Xp - mu
-    Sigma = np.cov(Xm, rowvar=False)
-    d = Sigma.shape[0]
-    # Tiny ridge for numerical stability.
-    Sigma = Sigma + (1e-12 * np.trace(Sigma) / max(d, 1)) * np.eye(d)
-    L = np.linalg.cholesky(Sigma)
-    Linv = np.linalg.inv(L)
-    return (Xp - mu) @ Linv, (Xs - mu) @ Linv
-
-
-def filter_and_transform_overdensities(
-    overdens: Dict[float, np.ndarray],
-) -> Tuple[Dict[float, np.ndarray], np.ndarray]:
-    """Filter out -1 values and apply log10(1 + delta) transform.
-
-    Grid points with overdensity = -1 indicate no particles nearby.
-    These must be removed before applying the log transform to avoid NaN.
-
-    Args:
-        overdens: Mapping from scale to raw overdensity arrays (may contain -1).
-
-    Returns:
-        Tuple (transformed_overdens, mask) where:
-            - transformed_overdens: log10(1 + delta) for valid points
-            - mask: Boolean array indicating which rows were kept
-
-    Raises:
-        ValueError: If arrays have mismatched lengths.
-    """
-    if not overdens:
-        return overdens, np.array([], dtype=bool)
-
-    first = next(iter(overdens.values()))
-    length = len(first)
-
-    # Build mask: keep rows where ALL scales have delta > -1
-    mask = np.ones(length, dtype=bool)
-    for scale, arr in overdens.items():
-        if len(arr) != length:
-            raise ValueError(
-                "All arrays must have the same length; found mismatched lengths."
-            )
-        mask &= arr > -1.0
-
-    n_dropped = length - np.sum(mask)
-    if n_dropped > 0:
-        print(
-            f"[Filter] Dropped {n_dropped:,}/{length:,} "
-            f"({100.0 * n_dropped / length:.2f}%) grid points with delta <= -1 "
-            f"(no particles nearby)."
-        )
-
-    # Apply log transform to filtered data
-    transformed = {scale: np.log10(arr[mask] + 1.0) for scale, arr in overdens.items()}
-
-    return transformed, mask
-
-
-def drop_nan_rows_per_scale(
-    samples: Dict[float, np.ndarray],
-) -> Dict[float, np.ndarray]:
-    """Remove entries that are NaN in any dimension for a population.
-
-    For a dictionary of 1D arrays (one per smoothing scale), this function
-    removes indices where *any* scale is NaN, ensuring consistent length
-    across all scales.
-
-    Args:
-        samples: Mapping from scale (e.g., in Mpc/h) to 1D NumPy arrays of
-            equal length.
-
-    Returns:
-        New dictionary with NaN-containing entries removed consistently
-        across all scales.
-
-    Raises:
-        ValueError: If the arrays in ``samples`` do not all have the same
-            length.
-    """
-    if not samples:
-        return samples
-
-    first = next(iter(samples.values()))
-    length = len(first)
-    mask = np.ones(length, dtype=bool)
-
-    for arr in samples.values():
-        if len(arr) != length:
-            raise ValueError(
-                "All arrays in a population must have the same length; "
-                "found mismatched lengths."
-            )
-        mask &= ~np.isnan(arr)
-
-    n_dropped = length - np.sum(mask)
-    if n_dropped > 0:
-        print(
-            f"[NaN] Dropped {n_dropped:,}/{length:,} "
-            f"({100.0 * n_dropped / length:.2f}%) entries containing NaN."
-        )
-
-    return {scale: arr[mask] for scale, arr in samples.items()}
-
-
-# ---------------------------------------------------------------------
-# Core k-NN reweighter with auto-tuning
-# ---------------------------------------------------------------------
-
-
-@dataclass
-class KNNReweighterConfig:
-    """Configuration for :class:`KNNReweighter`.
-
-    Attributes:
-        k: Default number of neighbors for k-NN if auto-tuning is disabled.
-        autotune_k: Candidate k values to try when auto-tuning. If None,
-            a single fixed k is used.
-        ess_floor: Minimum ESS fraction (ESS / M) for a candidate k to be
-            considered "feasible" during auto-tuning.
-        preprocess: Preprocessing mode: ``"whiten"``, ``"standardize"``, or
-            ``"none"``.
-        standardize_by_parent: If True and ``preprocess == "standardize"``,
-            use parent statistics only; otherwise, use pooled stats.
-        algorithm: Nearest-neighbors algorithm (as in scikit-learn).
-        metric: Distance metric (e.g., ``"minkowski"``, ``"euclidean"``).
-        leaf_size: Leaf size for tree-based nearest-neighbor algorithms.
-        tau: Temperature exponent applied to the weights (``w := w**tau``).
-            Use tau < 1 to reduce weight variance (smoother, higher ESS).
-            Use tau > 1 to increase contrast (sharper weights, lower ESS).
-            Default tau=1 preserves density ratio exactly.
-        clip_range: Tuple ``(min, max)`` for clipping weights.
-        ks_eval_parent: Maximum number of parent points for KS evaluation.
-        ks_eval_sample: Maximum number of sample points for KS evaluation.
-        random_state: Random seed for diagnostics and KS subsampling.
-    """
-
-    # k / tuning
-    k: int = 50
-    autotune_k: Optional[List[int]] = None
-    ess_floor: float = 0.20  # as fraction of M
-
-    # preprocessing
-    preprocess: str = "whiten"  # whiten | standardize | none
-    standardize_by_parent: bool = True  # only used if preprocess=standardize
-
-    # neighbors
-    algorithm: str = "auto"
-    metric: str = "minkowski"
-    leaf_size: int = 40
-
-    # stabilization
-    tau: float = 1.0  # tau < 1: reduce variance, tau > 1: increase contrast
-    clip_range: Tuple[float, float] = (0.01, 50.0)
-
-    # diagnostics
-    ks_eval_parent: int = 100_000
-    ks_eval_sample: int = 100_000
-    random_state: int = 42
-
-
-class KNNReweighter:
-    """k-NN density-ratio reweighter for multi-scale environments."""
-
-    def __init__(self, cfg: KNNReweighterConfig):
-        """Initialize the reweighter.
-
-        Args:
-            cfg: Configuration object controlling k-NN and diagnostics.
-        """
-        self.cfg = cfg
-        self.scales: List[float] = []
-        self.weights_: Optional[np.ndarray] = None
-        self.info_: Optional[Dict] = None
-
-    def fit(
-        self,
-        parent_samples: Dict[float, np.ndarray],
-        sample_samples: Dict[float, np.ndarray],
-    ) -> Tuple[np.ndarray, Dict]:
-        """Compute weights that map the sample to the parent distribution.
-
-        The features are constructed by column-stacking the overdensities at
-        different smoothing scales.
-
-        Args:
-            parent_samples: Mapping from scale to parent overdensity array
-                (1D, same length per scale).
-            sample_samples: Mapping from scale to sample overdensity array
-                (1D, same length per scale).
-
-        Returns:
-            Tuple ``(weights, info)``:
-                * ``weights``: 1D array of length M with reweighting factors.
-                * ``info``: Dictionary of diagnostics and configuration values.
-
-        Raises:
-            ValueError: If the parent and sample do not share the same set
-                of scales.
-        """
-        if set(parent_samples.keys()) != set(sample_samples.keys()):
-            raise ValueError(
-                "parent_samples and sample_samples must have identical keys "
-                "(same set of smoothing scales)."
-            )
-
-        self.scales = sorted(parent_samples.keys())
-        Xp = np.column_stack([parent_samples[s] for s in self.scales])  # [N, D]
-        Xs = np.column_stack([sample_samples[s] for s in self.scales])  # [M, D]
-        N, d = Xp.shape
-        M = Xs.shape[0]
-        rng = np.random.default_rng(self.cfg.random_state)
-
-        print(f"[kNN] {d}D features | Parent N={N:,} | Sample M={M:,}")
-        Xp_prep, Xs_prep = self._preprocess(Xp, Xs)
-
-        # Solve for weights, optionally auto-tuning k.
-        if self.cfg.autotune_k:
-            best_k, w, tuning_info = self._autotune_k(
-                Xp_prep, Xs_prep, Xp, Xs, N, M, d, rng
-            )
-            k_used = best_k
-        else:
-            k_used = self.cfg.k
-            w = self._weights_for_k(Xp_prep, Xs_prep, N, M, d, k_used)
-            tuning_info = None
-
-        # Basic diagnostics: ESS, clipping, KS.
-        ess = effective_sample_size(w)
-        clip_frac = float(
-            np.mean(
-                (w <= self.cfg.clip_range[0] + 1e-15)
-                | (w >= self.cfg.clip_range[1] - 1e-15)
-            )
-        )
-
-        ks_vals = [
-            weighted_ks(
-                Xp[:, j],
-                Xs[:, j],
-                w,
-                rng,
-                n_parent=min(self.cfg.ks_eval_parent, N),
-                n_sample=min(self.cfg.ks_eval_sample, M),
-            )
-            for j in range(d)
-        ]
-        ks_mean = float(np.mean(ks_vals))
-
-        info = {
-            "method": "knn_density_ratio",
-            "k": int(k_used),
-            "ks_mean": ks_mean,
-            "ks_per_dim": ks_vals,
-            "preprocess": self.cfg.preprocess,
-            "standardize_by_parent": self.cfg.standardize_by_parent,
-            "scales": self.scales,
-            "parent_samples": N,
-            "sample_samples": M,
-            "algorithm": self.cfg.algorithm,
-            "metric": self.cfg.metric,
-            "leaf_size": self.cfg.leaf_size,
-            "tau": self.cfg.tau,
-            "clip_range": self.cfg.clip_range,
-            "clip_fraction": clip_frac,
-            "ess": float(ess),
-            "mean_weight": float(np.mean(w)),
-            "std_weight": float(np.std(w)),
-            "min_weight": float(np.min(w)),
-            "max_weight": float(np.max(w)),
-            "tuning": tuning_info,
-        }
-
-        print(
-            f"[kNN] k={k_used} | KS(mean)={ks_mean:.4f} | "
-            f"ESS={ess:,.0f}/{M:,} | clipped={100 * clip_frac:.1f}%"
-        )
-
-        self.weights_ = w
-        self.info_ = info
-        return w, info
-
-    # --------------------- internals ---------------------
-
-    def _preprocess(
-        self,
-        Xp: np.ndarray,
-        Xs: np.ndarray,
-    ) -> Tuple[np.ndarray, np.ndarray]:
-        """Apply the requested preprocessing to parent and sample data.
-
-        Args:
-            Xp: Parent data array of shape ``(N, D)``.
-            Xs: Sample data array of shape ``(M, D)``.
-
-        Returns:
-            Tuple ``(Xp_processed, Xs_processed)``.
-
-        Raises:
-            ValueError: If ``preprocess`` is not one of the supported modes.
-        """
-        mode = self.cfg.preprocess.lower()
-
-        if mode == "whiten":
-            print("[kNN] Preprocess: parent-covariance whitening.")
-            return whiten_by_parent(Xp, Xs)
-
-        if mode == "standardize":
-            if self.cfg.standardize_by_parent:
-                mu = Xp.mean(axis=0)
-                sd = Xp.std(axis=0) + 1e-12
-                src = "parent"
-            else:
-                both = np.vstack([Xp, Xs])
-                mu = both.mean(axis=0)
-                sd = both.std(axis=0) + 1e-12
-                src = "pooled"
-            print(f"[kNN] Preprocess: z-score (by {src} stats).")
-            return (Xp - mu) / sd, (Xs - mu) / sd
-
-        if mode == "none":
-            print("[kNN] Preprocess: none.")
-            return Xp, Xs
-
-        raise ValueError('preprocess must be "whiten", "standardize", or "none"')
-
-    def _knn_dists(
-        self,
-        X_query: np.ndarray,
-        X_ref: np.ndarray,
-        *,
-        k: int,
-    ) -> np.ndarray:
-        """Compute distances to the k nearest neighbors.
-
-        Args:
-            X_query: Query points of shape ``(Q, D)``.
-            X_ref: Reference points of shape ``(R, D)``.
-            k: Number of neighbors to retrieve.
-
-        Returns:
-            Array of shape ``(Q, k)`` with distances to the k nearest neighbors.
-        """
-        nn = NearestNeighbors(
-            n_neighbors=k,
-            algorithm=self.cfg.algorithm,
-            leaf_size=self.cfg.leaf_size,
-            metric=self.cfg.metric,
-        ).fit(X_ref)
-        return nn.kneighbors(X_query, return_distance=True)[0]
-
-    def _weights_for_k(
-        self,
-        Xp_prep: np.ndarray,
-        Xs_prep: np.ndarray,
-        N: int,
-        M: int,
-        d: int,
-        k: int,
-    ) -> np.ndarray:
-        """Compute stabilized weights for a single k.
-
-        The core k-NN density-ratio formula is:
-
-        .. math::
-
-            w_i \\propto \\frac{M}{N}\\left(\\frac{r_\\text{sample, i}}
-            {r_\\text{parent, i}}\\right)^d,
-
-        with optional temperature exponent, clipping, and renormalization.
-
-        Args:
-            Xp_prep: Preprocessed parent data, shape ``(N, D)``.
-            Xs_prep: Preprocessed sample data, shape ``(M, D)``.
-            N: Number of parent points.
-            M: Number of sample points.
-            d: Dimensionality of the feature space.
-            k: Number of neighbors.
-
-        Returns:
-            One-dimensional array of weights of length M.
-
-        Raises:
-            ValueError: If k is out of valid range.
-        """
-        if k < 1:
-            raise ValueError(f"k must be >= 1, got k={k}")
-        if k >= N:
-            raise ValueError(f"k must be < N (parent size), got k={k}, N={N}")
-        if k + 1 > M:
-            raise ValueError(f"k+1 must be <= M (sample size), got k={k}, M={M}")
-
-        dist_p = self._knn_dists(Xs_prep, Xp_prep, k=k)  # [M, k]
-        r_parent = dist_p[:, -1]
-
-        dist_s = self._knn_dists(Xs_prep, Xs_prep, k=k + 1)  # include self
-        r_sample = dist_s[:, -1]
-
-        eps = 1e-12
-        w = (M / N) * ((r_sample + eps) / (r_parent + eps)) ** d
-
-        if self.cfg.tau != 1.0:
-            w = np.power(w, self.cfg.tau)
-
-        w = np.clip(w, self.cfg.clip_range[0], self.cfg.clip_range[1])
-        w *= len(w) / (w.sum() + 1e-18)
-        return w
-
-    def _autotune_k(
-        self,
-        Xp_prep: np.ndarray,
-        Xs_prep: np.ndarray,
-        Xp_raw: np.ndarray,
-        Xs_raw: np.ndarray,
-        N: int,
-        M: int,
-        d: int,
-        rng: np.random.Generator,
-    ) -> Tuple[int, np.ndarray, Dict]:
-        """Grid-search over candidate k values with an ESS floor.
-
-        Args:
-            Xp_prep: Preprocessed parent data, shape ``(N, D)``.
-            Xs_prep: Preprocessed sample data, shape ``(M, D)``.
-            Xp_raw: Raw parent data for KS diagnostics, shape ``(N, D)``.
-            Xs_raw: Raw sample data for KS diagnostics, shape ``(M, D)``.
-            N: Number of parent points.
-            M: Number of sample points.
-            d: Dimensionality of the feature space.
-            rng: NumPy random generator for KS subsampling.
-
-        Returns:
-            Tuple ``(best_k, best_weights, summary_dict)`` where
-            ``summary_dict`` contains per-k diagnostics.
-        """
-        candidates = sorted(
-            {int(k) for k in self.cfg.autotune_k if 1 <= int(k) < min(N, M)}
-        )
-        if not candidates:
-            raise ValueError(
-                "autotune_k is non-empty but no valid candidates remain. "
-                "Check that k < min(N, M)."
-            )
-
-        print(
-            f"[kNN] Auto-tune k over {candidates} with ESS floor "
-            f"{int(self.cfg.ess_floor * 100)}%..."
-        )
-
-        results = []
-        best_tuple = None
-        best_k = None
-        best_w = None
-
-        for k in candidates:
-            w = self._weights_for_k(Xp_prep, Xs_prep, N, M, d, k)
-            ess = effective_sample_size(w)
-
-            ks_vals = [
-                weighted_ks(
-                    Xp_raw[:, j],
-                    Xs_raw[:, j],
-                    w,
-                    rng,
-                    n_parent=min(self.cfg.ks_eval_parent, N),
-                    n_sample=min(self.cfg.ks_eval_sample, M),
-                )
-                for j in range(d)
-            ]
-            ks_mean = float(np.mean(ks_vals))
-            feasible = ess >= self.cfg.ess_floor * M
-            results.append(
-                {"k": k, "ess": float(ess), "ks_mean": ks_mean, "feasible": feasible}
-            )
-
-            # Selection criteria (lexicographic ordering):
-            # 1. Prefer feasible (ESS >= floor) over infeasible
-            # 2. Among same feasibility, prefer lower KS distance
-            # 3. Among same KS, prefer higher ESS
-            # 4. Among same ESS, prefer smaller k (simpler model)
-            selection_key = (not feasible, ks_mean, -ess, k)
-            if (best_tuple is None) or (selection_key < best_tuple):
-                best_tuple, best_k, best_w = selection_key, k, w
-
-        print(
-            "[kNN] Tuning results: "
-            + ", ".join(
-                f"k={r['k']}: KS={r['ks_mean']:.3f}, ESS={int(r['ess'])}"
-                + ("" if r["feasible"] else "")
-                for r in results
-            )
-        )
-
-        summary = {
-            "candidates": candidates,
-            "results": results,
-            "best_k": best_k,
-            "ess_floor": self.cfg.ess_floor,
-        }
-        return best_k, best_w, summary
-
-
-# ---------------------------------------------------------------------
-# HDF5 loading
-# ---------------------------------------------------------------------
-
-
-def load_multiscale_overdensities(
-    parent_path: str,
-    sample_path: str,
-) -> Tuple[Dict[float, np.ndarray], Dict[float, np.ndarray]]:
-    """Load raw overdensities from parent and sample HDF5 files.
-
-    The set of kernels is defined by the *sample* file. For every group
-    ``Grids/Kernel_N`` present in the sample file:
-
-    * The same group must exist in the parent file.
-    * Each group must contain a dataset ``GridPointOverDensities``.
-    * The kernel radius is read from the group's ``KernelRadius`` attribute.
-    * Raw overdensity values are returned (NOT log-transformed).
-
-    Note: Values of -1 indicate grid points with no particles nearby.
-    Use ``filter_and_transform_overdensities()`` to remove these and
-    apply the log10(1 + delta) transform.
-
-    The returned dictionaries map the smoothing scale (from the KernelRadius
-    attribute) to a 1D NumPy array of raw overdensities.
-
-    Args:
-        parent_path: Path to the parent/global population HDF5 file.
-        sample_path: Path to the sample/observed population HDF5 file.
-
-    Returns:
-        Tuple ``(parent_data, sample_data)``, where each is a mapping from
-        scale to 1D NumPy arrays.
-
-    Raises:
-        KeyError: If required groups/datasets are missing.
-        ValueError: If no kernels are found; if KernelRadius attribute is
-            missing; or if the parent is missing kernels found in the sample
-            file.
-    """
-    parent_data: Dict[float, np.ndarray] = {}
-    sample_data: Dict[float, np.ndarray] = {}
-
-    with (
-        h5py.File(sample_path, "r") as f_sample,
-        h5py.File(parent_path, "r") as f_parent,
-    ):
-        if "Grids" not in f_sample:
-            raise KeyError("Sample file is missing group 'Grids'.")
-        if "Grids" not in f_parent:
-            raise KeyError("Parent file is missing group 'Grids'.")
-
-        sample_grids = f_sample["Grids"]
-        parent_grids = f_parent["Grids"]
-
-        kernel_keys = [k for k in sample_grids.keys() if k.startswith("Kernel_")]
-        if not kernel_keys:
-            raise ValueError(
-                "No kernel groups found in sample file under 'Grids'. "
-                "Expected groups named like 'Kernel_0', 'Kernel_1', etc."
-            )
-
-        missing = [k for k in kernel_keys if k not in parent_grids]
-        if missing:
-            raise ValueError(
-                "Parent file is missing the following kernel groups "
-                f"present in the sample file: {', '.join(missing)}"
-            )
-
-        print(
-            f"[HDF5] Found {len(kernel_keys)} kernels in sample file: "
-            + ", ".join(kernel_keys)
-        )
-
-        for key in kernel_keys:
-            # Read kernel radius from group attribute
-            try:
-                scale = float(sample_grids[key].attrs["KernelRadius"])
-            except KeyError as exc:
-                raise ValueError(
-                    f"Kernel group '{key}' is missing 'KernelRadius' attribute. "
-                    "Cannot determine smoothing scale."
-                ) from exc
-
-            try:
-                parent_ds = parent_grids[key]["GridPointOverDensities"][...]
-                sample_ds = sample_grids[key]["GridPointOverDensities"][...]
-            except KeyError as exc:
-                raise KeyError(
-                    f"Missing 'GridPointOverDensities' for kernel '{key}'."
-                ) from exc
-
-            # Store raw overdensities (will be filtered and transformed later)
-            # Note: -1 values indicate grid points with no particles nearby
-            parent_data[scale] = np.asarray(parent_ds, dtype=float)
-            sample_data[scale] = np.asarray(sample_ds, dtype=float)
-
-    return parent_data, sample_data
-
-
-# ---------------------------------------------------------------------
-# Plotting and simple diagnostics
-# ---------------------------------------------------------------------
-
-
-def plot_diagnostics(
-    parent_samples: Dict[float, np.ndarray],
-    sample_samples: Dict[float, np.ndarray],
-    weights: np.ndarray,
-    info: Dict,
-    save_path: Optional[str] = None,
-) -> plt.Figure:
-    """Plot simple diagnostics: 1D distributions and weight histogram.
-
-    The plot has three panels:
-
-    * Parent vs sample vs weighted sample for the first scale.
-    * Parent vs sample vs weighted sample for the second scale (if present).
-    * Histogram of the weights.
-
-    Args:
-        parent_samples: Mapping from scale to parent overdensity array.
-        sample_samples: Mapping from scale to sample overdensity array.
-        weights: Weights for the sample population.
-        info: Diagnostics dictionary returned by :meth:`KNNReweighter.fit`.
-        save_path: Optional path to save the diagnostic PNG.
-
-    Returns:
-        The Matplotlib Figure instance.
-    """
-    scales = sorted(parent_samples.keys())
-    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
-    axes = axes.flatten()
-
-    def _plot_1d(ax, scale: float) -> None:
-        parent = parent_samples[scale]
-        sample = sample_samples[scale]
-        bins = np.linspace(
-            min(parent.min(), sample.min()), max(parent.max(), sample.max()), 50
-        )
-        ax.hist(
-            parent,
-            bins=bins,
-            density=True,
-            label="Parent",
-            histtype="step",
-            lw=2,
-        )
-        ax.hist(
-            sample,
-            bins=bins,
-            density=True,
-            label="Sample (raw)",
-            histtype="step",
-            lw=2,
-        )
-        ax.hist(
-            sample,
-            bins=bins,
-            weights=weights,
-            density=True,
-            label="Sample (weighted)",
-            histtype="step",
-            lw=2,
-        )
-        ax.set_xlabel(f"� (R={scale} Mpc/h)")
-        ax.set_ylabel("PDF")
-        ax.set_title(f"Scale {scale} Mpc/h")
-        ax.legend()
-        ax.grid(True, alpha=0.3)
-
-    # First scale.
-    _plot_1d(axes[0], scales[0])
-
-    # Second scale (if available).
-    if len(scales) > 1:
-        _plot_1d(axes[1], scales[1])
-    else:
-        axes[1].axis("off")
-
-    # Weight histogram.
-    axw = axes[2]
-    axw.hist(weights, bins=60, alpha=0.85, edgecolor="black")
-    axw.axvline(
-        np.mean(weights),
-        color="red",
-        linestyle="--",
-        lw=2,
-        label=f"Mean={np.mean(weights):.3f}",
-    )
-    axw.set_xlabel("Weight")
-    axw.set_ylabel("Count")
-    ess = int(info["ess"])
-    M = info["sample_samples"]
-    ks_mean = info["ks_mean"]
-    axw.set_title(f"w distribution | ESS={ess}/{M:,}, KS(mean)={ks_mean:.3f}")
-    axw.legend()
-    axw.grid(True, alpha=0.3)
-
-    plt.tight_layout()
-    if save_path:
-        plt.savefig(save_path, dpi=300, bbox_inches="tight")
-        print(f"[plot] Saved: {save_path}")
-    return fig
-
-
-# ---------------------------------------------------------------------
-# Pipeline
-# ---------------------------------------------------------------------
-
-
-def run_full_analysis(
-    parent_samples: Dict[float, np.ndarray],
-    sample_samples: Dict[float, np.ndarray],
-    show_plots: bool = True,
-    save_path: Optional[str] = None,
-    **knn_kwargs,
-) -> Tuple[np.ndarray, Dict]:
-    """Run the full k-NN reweighting and simple diagnostics.
-
-    Args:
-        parent_samples: Mapping from scale to parent overdensity array.
-        sample_samples: Mapping from scale to sample overdensity array.
-        show_plots: If True, display the diagnostic plot.
-        save_path: Optional path to save the diagnostic PNG.
-        **knn_kwargs: Keyword arguments used to configure
-            :class:`KNNReweighterConfig`.
-
-    Returns:
-        Tuple ``(weights, info)``:
-            * ``weights``: 1D array of weights for the sample population.
-            * ``info``: Diagnostics dictionary from
-              :meth:`KNNReweighter.fit`.
-    """
-    print("=== Multi-Scale Environment Weighting (k-NN, auto-k) ===")
-
-    cfg = KNNReweighterConfig(**knn_kwargs)
-    rw = KNNReweighter(cfg)
-
-    weights, info = rw.fit(parent_samples, sample_samples)
-
-    t = info.get("tuning")
-    tune_str = ""
-    if t:
-        tune_str = (
-            f" | auto-k over {t['candidates']} "
-            f"-> best={t['best_k']} (ESS floor {int(t['ess_floor'] * 100)}%)"
-        )
-
-    print(
-        "\n[Summary] "
-        f"k={info['k']}, KS(mean)={info['ks_mean']:.4f}, "
-        f"ESS={int(info['ess'])}/{info['sample_samples']}, "
-        f"clip={info['clip_range']}, "
-        f"clipped={100 * info['clip_fraction']:.1f}%"
-        f"{tune_str}"
-    )
-
-    if show_plots:
-        print("\n[Plot] Rendering diagnostics...")
-        _ = plot_diagnostics(parent_samples, sample_samples, weights, info, save_path)
-        plt.show()
-
-    return weights, info
-
-
-# ---------------------------------------------------------------------
-# __main__ (argparse)
-# ---------------------------------------------------------------------
-
-
-def _build_arg_parser() -> argparse.ArgumentParser:
-    """Create the command-line argument parser.
-
-    Returns:
-        Configured :class:`argparse.ArgumentParser` instance.
-    """
-    parser = argparse.ArgumentParser(
-        description=(
-            "k-NN density-ratio weighting for multi-scale environments.\n\n"
-            "The sample file defines which Grids/Kernel_* groups are used; "
-            "the same kernels must exist in the parent file."
-        )
-    )
-
-    # Required I/O.
-    parser.add_argument(
-        "--parent",
-        type=str,
-        required=True,
-        help="Path to parent/global population HDF5 file.",
-    )
-    parser.add_argument(
-        "--sample",
-        type=str,
-        required=True,
-        help="Path to sample/observed population HDF5 file.",
-    )
-
-    # k / auto-tuning.
-    parser.add_argument(
-        "--k",
-        type=int,
-        default=50,
-        help="Default number of neighbors (used if --autotune_k is empty).",
-    )
-    parser.add_argument(
-        "--autotune_k",
-        type=str,
-        default="",
-        help=(
-            "Comma-separated candidate k values for auto-tuning, "
-            "e.g. '35,50,80,100'. If empty, k is used directly."
-        ),
-    )
-    parser.add_argument(
-        "--ess_floor",
-        type=float,
-        default=0.20,
-        help="Minimum ESS fraction (ESS / M) for a candidate k to be feasible.",
-    )
-
-    # Preprocessing.
-    parser.add_argument(
-        "--preprocess",
-        choices=["whiten", "standardize", "none"],
-        default="whiten",
-        help="Feature preprocessing mode.",
-    )
-    parser.add_argument(
-        "--standardize_by_parent",
-        action="store_true",
-        help="If preprocess=standardize, use parent stats (default True).",
-    )
-    parser.add_argument(
-        "--standardize_by_pooled",
-        dest="standardize_by_parent",
-        action="store_false",
-        help="If preprocess=standardize, use pooled parent+sample stats.",
-    )
-    parser.set_defaults(standardize_by_parent=True)
-
-    # Neighbors.
-    parser.add_argument(
-        "--metric",
-        type=str,
-        default="minkowski",
-        help="Distance metric (e.g., 'minkowski', 'euclidean').",
-    )
-    parser.add_argument(
-        "--algorithm",
-        type=str,
-        default="auto",
-        help="NearestNeighbors algorithm (scikit-learn).",
-    )
-    parser.add_argument(
-        "--leaf_size",
-        type=int,
-        default=40,
-        help="Leaf size for tree-based nearest-neighbor algorithms.",
-    )
-
-    # Stabilization.
-    parser.add_argument(
-        "--tau",
-        type=float,
-        default=1.0,
-        help="Temperature exponent (weights := weights ** tau).",
-    )
-    parser.add_argument(
-        "--clip_min",
-        type=float,
-        default=0.01,
-        help="Lower clip bound for weights.",
-    )
-    parser.add_argument(
-        "--clip_max",
-        type=float,
-        default=50.0,
-        help="Upper clip bound for weights.",
-    )
-
-    # Misc / plotting.
-    parser.add_argument(
-        "--random_state",
-        type=int,
-        default=42,
-        help="Random seed for diagnostics and KS subsampling.",
-    )
-    parser.add_argument(
-        "--save",
-        type=str,
-        default=None,
-        help="Path to save diagnostic PNG.",
-    )
-    parser.add_argument(
-        "--no_plots",
-        action="store_true",
-        help="Disable plotting.",
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        default=None,
-        help="Path to save computed weights as .npy file.",
-    )
-
-    return parser
-
-
-if __name__ == "__main__":
-    parser = _build_arg_parser()
-    args = parser.parse_args()
-
-    # Load data from the two HDF5 files.
-    parent_data, sample_data = load_multiscale_overdensities(
-        parent_path=args.parent,
-        sample_path=args.sample,
-    )
-
-    # Remove NaNs consistently within each population.
-    parent_data = drop_nan_rows_per_scale(parent_data)
-    sample_data = drop_nan_rows_per_scale(sample_data)
-
-    # Build kwargs for run_full_analysis.
-    auto_list = [int(x) for x in args.autotune_k.split(",") if x.strip().isdigit()]
-    kwargs = dict(
-        k=args.k,
-        autotune_k=auto_list if auto_list else None,
-        ess_floor=args.ess_floor,
-        preprocess=args.preprocess,
-        standardize_by_parent=args.standardize_by_parent,
-        algorithm=args.algorithm,
-        metric=args.metric,
-        leaf_size=args.leaf_size,
-        tau=args.tau,
-        clip_range=(args.clip_min, args.clip_max),
-        random_state=args.random_state,
-    )
-
-    # Run pipeline.
-    weights, info = run_full_analysis(
-        parent_data,
-        sample_data,
-        show_plots=not args.no_plots,
-        save_path=args.save,
-        **kwargs,
-    )
-
-    # Save weights to file if requested.
-    if args.output:
-        np.save(args.output, weights)
-        print(f"[Output] Saved weights to: {args.output}")
-
-    print(f"\n[Done] Generated {len(weights)} weights.")
diff --git a/weighting/test_hmf_weighting.py b/weighting/test_hmf_weighting.py
deleted file mode 100644
index 574bd2a..0000000
--- a/weighting/test_hmf_weighting.py
+++ /dev/null
@@ -1,648 +0,0 @@
-"""Test k-NN weighting on halo mass functions.
-
-This script evaluates how well k-NN density-ratio weighting can reproduce
-the true halo mass function (HMF) from a small subsample.
-
-Workflow:
-1. Load multi-scale overdensity features from parent (global) and sample (full box) HDF5 files
-2. Load halo masses for the full sample population (the truth)
-3. Subsample N halos randomly from the full sample
-4. Apply k-NN weighting to the subsample using parent environment as reference
-5. Compare: Truth (full sample) vs Subsample (raw) vs Subsample (weighted)
-
-The goal is to test whether a small weighted subsample can reproduce the
-true HMF of the full sample.
-
-The script produces diagnostic plots showing:
-- HMF comparison: Truth vs raw subsample vs weighted subsample
-- Environment distributions at different scales
-- Weight distribution statistics
-
-Example
--------
-    python test_hmf_weighting.py \\
-        --parent parent_grids.hdf5 \\
-        --sample sample_grids.hdf5 \\
-        --masses sample_halo_masses.txt \\
-        --subsample 10000 \\
-        --autotune_k 35,50,80,100 \\
-        --output weighted_masses.npy \\
-        --save hmf_diagnostics.png
-"""
-
-from __future__ import annotations
-
-import argparse
-import sys
-from pathlib import Path
-from typing import Dict, Optional, Tuple
-
-import h5py
-import matplotlib.pyplot as plt
-import numpy as np
-
-# Import the k-NN weighting machinery
-from knn_weighter import (
-    KNNReweighter,
-    KNNReweighterConfig,
-    drop_nan_rows_per_scale,
-    effective_sample_size,
-    filter_and_transform_overdensities,
-    load_multiscale_overdensities,
-)
-
-
-def load_halo_masses(mass_file: str) -> np.ndarray:
-    """Load halo masses from a text file.
-
-    Args:
-        mass_file: Path to text file containing one mass per line.
-
-    Returns:
-        1D array of halo masses.
-
-    Raises:
-        ValueError: If file is empty or contains invalid data.
-    """
-    masses = np.loadtxt(mass_file)
-    if masses.ndim != 1:
-        raise ValueError(
-            f"Mass file should contain a single column, got shape {masses.shape}"
-        )
-    if len(masses) == 0:
-        raise ValueError("Mass file is empty")
-
-    print(f"[Masses] Loaded {len(masses):,} halo masses from {mass_file}")
-    print(f"[Masses] Mass range: {masses.min():.2e} - {masses.max():.2e}")
-
-    return masses
-
-
-def subsample_data(
-    sample_overdens: Dict[float, np.ndarray],
-    masses: np.ndarray,
-    n_subsample: int,
-    random_state: int = 42,
-) -> Tuple[Dict[float, np.ndarray], np.ndarray, np.ndarray]:
-    """Randomly subsample halos and their environment features.
-
-    Args:
-        sample_overdens: Mapping from scale to sample overdensity arrays.
-        masses: Halo masses (same length as overdensity arrays).
-        n_subsample: Number of halos to randomly select.
-        random_state: Random seed for reproducibility.
-
-    Returns:
-        Tuple (subsampled_overdens, subsampled_masses, indices).
-
-    Raises:
-        ValueError: If n_subsample > sample size.
-    """
-    n_total = len(masses)
-    if n_subsample > n_total:
-        raise ValueError(
-            f"Requested subsample size {n_subsample:,} exceeds "
-            f"sample size {n_total:,}"
-        )
-
-    rng = np.random.default_rng(random_state)
-    indices = rng.choice(n_total, size=n_subsample, replace=False)
-
-    subsampled_overdens = {
-        scale: arr[indices] for scale, arr in sample_overdens.items()
-    }
-    subsampled_masses = masses[indices]
-
-    print(
-        f"[Subsample] Randomly selected {n_subsample:,}/{n_total:,} halos "
-        f"(seed={random_state})"
-    )
-    print(
-        f"[Subsample] Subsampled mass range: "
-        f"{subsampled_masses.min():.2e} - {subsampled_masses.max():.2e}"
-    )
-
-    return subsampled_overdens, subsampled_masses, indices
-
-
-def compute_hmf(
-    masses: np.ndarray,
-    weights: Optional[np.ndarray] = None,
-    n_bins: int = 20,
-    mass_range: Optional[Tuple[float, float]] = None,
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """Compute halo mass function (differential number density).
-
-    Args:
-        masses: Halo masses.
-        weights: Optional weights for each halo.
-        n_bins: Number of mass bins.
-        mass_range: Optional (min, max) mass range. If None, uses data range.
-
-    Returns:
-        Tuple (bin_centers, dn_dlogM, bin_edges) where dn_dlogM is the
-        differential number density per logarithmic mass bin.
-    """
-    if weights is None:
-        weights = np.ones_like(masses)
-
-    if mass_range is None:
-        mass_range = (masses.min(), masses.max())
-
-    log_masses = np.log10(masses)
-    log_range = (np.log10(mass_range[0]), np.log10(mass_range[1]))
-
-    counts, bin_edges = np.histogram(
-        log_masses, bins=n_bins, range=log_range, weights=weights
-    )
-    bin_widths = np.diff(bin_edges)
-    bin_centers = 0.5 * (bin_edges[1:] + bin_edges[:-1])
-
-    # Differential number density: dn/dlog10(M)
-    dn_dlogM = counts / bin_widths
-    # Normalize to total number
-    dn_dlogM *= len(masses) / dn_dlogM.sum() / bin_widths.mean()
-
-    return bin_centers, dn_dlogM, bin_edges
-
-
-def plot_hmf_comparison(
-    truth_masses: np.ndarray,
-    subsample_masses: np.ndarray,
-    weights: np.ndarray,
-    info: Dict,
-    parent_overdens: Dict[float, np.ndarray],
-    subsample_overdens: Dict[float, np.ndarray],
-    save_path: Optional[str] = None,
-) -> plt.Figure:
-    """Plot HMF comparison and diagnostics.
-
-    Creates a multi-panel figure with:
-    - HMF comparison (truth vs raw subsample vs weighted subsample)
-    - Environment distributions at two scales
-    - Weight distribution histogram
-
-    Args:
-        truth_masses: True halo masses (full sample).
-        subsample_masses: Subsampled halo masses.
-        weights: Weights for subsampled halos.
-        info: Diagnostics from KNNReweighter.
-        parent_overdens: Parent overdensity features (global reference).
-        subsample_overdens: Subsample overdensity features.
-        save_path: Optional path to save figure.
-
-    Returns:
-        Figure instance.
-    """
-    fig = plt.figure(figsize=(18, 10))
-    gs = fig.add_gridspec(2, 3, hspace=0.3, wspace=0.3)
-
-    scales = sorted(parent_overdens.keys())
-
-    # Determine common mass range for HMF
-    all_masses = np.concatenate([truth_masses, subsample_masses])
-    mass_range = (all_masses.min(), all_masses.max())
-
-    # --- Panel 1: Halo Mass Function ---
-    ax_hmf = fig.add_subplot(gs[0, :])
-
-    # Compute HMFs
-    bc_truth, hmf_truth, _ = compute_hmf(
-        truth_masses, n_bins=25, mass_range=mass_range
-    )
-    bc_subsample, hmf_subsample, _ = compute_hmf(
-        subsample_masses, n_bins=25, mass_range=mass_range
-    )
-    bc_weighted, hmf_weighted, _ = compute_hmf(
-        subsample_masses, weights=weights, n_bins=25, mass_range=mass_range
-    )
-
-    ax_hmf.plot(
-        bc_truth, hmf_truth, "k-", lw=2.5, label=f"Truth (full sample, N={len(truth_masses):,})"
-    )
-    ax_hmf.plot(
-        bc_subsample,
-        hmf_subsample,
-        "r--",
-        lw=2,
-        alpha=0.7,
-        label=f"Subsample (raw, N={len(subsample_masses):,})",
-    )
-    ax_hmf.plot(
-        bc_weighted,
-        hmf_weighted,
-        "b-",
-        lw=2,
-        alpha=0.8,
-        label=f"Subsample (weighted, ESS={int(info['ess'])}/{len(subsample_masses):,})",
-    )
-
-    ax_hmf.set_xlabel(r"$\log_{10}(M_{\rm halo} \, [M_\odot])$", fontsize=13)
-    ax_hmf.set_ylabel(r"$dn / d\log_{10}M$ (normalized)", fontsize=13)
-    ax_hmf.set_title(
-        f"Halo Mass Function | k={info['k']}, KS(env)={info['ks_mean']:.3f}",
-        fontsize=14,
-        fontweight="bold",
-    )
-    ax_hmf.legend(fontsize=11, loc="upper right")
-    ax_hmf.grid(True, alpha=0.3)
-    ax_hmf.set_yscale("log")
-
-    # --- Panel 2: Environment at scale 0 ---
-    ax_env1 = fig.add_subplot(gs[1, 0])
-    _plot_environment_dist(
-        ax_env1, parent_overdens[scales[0]], sample_overdens[scales[0]], weights, scales[0]
-    )
-
-    # --- Panel 3: Environment at scale 1 (if available) ---
-    ax_env2 = fig.add_subplot(gs[1, 1])
-    if len(scales) > 1:
-        _plot_environment_dist(
-            ax_env2,
-            parent_overdens[scales[1]],
-            sample_overdens[scales[1]],
-            weights,
-            scales[1],
-        )
-    else:
-        ax_env2.text(
-            0.5, 0.5, "Only 1 scale available", ha="center", va="center", fontsize=12
-        )
-        ax_env2.axis("off")
-
-    # --- Panel 4: Weight distribution ---
-    ax_weights = fig.add_subplot(gs[1, 2])
-    ax_weights.hist(weights, bins=50, alpha=0.85, edgecolor="black", color="steelblue")
-    ax_weights.axvline(
-        np.mean(weights),
-        color="red",
-        linestyle="--",
-        lw=2,
-        label=f"Mean={np.mean(weights):.2f}",
-    )
-    ax_weights.axvline(
-        np.median(weights),
-        color="orange",
-        linestyle=":",
-        lw=2,
-        label=f"Median={np.median(weights):.2f}",
-    )
-    ax_weights.set_xlabel("Weight", fontsize=12)
-    ax_weights.set_ylabel("Count", fontsize=12)
-    ax_weights.set_title(
-        f"Weight Distribution\nESS={int(info['ess'])}/{len(weights):,}, "
-        f"clip={100*info['clip_fraction']:.1f}%",
-        fontsize=11,
-    )
-    ax_weights.legend(fontsize=10)
-    ax_weights.grid(True, alpha=0.3)
-
-    plt.suptitle(
-        "k-NN Halo Mass Function Weighting Test",
-        fontsize=16,
-        fontweight="bold",
-        y=0.995,
-    )
-
-    if save_path:
-        plt.savefig(save_path, dpi=300, bbox_inches="tight")
-        print(f"[Plot] Saved: {save_path}")
-
-    return fig
-
-
-def _plot_environment_dist(
-    ax: plt.Axes,
-    parent_env: np.ndarray,
-    subsample_env: np.ndarray,
-    weights: np.ndarray,
-    scale: float,
-) -> None:
-    """Plot environment distribution at a single scale.
-
-    Args:
-        ax: Matplotlib axes.
-        parent_env: Parent (global) overdensities.
-        subsample_env: Subsample overdensities.
-        weights: Subsample weights.
-        scale: Smoothing scale (Mpc/h).
-    """
-    bins = np.linspace(
-        min(parent_env.min(), subsample_env.min()),
-        max(parent_env.max(), subsample_env.max()),
-        40,
-    )
-
-    ax.hist(
-        parent_env,
-        bins=bins,
-        density=True,
-        alpha=0.6,
-        label="Parent (global)",
-        histtype="stepfilled",
-        color="gray",
-    )
-    ax.hist(
-        subsample_env,
-        bins=bins,
-        density=True,
-        alpha=0.7,
-        label="Subsample (raw)",
-        histtype="step",
-        lw=2,
-        color="red",
-    )
-    ax.hist(
-        subsample_env,
-        bins=bins,
-        weights=weights,
-        density=True,
-        alpha=0.8,
-        label="Subsample (weighted)",
-        histtype="step",
-        lw=2,
-        color="blue",
-    )
-
-    ax.set_xlabel(f"log10(1 + δ) at R={scale:.2f} Mpc/h", fontsize=11)
-    ax.set_ylabel("PDF", fontsize=11)
-    ax.set_title(f"Environment (scale {scale:.2f})", fontsize=11)
-    ax.legend(fontsize=9)
-    ax.grid(True, alpha=0.3)
-
-
-def print_summary_statistics(
-    truth_masses: np.ndarray,
-    subsample_masses: np.ndarray,
-    weights: np.ndarray,
-    info: Dict,
-) -> None:
-    """Print summary statistics for the weighting test.
-
-    Args:
-        truth_masses: True halo masses (full sample).
-        subsample_masses: Subsampled halo masses.
-        weights: Subsample weights.
-        info: Diagnostics from KNNReweighter.
-    """
-    print("\n" + "=" * 70)
-    print("SUMMARY STATISTICS")
-    print("=" * 70)
-
-    print(f"\nPopulation Sizes:")
-    print(f"  Truth (full sample): {len(truth_masses):,} halos")
-    print(f"  Subsample:           {len(subsample_masses):,} halos ({100*len(subsample_masses)/len(truth_masses):.1f}% of truth)")
-    print(f"  ESS:                 {int(info['ess']):,} (effective sample size)")
-    print(f"  ESS/N_subsample:     {info['ess']/len(subsample_masses):.1%}")
-
-    print(f"\nk-NN Configuration:")
-    print(f"  k:           {info['k']}")
-    print(f"  Preprocess:  {info['preprocess']}")
-    print(f"  Metric:      {info['metric']}")
-    print(f"  Tau:         {info['tau']}")
-
-    print(f"\nEnvironment Matching (KS distance):")
-    print(f"  Mean across scales: {info['ks_mean']:.4f}")
-    for i, (scale, ks) in enumerate(zip(info['scales'], info['ks_per_dim'])):
-        print(f"    Scale {i} (R={scale:.2f}): {ks:.4f}")
-
-    print(f"\nWeight Statistics:")
-    print(f"  Mean:      {info['mean_weight']:.3f}")
-    print(f"  Std:       {info['std_weight']:.3f}")
-    print(f"  Min:       {info['min_weight']:.3f}")
-    print(f"  Max:       {info['max_weight']:.3f}")
-    print(f"  Clip frac: {100*info['clip_fraction']:.1f}%")
-    print(f"  Clip range: {info['clip_range']}")
-
-    # Mass statistics
-    print(f"\nMass Range (log10 Msun):")
-    print(f"  Truth:     {np.log10(truth_masses.min()):.2f} - {np.log10(truth_masses.max()):.2f}")
-    print(f"  Subsample: {np.log10(subsample_masses.min()):.2f} - {np.log10(subsample_masses.max()):.2f}")
-
-    # Weighted vs unweighted mean log mass
-    mean_log_m_truth = np.mean(np.log10(truth_masses))
-    mean_log_m_subsample = np.mean(np.log10(subsample_masses))
-    mean_log_m_weighted = np.average(np.log10(subsample_masses), weights=weights)
-
-    print(f"\nMean log10(M) [Msun]:")
-    print(f"  Truth:      {mean_log_m_truth:.3f}")
-    print(f"  Subsample:  {mean_log_m_subsample:.3f} (bias: {mean_log_m_subsample - mean_log_m_truth:+.3f})")
-    print(f"  Weighted:   {mean_log_m_weighted:.3f} (bias: {mean_log_m_weighted - mean_log_m_truth:+.3f})")
-
-    print("=" * 70 + "\n")
-
-
-def main():
-    """Main execution function."""
-    parser = argparse.ArgumentParser(
-        description="Test k-NN weighting on halo mass functions",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=__doc__,
-    )
-
-    # Required inputs
-    parser.add_argument(
-        "--parent",
-        type=str,
-        required=True,
-        help="Path to parent/global population HDF5 file (gridded overdensities, global reference)",
-    )
-    parser.add_argument(
-        "--sample",
-        type=str,
-        required=True,
-        help="Path to full sample HDF5 file (gridded overdensities, this is the truth)",
-    )
-    parser.add_argument(
-        "--masses",
-        type=str,
-        required=True,
-        help="Path to text file with full sample halo masses (one per line, truth HMF)",
-    )
-
-    # Subsampling
-    parser.add_argument(
-        "--subsample",
-        type=int,
-        required=True,
-        help="Number of halos to randomly select from full sample for weighting test",
-    )
-
-    # k-NN parameters
-    parser.add_argument(
-        "--k",
-        type=int,
-        default=50,
-        help="Number of neighbors (default: 50)",
-    )
-    parser.add_argument(
-        "--autotune_k",
-        type=str,
-        default="",
-        help="Comma-separated k values for auto-tuning (e.g., '35,50,80,100')",
-    )
-    parser.add_argument(
-        "--ess_floor",
-        type=float,
-        default=0.20,
-        help="Minimum ESS fraction for auto-tuning (default: 0.20)",
-    )
-    parser.add_argument(
-        "--preprocess",
-        choices=["whiten", "standardize", "none"],
-        default="whiten",
-        help="Feature preprocessing mode (default: whiten)",
-    )
-    parser.add_argument(
-        "--tau",
-        type=float,
-        default=1.0,
-        help="Temperature parameter for weights (default: 1.0)",
-    )
-    parser.add_argument(
-        "--clip_min",
-        type=float,
-        default=0.01,
-        help="Lower clip bound for weights (default: 0.01)",
-    )
-    parser.add_argument(
-        "--clip_max",
-        type=float,
-        default=50.0,
-        help="Upper clip bound for weights (default: 50.0)",
-    )
-
-    # Output
-    parser.add_argument(
-        "--output",
-        type=str,
-        default=None,
-        help="Path to save weighted masses as .npy file",
-    )
-    parser.add_argument(
-        "--save",
-        type=str,
-        default=None,
-        help="Path to save diagnostic plot",
-    )
-    parser.add_argument(
-        "--no_plots",
-        action="store_true",
-        help="Disable interactive plotting",
-    )
-    parser.add_argument(
-        "--random_state",
-        type=int,
-        default=42,
-        help="Random seed for subsampling and diagnostics (default: 42)",
-    )
-
-    args = parser.parse_args()
-
-    # Load overdensity features
-    print("\n" + "=" * 70)
-    print("LOADING DATA")
-    print("=" * 70 + "\n")
-
-    parent_overdens, sample_overdens = load_multiscale_overdensities(
-        parent_path=args.parent,
-        sample_path=args.sample,
-    )
-
-    # Load halo masses (for the full sample = truth)
-    truth_masses = load_halo_masses(args.masses)
-
-    # Check consistency
-    first_scale = next(iter(sample_overdens.values()))
-    if len(first_scale) != len(truth_masses):
-        raise ValueError(
-            f"Mismatch: sample overdensities have {len(first_scale):,} entries "
-            f"but mass file has {len(truth_masses):,} entries"
-        )
-
-    # Filter and transform parent data (removes delta <= -1, applies log transform)
-    print("\n[Cleaning] Filtering and transforming parent overdensities...")
-    parent_overdens, _ = filter_and_transform_overdensities(parent_overdens)
-
-    # Additional NaN check on parent (shouldn't be any after filtering)
-    parent_overdens = drop_nan_rows_per_scale(parent_overdens)
-
-    # Filter and transform full sample data - need to synchronize with masses
-    print("[Cleaning] Filtering and transforming full sample overdensities...")
-    sample_overdens, mask = filter_and_transform_overdensities(sample_overdens)
-
-    # Apply mask to masses to keep them synchronized
-    truth_masses = truth_masses[mask]
-
-    # Additional NaN check on sample (shouldn't be any after filtering)
-    n_before = len(next(iter(sample_overdens.values())))
-    sample_overdens = drop_nan_rows_per_scale(sample_overdens)
-    n_after = len(next(iter(sample_overdens.values())))
-
-    if n_after != n_before:
-        # Need to rebuild mask for masses
-        mask_nan = np.ones(n_before, dtype=bool)
-        for arr in sample_overdens.values():
-            # This shouldn't happen, but just in case
-            pass
-        print(f"[WARNING] Found {n_before - n_after} additional NaN values after transform!")
-
-    # Subsample from the cleaned full sample
-    if args.subsample is None:
-        raise ValueError(
-            "--subsample is required. Specify how many halos to randomly select "
-            "from the full sample for weighting."
-        )
-
-    subsample_overdens, subsample_masses, subsample_indices = subsample_data(
-        sample_overdens, truth_masses, args.subsample, args.random_state
-    )
-
-    # Run k-NN weighting
-    # Goal: weight subsample to match parent (global) environment distribution
-    print("\n" + "=" * 70)
-    print("COMPUTING WEIGHTS")
-    print("=" * 70 + "\n")
-
-    auto_list = [int(x) for x in args.autotune_k.split(",") if x.strip().isdigit()]
-    cfg = KNNReweighterConfig(
-        k=args.k,
-        autotune_k=auto_list if auto_list else None,
-        ess_floor=args.ess_floor,
-        preprocess=args.preprocess,
-        tau=args.tau,
-        clip_range=(args.clip_min, args.clip_max),
-        random_state=args.random_state,
-    )
-
-    reweighter = KNNReweighter(cfg)
-    weights, info = reweighter.fit(parent_overdens, subsample_overdens)
-
-    # Print summary
-    print_summary_statistics(truth_masses, subsample_masses, weights, info)
-
-    # Save weighted masses if requested
-    if args.output:
-        weighted_data = np.column_stack([subsample_masses, weights])
-        np.save(args.output, weighted_data)
-        print(f"[Output] Saved (subsample_masses, weights) to: {args.output}")
-
-    # Plot results
-    if not args.no_plots:
-        print("\n[Plot] Generating diagnostics...\n")
-        fig = plot_hmf_comparison(
-            truth_masses,
-            subsample_masses,
-            weights,
-            info,
-            parent_overdens,
-            subsample_overdens,
-            save_path=args.save,
-        )
-        plt.show()
-
-    print("\n[Done] HMF weighting test complete.\n")
-
-
-if __name__ == "__main__":
-    main()

From d09ceaa19c0ddd901a37f09eb9e5f1b7f4df1e45 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Fri, 28 Nov 2025 23:04:13 +0000
Subject: [PATCH 21/23] Fixing the test suite

---
 tests/test_suite.py | 51 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/tests/test_suite.py b/tests/test_suite.py
index d187c4d..284781b 100755
--- a/tests/test_suite.py
+++ b/tests/test_suite.py
@@ -120,10 +120,29 @@ def create_dense_distribution(filename, npart=10000, boxsize=10.0):
         return npart
 
     @staticmethod
-    def _write_snapshot(filename, coords, masses, boxsize):
-        """Write HDF5 snapshot in SWIFT format"""
+    def _write_snapshot(filename, coords, masses, boxsize, h=0.681, Omega_cdm=0.256011, Omega_b=0.048600):
+        """Write HDF5 snapshot in SWIFT format
+
+        If masses are uniform (all equal), they will be rescaled to match the
+        cosmological mean density. If masses vary, they are used as-is (assumed
+        to already represent the correct density field).
+        """
         os.makedirs(os.path.dirname(filename), exist_ok=True)
 
+        # Calculate cosmological mean density
+        Omega_m = Omega_cdm + Omega_b
+        H0_kmsMpc = 100.0 * h
+        G = 4.301744232015554e+01  # (10^10 Msun)^-1 Mpc (km/s)^2
+        rho_crit_0 = (3.0 * H0_kmsMpc**2) / (8.0 * np.pi * G)
+        mean_density = rho_crit_0 * Omega_m
+
+        # For uniform mass distributions, rescale to match cosmological mean density
+        # This ensures uniform distributions give zero mean overdensity
+        if np.allclose(masses, masses[0]):  # All masses equal
+            volume = boxsize**3
+            total_mass = mean_density * volume
+            masses = np.full(len(masses), total_mass / len(masses))
+
         with h5py.File(filename, 'w') as f:
             # Header
             header = f.create_group('Header')
@@ -207,6 +226,11 @@ def _create_param_file(self, snapshot_file, output_file, cdim=3, kernel_radii=[0
             f.write(f"  type: uniform\n")
             f.write(f"  cdim: {cdim}\n")
             f.write(f"\n")
+            f.write(f"Cosmology:\n")
+            f.write(f"  h: 0.681              # Reduced Hubble constant\n")
+            f.write(f"  Omega_cdm: 0.256011   # Cold dark matter density parameter\n")
+            f.write(f"  Omega_b: 0.048600     # Baryon density parameter\n")
+            f.write(f"\n")
             f.write(f"Tree:\n")
             f.write(f"  max_leaf_count: 200\n")
             f.write(f"\n")
@@ -243,6 +267,10 @@ def test_single_particle_center(self):
         boxsize = 10.0
         TestDataGenerator.create_single_particle(snapshot, position, mass, boxsize)
 
+        # Read actual particle mass from snapshot (will be rescaled to match cosmology)
+        with h5py.File(snapshot, 'r') as f:
+            actual_mass = f['PartType1/Masses'][0]
+
         # Create parameters
         kernel_radii = [0.5, 1.0, 2.0]
         param_file = self._create_param_file(snapshot, "single_particle_out.hdf5",
@@ -275,8 +303,8 @@ def test_single_particle_center(self):
                 # Particle at center should be within kernel of center grid point
                 # for radii >= ~0.8 (half cell diagonal is ~0.866)
                 if radius >= 0.8:
-                    if masses[center_idx] != mass:
-                        return False, f"Kernel {i} (r={radius}): Expected mass {mass} at center, got {masses[center_idx]}"
+                    if not np.isclose(masses[center_idx], actual_mass):
+                        return False, f"Kernel {i} (r={radius}): Expected mass {actual_mass} at center, got {masses[center_idx]}"
 
         return True, "Single particle test passed"
 
@@ -563,6 +591,10 @@ def test_boundary_particles(self):
         # Write snapshot
         TestDataGenerator._write_snapshot(snapshot, positions, masses, boxsize)
 
+        # Read actual total mass from snapshot (will be rescaled to match cosmology)
+        with h5py.File(snapshot, 'r') as f:
+            expected_mass = np.sum(f['PartType1/Masses'][:])
+
         # Grid with points that should capture particles across boundaries
         param_file = self._create_param_file(snapshot, "boundary_parts_out.hdf5",
                                             cdim=3, kernel_radii=[2.0])
@@ -581,7 +613,6 @@ def test_boundary_particles(self):
             # All particles should be captured by at least one grid point
             # Check that the total mass is conserved (allowing small tolerance)
             total_mass_captured = np.sum(masses_out)
-            expected_mass = len(positions)  # All particles have mass=1
             if abs(total_mass_captured - expected_mass) > expected_mass * 0.1:
                 return False, f"Mass not conserved: {total_mass_captured} vs {expected_mass}"
 
@@ -757,6 +788,11 @@ def test_random_grid_mpi(self):
                 'type': 'random',
                 'n_grid_points': 500
             },
+            'Cosmology': {
+                'h': 0.681,
+                'Omega_cdm': 0.256011,
+                'Omega_b': 0.048600
+            },
             'Tree': {
                 'max_leaf_count': 50
             },
@@ -851,6 +887,11 @@ def _create_param_file(self, snapshot, basename, cdim=5, kernel_radii=[0.5, 1.0]
                 'type': 'uniform',
                 'cdim': cdim
             },
+            'Cosmology': {
+                'h': 0.681,
+                'Omega_cdm': 0.256011,
+                'Omega_b': 0.048600
+            },
             'Tree': {
                 'max_leaf_count': 50
             },

From 87dca02a2e50d8e25b29521bce42f5ce658fa9a6 Mon Sep 17 00:00:00 2001
From: Will Roper <wjr21@sussex.ac.uk>
Date: Fri, 28 Nov 2025 23:05:38 +0000
Subject: [PATCH 22/23] Potential fix for pull request finding 'Unused import'

Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>
---
 tests/test_cosmology.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_cosmology.py b/tests/test_cosmology.py
index d9d67bf..4a1e197 100644
--- a/tests/test_cosmology.py
+++ b/tests/test_cosmology.py
@@ -14,7 +14,6 @@
 from pathlib import Path
 import yaml
 import tempfile
-import os
 
 # Try to import astropy for direct comparison tests
 try:

From 8f497043c3e0ca20692c68f5ed585ae82db4b494 Mon Sep 17 00:00:00 2001
From: wjr21 <wjr21@sussex.ac.uk>
Date: Fri, 28 Nov 2025 23:12:20 +0000
Subject: [PATCH 23/23] Actually fixing the test params...

---
 tests/test_gridder.py | 45 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tests/test_gridder.py b/tests/test_gridder.py
index d52309e..6e13129 100644
--- a/tests/test_gridder.py
+++ b/tests/test_gridder.py
@@ -162,6 +162,11 @@ def test_grid_points_with_comments(self, build_gridder, test_snapshot):
   type: file
   grid_file: tests/data/grid_points_files/with_comments.txt
 
+Cosmology:
+  h: 0.681
+  Omega_cdm: 0.256011
+  Omega_b: 0.048600
+
 Tree:
   max_leaf_count: 200
 
@@ -215,6 +220,11 @@ def test_malformed_grid_points(self, build_gridder, test_snapshot):
   type: file
   grid_file: tests/data/grid_points_files/malformed.txt
 
+Cosmology:
+  h: 0.681
+  Omega_cdm: 0.256011
+  Omega_b: 0.048600
+
 Tree:
   max_leaf_count: 200
 
@@ -270,6 +280,11 @@ def test_missing_grid_file(self, build_gridder, test_snapshot):
   type: file
   grid_file: tests/data/grid_points_files/nonexistent.txt
 
+Cosmology:
+  h: 0.681
+  Omega_cdm: 0.256011
+  Omega_b: 0.048600
+
 Tree:
   max_leaf_count: 200
 
@@ -405,6 +420,11 @@ def test_random_grid_reproducibility(self, build_gridder, test_snapshot):
   n_grid_points: 50
   random_seed: {seed}
 
+Cosmology:
+  h: 0.681
+  Omega_cdm: 0.256011
+  Omega_b: 0.048600
+
 Tree:
   max_leaf_count: 200
 
@@ -468,6 +488,11 @@ def test_random_grid_different_seeds(self, build_gridder, test_snapshot):
   n_grid_points: 50
   random_seed: {seed}
 
+Cosmology:
+  h: 0.681
+  Omega_cdm: 0.256011
+  Omega_b: 0.048600
+
 Tree:
   max_leaf_count: 200
 
@@ -533,6 +558,11 @@ def test_sparse_grid_chunk_preparation(self, build_gridder, test_snapshot):
   type: uniform
   cdim: 5  # 5^3 = 125 grid points
 
+Cosmology:
+  h: 0.681
+  Omega_cdm: 0.256011
+  Omega_b: 0.048600
+
 Tree:
   max_leaf_count: 200
 
@@ -598,6 +628,11 @@ def test_cell_mass_summation(self, build_gridder, test_snapshot):
   type: uniform
   cdim: 3
 
+Cosmology:
+  h: 0.681
+  Omega_cdm: 0.256011
+  Omega_b: 0.048600
+
 Tree:
   max_leaf_count: 200
 
@@ -668,6 +703,11 @@ def test_uniform_grid_on_particles_unit_overdensity(self, test_snapshot):
   type: uniform
   cdim: 3  # 3^3 = 27 grid points
 
+Cosmology:
+  h: 0.681
+  Omega_cdm: 0.256011
+  Omega_b: 0.048600
+
 Tree:
   max_leaf_count: 200
 
@@ -737,6 +777,11 @@ def test_sparse_grid_on_particles_unit_overdensity(self, test_snapshot):
   type: file
   grid_file: {grid_file}
 
+Cosmology:
+  h: 0.681
+  Omega_cdm: 0.256011
+  Omega_b: 0.048600
+
 Tree:
   max_leaf_count: 200