diff --git a/Cargo.toml b/Cargo.toml index b38c520..95f856a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "hdbscan" -version = "0.12.0" +version = "0.13.0" edition = "2021" authors = [ "Tom Whitehead ", ] description = "HDBSCAN clustering in pure Rust. A huge improvement on DBSCAN, capable of identifying clusters of varying densities." diff --git a/src/cluster_result.rs b/src/cluster_result.rs new file mode 100644 index 0000000..b67bcf8 --- /dev/null +++ b/src/cluster_result.rs @@ -0,0 +1,186 @@ +use crate::data_wrappers::CondensedNode; +use num_traits::Float; +use std::collections::{HashMap, HashSet}; + +/// Detailed clustering result exposing diagnostics equivalent to Python's HDBSCAN. +/// +/// Contains cluster labels, membership probabilities, the condensed tree, +/// outlier scores (GLOSH), and supports soft clustering via +/// [`all_points_membership_vectors`](HdbscanResult::all_points_membership_vectors). +#[derive(Debug, Clone)] +pub struct HdbscanResult { + /// Cluster labels for each data point. -1 indicates noise. + pub labels: Vec, + /// Membership probability for each point in its assigned cluster. 0 for noise points. + pub probabilities: Vec, + /// The condensed cluster hierarchy. + pub condensed_tree: Vec>, + /// GLOSH outlier scores for each point. Range \[0, 1\], higher = more outlier-like. + pub outlier_scores: Vec, + /// Internal IDs of winning clusters, indexed by label. + cluster_map: Vec, + /// Pre-computed death lambdas for each cluster. + death_lambdas: HashMap, + /// Number of data points. + n_samples: usize, +} + +impl HdbscanResult { + pub(crate) fn new( + labels: Vec, + probabilities: Vec, + condensed_tree: Vec>, + outlier_scores: Vec, + cluster_map: Vec, + death_lambdas: HashMap, + n_samples: usize, + ) -> Self { + HdbscanResult { + labels, + probabilities, + condensed_tree, + outlier_scores, + cluster_map, + death_lambdas, + n_samples, + } + } + + /// Computes soft cluster membership vectors for all points. + /// + /// Returns a `Vec>` of shape `(n_points, n_clusters)` where each row + /// sums to approximately 1.0. Each entry represents the relative affinity of + /// the point for the corresponding cluster. + /// + /// Equivalent to Python's `hdbscan.all_points_membership_vectors()`. + pub fn all_points_membership_vectors(&self) -> Vec> { + let n_clusters = self.cluster_map.len(); + if n_clusters == 0 { + return vec![vec![]; self.n_samples]; + } + + // Build cluster parent map and lambda map (cluster_id -> parent, cluster_id -> lambda_birth) + let mut cluster_parent: HashMap = HashMap::new(); + let mut cluster_lambda: HashMap = HashMap::new(); + for node in &self.condensed_tree { + if node.node_id >= self.n_samples { + cluster_parent.insert(node.node_id, node.parent_node_id); + cluster_lambda.insert(node.node_id, node.lambda_birth); + } + } + + // Build point info: point_id -> (parent_cluster, lambda_birth) + let mut point_info: Vec<(usize, T)> = vec![(self.n_samples, T::zero()); self.n_samples]; + for node in &self.condensed_tree { + if node.node_id < self.n_samples { + point_info[node.node_id] = (node.parent_node_id, node.lambda_birth); + } + } + + // For each point, compute membership vector + let mut result = Vec::with_capacity(self.n_samples); + for &(p_parent, p_lambda) in point_info.iter().take(self.n_samples) { + let mut row = vec![T::zero(); n_clusters]; + let mut raw_sum = T::zero(); + + for (cluster_idx, &cluster_id) in self.cluster_map.iter().enumerate() { + let merge_lambda = Self::find_merge_lambda( + p_parent, + p_lambda, + cluster_id, + &cluster_parent, + &cluster_lambda, + self.n_samples, + ); + let death = self + .death_lambdas + .get(&cluster_id) + .copied() + .unwrap_or(T::one()); + let raw = if death.is_infinite() { + if merge_lambda.is_infinite() { + T::one() + } else { + T::zero() + } + } else if death > T::zero() { + merge_lambda / death + } else { + T::zero() + }; + row[cluster_idx] = raw; + raw_sum = raw_sum + raw; + } + + // Normalize row to sum to 1.0 + if raw_sum > T::zero() { + for val in &mut row { + *val = *val / raw_sum; + } + } + + result.push(row); + } + result + } + + /// Find the merge lambda between a point's parent cluster and a target cluster. + /// + /// This is the lambda at which the point and the target cluster would be in the + /// same cluster when traversing the condensed tree hierarchy. + fn find_merge_lambda( + point_parent: usize, + point_lambda: T, + target_cluster: usize, + cluster_parent: &HashMap, + cluster_lambda: &HashMap, + root: usize, + ) -> T { + // If point is directly in the target cluster + if point_parent == target_cluster { + return point_lambda; + } + + // Build ancestors of point_parent + let mut p_ancestors: HashSet = HashSet::new(); + let mut current = point_parent; + loop { + p_ancestors.insert(current); + match cluster_parent.get(¤t) { + Some(&parent) => current = parent, + None => break, + } + } + p_ancestors.insert(root); + + // Check if target_cluster is already an ancestor of point_parent + // (meaning the point is in the target cluster's subtree) + if p_ancestors.contains(&target_cluster) { + return point_lambda; + } + + // Walk up from target_cluster to find LCA + let mut prev_on_c_path = target_cluster; + let mut current = target_cluster; + loop { + match cluster_parent.get(¤t) { + Some(&parent) => { + prev_on_c_path = current; + current = parent; + if p_ancestors.contains(¤t) { + // Found LCA — return lambda_birth of child-of-LCA on target's path + return *cluster_lambda + .get(&prev_on_c_path) + .unwrap_or(&T::zero()); + } + } + None => { + // Reached root + return *cluster_lambda + .get(&prev_on_c_path) + .unwrap_or(&T::zero()); + } + } + } + } +} diff --git a/src/data_wrappers.rs b/src/data_wrappers.rs index a7f529d..88ed58b 100644 --- a/src/data_wrappers.rs +++ b/src/data_wrappers.rs @@ -12,9 +12,22 @@ pub(crate) struct SLTNode { pub(crate) size: usize, } -pub(crate) struct CondensedNode { - pub(crate) node_id: usize, - pub(crate) parent_node_id: usize, - pub(crate) lambda_birth: T, - pub(crate) size: usize, +/// A node in the condensed cluster tree. +/// +/// Maps to the Python HDBSCAN library's condensed tree DataFrame columns: +/// - `node_id` → `child` +/// - `parent_node_id` → `parent` +/// - `lambda_birth` → `lambda_val` +/// - `size` → `child_size` +#[derive(Clone, Debug, PartialEq)] +pub struct CondensedNode { + /// The ID of this node. For individual data points this is the point index (< n_samples). + /// For cluster nodes this is >= n_samples. Equivalent to Python's `child` column. + pub node_id: usize, + /// The ID of the parent cluster (always >= n_samples). Equivalent to Python's `parent` column. + pub parent_node_id: usize, + /// The lambda value (1/distance) at which this node was born. Equivalent to Python's `lambda_val`. + pub lambda_birth: T, + /// The number of points in this node (1 for individual points). Equivalent to Python's `child_size`. + pub size: usize, } diff --git a/src/hdbscan.rs b/src/hdbscan.rs index 0155c0f..b23a275 100644 --- a/src/hdbscan.rs +++ b/src/hdbscan.rs @@ -2,6 +2,7 @@ use crate::core_distances::parallel::CoreDistanceCalculatorPar; #[cfg(feature = "serial")] use crate::core_distances::serial::CoreDistanceCalculator; +use crate::cluster_result::HdbscanResult; use crate::data_wrappers::{CondensedNode, MSTEdge, SLTNode}; #[cfg(feature = "parallel")] use crate::min_spanning_tree::parallel::PrimsMinSpanningTreePar; @@ -10,13 +11,20 @@ use crate::min_spanning_tree::serial::PrimsMinSpanningTree; use crate::min_spanning_tree::MinSpanningTree; use crate::union_find::UnionFind; use crate::validation::DataValidator; +use crate::hyper_parameters::ClusterSelectionMethod; use crate::{distance, Center, DistanceMetric, HdbscanError, HdbscanHyperParams}; use num_traits::Float; -use std::collections::{HashMap, VecDeque}; +use std::collections::{HashMap, HashSet, VecDeque}; use std::ops::Range; type CondensedTree = Vec>; +struct ClusteringIntermediates { + condensed_tree: Vec>, + winning_clusters: Vec, + labels: Vec, +} + /// The HDBSCAN clustering algorithm in Rust. Generic over floating point numeric types. #[derive(Debug, Clone, PartialEq)] pub struct Hdbscan<'a, T> { @@ -74,12 +82,27 @@ impl Hdbscan<'_, T> { PrimsMinSpanningTree::new(self.data, self.hp.dist_metric, &core_distances); let min_spanning_tree = mst_calculator.compute(); - let single_linkage_tree = self.make_single_linkage_tree(&min_spanning_tree); - let condensed_tree = self.condense_tree(&single_linkage_tree); - let winning_clusters = self.extract_winning_clusters(&condensed_tree); - let labelled_data = self.label_data(&winning_clusters, &condensed_tree); + Ok(self.run_pipeline(&min_spanning_tree).labels) + } + + /// Performs clustering and returns detailed diagnostics including probabilities, + /// the condensed tree, and outlier scores. + /// + /// # Returns + /// * An [`HdbscanResult`] containing labels, probabilities, condensed tree, and + /// outlier scores. Supports further analysis via + /// [`all_points_membership_vectors`](HdbscanResult::all_points_membership_vectors). + pub fn cluster_detailed(&self) -> Result, HdbscanError> { + DataValidator::new(self.data, &self.hp).validate_input_data()?; + + let core_dist_calculator = CoreDistanceCalculator::new(self.data, &self.hp); + let core_distances = core_dist_calculator.calc_core_distances(); + + let mst_calculator = + PrimsMinSpanningTree::new(self.data, self.hp.dist_metric, &core_distances); + let min_spanning_tree = mst_calculator.compute(); - Ok(labelled_data) + Ok(self.build_detailed_result(&min_spanning_tree)) } } @@ -133,12 +156,27 @@ impl Hdbscan<'_, T> { PrimsMinSpanningTreePar::new(self.data, self.hp.dist_metric, &core_distances); let min_spanning_tree = mst_calculator.compute(); - let single_linkage_tree = self.make_single_linkage_tree(&min_spanning_tree); - let condensed_tree = self.condense_tree(&single_linkage_tree); - let winning_clusters = self.extract_winning_clusters(&condensed_tree); - let labelled_data = self.label_data(&winning_clusters, &condensed_tree); + Ok(self.run_pipeline(&min_spanning_tree).labels) + } + + /// Performs parallel clustering and returns detailed diagnostics including + /// probabilities, the condensed tree, and outlier scores. + /// + /// # Returns + /// * An [`HdbscanResult`] containing labels, probabilities, condensed tree, and + /// outlier scores. Supports further analysis via + /// [`all_points_membership_vectors`](HdbscanResult::all_points_membership_vectors). + pub fn cluster_detailed_par(&self) -> Result, HdbscanError> { + DataValidator::new(self.data, &self.hp).validate_input_data()?; + + let core_dist_calculator = CoreDistanceCalculatorPar::new(self.data, &self.hp); + let core_distances = core_dist_calculator.calc_core_distances(); - Ok(labelled_data) + let mst_calculator = + PrimsMinSpanningTreePar::new(self.data, self.hp.dist_metric, &core_distances); + let min_spanning_tree = mst_calculator.compute(); + + Ok(self.build_detailed_result(&min_spanning_tree)) } } @@ -290,6 +328,170 @@ impl<'a, T: Float> Hdbscan<'a, T> { )) } + fn run_pipeline(&self, mst: &[MSTEdge]) -> ClusteringIntermediates { + let single_linkage_tree = self.make_single_linkage_tree(mst); + let condensed_tree = self.condense_tree(&single_linkage_tree); + let winning_clusters = self.extract_winning_clusters(&condensed_tree); + let labels = self.label_data(&winning_clusters, &condensed_tree); + ClusteringIntermediates { + condensed_tree, + winning_clusters, + labels, + } + } + + fn build_detailed_result(&self, mst: &[MSTEdge]) -> HdbscanResult { + let intermediates = self.run_pipeline(mst); + let death_lambdas = self.compute_death_lambdas(&intermediates.condensed_tree); + let probabilities = + self.compute_probabilities(&intermediates.labels, &intermediates.condensed_tree); + let outlier_scores = + self.compute_outlier_scores(&intermediates.condensed_tree, &death_lambdas); + HdbscanResult::new( + intermediates.labels, + probabilities, + intermediates.condensed_tree, + outlier_scores, + intermediates.winning_clusters, + death_lambdas, + self.n_samples, + ) + } + + fn compute_probabilities( + &self, + labels: &[i32], + condensed_tree: &[CondensedNode], + ) -> Vec { + // Max lambda per cluster: max lambda_birth among all direct children + let mut max_lambda: HashMap = HashMap::new(); + for node in condensed_tree { + let entry = max_lambda.entry(node.parent_node_id).or_insert(T::zero()); + if node.lambda_birth > *entry { + *entry = node.lambda_birth; + } + } + + // Point info: point -> (parent_cluster, point_lambda) + let mut point_info: Vec<(usize, T)> = vec![(0, T::zero()); self.n_samples]; + for node in condensed_tree { + if node.node_id < self.n_samples { + point_info[node.node_id] = (node.parent_node_id, node.lambda_birth); + } + } + + let mut probabilities = vec![T::zero(); self.n_samples]; + for p in 0..self.n_samples { + if labels[p] == -1 { + continue; + } + let (parent_cluster, point_lambda) = point_info[p]; + let cluster_max = max_lambda.get(&parent_cluster).copied().unwrap_or(T::one()); + if cluster_max.is_infinite() { + // All points merged at distance 0 (duplicates) — maximally connected. + probabilities[p] = T::one(); + } else if cluster_max > T::zero() { + let capped = if point_lambda < cluster_max { + point_lambda + } else { + cluster_max + }; + probabilities[p] = capped / cluster_max; + } + } + probabilities + } + + fn compute_death_lambdas( + &self, + condensed_tree: &[CondensedNode], + ) -> HashMap { + // Step 1: for each cluster, max lambda_birth among direct children + let mut max_child_lambda: HashMap = HashMap::new(); + for node in condensed_tree { + let entry = max_child_lambda + .entry(node.parent_node_id) + .or_insert(T::zero()); + if node.lambda_birth > *entry { + *entry = node.lambda_birth; + } + } + + // Step 2: collect all cluster IDs + let mut all_cluster_ids: HashSet = HashSet::new(); + for node in condensed_tree { + if node.node_id >= self.n_samples { + all_cluster_ids.insert(node.node_id); + } + if node.parent_node_id >= self.n_samples { + all_cluster_ids.insert(node.parent_node_id); + } + } + + // Step 3: build child cluster map (parent -> child clusters) + let mut cluster_children: HashMap> = HashMap::new(); + for node in condensed_tree { + if node.node_id >= self.n_samples { + cluster_children + .entry(node.parent_node_id) + .or_default() + .push(node.node_id); + } + } + + // Step 4: sort descending for bottom-up propagation + let mut sorted_ids: Vec = all_cluster_ids.into_iter().collect(); + sorted_ids.sort_unstable_by(|a, b| b.cmp(a)); + + // Step 5: initialize from max child lambda + let mut death_lambdas: HashMap = HashMap::new(); + for &id in &sorted_ids { + death_lambdas.insert( + id, + max_child_lambda.get(&id).copied().unwrap_or(T::zero()), + ); + } + + // Step 6: propagate bottom-up + for &id in &sorted_ids { + if let Some(children) = cluster_children.get(&id) { + let max_child_death = children + .iter() + .filter_map(|child| death_lambdas.get(child).copied()) + .fold(T::zero(), |a, b| if a > b { a } else { b }); + let current = death_lambdas[&id]; + if max_child_death > current { + death_lambdas.insert(id, max_child_death); + } + } + } + + death_lambdas + } + + fn compute_outlier_scores( + &self, + condensed_tree: &[CondensedNode], + death_lambdas: &HashMap, + ) -> Vec { + let mut scores = vec![T::zero(); self.n_samples]; + for node in condensed_tree { + if node.node_id < self.n_samples { + let death = death_lambdas + .get(&node.parent_node_id) + .copied() + .unwrap_or(T::one()); + if death.is_infinite() { + // Points merged at distance 0 (duplicates) — not outliers. + scores[node.node_id] = T::zero(); + } else if death > T::zero() { + scores[node.node_id] = (death - node.lambda_birth) / death; + } + } + } + scores + } + fn make_single_linkage_tree(&self, min_spanning_tree: &[MSTEdge]) -> Vec> { let mut single_linkage_tree: Vec> = Vec::with_capacity(self.n_samples - 1); @@ -484,9 +686,26 @@ impl<'a, T: Float> Hdbscan<'a, T> { } fn extract_winning_clusters(&self, condensed_tree: &CondensedTree) -> Vec { + let mut selected_cluster_ids = match self.hp.cluster_selection_method { + ClusterSelectionMethod::Eom => self.extract_eom_clusters(condensed_tree), + ClusterSelectionMethod::Leaf => self.extract_leaf_clusters(condensed_tree), + }; + let (lower, upper) = self.get_cluster_id_bounds(condensed_tree); let n_clusters = upper - lower; + if self.hp.epsilon != 0.0 && n_clusters > 0 && !selected_cluster_ids.is_empty() { + selected_cluster_ids = + self.check_cluster_epsilons(selected_cluster_ids, condensed_tree); + } + + selected_cluster_ids.sort(); + selected_cluster_ids + } + + fn extract_eom_clusters(&self, condensed_tree: &CondensedTree) -> Vec { + let (lower, upper) = self.get_cluster_id_bounds(condensed_tree); + let mut stabilities = self.calc_all_stabilities(lower..upper, condensed_tree); let mut clusters: HashMap = stabilities.keys().map(|id| (*id, false)).collect(); @@ -520,19 +739,34 @@ impl<'a, T: Float> Hdbscan<'a, T> { } } - let mut selected_cluster_ids = clusters + clusters .into_iter() .filter(|(_id, should_keep)| *should_keep) .map(|(id, _should_keep)| id) - .collect(); + .collect() + } - if self.hp.epsilon != 0.0 && n_clusters > 0 { - selected_cluster_ids = - self.check_cluster_epsilons(selected_cluster_ids, condensed_tree); + fn extract_leaf_clusters(&self, condensed_tree: &CondensedTree) -> Vec { + let mut all_cluster_ids: HashSet = HashSet::new(); + let mut parent_cluster_ids: HashSet = HashSet::new(); + + for node in condensed_tree { + if node.node_id >= self.n_samples { + all_cluster_ids.insert(node.node_id); + } + if node.parent_node_id >= self.n_samples { + all_cluster_ids.insert(node.parent_node_id); + } + // If a cluster is the parent of another cluster, it's not a leaf + if node.node_id >= self.n_samples { + parent_cluster_ids.insert(node.parent_node_id); + } } - selected_cluster_ids.sort(); - selected_cluster_ids + all_cluster_ids + .difference(&parent_cluster_ids) + .copied() + .collect() } fn get_cluster_id_bounds(&self, condensed_tree: &CondensedTree) -> (usize, usize) { diff --git a/src/hyper_parameters.rs b/src/hyper_parameters.rs index 89ca7e3..c041fdf 100644 --- a/src/hyper_parameters.rs +++ b/src/hyper_parameters.rs @@ -3,6 +3,15 @@ use crate::distance::DistanceMetric; use num_traits::Num; use std::fmt::Display; +/// The method used to select flat clusters from the condensed tree. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum ClusterSelectionMethod { + /// Excess of Mass — maximizes cluster stability. Default. + Eom, + /// Leaf — selects the finest-grained (deepest) clusters. + Leaf, +} + // Defaults for parameters const MIN_CLUSTER_SIZE_DEFAULT: usize = 5; const MAX_CLUSTER_SIZE_DEFAULT: usize = usize::MAX; // Set to a value that will never be triggered @@ -10,6 +19,7 @@ const ALLOW_SINGLE_CLUSTER_DEFAULT: bool = false; const EPSILON_DEFAULT: f64 = 0.0; const DISTANCE_METRIC_DEFAULT: DistanceMetric = DistanceMetric::Euclidean; const NN_ALGORITHM_DEFAULT: NnAlgorithm = NnAlgorithm::Auto; +const CLUSTER_SELECTION_METHOD_DEFAULT: ClusterSelectionMethod = ClusterSelectionMethod::Eom; // Valid minimums/left bounds of parameters const MIN_CLUSTER_SIZE_MINIMUM: usize = 2; @@ -29,6 +39,7 @@ pub struct HdbscanHyperParams { pub(crate) epsilon: f64, pub(crate) dist_metric: DistanceMetric, pub(crate) nn_algo: NnAlgorithm, + pub(crate) cluster_selection_method: ClusterSelectionMethod, } /// Builder object to set custom hyper parameters. @@ -41,6 +52,7 @@ pub struct HyperParamBuilder { epsilon: Option, dist_metric: Option, nn_algo: Option, + cluster_selection_method: Option, } impl HdbscanHyperParams { @@ -62,6 +74,7 @@ impl HdbscanHyperParams { epsilon: None, dist_metric: None, nn_algo: None, + cluster_selection_method: None, } } } @@ -189,6 +202,23 @@ impl HyperParamBuilder { self } + /// Sets the cluster selection method. EOM (Excess of Mass) maximizes overall cluster + /// stability, while Leaf selects the finest-grained (deepest) clusters in the condensed + /// tree. Defaults to EOM. + /// + /// # Parameters + /// * cluster_selection_method - the cluster selection method + /// + /// # Returns + /// * the hyper parameter configuration builder + pub fn cluster_selection_method( + mut self, + cluster_selection_method: ClusterSelectionMethod, + ) -> HyperParamBuilder { + self.cluster_selection_method = Some(cluster_selection_method); + self + } + /// Finishes the building of the hyper parameter configuration. A call to this method is /// required to exist the builder pattern and complete the construction of the hyper parameters. /// @@ -206,6 +236,9 @@ impl HyperParamBuilder { epsilon: self.epsilon.unwrap_or(EPSILON_DEFAULT), dist_metric: self.dist_metric.unwrap_or(DISTANCE_METRIC_DEFAULT), nn_algo: self.nn_algo.unwrap_or(NN_ALGORITHM_DEFAULT), + cluster_selection_method: self + .cluster_selection_method + .unwrap_or(CLUSTER_SELECTION_METHOD_DEFAULT), } } diff --git a/src/lib.rs b/src/lib.rs index 151f4f1..711dbd1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -50,13 +50,16 @@ //! * [How HDBSCAN Works](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html) pub use crate::centers::Center; +pub use crate::cluster_result::HdbscanResult; pub use crate::core_distances::NnAlgorithm; +pub use crate::data_wrappers::CondensedNode; pub use crate::distance::DistanceMetric; pub use crate::error::HdbscanError; pub use crate::hdbscan::Hdbscan; -pub use crate::hyper_parameters::{HdbscanHyperParams, HyperParamBuilder}; +pub use crate::hyper_parameters::{ClusterSelectionMethod, HdbscanHyperParams, HyperParamBuilder}; mod centers; +mod cluster_result; mod core_distances; mod data_wrappers; mod distance; diff --git a/tests/common.rs b/tests/common.rs index 33b2088..189b43a 100644 --- a/tests/common.rs +++ b/tests/common.rs @@ -1,8 +1,13 @@ -use hdbscan::{Center, DistanceMetric, Hdbscan, HdbscanError, HdbscanHyperParams, NnAlgorithm}; +use hdbscan::{ + Center, ClusterSelectionMethod, DistanceMetric, Hdbscan, HdbscanError, HdbscanHyperParams, + HdbscanResult, NnAlgorithm, +}; use num_traits::Float; use std::collections::HashSet; type ClusterFn = fn(&Hdbscan) -> Result, HdbscanError>; +pub(crate) type DetailedClusterFn = + fn(&Hdbscan) -> Result, HdbscanError>; pub(crate) fn test_cluster(cluster_fn: ClusterFn) { let data = cluster_test_data(); @@ -362,3 +367,617 @@ pub(crate) fn test_precomputed_distances(cluster_fn: ClusterFn) { assert_eq!(unique_clusters.len(), 1, "Should have 1 distinct cluster"); assert_eq!(result[result.len() - 1], -1, "Should have 0 noise points"); } + +// ── Detailed clustering tests ─────────────────────────────────────────────── + +pub(crate) fn test_condensed_tree_not_empty(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + assert!(!result.condensed_tree.is_empty()); +} + +pub(crate) fn test_condensed_tree_has_all_points(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let n = data.len(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + let point_ids: HashSet = result + .condensed_tree + .iter() + .filter(|node| node.node_id < n) + .map(|node| node.node_id) + .collect(); + for i in 0..n { + assert!(point_ids.contains(&i), "Point {} missing from condensed tree", i); + } +} + +pub(crate) fn test_condensed_tree_valid_parents(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let n = data.len(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + for node in &result.condensed_tree { + assert!( + node.parent_node_id >= n, + "Parent {} is less than n_samples {}", + node.parent_node_id, + n + ); + } +} + +pub(crate) fn test_probabilities_range(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + for (i, &p) in result.probabilities.iter().enumerate() { + assert!(p >= 0.0 && p <= 1.0, "Probability {} out of range at index {}", p, i); + } +} + +pub(crate) fn test_probabilities_noise_zero(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + for (i, &label) in result.labels.iter().enumerate() { + if label == -1 { + assert_eq!( + result.probabilities[i], 0.0, + "Noise point {} should have probability 0", + i + ); + } + } +} + +pub(crate) fn test_probabilities_clustered_nonzero(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + for (i, &label) in result.labels.iter().enumerate() { + if label != -1 { + assert!( + result.probabilities[i] > 0.0, + "Clustered point {} should have probability > 0", + i + ); + } + } +} + +pub(crate) fn test_probabilities_length(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + assert_eq!(result.probabilities.len(), data.len()); +} + +pub(crate) fn test_outlier_scores_range(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + for (i, &s) in result.outlier_scores.iter().enumerate() { + assert!(s >= 0.0 && s <= 1.0, "Outlier score {} out of range at index {}", s, i); + } +} + +pub(crate) fn test_outlier_scores_length(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + assert_eq!(result.outlier_scores.len(), data.len()); +} + +pub(crate) fn test_outlier_scores_distant_point_high(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + // Point at index 10 is (10, 10), far from both clusters + let distant_score = result.outlier_scores[10]; + let avg_clustered: f32 = result + .outlier_scores + .iter() + .enumerate() + .filter(|(i, _)| result.labels[*i] != -1) + .map(|(_, &s)| s) + .sum::() + / result.labels.iter().filter(|&&l| l != -1).count() as f32; + assert!( + distant_score > avg_clustered, + "Distant point score {} should exceed average clustered score {}", + distant_score, + avg_clustered + ); +} + +pub(crate) fn test_membership_vectors_shape(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + let vectors = result.all_points_membership_vectors(); + let n_clusters = result + .labels + .iter() + .filter(|&&l| l != -1) + .collect::>() + .len(); + assert_eq!(vectors.len(), data.len()); + for row in &vectors { + assert_eq!(row.len(), n_clusters); + } +} + +pub(crate) fn test_membership_vectors_sum_to_one(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + let vectors = result.all_points_membership_vectors(); + for (i, row) in vectors.iter().enumerate() { + let sum: f32 = row.iter().sum(); + assert!( + (sum - 1.0).abs() < 1e-5, + "Row {} sums to {} instead of 1.0", + i, + sum + ); + } +} + +pub(crate) fn test_membership_vectors_range(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + let vectors = result.all_points_membership_vectors(); + for (i, row) in vectors.iter().enumerate() { + for (j, &v) in row.iter().enumerate() { + assert!( + v >= 0.0 && v <= 1.0, + "Membership [{},{}] = {} out of range", + i, + j, + v + ); + } + } +} + +pub(crate) fn test_membership_dominant_matches_label(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let result = cluster_fn(&clusterer).unwrap(); + let vectors = result.all_points_membership_vectors(); + for (i, &label) in result.labels.iter().enumerate() { + if label == -1 { + continue; + } + let row = &vectors[i]; + let argmax = row + .iter() + .enumerate() + .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) + .map(|(idx, _)| idx) + .unwrap(); + assert_eq!( + argmax, label as usize, + "Point {} argmax {} != label {}", + i, argmax, label + ); + } +} + +// ── Leaf cluster selection tests ──────────────────────────────────────────── + +pub(crate) fn test_leaf_basic_clusters(cluster_fn: ClusterFn) { + let data = cluster_test_data(); + let hp = HdbscanHyperParams::builder() + .cluster_selection_method(ClusterSelectionMethod::Leaf) + .build(); + let clusterer = Hdbscan::new(&data, hp); + let result = cluster_fn(&clusterer).unwrap(); + // First five points form one cluster + assert_eq!(1, result[..5].iter().collect::>().len()); + // Next five points are a second cluster + assert_eq!(1, result[5..10].iter().collect::>().len()); + // The final point is noise + assert_eq!(-1, result[10]); +} + +pub(crate) fn test_leaf_matches_eom_simple(cluster_fn: ClusterFn) { + let data = cluster_test_data(); + + let hp_eom = HdbscanHyperParams::builder() + .cluster_selection_method(ClusterSelectionMethod::Eom) + .build(); + let hp_leaf = HdbscanHyperParams::builder() + .cluster_selection_method(ClusterSelectionMethod::Leaf) + .build(); + + let eom_result = cluster_fn(&Hdbscan::new(&data, hp_eom)).unwrap(); + let leaf_result = cluster_fn(&Hdbscan::new(&data, hp_leaf)).unwrap(); + + let eom_clusters: HashSet<_> = eom_result.iter().filter(|&&l| l != -1).collect(); + let leaf_clusters: HashSet<_> = leaf_result.iter().filter(|&&l| l != -1).collect(); + assert_eq!(eom_clusters.len(), leaf_clusters.len()); + + let eom_noise = eom_result.iter().filter(|&&l| l == -1).count(); + let leaf_noise = leaf_result.iter().filter(|&&l| l == -1).count(); + assert_eq!(eom_noise, leaf_noise); +} + +pub(crate) fn test_leaf_with_epsilon(cluster_fn: ClusterFn) { + let data = vec![ + // Sub-cluster A1 + vec![0.0, 0.0], + vec![0.1, 0.1], + vec![0.2, 0.0], + // Sub-cluster A2 + vec![1.0, 0.0], + vec![1.1, 0.1], + vec![1.2, 0.0], + // Separate cluster B + vec![10.0, 10.0], + vec![10.1, 10.1], + vec![10.2, 10.0], + ]; + + let hp_leaf = HdbscanHyperParams::builder() + .min_cluster_size(3) + .cluster_selection_method(ClusterSelectionMethod::Leaf) + .build(); + let leaf_result = cluster_fn(&Hdbscan::new(&data, hp_leaf)).unwrap(); + let leaf_n_clusters = leaf_result + .iter() + .filter(|&&l| l != -1) + .collect::>() + .len(); + + // With a large epsilon, sub-clusters should merge + let hp_leaf_eps = HdbscanHyperParams::builder() + .min_cluster_size(3) + .cluster_selection_method(ClusterSelectionMethod::Leaf) + .epsilon(2.0) + .build(); + let eps_result = cluster_fn(&Hdbscan::new(&data, hp_leaf_eps)).unwrap(); + let eps_n_clusters = eps_result + .iter() + .filter(|&&l| l != -1) + .collect::>() + .len(); + + assert!( + eps_n_clusters <= leaf_n_clusters, + "Epsilon merging should not increase cluster count: {} > {}", + eps_n_clusters, + leaf_n_clusters + ); +} + +pub(crate) fn test_leaf_allow_single_cluster(cluster_fn: ClusterFn) { + let data = vec![ + vec![1.1, 1.1], + vec![1.2, 1.1], + vec![1.3, 1.2], + vec![1.1, 1.3], + vec![1.2, 1.2], + vec![3.0, 3.0], + ]; + + let hp = HdbscanHyperParams::builder() + .allow_single_cluster(true) + .min_cluster_size(4) + .min_samples(4) + .nn_algorithm(NnAlgorithm::BruteForce) + .cluster_selection_method(ClusterSelectionMethod::Leaf) + .build(); + let clusterer = Hdbscan::new(&data, hp); + let result = cluster_fn(&clusterer).unwrap(); + + // At least one cluster exists (not all noise) + assert!( + result.iter().any(|&l| l != -1), + "Should have at least one cluster" + ); + // Outlier at index 5 is noise + assert_eq!(-1, result[5]); +} + +pub(crate) fn test_leaf_allow_single_cluster_epsilon(cluster_fn: ClusterFn) { + let data = vec![ + vec![1.1, 1.1], + vec![1.2, 1.1], + vec![1.3, 1.2], + vec![2.1, 1.3], + vec![2.2, 1.2], + vec![2.0, 1.2], + vec![3.0, 3.0], + ]; + + // Baseline: Leaf without epsilon + let hp_baseline = HdbscanHyperParams::builder() + .min_cluster_size(3) + .cluster_selection_method(ClusterSelectionMethod::Leaf) + .build(); + let baseline_result = cluster_fn(&Hdbscan::new(&data, hp_baseline)).unwrap(); + let baseline_n_clusters = baseline_result + .iter() + .filter(|&&l| l != -1) + .collect::>() + .len(); + + // With allow_single_cluster + epsilon + let hp = HdbscanHyperParams::builder() + .allow_single_cluster(true) + .min_cluster_size(3) + .epsilon(1.2) + .cluster_selection_method(ClusterSelectionMethod::Leaf) + .build(); + let clusterer = Hdbscan::new(&data, hp); + let result = cluster_fn(&clusterer).unwrap(); + + let n_clusters = result + .iter() + .filter(|&&l| l != -1) + .collect::>() + .len(); + assert_eq!(1, n_clusters, "With epsilon, should have exactly 1 cluster"); + assert!( + n_clusters <= baseline_n_clusters, + "Cluster count with epsilon {} should not exceed baseline {}", + n_clusters, + baseline_n_clusters + ); + let n_noise = result.iter().filter(|&&l| l == -1).count(); + assert_eq!(1, n_noise, "Should have exactly 1 noise point"); +} + +pub(crate) fn test_leaf_detailed_probabilities_and_outlier_scores( + cluster_fn: DetailedClusterFn, +) { + let data = cluster_test_data(); + let hp = HdbscanHyperParams::builder() + .cluster_selection_method(ClusterSelectionMethod::Leaf) + .build(); + let clusterer = Hdbscan::new(&data, hp); + let result = cluster_fn(&clusterer).unwrap(); + + // Lengths match data + assert_eq!(result.labels.len(), data.len()); + assert_eq!(result.probabilities.len(), data.len()); + assert_eq!(result.outlier_scores.len(), data.len()); + + // Probabilities in [0,1]; noise=0, clustered>0 + for (i, &p) in result.probabilities.iter().enumerate() { + assert!(p >= 0.0 && p <= 1.0, "Probability {} out of range at {}", p, i); + if result.labels[i] == -1 { + assert_eq!(p, 0.0, "Noise point {} should have probability 0", i); + } else { + assert!(p > 0.0, "Clustered point {} should have probability > 0", i); + } + } + + // Outlier scores in [0,1] + for (i, &s) in result.outlier_scores.iter().enumerate() { + assert!(s >= 0.0 && s <= 1.0, "Outlier score {} out of range at {}", s, i); + } + + // Distant point (index 10) has higher outlier score than average clustered point + let distant_score = result.outlier_scores[10]; + let avg_clustered: f32 = result + .outlier_scores + .iter() + .enumerate() + .filter(|(i, _)| result.labels[*i] != -1) + .map(|(_, &s)| s) + .sum::() + / result.labels.iter().filter(|&&l| l != -1).count() as f32; + assert!( + distant_score > avg_clustered, + "Distant point score {} should exceed average clustered score {}", + distant_score, + avg_clustered + ); + + // Condensed tree not empty + assert!(!result.condensed_tree.is_empty()); +} + +pub(crate) fn test_leaf_detailed_membership_vectors(cluster_fn: DetailedClusterFn) { + let data = cluster_test_data(); + let hp = HdbscanHyperParams::builder() + .cluster_selection_method(ClusterSelectionMethod::Leaf) + .build(); + let clusterer = Hdbscan::new(&data, hp); + let result = cluster_fn(&clusterer).unwrap(); + let vectors = result.all_points_membership_vectors(); + + let n_clusters = result + .labels + .iter() + .filter(|&&l| l != -1) + .collect::>() + .len(); + + // Shape is n_points x n_clusters + assert_eq!(vectors.len(), data.len()); + for row in &vectors { + assert_eq!(row.len(), n_clusters); + } + + // Rows sum to ~1.0; all values in [0,1] + for (i, row) in vectors.iter().enumerate() { + let sum: f32 = row.iter().sum(); + assert!( + (sum - 1.0).abs() < 1e-5, + "Row {} sums to {} instead of 1.0", + i, + sum + ); + for (j, &v) in row.iter().enumerate() { + assert!( + v >= 0.0 && v <= 1.0, + "Membership [{},{}] = {} out of range", + i, + j, + v + ); + } + } + + // Dominant membership matches label for clustered points + for (i, &label) in result.labels.iter().enumerate() { + if label == -1 { + continue; + } + let row = &vectors[i]; + let argmax = row + .iter() + .enumerate() + .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) + .map(|(idx, _)| idx) + .unwrap(); + assert_eq!( + argmax, label as usize, + "Point {} argmax {} != label {}", + i, argmax, label + ); + } +} + +pub(crate) fn test_leaf_manhattan_distance(cluster_fn: ClusterFn) { + let data = vec![ + vec![1.3, 1.1], + vec![1.3, 1.2], + vec![1.2, 1.2], + vec![1.0, 1.1], + vec![0.9, 1.0], + vec![0.9, 1.0], + vec![3.7, 4.0], + ]; + let hp = HdbscanHyperParams::builder() + .min_cluster_size(3) + .min_samples(2) + .dist_metric(DistanceMetric::Manhattan) + .nn_algorithm(NnAlgorithm::BruteForce) + .cluster_selection_method(ClusterSelectionMethod::Leaf) + .build(); + let clusterer = Hdbscan::new(&data, hp); + let result = cluster_fn(&clusterer).unwrap(); + + // First three points form one cluster + assert_eq!(1, result[..3].iter().collect::>().len()); + // Next three points are a second cluster + assert_eq!(1, result[3..6].iter().collect::>().len()); + // The final point is noise + assert_eq!(-1, result[6]); +} + +pub(crate) fn test_leaf_precomputed_distances(cluster_fn: ClusterFn) { + let dist_matrix = vec![ + vec![0.0, 0.1, 0.2, 0.3, 9.0], + vec![0.1, 0.0, 0.1, 0.2, 9.0], + vec![0.2, 0.1, 0.0, 0.1, 9.0], + vec![0.3, 0.2, 0.1, 0.0, 9.0], + vec![9.0, 9.0, 9.0, 9.0, 9.0], + ]; + let hp = HdbscanHyperParams::builder() + .dist_metric(DistanceMetric::Precalculated) + .allow_single_cluster(true) + .min_cluster_size(2) + .min_samples(1) + .cluster_selection_method(ClusterSelectionMethod::Leaf) + .build(); + let clusterer = Hdbscan::new(&dist_matrix, hp); + let result = cluster_fn(&clusterer).unwrap(); + + // At least 1 cluster + let unique_clusters: HashSet<_> = result.iter().filter(|&&x| x != -1).collect(); + assert!( + !unique_clusters.is_empty(), + "Should have at least 1 cluster" + ); + // Last point is noise + assert_eq!(result[result.len() - 1], -1, "Last point should be noise"); +} + +pub(crate) fn test_default_is_eom(cluster_fn: ClusterFn) { + let data = vec![ + // Tight sub-cluster 1 + vec![0.0, 0.0], + vec![0.05, 0.05], + vec![0.1, 0.0], + // Tight sub-cluster 2 + vec![2.0, 0.0], + vec![2.05, 0.05], + vec![2.1, 0.0], + // Separate cluster + vec![20.0, 20.0], + vec![20.05, 20.05], + vec![20.1, 20.0], + ]; + + // Default (no cluster_selection_method specified) + let hp_default = HdbscanHyperParams::builder() + .min_cluster_size(3) + .build(); + let default_result = cluster_fn(&Hdbscan::new(&data, hp_default)).unwrap(); + + // Explicit EOM + let hp_eom = HdbscanHyperParams::builder() + .min_cluster_size(3) + .cluster_selection_method(ClusterSelectionMethod::Eom) + .build(); + let eom_result = cluster_fn(&Hdbscan::new(&data, hp_eom)).unwrap(); + + assert_eq!( + default_result, eom_result, + "Default cluster selection should produce identical results to explicit EOM" + ); +} + +pub(crate) fn test_leaf_finer_grained(cluster_fn: ClusterFn) { + let data = vec![ + // Tight sub-cluster 1 + vec![0.0, 0.0], + vec![0.05, 0.05], + vec![0.1, 0.0], + // Tight sub-cluster 2 + vec![2.0, 0.0], + vec![2.05, 0.05], + vec![2.1, 0.0], + // Separate cluster + vec![20.0, 20.0], + vec![20.05, 20.05], + vec![20.1, 20.0], + ]; + + let hp_eom = HdbscanHyperParams::builder() + .min_cluster_size(3) + .cluster_selection_method(ClusterSelectionMethod::Eom) + .build(); + let eom_result = cluster_fn(&Hdbscan::new(&data, hp_eom)).unwrap(); + let eom_n_clusters = eom_result + .iter() + .filter(|&&l| l != -1) + .collect::>() + .len(); + + let hp_leaf = HdbscanHyperParams::builder() + .min_cluster_size(3) + .cluster_selection_method(ClusterSelectionMethod::Leaf) + .build(); + let leaf_result = cluster_fn(&Hdbscan::new(&data, hp_leaf)).unwrap(); + let leaf_n_clusters = leaf_result + .iter() + .filter(|&&l| l != -1) + .collect::>() + .len(); + + assert!( + leaf_n_clusters >= eom_n_clusters, + "Leaf should find at least as many clusters as EOM: {} < {}", + leaf_n_clusters, + eom_n_clusters + ); +} diff --git a/tests/parallel.rs b/tests/parallel.rs index 6522ad4..9cfd0ae 100644 --- a/tests/parallel.rs +++ b/tests/parallel.rs @@ -1,5 +1,5 @@ #![cfg(feature = "parallel")] -use hdbscan::{Hdbscan, HdbscanError}; +use hdbscan::{Hdbscan, HdbscanError, HdbscanResult}; mod common; @@ -16,6 +16,19 @@ macro_rules! define_parallel_test { }; } +macro_rules! define_parallel_detailed_test { + ($test_fn:ident) => { + #[test] + fn $test_fn() { + fn cluster_fn(hdb: &Hdbscan) -> Result, HdbscanError> { + hdb.cluster_detailed_par() + } + + common::$test_fn(cluster_fn); + } + }; +} + define_parallel_test!(test_cluster); define_parallel_test!(test_builder_cluster); define_parallel_test!(test_single_cluster); @@ -30,3 +43,38 @@ define_parallel_test!(test_nyc_landmarks_haversine); define_parallel_test!(test_geo_cluster_across_180th_meridian); define_parallel_test!(test_cylindrical_hsv_colours); define_parallel_test!(test_precomputed_distances); +define_parallel_test!(test_leaf_basic_clusters); +define_parallel_test!(test_leaf_matches_eom_simple); +define_parallel_test!(test_leaf_with_epsilon); +define_parallel_test!(test_leaf_finer_grained); +define_parallel_test!(test_leaf_allow_single_cluster); +define_parallel_test!(test_leaf_allow_single_cluster_epsilon); +define_parallel_test!(test_leaf_manhattan_distance); +define_parallel_test!(test_leaf_precomputed_distances); +define_parallel_test!(test_default_is_eom); + +#[test] +fn test_detailed_labels_match_cluster() { + let data = common::cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let labels = clusterer.cluster_par().unwrap(); + let result = clusterer.cluster_detailed_par().unwrap(); + assert_eq!(labels, result.labels); +} + +define_parallel_detailed_test!(test_condensed_tree_not_empty); +define_parallel_detailed_test!(test_condensed_tree_has_all_points); +define_parallel_detailed_test!(test_condensed_tree_valid_parents); +define_parallel_detailed_test!(test_probabilities_range); +define_parallel_detailed_test!(test_probabilities_noise_zero); +define_parallel_detailed_test!(test_probabilities_clustered_nonzero); +define_parallel_detailed_test!(test_probabilities_length); +define_parallel_detailed_test!(test_outlier_scores_range); +define_parallel_detailed_test!(test_outlier_scores_length); +define_parallel_detailed_test!(test_outlier_scores_distant_point_high); +define_parallel_detailed_test!(test_membership_vectors_shape); +define_parallel_detailed_test!(test_membership_vectors_sum_to_one); +define_parallel_detailed_test!(test_membership_vectors_range); +define_parallel_detailed_test!(test_membership_dominant_matches_label); +define_parallel_detailed_test!(test_leaf_detailed_probabilities_and_outlier_scores); +define_parallel_detailed_test!(test_leaf_detailed_membership_vectors); diff --git a/tests/serial.rs b/tests/serial.rs index d1bb49f..5a78261 100644 --- a/tests/serial.rs +++ b/tests/serial.rs @@ -1,5 +1,5 @@ #![cfg(feature = "serial")] -use hdbscan::{Hdbscan, HdbscanError}; +use hdbscan::{Hdbscan, HdbscanError, HdbscanResult}; mod common; @@ -16,6 +16,19 @@ macro_rules! define_serial_test { }; } +macro_rules! define_serial_detailed_test { + ($test_fn:ident) => { + #[test] + fn $test_fn() { + fn cluster_fn(hdb: &Hdbscan) -> Result, HdbscanError> { + hdb.cluster_detailed() + } + + common::$test_fn(cluster_fn); + } + }; +} + define_serial_test!(test_cluster); define_serial_test!(test_builder_cluster); define_serial_test!(test_single_cluster); @@ -30,3 +43,78 @@ define_serial_test!(test_nyc_landmarks_haversine); define_serial_test!(test_geo_cluster_across_180th_meridian); define_serial_test!(test_cylindrical_hsv_colours); define_serial_test!(test_precomputed_distances); +define_serial_test!(test_leaf_basic_clusters); +define_serial_test!(test_leaf_matches_eom_simple); +define_serial_test!(test_leaf_with_epsilon); +define_serial_test!(test_leaf_finer_grained); +define_serial_test!(test_leaf_allow_single_cluster); +define_serial_test!(test_leaf_allow_single_cluster_epsilon); +define_serial_test!(test_leaf_manhattan_distance); +define_serial_test!(test_leaf_precomputed_distances); +define_serial_test!(test_default_is_eom); + +#[test] +fn test_detailed_labels_match_cluster() { + let data = common::cluster_test_data(); + let clusterer = Hdbscan::default_hyper_params(&data); + let labels = clusterer.cluster().unwrap(); + let result = clusterer.cluster_detailed().unwrap(); + assert_eq!(labels, result.labels); +} + +define_serial_detailed_test!(test_condensed_tree_not_empty); +define_serial_detailed_test!(test_condensed_tree_has_all_points); +define_serial_detailed_test!(test_condensed_tree_valid_parents); +define_serial_detailed_test!(test_probabilities_range); +define_serial_detailed_test!(test_probabilities_noise_zero); +define_serial_detailed_test!(test_probabilities_clustered_nonzero); +define_serial_detailed_test!(test_probabilities_length); +define_serial_detailed_test!(test_outlier_scores_range); +define_serial_detailed_test!(test_outlier_scores_length); +define_serial_detailed_test!(test_outlier_scores_distant_point_high); +define_serial_detailed_test!(test_membership_vectors_shape); +define_serial_detailed_test!(test_membership_vectors_sum_to_one); +define_serial_detailed_test!(test_membership_vectors_range); +define_serial_detailed_test!(test_membership_dominant_matches_label); +define_serial_detailed_test!(test_leaf_detailed_probabilities_and_outlier_scores); +define_serial_detailed_test!(test_leaf_detailed_membership_vectors); + +#[test] +fn test_duplicate_points_no_nan() { + // Duplicate/near-duplicate points cause distance=0, lambda=infinity. + // Probabilities and outlier scores must remain finite (no NaN from inf/inf). + let data: Vec> = vec![ + // Cluster A: many exact duplicates + vec![0.0, 0.0], + vec![0.0, 0.0], + vec![0.0, 0.0], + vec![0.0, 0.0], + vec![0.0, 0.0], + vec![0.1, 0.1], + // Cluster B: many exact duplicates + vec![10.0, 10.0], + vec![10.0, 10.0], + vec![10.0, 10.0], + vec![10.0, 10.0], + vec![10.0, 10.0], + vec![10.1, 10.1], + ]; + let clusterer = Hdbscan::default_hyper_params(&data); + let result = clusterer.cluster_detailed().unwrap(); + + for (i, &p) in result.probabilities.iter().enumerate() { + assert!(!p.is_nan(), "Probability is NaN at index {}", i); + assert!(p >= 0.0 && p <= 1.0, "Probability {} out of range at index {}", p, i); + } + for (i, &s) in result.outlier_scores.iter().enumerate() { + assert!(!s.is_nan(), "Outlier score is NaN at index {}", i); + assert!(s >= 0.0 && s <= 1.0, "Outlier score {} out of range at index {}", s, i); + } + + let memberships = result.all_points_membership_vectors(); + for (i, row) in memberships.iter().enumerate() { + for (j, &v) in row.iter().enumerate() { + assert!(!v.is_nan(), "Membership NaN at point {} cluster {}", i, j); + } + } +}