Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
run: sudo apt-get update

- name: Setup | Rust toolchain
uses: dtolnay/rust-toolchain@1.79.0
uses: dtolnay/rust-toolchain@1.80.0
with:
components: clippy, rustfmt

Expand Down
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Version 0.12.0 2025-11-24
## Changes
- Improvement to parallel clustering via the `cluster_par` method with parallelisation of the calculation of
Prim's minimum spanning tree.

# Version 0.11.0 2025-08-03
## Changes
- Addition of optional `parallel` feature that adds a method `cluster_par` to the `Hdbscan` struct. This method
Expand Down Expand Up @@ -78,4 +83,4 @@
# Version 0.3.0 2024-02-20
## Changes
- Added `max_cluster_size` hyper parameter, with support in the hyper parameter builder
- Improved read me documentation on current state of the algorithm
- Improved read me documentation on current state of the algorithm
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "hdbscan"
version = "0.11.0"
version = "0.12.0"
edition = "2021"
authors = [ "Tom Whitehead <t.j.whitehead21@gmail.com>", ]
description = "HDBSCAN clustering in pure Rust. A huge improvement on DBSCAN, capable of identifying clusters of varying densities."
Expand Down
94 changes: 27 additions & 67 deletions src/hdbscan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ use crate::core_distances::parallel::CoreDistanceCalculatorPar;
#[cfg(feature = "serial")]
use crate::core_distances::serial::CoreDistanceCalculator;
use crate::data_wrappers::{CondensedNode, MSTEdge, SLTNode};
#[cfg(feature = "parallel")]
use crate::min_spanning_tree::parallel::PrimsMinSpanningTreePar;
#[cfg(feature = "serial")]
use crate::min_spanning_tree::serial::PrimsMinSpanningTree;
use crate::min_spanning_tree::MinSpanningTree;
use crate::union_find::UnionFind;
use crate::validation::DataValidator;
use crate::{distance, Center, DistanceMetric, HdbscanError, HdbscanHyperParams};
Expand All @@ -26,7 +31,7 @@ impl<T: Float> Hdbscan<'_, T> {
///
/// # Returns
/// * A result that, if successful, contains a list of cluster labels, with a length equal to
/// the numbe of samples passed to the constructor. Positive integers mean a data point
/// the number of samples passed to the constructor. Positive integers mean a data point
/// belongs to a cluster of that label. -1 labels mean that a data point is noise and does
/// not belong to any cluster. An Error will be returned if the dimensionality of the input
/// vectors are mismatched, if any vector contains non-finite coordinates, or if the passed
Expand Down Expand Up @@ -60,15 +65,20 @@ impl<T: Float> Hdbscan<'_, T> {
///assert_eq!(-1, labels[10]);
/// ```
pub fn cluster(&self) -> Result<Vec<i32>, HdbscanError> {
let validator = DataValidator::new(self.data, &self.hp);
validator.validate_input_data()?;
let calculator = CoreDistanceCalculator::new(self.data, &self.hp);
let core_distances = calculator.calc_core_distances();
let min_spanning_tree = self.prims_min_spanning_tree(&core_distances);
DataValidator::new(self.data, &self.hp).validate_input_data()?;

let core_dist_calculator = CoreDistanceCalculator::new(self.data, &self.hp);
let core_distances = core_dist_calculator.calc_core_distances();

let mst_calculator =
PrimsMinSpanningTree::new(self.data, self.hp.dist_metric, &core_distances);
let min_spanning_tree = mst_calculator.compute();

let single_linkage_tree = self.make_single_linkage_tree(&min_spanning_tree);
let condensed_tree = self.condense_tree(&single_linkage_tree);
let winning_clusters = self.extract_winning_clusters(&condensed_tree);
let labelled_data = self.label_data(&winning_clusters, &condensed_tree);

Ok(labelled_data)
}
}
Expand All @@ -80,7 +90,7 @@ impl<T: Float + Send + Sync> Hdbscan<'_, T> {
///
/// # Returns
/// * A result that, if successful, contains a list of cluster labels, with a length equal to
/// the numbe of samples passed to the constructor. Positive integers mean a data point
/// the number of samples passed to the constructor. Positive integers mean a data point
/// belongs to a cluster of that label. -1 labels mean that a data point is noise and does
/// not belong to any cluster. An Error will be returned if the dimensionality of the input
/// vectors are mismatched, if any vector contains non-finite coordinates, or if the passed
Expand Down Expand Up @@ -114,15 +124,20 @@ impl<T: Float + Send + Sync> Hdbscan<'_, T> {
///assert_eq!(-1, labels[10]);
/// ```
pub fn cluster_par(&self) -> Result<Vec<i32>, HdbscanError> {
let validator = DataValidator::new(self.data, &self.hp);
validator.validate_input_data()?;
let calculator = CoreDistanceCalculatorPar::new(self.data, &self.hp);
let core_distances = calculator.calc_core_distances();
let min_spanning_tree = self.prims_min_spanning_tree(&core_distances);
DataValidator::new(self.data, &self.hp).validate_input_data()?;

let core_dist_calculator = CoreDistanceCalculatorPar::new(self.data, &self.hp);
let core_distances = core_dist_calculator.calc_core_distances();

let mst_calculator =
PrimsMinSpanningTreePar::new(self.data, self.hp.dist_metric, &core_distances);
let min_spanning_tree = mst_calculator.compute();

let single_linkage_tree = self.make_single_linkage_tree(&min_spanning_tree);
let condensed_tree = self.condense_tree(&single_linkage_tree);
let winning_clusters = self.extract_winning_clusters(&condensed_tree);
let labelled_data = self.label_data(&winning_clusters, &condensed_tree);

Ok(labelled_data)
}
}
Expand Down Expand Up @@ -275,61 +290,6 @@ impl<'a, T: Float> Hdbscan<'a, T> {
))
}

fn prims_min_spanning_tree(&self, core_distances: &[T]) -> Vec<MSTEdge<T>> {
let mut in_tree = vec![false; self.n_samples];
let mut distances = vec![T::infinity(); self.n_samples];
distances[0] = T::zero();

let mut mst = Vec::with_capacity(self.n_samples);

let mut left_node_id = 0;
let mut right_node_id = 0;

for _ in 1..self.n_samples {
in_tree[left_node_id] = true;
let mut current_min_dist = T::infinity();

for i in 0..self.n_samples {
if in_tree[i] {
continue;
}
let mrd = self.calc_mutual_reachability_dist(left_node_id, i, core_distances);
if mrd < distances[i] {
distances[i] = mrd;
}
if distances[i] < current_min_dist {
right_node_id = i;
current_min_dist = distances[i];
}
}
mst.push(MSTEdge {
left_node_id,
right_node_id,
distance: current_min_dist,
});
left_node_id = right_node_id;
}
self.sort_mst_by_dist(&mut mst);
mst
}

fn calc_mutual_reachability_dist(&self, a: usize, b: usize, core_distances: &[T]) -> T {
let core_dist_a = core_distances[a];
let core_dist_b = core_distances[b];
let dist_a_b = if self.hp.dist_metric == DistanceMetric::Precalculated {
self.data[a][b]
} else {
self.hp.dist_metric.calc_dist(&self.data[a], &self.data[b])
};

core_dist_a.max(core_dist_b).max(dist_a_b)
}

fn sort_mst_by_dist(&self, min_spanning_tree: &mut [MSTEdge<T>]) {
min_spanning_tree
.sort_by(|a, b| a.distance.partial_cmp(&b.distance).expect("Invalid floats"));
}

fn make_single_linkage_tree(&self, min_spanning_tree: &[MSTEdge<T>]) -> Vec<SLTNode<T>> {
let mut single_linkage_tree: Vec<SLTNode<T>> = Vec::with_capacity(self.n_samples - 1);

Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,6 @@ mod distance;
mod error;
mod hdbscan;
mod hyper_parameters;
mod min_spanning_tree;
mod union_find;
mod validation;
178 changes: 178 additions & 0 deletions src/min_spanning_tree.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
use crate::data_wrappers::MSTEdge;
use crate::DistanceMetric;
use num_traits::Float;

pub(crate) trait MinSpanningTree<'a, T> {
fn compute(&self) -> Vec<MSTEdge<T>>;
}

#[derive(Clone, Debug)]
struct MinSpanningTreeCommon<'a, T> {
data: &'a [Vec<T>],
dist_metric: DistanceMetric,
core_distances: &'a [T],
n_samples: usize,
}

impl<'a, T: Float> MinSpanningTreeCommon<'a, T> {
fn new(data: &'a [Vec<T>], dist_metric: DistanceMetric, core_distances: &'a [T]) -> Self {
MinSpanningTreeCommon {
data,
dist_metric,
core_distances,
n_samples: data.len(),
}
}

fn calc_mutual_reachability_dist(&self, a: usize, b: usize) -> T {
let core_dist_a = self.core_distances[a];
let core_dist_b = self.core_distances[b];
let dist_a_b = if self.dist_metric == DistanceMetric::Precalculated {
self.data[a][b]
} else {
self.dist_metric.calc_dist(&self.data[a], &self.data[b])
};
core_dist_a.max(core_dist_b).max(dist_a_b)
}

fn sort_mst_by_dist(&self, min_spanning_tree: &mut [MSTEdge<T>]) {
min_spanning_tree
.sort_by(|a, b| a.distance.partial_cmp(&b.distance).expect("Invalid floats"));
}
}

#[cfg(feature = "serial")]
pub(crate) mod serial {
use super::*;
use crate::data_wrappers::MSTEdge;
use num_traits::Float;

#[derive(Clone, Debug)]
pub(crate) struct PrimsMinSpanningTree<'a, T> {
common: MinSpanningTreeCommon<'a, T>,
}

impl<'a, T: Float> PrimsMinSpanningTree<'a, T> {
pub(crate) fn new(
data: &'a [Vec<T>],
dist_metric: DistanceMetric,
core_distances: &'a [T],
) -> Self {
let common = MinSpanningTreeCommon::new(data, dist_metric, core_distances);
PrimsMinSpanningTree { common }
}
}

impl<'a, T: Float> MinSpanningTree<'a, T> for PrimsMinSpanningTree<'a, T> {
fn compute(&self) -> Vec<MSTEdge<T>> {
let n_samples = self.common.n_samples;

let mut in_tree = vec![false; n_samples];
let mut distances = vec![T::infinity(); n_samples];
distances[0] = T::zero();

let mut mst = Vec::with_capacity(n_samples);

let mut left_node_id = 0;
let mut right_node_id = 0;

for _ in 1..n_samples {
in_tree[left_node_id] = true;
let mut current_min_dist = T::infinity();

for i in 0..n_samples {
if in_tree[i] {
continue;
}
let mrd = self.common.calc_mutual_reachability_dist(left_node_id, i);
if mrd < distances[i] {
distances[i] = mrd;
}
if distances[i] < current_min_dist {
right_node_id = i;
current_min_dist = distances[i];
}
}
mst.push(MSTEdge {
left_node_id,
right_node_id,
distance: current_min_dist,
});
left_node_id = right_node_id;
}
self.common.sort_mst_by_dist(&mut mst);
mst
}
}
}

#[cfg(feature = "parallel")]
pub(crate) mod parallel {
use super::*;
use crate::data_wrappers::MSTEdge;
use num_traits::Float;
use rayon::prelude::*;

#[derive(Clone, Debug)]
pub(crate) struct PrimsMinSpanningTreePar<'a, T> {
common: MinSpanningTreeCommon<'a, T>,
}

impl<'a, T: Float + Send + Sync> PrimsMinSpanningTreePar<'a, T> {
pub(crate) fn new(
data: &'a [Vec<T>],
dist_metric: DistanceMetric,
core_distances: &'a [T],
) -> Self {
let common = MinSpanningTreeCommon::new(data, dist_metric, core_distances);
PrimsMinSpanningTreePar { common }
}
}

impl<'a, T: Float + Send + Sync> MinSpanningTree<'a, T> for PrimsMinSpanningTreePar<'a, T> {
fn compute(&self) -> Vec<MSTEdge<T>> {
let n_samples = self.common.n_samples;

let mut in_tree = vec![false; n_samples];
let mut distances = vec![T::infinity(); n_samples];
distances[0] = T::zero();

let mut mst = Vec::with_capacity(n_samples);

let mut left_node_id = 0;

for _ in 1..n_samples {
in_tree[left_node_id] = true;

let (min_idx, min_dist) = distances
.par_iter_mut()
.enumerate()
.filter_map(|(i, dist)| {
if in_tree[i] {
None
} else {
let mrd = self.common.calc_mutual_reachability_dist(left_node_id, i);
if mrd < *dist {
*dist = mrd;
}
Some((i, *dist))
}
})
.min_by(|(_, dist_a), (_, dist_b)| {
dist_a.partial_cmp(dist_b).expect("Invalid floats")
})
.expect("Malformed distance array");

mst.push(MSTEdge {
left_node_id,
right_node_id: min_idx,
distance: min_dist,
});
left_node_id = min_idx;
}

self.common.sort_mst_by_dist(&mut mst);
mst
}
}
}
Loading