Skip to content

Commit 5bb40b8

Browse files
authored
Optimize duplicated.Splits() (#260)
1 parent 85063f4 commit 5bb40b8

File tree

3 files changed

+26
-33
lines changed

3 files changed

+26
-33
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: TreeTools
22
Title: Create, Modify and Analyse Phylogenetic Trees
3-
Version: 2.1.0.9006
3+
Version: 2.1.0.9007
44
Authors@R: c(
55
person("Martin R.", 'Smith', role = c("aut", "cre", "cph"),
66
email = "martin.smith@durham.ac.uk",

NEWS.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
1-
# TreeTools 2.1.0.9006 (2026-03-13) #
1+
# TreeTools 2.1.0.9007 (2026-03-13) #
2+
3+
- `duplicated.Splits()` uses hash-based O(n) de-duplication, replacing
4+
O(n²) pairwise comparison.
25

6+
# TreeTools 2.1.0.9006 (2026-03-13) #
7+
38
- `NodeDepth()` for unrooted trees rewritten as O(n) two-pass C++ algorithm,
49
replacing iterative R while-loop.
510

src/splits.cpp

Lines changed: 19 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#include <Rcpp/Lightest>
22
#include <memory> // for make_unique
33
#include <stdexcept> /* for errors */
4+
#include <string> /* for string (hash key) */
5+
#include <unordered_set> /* for unordered_set */
46
#include "../inst/include/TreeTools/assert.h" /* for ASSERT */
57
#include "../inst/include/TreeTools.h"
68

@@ -197,44 +199,30 @@ LogicalVector duplicated_splits(const RawMatrix splits,
197199
}
198200
}
199201

202+
// Hash-based O(n) deduplication
200203
LogicalVector ret(n_split);
204+
std::unordered_set<std::string> seen;
205+
seen.reserve(n_split * 2);
206+
std::string key(check_bins, '\0');
207+
201208
if (fromLast[0]) {
202-
for (intx it = n_split - 1; it--; ) {
203-
const intx i = it + 1; // nothing to duplicate split(0, _)
204-
if (ret[i]) {
205-
continue;
209+
// Scan from end; first seen (from end) is kept, earlier dupes are marked
210+
for (intx i = n_split; i--; ) {
211+
for (intx b = 0; b < check_bins; ++b) {
212+
key[b] = static_cast<char>(compare(i, b));
206213
}
207-
for (intx j = i; j--; ) {
208-
// Rcout << " check split " << i << " (" << uintx(compare(i, 0)) <<
209-
// ") vs " << j << " (" << uintx(compare(j, 0)) << "): ";
210-
for(intx bin = 0; compare(i, bin) == compare(j, bin); ) {
211-
// Rcout << " [bin " << bin << "] ";
212-
++bin;
213-
if (bin == check_bins) {
214-
// Rcout << "Duplicate!";
215-
ret[j] = true;
216-
break;
217-
}
218-
}
219-
// Rcout << "\n";
220-
214+
if (!seen.insert(key).second) {
215+
ret[i] = true;
221216
}
222217
}
223218
} else {
224-
for (intx i = 0; i != n_split - 1; ++i) {
225-
if (ret[i]) {
226-
continue;
219+
// Scan from start; first seen is kept, later dupes are marked
220+
for (intx i = 0; i < n_split; ++i) {
221+
for (intx b = 0; b < check_bins; ++b) {
222+
key[b] = static_cast<char>(compare(i, b));
227223
}
228-
for (intx j = i + 1; j != n_split; ++j) {
229-
230-
for(intx bin = 0; compare(i, bin) == compare(j, bin); ) {
231-
++bin;
232-
if (bin == check_bins) {
233-
ret[j] = true;
234-
break;
235-
}
236-
}
237-
224+
if (!seen.insert(key).second) {
225+
ret[i] = true;
238226
}
239227
}
240228
}

0 commit comments

Comments
 (0)