Skip to content

Commit a221fac

Browse files
authored
hash map (#258)
1 parent 906ce2c commit a221fac

File tree

5 files changed

+32
-15
lines changed

5 files changed

+32
-15
lines changed

.github/workflows/R-CMD-check.yml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@ on:
2121
- "**.R[dD]ata"
2222
- "**.Rpro*"
2323
pull_request:
24-
branches:
25-
- main
26-
- master
24+
branches: ["*"]
2725
paths-ignore:
2826
- "Meta**"
2927
- "memcheck**"

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,4 @@ binary 0/1 in an underlying `raw` object.
3434
before moving on to the next task.
3535
- Increment the `.900X` dev version suffix in `DESCRIPTION` with each
3636
`NEWS.md` update.
37+
- Check that existing tests cover all new code. (The GHA test suite uses codecov.)

NEWS.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# TreeTools 2.1.0.9005 (2026-03-13) #
22

3-
- `SplitFrequency(reference = NULL)` split normalization moved to C++,
4-
eliminating an R-level per-split loop.
3+
- `SplitFrequency(reference = NULL)` performance improvements:
4+
split normalization moved to C++; internal split de-duplication uses
5+
hash map instead of ordered map.
56

67
# TreeTools 2.1.0.9003 (2026-03-09) #
78

src/consensus.cpp

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ using namespace Rcpp;
66

77
#include <algorithm> /* for fill */
88
#include <array> /* for array */
9-
#include <map> /* for map */
9+
#include <string> /* for string (hash key) */
10+
#include <unordered_map> /* for unordered_map */
1011

1112
using TreeTools::ct_stack_threshold;
1213
using TreeTools::ct_max_leaves_heap;
@@ -178,12 +179,15 @@ List calc_split_frequencies(
178179

179180
StackEntry *const S_start = S.data();
180181

181-
// Use a map to store unique splits and their counts
182-
// Key: split bit pattern; Value: index in output
183-
std::map<std::vector<Rbyte>, int32> split_map;
182+
// Hash map for O(1) amortized split deduplication
183+
std::unordered_map<std::string, int32> split_map;
184+
split_map.reserve(ntip_3 * 2);
184185
std::vector<std::vector<Rbyte>> split_patterns;
185186
std::vector<int32> counts;
186187

188+
// Reusable key buffer — avoids per-split heap allocation
189+
std::string key(nbin, '\0');
190+
187191
for (int32 i = 0; i < n_trees; ++i) {
188192
if (tables[i].NOSWX(ntip_3)) {
189193
continue;
@@ -247,21 +251,21 @@ List calc_split_frequencies(
247251
const int32 end = tables[i].X_right(k + 1);
248252
if (start == 0 && end == 0) continue; // No valid cluster at this position
249253

250-
// Build the bit pattern for this split
251-
std::vector<Rbyte> pattern(nbin, 0);
254+
// Build the bit pattern into the reusable key buffer
255+
std::fill(key.begin(), key.end(), '\0');
252256
for (int32 j = start; j <= end; ++j) {
253257
const int32 leaf_idx = tables[i].DECODE(j) - 1; // 0-based
254258
const int32 byte_idx = leaf_idx >> 3;
255259
const int32 bit_idx = leaf_idx & 7;
256-
pattern[byte_idx] |= (Rbyte(1) << bit_idx);
260+
key[byte_idx] |= static_cast<char>(1 << bit_idx);
257261
}
258262

259-
auto it = split_map.find(pattern);
263+
auto it = split_map.find(key);
260264
if (it == split_map.end()) {
261265
// New split: record it with count from this reference tree
262266
const int32 idx = split_patterns.size();
263-
split_map[pattern] = idx;
264-
split_patterns.push_back(std::move(pattern));
267+
split_map.emplace(key, idx);
268+
split_patterns.emplace_back(key.begin(), key.end());
265269
counts.push_back(split_count[k]);
266270
}
267271
// If already found, the first reference tree that found it has the

tests/testthat/test-Support.R

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,19 @@ test_that("Node support colours consistent", {
5959
c("oor", "1", "34", "67", "101"))
6060
})
6161

62+
test_that("normalize_splits() covers all branches", {
63+
# n_spare == 0 (nTip multiple of 8): complement without trailing-bit mask
64+
trees16 <- c(PectinateTree(16), PectinateTree(16))
65+
freq16 <- SplitFrequency(trees16)
66+
expect_equal(length(freq16), NSplits(trees16[[1]]))
67+
expect_true(all(attr(freq16, "count") == 2L))
68+
69+
# Trivial splits are filtered: a star tree produces no non-trivial splits
70+
star <- c(StarTree(10), StarTree(10))
71+
freq_star <- SplitFrequency(star)
72+
expect_equal(length(freq_star), 0)
73+
})
74+
6275
test_that("SplitFrequency() handles four-split trees", {
6376
trees <- AddTipEverywhere(BalancedTree(3))
6477
trees <- c(trees[1], trees)

0 commit comments

Comments
 (0)