Skip to content

Commit 70502e1

Browse files
authored
asm popcounts (#256)
1 parent a13532f commit 70502e1

File tree

7 files changed

+36
-23
lines changed

7 files changed

+36
-23
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: TreeTools
22
Title: Create, Modify and Analyse Phylogenetic Trees
3-
Version: 2.1.0.9003
3+
Version: 2.1.0.9004
44
Authors@R: c(
55
person("Martin R.", 'Smith', role = c("aut", "cre", "cph"),
66
email = "martin.smith@durham.ac.uk",

NEWS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# TreeTools 2.1.0.9004 (2026-03-12) #
2+
3+
- Rewrite popcount calculation for more efficient `TipsInSplits()`.
4+
15
# TreeTools 2.1.0.9003 (2026-03-09) #
26

37
- `inst/include/TreeTools/tree_number.h` added to support downstream packages

benchmark/_compare_results.R

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,18 @@ for (pr_file in pr_files) {
5050
main_iqr)
5151

5252
threshold_percent <- 6 # Changes of ~5% are frequent
53+
# Sub-millisecond benchmarks are dominated by system jitter on CI runners;
54+
# require an absolute difference floor before flagging.
55+
min_meaningful_diff <- 2e-4 # 0.2 ms (times are in seconds)
56+
abs_diff <- abs(median_pr - median_main)
5357

5458
is_faster <- matched &&
59+
abs_diff > min_meaningful_diff &&
5560
median_pr < median_main - 2 * mad_main &&
5661
median_pr < noise_range[[1]]
5762

5863
is_slower <- matched &&
64+
abs_diff > min_meaningful_diff &&
5965
median_pr > median_main + 2 * mad_main &&
6066
median_pr > noise_range[[2]]
6167

benchmark/_init.R

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
library("TreeTools")
22

3-
Benchmark <- function(..., min_iterations = NULL) {
4-
result <- bench::mark(..., min_iterations = min_iterations %||% 3,
5-
time_unit = "us")
3+
Benchmark <- function(..., min_iterations = NULL, min_time = NULL) {
4+
args <- list(..., min_iterations = min_iterations %||% 3, time_unit = "us")
5+
if (!is.null(min_time)) args[["min_time"]] <- min_time
6+
result <- do.call(bench::mark, args)
67
if (interactive()) {
78
print(result)
89
} else {

benchmark/bench-PathLengths.R

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@ source("benchmark/_init.R") # sets seed
22

33
tr80 <- rtree(80)
44

5-
Benchmark(TreeTools:::path_lengths(tr80$edge, tr80$edge.length, FALSE))
6-
Benchmark(PathLengths(tr80, full = TRUE))
5+
Benchmark(TreeTools:::path_lengths(tr80$edge, tr80$edge.length, FALSE),
6+
min_time = 2)
7+
Benchmark(PathLengths(tr80, full = TRUE), min_time = 2)
78

89
tr80Unif <- tr80
910
tr80Unif[["edge.length"]] <- NULL
10-
Benchmark(PathLengths(tr80Unif, full = TRUE))
11+
Benchmark(PathLengths(tr80Unif, full = TRUE), min_time = 2)
1112

1213
tr2000 <- rtree(2000)
1314
Benchmark(PathLengths(tr2000, full = TRUE))

inst/include/TreeTools/SplitList.h

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,26 +38,25 @@ namespace TreeTools {
3838
return T(1) << bit_pos;
3939
}
4040

41-
#if __cplusplus >= 202002L
42-
#include <bit> // C++20 header for std::popcount
41+
// Hardware POPCNT: available on all x86-64 since 2008 (Nehalem / Barcelona).
42+
// Inline asm emits the instruction directly, without requiring -mpopcnt.
43+
#if (defined(__GNUC__) || defined(__clang__)) && defined(__x86_64__)
4344
inline int32 count_bits(splitbit x) {
44-
return static_cast<int32>(std::popcount(x));
45+
uint64_t result;
46+
__asm__ ("popcnt %1, %0" : "=r" (result) : "r" (x));
47+
return static_cast<int32>(result);
4548
}
46-
// Option 2: Fallback for C++17 and older
47-
#else
48-
#if defined(__GNUC__) || defined(__clang__)
49-
// GCC and Clang support __builtin_popcountll for long long
50-
inline int32 count_bits(splitbit x) {
51-
return static_cast<int32>(__builtin_popcountll(x));
52-
}
53-
#elif defined(_MSC_VER)
49+
#elif defined(_MSC_VER) && defined(_M_X64)
5450
#include <intrin.h>
5551
inline int32 count_bits(splitbit x) {
5652
return static_cast<int32>(__popcnt64(x));
5753
}
54+
#elif defined(__GNUC__) || defined(__clang__)
55+
// Non-x86 (ARM, etc.): builtin maps to efficient native instruction
56+
inline int32 count_bits(splitbit x) {
57+
return static_cast<int32>(__builtin_popcountll(x));
58+
}
5859
#else
59-
// A slower, but safe and highly portable fallback for all other compilers
60-
// This is a last resort if no built-in is available.
6160
inline int32_t count_bits(splitbit x) {
6261
int32_t count = 0;
6362
while (x != 0) {
@@ -66,9 +65,7 @@ namespace TreeTools {
6665
}
6766
return count;
6867
}
69-
#endif // Compiler check for builtins
70-
71-
#endif // C++20 check
68+
#endif
7269

7370
class SplitList {
7471
public:

src/tips_in_splits.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ Rcpp::IntegerVector tips_in_splits(Rcpp::RawMatrix splits) {
2222
const unsigned char* splits_i = splits.begin() + i;
2323
for (int32 bin = 0; bin < n_bin; ++bin) {
2424
const unsigned char* in_bin = splits_i + bin * n_split;
25+
// __builtin_popcount on 8-bit values; the inline asm POPCNT in
26+
// count_bits (SplitList.h) handles 64-bit splitbit values.
27+
// For 8-bit values, __builtin_popcount is fine — it's a tiny
28+
// fraction of runtime and not worth inline asm.
2529
ret[i] += static_cast<int>(__builtin_popcount(*in_bin));
2630
}
2731
}

0 commit comments

Comments
 (0)