From 19c81fa099737fc71dcf9d81d324c3bc3efa3dcf Mon Sep 17 00:00:00 2001 From: Youcef L Date: Mon, 21 Apr 2025 22:34:14 +0000 Subject: [PATCH 01/22] Backup intermediate state. --- lfp.hpp | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 129 insertions(+), 2 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index cf1363f..c08688d 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace lfp { @@ -430,6 +431,78 @@ constexpr bool bitmask_pack::is_prime(uint8_t n) template using bitmask = bitmask_impl; + +template +class bucket +{ +public: + static_assert(Offsets >= 1, "There must be at least one offset."); + struct entry { + /// Offsets of the bits to clear corresponding to multiples of prime_ + /// The offsets to clear are offsets_[0], offsets_[0] + offsets_[1], ..., offsets_[0]+offsets_[7] + alignas(64) std::uint32_t offsets_[Offsets]; + PrimeT prime_; + }; + using entries_type = std::vector; +public: + using offsets = decltype(entry::offsets_); + constexpr explicit bucket(std::size_t initialCapacity); + constexpr std::size_t size() const; + constexpr entries_type const & entries() const; + constexpr void clear(); + constexpr offsets & add(PrimeT prime, std::uint32_t offsetOfPrime); +private: + entries_type entries_; + std::size_t size_; +}; + +template +constexpr +bucket::bucket(std::size_t initialCapacity) + : size_(0) +{ + entries_.reserve(initialCapacity); +} + +template +constexpr std::size_t +bucket::size() const +{ + return size_; +} + +template +constexpr void +bucket::clear() +{ + size_ = 0; +} + + +template +constexpr bucket::offsets & +bucket::add(PrimeT prime, std::uint32_t offsetOfPrime) +{ + if(size_ < entries_.size()) { + entries_[size_] = entry{.prime_ = prime}; + } else { + entries_.push_back(entry{.prime_ = prime}); + } + auto & offsets = entries_[size_].offsets_; + offsets[0] = offsetOfPrime; + ++size_; + return offsets; +} + +template +constexpr bucket::entries_type const & +bucket::entries() const +{ + return entries_; +} + + + inline constexpr int8_t adjt[8][14] = { {0, 4, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 4, 2}, {0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 4, 0, 4, 2}, @@ -467,6 +540,7 @@ find_first_multiple_above(U p, V n0) template class PrimesIterator; + class Bitmap { public: @@ -482,6 +556,8 @@ class Bitmap constexpr void apply(bitmask_impl const & mask); template constexpr void apply(bitmask_pack const & maskPack); + template + constexpr void apply(bucket const & bkt); constexpr uint64_t popcount() const; void check() const; constexpr uint8_t at(std::size_t index) const; @@ -501,10 +577,12 @@ class Bitmap constexpr mask_application_data compute_mask_application_data(bitmask_impl const & bmk) const; template constexpr void apply(bitmask_impl const & bmk, mask_application_data & mappData, std::size_t endOffset); + constexpr void apply(bucket<1, value_type>::entry const & entry); + constexpr void apply(bucket<8, value_type>::entry const & entry); std::vector> vec_; std::size_t size_; - uint64_t n0_; + value_type n0_; static constexpr std::array d_{1,0,5,4,3,2,1,0,3,2,1,0,1,0,3,2,1,0,1,0,3,2,1,0,5,4,3,2,1,0}; static constexpr std::array residues_{1,7,11,13,17,19,23,29}; static constexpr std::array deltas_{6,4,2,4,2,4,6,2}; @@ -777,6 +855,56 @@ void Bitmap::apply(bitmask_pack const & maskPack) } } +template +constexpr void +Bitmap::apply(bucket const & bkt) +{ + static_assert(NumOffsets == 1 || NumOffsets == 8, + "The number of offsets must be 1 or 8, other values not supported yet!"); + auto const & entries = bkt.entries(); + if(entries.empty()) { + return; + } + for(auto const & entry : entries) { + apply(entry); + } +} + + +constexpr void +Bitmap::apply(bucket<1, value_type>::entry const & entry) +{ + assert(entry.offsets_[0] < size_); + reset(entry.offsets_[0]); +} + + +constexpr void +Bitmap::apply(bucket<8, value_type>::entry const & entry) +{ + std::size_t i = entry.offsets_[0]; + auto const & offsets = entry.offsets_; + auto p = entry.prime_; + if(offsets[7]) { + for(; i + 8 * p < size_; i += 8 * p) { + reset(i); + reset(i + offsets[1]); + reset(i + offsets[2]); + reset(i + offsets[3]); + reset(i + offsets[4]); + reset(i + offsets[5]); + reset(i + offsets[6]); + reset(i + offsets[7]); + } + } + const auto i0 = i; + for(auto j = 1; i < size_; i = i0 + offsets[j], ++j) { + reset(i); + if((j == std::size(offsets)) || !offsets[j]) { + break; + } + } +} inline constexpr uint8_t wheel[8][8] = { {6,4,2,4,2,4,6,2}, @@ -1467,4 +1595,3 @@ std::size_t count_primes(U n0, U n1, Threads const & threads) } // namespace lfp - From e11521f3a8a642ef2d0a27f1eecfab6cb3af7907 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Tue, 22 Apr 2025 22:43:42 +0000 Subject: [PATCH 02/22] Backup. --- lfp.hpp | 192 ++++++++++++++++++++++++++++++++++++++++-------------- tests.cpp | 6 +- 2 files changed, 146 insertions(+), 52 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index c08688d..f08bf80 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -71,6 +71,11 @@ struct Threads namespace details { +/// Constants +inline constexpr std::size_t bucket_sieve_threshold = 16384; +inline constexpr std::size_t max_num_primes_in_bucket = 16384+8192; +inline constexpr std::size_t initial_bucket_capacity = 65536; + template inline constexpr auto u8primes = std::to_array({ 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, @@ -443,16 +448,16 @@ class bucket alignas(64) std::uint32_t offsets_[Offsets]; PrimeT prime_; }; - using entries_type = std::vector; -public: using offsets = decltype(entry::offsets_); + constexpr bucket() = default; constexpr explicit bucket(std::size_t initialCapacity); constexpr std::size_t size() const; - constexpr entries_type const & entries() const; + constexpr auto entries() const; constexpr void clear(); constexpr offsets & add(PrimeT prime, std::uint32_t offsetOfPrime); + constexpr void sort(); private: - entries_type entries_; + std::vector entries_; std::size_t size_; }; @@ -495,10 +500,19 @@ bucket::add(PrimeT prime, std::uint32_t offsetOfPrime) } template -constexpr bucket::entries_type const & +constexpr auto bucket::entries() const { - return entries_; + return std::ranges::subrange(std::begin(entries_), std::begin(entries_) + size_); +} + +template +constexpr void +bucket::sort() +{ + std::sort(std::begin(entries_), std::end(entries_), [](auto const & x, auto const & y){ + return x.offsets_[0] < y.offsets_[0]; + }); } @@ -556,8 +570,8 @@ class Bitmap constexpr void apply(bitmask_impl const & mask); template constexpr void apply(bitmask_pack const & maskPack); - template - constexpr void apply(bucket const & bkt); + template + constexpr void apply(bucket & bkt); constexpr uint64_t popcount() const; void check() const; constexpr uint8_t at(std::size_t index) const; @@ -577,8 +591,10 @@ class Bitmap constexpr mask_application_data compute_mask_application_data(bitmask_impl const & bmk) const; template constexpr void apply(bitmask_impl const & bmk, mask_application_data & mappData, std::size_t endOffset); - constexpr void apply(bucket<1, value_type>::entry const & entry); - constexpr void apply(bucket<8, value_type>::entry const & entry); + template + constexpr void apply(bucket<1, PrimeT>::entry const & entry); + template + constexpr void apply(bucket<8, PrimeT>::entry const & entry); std::vector> vec_; std::size_t size_; @@ -855,32 +871,36 @@ void Bitmap::apply(bitmask_pack const & maskPack) } } -template +template constexpr void -Bitmap::apply(bucket const & bkt) +Bitmap::apply(bucket & bkt) { static_assert(NumOffsets == 1 || NumOffsets == 8, "The number of offsets must be 1 or 8, other values not supported yet!"); - auto const & entries = bkt.entries(); + static_assert(sizeof(PrimeT) <= sizeof(value_type)); + auto entries = bkt.entries(); if(entries.empty()) { return; } + bkt.sort(); for(auto const & entry : entries) { - apply(entry); + apply(entry); } } +template constexpr void -Bitmap::apply(bucket<1, value_type>::entry const & entry) +Bitmap::apply(bucket<1, PrimeT>::entry const & entry) { assert(entry.offsets_[0] < size_); reset(entry.offsets_[0]); } +template constexpr void -Bitmap::apply(bucket<8, value_type>::entry const & entry) +Bitmap::apply(bucket<8, PrimeT>::entry const & entry) { std::size_t i = entry.offsets_[0]; auto const & offsets = entry.offsets_; @@ -959,9 +979,56 @@ Ret compute_lt_coprime(U n1) template inline constexpr std::array primesBelowSix = {2, 3, 5}; -template +template +constexpr std::int32_t compute_offsets(std::array & offsets, + Bitmap const & bmp, + U p, + V c, + U ne + ) +{ + std::ranges::fill(offsets, 0); + constexpr auto c_max = std::numeric_limits::max(); + auto count = 0; + int32_t firstIndex = -1; + auto offsIdx = 0; + auto prevDelta = 0; + for(auto j = whoffs[(p % 30) * 4 / 15][(c % 30) * 4 / 15]; + c <= ne; + c = ((c_max - c < wheel[(p % 30) * 4 / 15][j] * p) + ? ne + 1 + : c + wheel[(p % 30) * 4 / 15][j] * p), j = (j + 1) % 8) { + auto currIdx = bmp.indexOf(c); + if(offsIdx == 0) { + firstIndex = currIdx; + } + + if(offsIdx > 0 && offsIdx <= offsets.size()) { + offsets[offsIdx - 1] = currIdx - firstIndex; + } + ++offsIdx; + if(offsIdx > offsets.size()) { + break; + } + } + return firstIndex; +} + +template +struct sieve_data +{ + Bitmap * bitmap_ = nullptr; + bool have_to_initialize_bitmap_ = true; + /// Bucket used when there is one, exactly one hit in the range being sieved + bucket<1, PrimeT> bucket_1_hit_; + /// Bucket used when there are more than one hit in the range being sieved + bucket<8, PrimeT> bucket_8_hits_; +}; + + +template constexpr auto -inner_sieve(SP const & smallPrimes, U n0, U n1, Func ff, Bitmap & bmp, bool initBmp = true) +inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data & sievdat) { auto const & tinyPrimes = primesBelowSix; if((n0 >= n1) || (n0 > std::numeric_limits::max() - 2)) { @@ -981,30 +1048,42 @@ inner_sieve(SP const & smallPrimes, U n0, U n1, Func ff, Bitmap & bmp, bool init } } n0 = compute_gte_coprime(n0); - auto ne = compute_lt_coprime(n1); + auto ne = compute_lt_coprime(n1); if(n0 > ne) { return ff(it0, it0, nullptr); } - if(initBmp) { - bmp.assign(n0, (ne - n0)/30 * 8 + ((ne % 30) >= (n0 % 30) ? (ne%30)*4/15 - (n0%30)*4/15 : 8 - (n0%30)*4/15 + (ne%30)*4/15) + 1); + if(sievdat.have_to_initialize_bitmap_) { + sievdat.bitmap_->assign(n0, (ne - n0)/30 * 8 + + ((ne % 30) >= (n0 % 30) + ? (ne % 30) * 4 / 15 - (n0 % 30) * 4 / 15 + : 8 - (n0 % 30) * 4 / 15 + (ne % 30) * 4 / 15) + 1); } + auto & bmp = *sievdat.bitmap_; // Primes below a certain threshold are dealt with by applying precomputed masks to the bitmap constexpr unsigned int lastSmallPrime = 103; constexpr unsigned int smallPrimesThreshold = lastSmallPrime + 1; - if(std::ranges::distance(smallPrimes | std::views::take_while([](auto p) { return p < smallPrimesThreshold; }))) { - bitmask_pack::value_type, + if(std::ranges::distance(basePrimes | std::views::take_while([](auto p) { return p < smallPrimesThreshold; }))) { + bitmask_pack::value_type, 7, 11, 13, 17, 19, 23, 29, 31> bitmasks_1; - bitmask_pack::value_type, + bitmask_pack::value_type, 37, 41, 43, 47, 53, 59, 61, 67> bitmasks_2; - bitmask_pack::value_type, + bitmask_pack::value_type, 71, 73, 79, 83, 89, 97, 101, lastSmallPrime> bitmasks_3; bmp.apply(bitmasks_1); bmp.apply(bitmasks_2); bmp.apply(bitmasks_3); } - for(auto p : smallPrimes | std::views::drop_while([smallPrimesThreshold](auto p) { return p < smallPrimesThreshold; })) { + std::size_t bucketPrimesCount{}; + auto applyBucketsAndClear = [&]() { + bmp.apply(sievdat.bucket_1_hit_); + sievdat.bucket_1_hit_.clear(); + bmp.apply(sievdat.bucket_8_hits_); + sievdat.bucket_8_hits_.clear(); + }; + + for(auto p : basePrimes | std::views::drop_while([smallPrimesThreshold](auto p) { return p < smallPrimesThreshold; })) { auto p2 = U{p} * p; if(p2 > ne) { break; @@ -1022,31 +1101,30 @@ inner_sieve(SP const & smallPrimes, U n0, U n1, Func ff, Bitmap & bmp, bool init continue; } - constexpr auto c_max = std::numeric_limits::max(); - auto count = 0; - int32_t firstIndex = -1; - std::array offsets{}; - auto offsIdx = 0; - auto prevDelta = 0; - for(auto j = whoffs[(p%30)*4/15][(c%30)*4/15]; c <= ne; - c = ((c_max - c < wheel[(p%30)*4/15][j]*p) ? ne + 1 : c + wheel[(p%30)*4/15][j]*p), j = (j+1)%8) { - auto currIdx = bmp.indexOf(c); - if(offsIdx == 0) { - firstIndex = currIdx; - } + std::array offsets; + std::int32_t startOffs = compute_offsets(offsets, bmp, p, c, ne); - if(offsIdx > 0 && offsIdx <= offsets.size()) { - offsets[offsIdx - 1] = currIdx - firstIndex; + if((startOffs < 0) || (startOffs >= bmp.size())) { + // No multiple of p greater or equal to p^2 in the current segment + continue; + } + + if(p >= bucket_sieve_threshold) { + if(!offsets[0]) { // exactly one hit + sievdat.bucket_1_hit_.add(p, startOffs); + } else { + auto & entryOffsets = sievdat.bucket_8_hits_.add(p, startOffs); + std::copy(std::begin(offsets), std::end(offsets), &entryOffsets[1]); } - ++offsIdx; - if(offsIdx > offsets.size()) { - break; + ++bucketPrimesCount; + if(bucketPrimesCount == max_num_primes_in_bucket) { + applyBucketsAndClear(); + bucketPrimesCount = 0; } - } - if((firstIndex < 0) || (firstIndex >= bmp.size())) { + continue; } - std::size_t i = firstIndex; + std::size_t i = startOffs; if(offsets[6]) { for(; i + 8 * p < bmp.size(); i += 8 * p) { bmp.reset(i); @@ -1067,6 +1145,10 @@ inner_sieve(SP const & smallPrimes, U n0, U n1, Func ff, Bitmap & bmp, bool init } } } + if(bucketPrimesCount) { + applyBucketsAndClear(); + } + return ff(it0, std::end(tinyPrimes), &bmp); } @@ -1386,10 +1468,11 @@ template inline constexpr auto u16primes = []() { auto sv = [] { Bitmap bmp; + sieve_data sievdat{.bitmap_ = &bmp}; return inner_sieve( u8primes, uint16_t(0), uint16_t(65535), collectSieveResults, - bmp); + sievdat); }; std::array u16primesArr; std::ranges::copy(sv(), std::begin(u16primesArr)); @@ -1425,7 +1508,11 @@ sieve(I k0, I k1, Fct ff) bitmaps.reserve((n1 - n0)/innerRangeSize + 1); std::vector prefix; prefix.reserve(3); - + + details::sieve_data sievdat{ + .bucket_1_hit_ = details::bucket<1, U>{initial_bucket_capacity}, + .bucket_8_hits_ = details::bucket<8, U>{initial_bucket_capacity} + }; for(auto a0 = n0, a1 = std::min(n1, (maxn - innerRangeSize < n0) ? maxn : U(n0 + innerRangeSize)); a0 < n1; @@ -1459,17 +1546,22 @@ sieve(I k0, I k1, Fct ff) return u16primes; } }(); + sievdat.bitmap_ = &primesBmp; + sievdat.have_to_initialize_bitmap_ = true; details::inner_sieve(basePrimes, m0, m1, - [](auto, auto, details::Bitmap const *){ }, primesBmp); + [](auto, auto, details::Bitmap const *){ }, sievdat); + details::PrimesIterator itP{&primesBmp}, itPe{&primesBmp, true}; auto basePrimesRange = std::ranges::subrange(itP, itPe); + sievdat.bitmap_ = &currSegBmp; + sievdat.have_to_initialize_bitmap_ = m0 == 0; details::inner_sieve(basePrimesRange, a0, a1, [&](auto it, auto ite, details::Bitmap const*){ if(it != ite) { prefix = std::vector{it, ite}; } return 0; - }, currSegBmp, m0 == 0); + }, sievdat); } } return ff(prefix, bitmaps); diff --git a/tests.cpp b/tests.cpp index c132f64..4fc8e8e 100644 --- a/tests.cpp +++ b/tests.cpp @@ -166,13 +166,15 @@ TEST_CASE("Sieve of Erathostenes - primes iterator and range") { using namespace lfp::details; { Bitmap bmp; - inner_sieve(u8primes, 300u, 400u, [](auto, auto, auto) {}, bmp); + sieve_data sievdat{.bitmap_ = &bmp}; + inner_sieve(u8primes, 300u, 400u, [](auto, auto, auto) {}, sievdat); PrimesIterator it{&bmp}, ite{&bmp, true}; CHECK_THAT(std::vector(it, ite), Equals(primes_by_division(300, 400))); } { Bitmap bmp; - inner_sieve(u16primes, 10000u, 12000u, [](auto, auto, auto) {}, bmp); + sieve_data sievdat{.bitmap_ = &bmp}; + inner_sieve(u16primes, 10000u, 12000u, [](auto, auto, auto) {}, sievdat); PrimesIterator it{&bmp}, ite{&bmp, true}; CHECK_THAT(std::vector(it, ite), Equals(primes_by_division(10000, 12000))); } From eb272aff1250b4763a087e4674c4d06bfb7ccd28 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Thu, 24 Apr 2025 12:56:19 +0000 Subject: [PATCH 03/22] Bucket sieving attempt. --- lfp.hpp | 108 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 37 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index f08bf80..58a1dd0 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace lfp { @@ -73,8 +74,8 @@ namespace details { /// Constants inline constexpr std::size_t bucket_sieve_threshold = 16384; -inline constexpr std::size_t max_num_primes_in_bucket = 16384+8192; -inline constexpr std::size_t initial_bucket_capacity = 65536; +inline constexpr std::size_t max_num_primes_in_bucket = 4096; +inline constexpr std::size_t initial_bucket_capacity = 4096; template inline constexpr auto u8primes = std::to_array({ @@ -445,7 +446,7 @@ class bucket struct entry { /// Offsets of the bits to clear corresponding to multiples of prime_ /// The offsets to clear are offsets_[0], offsets_[0] + offsets_[1], ..., offsets_[0]+offsets_[7] - alignas(64) std::uint32_t offsets_[Offsets]; + std::array offsets_; PrimeT prime_; }; using offsets = decltype(entry::offsets_); @@ -454,9 +455,12 @@ class bucket constexpr std::size_t size() const; constexpr auto entries() const; constexpr void clear(); - constexpr offsets & add(PrimeT prime, std::uint32_t offsetOfPrime); + constexpr void add(PrimeT prime, std::uint32_t offsetOfPrime) requires(Offsets == 1); + constexpr void add(PrimeT prime, std::uint32_t offsetOfPrime, + std::span nextOffsets) requires(Offsets > 1); constexpr void sort(); private: + constexpr entry & add_entry(PrimeT prime, std::uint32_t offsetOfPrime); std::vector entries_; std::size_t size_; }; @@ -483,20 +487,36 @@ bucket::clear() size_ = 0; } +template +constexpr void +bucket::add(PrimeT prime, std::uint32_t offsetOfPrime) requires(Offsets == 1) +{ + add_entry(prime, offsetOfPrime); +} template -constexpr bucket::offsets & -bucket::add(PrimeT prime, std::uint32_t offsetOfPrime) +constexpr void +bucket::add(PrimeT prime, std::uint32_t offsetOfPrime, + std::span nextOffsets) requires(Offsets > 1) { + auto & addedEntry = add_entry(prime, offsetOfPrime); + std::ranges::copy(nextOffsets, &addedEntry.offsets_[1]); +} + +template +constexpr bucket::entry & +bucket::add_entry(PrimeT prime, std::uint32_t offsetOfPrime) +{ + assert(size_ <= entries_.size()); if(size_ < entries_.size()) { entries_[size_] = entry{.prime_ = prime}; } else { entries_.push_back(entry{.prime_ = prime}); } - auto & offsets = entries_[size_].offsets_; - offsets[0] = offsetOfPrime; + auto & addedEntry = entries_[size_]; + addedEntry.offsets_[0] = offsetOfPrime; ++size_; - return offsets; + return addedEntry; } template @@ -510,7 +530,12 @@ template constexpr void bucket::sort() { - std::sort(std::begin(entries_), std::end(entries_), [](auto const & x, auto const & y){ + // Sorting has bad effect on performance deactovated until figuring out what to do + // (this condition is always true). + if constexpr (Offsets > 0) { + return; + } + std::sort(std::begin(entries_), std::begin(entries_) + size_, [](auto const & x, auto const & y){ return x.offsets_[0] < y.offsets_[0]; }); } @@ -1075,8 +1100,7 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data & sievdat) bmp.apply(bitmasks_3); } - std::size_t bucketPrimesCount{}; - auto applyBucketsAndClear = [&]() { + auto applyAndClearBuckets = [&]() { bmp.apply(sievdat.bucket_1_hit_); sievdat.bucket_1_hit_.clear(); bmp.apply(sievdat.bucket_8_hits_); @@ -1113,13 +1137,10 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data & sievdat) if(!offsets[0]) { // exactly one hit sievdat.bucket_1_hit_.add(p, startOffs); } else { - auto & entryOffsets = sievdat.bucket_8_hits_.add(p, startOffs); - std::copy(std::begin(offsets), std::end(offsets), &entryOffsets[1]); + sievdat.bucket_8_hits_.add(p, startOffs, offsets); } - ++bucketPrimesCount; - if(bucketPrimesCount == max_num_primes_in_bucket) { - applyBucketsAndClear(); - bucketPrimesCount = 0; + if(sievdat.bucket_1_hit_.size() + sievdat.bucket_8_hits_.size() >= max_num_primes_in_bucket) { + applyAndClearBuckets(); } continue; @@ -1145,9 +1166,6 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data & sievdat) } } } - if(bucketPrimesCount) { - applyBucketsAndClear(); - } return ff(it0, std::end(tinyPrimes), &bmp); } @@ -1502,17 +1520,14 @@ sieve(I k0, I k1, Fct ff) if constexpr (is_one_of_v) { return maxn; } else { - return U{48*1024*1024}; + return U{24*1024*1024}; } }(); - std::vector bitmaps; - bitmaps.reserve((n1 - n0)/innerRangeSize + 1); std::vector prefix; prefix.reserve(3); - - details::sieve_data sievdat{ - .bucket_1_hit_ = details::bucket<1, U>{initial_bucket_capacity}, - .bucket_8_hits_ = details::bucket<8, U>{initial_bucket_capacity} - }; + std::vector bitmaps; + bitmaps.reserve((n1 - n0)/innerRangeSize + 1); + std::vector> sievdats; + sievdats.reserve((n1 - n0)/innerRangeSize + 1); for(auto a0 = n0, a1 = std::min(n1, (maxn - innerRangeSize < n0) ? maxn : U(n0 + innerRangeSize)); a0 < n1; @@ -1532,13 +1547,18 @@ sieve(I k0, I k1, Fct ff) } }(); constexpr auto maxm = (U{1} << (std::numeric_limits::digits / 2)) - 1; bitmaps.emplace_back(details::Bitmap{}); - auto & currSegBmp = bitmaps.back(); + sievdats.emplace_back(details::sieve_data{ + .bitmap_ = &bitmaps.back(), + .bucket_1_hit_ = details::bucket<1, U>{initial_bucket_capacity}, + .bucket_8_hits_ = details::bucket<8, U>{initial_bucket_capacity} + }); + auto & sievdat = sievdats.back(); for(U m0 = 0, m1 = rangeSize; (m0 < (U{1} << (std::numeric_limits::digits / 2)) - 1) && (m0 * m0 <= a1); m0 = m1, m1 = (maxm - m1 >= rangeSize) ? m1 + rangeSize : maxm) { - details::Bitmap primesBmp; + details::Bitmap basePrimesBmp; constexpr auto basePrimes = []() { if constexpr (std::is_same_v || std::is_same_v) { return u8primes; @@ -1546,14 +1566,21 @@ sieve(I k0, I k1, Fct ff) return u16primes; } }(); - sievdat.bitmap_ = &primesBmp; - sievdat.have_to_initialize_bitmap_ = true; + auto sievdatForBasePrimes = details::sieve_data{ .bitmap_ = &basePrimesBmp, + .bucket_1_hit_ = details::bucket<1, U>{initial_bucket_capacity}, + .bucket_8_hits_ = details::bucket<8, U>{initial_bucket_capacity} + }; details::inner_sieve(basePrimes, m0, m1, - [](auto, auto, details::Bitmap const *){ }, sievdat); - - details::PrimesIterator itP{&primesBmp}, itPe{&primesBmp, true}; + [](auto, auto, details::Bitmap const *){ }, sievdatForBasePrimes); + // We have to this here otherwise the iterator below may encounter non-primes. + // Hoisting base primes generation would allow to avoid it... + basePrimesBmp.apply(sievdatForBasePrimes.bucket_1_hit_); + sievdatForBasePrimes.bucket_1_hit_.clear(); + basePrimesBmp.apply(sievdatForBasePrimes.bucket_8_hits_); + sievdatForBasePrimes.bucket_8_hits_.clear(); + + details::PrimesIterator itP{&basePrimesBmp}, itPe{&basePrimesBmp, true}; auto basePrimesRange = std::ranges::subrange(itP, itPe); - sievdat.bitmap_ = &currSegBmp; sievdat.have_to_initialize_bitmap_ = m0 == 0; details::inner_sieve(basePrimesRange, a0, a1, [&](auto it, auto ite, details::Bitmap const*){ @@ -1564,6 +1591,13 @@ sieve(I k0, I k1, Fct ff) }, sievdat); } } + + for(std::size_t i = 0; i < bitmaps.size(); ++i) { + bitmaps[i].apply(sievdats[i].bucket_1_hit_); + sievdats[i].bucket_1_hit_.clear(); + bitmaps[i].apply(sievdats[i].bucket_8_hits_); + sievdats[i].bucket_8_hits_.clear(); + } return ff(prefix, bitmaps); } From fcfda3f1ac6e8ccbe13015e4002d3dbaac9f9606 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Fri, 25 Apr 2025 22:13:21 +0000 Subject: [PATCH 04/22] Backup. --- lfp.hpp | 855 ++++++++++++++++++++++++++++-------------------------- tests.cpp | 4 +- 2 files changed, 452 insertions(+), 407 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index 58a1dd0..82b8634 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -1,9 +1,13 @@ /* * MIT License - * Copyright (c) 2024 Youcef Lemsafer + * Copyright (c) 2024-2025 Youcef Lemsafer * See LICENSE file for more details. * Creation date: december 2024. */ +/// @file lfp.hpp +/// @brief Header defining the lfp library +/// +/// #pragma once #include @@ -24,6 +28,14 @@ #include #include +#ifdef NDEBUG +# define LFP_ASSERT(...) ((void)0); +#else +# ifndef LFP_ASSERT +# define LFP_ASSERT(...) assert(__VA_ARGS__) +# endif +#endif + namespace lfp { @@ -77,6 +89,10 @@ inline constexpr std::size_t bucket_sieve_threshold = 16384; inline constexpr std::size_t max_num_primes_in_bucket = 4096; inline constexpr std::size_t initial_bucket_capacity = 4096; +/// Value meaning no offset i.e. the requested value is not present in the sequence +inline constexpr std::size_t noffs = (std::numeric_limits::max)(); + +/// All the primes below 2^8 in ascending order template inline constexpr auto u8primes = std::to_array({ 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, @@ -87,6 +103,169 @@ inline constexpr auto u8primes = std::to_array({ 233, 239, 241, 251 }); +/// The really tiny primes, all numbers represented in a bitmap are coprime to +/// these, but they can still be returned by a sieve e.g. if the user requests +/// the primes in a range containing one of them. +template +inline constexpr std::array primesBelowSix = {2, 3, 5}; + + +// Some two dimensional wheels to speed-up the localization of the multiples of a prime +inline constexpr uint8_t wheel[8][8] = { + {6, 4, 2, 4, 2, 4, 6, 2}, + {4, 2, 4, 2, 4, 6, 2, 6}, + {2, 4, 2, 4, 6, 2, 6, 4}, + {4, 2, 4, 6, 2, 6, 4, 2}, + {2, 4, 6, 2, 6, 4, 2, 4}, + {4, 6, 2, 6, 4, 2, 4, 2}, + {6, 2, 6, 4, 2, 4, 2, 4}, + {2, 6, 4, 2, 4, 2, 4, 6} +}; + +inline constexpr uint8_t whoffs[8][8] = { + {0, 1, 2, 3, 4, 5, 6, 7}, + {2, 7, 5, 4, 1, 0, 6, 3}, + {0, 2, 6, 4, 7, 5, 1, 3}, + {6, 2, 1, 5, 4, 0, 7, 3}, + {2, 6, 7, 3, 4, 0, 1, 5}, + {0, 6, 2, 4, 1, 3, 7, 5}, + {6, 1, 3, 4, 7, 0, 2, 5}, + {0, 7, 6, 5, 4, 3, 2, 1} +}; + +/// "Adjustment" table used for finding the smallest multiple of a prime p +/// above max(p^2, n0) for a given n0. +/// More precisely, let p be a prime, we start to cross out the multiples of p +/// at p^2 but since we are using a mod 30 wheel we need those multiples to be +/// coprime to 30. What this table does is, once a multiple c of p of the form +/// p^2 + 2kp is found, it adds to c what needs to be added to make it coprime +/// to 30. +inline constexpr int8_t adjt[8][14] = { + {0, 4, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 4, 2}, + {0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 4, 0, 4, 2}, + {0, 4, 4, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 2}, + {0, 2, 2, 0, 4, 0, 0, 2, 0, 0, 2, 0, 4, 2}, + {0, 2, 4, 0, 2, 0, 0, 2, 0, 0, 4, 0, 2, 2}, + {0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 4, 4}, + {0, 2, 4, 0, 4, 0, 0, 2, 0, 0, 2, 0, 2, 2}, + {0, 2, 4, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 4} +}; + +/// Computes the smallest multiple m of p such that m >= max(p^2, n0) and +/// gcd(m, 30) = 1. +/// @param p the prime a multiple of which is requested +/// @param n0 lower bound for returned multiple of p +/// @return 0 when such a multiple does not fit in the return type. +template +constexpr auto +find_first_multiple_above(U p, V n0) +{ + const auto p2 = p * p; + constexpr auto c_max = std::numeric_limits::max(); + U dp2; + auto c = (p2 >= n0) ? p2 + : (((dp2 = (n0 - p2 + 2 * p - 1)/(2 * p)*(2 * p)), + c_max - p2 < dp2) ? 0 + : p2 + dp2); + if(!c) { + return c; + } + auto cmod30 = c % 30; + switch(cmod30) { + case 3: case 5: case 9: case 15: case 21: case 25: case 27: { + auto dc = adjt[(p % 30) * 4 / 15][cmod30 >> 1] * p; + c = (c_max - c < dc) ? 0 : c + dc; + cmod30 = c % 30; + } + break; + default:; + } + return c; +} + +// Returns the smallest integer coprime to 30 and greater than or equal to @param n0 +template +constexpr +Ret compute_gte_coprime(U n0) +{ + constexpr uint8_t dn[30] = { + 1, 0, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0, 1, 0, 3, + 2, 1, 0, 1, 0, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0 + }; + + return Ret{n0} + dn[n0 % 30]; +} + + +/// Returns the largest integer coprime to 30 and < @param n1 +template +constexpr +Ret compute_lt_coprime(U n1) +{ + constexpr uint8_t dn[30] = { + 1, 2, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 1, 2, 1, + 2, 3, 4, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6 + }; + + return Ret{n1} - dn[n1 % 30]; +} + + +/// Returns the index of m relative to the index of m0. +/// @pre both m and m0 are coptime to 30 and m >= m0 +template +constexpr std::size_t index_of(U m, V m0) +{ + LFP_ASSERT((m >= m0) && (std::gcd(m, 30) == 1) && (std::gcd(m0, 30) == 1)); + + auto m_idx = (m % 30) * 4 / 15; + auto m0_idx = (m0 % 30) * 4 /15; + return (m - m0) / 30 * 8 + ((m_idx >= m0_idx) ? m_idx - m0_idx : 8 + m_idx - m0_idx); +} + +/// Computes the index of the first multiple of prime p in range [n0, ne] relative to the index of n0 +/// as well as the offsets of the following multiples of p relative to firstMultIdx. +/// @pre p is a prime number > 6, n0 <= n1, gcd(n0, 30) = 1 and gcd(ne, 30) = 1. +/// @post firstMultIdx is the index of the first multiple of p in [n0, n1[ or noffs if no such multiple, +/// offsets is filled with the offsets of the next multiples of p relative to firstMultIdx, +/// @return the number of offsets written to offs +template +constexpr std::size_t compute_offsets(std::size_t & firstMultIdx, + std::array & offsets, + U p, + V n0, + V ne + ) +{ + firstMultIdx = noffs; + + auto c = find_first_multiple_above(p, n0); + if(!c) { + return 0; + } + constexpr auto c_max = std::numeric_limits::max(); + bool isFirstIndex = true; + std::size_t offsCount = 0; + auto prevDelta = 0; + for(auto j = whoffs[(p % 30) * 4 / 15][(c % 30) * 4 / 15]; + c <= ne; + c = ((c_max - c < wheel[(p % 30) * 4 / 15][j] * p) + ? ne + 1 + : c + wheel[(p % 30) * 4 / 15][j] * p), j = (j + 1) % 8) { + auto currIdx = index_of(c, n0); + if(isFirstIndex) { + firstMultIdx = currIdx; + isFirstIndex = false; + continue; + } + offsets[offsCount++] = currIdx - firstMultIdx; + if(offsCount == offsets.size()) { + break; + } + } + return offsCount; +} + /// Applies given mask to value starting at bit value + offs. /// @pre 0 <= offs < std::numeric_limits::digits @@ -104,6 +283,9 @@ constexpr void mask(U & value, U mask, V offs) value &= (offs ? (mask << (std::numeric_limits::digits - offs)) : U{}) | (~U{} >> offs); } + +/// @class allocator +/// An allocator class used when specific alignment is needed. template struct allocator { @@ -170,8 +352,15 @@ allocator::deallocate(U* ptr, std::size_t /*n*/) noexcept } +/// Useful when instantiating class bitmask_impl e.g. bitmask_impl<..., use_dictionary_t{true}, ...> using use_dictionary_t = bool; +/// @class bitmask_impl +/// Holds the bitmask to be applied to a bitmap in order to reset the bits corresponding to the multiples of a prime P. +/// @param U the limb type +/// @param P the prime number +/// @param UseDictionary whether to precompute and store the masks for all possible indices +/// @param Alignment alignment to use for the array holding the mask(s) template class bitmask_impl { @@ -438,145 +627,48 @@ template using bitmask = bitmask_impl; -template +template class bucket { public: - static_assert(Offsets >= 1, "There must be at least one offset."); - struct entry { - /// Offsets of the bits to clear corresponding to multiples of prime_ - /// The offsets to clear are offsets_[0], offsets_[0] + offsets_[1], ..., offsets_[0]+offsets_[7] - std::array offsets_; - PrimeT prime_; - }; - using offsets = decltype(entry::offsets_); constexpr bucket() = default; constexpr explicit bucket(std::size_t initialCapacity); constexpr std::size_t size() const; - constexpr auto entries() const; - constexpr void clear(); - constexpr void add(PrimeT prime, std::uint32_t offsetOfPrime) requires(Offsets == 1); - constexpr void add(PrimeT prime, std::uint32_t offsetOfPrime, - std::span nextOffsets) requires(Offsets > 1); - constexpr void sort(); + constexpr void add(PrimeT prime); + constexpr std::vector const & primes() const; private: - constexpr entry & add_entry(PrimeT prime, std::uint32_t offsetOfPrime); - std::vector entries_; - std::size_t size_; + std::vector primes_; }; -template +template constexpr -bucket::bucket(std::size_t initialCapacity) - : size_(0) +bucket::bucket(std::size_t initialCapacity) { - entries_.reserve(initialCapacity); + primes_.reserve(initialCapacity); } -template +template constexpr std::size_t -bucket::size() const -{ - return size_; -} - -template -constexpr void -bucket::clear() +bucket::size() const { - size_ = 0; + return primes_.size(); } -template -constexpr void -bucket::add(PrimeT prime, std::uint32_t offsetOfPrime) requires(Offsets == 1) -{ - add_entry(prime, offsetOfPrime); -} - -template +template constexpr void -bucket::add(PrimeT prime, std::uint32_t offsetOfPrime, - std::span nextOffsets) requires(Offsets > 1) +bucket::add(PrimeT prime) { - auto & addedEntry = add_entry(prime, offsetOfPrime); - std::ranges::copy(nextOffsets, &addedEntry.offsets_[1]); -} - -template -constexpr bucket::entry & -bucket::add_entry(PrimeT prime, std::uint32_t offsetOfPrime) -{ - assert(size_ <= entries_.size()); - if(size_ < entries_.size()) { - entries_[size_] = entry{.prime_ = prime}; - } else { - entries_.push_back(entry{.prime_ = prime}); - } - auto & addedEntry = entries_[size_]; - addedEntry.offsets_[0] = offsetOfPrime; - ++size_; - return addedEntry; + primes_.push_back(prime); } -template -constexpr auto -bucket::entries() const -{ - return std::ranges::subrange(std::begin(entries_), std::begin(entries_) + size_); -} - -template -constexpr void -bucket::sort() +template +constexpr std::vector const & +bucket::primes() const { - // Sorting has bad effect on performance deactovated until figuring out what to do - // (this condition is always true). - if constexpr (Offsets > 0) { - return; - } - std::sort(std::begin(entries_), std::begin(entries_) + size_, [](auto const & x, auto const & y){ - return x.offsets_[0] < y.offsets_[0]; - }); + return primes_; } - -inline constexpr int8_t adjt[8][14] = { - {0, 4, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 4, 2}, - {0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 4, 0, 4, 2}, - {0, 4, 4, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 2}, - {0, 2, 2, 0, 4, 0, 0, 2, 0, 0, 2, 0, 4, 2}, - {0, 2, 4, 0, 2, 0, 0, 2, 0, 0, 4, 0, 2, 2}, - {0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 4, 4}, - {0, 2, 4, 0, 4, 0, 0, 2, 0, 0, 2, 0, 2, 2}, - {0, 2, 4, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 4} - }; - -template -constexpr auto -find_first_multiple_above(U p, V n0) -{ - const auto p2 = p * p; - constexpr auto c_max = std::numeric_limits::max(); - U dp2; - auto c = (p2 >= n0) ? p2 : (((dp2 = (n0 - p2 + 2 * p - 1)/(2 * p)*(2 * p)), c_max - p2 < dp2) ? 0 : p2 + dp2); - if(!c) { - return c; - } - auto cmod30 = c % 30; - switch(cmod30) { - case 3: case 5: case 9: case 15: case 21: case 25: case 27: { - auto dc = adjt[(p % 30) * 4 / 15][cmod30 >> 1] * p; - c = (c_max - c < dc) ? 0 : c + dc; - cmod30 = c % 30; - } - break; - default:; - } - return c; -} - template class PrimesIterator; @@ -590,13 +682,16 @@ class Bitmap constexpr void assign(uint64_t n0, std::size_t size); constexpr std::size_t size() const; constexpr std::size_t indexOf(uint64_t val) const; + constexpr value_type first_value() const; + constexpr value_type last_value() const; + constexpr value_type value_at(std::size_t idx) const; constexpr void reset(std::size_t index); template constexpr void apply(bitmask_impl const & mask); template constexpr void apply(bitmask_pack const & maskPack); - template - constexpr void apply(bucket & bkt); + template + constexpr void apply(bucket & bkt); constexpr uint64_t popcount() const; void check() const; constexpr uint8_t at(std::size_t index) const; @@ -605,7 +700,8 @@ class Bitmap private: template static constexpr std::size_t indexInResidues(Int); - + static constexpr value_type value_at_impl(value_type n0, std::size_t offs); + struct mask_application_data { std::size_t first_composite_; std::size_t first_composite_index_; @@ -616,14 +712,13 @@ class Bitmap constexpr mask_application_data compute_mask_application_data(bitmask_impl const & bmk) const; template constexpr void apply(bitmask_impl const & bmk, mask_application_data & mappData, std::size_t endOffset); - template - constexpr void apply(bucket<1, PrimeT>::entry const & entry); - template - constexpr void apply(bucket<8, PrimeT>::entry const & entry); std::vector> vec_; std::size_t size_; + /// First value value_type n0_; + /// Last value + value_type ne_; static constexpr std::array d_{1,0,5,4,3,2,1,0,3,2,1,0,1,0,3,2,1,0,1,0,3,2,1,0,5,4,3,2,1,0}; static constexpr std::array residues_{1,7,11,13,17,19,23,29}; static constexpr std::array deltas_{6,4,2,4,2,4,6,2}; @@ -639,6 +734,7 @@ constexpr Bitmap::Bitmap() : size_(0) , n0_(0) + , ne_(0) {} constexpr @@ -647,7 +743,10 @@ Bitmap::Bitmap(uint64_t n0, std::size_t size) ~decltype(vec_)::value_type{}) , size_(size) , n0_(n0 + d_[n0 % 30]) + , ne_(size_ ? value_at_impl(n0_, size_ - 1) : n0_) { + LFP_ASSERT(size_ >= 1); + if(size % NumLim::digits) { vec_.back() &= ~ElemType{} << (NumLim::digits - size % NumLim::digits); } @@ -661,6 +760,7 @@ Bitmap::assign(uint64_t n0, std::size_t size) ~decltype(vec_)::value_type{}); size_ = size; n0_ = n0 + d_[n0 % 30]; + ne_ = size_ ? value_at_impl(n0_, size_ - 1) : n0_; if(size % NumLim::digits) { vec_.back() &= ~ElemType{} << (NumLim::digits - size % NumLim::digits); @@ -676,11 +776,39 @@ Bitmap::size() const constexpr std::size_t -Bitmap::indexOf(uint64_t val) const +Bitmap::indexOf(value_type val) const +{ + return index_of(val, n0_); +} + +constexpr +Bitmap::value_type +Bitmap::first_value() const +{ + return n0_; +} + +constexpr +Bitmap::value_type +Bitmap::last_value() const +{ + return ne_; +} + +constexpr +Bitmap::value_type +Bitmap::value_at(std::size_t idx) const +{ + return value_at_impl(n0_, idx); +} + +constexpr +Bitmap::value_type +Bitmap::value_at_impl(value_type n0, size_t idx) { - auto valmod30idx = (val % 30) * 4 / 15; - auto n0mod30idx = (n0_ % 30) * 4 /15; - return (val - n0_) / 30 * 8 + ((valmod30idx >= n0mod30idx) ? valmod30idx - n0mod30idx : 8 + valmod30idx - n0mod30idx); + auto n0res = n0 % 30; + auto nxres = residues_[(n0res * 4 / 15 + idx) % 8]; + return n0 + (idx / 8) * 30 + ((nxres >= n0res) ? nxres - n0res : 30 + nxres - n0res ); } constexpr @@ -896,164 +1024,78 @@ void Bitmap::apply(bitmask_pack const & maskPack) } } -template -constexpr void -Bitmap::apply(bucket & bkt) -{ - static_assert(NumOffsets == 1 || NumOffsets == 8, - "The number of offsets must be 1 or 8, other values not supported yet!"); - static_assert(sizeof(PrimeT) <= sizeof(value_type)); - auto entries = bkt.entries(); - if(entries.empty()) { - return; - } - bkt.sort(); - for(auto const & entry : entries) { - apply(entry); - } -} - template constexpr void -Bitmap::apply(bucket<1, PrimeT>::entry const & entry) +Bitmap::apply(bucket & bkt) { - assert(entry.offsets_[0] < size_); - reset(entry.offsets_[0]); -} - + /// @todo: Do we really have to have this static assert? It seems logical because + /// base primes ought to be smaller than the ones being hunted... + static_assert(sizeof(PrimeT) <= sizeof(value_type)); -template -constexpr void -Bitmap::apply(bucket<8, PrimeT>::entry const & entry) -{ - std::size_t i = entry.offsets_[0]; - auto const & offsets = entry.offsets_; - auto p = entry.prime_; - if(offsets[7]) { - for(; i + 8 * p < size_; i += 8 * p) { - reset(i); - reset(i + offsets[1]); - reset(i + offsets[2]); - reset(i + offsets[3]); - reset(i + offsets[4]); - reset(i + offsets[5]); - reset(i + offsets[6]); - reset(i + offsets[7]); + std::array offsets; + for(auto p : bkt.primes()) { + std::size_t start = noffs; + if((n0_ == 2039522017) && (p == 17681)) { + if(!std::is_constant_evaluated()) { + std::cout << p << ", " << n0_ << std::endl; + } + } + auto offsCount = compute_offsets(start, offsets, p, n0_, ne_); + // This branch could be removed if we are absolutely sure, + // positive, that any prime in the bucket has at least one bit to reset. + // Need to do some profiling to establish whether removing the test + // improves the performances. Wait and see... + if(start == noffs) { + LFP_ASSERT(false); // should never happen, p does not belong in this bucket + continue; } - } - const auto i0 = i; - for(auto j = 1; i < size_; i = i0 + offsets[j], ++j) { - reset(i); - if((j == std::size(offsets)) || !offsets[j]) { - break; + reset(start); + // If things are done correctly the following condition is true most of the time + // because buckets are for big primes (someone like 509 has nothing to do here, + // whereas 12503 is welcome). + if(!offsCount) { + continue; + } + /// @todo: this is duplicated code, have a look at the end of inner_sieve + auto i = start; + if(offsCount == offsets.size()) { + // offsets are periodic, period is 8p + // @todo: can there be an overflow when adding 8*p? + for(; i + 8 * p < size_; i += 8 * p) { + reset(i); + reset(i + offsets[0]); + reset(i + offsets[1]); + reset(i + offsets[2]); + reset(i + offsets[3]); + reset(i + offsets[4]); + reset(i + offsets[5]); + reset(i + offsets[6]); + } + } + const auto i0 = i; + for(auto j = 0; i < size_; i = i0 + offsets[j], ++j) { + reset(i); + if(j == offsCount) { + break; + } } + } } -inline constexpr uint8_t wheel[8][8] = { - {6,4,2,4,2,4,6,2}, - {4,2,4,2,4,6,2,6}, - {2,4,2,4,6,2,6,4}, - {4,2,4,6,2,6,4,2}, - {2,4,6,2,6,4,2,4}, - {4,6,2,6,4,2,4,2}, - {6,2,6,4,2,4,2,4}, - {2,6,4,2,4,2,4,6} -}; - -inline constexpr uint8_t whoffs[8][8] = { - {0, 1, 2, 3, 4, 5, 6, 7}, - {2, 7, 5, 4, 1, 0, 6, 3}, - {0, 2, 6, 4, 7, 5, 1, 3}, - {6, 2, 1, 5, 4, 0, 7, 3}, - {2, 6, 7, 3, 4, 0, 1, 5}, - {0, 6, 2, 4, 1, 3, 7, 5}, - {6, 1, 3, 4, 7, 0, 2, 5}, - {0, 7, 6, 5, 4, 3, 2, 1} -}; - - -// Returns the smallest integer coprime to 30 and greater than or equal to @param n0 -template -constexpr -Ret compute_gte_coprime(U n0) -{ - constexpr uint8_t dn[30] = { - 1, 0, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0, 1, 0, 3, - 2, 1, 0, 1, 0, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0 - }; - - return Ret{n0} + dn[n0 % 30]; -} - - -// Returns the largest ineteger coprime to 30 and less than @param n1 -template -constexpr -Ret compute_lt_coprime(U n1) -{ - constexpr uint8_t dn[30] = { - 1, 2, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 1, 2, 1, - 2, 3, 4, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6 - }; - - return Ret{n1} - dn[n1 % 30]; -} - -template -inline constexpr std::array primesBelowSix = {2, 3, 5}; - -template -constexpr std::int32_t compute_offsets(std::array & offsets, - Bitmap const & bmp, - U p, - V c, - U ne - ) -{ - std::ranges::fill(offsets, 0); - constexpr auto c_max = std::numeric_limits::max(); - auto count = 0; - int32_t firstIndex = -1; - auto offsIdx = 0; - auto prevDelta = 0; - for(auto j = whoffs[(p % 30) * 4 / 15][(c % 30) * 4 / 15]; - c <= ne; - c = ((c_max - c < wheel[(p % 30) * 4 / 15][j] * p) - ? ne + 1 - : c + wheel[(p % 30) * 4 / 15][j] * p), j = (j + 1) % 8) { - auto currIdx = bmp.indexOf(c); - if(offsIdx == 0) { - firstIndex = currIdx; - } - - if(offsIdx > 0 && offsIdx <= offsets.size()) { - offsets[offsIdx - 1] = currIdx - firstIndex; - } - ++offsIdx; - if(offsIdx > offsets.size()) { - break; - } - } - return firstIndex; -} -template struct sieve_data { Bitmap * bitmap_ = nullptr; bool have_to_initialize_bitmap_ = true; - /// Bucket used when there is one, exactly one hit in the range being sieved - bucket<1, PrimeT> bucket_1_hit_; - /// Bucket used when there are more than one hit in the range being sieved - bucket<8, PrimeT> bucket_8_hits_; + bool have_to_ignore_bucketable_primes_ = false; }; template constexpr auto -inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data & sievdat) +inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data const & sievdat) { auto const & tinyPrimes = primesBelowSix; if((n0 >= n1) || (n0 > std::numeric_limits::max() - 2)) { @@ -1100,76 +1142,63 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data & sievdat) bmp.apply(bitmasks_3); } - auto applyAndClearBuckets = [&]() { - bmp.apply(sievdat.bucket_1_hit_); - sievdat.bucket_1_hit_.clear(); - bmp.apply(sievdat.bucket_8_hits_); - sievdat.bucket_8_hits_.clear(); - }; - - for(auto p : basePrimes | std::views::drop_while([smallPrimesThreshold](auto p) { return p < smallPrimesThreshold; })) { - auto p2 = U{p} * p; - if(p2 > ne) { - break; - } - // Early continue if p has no multiple in the current segment. The first condition is to avoid paying the cost of - // a modulo when there is a higher probability of p having a multiple in the current segment. - // This doesn't worsen things too much in the lower ranges while improving the performances in the higher ranges. - if(p > ne - n0) { - if(auto n0_mod_p = n0 % p; n0_mod_p && (p - n0_mod_p > ne - n0)) { - continue; - } - } - auto c = find_first_multiple_above(decltype(p2){p}, n0); - if(!c) { - continue; - } + for(auto p : basePrimes + | std::views::drop_while([smallPrimesThreshold](auto p) { return p < smallPrimesThreshold; }) + | std::views::take_while([&sievdat](auto p){ + return !(sievdat.have_to_ignore_bucketable_primes_ && (p >= bucket_sieve_threshold)); })) { + auto p2 = U{p} * p; + if(p2 > ne) { + break; + } + // Early continue if p has no multiple in the current segment. The first condition is to avoid paying the cost of + // a modulo when there is a higher probability of p having a multiple in the current segment. + // This doesn't worsen things too much in the lower ranges while improving the performances in the higher ranges. + if(p > ne - n0) { + if(auto n0_mod_p = n0 % p; n0_mod_p && (p - n0_mod_p > ne - n0)) { + continue; + } + } + auto c = find_first_multiple_above(decltype(p2){p}, n0); + if(!c) { + continue; + } + + std::size_t startOffs{noffs}; + std::array offsets; + auto offsetsCount = compute_offsets(startOffs, offsets, p, n0, ne); - std::array offsets; - std::int32_t startOffs = compute_offsets(offsets, bmp, p, c, ne); - - if((startOffs < 0) || (startOffs >= bmp.size())) { - // No multiple of p greater or equal to p^2 in the current segment - continue; - } - - if(p >= bucket_sieve_threshold) { - if(!offsets[0]) { // exactly one hit - sievdat.bucket_1_hit_.add(p, startOffs); - } else { - sievdat.bucket_8_hits_.add(p, startOffs, offsets); - } - if(sievdat.bucket_1_hit_.size() + sievdat.bucket_8_hits_.size() >= max_num_primes_in_bucket) { - applyAndClearBuckets(); - } - - continue; - } + if(startOffs == noffs) { + // No multiple of p greater or equal to p^2 (and coprime to 30) in the current segment + continue; + } + // Unroll the loop when possible i.e. when we have a full period std::size_t i = startOffs; - if(offsets[6]) { - for(; i + 8 * p < bmp.size(); i += 8 * p) { - bmp.reset(i); - bmp.reset(i + offsets[0]); - bmp.reset(i + offsets[1]); + if(offsetsCount == offsets.size()) { + for(; i + 8 * p < bmp.size(); i += 8 * p) { + bmp.reset(i); + bmp.reset(i + offsets[0]); + bmp.reset(i + offsets[1]); bmp.reset(i + offsets[2]); - bmp.reset(i + offsets[3]); - bmp.reset(i + offsets[4]); - bmp.reset(i + offsets[5]); - bmp.reset(i + offsets[6]); - } - } - const auto i0 = i; - for(auto j = 0; i < bmp.size(); i = i0 + offsets[j], ++j) { - bmp.reset(i); - if((j == offsets.size()) || !offsets[j]) { - break; - } - } + bmp.reset(i + offsets[3]); + bmp.reset(i + offsets[4]); + bmp.reset(i + offsets[5]); + bmp.reset(i + offsets[6]); + } + } + // Deal with the remaining offsets (or the partial period) + const auto i0 = i; + for(auto j = 0; i < bmp.size(); i = i0 + offsets[j], ++j) { + bmp.reset(i); + if(j == offsetsCount) { + break; + } + } } return ff(it0, std::end(tinyPrimes), &bmp); } + template {}.cbegin())> constexpr std::vector collectSieveResults(It first, It last, Bitmap const * bmp) @@ -1486,7 +1515,7 @@ template inline constexpr auto u16primes = []() { auto sv = [] { Bitmap bmp; - sieve_data sievdat{.bitmap_ = &bmp}; + sieve_data sievdat{.bitmap_ = &bmp}; return inner_sieve( u8primes, uint16_t(0), uint16_t(65535), collectSieveResults, @@ -1501,103 +1530,119 @@ inline constexpr auto u16primes = []() { template inline constexpr bool is_one_of_v = (std::is_same_v || ...); +template +struct segment +{ + U low_; + U high_; + Bitmap * bitmap_; + bucket bucket_; +}; template constexpr auto sieve(I k0, I k1, Fct ff) { static_assert(is_one_of_v); + int32_t, uint32_t, int64_t, uint64_t>); if constexpr(std::numeric_limits::is_signed) { - k0 = (std::max)(I{0}, k0); - k1 = (std::max)(I{0}, k1); + k0 = (std::max)(I{0}, k0); + k1 = (std::max)(I{0}, k1); } using U = std::make_unsigned_t; const U n0 = U(k0), n1 = U(k1); constexpr U maxn = std::numeric_limits::max(); - constexpr U innerRangeSize = []() { - if constexpr (is_one_of_v) { - return maxn; - } else { - return U{24*1024*1024}; - } }(); + constexpr U segmentSize = []() { + if constexpr (is_one_of_v) { + return maxn; + } else { + return U{24*1024*1024}; + } }(); std::vector prefix; prefix.reserve(3); - std::vector bitmaps; - bitmaps.reserve((n1 - n0)/innerRangeSize + 1); - std::vector> sievdats; - sievdats.reserve((n1 - n0)/innerRangeSize + 1); - - for(auto a0 = n0, a1 = std::min(n1, (maxn - innerRangeSize < n0) ? maxn : U(n0 + innerRangeSize)); + std::vector bitmaps; + bitmaps.reserve((n1 - n0) / segmentSize + 1); + std::vector> segments; + segments.reserve((n1 - n0) / segmentSize + 1); + for(auto a0 = n0, a1 = std::min(n1, (maxn - segmentSize < n0) ? maxn : U(n0 + segmentSize)); a0 < n1; - a0 = (maxn - innerRangeSize < a0) ? maxn : a0 + innerRangeSize, - a1 = std::min(n1, maxn - innerRangeSize < a0 ? maxn : U(a0 + innerRangeSize))) { - - const U rangeSize = [n1](){ - if constexpr (is_one_of_v) { - return (U{1} << (std::numeric_limits::digits / 2)) - 1; - } else { - // Established through tests - if(n1 >= U{55}<<54) { - return U{16*1024*1024}; - } - return (std::min)(U{16*1024*1024}, - U{1} << ((std::bit_width(n1) + 1)/2)); - } }(); - constexpr auto maxm = (U{1} << (std::numeric_limits::digits / 2)) - 1; - bitmaps.emplace_back(details::Bitmap{}); - sievdats.emplace_back(details::sieve_data{ - .bitmap_ = &bitmaps.back(), - .bucket_1_hit_ = details::bucket<1, U>{initial_bucket_capacity}, - .bucket_8_hits_ = details::bucket<8, U>{initial_bucket_capacity} - }); - auto & sievdat = sievdats.back(); - - for(U m0 = 0, m1 = rangeSize; - (m0 < (U{1} << (std::numeric_limits::digits / 2)) - 1) && (m0 * m0 <= a1); - m0 = m1, - m1 = (maxm - m1 >= rangeSize) ? m1 + rangeSize : maxm) { - details::Bitmap basePrimesBmp; - constexpr auto basePrimes = []() { - if constexpr (std::is_same_v || std::is_same_v) { - return u8primes; - } else { - return u16primes; - } - }(); - auto sievdatForBasePrimes = details::sieve_data{ .bitmap_ = &basePrimesBmp, - .bucket_1_hit_ = details::bucket<1, U>{initial_bucket_capacity}, - .bucket_8_hits_ = details::bucket<8, U>{initial_bucket_capacity} - }; - details::inner_sieve(basePrimes, m0, m1, - [](auto, auto, details::Bitmap const *){ }, sievdatForBasePrimes); - // We have to this here otherwise the iterator below may encounter non-primes. - // Hoisting base primes generation would allow to avoid it... - basePrimesBmp.apply(sievdatForBasePrimes.bucket_1_hit_); - sievdatForBasePrimes.bucket_1_hit_.clear(); - basePrimesBmp.apply(sievdatForBasePrimes.bucket_8_hits_); - sievdatForBasePrimes.bucket_8_hits_.clear(); - - details::PrimesIterator itP{&basePrimesBmp}, itPe{&basePrimesBmp, true}; - auto basePrimesRange = std::ranges::subrange(itP, itPe); - sievdat.have_to_initialize_bitmap_ = m0 == 0; - details::inner_sieve(basePrimesRange, a0, a1, - [&](auto it, auto ite, details::Bitmap const*){ - if(it != ite) { - prefix = std::vector{it, ite}; - } - return 0; - }, sievdat); + a0 = (maxn - segmentSize < a0) ? maxn : a0 + segmentSize, + a1 = std::min(n1, maxn - segmentSize < a0 ? maxn : U(a0 + segmentSize))) { + bitmaps.push_back(Bitmap{}); + segments.emplace_back(segment{ + .low_ = a0, + .high_ = a1, + .bitmap_ = &bitmaps.back(), + .bucket_ = bucket{initial_bucket_capacity}}); + } + + const U basePrimesRangeSize = [n1](){ + if constexpr (is_one_of_v) { + return (U{1} << (std::numeric_limits::digits / 2)) - 1; + } else { + // Established through tests + if(n1 >= U{55}<<54) { + return U{16*1024*1024}; + } + return (std::min)(U{16*1024*1024}, + U{1} << ((std::bit_width(n1) + 1)/2)); + } }(); + constexpr auto maxm = (U{1} << (std::numeric_limits::digits / 2)) - 1; + + for(U m0 = 0, m1 = basePrimesRangeSize; + (m0 < (U{1} << (std::numeric_limits::digits / 2)) - 1) && (m0 * m0 <= n1); + m0 = m1, + m1 = (maxm - m1 >= basePrimesRangeSize) ? m1 + basePrimesRangeSize : maxm) { + constexpr auto basePrimes = []() { + if constexpr (std::is_same_v || std::is_same_v) { + return u8primes; + } else { + return u16primes; + } + }(); + Bitmap basePrimesBmp; + details::inner_sieve(basePrimes, m0, m1, + [](auto, auto, details::Bitmap const *){ }, sieve_data{ + .bitmap_ = &basePrimesBmp, + .have_to_initialize_bitmap_ = true + }); + + auto pSquared = U{}; + details::PrimesIterator itP{&basePrimesBmp}, itPe{&basePrimesBmp, true}; + for(auto p : std::ranges::subrange(itP, itPe)) { + if(p < bucket_sieve_threshold) { + continue; + } + pSquared = p * p; + if(pSquared > n1) { + break; + } + auto kp = find_first_multiple_above(p, n0); + std::size_t segIdx = kp ? (kp - n0) / segmentSize : -1; + if((segIdx != -1) && (segIdx < segments.size()) + && (segIdx != segments.size() - 1 || kp < segments[segIdx].high_)) { + segments[segIdx].bucket_.add(p); + } + } + if(pSquared > n1) { + break; } } - for(std::size_t i = 0; i < bitmaps.size(); ++i) { - bitmaps[i].apply(sievdats[i].bucket_1_hit_); - sievdats[i].bucket_1_hit_.clear(); - bitmaps[i].apply(sievdats[i].bucket_8_hits_); - sievdats[i].bucket_8_hits_.clear(); + for(auto & currentSegment : segments) { + details::inner_sieve(u16primes, currentSegment.low_, currentSegment.high_, + [&](auto it, auto ite, details::Bitmap const*){ + if(it != ite) { + prefix = std::vector{it, ite}; + } + return 0; + }, sieve_data{ .bitmap_ = currentSegment.bitmap_, + .have_to_initialize_bitmap_ = true, + .have_to_ignore_bucketable_primes_ = true }); + currentSegment.bitmap_->apply(currentSegment.bucket_); } + return ff(prefix, bitmaps); } diff --git a/tests.cpp b/tests.cpp index 4fc8e8e..c124b08 100644 --- a/tests.cpp +++ b/tests.cpp @@ -166,14 +166,14 @@ TEST_CASE("Sieve of Erathostenes - primes iterator and range") { using namespace lfp::details; { Bitmap bmp; - sieve_data sievdat{.bitmap_ = &bmp}; + sieve_data sievdat{.bitmap_ = &bmp}; inner_sieve(u8primes, 300u, 400u, [](auto, auto, auto) {}, sievdat); PrimesIterator it{&bmp}, ite{&bmp, true}; CHECK_THAT(std::vector(it, ite), Equals(primes_by_division(300, 400))); } { Bitmap bmp; - sieve_data sievdat{.bitmap_ = &bmp}; + sieve_data sievdat{.bitmap_ = &bmp}; inner_sieve(u16primes, 10000u, 12000u, [](auto, auto, auto) {}, sievdat); PrimesIterator it{&bmp}, ite{&bmp, true}; CHECK_THAT(std::vector(it, ite), Equals(primes_by_division(10000, 12000))); From 0568f6da7957fa4ca596507121ce7fa2d48bd5d7 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Sat, 26 Apr 2025 14:30:50 +0000 Subject: [PATCH 05/22] Working bucket sieving but slower. --- lfp.hpp | 169 +++++++++++++++++++++++++++++++++++++++++++++++-------- makefile | 6 +- 2 files changed, 149 insertions(+), 26 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index 82b8634..9deca01 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -85,13 +85,18 @@ struct Threads namespace details { /// Constants -inline constexpr std::size_t bucket_sieve_threshold = 16384; +inline constexpr std::size_t bucket_sieve_threshold = 8192; inline constexpr std::size_t max_num_primes_in_bucket = 4096; inline constexpr std::size_t initial_bucket_capacity = 4096; /// Value meaning no offset i.e. the requested value is not present in the sequence inline constexpr std::size_t noffs = (std::numeric_limits::max)(); +/// True if and only if T is one of the types listed after it +template +inline constexpr bool is_one_of_v = sizeof...(Rest) && ((std::is_same_v || ...)); + + /// All the primes below 2^8 in ascending order template inline constexpr auto u8primes = std::to_array({ @@ -138,7 +143,7 @@ inline constexpr uint8_t whoffs[8][8] = { /// More precisely, let p be a prime, we start to cross out the multiples of p /// at p^2 but since we are using a mod 30 wheel we need those multiples to be /// coprime to 30. What this table does is, once a multiple c of p of the form -/// p^2 + 2kp is found, it adds to c what needs to be added to make it coprime +/// p^2 + 2k is found, it adds to c what needs to be added to make it coprime /// to 30. inline constexpr int8_t adjt[8][14] = { {0, 4, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 4, 2}, @@ -161,8 +166,8 @@ constexpr auto find_first_multiple_above(U p, V n0) { const auto p2 = p * p; - constexpr auto c_max = std::numeric_limits::max(); - U dp2; + constexpr auto c_max = std::numeric_limits::max(); + V dp2; auto c = (p2 >= n0) ? p2 : (((dp2 = (n0 - p2 + 2 * p - 1)/(2 * p)*(2 * p)), c_max - p2 < dp2) ? 0 @@ -243,15 +248,27 @@ constexpr std::size_t compute_offsets(std::size_t & firstMultIdx, if(!c) { return 0; } - constexpr auto c_max = std::numeric_limits::max(); +#if 0 + if(c < n0) { //@todo: remove. Used for debugging + c = find_first_multiple_above(p, n0); + if(!std::is_constant_evaluated()) { + std::cout << "Bad multiple, got " << c << " where a value > " << n0 << " was requested." << std::endl; + } + } +#endif + constexpr auto c_max = std::numeric_limits::max(); bool isFirstIndex = true; std::size_t offsCount = 0; auto prevDelta = 0; +// auto c0 = c; // @todo: remove. Used only by the assertion below for(auto j = whoffs[(p % 30) * 4 / 15][(c % 30) * 4 / 15]; c <= ne; c = ((c_max - c < wheel[(p % 30) * 4 / 15][j] * p) ? ne + 1 : c + wheel[(p % 30) * 4 / 15][j] * p), j = (j + 1) % 8) { +// if(c < c0) { +// LFP_ASSERT(false); +// } auto currIdx = index_of(c, n0); if(isFirstIndex) { firstMultIdx = currIdx; @@ -636,6 +653,24 @@ class bucket constexpr std::size_t size() const; constexpr void add(PrimeT prime); constexpr std::vector const & primes() const; + /// Moves primes in this bucket to the next bucket they belong to based + /// on their multiples. + /// This is the in-famous "kick the can" operation, once a prime p has had + /// its multiples in a segment crossed out, we push it forward to the next + /// segment where a multiple of p resides. + /// @param buckets a span refering to the buckets which will receive the + /// primes in this bucket. + /// @param n0 the lower bound of the segment corresponding to the first bucket in buckets + /// @param n1 the upper bound of the segment corresponding to the last bucket in buckets + /// @param segmentSize the size of a segment (the last segment can be smaller). + template + constexpr void push_primes_forward( + std::span> buckets, + U n0, + U n1, + std::size_t segmentSize + ); + private: std::vector primes_; }; @@ -668,6 +703,29 @@ bucket::primes() const return primes_; } +template +template +constexpr void +bucket::push_primes_forward( + std::span> buckets, + U n0, + U n1, + std::size_t segmentSize + ) +{ + if(buckets.empty()) { + return; + } + for(auto p : primes_) { + auto kp = find_first_multiple_above(p, n0); + if(kp && (kp < n1)) { + buckets[(kp - n0) / segmentSize].add(p); + } + } + decltype(primes_) emptyVec; + std::swap(primes_, emptyVec); +} + template class PrimesIterator; @@ -1033,14 +1091,63 @@ Bitmap::apply(bucket & bkt) /// base primes ought to be smaller than the ones being hunted... static_assert(sizeof(PrimeT) <= sizeof(value_type)); + std::vector allOffsets; + allOffsets.reserve(1024*1024); + + std::array offsets; + for(auto p : bkt.primes()) { + std::size_t start = noffs; + auto offsCount = compute_offsets(start, offsets, p, n0_, ne_); + // This branch could be removed if we are absolutely sure, + // positive, that any prime in the bucket has at least one bit to reset. + // Need to do some profiling to establish whether removing the test + // improves the performances. Wait and see... + if(start == noffs) { + LFP_ASSERT(false); // should never happen, p does not belong in this bucket + continue; + } + allOffsets.push_back(start); + // If things are done correctly the following condition is true most of the time + // because buckets are for big primes (someone like 509 has nothing to do here, + // whereas 12503 is welcome). + if(!offsCount) { + continue; + } + /// @todo: this is duplicated code, have a look at the end of inner_sieve + auto i = start; + if(offsCount == offsets.size()) { + // offsets are periodic, period is 8p + // @todo: can there be an overflow when adding 8*p? + for(; i + 8 * p < size_; i += 8 * p) { + allOffsets.push_back(i); + allOffsets.push_back(i + offsets[0]); + allOffsets.push_back(i + offsets[1]); + allOffsets.push_back(i + offsets[2]); + allOffsets.push_back(i + offsets[3]); + allOffsets.push_back(i + offsets[4]); + allOffsets.push_back(i + offsets[5]); + allOffsets.push_back(i + offsets[6]); + } + } + const auto i0 = i; + for(auto j = 0; i < size_; i = i0 + offsets[j], ++j) { + allOffsets.push_back(i); + if(j == offsCount) { + break; + } + } + } + + for(auto offsPtr = allOffsets.data(), offsPtrEnd = offsPtr + allOffsets.size(); + offsPtr != offsPtrEnd; + ++offsPtr) { + reset(*offsPtr); + } + +#if 0 std::array offsets; for(auto p : bkt.primes()) { std::size_t start = noffs; - if((n0_ == 2039522017) && (p == 17681)) { - if(!std::is_constant_evaluated()) { - std::cout << p << ", " << n0_ << std::endl; - } - } auto offsCount = compute_offsets(start, offsets, p, n0_, ne_); // This branch could be removed if we are absolutely sure, // positive, that any prime in the bucket has at least one bit to reset. @@ -1082,6 +1189,7 @@ Bitmap::apply(bucket & bkt) } } +#endif } @@ -1130,7 +1238,10 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data const & sievd // Primes below a certain threshold are dealt with by applying precomputed masks to the bitmap constexpr unsigned int lastSmallPrime = 103; constexpr unsigned int smallPrimesThreshold = lastSmallPrime + 1; - if(std::ranges::distance(basePrimes | std::views::take_while([](auto p) { return p < smallPrimesThreshold; }))) { + if(std::ranges::distance(basePrimes + | std::views::drop_while([](auto p) { return p < 7; }) + | std::views::take_while([](auto p) { return p < smallPrimesThreshold; }) + )) { bitmask_pack::value_type, 7, 11, 13, 17, 19, 23, 29, 31> bitmasks_1; bitmask_pack::value_type, @@ -1527,16 +1638,13 @@ inline constexpr auto u16primes = []() { }(); -template -inline constexpr bool is_one_of_v = (std::is_same_v || ...); - template struct segment { U low_; U high_; Bitmap * bitmap_; - bucket bucket_; + bucket * bucket_; }; template @@ -1561,21 +1669,26 @@ sieve(I k0, I k1, Fct ff) } }(); std::vector prefix; prefix.reserve(3); + auto const estimatedNumSegments = (n1 - n0) / segmentSize + 1; std::vector bitmaps; - bitmaps.reserve((n1 - n0) / segmentSize + 1); + bitmaps.reserve(estimatedNumSegments); + std::vector> buckets; + buckets.reserve(estimatedNumSegments); std::vector> segments; - segments.reserve((n1 - n0) / segmentSize + 1); + segments.reserve(estimatedNumSegments); for(auto a0 = n0, a1 = std::min(n1, (maxn - segmentSize < n0) ? maxn : U(n0 + segmentSize)); a0 < n1; a0 = (maxn - segmentSize < a0) ? maxn : a0 + segmentSize, a1 = std::min(n1, maxn - segmentSize < a0 ? maxn : U(a0 + segmentSize))) { bitmaps.push_back(Bitmap{}); + buckets.push_back(bucket{initial_bucket_capacity}); segments.emplace_back(segment{ .low_ = a0, .high_ = a1, .bitmap_ = &bitmaps.back(), - .bucket_ = bucket{initial_bucket_capacity}}); + .bucket_ = &buckets.back()}); } + auto const numSegments = segments.size(); const U basePrimesRangeSize = [n1](){ if constexpr (is_one_of_v) { @@ -1618,11 +1731,10 @@ sieve(I k0, I k1, Fct ff) if(pSquared > n1) { break; } - auto kp = find_first_multiple_above(p, n0); - std::size_t segIdx = kp ? (kp - n0) / segmentSize : -1; - if((segIdx != -1) && (segIdx < segments.size()) - && (segIdx != segments.size() - 1 || kp < segments[segIdx].high_)) { - segments[segIdx].bucket_.add(p); + + auto kp = find_first_multiple_above(p, n0); + if(kp && (kp < n1)) { + buckets[(kp - n0) / segmentSize].add(p); } } if(pSquared > n1) { @@ -1630,6 +1742,7 @@ sieve(I k0, I k1, Fct ff) } } + int i = 0; for(auto & currentSegment : segments) { details::inner_sieve(u16primes, currentSegment.low_, currentSegment.high_, [&](auto it, auto ite, details::Bitmap const*){ @@ -1640,7 +1753,15 @@ sieve(I k0, I k1, Fct ff) }, sieve_data{ .bitmap_ = currentSegment.bitmap_, .have_to_initialize_bitmap_ = true, .have_to_ignore_bucketable_primes_ = true }); - currentSegment.bitmap_->apply(currentSegment.bucket_); + currentSegment.bitmap_->apply(*currentSegment.bucket_); + auto const nextSegLow = ((i + 1) < numSegments) ? segments[i + 1].low_ : U{}; + currentSegment.bucket_->push_primes_forward( + std::span{buckets}.subspan(i + 1), + nextSegLow, + segments.back().high_, + segmentSize + ); + ++i; } return ff(prefix, bitmaps); diff --git a/makefile b/makefile index 2e2a8de..9d1d50d 100644 --- a/makefile +++ b/makefile @@ -4,8 +4,10 @@ # Creation date: december 2024. CXX ?= g++ -CXFLAGS ?= -std=c++20 -O3 -march=native $(ADDITIONAL_CXFLAGS) +CXFLAGS ?= -std=c++20 -O3 -march=native -DNDEBUG $(ADDITIONAL_CXFLAGS) STATIC_TESTS_CXFLAGS ?= +# By default we activate assertions during testing +DYNAMIC_TESTS_FLAGS ?= -UNDEBUG SYNTAX_ONLY_FLAG=-fsyntax-only TEST_LIBS ?= -lCatch2Main -lCatch2 TEST_LIB_INSTALL ?= ./Catch2-install @@ -25,7 +27,7 @@ static_tests_%: static_tests_%.cpp lfp.hpp $(CXX) $(CXFLAGS) $(SYNTAX_ONLY_FLAG) $(STATIC_TESTS_CXFLAGS) $< tests: tests.cpp lfp.hpp - $(CXX) $(CXFLAGS) $(TEST_LIB_INCL_OPT) -o $@ $< $(TEST_LIB_LIB_OPT) $(TEST_LIBS) + $(CXX) $(CXFLAGS) $(DYNAMIC_TESTS_FLAGS) $(TEST_LIB_INCL_OPT) -o $@ $< $(TEST_LIB_LIB_OPT) $(TEST_LIBS) all: static_tests lfp tests From ed8668b5b80c9432cfba56a39167f6c91e5e03b8 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Sun, 27 Apr 2025 02:07:36 +0000 Subject: [PATCH 06/22] Scratch vector shared between buckets. --- lfp.hpp | 177 ++++++++++++++++++++++++-------------------------------- 1 file changed, 76 insertions(+), 101 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index 9deca01..6ae5744 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -85,7 +85,7 @@ struct Threads namespace details { /// Constants -inline constexpr std::size_t bucket_sieve_threshold = 8192; +inline constexpr std::size_t bucket_sieve_threshold = 65535; inline constexpr std::size_t max_num_primes_in_bucket = 4096; inline constexpr std::size_t initial_bucket_capacity = 4096; @@ -648,11 +648,13 @@ template class bucket { public: - constexpr bucket() = default; - constexpr explicit bucket(std::size_t initialCapacity); + constexpr bucket(); + constexpr bucket(std::vector & scratchOffsets, std::size_t initialCapacity); constexpr std::size_t size() const; constexpr void add(PrimeT prime); constexpr std::vector const & primes() const; + template + constexpr std::vector const & compute_offsets(V n0, V ne, std::size_t bmpSize); /// Moves primes in this bucket to the next bucket they belong to based /// on their multiples. /// This is the in-famous "kick the can" operation, once a prime p has had @@ -673,11 +675,21 @@ class bucket private: std::vector primes_; + std::vector * scratch_offsets_; + std::vector default_scratch_offsets_; }; template constexpr -bucket::bucket(std::size_t initialCapacity) +bucket::bucket() +{ + scratch_offsets_ = &default_scratch_offsets_; +} + +template +constexpr +bucket::bucket(std::vector & scratchOffsets, std::size_t initialCapacity) + : scratch_offsets_(&scratchOffsets) { primes_.reserve(initialCapacity); } @@ -703,6 +715,60 @@ bucket::primes() const return primes_; } +template +template +constexpr std::vector const & +bucket::compute_offsets(V n0, V ne, std::size_t bmpSize) +{ + auto & scratchOffsets = *scratch_offsets_; + scratchOffsets.clear(); + + std::array offsets; + for(auto p : primes_) { + std::size_t start = noffs; + auto offsCount = details::compute_offsets(start, offsets, p, n0, ne); + // This branch could be removed if we are absolutely sure, + // positive, that any prime in the bucket has at least one bit to reset. + // Need to do some profiling to establish whether removing the test + // improves the performances. Wait and see... + if(start == noffs) { + LFP_ASSERT(false); // should never happen, p does not belong in this bucket + continue; + } + scratchOffsets.push_back(start); + // If things are done correctly the following condition is true most of the time + // because buckets are for big primes (someone like 509 has nothing to do here, + // whereas 12503 is welcome). + if(!offsCount) { + continue; + } + /// @todo: this is duplicated code, have a look at the end of inner_sieve + auto i = start; + if(offsCount == offsets.size()) { + // offsets are periodic, period is 8p + // @todo: can there be an overflow when adding 8*p? + for(; i + 8 * p < bmpSize; i += 8 * p) { + scratchOffsets.push_back(i); + scratchOffsets.push_back(i + offsets[0]); + scratchOffsets.push_back(i + offsets[1]); + scratchOffsets.push_back(i + offsets[2]); + scratchOffsets.push_back(i + offsets[3]); + scratchOffsets.push_back(i + offsets[4]); + scratchOffsets.push_back(i + offsets[5]); + scratchOffsets.push_back(i + offsets[6]); + } + } + const auto i0 = i; + for(auto j = 0; i < bmpSize; i = i0 + offsets[j], ++j) { + scratchOffsets.push_back(i); + if(j == offsCount) { + break; + } + } + } + return scratchOffsets; +} + template template constexpr void @@ -722,8 +788,7 @@ bucket::push_primes_forward( buckets[(kp - n0) / segmentSize].add(p); } } - decltype(primes_) emptyVec; - std::swap(primes_, emptyVec); + primes_.clear(); } @@ -1091,105 +1156,13 @@ Bitmap::apply(bucket & bkt) /// base primes ought to be smaller than the ones being hunted... static_assert(sizeof(PrimeT) <= sizeof(value_type)); - std::vector allOffsets; - allOffsets.reserve(1024*1024); - - std::array offsets; - for(auto p : bkt.primes()) { - std::size_t start = noffs; - auto offsCount = compute_offsets(start, offsets, p, n0_, ne_); - // This branch could be removed if we are absolutely sure, - // positive, that any prime in the bucket has at least one bit to reset. - // Need to do some profiling to establish whether removing the test - // improves the performances. Wait and see... - if(start == noffs) { - LFP_ASSERT(false); // should never happen, p does not belong in this bucket - continue; - } - allOffsets.push_back(start); - // If things are done correctly the following condition is true most of the time - // because buckets are for big primes (someone like 509 has nothing to do here, - // whereas 12503 is welcome). - if(!offsCount) { - continue; - } - /// @todo: this is duplicated code, have a look at the end of inner_sieve - auto i = start; - if(offsCount == offsets.size()) { - // offsets are periodic, period is 8p - // @todo: can there be an overflow when adding 8*p? - for(; i + 8 * p < size_; i += 8 * p) { - allOffsets.push_back(i); - allOffsets.push_back(i + offsets[0]); - allOffsets.push_back(i + offsets[1]); - allOffsets.push_back(i + offsets[2]); - allOffsets.push_back(i + offsets[3]); - allOffsets.push_back(i + offsets[4]); - allOffsets.push_back(i + offsets[5]); - allOffsets.push_back(i + offsets[6]); - } - } - const auto i0 = i; - for(auto j = 0; i < size_; i = i0 + offsets[j], ++j) { - allOffsets.push_back(i); - if(j == offsCount) { - break; - } - } - } + auto const & offsets = bkt.compute_offsets(n0_, ne_, size_); - for(auto offsPtr = allOffsets.data(), offsPtrEnd = offsPtr + allOffsets.size(); + for(auto offsPtr = offsets.data(), offsPtrEnd = offsPtr + offsets.size(); offsPtr != offsPtrEnd; ++offsPtr) { reset(*offsPtr); } - -#if 0 - std::array offsets; - for(auto p : bkt.primes()) { - std::size_t start = noffs; - auto offsCount = compute_offsets(start, offsets, p, n0_, ne_); - // This branch could be removed if we are absolutely sure, - // positive, that any prime in the bucket has at least one bit to reset. - // Need to do some profiling to establish whether removing the test - // improves the performances. Wait and see... - if(start == noffs) { - LFP_ASSERT(false); // should never happen, p does not belong in this bucket - continue; - } - reset(start); - // If things are done correctly the following condition is true most of the time - // because buckets are for big primes (someone like 509 has nothing to do here, - // whereas 12503 is welcome). - if(!offsCount) { - continue; - } - /// @todo: this is duplicated code, have a look at the end of inner_sieve - auto i = start; - if(offsCount == offsets.size()) { - // offsets are periodic, period is 8p - // @todo: can there be an overflow when adding 8*p? - for(; i + 8 * p < size_; i += 8 * p) { - reset(i); - reset(i + offsets[0]); - reset(i + offsets[1]); - reset(i + offsets[2]); - reset(i + offsets[3]); - reset(i + offsets[4]); - reset(i + offsets[5]); - reset(i + offsets[6]); - } - } - const auto i0 = i; - for(auto j = 0; i < size_; i = i0 + offsets[j], ++j) { - reset(i); - if(j == offsCount) { - break; - } - } - - } -#endif } @@ -1676,12 +1649,14 @@ sieve(I k0, I k1, Fct ff) buckets.reserve(estimatedNumSegments); std::vector> segments; segments.reserve(estimatedNumSegments); + std::vector scratchOffsets; + scratchOffsets.reserve(262144 * 10); for(auto a0 = n0, a1 = std::min(n1, (maxn - segmentSize < n0) ? maxn : U(n0 + segmentSize)); a0 < n1; a0 = (maxn - segmentSize < a0) ? maxn : a0 + segmentSize, a1 = std::min(n1, maxn - segmentSize < a0 ? maxn : U(a0 + segmentSize))) { bitmaps.push_back(Bitmap{}); - buckets.push_back(bucket{initial_bucket_capacity}); + buckets.push_back(bucket{scratchOffsets, initial_bucket_capacity}); segments.emplace_back(segment{ .low_ = a0, .high_ = a1, From 322b7eeeb09dbcfbcbbaaaff947774b7b455a881 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Mon, 28 Apr 2025 18:05:16 +0000 Subject: [PATCH 07/22] Backup intermediate non working state. --- lfp.hpp | 336 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 236 insertions(+), 100 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index 6ae5744..eaa8eeb 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -7,7 +7,6 @@ /// @file lfp.hpp /// @brief Header defining the lfp library /// -/// #pragma once #include @@ -143,7 +142,7 @@ inline constexpr uint8_t whoffs[8][8] = { /// More precisely, let p be a prime, we start to cross out the multiples of p /// at p^2 but since we are using a mod 30 wheel we need those multiples to be /// coprime to 30. What this table does is, once a multiple c of p of the form -/// p^2 + 2k is found, it adds to c what needs to be added to make it coprime +/// p^2 + 2kp is found, it adds to c what needs to be added to make it coprime /// to 30. inline constexpr int8_t adjt[8][14] = { {0, 4, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 4, 2}, @@ -156,16 +155,48 @@ inline constexpr int8_t adjt[8][14] = { {0, 2, 4, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 4} }; -/// Computes the smallest multiple m of p such that m >= max(p^2, n0) and -/// gcd(m, 30) = 1. -/// @param p the prime a multiple of which is requested -/// @param n0 lower bound for returned multiple of p -/// @return 0 when such a multiple does not fit in the return type. -template +template +constexpr auto cp30res_to_idx(UInt coprimeTo30Residue) +{ + return (coprimeTo30Residue * 17) >> 6; +} + + +struct ComputeSquareOfP {}; +struct NoUpperBoundCond {}; +struct ResiduesNotNeeded {}; + +template +struct first_multiple_finder +{ + constexpr auto operator()(U p, V n0); +// constexpr first_multiple_finder(SquareOfPComputer sqrOfPComputer, UpperBoundCond upperBoundCond, ResiduesReceiver residuesReceiver); + + SquareOfPComputer sqr_of_p_comp_; + UpperBoundCond upper_bound_cond_; + ResiduesReceiver residues_receiver_; +}; + +/* +template +constexpr +first_multiple_finder::first_multiple_finder(SquareOfPComputer sqrOfPComputer, UpperBoundCond upperBoundCond, ResiduesReceiver residuesReceiver) + : sqr_of_p_comp_(sqrOfPComputer) + , upper_bound_cond_(upperBoundCond) + , residues_receiver_(residuesReceiver) +{} +*/ +template constexpr auto -find_first_multiple_above(U p, V n0) +first_multiple_finder::operator()(U p, V n0) { - const auto p2 = p * p; + auto const p2 = [&](){ + if constexpr (std::is_same_v) { + return p * p; + } else { + return sqr_of_p_comp_(); + } + }(); constexpr auto c_max = std::numeric_limits::max(); V dp2; auto c = (p2 >= n0) ? p2 @@ -173,21 +204,74 @@ find_first_multiple_above(U p, V n0) c_max - p2 < dp2) ? 0 : p2 + dp2); if(!c) { - return c; + return decltype(c){}; + } + if constexpr(!std::is_same_v) { + if(!upper_bound_cond_(c)) { + return decltype(c){}; + } } + constexpr auto callerNeedsResidues = !std::is_same_v; auto cmod30 = c % 30; switch(cmod30) { case 3: case 5: case 9: case 15: case 21: case 25: case 27: { auto dc = adjt[(p % 30) * 4 / 15][cmod30 >> 1] * p; c = (c_max - c < dc) ? 0 : c + dc; - cmod30 = c % 30; + if constexpr(callerNeedsResidues) { + cmod30 = c % 30; + } } break; default:; } + if constexpr(callerNeedsResidues) { + residues_receiver_(p % 30, cmod30); + } return c; } +template +constexpr auto find_first_multiple_impl(U p, V n0) +{ + return first_multiple_finder{}(p, n0); +} + +template +constexpr auto find_first_multiple_impl(U p, V n0, SquareOfPComputer sqrOfPComp, UpperBoundCond upperBoundCond, ResiduesReceiver residuesReceiver) +{ + auto finder = first_multiple_finder{ + sqrOfPComp, + upperBoundCond, + residuesReceiver + }; + return finder(p, n0); +} + +template +constexpr auto +find_first_multiple_ae_b(U p, V n0, V n1, U pSquared, uint8_t & pmod30, uint8_t & cmod30) +{ + return find_first_multiple_impl(p, n0, + [pSquared](){ return pSquared; }, + [n1](auto c){ return c < n1; }, + [&](uint8_t pm30, uint8_t cm30){ + pmod30 = pm30; + cmod30 = cm30; + }); +} + +/// Computes the smallest multiple m of p such that m >= max(p^2, n0) and +/// gcd(m, 30) = 1. +/// @param p the prime a multiple of which is requested +/// @param n0 lower bound for returned multiple of p +/// @return 0 when such a multiple does not fit in the return type. +template +constexpr auto +find_first_multiple_ae(U p, V n0) +{ + return find_first_multiple_impl(p, n0); +} + // Returns the smallest integer coprime to 30 and greater than or equal to @param n0 template constexpr @@ -223,11 +307,27 @@ constexpr std::size_t index_of(U m, V m0) { LFP_ASSERT((m >= m0) && (std::gcd(m, 30) == 1) && (std::gcd(m0, 30) == 1)); - auto m_idx = (m % 30) * 4 / 15; - auto m0_idx = (m0 % 30) * 4 /15; + auto m_idx = cp30res_to_idx(m % 30); + auto m0_idx = cp30res_to_dx(m0 % 30); return (m - m0) / 30 * 8 + ((m_idx >= m0_idx) ? m_idx - m0_idx : 8 + m_idx - m0_idx); } +/// Returns the value at offset offs relative to the offset of n0 i.e. the value v such that offs == index_of(v, n0) +/// @pre n0 is coprime to 30 +template +constexpr V value_at(V n0, std::size_t offs) +{ + LFP_ASSERT(std::gcd(n0, 30) == 1); + + return n0 + (offs / 8) * 30 + (offs % 8); + /* offs%8 + 0 + n0%30 -> 1 0*/ + +} + + + /// Computes the index of the first multiple of prime p in range [n0, ne] relative to the index of n0 /// as well as the offsets of the following multiples of p relative to firstMultIdx. /// @pre p is a prime number > 6, n0 <= n1, gcd(n0, 30) = 1 and gcd(ne, 30) = 1. @@ -239,18 +339,15 @@ constexpr std::size_t compute_offsets(std::size_t & firstMultIdx, std::array & offsets, U p, V n0, - V ne + V ne, + V c ) { firstMultIdx = noffs; - auto c = find_first_multiple_above(p, n0); - if(!c) { - return 0; - } #if 0 if(c < n0) { //@todo: remove. Used for debugging - c = find_first_multiple_above(p, n0); + c = find_first_multiple_ae(p, n0); if(!std::is_constant_evaluated()) { std::cout << "Bad multiple, got " << c << " where a value > " << n0 << " was requested." << std::endl; } @@ -259,16 +356,14 @@ constexpr std::size_t compute_offsets(std::size_t & firstMultIdx, constexpr auto c_max = std::numeric_limits::max(); bool isFirstIndex = true; std::size_t offsCount = 0; - auto prevDelta = 0; -// auto c0 = c; // @todo: remove. Used only by the assertion below - for(auto j = whoffs[(p % 30) * 4 / 15][(c % 30) * 4 / 15]; + auto const iw = cp30res_to_idx(p % 30); + auto const jw = cp30res_to_idx(c % 30); + + for(auto j = whoffs[iw][jw]; c <= ne; - c = ((c_max - c < wheel[(p % 30) * 4 / 15][j] * p) + c = ((c_max - c < wheel[iw][j] * p) ? ne + 1 - : c + wheel[(p % 30) * 4 / 15][j] * p), j = (j + 1) % 8) { -// if(c < c0) { -// LFP_ASSERT(false); -// } + : c + wheel[iw][j] * p), j = (j + 1) % 8) { auto currIdx = index_of(c, n0); if(isFirstIndex) { firstMultIdx = currIdx; @@ -283,6 +378,39 @@ constexpr std::size_t compute_offsets(std::size_t & firstMultIdx, return offsCount; } +template +constexpr std::size_t compute_remaining_offsets(std::size_t firstMultIdx, + std::arry offsets, + U p, + V n0, + V ne, + V c, + uint8_t whpos + ) +{ + constexpr auto c_max = std::numeric_limits::max(); + bool isFirstIndex = true; + std::size_t offsCount = 0; + auto const iw = whpos >> 3; + auto const jw = whpos & 7; + for(auto j = whoffs[iw][jw]; + c <= ne; + c = ((c_max - c < wheel[iw][j] * p) + ? ne + 1 + : c + wheel[iw][j] * p), j = (j + 1) % 8) { + if(isFirstIndex) { + isFirstIndex = false; + continue; + } + auto currIdx = index_of(c, n0); + offsets[offsCount++] = currIdx - firstMultIdx; + if(offsCount == offsets.size()) { + break; + } + } + return offsCount; +} + /// Applies given mask to value starting at bit value + offs. /// @pre 0 <= offs < std::numeric_limits::digits @@ -449,7 +577,7 @@ template ::offset(std::size_t c) const { - return offs_[(c % 30) * 4 / 15]; + return offs_[cp30res_to_idx(c % 30)]; } @@ -648,13 +776,21 @@ template class bucket { public: - constexpr bucket(); - constexpr bucket(std::vector & scratchOffsets, std::size_t initialCapacity); + // Should be sufficient because if not, it means someone created 500MiB bitmap + // i.e. a bitmap representing a range of length 2^32/8*30 = 16'106'127'360 which + // is way too much. + using offset_type = std::uint32_t; + using offsets_type = std::vector; + + // constexpr bucket(); + constexpr bucket(offsets_type & scratchOffsets, std::size_t initialCapacity); constexpr std::size_t size() const; - constexpr void add(PrimeT prime); + constexpr void add(PrimeT prime, offset_type cOffs, uint8_t whpos); + /// Returns the primes in this bucket constexpr std::vector const & primes() const; template - constexpr std::vector const & compute_offsets(V n0, V ne, std::size_t bmpSize); + constexpr offsets_type const & compute_offsets(V n0, V ne, std::size_t bmpSize); +#if 0 /// Moves primes in this bucket to the next bucket they belong to based /// on their multiples. /// This is the in-famous "kick the can" operation, once a prime p has had @@ -672,24 +808,18 @@ class bucket U n1, std::size_t segmentSize ); - +#endif private: std::vector primes_; - std::vector * scratch_offsets_; - std::vector default_scratch_offsets_; + offsets_type first_multiples_offsets_; + std::vector whpos_; + offsets_type & scratch_offsets_; }; -template -constexpr -bucket::bucket() -{ - scratch_offsets_ = &default_scratch_offsets_; -} - template constexpr bucket::bucket(std::vector & scratchOffsets, std::size_t initialCapacity) - : scratch_offsets_(&scratchOffsets) + : scratch_offsets_(scratchOffsets) { primes_.reserve(initialCapacity); } @@ -703,9 +833,11 @@ bucket::size() const template constexpr void -bucket::add(PrimeT prime) +bucket::add(PrimeT prime, offset_type cOffs, uint8_t whpos) { primes_.push_back(prime); + first_multiples_offsets_.push_back(cOffs); + whpos_.push_back(whpos); } template @@ -715,60 +847,57 @@ bucket::primes() const return primes_; } + template template constexpr std::vector const & bucket::compute_offsets(V n0, V ne, std::size_t bmpSize) { - auto & scratchOffsets = *scratch_offsets_; - scratchOffsets.clear(); - - std::array offsets; - for(auto p : primes_) { - std::size_t start = noffs; - auto offsCount = details::compute_offsets(start, offsets, p, n0, ne); - // This branch could be removed if we are absolutely sure, - // positive, that any prime in the bucket has at least one bit to reset. - // Need to do some profiling to establish whether removing the test - // improves the performances. Wait and see... - if(start == noffs) { - LFP_ASSERT(false); // should never happen, p does not belong in this bucket - continue; - } - scratchOffsets.push_back(start); - // If things are done correctly the following condition is true most of the time + scratch_offsets_.clear(); + + for(std::size_t i{}, imax{primes_.size()}; i < imax; ++i) { + auto p = primes_[i]; + auto start = first_multiples_offsets_[i]; + scratch_offsets_.push_back(start); + auto whpos = whpos_[i]; + auto c = value_at(n0, start); + std::array offsets; + auto offsCount = compute_remaining_offsets(start, offsets, p, n0, ne, c, whpos); + // If things are done correctly the following condition is true most of the time // because buckets are for big primes (someone like 509 has nothing to do here, // whereas 12503 is welcome). - if(!offsCount) { - continue; - } + if(!offsCount) { + continue; + } /// @todo: this is duplicated code, have a look at the end of inner_sieve auto i = start; if(offsCount == offsets.size()) { // offsets are periodic, period is 8p // @todo: can there be an overflow when adding 8*p? for(; i + 8 * p < bmpSize; i += 8 * p) { - scratchOffsets.push_back(i); - scratchOffsets.push_back(i + offsets[0]); - scratchOffsets.push_back(i + offsets[1]); - scratchOffsets.push_back(i + offsets[2]); - scratchOffsets.push_back(i + offsets[3]); - scratchOffsets.push_back(i + offsets[4]); - scratchOffsets.push_back(i + offsets[5]); - scratchOffsets.push_back(i + offsets[6]); + scratch_offsets_.push_back(i); + scratch_offsets_.push_back(i + offsets[0]); + scratch_offsets_.push_back(i + offsets[1]); + scratch_offsets_.push_back(i + offsets[2]); + scratch_offsets_.push_back(i + offsets[3]); + scratch_offsets_.push_back(i + offsets[4]); + scratch_offsets_.push_back(i + offsets[5]); + scratch_offsets_.push_back(i + offsets[6]); } } const auto i0 = i; for(auto j = 0; i < bmpSize; i = i0 + offsets[j], ++j) { - scratchOffsets.push_back(i); + scratch_offsets_.push_back(i); if(j == offsCount) { break; } } } - return scratchOffsets; + + return scratch_offsets_; } +#if 0 template template constexpr void @@ -783,14 +912,14 @@ bucket::push_primes_forward( return; } for(auto p : primes_) { - auto kp = find_first_multiple_above(p, n0); + auto kp = find_first_multiple_ae(p, n0); if(kp && (kp < n1)) { buckets[(kp - n0) / segmentSize].add(p); } } primes_.clear(); } - +#endif template class PrimesIterator; @@ -1022,7 +1151,7 @@ Bitmap::compute_mask_application_data(bitmask_impl const & bmk) mask_application_data mappData{0, nopos_, nopos_, nopos_}; const auto p = bmk.prime(); - auto c = find_first_multiple_above(p, n0_); + auto c = find_first_multiple_ae(p, n0_); if(!c) { return mappData; } @@ -1242,14 +1371,14 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data const & sievd continue; } } - auto c = find_first_multiple_above(decltype(p2){p}, n0); + U c = find_first_multiple_ae(decltype(p2){p}, n0); if(!c) { continue; } std::size_t startOffs{noffs}; std::array offsets; - auto offsetsCount = compute_offsets(startOffs, offsets, p, n0, ne); + auto offsetsCount = compute_offsets(startOffs, offsets, p, n0, ne, c); if(startOffs == noffs) { // No multiple of p greater or equal to p^2 (and coprime to 30) in the current segment @@ -1638,7 +1767,7 @@ sieve(I k0, I k1, Fct ff) if constexpr (is_one_of_v) { return maxn; } else { - return U{24*1024*1024}; + return U{32*1024*1024}; } }(); std::vector prefix; prefix.reserve(3); @@ -1698,26 +1827,33 @@ sieve(I k0, I k1, Fct ff) auto pSquared = U{}; details::PrimesIterator itP{&basePrimesBmp}, itPe{&basePrimesBmp, true}; - for(auto p : std::ranges::subrange(itP, itPe)) { - if(p < bucket_sieve_threshold) { - continue; - } - pSquared = p * p; - if(pSquared > n1) { - break; - } - - auto kp = find_first_multiple_above(p, n0); - if(kp && (kp < n1)) { - buckets[(kp - n0) / segmentSize].add(p); - } + for(auto p : std::ranges::subrange(itP, itPe) + | std::views::drop_while([](auto q){ return q < bucket_sieve_threshold; })) { + pSquared = p * p; + + auto highest = segments.back().high_; + for(std::size_t i{}; i < numSegments;) { + uint8_t pmod30{}, cmod30{}; + auto kp = find_first_multiple_ae_b(p, segments[i].low_, highest, pSquared, pmod30, cmod30); + if(!kp) { + break; + } + i = (kp - n0) / segmentSize; + auto & segment = segments[i]; + auto indexOfMultipleInBitmap = index_of(compute_gte_coprime(segment.low_), kp); + segment.bucket_->add(p, indexOfMultipleInBitmap, (cp30res_to_idx(pmod30) << 3) | cp30res_to_idx(cmod30)); + if(kp + 2 * p >= highest) { // @todo: check for overflow!!! + break; + } + ++i; + } } - if(pSquared > n1) { + if(pSquared >= n1) { break; } } - int i = 0; + //int i = 0; for(auto & currentSegment : segments) { details::inner_sieve(u16primes, currentSegment.low_, currentSegment.high_, [&](auto it, auto ite, details::Bitmap const*){ @@ -1729,14 +1865,14 @@ sieve(I k0, I k1, Fct ff) .have_to_initialize_bitmap_ = true, .have_to_ignore_bucketable_primes_ = true }); currentSegment.bitmap_->apply(*currentSegment.bucket_); - auto const nextSegLow = ((i + 1) < numSegments) ? segments[i + 1].low_ : U{}; - currentSegment.bucket_->push_primes_forward( + //auto const nextSegLow = ((i + 1) < numSegments) ? segments[i + 1].low_ : U{}; + /*currentSegment.bucket_->push_primes_forward( std::span{buckets}.subspan(i + 1), nextSegLow, segments.back().high_, segmentSize - ); - ++i; + );*/ + //++i; } return ff(prefix, bitmaps); From a1f34c4dbdb24c725b6a7f99daa8433a6a9e3e17 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Tue, 29 Apr 2025 00:10:57 +0000 Subject: [PATCH 08/22] Backup. --- lfp.hpp | 128 ++++++++++++++++++++++---------------------------------- 1 file changed, 50 insertions(+), 78 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index eaa8eeb..19fa50c 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -95,6 +95,9 @@ inline constexpr std::size_t noffs = (std::numeric_limits::max)(); template inline constexpr bool is_one_of_v = sizeof...(Rest) && ((std::is_same_v || ...)); +/// Residues modulo 30 that are coprime to 30 +template +inline constexpr auto cp30_residues = std::to_array({1, 7, 11, 13, 17, 19, 23, 29}); /// All the primes below 2^8 in ascending order template @@ -170,22 +173,12 @@ template -constexpr -first_multiple_finder::first_multiple_finder(SquareOfPComputer sqrOfPComputer, UpperBoundCond upperBoundCond, ResiduesReceiver residuesReceiver) - : sqr_of_p_comp_(sqrOfPComputer) - , upper_bound_cond_(upperBoundCond) - , residues_receiver_(residuesReceiver) -{} -*/ template constexpr auto first_multiple_finder::operator()(U p, V n0) @@ -224,6 +217,11 @@ first_multiple_finder break; default:; } + if constexpr(!std::is_same_v) { + if(!upper_bound_cond_(c)) { + return decltype(c){}; + } + } if constexpr(callerNeedsResidues) { residues_receiver_(p % 30, cmod30); } @@ -308,7 +306,7 @@ constexpr std::size_t index_of(U m, V m0) LFP_ASSERT((m >= m0) && (std::gcd(m, 30) == 1) && (std::gcd(m0, 30) == 1)); auto m_idx = cp30res_to_idx(m % 30); - auto m0_idx = cp30res_to_dx(m0 % 30); + auto m0_idx = cp30res_to_idx(m0 % 30); return (m - m0) / 30 * 8 + ((m_idx >= m0_idx) ? m_idx - m0_idx : 8 + m_idx - m0_idx); } @@ -319,11 +317,12 @@ constexpr V value_at(V n0, std::size_t offs) { LFP_ASSERT(std::gcd(n0, 30) == 1); - return n0 + (offs / 8) * 30 + (offs % 8); - /* offs%8 - 0 - n0%30 -> 1 0*/ - + auto n0Idx = cp30res_to_idx(n0 % 30); + auto nxIdx = (n0Idx + offs) % 8; + return n0 + (offs / 8) * 30 + + ((nxIdx >= n0Idx) + ? cp30_residues[nxIdx] - cp30_residues[n0Idx] + : 30 + cp30_residues[nxIdx] - cp30_residues[n0Idx]); } @@ -380,7 +379,7 @@ constexpr std::size_t compute_offsets(std::size_t & firstMultIdx, template constexpr std::size_t compute_remaining_offsets(std::size_t firstMultIdx, - std::arry offsets, + std::array offsets, U p, V n0, V ne, @@ -616,7 +615,7 @@ bitmask_impl::compute_offsets() { std::remove_cv_t offsets; for_each_coprime_to_30([&offsets](auto i, auto j) { - offsets[(i % 30) * 4 / 15] = j; + offsets[cp30res_to_idx(i % 30)] = j; }); return offsets; } @@ -782,7 +781,6 @@ class bucket using offset_type = std::uint32_t; using offsets_type = std::vector; - // constexpr bucket(); constexpr bucket(offsets_type & scratchOffsets, std::size_t initialCapacity); constexpr std::size_t size() const; constexpr void add(PrimeT prime, offset_type cOffs, uint8_t whpos); @@ -790,25 +788,7 @@ class bucket constexpr std::vector const & primes() const; template constexpr offsets_type const & compute_offsets(V n0, V ne, std::size_t bmpSize); -#if 0 - /// Moves primes in this bucket to the next bucket they belong to based - /// on their multiples. - /// This is the in-famous "kick the can" operation, once a prime p has had - /// its multiples in a segment crossed out, we push it forward to the next - /// segment where a multiple of p resides. - /// @param buckets a span refering to the buckets which will receive the - /// primes in this bucket. - /// @param n0 the lower bound of the segment corresponding to the first bucket in buckets - /// @param n1 the upper bound of the segment corresponding to the last bucket in buckets - /// @param segmentSize the size of a segment (the last segment can be smaller). - template - constexpr void push_primes_forward( - std::span> buckets, - U n0, - U n1, - std::size_t segmentSize - ); -#endif + constexpr void clear(); private: std::vector primes_; offsets_type first_multiples_offsets_; @@ -822,6 +802,17 @@ bucket::bucket(std::vector & scratchOffsets, std::size_t : scratch_offsets_(scratchOffsets) { primes_.reserve(initialCapacity); + first_multiples_offsets_.reserve(initialCapacity); + whpos_.reserve(initialCapacity); +} + +template +constexpr void +bucket::clear() +{ + primes_.clear(); + first_multiples_offsets_.clear(); + whpos_.clear(); } template @@ -855,18 +846,19 @@ bucket::compute_offsets(V n0, V ne, std::size_t bmpSize) { scratch_offsets_.clear(); - for(std::size_t i{}, imax{primes_.size()}; i < imax; ++i) { - auto p = primes_[i]; - auto start = first_multiples_offsets_[i]; - scratch_offsets_.push_back(start); - auto whpos = whpos_[i]; + for(std::size_t k{}, kmax{primes_.size()}; k < kmax; ++k) { + auto p = primes_[k]; + auto start = first_multiples_offsets_[k]; + LFP_ASSERT(start < bmpSize); + auto whpos = whpos_[k]; auto c = value_at(n0, start); - std::array offsets; + std::array offsets{}; auto offsCount = compute_remaining_offsets(start, offsets, p, n0, ne, c, whpos); // If things are done correctly the following condition is true most of the time // because buckets are for big primes (someone like 509 has nothing to do here, // whereas 12503 is welcome). if(!offsCount) { + scratch_offsets_.push_back(start); continue; } /// @todo: this is duplicated code, have a look at the end of inner_sieve @@ -897,29 +889,6 @@ bucket::compute_offsets(V n0, V ne, std::size_t bmpSize) return scratch_offsets_; } -#if 0 -template -template -constexpr void -bucket::push_primes_forward( - std::span> buckets, - U n0, - U n1, - std::size_t segmentSize - ) -{ - if(buckets.empty()) { - return; - } - for(auto p : primes_) { - auto kp = find_first_multiple_ae(p, n0); - if(kp && (kp < n1)) { - buckets[(kp - n0) / segmentSize].add(p); - } - } - primes_.clear(); -} -#endif template class PrimesIterator; @@ -943,6 +912,8 @@ class Bitmap template constexpr void apply(bitmask_pack const & maskPack); template + constexpr void apply_and_clear(bucket & bkt); + template constexpr void apply(bucket & bkt); constexpr uint64_t popcount() const; void check() const; @@ -1276,6 +1247,14 @@ void Bitmap::apply(bitmask_pack const & maskPack) } } +template +constexpr void +Bitmap::apply_and_clear(bucket & bkt) +{ + apply(bkt); + bkt.clear(); +} + template constexpr void @@ -1838,9 +1817,11 @@ sieve(I k0, I k1, Fct ff) if(!kp) { break; } + LFP_ASSERT(kp < highest); i = (kp - n0) / segmentSize; + LFP_ASSERT(i < numSegments); auto & segment = segments[i]; - auto indexOfMultipleInBitmap = index_of(compute_gte_coprime(segment.low_), kp); + auto indexOfMultipleInBitmap = index_of(kp, compute_gte_coprime(segment.low_)); segment.bucket_->add(p, indexOfMultipleInBitmap, (cp30res_to_idx(pmod30) << 3) | cp30res_to_idx(cmod30)); if(kp + 2 * p >= highest) { // @todo: check for overflow!!! break; @@ -1853,7 +1834,6 @@ sieve(I k0, I k1, Fct ff) } } - //int i = 0; for(auto & currentSegment : segments) { details::inner_sieve(u16primes, currentSegment.low_, currentSegment.high_, [&](auto it, auto ite, details::Bitmap const*){ @@ -1864,15 +1844,7 @@ sieve(I k0, I k1, Fct ff) }, sieve_data{ .bitmap_ = currentSegment.bitmap_, .have_to_initialize_bitmap_ = true, .have_to_ignore_bucketable_primes_ = true }); - currentSegment.bitmap_->apply(*currentSegment.bucket_); - //auto const nextSegLow = ((i + 1) < numSegments) ? segments[i + 1].low_ : U{}; - /*currentSegment.bucket_->push_primes_forward( - std::span{buckets}.subspan(i + 1), - nextSegLow, - segments.back().high_, - segmentSize - );*/ - //++i; + currentSegment.bitmap_->apply_and_clear(*currentSegment.bucket_); } return ff(prefix, bitmaps); From 12af36f492c2425ce648f189548e06933bb6207c Mon Sep 17 00:00:00 2001 From: Youcef L Date: Tue, 29 Apr 2025 12:00:55 +0000 Subject: [PATCH 09/22] Working bucket siever. --- lfp.hpp | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index 19fa50c..0f4a138 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -298,6 +298,16 @@ Ret compute_lt_coprime(U n1) } +template +constexpr std::size_t index_of(U m, uint8_t mmod30Idx, V m0) +{ + LFP_ASSERT((m >= m0) && (std::gcd(m, 30) == 1) + && (mmod30Idx == cp30res_to_idx(m % 30)) && (std::gcd(m0, 30) == 1)); + + auto m0_idx = cp30res_to_idx(m0 % 30); + return (m - m0) / 30 * 8 + ((mmod30Idx >= m0_idx) ? mmod30Idx - m0_idx : 8 + mmod30Idx - m0_idx); +} + /// Returns the index of m relative to the index of m0. /// @pre both m and m0 are coptime to 30 and m >= m0 template @@ -379,7 +389,7 @@ constexpr std::size_t compute_offsets(std::size_t & firstMultIdx, template constexpr std::size_t compute_remaining_offsets(std::size_t firstMultIdx, - std::array offsets, + std::array & offsets, U p, V n0, V ne, @@ -852,7 +862,7 @@ bucket::compute_offsets(V n0, V ne, std::size_t bmpSize) LFP_ASSERT(start < bmpSize); auto whpos = whpos_[k]; auto c = value_at(n0, start); - std::array offsets{}; + std::array offsets; auto offsCount = compute_remaining_offsets(start, offsets, p, n0, ne, c, whpos); // If things are done correctly the following condition is true most of the time // because buckets are for big primes (someone like 509 has nothing to do here, @@ -1812,8 +1822,8 @@ sieve(I k0, I k1, Fct ff) auto highest = segments.back().high_; for(std::size_t i{}; i < numSegments;) { - uint8_t pmod30{}, cmod30{}; - auto kp = find_first_multiple_ae_b(p, segments[i].low_, highest, pSquared, pmod30, cmod30); + uint8_t pmod30{}, kpmod30{}; + auto kp = find_first_multiple_ae_b(p, segments[i].low_, highest, pSquared, pmod30, kpmod30); if(!kp) { break; } @@ -1821,8 +1831,9 @@ sieve(I k0, I k1, Fct ff) i = (kp - n0) / segmentSize; LFP_ASSERT(i < numSegments); auto & segment = segments[i]; - auto indexOfMultipleInBitmap = index_of(kp, compute_gte_coprime(segment.low_)); - segment.bucket_->add(p, indexOfMultipleInBitmap, (cp30res_to_idx(pmod30) << 3) | cp30res_to_idx(cmod30)); + auto kpmod30Idx = cp30res_to_idx(kpmod30); + auto indexOfMultipleInBitmap = index_of(kp, kpmod30Idx, compute_gte_coprime(segment.low_)); + segment.bucket_->add(p, indexOfMultipleInBitmap, (cp30res_to_idx(pmod30) << 3) | kpmod30Idx); if(kp + 2 * p >= highest) { // @todo: check for overflow!!! break; } From 5f16639ebce4f66398cfe392fb83e1bc050d8bc2 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Wed, 30 Apr 2025 09:19:03 +0000 Subject: [PATCH 10/22] Alignment of scratch vector used by buckets. --- lfp.hpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index 0f4a138..bc0cfe5 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -85,8 +85,7 @@ namespace details { /// Constants inline constexpr std::size_t bucket_sieve_threshold = 65535; -inline constexpr std::size_t max_num_primes_in_bucket = 4096; -inline constexpr std::size_t initial_bucket_capacity = 4096; +inline constexpr std::size_t initial_bucket_capacity = 32768; /// Value meaning no offset i.e. the requested value is not present in the sequence inline constexpr std::size_t noffs = (std::numeric_limits::max)(); @@ -789,7 +788,7 @@ class bucket // i.e. a bitmap representing a range of length 2^32/8*30 = 16'106'127'360 which // is way too much. using offset_type = std::uint32_t; - using offsets_type = std::vector; + using offsets_type = std::vector>; constexpr bucket(offsets_type & scratchOffsets, std::size_t initialCapacity); constexpr std::size_t size() const; @@ -808,7 +807,7 @@ class bucket template constexpr -bucket::bucket(std::vector & scratchOffsets, std::size_t initialCapacity) +bucket::bucket(offsets_type & scratchOffsets, std::size_t initialCapacity) : scratch_offsets_(scratchOffsets) { primes_.reserve(initialCapacity); @@ -851,7 +850,7 @@ bucket::primes() const template template -constexpr std::vector const & +constexpr bucket::offsets_type const & bucket::compute_offsets(V n0, V ne, std::size_t bmpSize) { scratch_offsets_.clear(); @@ -1767,7 +1766,7 @@ sieve(I k0, I k1, Fct ff) buckets.reserve(estimatedNumSegments); std::vector> segments; segments.reserve(estimatedNumSegments); - std::vector scratchOffsets; + std::vector> scratchOffsets; scratchOffsets.reserve(262144 * 10); for(auto a0 = n0, a1 = std::min(n1, (maxn - segmentSize < n0) ? maxn : U(n0 + segmentSize)); a0 < n1; From 34c9bd75103afc97bac2d339c229b8bda382196d Mon Sep 17 00:00:00 2001 From: Youcef L Date: Wed, 30 Apr 2025 17:51:54 +0000 Subject: [PATCH 11/22] Introducing logging. Backup. --- lfp.hpp | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/lfp.hpp b/lfp.hpp index bc0cfe5..e75238f 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -35,6 +35,16 @@ # endif #endif +#ifdef LFP_ACTIVATE_LOG +# define LFP_LOG(msg) \ + do { \ + if(!std::is_constant_evaluated()) { \ + ::lfp::details::log() << msg << "\n"; \ + } \ + } while(0) +#else +# define LFP_LOG(msg) ((void)0) +#endif namespace lfp { @@ -83,6 +93,67 @@ struct Threads namespace details { +class log_impl; + +/// Returns the unique logger +inline log_impl & log(); + +/// Logging class +class log_impl +{ + struct appender; +public: + log_impl(std::ostream & stream); + template + appender & operator <<(Arg&& arg); + +private: + class appender { + public: + template + appender & operator <<(Arg&& arg); + private: + appender() = default; + log_impl * parent_; + friend class log_impl; + }; + std::ostream & stream_; + appender appender_; +}; + +inline +log_impl::log_impl(std::ostream & stream) + : stream_(stream) +{ + appender_.parent_ = this; +} + +template +inline +log_impl::appender & log_impl::operator<<(Arg&& arg) +{ + auto ts = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now().time_since_epoch() + ).count(); + stream_ << ts << " | " << std::forward(arg); + return appender_; +} + +template +inline +log_impl::appender & log_impl::appender::operator <<(Arg&& arg) +{ + parent_->stream_ << std::forward(arg); + return *this; +} + +inline log_impl & log() +{ + static log_impl inst{std::cout}; + return inst; +} + + /// Constants inline constexpr std::size_t bucket_sieve_threshold = 65535; inline constexpr std::size_t initial_bucket_capacity = 32768; @@ -94,6 +165,13 @@ inline constexpr std::size_t noffs = (std::numeric_limits::max)(); template inline constexpr bool is_one_of_v = sizeof...(Rest) && ((std::is_same_v || ...)); +/// Returns the address of an object as an integer value +template +inline auto obj_addr(T const* ptr) +{ + return reinterpret_cast(ptr); +} + /// Residues modulo 30 that are coprime to 30 template inline constexpr auto cp30_residues = std::to_array({1, 7, 11, 13, 17, 19, 23, 29}); @@ -792,6 +870,7 @@ class bucket constexpr bucket(offsets_type & scratchOffsets, std::size_t initialCapacity); constexpr std::size_t size() const; + constexpr bool is_empty() const; constexpr void add(PrimeT prime, offset_type cOffs, uint8_t whpos); /// Returns the primes in this bucket constexpr std::vector const & primes() const; @@ -831,6 +910,13 @@ bucket::size() const return primes_.size(); } +template +constexpr bool +bucket::is_empty() const +{ + return primes_.empty(); +} + template constexpr void bucket::add(PrimeT prime, offset_type cOffs, uint8_t whpos) @@ -853,6 +939,8 @@ template constexpr bucket::offsets_type const & bucket::compute_offsets(V n0, V ne, std::size_t bmpSize) { + LFP_LOG("bucket " << reinterpret_cast(this) << ": computing offsets, number of primes: " << primes_.size()); + scratch_offsets_.clear(); for(std::size_t k{}, kmax{primes_.size()}; k < kmax; ++k) { @@ -895,6 +983,8 @@ bucket::compute_offsets(V n0, V ne, std::size_t bmpSize) } } + LFP_LOG("bucket " << reinterpret_cast(this) << ": found " << scratch_offsets_.size() << " offsets"); + return scratch_offsets_; } @@ -982,6 +1072,7 @@ Bitmap::Bitmap(uint64_t n0, std::size_t size) if(size % NumLim::digits) { vec_.back() &= ~ElemType{} << (NumLim::digits - size % NumLim::digits); } + LFP_LOG("constructed bitmap " << obj_addr(this) << ", {n0 = " << n0_ << ", ne = " << ne_ << ", size = " << size_ << "}"); } constexpr @@ -997,6 +1088,7 @@ Bitmap::assign(uint64_t n0, std::size_t size) if(size % NumLim::digits) { vec_.back() &= ~ElemType{} << (NumLim::digits - size % NumLim::digits); } + LFP_LOG("assigned bitmap " << obj_addr(this) << ", {n0 = " << n0_ << ", ne = " << ne_ << ", size = " << size_ << "}"); } constexpr @@ -1151,8 +1243,12 @@ template constexpr void Bitmap::apply(bitmask_impl const & bmk) { + LFP_LOG("bitmap " << obj_addr(this) << ": about to apply precomputed bitmask (prime = " << bmk.prime() << ")"); + auto mappData = compute_mask_application_data(bmk); apply(bmk, mappData, size()); + + LFP_LOG("bitmap " << obj_addr(this) << ": applied precomputed bitmask (prime = " << bmk.prime() << ")"); } @@ -1209,6 +1305,17 @@ template constexpr void Bitmap::apply(bitmask_pack const & maskPack) { + LFP_LOG("bitmap " << obj_addr(this) << ": about to apply pack of bitmasks (primes: " << maskPack.primes_as_string() << ")"); +#ifdef LFP_ACTIVATE_LOG + template + struct call_before_return { + call_before_return(Func f) : f_(f) {} + ~call_before_return() { + f_(); + } + } callBeforeRet; + LFP_LOG("bitmap " << obj_addr(bitmap_) << ": applied pack of bitmasks (primes: " << maskPack.primes_as_string() << ")"); +#endif constexpr auto masksCount = maskPack.size(); // We apply each mask individualy until we reach an offset where they can be applied // all together. @@ -1273,13 +1380,23 @@ Bitmap::apply(bucket & bkt) /// base primes ought to be smaller than the ones being hunted... static_assert(sizeof(PrimeT) <= sizeof(value_type)); + if(bkt.is_empty()) { + return; + } + + LFP_LOG("bitmap " << obj_addr(this) << ": about to apply bucket " << obj_addr(&bkt)); + auto const & offsets = bkt.compute_offsets(n0_, ne_, size_); + LFP_LOG("bitmap " << obj_addr(this) << ": reseting " << offsets.size() << " bit(s)"); + for(auto offsPtr = offsets.data(), offsPtrEnd = offsPtr + offsets.size(); offsPtr != offsPtrEnd; ++offsPtr) { reset(*offsPtr); } + + LFP_LOG("bitmap " << obj_addr(this) << ": bucket " << obj_addr(&bkt) << " applied"); } From 0a1a9bb5cbab15cf76df8f4912cf41155b6b163a Mon Sep 17 00:00:00 2001 From: Youcef L Date: Sun, 4 May 2025 08:48:04 +0000 Subject: [PATCH 12/22] Some progress on logging. --- lfp.hpp | 164 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 131 insertions(+), 33 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index e75238f..49c1b43 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -27,24 +27,6 @@ #include #include -#ifdef NDEBUG -# define LFP_ASSERT(...) ((void)0); -#else -# ifndef LFP_ASSERT -# define LFP_ASSERT(...) assert(__VA_ARGS__) -# endif -#endif - -#ifdef LFP_ACTIVATE_LOG -# define LFP_LOG(msg) \ - do { \ - if(!std::is_constant_evaluated()) { \ - ::lfp::details::log() << msg << "\n"; \ - } \ - } while(0) -#else -# define LFP_LOG(msg) ((void)0) -#endif namespace lfp { @@ -90,7 +72,54 @@ struct Threads static unsigned int defaultCount(); }; +} // namespace lfp + + + +#ifdef NDEBUG +# define LFP_ASSERT(...) ((void)0); +#else +# ifndef LFP_ASSERT +# define LFP_ASSERT(...) assert(__VA_ARGS__) +# endif +#endif + +#define LFP_CONCAT_IMPL(x, y) x##y +#define LFP_CONCAT(x, y) LFP_CONCAT_IMPL(x, y) + +#ifdef LFP_ACTIVATE_LOG +# define LFP_LOG(msg) \ + do { \ + if(!std::is_constant_evaluated()) { \ + ::lfp::details::log() << msg << "\n";\ + } \ + } while(0) + +# define LFP_LOG_S(msg, msgAtEos) \ + LFP_LOG(msg);\ + auto LFP_CONCAT(blockStartTime, __LINE__) = !std::is_constant_evaluated()\ + ? std::chrono::high_resolution_clock::now()\ + : decltype(std::chrono::high_resolution_clock::now()){}; \ + auto LFP_CONCAT(logAtEob, __LINE__) = [&](){\ + if(!std::is_constant_evaluated()) {\ + return std::variant{\ + details::make_log_at_eos([&](){\ + auto duration = std::chrono::duration{\ + std::chrono::high_resolution_clock::now()\ + - LFP_CONCAT(blockStartTime, __LINE__)};\ + ::lfp::details::log() << msgAtEos << " (duration: " << duration << ")\n";\ + })\ + };\ + }\ + return std::variant{0};\ + }(); +#else +# define LFP_LOG(msg) ((void)0) +# define LFP_LOG_S(msg, msgAtEob) ((void)0) +#endif + +namespace lfp { namespace details { class log_impl; @@ -106,6 +135,17 @@ class log_impl log_impl(std::ostream & stream); template appender & operator <<(Arg&& arg); + struct log_at_eos { + log_at_eos(log_at_eos const&) = delete; + log_at_eos(log_at_eos&& other) noexcept + { + func_ = std::move(other.func_); + other.func_ = [](){}; + } + log_at_eos(std::function func); + ~log_at_eos(); + std::function func_; + }; private: class appender { @@ -121,6 +161,7 @@ class log_impl appender appender_; }; + inline log_impl::log_impl(std::ostream & stream) : stream_(stream) @@ -147,6 +188,21 @@ log_impl::appender & log_impl::appender::operator <<(Arg&& arg) return *this; } +log_impl::log_at_eos::log_at_eos(std::function func) + : func_(func) +{} + +inline log_impl::log_at_eos::~log_at_eos() +{ + func_(); +} + +template +auto make_log_at_eos(Func func) +{ + return log_impl::log_at_eos{func}; +} + inline log_impl & log() { static log_impl inst{std::cout}; @@ -758,6 +814,8 @@ class bitmask_pack static_assert((is_prime(SmallPrimes) && ...), "Expecting prime numbers below 255!"); constexpr bitmask_pack() = default; + /// Returns a comma separated list of the SmallPrimes as a string + static constexpr std::string_view primes_as_string(); /// Returns the number of bitmask objects in this pack static constexpr auto size(); /// Returns a tuple containing all bitmask objects in this pack @@ -765,7 +823,7 @@ class bitmask_pack /// Gets the bitmask object at index I template constexpr auto const & get() const; - /// Returns the largest prime in the SmallPrimes + /// Returns the largest prime in SmallPrimes static constexpr std::uint8_t max_prime(); /// Returns the combination of the masks at the given offsets template @@ -774,6 +832,8 @@ class bitmask_pack private: using primes_sequence = std::integer_sequence; + static constexpr auto primes_as_string(primes_sequence primesSeq); + template static constexpr bool is_in_sequence(std::integer_sequence); @@ -790,6 +850,11 @@ class bitmask_pack /// The masks decltype(make_masks_tuple()) masks_; + // 3 digits per prime, 2 characters for the separator ", ", and the final '\0': + // 3 * sizeof...(SmallPrimes) + 2 * (sizeof...(SmallPrimes) - 1) + 1 + static constexpr std::array str_primes_ = primes_as_string(primes_sequence{}); + static constexpr std::string_view str_primes_view_{str_primes_.cbegin(), std::find(str_primes_.cbegin(), str_primes_.cend(), '\0')}; + }; @@ -799,6 +864,36 @@ constexpr auto bitmask_pack::size() return sizeof...(SmallPrimes); } +template +constexpr std::string_view bitmask_pack::primes_as_string() +{ +// return std::string_view{str_primes_.cbegin(), std::find(str_primes_.cbegin(), str_primes_.cend(), '\0')}; + return str_primes_view_; +} + +template +constexpr auto bitmask_pack::primes_as_string(primes_sequence) +{ + // all of this because std::to_string is not constexpr in C++20 + std::array ret{}; + int i = 0; + char * sep = nullptr; + for(auto p : std::to_array({SmallPrimes...})) { + if(i) { + ret[i++] = ','; + ret[i++] = ' '; + } + char strp[8] = {}; + int j = 7; + for(;p ; p /= 10) { + strp[j--] = '0' + (p % 10); + } + std::copy(strp + j + 1, strp + sizeof(strp), &ret[i]); + i += sizeof(strp) - j - 1; + } + return ret; +} + template constexpr auto const & bitmask_pack::get_all() const { @@ -939,7 +1034,10 @@ template constexpr bucket::offsets_type const & bucket::compute_offsets(V n0, V ne, std::size_t bmpSize) { - LFP_LOG("bucket " << reinterpret_cast(this) << ": computing offsets, number of primes: " << primes_.size()); + LFP_LOG("bucket " << reinterpret_cast(this) + << ": computing offsets, number of primes: " << primes_.size() + << ", first prime: " << (!primes_.empty() ? primes_.front() : 0) + << ", last prime: " << (!primes_.empty() ? primes_.back() : 0)); scratch_offsets_.clear(); @@ -1072,13 +1170,16 @@ Bitmap::Bitmap(uint64_t n0, std::size_t size) if(size % NumLim::digits) { vec_.back() &= ~ElemType{} << (NumLim::digits - size % NumLim::digits); } - LFP_LOG("constructed bitmap " << obj_addr(this) << ", {n0 = " << n0_ << ", ne = " << ne_ << ", size = " << size_ << "}"); + + LFP_LOG("constructed bitmap " << obj_addr(this) << ", [" << n0_ << ", " << ne_ << "], size = " << size_); } constexpr void Bitmap::assign(uint64_t n0, std::size_t size) { + LFP_LOG("assigning bitmap " << obj_addr(this)); + vec_.assign((size + NumLim::digits - 1)/NumLim::digits, ~decltype(vec_)::value_type{}); size_ = size; @@ -1088,7 +1189,8 @@ Bitmap::assign(uint64_t n0, std::size_t size) if(size % NumLim::digits) { vec_.back() &= ~ElemType{} << (NumLim::digits - size % NumLim::digits); } - LFP_LOG("assigned bitmap " << obj_addr(this) << ", {n0 = " << n0_ << ", ne = " << ne_ << ", size = " << size_ << "}"); + + LFP_LOG("assigned bitmap " << obj_addr(this) << ": [" << n0_ << ", " << ne_ << "], size = " << size_); } constexpr @@ -1305,17 +1407,10 @@ template constexpr void Bitmap::apply(bitmask_pack const & maskPack) { - LFP_LOG("bitmap " << obj_addr(this) << ": about to apply pack of bitmasks (primes: " << maskPack.primes_as_string() << ")"); -#ifdef LFP_ACTIVATE_LOG - template - struct call_before_return { - call_before_return(Func f) : f_(f) {} - ~call_before_return() { - f_(); - } - } callBeforeRet; - LFP_LOG("bitmap " << obj_addr(bitmap_) << ": applied pack of bitmasks (primes: " << maskPack.primes_as_string() << ")"); -#endif + LFP_LOG_S("bitmap " << obj_addr(this) << ": about to apply pack of bitmasks for primes {" + << maskPack.primes_as_string() << "}", + "bitmap " << obj_addr(this) << ": pack of bitmasks applied"); + constexpr auto masksCount = maskPack.size(); // We apply each mask individualy until we reach an offset where they can be applied // all together. @@ -1860,12 +1955,15 @@ sieve(I k0, I k1, Fct ff) { static_assert(is_one_of_v); + if constexpr(std::numeric_limits::is_signed) { k0 = (std::max)(I{0}, k0); k1 = (std::max)(I{0}, k1); } using U = std::make_unsigned_t; const U n0 = U(k0), n1 = U(k1); + + LFP_LOG("sieving range [" << n0 << ", " << n1 << "["); constexpr U maxn = std::numeric_limits::max(); constexpr U segmentSize = []() { From 24c4cb59cc90ffdc881df3db207359c4fa311eb0 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Mon, 5 May 2025 07:30:46 +0000 Subject: [PATCH 13/22] Added test, made sieve function support unsigned long long --- lfp.hpp | 2 +- tests.cpp | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/lfp.hpp b/lfp.hpp index 49c1b43..87acc85 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -1954,7 +1954,7 @@ constexpr auto sieve(I k0, I k1, Fct ff) { static_assert(is_one_of_v); + int32_t, uint32_t, int64_t, uint64_t, decltype(0ull)>); if constexpr(std::numeric_limits::is_signed) { k0 = (std::max)(I{0}, k0); diff --git a/tests.cpp b/tests.cpp index c124b08..f4b5993 100644 --- a/tests.cpp +++ b/tests.cpp @@ -229,3 +229,13 @@ TEST_CASE("Sieve of Erathostenes - misc - #1") { CHECK_THAT(lfp::count_primes(uint64_t(1'005'000'000'000), uint64_t(1'006'250'000'000)), equals(45228966)); } +TEST_CASE("Sieve of Erathostenes - misc - #2") { + auto primes = lfp::sieve_to_vector(10'000'000'000ull, 10'000'140'000ull); + CHECK_THAT(primes.size(), equals(6073)); + CHECK_THAT(primes.back(), equals(10'000'139'969)); + CHECK_THAT(primes[primes.size() - 2], equals(10'000'139'953)); + CHECK_THAT(primes.front(), equals(10'000'000'019)); + // 3037th prime after 10^10 + CHECK_THAT(primes[primes.size() / 2], equals(10'000'070'131)); +} + From 132148499393060d557a6f1b585af5be8c494ecb Mon Sep 17 00:00:00 2001 From: Youcef L Date: Tue, 6 May 2025 06:39:37 +0000 Subject: [PATCH 14/22] Fixed range too small in count_primes. Fixed inhability to use a bucketed primes threshold higer than 65535. --- lfp.hpp | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index 87acc85..a62d2fd 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -211,7 +211,7 @@ inline log_impl & log() /// Constants -inline constexpr std::size_t bucket_sieve_threshold = 65535; +inline constexpr std::size_t bucket_sieve_threshold = 204800; inline constexpr std::size_t initial_bucket_capacity = 32768; /// Value meaning no offset i.e. the requested value is not present in the sequence @@ -2025,7 +2025,7 @@ sieve(I k0, I k1, Fct ff) details::inner_sieve(basePrimes, m0, m1, [](auto, auto, details::Bitmap const *){ }, sieve_data{ .bitmap_ = &basePrimesBmp, - .have_to_initialize_bitmap_ = true + .have_to_initialize_bitmap_ = true }); auto pSquared = U{}; @@ -2059,8 +2059,22 @@ sieve(I k0, I k1, Fct ff) } } + U basePrimesUpperBound = [](){ + if constexpr(bucket_sieve_threshold > (std::numeric_limits::max)()) { + return (std::numeric_limits::max)(); + } else { + return U(bucket_sieve_threshold); + } + }(); + // @todo: make this static? + auto basePrimesForSegmentSieve = [basePrimesUpperBound](){ + Bitmap bmp; + sieve_data sievdat{.bitmap_ = &bmp}; + return inner_sieve(u16primes, U{}, basePrimesUpperBound, collectSieveResults, sievdat); + }(); + for(auto & currentSegment : segments) { - details::inner_sieve(u16primes, currentSegment.low_, currentSegment.high_, + details::inner_sieve(basePrimesForSegmentSieve, currentSegment.low_, currentSegment.high_, [&](auto it, auto ite, details::Bitmap const*){ if(it != ite) { prefix = std::vector{it, ite}; @@ -2161,7 +2175,12 @@ template constexpr std::size_t count_primes(U n0, U n1) { std::size_t count = 0; - constexpr auto rangeSize = 24*1024*1024; + const auto rangeSize = [n1]() { + if(n1 < (1u << 30)) return 24*1024*1024; + if(n1 < (1ull << 32)) return 32*1024*1024; + if(n1 < (1ull << 34)) return 128*1024*1024; + return 256*1024*1024; + } (); constexpr auto maxn = std::numeric_limits::max(); for(auto a0 = n0, a1 = std::min(n1, (maxn - rangeSize < n0) ? maxn : n0 + rangeSize); a0 < n1; From d2af44143849a17bd907c97b552bc92664f12451 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Tue, 6 May 2025 08:03:05 +0000 Subject: [PATCH 15/22] minor tweak of rangeSize --- lfp.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index a62d2fd..72ce3ea 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -2176,9 +2176,8 @@ constexpr std::size_t count_primes(U n0, U n1) { std::size_t count = 0; const auto rangeSize = [n1]() { - if(n1 < (1u << 30)) return 24*1024*1024; - if(n1 < (1ull << 32)) return 32*1024*1024; - if(n1 < (1ull << 34)) return 128*1024*1024; + if(n1 < (1ull << 32)) return 24*1024*1024; + if(n1 < (1ull << 36)) return 64*1024*1024; return 256*1024*1024; } (); constexpr auto maxn = std::numeric_limits::max(); From 1721dff30b3220b67428931289a7c713d2d1d8b7 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Tue, 6 May 2025 11:36:46 +0000 Subject: [PATCH 16/22] Fixed broken build. Added remaining primes below 128 to primes processed via precomputed bitmasks. --- lfp.hpp | 24 +++++++++++++++++------- static_tests_2.cpp | 4 ++-- tperf.sh | 2 +- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index 72ce3ea..3af41f7 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -211,7 +211,14 @@ inline log_impl & log() /// Constants -inline constexpr std::size_t bucket_sieve_threshold = 204800; +constexpr std::size_t bucketed_primes_threshold() +{ + if(std::is_constant_evaluated()) { + return 8192; + } else { + return 204800; + } +} inline constexpr std::size_t initial_bucket_capacity = 32768; /// Value meaning no offset i.e. the requested value is not present in the sequence @@ -1538,7 +1545,7 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data const & sievd auto & bmp = *sievdat.bitmap_; // Primes below a certain threshold are dealt with by applying precomputed masks to the bitmap - constexpr unsigned int lastSmallPrime = 103; + constexpr unsigned int lastSmallPrime = 127; constexpr unsigned int smallPrimesThreshold = lastSmallPrime + 1; if(std::ranges::distance(basePrimes | std::views::drop_while([](auto p) { return p < 7; }) @@ -1549,16 +1556,19 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data const & sievd bitmask_pack::value_type, 37, 41, 43, 47, 53, 59, 61, 67> bitmasks_2; bitmask_pack::value_type, - 71, 73, 79, 83, 89, 97, 101, lastSmallPrime> bitmasks_3; + 71, 73, 79, 83, 89, 97, 101, 103> bitmasks_3; + bitmask_pack::value_type, + 107, 109, 113, lastSmallPrime> bitmasks_4; bmp.apply(bitmasks_1); bmp.apply(bitmasks_2); bmp.apply(bitmasks_3); + bmp.apply(bitmasks_4); } for(auto p : basePrimes | std::views::drop_while([smallPrimesThreshold](auto p) { return p < smallPrimesThreshold; }) | std::views::take_while([&sievdat](auto p){ - return !(sievdat.have_to_ignore_bucketable_primes_ && (p >= bucket_sieve_threshold)); })) { + return !(sievdat.have_to_ignore_bucketable_primes_ && (p >= bucketed_primes_threshold())); })) { auto p2 = U{p} * p; if(p2 > ne) { break; @@ -2031,7 +2041,7 @@ sieve(I k0, I k1, Fct ff) auto pSquared = U{}; details::PrimesIterator itP{&basePrimesBmp}, itPe{&basePrimesBmp, true}; for(auto p : std::ranges::subrange(itP, itPe) - | std::views::drop_while([](auto q){ return q < bucket_sieve_threshold; })) { + | std::views::drop_while([](auto q){ return q < bucketed_primes_threshold(); })) { pSquared = p * p; auto highest = segments.back().high_; @@ -2060,10 +2070,10 @@ sieve(I k0, I k1, Fct ff) } U basePrimesUpperBound = [](){ - if constexpr(bucket_sieve_threshold > (std::numeric_limits::max)()) { + if constexpr(bucketed_primes_threshold() > (std::numeric_limits::max)()) { return (std::numeric_limits::max)(); } else { - return U(bucket_sieve_threshold); + return U(bucketed_primes_threshold()); } }(); // @todo: make this static? diff --git a/static_tests_2.cpp b/static_tests_2.cpp index 2c58d67..a14433e 100644 --- a/static_tests_2.cpp +++ b/static_tests_2.cpp @@ -25,14 +25,14 @@ static_assert(sieve_to_vector(uint32_t(0), uint32_t(3)) == std::vector static_assert(sieve_to_vector(uint32_t(262121), uint32_t(262144)) == std::vector{262121, 262127, 262133, 262139}); static_assert(sieve_to_vector(uint32_t(1048576), uint32_t(1048700)) == std::vector{1048583, 1048589, 1048601, 1048609, 1048613, 1048627, 1048633, 1048661, 1048681}); -static_assert(sieve_to_vector(uint32_t(61075016), uint32_t(61075116)) == std::vector{61075019, 61075037, 61075057, 61075061, - 61075087, 61075099, 61075103, 61075109, 61075111}); static_assert(sieve_to_vector(uint64_t(1000000), uint64_t(1000010)) == std::vector{1000003}); // The following tests make g++-12.2 stall while g++-13.3 has no issue compiling this file. // They are active by default, this allows to deactivate them. #ifndef LFP_TESTS_DISABLE_STATIC_ASSERTS_IN_HIGHER_RANGES +static_assert(sieve_to_vector(uint32_t(61075016), uint32_t(61075116)) == std::vector{61075019, 61075037, 61075057, 61075061, + 61075087, 61075099, 61075103, 61075109, 61075111}); static_assert(sieve_to_vector(uint32_t(1074041825), uint32_t(1074041924)) == std::vector{1074041849, 1074041869}); static_assert(sieve_to_vector(uint32_t(4294967196), uint32_t(4294967295)) == std::vector{4294967197, 4294967231, 4294967279, 4294967291}); diff --git a/tperf.sh b/tperf.sh index fb2d163..ecdbbde 100755 --- a/tperf.sh +++ b/tperf.sh @@ -17,7 +17,7 @@ ranges=(0 $((10**9))\ 18446744063709551616 18446744073709551615\ ) -threads_c=(1 4 8 16 32 48) +threads_c=(1 4 8 16 32 48 64) compute_avg() { local values=("$@") From 5bf1dd7fd68ef89b18ab14fbf85993a10d92ad9f Mon Sep 17 00:00:00 2001 From: Youcef L Date: Thu, 15 May 2025 12:34:27 +0000 Subject: [PATCH 17/22] Renamed Threads to threads. sqrt(t)-weighted partition of the main range. --- lfp.hpp | 107 +++++++++++++++++++++++++++++++++--------------------- main.cpp | 2 +- tests.cpp | 6 +-- 3 files changed, 69 insertions(+), 46 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index 3af41f7..2ff7108 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -10,6 +10,7 @@ #pragma once #include +#include #include #include #include @@ -31,7 +32,7 @@ namespace lfp { // Forward declarations -struct Threads; +struct threads; template class SieveResults; /// Returns the result of sieving the range [n0, n1). @@ -42,7 +43,7 @@ constexpr SieveResults sieve(U n0, U n1); /// Returns the result of sieving the range [n0, n1). /// The sieving is performed using at most threads.count() concurrent threads. template -SieveResults sieve(U n0, U n1, Threads const & threads); +SieveResults sieve(U n0, U n1, threads const & threads); /// Returns a vector containing the prime numbers in range [n0, n1) template @@ -55,21 +56,21 @@ constexpr std::size_t count_primes(U n0, U n1); /// Returns the number of prime numbers in range [n0, n1) /// the sieving is performed using at most threads.count() ooncurrent threads. template -std::size_t count_primes(U n0, U n1, Threads const & threads); +std::size_t count_primes(U n0, U n1, threads const & threads); /// @struct Holding concurrency information -struct Threads +struct threads { /// Constructs an instance x such that x.count() == std::thread::hardware_concurrency(). /// If std::thread::hardware_concurrency() == 0, then x.count() is equal to 1. - Threads(); + threads(); /// Constructs an instance x such that x.count() == c - explicit Threads(unsigned int c); + explicit threads(unsigned int c); /// Returns the maximum number of concurrent threads to use during sieving. unsigned int count() const; private: unsigned int count_; - static unsigned int defaultCount(); + static unsigned int default_count(); }; } // namespace lfp @@ -1544,12 +1545,12 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data const & sievd } auto & bmp = *sievdat.bitmap_; - // Primes below a certain threshold are dealt with by applying precomputed masks to the bitmap - constexpr unsigned int lastSmallPrime = 127; - constexpr unsigned int smallPrimesThreshold = lastSmallPrime + 1; + // Primes below a certain threshold are dealt with by applying precomputed masks to the bitmapi + bool const useBitmasksForAllPrimesBelow128 = n1 > (std::numeric_limits::max)(); + unsigned int const smallPrimesThreshold = useBitmasksForAllPrimesBelow128 ? 128 : 104; if(std::ranges::distance(basePrimes | std::views::drop_while([](auto p) { return p < 7; }) - | std::views::take_while([](auto p) { return p < smallPrimesThreshold; }) + | std::views::take_while([smallPrimesThreshold](auto p) { return p < smallPrimesThreshold; }) )) { bitmask_pack::value_type, 7, 11, 13, 17, 19, 23, 29, 31> bitmasks_1; @@ -1557,12 +1558,14 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data const & sievd 37, 41, 43, 47, 53, 59, 61, 67> bitmasks_2; bitmask_pack::value_type, 71, 73, 79, 83, 89, 97, 101, 103> bitmasks_3; - bitmask_pack::value_type, - 107, 109, 113, lastSmallPrime> bitmasks_4; bmp.apply(bitmasks_1); bmp.apply(bitmasks_2); bmp.apply(bitmasks_3); - bmp.apply(bitmasks_4); + if(useBitmasksForAllPrimesBelow128) { + bitmask_pack::value_type, + 107, 109, 113, 127> bitmasks_4; + bmp.apply(bitmasks_4); + } } for(auto p : basePrimes @@ -2099,11 +2102,34 @@ sieve(I k0, I k1, Fct ff) return ff(prefix, bitmaps); } + +template +void partition_range(U n0, U n1, int N, FuncT processRange) +{ + if(N <= 0 || (n0 >= n1)) { + processRange(n0, n0); + } + if((N == 1) || (n1 - n0 < N)) { + processRange(n0, n1); + } + auto v = std::views::iota(1, N + 1); + auto weight = std::accumulate(std::begin(v), std::end(v), 0.0, + [](double x, auto k){ return x + 1.0 / std::sqrt(k); + }); + auto c = (n1 - n0) / weight; + auto current = n0; + std::for_each(std::begin(v), std::end(v), [&](auto k){ + auto ni = (k == N) ? n1 : current + c / std::sqrt(k); + processRange(current, ni); + current = ni; + }); +} + } // namespace details inline unsigned int -Threads::defaultCount() +threads::default_count() { static auto v = []() { auto c = std::thread::hardware_concurrency(); @@ -2112,15 +2138,15 @@ Threads::defaultCount() return v; } -inline Threads::Threads() - : count_(Threads::defaultCount()) +inline threads::threads() + : count_(threads::default_count()) {} -inline Threads::Threads(unsigned int numThreads) +inline threads::threads(unsigned int numThreads) : count_(numThreads ? numThreads : 1) {} -inline unsigned int Threads::count() const +inline unsigned int threads::count() const { return count_; } @@ -2147,36 +2173,34 @@ sieve_to_vector(U n0, U n1) template SieveResults -sieve(U n0, U n1, Threads const & threads) +sieve(U n0, U n1, threads const & threads) { if(threads.count() == 1 || (n1 <= n0) || (n1 - n0) <= threads.count()) { return sieve(n0, n1); } - auto numThreads = threads.count(); std::vector>> results; std::vector prefix; - for(auto k = n0, dk = (n1 - n0) / numThreads, ek = (n1 - n0) % numThreads; k < n1; ek = ek ? ek - 1 : 0) { - auto kmax = k + dk + (ek ? 1 : 0); + details::partition_range(n0, n1, threads.count(), [&](U v0, U v1) { results.emplace_back(std::async(std::launch::async, - [&prefix](U v0, U v1){ + [v0, v1, &prefix](){ return details::sieve(v0, v1, - [&prefix](auto & pref, std::vector & bmps) { + [&](auto & pref, std::vector & bmps) { if(!pref.empty()) { prefix = std::move(pref); } return std::move(bmps); }); - }, k, kmax)); - k = kmax; - } + })); + }); std::vector bmps = - std::accumulate(std::begin(results), std::end(results), std::vector{}, - [](auto x, auto & y) { - for(auto & b : y.get()) { - x.emplace_back(std::move(b)); - } - return std::move(x); - }); + std::accumulate(std::begin(results), std::end(results), + std::vector{}, + [](auto x, auto & y) { + for(auto & b : y.get()) { + x.emplace_back(std::move(b)); + } + return std::move(x); + }); return SieveResults{std::move(prefix), std::move(bmps)}; } @@ -2201,21 +2225,19 @@ constexpr std::size_t count_primes(U n0, U n1) } template -std::size_t count_primes(U n0, U n1, Threads const & threads) +std::size_t count_primes(U n0, U n1, threads const & threads) { if(n0 >= n1) { return 0; } auto numThreads = threads.count(); - if(n1 - n0 < numThreads) { + if((numThreads == 1) || (n1 - n0 <= numThreads)) { return count_primes(n0, n1); } std::vector> results; - for(auto k = n0, dk = (n1 - n0) / numThreads, ek = (n1 - n0) % numThreads; k < n1; ek = ek ? ek - 1 : 0) { - auto kmax = k + dk + (ek ? 1 : 0); - results.emplace_back(std::async(std::launch::async, static_cast(count_primes), k, kmax)); - k = kmax; - } + details::partition_range(n0, n1, numThreads, [&results](U v0, U v1) { + results.emplace_back(std::async(std::launch::async, static_cast(count_primes), v0, v1)); + }); return std::accumulate(std::begin(results), std::end(results), std::size_t{}, [](auto x, auto & y) { return x + y.get(); }); } @@ -2223,3 +2245,4 @@ std::size_t count_primes(U n0, U n1, Threads const & threads) } // namespace lfp + diff --git a/main.cpp b/main.cpp index cd94583..c3a08f6 100644 --- a/main.cpp +++ b/main.cpp @@ -36,7 +36,7 @@ int main(int argc, char** argv) istr1 >> n1; auto const startt = std::chrono::steady_clock::now(); - auto numPrimes = lfp::count_primes(n0, n1, lfp::Threads{numThreads < 0 ? 1u : (unsigned int)numThreads}); + auto numPrimes = lfp::count_primes(n0, n1, lfp::threads{numThreads < 0 ? 1u : (unsigned int)numThreads}); auto const endt = std::chrono::steady_clock::now(); std::cout << "The number of prime numbers in range [" << n0 << ", " << n1 << "[ is " diff --git a/tests.cpp b/tests.cpp index f4b5993..8aa350f 100644 --- a/tests.cpp +++ b/tests.cpp @@ -220,9 +220,9 @@ TEST_CASE("Sieve of Erathostenes - above 2^32 - #2") { } TEST_CASE("Sieve of Erathostenes - multithreaded sieve") { - CHECK_THAT(lfp::sieve(uint64_t(0), uint64_t(1000000), lfp::Threads{2}).count(), equals(78498)); - CHECK_THAT(lfp::sieve(uint64_t(0), uint64_t(1'000'000'000), lfp::Threads{4}).count(), equals(50847534)); - CHECK_THAT(lfp::sieve(641*641, 8191*8191+1, lfp::Threads{4}).count(), equals(3922190)); + CHECK_THAT(lfp::sieve(uint64_t(0), uint64_t(1000000), lfp::threads{2}).count(), equals(78498)); + CHECK_THAT(lfp::sieve(uint64_t(0), uint64_t(1'000'000'000), lfp::threads{4}).count(), equals(50847534)); + CHECK_THAT(lfp::sieve(641*641, 8191*8191+1, lfp::threads{4}).count(), equals(3922190)); } TEST_CASE("Sieve of Erathostenes - misc - #1") { From f557eb5f8d106a63a0d4a23c5dda3defa5031bce Mon Sep 17 00:00:00 2001 From: Youcef L Date: Thu, 15 May 2025 17:19:47 +0000 Subject: [PATCH 18/22] Fixed buggy partition_range. Restored bitmasking up to p=127. --- lfp.hpp | 44 +++++++++++++++++++++----------------------- tests.cpp | 1 + 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index 2ff7108..7abbfb9 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -1545,12 +1545,12 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data const & sievd } auto & bmp = *sievdat.bitmap_; - // Primes below a certain threshold are dealt with by applying precomputed masks to the bitmapi - bool const useBitmasksForAllPrimesBelow128 = n1 > (std::numeric_limits::max)(); - unsigned int const smallPrimesThreshold = useBitmasksForAllPrimesBelow128 ? 128 : 104; + // Primes below a certain threshold are dealt with by applying precomputed masks to the bitmap + constexpr unsigned int lastSmallPrime = 127; + constexpr unsigned int smallPrimesThreshold = lastSmallPrime + 1; if(std::ranges::distance(basePrimes | std::views::drop_while([](auto p) { return p < 7; }) - | std::views::take_while([smallPrimesThreshold](auto p) { return p < smallPrimesThreshold; }) + | std::views::take_while([](auto p) { return p < smallPrimesThreshold; }) )) { bitmask_pack::value_type, 7, 11, 13, 17, 19, 23, 29, 31> bitmasks_1; @@ -1558,14 +1558,12 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data const & sievd 37, 41, 43, 47, 53, 59, 61, 67> bitmasks_2; bitmask_pack::value_type, 71, 73, 79, 83, 89, 97, 101, 103> bitmasks_3; + bitmask_pack::value_type, + 107, 109, 113, lastSmallPrime> bitmasks_4; bmp.apply(bitmasks_1); bmp.apply(bitmasks_2); bmp.apply(bitmasks_3); - if(useBitmasksForAllPrimesBelow128) { - bitmask_pack::value_type, - 107, 109, 113, 127> bitmasks_4; - bmp.apply(bitmasks_4); - } + bmp.apply(bitmasks_4); } for(auto p : basePrimes @@ -2108,21 +2106,21 @@ void partition_range(U n0, U n1, int N, FuncT processRange) { if(N <= 0 || (n0 >= n1)) { processRange(n0, n0); - } - if((N == 1) || (n1 - n0 < N)) { + } else if((N == 1) || (n1 - n0 < N)) { processRange(n0, n1); + } else { + auto v = std::views::iota(1, N + 1); + auto weight = std::accumulate(std::begin(v), std::end(v), 0.0, + [](double x, auto k){ return x + 1.0 / std::sqrt(k); + }); + auto c = (n1 - n0) / weight; + auto current = n0; + std::for_each(std::begin(v), std::end(v), [&](auto k){ + auto ni = (k == N) ? n1 : current + c / std::sqrt(k); + processRange(current, ni); + current = ni; + }); } - auto v = std::views::iota(1, N + 1); - auto weight = std::accumulate(std::begin(v), std::end(v), 0.0, - [](double x, auto k){ return x + 1.0 / std::sqrt(k); - }); - auto c = (n1 - n0) / weight; - auto current = n0; - std::for_each(std::begin(v), std::end(v), [&](auto k){ - auto ni = (k == N) ? n1 : current + c / std::sqrt(k); - processRange(current, ni); - current = ni; - }); } } // namespace details @@ -2231,7 +2229,7 @@ std::size_t count_primes(U n0, U n1, threads const & threads) return 0; } auto numThreads = threads.count(); - if((numThreads == 1) || (n1 - n0 <= numThreads)) { + if(n1 - n0 <= numThreads) { return count_primes(n0, n1); } std::vector> results; diff --git a/tests.cpp b/tests.cpp index 8aa350f..d92275b 100644 --- a/tests.cpp +++ b/tests.cpp @@ -220,6 +220,7 @@ TEST_CASE("Sieve of Erathostenes - above 2^32 - #2") { } TEST_CASE("Sieve of Erathostenes - multithreaded sieve") { + CHECK_THAT(lfp::sieve(uint64_t(0), uint64_t(1000000), lfp::threads{1}).count(), equals(78498)); CHECK_THAT(lfp::sieve(uint64_t(0), uint64_t(1000000), lfp::threads{2}).count(), equals(78498)); CHECK_THAT(lfp::sieve(uint64_t(0), uint64_t(1'000'000'000), lfp::threads{4}).count(), equals(50847534)); CHECK_THAT(lfp::sieve(641*641, 8191*8191+1, lfp::threads{4}).count(), equals(3922190)); From c03f4811c665e8c6b49aad39a2bd4ad63c6293a4 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Thu, 15 May 2025 20:13:49 +0100 Subject: [PATCH 19/22] Update README.md --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 7b55452..569b912 100644 --- a/README.md +++ b/README.md @@ -86,13 +86,13 @@ template std::size_t count_primes(U n0, U n1, Threads const & threads); /// @struct Holding concurrency information -struct Threads +struct threads { /// Constructs an instance x such that x.count() == std::thread::hardware_concurrency(). /// If std::thread::hardware_concurrency() == 0, then x.count() is equal to 1. - Threads(); + threads(); /// Constructs an instance x such that x.count() == c - explicit Threads(unsigned int c); + explicit threads(unsigned int c); /// Returns the maximum number of concurrent threads to use during sieving. unsigned int count() const; // ...Private part omitted... @@ -137,14 +137,14 @@ The following table gives an idea of the performances to expect from the sieve ( | Range \ Threads | 1 | 4 | 8 | 16 | 32 | 48 | 64 | Number of primes | |-----------------|---|---|---|----|----|----|----|------------------| -| $\left[0, 10^{9}\right[$ | 0.211 | 0.057 | 0.030 | 0.018 | 0.015 | 0.016 | 0.015 | **50847534** | -| $\left[0, 2^{32}-1\right[$ | 0.979 | 0.263 | 0.133 | 0.070 | 0.043 | 0.036 | 0.037 | **203280221** | -| $\left[10^{12}, 10^{12}+10^{10}\right[$ | 4.643 | 1.175 | 0.592 | 0.301 | 0.158 | 0.120 | 0.104 | **361840208** | -| $\left[10^{15}, 10^{15}+10^{10}\right[$ | 26.475 | 6.620 | 3.324 | 1.670 | 0.863 | 0.603 | 0.521 | **289531946** | -| $\left[10^{18}, 10^{18}+10^{10}\right[$ | 392.079 | 98.792 | 49.344 | 24.668 | 12.874 | 8.944 | 7.640 | **241272176** | -| $\left[2^{64}-10^{10}, 2^{64}-1\right[$ | 1478.033 | 374.467 | 186.686 | 93.948 | 49.188 | 34.535 | 29.364 | **225402976** | +| $\left[0, 10^{9}\right[$ | 0.279 | 0.095 | 0.061 | 0.041 | 0.033 | 0.038 | 0.037 | **50847534** | +| $\left[0, 2^{32}-1\right[$ | 1.319 | 0.442 | 0.276 | 0.179 | 0.124 | 0.107 | 0.102 | **203280221** | +| $\left[10^{12}, 10^{12}+10^{10}\right[$ | 5.041 | 1.833 | 1.201 | 0.836 | 0.591 | 0.500 | 0.463 | **361840208** | +| $\left[10^{15}, 10^{15}+10^{10}\right[$ | 17.498 | 6.397 | 4.210 | 2.966 | 2.391 | 2.018 | 1.926 | **289531946** | +| $\left[10^{18}, 10^{18}+10^{10}\right[$ | 72.288 | 26.501 | 17.121 | 11.614 | 8.221 | 6.592 | 6.359 | **241272176** | +| $\left[2^{64}-10^{10}, 2^{64}-1\right[$ | 209.197 | 74.313 | 47.860 | 32.049 | 21.893 | 16.928 | 16.996 | **225402976** | -These timings were measured on an AMD EPYC 9R14, the compilation flags used are "-std=c++20 -O3 -march=native -mtune=native" (OS: Debian 12, compiler: g++ version 12.2). +These timings were measured on an AMD EPYC 9R14, the compilation flags used are "-std=c++20 -O3 -march=native -mtune=native -DNDEBUG" (OS: Debian 12, compiler: g++ version 12.2). From 78f96550d6697c55038fb37ec553dccb97b281c4 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Sat, 17 May 2025 11:23:27 +0000 Subject: [PATCH 20/22] Restore 104 as threshold for primes handled through bitmasking (improves performance). --- lfp.hpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lfp.hpp b/lfp.hpp index 7abbfb9..fea5e9b 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -1546,7 +1546,7 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data const & sievd auto & bmp = *sievdat.bitmap_; // Primes below a certain threshold are dealt with by applying precomputed masks to the bitmap - constexpr unsigned int lastSmallPrime = 127; + constexpr unsigned int lastSmallPrime = 103; constexpr unsigned int smallPrimesThreshold = lastSmallPrime + 1; if(std::ranges::distance(basePrimes | std::views::drop_while([](auto p) { return p < 7; }) @@ -1557,13 +1557,10 @@ inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data const & sievd bitmask_pack::value_type, 37, 41, 43, 47, 53, 59, 61, 67> bitmasks_2; bitmask_pack::value_type, - 71, 73, 79, 83, 89, 97, 101, 103> bitmasks_3; - bitmask_pack::value_type, - 107, 109, 113, lastSmallPrime> bitmasks_4; + 71, 73, 79, 83, 89, 97, 101, lastSmallPrime> bitmasks_3; bmp.apply(bitmasks_1); bmp.apply(bitmasks_2); bmp.apply(bitmasks_3); - bmp.apply(bitmasks_4); } for(auto p : basePrimes From 16d7100d5ec83b6245e1b0ded4ec7c736cc0a71c Mon Sep 17 00:00:00 2001 From: Youcef L Date: Sat, 17 May 2025 12:31:09 +0100 Subject: [PATCH 21/22] Update README.md --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 569b912..c64e9fb 100644 --- a/README.md +++ b/README.md @@ -137,12 +137,12 @@ The following table gives an idea of the performances to expect from the sieve ( | Range \ Threads | 1 | 4 | 8 | 16 | 32 | 48 | 64 | Number of primes | |-----------------|---|---|---|----|----|----|----|------------------| -| $\left[0, 10^{9}\right[$ | 0.279 | 0.095 | 0.061 | 0.041 | 0.033 | 0.038 | 0.037 | **50847534** | -| $\left[0, 2^{32}-1\right[$ | 1.319 | 0.442 | 0.276 | 0.179 | 0.124 | 0.107 | 0.102 | **203280221** | -| $\left[10^{12}, 10^{12}+10^{10}\right[$ | 5.041 | 1.833 | 1.201 | 0.836 | 0.591 | 0.500 | 0.463 | **361840208** | -| $\left[10^{15}, 10^{15}+10^{10}\right[$ | 17.498 | 6.397 | 4.210 | 2.966 | 2.391 | 2.018 | 1.926 | **289531946** | -| $\left[10^{18}, 10^{18}+10^{10}\right[$ | 72.288 | 26.501 | 17.121 | 11.614 | 8.221 | 6.592 | 6.359 | **241272176** | -| $\left[2^{64}-10^{10}, 2^{64}-1\right[$ | 209.197 | 74.313 | 47.860 | 32.049 | 21.893 | 16.928 | 16.996 | **225402976** | +| $\left[0, 10^{9}\right[$ | 0.228 | 0.079 | 0.050 | 0.035 | 0.034 | 0.036 | 0.037 | **50847534** | +| $\left[0, 2^{32}-1\right[$ | 1.077 | 0.362 | 0.224 | 0.146 | 0.104 | 0.092 | 0.088 | **203280221** | +| $\left[10^{12}, 10^{12}+10^{10}\right[$ | 4.421 | 1.610 | 1.058 | 0.741 | 0.530 | 0.453 | 0.421 | **361840208** | +| $\left[10^{15}, 10^{15}+10^{10}\right[$ | 16.800 | 6.148 | 4.052 | 2.878 | 2.308 | 1.983 | 1.888 | **289531946** | +| $\left[10^{18}, 10^{18}+10^{10}\right[$ | 69.846 | 25.593 | 16.562 | 11.224 | 7.947 | 6.384 | 6.199 | **241272176** | +| $\left[2^{64}-10^{10}, 2^{64}-1\right[$ | 200.867 | 71.364 | 45.911 | 30.781 | 21.050 | 16.361 | 16.497 | **225402976** | These timings were measured on an AMD EPYC 9R14, the compilation flags used are "-std=c++20 -O3 -march=native -mtune=native -DNDEBUG" (OS: Debian 12, compiler: g++ version 12.2). From f99d583ccddce147a1d5a259762696c931597d79 Mon Sep 17 00:00:00 2001 From: Youcef L Date: Sat, 17 May 2025 12:51:33 +0100 Subject: [PATCH 22/22] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c64e9fb..1fbf708 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,7 @@ This sieve leverages several optimization techniques for performance and scalabi - a range $R = [n_0, n_1[$ to be sieved is represented by a sequence of bits, called a bitmap, to each bit corresponds one and only one of the integers coprime to 30 in $R$, thus we only consume about $\frac{4}{15}\cdot(n_1 - n_0)$ bits of memory to represent R. - when crossing out the multiples of a prime $p$, we start at $p^{2}$ and, since we are using a modulo 30 wheel, we only consider the multiples of $p$ of the form $p^{2} + 2kp$ ($k$ being an integer >= 0) that are coprime to 30. - for each prime $p$ below a certain threshold (currently 104), we precompute a bitmask of length $8p$ at compile time. These bitmasks are applied in batches—e.g., in one pass, we cross out multiples of {7, 11, ..., 31}. The threshold 104 was chosen to group primes into three sets of eight (optimized for batch processing) and empirically showed better performance than higher values (though future tuning may optimize this further). + - the sieve employs bucket sieving for primes above 204800—a technique that groups large primes into cache-friendly batches. This threshold was chosen to balance overhead and gains but may be optimized further in future releases. - the sieve is segmented i.e. if the size of the range R to sieve exceeds a certain threshold S, R is split into segments of size at most S. - the sieve is multithreaded, we allocate N threads and each thread deals with part of the range to sieve. - the memory allocated for a bitmap is 64-byte aligned.