diff --git a/README.md b/README.md index 7b55452..1fbf708 100644 --- a/README.md +++ b/README.md @@ -86,13 +86,13 @@ template std::size_t count_primes(U n0, U n1, Threads const & threads); /// @struct Holding concurrency information -struct Threads +struct threads { /// Constructs an instance x such that x.count() == std::thread::hardware_concurrency(). /// If std::thread::hardware_concurrency() == 0, then x.count() is equal to 1. - Threads(); + threads(); /// Constructs an instance x such that x.count() == c - explicit Threads(unsigned int c); + explicit threads(unsigned int c); /// Returns the maximum number of concurrent threads to use during sieving. unsigned int count() const; // ...Private part omitted... @@ -129,6 +129,7 @@ This sieve leverages several optimization techniques for performance and scalabi - a range $R = [n_0, n_1[$ to be sieved is represented by a sequence of bits, called a bitmap, to each bit corresponds one and only one of the integers coprime to 30 in $R$, thus we only consume about $\frac{4}{15}\cdot(n_1 - n_0)$ bits of memory to represent R. - when crossing out the multiples of a prime $p$, we start at $p^{2}$ and, since we are using a modulo 30 wheel, we only consider the multiples of $p$ of the form $p^{2} + 2kp$ ($k$ being an integer >= 0) that are coprime to 30. - for each prime $p$ below a certain threshold (currently 104), we precompute a bitmask of length $8p$ at compile time. These bitmasks are applied in batches—e.g., in one pass, we cross out multiples of {7, 11, ..., 31}. The threshold 104 was chosen to group primes into three sets of eight (optimized for batch processing) and empirically showed better performance than higher values (though future tuning may optimize this further). + - the sieve employs bucket sieving for primes above 204800—a technique that groups large primes into cache-friendly batches. This threshold was chosen to balance overhead and gains but may be optimized further in future releases. - the sieve is segmented i.e. if the size of the range R to sieve exceeds a certain threshold S, R is split into segments of size at most S. - the sieve is multithreaded, we allocate N threads and each thread deals with part of the range to sieve. - the memory allocated for a bitmap is 64-byte aligned. @@ -137,14 +138,14 @@ The following table gives an idea of the performances to expect from the sieve ( | Range \ Threads | 1 | 4 | 8 | 16 | 32 | 48 | 64 | Number of primes | |-----------------|---|---|---|----|----|----|----|------------------| -| $\left[0, 10^{9}\right[$ | 0.211 | 0.057 | 0.030 | 0.018 | 0.015 | 0.016 | 0.015 | **50847534** | -| $\left[0, 2^{32}-1\right[$ | 0.979 | 0.263 | 0.133 | 0.070 | 0.043 | 0.036 | 0.037 | **203280221** | -| $\left[10^{12}, 10^{12}+10^{10}\right[$ | 4.643 | 1.175 | 0.592 | 0.301 | 0.158 | 0.120 | 0.104 | **361840208** | -| $\left[10^{15}, 10^{15}+10^{10}\right[$ | 26.475 | 6.620 | 3.324 | 1.670 | 0.863 | 0.603 | 0.521 | **289531946** | -| $\left[10^{18}, 10^{18}+10^{10}\right[$ | 392.079 | 98.792 | 49.344 | 24.668 | 12.874 | 8.944 | 7.640 | **241272176** | -| $\left[2^{64}-10^{10}, 2^{64}-1\right[$ | 1478.033 | 374.467 | 186.686 | 93.948 | 49.188 | 34.535 | 29.364 | **225402976** | +| $\left[0, 10^{9}\right[$ | 0.228 | 0.079 | 0.050 | 0.035 | 0.034 | 0.036 | 0.037 | **50847534** | +| $\left[0, 2^{32}-1\right[$ | 1.077 | 0.362 | 0.224 | 0.146 | 0.104 | 0.092 | 0.088 | **203280221** | +| $\left[10^{12}, 10^{12}+10^{10}\right[$ | 4.421 | 1.610 | 1.058 | 0.741 | 0.530 | 0.453 | 0.421 | **361840208** | +| $\left[10^{15}, 10^{15}+10^{10}\right[$ | 16.800 | 6.148 | 4.052 | 2.878 | 2.308 | 1.983 | 1.888 | **289531946** | +| $\left[10^{18}, 10^{18}+10^{10}\right[$ | 69.846 | 25.593 | 16.562 | 11.224 | 7.947 | 6.384 | 6.199 | **241272176** | +| $\left[2^{64}-10^{10}, 2^{64}-1\right[$ | 200.867 | 71.364 | 45.911 | 30.781 | 21.050 | 16.361 | 16.497 | **225402976** | -These timings were measured on an AMD EPYC 9R14, the compilation flags used are "-std=c++20 -O3 -march=native -mtune=native" (OS: Debian 12, compiler: g++ version 12.2). +These timings were measured on an AMD EPYC 9R14, the compilation flags used are "-std=c++20 -O3 -march=native -mtune=native -DNDEBUG" (OS: Debian 12, compiler: g++ version 12.2). diff --git a/lfp.hpp b/lfp.hpp index cf1363f..fea5e9b 100644 --- a/lfp.hpp +++ b/lfp.hpp @@ -1,12 +1,16 @@ /* * MIT License - * Copyright (c) 2024 Youcef Lemsafer + * Copyright (c) 2024-2025 Youcef Lemsafer * See LICENSE file for more details. * Creation date: december 2024. */ +/// @file lfp.hpp +/// @brief Header defining the lfp library +/// #pragma once #include +#include #include #include #include @@ -21,12 +25,14 @@ #include #include #include +#include +#include namespace lfp { // Forward declarations -struct Threads; +struct threads; template class SieveResults; /// Returns the result of sieving the range [n0, n1). @@ -37,7 +43,7 @@ constexpr SieveResults sieve(U n0, U n1); /// Returns the result of sieving the range [n0, n1). /// The sieving is performed using at most threads.count() concurrent threads. template -SieveResults sieve(U n0, U n1, Threads const & threads); +SieveResults sieve(U n0, U n1, threads const & threads); /// Returns a vector containing the prime numbers in range [n0, n1) template @@ -50,26 +56,191 @@ constexpr std::size_t count_primes(U n0, U n1); /// Returns the number of prime numbers in range [n0, n1) /// the sieving is performed using at most threads.count() ooncurrent threads. template -std::size_t count_primes(U n0, U n1, Threads const & threads); +std::size_t count_primes(U n0, U n1, threads const & threads); /// @struct Holding concurrency information -struct Threads +struct threads { /// Constructs an instance x such that x.count() == std::thread::hardware_concurrency(). /// If std::thread::hardware_concurrency() == 0, then x.count() is equal to 1. - Threads(); + threads(); /// Constructs an instance x such that x.count() == c - explicit Threads(unsigned int c); + explicit threads(unsigned int c); /// Returns the maximum number of concurrent threads to use during sieving. unsigned int count() const; private: unsigned int count_; - static unsigned int defaultCount(); + static unsigned int default_count(); }; +} // namespace lfp + + +#ifdef NDEBUG +# define LFP_ASSERT(...) ((void)0); +#else +# ifndef LFP_ASSERT +# define LFP_ASSERT(...) assert(__VA_ARGS__) +# endif +#endif + +#define LFP_CONCAT_IMPL(x, y) x##y +#define LFP_CONCAT(x, y) LFP_CONCAT_IMPL(x, y) + +#ifdef LFP_ACTIVATE_LOG +# define LFP_LOG(msg) \ + do { \ + if(!std::is_constant_evaluated()) { \ + ::lfp::details::log() << msg << "\n";\ + } \ + } while(0) + +# define LFP_LOG_S(msg, msgAtEos) \ + LFP_LOG(msg);\ + auto LFP_CONCAT(blockStartTime, __LINE__) = !std::is_constant_evaluated()\ + ? std::chrono::high_resolution_clock::now()\ + : decltype(std::chrono::high_resolution_clock::now()){}; \ + auto LFP_CONCAT(logAtEob, __LINE__) = [&](){\ + if(!std::is_constant_evaluated()) {\ + return std::variant{\ + details::make_log_at_eos([&](){\ + auto duration = std::chrono::duration{\ + std::chrono::high_resolution_clock::now()\ + - LFP_CONCAT(blockStartTime, __LINE__)};\ + ::lfp::details::log() << msgAtEos << " (duration: " << duration << ")\n";\ + })\ + };\ + }\ + return std::variant{0};\ + }(); +#else +# define LFP_LOG(msg) ((void)0) +# define LFP_LOG_S(msg, msgAtEob) ((void)0) +#endif + + +namespace lfp { namespace details { +class log_impl; + +/// Returns the unique logger +inline log_impl & log(); + +/// Logging class +class log_impl +{ + struct appender; +public: + log_impl(std::ostream & stream); + template + appender & operator <<(Arg&& arg); + struct log_at_eos { + log_at_eos(log_at_eos const&) = delete; + log_at_eos(log_at_eos&& other) noexcept + { + func_ = std::move(other.func_); + other.func_ = [](){}; + } + log_at_eos(std::function func); + ~log_at_eos(); + std::function func_; + }; + +private: + class appender { + public: + template + appender & operator <<(Arg&& arg); + private: + appender() = default; + log_impl * parent_; + friend class log_impl; + }; + std::ostream & stream_; + appender appender_; +}; + + +inline +log_impl::log_impl(std::ostream & stream) + : stream_(stream) +{ + appender_.parent_ = this; +} + +template +inline +log_impl::appender & log_impl::operator<<(Arg&& arg) +{ + auto ts = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now().time_since_epoch() + ).count(); + stream_ << ts << " | " << std::forward(arg); + return appender_; +} + +template +inline +log_impl::appender & log_impl::appender::operator <<(Arg&& arg) +{ + parent_->stream_ << std::forward(arg); + return *this; +} + +log_impl::log_at_eos::log_at_eos(std::function func) + : func_(func) +{} + +inline log_impl::log_at_eos::~log_at_eos() +{ + func_(); +} + +template +auto make_log_at_eos(Func func) +{ + return log_impl::log_at_eos{func}; +} + +inline log_impl & log() +{ + static log_impl inst{std::cout}; + return inst; +} + + +/// Constants +constexpr std::size_t bucketed_primes_threshold() +{ + if(std::is_constant_evaluated()) { + return 8192; + } else { + return 204800; + } +} +inline constexpr std::size_t initial_bucket_capacity = 32768; + +/// Value meaning no offset i.e. the requested value is not present in the sequence +inline constexpr std::size_t noffs = (std::numeric_limits::max)(); + +/// True if and only if T is one of the types listed after it +template +inline constexpr bool is_one_of_v = sizeof...(Rest) && ((std::is_same_v || ...)); + +/// Returns the address of an object as an integer value +template +inline auto obj_addr(T const* ptr) +{ + return reinterpret_cast(ptr); +} + +/// Residues modulo 30 that are coprime to 30 +template +inline constexpr auto cp30_residues = std::to_array({1, 7, 11, 13, 17, 19, 23, 29}); + +/// All the primes below 2^8 in ascending order template inline constexpr auto u8primes = std::to_array({ 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, @@ -80,6 +251,316 @@ inline constexpr auto u8primes = std::to_array({ 233, 239, 241, 251 }); +/// The really tiny primes, all numbers represented in a bitmap are coprime to +/// these, but they can still be returned by a sieve e.g. if the user requests +/// the primes in a range containing one of them. +template +inline constexpr std::array primesBelowSix = {2, 3, 5}; + + +// Some two dimensional wheels to speed-up the localization of the multiples of a prime +inline constexpr uint8_t wheel[8][8] = { + {6, 4, 2, 4, 2, 4, 6, 2}, + {4, 2, 4, 2, 4, 6, 2, 6}, + {2, 4, 2, 4, 6, 2, 6, 4}, + {4, 2, 4, 6, 2, 6, 4, 2}, + {2, 4, 6, 2, 6, 4, 2, 4}, + {4, 6, 2, 6, 4, 2, 4, 2}, + {6, 2, 6, 4, 2, 4, 2, 4}, + {2, 6, 4, 2, 4, 2, 4, 6} +}; + +inline constexpr uint8_t whoffs[8][8] = { + {0, 1, 2, 3, 4, 5, 6, 7}, + {2, 7, 5, 4, 1, 0, 6, 3}, + {0, 2, 6, 4, 7, 5, 1, 3}, + {6, 2, 1, 5, 4, 0, 7, 3}, + {2, 6, 7, 3, 4, 0, 1, 5}, + {0, 6, 2, 4, 1, 3, 7, 5}, + {6, 1, 3, 4, 7, 0, 2, 5}, + {0, 7, 6, 5, 4, 3, 2, 1} +}; + +/// "Adjustment" table used for finding the smallest multiple of a prime p +/// above max(p^2, n0) for a given n0. +/// More precisely, let p be a prime, we start to cross out the multiples of p +/// at p^2 but since we are using a mod 30 wheel we need those multiples to be +/// coprime to 30. What this table does is, once a multiple c of p of the form +/// p^2 + 2kp is found, it adds to c what needs to be added to make it coprime +/// to 30. +inline constexpr int8_t adjt[8][14] = { + {0, 4, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 4, 2}, + {0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 4, 0, 4, 2}, + {0, 4, 4, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 2}, + {0, 2, 2, 0, 4, 0, 0, 2, 0, 0, 2, 0, 4, 2}, + {0, 2, 4, 0, 2, 0, 0, 2, 0, 0, 4, 0, 2, 2}, + {0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 4, 4}, + {0, 2, 4, 0, 4, 0, 0, 2, 0, 0, 2, 0, 2, 2}, + {0, 2, 4, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 4} +}; + +template +constexpr auto cp30res_to_idx(UInt coprimeTo30Residue) +{ + return (coprimeTo30Residue * 17) >> 6; +} + + +struct ComputeSquareOfP {}; +struct NoUpperBoundCond {}; +struct ResiduesNotNeeded {}; + +template +struct first_multiple_finder +{ + constexpr auto operator()(U p, V n0); + + SquareOfPComputer sqr_of_p_comp_; + UpperBoundCond upper_bound_cond_; + ResiduesReceiver residues_receiver_; +}; + +template +constexpr auto +first_multiple_finder::operator()(U p, V n0) +{ + auto const p2 = [&](){ + if constexpr (std::is_same_v) { + return p * p; + } else { + return sqr_of_p_comp_(); + } + }(); + constexpr auto c_max = std::numeric_limits::max(); + V dp2; + auto c = (p2 >= n0) ? p2 + : (((dp2 = (n0 - p2 + 2 * p - 1)/(2 * p)*(2 * p)), + c_max - p2 < dp2) ? 0 + : p2 + dp2); + if(!c) { + return decltype(c){}; + } + if constexpr(!std::is_same_v) { + if(!upper_bound_cond_(c)) { + return decltype(c){}; + } + } + constexpr auto callerNeedsResidues = !std::is_same_v; + auto cmod30 = c % 30; + switch(cmod30) { + case 3: case 5: case 9: case 15: case 21: case 25: case 27: { + auto dc = adjt[(p % 30) * 4 / 15][cmod30 >> 1] * p; + c = (c_max - c < dc) ? 0 : c + dc; + if constexpr(callerNeedsResidues) { + cmod30 = c % 30; + } + } + break; + default:; + } + if constexpr(!std::is_same_v) { + if(!upper_bound_cond_(c)) { + return decltype(c){}; + } + } + if constexpr(callerNeedsResidues) { + residues_receiver_(p % 30, cmod30); + } + return c; +} + +template +constexpr auto find_first_multiple_impl(U p, V n0) +{ + return first_multiple_finder{}(p, n0); +} + +template +constexpr auto find_first_multiple_impl(U p, V n0, SquareOfPComputer sqrOfPComp, UpperBoundCond upperBoundCond, ResiduesReceiver residuesReceiver) +{ + auto finder = first_multiple_finder{ + sqrOfPComp, + upperBoundCond, + residuesReceiver + }; + return finder(p, n0); +} + +template +constexpr auto +find_first_multiple_ae_b(U p, V n0, V n1, U pSquared, uint8_t & pmod30, uint8_t & cmod30) +{ + return find_first_multiple_impl(p, n0, + [pSquared](){ return pSquared; }, + [n1](auto c){ return c < n1; }, + [&](uint8_t pm30, uint8_t cm30){ + pmod30 = pm30; + cmod30 = cm30; + }); +} + +/// Computes the smallest multiple m of p such that m >= max(p^2, n0) and +/// gcd(m, 30) = 1. +/// @param p the prime a multiple of which is requested +/// @param n0 lower bound for returned multiple of p +/// @return 0 when such a multiple does not fit in the return type. +template +constexpr auto +find_first_multiple_ae(U p, V n0) +{ + return find_first_multiple_impl(p, n0); +} + +// Returns the smallest integer coprime to 30 and greater than or equal to @param n0 +template +constexpr +Ret compute_gte_coprime(U n0) +{ + constexpr uint8_t dn[30] = { + 1, 0, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0, 1, 0, 3, + 2, 1, 0, 1, 0, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0 + }; + + return Ret{n0} + dn[n0 % 30]; +} + + +/// Returns the largest integer coprime to 30 and < @param n1 +template +constexpr +Ret compute_lt_coprime(U n1) +{ + constexpr uint8_t dn[30] = { + 1, 2, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 1, 2, 1, + 2, 3, 4, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6 + }; + + return Ret{n1} - dn[n1 % 30]; +} + + +template +constexpr std::size_t index_of(U m, uint8_t mmod30Idx, V m0) +{ + LFP_ASSERT((m >= m0) && (std::gcd(m, 30) == 1) + && (mmod30Idx == cp30res_to_idx(m % 30)) && (std::gcd(m0, 30) == 1)); + + auto m0_idx = cp30res_to_idx(m0 % 30); + return (m - m0) / 30 * 8 + ((mmod30Idx >= m0_idx) ? mmod30Idx - m0_idx : 8 + mmod30Idx - m0_idx); +} + +/// Returns the index of m relative to the index of m0. +/// @pre both m and m0 are coptime to 30 and m >= m0 +template +constexpr std::size_t index_of(U m, V m0) +{ + LFP_ASSERT((m >= m0) && (std::gcd(m, 30) == 1) && (std::gcd(m0, 30) == 1)); + + auto m_idx = cp30res_to_idx(m % 30); + auto m0_idx = cp30res_to_idx(m0 % 30); + return (m - m0) / 30 * 8 + ((m_idx >= m0_idx) ? m_idx - m0_idx : 8 + m_idx - m0_idx); +} + +/// Returns the value at offset offs relative to the offset of n0 i.e. the value v such that offs == index_of(v, n0) +/// @pre n0 is coprime to 30 +template +constexpr V value_at(V n0, std::size_t offs) +{ + LFP_ASSERT(std::gcd(n0, 30) == 1); + + auto n0Idx = cp30res_to_idx(n0 % 30); + auto nxIdx = (n0Idx + offs) % 8; + return n0 + (offs / 8) * 30 + + ((nxIdx >= n0Idx) + ? cp30_residues[nxIdx] - cp30_residues[n0Idx] + : 30 + cp30_residues[nxIdx] - cp30_residues[n0Idx]); +} + + + +/// Computes the index of the first multiple of prime p in range [n0, ne] relative to the index of n0 +/// as well as the offsets of the following multiples of p relative to firstMultIdx. +/// @pre p is a prime number > 6, n0 <= n1, gcd(n0, 30) = 1 and gcd(ne, 30) = 1. +/// @post firstMultIdx is the index of the first multiple of p in [n0, n1[ or noffs if no such multiple, +/// offsets is filled with the offsets of the next multiples of p relative to firstMultIdx, +/// @return the number of offsets written to offs +template +constexpr std::size_t compute_offsets(std::size_t & firstMultIdx, + std::array & offsets, + U p, + V n0, + V ne, + V c + ) +{ + firstMultIdx = noffs; + +#if 0 + if(c < n0) { //@todo: remove. Used for debugging + c = find_first_multiple_ae(p, n0); + if(!std::is_constant_evaluated()) { + std::cout << "Bad multiple, got " << c << " where a value > " << n0 << " was requested." << std::endl; + } + } +#endif + constexpr auto c_max = std::numeric_limits::max(); + bool isFirstIndex = true; + std::size_t offsCount = 0; + auto const iw = cp30res_to_idx(p % 30); + auto const jw = cp30res_to_idx(c % 30); + + for(auto j = whoffs[iw][jw]; + c <= ne; + c = ((c_max - c < wheel[iw][j] * p) + ? ne + 1 + : c + wheel[iw][j] * p), j = (j + 1) % 8) { + auto currIdx = index_of(c, n0); + if(isFirstIndex) { + firstMultIdx = currIdx; + isFirstIndex = false; + continue; + } + offsets[offsCount++] = currIdx - firstMultIdx; + if(offsCount == offsets.size()) { + break; + } + } + return offsCount; +} + +template +constexpr std::size_t compute_remaining_offsets(std::size_t firstMultIdx, + std::array & offsets, + U p, + V n0, + V ne, + V c, + uint8_t whpos + ) +{ + constexpr auto c_max = std::numeric_limits::max(); + bool isFirstIndex = true; + std::size_t offsCount = 0; + auto const iw = whpos >> 3; + auto const jw = whpos & 7; + for(auto j = whoffs[iw][jw]; + c <= ne; + c = ((c_max - c < wheel[iw][j] * p) + ? ne + 1 + : c + wheel[iw][j] * p), j = (j + 1) % 8) { + if(isFirstIndex) { + isFirstIndex = false; + continue; + } + auto currIdx = index_of(c, n0); + offsets[offsCount++] = currIdx - firstMultIdx; + if(offsCount == offsets.size()) { + break; + } + } + return offsCount; +} + /// Applies given mask to value starting at bit value + offs. /// @pre 0 <= offs < std::numeric_limits::digits @@ -97,6 +578,9 @@ constexpr void mask(U & value, U mask, V offs) value &= (offs ? (mask << (std::numeric_limits::digits - offs)) : U{}) | (~U{} >> offs); } + +/// @class allocator +/// An allocator class used when specific alignment is needed. template struct allocator { @@ -163,8 +647,15 @@ allocator::deallocate(U* ptr, std::size_t /*n*/) noexcept } +/// Useful when instantiating class bitmask_impl e.g. bitmask_impl<..., use_dictionary_t{true}, ...> using use_dictionary_t = bool; +/// @class bitmask_impl +/// Holds the bitmask to be applied to a bitmap in order to reset the bits corresponding to the multiples of a prime P. +/// @param U the limb type +/// @param P the prime number +/// @param UseDictionary whether to precompute and store the masks for all possible indices +/// @param Alignment alignment to use for the array holding the mask(s) template class bitmask_impl { @@ -236,7 +727,7 @@ template ::offset(std::size_t c) const { - return offs_[(c % 30) * 4 / 15]; + return offs_[cp30res_to_idx(c % 30)]; } @@ -275,7 +766,7 @@ bitmask_impl::compute_offsets() { std::remove_cv_t offsets; for_each_coprime_to_30([&offsets](auto i, auto j) { - offsets[(i % 30) * 4 / 15] = j; + offsets[cp30res_to_idx(i % 30)] = j; }); return offsets; } @@ -331,6 +822,8 @@ class bitmask_pack static_assert((is_prime(SmallPrimes) && ...), "Expecting prime numbers below 255!"); constexpr bitmask_pack() = default; + /// Returns a comma separated list of the SmallPrimes as a string + static constexpr std::string_view primes_as_string(); /// Returns the number of bitmask objects in this pack static constexpr auto size(); /// Returns a tuple containing all bitmask objects in this pack @@ -338,7 +831,7 @@ class bitmask_pack /// Gets the bitmask object at index I template constexpr auto const & get() const; - /// Returns the largest prime in the SmallPrimes + /// Returns the largest prime in SmallPrimes static constexpr std::uint8_t max_prime(); /// Returns the combination of the masks at the given offsets template @@ -347,6 +840,8 @@ class bitmask_pack private: using primes_sequence = std::integer_sequence; + static constexpr auto primes_as_string(primes_sequence primesSeq); + template static constexpr bool is_in_sequence(std::integer_sequence); @@ -363,6 +858,11 @@ class bitmask_pack /// The masks decltype(make_masks_tuple()) masks_; + // 3 digits per prime, 2 characters for the separator ", ", and the final '\0': + // 3 * sizeof...(SmallPrimes) + 2 * (sizeof...(SmallPrimes) - 1) + 1 + static constexpr std::array str_primes_ = primes_as_string(primes_sequence{}); + static constexpr std::string_view str_primes_view_{str_primes_.cbegin(), std::find(str_primes_.cbegin(), str_primes_.cend(), '\0')}; + }; @@ -372,6 +872,36 @@ constexpr auto bitmask_pack::size() return sizeof...(SmallPrimes); } +template +constexpr std::string_view bitmask_pack::primes_as_string() +{ +// return std::string_view{str_primes_.cbegin(), std::find(str_primes_.cbegin(), str_primes_.cend(), '\0')}; + return str_primes_view_; +} + +template +constexpr auto bitmask_pack::primes_as_string(primes_sequence) +{ + // all of this because std::to_string is not constexpr in C++20 + std::array ret{}; + int i = 0; + char * sep = nullptr; + for(auto p : std::to_array({SmallPrimes...})) { + if(i) { + ret[i++] = ','; + ret[i++] = ' '; + } + char strp[8] = {}; + int j = 7; + for(;p ; p /= 10) { + strp[j--] = '0' + (p % 10); + } + std::copy(strp + j + 1, strp + sizeof(strp), &ret[i]); + i += sizeof(strp) - j - 1; + } + return ret; +} + template constexpr auto const & bitmask_pack::get_all() const { @@ -430,43 +960,144 @@ constexpr bool bitmask_pack::is_prime(uint8_t n) template using bitmask = bitmask_impl; -inline constexpr int8_t adjt[8][14] = { - {0, 4, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 4, 2}, - {0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 4, 0, 4, 2}, - {0, 4, 4, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 2}, - {0, 2, 2, 0, 4, 0, 0, 2, 0, 0, 2, 0, 4, 2}, - {0, 2, 4, 0, 2, 0, 0, 2, 0, 0, 4, 0, 2, 2}, - {0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 4, 4}, - {0, 2, 4, 0, 4, 0, 0, 2, 0, 0, 2, 0, 2, 2}, - {0, 2, 4, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 4} - }; -template -constexpr auto -find_first_multiple_above(U p, V n0) +template +class bucket { - const auto p2 = p * p; - constexpr auto c_max = std::numeric_limits::max(); - U dp2; - auto c = (p2 >= n0) ? p2 : (((dp2 = (n0 - p2 + 2 * p - 1)/(2 * p)*(2 * p)), c_max - p2 < dp2) ? 0 : p2 + dp2); - if(!c) { - return c; - } - auto cmod30 = c % 30; - switch(cmod30) { - case 3: case 5: case 9: case 15: case 21: case 25: case 27: { - auto dc = adjt[(p % 30) * 4 / 15][cmod30 >> 1] * p; - c = (c_max - c < dc) ? 0 : c + dc; - cmod30 = c % 30; +public: + // Should be sufficient because if not, it means someone created 500MiB bitmap + // i.e. a bitmap representing a range of length 2^32/8*30 = 16'106'127'360 which + // is way too much. + using offset_type = std::uint32_t; + using offsets_type = std::vector>; + + constexpr bucket(offsets_type & scratchOffsets, std::size_t initialCapacity); + constexpr std::size_t size() const; + constexpr bool is_empty() const; + constexpr void add(PrimeT prime, offset_type cOffs, uint8_t whpos); + /// Returns the primes in this bucket + constexpr std::vector const & primes() const; + template + constexpr offsets_type const & compute_offsets(V n0, V ne, std::size_t bmpSize); + constexpr void clear(); +private: + std::vector primes_; + offsets_type first_multiples_offsets_; + std::vector whpos_; + offsets_type & scratch_offsets_; +}; + +template +constexpr +bucket::bucket(offsets_type & scratchOffsets, std::size_t initialCapacity) + : scratch_offsets_(scratchOffsets) +{ + primes_.reserve(initialCapacity); + first_multiples_offsets_.reserve(initialCapacity); + whpos_.reserve(initialCapacity); +} + +template +constexpr void +bucket::clear() +{ + primes_.clear(); + first_multiples_offsets_.clear(); + whpos_.clear(); +} + +template +constexpr std::size_t +bucket::size() const +{ + return primes_.size(); +} + +template +constexpr bool +bucket::is_empty() const +{ + return primes_.empty(); +} + +template +constexpr void +bucket::add(PrimeT prime, offset_type cOffs, uint8_t whpos) +{ + primes_.push_back(prime); + first_multiples_offsets_.push_back(cOffs); + whpos_.push_back(whpos); +} + +template +constexpr std::vector const & +bucket::primes() const +{ + return primes_; +} + + +template +template +constexpr bucket::offsets_type const & +bucket::compute_offsets(V n0, V ne, std::size_t bmpSize) +{ + LFP_LOG("bucket " << reinterpret_cast(this) + << ": computing offsets, number of primes: " << primes_.size() + << ", first prime: " << (!primes_.empty() ? primes_.front() : 0) + << ", last prime: " << (!primes_.empty() ? primes_.back() : 0)); + + scratch_offsets_.clear(); + + for(std::size_t k{}, kmax{primes_.size()}; k < kmax; ++k) { + auto p = primes_[k]; + auto start = first_multiples_offsets_[k]; + LFP_ASSERT(start < bmpSize); + auto whpos = whpos_[k]; + auto c = value_at(n0, start); + std::array offsets; + auto offsCount = compute_remaining_offsets(start, offsets, p, n0, ne, c, whpos); + // If things are done correctly the following condition is true most of the time + // because buckets are for big primes (someone like 509 has nothing to do here, + // whereas 12503 is welcome). + if(!offsCount) { + scratch_offsets_.push_back(start); + continue; + } + /// @todo: this is duplicated code, have a look at the end of inner_sieve + auto i = start; + if(offsCount == offsets.size()) { + // offsets are periodic, period is 8p + // @todo: can there be an overflow when adding 8*p? + for(; i + 8 * p < bmpSize; i += 8 * p) { + scratch_offsets_.push_back(i); + scratch_offsets_.push_back(i + offsets[0]); + scratch_offsets_.push_back(i + offsets[1]); + scratch_offsets_.push_back(i + offsets[2]); + scratch_offsets_.push_back(i + offsets[3]); + scratch_offsets_.push_back(i + offsets[4]); + scratch_offsets_.push_back(i + offsets[5]); + scratch_offsets_.push_back(i + offsets[6]); + } + } + const auto i0 = i; + for(auto j = 0; i < bmpSize; i = i0 + offsets[j], ++j) { + scratch_offsets_.push_back(i); + if(j == offsCount) { + break; + } } - break; - default:; } - return c; + + LFP_LOG("bucket " << reinterpret_cast(this) << ": found " << scratch_offsets_.size() << " offsets"); + + return scratch_offsets_; } + template class PrimesIterator; + class Bitmap { public: @@ -477,11 +1108,18 @@ class Bitmap constexpr void assign(uint64_t n0, std::size_t size); constexpr std::size_t size() const; constexpr std::size_t indexOf(uint64_t val) const; + constexpr value_type first_value() const; + constexpr value_type last_value() const; + constexpr value_type value_at(std::size_t idx) const; constexpr void reset(std::size_t index); template constexpr void apply(bitmask_impl const & mask); template constexpr void apply(bitmask_pack const & maskPack); + template + constexpr void apply_and_clear(bucket & bkt); + template + constexpr void apply(bucket & bkt); constexpr uint64_t popcount() const; void check() const; constexpr uint8_t at(std::size_t index) const; @@ -490,7 +1128,8 @@ class Bitmap private: template static constexpr std::size_t indexInResidues(Int); - + static constexpr value_type value_at_impl(value_type n0, std::size_t offs); + struct mask_application_data { std::size_t first_composite_; std::size_t first_composite_index_; @@ -504,7 +1143,10 @@ class Bitmap std::vector> vec_; std::size_t size_; - uint64_t n0_; + /// First value + value_type n0_; + /// Last value + value_type ne_; static constexpr std::array d_{1,0,5,4,3,2,1,0,3,2,1,0,1,0,3,2,1,0,1,0,3,2,1,0,5,4,3,2,1,0}; static constexpr std::array residues_{1,7,11,13,17,19,23,29}; static constexpr std::array deltas_{6,4,2,4,2,4,6,2}; @@ -520,6 +1162,7 @@ constexpr Bitmap::Bitmap() : size_(0) , n0_(0) + , ne_(0) {} constexpr @@ -528,24 +1171,34 @@ Bitmap::Bitmap(uint64_t n0, std::size_t size) ~decltype(vec_)::value_type{}) , size_(size) , n0_(n0 + d_[n0 % 30]) + , ne_(size_ ? value_at_impl(n0_, size_ - 1) : n0_) { + LFP_ASSERT(size_ >= 1); + if(size % NumLim::digits) { vec_.back() &= ~ElemType{} << (NumLim::digits - size % NumLim::digits); } + + LFP_LOG("constructed bitmap " << obj_addr(this) << ", [" << n0_ << ", " << ne_ << "], size = " << size_); } constexpr void Bitmap::assign(uint64_t n0, std::size_t size) { + LFP_LOG("assigning bitmap " << obj_addr(this)); + vec_.assign((size + NumLim::digits - 1)/NumLim::digits, ~decltype(vec_)::value_type{}); size_ = size; n0_ = n0 + d_[n0 % 30]; + ne_ = size_ ? value_at_impl(n0_, size_ - 1) : n0_; if(size % NumLim::digits) { vec_.back() &= ~ElemType{} << (NumLim::digits - size % NumLim::digits); } + + LFP_LOG("assigned bitmap " << obj_addr(this) << ": [" << n0_ << ", " << ne_ << "], size = " << size_); } constexpr @@ -557,11 +1210,39 @@ Bitmap::size() const constexpr std::size_t -Bitmap::indexOf(uint64_t val) const +Bitmap::indexOf(value_type val) const +{ + return index_of(val, n0_); +} + +constexpr +Bitmap::value_type +Bitmap::first_value() const +{ + return n0_; +} + +constexpr +Bitmap::value_type +Bitmap::last_value() const +{ + return ne_; +} + +constexpr +Bitmap::value_type +Bitmap::value_at(std::size_t idx) const +{ + return value_at_impl(n0_, idx); +} + +constexpr +Bitmap::value_type +Bitmap::value_at_impl(value_type n0, size_t idx) { - auto valmod30idx = (val % 30) * 4 / 15; - auto n0mod30idx = (n0_ % 30) * 4 /15; - return (val - n0_) / 30 * 8 + ((valmod30idx >= n0mod30idx) ? valmod30idx - n0mod30idx : 8 + valmod30idx - n0mod30idx); + auto n0res = n0 % 30; + auto nxres = residues_[(n0res * 4 / 15 + idx) % 8]; + return n0 + (idx / 8) * 30 + ((nxres >= n0res) ? nxres - n0res : 30 + nxres - n0res ); } constexpr @@ -652,7 +1333,7 @@ Bitmap::compute_mask_application_data(bitmask_impl const & bmk) mask_application_data mappData{0, nopos_, nopos_, nopos_}; const auto p = bmk.prime(); - auto c = find_first_multiple_above(p, n0_); + auto c = find_first_multiple_ae(p, n0_); if(!c) { return mappData; } @@ -672,8 +1353,12 @@ template constexpr void Bitmap::apply(bitmask_impl const & bmk) { + LFP_LOG("bitmap " << obj_addr(this) << ": about to apply precomputed bitmask (prime = " << bmk.prime() << ")"); + auto mappData = compute_mask_application_data(bmk); apply(bmk, mappData, size()); + + LFP_LOG("bitmap " << obj_addr(this) << ": applied precomputed bitmask (prime = " << bmk.prime() << ")"); } @@ -730,6 +1415,10 @@ template constexpr void Bitmap::apply(bitmask_pack const & maskPack) { + LFP_LOG_S("bitmap " << obj_addr(this) << ": about to apply pack of bitmasks for primes {" + << maskPack.primes_as_string() << "}", + "bitmap " << obj_addr(this) << ": pack of bitmasks applied"); + constexpr auto masksCount = maskPack.size(); // We apply each mask individualy until we reach an offset where they can be applied // all together. @@ -777,63 +1466,54 @@ void Bitmap::apply(bitmask_pack const & maskPack) } } +template +constexpr void +Bitmap::apply_and_clear(bucket & bkt) +{ + apply(bkt); + bkt.clear(); +} -inline constexpr uint8_t wheel[8][8] = { - {6,4,2,4,2,4,6,2}, - {4,2,4,2,4,6,2,6}, - {2,4,2,4,6,2,6,4}, - {4,2,4,6,2,6,4,2}, - {2,4,6,2,6,4,2,4}, - {4,6,2,6,4,2,4,2}, - {6,2,6,4,2,4,2,4}, - {2,6,4,2,4,2,4,6} -}; -inline constexpr uint8_t whoffs[8][8] = { - {0, 1, 2, 3, 4, 5, 6, 7}, - {2, 7, 5, 4, 1, 0, 6, 3}, - {0, 2, 6, 4, 7, 5, 1, 3}, - {6, 2, 1, 5, 4, 0, 7, 3}, - {2, 6, 7, 3, 4, 0, 1, 5}, - {0, 6, 2, 4, 1, 3, 7, 5}, - {6, 1, 3, 4, 7, 0, 2, 5}, - {0, 7, 6, 5, 4, 3, 2, 1} -}; +template +constexpr void +Bitmap::apply(bucket & bkt) +{ + /// @todo: Do we really have to have this static assert? It seems logical because + /// base primes ought to be smaller than the ones being hunted... + static_assert(sizeof(PrimeT) <= sizeof(value_type)); + if(bkt.is_empty()) { + return; + } -// Returns the smallest integer coprime to 30 and greater than or equal to @param n0 -template -constexpr -Ret compute_gte_coprime(U n0) -{ - constexpr uint8_t dn[30] = { - 1, 0, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0, 1, 0, 3, - 2, 1, 0, 1, 0, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0 - }; + LFP_LOG("bitmap " << obj_addr(this) << ": about to apply bucket " << obj_addr(&bkt)); - return Ret{n0} + dn[n0 % 30]; -} + auto const & offsets = bkt.compute_offsets(n0_, ne_, size_); + LFP_LOG("bitmap " << obj_addr(this) << ": reseting " << offsets.size() << " bit(s)"); -// Returns the largest ineteger coprime to 30 and less than @param n1 -template -constexpr -Ret compute_lt_coprime(U n1) -{ - constexpr uint8_t dn[30] = { - 1, 2, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 1, 2, 1, - 2, 3, 4, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6 - }; + for(auto offsPtr = offsets.data(), offsPtrEnd = offsPtr + offsets.size(); + offsPtr != offsPtrEnd; + ++offsPtr) { + reset(*offsPtr); + } - return Ret{n1} - dn[n1 % 30]; + LFP_LOG("bitmap " << obj_addr(this) << ": bucket " << obj_addr(&bkt) << " applied"); } -template -inline constexpr std::array primesBelowSix = {2, 3, 5}; -template +struct sieve_data +{ + Bitmap * bitmap_ = nullptr; + bool have_to_initialize_bitmap_ = true; + bool have_to_ignore_bucketable_primes_ = false; +}; + + +template constexpr auto -inner_sieve(SP const & smallPrimes, U n0, U n1, Func ff, Bitmap & bmp, bool initBmp = true) +inner_sieve(BP const & basePrimes, U n0, U n1, Func ff, sieve_data const & sievdat) { auto const & tinyPrimes = primesBelowSix; if((n0 >= n1) || (n0 > std::numeric_limits::max() - 2)) { @@ -853,95 +1533,93 @@ inner_sieve(SP const & smallPrimes, U n0, U n1, Func ff, Bitmap & bmp, bool init } } n0 = compute_gte_coprime(n0); - auto ne = compute_lt_coprime(n1); + auto ne = compute_lt_coprime(n1); if(n0 > ne) { return ff(it0, it0, nullptr); } - if(initBmp) { - bmp.assign(n0, (ne - n0)/30 * 8 + ((ne % 30) >= (n0 % 30) ? (ne%30)*4/15 - (n0%30)*4/15 : 8 - (n0%30)*4/15 + (ne%30)*4/15) + 1); + if(sievdat.have_to_initialize_bitmap_) { + sievdat.bitmap_->assign(n0, (ne - n0)/30 * 8 + + ((ne % 30) >= (n0 % 30) + ? (ne % 30) * 4 / 15 - (n0 % 30) * 4 / 15 + : 8 - (n0 % 30) * 4 / 15 + (ne % 30) * 4 / 15) + 1); } + auto & bmp = *sievdat.bitmap_; // Primes below a certain threshold are dealt with by applying precomputed masks to the bitmap constexpr unsigned int lastSmallPrime = 103; constexpr unsigned int smallPrimesThreshold = lastSmallPrime + 1; - if(std::ranges::distance(smallPrimes | std::views::take_while([](auto p) { return p < smallPrimesThreshold; }))) { - bitmask_pack::value_type, + if(std::ranges::distance(basePrimes + | std::views::drop_while([](auto p) { return p < 7; }) + | std::views::take_while([](auto p) { return p < smallPrimesThreshold; }) + )) { + bitmask_pack::value_type, 7, 11, 13, 17, 19, 23, 29, 31> bitmasks_1; - bitmask_pack::value_type, + bitmask_pack::value_type, 37, 41, 43, 47, 53, 59, 61, 67> bitmasks_2; - bitmask_pack::value_type, + bitmask_pack::value_type, 71, 73, 79, 83, 89, 97, 101, lastSmallPrime> bitmasks_3; bmp.apply(bitmasks_1); bmp.apply(bitmasks_2); bmp.apply(bitmasks_3); } - for(auto p : smallPrimes | std::views::drop_while([smallPrimesThreshold](auto p) { return p < smallPrimesThreshold; })) { - auto p2 = U{p} * p; - if(p2 > ne) { - break; - } - // Early continue if p has no multiple in the current segment. The first condition is to avoid paying the cost of - // a modulo when there is a higher probability of p having a multiple in the current segment. - // This doesn't worsen things too much in the lower ranges while improving the performances in the higher ranges. - if(p > ne - n0) { - if(auto n0_mod_p = n0 % p; n0_mod_p && (p - n0_mod_p > ne - n0)) { - continue; - } - } - auto c = find_first_multiple_above(decltype(p2){p}, n0); - if(!c) { - continue; - } + for(auto p : basePrimes + | std::views::drop_while([smallPrimesThreshold](auto p) { return p < smallPrimesThreshold; }) + | std::views::take_while([&sievdat](auto p){ + return !(sievdat.have_to_ignore_bucketable_primes_ && (p >= bucketed_primes_threshold())); })) { + auto p2 = U{p} * p; + if(p2 > ne) { + break; + } + // Early continue if p has no multiple in the current segment. The first condition is to avoid paying the cost of + // a modulo when there is a higher probability of p having a multiple in the current segment. + // This doesn't worsen things too much in the lower ranges while improving the performances in the higher ranges. + if(p > ne - n0) { + if(auto n0_mod_p = n0 % p; n0_mod_p && (p - n0_mod_p > ne - n0)) { + continue; + } + } + U c = find_first_multiple_ae(decltype(p2){p}, n0); + if(!c) { + continue; + } + + std::size_t startOffs{noffs}; + std::array offsets; + auto offsetsCount = compute_offsets(startOffs, offsets, p, n0, ne, c); - constexpr auto c_max = std::numeric_limits::max(); - auto count = 0; - int32_t firstIndex = -1; - std::array offsets{}; - auto offsIdx = 0; - auto prevDelta = 0; - for(auto j = whoffs[(p%30)*4/15][(c%30)*4/15]; c <= ne; - c = ((c_max - c < wheel[(p%30)*4/15][j]*p) ? ne + 1 : c + wheel[(p%30)*4/15][j]*p), j = (j+1)%8) { - auto currIdx = bmp.indexOf(c); - if(offsIdx == 0) { - firstIndex = currIdx; - } - - if(offsIdx > 0 && offsIdx <= offsets.size()) { - offsets[offsIdx - 1] = currIdx - firstIndex; - } - ++offsIdx; - if(offsIdx > offsets.size()) { - break; - } - } - if((firstIndex < 0) || (firstIndex >= bmp.size())) { - continue; - } - std::size_t i = firstIndex; - if(offsets[6]) { - for(; i + 8 * p < bmp.size(); i += 8 * p) { - bmp.reset(i); - bmp.reset(i + offsets[0]); - bmp.reset(i + offsets[1]); + if(startOffs == noffs) { + // No multiple of p greater or equal to p^2 (and coprime to 30) in the current segment + continue; + } + // Unroll the loop when possible i.e. when we have a full period + std::size_t i = startOffs; + if(offsetsCount == offsets.size()) { + for(; i + 8 * p < bmp.size(); i += 8 * p) { + bmp.reset(i); + bmp.reset(i + offsets[0]); + bmp.reset(i + offsets[1]); bmp.reset(i + offsets[2]); - bmp.reset(i + offsets[3]); - bmp.reset(i + offsets[4]); - bmp.reset(i + offsets[5]); - bmp.reset(i + offsets[6]); - } - } - const auto i0 = i; - for(auto j = 0; i < bmp.size(); i = i0 + offsets[j], ++j) { - bmp.reset(i); - if((j == offsets.size()) || !offsets[j]) { - break; - } - } + bmp.reset(i + offsets[3]); + bmp.reset(i + offsets[4]); + bmp.reset(i + offsets[5]); + bmp.reset(i + offsets[6]); + } + } + // Deal with the remaining offsets (or the partial period) + const auto i0 = i; + for(auto j = 0; i < bmp.size(); i = i0 + offsets[j], ++j) { + bmp.reset(i); + if(j == offsetsCount) { + break; + } + } } + return ff(it0, std::end(tinyPrimes), &bmp); } + template {}.cbegin())> constexpr std::vector collectSieveResults(It first, It last, Bitmap const * bmp) @@ -1258,10 +1936,11 @@ template inline constexpr auto u16primes = []() { auto sv = [] { Bitmap bmp; + sieve_data sievdat{.bitmap_ = &bmp}; return inner_sieve( u8primes, uint16_t(0), uint16_t(65535), collectSieveResults, - bmp); + sievdat); }; std::array u16primesArr; std::ranges::copy(sv(), std::begin(u16primesArr)); @@ -1269,89 +1948,183 @@ inline constexpr auto u16primes = []() { }(); -template -inline constexpr bool is_one_of_v = (std::is_same_v || ...); - +template +struct segment +{ + U low_; + U high_; + Bitmap * bitmap_; + bucket * bucket_; +}; template constexpr auto sieve(I k0, I k1, Fct ff) { static_assert(is_one_of_v); + int32_t, uint32_t, int64_t, uint64_t, decltype(0ull)>); + if constexpr(std::numeric_limits::is_signed) { - k0 = (std::max)(I{0}, k0); - k1 = (std::max)(I{0}, k1); + k0 = (std::max)(I{0}, k0); + k1 = (std::max)(I{0}, k1); } using U = std::make_unsigned_t; const U n0 = U(k0), n1 = U(k1); + + LFP_LOG("sieving range [" << n0 << ", " << n1 << "["); constexpr U maxn = std::numeric_limits::max(); - constexpr U innerRangeSize = []() { - if constexpr (is_one_of_v) { - return maxn; - } else { - return U{48*1024*1024}; - } }(); - std::vector bitmaps; - bitmaps.reserve((n1 - n0)/innerRangeSize + 1); + constexpr U segmentSize = []() { + if constexpr (is_one_of_v) { + return maxn; + } else { + return U{32*1024*1024}; + } }(); std::vector prefix; prefix.reserve(3); - - - for(auto a0 = n0, a1 = std::min(n1, (maxn - innerRangeSize < n0) ? maxn : U(n0 + innerRangeSize)); + auto const estimatedNumSegments = (n1 - n0) / segmentSize + 1; + std::vector bitmaps; + bitmaps.reserve(estimatedNumSegments); + std::vector> buckets; + buckets.reserve(estimatedNumSegments); + std::vector> segments; + segments.reserve(estimatedNumSegments); + std::vector> scratchOffsets; + scratchOffsets.reserve(262144 * 10); + for(auto a0 = n0, a1 = std::min(n1, (maxn - segmentSize < n0) ? maxn : U(n0 + segmentSize)); a0 < n1; - a0 = (maxn - innerRangeSize < a0) ? maxn : a0 + innerRangeSize, - a1 = std::min(n1, maxn - innerRangeSize < a0 ? maxn : U(a0 + innerRangeSize))) { - - const U rangeSize = [n1](){ - if constexpr (is_one_of_v) { - return (U{1} << (std::numeric_limits::digits / 2)) - 1; - } else { - // Established through tests - if(n1 >= U{55}<<54) { - return U{16*1024*1024}; + a0 = (maxn - segmentSize < a0) ? maxn : a0 + segmentSize, + a1 = std::min(n1, maxn - segmentSize < a0 ? maxn : U(a0 + segmentSize))) { + bitmaps.push_back(Bitmap{}); + buckets.push_back(bucket{scratchOffsets, initial_bucket_capacity}); + segments.emplace_back(segment{ + .low_ = a0, + .high_ = a1, + .bitmap_ = &bitmaps.back(), + .bucket_ = &buckets.back()}); + } + auto const numSegments = segments.size(); + + const U basePrimesRangeSize = [n1](){ + if constexpr (is_one_of_v) { + return (U{1} << (std::numeric_limits::digits / 2)) - 1; + } else { + // Established through tests + if(n1 >= U{55}<<54) { + return U{16*1024*1024}; + } + return (std::min)(U{16*1024*1024}, + U{1} << ((std::bit_width(n1) + 1)/2)); + } }(); + constexpr auto maxm = (U{1} << (std::numeric_limits::digits / 2)) - 1; + + for(U m0 = 0, m1 = basePrimesRangeSize; + (m0 < (U{1} << (std::numeric_limits::digits / 2)) - 1) && (m0 * m0 <= n1); + m0 = m1, + m1 = (maxm - m1 >= basePrimesRangeSize) ? m1 + basePrimesRangeSize : maxm) { + constexpr auto basePrimes = []() { + if constexpr (std::is_same_v || std::is_same_v) { + return u8primes; + } else { + return u16primes; + } + }(); + Bitmap basePrimesBmp; + details::inner_sieve(basePrimes, m0, m1, + [](auto, auto, details::Bitmap const *){ }, sieve_data{ + .bitmap_ = &basePrimesBmp, + .have_to_initialize_bitmap_ = true + }); + + auto pSquared = U{}; + details::PrimesIterator itP{&basePrimesBmp}, itPe{&basePrimesBmp, true}; + for(auto p : std::ranges::subrange(itP, itPe) + | std::views::drop_while([](auto q){ return q < bucketed_primes_threshold(); })) { + pSquared = p * p; + + auto highest = segments.back().high_; + for(std::size_t i{}; i < numSegments;) { + uint8_t pmod30{}, kpmod30{}; + auto kp = find_first_multiple_ae_b(p, segments[i].low_, highest, pSquared, pmod30, kpmod30); + if(!kp) { + break; } - return (std::min)(U{16*1024*1024}, - U{1} << ((std::bit_width(n1) + 1)/2)); - } }(); - constexpr auto maxm = (U{1} << (std::numeric_limits::digits / 2)) - 1; - bitmaps.emplace_back(details::Bitmap{}); - auto & currSegBmp = bitmaps.back(); - - for(U m0 = 0, m1 = rangeSize; - (m0 < (U{1} << (std::numeric_limits::digits / 2)) - 1) && (m0 * m0 <= a1); - m0 = m1, - m1 = (maxm - m1 >= rangeSize) ? m1 + rangeSize : maxm) { - details::Bitmap primesBmp; - constexpr auto basePrimes = []() { - if constexpr (std::is_same_v || std::is_same_v) { - return u8primes; - } else { - return u16primes; - } - }(); - details::inner_sieve(basePrimes, m0, m1, - [](auto, auto, details::Bitmap const *){ }, primesBmp); - details::PrimesIterator itP{&primesBmp}, itPe{&primesBmp, true}; - auto basePrimesRange = std::ranges::subrange(itP, itPe); - details::inner_sieve(basePrimesRange, a0, a1, - [&](auto it, auto ite, details::Bitmap const*){ - if(it != ite) { - prefix = std::vector{it, ite}; - } - return 0; - }, currSegBmp, m0 == 0); + LFP_ASSERT(kp < highest); + i = (kp - n0) / segmentSize; + LFP_ASSERT(i < numSegments); + auto & segment = segments[i]; + auto kpmod30Idx = cp30res_to_idx(kpmod30); + auto indexOfMultipleInBitmap = index_of(kp, kpmod30Idx, compute_gte_coprime(segment.low_)); + segment.bucket_->add(p, indexOfMultipleInBitmap, (cp30res_to_idx(pmod30) << 3) | kpmod30Idx); + if(kp + 2 * p >= highest) { // @todo: check for overflow!!! + break; + } + ++i; + } + } + if(pSquared >= n1) { + break; } } + + U basePrimesUpperBound = [](){ + if constexpr(bucketed_primes_threshold() > (std::numeric_limits::max)()) { + return (std::numeric_limits::max)(); + } else { + return U(bucketed_primes_threshold()); + } + }(); + // @todo: make this static? + auto basePrimesForSegmentSieve = [basePrimesUpperBound](){ + Bitmap bmp; + sieve_data sievdat{.bitmap_ = &bmp}; + return inner_sieve(u16primes, U{}, basePrimesUpperBound, collectSieveResults, sievdat); + }(); + + for(auto & currentSegment : segments) { + details::inner_sieve(basePrimesForSegmentSieve, currentSegment.low_, currentSegment.high_, + [&](auto it, auto ite, details::Bitmap const*){ + if(it != ite) { + prefix = std::vector{it, ite}; + } + return 0; + }, sieve_data{ .bitmap_ = currentSegment.bitmap_, + .have_to_initialize_bitmap_ = true, + .have_to_ignore_bucketable_primes_ = true }); + currentSegment.bitmap_->apply_and_clear(*currentSegment.bucket_); + } + return ff(prefix, bitmaps); } + +template +void partition_range(U n0, U n1, int N, FuncT processRange) +{ + if(N <= 0 || (n0 >= n1)) { + processRange(n0, n0); + } else if((N == 1) || (n1 - n0 < N)) { + processRange(n0, n1); + } else { + auto v = std::views::iota(1, N + 1); + auto weight = std::accumulate(std::begin(v), std::end(v), 0.0, + [](double x, auto k){ return x + 1.0 / std::sqrt(k); + }); + auto c = (n1 - n0) / weight; + auto current = n0; + std::for_each(std::begin(v), std::end(v), [&](auto k){ + auto ni = (k == N) ? n1 : current + c / std::sqrt(k); + processRange(current, ni); + current = ni; + }); + } +} + } // namespace details inline unsigned int -Threads::defaultCount() +threads::default_count() { static auto v = []() { auto c = std::thread::hardware_concurrency(); @@ -1360,15 +2133,15 @@ Threads::defaultCount() return v; } -inline Threads::Threads() - : count_(Threads::defaultCount()) +inline threads::threads() + : count_(threads::default_count()) {} -inline Threads::Threads(unsigned int numThreads) +inline threads::threads(unsigned int numThreads) : count_(numThreads ? numThreads : 1) {} -inline unsigned int Threads::count() const +inline unsigned int threads::count() const { return count_; } @@ -1395,36 +2168,34 @@ sieve_to_vector(U n0, U n1) template SieveResults -sieve(U n0, U n1, Threads const & threads) +sieve(U n0, U n1, threads const & threads) { if(threads.count() == 1 || (n1 <= n0) || (n1 - n0) <= threads.count()) { return sieve(n0, n1); } - auto numThreads = threads.count(); std::vector>> results; std::vector prefix; - for(auto k = n0, dk = (n1 - n0) / numThreads, ek = (n1 - n0) % numThreads; k < n1; ek = ek ? ek - 1 : 0) { - auto kmax = k + dk + (ek ? 1 : 0); + details::partition_range(n0, n1, threads.count(), [&](U v0, U v1) { results.emplace_back(std::async(std::launch::async, - [&prefix](U v0, U v1){ + [v0, v1, &prefix](){ return details::sieve(v0, v1, - [&prefix](auto & pref, std::vector & bmps) { + [&](auto & pref, std::vector & bmps) { if(!pref.empty()) { prefix = std::move(pref); } return std::move(bmps); }); - }, k, kmax)); - k = kmax; - } + })); + }); std::vector bmps = - std::accumulate(std::begin(results), std::end(results), std::vector{}, - [](auto x, auto & y) { - for(auto & b : y.get()) { - x.emplace_back(std::move(b)); - } - return std::move(x); - }); + std::accumulate(std::begin(results), std::end(results), + std::vector{}, + [](auto x, auto & y) { + for(auto & b : y.get()) { + x.emplace_back(std::move(b)); + } + return std::move(x); + }); return SieveResults{std::move(prefix), std::move(bmps)}; } @@ -1433,7 +2204,11 @@ template constexpr std::size_t count_primes(U n0, U n1) { std::size_t count = 0; - constexpr auto rangeSize = 24*1024*1024; + const auto rangeSize = [n1]() { + if(n1 < (1ull << 32)) return 24*1024*1024; + if(n1 < (1ull << 36)) return 64*1024*1024; + return 256*1024*1024; + } (); constexpr auto maxn = std::numeric_limits::max(); for(auto a0 = n0, a1 = std::min(n1, (maxn - rangeSize < n0) ? maxn : n0 + rangeSize); a0 < n1; @@ -1445,21 +2220,19 @@ constexpr std::size_t count_primes(U n0, U n1) } template -std::size_t count_primes(U n0, U n1, Threads const & threads) +std::size_t count_primes(U n0, U n1, threads const & threads) { if(n0 >= n1) { return 0; } auto numThreads = threads.count(); - if(n1 - n0 < numThreads) { + if(n1 - n0 <= numThreads) { return count_primes(n0, n1); } std::vector> results; - for(auto k = n0, dk = (n1 - n0) / numThreads, ek = (n1 - n0) % numThreads; k < n1; ek = ek ? ek - 1 : 0) { - auto kmax = k + dk + (ek ? 1 : 0); - results.emplace_back(std::async(std::launch::async, static_cast(count_primes), k, kmax)); - k = kmax; - } + details::partition_range(n0, n1, numThreads, [&results](U v0, U v1) { + results.emplace_back(std::async(std::launch::async, static_cast(count_primes), v0, v1)); + }); return std::accumulate(std::begin(results), std::end(results), std::size_t{}, [](auto x, auto & y) { return x + y.get(); }); } diff --git a/main.cpp b/main.cpp index cd94583..c3a08f6 100644 --- a/main.cpp +++ b/main.cpp @@ -36,7 +36,7 @@ int main(int argc, char** argv) istr1 >> n1; auto const startt = std::chrono::steady_clock::now(); - auto numPrimes = lfp::count_primes(n0, n1, lfp::Threads{numThreads < 0 ? 1u : (unsigned int)numThreads}); + auto numPrimes = lfp::count_primes(n0, n1, lfp::threads{numThreads < 0 ? 1u : (unsigned int)numThreads}); auto const endt = std::chrono::steady_clock::now(); std::cout << "The number of prime numbers in range [" << n0 << ", " << n1 << "[ is " diff --git a/makefile b/makefile index 2e2a8de..9d1d50d 100644 --- a/makefile +++ b/makefile @@ -4,8 +4,10 @@ # Creation date: december 2024. CXX ?= g++ -CXFLAGS ?= -std=c++20 -O3 -march=native $(ADDITIONAL_CXFLAGS) +CXFLAGS ?= -std=c++20 -O3 -march=native -DNDEBUG $(ADDITIONAL_CXFLAGS) STATIC_TESTS_CXFLAGS ?= +# By default we activate assertions during testing +DYNAMIC_TESTS_FLAGS ?= -UNDEBUG SYNTAX_ONLY_FLAG=-fsyntax-only TEST_LIBS ?= -lCatch2Main -lCatch2 TEST_LIB_INSTALL ?= ./Catch2-install @@ -25,7 +27,7 @@ static_tests_%: static_tests_%.cpp lfp.hpp $(CXX) $(CXFLAGS) $(SYNTAX_ONLY_FLAG) $(STATIC_TESTS_CXFLAGS) $< tests: tests.cpp lfp.hpp - $(CXX) $(CXFLAGS) $(TEST_LIB_INCL_OPT) -o $@ $< $(TEST_LIB_LIB_OPT) $(TEST_LIBS) + $(CXX) $(CXFLAGS) $(DYNAMIC_TESTS_FLAGS) $(TEST_LIB_INCL_OPT) -o $@ $< $(TEST_LIB_LIB_OPT) $(TEST_LIBS) all: static_tests lfp tests diff --git a/static_tests_2.cpp b/static_tests_2.cpp index 2c58d67..a14433e 100644 --- a/static_tests_2.cpp +++ b/static_tests_2.cpp @@ -25,14 +25,14 @@ static_assert(sieve_to_vector(uint32_t(0), uint32_t(3)) == std::vector static_assert(sieve_to_vector(uint32_t(262121), uint32_t(262144)) == std::vector{262121, 262127, 262133, 262139}); static_assert(sieve_to_vector(uint32_t(1048576), uint32_t(1048700)) == std::vector{1048583, 1048589, 1048601, 1048609, 1048613, 1048627, 1048633, 1048661, 1048681}); -static_assert(sieve_to_vector(uint32_t(61075016), uint32_t(61075116)) == std::vector{61075019, 61075037, 61075057, 61075061, - 61075087, 61075099, 61075103, 61075109, 61075111}); static_assert(sieve_to_vector(uint64_t(1000000), uint64_t(1000010)) == std::vector{1000003}); // The following tests make g++-12.2 stall while g++-13.3 has no issue compiling this file. // They are active by default, this allows to deactivate them. #ifndef LFP_TESTS_DISABLE_STATIC_ASSERTS_IN_HIGHER_RANGES +static_assert(sieve_to_vector(uint32_t(61075016), uint32_t(61075116)) == std::vector{61075019, 61075037, 61075057, 61075061, + 61075087, 61075099, 61075103, 61075109, 61075111}); static_assert(sieve_to_vector(uint32_t(1074041825), uint32_t(1074041924)) == std::vector{1074041849, 1074041869}); static_assert(sieve_to_vector(uint32_t(4294967196), uint32_t(4294967295)) == std::vector{4294967197, 4294967231, 4294967279, 4294967291}); diff --git a/tests.cpp b/tests.cpp index c132f64..d92275b 100644 --- a/tests.cpp +++ b/tests.cpp @@ -166,13 +166,15 @@ TEST_CASE("Sieve of Erathostenes - primes iterator and range") { using namespace lfp::details; { Bitmap bmp; - inner_sieve(u8primes, 300u, 400u, [](auto, auto, auto) {}, bmp); + sieve_data sievdat{.bitmap_ = &bmp}; + inner_sieve(u8primes, 300u, 400u, [](auto, auto, auto) {}, sievdat); PrimesIterator it{&bmp}, ite{&bmp, true}; CHECK_THAT(std::vector(it, ite), Equals(primes_by_division(300, 400))); } { Bitmap bmp; - inner_sieve(u16primes, 10000u, 12000u, [](auto, auto, auto) {}, bmp); + sieve_data sievdat{.bitmap_ = &bmp}; + inner_sieve(u16primes, 10000u, 12000u, [](auto, auto, auto) {}, sievdat); PrimesIterator it{&bmp}, ite{&bmp, true}; CHECK_THAT(std::vector(it, ite), Equals(primes_by_division(10000, 12000))); } @@ -218,12 +220,23 @@ TEST_CASE("Sieve of Erathostenes - above 2^32 - #2") { } TEST_CASE("Sieve of Erathostenes - multithreaded sieve") { - CHECK_THAT(lfp::sieve(uint64_t(0), uint64_t(1000000), lfp::Threads{2}).count(), equals(78498)); - CHECK_THAT(lfp::sieve(uint64_t(0), uint64_t(1'000'000'000), lfp::Threads{4}).count(), equals(50847534)); - CHECK_THAT(lfp::sieve(641*641, 8191*8191+1, lfp::Threads{4}).count(), equals(3922190)); + CHECK_THAT(lfp::sieve(uint64_t(0), uint64_t(1000000), lfp::threads{1}).count(), equals(78498)); + CHECK_THAT(lfp::sieve(uint64_t(0), uint64_t(1000000), lfp::threads{2}).count(), equals(78498)); + CHECK_THAT(lfp::sieve(uint64_t(0), uint64_t(1'000'000'000), lfp::threads{4}).count(), equals(50847534)); + CHECK_THAT(lfp::sieve(641*641, 8191*8191+1, lfp::threads{4}).count(), equals(3922190)); } TEST_CASE("Sieve of Erathostenes - misc - #1") { CHECK_THAT(lfp::count_primes(uint64_t(1'005'000'000'000), uint64_t(1'006'250'000'000)), equals(45228966)); } +TEST_CASE("Sieve of Erathostenes - misc - #2") { + auto primes = lfp::sieve_to_vector(10'000'000'000ull, 10'000'140'000ull); + CHECK_THAT(primes.size(), equals(6073)); + CHECK_THAT(primes.back(), equals(10'000'139'969)); + CHECK_THAT(primes[primes.size() - 2], equals(10'000'139'953)); + CHECK_THAT(primes.front(), equals(10'000'000'019)); + // 3037th prime after 10^10 + CHECK_THAT(primes[primes.size() / 2], equals(10'000'070'131)); +} + diff --git a/tperf.sh b/tperf.sh index fb2d163..ecdbbde 100755 --- a/tperf.sh +++ b/tperf.sh @@ -17,7 +17,7 @@ ranges=(0 $((10**9))\ 18446744063709551616 18446744073709551615\ ) -threads_c=(1 4 8 16 32 48) +threads_c=(1 4 8 16 32 48 64) compute_avg() { local values=("$@")