diff --git a/include/gerbil/Bundle.h b/include/gerbil/Bundle.h index 7480d6d..9606e25 100644 --- a/include/gerbil/Bundle.h +++ b/include/gerbil/Bundle.h @@ -24,6 +24,7 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "KMer.h" + namespace gerbil { @@ -107,6 +108,8 @@ namespace gerbil { uint64 sMerNumber; // number of s-mers uint64 kMerNumber; // number of k-mers + + SuperBundle(); ~SuperBundle(); @@ -123,6 +126,8 @@ namespace gerbil { bool merge(const SuperBundle &sb); + + //debugging, working only with "add" uint32 getSize() { return _next - data + 1; diff --git a/include/gerbil/CpuHasher.h b/include/gerbil/CpuHasher.h index 2c0f1aa..809efd1 100644 --- a/include/gerbil/CpuHasher.h +++ b/include/gerbil/CpuHasher.h @@ -453,16 +453,17 @@ namespace gerbil { // preparation next bin if (notEmpty) { binKMers = 0; + std::cout << binUKMers << " load in " << curTempFileId << std::endl; binUKMers = 0; binFKMers = 0; ratio = distributor->getSplitRatio(false, tId, curTempFileId); //printf("cpu hasher thread %i: my ratio is %f\n", tId, ratio); curTempFileId = kmb->getTempFileId(); curTempRun = kmb->getTempFileRun(); - apprUKMersNumber = _tempFiles[curTempFileId].approximateUniqueKmers( - kMersNumber ? (double) uKMersNumber / kMersNumber : START_RATIO) * ratio / _tempFiles[curTempFileId].getNumberOfRuns(); + apprUKMersNumber = _tempFiles[curTempFileId].approximateUniqueKmers(kMersNumber ? (double) uKMersNumber / kMersNumber : START_RATIO) * ratio / _tempFiles[curTempFileId].getNumberOfRuns(); // lolz put here approximation of kmers + //std::cout << "approx load" << apprUKMersNumber << " id " << curTempFileId << std::endl; //std::cout << "curTempFileId : " << curTempFileId << " curTempRun: " << curTempRun << std::endl; - KMCHT_setPartSize(); + KMCHT_setPartSize(); // lolz // reset timer duration = ms(0); } diff --git a/include/gerbil/KmerHasher.h b/include/gerbil/KmerHasher.h index 002fcac..b765d6a 100644 --- a/include/gerbil/KmerHasher.h +++ b/include/gerbil/KmerHasher.h @@ -104,7 +104,7 @@ namespace gerbil { if (_norm) { // use normalized kmers - processThreadSplit(tId, cpuKMerQueues, gpuKMerQueues); + processThreadSplit(tId, cpuKMerQueues, gpuKMerQueues); /// lolz push here stats } else { // do not use normalized kmers processThreadSplit(tId, cpuKMerQueues, gpuKMerQueues); @@ -130,7 +130,7 @@ namespace gerbil { cpu::HasherTask cpuHasher(_numCPUHasher, _distributor, &_kmcSyncSwapQueue, _tempFiles, _thresholdMin, - _maxKmcHashtableSize, _tempFolder); + _maxKmcHashtableSize, _tempFolder); /// lolz??? // create Hash Tables (GPU-side) gpu::HasherTask gpuHasher(_numGPUHasher, @@ -159,7 +159,7 @@ namespace gerbil { } // start splitting - processSplit(cpuKMerQueues, gpuKMerQueues); + processSplit(cpuKMerQueues, gpuKMerQueues); /// lolz push here stats // start hashing cpuHasher.hash(cpuKMerQueues); @@ -203,6 +203,7 @@ namespace gerbil { _btUKMersNumberCPU = cpuHasher.getBtUKMersNumber(); _kMersNumberGPU = gpuHasher.getKMersNumber(); _uKMersNumberGPU = gpuHasher.getUKMersNumber(); + std::cout << _uKMersNumberGPU << std::endl; _btUKMersNumberGPU = gpuHasher.getBtUKMersNumber(); }); diff --git a/include/gerbil/TempFile.h b/include/gerbil/TempFile.h index 528ad14..624585f 100644 --- a/include/gerbil/TempFile.h +++ b/include/gerbil/TempFile.h @@ -23,6 +23,7 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define BINFILE_H_ #include "Bundle.h" +#include "hyperloglog.h" namespace gerbil { @@ -42,6 +43,8 @@ namespace gerbil { uint64 _numberOfRuns; // number of runs + HyperLogLog *hll; // HyperLogLog data structure for load estimation + public: TempFile(); @@ -90,6 +93,9 @@ namespace gerbil { inline uint64 TempFile::approximateUniqueKmers(const double ratio) const { return _kmers * ratio; + // hll = new HyperLogLog(HLL_SIZE); + //return cardinality = (uint64)hll.estimate() + 1; + // delete hll; } inline const uint64 &TempFile::getKMersNumber() const { diff --git a/include/gerbil/config.h b/include/gerbil/config.h index e3543af..4f20c9c 100644 --- a/include/gerbil/config.h +++ b/include/gerbil/config.h @@ -104,7 +104,7 @@ namespace gerbil { #define DEF_TEMPFILES_NUMBER 512 #define MIN_TEMPFILES_NUMBER 2 -#define MAX_TEMPFILES_NUMBER 4 * 1024 +#define MAX_TEMPFILES_NUMBER 4 * 1024 * 100 #define DEF_THREADS_NUMBER 8 #define MIN_THREADS_NUMBER 4 diff --git a/include/gerbil/hyperloglog.h b/include/gerbil/hyperloglog.h new file mode 100644 index 0000000..22bd8f8 --- /dev/null +++ b/include/gerbil/hyperloglog.h @@ -0,0 +1,374 @@ +#ifndef HYPERLOGLOG_HPP +#define HYPERLOGLOG_HPP + +/** + * @file hyperloglog.hpp + * @brief HyperLogLog cardinality estimator + * @date Created 2013/3/20 + * @author Hideaki Ohno + */ + +#include +#include +#include +#include +#include +#include "./murmur3.h" + +#define HLL_HASH_SEED 313 + +#if defined(__has_builtin) && (defined(__GNUC__) || defined(__clang__)) + +#define _GET_CLZ(x, b) (uint8_t)std::min(b, ::__builtin_clz(x)) + 1 + +#else + +inline uint8_t _get_leading_zero_count(uint32_t x, uint8_t b) { + +#if defined (_MSC_VER) + uint32_t leading_zero_len = 32; + ::_BitScanReverse(&leading_zero_len, x); + --leading_zero_len; + return std::min(b, (uint8_t)leading_zero_len); +#else + uint8_t v = 1; + while (v <= b && !(x & 0x80000000)) { + v++; + x <<= 1; + } + return v; +#endif + +} +#define _GET_CLZ(x, b) _get_leading_zero_count(x, b) +#endif /* defined(__GNUC__) */ + +namespace hll { + +static const double pow_2_32 = 4294967296.0; ///< 2^32 +static const double neg_pow_2_32 = -4294967296.0; ///< -(2^32) + +/** @class HyperLogLog + * @brief Implement of 'HyperLogLog' estimate cardinality algorithm + */ +class HyperLogLog { +public: + + /** + * Constructor + * + * @param[in] b bit width (register size will be 2 to the b power). + * This value must be in the range[4,30].Default value is 4. + * + * @exception std::invalid_argument the argument is out of range. + */ + HyperLogLog(uint8_t b = 4) throw (std::invalid_argument) : + b_(b), m_(1 << b), M_(m_, 0) { + + if (b < 4 || 30 < b) { + throw std::invalid_argument("bit width must be in the range [4,30]"); + } + + double alpha; + switch (m_) { + case 16: + alpha = 0.673; + break; + case 32: + alpha = 0.697; + break; + case 64: + alpha = 0.709; + break; + default: + alpha = 0.7213 / (1.0 + 1.079 / m_); + break; + } + alphaMM_ = alpha * m_ * m_; + } + + /** + * Adds element to the estimator + * + * @param[in] str string to add + * @param[in] len length of string + */ + void add(const char* str, uint32_t len) { + uint32_t hash; + MurmurHash3_x86_32(str, len, HLL_HASH_SEED, (void*) &hash); + uint32_t index = hash >> (32 - b_); + uint8_t rank = _GET_CLZ((hash << b_), 32 - b_); + if (rank > M_[index]) { + M_[index] = rank; + } + } + + /** + * Estimates cardinality value. + * + * @return Estimated cardinality value. + */ + double estimate() const { + double estimate; + double sum = 0.0; + for (uint32_t i = 0; i < m_; i++) { + sum += 1.0 / (1 << M_[i]); + } + estimate = alphaMM_ / sum; // E in the original paper + if (estimate <= 2.5 * m_) { + uint32_t zeros = 0; + for (uint32_t i = 0; i < m_; i++) { + if (M_[i] == 0) { + zeros++; + } + } + if (zeros != 0) { + estimate = m_ * std::log(static_cast(m_)/ zeros); + } + } else if (estimate > (1.0 / 30.0) * pow_2_32) { + estimate = neg_pow_2_32 * log(1.0 - (estimate / pow_2_32)); + } + return estimate; + } + + /** + * Merges the estimate from 'other' into this object, returning the estimate of their union. + * The number of registers in each must be the same. + * + * @param[in] other HyperLogLog instance to be merged + * + * @exception std::invalid_argument number of registers doesn't match. + */ + void merge(const HyperLogLog& other) throw (std::invalid_argument) { + if (m_ != other.m_) { + std::stringstream ss; + ss << "number of registers doesn't match: " << m_ << " != " << other.m_; + throw std::invalid_argument(ss.str().c_str()); + } + for (uint32_t r = 0; r < m_; ++r) { + if (M_[r] < other.M_[r]) { + M_[r] |= other.M_[r]; + } + } + } + + /** + * Clears all internal registers. + */ + void clear() { + std::fill(M_.begin(), M_.end(), 0); + } + + /** + * Returns size of register. + * + * @return Register size + */ + uint32_t registerSize() const { + return m_; + } + + /** + * Exchanges the content of the instance + * + * @param[in,out] rhs Another HyperLogLog instance + */ + void swap(HyperLogLog& rhs) { + std::swap(b_, rhs.b_); + std::swap(m_, rhs.m_); + std::swap(alphaMM_, rhs.alphaMM_); + M_.swap(rhs.M_); + } + + /** + * Dump the current status to a stream + * + * @param[out] os The output stream where the data is saved + * + * @exception std::runtime_error When failed to dump. + */ + void dump(std::ostream& os) const throw(std::runtime_error){ + os.write((char*)&b_, sizeof(b_)); + os.write((char*)&M_[0], sizeof(M_[0]) * M_.size()); + if(os.fail()){ + throw std::runtime_error("Failed to dump"); + } + } + + /** + * Restore the status from a stream + * + * @param[in] is The input stream where the status is saved + * + * @exception std::runtime_error When failed to restore. + */ + void restore(std::istream& is) throw(std::runtime_error){ + uint8_t b = 0; + is.read((char*)&b, sizeof(b)); + HyperLogLog tempHLL(b); + is.read((char*)&(tempHLL.M_[0]), sizeof(M_[0]) * tempHLL.m_); + if(is.fail()){ + throw std::runtime_error("Failed to restore"); + } + swap(tempHLL); + } + +protected: + uint8_t b_; ///< register bit width + uint32_t m_; ///< register size + double alphaMM_; ///< alpha * m^2 + std::vector M_; ///< registers +}; + +/** + * @brief HIP estimator on HyperLogLog counter. + */ +class HyperLogLogHIP : public HyperLogLog { +public: + + /** + * Constructor + * + * @param[in] b bit width (register size will be 2 to the b power). + * This value must be in the range[4,30].Default value is 4. + * + * @exception std::invalid_argument the argument is out of range. + */ + HyperLogLogHIP(uint8_t b = 4) throw (std::invalid_argument) : HyperLogLog(b), register_limit_((1 << 5) - 1), c_(0.0), p_(1 << b) { + } + + /** + * Adds element to the estimator + * + * @param[in] str string to add + * @param[in] len length of string + */ + void add(const char* str, uint32_t len) { + uint32_t hash; + MurmurHash3_x86_32(str, len, HLL_HASH_SEED, (void*) &hash); + uint32_t index = hash >> (32 - b_); + uint8_t rank = _GET_CLZ((hash << b_), 32 - b_); + rank = rank == 0 ? register_limit_ : std::min(register_limit_, rank); + const uint8_t old = M_[index]; + if (rank > old) { + c_ += 1.0 / (p_/m_); + p_ -= 1.0/(1 << old); + M_[index] = rank; + if(rank < 31){ + p_ += 1.0/(uint32_t(1) << rank); + } + } + } + + /** + * Estimates cardinality value. + * + * @return Estimated cardinality value. + */ + double estimate() const { + return c_; + } + + /** + * Merges the estimate from 'other' into this object, returning the estimate of their union. + * The number of registers in each must be the same. + * + * @param[in] other HyperLogLog instance to be merged + * + * @exception std::invalid_argument number of registers doesn't match. + */ + void merge(const HyperLogLogHIP& other) throw (std::invalid_argument) { + if (m_ != other.m_) { + std::stringstream ss; + ss << "number of registers doesn't match: " << m_ << " != " << other.m_; + throw std::invalid_argument(ss.str().c_str()); + } + for (uint32_t r = 0; r < m_; ++r) { + const uint8_t b = M_[r]; + const uint8_t b_other = other.M_[r]; + if (b < b_other) { + c_ += 1.0 / (p_/m_); + p_ -= 1.0/(1 << b); + M_[r] |= b_other; + if(b_other < register_limit_){ + p_ += 1.0/(1 << b_other); + } + } + } + } + + /** + * Clears all internal registers. + */ + void clear() { + std::fill(M_.begin(), M_.end(), 0); + c_ = 0.0; + p_ = 1 << b_; + } + + /** + * Returns size of register. + * + * @return Register size + */ + uint32_t registerSize() const { + return m_; + } + + /** + * Exchanges the content of the instance + * + * @param[in,out] rhs Another HyperLogLog instance + */ + void swap(HyperLogLogHIP& rhs) { + std::swap(b_, rhs.b_); + std::swap(m_, rhs.m_); + std::swap(c_, rhs.c_); + M_.swap(rhs.M_); + } + + /** + * Dump the current status to a stream + * + * @param[out] os The output stream where the data is saved + * + * @exception std::runtime_error When failed to dump. + */ + void dump(std::ostream& os) const throw(std::runtime_error){ + os.write((char*)&b_, sizeof(b_)); + os.write((char*)&M_[0], sizeof(M_[0]) * M_.size()); + os.write((char*)&c_, sizeof(c_)); + os.write((char*)&p_, sizeof(p_)); + if(os.fail()){ + throw std::runtime_error("Failed to dump"); + } + } + + /** + * Restore the status from a stream + * + * @param[in] is The input stream where the status is saved + * + * @exception std::runtime_error When failed to restore. + */ + void restore(std::istream& is) throw(std::runtime_error){ + uint8_t b = 0; + is.read((char*)&b, sizeof(b)); + HyperLogLogHIP tempHLL(b); + is.read((char*)&(tempHLL.M_[0]), sizeof(M_[0]) * tempHLL.m_); + is.read((char*)&(tempHLL.c_), sizeof(double)); + is.read((char*)&(tempHLL.p_), sizeof(double)); + if(is.fail()){ + throw std::runtime_error("Failed to restore"); + } + swap(tempHLL); + } +private: + const uint8_t register_limit_; + double c_; + double p_; +}; + +} // namespace hll + +#endif // !defined(HYPERLOGLOG_HPP) \ No newline at end of file diff --git a/include/gerbil/murmur3.h b/include/gerbil/murmur3.h new file mode 100644 index 0000000..c778976 --- /dev/null +++ b/include/gerbil/murmur3.h @@ -0,0 +1,151 @@ +#ifndef MURMUR3_H +#define MURMUR3_H + +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +typedef unsigned char uint8_t; +typedef unsigned long uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +#define FORCE_INLINE __attribute__((always_inline)) + +inline uint32_t rotl32 ( uint32_t x, uint8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +/* NO-OP for little-endian platforms */ +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) +# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define BYTESWAP(x) (x) +# endif +/* if __BYTE_ORDER__ is not predefined (like FreeBSD), use arch */ +#elif defined(__i386) || defined(__x86_64) \ + || defined(__alpha) || defined(__vax) + +# define BYTESWAP(x) (x) +/* use __builtin_bswap32 if available */ +#elif defined(__GNUC__) || defined(__clang__) +#ifdef __has_builtin +#if __has_builtin(__builtin_bswap32) +#define BYTESWAP(x) __builtin_bswap32(x) +#endif // __has_builtin(__builtin_bswap32) +#endif // __has_builtin +#endif // defined(__GNUC__) || defined(__clang__) +/* last resort (big-endian w/o __builtin_bswap) */ +#ifndef BYTESWAP +# define BYTESWAP(x) ((((x)&0xFF)<<24) \ + |(((x)>>24)&0xFF) \ + |(((x)&0x0000FF00)<<8) \ + |(((x)&0x00FF0000)>>8) ) +#endif + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +#define getblock(p, i) BYTESWAP(p[i]) + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +uint32_t fmix32( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//----------------------------------------------------------------------------- + +#ifdef __cplusplus +extern "C" +#else +extern +#endif +void MurmurHash3_x86_32( const void * key, int len, uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + int i; + + uint32_t h1 = seed; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + { + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + } + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix32(h1); + + *(uint32_t*)out = h1; +} + +#endif \ No newline at end of file diff --git a/src/gerbil/Application.cpp b/src/gerbil/Application.cpp index 78dc404..4128be4 100644 --- a/src/gerbil/Application.cpp +++ b/src/gerbil/Application.cpp @@ -521,7 +521,7 @@ void gerbil::Application::distributeMemory2( // compute memory for hashtable uint64 maxUKMersNumber = std::min(tempFileStatistic->getMaxKMersNumber(), - tempFileStatistic->getAvg2SdKMersNumber()); + tempFileStatistic->getAvg2SdKMersNumber()); // lolz if (maxUKMersNumber > maxKmcHashtableSize) { maxUKMersNumber -= maxKmcHashtableSize; // already assured uint64 extraSize; diff --git a/src/gerbil/Bundle.cpp b/src/gerbil/Bundle.cpp index deeb9cf..432a747 100644 --- a/src/gerbil/Bundle.cpp +++ b/src/gerbil/Bundle.cpp @@ -107,6 +107,7 @@ gerbil::SuperBundle::SuperBundle() { gerbil::SuperBundle::~SuperBundle() { } + void gerbil::SuperBundle::finalize() { *_next = 0; _finalized = true; diff --git a/src/gerbil/SequenceSplitter.cpp b/src/gerbil/SequenceSplitter.cpp index 4d588ea..70366f1 100644 --- a/src/gerbil/SequenceSplitter.cpp +++ b/src/gerbil/SequenceSplitter.cpp @@ -109,7 +109,7 @@ void gerbil::SequenceSplitter::loadUhs(std::string path) std::ifstream infile(path); int str_value; std::string line; - std::cout << "lolz0" << std::endl<< std::flush; + // std::cout << "lolz0" << std::endl<< std::flush; while (std::getline(infile, line)) { str_value = 0; @@ -122,9 +122,9 @@ void gerbil::SequenceSplitter::loadUhs(std::string path) uhs[str_value] = 1; uhs[invMMer(str_value)] = 1; } - std::cout << "lolz6" << std::endl<< std::flush; + // std::cout << "lolz6" << std::endl<< std::flush; infile.close(); - std::cout << "lolz7" << std::endl<< std::flush; + // std::cout << "lolz7" << std::endl<< std::flush; } void gerbil::SequenceSplitter::loadRanks(uint32* ranks) @@ -146,7 +146,7 @@ void gerbil::SequenceSplitter::loadRanks(uint32* ranks) std::sort(localRanks, localRanks + mmersNumber, CompR(ranks)); for(uint32 j = 0; j < mmersNumber; ++j) { - std::cout << " j is = " << j << " val is " << localRanks[j] << std::endl; + // std::cout << " j is = " << j << " val is " << localRanks[j] << std::endl; ranks[localRanks[j]] = j; } infile.close(); @@ -247,7 +247,10 @@ void gerbil::SequenceSplitter::detMMerHisto() { for (uint32 i = 0; i < mMersNumber ; ++i) { - _stats.push_back(std::make_pair(sorted[i], stats[sorted[i]])); + if(i < invMMer(i)) + { + _stats.push_back(std::make_pair(sorted[i], stats[sorted[i]])); + } } std::list _stats_zero_ids; @@ -271,14 +274,18 @@ void gerbil::SequenceSplitter::detMMerHisto() { double sum = 0.0; for (auto &i : _stats) { - // i.second += 1000; + // uint32 not_ref_i = i; + // if(i.first < invMMer(i.first)) + // { + // i.second += 50; + // } //i.second += 100; sum += i.second; } double mean = sum / _tempFilesNumber; - double max_bin_size = mean; + double max_bin_size = 0.9 * mean; uint32 n = _tempFilesNumber; //one is needed for disabled signatures uint32 max_bins = _tempFilesNumber; uint32 num_zeros_in_bin = (_stats_zero_ids.size() / max_bins) + 1; @@ -295,7 +302,7 @@ void gerbil::SequenceSplitter::detMMerHisto() { _mToBin[max.first] = bin_no++; sum -= max.second; mean = sum / (max_bins - bin_no); - //max_bin_size = 1.1 * mean; + max_bin_size = 0.9 * mean; //max_bin_size = _stats.back().second; _stats.pop_front(); @@ -321,7 +328,7 @@ void gerbil::SequenceSplitter::detMMerHisto() { } for (auto i = group.begin(); i != group.end(); ++i) { - //std::cout << i->first << " in " << bin_no <first << " in " << bin_no <first] = bin_no; } @@ -338,7 +345,7 @@ void gerbil::SequenceSplitter::detMMerHisto() { sum -= tmp_sum; mean = sum / (max_bins - bin_no); //if(bin_no > _tempFilesNumber / 4) - max_bin_size = 1.01 * mean; + max_bin_size = 1.1 * mean; //if(bin_no < 200) //max_bin_size = _stats.back().second; @@ -351,9 +358,14 @@ void gerbil::SequenceSplitter::detMMerHisto() { _mToBin[i->first] = _tempFilesNumber - 1 - (i->first%_tempFilesNumber); } } - if(_stats_zero_ids.size() == 0) + if(_stats_zero_ids.size() > 0) { - std::cout << "No elements in zero sized " << std::endl; + //std::cout << "Elements in zero sized " << std::endl; + for (auto i = _stats_zero_ids.begin(); i != _stats_zero_ids.end();) + { + _mToBin[*i] = _tempFilesNumber - 1 - ((*i)%_tempFilesNumber); + i = _stats_zero_ids.erase(i); + } } } @@ -501,6 +513,7 @@ void gerbil::SequenceSplitter::processThread(const uint &id) { for(uint_tfn i = 0; i < _tempFilesNumber; ++i) { curSuperBundles[i] = new SuperBundle(); curSuperBundles[i]->tempFileId = i; + } while(_readBundleSyncQueue->swapPop(rb)) { diff --git a/src/gerbil/SuperReader.cpp b/src/gerbil/SuperReader.cpp index ee82feb..f76f1a0 100644 --- a/src/gerbil/SuperReader.cpp +++ b/src/gerbil/SuperReader.cpp @@ -33,7 +33,7 @@ void gerbil::SuperReader::processThread() { StopWatch sw; sw.start(); ) - + // TODO : keep order regular, and keep number of uKmers after first file? SuperBundle *sb = new SuperBundle; std::vector> binFileOrder; for (uint_tfn tempFileId(0); tempFileId < _tempFilesNumber; ++tempFileId) diff --git a/src/gerbil/TempFile.cpp b/src/gerbil/TempFile.cpp index ef6eb13..4a4df89 100644 --- a/src/gerbil/TempFile.cpp +++ b/src/gerbil/TempFile.cpp @@ -61,28 +61,26 @@ void gerbil::TempFile::reset() { rewind(_file); } +// TODO: add kmers to HLL in tempfile + bool gerbil::TempFile::write(SuperBundle *superBundle) { _size += SUPER_BUNDLE_DATA_SIZE_B; _smers += superBundle->sMerNumber; _kmers += superBundle->kMerNumber; _filled += superBundle->getSize(); - auto x = (char *) superBundle->data; - if(x == nullptr) - { - std::cout << "nullptr"; - return true; - } - return fwrite(x, 1, SUPER_BUNDLE_DATA_SIZE_B, _file) == SUPER_BUNDLE_DATA_SIZE_B; + return fwrite((char *) superBundle->data, 1, SUPER_BUNDLE_DATA_SIZE_B, _file) == SUPER_BUNDLE_DATA_SIZE_B; } bool gerbil::TempFile::write(char *data, const uint64 &size, const uint64 &smers, const uint64 &kmers, const uint64 &filled) { + std::cout << " in the not used write " << std::endl; _size += size; _smers += smers; _kmers += kmers; _filled += filled; return fwrite((char *) data, 1, size, _file) == size; -} +} + bool gerbil::TempFile::read(SuperBundle *superBundle) { return fread(superBundle->data, 1, SUPER_BUNDLE_DATA_SIZE_B, _file) == SUPER_BUNDLE_DATA_SIZE_B; @@ -110,3 +108,5 @@ void gerbil::TempFile::loadStats(string path, FILE *file) { _ukmers = 0; _filename = path + "temp" + to_string(_id) + ".bin"; } + +// TODO: get out of sequence splitter the estimation for loads. insert them to temp file. when loading