From faa40829cb8ad84e394b9712e8c6259fea5a0916 Mon Sep 17 00:00:00 2001 From: Dan Flomin Date: Sun, 14 Feb 2021 18:27:12 +0200 Subject: [PATCH 1/5] during add of UHS into UHS+signature ordering --- include/gerbil/MmerComparator.h | 0 include/gerbil/SequenceSplitter.h | 7 +++- src/gerbil/SequenceSplitter.cpp | 63 ++++++++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 3 deletions(-) create mode 100644 include/gerbil/MmerComparator.h diff --git a/include/gerbil/MmerComparator.h b/include/gerbil/MmerComparator.h new file mode 100644 index 0000000..e69de29 diff --git a/include/gerbil/SequenceSplitter.h b/include/gerbil/SequenceSplitter.h index e44f413..11b0ef0 100644 --- a/include/gerbil/SequenceSplitter.h +++ b/include/gerbil/SequenceSplitter.h @@ -49,9 +49,13 @@ class SequenceSplitter{ std::atomic _baseNumbers; // total number of bases + std::string uhsPath; + byte* uhs; + uint32_t invMMer(const uint32_t &mmer); // inverts a minimizer bool isAllowed(uint32 mmer); // checks whether a minimizer is allowed (special, tested strategies) void detMMerHisto(); // calculation of a histogram (special, tested strategies) + void loadUhs(std::string path = "res_7.txt"); void processThread(const uint &id); // starts a single thread @@ -68,7 +72,8 @@ class SequenceSplitter{ const uint32_t &k, // size of k-mer const uint8 &m, // Size of m-mer (minimizer) const uint_tfn &tempFilesNumber, // Number of Bin-Files - const bool &norm // normalized k-mers + const bool &norm, // normalized k-mers + std::string uhsPath ); /* diff --git a/src/gerbil/SequenceSplitter.cpp b/src/gerbil/SequenceSplitter.cpp index a520b20..075a40b 100644 --- a/src/gerbil/SequenceSplitter.cpp +++ b/src/gerbil/SequenceSplitter.cpp @@ -7,6 +7,8 @@ #include "../../include/gerbil/SequenceSplitter.h" +#include +#include #define SS_MMER_UNDEFINED 0x80000000 @@ -49,7 +51,6 @@ gerbil::SequenceSplitter::~SequenceSplitter() { gerbil::SyncSwapQueueMPSC** gerbil::SequenceSplitter::getSuperBundleQueues() { return _superBundleQueues; } - uint32_t gerbil::SequenceSplitter::invMMer(const uint32_t &mmer){ uint32 rev = 0; uint32 immer = ~mmer; @@ -62,6 +63,7 @@ uint32_t gerbil::SequenceSplitter::invMMer(const uint32_t &mmer){ return rev; } + // kmc2 strategy bool gerbil::SequenceSplitter::isAllowed(uint32 mmer) { if(mmer > invMMer(mmer)) @@ -88,13 +90,69 @@ bool gerbil::SequenceSplitter::isAllowed(uint32 mmer) { return true; } +void gerbil::SequenceSplitter::loadUhs(std::string path) +{ + uint32 mmersNumber = 1<<(2*_m); + uhs = new byte[mmersNumber]; + for(int i = 0; i< mmersNumber; i++) + uhs[i] = 0; + + char num_codes[256]; + for (int i = 0; i < 256; i++) + num_codes[i] = -1; + num_codes['A'] = num_codes['a'] = 0; + num_codes['C'] = num_codes['c'] = 1; + num_codes['G'] = num_codes['g'] = 2; + num_codes['T'] = num_codes['t'] = 3; + + std::ifstream infile(path); + int str_value; + std::string line; + std::cout << "lolz0" << std::endl<< std::flush; + while (std::getline(infile, line)) + { + str_value = 0; + const char *seq = line.c_str(); + for(uint32 i = 0; i < _m; i++) + { + str_value += num_codes[seq[i]] << (2*_m - (1+i)*2); + } + //str_value = (num_codes[seq[0]] << 16) + (num_codes[seq[1]] << 14) + (num_codes[seq[2]] << 12) + (num_codes[seq[3]] << 10) + (num_codes[seq[4]] << 8) + (num_codes[seq[5]] << 6) + (num_codes[seq[6]] << 4) + (num_codes[seq[7]]<< 2) + (num_codes[seq[8]]); + uhs[str_value] = 1; + } + std::cout << "lolz6" << std::endl<< std::flush; + infile.close(); + std::cout << "lolz7" << std::endl<< std::flush; +} + + void gerbil::SequenceSplitter::detMMerHisto() { const uint32 mMersNumber = 1 << (2 * _m); + loadUhs(); + // kmc2 strategy std::vector> kMerFrequencies; + uint32 normalized_value; for(uint32 mmer = 0; mmer < mMersNumber; ++mmer) - kMerFrequencies.push_back(std::make_pair(isAllowed(mmer) ? mmer : 0xffffffffu, mmer)); + { + if(uhs[mmer] == 0) + { + if(isAllowed(mmer)) + normalized_value = 2*mMersNumber + mmer; + else + normalized_value = 3*mMersNumber + mmer; + } + else + { + if(isAllowed(mmer)) + normalized_value = mmer; + else + normalized_value = mMersNumber + mmer; + } + kMerFrequencies.push_back(std::make_pair(normalized_value, mmer)); + } + std::sort(kMerFrequencies.begin(), kMerFrequencies.end()); uint32 rank = 0; @@ -308,6 +366,7 @@ void gerbil::SequenceSplitter::processThread(const uint &id) { int32 smer_c = (uint32) _m - _k; uint_tfn curTempFileId; + // TODO: HERE CHANGE COMPARISON for(uint32 i(_m - 1); i < reads_size; ++i) { cur_val = mmerval[i]; From eb6a431d880481134a7101cdd4c5c14dd0467180 Mon Sep 17 00:00:00 2001 From: Dan Flomin Date: Mon, 15 Feb 2021 12:33:28 +0200 Subject: [PATCH 2/5] during implement of UHS support --- build | 4 ++++ include/gerbil/SequenceSplitter.h | 4 ++-- src/gerbil/SequenceSplitter.cpp | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) create mode 100755 build diff --git a/build b/build new file mode 100755 index 0000000..dcecacc --- /dev/null +++ b/build @@ -0,0 +1,4 @@ +export CXX=/home/gaga/danflomin/usr/bin/g++ +export CC=/home/gaga/danflomin/usr/bin/gcc +/home/gaga/danflomin/usr/bin/cmake . +/home/gaga/danflomin/usr/bin/cmake --build . diff --git a/include/gerbil/SequenceSplitter.h b/include/gerbil/SequenceSplitter.h index 11b0ef0..3df502f 100644 --- a/include/gerbil/SequenceSplitter.h +++ b/include/gerbil/SequenceSplitter.h @@ -72,8 +72,8 @@ class SequenceSplitter{ const uint32_t &k, // size of k-mer const uint8 &m, // Size of m-mer (minimizer) const uint_tfn &tempFilesNumber, // Number of Bin-Files - const bool &norm, // normalized k-mers - std::string uhsPath + const bool &norm//, // normalized k-mers + //std::string uhsPath ); /* diff --git a/src/gerbil/SequenceSplitter.cpp b/src/gerbil/SequenceSplitter.cpp index 075a40b..a1e226c 100644 --- a/src/gerbil/SequenceSplitter.cpp +++ b/src/gerbil/SequenceSplitter.cpp @@ -9,6 +9,7 @@ #include "../../include/gerbil/SequenceSplitter.h" #include #include +#include #define SS_MMER_UNDEFINED 0x80000000 From 2f27615f5667447aff0c8b88fcf59ad01cc2cdf0 Mon Sep 17 00:00:00 2001 From: Dan Flomin Date: Mon, 1 Mar 2021 16:50:36 +0200 Subject: [PATCH 3/5] add load of ranks and stats of minimizer --- .gitignore | 1 + CMakeLists.txt | 4 +- build | 4 - include/gerbil/KmerHasher.h | 1 + include/gerbil/MmerComparator.h | 0 include/gerbil/SequenceSplitter.h | 23 +++ src/gerbil/SequenceSplitter.cpp | 246 ++++++++++++++++++++++++++---- src/gerbil/TempFile.cpp | 8 +- 8 files changed, 247 insertions(+), 40 deletions(-) create mode 100644 .gitignore delete mode 100755 build delete mode 100644 include/gerbil/MmerComparator.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..378eac2 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +build diff --git a/CMakeLists.txt b/CMakeLists.txt index d085b11..a7bb430 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ if(NOT CMAKE_BUILD_TYPE) endif(NOT CMAKE_BUILD_TYPE) # Compile Flags -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fpermissive -w") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fpermissive -w -g") # Include Files set(HEADER_FILES @@ -99,7 +99,7 @@ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") add_executable(gerbil ${HEADER_FILES} ${SOURCE_FILES_CPP} ${CUDA_OBJECTS}) # build convert -add_executable(toFasta src/gerbil/toFasta.cpp) +# add_executable(toFasta src/gerbil/toFasta.cpp) # link against external libraries target_link_libraries(gerbil ${MY_LIBS}) diff --git a/build b/build deleted file mode 100755 index dcecacc..0000000 --- a/build +++ /dev/null @@ -1,4 +0,0 @@ -export CXX=/home/gaga/danflomin/usr/bin/g++ -export CC=/home/gaga/danflomin/usr/bin/gcc -/home/gaga/danflomin/usr/bin/cmake . -/home/gaga/danflomin/usr/bin/cmake --build . diff --git a/include/gerbil/KmerHasher.h b/include/gerbil/KmerHasher.h index 079fdc7..002fcac 100644 --- a/include/gerbil/KmerHasher.h +++ b/include/gerbil/KmerHasher.h @@ -199,6 +199,7 @@ namespace gerbil { _kMersNumberCPU = cpuHasher.getKMersNumber(); _uKMersNumberCPU = cpuHasher.getUKMersNumber(); + std::cout << _uKMersNumberCPU << std::endl; _btUKMersNumberCPU = cpuHasher.getBtUKMersNumber(); _kMersNumberGPU = gpuHasher.getKMersNumber(); _uKMersNumberGPU = gpuHasher.getUKMersNumber(); diff --git a/include/gerbil/MmerComparator.h b/include/gerbil/MmerComparator.h deleted file mode 100644 index e69de29..0000000 diff --git a/include/gerbil/SequenceSplitter.h b/include/gerbil/SequenceSplitter.h index 3df502f..60ffb85 100644 --- a/include/gerbil/SequenceSplitter.h +++ b/include/gerbil/SequenceSplitter.h @@ -56,10 +56,33 @@ class SequenceSplitter{ bool isAllowed(uint32 mmer); // checks whether a minimizer is allowed (special, tested strategies) void detMMerHisto(); // calculation of a histogram (special, tested strategies) void loadUhs(std::string path = "res_7.txt"); + void loadRanks(uint32* ranks); + void loadFrequency(uint32* freq); void processThread(const uint &id); // starts a single thread public: + class Comp + { + uint32* signature_occurrences; + public: + Comp(uint32* _signature_occurrences) : signature_occurrences(_signature_occurrences){} + bool operator()(int i, int j) + { + return signature_occurrences[i] > signature_occurrences[j]; + } + }; + + class CompR + { + uint32* signature_occurrences; + public: + CompR(uint32* _signature_occurrences) : signature_occurrences(_signature_occurrences){} + bool operator()(int i, int j) + { + return signature_occurrences[i] < signature_occurrences[j]; + } + }; SyncSwapQueueMPSC** getSuperBundleQueues(); // returns the SuperBundleQueues /* diff --git a/src/gerbil/SequenceSplitter.cpp b/src/gerbil/SequenceSplitter.cpp index a1e226c..4d588ea 100644 --- a/src/gerbil/SequenceSplitter.cpp +++ b/src/gerbil/SequenceSplitter.cpp @@ -120,60 +120,241 @@ void gerbil::SequenceSplitter::loadUhs(std::string path) } //str_value = (num_codes[seq[0]] << 16) + (num_codes[seq[1]] << 14) + (num_codes[seq[2]] << 12) + (num_codes[seq[3]] << 10) + (num_codes[seq[4]] << 8) + (num_codes[seq[5]] << 6) + (num_codes[seq[6]] << 4) + (num_codes[seq[7]]<< 2) + (num_codes[seq[8]]); uhs[str_value] = 1; + uhs[invMMer(str_value)] = 1; } std::cout << "lolz6" << std::endl<< std::flush; infile.close(); std::cout << "lolz7" << std::endl<< std::flush; } +void gerbil::SequenceSplitter::loadRanks(uint32* ranks) +{ + uint32 mmersNumber = 1<<(2*_m); + uint32 localRanks[mmersNumber]; + std::ifstream infile("ranks.txt"); + int str_value; + std::string line; + uint32 i = 0; + while (std::getline(infile, line)) + { + str_value = std::stoi(line); + ranks[i] = str_value; + localRanks[i] = i; + i++; + if(i == mmersNumber) break; + } + std::sort(localRanks, localRanks + mmersNumber, CompR(ranks)); + for(uint32 j = 0; j < mmersNumber; ++j) + { + std::cout << " j is = " << j << " val is " << localRanks[j] << std::endl; + ranks[localRanks[j]] = j; + } + infile.close(); +} + +void gerbil::SequenceSplitter::loadFrequency(uint32* freq) +{ + uint32 mmersNumber = 1<<(2*_m); + std::ifstream infile("freq.txt"); + int str_value; + std::string line; + uint32 i = 0; + while (std::getline(infile, line)) + { + str_value = std::stoi(line); + freq[i] = str_value; + i++; + if(i == mmersNumber) break; + } + infile.close(); +} + void gerbil::SequenceSplitter::detMMerHisto() { const uint32 mMersNumber = 1 << (2 * _m); + uint32 stats[mMersNumber]; loadUhs(); + loadRanks(_mVal); + + loadFrequency(stats); // kmc2 strategy - std::vector> kMerFrequencies; - uint32 normalized_value; - for(uint32 mmer = 0; mmer < mMersNumber; ++mmer) + // std::vector> kMerFrequencies; + // for(uint32 mmer = 0; mmer < mMersNumber; ++mmer) + // kMerFrequencies.push_back(std::make_pair(isAllowed(mmer) ? mmer : 0xffffffffu, mmer)); + // uint32 normalized_value; + // for(uint32 mmer = 0; mmer < mMersNumber; ++mmer) + // { + // if(uhs[mmer] == 0) + // { + // if(isAllowed(mmer)) + // normalized_value = 2*mMersNumber + mmer; + // else + // normalized_value = 3*mMersNumber + mmer; + // } + // else + // { + // if(isAllowed(mmer)) + // normalized_value = mmer; + // else + // normalized_value = mMersNumber + mmer; + // } + // kMerFrequencies.push_back(std::make_pair(normalized_value, mmer)); + // } + + + // std::sort(kMerFrequencies.begin(), kMerFrequencies.end()); + // uint32 rank = 0; + // for (std::vector>::iterator it = kMerFrequencies.begin() ; it != kMerFrequencies.end(); ++it) { + // _mVal[it->second] = rank++; + // } + + + + // for(uint32 i = 0; i < mMersNumber; ++i) + // { + // _mToBin[i] = i % _tempFilesNumber; + // } + + + // pyramid-shaped distribution + // uint32 bfnx2 = 2 * _tempFilesNumber; + // uint32 mMN = mMersNumber - mMersNumber % bfnx2; + // int32 p = mMN / bfnx2; + // for(uint32 i = 0; i < mMersNumber; ++i) + // if(_mVal[i] < mMN){ + // int32 d = _mVal[i] / bfnx2; + // int32 m = (_mVal[i] + d * bfnx2 / p) % bfnx2; + // int32 x = m - (int32)_tempFilesNumber; + // // if((x < 0 ? -x - 1 : x) == 335) std::cout << i << std::endl; + // _mToBin[i] = x < 0 ? -x - 1 : x; + // } + // else + // { + // // if((_tempFilesNumber - 1 - (_mVal[i] % _tempFilesNumber)) == 335) std::cout << i << std::endl; + // _mToBin[i] = _tempFilesNumber - 1 - (_mVal[i] % _tempFilesNumber); + // } + + + uint32 local_sorted[mMersNumber]; + uint32 *sorted = local_sorted; + for (uint32 i = 0; i < mMersNumber ; ++i) + sorted[i] = i; + std::sort(sorted, sorted + mMersNumber, Comp(stats)); + + std::list> _stats; + + for (uint32 i = 0; i < mMersNumber ; ++i) { - if(uhs[mmer] == 0) + _stats.push_back(std::make_pair(sorted[i], stats[sorted[i]])); + } + + std::list _stats_zero_ids; + for (auto it = _stats.begin(); it != _stats.end();) + { + if (it->second == 0) { - if(isAllowed(mmer)) - normalized_value = 2*mMersNumber + mmer; - else - normalized_value = 3*mMersNumber + mmer; + _stats_zero_ids.push_back(it->first); + it = _stats.erase(it); } else + ++it; + } + uint32 total_zeroes = _stats_zero_ids.size(); + uint32 assigned_zeros = 0; + + + std::list> group; + uint32 bin_no = 0; + //counting sum + double sum = 0.0; + for (auto &i : _stats) + { + // i.second += 1000; + //i.second += 100; + sum += i.second; + } + + + double mean = sum / _tempFilesNumber; + double max_bin_size = mean; + uint32 n = _tempFilesNumber; //one is needed for disabled signatures + uint32 max_bins = _tempFilesNumber; + uint32 num_zeros_in_bin = (_stats_zero_ids.size() / max_bins) + 1; + + //max_bin_size = (mean + _stats.front().second) / 2; + + while (_stats.size() > n) + { + std::pair& max = _stats.front(); + + if (max.second >= mean) { - if(isAllowed(mmer)) - normalized_value = mmer; - else - normalized_value = mMersNumber + mmer; + std::cout << max.first << " innn " << bin_no <second <= max_bin_size) + { + tmp_sum += it->second; + group.push_back(*it); + it = _stats.erase(it); + ++in_current; + } + else + ++it; + } + for (auto i = group.begin(); i != group.end(); ++i) + { + //std::cout << i->first << " in " << bin_no <first] = bin_no; + } - std::sort(kMerFrequencies.begin(), kMerFrequencies.end()); - uint32 rank = 0; - for (std::vector>::iterator it = kMerFrequencies.begin() ; it != kMerFrequencies.end(); ++it) { - _mVal[it->second] = rank++; + for (auto p = std::make_pair(_stats_zero_ids.begin(),0); + p.first != _stats_zero_ids.end() && p.second < num_zeros_in_bin && assigned_zeros < total_zeroes; + ++p.second, ++assigned_zeros) + { + _mToBin[*(p.first)] = bin_no; + p.first = _stats_zero_ids.erase(p.first); + } + --n; + ++bin_no; + + sum -= tmp_sum; + mean = sum / (max_bins - bin_no); + //if(bin_no > _tempFilesNumber / 4) + max_bin_size = 1.01 * mean; + //if(bin_no < 200) + //max_bin_size = _stats.back().second; + + } } - - // pyramid-shaped distribution - uint32 bfnx2 = 2 * _tempFilesNumber; - uint32 mMN = mMersNumber - mMersNumber % bfnx2; - int32 p = mMN / bfnx2; - for(uint32 i = 0; i < mMersNumber; ++i) - if(_mVal[i] < mMN){ - int32 d = _mVal[i] / bfnx2; - int32 m = (_mVal[i] + d * bfnx2 / p) % bfnx2; - int32 x = m - (int32)_tempFilesNumber; - _mToBin[i] = x < 0 ? -x - 1 : x; + if (_stats.size() > 0) + { + for (auto i = _stats.begin(); i != _stats.end(); ++i) + { + _mToBin[i->first] = _tempFilesNumber - 1 - (i->first%_tempFilesNumber); } - else - _mToBin[i] = _tempFilesNumber - 1 - (_mVal[i] % _tempFilesNumber); + } + if(_stats_zero_ids.size() == 0) + { + std::cout << "No elements in zero sized " << std::endl; + } } void gerbil::SequenceSplitter::process() { @@ -367,7 +548,6 @@ void gerbil::SequenceSplitter::processThread(const uint &id) { int32 smer_c = (uint32) _m - _k; uint_tfn curTempFileId; - // TODO: HERE CHANGE COMPARISON for(uint32 i(_m - 1); i < reads_size; ++i) { cur_val = mmerval[i]; diff --git a/src/gerbil/TempFile.cpp b/src/gerbil/TempFile.cpp index ac5913d..ef6eb13 100644 --- a/src/gerbil/TempFile.cpp +++ b/src/gerbil/TempFile.cpp @@ -66,7 +66,13 @@ bool gerbil::TempFile::write(SuperBundle *superBundle) { _smers += superBundle->sMerNumber; _kmers += superBundle->kMerNumber; _filled += superBundle->getSize(); - return fwrite((char *) superBundle->data, 1, SUPER_BUNDLE_DATA_SIZE_B, _file) == SUPER_BUNDLE_DATA_SIZE_B; + auto x = (char *) superBundle->data; + if(x == nullptr) + { + std::cout << "nullptr"; + return true; + } + return fwrite(x, 1, SUPER_BUNDLE_DATA_SIZE_B, _file) == SUPER_BUNDLE_DATA_SIZE_B; } bool gerbil::TempFile::write(char *data, const uint64 &size, const uint64 &smers, const uint64 &kmers, From 5137c7455fd2ff5a460c359f997498c166ed74fb Mon Sep 17 00:00:00 2001 From: Dan Flomin Date: Sun, 18 Apr 2021 13:13:41 +0300 Subject: [PATCH 4/5] remove UHS stuff --- include/gerbil/SequenceSplitter.h | 3 --- src/gerbil/SequenceSplitter.cpp | 44 ++++--------------------------- 2 files changed, 5 insertions(+), 42 deletions(-) diff --git a/include/gerbil/SequenceSplitter.h b/include/gerbil/SequenceSplitter.h index 60ffb85..b0e27fe 100644 --- a/include/gerbil/SequenceSplitter.h +++ b/include/gerbil/SequenceSplitter.h @@ -49,8 +49,6 @@ class SequenceSplitter{ std::atomic _baseNumbers; // total number of bases - std::string uhsPath; - byte* uhs; uint32_t invMMer(const uint32_t &mmer); // inverts a minimizer bool isAllowed(uint32 mmer); // checks whether a minimizer is allowed (special, tested strategies) @@ -96,7 +94,6 @@ class SequenceSplitter{ const uint8 &m, // Size of m-mer (minimizer) const uint_tfn &tempFilesNumber, // Number of Bin-Files const bool &norm//, // normalized k-mers - //std::string uhsPath ); /* diff --git a/src/gerbil/SequenceSplitter.cpp b/src/gerbil/SequenceSplitter.cpp index 4d588ea..d0b0d6c 100644 --- a/src/gerbil/SequenceSplitter.cpp +++ b/src/gerbil/SequenceSplitter.cpp @@ -91,41 +91,6 @@ bool gerbil::SequenceSplitter::isAllowed(uint32 mmer) { return true; } -void gerbil::SequenceSplitter::loadUhs(std::string path) -{ - uint32 mmersNumber = 1<<(2*_m); - uhs = new byte[mmersNumber]; - for(int i = 0; i< mmersNumber; i++) - uhs[i] = 0; - - char num_codes[256]; - for (int i = 0; i < 256; i++) - num_codes[i] = -1; - num_codes['A'] = num_codes['a'] = 0; - num_codes['C'] = num_codes['c'] = 1; - num_codes['G'] = num_codes['g'] = 2; - num_codes['T'] = num_codes['t'] = 3; - - std::ifstream infile(path); - int str_value; - std::string line; - std::cout << "lolz0" << std::endl<< std::flush; - while (std::getline(infile, line)) - { - str_value = 0; - const char *seq = line.c_str(); - for(uint32 i = 0; i < _m; i++) - { - str_value += num_codes[seq[i]] << (2*_m - (1+i)*2); - } - //str_value = (num_codes[seq[0]] << 16) + (num_codes[seq[1]] << 14) + (num_codes[seq[2]] << 12) + (num_codes[seq[3]] << 10) + (num_codes[seq[4]] << 8) + (num_codes[seq[5]] << 6) + (num_codes[seq[6]] << 4) + (num_codes[seq[7]]<< 2) + (num_codes[seq[8]]); - uhs[str_value] = 1; - uhs[invMMer(str_value)] = 1; - } - std::cout << "lolz6" << std::endl<< std::flush; - infile.close(); - std::cout << "lolz7" << std::endl<< std::flush; -} void gerbil::SequenceSplitter::loadRanks(uint32* ranks) { @@ -174,9 +139,7 @@ void gerbil::SequenceSplitter::detMMerHisto() { const uint32 mMersNumber = 1 << (2 * _m); uint32 stats[mMersNumber]; - loadUhs(); - loadRanks(_mVal); - + loadRanks(_mVal); loadFrequency(stats); // kmc2 strategy @@ -247,7 +210,10 @@ void gerbil::SequenceSplitter::detMMerHisto() { for (uint32 i = 0; i < mMersNumber ; ++i) { - _stats.push_back(std::make_pair(sorted[i], stats[sorted[i]])); + if(i < invMMer(i)) + { + _stats.push_back(std::make_pair(sorted[i], stats[sorted[i]])); + } } std::list _stats_zero_ids; From bffac3d389941b5720c2c12f67d446feefe14ac9 Mon Sep 17 00:00:00 2001 From: Dan Flomin Date: Sun, 18 Apr 2021 13:22:47 +0300 Subject: [PATCH 5/5] refactor SequenceSplitter. Code looks like it should be, up to minor changes in binning rules (adding 1000 to stats or multiplying mean by 1.1) --- include/gerbil/SequenceSplitter.h | 2 +- src/gerbil/SequenceSplitter.cpp | 16 ++++------------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/include/gerbil/SequenceSplitter.h b/include/gerbil/SequenceSplitter.h index b0e27fe..4dfbfc9 100644 --- a/include/gerbil/SequenceSplitter.h +++ b/include/gerbil/SequenceSplitter.h @@ -93,7 +93,7 @@ class SequenceSplitter{ const uint32_t &k, // size of k-mer const uint8 &m, // Size of m-mer (minimizer) const uint_tfn &tempFilesNumber, // Number of Bin-Files - const bool &norm//, // normalized k-mers + const bool &norm // normalized k-mers ); /* diff --git a/src/gerbil/SequenceSplitter.cpp b/src/gerbil/SequenceSplitter.cpp index d0b0d6c..9964212 100644 --- a/src/gerbil/SequenceSplitter.cpp +++ b/src/gerbil/SequenceSplitter.cpp @@ -237,19 +237,16 @@ void gerbil::SequenceSplitter::detMMerHisto() { double sum = 0.0; for (auto &i : _stats) { - // i.second += 1000; - //i.second += 100; + i.second += 100; sum += i.second; } double mean = sum / _tempFilesNumber; double max_bin_size = mean; - uint32 n = _tempFilesNumber; //one is needed for disabled signatures + uint32 n = _tempFilesNumber; uint32 max_bins = _tempFilesNumber; uint32 num_zeros_in_bin = (_stats_zero_ids.size() / max_bins) + 1; - - //max_bin_size = (mean + _stats.front().second) / 2; while (_stats.size() > n) { @@ -261,8 +258,7 @@ void gerbil::SequenceSplitter::detMMerHisto() { _mToBin[max.first] = bin_no++; sum -= max.second; mean = sum / (max_bins - bin_no); - //max_bin_size = 1.1 * mean; - //max_bin_size = _stats.back().second; + max_bin_size = 1.1 * mean; _stats.pop_front(); --n; @@ -303,11 +299,7 @@ void gerbil::SequenceSplitter::detMMerHisto() { sum -= tmp_sum; mean = sum / (max_bins - bin_no); - //if(bin_no > _tempFilesNumber / 4) - max_bin_size = 1.01 * mean; - //if(bin_no < 200) - //max_bin_size = _stats.back().second; - + max_bin_size = 1.1 * mean } } if (_stats.size() > 0)