diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..378eac2 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +build diff --git a/CMakeLists.txt b/CMakeLists.txt index d085b11..a7bb430 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ if(NOT CMAKE_BUILD_TYPE) endif(NOT CMAKE_BUILD_TYPE) # Compile Flags -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fpermissive -w") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fpermissive -w -g") # Include Files set(HEADER_FILES @@ -99,7 +99,7 @@ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") add_executable(gerbil ${HEADER_FILES} ${SOURCE_FILES_CPP} ${CUDA_OBJECTS}) # build convert -add_executable(toFasta src/gerbil/toFasta.cpp) +# add_executable(toFasta src/gerbil/toFasta.cpp) # link against external libraries target_link_libraries(gerbil ${MY_LIBS}) diff --git a/include/gerbil/KmerHasher.h b/include/gerbil/KmerHasher.h index 079fdc7..002fcac 100644 --- a/include/gerbil/KmerHasher.h +++ b/include/gerbil/KmerHasher.h @@ -199,6 +199,7 @@ namespace gerbil { _kMersNumberCPU = cpuHasher.getKMersNumber(); _uKMersNumberCPU = cpuHasher.getUKMersNumber(); + std::cout << _uKMersNumberCPU << std::endl; _btUKMersNumberCPU = cpuHasher.getBtUKMersNumber(); _kMersNumberGPU = gpuHasher.getKMersNumber(); _uKMersNumberGPU = gpuHasher.getUKMersNumber(); diff --git a/include/gerbil/SequenceSplitter.h b/include/gerbil/SequenceSplitter.h index e44f413..4dfbfc9 100644 --- a/include/gerbil/SequenceSplitter.h +++ b/include/gerbil/SequenceSplitter.h @@ -49,13 +49,38 @@ class SequenceSplitter{ std::atomic _baseNumbers; // total number of bases + uint32_t invMMer(const uint32_t &mmer); // inverts a minimizer bool isAllowed(uint32 mmer); // checks whether a minimizer is allowed (special, tested strategies) void detMMerHisto(); // calculation of a histogram (special, tested strategies) + void loadUhs(std::string path = "res_7.txt"); + void loadRanks(uint32* ranks); + void loadFrequency(uint32* freq); void processThread(const uint &id); // starts a single thread public: + class Comp + { + uint32* signature_occurrences; + public: + Comp(uint32* _signature_occurrences) : signature_occurrences(_signature_occurrences){} + bool operator()(int i, int j) + { + return signature_occurrences[i] > signature_occurrences[j]; + } + }; + + class CompR + { + uint32* signature_occurrences; + public: + CompR(uint32* _signature_occurrences) : signature_occurrences(_signature_occurrences){} + bool operator()(int i, int j) + { + return signature_occurrences[i] < signature_occurrences[j]; + } + }; SyncSwapQueueMPSC** getSuperBundleQueues(); // returns the SuperBundleQueues /* diff --git a/src/gerbil/SequenceSplitter.cpp b/src/gerbil/SequenceSplitter.cpp index a520b20..9964212 100644 --- a/src/gerbil/SequenceSplitter.cpp +++ b/src/gerbil/SequenceSplitter.cpp @@ -7,6 +7,9 @@ #include "../../include/gerbil/SequenceSplitter.h" +#include +#include +#include #define SS_MMER_UNDEFINED 0x80000000 @@ -49,7 +52,6 @@ gerbil::SequenceSplitter::~SequenceSplitter() { gerbil::SyncSwapQueueMPSC** gerbil::SequenceSplitter::getSuperBundleQueues() { return _superBundleQueues; } - uint32_t gerbil::SequenceSplitter::invMMer(const uint32_t &mmer){ uint32 rev = 0; uint32 immer = ~mmer; @@ -62,6 +64,7 @@ uint32_t gerbil::SequenceSplitter::invMMer(const uint32_t &mmer){ return rev; } + // kmc2 strategy bool gerbil::SequenceSplitter::isAllowed(uint32 mmer) { if(mmer > invMMer(mmer)) @@ -88,33 +91,228 @@ bool gerbil::SequenceSplitter::isAllowed(uint32 mmer) { return true; } + +void gerbil::SequenceSplitter::loadRanks(uint32* ranks) +{ + uint32 mmersNumber = 1<<(2*_m); + uint32 localRanks[mmersNumber]; + std::ifstream infile("ranks.txt"); + int str_value; + std::string line; + uint32 i = 0; + while (std::getline(infile, line)) + { + str_value = std::stoi(line); + ranks[i] = str_value; + localRanks[i] = i; + i++; + if(i == mmersNumber) break; + } + std::sort(localRanks, localRanks + mmersNumber, CompR(ranks)); + for(uint32 j = 0; j < mmersNumber; ++j) + { + std::cout << " j is = " << j << " val is " << localRanks[j] << std::endl; + ranks[localRanks[j]] = j; + } + infile.close(); +} + +void gerbil::SequenceSplitter::loadFrequency(uint32* freq) +{ + uint32 mmersNumber = 1<<(2*_m); + std::ifstream infile("freq.txt"); + int str_value; + std::string line; + uint32 i = 0; + while (std::getline(infile, line)) + { + str_value = std::stoi(line); + freq[i] = str_value; + i++; + if(i == mmersNumber) break; + } + infile.close(); +} + + void gerbil::SequenceSplitter::detMMerHisto() { const uint32 mMersNumber = 1 << (2 * _m); + uint32 stats[mMersNumber]; + + loadRanks(_mVal); + loadFrequency(stats); // kmc2 strategy - std::vector> kMerFrequencies; - for(uint32 mmer = 0; mmer < mMersNumber; ++mmer) - kMerFrequencies.push_back(std::make_pair(isAllowed(mmer) ? mmer : 0xffffffffu, mmer)); - - std::sort(kMerFrequencies.begin(), kMerFrequencies.end()); - uint32 rank = 0; - for (std::vector>::iterator it = kMerFrequencies.begin() ; it != kMerFrequencies.end(); ++it) { - _mVal[it->second] = rank++; - } + // std::vector> kMerFrequencies; + // for(uint32 mmer = 0; mmer < mMersNumber; ++mmer) + // kMerFrequencies.push_back(std::make_pair(isAllowed(mmer) ? mmer : 0xffffffffu, mmer)); + // uint32 normalized_value; + // for(uint32 mmer = 0; mmer < mMersNumber; ++mmer) + // { + // if(uhs[mmer] == 0) + // { + // if(isAllowed(mmer)) + // normalized_value = 2*mMersNumber + mmer; + // else + // normalized_value = 3*mMersNumber + mmer; + // } + // else + // { + // if(isAllowed(mmer)) + // normalized_value = mmer; + // else + // normalized_value = mMersNumber + mmer; + // } + // kMerFrequencies.push_back(std::make_pair(normalized_value, mmer)); + // } + + + // std::sort(kMerFrequencies.begin(), kMerFrequencies.end()); + // uint32 rank = 0; + // for (std::vector>::iterator it = kMerFrequencies.begin() ; it != kMerFrequencies.end(); ++it) { + // _mVal[it->second] = rank++; + // } + + + + // for(uint32 i = 0; i < mMersNumber; ++i) + // { + // _mToBin[i] = i % _tempFilesNumber; + // } + // pyramid-shaped distribution - uint32 bfnx2 = 2 * _tempFilesNumber; - uint32 mMN = mMersNumber - mMersNumber % bfnx2; - int32 p = mMN / bfnx2; - for(uint32 i = 0; i < mMersNumber; ++i) - if(_mVal[i] < mMN){ - int32 d = _mVal[i] / bfnx2; - int32 m = (_mVal[i] + d * bfnx2 / p) % bfnx2; - int32 x = m - (int32)_tempFilesNumber; - _mToBin[i] = x < 0 ? -x - 1 : x; + // uint32 bfnx2 = 2 * _tempFilesNumber; + // uint32 mMN = mMersNumber - mMersNumber % bfnx2; + // int32 p = mMN / bfnx2; + // for(uint32 i = 0; i < mMersNumber; ++i) + // if(_mVal[i] < mMN){ + // int32 d = _mVal[i] / bfnx2; + // int32 m = (_mVal[i] + d * bfnx2 / p) % bfnx2; + // int32 x = m - (int32)_tempFilesNumber; + // // if((x < 0 ? -x - 1 : x) == 335) std::cout << i << std::endl; + // _mToBin[i] = x < 0 ? -x - 1 : x; + // } + // else + // { + // // if((_tempFilesNumber - 1 - (_mVal[i] % _tempFilesNumber)) == 335) std::cout << i << std::endl; + // _mToBin[i] = _tempFilesNumber - 1 - (_mVal[i] % _tempFilesNumber); + // } + + + uint32 local_sorted[mMersNumber]; + uint32 *sorted = local_sorted; + for (uint32 i = 0; i < mMersNumber ; ++i) + sorted[i] = i; + std::sort(sorted, sorted + mMersNumber, Comp(stats)); + + std::list> _stats; + + for (uint32 i = 0; i < mMersNumber ; ++i) + { + if(i < invMMer(i)) + { + _stats.push_back(std::make_pair(sorted[i], stats[sorted[i]])); + } + } + + std::list _stats_zero_ids; + for (auto it = _stats.begin(); it != _stats.end();) + { + if (it->second == 0) + { + _stats_zero_ids.push_back(it->first); + it = _stats.erase(it); } else - _mToBin[i] = _tempFilesNumber - 1 - (_mVal[i] % _tempFilesNumber); + ++it; + } + uint32 total_zeroes = _stats_zero_ids.size(); + uint32 assigned_zeros = 0; + + + std::list> group; + uint32 bin_no = 0; + //counting sum + double sum = 0.0; + for (auto &i : _stats) + { + i.second += 100; + sum += i.second; + } + + + double mean = sum / _tempFilesNumber; + double max_bin_size = mean; + uint32 n = _tempFilesNumber; + uint32 max_bins = _tempFilesNumber; + uint32 num_zeros_in_bin = (_stats_zero_ids.size() / max_bins) + 1; + + while (_stats.size() > n) + { + std::pair& max = _stats.front(); + + if (max.second >= mean) + { + std::cout << max.first << " innn " << bin_no <second <= max_bin_size) + { + tmp_sum += it->second; + group.push_back(*it); + it = _stats.erase(it); + ++in_current; + } + else + ++it; + } + for (auto i = group.begin(); i != group.end(); ++i) + { + //std::cout << i->first << " in " << bin_no <first] = bin_no; + } + + for (auto p = std::make_pair(_stats_zero_ids.begin(),0); + p.first != _stats_zero_ids.end() && p.second < num_zeros_in_bin && assigned_zeros < total_zeroes; + ++p.second, ++assigned_zeros) + { + _mToBin[*(p.first)] = bin_no; + p.first = _stats_zero_ids.erase(p.first); + } + --n; + ++bin_no; + + sum -= tmp_sum; + mean = sum / (max_bins - bin_no); + max_bin_size = 1.1 * mean + } + } + if (_stats.size() > 0) + { + for (auto i = _stats.begin(); i != _stats.end(); ++i) + { + _mToBin[i->first] = _tempFilesNumber - 1 - (i->first%_tempFilesNumber); + } + } + if(_stats_zero_ids.size() == 0) + { + std::cout << "No elements in zero sized " << std::endl; + } } void gerbil::SequenceSplitter::process() { diff --git a/src/gerbil/TempFile.cpp b/src/gerbil/TempFile.cpp index ac5913d..ef6eb13 100644 --- a/src/gerbil/TempFile.cpp +++ b/src/gerbil/TempFile.cpp @@ -66,7 +66,13 @@ bool gerbil::TempFile::write(SuperBundle *superBundle) { _smers += superBundle->sMerNumber; _kmers += superBundle->kMerNumber; _filled += superBundle->getSize(); - return fwrite((char *) superBundle->data, 1, SUPER_BUNDLE_DATA_SIZE_B, _file) == SUPER_BUNDLE_DATA_SIZE_B; + auto x = (char *) superBundle->data; + if(x == nullptr) + { + std::cout << "nullptr"; + return true; + } + return fwrite(x, 1, SUPER_BUNDLE_DATA_SIZE_B, _file) == SUPER_BUNDLE_DATA_SIZE_B; } bool gerbil::TempFile::write(char *data, const uint64 &size, const uint64 &smers, const uint64 &kmers,