Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
build
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ if(NOT CMAKE_BUILD_TYPE)
endif(NOT CMAKE_BUILD_TYPE)

# Compile Flags
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fpermissive -w")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fpermissive -w -g")

# Include Files
set(HEADER_FILES
Expand Down Expand Up @@ -99,7 +99,7 @@ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
add_executable(gerbil ${HEADER_FILES} ${SOURCE_FILES_CPP} ${CUDA_OBJECTS})

# build convert
add_executable(toFasta src/gerbil/toFasta.cpp)
# add_executable(toFasta src/gerbil/toFasta.cpp)

# link against external libraries
target_link_libraries(gerbil ${MY_LIBS})
Expand Down
1 change: 1 addition & 0 deletions include/gerbil/KmerHasher.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ namespace gerbil {

_kMersNumberCPU = cpuHasher.getKMersNumber();
_uKMersNumberCPU = cpuHasher.getUKMersNumber();
std::cout << _uKMersNumberCPU << std::endl;
_btUKMersNumberCPU = cpuHasher.getBtUKMersNumber();
_kMersNumberGPU = gpuHasher.getKMersNumber();
_uKMersNumberGPU = gpuHasher.getUKMersNumber();
Expand Down
25 changes: 25 additions & 0 deletions include/gerbil/SequenceSplitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,38 @@ class SequenceSplitter{

std::atomic<uint64> _baseNumbers; // total number of bases


uint32_t invMMer(const uint32_t &mmer); // inverts a minimizer
bool isAllowed(uint32 mmer); // checks whether a minimizer is allowed (special, tested strategies)
void detMMerHisto(); // calculation of a histogram (special, tested strategies)
void loadUhs(std::string path = "res_7.txt");
void loadRanks(uint32* ranks);
void loadFrequency(uint32* freq);

void processThread(const uint &id); // starts a single thread

public:
class Comp
{
uint32* signature_occurrences;
public:
Comp(uint32* _signature_occurrences) : signature_occurrences(_signature_occurrences){}
bool operator()(int i, int j)
{
return signature_occurrences[i] > signature_occurrences[j];
}
};

class CompR
{
uint32* signature_occurrences;
public:
CompR(uint32* _signature_occurrences) : signature_occurrences(_signature_occurrences){}
bool operator()(int i, int j)
{
return signature_occurrences[i] < signature_occurrences[j];
}
};
SyncSwapQueueMPSC<SuperBundle>** getSuperBundleQueues(); // returns the SuperBundleQueues

/*
Expand Down
238 changes: 218 additions & 20 deletions src/gerbil/SequenceSplitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@


#include "../../include/gerbil/SequenceSplitter.h"
#include <iostream>
#include <string>
#include <fstream>


#define SS_MMER_UNDEFINED 0x80000000
Expand Down Expand Up @@ -49,7 +52,6 @@ gerbil::SequenceSplitter::~SequenceSplitter() {
gerbil::SyncSwapQueueMPSC<gerbil::SuperBundle>** gerbil::SequenceSplitter::getSuperBundleQueues() {
return _superBundleQueues;
}

uint32_t gerbil::SequenceSplitter::invMMer(const uint32_t &mmer){
uint32 rev = 0;
uint32 immer = ~mmer;
Expand All @@ -62,6 +64,7 @@ uint32_t gerbil::SequenceSplitter::invMMer(const uint32_t &mmer){
return rev;
}


// kmc2 strategy
bool gerbil::SequenceSplitter::isAllowed(uint32 mmer) {
if(mmer > invMMer(mmer))
Expand All @@ -88,33 +91,228 @@ bool gerbil::SequenceSplitter::isAllowed(uint32 mmer) {
return true;
}


void gerbil::SequenceSplitter::loadRanks(uint32* ranks)
{
uint32 mmersNumber = 1<<(2*_m);
uint32 localRanks[mmersNumber];
std::ifstream infile("ranks.txt");
int str_value;
std::string line;
uint32 i = 0;
while (std::getline(infile, line))
{
str_value = std::stoi(line);
ranks[i] = str_value;
localRanks[i] = i;
i++;
if(i == mmersNumber) break;
}
std::sort(localRanks, localRanks + mmersNumber, CompR(ranks));
for(uint32 j = 0; j < mmersNumber; ++j)
{
std::cout << " j is = " << j << " val is " << localRanks[j] << std::endl;
ranks[localRanks[j]] = j;
}
infile.close();
}

void gerbil::SequenceSplitter::loadFrequency(uint32* freq)
{
uint32 mmersNumber = 1<<(2*_m);
std::ifstream infile("freq.txt");
int str_value;
std::string line;
uint32 i = 0;
while (std::getline(infile, line))
{
str_value = std::stoi(line);
freq[i] = str_value;
i++;
if(i == mmersNumber) break;
}
infile.close();
}


void gerbil::SequenceSplitter::detMMerHisto() {
const uint32 mMersNumber = 1 << (2 * _m);
uint32 stats[mMersNumber];

loadRanks(_mVal);
loadFrequency(stats);

// kmc2 strategy
std::vector<std::pair<uint32_t,uint32_t>> kMerFrequencies;
for(uint32 mmer = 0; mmer < mMersNumber; ++mmer)
kMerFrequencies.push_back(std::make_pair(isAllowed(mmer) ? mmer : 0xffffffffu, mmer));

std::sort(kMerFrequencies.begin(), kMerFrequencies.end());
uint32 rank = 0;
for (std::vector<std::pair<uint32_t,uint32_t>>::iterator it = kMerFrequencies.begin() ; it != kMerFrequencies.end(); ++it) {
_mVal[it->second] = rank++;
}
// std::vector<std::pair<uint32_t,uint32_t>> kMerFrequencies;
// for(uint32 mmer = 0; mmer < mMersNumber; ++mmer)
// kMerFrequencies.push_back(std::make_pair(isAllowed(mmer) ? mmer : 0xffffffffu, mmer));
// uint32 normalized_value;
// for(uint32 mmer = 0; mmer < mMersNumber; ++mmer)
// {
// if(uhs[mmer] == 0)
// {
// if(isAllowed(mmer))
// normalized_value = 2*mMersNumber + mmer;
// else
// normalized_value = 3*mMersNumber + mmer;
// }
// else
// {
// if(isAllowed(mmer))
// normalized_value = mmer;
// else
// normalized_value = mMersNumber + mmer;
// }
// kMerFrequencies.push_back(std::make_pair(normalized_value, mmer));
// }


// std::sort(kMerFrequencies.begin(), kMerFrequencies.end());
// uint32 rank = 0;
// for (std::vector<std::pair<uint32_t,uint32_t>>::iterator it = kMerFrequencies.begin() ; it != kMerFrequencies.end(); ++it) {
// _mVal[it->second] = rank++;
// }



// for(uint32 i = 0; i < mMersNumber; ++i)
// {
// _mToBin[i] = i % _tempFilesNumber;
// }


// pyramid-shaped distribution
uint32 bfnx2 = 2 * _tempFilesNumber;
uint32 mMN = mMersNumber - mMersNumber % bfnx2;
int32 p = mMN / bfnx2;
for(uint32 i = 0; i < mMersNumber; ++i)
if(_mVal[i] < mMN){
int32 d = _mVal[i] / bfnx2;
int32 m = (_mVal[i] + d * bfnx2 / p) % bfnx2;
int32 x = m - (int32)_tempFilesNumber;
_mToBin[i] = x < 0 ? -x - 1 : x;
// uint32 bfnx2 = 2 * _tempFilesNumber;
// uint32 mMN = mMersNumber - mMersNumber % bfnx2;
// int32 p = mMN / bfnx2;
// for(uint32 i = 0; i < mMersNumber; ++i)
// if(_mVal[i] < mMN){
// int32 d = _mVal[i] / bfnx2;
// int32 m = (_mVal[i] + d * bfnx2 / p) % bfnx2;
// int32 x = m - (int32)_tempFilesNumber;
// // if((x < 0 ? -x - 1 : x) == 335) std::cout << i << std::endl;
// _mToBin[i] = x < 0 ? -x - 1 : x;
// }
// else
// {
// // if((_tempFilesNumber - 1 - (_mVal[i] % _tempFilesNumber)) == 335) std::cout << i << std::endl;
// _mToBin[i] = _tempFilesNumber - 1 - (_mVal[i] % _tempFilesNumber);
// }


uint32 local_sorted[mMersNumber];
uint32 *sorted = local_sorted;
for (uint32 i = 0; i < mMersNumber ; ++i)
sorted[i] = i;
std::sort(sorted, sorted + mMersNumber, Comp(stats));

std::list<std::pair<uint32, uint64>> _stats;

for (uint32 i = 0; i < mMersNumber ; ++i)
{
if(i < invMMer(i))
{
_stats.push_back(std::make_pair(sorted[i], stats[sorted[i]]));
}
}

std::list<uint32> _stats_zero_ids;
for (auto it = _stats.begin(); it != _stats.end();)
{
if (it->second == 0)
{
_stats_zero_ids.push_back(it->first);
it = _stats.erase(it);
}
else
_mToBin[i] = _tempFilesNumber - 1 - (_mVal[i] % _tempFilesNumber);
++it;
}
uint32 total_zeroes = _stats_zero_ids.size();
uint32 assigned_zeros = 0;


std::list<std::pair<uint32, uint64>> group;
uint32 bin_no = 0;
//counting sum
double sum = 0.0;
for (auto &i : _stats)
{
i.second += 100;
sum += i.second;
}


double mean = sum / _tempFilesNumber;
double max_bin_size = mean;
uint32 n = _tempFilesNumber;
uint32 max_bins = _tempFilesNumber;
uint32 num_zeros_in_bin = (_stats_zero_ids.size() / max_bins) + 1;

while (_stats.size() > n)
{
std::pair<uint32, uint64>& max = _stats.front();

if (max.second >= mean)
{
std::cout << max.first << " innn " << bin_no <<std::endl << std::flush;
_mToBin[max.first] = bin_no++;
sum -= max.second;
mean = sum / (max_bins - bin_no);
max_bin_size = 1.1 * mean;

_stats.pop_front();
--n;
}
else
{
//heuristic
group.clear();
double tmp_sum = 0.0;
uint32 in_current = 0;
for (auto it = _stats.begin(); it != _stats.end();)
{
if (tmp_sum + it->second <= max_bin_size)
{
tmp_sum += it->second;
group.push_back(*it);
it = _stats.erase(it);
++in_current;
}
else
++it;
}
for (auto i = group.begin(); i != group.end(); ++i)
{
//std::cout << i->first << " in " << bin_no <<std::endl << std::flush;
_mToBin[i->first] = bin_no;
}

for (auto p = std::make_pair(_stats_zero_ids.begin(),0);
p.first != _stats_zero_ids.end() && p.second < num_zeros_in_bin && assigned_zeros < total_zeroes;
++p.second, ++assigned_zeros)
{
_mToBin[*(p.first)] = bin_no;
p.first = _stats_zero_ids.erase(p.first);
}
--n;
++bin_no;

sum -= tmp_sum;
mean = sum / (max_bins - bin_no);
max_bin_size = 1.1 * mean
}
}
if (_stats.size() > 0)
{
for (auto i = _stats.begin(); i != _stats.end(); ++i)
{
_mToBin[i->first] = _tempFilesNumber - 1 - (i->first%_tempFilesNumber);
}
}
if(_stats_zero_ids.size() == 0)
{
std::cout << "No elements in zero sized " << std::endl;
}
}

void gerbil::SequenceSplitter::process() {
Expand Down
8 changes: 7 additions & 1 deletion src/gerbil/TempFile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,13 @@ bool gerbil::TempFile::write(SuperBundle *superBundle) {
_smers += superBundle->sMerNumber;
_kmers += superBundle->kMerNumber;
_filled += superBundle->getSize();
return fwrite((char *) superBundle->data, 1, SUPER_BUNDLE_DATA_SIZE_B, _file) == SUPER_BUNDLE_DATA_SIZE_B;
auto x = (char *) superBundle->data;
if(x == nullptr)
{
std::cout << "nullptr";
return true;
}
return fwrite(x, 1, SUPER_BUNDLE_DATA_SIZE_B, _file) == SUPER_BUNDLE_DATA_SIZE_B;
}

bool gerbil::TempFile::write(char *data, const uint64 &size, const uint64 &smers, const uint64 &kmers,
Expand Down