diff --git a/include/btllib/nthash.hpp b/include/btllib/nthash.hpp index 20dc16da..23a698b0 100644 --- a/include/btllib/nthash.hpp +++ b/include/btllib/nthash.hpp @@ -151,6 +151,7 @@ class NtHash bool forward() const { return forward_hash <= reverse_hash; } unsigned get_hash_num() const { return hash_num; } unsigned get_k() const { return k; } + size_t get_seq_len() const { return seq_len; } uint64_t get_forward_hash() const { return forward_hash; } uint64_t get_reverse_hash() const { return reverse_hash; } diff --git a/include/btllib/phred_nthash.hpp b/include/btllib/phred_nthash.hpp new file mode 100644 index 00000000..99ad7bbe --- /dev/null +++ b/include/btllib/phred_nthash.hpp @@ -0,0 +1,127 @@ +#ifndef BTLLIB_PHRED_NTHASH_HPP +#define BTLLIB_PHRED_NTHASH_HPP + +#include "btllib/nthash.hpp" +#include "btllib/util.hpp" + +#include +#include + +namespace btllib { +/** + * NtHash with Phred score filtering. + */ +class PhredNtHash : private NtHash +{ +public: + /** + * Constructor for PhredNtHash. + * @param seq Sequence to hash. + * @param seq_len Length of `seq`. + * @param hash_num Number of hashes to compute. + * @param k Length of k-mer. + * @param phred_min Minimum Phred score for a base to be included in the hash. + * @param quality_string String of Phred scores for each base in `seq`. + * @param pos Position to start hashing from. + * @return PhredNtHash object. + */ + PhredNtHash(const char* seq, + size_t seq_len, + unsigned hash_num, + unsigned k, + size_t phred_min, + std::string_view quality_string, + size_t pos = 0); + + /** + * Constructor for PhredNtHash. + * @param seq Sequence to hash. + * @param hash_num Number of hashes to compute. + * @param k Length of k-mer. + * @param phred_min Minimum Phred score for a base to be included in the hash. + * @param quality_string String of Phred scores for each base in `seq`. + * @param pos Position to start hashing from. + * @return PhredNtHash object. + */ + PhredNtHash(const std::string& seq, + unsigned hash_num, + unsigned k, + size_t phred_min, + std::string_view quality_string, + size_t pos = 0); + + /** + * Constructor for PhredNtHash. + * @param seq Sequence to hash. + * @param seq_len Length of `seq`. + * @param hash_num Number of hashes to compute. + * @param k Length of k-mer. + * @param phred_min Minimum Phred score for a base to be included in the hash. + * @param quality_string String of Phred scores for each base in `seq`. + * @param pos Position to start hashing from. + * @return PhredNtHash object. + */ + PhredNtHash(const char* seq, + size_t seq_len, + unsigned hash_num, + unsigned k, + size_t phred_min, + const char* quality_string, + size_t pos = 0); + + /** + * Constructor for PhredNtHash. + * @param seq Sequence to hash. + * @param hash_num Number of hashes to compute. + * @param k Length of k-mer. + * @param phred_min Minimum Phred score for a base to be included in the hash. + * @param quality_string String of Phred scores for each base in `seq`. + * @param pos Position to start hashing from. + * @return PhredNtHash object. + */ + PhredNtHash(const std::string& seq, + unsigned hash_num, + unsigned k, + size_t phred_min, + const char* quality_string, + size_t pos = 0); + + /** + * Roll the hash forward by one base. If the next k-mer contains a base with + * a Phred score below `phred_min`, the hash will be rolled forward until a + * k-mer with all bases with a Phred score above `phred_min` is found. + * @return True if successful, false if the end of the sequence is reached or + * no k-mer with all bases with a Phred score above `phred_min` is found. + */ + bool roll(); + /** + * Roll the hash backward by one base. If the previous k-mer contains a base + * with a Phred score below `phred_min`, the hash will be rolled backward + * until a k-mer with all bases with a Phred score above `phred_min` is found. + * @return True if successful, false if the start of the sequence is reached + * or no k-mer with all bases with a Phred score above `phred_min` is found. + */ + bool roll_back(); + const uint64_t* hashes() const { return NtHash::hashes(); } + + /** + * Get the position of last hashed k-mer or the k-mer to be hashed if roll() + * has never been called on this NtHash object. + */ + size_t get_pos() const { return NtHash::get_pos(); } + bool forward() const { return NtHash::forward(); } + unsigned get_hash_num() const { return NtHash::get_hash_num(); } + unsigned get_k() const { return NtHash::get_k(); } + size_t get_seq_len() const { return NtHash::get_seq_len(); } + uint64_t get_forward_hash() const { return NtHash::get_forward_hash(); } + uint64_t get_reverse_hash() const { return NtHash::get_reverse_hash(); } + +private: + const char* qual_seq; + unsigned char phred_min; + RangeMinimumQuery rmq; +}; + +} // namespace btllib + +#endif // BTLLIB_PHRED_NTHASH_HPP \ No newline at end of file diff --git a/include/btllib/util.hpp b/include/btllib/util.hpp index 03f73920..99a32c52 100644 --- a/include/btllib/util.hpp +++ b/include/btllib/util.hpp @@ -5,7 +5,9 @@ #define BTLLIB_UTIL_HPP #include "btllib/cstring.hpp" +#include "btllib/status.hpp" +#include #include #include #include @@ -13,6 +15,8 @@ namespace btllib { +static constexpr double PHRED_OFFSET = 33.0; + /** * Split a string into component substrings with a delimiter. * @@ -125,6 +129,93 @@ get_dirname(const std::string& path); double calc_phred_avg(const std::string& qual, size_t start_pos = 0, size_t len = 0); +/** + * Range minimum query class. + * Finds the index of the minimum value in a range of any iterable that can be + * indexed via [x] where x is the index corresponding to the element in the + * iterable. + * @tparam T The type of the iterable. + */ +template +class RangeMinimumQuery +{ +public: + /** + * Constructor for RangeMinimumQuery. + * @param array The iterable to be queried. + * @param size_of_array The size of the iterable. + */ + RangeMinimumQuery(const T& array, size_t size_of_array); + /** + * Query the index of the minimum value in the range [start, end]. + * @param start The start index of the range. + * @param end The end index of the range. + * @return The index of the minimum value in the range. + * @pre start <= end + * @pre end < size_of_array + * @pre start >= 0 + * + * @post The return value is in the range [start, end] + * @post The return value is the index of the minimum value in the range. + */ + size_t query(size_t start, size_t end); + +private: + //* The lookup table for the query. + std::vector> lookup_table; +}; + +// adapted from +// https://www.geeksforgeeks.org/range-minimum-query-for-static-array/ +template +RangeMinimumQuery::RangeMinimumQuery(const T& array, size_t size_of_array) +{ + size_t log_size = + (size_t)(log2( + size_of_array)) + // NOLINT(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + 1; + lookup_table.resize(log_size, std::vector(size_of_array)); + for (size_t i = 0; i < size_of_array; i++) { + lookup_table[0][i] = i; + } + for (size_t j = 1; j < log_size; j++) { + for (size_t i = 0; i + (1 << j) - 1 < size_of_array; i++) { + if (array[lookup_table[j - 1][i]] < + array[lookup_table[j - 1][i + (1 << (j - 1))]]) { + lookup_table[j][i] = lookup_table[j - 1][i]; + } else { + lookup_table[j][i] = lookup_table[j - 1][i + (1 << (j - 1))]; + } + } + } +} + +template +size_t +RangeMinimumQuery::query(size_t start, size_t end) +{ + if (start > end) { + log_error("RangeMinimumQuery::query: start > end"); + std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe) + } + if (end >= lookup_table[0].size()) { + log_error("RangeMinimumQuery::query: end >= lookup_table[0].size()"); + std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe) + } + if (start >= lookup_table[0].size()) { + log_error("RangeMinimumQuery::query: start >= lookup_table[0].size()"); + std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe) + } + + auto j = (size_t)(log2( + end - // NOLINT(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + start + 1)); + if (lookup_table[j][start] <= lookup_table[j][end - (1 << j) + 1]) { + return lookup_table[j][start]; + } + return lookup_table[j][end - (1 << j) + 1]; +} + // This exists in C++20, but we don't support that yet /// @cond HIDDEN_SYMBOLS class Barrier diff --git a/src/btllib/phred_nthash.cpp b/src/btllib/phred_nthash.cpp new file mode 100644 index 00000000..fcd6fbe0 --- /dev/null +++ b/src/btllib/phred_nthash.cpp @@ -0,0 +1,114 @@ +#include "btllib/phred_nthash.hpp" + +namespace btllib { +PhredNtHash::PhredNtHash(const char* seq, + size_t seq_len, + unsigned hash_num, + unsigned k, + size_t phred_min, + std::string_view quality_string, + size_t pos) + : NtHash(seq, seq_len, hash_num, k, pos) + , qual_seq(quality_string.data()) + , phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET)) + , rmq(quality_string, seq_len) +{ +} + +PhredNtHash::PhredNtHash(const std::string& seq, + unsigned hash_num, + unsigned k, + size_t phred_min, + std::string_view quality_string, + size_t pos) + : NtHash(seq, hash_num, k, pos) + , qual_seq(quality_string.data()) + , phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET)) + , rmq(quality_string, seq.length()) +{ +} + +PhredNtHash::PhredNtHash(const char* seq, + size_t seq_len, + unsigned hash_num, + unsigned k, + size_t phred_min, + const char* quality_string, + size_t pos) + : NtHash(seq, seq_len, hash_num, k, pos) + , qual_seq(quality_string) + , phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET)) + , rmq(quality_string, seq_len) +{ +} + +PhredNtHash::PhredNtHash(const std::string& seq, + unsigned hash_num, + unsigned k, + size_t phred_min, + const char* quality_string, + size_t pos) + : NtHash(seq, hash_num, k, pos) + , qual_seq(quality_string) + , phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET)) + , rmq(quality_string, seq.length()) +{ +} + +bool +PhredNtHash::roll() +{ + bool success = NtHash::roll(); + if (!success) { + return false; + } + size_t curr_pos = NtHash::get_pos(); + size_t k = NtHash::get_k(); + size_t seq_len = NtHash::get_seq_len(); + size_t min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); + auto min_phred = (unsigned char)qual_seq[min_phred_idx]; + while (min_phred < phred_min) { + // check next kmer range does not exceed sequence length + if (min_phred_idx + k >= seq_len) { + return false; + } + curr_pos = min_phred_idx + 1; + min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); + min_phred = (size_t)qual_seq[min_phred_idx]; + } + + while (curr_pos != NtHash::get_pos()) { + success = NtHash::roll(); + } + + return success; +} + +bool +PhredNtHash::roll_back() +{ + bool success = NtHash::roll_back(); + if (!success) { + return false; + } + size_t curr_pos = NtHash::get_pos(); + size_t k = NtHash::get_k(); + size_t min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); + auto min_phred = (unsigned char)qual_seq[min_phred_idx]; + while (min_phred < phred_min) { + // check next kmer range does not exceed sequence length + if (min_phred_idx - k >= NtHash::get_seq_len()) { + return false; + } + curr_pos = min_phred_idx - k; + min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); + min_phred = (size_t)qual_seq[min_phred_idx]; + } + + while (curr_pos != NtHash::get_pos()) { + success = NtHash::roll_back(); + } + + return success; +} +} // namespace btllib \ No newline at end of file diff --git a/src/btllib/util.cpp b/src/btllib/util.cpp index f210aae9..8cafc5e1 100644 --- a/src/btllib/util.cpp +++ b/src/btllib/util.cpp @@ -176,7 +176,6 @@ calc_phred_avg(const std::string& qual, const size_t start_pos, size_t len) phred_sum += (size_t)qual.at(i); } - static constexpr double PHRED_OFFSET = 33.0; return ((double)phred_sum / (double)len) - PHRED_OFFSET; } diff --git a/tests/phred_nthash.cpp b/tests/phred_nthash.cpp new file mode 100644 index 00000000..23977cfe --- /dev/null +++ b/tests/phred_nthash.cpp @@ -0,0 +1,206 @@ +#include "btllib/phred_nthash.hpp" +#include "btllib/seq.hpp" +#include "btllib/util.hpp" +#include "helpers.hpp" + +#include +#include +#include +#include + +int +main() +{ + + { + PRINT_TEST_NAME("Test skip due to base with low Phred score") + + std::string seq = "ACATGCCATGC"; + std::string qual = "11111011111"; + const unsigned k = 5; + const unsigned h = 3; + + const std::vector> hashes = { + { 0xf59ecb45f0e22b9c, 0x4969c33ac240c129, 0x688d616f0d7e08c3 }, + { 0x38cc00f940aebdae, 0xab7e1b110e086fc6, 0x11a1818bcfdd553 } + }; + + btllib::PhredNtHash phred_nthash(seq, h, k, 16, qual); + + for (const auto& h_vals : hashes) { + phred_nthash.roll(); + TEST_ASSERT_ARRAY_EQ(h_vals, phred_nthash.hashes(), h); + } + + phred_nthash.roll_back(); + TEST_ASSERT_EQ(phred_nthash.get_pos(), 0); + TEST_ASSERT_ARRAY_EQ(hashes[0], phred_nthash.hashes(), h); + + } + + { + PRINT_TEST_NAME("k-mer hash values") + + std::string seq = "ACATGCATGCA"; + std::string qual = "$$%%)*0)'%%"; + const unsigned k = 5; + const unsigned h = 3; + + const std::vector> hashes = { + { 0xf59ecb45f0e22b9c, 0x4969c33ac240c129, 0x688d616f0d7e08c3 }, + { 0x38cc00f940aebdae, 0xab7e1b110e086fc6, 0x11a1818bcfdd553 }, + { 0x603a48c5a11c794a, 0xe66016e61816b9c4, 0xc5b13cb146996ffe } + }; + + btllib::PhredNtHash phred_nthash(seq, h, k, 0, qual); + + for (const auto& h_vals : hashes) { + phred_nthash.roll(); + TEST_ASSERT_ARRAY_EQ(h_vals, phred_nthash.hashes(), h); + } + } + + { + PRINT_TEST_NAME("k-mer rolling") + + std::string seq = "AGTCAGTC"; + std::string qual = "$$%%)*0)"; + unsigned h = 3; + unsigned k = 4; + + btllib::PhredNtHash phred_nthash(seq, h, k, 0, qual); + std::vector hashes; + + while (phred_nthash.roll()) { + uint64_t* h_vals = new uint64_t[h]; + std::copy(phred_nthash.hashes(), phred_nthash.hashes() + h, h_vals); + hashes.push_back(h_vals); + } + + TEST_ASSERT_EQ(hashes.size(), seq.length() - k + 1); + + // check same hash value for identical k-mers (first and last) + TEST_ASSERT_ARRAY_EQ(hashes[0], hashes[hashes.size() - 1], h); + } + + { + PRINT_TEST_NAME("k-mer rolling vs ntbase hash values") + + std::string seq = "ACGTACACTGGACTGAGTCT"; + std::string qual = "$$%%)*0)'%%%%)*0)'%%"; + + btllib::PhredNtHash phred_nthash(seq, 3, seq.size() - 2, 0, qual); + /* 18-mers of kmer*/ + std::string kmer1 = "ACGTACACTGGACTGAGT"; + std::string kmer2 = "CGTACACTGGACTGAGTC"; + std::string kmer3 = "GTACACTGGACTGAGTCT"; + + btllib::PhredNtHash phred_nthash_vector[] = { + btllib::PhredNtHash(kmer1, phred_nthash.get_hash_num(), kmer1.size(), 0, qual), + btllib::PhredNtHash(kmer2, phred_nthash.get_hash_num(), kmer2.size(), 0, qual), + btllib::PhredNtHash(kmer3, phred_nthash.get_hash_num(), kmer3.size(), 0, qual) + }; + + size_t i; + for (i = 0; phred_nthash.roll() && phred_nthash_vector[i].roll(); ++i) { + for (size_t j = 0; j < phred_nthash.get_hash_num(); ++j) { + TEST_ASSERT_EQ(phred_nthash.hashes()[j], phred_nthash_vector[i].hashes()[j]); + } + } + TEST_ASSERT_EQ(i, 3); + } + + { + PRINT_TEST_NAME("canonical hashing") + + std::string seq_f = "ACGTACACTGGACTGAGTCT"; + std::string seq_r = "AGACTCAGTCCAGTGTACGT"; + std::string qual = "$$%%)*0)'%%%%)*0)'%%"; + + unsigned h = 3; + + btllib::PhredNtHash phred_nthash_f(seq_f, h, seq_f.size(), 0, qual); + btllib::PhredNtHash phred_nthash_r(seq_r, h, seq_r.size(), 0, qual); + + phred_nthash_f.roll(); + phred_nthash_r.roll(); + TEST_ASSERT_EQ(phred_nthash_f.get_hash_num(), phred_nthash_r.get_hash_num()) + TEST_ASSERT_ARRAY_EQ(phred_nthash_f.hashes(), phred_nthash_r.hashes(), h) + } + + { + PRINT_TEST_NAME("k-mer back rolling") + + std::string seq = "ACTAGCTG"; + std::string qual = "$$%%)*0%"; + unsigned h = 3; + unsigned k = 5; + + btllib::PhredNtHash phred_nthash(seq, h, k, 0, qual); + std::stack hashes; + + while (phred_nthash.roll()) { + uint64_t* h_vals = new uint64_t[h]; + std::copy(phred_nthash.hashes(), phred_nthash.hashes() + h, h_vals); + hashes.push(h_vals); + } + + TEST_ASSERT_EQ(hashes.size(), seq.length() - k + 1) + + do { + TEST_ASSERT_ARRAY_EQ(phred_nthash.hashes(), hashes.top(), h) + hashes.pop(); + } while (phred_nthash.roll_back()); + } + + { + PRINT_TEST_NAME("skipping Ns") + + std::string seq = "ACGTACACTGGACTGAGTCT"; + std::string qual = "$$%%)*0)'%%%%)*0)'%%"; + std::string seq_with_ns = seq; + + TEST_ASSERT_GE(seq_with_ns.size(), 10) + seq_with_ns[seq_with_ns.size() / 2] = 'N'; + seq_with_ns[seq_with_ns.size() / 2 + 1] = 'N'; + unsigned k = (seq.size() - 2) / 2 - 1; + btllib::PhredNtHash phred_nthash(seq_with_ns, 3, k, 0, qual); + + std::vector positions; + for (size_t i = 0; i < seq_with_ns.size() / 2 - k + 1; i++) { + positions.push_back(i); + } + for (size_t i = seq_with_ns.size() / 2 + 2; i < seq_with_ns.size() - k + 1; + i++) { + positions.push_back(i); + } + + size_t i = 0; + while (phred_nthash.roll()) { + TEST_ASSERT_EQ(phred_nthash.get_pos(), positions[i]) + i++; + } + TEST_ASSERT_EQ(positions.size(), i) + } + + { + std::cerr << "Testing RNA" << std::endl; + + std::string seq = "ACGTACACTGGACTGAGTCT"; + std::string qual = "$$%%)*0)'%%%%)*0)'%%"; + btllib::PhredNtHash dna_phred_nthash(seq, 3, 20, 0, qual); + + std::string rna_seq = "ACGUACACUGGACUGAGUCU"; + btllib::PhredNtHash rna_phred_nthash(rna_seq, 3, 20, 0, qual); + + dna_phred_nthash.roll(); + rna_phred_nthash.roll(); + size_t i; + for (i = 0; i < dna_phred_nthash.get_hash_num(); ++i) { + TEST_ASSERT_EQ(dna_phred_nthash.hashes()[i], rna_phred_nthash.hashes()[i]); + } + TEST_ASSERT_EQ(i, 3); + } + + return 0; +} diff --git a/tests/util.cpp b/tests/util.cpp index 94db0ebc..3d8de971 100644 --- a/tests/util.cpp +++ b/tests/util.cpp @@ -2,6 +2,7 @@ #include "helpers.hpp" #include +#include int main() @@ -37,5 +38,11 @@ main() TEST_ASSERT_LT(std::abs(avg2 - 3.5), 1e-4); TEST_ASSERT_LT(std::abs(avg3 - 6.15), 1e-4); + std::vector test_vec{ 7, 2, 3, 0, 5, 10, 3, 12, 18 }; + btllib::RangeMinimumQuery> rmq(test_vec, test_vec.size()); + TEST_ASSERT_EQ(test_vec.at(rmq.query(0, 4)), 0); + TEST_ASSERT_EQ(test_vec.at(rmq.query(4, 7)), 3); + TEST_ASSERT_EQ(test_vec.at(rmq.query(7, 8)), 12); + return 0; -} \ No newline at end of file +}