From 593bf46f17a8a256d6265e84b262b3d8ea144a61 Mon Sep 17 00:00:00 2001 From: jowong04 Date: Thu, 13 Jul 2023 17:55:38 -0700 Subject: [PATCH 1/6] util: add rmq utility --- include/btllib/util.hpp | 351 +++++++++++++++++++++++++--------------- src/btllib/util.cpp | 1 - tests/util.cpp | 9 +- 3 files changed, 230 insertions(+), 131 deletions(-) diff --git a/include/btllib/util.hpp b/include/btllib/util.hpp index 03f73920..17878444 100644 --- a/include/btllib/util.hpp +++ b/include/btllib/util.hpp @@ -5,147 +5,240 @@ #define BTLLIB_UTIL_HPP #include "btllib/cstring.hpp" +#include "btllib/status.hpp" +#include #include #include #include #include -namespace btllib { - -/** - * Split a string into component substrings with a delimiter. - * - * @param s String to split. - * @param delim Delimiter to split with. - * - * @return Vector of substrings delimited by `delim`, excluding delimiters - * themselves. - */ -std::vector -split(const std::string& s, const std::string& delim); - -/** - * Join a vector of strings into a single string with a delimiter. - * - * @param s Vector of strings to join. - * @param delim Delimiter to join the strings with. - * - * @return String with all the components joined. - */ -std::string -join(const std::vector& s, const std::string& delim); - -/** - * Trim whitespace on the left side of the given string. - * - * @param s String to trim, edited in-place. - * - */ -void -ltrim(std::string& s); -void -ltrim(btllib::CString& s); - -/** - * Trim whitespace on the right side of the given string. - * - * @param s String to trim, edited in-place. - * - */ -void -rtrim(std::string& s); -void -rtrim(btllib::CString& s); - -/** - * Trim whitespace on the left and right side of the given string. - * - * @param s String to trim, edited in-place. - * - */ -void -trim(std::string& s); -void -trim(btllib::CString& s); - -/** - * Check whether the given string starts with a prefix. - * - * @param s String to check. - * @param prefix Prefix to check for. - * - */ -bool -startswith(std::string s, std::string prefix); - -/** - * Check whether the given string ends with a suffix. - * - * @param s String to check. - * @param suffix Suffix to check for. - * - */ -bool -endswith(std::string s, std::string suffix); - -/** - * Equivalent to the GNU implementation of basename, - * but returns a string copy of the result. - * - * @param path The path to get basename from. - * - * @return The basename of the path. - */ -std::string -get_basename(const std::string& path); - -/** - * Equivalent to the GNU implementation of dirname, - * but returns a string copy of the result. - * - * @param path The path to get dirname from. - * - * @return The dirname of the path. - */ -std::string -get_dirname(const std::string& path); - -/** - * Calculate the average phred score of a string, - * depending on the start position and length. - * - * @param qual The quality string to calculate the average from. - * @param start_pos The start position of the substring. Defaults to 0. - * @param len The length of the substring. Defaults to 0. If 0, the whole string - * is used. - * - * @return The average phred score of the substring. - */ -double -calc_phred_avg(const std::string& qual, size_t start_pos = 0, size_t len = 0); - -// This exists in C++20, but we don't support that yet -/// @cond HIDDEN_SYMBOLS -class Barrier +namespace btllib { -public: - Barrier(const unsigned count) - : counter_default(count) + static constexpr double PHRED_OFFSET = 33.0; + + /** + * Split a string into component substrings with a delimiter. + * + * @param s String to split. + * @param delim Delimiter to split with. + * + * @return Vector of substrings delimited by `delim`, excluding delimiters + * themselves. + */ + std::vector + split(const std::string &s, const std::string &delim); + + /** + * Join a vector of strings into a single string with a delimiter. + * + * @param s Vector of strings to join. + * @param delim Delimiter to join the strings with. + * + * @return String with all the components joined. + */ + std::string + join(const std::vector &s, const std::string &delim); + + /** + * Trim whitespace on the left side of the given string. + * + * @param s String to trim, edited in-place. + * + */ + void + ltrim(std::string &s); + void + ltrim(btllib::CString &s); + + /** + * Trim whitespace on the right side of the given string. + * + * @param s String to trim, edited in-place. + * + */ + void + rtrim(std::string &s); + void + rtrim(btllib::CString &s); + + /** + * Trim whitespace on the left and right side of the given string. + * + * @param s String to trim, edited in-place. + * + */ + void + trim(std::string &s); + void + trim(btllib::CString &s); + + /** + * Check whether the given string starts with a prefix. + * + * @param s String to check. + * @param prefix Prefix to check for. + * + */ + bool + startswith(std::string s, std::string prefix); + + /** + * Check whether the given string ends with a suffix. + * + * @param s String to check. + * @param suffix Suffix to check for. + * + */ + bool + endswith(std::string s, std::string suffix); + + /** + * Equivalent to the GNU implementation of basename, + * but returns a string copy of the result. + * + * @param path The path to get basename from. + * + * @return The basename of the path. + */ + std::string + get_basename(const std::string &path); + + /** + * Equivalent to the GNU implementation of dirname, + * but returns a string copy of the result. + * + * @param path The path to get dirname from. + * + * @return The dirname of the path. + */ + std::string + get_dirname(const std::string &path); + + /** + * Calculate the average phred score of a string, + * depending on the start position and length. + * + * @param qual The quality string to calculate the average from. + * @param start_pos The start position of the substring. Defaults to 0. + * @param len The length of the substring. Defaults to 0. If 0, the whole string + * is used. + * + * @return The average phred score of the substring. + */ + double + calc_phred_avg(const std::string &qual, size_t start_pos = 0, size_t len = 0); + + /** + * Range minimum query class. + * Finds the index of the minimum value in a range of any iterable that can be indexed via [x] + * where x is the index corresponding to the element in the iterable. + * @tparam T The type of the iterable. + */ + template + class RangeMinimumQuery + { + public: + /** + * Constructor for RangeMinimumQuery. + * @param array The iterable to be queried. + * @param size_of_array The size of the iterable. + */ + RangeMinimumQuery(const T &array, size_t size_of_array); + /** + * Query the index of the minimum value in the range [start, end]. + * @param start The start index of the range. + * @param end The end index of the range. + * @return The index of the minimum value in the range. + * @pre start <= end + * @pre end < size_of_array + * @pre start >= 0 + * + * @post The return value is in the range [start, end] + * @post The return value is the index of the minimum value in the range. + */ + size_t query(size_t start, size_t end); + + private: + //* The lookup table for the query. + std::vector> lookup_table; + }; + + // adapted from https://www.geeksforgeeks.org/range-minimum-query-for-static-array/ + template + RangeMinimumQuery::RangeMinimumQuery(const T &array, size_t size_of_array) { + size_t log_size = (size_t)(log2(size_of_array)) + 1; // NOLINT(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + lookup_table.resize(log_size, std::vector(size_of_array)); + for (size_t i = 0; i < size_of_array; i++) + { + lookup_table[0][i] = i; + } + for (size_t j = 1; j < log_size; j++) + { + for (size_t i = 0; i + (1 << j) - 1 < size_of_array; i++) + { + if (array[lookup_table[j - 1][i]] < array[lookup_table[j - 1][i + (1 << (j - 1))]]) + { + lookup_table[j][i] = lookup_table[j - 1][i]; + } + else + { + lookup_table[j][i] = lookup_table[j - 1][i + (1 << (j - 1))]; + } + } + } } - void wait(); + template + size_t RangeMinimumQuery::query(size_t start, size_t end) + { + if (start > end) + { + log_error("RangeMinimumQuery::query: start > end"); + std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe) + } + if (end >= lookup_table[0].size()) + { + log_error("RangeMinimumQuery::query: end >= lookup_table[0].size()"); + std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe) + } + if (start >= lookup_table[0].size()) + { + log_error("RangeMinimumQuery::query: start >= lookup_table[0].size()"); + std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe) + } + + auto j = (size_t)(log2(end - start + 1)); // NOLINT(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + if (lookup_table[j][start] <= lookup_table[j][end - (1 << j) + 1]) + { + return lookup_table[j][start]; + } + return lookup_table[j][end - (1 << j) + 1]; + } + + // This exists in C++20, but we don't support that yet + /// @cond HIDDEN_SYMBOLS + class Barrier + { -private: - std::mutex m; - std::condition_variable cv; - unsigned counter{ 0 }; - unsigned counter_default; - unsigned waiting{ 0 }; -}; -/// @endcond + public: + Barrier(const unsigned count) + : counter_default(count) + { + } + + void wait(); + + private: + std::mutex m; + std::condition_variable cv; + unsigned counter{0}; + unsigned counter_default; + unsigned waiting{0}; + }; + /// @endcond } // namespace btllib diff --git a/src/btllib/util.cpp b/src/btllib/util.cpp index f210aae9..8cafc5e1 100644 --- a/src/btllib/util.cpp +++ b/src/btllib/util.cpp @@ -176,7 +176,6 @@ calc_phred_avg(const std::string& qual, const size_t start_pos, size_t len) phred_sum += (size_t)qual.at(i); } - static constexpr double PHRED_OFFSET = 33.0; return ((double)phred_sum / (double)len) - PHRED_OFFSET; } diff --git a/tests/util.cpp b/tests/util.cpp index 94db0ebc..bdcf5041 100644 --- a/tests/util.cpp +++ b/tests/util.cpp @@ -2,6 +2,7 @@ #include "helpers.hpp" #include +#include int main() @@ -35,7 +36,13 @@ main() TEST_ASSERT_LT(std::abs(avg - 6.4), 1e-4); TEST_ASSERT_LT(std::abs(avg1 - 10.949), 1e-4); TEST_ASSERT_LT(std::abs(avg2 - 3.5), 1e-4); - TEST_ASSERT_LT(std::abs(avg3 - 6.15), 1e-4); + TEST_ASSERT_LT(std::abs(avg3 - 6.15), 1e-4);\ + + std::vector test_vec{ 7, 2, 3, 0, 5, 10, 3, 12, 18 }; + btllib::RangeMinimumQuery> rmq(test_vec, test_vec.size()); + TEST_ASSERT_EQ(test_vec.at(rmq.query(0, 4)), 0); + TEST_ASSERT_EQ(test_vec.at(rmq.query(4, 7)), 3); + TEST_ASSERT_EQ(test_vec.at(rmq.query(7, 8)), 12); return 0; } \ No newline at end of file From a912dcdd4ad7d85409abc0e575b231c9bb998a84 Mon Sep 17 00:00:00 2001 From: jowong04 Date: Thu, 13 Jul 2023 17:56:55 -0700 Subject: [PATCH 2/6] add PhredNtHash class --- include/btllib/nthash.hpp | 1 + include/btllib/phred_nthash.hpp | 116 ++++++++++++++++++ src/btllib/phred_nthash.cpp | 97 +++++++++++++++ tests/phred_nthash.cpp | 202 ++++++++++++++++++++++++++++++++ 4 files changed, 416 insertions(+) create mode 100644 include/btllib/phred_nthash.hpp create mode 100644 src/btllib/phred_nthash.cpp create mode 100644 tests/phred_nthash.cpp diff --git a/include/btllib/nthash.hpp b/include/btllib/nthash.hpp index 20dc16da..23a698b0 100644 --- a/include/btllib/nthash.hpp +++ b/include/btllib/nthash.hpp @@ -151,6 +151,7 @@ class NtHash bool forward() const { return forward_hash <= reverse_hash; } unsigned get_hash_num() const { return hash_num; } unsigned get_k() const { return k; } + size_t get_seq_len() const { return seq_len; } uint64_t get_forward_hash() const { return forward_hash; } uint64_t get_reverse_hash() const { return reverse_hash; } diff --git a/include/btllib/phred_nthash.hpp b/include/btllib/phred_nthash.hpp new file mode 100644 index 00000000..62fdace2 --- /dev/null +++ b/include/btllib/phred_nthash.hpp @@ -0,0 +1,116 @@ +#ifndef BTLLIB_PHRED_NTHASH_HPP +#define BTLLIB_PHRED_NTHASH_HPP + +#include "btllib/nthash.hpp" +#include "btllib/util.hpp" + +#include +#include + +namespace btllib +{ + /** + * NtHash with Phred score filtering. + */ + class PhredNtHash : private NtHash + { + public: + /** + * Constructor for PhredNtHash. + * @param seq Sequence to hash. + * @param seq_len Length of `seq`. + * @param hash_num Number of hashes to compute. + * @param k Length of k-mer. + * @param phred_min Minimum Phred score for a base to be included in the hash. + * @param quality_string String of Phred scores for each base in `seq`. + * @param pos Position to start hashing from. + * @return PhredNtHash object. + */ + PhredNtHash(const char *seq, + size_t seq_len, + unsigned hash_num, + unsigned k, + size_t phred_min, const std::string &quality_string, size_t pos = 0); + + /** + * Constructor for PhredNtHash. + * @param seq Sequence to hash. + * @param hash_num Number of hashes to compute. + * @param k Length of k-mer. + * @param phred_min Minimum Phred score for a base to be included in the hash. + * @param quality_string String of Phred scores for each base in `seq`. + * @param pos Position to start hashing from. + * @return PhredNtHash object. + */ + PhredNtHash(const std::string &seq, unsigned hash_num, unsigned k, + size_t phred_min, const std::string &quality_string, size_t pos = 0); + + /** + * Constructor for PhredNtHash. + * @param seq Sequence to hash. + * @param seq_len Length of `seq`. + * @param hash_num Number of hashes to compute. + * @param k Length of k-mer. + * @param phred_min Minimum Phred score for a base to be included in the hash. + * @param quality_string String of Phred scores for each base in `seq`. + * @param pos Position to start hashing from. + * @return PhredNtHash object. + */ + PhredNtHash(const char *seq, + size_t seq_len, + unsigned hash_num, + unsigned k, + size_t phred_min, const char *quality_string, size_t pos = 0); + + /** + * Constructor for PhredNtHash. + * @param seq Sequence to hash. + * @param hash_num Number of hashes to compute. + * @param k Length of k-mer. + * @param phred_min Minimum Phred score for a base to be included in the hash. + * @param quality_string String of Phred scores for each base in `seq`. + * @param pos Position to start hashing from. + * @return PhredNtHash object. + */ + PhredNtHash(const std::string &seq, unsigned hash_num, unsigned k, + size_t phred_min, const char *quality_string, size_t pos = 0); + + /** + * Roll the hash forward by one base. If the next k-mer contains a base with + * a Phred score below `phred_min`, the hash will be rolled forward until a + * k-mer with all bases with a Phred score above `phred_min` is found. + * @return True if successful, false if the end of the sequence is reached or + * no k-mer with all bases with a Phred score above `phred_min` is found. + */ + bool roll(); + /** + * Roll the hash backward by one base. If the previous k-mer contains a base + * with a Phred score below `phred_min`, the hash will be rolled backward + * until a k-mer with all bases with a Phred score above `phred_min` is found. + * @return True if successful, false if the start of the sequence is reached or + * no k-mer with all bases with a Phred score above `phred_min` is found. + */ + bool roll_back(); + const uint64_t *hashes() const { return NtHash::hashes(); } + + /** + * Get the position of last hashed k-mer or the k-mer to be hashed if roll() + * has never been called on this NtHash object. + */ + size_t get_pos() const { return NtHash::get_pos(); } + bool forward() const { return NtHash::forward(); } + unsigned get_hash_num() const { return NtHash::get_hash_num(); } + unsigned get_k() const { return NtHash::get_k(); } + size_t get_seq_len() const { return NtHash::get_seq_len(); } + uint64_t get_forward_hash() const { return NtHash::get_forward_hash(); } + uint64_t get_reverse_hash() const { return NtHash::get_reverse_hash(); } + + private: + const char *qual_seq; + size_t phred_min; + RangeMinimumQuery rmq; + }; + +} // namespace btllib + +#endif // BTLLIB_PHRED_NTHASH_HPP \ No newline at end of file diff --git a/src/btllib/phred_nthash.cpp b/src/btllib/phred_nthash.cpp new file mode 100644 index 00000000..0af7b586 --- /dev/null +++ b/src/btllib/phred_nthash.cpp @@ -0,0 +1,97 @@ +#include "btllib/phred_nthash.hpp" + +namespace btllib +{ + PhredNtHash::PhredNtHash(const char *seq, + size_t seq_len, + unsigned hash_num, + unsigned k, + size_t phred_min, const std::string &quality_string, size_t pos) + : NtHash(seq, seq_len, hash_num, k, pos), qual_seq(quality_string.c_str()), phred_min(phred_min + (size_t)PHRED_OFFSET), rmq(quality_string, seq_len) + { + } + + PhredNtHash::PhredNtHash(const std::string &seq, unsigned hash_num, unsigned k, + size_t phred_min, const std::string &quality_string, size_t pos) + : NtHash(seq, hash_num, k, pos), qual_seq(quality_string.c_str()), phred_min(phred_min + (size_t)PHRED_OFFSET), rmq(quality_string, seq.length()) + { + } + + PhredNtHash::PhredNtHash(const char *seq, + size_t seq_len, + unsigned hash_num, + unsigned k, + size_t phred_min, const char *quality_string, size_t pos) + : NtHash(seq, seq_len, hash_num, k, pos), qual_seq(quality_string), phred_min(phred_min + (size_t)PHRED_OFFSET), rmq(quality_string, seq_len) + { + } + + PhredNtHash::PhredNtHash(const std::string &seq, unsigned hash_num, unsigned k, + size_t phred_min, const char *quality_string, size_t pos) + : NtHash(seq, hash_num, k, pos), qual_seq(quality_string), phred_min(phred_min + (size_t)PHRED_OFFSET), rmq(quality_string, seq.length()) + { + } + + bool PhredNtHash::roll() + { + bool success = NtHash::roll(); + if (!success) + { + return false; + } + size_t curr_pos = NtHash::get_pos(); + size_t k = NtHash::get_k(); + size_t seq_len = NtHash::get_seq_len(); + size_t min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); + size_t min_phred = (size_t)qual_seq[min_phred_idx]; + while (min_phred < phred_min) + { + // check next kmer range does not exceed sequence length + if (min_phred_idx + k >= seq_len) + { + return false; + } + curr_pos = min_phred_idx + 1; + min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); + min_phred = (size_t)qual_seq[min_phred_idx]; + } + + while (curr_pos != NtHash::get_pos()) + { + success = NtHash::roll(); + } + + return success; + } + + bool PhredNtHash::roll_back() + { + bool success = NtHash::roll_back(); + if (!success) + { + return false; + } + size_t curr_pos = NtHash::get_pos(); + size_t k = NtHash::get_k(); + size_t min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); + size_t min_phred = (size_t)qual_seq[min_phred_idx]; + while (min_phred < phred_min) + { + // check next kmer range does not exceed sequence length + if (min_phred_idx - k >= NtHash::get_seq_len()) + { + return false; + } + curr_pos = min_phred_idx - k; + min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); + min_phred = (size_t)qual_seq[min_phred_idx]; + } + + while (curr_pos != NtHash::get_pos()) + { + success = NtHash::roll_back(); + } + + return success; + } +} // namespace btllib \ No newline at end of file diff --git a/tests/phred_nthash.cpp b/tests/phred_nthash.cpp new file mode 100644 index 00000000..cc423706 --- /dev/null +++ b/tests/phred_nthash.cpp @@ -0,0 +1,202 @@ +#include "btllib/phred_nthash.hpp" +#include "btllib/seq.hpp" +#include "btllib/util.hpp" +#include "helpers.hpp" + +#include +#include +#include +#include + +int +main() +{ + + { + PRINT_TEST_NAME("Test skip due to base with low Phred score") + + std::string seq = "ACATGCCATGC"; + std::string qual = "11111011111"; + const unsigned k = 5; + const unsigned h = 3; + + const std::vector> hashes = { + { 0xf59ecb45f0e22b9c, 0x4969c33ac240c129, 0x688d616f0d7e08c3 }, + { 0x38cc00f940aebdae, 0xab7e1b110e086fc6, 0x11a1818bcfdd553 } + }; + + btllib::PhredNtHash phred_nthash(seq, h, k, 16, qual); + + for (const auto& h_vals : hashes) { + phred_nthash.roll(); + TEST_ASSERT_ARRAY_EQ(h_vals, phred_nthash.hashes(), h); + } + + } + + { + PRINT_TEST_NAME("k-mer hash values") + + std::string seq = "ACATGCATGCA"; + std::string qual = "$$%%)*0)'%%"; + const unsigned k = 5; + const unsigned h = 3; + + const std::vector> hashes = { + { 0xf59ecb45f0e22b9c, 0x4969c33ac240c129, 0x688d616f0d7e08c3 }, + { 0x38cc00f940aebdae, 0xab7e1b110e086fc6, 0x11a1818bcfdd553 }, + { 0x603a48c5a11c794a, 0xe66016e61816b9c4, 0xc5b13cb146996ffe } + }; + + btllib::PhredNtHash phred_nthash(seq, h, k, 0, qual); + + for (const auto& h_vals : hashes) { + phred_nthash.roll(); + TEST_ASSERT_ARRAY_EQ(h_vals, phred_nthash.hashes(), h); + } + } + + { + PRINT_TEST_NAME("k-mer rolling") + + std::string seq = "AGTCAGTC"; + std::string qual = "$$%%)*0)"; + unsigned h = 3; + unsigned k = 4; + + btllib::PhredNtHash phred_nthash(seq, h, k, 0, qual); + std::vector hashes; + + while (phred_nthash.roll()) { + uint64_t* h_vals = new uint64_t[h]; + std::copy(phred_nthash.hashes(), phred_nthash.hashes() + h, h_vals); + hashes.push_back(h_vals); + } + + TEST_ASSERT_EQ(hashes.size(), seq.length() - k + 1); + + // check same hash value for identical k-mers (first and last) + TEST_ASSERT_ARRAY_EQ(hashes[0], hashes[hashes.size() - 1], h); + } + + { + PRINT_TEST_NAME("k-mer rolling vs ntbase hash values") + + std::string seq = "ACGTACACTGGACTGAGTCT"; + std::string qual = "$$%%)*0)'%%%%)*0)'%%"; + + btllib::PhredNtHash phred_nthash(seq, 3, seq.size() - 2, 0, qual); + /* 18-mers of kmer*/ + std::string kmer1 = "ACGTACACTGGACTGAGT"; + std::string kmer2 = "CGTACACTGGACTGAGTC"; + std::string kmer3 = "GTACACTGGACTGAGTCT"; + + btllib::PhredNtHash phred_nthash_vector[] = { + btllib::PhredNtHash(kmer1, phred_nthash.get_hash_num(), kmer1.size(), 0, qual), + btllib::PhredNtHash(kmer2, phred_nthash.get_hash_num(), kmer2.size(), 0, qual), + btllib::PhredNtHash(kmer3, phred_nthash.get_hash_num(), kmer3.size(), 0, qual) + }; + + size_t i; + for (i = 0; phred_nthash.roll() && phred_nthash_vector[i].roll(); ++i) { + for (size_t j = 0; j < phred_nthash.get_hash_num(); ++j) { + TEST_ASSERT_EQ(phred_nthash.hashes()[j], phred_nthash_vector[i].hashes()[j]); + } + } + TEST_ASSERT_EQ(i, 3); + } + + { + PRINT_TEST_NAME("canonical hashing") + + std::string seq_f = "ACGTACACTGGACTGAGTCT"; + std::string seq_r = "AGACTCAGTCCAGTGTACGT"; + std::string qual = "$$%%)*0)'%%%%)*0)'%%"; + + unsigned h = 3; + + btllib::PhredNtHash phred_nthash_f(seq_f, h, seq_f.size(), 0, qual); + btllib::PhredNtHash phred_nthash_r(seq_r, h, seq_r.size(), 0, qual); + + phred_nthash_f.roll(); + phred_nthash_r.roll(); + TEST_ASSERT_EQ(phred_nthash_f.get_hash_num(), phred_nthash_r.get_hash_num()) + TEST_ASSERT_ARRAY_EQ(phred_nthash_f.hashes(), phred_nthash_r.hashes(), h) + } + + { + PRINT_TEST_NAME("k-mer back rolling") + + std::string seq = "ACTAGCTG"; + std::string qual = "$$%%)*0%"; + unsigned h = 3; + unsigned k = 5; + + btllib::PhredNtHash phred_nthash(seq, h, k, 0, qual); + std::stack hashes; + + while (phred_nthash.roll()) { + uint64_t* h_vals = new uint64_t[h]; + std::copy(phred_nthash.hashes(), phred_nthash.hashes() + h, h_vals); + hashes.push(h_vals); + } + + TEST_ASSERT_EQ(hashes.size(), seq.length() - k + 1) + + do { + TEST_ASSERT_ARRAY_EQ(phred_nthash.hashes(), hashes.top(), h) + hashes.pop(); + } while (phred_nthash.roll_back()); + } + + { + PRINT_TEST_NAME("skipping Ns") + + std::string seq = "ACGTACACTGGACTGAGTCT"; + std::string qual = "$$%%)*0)'%%%%)*0)'%%"; + std::string seq_with_ns = seq; + + TEST_ASSERT_GE(seq_with_ns.size(), 10) + seq_with_ns[seq_with_ns.size() / 2] = 'N'; + seq_with_ns[seq_with_ns.size() / 2 + 1] = 'N'; + unsigned k = (seq.size() - 2) / 2 - 1; + btllib::PhredNtHash phred_nthash(seq_with_ns, 3, k, 0, qual); + + std::vector positions; + for (size_t i = 0; i < seq_with_ns.size() / 2 - k + 1; i++) { + positions.push_back(i); + } + for (size_t i = seq_with_ns.size() / 2 + 2; i < seq_with_ns.size() - k + 1; + i++) { + positions.push_back(i); + } + + size_t i = 0; + while (phred_nthash.roll()) { + TEST_ASSERT_EQ(phred_nthash.get_pos(), positions[i]) + i++; + } + TEST_ASSERT_EQ(positions.size(), i) + } + + { + std::cerr << "Testing RNA" << std::endl; + + std::string seq = "ACGTACACTGGACTGAGTCT"; + std::string qual = "$$%%)*0)'%%%%)*0)'%%"; + btllib::PhredNtHash dna_phred_nthash(seq, 3, 20, 0, qual); + + std::string rna_seq = "ACGUACACUGGACUGAGUCU"; + btllib::PhredNtHash rna_phred_nthash(rna_seq, 3, 20, 0, qual); + + dna_phred_nthash.roll(); + rna_phred_nthash.roll(); + size_t i; + for (i = 0; i < dna_phred_nthash.get_hash_num(); ++i) { + TEST_ASSERT_EQ(dna_phred_nthash.hashes()[i], rna_phred_nthash.hashes()[i]); + } + TEST_ASSERT_EQ(i, 3); + } + + return 0; +} From 020a5a326748166f42d3411a17732ef34ee70bab Mon Sep 17 00:00:00 2001 From: jowong04 Date: Thu, 13 Jul 2023 18:31:52 -0700 Subject: [PATCH 3/6] clang-format --- include/btllib/phred_nthash.hpp | 205 ++++++++-------- include/btllib/util.hpp | 400 ++++++++++++++++---------------- src/btllib/phred_nthash.cpp | 183 ++++++++------- 3 files changed, 407 insertions(+), 381 deletions(-) diff --git a/include/btllib/phred_nthash.hpp b/include/btllib/phred_nthash.hpp index 62fdace2..7ef2462d 100644 --- a/include/btllib/phred_nthash.hpp +++ b/include/btllib/phred_nthash.hpp @@ -4,112 +4,123 @@ #include "btllib/nthash.hpp" #include "btllib/util.hpp" -#include #include +#include -namespace btllib +namespace btllib { +/** + * NtHash with Phred score filtering. + */ +class PhredNtHash : private NtHash { - /** - * NtHash with Phred score filtering. - */ - class PhredNtHash : private NtHash - { - public: - /** - * Constructor for PhredNtHash. - * @param seq Sequence to hash. - * @param seq_len Length of `seq`. - * @param hash_num Number of hashes to compute. - * @param k Length of k-mer. - * @param phred_min Minimum Phred score for a base to be included in the hash. - * @param quality_string String of Phred scores for each base in `seq`. - * @param pos Position to start hashing from. - * @return PhredNtHash object. - */ - PhredNtHash(const char *seq, - size_t seq_len, - unsigned hash_num, - unsigned k, - size_t phred_min, const std::string &quality_string, size_t pos = 0); +public: + /** + * Constructor for PhredNtHash. + * @param seq Sequence to hash. + * @param seq_len Length of `seq`. + * @param hash_num Number of hashes to compute. + * @param k Length of k-mer. + * @param phred_min Minimum Phred score for a base to be included in the hash. + * @param quality_string String of Phred scores for each base in `seq`. + * @param pos Position to start hashing from. + * @return PhredNtHash object. + */ + PhredNtHash(const char* seq, + size_t seq_len, + unsigned hash_num, + unsigned k, + unsigned char phred_min, + std::string_view quality_string, + size_t pos = 0); - /** - * Constructor for PhredNtHash. - * @param seq Sequence to hash. - * @param hash_num Number of hashes to compute. - * @param k Length of k-mer. - * @param phred_min Minimum Phred score for a base to be included in the hash. - * @param quality_string String of Phred scores for each base in `seq`. - * @param pos Position to start hashing from. - * @return PhredNtHash object. - */ - PhredNtHash(const std::string &seq, unsigned hash_num, unsigned k, - size_t phred_min, const std::string &quality_string, size_t pos = 0); + /** + * Constructor for PhredNtHash. + * @param seq Sequence to hash. + * @param hash_num Number of hashes to compute. + * @param k Length of k-mer. + * @param phred_min Minimum Phred score for a base to be included in the hash. + * @param quality_string String of Phred scores for each base in `seq`. + * @param pos Position to start hashing from. + * @return PhredNtHash object. + */ + PhredNtHash(const std::string& seq, + unsigned hash_num, + unsigned k, + unsigned char phred_min, + std::string_view quality_string, + size_t pos = 0); - /** - * Constructor for PhredNtHash. - * @param seq Sequence to hash. - * @param seq_len Length of `seq`. - * @param hash_num Number of hashes to compute. - * @param k Length of k-mer. - * @param phred_min Minimum Phred score for a base to be included in the hash. - * @param quality_string String of Phred scores for each base in `seq`. - * @param pos Position to start hashing from. - * @return PhredNtHash object. - */ - PhredNtHash(const char *seq, - size_t seq_len, - unsigned hash_num, - unsigned k, - size_t phred_min, const char *quality_string, size_t pos = 0); + /** + * Constructor for PhredNtHash. + * @param seq Sequence to hash. + * @param seq_len Length of `seq`. + * @param hash_num Number of hashes to compute. + * @param k Length of k-mer. + * @param phred_min Minimum Phred score for a base to be included in the hash. + * @param quality_string String of Phred scores for each base in `seq`. + * @param pos Position to start hashing from. + * @return PhredNtHash object. + */ + PhredNtHash(const char* seq, + size_t seq_len, + unsigned hash_num, + unsigned k, + unsigned char phred_min, + const char* quality_string, + size_t pos = 0); - /** - * Constructor for PhredNtHash. - * @param seq Sequence to hash. - * @param hash_num Number of hashes to compute. - * @param k Length of k-mer. - * @param phred_min Minimum Phred score for a base to be included in the hash. - * @param quality_string String of Phred scores for each base in `seq`. - * @param pos Position to start hashing from. - * @return PhredNtHash object. - */ - PhredNtHash(const std::string &seq, unsigned hash_num, unsigned k, - size_t phred_min, const char *quality_string, size_t pos = 0); + /** + * Constructor for PhredNtHash. + * @param seq Sequence to hash. + * @param hash_num Number of hashes to compute. + * @param k Length of k-mer. + * @param phred_min Minimum Phred score for a base to be included in the hash. + * @param quality_string String of Phred scores for each base in `seq`. + * @param pos Position to start hashing from. + * @return PhredNtHash object. + */ + PhredNtHash(const std::string& seq, + unsigned hash_num, + unsigned k, + unsigned char phred_min, + const char* quality_string, + size_t pos = 0); - /** - * Roll the hash forward by one base. If the next k-mer contains a base with - * a Phred score below `phred_min`, the hash will be rolled forward until a - * k-mer with all bases with a Phred score above `phred_min` is found. - * @return True if successful, false if the end of the sequence is reached or - * no k-mer with all bases with a Phred score above `phred_min` is found. - */ - bool roll(); - /** - * Roll the hash backward by one base. If the previous k-mer contains a base - * with a Phred score below `phred_min`, the hash will be rolled backward - * until a k-mer with all bases with a Phred score above `phred_min` is found. - * @return True if successful, false if the start of the sequence is reached or - * no k-mer with all bases with a Phred score above `phred_min` is found. - */ - bool roll_back(); - const uint64_t *hashes() const { return NtHash::hashes(); } + /** + * Roll the hash forward by one base. If the next k-mer contains a base with + * a Phred score below `phred_min`, the hash will be rolled forward until a + * k-mer with all bases with a Phred score above `phred_min` is found. + * @return True if successful, false if the end of the sequence is reached or + * no k-mer with all bases with a Phred score above `phred_min` is found. + */ + bool roll(); + /** + * Roll the hash backward by one base. If the previous k-mer contains a base + * with a Phred score below `phred_min`, the hash will be rolled backward + * until a k-mer with all bases with a Phred score above `phred_min` is found. + * @return True if successful, false if the start of the sequence is reached + * or no k-mer with all bases with a Phred score above `phred_min` is found. + */ + bool roll_back(); + const uint64_t* hashes() const { return NtHash::hashes(); } - /** - * Get the position of last hashed k-mer or the k-mer to be hashed if roll() - * has never been called on this NtHash object. - */ - size_t get_pos() const { return NtHash::get_pos(); } - bool forward() const { return NtHash::forward(); } - unsigned get_hash_num() const { return NtHash::get_hash_num(); } - unsigned get_k() const { return NtHash::get_k(); } - size_t get_seq_len() const { return NtHash::get_seq_len(); } - uint64_t get_forward_hash() const { return NtHash::get_forward_hash(); } - uint64_t get_reverse_hash() const { return NtHash::get_reverse_hash(); } + /** + * Get the position of last hashed k-mer or the k-mer to be hashed if roll() + * has never been called on this NtHash object. + */ + size_t get_pos() const { return NtHash::get_pos(); } + bool forward() const { return NtHash::forward(); } + unsigned get_hash_num() const { return NtHash::get_hash_num(); } + unsigned get_k() const { return NtHash::get_k(); } + size_t get_seq_len() const { return NtHash::get_seq_len(); } + uint64_t get_forward_hash() const { return NtHash::get_forward_hash(); } + uint64_t get_reverse_hash() const { return NtHash::get_reverse_hash(); } - private: - const char *qual_seq; - size_t phred_min; - RangeMinimumQuery rmq; - }; +private: + const char* qual_seq; + unsigned char phred_min; + RangeMinimumQuery rmq; +}; } // namespace btllib diff --git a/include/btllib/util.hpp b/include/btllib/util.hpp index 17878444..99a32c52 100644 --- a/include/btllib/util.hpp +++ b/include/btllib/util.hpp @@ -13,232 +13,230 @@ #include #include -namespace btllib -{ +namespace btllib { - static constexpr double PHRED_OFFSET = 33.0; +static constexpr double PHRED_OFFSET = 33.0; - /** - * Split a string into component substrings with a delimiter. - * - * @param s String to split. - * @param delim Delimiter to split with. - * - * @return Vector of substrings delimited by `delim`, excluding delimiters - * themselves. - */ - std::vector - split(const std::string &s, const std::string &delim); +/** + * Split a string into component substrings with a delimiter. + * + * @param s String to split. + * @param delim Delimiter to split with. + * + * @return Vector of substrings delimited by `delim`, excluding delimiters + * themselves. + */ +std::vector +split(const std::string& s, const std::string& delim); - /** - * Join a vector of strings into a single string with a delimiter. - * - * @param s Vector of strings to join. - * @param delim Delimiter to join the strings with. - * - * @return String with all the components joined. - */ - std::string - join(const std::vector &s, const std::string &delim); +/** + * Join a vector of strings into a single string with a delimiter. + * + * @param s Vector of strings to join. + * @param delim Delimiter to join the strings with. + * + * @return String with all the components joined. + */ +std::string +join(const std::vector& s, const std::string& delim); - /** - * Trim whitespace on the left side of the given string. - * - * @param s String to trim, edited in-place. - * - */ - void - ltrim(std::string &s); - void - ltrim(btllib::CString &s); +/** + * Trim whitespace on the left side of the given string. + * + * @param s String to trim, edited in-place. + * + */ +void +ltrim(std::string& s); +void +ltrim(btllib::CString& s); - /** - * Trim whitespace on the right side of the given string. - * - * @param s String to trim, edited in-place. - * - */ - void - rtrim(std::string &s); - void - rtrim(btllib::CString &s); +/** + * Trim whitespace on the right side of the given string. + * + * @param s String to trim, edited in-place. + * + */ +void +rtrim(std::string& s); +void +rtrim(btllib::CString& s); - /** - * Trim whitespace on the left and right side of the given string. - * - * @param s String to trim, edited in-place. - * - */ - void - trim(std::string &s); - void - trim(btllib::CString &s); +/** + * Trim whitespace on the left and right side of the given string. + * + * @param s String to trim, edited in-place. + * + */ +void +trim(std::string& s); +void +trim(btllib::CString& s); - /** - * Check whether the given string starts with a prefix. - * - * @param s String to check. - * @param prefix Prefix to check for. - * - */ - bool - startswith(std::string s, std::string prefix); +/** + * Check whether the given string starts with a prefix. + * + * @param s String to check. + * @param prefix Prefix to check for. + * + */ +bool +startswith(std::string s, std::string prefix); - /** - * Check whether the given string ends with a suffix. - * - * @param s String to check. - * @param suffix Suffix to check for. - * - */ - bool - endswith(std::string s, std::string suffix); +/** + * Check whether the given string ends with a suffix. + * + * @param s String to check. + * @param suffix Suffix to check for. + * + */ +bool +endswith(std::string s, std::string suffix); - /** - * Equivalent to the GNU implementation of basename, - * but returns a string copy of the result. - * - * @param path The path to get basename from. - * - * @return The basename of the path. - */ - std::string - get_basename(const std::string &path); +/** + * Equivalent to the GNU implementation of basename, + * but returns a string copy of the result. + * + * @param path The path to get basename from. + * + * @return The basename of the path. + */ +std::string +get_basename(const std::string& path); - /** - * Equivalent to the GNU implementation of dirname, - * but returns a string copy of the result. - * - * @param path The path to get dirname from. - * - * @return The dirname of the path. - */ - std::string - get_dirname(const std::string &path); +/** + * Equivalent to the GNU implementation of dirname, + * but returns a string copy of the result. + * + * @param path The path to get dirname from. + * + * @return The dirname of the path. + */ +std::string +get_dirname(const std::string& path); +/** + * Calculate the average phred score of a string, + * depending on the start position and length. + * + * @param qual The quality string to calculate the average from. + * @param start_pos The start position of the substring. Defaults to 0. + * @param len The length of the substring. Defaults to 0. If 0, the whole string + * is used. + * + * @return The average phred score of the substring. + */ +double +calc_phred_avg(const std::string& qual, size_t start_pos = 0, size_t len = 0); + +/** + * Range minimum query class. + * Finds the index of the minimum value in a range of any iterable that can be + * indexed via [x] where x is the index corresponding to the element in the + * iterable. + * @tparam T The type of the iterable. + */ +template +class RangeMinimumQuery +{ +public: /** - * Calculate the average phred score of a string, - * depending on the start position and length. - * - * @param qual The quality string to calculate the average from. - * @param start_pos The start position of the substring. Defaults to 0. - * @param len The length of the substring. Defaults to 0. If 0, the whole string - * is used. - * - * @return The average phred score of the substring. + * Constructor for RangeMinimumQuery. + * @param array The iterable to be queried. + * @param size_of_array The size of the iterable. */ - double - calc_phred_avg(const std::string &qual, size_t start_pos = 0, size_t len = 0); - + RangeMinimumQuery(const T& array, size_t size_of_array); /** - * Range minimum query class. - * Finds the index of the minimum value in a range of any iterable that can be indexed via [x] - * where x is the index corresponding to the element in the iterable. - * @tparam T The type of the iterable. + * Query the index of the minimum value in the range [start, end]. + * @param start The start index of the range. + * @param end The end index of the range. + * @return The index of the minimum value in the range. + * @pre start <= end + * @pre end < size_of_array + * @pre start >= 0 + * + * @post The return value is in the range [start, end] + * @post The return value is the index of the minimum value in the range. */ - template - class RangeMinimumQuery - { - public: - /** - * Constructor for RangeMinimumQuery. - * @param array The iterable to be queried. - * @param size_of_array The size of the iterable. - */ - RangeMinimumQuery(const T &array, size_t size_of_array); - /** - * Query the index of the minimum value in the range [start, end]. - * @param start The start index of the range. - * @param end The end index of the range. - * @return The index of the minimum value in the range. - * @pre start <= end - * @pre end < size_of_array - * @pre start >= 0 - * - * @post The return value is in the range [start, end] - * @post The return value is the index of the minimum value in the range. - */ - size_t query(size_t start, size_t end); - - private: - //* The lookup table for the query. - std::vector> lookup_table; - }; - - // adapted from https://www.geeksforgeeks.org/range-minimum-query-for-static-array/ - template - RangeMinimumQuery::RangeMinimumQuery(const T &array, size_t size_of_array) - { - size_t log_size = (size_t)(log2(size_of_array)) + 1; // NOLINT(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) - lookup_table.resize(log_size, std::vector(size_of_array)); - for (size_t i = 0; i < size_of_array; i++) - { - lookup_table[0][i] = i; - } - for (size_t j = 1; j < log_size; j++) - { - for (size_t i = 0; i + (1 << j) - 1 < size_of_array; i++) - { - if (array[lookup_table[j - 1][i]] < array[lookup_table[j - 1][i + (1 << (j - 1))]]) - { - lookup_table[j][i] = lookup_table[j - 1][i]; - } - else - { - lookup_table[j][i] = lookup_table[j - 1][i + (1 << (j - 1))]; - } + size_t query(size_t start, size_t end); + +private: + //* The lookup table for the query. + std::vector> lookup_table; +}; + +// adapted from +// https://www.geeksforgeeks.org/range-minimum-query-for-static-array/ +template +RangeMinimumQuery::RangeMinimumQuery(const T& array, size_t size_of_array) +{ + size_t log_size = + (size_t)(log2( + size_of_array)) + // NOLINT(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + 1; + lookup_table.resize(log_size, std::vector(size_of_array)); + for (size_t i = 0; i < size_of_array; i++) { + lookup_table[0][i] = i; + } + for (size_t j = 1; j < log_size; j++) { + for (size_t i = 0; i + (1 << j) - 1 < size_of_array; i++) { + if (array[lookup_table[j - 1][i]] < + array[lookup_table[j - 1][i + (1 << (j - 1))]]) { + lookup_table[j][i] = lookup_table[j - 1][i]; + } else { + lookup_table[j][i] = lookup_table[j - 1][i + (1 << (j - 1))]; } } } +} - template - size_t RangeMinimumQuery::query(size_t start, size_t end) - { - if (start > end) - { - log_error("RangeMinimumQuery::query: start > end"); - std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe) - } - if (end >= lookup_table[0].size()) - { - log_error("RangeMinimumQuery::query: end >= lookup_table[0].size()"); - std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe) - } - if (start >= lookup_table[0].size()) - { - log_error("RangeMinimumQuery::query: start >= lookup_table[0].size()"); - std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe) - } +template +size_t +RangeMinimumQuery::query(size_t start, size_t end) +{ + if (start > end) { + log_error("RangeMinimumQuery::query: start > end"); + std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe) + } + if (end >= lookup_table[0].size()) { + log_error("RangeMinimumQuery::query: end >= lookup_table[0].size()"); + std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe) + } + if (start >= lookup_table[0].size()) { + log_error("RangeMinimumQuery::query: start >= lookup_table[0].size()"); + std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe) + } - auto j = (size_t)(log2(end - start + 1)); // NOLINT(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) - if (lookup_table[j][start] <= lookup_table[j][end - (1 << j) + 1]) - { - return lookup_table[j][start]; - } - return lookup_table[j][end - (1 << j) + 1]; + auto j = (size_t)(log2( + end - // NOLINT(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + start + 1)); + if (lookup_table[j][start] <= lookup_table[j][end - (1 << j) + 1]) { + return lookup_table[j][start]; } + return lookup_table[j][end - (1 << j) + 1]; +} - // This exists in C++20, but we don't support that yet - /// @cond HIDDEN_SYMBOLS - class Barrier - { +// This exists in C++20, but we don't support that yet +/// @cond HIDDEN_SYMBOLS +class Barrier +{ - public: - Barrier(const unsigned count) - : counter_default(count) - { - } +public: + Barrier(const unsigned count) + : counter_default(count) + { + } - void wait(); + void wait(); - private: - std::mutex m; - std::condition_variable cv; - unsigned counter{0}; - unsigned counter_default; - unsigned waiting{0}; - }; - /// @endcond +private: + std::mutex m; + std::condition_variable cv; + unsigned counter{ 0 }; + unsigned counter_default; + unsigned waiting{ 0 }; +}; +/// @endcond } // namespace btllib diff --git a/src/btllib/phred_nthash.cpp b/src/btllib/phred_nthash.cpp index 0af7b586..e41695f4 100644 --- a/src/btllib/phred_nthash.cpp +++ b/src/btllib/phred_nthash.cpp @@ -1,97 +1,114 @@ #include "btllib/phred_nthash.hpp" -namespace btllib +namespace btllib { +PhredNtHash::PhredNtHash(const char* seq, + size_t seq_len, + unsigned hash_num, + unsigned k, + unsigned char phred_min, + std::string_view quality_string, + size_t pos) + : NtHash(seq, seq_len, hash_num, k, pos) + , qual_seq(quality_string.data()) + , phred_min(phred_min + (size_t)PHRED_OFFSET) + , rmq(quality_string, seq_len) { - PhredNtHash::PhredNtHash(const char *seq, - size_t seq_len, - unsigned hash_num, - unsigned k, - size_t phred_min, const std::string &quality_string, size_t pos) - : NtHash(seq, seq_len, hash_num, k, pos), qual_seq(quality_string.c_str()), phred_min(phred_min + (size_t)PHRED_OFFSET), rmq(quality_string, seq_len) - { - } +} - PhredNtHash::PhredNtHash(const std::string &seq, unsigned hash_num, unsigned k, - size_t phred_min, const std::string &quality_string, size_t pos) - : NtHash(seq, hash_num, k, pos), qual_seq(quality_string.c_str()), phred_min(phred_min + (size_t)PHRED_OFFSET), rmq(quality_string, seq.length()) - { - } +PhredNtHash::PhredNtHash(const std::string& seq, + unsigned hash_num, + unsigned k, + unsigned char phred_min, + std::string_view quality_string, + size_t pos) + : NtHash(seq, hash_num, k, pos) + , qual_seq(quality_string.data()) + , phred_min(phred_min + (size_t)PHRED_OFFSET) + , rmq(quality_string, seq.length()) +{ +} - PhredNtHash::PhredNtHash(const char *seq, - size_t seq_len, - unsigned hash_num, - unsigned k, - size_t phred_min, const char *quality_string, size_t pos) - : NtHash(seq, seq_len, hash_num, k, pos), qual_seq(quality_string), phred_min(phred_min + (size_t)PHRED_OFFSET), rmq(quality_string, seq_len) - { - } +PhredNtHash::PhredNtHash(const char* seq, + size_t seq_len, + unsigned hash_num, + unsigned k, + unsigned char phred_min, + const char* quality_string, + size_t pos) + : NtHash(seq, seq_len, hash_num, k, pos) + , qual_seq(quality_string) + , phred_min(phred_min + (size_t)PHRED_OFFSET) + , rmq(quality_string, seq_len) +{ +} - PhredNtHash::PhredNtHash(const std::string &seq, unsigned hash_num, unsigned k, - size_t phred_min, const char *quality_string, size_t pos) - : NtHash(seq, hash_num, k, pos), qual_seq(quality_string), phred_min(phred_min + (size_t)PHRED_OFFSET), rmq(quality_string, seq.length()) - { +PhredNtHash::PhredNtHash(const std::string& seq, + unsigned hash_num, + unsigned k, + unsigned char phred_min, + const char* quality_string, + size_t pos) + : NtHash(seq, hash_num, k, pos) + , qual_seq(quality_string) + , phred_min(phred_min + (size_t)PHRED_OFFSET) + , rmq(quality_string, seq.length()) +{ +} + +bool +PhredNtHash::roll() +{ + bool success = NtHash::roll(); + if (!success) { + return false; + } + size_t curr_pos = NtHash::get_pos(); + size_t k = NtHash::get_k(); + size_t seq_len = NtHash::get_seq_len(); + size_t min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); + auto min_phred = (unsigned char)qual_seq[min_phred_idx]; + while (min_phred < phred_min) { + // check next kmer range does not exceed sequence length + if (min_phred_idx + k >= seq_len) { + return false; } + curr_pos = min_phred_idx + 1; + min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); + min_phred = (size_t)qual_seq[min_phred_idx]; + } - bool PhredNtHash::roll() - { - bool success = NtHash::roll(); - if (!success) - { - return false; - } - size_t curr_pos = NtHash::get_pos(); - size_t k = NtHash::get_k(); - size_t seq_len = NtHash::get_seq_len(); - size_t min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); - size_t min_phred = (size_t)qual_seq[min_phred_idx]; - while (min_phred < phred_min) - { - // check next kmer range does not exceed sequence length - if (min_phred_idx + k >= seq_len) - { - return false; - } - curr_pos = min_phred_idx + 1; - min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); - min_phred = (size_t)qual_seq[min_phred_idx]; - } + while (curr_pos != NtHash::get_pos()) { + success = NtHash::roll(); + } - while (curr_pos != NtHash::get_pos()) - { - success = NtHash::roll(); - } + return success; +} - return success; +bool +PhredNtHash::roll_back() +{ + bool success = NtHash::roll_back(); + if (!success) { + return false; + } + size_t curr_pos = NtHash::get_pos(); + size_t k = NtHash::get_k(); + size_t min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); + auto min_phred = (unsigned char)qual_seq[min_phred_idx]; + while (min_phred < phred_min) { + // check next kmer range does not exceed sequence length + if (min_phred_idx - k >= NtHash::get_seq_len()) { + return false; } + curr_pos = min_phred_idx - k; + min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); + min_phred = (size_t)qual_seq[min_phred_idx]; + } - bool PhredNtHash::roll_back() - { - bool success = NtHash::roll_back(); - if (!success) - { - return false; - } - size_t curr_pos = NtHash::get_pos(); - size_t k = NtHash::get_k(); - size_t min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); - size_t min_phred = (size_t)qual_seq[min_phred_idx]; - while (min_phred < phred_min) - { - // check next kmer range does not exceed sequence length - if (min_phred_idx - k >= NtHash::get_seq_len()) - { - return false; - } - curr_pos = min_phred_idx - k; - min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); - min_phred = (size_t)qual_seq[min_phred_idx]; - } + while (curr_pos != NtHash::get_pos()) { + success = NtHash::roll_back(); + } - while (curr_pos != NtHash::get_pos()) - { - success = NtHash::roll_back(); - } - - return success; - } + return success; +} } // namespace btllib \ No newline at end of file From cebddbbbf123f26df2d1ba6cbe45c361f63bc103 Mon Sep 17 00:00:00 2001 From: jowong04 Date: Thu, 13 Jul 2023 18:32:17 -0700 Subject: [PATCH 4/6] phred_nthash: add roll_back --- tests/phred_nthash.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/phred_nthash.cpp b/tests/phred_nthash.cpp index cc423706..23977cfe 100644 --- a/tests/phred_nthash.cpp +++ b/tests/phred_nthash.cpp @@ -32,6 +32,10 @@ main() TEST_ASSERT_ARRAY_EQ(h_vals, phred_nthash.hashes(), h); } + phred_nthash.roll_back(); + TEST_ASSERT_EQ(phred_nthash.get_pos(), 0); + TEST_ASSERT_ARRAY_EQ(hashes[0], phred_nthash.hashes(), h); + } { From dc8b9b9802477f7b9bf3219030a102e2a9a6af07 Mon Sep 17 00:00:00 2001 From: jowong04 Date: Thu, 13 Jul 2023 18:50:19 -0700 Subject: [PATCH 5/6] phred_nthash: fix constructor --- include/btllib/phred_nthash.hpp | 8 ++++---- src/btllib/phred_nthash.cpp | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/btllib/phred_nthash.hpp b/include/btllib/phred_nthash.hpp index 7ef2462d..99ad7bbe 100644 --- a/include/btllib/phred_nthash.hpp +++ b/include/btllib/phred_nthash.hpp @@ -29,7 +29,7 @@ class PhredNtHash : private NtHash size_t seq_len, unsigned hash_num, unsigned k, - unsigned char phred_min, + size_t phred_min, std::string_view quality_string, size_t pos = 0); @@ -46,7 +46,7 @@ class PhredNtHash : private NtHash PhredNtHash(const std::string& seq, unsigned hash_num, unsigned k, - unsigned char phred_min, + size_t phred_min, std::string_view quality_string, size_t pos = 0); @@ -65,7 +65,7 @@ class PhredNtHash : private NtHash size_t seq_len, unsigned hash_num, unsigned k, - unsigned char phred_min, + size_t phred_min, const char* quality_string, size_t pos = 0); @@ -82,7 +82,7 @@ class PhredNtHash : private NtHash PhredNtHash(const std::string& seq, unsigned hash_num, unsigned k, - unsigned char phred_min, + size_t phred_min, const char* quality_string, size_t pos = 0); diff --git a/src/btllib/phred_nthash.cpp b/src/btllib/phred_nthash.cpp index e41695f4..fcd6fbe0 100644 --- a/src/btllib/phred_nthash.cpp +++ b/src/btllib/phred_nthash.cpp @@ -5,12 +5,12 @@ PhredNtHash::PhredNtHash(const char* seq, size_t seq_len, unsigned hash_num, unsigned k, - unsigned char phred_min, + size_t phred_min, std::string_view quality_string, size_t pos) : NtHash(seq, seq_len, hash_num, k, pos) , qual_seq(quality_string.data()) - , phred_min(phred_min + (size_t)PHRED_OFFSET) + , phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET)) , rmq(quality_string, seq_len) { } @@ -18,12 +18,12 @@ PhredNtHash::PhredNtHash(const char* seq, PhredNtHash::PhredNtHash(const std::string& seq, unsigned hash_num, unsigned k, - unsigned char phred_min, + size_t phred_min, std::string_view quality_string, size_t pos) : NtHash(seq, hash_num, k, pos) , qual_seq(quality_string.data()) - , phred_min(phred_min + (size_t)PHRED_OFFSET) + , phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET)) , rmq(quality_string, seq.length()) { } @@ -32,12 +32,12 @@ PhredNtHash::PhredNtHash(const char* seq, size_t seq_len, unsigned hash_num, unsigned k, - unsigned char phred_min, + size_t phred_min, const char* quality_string, size_t pos) : NtHash(seq, seq_len, hash_num, k, pos) , qual_seq(quality_string) - , phred_min(phred_min + (size_t)PHRED_OFFSET) + , phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET)) , rmq(quality_string, seq_len) { } @@ -45,12 +45,12 @@ PhredNtHash::PhredNtHash(const char* seq, PhredNtHash::PhredNtHash(const std::string& seq, unsigned hash_num, unsigned k, - unsigned char phred_min, + size_t phred_min, const char* quality_string, size_t pos) : NtHash(seq, hash_num, k, pos) , qual_seq(quality_string) - , phred_min(phred_min + (size_t)PHRED_OFFSET) + , phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET)) , rmq(quality_string, seq.length()) { } From e03786ed10454dfc528638922f43d4bd47d467f4 Mon Sep 17 00:00:00 2001 From: JW <34543031+jwcodee@users.noreply.github.com> Date: Thu, 13 Jul 2023 18:56:25 -0700 Subject: [PATCH 6/6] util.cpp: remove typo --- tests/util.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/util.cpp b/tests/util.cpp index bdcf5041..3d8de971 100644 --- a/tests/util.cpp +++ b/tests/util.cpp @@ -36,7 +36,7 @@ main() TEST_ASSERT_LT(std::abs(avg - 6.4), 1e-4); TEST_ASSERT_LT(std::abs(avg1 - 10.949), 1e-4); TEST_ASSERT_LT(std::abs(avg2 - 3.5), 1e-4); - TEST_ASSERT_LT(std::abs(avg3 - 6.15), 1e-4);\ + TEST_ASSERT_LT(std::abs(avg3 - 6.15), 1e-4); std::vector test_vec{ 7, 2, 3, 0, 5, 10, 3, 12, 18 }; btllib::RangeMinimumQuery> rmq(test_vec, test_vec.size()); @@ -45,4 +45,4 @@ main() TEST_ASSERT_EQ(test_vec.at(rmq.query(7, 8)), 12); return 0; -} \ No newline at end of file +}