diff --git a/src/analysis/hmr.cpp b/src/analysis/hmr.cpp index 16ec54bc..9182b1a9 100644 --- a/src/analysis/hmr.cpp +++ b/src/analysis/hmr.cpp @@ -14,42 +14,42 @@ * GNU General Public License for more details. */ -#include #include +#include // for [u]int[0-9]+_t #include #include -#include +#include +#include #include +#include #include -#include // for [u]int[0-9]+_t -#include #include -#include "smithlab_utils.hpp" -#include "smithlab_os.hpp" #include "GenomicRegion.hpp" #include "OptionParser.hpp" +#include "smithlab_os.hpp" +#include "smithlab_utils.hpp" -#include "TwoStateHMM.hpp" #include "MSite.hpp" +#include "TwoStateHMM.hpp" #include "counts_header.hpp" -using std::string; -using std::vector; +using std::accumulate; +using std::cerr; using std::cout; using std::endl; -using std::cerr; -using std::numeric_limits; +using std::make_pair; using std::max; using std::min; +using std::numeric_limits; using std::pair; -using std::make_pair; +using std::reduce; using std::runtime_error; +using std::string; using std::to_string; using std::unordered_set; -using std::accumulate; -using std::reduce; +using std::vector; using bamxx::bgzf_file; @@ -58,10 +58,10 @@ struct hmr_summary { hmr_count = hmrs.size(); hmr_total_size = accumulate(cbegin(hmrs), cend(hmrs), 0ul, [](const uint64_t t, const GenomicRegion &p) { - return t + p.get_width(); }); - hmr_mean_size = - static_cast(hmr_total_size)/ - std::max(hmr_count, static_cast(1)); + return t + p.get_width(); + }); + hmr_mean_size = static_cast(hmr_total_size) / + std::max(hmr_count, static_cast(1)); } // hmr_count is the number of identified HMRs. uint64_t hmr_count{}; @@ -70,18 +70,18 @@ struct hmr_summary { // mean_hmr_size is the mean size of the identified HMRs double hmr_mean_size{}; - string tostring() { + string + tostring() { std::ostringstream oss; oss << "hmr_count: " << hmr_count << endl << "hmr_total_size: " << hmr_total_size << endl - << "hmr_mean_size: " - << std::fixed << std::setprecision(2) << hmr_mean_size; + << "hmr_mean_size: " << std::fixed << std::setprecision(2) + << hmr_mean_size; return oss.str(); } }; - static GenomicRegion as_gen_rgn(const MSite &s) { return GenomicRegion(s.chrom, s.pos, s.pos + 1); @@ -89,27 +89,29 @@ as_gen_rgn(const MSite &s) { static string format_cpg_meth_tag(const pair &m) { - return "CpG:" + to_string(static_cast(m.first)) + - ":" + to_string(static_cast(m.second)); + return "CpG:" + to_string(static_cast(m.first)) + ":" + + to_string(static_cast(m.second)); } static double get_stepup_cutoff(vector scores, const double cutoff) { - if (cutoff <= 0) return numeric_limits::max(); - else if (cutoff > 1) return numeric_limits::min(); + if (cutoff <= 0) + return numeric_limits::max(); + else if (cutoff > 1) + return numeric_limits::min(); const size_t n = scores.size(); std::sort(begin(scores), end(scores)); size_t i = 1; - while (i < n && scores[i-1] < (cutoff*i)/n) ++i; + while (i < n && scores[i - 1] < (cutoff * i) / n) + ++i; return scores[i - 1]; } static void get_domain_scores(const vector &state_ids, - const vector > &meth, - const vector &reset_points, - vector &scores) { + const vector> &meth, + const vector &reset_points, vector &scores) { size_t reset_idx = 1; bool in_domain = false; @@ -125,7 +127,7 @@ get_domain_scores(const vector &state_ids, } if (state_ids[i]) { in_domain = true; - score += 1.0 - (meth[i].first/(meth[i].first + meth[i].second)); + score += 1.0 - (meth[i].first / (meth[i].first + meth[i].second)); } else if (in_domain) { in_domain = false; @@ -138,12 +140,9 @@ get_domain_scores(const vector &state_ids, scores.push_back(score); } - static void -build_domains(const vector &cpgs, - const vector &reset_points, - const vector &state_ids, - vector &domains) { +build_domains(const vector &cpgs, const vector &reset_points, + const vector &state_ids, vector &domains) { size_t n_cpgs = 0, reset_idx = 1, prev_end = 0; bool in_domain = false; @@ -157,7 +156,7 @@ build_domains(const vector &cpgs, } ++reset_idx; } - if (state_ids[i]) { // currently in an hmr + if (state_ids[i]) { // currently in an hmr if (!in_domain) { in_domain = true; domains.push_back(as_gen_rgn(cpgs[i])); @@ -179,13 +178,10 @@ build_domains(const vector &cpgs, } } - template static void -separate_regions(const bool verbose, - const size_t desert_size, - vector &cpgs, - vector &meth, vector &reads, +separate_regions(const bool verbose, const size_t desert_size, + vector &cpgs, vector &meth, vector &reads, vector &reset_points) { if (verbose) cerr << "[separating by cpg desert]" << endl; @@ -207,14 +203,15 @@ separate_regions(const bool verbose, // segregate cpgs size_t prev_pos = 0; for (size_t i = 0; i < cpgs.size(); ++i) { - const size_t dist = (i > 0 && cpgs[i].chrom == cpgs[i - 1].chrom) ? - cpgs[i].pos - prev_pos : numeric_limits::max(); + const size_t dist = (i > 0 && cpgs[i].chrom == cpgs[i - 1].chrom) + ? cpgs[i].pos - prev_pos + : numeric_limits::max(); if (dist > desert_size) { reset_points.push_back(i); if (dist < numeric_limits::max()) bases_in_deserts += dist; } - if (dist::max()) + if (dist < numeric_limits::max()) total_bases += dist; prev_pos = cpgs[i].pos; @@ -225,33 +222,29 @@ separate_regions(const bool verbose, cerr << "[cpgs retained: " << cpgs.size() << "]" << endl << "[deserts removed: " << reset_points.size() - 2 << "]" << endl << "[genome fraction covered: " - << 1.0 - (bases_in_deserts/total_bases) << "]" << endl; + << 1.0 - (bases_in_deserts / total_bases) << "]" << endl; } - /* function to "fold" the methylation profile so that the middle * methylation becomes lower methylation, and both the low and high * methylation become high. this method actually seems to work. */ static void make_partial_meth(const vector &reads, - vector > &meth) { + vector> &meth) { for (size_t i = 0; i < reads.size(); ++i) { - double m = meth[i].first/reads[i]; - m = (m <= 0.5) ? (1.0 - 2*m) : (1.0 - 2*(1.0 - m)); - meth[i].first = reads[i]*m; + double m = meth[i].first / reads[i]; + m = (m <= 0.5) ? (1.0 - 2 * m) : (1.0 - 2 * (1.0 - m)); + meth[i].first = reads[i] * m; meth[i].second = (reads[i] - meth[i].first); } } static void -shuffle_cpgs(const size_t rng_seed, - const TwoStateHMM &hmm, - vector > meth, - vector reset_points, - const double p_fb, const double p_bf, - const double fg_alpha, const double fg_beta, - const double bg_alpha, const double bg_beta, +shuffle_cpgs(const size_t rng_seed, const TwoStateHMM &hmm, + vector> meth, vector reset_points, + const double p_fb, const double p_bf, const double fg_alpha, + const double fg_beta, const double bg_alpha, const double bg_beta, vector &domain_scores) { auto eng = std::default_random_engine(rng_seed); @@ -259,9 +252,8 @@ shuffle_cpgs(const size_t rng_seed, vector state_ids; vector scores; - hmm.PosteriorDecoding(meth, reset_points, p_fb, p_bf, - fg_alpha, fg_beta, bg_alpha, - bg_beta, state_ids, scores); + hmm.PosteriorDecoding(meth, reset_points, p_fb, p_bf, fg_alpha, fg_beta, + bg_alpha, bg_beta, state_ids, scores); get_domain_scores(state_ids, meth, reset_points, domain_scores); sort(begin(domain_scores), end(domain_scores)); } @@ -272,30 +264,23 @@ assign_p_values(const vector &random_scores, vector &p_values) { const double n_randoms = random_scores.empty() ? 1 : random_scores.size(); for (size_t i = 0; i < observed_scores.size(); ++i) - p_values.push_back((end(random_scores) - - upper_bound(begin(random_scores), - end(random_scores), - observed_scores[i]))/n_randoms); + p_values.push_back((end(random_scores) - upper_bound(begin(random_scores), + end(random_scores), + observed_scores[i])) / + n_randoms); } static void -read_params_file(const bool verbose, - const string ¶ms_file, - double &fg_alpha, double &fg_beta, - double &bg_alpha, double &bg_beta, - double &p_fb, double &p_bf, +read_params_file(const bool verbose, const string ¶ms_file, + double &fg_alpha, double &fg_beta, double &bg_alpha, + double &bg_beta, double &p_fb, double &p_bf, double &domain_score_cutoff) { string jnk; std::ifstream in(params_file); if (!in) throw runtime_error("failed to parse params file: " + params_file); - in >> jnk >> fg_alpha - >> jnk >> fg_beta - >> jnk >> bg_alpha - >> jnk >> bg_beta - >> jnk >> p_fb - >> jnk >> p_bf - >> jnk >> domain_score_cutoff; + in >> jnk >> fg_alpha >> jnk >> fg_beta >> jnk >> bg_alpha >> jnk >> + bg_beta >> jnk >> p_fb >> jnk >> p_bf >> jnk >> domain_score_cutoff; if (verbose) cerr << "FG_ALPHA\t" << fg_alpha << endl << "FG_BETA\t" << fg_beta << endl @@ -307,17 +292,14 @@ read_params_file(const bool verbose, } static void -write_params_file(const string &outfile, - const double fg_alpha, - const double fg_beta, - const double bg_alpha, - const double bg_beta, - const double p_fb, - const double p_bf, +write_params_file(const string &outfile, const double fg_alpha, + const double fg_beta, const double bg_alpha, + const double bg_beta, const double p_fb, const double p_bf, const double domain_score_cutoff) { std::ofstream of; - if (!outfile.empty()) of.open(outfile.c_str()); + if (!outfile.empty()) + of.open(outfile.c_str()); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); out.precision(30); @@ -330,13 +312,25 @@ write_params_file(const string &outfile, << "DOMAIN_SCORE_CUTOFF\t" << domain_score_cutoff << endl; } +[[nodiscard]] +static inline double +get_n_meth(const MSite &s) { + return s.meth * s.n_reads; +} + +[[nodiscard]] +static inline double +get_n_unmeth(const MSite &s) { + return (1.0 - s.meth) * s.n_reads; +} + static void load_cpgs(const string &cpgs_file, vector &cpgs, - vector > &meth, - vector &reads) { + vector> &meth, vector &reads) { bgzf_file in(cpgs_file, "r"); - if (!in) throw runtime_error("failed opening file: " + cpgs_file); + if (!in) + throw runtime_error("failed opening file: " + cpgs_file); if (get_has_counts_header(cpgs_file)) skip_counts_header(in); @@ -347,22 +341,25 @@ load_cpgs(const string &cpgs_file, vector &cpgs, throw runtime_error("error: input is not symmetric-CpGs: " + cpgs_file); cpgs.push_back(the_site); reads.push_back(the_site.n_reads); - meth.push_back(make_pair(the_site.n_meth(), the_site.n_unmeth())); + meth.push_back(make_pair(get_n_meth(the_site), get_n_unmeth(the_site))); prev_site = the_site; } } -template static auto +template +static auto get_mean(InputIterator first, InputIterator last) -> T { return accumulate(first, last, static_cast(0)) / std::distance(first, last); } -template static void +template +static void check_sorted_within_chroms(T first, const T last) { // empty, or a single element - if (first == last || first + 1 == last) return; + if (first == last || first + 1 == last) + return; unordered_set chroms_seen; string cur_chrom = ""; @@ -370,25 +367,19 @@ check_sorted_within_chroms(T first, const T last) { for (auto fast = first + 1; fast != last; ++fast, ++first) { if (fast->chrom != cur_chrom) { if (chroms_seen.find(fast->chrom) != end(chroms_seen)) { - throw runtime_error( - "input not grouped by chromosomes. " - "Error in the following line:\n" + - fast->tostring() - ); - + throw runtime_error("input not grouped by chromosomes. " + "Error in the following line:\n" + + fast->tostring()); } cur_chrom = fast->chrom; chroms_seen.insert(cur_chrom); // don't check ordering if chroms have changed } - else { // first and fast are in the same chrom + else { // first and fast are in the same chrom if (first->pos >= fast->pos) { - throw runtime_error( - "input file not sorted properly. " - "Error in the following lines:\n" + - first->tostring() + "\n" + - fast->tostring() - ); + throw runtime_error("input file not sorted properly. " + "Error in the following lines:\n" + + first->tostring() + "\n" + fast->tostring()); } } } @@ -412,6 +403,7 @@ main_hmr(int argc, const char **argv) { // run mode flags bool verbose = false; bool PARTIAL_METH = false; + bool allow_extra_fields = false; // corrections for small values const double tolerance = 1e-10; @@ -430,29 +422,38 @@ main_hmr(int argc, const char **argv) { file."; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), - description, ""); - opt_parse.add_opt("out", 'o', "output file (default: stdout)", - false, outfile); - opt_parse.add_opt("desert", 'd', "max dist btwn covered cpgs in HMR", - false, desert_size); + OptionParser opt_parse(strip_path(argv[0]), description, + ""); + opt_parse.add_opt("out", 'o', "output file (default: stdout)", false, + outfile); + opt_parse.add_opt("desert", 'd', "max dist btwn covered cpgs in HMR", false, + desert_size); opt_parse.add_opt("itr", 'i', "max iterations", false, max_iterations); opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); - opt_parse.add_opt("partial", '\0', "identify PMRs instead of HMRs", - false, PARTIAL_METH); - opt_parse.add_opt("post-hypo", '\0', "output file for single-CpG posterior " + opt_parse.add_opt("partial", '\0', "identify PMRs instead of HMRs", false, + PARTIAL_METH); + opt_parse.add_opt("post-hypo", '\0', + "output file for single-CpG posterior " "hypomethylation probability (default: none)", false, hypo_post_outfile); - opt_parse.add_opt("post-meth", '\0', "output file for single-CpG posteiror " + opt_parse.add_opt("post-meth", '\0', + "output file for single-CpG posteiror " "methylation probability (default: none)", false, meth_post_outfile); - opt_parse.add_opt("params-in", 'P', "HMM parameter file " - "(override training)", false, params_in_file); - opt_parse.add_opt("params-out", 'p', "write HMM parameters to this " - "file (default: none)", false, params_out_file); + opt_parse.add_opt("params-in", 'P', + "HMM parameter file " + "(override training)", + false, params_in_file); + opt_parse.add_opt("params-out", 'p', + "write HMM parameters to this " + "file (default: none)", + false, params_out_file); opt_parse.add_opt("seed", 's', "specify random seed", false, rng_seed); opt_parse.add_opt("summary", 'S', "write summary output here", false, summary_file); + opt_parse.add_opt("relaxed", '\0', + "input has extra fields (used for nanopore)", false, + allow_extra_fields); opt_parse.set_show_defaults(); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); @@ -476,12 +477,14 @@ main_hmr(int argc, const char **argv) { const string cpgs_file = leftover_args.front(); /****************** END COMMAND LINE OPTIONS *****************/ + MSite::no_extra_fields = (allow_extra_fields == false); + if (!is_msite_file(cpgs_file)) throw runtime_error("malformed counts file: " + cpgs_file); // separate the regions by chrom and by desert vector cpgs; - vector > meth; + vector> meth; vector reads; if (verbose) cerr << "[reading methylation levels]" << endl; @@ -507,7 +510,8 @@ main_hmr(int argc, const char **argv) { << " min_coverage=" << min_coverage << endl; if (!summary_file.empty()) { std::ofstream summary_out(summary_file); - if (!summary_out) throw runtime_error("failed to open: " + summary_file); + if (!summary_out) + throw runtime_error("failed to open: " + summary_file); summary_out << hmr_summary({}).tostring() << endl; } return EXIT_SUCCESS; @@ -527,23 +531,22 @@ main_hmr(int argc, const char **argv) { double bg_alpha = 0, bg_beta = 0; double domain_score_cutoff = std::numeric_limits::max(); - if (!params_in_file.empty()) { // read parameters file - read_params_file(verbose, params_in_file, - fg_alpha, fg_beta, bg_alpha, bg_beta, - p_fb, p_bf, domain_score_cutoff); + if (!params_in_file.empty()) { // read parameters file + read_params_file(verbose, params_in_file, fg_alpha, fg_beta, bg_alpha, + bg_beta, p_fb, p_bf, domain_score_cutoff); max_iterations = 0; } else { const double n_reads = get_mean(begin(reads), end(reads)); - fg_alpha = 0.33*n_reads; - fg_beta = 0.67*n_reads; - bg_alpha = 0.67*n_reads; - bg_beta = 0.33*n_reads; + fg_alpha = 0.33 * n_reads; + fg_beta = 0.67 * n_reads; + bg_alpha = 0.67 * n_reads; + bg_beta = 0.33 * n_reads; } if (max_iterations > 0) - hmm.BaumWelchTraining(meth, reset_points, p_fb, p_bf, - fg_alpha, fg_beta, bg_alpha, bg_beta); + hmm.BaumWelchTraining(meth, reset_points, p_fb, p_bf, fg_alpha, fg_beta, + bg_alpha, bg_beta); // DECODE THE DOMAINS vector state_ids; vector posteriors; @@ -554,8 +557,8 @@ main_hmr(int argc, const char **argv) { get_domain_scores(state_ids, meth, reset_points, domain_scores); vector random_scores; - shuffle_cpgs(rng_seed, hmm, meth, reset_points, p_fb, p_bf, - fg_alpha, fg_beta, bg_alpha, bg_beta, random_scores); + shuffle_cpgs(rng_seed, hmm, meth, reset_points, p_fb, p_bf, fg_alpha, + fg_beta, bg_alpha, bg_beta, random_scores); vector p_values; assign_p_values(random_scores, domain_scores, p_values); @@ -574,7 +577,8 @@ main_hmr(int argc, const char **argv) { build_domains(cpgs, reset_points, state_ids, domains); std::ofstream of; - if (!outfile.empty()) of.open(outfile); + if (!outfile.empty()) + of.open(outfile); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); size_t good_hmr_count = 0; @@ -586,7 +590,7 @@ main_hmr(int argc, const char **argv) { } domains.resize(good_hmr_count); - for (const auto &d: domains) + for (const auto &d : domains) out << d << '\n'; if (!hypo_post_outfile.empty()) { @@ -614,7 +618,8 @@ main_hmr(int argc, const char **argv) { } if (!summary_file.empty()) { std::ofstream summary_out(summary_file); - if (!summary_out) throw runtime_error("failed to open: " + summary_file); + if (!summary_out) + throw runtime_error("failed to open: " + summary_file); summary_out << hmr_summary(domains).tostring() << endl; } } diff --git a/src/analysis/methcounts.cpp b/src/analysis/methcounts.cpp index 0fe72620..33b7d1a6 100644 --- a/src/analysis/methcounts.cpp +++ b/src/analysis/methcounts.cpp @@ -17,14 +17,14 @@ * General Public License for more details. */ -#include -#include +#include // for [u]int[0-9]+_t #include #include #include #include +#include #include -#include // for [u]int[0-9]+_t +#include #include "OptionParser.hpp" // #include "GenomicRegion.hpp" @@ -33,23 +33,23 @@ MSite has a way to serialize into a char[] directly, then we should use it. */ // #include "MSite.hpp" -#include "bsutils.hpp" -#include "dnmt_error.hpp" #include "bam_record_utils.hpp" +#include "bsutils.hpp" #include "counts_header.hpp" +#include "dnmt_error.hpp" /* HTSlib */ #include -using std::string; -using std::vector; using std::cerr; using std::endl; -using std::unordered_set; +using std::string; using std::unordered_map; +using std::unordered_set; +using std::vector; -using bamxx::bam_rec; using bamxx::bam_header; +using bamxx::bam_rec; using bamxx::bgzf_file; struct quick_buf : public std::ostringstream, @@ -57,20 +57,21 @@ struct quick_buf : public std::ostringstream, // ADS: By user ecatmur on SO; very fast. Seems to work... quick_buf() { // ...but this seems to depend on data layout - static_cast&>(*this).rdbuf(this); + static_cast &>(*this).rdbuf(this); } - void clear() { + void + clear() { // reset buffer pointers (member functions) setp(pbase(), pbase()); } - char const* c_str() { + char const * + c_str() { /* between c_str and insertion make sure to clear() */ *pptr() = '\0'; return pbase(); } }; - // ADS: we should never have to worry about coverage over > 32767 in // any downstream analysis, so using "int16_t" here would allow to // detect wrap around and report it as some kind of weird thing, maybe @@ -78,14 +79,15 @@ struct quick_buf : public std::ostringstream, // >65535-fold coverage, we can make the change here. typedef uint16_t count_type; - static inline bool -eats_ref(const uint32_t c) {return bam_cigar_type(bam_cigar_op(c)) & 2;} - +eats_ref(const uint32_t c) { + return bam_cigar_type(bam_cigar_op(c)) & 2; +} static inline bool -eats_query(const uint32_t c) {return bam_cigar_type(bam_cigar_op(c)) & 1;} - +eats_query(const uint32_t c) { + return bam_cigar_type(bam_cigar_op(c)) & 1; +} /* The three functions below here should probably be moved into bsutils.hpp. I am not sure if the DDG function is needed, but it @@ -95,32 +97,23 @@ eats_query(const uint32_t c) {return bam_cigar_type(bam_cigar_op(c)) & 1;} plants. */ static bool is_chh(const std::string &s, size_t i) { - return (i < (s.length() - 2)) && - is_cytosine(s[i]) && - !is_guanine(s[i + 1]) && - !is_guanine(s[i + 2]); + return (i < (s.length() - 2)) && is_cytosine(s[i]) && !is_guanine(s[i + 1]) && + !is_guanine(s[i + 2]); } - static bool is_ddg(const std::string &s, size_t i) { - return (i < (s.length() - 2)) && - !is_cytosine(s[i]) && - !is_cytosine(s[i + 1]) && - is_guanine(s[i + 2]); + return (i < (s.length() - 2)) && !is_cytosine(s[i]) && + !is_cytosine(s[i + 1]) && is_guanine(s[i + 2]); } - static bool is_c_at_g(const std::string &s, size_t i) { - return (i < (s.length() - 2)) && - is_cytosine(s[i]) && - !is_cytosine(s[i + 1]) && - !is_guanine(s[i + 1]) && - is_guanine(s[i + 2]); + return (i < (s.length() - 2)) && is_cytosine(s[i]) && + !is_cytosine(s[i + 1]) && !is_guanine(s[i + 1]) && + is_guanine(s[i + 2]); } - /* Right now the CountSet objects below are much larger than they need to be, for the things we are computing. However, it's not clear that the minimum information would really put the memory @@ -128,41 +121,69 @@ is_c_at_g(const std::string &s, size_t i) { all the information seems reasonable. */ struct CountSet { - string tostring() const { + string + tostring() const { std::ostringstream oss; - oss << pA << '\t' << pC << '\t' << pG << '\t' << pT << '\t' - << nA << '\t' << nC << '\t' << nG << '\t' << nT; + oss << pA << '\t' << pC << '\t' << pG << '\t' << pT << '\t' << nA << '\t' + << nC << '\t' << nG << '\t' << nT; // << '\t' << N; /* not used */ return oss.str(); } - void add_count_pos(const char x) { - if (x == 'T') ++pT; // conditions ordered for efficiency - else if (x == 'C') ++pC; - else if (x == 'G') ++pG; - else if (x == 'A') ++pA; + void + add_count_pos(const char x) { + if (x == 'T') + ++pT; // conditions ordered for efficiency + else if (x == 'C') + ++pC; + else if (x == 'G') + ++pG; + else if (x == 'A') + ++pA; // else ++N; /* not used */ } - void add_count_neg(const char x) { - if (x == 'T') ++nT; // conditions ordered for efficiency(??) - else if (x == 'C') ++nC; - else if (x == 'G') ++nG; - else if (x == 'A') ++nA; + void + add_count_neg(const char x) { + if (x == 'T') + ++nT; // conditions ordered for efficiency(??) + else if (x == 'C') + ++nC; + else if (x == 'G') + ++nG; + else if (x == 'A') + ++nA; // else ++N; /* not used */ } - count_type pos_total() const {return pA + pC + pG + pT;} - count_type neg_total() const {return nA + nC + nG + nT;} + count_type + pos_total() const { + return pA + pC + pG + pT; + } + count_type + neg_total() const { + return nA + nC + nG + nT; + } - count_type unconverted_cytosine() const {return pC;} - count_type converted_cytosine() const {return pT;} - count_type unconverted_guanine() const {return nC;} - count_type converted_guanine() const {return nT;} + count_type + unconverted_cytosine() const { + return pC; + } + count_type + converted_cytosine() const { + return pT; + } + count_type + unconverted_guanine() const { + return nC; + } + count_type + converted_guanine() const { + return nT; + } count_type pA{0}, pC{0}, pG{0}, pT{0}; count_type nA{0}, nC{0}, nG{0}, nT{0}; // count_type N; /* this wasn't used and breaks alignment */ }; - /* The "tag" returned by this function should be exclusive, so that * the order of checking conditions doesn't matter. There is also a * bit of a hack in that the unsigned "pos" could wrap, but this still @@ -172,18 +193,26 @@ struct CountSet { static uint32_t get_tag_from_genome(const string &s, const size_t pos) { if (is_cytosine(s[pos])) { - if (is_cpg(s, pos)) return 0; - else if (is_chh(s, pos)) return 1; - else if (is_c_at_g(s, pos)) return 2; - else return 3; + if (is_cpg(s, pos)) + return 0; + else if (is_chh(s, pos)) + return 1; + else if (is_c_at_g(s, pos)) + return 2; + else + return 3; } if (is_guanine(s[pos])) { - if (is_cpg(s, pos - 1)) return 0; - else if (is_ddg(s, pos - 2)) return 1; - else if (is_c_at_g(s, pos - 2)) return 2; - else return 3; + if (is_cpg(s, pos - 1)) + return 0; + else if (is_ddg(s, pos - 2)) + return 1; + else if (is_c_at_g(s, pos - 2)) + return 2; + else + return 3; } - return 4; // shouldn't be used for anything + return 4; // shouldn't be used for anything } /* This "has_mutated" function looks on the opposite strand to see @@ -199,40 +228,39 @@ has_mutated(const char base, const CountSet &cs) { } static const char *tag_values[] = { - "CpG", // 0 - "CHH", // 1 - "CXG", // 2 - "CCG", // 3 - "N", // 4 - "CpGx",// 5 <---- MUT_OFFSET - "CHHx",// 6 - "CXGx",// 7 - "CCGx",// 8 - "Nx" // 9 + "CpG", // 0 + "CHH", // 1 + "CXG", // 2 + "CCG", // 3 + "N", // 4 + "CpGx", // 5 <---- MUT_OFFSET + "CHHx", // 6 + "CXGx", // 7 + "CCGx", // 8 + "Nx" // 9 }; static const uint32_t MUT_OFFSET = 5; - static inline uint32_t tag_with_mut(const uint32_t tag, const bool mut) { return tag + (mut ? MUT_OFFSET : 0); } - template static void -write_output(const bam_header &hdr, bgzf_file &out, - const int32_t tid, const string &chrom, - const vector &counts, bool CPG_ONLY) { +write_output(const bam_header &hdr, bgzf_file &out, const int32_t tid, + const string &chrom, const vector &counts, + bool CPG_ONLY) { - quick_buf buf; // keep underlying buffer space? + quick_buf buf; // keep underlying buffer space? for (size_t i = 0; i < counts.size(); ++i) { const char base = chrom[i]; if (is_cytosine(base) || is_guanine(base)) { const uint32_t the_tag = get_tag_from_genome(chrom, i); - if (CPG_ONLY && the_tag != 0) continue; + if (CPG_ONLY && the_tag != 0) + continue; const bool is_c = is_cytosine(base); const double unconverted = is_c ? counts[i].unconverted_cytosine() @@ -241,7 +269,8 @@ write_output(const bam_header &hdr, bgzf_file &out, is_c ? counts[i].converted_cytosine() : counts[i].converted_guanine(); const uint32_t n_reads = unconverted + converted; - if (require_covered && n_reads == 0) continue; + if (require_covered && n_reads == 0) + continue; const bool mut = has_mutated(base, counts[i]); buf.clear(); @@ -257,7 +286,6 @@ write_output(const bam_header &hdr, bgzf_file &out, } } - static void count_states_pos(const bam_rec &aln, vector &counts) { /* Move through cigar, reference and read positions without @@ -266,7 +294,7 @@ count_states_pos(const bam_rec &aln, vector &counts) { const auto beg_cig = bam_get_cigar(aln); const auto end_cig = beg_cig + get_n_cigar(aln); auto rpos = get_pos(aln); - auto qpos = 0; // to match type with b->core.l_qseq + auto qpos = 0; // to match type with b->core.l_qseq for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { const char op = bam_cigar_op(*c_itr); const uint32_t n = bam_cigar_oplen(*c_itr); @@ -293,7 +321,6 @@ count_states_pos(const bam_rec &aln, vector &counts) { assert(qpos == get_l_qseq(aln)); } - static void count_states_neg(const bam_rec &aln, vector &counts) { /* Move through cigar, reference and (*backward*) through read @@ -302,14 +329,14 @@ count_states_neg(const bam_rec &aln, vector &counts) { const auto beg_cig = bam_get_cigar(aln); const auto end_cig = beg_cig + get_n_cigar(aln); size_t rpos = get_pos(aln); - size_t qpos = get_l_qseq(aln); // to match type with b->core.l_qseq + size_t qpos = get_l_qseq(aln); // to match type with b->core.l_qseq for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { const char op = bam_cigar_op(*c_itr); const uint32_t n = bam_cigar_oplen(*c_itr); if (eats_ref(op) && eats_query(op)) { - const size_t end_qpos = qpos - n; // to match type with qpos - for (; qpos > end_qpos; --qpos) // beware ++ in macro below!!! - counts[rpos++].add_count_neg(seq_nt16_str[bam_seqi(seq, qpos-1)]); + const size_t end_qpos = qpos - n; // to match type with qpos + for (; qpos > end_qpos; --qpos) // beware ++ in macro below!!! + counts[rpos++].add_count_neg(seq_nt16_str[bam_seqi(seq, qpos - 1)]); } else if (eats_query(op)) { qpos -= n; @@ -323,7 +350,6 @@ count_states_neg(const bam_rec &aln, vector &counts) { assert(qpos == 0); } - static unordered_map get_tid_to_idx(const bam_header &hdr, const unordered_map name_to_idx) { @@ -340,15 +366,14 @@ get_tid_to_idx(const bam_header &hdr, return tid_to_idx; } - template static void output_skipped_chromosome(const bool CPG_ONLY, const int32_t tid, const unordered_map &tid_to_idx, const bam_header &hdr, const vector::const_iterator chroms_beg, - const vector &chrom_sizes, - vector &counts, bgzf_file &out) { + const vector &chrom_sizes, vector &counts, + bgzf_file &out) { // get the index of the next chrom sequence const auto chrom_idx = tid_to_idx.find(tid); @@ -364,27 +389,29 @@ output_skipped_chromosome(const bool CPG_ONLY, const int32_t tid, write_output(hdr, out, tid, *chrom_itr, counts, CPG_ONLY); } - static bool consistent_targets(const bam_header &hdr, const unordered_map &tid_to_idx, const vector &names, const vector &sizes) { const size_t n_targets = hdr.h->n_targets; - if (n_targets != names.size()) return false; + if (n_targets != names.size()) + return false; for (size_t tid = 0; tid < n_targets; ++tid) { const string tid_name_sam = sam_hdr_tid2name(hdr, tid); const size_t tid_size_sam = sam_hdr_tid2len(hdr, tid); const auto idx_itr = tid_to_idx.find(tid); - if (idx_itr == cend(tid_to_idx)) return false; + if (idx_itr == cend(tid_to_idx)) + return false; const auto idx = idx_itr->second; - if (tid_name_sam != names[idx] || tid_size_sam != sizes[idx]) return false; + if (tid_name_sam != names[idx] || tid_size_sam != sizes[idx]) + return false; } return true; } - -template static void +template +static void process_reads(const bool VERBOSE, const bool show_progress, const bool compress_output, const bool include_header, const size_t n_threads, const string &infile, @@ -411,10 +438,12 @@ process_reads(const bool VERBOSE, const bool show_progress, // open the hts SAM/BAM input file and get the header bamxx::bam_in hts(infile); - if (!hts) throw dnmt_error("failed to open input file"); + if (!hts) + throw dnmt_error("failed to open input file"); // load the input file's header bam_header hdr(hts); - if (!hdr) throw dnmt_error("failed to read header"); + if (!hdr) + throw dnmt_error("failed to read header"); unordered_map tid_to_idx = get_tid_to_idx(hdr, name_to_idx); @@ -424,7 +453,8 @@ process_reads(const bool VERBOSE, const bool show_progress, // open the output file const string output_mode = compress_output ? "w" : "wu"; bgzf_file out(outfile, output_mode); - if (!out) throw dnmt_error("error opening output file: " + outfile); + if (!out) + throw dnmt_error("error opening output file: " + outfile); // set the threads for the input file decompression if (n_threads > 1) { @@ -449,25 +479,19 @@ process_reads(const bool VERBOSE, const bool show_progress, const int32_t tid = get_tid(aln); if (tid == -1) // ADS: skip reads that have no tid -- they are not mapped continue; - if (tid == prev_tid) { - // do the work for this mapped read, depending on strand - if (bam_is_rev(aln)) - count_states_neg(aln, counts); - else - count_states_pos(aln, counts); - } - else { // chrom has changed, so output results and get the next chrom + if (tid != prev_tid) { + // chrom has changed, so output results and get the next chrom // write output if there is any; counts is empty only for first chrom if (!counts.empty()) - write_output(hdr, out, prev_tid, - *chrom_itr, counts, CPG_ONLY); + write_output(hdr, out, prev_tid, *chrom_itr, counts, + CPG_ONLY); // make sure reads are sorted chrom tid number in header if (tid < prev_tid) { const std::string message = "SAM file is not sorted " - "previous tid: " + - std::to_string(prev_tid) + - " current tid: " + std::to_string(tid); + "previous tid: " + + std::to_string(prev_tid) + + " current tid: " + std::to_string(tid); throw dnmt_error(message); } @@ -491,16 +515,22 @@ process_reads(const bool VERBOSE, const bool show_progress, counts.clear(); counts.resize(chrom_sizes[chrom_idx->second]); } + // do the work for this mapped read, depending on strand + if (bam_is_rev(aln)) + count_states_neg(aln, counts); + else + count_states_pos(aln, counts); } if (!counts.empty()) - write_output(hdr, out, prev_tid, *chrom_itr, counts, CPG_ONLY); + write_output(hdr, out, prev_tid, *chrom_itr, counts, + CPG_ONLY); // ADS: if some chroms might not be covered by reads, we have to // iterate over what remains if (!require_covered) for (auto i = prev_tid + 1; i < hdr.h->n_targets; ++i) - output_skipped_chromosome(CPG_ONLY, i, tid_to_idx, hdr, - chroms_beg, chrom_sizes, counts, out); + output_skipped_chromosome(CPG_ONLY, i, tid_to_idx, hdr, chroms_beg, + chrom_sizes, counts, out); } int @@ -524,12 +554,12 @@ main_counts(int argc, const char **argv) { "get methylation levels from " "mapped bisulfite sequencing reads", "-c "); - opt_parse.add_opt("threads", 't', "threads to use (few needed)", - false, n_threads); + opt_parse.add_opt("threads", 't', "threads to use (few needed)", false, + n_threads); opt_parse.add_opt("output", 'o', "output file name (default: stdout)", false, outfile); opt_parse.add_opt("chrom", 'c', "reference genome file (FASTA format)", - true , chroms_file); + true, chroms_file); opt_parse.add_opt("cpg-only", 'n', "print only CpG context cytosines", false, CPG_ONLY); opt_parse.add_opt("require-covered", 'r', "only output covered sites", @@ -558,7 +588,7 @@ main_counts(int argc, const char **argv) { throw dnmt_error("thread count cannot be negative"); std::ostringstream cmd; - copy(argv, argv + argc, std::ostream_iterator(cmd, " ")); + copy(argv, argv + argc, std::ostream_iterator(cmd, " ")); // file types from HTSlib use "-" for the filename to go to stdout if (outfile.empty()) @@ -567,8 +597,8 @@ main_counts(int argc, const char **argv) { if (VERBOSE) cerr << "[input BAM/SAM file: " << mapped_reads_file << "]" << endl << "[output file: " << outfile << "]" << endl - << "[output format: " - << (compress_output ? "bgzf" : "text") << "]" << endl + << "[output format: " << (compress_output ? "bgzf" : "text") << "]" + << endl << "[genome file: " << chroms_file << "]" << endl << "[threads requested: " << n_threads << "]" << endl << "[CpG only mode: " << (CPG_ONLY ? "yes" : "no") << "]" << endl diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index 5e316ee8..b2721c55 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -16,20 +16,25 @@ * more details. */ +#include #include -#include // for [u]int[0-9]+_t +#include +#include +#include +#include #include #include +#include +#include #include +#include #include #include #include -#include #include #include "OptionParser.hpp" #include "bam_record_utils.hpp" -#include "bsutils.hpp" #include "counts_header.hpp" #include "dnmt_error.hpp" @@ -40,6 +45,56 @@ using bamxx::bam_header; using bamxx::bam_rec; using bamxx::bgzf_file; +[[nodiscard]] inline bool +is_cytosine(const char c) { + return c == 'c' || c == 'C'; +} + +[[nodiscard]] inline bool +is_guanine(const char c) { + return c == 'g' || c == 'G'; +} + +[[nodiscard]] inline bool +is_thymine(const char c) { + return c == 't' || c == 'T'; +} + +[[nodiscard]] inline bool +is_adenine(const char c) { + return c == 'a' || c == 'A'; +} + +[[nodiscard]] inline bool +is_cpg(const std::string &s, const std::size_t i) { + return i + 1 < std::size(s) && is_cytosine(s[i]) && is_guanine(s[i + 1]); +} + +static void +read_fasta_file(const std::string &filename, std::vector &names, + std::vector &sequences) { + bgzf_file in(filename, "r"); + if (!in) + throw std::runtime_error("error opening genome file: " + filename); + names.clear(); + sequences.clear(); + + std::string line; + while (getline(in, line)) { + if (line[0] == '>') { + const auto first_space = line.find_first_of(" \t", 1); + if (first_space == std::string::npos) + names.push_back(line.substr(1)); + else + names.emplace_back(std::cbegin(line) + 1, + std::cbegin(line) + line.find_first_of(" \t", 1)); + sequences.emplace_back(); + } + else + sequences.back() += line; + } +} + [[nodiscard]] static std::string get_basecall_model(const bam_header &hdr) { kstring_t ks{}; @@ -120,50 +175,51 @@ is_c_at_g(const std::string &s, const std::size_t i) { requirement of the program into a more reasonable range, so keeping all the information seems reasonable. */ struct CountSet { - std::string + static constexpr auto max_prob_repr = 256.0; + [[nodiscard]] std::string tostring() const { // clang-format off std::ostringstream oss; - oss << hydroxy_pos << '\t' << hydroxy_neg << '\t' - << methyl_pos << '\t' << methyl_neg << '\t' - << n_reads_pos << '\t' << n_reads_neg << '\n'; + oss << hydroxy_fwd << '\t' << hydroxy_rev << '\t' + << methyl_fwd << '\t' << methyl_rev << '\t' + << n_reads_fwd << '\t' << n_reads_rev << '\n'; // clang-format on return oss.str(); } void - add_count_pos(const std::uint8_t h, const std::uint8_t m) { - hydroxy_pos += h; - methyl_pos += m; - ++n_reads_pos; + add_count_fwd(const std::uint8_t h, const std::uint8_t m) { + hydroxy_fwd += h; + methyl_fwd += m; + ++n_reads_fwd; } void - add_count_neg(const std::uint8_t h, const std::uint8_t m) { - hydroxy_neg += h; - methyl_neg += m; - ++n_reads_neg; + add_count_rev(const std::uint8_t h, const std::uint8_t m) { + hydroxy_rev += h; + methyl_rev += m; + ++n_reads_rev; } - double + [[nodiscard]] double get_hydroxy(const bool is_c) const { - return is_c ? hydroxy_pos : hydroxy_neg; + return (is_c ? hydroxy_fwd : hydroxy_rev) / max_prob_repr; } - double + [[nodiscard]] double get_methyl(const bool is_c) const { - return is_c ? methyl_pos : methyl_neg; + return (is_c ? methyl_fwd : methyl_rev) / max_prob_repr; } - double + [[nodiscard]] double get_mods(const bool is_c) const { return get_hydroxy(is_c) + get_methyl(is_c); } - double + [[nodiscard]] double get_n_reads(const bool is_c) const { - return is_c ? n_reads_pos : n_reads_neg; - } - count_type hydroxy_pos{0}; - count_type hydroxy_neg{0}; - count_type methyl_pos{0}; - count_type methyl_neg{0}; - count_type n_reads_pos{0}; - count_type n_reads_neg{0}; + return is_c ? n_reads_fwd : n_reads_rev; + } + count_type hydroxy_fwd{0}; + count_type hydroxy_rev{0}; + count_type methyl_fwd{0}; + count_type methyl_rev{0}; + count_type n_reads_fwd{0}; + count_type n_reads_rev{0}; }; /* The "tag" returned by this function should be exclusive, so that @@ -210,6 +266,8 @@ template get_hydroxy_sites(const T mod_pos_beg, const T mod_pos_end) { const char hydroxy_tag[] = "C+h?"; const auto hydroxy_tag_size = 4; + if (mod_pos_beg == mod_pos_end) + return {}; auto hydroxy_beg = strstr(mod_pos_beg, hydroxy_tag); if (hydroxy_beg == mod_pos_end) return std::tuple{}; @@ -225,6 +283,8 @@ template get_methyl_sites(const T mod_pos_beg, const T mod_pos_end) { const char methyl_tag[] = "C+h?"; const auto methyl_tag_size = 4; + if (mod_pos_beg == mod_pos_end) + return {}; auto methyl_beg = strstr(mod_pos_beg, methyl_tag); if (methyl_beg == mod_pos_end) return std::tuple{}; @@ -245,11 +305,35 @@ get_modification_positions(const bamxx::bam_rec &aln) { return {mod_pos_beg, mod_pos_end}; } +struct prob_counter { + std::array meth_hist{}; + std::array hydro_hist{}; + std::string + json() const { + std::ostringstream oss; + oss << R"({"methyl_hist":[)"; + for (auto i = 0; i < 256; ++i) { + if (i > 0) + oss << ','; + oss << R"(")" << meth_hist[i] << R"(")"; + } + oss << R"(],"hydroxy_hist":[)"; + for (auto i = 0; i < 256; ++i) { + if (i > 0) + oss << ','; + oss << R"(")" << hydro_hist[i] << R"(")"; + } + oss << R"(]})"; + return oss.str(); + } +}; + struct mod_prob_buffer { static constexpr auto init_capacity{128 * 1024}; std::vector hydroxy_probs; std::vector methyl_probs; + prob_counter pc; mod_prob_buffer() { methyl_probs.reserve(init_capacity); @@ -274,8 +358,6 @@ struct mod_prob_buffer { auto [hydroxy_beg, hydroxy_end] = get_hydroxy_sites(mod_pos_beg, mod_pos_end); auto [methyl_beg, methyl_end] = get_methyl_sites(mod_pos_beg, mod_pos_end); - - // check if any are false if (mod_pos_beg == nullptr || hydroxy_beg == nullptr || methyl_beg == nullptr) return false; @@ -313,8 +395,13 @@ struct mod_prob_buffer { // assume that when delta hits 0 we have a CpG site if (delta != 0) return false; - methyl_probs[i] = bam_auxB2i(mod_prob, methyl_prob_idx++); - hydroxy_probs[i] = bam_auxB2i(mod_prob, hydroxy_prob_idx++); + const std::uint8_t m_val = bam_auxB2i(mod_prob, methyl_prob_idx++); + methyl_probs[i] = m_val; + pc.meth_hist[m_val]++; + + const std::uint8_t h_val = bam_auxB2i(mod_prob, hydroxy_prob_idx++); + hydroxy_probs[i] = h_val; + pc.hydro_hist[h_val]++; } --delta; if (delta < 0) @@ -330,8 +417,13 @@ struct mod_prob_buffer { // assume that when delta hits 0 we have a CpG site if (delta != 0) return false; - methyl_probs[i] = bam_auxB2i(mod_prob, methyl_prob_idx++); - hydroxy_probs[i] = bam_auxB2i(mod_prob, hydroxy_prob_idx++); + const std::uint8_t m_val = bam_auxB2i(mod_prob, methyl_prob_idx++); + methyl_probs[i] = m_val; + pc.meth_hist[m_val]++; + + const std::uint8_t h_val = bam_auxB2i(mod_prob, hydroxy_prob_idx++); + hydroxy_probs[i] = h_val; + pc.hydro_hist[h_val]++; } --delta; if (delta < 0) @@ -378,7 +470,7 @@ struct match_counter { container neg{}; void - add_pos(const std::uint8_t r1, const std::uint8_t r2, const std::uint8_t q1, + add_fwd(const std::uint8_t r1, const std::uint8_t r2, const std::uint8_t q1, const std::uint8_t q2) { const auto r1e = encoding[r1]; const auto r2e = encoding[r2]; @@ -390,7 +482,7 @@ struct match_counter { } void - add_neg(const std::uint8_t r1, const std::uint8_t r2, const std::uint8_t q1, + add_rev(const std::uint8_t r1, const std::uint8_t r2, const std::uint8_t q1, const std::uint8_t q2) { const auto r1e = encoding[r1]; const auto r2e = encoding[r2]; @@ -491,7 +583,7 @@ struct match_counter { }; static void -count_states_pos(const bam_rec &aln, std::vector &counts, +count_states_fwd(const bam_rec &aln, std::vector &counts, mod_prob_buffer &mod_buf, const std::string &chrom, match_counter &mc) { /* Move through cigar, reference and read positions without @@ -520,10 +612,10 @@ count_states_pos(const bam_rec &aln, std::vector &counts, const decltype(qpos) end_qpos = qpos + n; for (; qpos < end_qpos; ++qpos) { const auto r_nuc = *ref_itr++; - mc.add_pos(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, + mc.add_fwd(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, seq_nt16_str[bam_seqi(seq, qpos)], qpos == q_lim ? 'N' : seq_nt16_str[bam_seqi(seq, qpos + 1)]); - counts[rpos++].add_count_pos(*hydroxy_prob_itr, *methyl_prob_itr); + counts[rpos++].add_count_fwd(*hydroxy_prob_itr, *methyl_prob_itr); ++methyl_prob_itr; ++hydroxy_prob_itr; } @@ -546,7 +638,7 @@ count_states_pos(const bam_rec &aln, std::vector &counts, } static void -count_states_neg(const bam_rec &aln, std::vector &counts, +count_states_rev(const bam_rec &aln, std::vector &counts, mod_prob_buffer &mod_buf, const std::string &chrom, match_counter &mc) { /* Move through cigar, reference and (*backward*) through read @@ -580,11 +672,11 @@ count_states_neg(const bam_rec &aln, std::vector &counts, const auto r_nuc = *ref_itr++; const auto q_nuc = seq_nt16_str[bam_seqi(seq, q_idx)]; ++q_idx; - mc.add_neg(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, q_nuc, + mc.add_rev(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, q_nuc, q_idx == l_qseq ? 'N' : seq_nt16_str[bam_seqi(seq, q_idx)]); --methyl_prob_itr; --hydroxy_prob_itr; - counts[rpos++].add_count_neg(*hydroxy_prob_itr, *methyl_prob_itr); + counts[rpos++].add_count_rev(*hydroxy_prob_itr, *methyl_prob_itr); } } else if (eats_query(op)) { @@ -604,31 +696,34 @@ count_states_neg(const bam_rec &aln, std::vector &counts, assert(qpos == 0); } -[[nodiscard]] static std::unordered_map +[[nodiscard]] static std::tuple, + std::set> get_tid_to_idx( const bam_header &hdr, const std::unordered_map &name_to_idx) { - std::unordered_map tid_to_idx; + std::set missing_tids; + std::map tid_to_idx; for (std::int32_t i = 0; i < hdr.h->n_targets; ++i) { // "curr_name" gives a "tid_to_name" mapping allowing to jump // through "name_to_idx" and get "tid_to_idx" const std::string curr_name(hdr.h->target_name[i]); const auto name_itr(name_to_idx.find(curr_name)); - if (name_itr == end(name_to_idx)) - throw std::runtime_error("failed to find chrom: " + curr_name); - tid_to_idx[i] = name_itr->second; + if (name_itr == std::cend(name_to_idx)) + missing_tids.insert(i); + else + tid_to_idx[i] = name_itr->second; } - return tid_to_idx; + return std::tuple, + std::set>{tid_to_idx, missing_tids}; } [[nodiscard]] static bool -consistent_targets( - const bam_header &hdr, - const std::unordered_map &tid_to_idx, - const std::vector &names, - const std::vector &sizes) { +consistent_targets(const bam_header &hdr, + const std::map &tid_to_idx, + const std::vector &names, + const std::vector &sizes) { const std::size_t n_targets = hdr.h->n_targets; - if (n_targets != names.size()) + if (n_targets != std::size(names)) return false; for (std::size_t tid = 0; tid < n_targets; ++tid) { @@ -644,6 +739,26 @@ consistent_targets( return true; } +[[nodiscard]] static bool +consistent_existing_targets( + const bam_header &hdr, const std::map &tid_to_idx, + const std::vector &names, + const std::vector &sizes) { + const std::size_t n_targets = hdr.h->n_targets; + for (std::size_t tid = 0; tid < n_targets; ++tid) { + const auto idx_itr = tid_to_idx.find(tid); + if (idx_itr == std::cend(tid_to_idx)) + continue; + const std::string tid_name_sam = sam_hdr_tid2name(hdr, tid); + const std::size_t tid_size_sam = sam_hdr_tid2len(hdr, tid); + const auto idx = idx_itr->second; + if (tid_name_sam != names[idx] || tid_size_sam != sizes[idx]) + return false; + } + return true; +} + +int counter{}; struct read_processor { static constexpr auto default_expected_basecall_model = "dna_r10.4.1_e8.2_400bps_hac@v4.3.0"; @@ -653,6 +768,7 @@ struct read_processor { bool compress_output{}; bool include_header{}; bool require_covered{}; + bool symmetric{}; bool cpg_only{}; bool force{}; std::uint32_t n_threads{1}; @@ -668,6 +784,7 @@ struct read_processor { << "[compress_output: " << compress_output << "]\n" << "[include_header: " << include_header << "]\n" << "[require_covered: " << require_covered << "]\n" + << "[symmetric: " << symmetric << "]\n" << "[cpg_only: " << cpg_only << "]\n" << "[force: " << force << "]\n" << "[n_threads: " << n_threads << "]\n" @@ -687,7 +804,7 @@ struct read_processor { void output_skipped_chromosome( const std::int32_t tid, - const std::unordered_map &tid_to_idx, + const std::map &tid_to_idx, const bam_header &hdr, const std::vector::const_iterator chroms_beg, const std::vector &chrom_sizes, std::vector &counts, @@ -695,9 +812,13 @@ struct read_processor { // get the index of the next chrom sequence const auto chrom_idx = tid_to_idx.find(tid); - if (chrom_idx == std::cend(tid_to_idx)) - throw std::runtime_error("chrom not found: " + - sam_hdr_tid2name(hdr, tid)); + if (chrom_idx == std::cend(tid_to_idx)) { + if (force) + return; + else + throw std::runtime_error("chrom not found: " + + sam_hdr_tid2name(hdr, tid)); + } const auto chrom_itr = chroms_beg + chrom_idx->second; @@ -710,9 +831,9 @@ struct read_processor { } void - write_output(const bam_header &hdr, bgzf_file &out, const std::int32_t tid, - const std::string &chrom, - const std::vector &counts) const { + write_output_all(const bam_header &hdr, bgzf_file &out, + const std::int32_t tid, const std::string &chrom, + const std::vector &counts) const { static constexpr auto out_fmt = "%ld\t%c\t%s\t%.6g\t%d\t%.6g\t%.6g\n"; static constexpr auto buf_size = 1024; char buffer[buf_size]; @@ -728,33 +849,33 @@ struct read_processor { throw std::runtime_error("failed to write to output buffer"); auto buffer_after_chrom = buffer + chrom_name_offset; - for (std::size_t i = 0; i < std::size(chrom); ++i) { - const char base = chrom[i]; + for (auto chrom_posn = 0ul; chrom_posn < std::size(chrom); ++chrom_posn) { + const char base = chrom[chrom_posn]; if (is_cytosine(base) || is_guanine(base)) { - const std::uint32_t the_tag = get_tag_from_genome(chrom, i); + const std::uint32_t the_tag = get_tag_from_genome(chrom, chrom_posn); if (cpg_only && the_tag != 0) continue; const bool is_c = is_cytosine(base); - const std::uint32_t n_reads = counts[i].get_n_reads(is_c); + const std::uint32_t n_reads = counts[chrom_posn].get_n_reads(is_c); if (require_covered && n_reads == 0) continue; const double denom = std::max(n_reads, 1u); - const double hydroxy = counts[i].get_hydroxy(is_c) / 256.0 / denom; - const double methyl = counts[i].get_methyl(is_c) / 256.0 / denom; - const double mods = counts[i].get_mods(is_c) / 256.0 / denom; + const double hydroxy = counts[chrom_posn].get_hydroxy(is_c) / denom; + const double methyl = counts[chrom_posn].get_methyl(is_c) / denom; + const double mods = counts[chrom_posn].get_mods(is_c) / denom; // clang-format off - int r = std::sprintf(buffer_after_chrom, out_fmt, - i, - is_c ? '+' : '-', - tag_values[the_tag], - mods, - n_reads, - hydroxy, - methyl); + const int r = std::sprintf(buffer_after_chrom, out_fmt, + chrom_posn, + is_c ? '+' : '-', + tag_values[the_tag], + mods, + n_reads, + hydroxy, + methyl); // clang-format on if (r < 0) @@ -764,25 +885,101 @@ struct read_processor { } } - [[nodiscard]] match_counter + void + write_output_sym(const bam_header &hdr, bgzf_file &out, + const std::int32_t tid, const std::string &chrom, + const std::vector &counts) const { + static constexpr auto out_fmt = "%ld\t+\tCpG\t%.6g\t%d\t%.6g\t%.6g\n"; + static constexpr auto buf_size = 1024; + char buffer[buf_size]; + + // Put chrom name in buffer and then skip that part for each site because + // it doesn't change. + const auto chrom_name = sam_hdr_tid2name(hdr.h, tid); + if (chrom_name == nullptr) + throw std::runtime_error("failed to identify chrom for tid: " + + std::to_string(tid)); + const auto chrom_name_offset = std::sprintf(buffer, "%s\t", chrom_name); + if (chrom_name_offset < 0) + throw std::runtime_error("failed to write to output buffer"); + auto buffer_after_chrom = buffer + chrom_name_offset; + + bool prev_was_c{false}; + std::uint32_t n_reads_pos{}; + double hydroxy_pos{}; + double methyl_pos{}; + + for (auto chrom_posn = 0ul; chrom_posn < std::size(chrom); ++chrom_posn) { + const char base = chrom[chrom_posn]; + if (is_cytosine(base)) { + prev_was_c = true; + n_reads_pos = counts[chrom_posn].get_n_reads(true); + hydroxy_pos = counts[chrom_posn].get_hydroxy(true); + methyl_pos = counts[chrom_posn].get_methyl(true); + } + else { + if (is_guanine(base) && prev_was_c) { + const std::uint32_t n_reads_neg = + counts[chrom_posn].get_n_reads(false); + const double hydroxy_neg = counts[chrom_posn].get_hydroxy(false); + const double methyl_neg = counts[chrom_posn].get_methyl(false); + + const auto n_reads = n_reads_pos + n_reads_neg; + if (require_covered && n_reads == 0) + continue; + + const double denom = std::max(n_reads, 1u); + const auto hydroxy = (hydroxy_neg + hydroxy_pos) / denom; + const auto methyl = (methyl_neg + methyl_pos) / denom; + const auto mods = hydroxy + methyl; + + // clang-format off + const int r = std::sprintf(buffer_after_chrom, out_fmt, + chrom_posn - 1, // for previous position + mods, + n_reads, + hydroxy, + methyl); + // clang-format on + + if (r < 0) + throw std::runtime_error("failed to write to output buffer"); + out.write(buffer); + } + prev_was_c = false; + } + } + } + + void + write_output(const bam_header &hdr, bgzf_file &out, const std::int32_t tid, + const std::string &chrom, + const std::vector &counts) const { + if (symmetric) + write_output_sym(hdr, out, tid, chrom, counts); + else + write_output_all(hdr, out, tid, chrom, counts); + } + + [[nodiscard]] std::tuple operator()(const std::string &infile, const std::string &outfile, const std::string &chroms_file) const { // first get the chromosome names and sequences from the FASTA file std::vector chroms, names; - read_fasta_file_short_names(chroms_file, names, chroms); + read_fasta_file(chroms_file, names, chroms); for (auto &i : chroms) - transform(std::cbegin(i), std::cend(i), std::begin(i), - [](const char c) { return std::toupper(c); }); + std::transform(std::cbegin(i), std::cend(i), std::begin(i), + [](const char c) { return std::toupper(c); }); if (verbose) - std::cerr << "[n chroms in reference: " << chroms.size() << "]" + std::cerr << "[n chroms in reference: " << std::size(chroms) << "]" << std::endl; const auto chroms_beg = std::cbegin(chroms); std::unordered_map name_to_idx; - std::vector chrom_sizes(chroms.size(), 0); - for (std::size_t i = 0; i < chroms.size(); ++i) { + std::vector chrom_sizes(std::size(chroms), 0); + for (std::size_t i = 0; i < std::size(chroms); ++i) { name_to_idx[names[i]] = i; - chrom_sizes[i] = chroms[i].size(); + chrom_sizes[i] = std::size(chroms[i]); } bamxx::bam_tpool tp(n_threads); // Must be destroyed after hts @@ -796,10 +993,20 @@ struct read_processor { if (!hdr) throw std::runtime_error("failed to read header"); - std::unordered_map tid_to_idx = - get_tid_to_idx(hdr, name_to_idx); + auto [tid_to_idx, missing_tids] = get_tid_to_idx(hdr, name_to_idx); + if (verbose) + std::cerr << "[targets found: " << std::size(tid_to_idx) << "]\n" + << "[missing targets: " << std::size(missing_tids) << "]\n"; + + if (!force && !missing_tids.empty()) + throw std::runtime_error( + "missing targets in reference and force not specified"); + + if (!force && !consistent_targets(hdr, tid_to_idx, names, chrom_sizes)) + throw std::runtime_error("inconsistent reference genome information"); - if (!consistent_targets(hdr, tid_to_idx, names, chrom_sizes)) + if (force && + !consistent_existing_targets(hdr, tid_to_idx, names, chrom_sizes)) throw std::runtime_error("inconsistent reference genome information"); // open the output file @@ -826,7 +1033,7 @@ struct read_processor { << "observed=" << (basecall_model.empty() ? "NA" : basecall_model) << "\n" << "expected=" << expected_basecall_model_str() << std::endl; - return {}; + return {{}, {}}; } if (include_header) @@ -845,23 +1052,16 @@ struct read_processor { match_counter mc; mod_prob_buffer mod_buf; + bool current_target_present = false; + while (hts.read(hdr, aln)) { const std::int32_t tid = get_tid(aln); if (get_l_qseq(aln) == 0) continue; - if (tid == -1) // ADS: skip reads that have no tid -- they are not mapped continue; - if (tid == prev_tid) { - const bool is_rev = bam_is_rev(aln); - if (is_rev && strand != 1) - count_states_neg(aln, counts, mod_buf, *chrom_itr, mc); - if (!is_rev && strand != 2) - count_states_pos(aln, counts, mod_buf, *chrom_itr, mc); - } - else { // chrom has changed, so output results and get the next chrom - - // write output if there is any; counts is empty only for first chrom + if (tid != prev_tid) { // chrom changed, output results, get next chrom + // write output if any; counts is empty on first and missing chroms if (!counts.empty()) write_output(hdr, out, prev_tid, *chrom_itr, counts); // make sure reads are sorted chrom tid number in header @@ -879,31 +1079,48 @@ struct read_processor { chrom_sizes, counts, out); // get the next chrom to process - auto chrom_idx(tid_to_idx.find(tid)); - if (chrom_idx == end(tid_to_idx)) + const auto chrom_idx = tid_to_idx.find(tid); + current_target_present = (chrom_idx != std::cend(tid_to_idx)); + if (!force && !current_target_present) throw std::runtime_error("chromosome not found: " + std::string(sam_hdr_tid2name(hdr, tid))); - if (show_progress) - std::cerr << "processing " << sam_hdr_tid2name(hdr, tid) << std::endl; - prev_tid = tid; - chrom_itr = chroms_beg + chrom_idx->second; + if (show_progress && current_target_present) + std::cerr << "processing " << sam_hdr_tid2name(hdr, tid) << std::endl; // reset the counts counts.clear(); - counts.resize(chrom_sizes[chrom_idx->second]); + if (current_target_present) { + // set size of counts if current target is present + chrom_itr = chroms_beg + chrom_idx->second; + counts.resize(chrom_sizes[chrom_idx->second]); + + const bool is_rev = bam_is_rev(aln); + if (is_rev && strand != 1) + count_states_rev(aln, counts, mod_buf, *chrom_itr, mc); + if (!is_rev && strand != 2) + count_states_fwd(aln, counts, mod_buf, *chrom_itr, mc); + } + prev_tid = tid; + } + if (current_target_present) { + const bool is_rev = bam_is_rev(aln); + if (is_rev && strand != 1) + count_states_rev(aln, counts, mod_buf, *chrom_itr, mc); + if (!is_rev && strand != 2) + count_states_fwd(aln, counts, mod_buf, *chrom_itr, mc); } } if (!counts.empty()) write_output(hdr, out, prev_tid, *chrom_itr, counts); - // ADS: if some chroms might not be covered by reads, we have to - // iterate over what remains + // ADS: if some chroms might not be covered by reads, we have to iterate + // over what remains if (!require_covered) for (auto i = prev_tid + 1; i < hdr.h->n_targets; ++i) output_skipped_chromosome(i, tid_to_idx, hdr, chroms_beg, chrom_sizes, counts, out); - return mc; + return std::tuple{mc, mod_buf.pc}; } }; @@ -919,7 +1136,7 @@ main_nanocount(int argc, const char **argv) { std::string stats_file; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), + OptionParser opt_parse(std::filesystem::path{argv[0]}.filename(), "get methylation levels from mapped nanopore reads", "-c "); opt_parse.add_opt("threads", 't', "threads to use (few needed)", false, @@ -930,12 +1147,14 @@ main_nanocount(int argc, const char **argv) { true, chroms_file); opt_parse.add_opt("cpg-only", 'n', "print only CpG context cytosines", false, rp.cpg_only); + opt_parse.add_opt("sym", '\0', "collapse symmetric CpGs (implies -n)", + false, rp.symmetric); opt_parse.add_opt("require-covered", 'r', "only output covered sites", false, rp.require_covered); opt_parse.add_opt("strand", '\0', - "use strand (1=positive, 2=negative, 0=default)", false, + "strand-specific (1=forward, 2=reverse, 0=both)", false, rp.strand); - opt_parse.add_opt("stats", 's', "output match/mismatch stats to this file", + opt_parse.add_opt("stats", 's', "output summary stats in json format", false, stats_file); opt_parse.add_opt("force", '\0', "skip consistency checks", false, rp.force); @@ -964,6 +1183,9 @@ main_nanocount(int argc, const char **argv) { if (rp.force) rp.expected_basecall_model.clear(); + if (rp.symmetric) + rp.cpg_only = true; + if (rp.n_threads <= 0) throw std::runtime_error("thread count cannot be negative"); @@ -980,12 +1202,13 @@ main_nanocount(int argc, const char **argv) { << "[genome file: " << chroms_file << "]\n" << rp.tostring(); - const auto mc = rp(mapped_reads_file, outfile, chroms_file); + const auto [mc, pc] = rp(mapped_reads_file, outfile, chroms_file); if (!stats_file.empty()) { std::ofstream stats_out(stats_file); if (!stats_out) throw std::runtime_error("Error opening stats file: " + stats_file); - stats_out << mc.json() << std::endl; + stats_out << R"({"matches":)" << mc.json() << "," + << R"("probabilities":)" << pc.json() << "}\n"; } } catch (const std::exception &e) { diff --git a/src/common/MSite.cpp b/src/common/MSite.cpp index a9275929..14a67131 100644 --- a/src/common/MSite.cpp +++ b/src/common/MSite.cpp @@ -348,7 +348,7 @@ is_msite_line(const string &line) { return false; string temp; - if (iss >> temp) + if (MSite::no_extra_fields && iss >> temp) return false; else return true; diff --git a/src/common/TwoStateHMM.cpp b/src/common/TwoStateHMM.cpp index a009f281..7241145d 100644 --- a/src/common/TwoStateHMM.cpp +++ b/src/common/TwoStateHMM.cpp @@ -63,10 +63,9 @@ TwoStateBetaBin::tostring() const { double TwoStateBetaBin::operator()(const pair &val) const { - const size_t x = static_cast(val.first); - const size_t n = static_cast(x + val.second); - return gsl_sf_lnchoose(n, x) + - gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper; + return gsl_sf_lngamma(val.first + val.second + 1.0) - + gsl_sf_lngamma(val.first + 1.0) - gsl_sf_lngamma(val.second + 1.0) + + gsl_sf_lnbeta(alpha + val.first, beta + val.second) - lnbeta_helper; } inline static double