From cde9a7b22f983a83a6bc0988ea8ca2c74ed55fd9 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 21 Dec 2025 19:43:03 -0800 Subject: [PATCH 1/6] multiple sources: replacing GenomicRegion and SimpleGenomicRegion with Interval6 and Interval --- src/amrfinder/allelicmeth.cpp | 178 +++-- src/amrfinder/amrtester.cpp | 236 ++++--- src/analysis/cpgbins.cpp | 104 ++- src/analysis/hmr-rep.cpp | 61 +- src/analysis/hmr.cpp | 457 ++++++------- src/analysis/hypermr.cpp | 305 +++++---- src/analysis/metagene.cpp | 177 +++-- src/analysis/multimethstat.cpp | 521 +++++++-------- src/analysis/pmd.cpp | 1109 ++++++++++++++++---------------- src/analysis/roimethstat.cpp | 187 +++--- src/common/Interval.hpp | 17 +- src/common/Interval6.hpp | 20 +- src/common/LevelsCounter.cpp | 41 +- src/common/LevelsCounter.hpp | 62 +- src/common/MSite.cpp | 210 +++--- src/common/bsutils.cpp | 23 +- src/common/bsutils.hpp | 8 +- src/radmeth/dmr.cpp | 289 +++++---- src/radmeth/radmeth-merge.cpp | 139 ++-- src/utils/selectsites.cpp | 280 ++++---- 20 files changed, 2143 insertions(+), 2281 deletions(-) diff --git a/src/amrfinder/allelicmeth.cpp b/src/amrfinder/allelicmeth.cpp index e1705d3b..3b99ff58 100644 --- a/src/amrfinder/allelicmeth.cpp +++ b/src/amrfinder/allelicmeth.cpp @@ -1,6 +1,4 @@ -/* allelicmeth: - * - * Copyright (C) 2014-2022 University of Southern California, +/* Copyright (C) 2014-2022 University of Southern California, * Andrew D. Smith, and Benjamin E. Decato * * Authors: Andrew D. Smith and Benjamin E. Decato @@ -19,7 +17,6 @@ #include "Epiread.hpp" #include "MSite.hpp" -#include "GenomicRegion.hpp" #include "OptionParser.hpp" #include "smithlab_os.hpp" #include "smithlab_utils.hpp" @@ -42,27 +39,16 @@ #include #include -using std::cerr; -using std::cout; -using std::max; -using std::min; -using std::runtime_error; -using std::string; -using std::unordered_map; -using std::unordered_set; -using std::vector; - -// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions,*-prefer-member-initializer) +// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) static inline double log_sum_log(const double p, const double q) { - if (p == 0) { + if (p == 0) return q; - } - else if (q == 0) { + if (q == 0) return p; - } - return p > q ? p + log1p(exp(q - p)) : q + log1p(exp(p - q)); + return p > q ? p + std::log1p(std::exp(q - p)) + : q + std::log1p(std::exp(p - q)); } static inline double @@ -77,28 +63,29 @@ lnchoose(const unsigned int n, unsigned int m) { // p(k) = C(n1, k) C(n2, t - k) / C(n1 + n2, t) static double -log_hyper_g(const size_t k, const size_t n1, const size_t n2, const size_t t) { +log_hyper_g(const std::size_t k, const std::size_t n1, const std::size_t n2, + const std::size_t t) { return lnchoose(n1, k) + lnchoose(n2, t - k) - lnchoose(n1 + n2, t); } static double -fishers_exact(size_t a, size_t b, size_t c, size_t d) { - const size_t m = a + c; // sum of first column - const size_t n = b + d; // sum of second column - const size_t k = a + b; // sum of first row +fishers_exact(std::size_t a, std::size_t b, std::size_t c, std::size_t d) { + const std::size_t m = a + c; // sum of first column + const std::size_t n = b + d; // sum of second column + const std::size_t k = a + b; // sum of first row // ADS: want more extreme than "observed" const double observed = log_hyper_g(a, m, n, k); double p = 0.0; - for (size_t i = (n > k ? 0ul : k - n); i <= std::min(k, m); ++i) { + for (std::size_t i = (n > k ? 0ul : k - n); i <= std::min(k, m); ++i) { const double curr = log_hyper_g(i, m, n, k); if (curr <= observed) p = log_sum_log(p, curr); } - return exp(p); + return std::exp(p); } -static size_t -state_pair_to_index(const string &s, const size_t idx) { +static std::size_t +state_pair_to_index(const std::string &s, const std::size_t idx) { assert(idx < s.length() - 1); const char a = s[idx]; if (a == 'C') { @@ -136,13 +123,13 @@ template struct PairStateCounter { return CC + CT + TC + TT; } - string + std::string tostring() const { return toa(CC) + '\t' + toa(CT) + '\t' + toa(TC) + '\t' + toa(TT); } void - increment(const size_t state) { + increment(const std::size_t state) { if (state == 0) ++CC; else if (state == 1) @@ -156,19 +143,20 @@ template struct PairStateCounter { template void -fit_states(const epiread &er, vector> &counts) { - for (size_t i = 0; i < er.length() - 1; ++i) { - const size_t pos = er.pos + i; +fit_states(const epiread &er, std::vector> &counts) { + for (std::size_t i = 0; i < er.length() - 1; ++i) { + const std::size_t pos = er.pos + i; assert(pos < counts.size()); - const size_t curr_state = state_pair_to_index(er.seq, i); + const std::size_t curr_state = state_pair_to_index(er.seq, i); counts[pos].increment(curr_state); } } static void -collect_cpgs(const string &s, unordered_map &cpgs) { - size_t cpg_idx = 0; - size_t nuc_idx = 0; +collect_cpgs(const std::string &s, + std::unordered_map &cpgs) { + std::size_t cpg_idx = 0; + std::size_t nuc_idx = 0; const auto lim = end(s) - (s.size() == 0 ? 0 : 1); for (auto itr = begin(s); itr != lim; ++itr, ++nuc_idx) if (*itr == 'C' && *(itr + 1) == 'G') @@ -179,53 +167,58 @@ collect_cpgs(const string &s, unordered_map &cpgs) { consecutive CpG sites. So there would be one fewer of them than the total CpGs in a chromosome. */ static void -convert_coordinates(const string &chrom, vector &sites) { - unordered_map cpgs; +convert_coordinates(const std::string &chrom, std::vector &sites) { + std::unordered_map cpgs; collect_cpgs(chrom, cpgs); - for (size_t i = 0; i < sites.size(); ++i) { + for (std::size_t i = 0; i < sites.size(); ++i) { const auto pos_itr = cpgs.find(sites[i].pos); if (pos_itr == end(cpgs)) - throw runtime_error("failed converting site:\n" + sites[i].tostring()); + throw std::runtime_error("failed converting site:\n" + + sites[i].tostring()); sites[i].pos = pos_itr->second; } } template void -add_cytosine(const string &chrom_name, const size_t start_cpg, - vector> &counts, vector &cytosines) { +add_cytosine(const std::string &chrom_name, const std::size_t start_cpg, + std::vector> &counts, + std::vector &cytosines) { std::ostringstream s; s << counts[start_cpg].score() << "\t" << counts[start_cpg].total() << "\t" << counts[start_cpg].tostring(); - const string name(s.str()); + const std::string name(s.str()); cytosines.push_back(MSite(chrom_name, start_cpg, '+', name, 0, 0)); } template void -process_chrom(const string &chrom_name, const vector &epireads, - vector &cytosines, vector> &counts) { - for (size_t i = 0; i < epireads.size(); ++i) +process_chrom(const std::string &chrom_name, + const std::vector &epireads, + std::vector &cytosines, + std::vector> &counts) { + for (std::size_t i = 0; i < epireads.size(); ++i) fit_states(epireads[i], counts); - for (size_t i = 0; i < counts.size(); ++i) + for (std::size_t i = 0; i < counts.size(); ++i) add_cytosine(chrom_name, i, counts, cytosines); } static void -update_chroms_seen(const string &chrom_name, - unordered_set &chroms_seen) { +update_chroms_seen(const std::string &chrom_name, + std::unordered_set &chroms_seen) { const auto chr_itr = chroms_seen.find(chrom_name); if (chr_itr != end(chroms_seen)) - throw runtime_error("chroms out of order: " + chrom_name); + throw std::runtime_error("chroms out of order: " + chrom_name); chroms_seen.insert(chrom_name); } static void -verify_chroms_available(const string &chrom_name, - unordered_map &chrom_lookup) { +verify_chroms_available( + const std::string &chrom_name, + std::unordered_map &chrom_lookup) { const auto chr_itr = chrom_lookup.find(chrom_name); if (chr_itr == end(chrom_lookup)) - throw runtime_error("chrom not found: " + chrom_name); + throw std::runtime_error("chrom not found: " + chrom_name); } int @@ -236,8 +229,8 @@ computes probability of allele-specific methylation at each tuple of CpGs )"; bool VERBOSE = false; - string outfile; - string chroms_dir; + std::string outfile; + std::string chroms_dir; /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) description, ""); @@ -246,79 +239,79 @@ computes probability of allele-specific methylation at each tuple of CpGs opt_parse.add_opt("chrom", 'c', "genome sequence file/directory", true, chroms_dir); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << '\n' - << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << '\n'; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 1) { - cerr << opt_parse.help_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } - const string epi_file(leftover_args.front()); + const std::string epi_file(leftover_args.front()); /****************** END COMMAND LINE OPTIONS *****************/ - vector chrom_names; - vector chroms; + std::vector chrom_names; + std::vector chroms; read_fasta_file_short_names(chroms_dir, chrom_names, chroms); for (auto &&i : chroms) transform(begin(i), end(i), begin(i), [](const char c) { return std::toupper(c); }); // lookup to map chrom names to chrom sequences - unordered_map chrom_lookup; - for (size_t i = 0; i < chrom_names.size(); ++i) + std::unordered_map chrom_lookup; + for (std::size_t i = 0; i < chrom_names.size(); ++i) chrom_lookup.insert(make_pair(chrom_names[i], i)); - unordered_map chrom_sizes; - for (size_t i = 0; i < chrom_names.size(); ++i) { - size_t cpg_count = 0; - for (size_t j = 0; j < chroms[i].size() - 1; ++j) + std::unordered_map chrom_sizes; + for (std::size_t i = 0; i < chrom_names.size(); ++i) { + std::size_t cpg_count = 0; + for (std::size_t j = 0; j < chroms[i].size() - 1; ++j) cpg_count += (chroms[i][j] == 'C' && chroms[i][j + 1] == 'G'); chrom_sizes.insert(make_pair(chrom_names[i], cpg_count)); } if (VERBOSE) - cerr << "number of chromosomes: " << chrom_sizes.size() << '\n'; + std::cerr << "number of chromosomes: " << chrom_sizes.size() << '\n'; std::ifstream in(epi_file); if (!in) - throw runtime_error("cannot open input file: " + epi_file); + throw std::runtime_error("cannot open input file: " + epi_file); - std::ofstream of; - if (!outfile.empty()) - of.open(outfile); - std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open output file: " + outfile); - unordered_set chroms_seen; - string chrom; + std::unordered_set chroms_seen; + std::string chrom; epiread er; - vector epireads; + std::vector epireads; while (in >> er) { if (er.chr != chrom) { update_chroms_seen(er.chr, chroms_seen); verify_chroms_available(er.chr, chrom_lookup); if (VERBOSE) - cerr << "[processing " << er.chr << "]" << '\n'; + std::cerr << "[processing " << er.chr << "]" << '\n'; if (!chrom.empty()) { - vector> counts(chrom_sizes[chrom] - 1); - vector cytosines; + std::vector> counts( + chrom_sizes[chrom] - 1); + std::vector cytosines; process_chrom(chrom, epireads, cytosines, counts); - const size_t chrom_idx = chrom_lookup[chrom]; + const std::size_t chrom_idx = chrom_lookup[chrom]; convert_coordinates(chroms[chrom_idx], cytosines); - for (size_t i = 0; i < cytosines.size() - 1; ++i) { + for (std::size_t i = 0; i < cytosines.size() - 1; ++i) { out << cytosines[i].chrom << "\t" << cytosines[i].pos << "\t+\tCpG\t" << cytosines[i].context << '\n'; } @@ -329,22 +322,23 @@ computes probability of allele-specific methylation at each tuple of CpGs chrom.swap(er.chr); } if (!chrom.empty()) { - vector> counts(chrom_sizes[chrom] - 1); - vector cytosines; + std::vector> counts(chrom_sizes[chrom] - + 1); + std::vector cytosines; process_chrom(chrom, epireads, cytosines, counts); - const size_t chrom_idx = chrom_lookup[chrom]; + const std::size_t chrom_idx = chrom_lookup[chrom]; convert_coordinates(chroms[chrom_idx], cytosines); - for (size_t i = 0; i < cytosines.size() - 1; ++i) { + for (std::size_t i = 0; i < cytosines.size() - 1; ++i) { out << cytosines[i].chrom << "\t" << cytosines[i].pos << "\t+\tCpG\t" << cytosines[i].context << '\n'; } } } catch (const std::exception &e) { - cerr << e.what() << '\n'; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } -// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions,*-prefer-member-initializer) +// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) diff --git a/src/amrfinder/amrtester.cpp b/src/amrfinder/amrtester.cpp index 78da4fed..80d776a2 100644 --- a/src/amrfinder/amrtester.cpp +++ b/src/amrfinder/amrtester.cpp @@ -1,29 +1,30 @@ -/* amrtester: A program for testing whether a genomic region has - * allele-specific methylation - * - * Copyright (C) 2014-2023 University of Southern California and - * Benjamin E Decato and Andrew D. Smith and Fang Fang +/* Copyright (C) 2014-2023 Andrew D. Smith, Benjamin E Decato and Fang Fang * * Authors: Andrew D. Smith and Benjamin E Decato and Fang Fang * - * This program is free software: you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . */ +[[maybe_unused]] static constexpr auto about = R"( +amrtester: A program for testing whether a genomic region has allele-specific +methylation +)"; + #include "Epiread.hpp" #include "EpireadStats.hpp" -#include +#include #include #include #include @@ -42,44 +43,32 @@ #include #include -using std::begin; -using std::cerr; -using std::cout; -using std::end; -using std::runtime_error; -using std::streampos; -using std::string; -using std::unordered_map; -using std::vector; - -using epi_r = small_epiread; - -// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) +// NOLINTBEGIN(*-narrowing-conversions) static void backup_to_start_of_current_record(std::ifstream &in) { - static const size_t assumed_max_valid_line_width = 10000; - size_t count = 0; + static constexpr std::size_t assumed_max_valid_line_width = 10000; + std::size_t count = 0; while (in.tellg() > 0 && in.peek() != '\n' && in.peek() != '\r' && count++ < assumed_max_valid_line_width) in.seekg(-1, std::ios_base::cur); if (count > assumed_max_valid_line_width) - throw runtime_error("file contains a line longer than " + - std::to_string(assumed_max_valid_line_width)); + throw std::runtime_error("file contains a line longer than " + + std::to_string(assumed_max_valid_line_width)); } -static streampos -find_first_epiread_ending_after_position(const string &query_chrom, - const size_t query_pos, +static std::streampos +find_first_epiread_ending_after_position(const std::string &query_chrom, + const std::size_t query_pos, std::ifstream &in) { in.seekg(0, std::ios_base::end); auto high_pos = in.tellg(); - size_t eof = in.tellg(); + std::size_t eof = in.tellg(); in.seekg(0, std::ios_base::beg); std::streamoff low_pos = 0; - string chrom, seq; - size_t start = 0ul; + std::string chrom, seq; + std::size_t start = 0ul; // This is just binary search on disk while (high_pos > low_pos + 1) { @@ -92,11 +81,11 @@ find_first_epiread_ending_after_position(const string &query_chrom, if (low_pos + 2 == static_cast(eof)) return -1; - if (!(in >> chrom >> start >> seq)) { - throw runtime_error("problem loading reads"); - } + if (!(in >> chrom >> start >> seq)) + throw std::runtime_error("problem loading reads"); + if (chrom < query_chrom || - (chrom == query_chrom && start + seq.length() <= query_pos)) + (chrom == query_chrom && start + std::size(seq) <= query_pos)) low_pos = mid_pos; else high_pos = mid_pos; @@ -105,62 +94,58 @@ find_first_epiread_ending_after_position(const string &query_chrom, } static void -load_reads(const string &reads_file_name, const GenomicRegion ®ion, - vector &the_reads) { +load_reads(const std::string &reads_file_name, const Interval6 ®ion, + std::vector &the_reads) { // open and check the file std::ifstream in(reads_file_name.c_str()); if (!in) - throw runtime_error("cannot open input file " + reads_file_name); + throw std::runtime_error("cannot open input file " + reads_file_name); - const string query_chrom(region.get_chrom()); - const size_t query_start = region.get_start(); - const size_t query_end = region.get_end(); - const streampos low_offset = + const std::string query_chrom(region.chrom); + const std::size_t query_start = region.start; + const std::size_t query_end = region.stop; + const std::streampos low_offset = find_first_epiread_ending_after_position(query_chrom, query_start, in); in.seekg(low_offset, std::ios_base::beg); backup_to_start_of_current_record(in); - string chrom, seq; - size_t start = 0ul; + std::string chrom, seq; + std::size_t start = 0ul; while ((in >> chrom >> start >> seq) && chrom == query_chrom && start < query_end) the_reads.emplace_back(start, seq); } static void -convert_coordinates(const vector &cpg_positions, - GenomicRegion ®ion) { - const size_t start_pos = - lower_bound(cbegin(cpg_positions), cend(cpg_positions), - region.get_start()) - - cbegin(cpg_positions); - - const size_t end_pos = - lower_bound(cbegin(cpg_positions), cend(cpg_positions), region.get_end()) - - cbegin(cpg_positions); - - region.set_start(start_pos); - region.set_end(end_pos); +convert_coordinates(const std::vector &cpg_positions, + Interval6 ®ion) { + const auto lb_pos = [](const auto &v, const auto x) { + return std::distance(std::cbegin(v), + std::lower_bound(std::cbegin(v), std::cend(v), x)); + }; + region.start = lb_pos(cpg_positions, region.start); + region.stop = lb_pos(cpg_positions, region.stop); } inline static bool -is_cpg(const string &s, const size_t idx) { - return toupper(s[idx]) == 'C' && toupper(s[idx + 1]) == 'G'; +is_cpg(const std::string &s, const std::size_t idx) { + return std::toupper(s[idx]) == 'C' && std::toupper(s[idx + 1]) == 'G'; } static void -collect_cpgs(const string &s, vector &cpgs) { - const size_t lim = s.length() - 1; - for (size_t i = 0; i < lim; ++i) +collect_cpgs(const std::string &s, std::vector &cpgs) { + const std::size_t lim = s.length() - 1; + for (std::size_t i = 0; i < lim; ++i) if (is_cpg(s, i)) cpgs.push_back(i); } static void -clip_reads(const size_t start_pos, const size_t end_pos, vector &r) { - size_t j = 0; - for (size_t i = 0; i < r.size(); ++i) { +clip_reads(const std::size_t start_pos, const std::size_t end_pos, + std::vector &r) { + std::size_t j = 0; + for (std::size_t i = 0; i < std::size(r); ++i) { if (start_pos < r[i].pos + r[i].seq.length() && r[i].pos < end_pos) { if (r[i].pos < start_pos) { assert(start_pos - r[i].pos < r[i].seq.length()); @@ -173,18 +158,16 @@ clip_reads(const size_t start_pos, const size_t end_pos, vector &r) { ++j; } } - r.erase(begin(r) + j, end(r)); + r.erase(std::begin(r) + j, std::end(r)); } // give names to regions if they do not exist static void -ensure_regions_are_named( - vector ®ions // cppcheck-suppress constParameterReference -) { +ensure_regions_are_named(std::vector ®ions) { auto region_name_idx = 0u; - for (auto region : regions) - if (region.get_name().empty()) - region.set_name("region" + std::to_string(++region_name_idx)); + for (auto ®ion : regions) + if (region.name.empty()) + region.name = "region" + std::to_string(++region_name_idx); } int @@ -197,11 +180,14 @@ main_amrtester(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) bool use_bic = false; bool correct_for_read_count = true; - string outfile; - string chrom_file; + std::string outfile; + std::string chrom_file; - size_t max_itr = 10; - double high_prob = 0.75, low_prob = 0.25; + // NOLINTBEGIN(*-avoid-magic-numbers) + std::size_t max_itr{10}; + double high_prob{0.75}; + double low_prob{0.25}; + // NOLINTEND(*-avoid-magic-numbers) /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) @@ -217,111 +203,109 @@ main_amrtester(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); opt_parse.add_opt("progress", 'P', "show progress", false, show_progress); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << '\n' - << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << '\n'; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } - if (leftover_args.size() != 2) { - cerr << opt_parse.help_message() << '\n' - << opt_parse.about_message() << '\n'; + if (std::size(leftover_args) != 2) { + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } - const string regions_file(leftover_args.front()); - const string reads_file_name(leftover_args.back()); + const std::string regions_file(leftover_args.front()); + const std::string reads_file_name(leftover_args.back()); /****************** END COMMAND LINE OPTIONS *****************/ const EpireadStats epistat{low_prob, high_prob, critical_value, max_itr, use_bic, correct_for_read_count}; if (!validate_epiread_file(reads_file_name)) - throw runtime_error("invalid states file: " + reads_file_name); + throw std::runtime_error("invalid states file: " + reads_file_name); /* first load in all the chromosome sequences and names, and make a map from chromosome name to the location of the chromosome itself */ - vector chroms; - vector chrom_names; - unordered_map chrom_lookup; + std::vector chroms; + std::vector chrom_names; + std::unordered_map chrom_lookup; read_fasta_file_short_names(chrom_file, chrom_names, chroms); - for (auto i = 0u; i < size(chroms); ++i) { - if (chrom_lookup.find(chrom_names[i]) != cend(chrom_lookup)) - throw runtime_error("repeated chromosome name: " + chrom_names[i]); + for (auto i = 0u; i < std::size(chroms); ++i) { + if (chrom_lookup.find(chrom_names[i]) != std::cend(chrom_lookup)) + throw std::runtime_error("repeated chromosome name: " + chrom_names[i]); chrom_lookup[chrom_names[i]] = i; } - vector regions; - ReadBEDFile(regions_file, regions); - if (!check_sorted(regions)) - throw runtime_error("regions not sorted in: " + regions_file); + auto regions = read_intervals6(regions_file); + if (!std::is_sorted(std::cbegin(regions), std::cend(regions))) + throw std::runtime_error("regions not sorted in: " + regions_file); ensure_regions_are_named(regions); - auto n_regions = size(regions); + auto n_regions = std::size(regions); if (verbose) - cerr << "number of regions: " << n_regions << '\n'; + std::cerr << "number of regions: " << n_regions << '\n'; - string chrom_name; - vector cpg_positions; + std::string chrom_name; + std::vector cpg_positions; - std::ofstream of; - if (!outfile.empty()) - of.open(outfile); - std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open file: " + outfile); bool is_significant = false; - ProgressBar progress(size(regions)); + ProgressBar progress(std::size(regions)); auto progress_idx = 0u; for (auto ®ion : regions) { if (show_progress && progress.time_to_report(progress_idx)) - progress.report(cerr, progress_idx); + progress.report(std::cerr, progress_idx); ++progress_idx; // get the correct chrom if it has changed - if (region.get_chrom() != chrom_name) { - chrom_name = region.get_chrom(); + if (region.chrom != chrom_name) { + chrom_name = region.chrom; auto the_chrom = chrom_lookup.find(chrom_name); if (the_chrom == end(chrom_lookup)) - throw runtime_error("could not find chrom: " + chrom_name); + throw std::runtime_error("could not find chrom: " + chrom_name); cpg_positions.clear(); collect_cpgs(chroms[the_chrom->second], cpg_positions); } - GenomicRegion conv_region(region); + Interval6 conv_region(region); convert_coordinates(cpg_positions, conv_region); - vector reads; + std::vector reads; load_reads(reads_file_name, conv_region, reads); - clip_reads(conv_region.get_start(), conv_region.get_end(), reads); + clip_reads(conv_region.start, conv_region.stop, reads); const auto score = reads.empty() ? 1.0 : epistat.test_asm(reads, is_significant); - region.set_score(static_cast(score)); - region.set_name(region.get_name() + ":" + toa(reads.size())); + region.score = score; + region.name += ":" + std::to_string(std::size(reads)); out << region << '\n'; } if (show_progress) - cerr << "\r100%\n"; + std::cerr << "\r100%\n"; } catch (const std::exception &e) { - cerr << e.what() << '\n'; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } -// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) +// NOLINTEND(*-narrowing-conversions) diff --git a/src/analysis/cpgbins.cpp b/src/analysis/cpgbins.cpp index 6ab28693..a76d74f4 100644 --- a/src/analysis/cpgbins.cpp +++ b/src/analysis/cpgbins.cpp @@ -15,13 +15,14 @@ * General Public License for more details. */ -#include "GenomicRegion.hpp" +#include "Interval6.hpp" #include "LevelsCounter.hpp" #include "OptionParser.hpp" #include "bsutils.hpp" #include "xcounts_utils.hpp" #include +#include #include #include #include @@ -36,67 +37,35 @@ #include #include -// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) - -static std::string -format_levels_counter(const LevelsCounter &lc) { - // ... - // (7) weighted mean methylation - // (8) unweighted mean methylation - // (9) fractional methylation - // (10) number of sites in the region - // (11) number of sites covered at least once - // (12) number of observations in reads indicating methylation - // (13) total number of observations from reads in the region - std::ostringstream oss; - // clang-format off - oss << lc.mean_meth_weighted() << '\t' - << lc.mean_meth() << '\t' - << lc.fractional_meth() << '\t' - << lc.total_sites << '\t' - << lc.sites_covered << '\t' - << lc.total_c << '\t' - << (lc.total_c + lc.total_t) << '\t' - << lc.total_meth << '\t' - << lc.sites_covered; - // clang-format on - return oss.str(); -} - static std::unordered_map get_chrom_sizes(const std::string &chrom_sizes_file) { - // NOLINTBEGIN(performance-inefficient-string-concatenation) - std::unordered_map chrom_sizes; - std::ifstream in(chrom_sizes_file); if (!in) throw std::runtime_error("failed to open file: " + chrom_sizes_file); + std::unordered_map chrom_sizes; + std::string line; while (getline(in, line)) { std::istringstream iss(line); std::string chrom_name{}; std::uint64_t chrom_size{}; if (!(iss >> chrom_name >> chrom_size)) - throw std::runtime_error("bad line in " + chrom_sizes_file + ":\n" + - line); + throw std::runtime_error("bad chromosome sizes line:\n" + line); std::string dummy; if (iss >> dummy) throw std::runtime_error("too many columns: " + line); if (chrom_sizes.find(chrom_name) != std::cend(chrom_sizes)) - throw std::runtime_error("repeated entry " + chrom_name + " in " + - chrom_sizes_file); + throw std::runtime_error("repeated entry in chromosome sizes: " + line); chrom_sizes[chrom_name] = chrom_size; } return chrom_sizes; - // NOLINTEND(performance-inefficient-string-concatenation) } static std::vector get_chrom_names(const std::string &chrom_sizes_file) { - // NOLINTBEGIN(performance-inefficient-string-concatenation) std::ifstream in(chrom_sizes_file); if (!in) throw std::runtime_error("failed to open file: " + chrom_sizes_file); @@ -108,17 +77,15 @@ get_chrom_names(const std::string &chrom_sizes_file) { std::istringstream iss(line); std::string chrom_name{}; if (!(iss >> chrom_name)) - throw std::runtime_error("bad line in " + chrom_sizes_file + ":\n" + - line); - + throw std::runtime_error("bad line in chromosome sizes file: " + line); chrom_names.push_back(chrom_name); } return chrom_names; - // NOLINTEND(performance-inefficient-string-concatenation) } static void update(LevelsCounter &lc, const xcounts_entry &xce) { + static constexpr auto one_half = 0.5; const std::uint64_t n_reads = xce.n_reads(); if (n_reads > 0) { ++lc.sites_covered; @@ -127,10 +94,11 @@ update(LevelsCounter &lc, const xcounts_entry &xce) { lc.total_t += xce.n_unmeth; const auto meth = xce.frac(); lc.total_meth += meth; - double lower = 0.0, upper = 0.0; - wilson_ci_for_binomial(lc.alpha, n_reads, meth, lower, upper); - lc.called_meth += (lower > 0.5); - lc.called_unmeth += (upper < 0.5); + double lower{}, upper{}; + wilson_ci_for_binomial(lc.alpha, static_cast(n_reads), meth, lower, + upper); + lc.called_meth += (lower > one_half); + lc.called_unmeth += (upper < one_half); } ++lc.total_sites; } @@ -141,11 +109,11 @@ get_name(const bool report_more_info, const char level_code, static const std::string tag = "CpG"; return tag + (report_more_info ? "" - : "_" + std::to_string( - (level_code == 'w' - ? lc.coverage() - : (level_code == 'u' ? lc.sites_covered - : lc.total_called())))); + : "_" + std::to_string(level_code == 'w' + ? lc.coverage() + : (level_code == 'u' + ? lc.sites_covered + : lc.total_called()))); } static void @@ -153,9 +121,8 @@ process_chrom(const bool report_more_info, const char level_code, const std::string &chrom_name, const std::uint64_t chrom_size, const std::uint64_t bin_size, const std::vector &sites, std::ostream &out) { - GenomicRegion r(chrom_name, 0, 0, "CpG", 0.0, '+'); - - std::uint64_t j = 0; + Interval6 r(chrom_name, 0, 0, "CpG", 0.0, '+'); + std::uint64_t j{}; for (auto i = 0ul; i < chrom_size; i += bin_size) { while (j < std::size(sites) && sites[j].pos < i) ++j; @@ -164,13 +131,13 @@ process_chrom(const bool report_more_info, const char level_code, while (j < std::size(sites) && sites[j].pos < i + bin_size) update(lc, sites[j++]); - r.set_start(i); - r.set_end(std::min(i + bin_size, chrom_size)); - r.set_score(level_code == 'w' ? lc.mean_meth_weighted() - : (level_code == 'u' ? lc.mean_meth() - : lc.fractional_meth())); + r.start = i; + r.stop = std::min(i + bin_size, chrom_size); + r.score = level_code == 'w' + ? lc.mean_meth_weighted() + : (level_code == 'u' ? lc.mean_meth() : lc.fractional_meth()); - r.set_name(get_name(report_more_info, level_code, lc)); + r.name = get_name(report_more_info, level_code, lc); out << r; if (report_more_info) @@ -183,12 +150,12 @@ static void process_chrom(const bool report_more_info, const std::string &chrom_name, const std::uint64_t chrom_size, const std::uint64_t bin_size, std::ostream &out) { - GenomicRegion r(chrom_name, 0, 0, "CpG_0", 0.0, '+'); + Interval6 r(chrom_name, 0, 0, "CpG_0", 0.0, '+'); LevelsCounter lc; const std::string lc_formatted = format_levels_counter(lc); for (auto i = 0ul; i < chrom_size; i += bin_size) { - r.set_start(i); - r.set_end(std::min(i + bin_size, chrom_size)); + r.start = i; + r.stop = std::min(i + bin_size, chrom_size); out << r; if (report_more_info) out << '\t' << lc_formatted; @@ -216,9 +183,11 @@ Columns (beyond the first 6) in the BED format output: bool verbose = false; bool report_more_info = false; - std::uint32_t n_threads = 1; + std::int32_t n_threads = 1; // std::uint64_t bin_size = 1000; - size_t bin_size = 1000; // ADS: for macOS gcc-14.2.0 + + // ADS: for macOS gcc-14.2.0 + std::size_t bin_size = 1000; // NOLINT(*-avoid-magic-numbers) std::string level_code = "w"; std::string outfile; @@ -261,6 +230,11 @@ Columns (beyond the first 6) in the BED format output: const std::string xcounts_file = leftover_args.back(); /****************** END COMMAND LINE OPTIONS *****************/ + if (n_threads <= 0) { + std::cerr << "number of threads must be at least 1\n"; + return EXIT_FAILURE; + } + if (!std::filesystem::is_regular_file(chrom_sizes_file)) throw std::runtime_error("chromosome sizes file not a regular file: " + chrom_sizes_file); @@ -295,5 +269,3 @@ Columns (beyond the first 6) in the BED format output: } return EXIT_SUCCESS; } - -// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) diff --git a/src/analysis/hmr-rep.cpp b/src/analysis/hmr-rep.cpp index ec40f9ca..125af4d3 100644 --- a/src/analysis/hmr-rep.cpp +++ b/src/analysis/hmr-rep.cpp @@ -1,23 +1,22 @@ -/* Copyright (C) 2019-2022 University of Southern California - * Andrew D Smith - * Author: Andrew D. Smith, Song Qiang, Jenny Qu +/* Copyright (C) 2019-2025 Andrew D. Smith, Song Qiang, Jenny Qu * - * This is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This is free software; you can redistribute it and/or modify it under the + * terms of the GNU General Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your option) any later + * version. * - * This is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. */ -#include "GenomicRegion.hpp" +#include "Interval6.hpp" #include "MSite.hpp" -#include "OptionParser.hpp" #include "TwoStateHMM.hpp" +#include "OptionParser.hpp" + #include #include @@ -40,9 +39,9 @@ // NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) -static GenomicRegion +static Interval6 as_gen_rgn(const MSite &s) { - return GenomicRegion(s.chrom, s.pos, s.pos + 1); + return Interval6(s.chrom, s.pos, s.pos + 1, std::string{}, 0.0, '+'); } static double @@ -102,15 +101,15 @@ static void build_domains(const std::vector &cpgs, const std::vector &reset_points, const std::vector &state_ids, - std::vector &domains) { + std::vector &domains) { size_t n_cpgs = 0, n_domains = 0, reset_idx = 1, prev_end = 0; bool in_domain = false; for (size_t i = 0; i < state_ids.size(); ++i) { if (reset_points[reset_idx] == i) { if (in_domain) { in_domain = false; - domains.back().set_end(prev_end); - domains.back().set_score(n_cpgs); + domains.back().stop = prev_end; + domains.back().score = n_cpgs; n_cpgs = 0; } ++reset_idx; @@ -119,14 +118,14 @@ build_domains(const std::vector &cpgs, if (!in_domain) { in_domain = true; domains.push_back(as_gen_rgn(cpgs[i])); - domains.back().set_name("HYPO" + std::to_string(n_domains++)); + domains.back().name = "HYPO" + std::to_string(n_domains++); } ++n_cpgs; } else if (in_domain) { in_domain = false; - domains.back().set_end(prev_end); - domains.back().set_score(n_cpgs); + domains.back().stop = prev_end; + domains.back().score = n_cpgs; n_cpgs = 0; } prev_end = cpgs[i].pos + 1; @@ -487,7 +486,7 @@ main_hmr_rep(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) if (fdr_cutoff == std::numeric_limits::max()) fdr_cutoff = get_stepup_cutoff(p_values, 0.01); - std::vector domains; + std::vector domains; build_domains(cpgs, reset_points, state_ids, domains); std::ofstream of; @@ -498,7 +497,7 @@ main_hmr_rep(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) size_t good_hmr_count = 0; for (size_t i = 0; i < domains.size(); ++i) if (p_values[i] < fdr_cutoff) { - domains[i].set_name("HYPO" + std::to_string(good_hmr_count++)); + domains[i].name = "HYPO" + std::to_string(good_hmr_count++); out << domains[i] << '\n'; } @@ -517,10 +516,10 @@ main_hmr_rep(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) m_reads += meth[j][i].first; u_reads += meth[j][i].second; } - GenomicRegion cpg(as_gen_rgn(cpgs[i])); - cpg.set_name("CpG:" + std::to_string(m_reads) + ":" + - std::to_string(u_reads)); - cpg.set_score(posteriors[i]); + Interval6 cpg(as_gen_rgn(cpgs[i])); + cpg.name = + "CpG:" + std::to_string(m_reads) + ":" + std::to_string(u_reads); + cpg.score = posteriors[i]; out_post << cpg << '\n'; } } @@ -535,10 +534,10 @@ main_hmr_rep(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) m_reads += meth[j][i].first; u_reads += meth[j][i].second; } - GenomicRegion cpg(as_gen_rgn(cpgs[i])); - cpg.set_name("CpG:" + std::to_string(m_reads) + ":" + - std::to_string(u_reads)); - cpg.set_score(1.0 - posteriors[i]); + Interval6 cpg(as_gen_rgn(cpgs[i])); + cpg.name = + "CpG:" + std::to_string(m_reads) + ":" + std::to_string(u_reads); + cpg.score = 1.0 - posteriors[i]; out_post << cpg << '\n'; } } diff --git a/src/analysis/hmr.cpp b/src/analysis/hmr.cpp index 295ebeeb..d5faeeab 100644 --- a/src/analysis/hmr.cpp +++ b/src/analysis/hmr.cpp @@ -1,20 +1,19 @@ -/* Copyright (C) 2009-2023 University of Southern California - * Andrew D Smith +/* Copyright (C) 2009-2025 Andrew D Smith and Song Qiang * - * Author: Andrew D. Smith, Song Qiang + * Author: Andrew D. Smith and Song Qiang * - * This is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This is free software; you can redistribute it and/or modify it under the + * terms of the GNU General Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your option) any later + * version. * - * This is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. */ -#include "GenomicRegion.hpp" +#include "Interval6.hpp" #include "MSite.hpp" #include "OptionParser.hpp" #include "TwoStateHMM.hpp" @@ -22,6 +21,8 @@ #include +#include "nlohmann/json.hpp" + #include #include #include @@ -41,18 +42,64 @@ #include #include -// NOLINTBEGIN(*-narrowing-conversions) +template +[[nodiscard]] static auto +get_mean(InputIterator first, InputIterator last) -> T { + return std::accumulate(first, last, static_cast(0)) / + std::distance(first, last); +} + +struct hmr_params { + static constexpr auto frac_init = 0.33; + static constexpr auto trans_init = 0.25; + double fg_alpha{}; + double fg_beta{}; + double bg_alpha{}; + double bg_beta{}; + double p_fb{}; + double p_bf{}; + double domain_score_cutoff{}; + + [[nodiscard]] auto + fg_meth() const { + return fg_alpha / (fg_alpha + fg_beta); + } + + [[nodiscard]] auto + bg_meth() const { + return bg_alpha / (bg_alpha + bg_beta); + } + + auto + init(const double n_reads) { + fg_alpha = frac_init * n_reads; + fg_beta = (1.0 - frac_init) * n_reads; + bg_alpha = (1.0 - frac_init) * n_reads; + bg_beta = frac_init * n_reads; + p_fb = trans_init; + p_bf = trans_init; + domain_score_cutoff = std::numeric_limits::max(); + } + + /// Estimate methylation level using param values and the posterior + /// probability on HMR status + [[nodiscard]] auto + est_meth(const double posterior) -> double { + return fg_meth() * posterior + bg_meth() * (1.0 - posterior); + } + + NLOHMANN_DEFINE_TYPE_INTRUSIVE(hmr_params, fg_alpha, fg_beta, bg_alpha, + bg_beta, p_fb, p_bf, domain_score_cutoff); +}; struct hmr_summary { - explicit hmr_summary(const std::vector &hmrs) : + explicit hmr_summary(const std::vector &hmrs) : hmr_count{std::size(hmrs)}, - hmr_total_size{ - std::accumulate(std::cbegin(hmrs), std::cend(hmrs), 0ul, - [](const std::uint64_t t, const GenomicRegion &p) { - return t + p.get_width(); - })} { + hmr_total_size{std::accumulate( + std::cbegin(hmrs), std::cend(hmrs), 0ul, + [](const std::uint64_t t, const Interval6 &p) { return t + size(p); })} { hmr_mean_size = static_cast(hmr_total_size) / - std::max(hmr_count, static_cast(1)); + std::max(static_cast(hmr_count), 1u); } // hmr_count is the number of identified HMRs. std::uint64_t hmr_count{}; @@ -61,39 +108,35 @@ struct hmr_summary { // mean_hmr_size is the mean size of the identified HMRs double hmr_mean_size{}; - std::string - tostring() { + auto + tostring() -> std::string { std::ostringstream oss; oss << "hmr_count: " << hmr_count << '\n' << "hmr_total_size: " << hmr_total_size << '\n' << "hmr_mean_size: " << std::fixed << std::setprecision(2) << hmr_mean_size; - return oss.str(); } + NLOHMANN_DEFINE_TYPE_INTRUSIVE(hmr_summary, hmr_count, hmr_total_size, + hmr_mean_size); }; -static GenomicRegion -as_gen_rgn(const MSite &s) { - return GenomicRegion(s.chrom, s.pos, s.pos + 1); -} - -static std::string -format_cpg_meth_tag(const std::pair &m) { - return "CpG:" + std::to_string(static_cast(m.first)) + ":" + - std::to_string(static_cast(m.second)); +static auto +to_interval(const MSite &s) -> Interval6 { + const auto p = static_cast(s.pos); + return {s.chrom, p, p + 1, {}, {}, '+'}; } -static double -get_stepup_cutoff(std::vector scores, const double cutoff) { +static auto +get_stepup_cutoff(std::vector scores, const double cutoff) -> double { if (cutoff <= 0) return std::numeric_limits::max(); - else if (cutoff > 1) + if (cutoff > 1) return std::numeric_limits::min(); - const std::size_t n = std::size(scores); + const std::uint32_t n = std::size(scores); std::sort(begin(scores), std::end(scores)); - std::size_t i = 1; + std::uint32_t i = 1; while (i < n && scores[i - 1] < (cutoff * i) / n) ++i; return scores[i - 1]; @@ -105,8 +148,8 @@ get_domain_scores(const std::vector &state_ids, const std::vector &reset_points) -> std::vector { std::size_t reset_idx = 1; - bool in_domain = false; - double score = 0; + bool in_domain{}; + double score{}; std::vector domain_scores; for (std::size_t i = 0; i < std::size(state_ids); ++i) { if (reset_points[reset_idx] == i) { @@ -136,51 +179,53 @@ static void build_domains(const std::vector &cpgs, const std::vector &reset_points, const std::vector &state_ids, - std::vector &domains) { - std::size_t n_cpgs = 0, reset_idx = 1, prev_end = 0; + std::vector &domains) { + std::uint32_t n_cpgs{}; + std::uint32_t prev_stop{}; bool in_domain = false; - for (std::size_t i = 0; i < std::size(state_ids); ++i) { - if (reset_points[reset_idx] == i) { + // assume reset points include 0 and size(cpgs) + auto r_itr = std::cbegin(reset_points) + 1; + for (std::uint32_t i = 0; i < std::size(state_ids); ++i) { + if (i == *r_itr) { if (in_domain) { in_domain = false; - domains.back().set_end(prev_end); - domains.back().set_score(n_cpgs); + domains.back().stop = prev_stop; + domains.back().score = n_cpgs; n_cpgs = 0; } - ++reset_idx; + ++r_itr; } if (state_ids[i]) { // currently in an hmr if (!in_domain) { in_domain = true; - domains.push_back(as_gen_rgn(cpgs[i])); - domains.back().set_name("HYPO" + std::to_string(std::size(domains))); + domains.push_back(to_interval(cpgs[i])); + domains.back().name = "HYPO" + std::to_string(std::size(domains)); } ++n_cpgs; } else if (in_domain) { in_domain = false; - domains.back().set_end(prev_end); - domains.back().set_score(n_cpgs); + domains.back().stop = prev_stop; + domains.back().score = n_cpgs; n_cpgs = 0; } - prev_end = cpgs[i].pos + 1; + prev_stop = cpgs[i].pos + 1; } if (in_domain) { - domains.back().set_end(prev_end); - domains.back().set_score(n_cpgs); + domains.back().stop = prev_stop; + domains.back().score = n_cpgs; } } template -static void +static auto separate_regions(const bool verbose, const std::size_t desert_size, std::vector &cpgs, std::vector &meth, - std::vector &reads, - std::vector &reset_points) { + std::vector &reads) -> std::vector { if (verbose) std::cerr << "[separating by cpg desert]\n"; // eliminate the zero-read cpgs - std::size_t j = 0; + std::int64_t j = 0; for (std::size_t i = 0; i < std::size(cpgs); ++i) if (reads[i] > 0) { cpgs[j] = cpgs[i]; @@ -192,8 +237,10 @@ separate_regions(const bool verbose, const std::size_t desert_size, meth.erase(std::begin(meth) + j, std::end(meth)); reads.erase(std::begin(reads) + j, std::end(reads)); - double total_bases = 0; - double bases_in_deserts = 0; + std::vector reset_points; + + std::size_t total_bases{}; + std::size_t bases_in_deserts{}; // segregate cpgs std::size_t prev_pos = 0; for (std::size_t i = 0; i < std::size(cpgs); ++i) { @@ -213,25 +260,28 @@ separate_regions(const bool verbose, const std::size_t desert_size, reset_points.push_back(std::size(cpgs)); if (verbose) - std::cerr << "[cpgs retained: " << std::size(cpgs) << "]" << '\n' + std::cerr << "[cpgs retained: " << std::size(cpgs) << "]\n" << "[deserts removed: " << std::size(reset_points) - 2 << "]\n" << "[genome fraction covered: " - << 1.0 - (bases_in_deserts / total_bases) << "]\n"; + << 1.0 - (static_cast(bases_in_deserts) / + static_cast(total_bases)) + << "]\n"; + return reset_points; } -/* function to "fold" the methylation profile so that the middle - * methylation becomes lower methylation, and both the low and high - * methylation become high. this method actually seems to work. +/* function to "fold" the methylation profile so that the middle methylation + * becomes lower methylation, and both the low and high methylation become + * high. this method actually seems to work. */ static void make_partial_meth(const std::vector &reads, std::vector> &meth) { - static constexpr auto half = 0.5; + static constexpr auto one_half = 0.5; for (std::size_t i = 0; i < std::size(reads); ++i) { - double m = meth[i].first / reads[i]; - m = (m <= half) ? (1.0 - 2 * m) : (1.0 - 2 * (1.0 - m)); - meth[i].first = reads[i] * m; - meth[i].second = (reads[i] - meth[i].first); + const auto r = reads[i]; + double m = meth[i].first / r; + m = (m <= one_half) ? (1.0 - 2 * m) : (1.0 - 2 * (1.0 - m)); + meth[i] = {r * m, r * (1.0 - m)}; } } @@ -256,160 +306,121 @@ shuffle_cpgs(const std::size_t rng_seed, const TwoStateHMM &hmm, [[nodiscard]] static auto assign_p_values(const std::vector &random, const std::vector &observed) -> std::vector { - const double n_randoms = random.empty() ? 1 : std::size(random); + const auto n_rand = + random.empty() ? 1.0 : static_cast(std::size(random)); const auto n_scores = std::size(observed); + const auto r_end = std::cend(random); std::vector p_values(n_scores); for (auto i = 0u; i < n_scores; ++i) { - const auto ub = - std::upper_bound(std::cbegin(random), std::cend(random), observed[i]); - p_values[i] = std::distance(ub, std::end(random)) / n_randoms; + const auto ub = std::upper_bound(std::cbegin(random), r_end, observed[i]); + p_values[i] = static_cast(std::distance(ub, r_end)) / n_rand; } return p_values; } -static void -read_params_file(const bool verbose, const std::string ¶ms_file, - double &fg_alpha, double &fg_beta, double &bg_alpha, - double &bg_beta, double &p_fb, double &p_bf, - double &domain_score_cutoff) { - std::string jnk; +[[nodiscard]] static auto +read_params_file(const std::string ¶ms_file) -> hmr_params { std::ifstream in(params_file); if (!in) - throw std::runtime_error("failed to parse params file: " + params_file); - in >> jnk >> fg_alpha >> jnk >> fg_beta >> jnk >> bg_alpha >> jnk >> - bg_beta >> jnk >> p_fb >> jnk >> p_bf >> jnk >> domain_score_cutoff; - if (verbose) - std::cerr << "FG_ALPHA\t" << fg_alpha << '\n' - << "FG_BETA\t" << fg_beta << '\n' - << "BG_ALPHA\t" << bg_alpha << '\n' - << "BG_BETA\t" << bg_beta << '\n' - << "F_B\t" << p_fb << '\n' - << "B_F\t" << p_bf << '\n' - << "DOMAIN_SCORE_CUTOFF\t" << domain_score_cutoff << '\n'; + throw std::runtime_error("failed to open params file: " + params_file); + const nlohmann::json data = nlohmann::json::parse(in); + try { + hmr_params p(data); + return p; + } + catch (const std::exception &e) { + const auto msg = "failed to parse parameters file: " + params_file; + throw std::runtime_error(std::string(e.what()) + "\n" + msg); + } } static void -write_params_file(const std::string &outfile, const double fg_alpha, - const double fg_beta, const double bg_alpha, - const double bg_beta, const double p_fb, const double p_bf, - const double domain_score_cutoff) { - static constexpr auto precision_value = 30; - std::ofstream of; - if (!outfile.empty()) - of.open(outfile.c_str()); - std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); - - out.precision(precision_value); - out << "FG_ALPHA\t" << fg_alpha << '\n' - << "FG_BETA\t" << fg_beta << '\n' - << "BG_ALPHA\t" << bg_alpha << '\n' - << "BG_BETA\t" << bg_beta << '\n' - << "F_B\t" << p_fb << '\n' - << "B_F\t" << p_bf << '\n' - << "DOMAIN_SCORE_CUTOFF\t" << domain_score_cutoff << '\n'; +write_params_file(const std::string &outfile, const hmr_params ¶ms) { + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open params output file: " + outfile); + out << nlohmann::json(params); } [[nodiscard]] -static inline double -get_n_meth(const MSite &s) { - return s.meth * s.n_reads; +static inline auto +get_n_meth(const MSite &s) -> double { + return s.meth * static_cast(s.n_reads); } [[nodiscard]] -static inline double -get_n_unmeth(const MSite &s) { - return (1.0 - s.meth) * s.n_reads; +static inline auto +get_n_unmeth(const MSite &s) -> double { + return (1.0 - s.meth) * static_cast(s.n_reads); } -static void -load_cpgs(const std::string &cpgs_file, std::vector &cpgs, - std::vector> &meth, - std::vector &reads) { +static auto +load_cpgs(const std::string &cpgs_file) -> std::vector { bamxx::bgzf_file in(cpgs_file, "r"); if (!in) throw std::runtime_error("failed opening file: " + cpgs_file); - if (get_has_counts_header(cpgs_file)) skip_counts_header(in); - + std::vector cpgs; MSite prev_site, the_site; while (read_site(in, the_site)) { if (!the_site.is_cpg() || distance(prev_site, the_site) < 2) - throw std::runtime_error("error: input is not symmetric-CpGs: " + - cpgs_file); + throw std::runtime_error("input is not symmetric-CpGs: " + cpgs_file); cpgs.push_back(the_site); - reads.push_back(the_site.n_reads); - meth.push_back( - std::make_pair(get_n_meth(the_site), get_n_unmeth(the_site))); prev_site = the_site; } -} - -template -static auto -get_mean(InputIterator first, InputIterator last) -> T { - return std::accumulate(first, last, static_cast(0)) / - std::distance(first, last); + return cpgs; } template static void check_sorted_within_chroms(T first, const T last) { // empty, or a single element - if (first == last || first + 1 == last) + if (last <= first + 1) return; - std::unordered_set chroms_seen; - std::string cur_chrom = ""; - - for (auto fast = first + 1; fast != last; ++fast, ++first) { - if (fast->chrom != cur_chrom) { - if (chroms_seen.find(fast->chrom) != std::end(chroms_seen)) { + std::string cur_chrom; + for (auto next = first + 1; next != last; ++next, ++first) { + if (next->chrom != cur_chrom) { + if (chroms_seen.find(next->chrom) != std::end(chroms_seen)) throw std::runtime_error("input not grouped by chromosomes. " "Error in the following line:\n" + - fast->tostring()); - } - cur_chrom = fast->chrom; + next->tostring()); + cur_chrom = next->chrom; chroms_seen.insert(cur_chrom); // don't check ordering if chroms have changed } - else { // first and fast are in the same chrom - if (first->pos >= fast->pos) { + else { // first and next are in the same chrom + if (first->pos >= next->pos) { throw std::runtime_error("input file not sorted properly. " "Error in the following lines:\n" + - first->tostring() + "\n" + fast->tostring()); + first->tostring() + "\n" + next->tostring()); } } } } -int -main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) +auto +main_hmr(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays) try { static constexpr auto min_coverage = 1.0; static constexpr auto tolerance = 1e-10; // corrections for small values static constexpr auto p_value_cutoff = 0.01; - static constexpr auto alpha_frac = 0.33; std::string outfile; - std::string hypo_post_outfile; - std::string meth_post_outfile; + std::string post_outfile; + std::string meth_outfile; - // NOLINTBEGIN(*-avoid-magic-numbers) - std::size_t desert_size{1000}; - std::size_t max_iterations{10}; - std::size_t rng_seed{408}; - double p_fb{0.25}; - double p_bf{0.25}; - // NOLINTEND(*-avoid-magic-numbers) + std::size_t desert_size{1000}; // NOLINT(*-avoid-magic-numbers) + std::size_t max_iterations{10}; // NOLINT(*-avoid-magic-numbers) + std::size_t rng_seed{408}; // NOLINT(*-avoid-magic-numbers) // run mode flags - bool verbose = false; - bool PARTIAL_METH = false; - bool allow_extra_fields = false; + bool verbose{false}; + bool PARTIAL_METH{false}; + bool allow_extra_fields{false}; std::string summary_file; - std::string params_in_file; std::string params_out_file; @@ -434,11 +445,11 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) opt_parse.add_opt("post-hypo", '\0', "output file for single-CpG posterior " "hypomethylation probability (default: none)", - false, hypo_post_outfile); + false, post_outfile); opt_parse.add_opt("post-meth", '\0', "output file for single-CpG posteiror " "methylation probability (default: none)", - false, meth_post_outfile); + false, meth_outfile); opt_parse.add_opt("params-in", 'P', "HMM parameter file " "(override training)", @@ -486,16 +497,22 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) throw std::runtime_error("failed to open output file: " + outfile); // separate the regions by chrom and by desert - std::vector cpgs; - std::vector> meth; - std::vector reads; if (verbose) std::cerr << "[reading methylation levels]\n"; - load_cpgs(cpgs_file, cpgs, meth, reads); - + const auto orig_cpgs = load_cpgs(cpgs_file); if (verbose) std::cerr << "[checking if input is properly formatted]\n"; - check_sorted_within_chroms(std::begin(cpgs), std::end(cpgs)); + check_sorted_within_chroms(std::cbegin(orig_cpgs), std::cend(orig_cpgs)); + + auto cpgs = orig_cpgs; + + std::vector> meth; + std::vector reads; + for (const auto &cpg : cpgs) { + reads.push_back(cpg.n_reads); + meth.emplace_back(get_n_meth(cpg), get_n_unmeth(cpg)); + } + if (PARTIAL_METH) make_partial_meth(reads, meth); @@ -520,100 +537,88 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) return EXIT_SUCCESS; } - // separate the regions by chrom and by desert, and eliminate - // those isolated CpGs - std::vector reset_points; - separate_regions(verbose, desert_size, cpgs, meth, reads, reset_points); + // separate the regions by chrom and by desert, and eliminate those + // isolated CpGs + const auto reset_points = + separate_regions(verbose, desert_size, cpgs, meth, reads); const TwoStateHMM hmm(tolerance, max_iterations, verbose); - double fg_alpha{}, fg_beta{}; - double bg_alpha{}, bg_beta{}; - double domain_score_cutoff = std::numeric_limits::max(); + hmr_params params; if (!params_in_file.empty()) { // read parameters file - read_params_file(verbose, params_in_file, fg_alpha, fg_beta, bg_alpha, - bg_beta, p_fb, p_bf, domain_score_cutoff); + params = read_params_file(params_in_file); max_iterations = 0; } - else { - const double n_reads = get_mean(std::cbegin(reads), std::cend(reads)); - fg_alpha = alpha_frac * n_reads; - fg_beta = (1.0 - alpha_frac) * n_reads; - bg_alpha = (1.0 - alpha_frac) * n_reads; - bg_beta = alpha_frac * n_reads; - } + else + params.init(mean_coverage); if (max_iterations > 0) - hmm.BaumWelchTraining(meth, reset_points, p_fb, p_bf, fg_alpha, fg_beta, - bg_alpha, bg_beta); - // DECODE THE DOMAINS + hmm.BaumWelchTraining(meth, reset_points, params.p_fb, params.p_bf, + params.fg_alpha, params.fg_beta, params.bg_alpha, + params.bg_beta); + // decode the domains std::vector state_ids; std::vector posteriors; - hmm.PosteriorDecoding(meth, reset_points, p_fb, p_bf, fg_alpha, fg_beta, - bg_alpha, bg_beta, state_ids, posteriors); + hmm.PosteriorDecoding(meth, reset_points, params.p_fb, params.p_bf, + params.fg_alpha, params.fg_beta, params.bg_alpha, + params.bg_beta, state_ids, posteriors); const auto domain_scores = get_domain_scores(state_ids, meth, reset_points); - const auto random_scores = - shuffle_cpgs(rng_seed, hmm, meth, reset_points, p_fb, p_bf, fg_alpha, - fg_beta, bg_alpha, bg_beta); + const auto random_scores = shuffle_cpgs( + rng_seed, hmm, meth, reset_points, params.p_fb, params.p_bf, + params.fg_alpha, params.fg_beta, params.bg_alpha, params.bg_beta); const auto p_values = assign_p_values(random_scores, domain_scores); - if (domain_score_cutoff == std::numeric_limits::max() && + if (params.domain_score_cutoff == std::numeric_limits::max() && !domain_scores.empty()) - domain_score_cutoff = get_stepup_cutoff(p_values, p_value_cutoff); + params.domain_score_cutoff = get_stepup_cutoff(p_values, p_value_cutoff); // write parameters if requested if (!params_out_file.empty()) - write_params_file(params_out_file, fg_alpha, fg_beta, bg_alpha, bg_beta, - p_fb, p_bf, domain_score_cutoff); + write_params_file(params_out_file, params); - std::vector domains; + std::vector domains; if (!domain_scores.empty()) build_domains(cpgs, reset_points, state_ids, domains); { std::size_t good_hmr_count = 0; for (auto i = 0u; i < std::size(domains); ++i) - if (p_values[i] < domain_score_cutoff) { + if (p_values[i] < params.domain_score_cutoff) { domains[good_hmr_count] = domains[i]; - domains[good_hmr_count].set_name("HYPO" + - std::to_string(good_hmr_count)); + domains[good_hmr_count].name = + "HYPO" + std::to_string(good_hmr_count); ++good_hmr_count; } domains.resize(good_hmr_count); - for (const auto &d : domains) - out << d << '\n'; + out << to_string(d) << '\n'; } - if (!hypo_post_outfile.empty()) { + if (!post_outfile.empty()) { if (verbose) - std::cerr << "[writing=" << hypo_post_outfile << "]\n"; - std::ofstream out_hypo_post(hypo_post_outfile); - if (!out_hypo_post) - throw std::runtime_error("failed to open output hypo post file: " + - hypo_post_outfile); - for (std::size_t i = 0; i < std::size(cpgs); ++i) { - GenomicRegion cpg(as_gen_rgn(cpgs[i])); - cpg.set_name(format_cpg_meth_tag(meth[i])); - cpg.set_score(posteriors[i]); - out_hypo_post << cpg << '\n'; + std::cerr << "[writing=" << post_outfile << "]\n"; + std::ofstream out_post(post_outfile); + if (!out_post) + throw std::runtime_error("failed to open output file: " + post_outfile); + auto j = 0u; + for (auto cpg : orig_cpgs) { + cpg.meth = (cpg.n_reads > 0) ? 1.0 - posteriors[j++] : 0.0; + out_post << cpg << '\n'; } } - if (!meth_post_outfile.empty()) { - std::ofstream out_meth_post(meth_post_outfile); - if (!out_meth_post) - throw std::runtime_error("failed to open meth post output file: " + - meth_post_outfile); + if (!meth_outfile.empty()) { + std::ofstream out_meth(meth_outfile); + if (!out_meth) + throw std::runtime_error("failed to open output file: " + meth_outfile); if (verbose) - std::cerr << "[writing=" << meth_post_outfile << "]\n"; - for (std::size_t i = 0; i < std::size(cpgs); ++i) { - GenomicRegion cpg(as_gen_rgn(cpgs[i])); - cpg.set_name(format_cpg_meth_tag(meth[i])); - cpg.set_score(1.0 - posteriors[i]); - out_meth_post << cpg << '\n'; + std::cerr << "[writing=" << meth_outfile << "]\n"; + auto j = 0u; + for (auto cpg : orig_cpgs) { + cpg.meth = cpg.n_reads > 0 ? params.est_meth(posteriors[j++]) : 0.0; + out_meth << cpg << '\n'; } } if (!summary_file.empty()) { @@ -629,5 +634,3 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) } return EXIT_SUCCESS; } - -// NOLINTEND(*-narrowing-conversions) diff --git a/src/analysis/hypermr.cpp b/src/analysis/hypermr.cpp index 4f9d1698..12c42532 100644 --- a/src/analysis/hypermr.cpp +++ b/src/analysis/hypermr.cpp @@ -1,31 +1,34 @@ -/* Copyright (C) 2009-2023 University of Southern California - * Song Qiang and Andrew D Smith - * Author: Song Qiang, Andrew D. Smith +/* Copyright (C) 2009-2025 Song Qiang and Andrew D Smith * - * This file is part of dnmtools. + * This is free software; you can redistribute it and/or modify it under the + * terms of the GNU General Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your option) any later + * version. * - * This is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. * - * This is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this software; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA - * 02110-1301 USA + * You should have received a copy of the GNU General Public License along + * with this software; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +[[maybe_unused]] static constexpr auto about = R"( +Identify regions of elevated methylation. Designed for methylomes of plants, +especially A. thaliana, and other organisms with a low background methylation +level, but with increased methylation associated with biological activity at a +specific regions of the genome. +)"; + #include "BetaBin.hpp" -#include "GenomicRegion.hpp" +#include "Interval6.hpp" #include "MSite.hpp" -#include "OptionParser.hpp" #include "ThreeStateHMM.hpp" +#include "OptionParser.hpp" + #include #include @@ -41,170 +44,151 @@ #include #include -using std::begin; -using std::cerr; -using std::cout; -using std::end; -using std::endl; -using std::istream_iterator; -using std::make_pair; -using std::max; -using std::min; -using std::numeric_limits; -using std::pair; -using std::runtime_error; -using std::string; -using std::to_string; -using std::vector; - -using std::ofstream; -using std::ostream_iterator; - -// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) - -static GenomicRegion +// NOLINTBEGIN(*-narrowing-conversions) + +static Interval6 as_gen_rgn(const MSite &s) { - return GenomicRegion(s.chrom, s.pos, s.pos + 1); + return Interval6(s.chrom, s.pos, s.pos + 1, std::string{}, 0.0, '+'); } static void -load_cpgs(const string &cpgs_file, vector &cpgs, - vector> &meth) { +load_cpgs(const std::string &cpgs_file, std::vector &cpgs, + std::vector> &meth) { bamxx::bgzf_file in(cpgs_file, "r"); if (!in) - throw runtime_error("failed opening file: " + cpgs_file); + throw std::runtime_error("failed opening file: " + cpgs_file); MSite the_site; - string line; + std::string line; while (getline(in, line)) cpgs.push_back(MSite(line)); - meth.resize(cpgs.size()); - for (size_t i = 0; i < cpgs.size(); ++i) - meth[i] = make_pair(cpgs[i].n_meth(), cpgs[i].n_unmeth()); + meth.resize(std::size(cpgs)); + for (std::size_t i = 0; i < std::size(cpgs); ++i) + meth[i] = std::make_pair(cpgs[i].n_meth(), cpgs[i].n_unmeth()); } template static void -separate_regions(const size_t desert_size, vector &cpgs, vector &meth, - vector &reset_points, size_t &total_bases, - size_t &bases_in_deserts) { +separate_regions(const std::size_t desert_size, std::vector &cpgs, + std::vector &meth, std::vector &reset_points, + std::size_t &total_bases, std::size_t &bases_in_deserts) { // eliminate the zero-read cpgs - size_t j = 0; - for (size_t i = 0; i < cpgs.size(); ++i) + std::size_t j = 0; + for (std::size_t i = 0; i < std::size(cpgs); ++i) if (cpgs[i].n_reads > 0) { cpgs[j] = cpgs[i]; meth[j] = meth[i]; ++j; } - cpgs.erase(begin(cpgs) + j, end(cpgs)); - meth.erase(begin(meth) + j, end(meth)); + cpgs.erase(std::begin(cpgs) + j, std::end(cpgs)); + meth.erase(std::begin(meth) + j, std::end(meth)); total_bases = 0; bases_in_deserts = 0; - size_t prev_pos = 0; - for (size_t i = 0; i < cpgs.size(); ++i) { - const size_t dist = (i > 0 && cpgs[i].chrom == cpgs[i - 1].chrom) - ? cpgs[i].pos - prev_pos - : numeric_limits::max(); + std::size_t prev_pos = 0; + for (std::size_t i = 0; i < std::size(cpgs); ++i) { + const std::size_t dist = (i > 0 && cpgs[i].chrom == cpgs[i - 1].chrom) + ? cpgs[i].pos - prev_pos + : std::numeric_limits::max(); if (dist > desert_size) { reset_points.push_back(i); - if (dist < numeric_limits::max()) + if (dist < std::numeric_limits::max()) bases_in_deserts += dist; } - if (dist < numeric_limits::max()) + if (dist < std::numeric_limits::max()) total_bases += dist; prev_pos = cpgs[i].pos; } - reset_points.push_back(cpgs.size()); + reset_points.push_back(std::size(cpgs)); } // ADS (!!!) this function seems to not be working at all right now static void -read_params_file(const string ¶ms_file, +read_params_file(const std::string ¶ms_file, // betabin &hypo_emission, // betabin &HYPER_emission, // betabin &HYPO_emission, - vector> &trans) { + std::vector> &trans) { std::ifstream in(params_file); if (!in) - throw runtime_error("failed to read param file: " + params_file); + throw std::runtime_error("failed to read param file: " + params_file); - string hypo_emission_str; + std::string hypo_emission_str; getline(in, hypo_emission_str); - string HYPER_emission_str; + std::string HYPER_emission_str; getline(in, HYPER_emission_str); - string HYPO_emission_str; + std::string HYPO_emission_str; getline(in, HYPO_emission_str); - trans.resize(3, vector(3, 0.0)); - for (size_t i = 0; i < trans.size(); ++i) - for (size_t j = 0; j < trans[i].size(); ++j) + trans.resize(3, std::vector(3, 0.0)); + for (std::size_t i = 0; i < std::size(trans); ++i) + for (std::size_t j = 0; j < std::size(trans[i]); ++j) in >> trans[i][j]; } static void -write_params_file(const string ¶ms_file, const betabin &hypo_emission, +write_params_file(const std::string ¶ms_file, const betabin &hypo_emission, const betabin &HYPER_emission, const betabin &HYPO_emission, - const vector> &trans) { - ofstream out(params_file); + const std::vector> &trans) { + std::ofstream out(params_file); out << hypo_emission.tostring() << '\n' << HYPER_emission.tostring() << '\n' << HYPO_emission.tostring() << '\n'; for (const auto &i : trans) { std::copy(std::cbegin(i), std::cend(i), - ostream_iterator(out, "\t")); + std::ostream_iterator(out, "\t")); out << '\n'; } } static void -build_domains(const vector &cpgs, - const vector> &meth, - const vector &reset_points, - const vector &classes, - vector &domains) { +build_domains(const std::vector &cpgs, + const std::vector> &meth, + const std::vector &reset_points, + const std::vector &classes, + std::vector &domains) { domains.clear(); - for (size_t i = 0; i < reset_points.size() - 1; ++i) { - const size_t start = reset_points[i]; - const size_t end = reset_points[i + 1]; + for (std::size_t i = 0; i + 1 < std::size(reset_points); ++i) { + const std::size_t start = reset_points[i]; + const std::size_t end = reset_points[i + 1]; - GenomicRegion domain(as_gen_rgn(cpgs[start])); + Interval6 domain(as_gen_rgn(cpgs[start])); STATE_LABELS prev_state = classes[start]; - size_t n = 1; + std::size_t n = 1; double meth_sum = meth[start].first / (meth[start].first + meth[start].second); - for (size_t j = start + 1; j < end; ++j) { + for (std::size_t j = start + 1; j < end; ++j) { if ((prev_state == hypo && classes[j] == hypo) || (prev_state != hypo && classes[j] != hypo)) { ++n; meth_sum += meth[j].first / (meth[j].first + meth[j].second); } else { - domain.set_end(cpgs[j - 1].pos + 1); - const string label = (prev_state == hypo ? "hypo" : "hyper"); - domain.set_name(label + ":" + std::to_string(n)); - domain.set_score(meth_sum); - domain.set_strand('+'); + domain.stop = cpgs[j - 1].pos + 1; + const std::string label = (prev_state == hypo ? "hypo" : "hyper"); + domain.name = label + ":" + std::to_string(n); + domain.score = meth_sum; + domain.strand = '+'; if (prev_state == HYPER || prev_state == HYPO) domains.push_back(domain); - domain = GenomicRegion(as_gen_rgn(cpgs[j])); + domain = Interval6(as_gen_rgn(cpgs[j])); n = 1; prev_state = classes[j]; meth_sum = meth[j].first / (meth[j].first + meth[j].second); } } - domain.set_end(cpgs[end - 1].pos + 1); - const string label = (prev_state == hypo ? "hypo" : "hyper"); - domain.set_name(label + ":" + std::to_string(n)); - domain.set_score(meth_sum); - domain.set_strand('+'); + domain.stop = cpgs[end - 1].pos + 1; + const std::string label = (prev_state == hypo ? "hypo" : "hyper"); + domain.name = label + ":" + std::to_string(n); + domain.score = meth_sum; + domain.strand = '+'; if (prev_state == HYPER || prev_state == HYPO) domains.push_back(domain); } @@ -212,53 +196,53 @@ build_domains(const vector &cpgs, static void filter_domains(const double min_cumulative_meth, - vector &domains) { - size_t j = 0; - for (size_t i = 0; i < domains.size(); ++i) - if (domains[i].get_score() >= min_cumulative_meth) + std::vector &domains) { + std::size_t j = 0; + for (std::size_t i = 0; i < std::size(domains); ++i) + if (domains[i].score >= min_cumulative_meth) domains[j++] = domains[i]; - domains.erase(begin(domains) + j, end(domains)); + domains.erase(std::begin(domains) + j, std::end(domains)); } static void -initialize_transitions(vector> &trans) { +initialize_transitions(std::vector> &trans) { // "hypo" -> "hypo" is high, and goes only to "HYPER"; // "HYPER" -> "HYPER" or "HYPO" equally; // "HYPO" -> "HYPER" usually; never to "hypo" - trans = {{0.990, 0.010, 0.000}, {0.050, 0.475, 0.475}, {0.000, 0.666, 0.334}}; + + // NOLINTBEGIN(*-avoid-magic-numbers) + trans = { + {0.990, 0.010, 0.000}, + {0.050, 0.475, 0.475}, + {0.000, 0.666, 0.334}, + }; + // NOLINTEND(*-avoid-magic-numbers) } int main_hypermr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - static const string description = - "Identify regions of elevated methylation. Designed for " - "methylomes of plants, especially A. thaliana, and other " - "organisms with a low background methylation level, but " - "with increased methylation associated with biological " - "activity at a specific regions of the genome."; - - string outfile; - string scores_file; + // NOLINTBEGIN(*-avoid-magic-numbers) + std::size_t desert_size = 1000; + std::size_t max_iterations = 10; + // corrections for small values (not parameters): + double tolerance = 1e-10; + double min_cumulative_meth = 4.0; - size_t desert_size = 1000; - size_t max_iterations = 10; + // NOLINTEND(*-avoid-magic-numbers) // run mode flags bool VERBOSE = false; - - // corrections for small values (not parameters): - double tolerance = 1e-10; - - double min_cumulative_meth = 4.0; bool USE_VITERBI_DECODING = false; - string params_in_file; - string params_out_file; + std::string outfile; + std::string scores_file; + std::string params_in_file; + std::string params_out_file; /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) - description, ""); + about, ""); opt_parse.add_opt("out", 'o', "output file (BED format)", false, outfile); opt_parse.add_opt("scores", 's', "output file for posterior scores", false, scores_file); @@ -277,62 +261,63 @@ main_hypermr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) opt_parse.add_opt("params-out", 'p', "parameters ouptut file", false, params_out_file); opt_parse.set_show_defaults(); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << '\n' - << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << '\n'; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } - if (leftover_args.size() != 1) { - cerr << opt_parse.help_message() << '\n'; + if (std::size(leftover_args) != 1) { + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } - const string cpgs_file = leftover_args.front(); + const std::string cpgs_file = leftover_args.front(); /****************** END COMMAND LINE OPTIONS *****************/ if (!is_msite_file(cpgs_file)) - throw runtime_error("malformed counts file: " + cpgs_file); + throw std::runtime_error("malformed counts file: " + cpgs_file); if (VERBOSE) - cerr << "[loading_data]" << '\n'; - vector cpgs; - vector> meth; + std::cerr << "[loading_data]" << '\n'; + std::vector cpgs; + std::vector> meth; load_cpgs(cpgs_file, cpgs, meth); - const size_t n_sites = cpgs.size(); + const std::size_t n_sites = std::size(cpgs); const double mean_cov = std::accumulate( std::cbegin(cpgs), std::cend(cpgs), 0.0, [](const auto a, const auto &c) { return a + c.n_reads; }) / static_cast(n_sites); - cerr << "[n_sites=" << n_sites << "]" << '\n' - << "[mean_coverage=" << mean_cov << "]" << '\n'; + std::cerr << "[n_sites=" << n_sites << "]" << '\n' + << "[mean_coverage=" << mean_cov << "]" << '\n'; // separate the regions by chrom and by desert, and eliminate // those isolated CpGs - vector reset_points; - size_t total_bases = 0, bases_in_deserts = 0; + std::vector reset_points; + std::size_t total_bases = 0, bases_in_deserts = 0; separate_regions(desert_size, cpgs, meth, reset_points, total_bases, bases_in_deserts); if (VERBOSE) { const auto des_frac = static_cast(bases_in_deserts) / total_bases; - cerr << "[n_sites_retained=" << cpgs.size() << "]" << '\n' - << "[deserts_removed=" << reset_points.size() - 2 << "]" << '\n' - << "[remaining_genome_fraction=" << 1.0 - des_frac << "]" << '\n'; + std::cerr << "[n_sites_retained=" << std::size(cpgs) << "]" << '\n' + << "[deserts_removed=" << std::size(reset_points) - 2 << "]\n" + << "[remaining_genome_fraction=" << 1.0 - des_frac << "]" + << '\n'; } ThreeStateHMM hmm(meth, reset_points, tolerance, max_iterations, VERBOSE); - vector> trans; + std::vector> trans; initialize_transitions(trans); betabin hypo_emission, HYPER_emission, HYPO_emission; @@ -344,12 +329,13 @@ main_hypermr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) // HYPO_emission, trans); else { + // NOLINTBEGIN(*-avoid-magic-numbers) const double n_reads = mean_cov; const double fg_alpha = 0.33 * n_reads; const double fg_beta = 0.67 * n_reads; const double bg_alpha = 0.67 * n_reads; const double bg_beta = 0.33 * n_reads; - + // NOLINTEND(*-avoid-magic-numbers) hypo_emission = betabin(fg_alpha, fg_beta); HYPER_emission = betabin(bg_alpha, bg_beta); HYPO_emission = hypo_emission; @@ -365,7 +351,7 @@ main_hypermr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) HYPO_emission, trans); // decode the states - vector classes; + std::vector classes; if (USE_VITERBI_DECODING) hmm.ViterbiDecoding(); else @@ -373,26 +359,25 @@ main_hypermr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) hmm.get_classes(classes); // identify the domains of hypermethylation - vector domains; + std::vector domains; build_domains(cpgs, hmm.observations, reset_points, classes, domains); filter_domains(min_cumulative_meth, domains); // write the results - ofstream of; - if (!outfile.empty()) - of.open(outfile); - std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); - copy(begin(domains), end(domains), - ostream_iterator(out, "\n")); + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open outfile: " + outfile); + std::copy(std::cbegin(domains), std::cend(domains), + std::ostream_iterator(out, "\n")); // if requested, write the posterior scores if (!scores_file.empty()) { if (USE_VITERBI_DECODING) hmm.PosteriorDecoding(); - vector scores; + std::vector scores; hmm.get_state_posteriors(scores); - ofstream score_out(scores_file); - for (size_t i = 0; i < cpgs.size(); ++i) { + std::ofstream score_out(scores_file); + for (std::size_t i = 0; i < std::size(cpgs); ++i) { score_out << cpgs[i] << "\t"; if (classes[i] == hypo) score_out << "hypo\n" << scores[i].hypo << '\n'; @@ -404,10 +389,10 @@ main_hypermr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) } } catch (const std::exception &e) { - cerr << "ERROR:\t" << e.what() << '\n'; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } -// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) +// NOLINTEND(*-narrowing-conversions) diff --git a/src/analysis/metagene.cpp b/src/analysis/metagene.cpp index e20d2f6d..4c8f574d 100644 --- a/src/analysis/metagene.cpp +++ b/src/analysis/metagene.cpp @@ -1,30 +1,29 @@ -/* metagene (tsscpgplot): get data to plot methylation level around a TSS +/* Copyright (C) 2010-2025 Andrew D. Smith * - * Copyright (C) 2010-2023 Andrew D. Smith + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. * - * Authors: Andrew D. Smith + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. * - * This program is free software: you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see - * . + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . */ -#include "GenomicRegion.hpp" +[[maybe_unused]] static constexpr auto about = R"( +metagene (tsscpgplot): get data to plot methylation level around a TSS +)"; + +#include "Interval6.hpp" #include "LevelsCounter.hpp" #include "MSite.hpp" #include "OptionParser.hpp" -#include "bamxx.hpp" +#include #include #include @@ -40,33 +39,23 @@ #include #include -using std::cerr; -using std::cout; -using std::ostream; -using std::pair; -using std::runtime_error; -using std::sort; -using std::string; -using std::to_string; -using std::unordered_map; -using std::vector; - static MSite -tss_from_gene(const GenomicRegion &r) { +tss_from_gene(const Interval6 &r) { MSite s; - s.chrom = r.get_chrom(); - s.pos = (r.pos_strand() ? r.get_start() : r.get_end()); - s.strand = r.get_strand(); + s.chrom = r.chrom; + s.pos = r.strand == '+' ? r.start : r.stop; + s.strand = r.strand; return s; } static void -process_chrom(const uint32_t region_size, const vector &genes, - const pair &bounds, - const vector &sites, vector &levels) { - const uint32_t twice_rs = 2 * region_size; - - auto comp = [](const MSite &a, const MSite &b) { return a.pos < b.pos; }; +process_chrom(const std::uint32_t region_size, + const std::vector &genes, + const std::pair &bounds, + const std::vector &sites, + std::vector &levels) { + const auto cmp = [](const MSite &a, const MSite &b) { return a.pos < b.pos; }; + const std::uint32_t twice_rs = 2 * region_size; for (auto i = bounds.first; i < bounds.second; ++i) { const MSite tss = tss_from_gene(genes[i]); @@ -76,8 +65,10 @@ process_chrom(const uint32_t region_size, const vector &genes, MSite right = tss; right.pos += region_size; - const auto lim = upper_bound(cbegin(sites), cend(sites), right, comp); - auto itr = lower_bound(cbegin(sites), cend(sites), left, comp); + const auto lim = + std::upper_bound(std::cbegin(sites), std::cend(sites), right, cmp); + auto itr = + std::lower_bound(std::cbegin(sites), std::cend(sites), left, cmp); for (; itr != lim; ++itr) { const auto dist = itr->pos - left.pos; const auto base_idx = tss.strand == '+' ? dist : twice_rs - dist; @@ -88,10 +79,11 @@ process_chrom(const uint32_t region_size, const vector &genes, template static void -collapse_bins(const uint32_t bin_size, vector &v) { - const uint32_t n_bins = std::ceil(static_cast(v.size()) / bin_size); - vector vv(n_bins); - for (auto i = 0u; i < v.size(); ++i) +collapse_bins(const std::uint32_t bin_size, std::vector &v) { + const std::uint32_t n_bins = + std::ceil(static_cast(std::size(v)) / bin_size); + std::vector vv(n_bins); + for (auto i = 0u; i < std::size(v); ++i) vv[i / bin_size] += v[i]; v.swap(vv); } @@ -111,11 +103,11 @@ genes, and the promoters are of interest, the strand will be used correctly. )"; try { - string outfile; - uint32_t region_size = 5000; // NOLINT(*-avoid-magic-numbers) + std::string outfile; + std::uint32_t region_size = 5000; // NOLINT(*-avoid-magic-numbers) bool verbose = false; bool show_progress = false; - uint32_t bin_size = 50; // NOLINT(*-avoid-magic-numbers) + std::uint32_t bin_size = 50; // NOLINT(*-avoid-magic-numbers) /****************** GET COMMAND LINE ARGUMENTS ***************************/ OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) @@ -127,111 +119,110 @@ correctly. opt_parse.add_opt("bin", 'b', "bin size", false, bin_size); opt_parse.add_opt("progress", '\0', "show progress", false, show_progress); opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << '\n'; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } - if (leftover_args.size() != 2) { - cerr << opt_parse.help_message() << '\n'; + if (std::size(leftover_args) != 2) { + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } - const string features_file_name = leftover_args.front(); - const string cpg_file_name = leftover_args.back(); + const std::string features_file_name = leftover_args.front(); + const std::string cpg_file_name = leftover_args.back(); /**********************************************************************/ if (verbose) - cerr << "[loading feature annotations data]" << '\n'; - vector features; - ReadBEDFile(features_file_name, features); - sort(begin(features), end(features)); + std::cerr << "[loading feature annotations data]\n"; + auto features = read_intervals6(features_file_name); + std::sort(std::begin(features), std::end(features)); if (verbose) - cerr << "[number of features: " << features.size() << "]" << '\n'; + std::cerr << "[number of features: " << std::size(features) << "]\n"; // identify the start and end of ranges for each chromosome - unordered_map> lookup; - typedef decltype(lookup)::iterator lookup_itr; + using pos_pair = std::pair; + std::unordered_map lookup; - string chrom_name; + std::string chrom_name; auto prev_idx = 0u; - for (auto i = 0u; i < features.size(); ++i) - if (features[i].get_chrom() != chrom_name) { + for (auto i = 0u; i < std::size(features); ++i) + if (features[i].chrom != chrom_name) { if (!chrom_name.empty()) lookup.insert({chrom_name, {prev_idx, i}}); prev_idx = i; - chrom_name = features[i].get_chrom(); + chrom_name = features[i].chrom; } - lookup.insert({chrom_name, {prev_idx, features.size()}}); + lookup.insert({chrom_name, {prev_idx, std::size(features)}}); if (verbose) - cerr << "[number of chroms with features: " << lookup.size() << "]\n"; + std::cerr << "[number of chroms with features: " << std::size(lookup) + << "]\n"; - auto pair_diff = [&lookup](const lookup_itr x) { - return (x != end(lookup)) ? x->second.second - x->second.first : 0u; + const auto pair_diff = [&lookup](const auto x) { + return (x != std::cend(lookup)) ? x->second.second - x->second.first : 0u; }; - vector levels(2 * region_size); + std::vector levels(2 * region_size); bamxx::bgzf_file cpgin(cpg_file_name, "r"); if (!cpgin) - throw runtime_error("failed to open file: " + cpg_file_name); + throw std::runtime_error("failed to open file: " + cpg_file_name); - vector sites; - string line; + std::vector sites; + std::string line; chrom_name.clear(); while (getline(cpgin, line)) { - auto the_site = MSite(line); + const auto the_site = MSite(line); if (the_site.chrom != chrom_name) { if (!sites.empty()) { const auto bounds = lookup.find(chrom_name); - if (bounds != end(lookup)) + if (bounds != std::cend(lookup)) process_chrom(region_size, features, bounds->second, sites, levels); const auto n_features = pair_diff(bounds); if (show_progress) - cerr << "[sites=" << sites.size() << " features=" << n_features - << "]" << '\n'; + std::cerr << "[sites=" << std::size(sites) + << " features=" << n_features << "]" << '\n'; sites.clear(); } if (show_progress) - cerr << "[processing: " << the_site.chrom << "]"; + std::cerr << "[processing: " << the_site.chrom << "]"; chrom_name = the_site.chrom; } - sites.push_back(std::move(the_site)); + sites.push_back(the_site); } if (!sites.empty()) { const auto bounds = lookup.find(chrom_name); - if (bounds != end(lookup)) + if (bounds != std::cend(lookup)) process_chrom(region_size, features, bounds->second, sites, levels); const auto n_features = pair_diff(bounds); if (show_progress) - cerr << "[sites=" << sites.size() << " features=" << n_features << "]" - << '\n'; + std::cerr << "[sites=" << std::size(sites) << " features=" << n_features + << "]\n"; } collapse_bins(bin_size, levels); if (verbose) - cerr << "output columns:\n" << LevelsCounter::format_header() << '\n'; - - std::ofstream of; - if (!outfile.empty()) - of.open(outfile); - std::ostream out(of.is_open() ? of.rdbuf() : std::cout.rdbuf()); + std::cerr << "output columns:\n" + << LevelsCounter::format_header() << '\n'; - for (auto i = 0u; i < levels.size(); ++i) + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open file: " + outfile); + for (auto i = 0u; i < std::size(levels); ++i) out << i * bin_size << '\t' << levels[i].format_row() << '\n'; } - catch (std::exception &e) { - cerr << e.what() << '\n'; + catch (const std::exception &e) { + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; diff --git a/src/analysis/multimethstat.cpp b/src/analysis/multimethstat.cpp index c745bd82..67e2ce75 100644 --- a/src/analysis/multimethstat.cpp +++ b/src/analysis/multimethstat.cpp @@ -1,25 +1,26 @@ -/* roimethstat: average methylation in each of a set of regions +/* Copyright (C) 2014-2022 Andrew D. Smith * - * Copyright (C) 2014-2022 Andrew D. Smith + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. * - * Authors: Andrew D. Smith - * - * This program is free software: you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. */ -#include "GenomicRegion.hpp" +[[maybe_unused]] static constexpr auto about = R"( +multistat: average methylation in each of a set of regions for each methylome +)"; + +#include "Interval6.hpp" #include "MSite.hpp" -#include "OptionParser.hpp" #include "bsutils.hpp" +#include "OptionParser.hpp" + #include #include @@ -35,93 +36,75 @@ #include #include -using std::cerr; -using std::cout; -using std::endl; -using std::ifstream; -using std::ios_base; -using std::is_sorted; -using std::isfinite; -using std::make_pair; -using std::pair; -using std::runtime_error; -using std::string; -using std::vector; - -using bamxx::bgzf_file; - -// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) - -static pair -meth_unmeth_calls(const size_t n_meth, const size_t n_unmeth) { - static const double alpha = 0.95; - // get info for binomial test - double lower = 0.0, upper = 0.0; - const size_t total = n_meth + n_unmeth; +// NOLINTBEGIN(*-narrowing-conversions) + +static std::pair +meth_unmeth_calls(const std::size_t n_meth, const std::size_t n_unmeth) { + static constexpr double alpha = 0.95; + static constexpr double one_half = 0.5; + double lower{}, upper{}; + const auto total = n_meth + n_unmeth; wilson_ci_for_binomial(alpha, total, static_cast(n_meth) / total, lower, upper); - return make_pair(lower > 0.5, upper < 0.5); + return std::make_pair(lower > one_half, upper < one_half); } -static std::pair -region_bounds(const vector &sites, const GenomicRegion ®ion) { - const string chrom(region.get_chrom()); - const char strand(region.get_strand()); - - const MSite a(chrom, region.get_start(), strand, "", 0, 0); - vector::const_iterator a_ins(lower_bound(begin(sites), end(sites), a)); - - const MSite b(chrom, region.get_end(), strand, "", 0, 0); - vector::const_iterator b_ins(lower_bound(begin(sites), end(sites), b)); - - return make_pair(a_ins - begin(sites), b_ins - begin(sites)); +static std::pair +region_bounds(const std::vector &sites, const Interval6 ®ion) { + const MSite a(region.chrom, region.start, region.strand, "", 0, 0); + const auto a_ins = std::lower_bound(std::cbegin(sites), std::cend(sites), a); + const MSite b(region.chrom, region.stop, region.strand, "", 0, 0); + const auto b_ins = std::lower_bound(std::cbegin(sites), std::cend(sites), b); + return std::make_pair(std::distance(std::cbegin(sites), a_ins), + std::distance(std::cbegin(sites), b_ins)); } /* - This function is used to make sure the output is done the same way - in the different places it might be generated. One issue is that - there is a lot of string and stream manipulation here, which might - not be desirable if there are lots of regions, as when the genome - is binned. + This function is used to make sure the output is done the same way in the + different places it might be generated. One issue is that there is a lot of + string and stream manipulation here, which might not be desirable if there + are lots of regions, as when the genome is binned. */ -static string -format_output_line( - const bool report_more_information, const GenomicRegion &r, - const vector &score, const vector &weighted_mean_meth, - const vector &mean_meth, const vector &fractional_meth, - const size_t total_cpgs, const vector &cpgs_with_reads, - const vector &total_meth, const vector &total_reads) { +static std::string +format_output_line(const bool report_more_information, const Interval6 &r, + const std::vector &score, + const std::vector &weighted_mean_meth, + const std::vector &mean_meth, + const std::vector &fractional_meth, + const std::size_t total_cpgs, + const std::vector &cpgs_with_reads, + const std::vector &total_meth, + const std::vector &total_reads) { std::ostringstream oss; - oss << r.get_chrom() << '\t' << r.get_start() << '\t' << r.get_end() << '\t' - << r.get_name() << '\t'; + oss << r.chrom << '\t' << r.start << '\t' << r.stop << '\t' << r.name << '\t'; - const size_t n_samples = score.size(); + const std::size_t n_samples = std::size(score); - for (size_t i = 0; i < n_samples; ++i) { - if (isfinite(score[i])) + for (std::size_t i = 0; i < n_samples; ++i) { + if (std::isfinite(score[i])) oss << score[i]; else oss << "NA"; // GS: commenting this out because we shouldn't be reporting // strand several times - // oss << '\t' << r.get_strand(); + // oss << '\t' << r.strand; if (report_more_information) { oss << '\t'; - if (isfinite(weighted_mean_meth[i])) + if (std::isfinite(weighted_mean_meth[i])) oss << weighted_mean_meth[i]; else oss << "NA"; oss << '\t'; - if (isfinite(mean_meth[i])) + if (std::isfinite(mean_meth[i])) oss << mean_meth[i]; else oss << "NA"; oss << '\t'; - if (isfinite(fractional_meth[i])) + if (std::isfinite(fractional_meth[i])) oss << fractional_meth[i]; else oss << "NA"; @@ -136,12 +119,12 @@ format_output_line( } static void -get_site_and_values(string &line, MSite &site, vector &values) { +get_site_and_values(std::string &line, MSite &site, + std::vector &values) { // in tabular, chrom/pos/strand is separated by colons - replace(begin(line), end(line), ':', '\t'); + std::replace(std::begin(line), std::end(line), ':', '\t'); std::istringstream iss(line); iss >> site.chrom >> site.pos >> site.strand >> site.context; - values.clear(); double v = 0.0; while (iss >> v) @@ -149,102 +132,101 @@ get_site_and_values(string &line, MSite &site, vector &values) { } static bool -all_is_finite(const vector &scores) { - for (auto it(begin(scores)); it != end(scores); ++it) - if (!isfinite(*it)) - return false; - - return true; +all_finite(const std::vector &scores) { + return std::all_of(std::cbegin(scores), std::cend(scores), + [](const auto x) { return std::isfinite(x); }); } static void -process_with_cpgs_loaded(const bool VERBOSE, +process_with_cpgs_loaded(const bool verbose, // const bool sort_data_if_needed, - const bool PRINT_NUMERIC_ONLY, + const bool print_numeric_only, const bool report_more_information, - const char level_code, const string &cpgs_file, - vector ®ions, std::ostream &out) { - bgzf_file in(cpgs_file, "r"); + const char level_code, const std::string &cpgs_file, + const std::vector ®ions, + std::ostream &out) { + bamxx::bgzf_file in(cpgs_file, "r"); if (!in) - throw runtime_error("cannot open file: " + cpgs_file); + throw std::runtime_error("cannot open file: " + cpgs_file); - string header; + std::string header; getline(in, header); std::istringstream iss(header); - string col_name; - vector col_names; + std::string col_name; + std::vector col_names; while (iss >> col_name) col_names.push_back(col_name); - auto end_uniq = std::unique(begin(col_names), end(col_names)); + const auto end_uniq = std::unique(std::begin(col_names), std::end(col_names)); + const auto n_uniq = std::distance(std::begin(col_names), end_uniq); // make sure all columns come in pairs of counts - if (2 * static_cast(distance(begin(col_names), end_uniq)) != - col_names.size()) - throw runtime_error("wrong header format:\n" + header); + if (2 * static_cast(n_uniq) != std::size(col_names)) + throw std::runtime_error("wrong header format:\n" + header); - col_names.resize(distance(begin(col_names), end_uniq)); - const size_t n_samples = col_names.size(); - if (VERBOSE) - cerr << "[n_samples=" << n_samples << "]\n"; + col_names.resize(std::distance(std::begin(col_names), end_uniq)); + const std::size_t n_samples = std::size(col_names); + if (verbose) + std::cerr << "[n_samples=" << n_samples << "]\n"; - vector cpgs; - vector> values; + std::vector cpgs; + std::vector> values; MSite the_cpg; - string line; + std::string line; while (getline(in, line)) { - values.push_back(vector()); + values.push_back(std::vector()); get_site_and_values(line, the_cpg, values.back()); cpgs.push_back(the_cpg); - if (values.back().size() != 2 * n_samples) - throw runtime_error("wrong row length. Expected " + - std::to_string(2 * n_samples) + ", got " + - std::to_string(values.back().size()) + "\n" + line); + if (std::size(values.back()) != 2 * n_samples) + throw std::runtime_error( + "wrong row length. Expected " + std::to_string(2 * n_samples) + + ", got " + std::to_string(std::size(values.back())) + "\n" + line); } - if (VERBOSE) - cerr << "[n_cpgs=" << cpgs.size() << "]\n"; + if (verbose) + std::cerr << "[n_cpgs=" << std::size(cpgs) << "]\n"; - if (!is_sorted(begin(cpgs), end(cpgs))) { + if (!std::is_sorted(std::cbegin(cpgs), std::cend(cpgs))) { /* GS: need a way to sort cpgs and values to preserve the order. * Commenting this for now until I know the best way to do this if (sort_data_if_needed) - sort(begin(cpgs), end(cpgs)); + std::sort(begin(cpgs), end(cpgs)); else*/ - throw runtime_error("data not sorted: " + cpgs_file); + throw std::runtime_error("data not sorted: " + cpgs_file); } - if (VERBOSE) - cerr << "[n_cpgs=" << cpgs.size() << "]\n"; + if (verbose) + std::cerr << "[n_cpgs=" << std::size(cpgs) << "]\n"; // write header as first column - for (size_t i = 0; i < n_samples; ++i) + for (std::size_t i = 0; i < n_samples; ++i) out << '\t' << col_names[i]; out << '\n'; // preallocate entire space for a single row - vector total_meth(n_samples, 0); - vector total_reads(n_samples, 0); - vector cpgs_with_reads(n_samples, 0); - vector called_total(n_samples, 0); - vector called_meth(n_samples, 0); - vector weighted_mean_meth(n_samples, 0); - vector fractional_meth(n_samples, 0); - vector unweighted_mean_meth(n_samples, 0); - vector score(n_samples, 0); - - for (size_t i = 0; i < regions.size(); ++i) { - const pair bounds(region_bounds(cpgs, regions[i])); - - for (size_t j = 0; j < n_samples; ++j) { + std::vector total_meth(n_samples, 0); + std::vector total_reads(n_samples, 0); + std::vector cpgs_with_reads(n_samples, 0); + std::vector called_total(n_samples, 0); + std::vector called_meth(n_samples, 0); + std::vector weighted_mean_meth(n_samples, 0); + std::vector fractional_meth(n_samples, 0); + std::vector unweighted_mean_meth(n_samples, 0); + std::vector score(n_samples, 0); + + for (std::size_t i = 0; i < std::size(regions); ++i) { + const std::pair bounds( + region_bounds(cpgs, regions[i])); + + for (std::size_t j = 0; j < n_samples; ++j) { total_meth[j] = total_reads[j] = cpgs_with_reads[j] = called_total[j] = called_meth[j] = 0; double mean_meth = 0.0; - for (size_t k = bounds.first; k < bounds.second; ++k) { - const size_t meth = values[k][2 * j + 1]; - const size_t tot = values[k][2 * j]; + for (std::size_t k = bounds.first; k < bounds.second; ++k) { + const std::size_t meth = values[k][2 * j + 1]; + const std::size_t tot = values[k][2 * j]; if (tot > 0) { total_reads[j] += tot; total_meth[j] += meth; @@ -270,8 +252,8 @@ process_with_cpgs_loaded(const bool VERBOSE, : fractional_meth[j])); } - const size_t total_cpgs = bounds.second - bounds.first; - if (!PRINT_NUMERIC_ONLY || all_is_finite(score)) + const std::size_t total_cpgs = bounds.second - bounds.first; + if (!print_numeric_only || all_finite(score)) out << format_output_line(report_more_information, regions[i], score, weighted_mean_meth, unweighted_mean_meth, fractional_meth, total_cpgs, cpgs_with_reads, @@ -286,7 +268,7 @@ process_with_cpgs_loaded(const bool VERBOSE, /// static void -move_to_start_of_line(ifstream &in) { +move_to_start_of_line(std::ifstream &in) { char next{}; while (in.good() && in.get(next) && next != '\n') { in.unget(); @@ -298,45 +280,46 @@ move_to_start_of_line(ifstream &in) { } static void -get_chr_and_idx(ifstream &in, string &chrom, size_t &pos) { - string tmp; +get_chr_and_idx(std::ifstream &in, std::string &chrom, std::size_t &pos) { + std::string tmp; in >> tmp; - replace(begin(tmp), end(tmp), ':', ' '); + std::replace(std::begin(tmp), std::end(tmp), ':', ' '); std::istringstream iss(tmp); iss >> chrom >> pos; } static void -find_start_line(const string &chr, const size_t idx, ifstream &cpg_in) { - cpg_in.seekg(0, ios_base::beg); - const size_t begin_pos = cpg_in.tellg(); - cpg_in.seekg(0, ios_base::end); - const size_t end_pos = cpg_in.tellg(); +find_start_line(const std::string &chr, const std::size_t idx, + std::ifstream &cpg_in) { + cpg_in.seekg(0, std::ios_base::beg); + const std::size_t begin_pos = cpg_in.tellg(); + cpg_in.seekg(0, std::ios_base::end); + const std::size_t end_pos = cpg_in.tellg(); if (end_pos - begin_pos < 2) - throw runtime_error("empty meth file"); + throw std::runtime_error("empty meth file"); - size_t step_size = (end_pos - begin_pos) / 2; + std::size_t step_size = (end_pos - begin_pos) / 2; - cpg_in.seekg(0, ios_base::beg); - string low_chr; - size_t low_idx = 0; + cpg_in.seekg(0, std::ios_base::beg); + std::string low_chr; + std::size_t low_idx = 0; get_chr_and_idx(cpg_in, low_chr, low_idx); // MAGIC: need the -2 here to get past the EOF and possibly a '\n' - cpg_in.seekg(-2, ios_base::end); + cpg_in.seekg(-2, std::ios_base::end); move_to_start_of_line(cpg_in); - string high_chr; - size_t high_idx{}; + std::string high_chr; + std::size_t high_idx{}; get_chr_and_idx(cpg_in, high_chr, high_idx); - size_t pos = step_size; - cpg_in.seekg(pos, ios_base::beg); + std::size_t pos = step_size; + cpg_in.seekg(pos, std::ios_base::beg); move_to_start_of_line(cpg_in); while (step_size > 0) { - string mid_chr; - size_t mid_idx = 0; + std::string mid_chr; + std::size_t mid_idx = 0; // ADS: possible here to check that the chroms are in the right // order, at least the ones we have seen. get_chr_and_idx(cpg_in, mid_chr, mid_idx); @@ -351,49 +334,51 @@ find_start_line(const string &chr, const size_t idx, ifstream &cpg_in) { std::swap(mid_idx, low_idx); pos += step_size; } - cpg_in.seekg(pos, ios_base::beg); + cpg_in.seekg(pos, std::ios_base::beg); move_to_start_of_line(cpg_in); } } static bool -cpg_not_past_region(const GenomicRegion ®ion, const size_t end_pos, +cpg_not_past_region(const Interval6 ®ion, const std::size_t end_pos, const MSite &cpg) { - return (cpg.chrom == region.get_chrom() && cpg.pos < end_pos) || - cpg.chrom < region.get_chrom(); + return (cpg.chrom == region.chrom && cpg.pos < end_pos) || + cpg.chrom < region.chrom; } static void -get_cpg_stats(ifstream &cpg_in, const GenomicRegion ®ion, - vector &total_meth, vector &total_reads, - size_t &total_cpgs, vector &cpgs_with_reads, - vector &called_total, vector &called_meth, - vector &mean_meth) { - const string chrom(region.get_chrom()); - const size_t start_pos = region.get_start(); - const size_t end_pos = region.get_end(); +get_cpg_stats(std::ifstream &cpg_in, const Interval6 ®ion, + std::vector &total_meth, + std::vector &total_reads, std::size_t &total_cpgs, + std::vector &cpgs_with_reads, + std::vector &called_total, + std::vector &called_meth, + std::vector &mean_meth) { + const std::string chrom(region.chrom); + const std::size_t start_pos = region.start; + const std::size_t end_pos = region.stop; find_start_line(chrom, start_pos, cpg_in); - const size_t n_samples = total_meth.size(); + const std::size_t n_samples = std::size(total_meth); MSite cpg; - vector values; + std::vector values; values.reserve(2 * n_samples); - string line; + std::string line; while (getline(cpg_in, line)) { values.clear(); get_site_and_values(line, cpg, values); - if (values.size() != 2 * n_samples) - throw runtime_error("wrong row length. Expected " + - std::to_string(2 * n_samples) + ", got " + - std::to_string(values.size()) + "\n" + line); + if (std::size(values) != 2 * n_samples) + throw std::runtime_error("wrong row length. Expected " + + std::to_string(2 * n_samples) + ", got " + + std::to_string(std::size(values)) + "\n" + line); if (cpg_not_past_region(region, end_pos, cpg)) { if (start_pos <= cpg.pos && cpg.chrom == chrom) { ++total_cpgs; - for (size_t j = 0; j < n_samples; ++j) { - const size_t meth = values[2 * j + 1]; - const size_t tot = values[2 * j]; + for (std::size_t j = 0; j < n_samples; ++j) { + const std::size_t meth = values[2 * j + 1]; + const std::size_t tot = values[2 * j]; if (values[2 * j] > 0) { total_meth[j] += meth; total_reads[j] += tot; @@ -413,51 +398,51 @@ get_cpg_stats(ifstream &cpg_in, const GenomicRegion ®ion, } static void -process_with_cpgs_on_disk(const bool PRINT_NUMERIC_ONLY, +process_with_cpgs_on_disk(const bool print_numeric_only, const bool report_more_information, - const char level_code, const string &cpgs_file, - vector ®ions, std::ostream &out) { - ifstream in(cpgs_file); - string header; + const char level_code, const std::string &cpgs_file, + const std::vector ®ions, + std::ostream &out) { + std::ifstream in(cpgs_file); + std::string header; getline(in, header); std::istringstream iss(header); - string col_name; - vector col_names; + std::string col_name; + std::vector col_names; while (iss >> col_name) col_names.push_back(col_name); - auto end_uniq = std::unique(begin(col_names), end(col_names)); + const auto end_uniq = std::unique(std::begin(col_names), std::end(col_names)); + const auto n_uniq = std::distance(std::begin(col_names), end_uniq); // make sure all columns come in pairs of counts - if (2 * static_cast(distance(begin(col_names), end_uniq)) != - col_names.size()) - throw runtime_error("wrong header format:\n" + header); + if (2 * static_cast(n_uniq) != std::size(col_names)) + throw std::runtime_error("wrong header format:\n" + header); - col_names.resize(distance(begin(col_names), end_uniq)); - const size_t n_samples = col_names.size(); + col_names.resize(std::distance(std::begin(col_names), end_uniq)); + const std::size_t n_samples = std::size(col_names); // write header as first column - for (size_t i = 0; i < n_samples; ++i) + for (std::size_t i = 0; i < n_samples; ++i) out << '\t' << col_names[i]; out << '\n'; // preallocate entire space for a single row - vector total_meth(n_samples, 0); - vector total_reads(n_samples, 0); - vector cpgs_with_reads(n_samples, 0); - vector called_total(n_samples, 0); - vector called_meth(n_samples, 0); - vector weighted_mean_meth(n_samples, 0); - vector fractional_meth(n_samples, 0); - vector unweighted_mean_meth(n_samples, 0); - vector score(n_samples, 0.0); - vector mean_meth(n_samples, 0.0); - - vector values(2 * n_samples, 0.0); - for (size_t i = 0; i < regions.size() && in; ++i) { - size_t total_cpgs = 0; - mean_meth = vector(n_samples, 0.0); - for (size_t j = 0; j < n_samples; ++j) { + std::vector total_meth(n_samples, 0); + std::vector total_reads(n_samples, 0); + std::vector cpgs_with_reads(n_samples, 0); + std::vector called_total(n_samples, 0); + std::vector called_meth(n_samples, 0); + std::vector weighted_mean_meth(n_samples, 0); + std::vector fractional_meth(n_samples, 0); + std::vector unweighted_mean_meth(n_samples, 0); + std::vector score(n_samples, 0.0); + + for (std::size_t i = 0; i < std::size(regions) && in; ++i) { + std::size_t total_cpgs = 0; + auto mean_meth = std::vector(n_samples, 0.0); + + for (std::size_t j = 0; j < n_samples; ++j) { total_meth[j] = total_reads[j] = cpgs_with_reads[j] = called_total[j] = called_meth[j] = 0; } @@ -465,20 +450,19 @@ process_with_cpgs_on_disk(const bool PRINT_NUMERIC_ONLY, get_cpg_stats(in, regions[i], total_meth, total_reads, total_cpgs, cpgs_with_reads, called_total, called_meth, mean_meth); - for (size_t j = 0; j < n_samples; ++j) { + for (std::size_t j = 0; j < n_samples; ++j) { fractional_meth[j] = static_cast(called_meth[j]) / called_total[j]; weighted_mean_meth[j] = static_cast(total_meth[j]) / total_reads[j]; unweighted_mean_meth[j] = mean_meth[j] / cpgs_with_reads[j]; - score[j] = (level_code == 'w' ? weighted_mean_meth[j] : (level_code == 'u' ? unweighted_mean_meth[j] : fractional_meth[j])); } - if (!PRINT_NUMERIC_ONLY || all_is_finite(score)) + if (!print_numeric_only || all_finite(score)) out << format_output_line(report_more_information, regions[i], score, weighted_mean_meth, unweighted_mean_meth, fractional_meth, total_cpgs, cpgs_with_reads, @@ -489,38 +473,36 @@ process_with_cpgs_on_disk(const bool PRINT_NUMERIC_ONLY, // end of code for searching on disk -[[nodiscard]] static size_t -check_bed_format(const string ®ions_file) { - ifstream in(regions_file); +[[nodiscard]] static std::size_t +get_bed_columns(const std::string ®ions_file) { + std::ifstream in(regions_file); if (!in) - throw runtime_error("cannot open file: " + regions_file); - - string line; + throw std::runtime_error("cannot open file: " + regions_file); + std::string line; getline(in, line); - std::istringstream iss(line); - string token; - size_t n_columns = 0; + std::string token; + std::size_t n_columns = 0; while (iss >> token) ++n_columns; - return n_columns; } int main_multimethstat(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - static const string default_name_prefix = "X"; - - bool VERBOSE = false; - bool PRINT_NUMERIC_ONLY = false; - bool load_entire_file = false; - bool report_more_information = false; - bool sort_data_if_needed = false; - - string level_code = "w"; - - string outfile; + static constexpr auto default_name_prefix = "X"; + constexpr auto valid_n_cols = [](const auto n_cols) { + return n_cols != 3 && n_cols < 6; // NOLINT(*-avoid-magic-numbers) + }; + + bool verbose{false}; + bool print_numeric_only{false}; + bool load_entire_file{false}; + bool report_more_information{false}; + bool sort_data_if_needed{false}; + std::string level_code = "w"; + std::string outfile; /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) @@ -531,7 +513,7 @@ main_multimethstat(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) opt_parse.add_opt("output", 'o', "Name of output file (default: stdout)", false, outfile); opt_parse.add_opt("numeric", 'N', "print numeric values only (not NAs)", - false, PRINT_NUMERIC_ONLY); + false, print_numeric_only); opt_parse.add_opt("preload", 'L', "Load all CpG sites", false, load_entire_file); opt_parse.add_opt("sort", 's', "sort data if needed", false, @@ -542,84 +524,79 @@ main_multimethstat(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) false, level_code); opt_parse.add_opt("more-levels", 'M', "report more methylation information", false, report_more_information); - opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); - vector leftover_args; + opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << '\n' - << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << '\n'; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } - if (leftover_args.size() != 2) { - cerr << opt_parse.help_message() << '\n'; + if (std::size(leftover_args) != 2) { + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } if (sort_data_if_needed && !load_entire_file) { - cerr << "cannot sort data unless all data is loaded\n"; + std::cerr << "cannot sort data unless all data is loaded\n"; return EXIT_SUCCESS; } if (level_code != "w" && level_code != "u" && level_code != "f") { - cerr << "selected level must be in {w, u, f}\n"; + std::cerr << "selected level must be in {w, u, f}\n"; return EXIT_SUCCESS; } - const string regions_file = leftover_args.front(); - const string cpgs_file = leftover_args.back(); + const std::string regions_file = leftover_args.front(); + const std::string cpgs_file = leftover_args.back(); /****************** END COMMAND LINE OPTIONS *****************/ - if (VERBOSE) - cerr << "loading regions\n"; + if (verbose) + std::cerr << "loading regions\n"; - const size_t n_columns = check_bed_format(regions_file); - // MAGIC: this allows for exactly 3 or at least 6 columns in the - // bed format - if (n_columns != 3 && n_columns < 6) - throw runtime_error("format must be 3 or 6+ column bed: " + regions_file); - - vector regions; - ReadBEDFile(regions_file, regions); - if (!is_sorted(begin(regions), end(regions))) { + const auto n_columns = get_bed_columns(regions_file); + if (valid_n_cols(n_columns)) + throw std::runtime_error("format must be 3 or 6+ column bed: " + + regions_file); + auto regions = read_intervals6(regions_file); + if (!std::is_sorted(std::cbegin(regions), std::cend(regions))) { if (sort_data_if_needed) { - if (VERBOSE) - cerr << "sorting regions\n"; - sort(begin(regions), end(regions)); + if (verbose) + std::cerr << "sorting regions\n"; + std::sort(std::begin(regions), std::end(regions)); } else - throw runtime_error("regions not sorted in file: " + regions_file); + throw std::runtime_error("regions not sorted in file: " + regions_file); } if (n_columns == 3) // then we should name the regions - for (size_t i = 0; i < regions.size(); ++i) - regions[i].set_name(default_name_prefix + std::to_string(i)); - - if (VERBOSE) - cerr << "[n_regions=" << regions.size() << "]\n"; + for (std::size_t i = 0; i < std::size(regions); ++i) + regions[i].name = default_name_prefix + std::to_string(i); - std::ofstream of; - if (!outfile.empty()) - of.open(outfile); - std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); + if (verbose) + std::cerr << "[n_regions=" << std::size(regions) << "]\n"; + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open output file: " + outfile); if (load_entire_file) - process_with_cpgs_loaded(VERBOSE, // sort_data_if_needed, - PRINT_NUMERIC_ONLY, report_more_information, + process_with_cpgs_loaded(verbose, // sort_data_if_needed, + print_numeric_only, report_more_information, level_code[0], cpgs_file, regions, out); else - process_with_cpgs_on_disk(PRINT_NUMERIC_ONLY, report_more_information, + process_with_cpgs_on_disk(print_numeric_only, report_more_information, level_code[0], cpgs_file, regions, out); } catch (const std::exception &e) { - cerr << e.what() << '\n'; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } -// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) +// NOLINTEND(*-narrowing-conversions) diff --git a/src/analysis/pmd.cpp b/src/analysis/pmd.cpp index 8519c23a..02529e99 100644 --- a/src/analysis/pmd.cpp +++ b/src/analysis/pmd.cpp @@ -1,25 +1,32 @@ -/* Copyright (C) 2018 University of Southern California - * Andrew D Smith - * Authors: Andrew D. Smith, Song Qiang, Jenny Qu, - * Benjamin Decato, Guilherme Sena +/* Copyright (C) 2018-2025 Andrew D Smith and Benjamin Decato * - * This is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This is free software; you can redistribute it and/or modify it under the + * terms of the GNU General Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your option) any later + * version. * - * This is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. */ -#include "GenomicRegion.hpp" +[[maybe_unused]] static constexpr auto description = R"( +Identify PMDs in methylomes. Methylation must be provided in the methcounts +file format (chrom, position, strand, context, methylation, reads). See the +methcounts documentation for details. This program assumes only data at CpG +sites and that strands are collapsed so only the positive site appears in the +file, but reads counts are from both strands. +)"; + +#include "Interval.hpp" +#include "Interval6.hpp" #include "MSite.hpp" -#include "OptionParser.hpp" #include "TwoStateHMM_PMD.hpp" #include "bsutils.hpp" #include "counts_header.hpp" + +#include "OptionParser.hpp" #include "smithlab_utils.hpp" #include @@ -45,38 +52,16 @@ #include #include -// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) - -using std::cerr; -using std::cout; -using std::make_pair; -using std::max; -using std::min; -using std::numeric_limits; -using std::runtime_error; -using std::string; -using std::vector; - -using std::ofstream; -using std::ostream; -using std::to_string; - -using bamxx::bgzf_file; - -template using num_lim = std::numeric_limits; +// NOLINTBEGIN(*-narrowing-conversions) struct pmd_summary { - explicit pmd_summary(const vector &pmds) : - pmd_count{std::size(pmds)} { - // NOLINTBEGIN(*-prefer-member-initializer) - pmd_total_size = - accumulate(cbegin(pmds), cend(pmds), 0ul, - [](const std::uint64_t t, const GenomicRegion &p) { - return t + p.get_width(); - }); + explicit pmd_summary(const std::vector &pmds) : + pmd_count{std::size(pmds)}, + pmd_total_size{std::accumulate( + std::cbegin(pmds), std::cend(pmds), 0ul, + [](const std::uint64_t t, const Interval6 &p) { return t + size(p); })} { pmd_mean_size = static_cast(pmd_total_size) / std::max(pmd_count, static_cast(1)); - // NOLINTEND(*-prefer-member-initializer) } // pmd_count is the number of identified PMDs. std::uint64_t pmd_count{}; @@ -84,48 +69,47 @@ struct pmd_summary { std::uint64_t pmd_total_size{}; // mean_pmd_size is the mean size of the identified PMDs double pmd_mean_size{}; - - string - tostring() { - std::ostringstream oss; - oss << "pmd_count: " << pmd_count << '\n' - << "pmd_total_size: " << pmd_total_size << '\n' - << "pmd_mean_size: " << std::fixed << std::setprecision(2) - << pmd_mean_size; - - return oss.str(); - } }; +[[nodiscard]] static inline auto +to_string(const pmd_summary &s) -> std::string { + std::ostringstream oss; + oss << "pmd_count: " << s.pmd_count << '\n' + << "pmd_total_size: " << s.pmd_total_size << '\n' + << "pmd_mean_size: " << std::fixed << s.pmd_mean_size; + return oss.str(); +} + static void -get_adjacent_distances(const vector &pmds, - vector &dists) { - for (size_t i = 1; i < pmds.size(); ++i) - if (pmds[i].same_chrom(pmds[i - 1])) - dists.push_back(pmds[i].get_start() - pmds[i - 1].get_end()); +get_adjacent_distances(const std::vector &pmds, + std::vector &dists) { + for (std::size_t i = 1; i < std::size(pmds); ++i) + if (pmds[i].chrom == pmds[i - 1].chrom) + dists.push_back(pmds[i].start - pmds[i - 1].stop); } static bool -precedes(const string &chrom, const size_t position, const GenomicRegion &r) { - return (chrom < r.get_chrom() || - (chrom == r.get_chrom() && position < r.get_start())); +precedes(const std::string &chrom, const std::size_t position, + const Interval6 &r) { + return chrom < r.chrom || (chrom == r.chrom && position < r.start); } static bool -succeeds(const string &chrom, const size_t position, const GenomicRegion &r) { - return (r.get_chrom() < chrom || - (chrom == r.get_chrom() && r.get_end() <= position)); +succeeds(const std::string &chrom, const std::size_t position, + const Interval6 &r) { + return r.chrom < chrom || (chrom == r.chrom && r.stop <= position); } static void -merge_nearby_pmd(const size_t max_merge_dist, vector &pmds) { - size_t j = 0; - for (size_t i = 1; i < pmds.size(); ++i) { - if (pmds[j].same_chrom(pmds[i]) && - pmds[i].get_start() - pmds[j].get_end() <= max_merge_dist) { - pmds[j].set_end(pmds[i].get_end()); - const string combined_name(pmds[j].get_name() + pmds[i].get_name()); - pmds[j].set_name(combined_name); +merge_nearby_pmd(const std::size_t max_merge_dist, + std::vector &pmds) { + std::size_t j = 0; + for (std::size_t i = 1; i < std::size(pmds); ++i) { + if (pmds[j].chrom == pmds[i].chrom && + pmds[i].start - pmds[j].stop <= max_merge_dist) { + pmds[j].stop = pmds[i].stop; + const std::string combined_name(pmds[j].name + pmds[i].name); + pmds[j].name = combined_name; } else pmds[++j] = pmds[i]; @@ -152,39 +136,42 @@ beta_max_likelihood(const double fg_alpha, const double fg_beta, beta_log_likelihood(bg_alpha, bg_beta, p_hi); } -static size_t +static std::size_t find_best_bound(const bool IS_RIGHT_BOUNDARY, - std::map> &pos_meth_tot, - const vector &fg_alpha, const vector &fg_beta, - const vector &bg_alpha, const vector &bg_beta) { - vector> meth_tot; - vector positions; - for (auto &&it : pos_meth_tot) { + const std::map> + &pos_meth_tot, + const std::vector &fg_alpha, + const std::vector &fg_beta, + const std::vector &bg_alpha, + const std::vector &bg_beta) { + std::vector> meth_tot; + std::vector positions; + for (const auto &it : pos_meth_tot) { positions.push_back(it.first); meth_tot.push_back(it.second); } - vector cumu_left_meth(meth_tot.size(), 0); - vector cumu_left_tot(meth_tot.size(), 0); - vector cumu_right_meth(meth_tot.size(), 0); - vector cumu_right_tot(meth_tot.size(), 0); - if (meth_tot.size() > 0) - for (size_t i = 1; i < meth_tot.size() - 1; ++i) { - const size_t j = meth_tot.size() - 1 - i; + std::vector cumu_left_meth(std::size(meth_tot), 0); + std::vector cumu_left_tot(std::size(meth_tot), 0); + std::vector cumu_right_meth(std::size(meth_tot), 0); + std::vector cumu_right_tot(std::size(meth_tot), 0); + if (std::size(meth_tot) > 0) + for (std::size_t i = 1; i + 1 < std::size(meth_tot); ++i) { + const std::size_t j = std::size(meth_tot) - 1 - i; cumu_left_meth[i] = cumu_left_meth[i - 1] + meth_tot[i - 1].first; cumu_left_tot[i] = cumu_left_tot[i - 1] + meth_tot[i - 1].second; cumu_right_meth[j] = cumu_right_meth[j + 1] + meth_tot[j + 1].first; cumu_right_tot[j] = cumu_right_tot[j + 1] + meth_tot[j + 1].second; } - size_t best_idx = 0; - double best_score = -num_lim::max(); - if (meth_tot.size() > 0) - for (size_t i = 1; i < meth_tot.size() - 1; ++i) { - size_t N_low{}; - size_t k_low{}; - size_t N_hi{}; - size_t k_hi{}; + std::size_t best_idx = 0; + double best_score = -std::numeric_limits::max(); + if (std::size(meth_tot) > 0) + for (std::size_t i = 1; i + 1 < std::size(meth_tot); ++i) { + std::size_t N_low{}; + std::size_t k_low{}; + std::size_t N_hi{}; + std::size_t k_hi{}; if (!IS_RIGHT_BOUNDARY) { N_low = cumu_right_tot[i] + meth_tot[i].second; k_low = cumu_right_meth[i] + meth_tot[i].first; @@ -202,66 +189,63 @@ find_best_bound(const bool IS_RIGHT_BOUNDARY, const double p_hi = static_cast(k_hi) / N_hi; const double p_low = static_cast(k_low) / N_low; - for (size_t j = 0; j < fg_alpha.size(); ++j) { + for (std::size_t j = 0; j < std::size(fg_alpha); ++j) { score += beta_max_likelihood(fg_alpha[j], fg_beta[j], bg_alpha[j], bg_beta[j], p_low, p_hi); } // beta max likelihood using learned emissions - score /= fg_alpha.size(); + score /= std::size(fg_alpha); if (p_hi > p_low && score > best_score) { best_idx = i; best_score = score; } } } - return (best_score > -num_lim::max()) ? positions[best_idx] - : num_lim::max(); + return (best_score > -std::numeric_limits::max()) + ? positions[best_idx] + : std::numeric_limits::max(); } static void -get_boundary_positions(vector &bounds, - const vector &pmds, - const size_t &bin_size) { - for (size_t i = 0; i < pmds.size(); ++i) { +get_boundary_positions(std::vector &bounds, + const std::vector &pmds, + const std::size_t &bin_size) { + for (std::size_t i = 0; i < std::size(pmds); ++i) { bounds.push_back(pmds[i]); - pmds[i].get_start() > bin_size - ? bounds.back().set_start(pmds[i].get_start() - bin_size) - : bounds.back().set_start(0); - bounds.back().set_end(pmds[i].get_start() + bin_size); + bounds.back().start = + pmds[i].start > bin_size ? pmds[i].start - bin_size : 0; + bounds.back().stop = pmds[i].start + bin_size; bounds.push_back(pmds[i]); - pmds[i].get_end() > bin_size - ? bounds.back().set_start(pmds[i].get_end() - bin_size) - : bounds.back().set_start(0); - bounds.back().set_end(pmds[i].get_end() + bin_size); + bounds.back().start = pmds[i].stop > bin_size ? pmds[i].stop - bin_size : 0; + bounds.back().stop = pmds[i].stop + bin_size; } } static void get_optimized_boundary_likelihoods( - const vector &cpgs_file, vector &bounds, - const vector &array_status, const vector &fg_alpha, - const vector &fg_beta, const vector &bg_alpha, - const vector &bg_beta, vector &boundary_scores, - vector &boundary_certainties) { - // MAGIC NUMBER FOR WEIGHTING ARRAY - // CONTRIBUTION TO BOUNDARY OBSERVATIONS - static const double array_coverage_constant = 10; - - vector in; - for (size_t i = 0; i < cpgs_file.size(); ++i) { + const std::vector &cpgs_file, + const std::vector &bounds, const std::vector &array_status, + const std::vector &fg_alpha, const std::vector &fg_beta, + const std::vector &bg_alpha, const std::vector &bg_beta, + std::vector &boundary_scores, + std::vector &boundary_certainties) { + // For weighting array contribution to boundary observations + static constexpr double array_coverage_constant = 10; + + std::vector in; + for (std::size_t i = 0; i < std::size(cpgs_file); ++i) { in.emplace_back(cpgs_file[i], "r"); if (get_has_counts_header(cpgs_file[i])) skip_counts_header(in.back()); } - std::map> pos_meth_tot; - size_t n_meth{}; - size_t n_reads{}; - size_t bound_idx{}; - for (; bound_idx < bounds.size(); ++bound_idx) { // for each boundary - for (size_t i = 0; i < in.size(); ++i) { + std::map> pos_meth_tot; + std::size_t n_meth{}; + std::size_t n_reads{}; + std::size_t bound_idx{}; + for (; bound_idx < std::size(bounds); ++bound_idx) { // for each boundary + for (std::size_t i = 0; i < std::size(in); ++i) { // get totals for all CpGs overlapping that boundary - MSite site; while (read_site(in[i], site) && !succeeds(site.chrom, site.pos, bounds[bound_idx])) { @@ -284,10 +268,8 @@ get_optimized_boundary_likelihoods( n_meth = site.n_meth(); n_reads = site.n_reads; } - auto it(pos_meth_tot.find(site.pos)); - if (it == end(pos_meth_tot)) + if (pos_meth_tot.find(site.pos) == std::cend(pos_meth_tot)) pos_meth_tot[site.pos] = std::make_pair(n_meth, n_reads); - else { // add this file's contribution to the site's methylation pos_meth_tot[site.pos].first += n_meth; pos_meth_tot[site.pos].second += n_reads; @@ -297,18 +279,18 @@ get_optimized_boundary_likelihoods( } // Get the boundary position - size_t boundary_position = - (bounds[bound_idx].get_start() + bounds[bound_idx].get_end()) / 2; - - size_t N_low = 0, k_low = 0, N_hi = 0, k_hi = 0; - for (auto it = begin(pos_meth_tot); it != end(pos_meth_tot); ++it) { - if (it->first < boundary_position) { - N_low += it->second.second; - k_low += it->second.first; + std::size_t boundary_position = + (bounds[bound_idx].start + bounds[bound_idx].stop) / 2; + + std::size_t N_low = 0, k_low = 0, N_hi = 0, k_hi = 0; + for (const auto &p : pos_meth_tot) { + if (p.first < boundary_position) { + N_low += p.second.second; + k_low += p.second.first; } else { - N_hi += it->second.second; - k_hi += it->second.first; + N_hi += p.second.second; + k_hi += p.second.first; } } @@ -317,47 +299,48 @@ get_optimized_boundary_likelihoods( const double p_low = static_cast(k_low) / N_low; if (bound_idx % 2) { // its a right boundary, p_low should go with fg - for (size_t j = 0; j < fg_alpha.size(); ++j) + for (std::size_t j = 0; j < std::size(fg_alpha); ++j) score += beta_max_likelihood(fg_alpha[j], fg_beta[j], bg_alpha[j], bg_beta[j], p_low, p_hi); } else { // its a left boundary, p_low should go with bg - for (size_t j = 0; j < fg_alpha.size(); ++j) + for (std::size_t j = 0; j < std::size(fg_alpha); ++j) score += beta_max_likelihood(bg_alpha[j], bg_beta[j], fg_alpha[j], fg_beta[j], p_low, p_hi); } boundary_certainties.push_back(std::min(N_low, N_hi)); - score /= fg_alpha.size(); + score /= std::size(fg_alpha); boundary_scores.push_back(exp(score)); pos_meth_tot.clear(); } } static void -find_exact_boundaries( - const vector &cpgs_file, vector &bounds, - const vector &array_status, const vector &fg_alpha, - const vector &fg_beta, const vector &bg_alpha, - const vector &bg_beta, vector &bound_site) { - // MAGIC NUMBER FOR WEIGHTING ARRAY - // CONTRIBUTION TO BOUNDARY OBSERVATIONS - static const double array_coverage_constant = 10; - - vector in; - for (size_t i = 0; i < std::size(cpgs_file); ++i) { +find_exact_boundaries(const std::vector &cpgs_file, + const std::vector &bounds, + const std::vector &array_status, + const std::vector &fg_alpha, + const std::vector &fg_beta, + const std::vector &bg_alpha, + const std::vector &bg_beta, + std::vector &bound_site) { + // For weighting array contribution to boundary observations + static constexpr double array_coverage_constant = 10; + + std::vector in; + for (std::size_t i = 0; i < std::size(cpgs_file); ++i) { in.emplace_back(cpgs_file[i], "r"); if (get_has_counts_header(cpgs_file[i])) skip_counts_header(in[i]); } - std::map> pos_meth_tot; - size_t n_meth{}; - size_t n_reads{}; - size_t bound_idx{}; - for (; bound_idx < bounds.size(); ++bound_idx) { // for each boundary - for (size_t i = 0; i < in.size(); ++i) { + std::map> pos_meth_tot; + std::size_t n_meth{}; + std::size_t n_reads{}; + std::size_t bound_idx{}; + for (; bound_idx < std::size(bounds); ++bound_idx) { // for each boundary + for (std::size_t i = 0; i < std::size(in); ++i) { // get totals for all CpGs overlapping that boundary - MSite site; while (read_site(in[i], site) && !succeeds(site.chrom, site.pos, bounds[bound_idx])) { @@ -398,108 +381,97 @@ find_exact_boundaries( } static void -optimize_boundaries(const size_t bin_size, const vector &cpgs_file, - vector &pmds, - const vector &array_status, - const vector &fg_alpha, - const vector &fg_beta, - const vector &bg_alpha, - const vector &bg_beta) { - vector bounds; +optimize_boundaries( + const std::size_t bin_size, const std::vector &cpgs_file, + std::vector &pmds, const std::vector &array_status, + const std::vector &fg_alpha, const std::vector &fg_beta, + const std::vector &bg_alpha, const std::vector &bg_beta) { + std::vector bounds; get_boundary_positions(bounds, pmds, bin_size); - vector bound_site; + std::vector bound_site; find_exact_boundaries(cpgs_file, bounds, array_status, fg_alpha, fg_beta, bg_alpha, bg_beta, bound_site); - //////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////// - ///// NOW RESET THE STARTS AND ENDS OF PMDs - ///// - for (size_t i = 0; i < pmds.size(); ++i) { - const size_t start_site = bound_site[2 * i]; - if (start_site != num_lim::max()) - pmds[i].set_start(start_site); - const size_t end_site = bound_site[2 * i + 1]; - if (end_site != num_lim::max()) - pmds[i].set_end(end_site + 1); + // Now reset the starts and ends of PMDs + for (std::size_t i = 0; i < std::size(pmds); ++i) { + const std::size_t start_site = bound_site[2 * i]; + if (start_site != std::numeric_limits::max()) + pmds[i].start = start_site; + const std::size_t end_site = bound_site[2 * i + 1]; + if (end_site != std::numeric_limits::max()) + pmds[i].stop = end_site + 1; } - //////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////// - ///// NOW MERGE THE PMDS THAT ARE TOO CLOSE - ///// - vector dists; + // Now merge the pmds that are too close + std::vector dists; get_adjacent_distances(pmds, dists); - sort(begin(dists), end(dists)); - - //////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////// - // NEED TO USE SOME RANDOMIZATION METHOD HERE TO FIGURE OUT THE - // MERGING DISTANCE - vector> dist_hist; - size_t first = 0; - for (size_t i = 1; i < dists.size(); ++i) + std::sort(std::begin(dists), std::end(dists)); + + // Need to use some randomization method here to figure out the + // merging distance + std::vector> dist_hist; + std::size_t first = 0; + for (std::size_t i = 1; i < std::size(dists); ++i) if (dists[i] != dists[i - 1]) { dist_hist.push_back(std::make_pair(dists[i - 1], i - first)); first = i; } - merge_nearby_pmd(2 * bin_size, pmds); - /////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////// - // LAST, GET THE CPG SITES WITHIN 1 BIN OF EACH BOUNDARY AND COMPUTE - // THE LIKELIHOOD TO GET A "SCORE" ON THE BOUNDARY - /////// + // Last, get the cpg sites within 1 bin of each boundary and compute + // the likelihood to get a "score" on the boundary bounds.clear(); // need updated boundaries after merging nearby PMDs get_boundary_positions(bounds, pmds, bin_size); - vector boundary_scores; - vector boundary_certainties; + std::vector boundary_scores; + std::vector boundary_certainties; get_optimized_boundary_likelihoods(cpgs_file, bounds, array_status, fg_alpha, fg_beta, bg_alpha, bg_beta, boundary_scores, boundary_certainties); // Add the boundary scores to the PMD names - for (size_t i = 0; i < pmds.size(); ++i) - pmds[i].set_name(pmds[i].get_name() + ":" + - to_string(boundary_scores[2 * i]) + ":" + - to_string(boundary_certainties[2 * i]) + ":" + - to_string(boundary_scores[2 * i + 1]) + ":" + - to_string(boundary_certainties[2 * i + 1])); + // clang-format off + for (std::size_t i = 0; i < std::size(pmds); ++i) + pmds[i].name = pmds[i].name + ":" + + std::to_string(boundary_scores[2 * i]) + ":" + + std::to_string(boundary_certainties[2 * i]) + ":" + + std::to_string(boundary_scores[2 * i + 1]) + ":" + + std::to_string(boundary_certainties[2 * i + 1]); + // clang-format on } double -get_score_cutoff_for_fdr(const vector &scores, const double fdr) { +get_score_cutoff_for_fdr(const std::vector &scores, const double fdr) { if (fdr <= 0) - return num_lim::max(); - else if (fdr > 1) - return num_lim::min(); - vector local(scores); - std::sort(begin(local), end(local)); - size_t i = 0; - for (; i < local.size() - 1 && - local[i + 1] < fdr * static_cast(i + 1) / local.size(); + return std::numeric_limits::max(); + if (fdr > 1) + return std::numeric_limits::min(); + std::vector local(scores); + std::sort(std::begin(local), std::end(local)); + std::size_t i = 0; + for (; i + 1 < std::size(local) && + local[i + 1] < fdr * static_cast(i + 1) / std::size(local); ++i) ; - return local[i] + 1.0 / scores.size(); + return local[i] + 1.0 / std::size(scores); } static inline double score_contribution(const std::pair &m) { const double denom = m.first + m.second; - return (denom > 0) ? 1.0 - m.first / denom : 0.0; + return denom > 0 ? 1.0 - m.first / denom : 0.0; } static void -get_domain_scores(const vector &classes, - const vector>> &meth, - const vector &reset_points, vector &scores) { - const size_t n_replicates = meth.size(); - size_t reset_idx = 1; +get_domain_scores( + const std::vector &classes, + const std::vector>> &meth, + const std::vector &reset_points, std::vector &scores) { + const std::size_t n_replicates = std::size(meth); + std::size_t reset_idx = 1; bool in_domain = false; double score = 0; - for (size_t i = 0; i < classes.size(); ++i) { + for (std::size_t i = 0; i < std::size(classes); ++i) { if (reset_points[reset_idx] == i) { if (in_domain) { scores.push_back(score); @@ -509,7 +481,7 @@ get_domain_scores(const vector &classes, ++reset_idx; } if (classes[i]) { - for (size_t r = 0; r < n_replicates; ++r) + for (std::size_t r = 0; r < n_replicates; ++r) score += score_contribution(meth[r][i]); in_domain = true; } @@ -524,16 +496,17 @@ get_domain_scores(const vector &classes, } static void -build_domains(const vector &bins, - const vector &reset_points, const vector &classes, - vector &domains) { - size_t n_bins = 0, reset_idx = 1, prev_end = 0; +build_domains(const std::vector &bins, + const std::vector &reset_points, + const std::vector &classes, + std::vector &domains) { + std::size_t n_bins = 0, reset_idx = 1, prev_end = 0; bool in_domain = false; - for (size_t i = 0; i < classes.size(); ++i) { + for (std::size_t i = 0; i < std::size(classes); ++i) { if (reset_points[reset_idx] == i) { if (in_domain) { - domains.back().set_end(prev_end); - domains.back().set_score(n_bins); + domains.back().stop = prev_end; + domains.back().score = n_bins; n_bins = 0; in_domain = false; } @@ -541,37 +514,39 @@ build_domains(const vector &bins, } if (classes[i]) { if (!in_domain) { - domains.push_back(GenomicRegion(bins[i])); + domains.emplace_back(bins[i].chrom, bins[i].start, bins[i].stop, + std::string{}, 0, '+'); in_domain = true; } ++n_bins; } else if (in_domain) { - domains.back().set_end(prev_end); - domains.back().set_score(n_bins); + domains.back().stop = prev_end; + domains.back().score = n_bins; n_bins = 0; in_domain = false; } - prev_end = bins[i].get_end(); + prev_end = bins[i].stop; } } // Modified to take multiple replicates template static void -separate_regions(const size_t desert_size, - vector> &bins, - vector> &meth, vector> &reads, - vector &reset_points, - vector &dists_btwn_bins) { - const size_t n_replicates = bins.size(); +separate_regions(const std::size_t desert_size, + std::vector> &bins, + std::vector> &meth, + std::vector> &reads, + std::vector &reset_points, + std::vector &dists_btwn_bins) { + const std::size_t n_replicates = std::size(bins); // eliminate the zero-read cpg sites if no coverage in any rep - size_t end_coord_of_prev = 0; - size_t j = 0; - for (size_t i = 0; i < bins[0].size(); ++i) { + std::size_t end_coord_of_prev = 0; + std::size_t j = 0; + for (std::size_t i = 0; i < std::size(bins[0]); ++i) { bool all_empty = true; - size_t rep_idx = 0; + std::size_t rep_idx = 0; while (all_empty && rep_idx < n_replicates) { if (reads[rep_idx][i] == 0) ++rep_idx; @@ -579,9 +554,9 @@ separate_regions(const size_t desert_size, all_empty = false; } if (!all_empty) { - dists_btwn_bins.push_back(bins[0][i].get_start() - end_coord_of_prev); - end_coord_of_prev = bins[0][i].get_end(); - for (size_t r = 0; r < n_replicates; ++r) { + dists_btwn_bins.push_back(bins[0][i].start - end_coord_of_prev); + end_coord_of_prev = bins[0][i].stop; + for (std::size_t r = 0; r < n_replicates; ++r) { bins[r][j] = bins[r][i]; meth[r][j] = meth[r][i]; reads[r][j] = reads[r][i]; @@ -590,70 +565,74 @@ separate_regions(const size_t desert_size, } } - for (size_t r = 0; r < n_replicates; ++r) { + for (std::size_t r = 0; r < n_replicates; ++r) { bins[r].resize(j); meth[r].resize(j); reads[r].resize(j); } // segregate bins - size_t prev_cpg = 0; - for (size_t i = 0; i < bins[0].size(); ++i) { - const size_t dist = (i > 0 && bins[0][i].same_chrom(bins[0][i - 1])) - ? bins[0][i].get_start() - prev_cpg - : num_lim::max(); + std::size_t prev_cpg = 0; + for (std::size_t i = 0; i < std::size(bins[0]); ++i) { + const std::size_t dist = (i > 0 && bins[0][i].chrom == bins[0][i - 1].chrom) + ? bins[0][i].start - prev_cpg + : std::numeric_limits::max(); if (dist > desert_size) reset_points.push_back(i); - prev_cpg = bins[0][i].get_start(); + prev_cpg = bins[0][i].start; } assert(std::size(reset_points) > 0); - reset_points.push_back(bins[0].size()); + reset_points.push_back(std::size(bins[0])); } static void -shuffle_bins(const size_t rng_seed, const TwoStateHMM &hmm, - vector>> meth, - const vector &reset_points, - const vector &start_trans, - const vector> &trans, - const vector &end_trans, const vector &fg_alpha, - const vector &fg_beta, const vector &bg_alpha, - const vector &bg_beta, vector &domain_scores, - const vector &array_status) { - size_t n_replicates = meth.size(); - +shuffle_bins(const std::size_t rng_seed, const TwoStateHMM &hmm, + std::vector>> + meth, // cppcheck-suppress[passedByValue] + const std::vector &reset_points, + const std::vector &start_trans, + const std::vector> &trans, + const std::vector &end_trans, + const std::vector &fg_alpha, + const std::vector &fg_beta, + const std::vector &bg_alpha, + const std::vector &bg_beta, + std::vector &domain_scores, + const std::vector &array_status) { + const auto n_replicates = std::size(meth); auto eng = std::default_random_engine(rng_seed); - for (size_t r = 0; r < n_replicates; ++r) - std::shuffle(begin(meth[r]), end(meth[r]), eng); + for (std::size_t r = 0; r < n_replicates; ++r) + std::shuffle(std::begin(meth[r]), std::end(meth[r]), eng); - vector classes; - vector scores; + std::vector classes; + std::vector scores; hmm.PosteriorDecoding_rep(meth, reset_points, start_trans, trans, end_trans, fg_alpha, fg_beta, bg_alpha, bg_beta, classes, scores, array_status); get_domain_scores(classes, meth, reset_points, domain_scores); - sort(begin(domain_scores), end(domain_scores)); + std::sort(std::begin(domain_scores), std::end(domain_scores)); } static void -assign_p_values(const vector &random_scores, - const vector &observed_scores, - vector &p_values) { - const double n_randoms = random_scores.size() == 0 ? 1 : random_scores.size(); +assign_p_values(const std::vector &random_scores, + const std::vector &observed_scores, + std::vector &p_values) { + const double n_randoms = std::max(std::size(random_scores), 1ul); for (auto scr : observed_scores) { const auto scr_itr = - upper_bound(cbegin(random_scores), cend(random_scores), scr); - p_values.push_back(std::distance(scr_itr, cend(random_scores)) / n_randoms); + upper_bound(std::cbegin(random_scores), std::cend(random_scores), scr); + p_values.push_back(std::distance(scr_itr, std::cend(random_scores)) / + n_randoms); } } static void -read_params_file(const bool verbose, const string ¶ms_file, +read_params_file(const bool verbose, const std::string ¶ms_file, double &fg_alpha, double &fg_beta, double &bg_alpha, - double &bg_beta, vector &start_trans, - vector> &trans, vector &end_trans, - double &fdr_cutoff) { - string jnk; + double &bg_beta, std::vector &start_trans, + std::vector> &trans, + std::vector &end_trans, double &fdr_cutoff) { + std::string jnk; std::ifstream in(params_file.c_str()); in >> jnk >> fg_alpha >> jnk >> fg_beta >> jnk >> bg_alpha >> jnk >> @@ -663,50 +642,56 @@ read_params_file(const bool verbose, const string ¶ms_file, fdr_cutoff; if (verbose) - cerr << "Read in params from " << params_file << '\n' - << "FG_ALPHA\t" << fg_alpha << '\n' - << "FG_BETA\t" << fg_beta << '\n' - << "BG_ALPHA\t" << bg_alpha << '\n' - << "BG_BETA\t" << bg_beta << '\n' - << "S_F\t" << start_trans[0] << '\n' - << "S_B\t" << start_trans[1] << '\n' - << "F_F\t" << trans[0][0] << '\n' - << "F_B\t" << trans[0][1] << '\n' - << "B_F\t" << trans[1][0] << '\n' - << "B_B\t" << trans[1][1] << '\n' - << "F_E\t" << end_trans[0] << '\n' - << "B_E\t" << end_trans[1] << '\n' - << "FDR_CUTOFF\t" << fdr_cutoff << '\n'; + std::cerr << "Read in params from " << params_file << '\n' + << "FG_ALPHA\t" << fg_alpha << '\n' + << "FG_BETA\t" << fg_beta << '\n' + << "BG_ALPHA\t" << bg_alpha << '\n' + << "BG_BETA\t" << bg_beta << '\n' + << "S_F\t" << start_trans[0] << '\n' + << "S_B\t" << start_trans[1] << '\n' + << "F_F\t" << trans[0][0] << '\n' + << "F_B\t" << trans[0][1] << '\n' + << "B_F\t" << trans[1][0] << '\n' + << "B_B\t" << trans[1][1] << '\n' + << "F_E\t" << end_trans[0] << '\n' + << "B_E\t" << end_trans[1] << '\n' + << "FDR_CUTOFF\t" << fdr_cutoff << '\n'; } static void -write_posteriors_file(const string &posteriors_file, - const vector> &bins, - const vector &scores) { - static const size_t decimal_precision = 10; - - ofstream out(posteriors_file); +write_posteriors_file(const std::string &posteriors_file, + const std::vector> &bins, + const std::vector &scores) { + static constexpr auto decimal_precision = 10; + std::ofstream out(posteriors_file); + if (!out) + throw std::runtime_error("failed to open: " + posteriors_file); out.precision(decimal_precision); - for (size_t r = 0; r < scores.size(); ++r) + for (std::size_t r = 0; r < std::size(scores); ++r) out << bins[0][r] << '\t' << scores[r] << '\n'; } static void -write_params_file(const string &outfile, const vector &fg_alpha, - const vector &fg_beta, const vector &bg_alpha, - const vector &bg_beta, - const vector &start_trans, - const vector> &trans, - const vector &end_trans) { - static const size_t decimal_precision = 30; - ofstream out(outfile); +write_params_file(const std::string &outfile, + const std::vector &fg_alpha, + const std::vector &fg_beta, + const std::vector &bg_alpha, + const std::vector &bg_beta, + const std::vector &start_trans, + const std::vector> &trans, + const std::vector &end_trans) { + static constexpr auto decimal_precision = 30; + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open: " + outfile); out.precision(decimal_precision); - for (size_t r = 0; r < fg_alpha.size(); ++r) + // NOLINTBEGIN(*-avoid-magic-numbers) + for (std::size_t r = 0; r < std::size(fg_alpha); ++r) out << "FG_ALPHA_" << r + 1 << "\t" << std::setw(14) << fg_alpha[r] << "\t" << "FG_BETA_" << r + 1 << "\t" << std::setw(14) << fg_beta[r] << "\t" << "BG_ALPHA_" << r + 1 << "\t" << std::setw(14) << bg_alpha[r] << "\t" << "BG_BETA_" << r + 1 << "\t" << std::setw(14) << bg_beta[r] << '\n'; - + // NOLINTEND(*-avoid-magic-numbers) out << "S_F\t" << start_trans[0] << '\n' << "S_B\t" << start_trans[1] << '\n' << "F_F\t" << trans[0][0] << '\n' @@ -718,8 +703,8 @@ write_params_file(const string &outfile, const vector &fg_alpha, } static bool -check_if_array_data(const string &infile) { - bgzf_file in(infile, "r"); +check_if_array_data(const std::string &infile) { + bamxx::bgzf_file in(infile, "r"); if (!in) throw std::runtime_error("bad file: " + infile); @@ -735,22 +720,22 @@ check_if_array_data(const string &infile) { } static void -load_array_data(const size_t bin_size, const string &cpgs_file, - vector &bins, - vector> &meth, - vector &reads) { +load_array_data(const std::size_t bin_size, const std::string &cpgs_file, + std::vector &bins, + std::vector> &meth, + std::vector &reads) { // MAGIC. GS: minimum value for array? - static const double meth_min = 1.0e-2; + static constexpr double meth_min = 1.0e-2; - bgzf_file in(cpgs_file, "r"); + bamxx::bgzf_file in(cpgs_file, "r"); if (!in) throw std::runtime_error("bad sites file: " + cpgs_file); if (get_has_counts_header(cpgs_file)) skip_counts_header(in); - string curr_chrom; - size_t prev_pos = 0ul, curr_pos = 0ul; + std::string curr_chrom; + std::size_t prev_pos = 0ul, curr_pos = 0ul; double array_meth_bin = 0.0; double num_probes_in_bin = 0.0; @@ -772,8 +757,9 @@ load_array_data(const size_t bin_size, const string &cpgs_file, if (curr_chrom != site.chrom) { if (!curr_chrom.empty()) { if (site.chrom < curr_chrom) - throw runtime_error("CpGs not sorted in file \"" + cpgs_file + "\""); - bins.push_back(SimpleGenomicRegion(curr_chrom, curr_pos, prev_pos + 1)); + throw std::runtime_error("CpGs not sorted in file \"" + cpgs_file + + "\""); + bins.push_back(Interval(curr_chrom, curr_pos, prev_pos + 1)); meth.push_back(std::make_pair(array_meth_bin, num_probes_in_bin)); if (num_probes_in_bin > 0) reads.push_back(1); @@ -789,8 +775,7 @@ load_array_data(const size_t bin_size, const string &cpgs_file, throw std::runtime_error("CpGs not sorted in file \"" + cpgs_file + "\""); } else if (site.pos > curr_pos + bin_size) { - bins.push_back( - SimpleGenomicRegion(curr_chrom, curr_pos, curr_pos + bin_size)); + bins.push_back(Interval(curr_chrom, curr_pos, curr_pos + bin_size)); meth.push_back(std::make_pair(array_meth_bin, num_probes_in_bin)); (num_probes_in_bin > 0) ? reads.push_back(1) : reads.push_back(0); @@ -798,8 +783,7 @@ load_array_data(const size_t bin_size, const string &cpgs_file, num_probes_in_bin = 0.0; curr_pos += bin_size; while (curr_pos + bin_size < site.pos) { - bins.push_back( - SimpleGenomicRegion(curr_chrom, curr_pos, curr_pos + bin_size)); + bins.push_back(Interval(curr_chrom, curr_pos, curr_pos + bin_size)); reads.push_back(0); meth.push_back(std::make_pair(0.0, 0.0)); curr_pos += bin_size; @@ -818,7 +802,7 @@ load_array_data(const size_t bin_size, const string &cpgs_file, } prev_pos = site.pos; if (!curr_chrom.empty()) { - bins.push_back(SimpleGenomicRegion(curr_chrom, curr_pos, prev_pos + 1)); + bins.push_back(Interval(curr_chrom, curr_pos, prev_pos + 1)); meth.push_back(std::make_pair(array_meth_bin, num_probes_in_bin)); if (num_probes_in_bin > 0) reads.push_back(1); @@ -828,51 +812,52 @@ load_array_data(const size_t bin_size, const string &cpgs_file, } static void -load_wgbs_data(const size_t bin_size, const string &cpgs_file, - vector &bins, - vector> &meth, vector &reads) { +load_wgbs_data(const std::size_t bin_size, const std::string &cpgs_file, + std::vector &bins, + std::vector> &meth, + std::vector &reads) { reads.clear(); // for safety meth.clear(); bins.clear(); // ADS: loading data each iteration should be put outside loop - bgzf_file in(cpgs_file, "r"); + bamxx::bgzf_file in(cpgs_file, "r"); if (!in) - throw runtime_error("bad sites file: " + cpgs_file); + throw std::runtime_error("bad sites file: " + cpgs_file); if (get_has_counts_header(cpgs_file)) skip_counts_header(in); // keep track of the chroms we've seen - string curr_chrom; - std::unordered_set chroms_seen; + std::string curr_chrom; + std::unordered_set chroms_seen; MSite site; - size_t prev_pos = 0ul; - size_t sites_in_bin = 0ul; + std::size_t prev_pos = 0ul; + std::size_t sites_in_bin = 0ul; while (read_site(in, site)) { if (curr_chrom != site.chrom) { // handle change of chrom if (sites_in_bin > 0) - bins.back().set_end(prev_pos); - if (chroms_seen.find(site.chrom) != end(chroms_seen)) - throw runtime_error("sites not sorted"); + bins.back().stop = prev_pos; + if (chroms_seen.find(site.chrom) != std::cend(chroms_seen)) + throw std::runtime_error("sites not sorted"); chroms_seen.insert(site.chrom); curr_chrom = site.chrom; reads.push_back(0); meth.push_back(std::make_pair(0.0, 0.0)); - bins.push_back(SimpleGenomicRegion(site.chrom, 0, bin_size)); + bins.push_back(Interval(site.chrom, 0, bin_size)); sites_in_bin = 0; } prev_pos = site.pos; - if (site.pos < bins.back().get_start()) - throw runtime_error("sites not sorted"); - while (bins.back().get_end() < site.pos) { + if (site.pos < bins.back().start) + throw std::runtime_error("sites not sorted"); + while (bins.back().stop < site.pos) { sites_in_bin = 0; reads.push_back(0); meth.push_back(std::make_pair(0.0, 0.0)); - bins.push_back(SimpleGenomicRegion(site.chrom, bins.back().get_end(), - bins.back().get_end() + bin_size)); + bins.push_back( + Interval(site.chrom, bins.back().stop, bins.back().stop + bin_size)); } reads.back() += site.n_reads; meth.back().first += site.n_meth(); @@ -880,20 +865,20 @@ load_wgbs_data(const size_t bin_size, const string &cpgs_file, sites_in_bin++; } if (sites_in_bin > 0) - bins.back().set_end(prev_pos); + bins.back().stop = prev_pos; } static void -remove_empty_bins_at_chrom_start(vector &bins, - vector> &meth, - vector &reads) { +remove_empty_bins_at_chrom_start(std::vector &bins, + std::vector> &meth, + std::vector &reads) { bool chrom_start = true; - size_t j = 0; - string prev_chrom = ""; - for (size_t i = 0; i < bins.size(); ++i) { - if (bins[i].get_chrom() != prev_chrom) { + std::size_t j = 0; + std::string prev_chrom = ""; + for (std::size_t i = 0; i < std::size(bins); ++i) { + if (bins[i].chrom != prev_chrom) { chrom_start = true; - prev_chrom = bins[i].get_chrom(); + prev_chrom = bins[i].chrom; } if (reads[i] > 0) chrom_start = false; @@ -904,41 +889,41 @@ remove_empty_bins_at_chrom_start(vector &bins, ++j; } } - bins.erase(begin(bins) + j, end(bins)); - meth.erase(begin(meth) + j, end(meth)); - reads.erase(begin(reads) + j, end(reads)); + bins.erase(std::begin(bins) + j, std::end(bins)); + meth.erase(std::begin(meth) + j, std::end(meth)); + reads.erase(std::begin(reads) + j, std::end(reads)); } static void -load_read_counts(const string &cpgs_file, const size_t bin_size, - vector &reads) { +load_read_counts(const std::string &cpgs_file, const std::size_t bin_size, + std::vector &reads) { reads.clear(); // for safety // ADS: loading data each iteration should be put outside loop - bgzf_file in(cpgs_file, "r"); + bamxx::bgzf_file in(cpgs_file, "r"); if (!in) - throw runtime_error("bad methcounts file: " + cpgs_file); + throw std::runtime_error("bad methcounts file: " + cpgs_file); if (get_has_counts_header(cpgs_file)) skip_counts_header(in); // keep track of where we are and what we've seen - size_t bin_start = 0ul; - string curr_chrom; - std::unordered_set chroms_seen; + std::size_t bin_start = 0ul; + std::string curr_chrom; + std::unordered_set chroms_seen; MSite site; while (read_site(in, site)) { if (curr_chrom != site.chrom) { // handle change of chrom - if (chroms_seen.find(site.chrom) != end(chroms_seen)) - throw runtime_error("sites not sorted"); + if (chroms_seen.find(site.chrom) != std::cend(chroms_seen)) + throw std::runtime_error("sites not sorted"); chroms_seen.insert(site.chrom); bin_start = 0; curr_chrom = site.chrom; reads.push_back(0); } if (site.pos < bin_start) - throw runtime_error("sites not sorted"); + throw std::runtime_error("sites not sorted"); for (; bin_start + bin_size < site.pos; bin_start += bin_size) reads.push_back(0); reads.back() += site.n_reads; @@ -946,30 +931,32 @@ load_read_counts(const string &cpgs_file, const size_t bin_size, } static double -good_bins_frac(const vector &cumulative, const size_t min_bin_size, - const size_t bin_size, const size_t min_cov_to_pass) { +good_bins_frac(const std::vector &cumulative, + const std::size_t min_bin_size, const std::size_t bin_size, + const std::size_t min_cov_to_pass) { // make sure the target bin size is a multiple of the minimum so we // have the resolution to construct the new bins assert(bin_size % min_bin_size == 0); // the step size corresponds to the number of minium sized bins that // would make up a new bin of the target size - const size_t step_size = bin_size / min_bin_size; + const std::size_t step_size = bin_size / min_bin_size; - size_t passing_bins = 0, covered_bins = 0; + std::size_t passing_bins = 0, covered_bins = 0; - size_t prev_total = 0; - for (size_t i = 0; i + step_size < cumulative.size(); i += step_size) { - const size_t curr_cumulative = cumulative[i + step_size]; - const size_t bin_count = curr_cumulative - prev_total; + std::size_t prev_total = 0; + for (std::size_t i = 0; i + step_size < std::size(cumulative); + i += step_size) { + const std::size_t curr_cumulative = cumulative[i + step_size]; + const std::size_t bin_count = curr_cumulative - prev_total; covered_bins += (bin_count > 0); passing_bins += (bin_count >= min_cov_to_pass); prev_total = curr_cumulative; } // check if there is a leftover bin at the end - if (cumulative.size() % step_size != 0) { - const size_t bin_count = cumulative.back() - prev_total; + if (std::size(cumulative) % step_size != 0) { + const std::size_t bin_count = cumulative.back() - prev_total; covered_bins += (bin_count > 0); passing_bins += (bin_count >= min_cov_to_pass); } @@ -977,11 +964,11 @@ good_bins_frac(const vector &cumulative, const size_t min_bin_size, return static_cast(passing_bins) / std::max(1ul, covered_bins); } -static size_t +static std::size_t get_min_reads_for_confidence(const double conf_level) { // ADS: value of 0.5 below important; this is where the CI is widest static const double fixed_phat = 0.5; - size_t n_reads = 0; + std::size_t n_reads = 0; double lower = 0.0, upper = 1.0; // ADS: should be doubling first, followed by bisection while (1.0 - conf_level < upper - lower) { @@ -991,35 +978,36 @@ get_min_reads_for_confidence(const double conf_level) { return n_reads; } -// ADS: this function will return num_lim::max() if the +// ADS: this function will return std::numeric_limits::max() if the // fraction of "good" bins is zero for all attempted bin sizes. -static size_t -binsize_selection(const size_t resolution, const size_t min_bin_sz, - const size_t max_bin_sz, const double conf_level, - const double min_frac_passed, const string &cpgs_file) { - const size_t min_cov_to_pass = get_min_reads_for_confidence(conf_level); +static std::size_t +binsize_selection(const std::size_t resolution, const std::size_t min_bin_sz, + const std::size_t max_bin_sz, const double conf_level, + const double min_frac_passed, const std::string &cpgs_file) { + const std::size_t min_cov_to_pass = get_min_reads_for_confidence(conf_level); - vector reads; + std::vector reads; load_read_counts(cpgs_file, resolution, reads); - std::partial_sum(begin(reads), end(reads), begin(reads)); + std::partial_sum(std::cbegin(reads), std::cend(reads), std::begin(reads)); double frac_passed = 0.0; - size_t bin_size = min_bin_sz; + std::size_t bin_size = min_bin_sz; while (bin_size < max_bin_sz && frac_passed < min_frac_passed) { frac_passed = good_bins_frac(reads, resolution, bin_size, min_cov_to_pass); if (frac_passed < min_frac_passed) bin_size += resolution; } - return frac_passed < min_frac_passed ? num_lim::max() : bin_size; + return frac_passed < min_frac_passed ? std::numeric_limits::max() + : bin_size; } static void -load_bins(const size_t bin_size, const string &cpgs_file, - vector &bins, - vector> &meth, vector &reads, - vector &array_status) { +load_bins(const std::size_t bin_size, const std::string &cpgs_file, + std::vector &bins, + std::vector> &meth, + std::vector &reads, std::vector &array_status) { const bool is_array_data = check_if_array_data(cpgs_file); array_status.push_back(is_array_data); @@ -1033,39 +1021,44 @@ load_bins(const size_t bin_size, const string &cpgs_file, } static void -get_union_of_bins(const vector> &orig, - vector &bins) { +get_union_of_bins(const std::vector> &orig, + std::vector &bins) { + const auto overlaps = [](const auto &x, const auto &y) { + const auto cmp = x.chrom.compare(y.chrom); + return cmp == 0 && std::max(x.start, x.start) < std::min(x.stop, y.stop); + }; + // flatten the set of sorted bins bins.clear(); for (auto &&i : orig) - bins.insert(end(bins), begin(i), end(i)); + bins.insert(std::end(bins), std::cbegin(i), std::cend(i)); // merge each sorted interval of bins - const auto first = begin(bins); - auto middle = begin(bins); - for (size_t i = 1; i < orig.size(); ++i) { - middle += orig[i - 1].size(); - std::inplace_merge(first, middle, middle + orig[i].size()); + const auto first = std::begin(bins); + auto middle = std::begin(bins); + for (std::size_t i = 1; i < std::size(orig); ++i) { + middle += std::size(orig[i - 1]); + std::inplace_merge(first, middle, middle + std::size(orig[i])); } // ensure unique bins - bins.erase(unique(begin(bins), end(bins)), end(bins)); + bins.erase(std::unique(std::begin(bins), std::end(bins)), std::end(bins)); bins.shrink_to_fit(); // make sure all bins are aligned at same boundaries - for (size_t i = 1; i < bins.size(); ++i) - if (bins[i - 1].overlaps(bins[i])) + for (std::size_t i = 1; i < std::size(bins); ++i) + if (overlaps(bins[i - 1], bins[i])) throw std::runtime_error("bins from reps not aligned"); } static void -add_missing_bins(const vector &all_bins, - vector &bins, - vector> &meth) { - const size_t n_bins = all_bins.size(); - vector> tmp_meth(n_bins); - - size_t j = 0; // assume j range no larger than i range - for (size_t i = 0; i < n_bins; ++i) { +add_missing_bins(const std::vector &all_bins, + std::vector &bins, + std::vector> &meth) { + const std::size_t n_bins = std::size(all_bins); + std::vector> tmp_meth(n_bins); + + std::size_t j = 0; // assume j range no larger than i range + for (std::size_t i = 0; i < n_bins; ++i) { if (all_bins[i] == bins[j]) tmp_meth[i] = meth[j++]; else @@ -1076,53 +1069,68 @@ add_missing_bins(const vector &all_bins, } static void -write_empty_summary(const string &summary_file) { - if (!summary_file.empty()) { - ofstream summary_out(summary_file); - if (!summary_out) - throw runtime_error("failed to open: " + summary_file); - summary_out << pmd_summary({}).tostring() << '\n'; - } +write_empty_summary(const std::string &summary_file) { + if (summary_file.empty()) + return; + std::ofstream summary_out(summary_file); + if (!summary_out) + throw std::runtime_error("failed to open: " + summary_file); + summary_out << to_string(pmd_summary({})) << '\n'; } int main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - static const size_t min_observations_for_inference = 100; - static const size_t max_bin_size = 500000; - static const size_t min_bin_size = 1000; - size_t resolution = 500; + static constexpr auto min_observations_for_inference{100ul}; + static constexpr auto default_self_transition{0.99}; + static constexpr auto default_start_transition{0.5}; + static constexpr auto default_end_transition{1e-10}; + static constexpr auto default_transition{0.01}; - string outfile; + // magic numbers from paper: highest jaccard index to wgbs + static constexpr auto bin_size_for_array{1000ul}; + static constexpr auto desert_size_for_array{200000ul}; - size_t rng_seed = 408; + // MAGIC: corrections for small values (not parameters): + static constexpr auto tolerance{1e-5}; + static constexpr auto min_prob{1e-10}; + + static constexpr auto default_fg_alpha{0.05}; + static constexpr auto default_fg_beta{0.95}; + static constexpr auto default_bg_alpha{0.95}; + static constexpr auto default_bg_beta{0.05}; + const auto init_trans = [&] { + return std::vector{ + std::vector(default_self_transition, default_transition), + std::vector(default_transition, default_self_transition), + }; + }; + // > + // std::vector> t( + // 2, std::vector(2, default_transition)); + // t[0][0] = t[1][1] = default_self_transition; + // return t; + // }; + + // NOLINTBEGIN(*-avoid-magic-numbers) + std::size_t resolution = 500; + std::size_t rng_seed = 408; + std::size_t desert_size = 5000; + std::size_t bin_size = 1000; + std::size_t max_iterations = 10; + // NOLINTEND(*-avoid-magic-numbers) - bool DEBUG = false; - size_t desert_size = 5000; - size_t bin_size = 1000; - size_t max_iterations = 10; // run mode flags + bool DEBUG = false; bool verbose = false; bool ARRAY_MODE = false; bool fixed_bin_size = false; - // MAGIC: corrections for small values (not parameters): - static const double tolerance = 1e-5; - static const double min_prob = 1e-10; - - string summary_file; - - string params_in_files; - string params_out_file; - string posteriors_out_prefix; - - constexpr auto description = - R"(Identify PMDs in methylomes. Methylation must be provided in the - methcounts file format (chrom, position, strand, context, - methylation, reads). See the methcounts documentation for - details. This program assumes only data at CpG sites and that - strands are collapsed so only the positive site appears in the - file, but reads counts are from both strands.)"; + std::string summary_file; + std::string params_in_files; + std::string params_out_file; + std::string posteriors_out_prefix; + std::string outfile; /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) @@ -1151,41 +1159,39 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) false, params_out_file); opt_parse.add_opt("seed", 's', "specify random seed", false, rng_seed); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << '\n' - << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << '\n'; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.empty()) { - cerr << opt_parse.help_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } /****************** END COMMAND LINE OPTIONS *****************/ - resolution = min(bin_size, resolution); + resolution = std::min(bin_size, resolution); - vector cpgs_file = leftover_args; - vector params_in_file; - if (!params_in_files.empty()) { + std::vector cpgs_file = leftover_args; + std::vector params_in_file; + if (!params_in_files.empty()) params_in_file = smithlab::split(params_in_files, ",", false); - assert(cpgs_file.size() == params_in_file.size()); - } - const size_t n_replicates = cpgs_file.size(); + const std::size_t n_replicates = std::size(cpgs_file); std::for_each(std::cbegin(cpgs_file), std::cend(cpgs_file), [](const auto &x) { if (!is_msite_file(x)) - throw runtime_error("malformed counts file: " + x); + throw std::runtime_error("malformed counts file: " + x); }); bool insufficient_data = false; // ADS: this is used now to detect @@ -1196,75 +1202,81 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) // Sanity checks input file format and dynamically selects bin // size from WGBS samples. if (!fixed_bin_size && !ARRAY_MODE) { + static constexpr auto max_bin_size{500000ul}; + static constexpr auto min_bin_size{1000ul}; + static constexpr auto desert_size_multiplier{5}; // ADS: what is this? + static constexpr auto default_confidence_interval{0.80}; + static constexpr auto default_prop_accept{0.80}; if (verbose) - cerr << "[DYNAMICALLY SELECTING BIN SIZE]" << '\n'; - double confidence_interval = 0.80; - double prop_accept = 0.80; - for (size_t i = 0; i < n_replicates && !insufficient_data; ++i) { + std::cerr << "[DYNAMICALLY SELECTING BIN SIZE]" << '\n'; + double confidence_interval = default_confidence_interval; + double prop_accept = default_prop_accept; + for (std::size_t i = 0; i < n_replicates && !insufficient_data; ++i) { const bool arrayData = check_if_array_data(cpgs_file[i]); if (!arrayData) { bin_size = binsize_selection(resolution, min_bin_size, max_bin_size, confidence_interval, prop_accept, cpgs_file[i]); - if (bin_size == num_lim::max()) + if (bin_size == std::numeric_limits::max()) insufficient_data = true; - desert_size = 5 * bin_size; // TODO(ADS): what should this be? + desert_size = desert_size_multiplier * bin_size; } else { - // same as the parameters below - bin_size = 1000; - desert_size = 200000; + bin_size = bin_size_for_array; + desert_size = desert_size_for_array; } } } else if (ARRAY_MODE) { - bin_size = 1000; // MAGIC NUMBERS FROM PAPER - desert_size = 200000; // PERFORM WITH HIGHEST JACCARD INDEX TO WGBS + bin_size = bin_size_for_array; + desert_size = desert_size_for_array; } else { - desert_size = max(desert_size, bin_size); + desert_size = std::max(desert_size, bin_size); } if (insufficient_data) { // ADS: first check for insufficient data; another is needed if // fixed bin size is used if (verbose) - cerr << "EXITING: INSUFFICIENT DATA" << '\n'; + std::cerr << "EXITING: INSUFFICIENT DATA" << '\n'; if (!summary_file.empty()) write_empty_summary(summary_file); return EXIT_SUCCESS; } if (verbose) - cerr << "[READING IN AT BIN SIZE " << bin_size << "]" << '\n'; + std::cerr << "[READING IN AT BIN SIZE " << bin_size << "]\n"; // separate the regions by chrom and by desert - vector> bins(n_replicates); - vector>> meth(n_replicates); - vector> reads(n_replicates); - vector array_status; + std::vector> bins(n_replicates); + std::vector>> meth(n_replicates); + std::vector> reads(n_replicates); + std::vector array_status; - for (size_t i = 0; i < n_replicates && !insufficient_data; ++i) { + for (std::size_t i = 0; i < n_replicates && !insufficient_data; ++i) { if (verbose) - cerr << "[READING CPGS AND METH PROPS] from " << cpgs_file[i] << '\n'; + std::cerr << "[READING CPGS AND METH PROPS] from " << cpgs_file[i] + << '\n'; load_bins(bin_size, cpgs_file[i], bins[i], meth[i], reads[i], array_status); const double total_observations = - accumulate(begin(reads[i]), end(reads[i]), 0); - if (total_observations <= num_lim::min()) + std::accumulate(std::cbegin(reads[i]), std::cend(reads[i]), 0.0); + if (total_observations <= std::numeric_limits::min()) insufficient_data = true; if (verbose) - cerr << "TOTAL BINS: " << bins[i].size() << '\n' - << "MEAN COVERAGE: " - << total_observations / std::max(1ul, reads[i].size()) << '\n'; + std::cerr << "TOTAL BINS: " << std::size(bins[i]) << '\n' + << "MEAN COVERAGE: " + << total_observations / std::max(1ul, std::size(reads[i])) + << '\n'; } if (insufficient_data) { // ADS: second check for insufficient data; another is needed if // filtered number of bins is too few if (verbose) - cerr << "EXITING: INSUFFICIENT DATA" << '\n'; + std::cerr << "EXITING: INSUFFICIENT DATA" << '\n'; if (!summary_file.empty()) write_empty_summary(summary_file); return EXIT_SUCCESS; @@ -1272,24 +1284,24 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) if (n_replicates > 1) { bool need_to_adjust_bins = false; - const size_t n_bins_0 = bins[0].size(); - for (size_t i = 1; i < n_replicates && !need_to_adjust_bins; ++i) - if (bins[i].size() != n_bins_0) + const std::size_t n_bins_0 = std::size(bins[0]); + for (std::size_t i = 1; i < n_replicates && !need_to_adjust_bins; ++i) + if (std::size(bins[i]) != n_bins_0) need_to_adjust_bins = true; if (need_to_adjust_bins) { - vector all_bins; + std::vector all_bins; get_union_of_bins(bins, all_bins); - for (size_t i = 0; i < bins.size(); ++i) + for (std::size_t i = 0; i < std::size(bins); ++i) add_missing_bins(all_bins, bins[i], meth[i]); } } // separate regions by chrom and desert; eliminate isolated Bins - vector reset_points; - vector dists_btwn_bins; + std::vector reset_points; + std::vector dists_btwn_bins; if (verbose) - cerr << "[separating by CpG desert]" << '\n'; + std::cerr << "[separating by CpG desert]" << '\n'; separate_regions(desert_size, bins, meth, reads, reset_points, dists_btwn_bins); if (size(bins[0]) < min_observations_for_inference) @@ -1299,32 +1311,31 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) // ADS: final check for sufficient data failed; too few bins // after filtering if (verbose) - cerr << "EXITING: INSUFFICIENT DATA" << '\n'; + std::cerr << "EXITING: INSUFFICIENT DATA" << '\n'; if (!summary_file.empty()) write_empty_summary(summary_file); return EXIT_SUCCESS; } if (verbose) - cerr << "bins retained: " << std::size(bins[0]) << '\n' - << "number of distances between: " << std::size(dists_btwn_bins) - << '\n' - << "deserts removed: " << size(reset_points) - 2 << '\n'; + std::cerr << "bins retained: " << std::size(bins[0]) << '\n' + << "number of distances between: " << std::size(dists_btwn_bins) + << '\n' + << "deserts removed: " << size(reset_points) - 2 << '\n'; /****************** Read in params *****************/ - vector start_trans(2, 0.5), end_trans(2, 1e-10); - vector> trans(2, vector(2, 0.01)); - trans[0][0] = trans[1][1] = 0.99; + auto start_trans = std::vector(2, default_start_transition); + auto end_trans = std::vector(2, default_end_transition); + auto trans = init_trans(); const TwoStateHMM hmm(min_prob, tolerance, max_iterations, verbose, DEBUG); - vector reps_fg_alpha(n_replicates, 0.05); - vector reps_fg_beta(n_replicates, 0.95); - vector reps_bg_alpha(n_replicates, 0.95); - vector reps_bg_beta(n_replicates, 0.05); - double score_cutoff_for_fdr = num_lim::max(); - + std::vector reps_fg_alpha(n_replicates, default_fg_alpha); + std::vector reps_fg_beta(n_replicates, default_fg_beta); + std::vector reps_bg_alpha(n_replicates, default_bg_alpha); + std::vector reps_bg_beta(n_replicates, default_bg_beta); + double score_cutoff_for_fdr = std::numeric_limits::max(); if (!params_in_file.empty()) { // read parameters files - for (size_t i = 0; i < n_replicates; ++i) + for (auto i = 0u; i < n_replicates; ++i) read_params_file(verbose, params_in_file[i], reps_fg_alpha[i], reps_fg_beta[i], reps_bg_alpha[i], reps_bg_beta[i], start_trans, trans, end_trans, score_cutoff_for_fdr); @@ -1346,14 +1357,14 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) /***********************************/ if (!posteriors_out_prefix.empty()) { - vector into_scores; + std::vector into_scores; hmm.TransitionPosteriors_rep(meth, reset_points, start_trans, trans, end_trans, reps_fg_alpha, reps_fg_beta, reps_bg_alpha, reps_bg_beta, array_status, 2, into_scores); write_posteriors_file(posteriors_out_prefix + ".intoTrans", bins, into_scores); - vector outof_scores; + std::vector outof_scores; hmm.TransitionPosteriors_rep(meth, reset_points, start_trans, trans, end_trans, reps_fg_alpha, reps_fg_beta, reps_bg_alpha, reps_bg_beta, array_status, 1, @@ -1362,8 +1373,8 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) outof_scores); } - vector classes; - vector scores; + std::vector classes; + std::vector scores; hmm.PosteriorDecoding_rep(meth, reset_points, start_trans, trans, end_trans, reps_fg_alpha, reps_fg_beta, reps_bg_alpha, reps_bg_beta, classes, scores, array_status); @@ -1372,38 +1383,42 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) write_posteriors_file(posteriors_out_prefix + ".posteriors", bins, scores); - vector domain_scores; + std::vector domain_scores; get_domain_scores(classes, meth, reset_points, domain_scores); if (verbose) - cerr << "[RANDOMIZING SCORES FOR FDR]" << '\n'; + std::cerr << "[RANDOMIZING SCORES FOR FDR]" << '\n'; - vector random_scores; + std::vector random_scores; shuffle_bins(rng_seed, hmm, meth, reset_points, start_trans, trans, end_trans, reps_fg_alpha, reps_fg_beta, reps_bg_alpha, reps_bg_beta, random_scores, array_status); - vector p_values; + std::vector p_values; assign_p_values(random_scores, domain_scores, p_values); - if (score_cutoff_for_fdr == num_lim::max() && !p_values.empty()) - score_cutoff_for_fdr = get_score_cutoff_for_fdr(p_values, 0.01); - + if (score_cutoff_for_fdr == std::numeric_limits::max() && + !p_values.empty()) { + static constexpr auto default_p_value_cutoff{0.01}; + score_cutoff_for_fdr = + get_score_cutoff_for_fdr(p_values, default_p_value_cutoff); + } if (!params_out_file.empty()) { - ofstream out(params_out_file, std::ios::app); - out << "FDR_CUTOFF\t" << std::setprecision(30) << score_cutoff_for_fdr - << '\n'; + std::ofstream out(params_out_file, std::ios::app); + if (!out) + throw std::runtime_error("failed to open: " + params_out_file); + out << "FDR_CUTOFF\t" << score_cutoff_for_fdr << '\n'; } - vector domains; + std::vector domains; build_domains(bins[0], reset_points, classes, domains); - size_t good_pmd_count = 0; - vector good_domains; - for (size_t i = 0; i < domains.size(); ++i) + std::size_t good_pmd_count = 0; + std::vector good_domains; + for (std::size_t i = 0; i < std::size(domains); ++i) if (p_values[i] < score_cutoff_for_fdr) { good_domains.push_back(domains[i]); - good_domains.back().set_name("PMD" + to_string(good_pmd_count++)); + good_domains.back().name = "PMD" + std::to_string(good_pmd_count++); } optimize_boundaries(bin_size, cpgs_file, good_domains, array_status, @@ -1411,25 +1426,23 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) reps_bg_beta); if (!summary_file.empty()) { - ofstream summary_out(summary_file); + std::ofstream summary_out(summary_file); if (!summary_out) - throw runtime_error("failed to open: " + summary_file); - summary_out << pmd_summary(good_domains).tostring() << '\n'; + throw std::runtime_error("failed to open: " + summary_file); + summary_out << to_string(pmd_summary(good_domains)) << '\n'; } - ofstream of; - if (!outfile.empty()) - of.open(outfile); - ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); - - copy(begin(good_domains), end(good_domains), - std::ostream_iterator(out, "\n")); + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open: " + outfile); + std::copy(std::cbegin(good_domains), std::cend(good_domains), + std::ostream_iterator(out, "\n")); } catch (const std::exception &e) { - cerr << e.what() << '\n'; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } -// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) +// NOLINTEND(*-narrowing-conversions) diff --git a/src/analysis/roimethstat.cpp b/src/analysis/roimethstat.cpp index 120030c6..ab393c70 100644 --- a/src/analysis/roimethstat.cpp +++ b/src/analysis/roimethstat.cpp @@ -15,7 +15,7 @@ * more details. */ -#include "GenomicRegion.hpp" +#include "Interval6.hpp" #include "LevelsCounter.hpp" #include "MSite.hpp" #include "OptionParser.hpp" @@ -39,39 +39,11 @@ #include #include -// NOLINTBEGIN(*-avoid-c-arrays,*-avoid-magic-numbers,*-narrowing-conversions,*-pointer-arithmetic) - -[[nodiscard]] static std::string -format_levels_counter(const LevelsCounter &lc) { - // ... - // (7) weighted mean methylation - // (8) unweighted mean methylation - // (9) fractional methylation - // (10) number of sites in the region - // (11) number of sites covered at least once - // (12) number of observations in reads indicating methylation - // (13) total number of observations from reads in the region - std::ostringstream oss; - // clang-format off - oss << lc.mean_meth_weighted() << '\t' - << lc.mean_meth() << '\t' - << lc.fractional_meth() << '\t' - << lc.total_sites << '\t' - << lc.sites_covered << '\t' - << lc.total_c << '\t' - << lc.coverage(); - // clang-format on - return oss.str(); -} - -struct genomic_interval { - std::string chrom{}; - std::uint64_t start_pos{}; - std::uint64_t end_pos{}; -}; +// NOLINTBEGIN(*-narrowing-conversions) static void update(LevelsCounter &lc, const xcounts_entry &xse) { + static constexpr auto one_half = 0.5; const std::uint64_t n_reads = xse.n_meth + xse.n_unmeth; if (n_reads > 0) { ++lc.sites_covered; @@ -80,37 +52,38 @@ update(LevelsCounter &lc, const xcounts_entry &xse) { lc.total_t += xse.n_unmeth; const auto meth = static_cast(xse.n_unmeth) / n_reads; lc.total_meth += meth; - double lower = 0.0, upper = 0.0; + double lower{}; + double upper{}; wilson_ci_for_binomial(lc.alpha, n_reads, meth, lower, upper); - lc.called_meth += (lower > 0.5); - lc.called_unmeth += (upper < 0.5); + lc.called_meth += (lower > one_half); + lc.called_unmeth += (upper < one_half); } ++lc.total_sites; } static void process_chrom(const bool report_more_info, const char level_code, - const std::vector &intervals, + const std::vector &intervals, const std::vector &sites, std::ostream &out) { std::uint64_t j = 0; for (auto i = 0ul; i < std::size(intervals); ++i) { - while (j < std::size(sites) && sites[j].pos < intervals[i].get_start()) + while (j < std::size(sites) && sites[j].pos < intervals[i].start) ++j; - LevelsCounter lc; - while (j < std::size(sites) && sites[j].pos < intervals[i].get_end()) + while (j < std::size(sites) && sites[j].pos < intervals[i].stop) update(lc, sites[j++]); - GenomicRegion r(intervals[i]); - r.set_score(level_code == 'w' ? lc.mean_meth_weighted() - : (level_code == 'u' ? lc.mean_meth() - : lc.fractional_meth())); - r.set_name(r.get_name() + "_" + - std::to_string((level_code == 'w' - ? lc.coverage() - : (level_code == 'u' ? lc.sites_covered - : lc.total_called())))); - out << r; + Interval6 interval(intervals[i]); + interval.score = + level_code == 'w' + ? lc.mean_meth_weighted() + : (level_code == 'u' ? lc.mean_meth() : lc.fractional_meth()); + interval.name += + "_" + std::to_string( + level_code == 'w' + ? lc.coverage() + : (level_code == 'u' ? lc.sites_covered : lc.total_called())); + out << to_string(interval); if (report_more_info) out << '\t' << format_levels_counter(lc); out << '\n'; @@ -119,11 +92,11 @@ process_chrom(const bool report_more_info, const char level_code, static void process_chrom(const bool report_more_info, - const std::vector &intervals, std::ostream &out) { + const std::vector &intervals, std::ostream &out) { LevelsCounter lc; const std::string lc_formatted = format_levels_counter(lc); for (const auto &r : intervals) { - out << r; + out << to_string(r); if (report_more_info) out << '\t' << lc_formatted; out << '\n'; @@ -133,23 +106,23 @@ process_chrom(const bool report_more_info, static void process_from_xcounts(const std::uint32_t n_threads, const bool report_more_info, const char level_code, const std::string &xsym_file, - const std::vector &intervals_in, + const std::vector &intervals_in, std::ostream &out) { const auto sites_by_chrom = read_xcounts_by_chrom(n_threads, xsym_file); - // const auto intervals = get_GenomicRegions(intervals_file); + // const auto intervals = get_Interval6s(intervals_file); - std::vector> intervals_by_chrom; + std::vector> intervals_by_chrom; std::string prev_chrom; for (auto i = 0u; i < std::size(intervals_in); ++i) { - if (intervals_in[i].get_chrom() != prev_chrom) { - intervals_by_chrom.push_back(std::vector()); - prev_chrom = intervals_in[i].get_chrom(); + if (intervals_in[i].chrom != prev_chrom) { + intervals_by_chrom.push_back(std::vector()); + prev_chrom = intervals_in[i].chrom; } intervals_by_chrom.back().push_back(intervals_in[i]); } for (const auto &intervals : intervals_by_chrom) { - const auto chrom_name = intervals.front().get_chrom(); + const auto chrom_name = intervals.front().chrom; const auto sites = sites_by_chrom.find(chrom_name); if (sites != std::cend(sites_by_chrom)) process_chrom(report_more_info, level_code, intervals, sites->second, @@ -160,12 +133,11 @@ process_from_xcounts(const std::uint32_t n_threads, const bool report_more_info, } [[nodiscard]] bool -cmp_within_chrom(const GenomicRegion &r1, const GenomicRegion &r2) { - return r1.get_start() < r2.get_start() || - (r1.get_start() == r2.get_start() && - (r1.get_end() < r2.get_end() || - (r1.get_end() == r2.get_end() && - (r1.get_strand() < r2.get_strand())))); +cmp_within_chrom(const Interval6 &r1, const Interval6 &r2) { + return r1.start < r2.start || + (r1.start == r2.start && + (r1.stop < r2.stop || + (r1.stop == r2.stop && (r1.strand < r2.strand)))); } [[nodiscard]] bool @@ -181,8 +153,8 @@ get_chrom_order(const T &order, const MSite &s) { template [[nodiscard]] typename T::const_iterator -get_chrom_order(const T &order, const GenomicRegion &r) { - return order.find(r.get_chrom()); +get_chrom_order(const T &order, const Interval6 &r) { + return order.find(r.chrom); } struct cmp_chrom_order { @@ -248,9 +220,9 @@ region_bounds(const std::unordered_map &chrom_order, // that region is on the negative strand const char // strand(region.get_strand()); - const std::string chrom(region.get_chrom()); - const MSite a(chrom, region.get_start(), '+', "", 0, 0); - const MSite b(chrom, region.get_end(), '+', "", 0, 0); + const std::string chrom(region.chrom); + const MSite a(chrom, region.start, '+', "", 0, 0); + const MSite b(chrom, region.stop, '+', "", 0, 0); // ADS: This function seems like std::equal_range but both elements // below use "lower_bound" because "b" is strictly following valid @@ -292,10 +264,10 @@ read_sites(const std::string &filename) { static void process_preloaded( - const bool VERBOSE, const bool report_more_info, const char level_code, + const bool verbose, const bool report_more_info, const char level_code, const std::string &sites_file, const std::unordered_map &chrom_order, - const std::vector ®ions, std::ostream &out) { + const std::vector ®ions, std::ostream &out) { const auto sites = read_sites(sites_file); if (sites.empty()) throw std::runtime_error("failed to read sites: " + sites_file); @@ -303,7 +275,7 @@ process_preloaded( if (!is_sorted_within_chrom(sites)) throw std::runtime_error("sites not sorted: " + sites_file); - if (VERBOSE) + if (verbose) std::cerr << "[n_sites=" << std::size(sites) << "]\n"; for (const auto &r : regions) { @@ -317,9 +289,9 @@ process_preloaded( level_code == 'w' ? lc.mean_meth_weighted() : (level_code == 'u' ? lc.mean_meth() : lc.fractional_meth()); - GenomicRegion r_scored{r}; - r_scored.set_score(score); - out << r_scored; + Interval6 r_scored{r}; + r_scored.score = score; + out << to_string(r_scored); if (report_more_info) out << '\t' << format_levels_counter(lc); out << '\n'; @@ -347,15 +319,15 @@ at_or_after(const MSite &site, const std::string &chrom, [[nodiscard]] static LevelsCounter calc_site_stats( - std::ifstream &sites_in, const GenomicRegion ®ion, + std::ifstream &sites_in, const Interval6 ®ion, const std::unordered_map &chrom_order) { - const std::string chrom{region.get_chrom()}; + const std::string chrom{region.chrom}; const auto chrom_itr = chrom_order.find(chrom); if (chrom_itr == std::cend(chrom_order)) throw std::runtime_error("lookup failure: " + chrom); const auto chrom_idx = chrom_itr->second; - const auto start_pos = region.get_start(); - const auto end_pos = region.get_end(); + const auto start_pos = region.start; + const auto end_pos = region.stop; find_offset_for_msite(chrom_order, chrom, start_pos, sites_in); LevelsCounter lc; @@ -375,7 +347,7 @@ process_on_disk( const bool report_more_info, const char level_code, const std::string &sites_file, const std::unordered_map &chrom_order, - const std::vector ®ions, std::ostream &out) { + const std::vector ®ions, std::ostream &out) { std::ifstream in(sites_file); if (!in) throw std::runtime_error("failed to open file: " + sites_file); @@ -386,9 +358,9 @@ process_on_disk( level_code == 'w' ? lc.mean_meth_weighted() : (level_code == 'u' ? lc.mean_meth() : lc.fractional_meth()); - GenomicRegion r{region}; - r.set_score(score); - out << r; + Interval6 r{region}; + r.score = score; + out << to_string(r); if (report_more_info) out << '\t' << format_levels_counter(lc); out << '\n'; @@ -399,22 +371,19 @@ process_on_disk( get_bed_columns(const std::string ®ions_file) { std::ifstream in(regions_file); if (!in) - throw std::runtime_error("failed to open file: " + regions_file); - + throw std::runtime_error("cannot open file: " + regions_file); std::string line; - std::getline(in, line); - + getline(in, line); std::istringstream iss(line); std::string token; std::size_t n_columns = 0; while (iss >> token) ++n_columns; - return n_columns; } int -main_roimethstat(int argc, char *argv[]) { +main_roimethstat(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { static const std::string description = R"""( Compute average site methylation levels in each interval from a given @@ -430,10 +399,13 @@ Columns (beyond the first 6) in the BED format output: (12) number of observations in reads indicating methylation (13) total number of observations from reads in the region )"""; + constexpr auto valid_n_cols = [](const auto n_cols) { + return n_cols != 3 && n_cols < 6; // NOLINT(*-avoid-magic-numbers) + }; static const std::string default_name_prefix = "X"; - bool VERBOSE = false; + bool verbose = false; bool print_numeric_only = false; bool preload = false; bool report_more_info = false; @@ -446,11 +418,10 @@ Columns (beyond the first 6) in the BED format output: std::string outfile; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(argv[0], description, - " "); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, " "); opt_parse.set_show_defaults(); - opt_parse.add_opt("output", 'o', "Name of output file (default: stdout)", - false, outfile); + opt_parse.add_opt("output", 'o', "Name of output file", true, outfile); opt_parse.add_opt("numeric", 'N', "print numeric values only (not NAs)", false, print_numeric_only); opt_parse.add_opt("preload", 'L', "load all site sites", false, preload); @@ -467,7 +438,7 @@ Columns (beyond the first 6) in the BED format output: opt_parse.add_opt("relaxed", '\0', "input has extra fields (used for nanopore)", false, allow_extra_fields); - opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); + opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { @@ -509,27 +480,25 @@ Columns (beyond the first 6) in the BED format output: for (const auto &i : get_chroms(sites_file)) chrom_order.emplace(i, std::size(chrom_order)); - if (VERBOSE) + if (verbose) std::cerr << "loading regions" << '\n'; if (!std::filesystem::is_regular_file(regions_file)) // otherwise we could not read the file twice throw std::runtime_error("regions file must be regular file"); - // MAGIC: below allow for ==3 or >=6 columns in the bed format const auto n_columns = get_bed_columns(regions_file); - if (n_columns != 3 && n_columns < 6) + if (valid_n_cols(n_columns)) throw std::runtime_error("format must be 3 or 6+ column bed: " + regions_file); if (is_msite_file(regions_file)) throw std::runtime_error("seems to be a counts file: " + regions_file + "\ncheck order of input arguments"); - std::vector regions; - ReadBEDFile(regions_file, regions); + auto regions = read_intervals6(regions_file); if (!std::is_sorted(std::begin(regions), std::end(regions))) { if (sort_data_if_needed) { - if (VERBOSE) + if (verbose) std::cerr << "sorting regions\n"; std::sort(std::begin(regions), std::end(regions), cmp_chrom_order(chrom_order)); @@ -543,24 +512,20 @@ Columns (beyond the first 6) in the BED format output: // be empty and the output format will be wrong if (n_columns == 3) for (auto i = 0u; i < std::size(regions); ++i) - regions[i].set_name(default_name_prefix + std::to_string(i)); + regions[i].name = default_name_prefix + std::to_string(i); - if (VERBOSE) + if (verbose) std::cerr << "[n_regions=" << std::size(regions) << "]\n"; - std::ofstream of; - if (!outfile.empty()) { - of.open(outfile); - if (!of) - throw std::runtime_error("failed to open outfile: " + outfile); - } - std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open outfile: " + outfile); if (is_xcounts) process_from_xcounts(n_threads, report_more_info, level_code[0], sites_file, regions, out); else if (preload) - process_preloaded(VERBOSE, report_more_info, level_code[0], sites_file, + process_preloaded(verbose, report_more_info, level_code[0], sites_file, chrom_order, regions, out); else process_on_disk(report_more_info, level_code[0], sites_file, chrom_order, @@ -573,4 +538,4 @@ Columns (beyond the first 6) in the BED format output: return EXIT_SUCCESS; } -// NOLINTEND(*-avoid-c-arrays,*-avoid-magic-numbers,*-narrowing-conversions,*-pointer-arithmetic) +// NOLINTEND(*-narrowing-conversions) diff --git a/src/common/Interval.hpp b/src/common/Interval.hpp index ee1fa02f..b85fda82 100644 --- a/src/common/Interval.hpp +++ b/src/common/Interval.hpp @@ -20,6 +20,7 @@ #include // #include // ADS: needs c++20 +#include #include // std::size #include #include @@ -32,9 +33,7 @@ struct Interval { Interval() = default; Interval(const std::string &chrom, const std::uint32_t start, - const std::uint32_t stop) : - chrom{chrom}, - start{start}, stop{stop} {} + const std::uint32_t stop) : chrom{chrom}, start{start}, stop{stop} {} explicit Interval(const std::string &line) { if (!initialize(line.data(), line.data() + std::size(line))) @@ -43,17 +42,27 @@ struct Interval { auto initialize(const char *, const char *) -> bool; - auto + [[nodiscard]] auto operator<(const Interval &rhs) const { return (chrom < rhs.chrom || (chrom == rhs.chrom && (start < rhs.start || (start == rhs.start && stop < rhs.stop)))); } + [[nodiscard]] auto + operator==(const Interval &rhs) const { + return chrom == rhs.chrom && start == rhs.start && stop < rhs.stop; + } + // auto // operator<=>(const Interval &) const = default; }; +inline auto +operator<<(std::ostream &os, const Interval &x) -> std::ostream & { + return os << x.chrom << "\t" << x.start << "\t" << x.stop; +} + [[nodiscard]] inline auto to_string(const Interval &x) -> std::string { return x.chrom + "\t" + std::to_string(x.start) + "\t" + diff --git a/src/common/Interval6.hpp b/src/common/Interval6.hpp index 3b84e213..94eed1f3 100644 --- a/src/common/Interval6.hpp +++ b/src/common/Interval6.hpp @@ -21,6 +21,7 @@ #include // #include // ADS: needs c++20 #include // std::size +#include #include #include #include @@ -37,8 +38,8 @@ struct Interval6 { Interval6(const std::string &chrom, const std::uint32_t start, const std::uint32_t stop, const std::string &name, const double score, const char strand) : - chrom{chrom}, - start{start}, stop{stop}, name{name}, score{score}, strand{strand} {} + chrom{chrom}, start{start}, stop{stop}, name{name}, score{score}, + strand{strand} {} explicit Interval6(const std::string &line) { if (!initialize(line.data(), line.data() + std::size(line))) @@ -58,11 +59,20 @@ struct Interval6 { // operator<=>(const Interval6 &) const = default; }; +inline auto +operator<<(std::ostream &os, const Interval6 &x) -> std::ostream & { + return os << x.chrom << "\t" << x.start << "\t" << x.stop << "\t" << x.name + << "\t" << x.score << "\t" << x.strand; +} + [[nodiscard]] inline auto to_string(const Interval6 &x) -> std::string { - return x.chrom + "\t" + std::to_string(x.start) + "\t" + - std::to_string(x.stop) + "\t" + x.name + "\t" + - std::to_string(x.score) + "\t" + std::string(1, x.strand); + std::ostringstream oss; + oss << x; + return oss.str(); + // return x.chrom + "\t" + std::to_string(x.start) + "\t" + + // std::to_string(x.stop) + "\t" + x.name + "\t" + + // std::to_string(x.score) + "\t" + std::string(1, x.strand); } // ADS: need to bump to c++20 for this diff --git a/src/common/LevelsCounter.cpp b/src/common/LevelsCounter.cpp index 92004c45..a9f6103e 100644 --- a/src/common/LevelsCounter.cpp +++ b/src/common/LevelsCounter.cpp @@ -21,6 +21,23 @@ #include #include +auto +format_levels_counter(const LevelsCounter &lc) -> std::string { + std::ostringstream oss; + // clang-format off + oss << lc.mean_meth_weighted() << '\t' + << lc.mean_meth() << '\t' + << lc.fractional_meth() << '\t' + << lc.total_sites << '\t' + << lc.sites_covered << '\t' + << lc.total_c << '\t' + << (lc.total_c + lc.total_t) << '\t' + << lc.total_meth << '\t' + << lc.sites_covered; + // clang-format on + return oss.str(); +} + void LevelsCounter::update(const MSite &s) { static constexpr auto half = 0.5; @@ -43,8 +60,8 @@ LevelsCounter::update(const MSite &s) { ++total_sites; } -std::string -LevelsCounter::tostring() const { +auto +LevelsCounter::tostring() const -> std::string { static const std::string indent = std::string(2, ' '); const bool good = (sites_covered != 0); std::ostringstream oss; @@ -74,8 +91,8 @@ LevelsCounter::tostring() const { return oss.str(); } -std::string -LevelsCounter::format_row() const { +auto +LevelsCounter::format_row() const -> std::string { const bool good = (sites_covered > 0); std::ostringstream oss; // directly counted values @@ -101,8 +118,8 @@ LevelsCounter::format_row() const { return oss.str(); } -std::string -LevelsCounter::format_header() { +auto +LevelsCounter::format_header() -> std::string { std::ostringstream oss; // directly counted values oss << "01. total_sites" << '\n' @@ -127,8 +144,8 @@ LevelsCounter::format_header() { double LevelsCounter::alpha = 0.05; // NOLINT(*-avoid-magic-numbers) -std::ostream & -operator<<(std::ostream &out, const LevelsCounter &cs) { +auto +operator<<(std::ostream &out, const LevelsCounter &cs) -> std::ostream & { return out << cs.tostring(); } @@ -139,8 +156,8 @@ check_label(const std::string &observed, const std::string &expected) { "]"); } -std::istream & -operator>>(std::istream &in, LevelsCounter &cs) { +auto +operator>>(std::istream &in, LevelsCounter &cs) -> std::istream & { in >> cs.context; // get the context cs.context = cs.context.substr(0, cs.context.find_first_of(':')); @@ -175,8 +192,8 @@ operator>>(std::istream &in, LevelsCounter &cs) { return in; } -LevelsCounter & -LevelsCounter::operator+=(const LevelsCounter &rhs) { +auto +LevelsCounter::operator+=(const LevelsCounter &rhs) -> LevelsCounter & { total_sites += rhs.total_sites; sites_covered += rhs.sites_covered; max_depth += rhs.max_depth; diff --git a/src/common/LevelsCounter.hpp b/src/common/LevelsCounter.hpp index f6c24e75..cb55c7e0 100644 --- a/src/common/LevelsCounter.hpp +++ b/src/common/LevelsCounter.hpp @@ -20,6 +20,7 @@ #include #include #include +#include struct MSite; struct LevelsCounter { @@ -72,51 +73,51 @@ struct LevelsCounter { // coverage is equal to total_c plus total_t. These are the counts that // contribute towards estimates of the various methylation levels. - [[nodiscard]] std::uint64_t - coverage() const { + [[nodiscard]] auto + coverage() const -> std::uint64_t { return total_c + total_t; } // mean_depth is the average number of reads contributing to methylation // calls over all sites. - [[nodiscard]] double - mean_depth() const { + [[nodiscard]] auto + mean_depth() const -> double { return static_cast(coverage()) / total_sites; } // mean_depth_covered is the average number of reads contributing to // methylation calls over all sites that are covered at least once. - [[nodiscard]] double - mean_depth_covered() const { + [[nodiscard]] auto + mean_depth_covered() const -> double { return static_cast(coverage()) / sites_covered; } // sites_covered_frac is the fraction of all sites that are covered at least // once. - [[nodiscard]] double - sites_covered_frac() const { + [[nodiscard]] auto + sites_covered_frac() const -> double { return static_cast(sites_covered) / total_sites; } // total_called is equal to called_meth plus called_unmeth - [[nodiscard]] std::uint64_t - total_called() const { + [[nodiscard]] auto + total_called() const -> std::uint64_t { return called_meth + called_unmeth; } // mean_meth_weighted is the unweighted mean methylation level. This // is the ratio of total_c divided by coverage. This value is always // between 0 and 1. - [[nodiscard]] double - mean_meth_weighted() const { + [[nodiscard]] auto + mean_meth_weighted() const -> double { return static_cast(total_c) / std::max(coverage(), static_cast(1)); } // mean_meth is the unweighted mean methylation level. This is the ratio of // total_meth divided by sites_covered. Value is always between 0 and 1. - [[nodiscard]] double - mean_meth() const { + [[nodiscard]] auto + mean_meth() const -> double { return static_cast(total_meth) / std::max(sites_covered, static_cast(1)); } @@ -124,36 +125,39 @@ struct LevelsCounter { // fractional_meth is the fraction of "called" sites that are called // methylated. It is the ratio of called_meth divided by total_called. This // value is always between 0 and 1. - [[nodiscard]] double - fractional_meth() const { + [[nodiscard]] auto + fractional_meth() const -> double { return static_cast(called_meth) / std::max(total_called(), static_cast(1)); } - explicit LevelsCounter(const std::string &c) : context{c} {} + explicit LevelsCounter(std::string c) : context{std::move(c)} {} LevelsCounter() = default; - LevelsCounter & - operator+=(const LevelsCounter &rhs); + auto + operator+=(const LevelsCounter &rhs) -> LevelsCounter &; void update(const MSite &s); - [[nodiscard]] std::string - tostring() const; - [[nodiscard]] std::string - format_row() const; - [[nodiscard]] static std::string - format_header(); + [[nodiscard]] auto + tostring() const -> std::string; + [[nodiscard]] auto + format_row() const -> std::string; + [[nodiscard]] static auto + format_header() -> std::string; static double alpha; }; -std::ostream & -operator<<(std::ostream &out, const LevelsCounter &cs); +auto +format_levels_counter(const LevelsCounter &lc) -> std::string; -std::istream & -operator>>(std::istream &in, LevelsCounter &cs); +auto +operator<<(std::ostream &out, const LevelsCounter &cs) -> std::ostream &; + +auto +operator>>(std::istream &in, LevelsCounter &cs) -> std::istream &; #endif diff --git a/src/common/MSite.cpp b/src/common/MSite.cpp index 053addc0..741ee59b 100644 --- a/src/common/MSite.cpp +++ b/src/common/MSite.cpp @@ -1,45 +1,31 @@ -/* - Copyright (C) 2015-2022 University of Southern California - Authors: Andrew D. Smith and Masaru Nakajima - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. -*/ +/* Copyright (C) 2015-2025 Andrew D. Smith + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ #include "MSite.hpp" - -#include "bamxx.hpp" #include "counts_header.hpp" +#include + #include #include #include #include -#include #include #include #include #include #include -using std::cbegin; -using std::cend; -using std::end; -using std::find_if; -using std::from_chars; -using std::ifstream; -using std::ios_base; -using std::regex_match; -using std::runtime_error; -using std::string; - bool MSite::no_extra_fields = true; bool @@ -51,42 +37,42 @@ MSite::initialize(const char *c, const char *c_end) { // NOLINTBEGIN(*-pointer-arithmetic) auto field_s = c; - auto field_e = find_if(field_s + 1, c_end, is_sep); + auto field_e = std::find_if(field_s + 1, c_end, is_sep); if (field_e == c_end) failed = true; { const std::uint32_t d = std::distance(field_s, field_e); - chrom = string{field_s, d}; + chrom = std::string{field_s, d}; } - field_s = find_if(field_e + 1, c_end, not_sep); - field_e = find_if(field_s + 1, c_end, is_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); + field_e = std::find_if(field_s + 1, c_end, is_sep); failed = failed || (field_e == c_end); { - const auto [ptr, ec] = from_chars(field_s, field_e, pos); + const auto [ptr, ec] = std::from_chars(field_s, field_e, pos); failed = failed || (ptr == field_s); } - field_s = find_if(field_e + 1, c_end, not_sep); - field_e = find_if(field_s + 1, c_end, is_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); + field_e = std::find_if(field_s + 1, c_end, is_sep); failed = failed || (field_e != field_s + 1 || field_e == c_end); strand = *field_s; failed = failed || (strand != '-' && strand != '+'); - field_s = find_if(field_e + 1, c_end, not_sep); - field_e = find_if(field_s + 1, c_end, is_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); + field_e = std::find_if(field_s + 1, c_end, is_sep); failed = failed || (field_e == c_end); { const std::uint32_t d = std::distance(field_s, field_e); - context = string{field_s, d}; + context = std::string{field_s, d}; } - field_s = find_if(field_e + 1, c_end, not_sep); - field_e = find_if(field_s + 1, c_end, is_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); + field_e = std::find_if(field_s + 1, c_end, is_sep); failed = failed || (field_e == c_end); { @@ -94,15 +80,15 @@ MSite::initialize(const char *c, const char *c_end) { const int ret = std::sscanf(field_s, "%lf", &meth); failed = failed || (ret < 1); #else - const auto [ptr, ec] = from_chars(field_s, field_e, meth); + const auto [ptr, ec] = std::from_chars(field_s, field_e, meth); failed = failed || (ptr == field_s); #endif } - field_s = find_if(field_e + 1, c_end, not_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); { - const auto [ptr, ec] = from_chars(field_s, c_end, n_reads); + const auto [ptr, ec] = std::from_chars(field_s, c_end, n_reads); failed = failed || (no_extra_fields && ptr != c_end); } // ADS: the value below would work for a flag @@ -113,19 +99,19 @@ MSite::initialize(const char *c, const char *c_end) { return !failed; } -MSite::MSite(const string &line) { +MSite::MSite(const std::string &line) { // NOLINTNEXTLINE(*-pointer-arithmetic) if (!initialize(line.data(), line.data() + std::size(line))) - throw runtime_error("bad count line: " + line); + throw std::runtime_error("bad count line: " + line); } MSite::MSite(const char *line, const int n) { // NOLINTNEXTLINE(*-pointer-arithmetic) if (!initialize(line, line + n)) - throw runtime_error("bad count line: " + string(line)); + throw std::runtime_error("bad count line: " + std::string(line)); } -string +std::string MSite::tostring() const { std::ostringstream oss; oss << chrom << '\t' << pos << '\t' << strand << '\t' << context << '\t' @@ -140,7 +126,7 @@ distance(const MSite &a, const MSite &b) { } static void -move_to_start_of_line(ifstream &in) { +move_to_start_of_line(std::ifstream &in) { char next{}; // move backwards by: one step forward, two steps back while (in.good() && in.get(next) && next != '\n') { @@ -156,34 +142,34 @@ move_to_start_of_line(ifstream &in) { void find_offset_for_msite(const std::string &chr, const size_t idx, std::ifstream &site_in) { - site_in.seekg(0, ios_base::beg); + site_in.seekg(0, std::ios_base::beg); const auto begin_pos = site_in.tellg(); - site_in.seekg(0, ios_base::end); + site_in.seekg(0, std::ios_base::end); const auto end_pos = site_in.tellg(); if (end_pos - begin_pos < 2) - throw runtime_error("empty counts file"); + throw std::runtime_error("empty counts file"); std::streamoff step_size = (end_pos - begin_pos) / 2; - site_in.seekg(0, ios_base::beg); - string low_chr; - size_t low_idx = 0; + site_in.seekg(0, std::ios_base::beg); + std::string low_chr; + size_t low_idx{}; if (!(site_in >> low_chr >> low_idx)) - throw runtime_error("failed search in counts file"); + throw std::runtime_error("failed search in counts file"); // MAGIC: need the -2 here to get past the EOF and possibly a '\n' - site_in.seekg(-2, ios_base::end); + site_in.seekg(-2, std::ios_base::end); move_to_start_of_line(site_in); - string high_chr; + std::string high_chr; size_t high_idx{}; if (!(site_in >> high_chr >> high_idx)) - throw runtime_error("failed search in counts file"); + throw std::runtime_error("failed search in counts file"); std::streamoff pos = step_size; - site_in.seekg(pos, ios_base::beg); + site_in.seekg(pos, std::ios_base::beg); move_to_start_of_line(site_in); - std::streamoff prev_pos = 0; // keep track of previous position in file + std::streamoff prev_pos{}; // keep track of previous position in file // binary search inside sorted file on disk // iterate until step size is 0 or positions are identical @@ -191,88 +177,89 @@ find_offset_for_msite(const std::string &chr, const size_t idx, // track (mid) position in file to make sure it keeps moving prev_pos = pos; - string mid_chr; // chromosome name at mid point - size_t mid_idx = 0; // position at mid point + std::string mid_chr; // chromosome name at mid point + size_t mid_idx{}; // position at mid point if (!(site_in >> mid_chr >> mid_idx)) - throw runtime_error("failed navigating inside file"); + throw std::runtime_error("failed navigating inside file"); // this check will never give a false indication of unsorted, but // might catch an unsorted file if (mid_chr < low_chr || mid_chr > high_chr) { // NOLINTBEGIN(performance-inefficient-string-concatenation) - throw runtime_error("chromosomes unsorted inside file: " - "low=" + - low_chr + ",mid=" + mid_chr + ",high=" + high_chr); + throw std::runtime_error("chromosomes unsorted inside file: " + "low=" + + low_chr + ",mid=" + mid_chr + + ",high=" + high_chr); // NOLINTEND(performance-inefficient-string-concatenation) } step_size /= 2; // cut the range in half if (chr < mid_chr || (chr == mid_chr && idx <= mid_idx)) { // move to the left - std::swap(mid_chr, high_chr); - std::swap(mid_idx, high_idx); + std::swap(high_chr, mid_chr); + std::swap(high_idx, mid_idx); pos -= step_size; } else { // move to the left - std::swap(mid_chr, low_chr); - std::swap(mid_idx, low_idx); + std::swap(low_chr, mid_chr); + std::swap(low_idx, mid_idx); pos += step_size; } - site_in.seekg(pos, ios_base::beg); + site_in.seekg(pos, std::ios_base::beg); move_to_start_of_line(site_in); } } void find_offset_for_msite( - const std::unordered_map &chrom_order, + const std::unordered_map &chrom_order, const std::string &chr, const size_t idx, std::ifstream &site_in) { - site_in.seekg(0, ios_base::beg); + site_in.seekg(0, std::ios_base::beg); const auto begin_pos = site_in.tellg(); - site_in.seekg(0, ios_base::end); + site_in.seekg(0, std::ios_base::end); const auto end_pos = site_in.tellg(); if (end_pos - begin_pos < 2) - throw runtime_error("empty counts file"); + throw std::runtime_error("empty counts file"); std::streamoff step_size = (end_pos - begin_pos) / 2; const auto chr_idx_itr = chrom_order.find(chr); - if (chr_idx_itr == end(chrom_order)) { - site_in.seekg(0, ios_base::end); + if (chr_idx_itr == std::cend(chrom_order)) { + site_in.seekg(0, std::ios_base::end); return; } const auto chr_idx = chr_idx_itr->second; - site_in.seekg(0, ios_base::beg); - string low_chr; + site_in.seekg(0, std::ios_base::beg); + std::string low_chr; size_t low_idx{}; if (!(site_in >> low_chr >> low_idx)) - throw runtime_error("failed search in counts file"); + throw std::runtime_error("failed search in counts file"); const auto low_chr_idx_itr = chrom_order.find(low_chr); - if (low_chr_idx_itr == end(chrom_order)) - throw runtime_error("inconsistent chromosome order info"); + if (low_chr_idx_itr == std::cend(chrom_order)) + throw std::runtime_error("inconsistent chromosome order info"); auto low_chr_idx = low_chr_idx_itr->second; // MAGIC: need the -2 here to get past the EOF and hopefully a '\n' - site_in.seekg(-2, ios_base::end); + site_in.seekg(-2, std::ios_base::end); move_to_start_of_line(site_in); - string high_chr; + std::string high_chr; size_t high_idx{}; if (!(site_in >> high_chr >> high_idx)) - throw runtime_error("failed search in counts file"); + throw std::runtime_error("failed search in counts file"); const auto high_chr_idx_itr = chrom_order.find(high_chr); - if (high_chr_idx_itr == end(chrom_order)) - throw runtime_error("inconsistent chromosome order info"); + if (high_chr_idx_itr == std::cend(chrom_order)) + throw std::runtime_error("inconsistent chromosome order info"); auto high_chr_idx = high_chr_idx_itr->second; std::streamoff pos = step_size; - site_in.seekg(pos, ios_base::beg); + site_in.seekg(pos, std::ios_base::beg); move_to_start_of_line(site_in); - std::streamoff prev_pos = 0; // keep track of previous position in file + std::streamoff prev_pos{}; // keep track of previous position in file // binary search inside sorted file on disk // iterate until step size is 0 or positions are identical @@ -280,26 +267,25 @@ find_offset_for_msite( // track (mid) position in file to make sure it keeps moving prev_pos = pos; - string mid_chr; // chromosome name at mid point - size_t mid_idx{}; // position at mid point + std::string mid_chr; // chromosome name at mid point + size_t mid_idx{}; // position at mid point if (!(site_in >> mid_chr >> mid_idx)) - throw runtime_error("failed navigating inside file"); + throw std::runtime_error("failed navigating inside file"); const auto mid_chr_idx_itr = chrom_order.find(mid_chr); - if (mid_chr_idx_itr == end(chrom_order)) - throw runtime_error("inconsistent chromosome order info"); + if (mid_chr_idx_itr == std::cend(chrom_order)) + throw std::runtime_error("inconsistent chromosome order info"); const auto mid_chr_idx = mid_chr_idx_itr->second; // this check will never give a false indication of unsorted, but // might catch an unsorted file - using std::to_string; if (mid_chr_idx < low_chr_idx || mid_chr_idx > high_chr_idx) { // NOLINTBEGIN(performance-inefficient-string-concatenation) - throw runtime_error("chromosomes unsorted inside file: " - "low=" + - to_string(low_chr_idx) + - ",mid=" + to_string(mid_chr_idx) + - ",high=" + to_string(high_chr_idx)); + throw std::runtime_error("chromosomes unsorted inside file: " + "low=" + + std::to_string(low_chr_idx) + + ",mid=" + std::to_string(mid_chr_idx) + + ",high=" + std::to_string(high_chr_idx)); // NOLINTEND(performance-inefficient-string-concatenation) } @@ -317,43 +303,43 @@ find_offset_for_msite( // low_idx = mid_idx; pos += step_size; } - site_in.seekg(pos, ios_base::beg); + site_in.seekg(pos, std::ios_base::beg); move_to_start_of_line(site_in); } } bool -is_msite_line(const string &line) { +is_msite_line(const std::string &line) { std::istringstream iss(line); - string chrom; + std::string chrom; if (!(iss >> chrom)) return false; - int64_t pos = 0; + std::int64_t pos{}; if (!(iss >> pos || pos < 0)) return false; - string strand; + std::string strand; if (!(iss >> strand) || (strand.size() != 1) || ((strand != "+") && (strand != "-"))) return false; - string context; + std::string context; // ADS: below, allowing any context // std::regex pattern("^C[pHWX][GH]$"); if (!(iss >> context)) return false; - double level = 0.0; + double level{}; if (!(iss >> level) || level < 0.0 || level > 1.0) return false; - int64_t n_reads = 0; + std::int64_t n_reads{}; if (!(iss >> n_reads || n_reads < 0)) return false; - string temp; + std::string temp; if (MSite::no_extra_fields && iss >> temp) return false; else @@ -361,12 +347,12 @@ is_msite_line(const string &line) { } bool -is_msite_file(const string &file) { +is_msite_file(const std::string &file) { bamxx::bgzf_file in(file, "r"); if (!in) - throw runtime_error("cannot open file: " + file); + throw std::runtime_error("cannot open file: " + file); - string line; + std::string line; while (getline(in, line) && is_counts_header_line(line)) ; diff --git a/src/common/bsutils.cpp b/src/common/bsutils.cpp index d1e857da..b6dd0aa6 100644 --- a/src/common/bsutils.cpp +++ b/src/common/bsutils.cpp @@ -16,7 +16,7 @@ #include "bsutils.hpp" #include "dnmtools_gaussinv.hpp" -#include +#include "Interval6.hpp" #include #include @@ -47,31 +47,30 @@ wilson_ci_for_binomial(const double alpha, const double n, const double p_hat, } void -adjust_region_ends(const std::vector> &clusters, - std::vector ®ions) { +adjust_region_ends(const std::vector> &clusters, + std::vector ®ions) { assert(std::size(clusters) == std::size(regions)); for (std::size_t i = 0; i < std::size(regions); ++i) { - std::size_t max_pos = regions[i].get_end(); - std::size_t min_pos = regions[i].get_start(); + auto max_pos = regions[i].stop; + auto min_pos = regions[i].start; for (std::size_t j = 0; j < std::size(clusters[i]); ++j) { - max_pos = std::max(clusters[i][j].get_end(), max_pos); - min_pos = std::min(clusters[i][j].get_start(), min_pos); + max_pos = std::max(clusters[i][j].stop, max_pos); + min_pos = std::min(clusters[i][j].start, min_pos); } - regions[i].set_end(max_pos); - regions[i].set_start(min_pos); + regions[i].stop = max_pos; + regions[i].start = min_pos; } } void -relative_sort(const std::vector &mapped_locations, +relative_sort(const std::vector &mapped_locations, const std::vector &names, std::vector &lookup) { std::unordered_map names_map; for (std::size_t i = 0; i < std::size(names); ++i) names_map[names[i]] = i; - for (std::size_t i = 0; i < std::size(mapped_locations); ++i) { - const auto j = names_map.find(mapped_locations[i].get_name()); + const auto j = names_map.find(mapped_locations[i].name); if (j == std::cend(names_map)) throw std::runtime_error("read sequence not found for: " + names[i]); lookup.push_back(j->second); diff --git a/src/common/bsutils.hpp b/src/common/bsutils.hpp index 30aa50d6..7beb5aa6 100644 --- a/src/common/bsutils.hpp +++ b/src/common/bsutils.hpp @@ -21,7 +21,7 @@ #include #include #include -class GenomicRegion; +class Interval6; inline bool is_cytosine(char c) { @@ -54,11 +54,11 @@ is_cpg(const std::string &s, size_t i) { } void -adjust_region_ends(const std::vector> &clusters, - std::vector ®ions); +adjust_region_ends(const std::vector> &clusters, + std::vector ®ions); void -relative_sort(const std::vector &mapped_locations, +relative_sort(const std::vector &mapped_locations, const std::vector &names, std::vector &lookup); diff --git a/src/radmeth/dmr.cpp b/src/radmeth/dmr.cpp index 779a0bb1..b688bd4d 100644 --- a/src/radmeth/dmr.cpp +++ b/src/radmeth/dmr.cpp @@ -1,10 +1,4 @@ -/* dmr: computes DMRs based on HMRs and probability of differences at - * individual CpGs - * - * Copyright (C) 2012-2023 University of Southern California and - * Andrew D. Smith - * - * Authors: Andrew D. Smith +/* Copyright (C) 2012-2025 Andrew D. Smith * * This program is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -17,15 +11,17 @@ * General Public License for more details. */ -#include "GenomicRegion.hpp" +[[maybe_unused]] static constexpr auto about = R"( +dmr: compute DMRs based on HMRs and probability of differences at each site +)"; + +#include "Interval6.hpp" #include "MSite.hpp" #include "OptionParser.hpp" -#include "smithlab_utils.hpp" #include #include -#include #include #include #include @@ -33,41 +29,32 @@ #include #include #include +#include #include #include #include #include #include -using std::cerr; -using std::cout; -using std::endl; -using std::find_if; -using std::from_chars; -using std::ifstream; -using std::max; -using std::pair; -using std::runtime_error; -using std::string; -using std::vector; - static bool -parse_methdiff_line(const char *c, const char *c_end, string &chrom, - uint32_t &pos, char &strand, string &context, - double &diffscore, uint32_t &n_meth_a, uint32_t &n_unmeth_a, - uint32_t &n_meth_b, uint32_t &n_unmeth_b) { +parse_methdiff_line(const char *c, const std::size_t c_sz, std::string &chrom, + std::uint32_t &pos, char &strand, std::string &context, + double &diffscore, std::uint32_t &n_meth_a, + std::uint32_t &n_unmeth_a, std::uint32_t &n_meth_b, + std::uint32_t &n_unmeth_b) { constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; }; constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; }; // NOLINTBEGIN(*-pointer-arithmetic) + const auto c_end = c + c_sz; auto field_s = c; auto field_e = std::find_if(field_s + 1, c_end, is_sep); bool failed = field_e == c_end; // chromosome name { - const uint32_t d = std::distance(field_s, field_e); - chrom = string{field_s, d}; + const std::uint32_t d = std::distance(field_s, field_e); + chrom = std::string{field_s, d}; } field_s = std::find_if(field_e + 1, c_end, not_sep); @@ -95,8 +82,8 @@ parse_methdiff_line(const char *c, const char *c_end, string &chrom, // context { - const uint32_t d = std::distance(field_s, field_e); - context = string{field_s, d}; + const std::uint32_t d = std::distance(field_s, field_e); + context = std::string{field_s, d}; } field_s = std::find_if(field_e + 1, c_end, not_sep); @@ -158,25 +145,25 @@ parse_methdiff_line(const char *c, const char *c_end, string &chrom, return !failed; } -static vector -read_diffs_file(const string &diffs_file) { +static std::vector +read_diffs_file(const std::string &diffs_file) { bamxx::bgzf_file in(diffs_file, "r"); if (!in) - throw runtime_error("could not open file: " + diffs_file); + throw std::runtime_error("could not open file: " + diffs_file); - string chrom, name; + std::string chrom, name; char strand{}; double diffscore{}; - uint32_t pos{}, meth_a{}, unmeth_a{}, meth_b{}, unmeth_b{}; + std::uint32_t pos{}, meth_a{}, unmeth_a{}, meth_b{}, unmeth_b{}; - vector cpgs; - string line; + std::vector cpgs; + std::string line; while (getline(in, line)) { // NOLINTBEGIN(*-pointer-arithmetic) - if (!parse_methdiff_line(line.data(), line.data() + size(line), chrom, pos, - strand, name, diffscore, meth_a, unmeth_a, meth_b, + if (!parse_methdiff_line(line.data(), std::size(line), chrom, pos, strand, + name, diffscore, meth_a, unmeth_a, meth_b, unmeth_b)) - throw runtime_error("bad methdiff line: " + line); + throw std::runtime_error("bad methdiff line: " + line); // NOLINTEND(*-pointer-arithmetic) cpgs.emplace_back(chrom, pos, strand, name, diffscore, 1); @@ -185,63 +172,64 @@ read_diffs_file(const string &diffs_file) { } static void -complement_regions(const size_t max_end, const vector &a, - const size_t start, const size_t end, - vector &cmpl) { - cmpl.push_back(GenomicRegion(a[start])); - cmpl.back().set_start(0); - for (size_t i = start; i < end; ++i) { - cmpl.back().set_end(a[i].get_start()); - cmpl.push_back(GenomicRegion(a[i])); - cmpl.back().set_start(a[i].get_end()); +complement(const std::size_t max_end, const std::vector &a, + const std::size_t start, const std::size_t end, + std::vector &cmpl) { + cmpl.push_back(Interval6(a[start])); + cmpl.back().start = 0; + for (std::size_t i = start; i < end; ++i) { + cmpl.back().stop = a[i].start; + cmpl.push_back(Interval6(a[i])); + cmpl.back().start = a[i].stop; } - cmpl.back().set_end(max_end); + cmpl.back().stop = max_end; } -static vector -get_chrom_ends(const vector &r) { - vector ends; - for (size_t i = 0; i < r.size() - 1; ++i) - if (!r[i].same_chrom(r[i + 1])) +static std::vector +get_chrom_ends(const std::vector &r) { + std::vector ends; + for (std::size_t i = 0; i + 1 < std::size(r); ++i) + if (r[i].chrom != r[i + 1].chrom) ends.push_back(i + 1); - ends.push_back(r.size()); + ends.push_back(std::size(r)); return ends; } -static vector -complement_regions(const size_t max_end, const vector &r) { - vector r_chroms = get_chrom_ends(r); - vector r_cmpl; - size_t t = 0; - for (size_t i = 0; i < size(r_chroms); ++i) { - complement_regions(max_end, r, t, r_chroms[i], r_cmpl); +static std::vector +complement(const std::size_t max_end, const std::vector &r) { + std::vector r_chroms = get_chrom_ends(r); + std::vector r_cmpl; + std::size_t t{}; + for (std::size_t i = 0; i < std::size(r_chroms); ++i) { + complement(max_end, r, t, r_chroms[i], r_cmpl); t = r_chroms[i]; } return r_cmpl; } static bool -check_no_overlap(const vector ®ions) { - for (size_t i = 1; i < regions.size(); ++i) - if (regions[i].same_chrom(regions[i - 1]) && - regions[i].get_start() < regions[i - 1].get_end()) +check_no_overlap(const std::vector ®ions) { + for (std::size_t i = 1; i < std::size(regions); ++i) + if (regions[i].chrom == regions[i - 1].chrom && + regions[i].start < regions[i - 1].stop) return false; return true; } static inline MSite -get_left_msite(const GenomicRegion &r) { - return {r.get_chrom(), r.get_start(), r.get_strand(), r.get_name(), 0.0, 1u}; +get_left_msite(const Interval6 &r) { + return {r.chrom, r.start, r.strand, r.name, 0.0, 1u}; } static inline MSite -get_right_msite(const GenomicRegion &r) { - return {r.get_chrom(), r.get_end(), r.get_strand(), r.get_name(), 0.0, 1u}; +get_right_msite(const Interval6 &r) { + return {r.chrom, r.stop, r.strand, r.name, 0.0, 1u}; } -static vector> -separate_sites(const vector &dmrs, const vector &sites) { - vector> sep_sites; +static std::vector> +separate_sites(const std::vector &dmrs, + const std::vector &sites) { + std::vector> sep_sites; for (const auto &dmr : dmrs) { const auto a = get_left_msite(dmr); const auto b = get_right_msite(dmr); @@ -260,10 +248,11 @@ pval_from_msite(const MSite &s) { static void get_cpg_stats(const bool LOW_CUTOFF, const double sig_cutoff, - const vector &cpgs, const size_t start_idx, - const size_t end_idx, size_t &total_cpgs, size_t &total_sig) { + const std::vector &cpgs, const std::size_t start_idx, + const std::size_t end_idx, std::size_t &total_cpgs, + std::size_t &total_sig) { total_cpgs = end_idx - start_idx; - for (size_t i = start_idx; i < end_idx; ++i) { + for (std::size_t i = start_idx; i < end_idx; ++i) { const auto pval = pval_from_msite(cpgs[i]); if ((LOW_CUTOFF && (pval < sig_cutoff)) || (!LOW_CUTOFF && (pval > 1.0 - sig_cutoff))) @@ -271,10 +260,42 @@ get_cpg_stats(const bool LOW_CUTOFF, const double sig_cutoff, } } +template +[[nodiscard]] auto +intersection_by_base(const std::vector &a, + const std::vector &b) -> std::vector { + const auto end_precedes = [](const T &x, const T &y) { + const auto cmp = x.chrom.compare(y.chrom); + return cmp < 0 || (cmp == 0 && x.stop < y.stop); + }; + const auto overlaps = [](const T &x, const T &y) { + const auto cmp = x.chrom.compare(y.chrom); + return cmp == 0 && std::max(x.start, x.start) < std::min(x.stop, y.stop); + }; + + auto a_itr = std::cbegin(a); + auto a_end = std::cend(a); + auto b_itr = std::cbegin(b); + auto b_end = std::cend(b); + + std::vector c; + while (a_itr != a_end && b_itr != b_end) { + if (overlaps(*a_itr, *b_itr)) + c.emplace_back(a_itr->chrom, std::max(a_itr->start, b_itr->start), + std::min(a_itr->stop, b_itr->stop), std::string{}, 0.0, + '+'); + if (end_precedes(*a_itr, *b_itr)) + ++a_itr; + else + ++b_itr; + } + return c; +} + int main_dmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - static const string description = + static const std::string description = R"( computes DMRs based on HMRs and probability of differences at individual CpGs"; @@ -291,116 +312,118 @@ individual CpGs"; opt_parse.set_show_defaults(); opt_parse.add_opt("cutoff", 'c', "Significance cutoff", false, sig_cutoff); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << '\n' - << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << '\n'; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } - if (leftover_args.size() != 5) { // NOLINT(*-avoid-magic-numbers) - cerr << opt_parse.help_message() << '\n'; + if (std::size(leftover_args) != 5) { // NOLINT(*-avoid-magic-numbers) + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } - const string diffs_file = leftover_args[0]; - const string hmr1_file = leftover_args[1]; - const string hmr2_file = leftover_args[2]; - const string outfile_a = leftover_args[3]; - const string outfile_b = leftover_args[4]; + const std::string diffs_file = leftover_args[0]; + const std::string hmr1_file = leftover_args[1]; + const std::string hmr2_file = leftover_args[2]; + const std::string outfile_a = leftover_args[3]; + const std::string outfile_b = leftover_args[4]; /****************** END COMMAND LINE OPTIONS *****************/ if (VERBOSE) - cerr << "[LOADING HMRS] " << hmr1_file << '\n'; + std::cerr << "[LOADING HMRS] " << hmr1_file << '\n'; - vector regions_a; - ReadBEDFile(hmr1_file, regions_a); - assert(check_sorted(regions_a)); - if (!check_sorted(regions_a)) - throw runtime_error("regions not sorted in file: " + hmr1_file); + const auto regions_a = read_intervals6(hmr1_file); + if (!std::is_sorted(std::cbegin(regions_a), std::cend(regions_a))) + throw std::runtime_error("regions not sorted in file: " + hmr1_file); if (!check_no_overlap(regions_a)) - throw runtime_error("regions overlap in file: " + hmr1_file); + throw std::runtime_error("regions overlap in file: " + hmr1_file); if (VERBOSE) - cerr << "[LOADING HMRS] " << hmr2_file << '\n'; + std::cerr << "[LOADING HMRS] " << hmr2_file << '\n'; - vector regions_b; - ReadBEDFile(hmr2_file, regions_b); - assert(check_sorted(regions_b)); - if (!check_sorted(regions_b)) - throw runtime_error("regions not sorted in file: " + hmr2_file); + const auto regions_b = read_intervals6(hmr2_file); + if (!std::is_sorted(std::cbegin(regions_b), std::cend(regions_b))) + throw std::runtime_error("regions not sorted in file: " + hmr2_file); if (!check_no_overlap(regions_b)) - throw runtime_error("regions overlap in file: " + hmr2_file); + throw std::runtime_error("regions overlap in file: " + hmr2_file); if (VERBOSE) - cerr << "[COMPUTING SYMMETRIC DIFFERENCE]" << '\n'; + std::cerr << "[COMPUTING SYMMETRIC DIFFERENCE]" << '\n'; - size_t max_end = 0; - for (const auto &r : regions_a) - max_end = max(max_end, r.get_end()); // cppcheck-suppress useStlAlgorithm - for (const auto &r : regions_b) - max_end = max(max_end, r.get_end()); // cppcheck-suppress useStlAlgorithm + const auto get_max_stop = [](const auto &x) { + const auto acc = [](const auto s, const auto &r) { + return std::max(s, r.stop); + }; + return std::accumulate(std::cbegin(x), std::cend(x), 0u, acc); + }; + const auto max_stop = + std::max(get_max_stop(regions_a), get_max_stop(regions_b)); - const auto a_cmpl = complement_regions(max_end, regions_a); - const auto b_cmpl = complement_regions(max_end, regions_b); + const auto a_cmpl = complement(max_stop, regions_a); + const auto b_cmpl = complement(max_stop, regions_b); - vector dmrs_a, dmrs_b; - genomic_region_intersection_by_base(regions_a, b_cmpl, dmrs_a); - genomic_region_intersection_by_base(regions_b, a_cmpl, dmrs_b); + auto dmrs_a = intersection_by_base(regions_a, b_cmpl); + auto dmrs_b = intersection_by_base(regions_b, a_cmpl); // separate the regions by chrom and by desert if (VERBOSE) - cerr << "[READING CPG METH DIFFS]" << '\n'; + std::cerr << "[READING CPG METH DIFFS]" << '\n'; const auto cpgs = read_diffs_file(diffs_file); if (VERBOSE) - cerr << "[read " << size(cpgs) << " sites from " + diffs_file << "]" - << '\n'; + std::cerr << "[read " << std::size(cpgs) << " sites from " + diffs_file + << "]\n"; - if (!check_sorted(cpgs)) - throw runtime_error("CpGs not sorted in: " + diffs_file); + if (!std::is_sorted(std::cbegin(cpgs), std::cend(cpgs))) + throw std::runtime_error("CpGs not sorted in: " + diffs_file); if (VERBOSE) - cerr << "[TOTAL CPGS]: " << cpgs.size() << '\n'; + std::cerr << "[TOTAL CPGS]: " << std::size(cpgs) << '\n'; auto sep_sites = separate_sites(dmrs_a, cpgs); - for (size_t i = 0; i < dmrs_a.size(); ++i) { - size_t total_cpgs = 0, total_sig = 0; + for (std::size_t i = 0; i < std::size(dmrs_a); ++i) { + std::size_t total_cpgs{}, total_sig{}; get_cpg_stats(true, sig_cutoff, cpgs, sep_sites[i].first, sep_sites[i].second, total_cpgs, total_sig); - dmrs_a[i].set_name(dmrs_a[i].get_name() + ":" + toa(total_cpgs)); - dmrs_a[i].set_score(static_cast(total_sig)); + dmrs_a[i].name += ":" + std::to_string(total_cpgs); + dmrs_a[i].score = static_cast(total_sig); } sep_sites = separate_sites(dmrs_b, cpgs); - for (size_t i = 0; i < dmrs_b.size(); ++i) { - size_t total_cpgs = 0, total_sig = 0; + for (std::size_t i = 0; i < std::size(dmrs_b); ++i) { + std::size_t total_cpgs{}, total_sig{}; get_cpg_stats(false, sig_cutoff, cpgs, sep_sites[i].first, sep_sites[i].second, total_cpgs, total_sig); - dmrs_b[i].set_name(dmrs_b[i].get_name() + ":" + toa(total_cpgs)); - dmrs_b[i].set_score(static_cast(total_sig)); + dmrs_b[i].name += ":" + std::to_string(total_cpgs); + dmrs_b[i].score = static_cast(total_sig); } std::ofstream out_a(outfile_a); + if (!out_a) + throw std::runtime_error("failed to open output file: " + outfile_a); std::copy(std::cbegin(dmrs_a), std::cend(dmrs_a), - std::ostream_iterator(out_a, "\n")); + std::ostream_iterator(out_a, "\n")); std::ofstream out_b(outfile_b); + if (!out_b) + throw std::runtime_error("failed to open output file: " + outfile_b); std::copy(std::cbegin(dmrs_b), std::cend(dmrs_b), - std::ostream_iterator(out_b, "\n")); + std::ostream_iterator(out_b, "\n")); if (VERBOSE) - cerr << "[OUTPUT FORMAT] COL4=NAME:N_COVERED_CPGS COL5=N_SIG_CPGS\n"; + std::cerr << "[OUTPUT FORMAT] COL4=NAME:N_COVERED_CPGS COL5=N_SIG_CPGS\n"; } catch (const std::exception &e) { - cerr << e.what() << '\n'; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; diff --git a/src/radmeth/radmeth-merge.cpp b/src/radmeth/radmeth-merge.cpp index cc519072..0f525857 100644 --- a/src/radmeth/radmeth-merge.cpp +++ b/src/radmeth/radmeth-merge.cpp @@ -15,7 +15,7 @@ * General Public License for more details. */ -#include "GenomicRegion.hpp" +#include "Interval6.hpp" #include "OptionParser.hpp" #include @@ -27,49 +27,36 @@ #include #include -using std::cerr; -using std::cout; -using std::endl; -using std::ifstream; -using std::istream; -using std::istringstream; -using std::ofstream; -using std::ostream; -using std::runtime_error; -using std::string; -using std::vector; - // NOLINTBEGIN(*-narrowing-conversions) // Attemps to find the next significant CpG site. Returns true if one was found // and flase otherwise. static bool -read_next_significant_cpg(istream &cpg_stream, GenomicRegion &cpg, - double cutoff, bool &skipped_any, bool &n_sig_sites, - size_t &test_cov, size_t &test_meth, size_t &rest_cov, - size_t &rest_meth) { - GenomicRegion region; +read_next_significant_cpg(std::istream &cpg_stream, Interval6 &cpg, + double cutoff, bool &skipped_any, bool &has_sig_sites, + std::size_t &test_cov, std::size_t &test_meth, + std::size_t &rest_cov, std::size_t &rest_meth) { + Interval6 region; skipped_any = false; - n_sig_sites = false; - string cpg_encoding; + has_sig_sites = false; + std::string line; - while (getline(cpg_stream, cpg_encoding)) { - string chrom, name, sign; - size_t position{}; + while (std::getline(cpg_stream, line)) { + std::string chrom, name, sign; + std::size_t position{}; double raw_pval{}; double adjusted_pval{}; double corrected_pval{}; - - istringstream iss(cpg_encoding); - iss.exceptions(std::ios::failbit); - iss >> chrom >> position >> sign >> name >> raw_pval >> adjusted_pval >> - corrected_pval >> test_cov >> test_meth >> rest_cov >> rest_meth; - + std::istringstream iss(line); + if (!(iss >> chrom >> position >> sign >> name >> raw_pval >> + adjusted_pval >> corrected_pval >> test_cov >> test_meth >> + rest_cov >> rest_meth)) + throw std::runtime_error("failed to parse line:\n" + line); if (corrected_pval >= 0.0 && corrected_pval < cutoff) { - cpg.set_chrom(chrom); - cpg.set_start(position); - cpg.set_end(position + 1); - n_sig_sites = (0 <= raw_pval && raw_pval < cutoff); + cpg.chrom = chrom; + cpg.start = position; + cpg.stop = position + 1; + has_sig_sites = (0.0 <= raw_pval && raw_pval < cutoff); return true; } skipped_any = true; @@ -79,77 +66,75 @@ read_next_significant_cpg(istream &cpg_stream, GenomicRegion &cpg, } static void -merge(istream &cpg_stream, ostream &dmr_stream, double cutoff) { - GenomicRegion dmr; - dmr.set_name("dmr"); +merge(std::istream &cpg_stream, std::ostream &dmr_stream, double cutoff) { + Interval6 dmr; + dmr.name = "dmr"; - size_t dmr_test_cov = 0; - size_t dmr_test_meth = 0; - size_t dmr_rest_cov = 0; - size_t dmr_rest_meth = 0; + std::size_t dmr_test_cov{}; + std::size_t dmr_test_meth{}; + std::size_t dmr_rest_cov{}; + std::size_t dmr_rest_meth{}; - size_t test_cov = 0; - size_t test_meth = 0; - size_t rest_cov = 0; - size_t rest_meth = 0; + std::size_t test_cov{}; + std::size_t test_meth{}; + std::size_t rest_cov{}; + std::size_t rest_meth{}; // Find the first significant CpG, or terminate the function if none exist. bool skipped_last_cpg{}; - bool n_sig_sites{}; + bool has_sig_sites{}; if (!read_next_significant_cpg(cpg_stream, dmr, cutoff, skipped_last_cpg, - n_sig_sites, test_cov, test_meth, rest_cov, + has_sig_sites, test_cov, test_meth, rest_cov, rest_meth)) return; - dmr.set_score(n_sig_sites); + dmr.score = has_sig_sites ? 1.0 : 0.0; dmr_test_cov += test_cov; dmr_test_meth += test_meth; dmr_rest_cov += rest_cov; dmr_rest_meth += rest_meth; - GenomicRegion cpg; - cpg.set_name("dmr"); + Interval6 cpg; + cpg.name = "dmr"; while (read_next_significant_cpg(cpg_stream, cpg, cutoff, skipped_last_cpg, - n_sig_sites, test_cov, test_meth, rest_cov, + has_sig_sites, test_cov, test_meth, rest_cov, rest_meth)) { - if (skipped_last_cpg || cpg.get_chrom() != dmr.get_chrom()) { - if (dmr.get_score() != 0) - dmr_stream << dmr.get_chrom() << '\t' << dmr.get_start() << '\t' - << dmr.get_end() << '\t' << dmr.get_name() << '\t' - << dmr.get_score() << '\t' + if (skipped_last_cpg || cpg.chrom != dmr.chrom) { + if (dmr.score != 0) + dmr_stream << dmr.chrom << '\t' << dmr.start << '\t' << dmr.stop << '\t' + << dmr.name << '\t' << dmr.score << '\t' << static_cast(dmr_test_meth) / dmr_test_cov - static_cast(dmr_rest_meth) / dmr_rest_cov << '\n'; dmr = cpg; - dmr.set_score(n_sig_sites); + dmr.score += has_sig_sites ? 1.0 : 0.0; dmr_test_cov = test_cov; dmr_test_meth = test_meth; dmr_rest_cov = rest_cov; dmr_rest_meth = rest_meth; } else { - dmr.set_end(cpg.get_end()); - dmr.set_score(dmr.get_score() + n_sig_sites); + dmr.stop = cpg.stop; + dmr.score += has_sig_sites ? 1.0 : 0.0; dmr_test_cov += test_cov; dmr_test_meth += test_meth; dmr_rest_cov += rest_cov; dmr_rest_meth += rest_meth; } } - if (dmr.get_score() != 0) { + if (dmr.score != 0) { const double diff = static_cast(dmr_test_meth) / dmr_test_cov - static_cast(dmr_rest_meth) / dmr_rest_cov; - dmr_stream << dmr.get_chrom() << '\t' << dmr.get_start() << '\t' - << dmr.get_end() << '\t' << dmr.get_name() << '\t' - << dmr.get_score() << '\t' << diff << '\n'; + dmr_stream << dmr.chrom << '\t' << dmr.start << '\t' << dmr.stop << '\t' + << dmr.name << '\t' << dmr.score << '\t' << diff << '\n'; } } int main_radmeth_merge(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - string outfile; + std::string outfile; double cutoff = 0.01; // NOLINT(*-avoid-magic-numbers) /**************** GET COMMAND LINE ARGUMENTS *************************/ @@ -161,41 +146,39 @@ main_radmeth_merge(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) opt_parse.add_opt("output", 'o', "output file (default: stdout)", false, outfile); opt_parse.add_opt("cutoff", 'p', "p-value cutoff", false, cutoff); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << '\n'; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 1) { - cerr << opt_parse.help_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } - const string bed_filename = leftover_args.front(); + const std::string bed_filename = leftover_args.front(); /************************************************************************/ - ofstream of; - if (!outfile.empty()) - of.open(outfile); - ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); - - ifstream in(bed_filename); + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open: " + outfile); + std::ifstream in(bed_filename); if (!in) - throw runtime_error("could not open file: " + bed_filename); + throw std::runtime_error("failed to open: " + bed_filename); merge(in, out, cutoff); } catch (const std::exception &e) { - cerr << "ERROR: " << e.what() << '\n'; - exit(EXIT_FAILURE); + std::cerr << e.what() << '\n'; + return EXIT_FAILURE; } return EXIT_SUCCESS; } diff --git a/src/utils/selectsites.cpp b/src/utils/selectsites.cpp index 92bde671..59a4a2f1 100644 --- a/src/utils/selectsites.cpp +++ b/src/utils/selectsites.cpp @@ -1,22 +1,22 @@ -/* selectsites: program to select sites, specified in a methcounts - * format file, that are contained in given (bed format) intervals +/* Copyright (C) 2019-2025 Andrew D. Smith * - * Copyright (C) 2019-2023 Andrew D. Smith + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. * - * Authors: Andrew D. Smith - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. */ -#include "GenomicRegion.hpp" +[[maybe_unused]] static constexpr auto about = R"( +selectsites: Select sites, specified in a methcounts format file, that are +contained in given (bed format) intervals. +)"; + +#include "Interval6.hpp" #include "MSite.hpp" #include "OptionParser.hpp" @@ -24,8 +24,6 @@ #include -#include - #include #include #include @@ -44,42 +42,9 @@ #include #include -using std::cerr; -using std::cout; -using std::ios_base; -using std::runtime_error; -using std::string; -using std::tie; -using std::tuple; -using std::unordered_map; - -using bamxx::bgzf_file; - -namespace fs = std::filesystem; - -struct quick_buf : public std::ostringstream, - public std::basic_stringbuf { - // ADS: By user ecatmur on SO; very fast. Seems to work... - quick_buf() { - // ...but this seems to depend on data layout - static_cast &>(*this).rdbuf(this); - } - void - clear() { - // reset buffer pointers (member functions) - setp(pbase(), pbase()); - } - char const * - data() { - /* between data() and insertion make sure to clear() */ - *pptr() = '\0'; - return pbase(); - } -}; - struct selectsites_summary { - // command_line is the command used to produce this summary file and - // the corresponding results + // command_line is the command used to produce this summary file and the + // corresponding results std::string command_line{}; // n_target_regions is the number of target regions provided as @@ -89,108 +54,104 @@ struct selectsites_summary { // target_region_size is the sum of the sizes of each target region std::uint64_t target_region_size{}; - // n_target_regions_collapsed is the number of target regions after - // having collapsed the input target regions merging those that - // overlap + // n_target_regions_collapsed is the number of target regions after having + // collapsed the input target regions merging those that overlap std::uint64_t n_target_regions_collapsed{}; - // target_region_collapsed_size is the sum of the sizes of target - // regions after collapsing + // target_region_collapsed_size is the sum of the sizes of target regions + // after collapsing std::uint64_t target_region_collapsed_size{}; - // n_sites_total is the total number of sites available in the input - // counts file. This value is displayed as zero if the command - // specified to process the sites on disk. + // n_sites_total is the total number of sites available in the input counts + // file. This value is displayed as zero if the command specified to process + // the sites on disk. std::uint64_t n_sites_total{}; // n_sites_selected is the total number of sites in the output file std::uint64_t n_sites_selected{}; template - static auto - measure_target_regions(const T &t) -> tuple { + [[nodiscard]] static auto + measure_target_regions(const T &t) + -> std::tuple { return { std::size(t), - accumulate(cbegin(t), cend(t), 0ul, - [](const std::uint64_t a, const typename T::value_type &v) { - return a + v.get_width(); - }), + std::accumulate( + std::cbegin(t), std::cend(t), 0ul, + [](const std::uint64_t a, const typename T::value_type &v) { + return a + size(v); + }), }; } - - auto - tostring() const -> std::string { - std::ostringstream oss; - oss << "command_line: " << command_line << '\n' - << "n_target_regions: " << n_target_regions << '\n' - << "target_region_size: " << target_region_size << '\n' - << "n_target_regions_collapsed: " << n_target_regions_collapsed << '\n' - << "target_region_collapsed_size: " << target_region_collapsed_size - << '\n' - << "n_sites_total: " << n_sites_total << '\n' - << "n_sites_selected: " << n_sites_selected; - return oss.str(); - } }; +[[nodiscard]] auto +to_string(const selectsites_summary &s) -> std::string { + std::ostringstream oss; + oss << "command_line: " << s.command_line << '\n' + << "n_target_regions: " << s.n_target_regions << '\n' + << "target_region_size: " << s.target_region_size << '\n' + << "n_target_regions_collapsed: " << s.n_target_regions_collapsed << '\n' + << "target_region_collapsed_size: " << s.target_region_collapsed_size + << '\n' + << "n_sites_total: " << s.n_sites_total << '\n' + << "n_sites_selected: " << s.n_sites_selected; + return oss.str(); +} + static auto write_stats_output(const selectsites_summary &summary, const std::string &summary_file) -> void { if (!summary_file.empty()) { std::ofstream out_summary(summary_file); if (!out_summary) - throw runtime_error("bad summary output file"); - out_summary << summary.tostring() << '\n'; + throw std::runtime_error("bad summary output file"); + out_summary << to_string(summary) << '\n'; } } static void -collapsebed(std::vector ®ions) { +collapsebed(std::vector ®ions) { std::ptrdiff_t j = 0; - for (std::size_t i = 1; i < std::size(regions); ++i) { - if (regions[j].same_chrom(regions[i]) && - regions[i].get_start() <= regions[j].get_end()) { - regions[j].set_end(std::max(regions[j].get_end(), regions[i].get_end())); - } - else { - regions[++j] = regions[i]; - } + for (const auto &r : regions) { + if (regions[j].chrom == r.chrom && r.start <= regions[j].stop) + regions[j].stop = std::max(regions[j].stop, r.stop); + else + regions[++j] = r; } regions.erase(std::begin(regions) + j + 1, std::end(regions)); } static inline bool -precedes(const GenomicRegion &r, const MSite &s) { - return (r.get_chrom() < s.chrom || - (r.get_chrom() == s.chrom && r.get_end() <= s.pos)); +precedes(const Interval6 &r, const MSite &s) { + return (r.chrom < s.chrom || (r.chrom == s.chrom && r.stop <= s.pos)); } static inline bool -contains(const GenomicRegion &r, const MSite &s) { - return (r.get_chrom() == s.chrom && - (r.get_start() <= s.pos && s.pos < r.get_end())); +contains(const Interval6 &r, const MSite &s) { + return (r.chrom == s.chrom && (r.start <= s.pos && s.pos < r.stop)); } static auto process_all_sites( - const bool VERBOSE, const std::string &sites_file, - const unordered_map> ®ions, - bgzf_file &out) -> tuple { - bgzf_file in(sites_file, "r"); + const bool verbose, const std::string &sites_file, + const std::unordered_map> ®ions, + bamxx::bgzf_file &out) -> std::tuple { + bamxx::bgzf_file in(sites_file, "r"); if (!in) - throw runtime_error("cannot open file: " + sites_file); + throw std::runtime_error("cannot open file: " + sites_file); std::uint64_t n_sites_total = 0u; std::uint64_t n_sites_selected = 0u; MSite the_site, prev_site; - std::vector::const_iterator i, i_lim; + std::vector::const_iterator i, i_lim; bool chrom_is_relevant = false; while (read_site(in, the_site)) { ++n_sites_total; if (the_site.chrom != prev_site.chrom) { - if (VERBOSE) - cerr << "processing " << the_site.chrom << '\n'; + if (verbose) + std::cerr << "processing " << the_site.chrom << '\n'; const auto r = regions.find(the_site.chrom); chrom_is_relevant = (r != cend(regions)); if (chrom_is_relevant) { @@ -211,43 +172,39 @@ process_all_sites( return {n_sites_selected, n_sites_total}; } -static auto -get_sites_in_region(std::ifstream &site_in, const GenomicRegion ®ion, - bgzf_file &out) -> std::uint64_t { - quick_buf buf; - - const std::string chrom{region.get_chrom()}; - const std::size_t start_pos = region.get_start(); - const std::size_t end_pos = region.get_end(); +[[nodiscard]] static auto +get_sites_in_region(std::ifstream &site_in, const Interval6 ®ion, + bamxx::bgzf_file &out) -> std::uint64_t { + const std::string chrom{region.chrom}; + const std::size_t start_pos = region.start; + const std::size_t end_pos = region.stop; find_offset_for_msite(chrom, start_pos, site_in); - MSite the_site; std::uint64_t n_sites_selected = 0u; // ADS: should only happen once that "the_site.chrom < chrom" and // this is only needed because of end state of binary search on disk - while (site_in >> the_site && - (the_site.chrom < chrom || - (the_site.chrom == chrom && the_site.pos < end_pos))) + std::string line; + while (getline(site_in, line)) { + const auto the_site = MSite(line); + if (chrom <= the_site.chrom && + (the_site.chrom != chrom || end_pos <= the_site.pos)) + break; if (start_pos <= the_site.pos) { - // ADS: don't like this -- always needing to "clear()" this - // struct is bad... - buf.clear(); - buf << the_site << '\n'; - if (!out.write(buf.data(), buf.tellp())) - throw runtime_error("error writing output"); + if (!out.write(line.data(), std::size(line))) + throw std::runtime_error("error writing output"); ++n_sites_selected; } + } return n_sites_selected; } static auto process_with_sites_on_disk(const std::string &sites_file, - const std::vector ®ions, - bgzf_file &out) -> std::uint64_t { + const std::vector ®ions, + bamxx::bgzf_file &out) -> std::uint64_t { std::ifstream in(sites_file); if (!in) - throw runtime_error("cannot open file: " + sites_file); - + throw std::runtime_error("cannot open file: " + sites_file); std::uint64_t n_sites_selected = 0ul; for (auto i = 0u; i < std::size(regions) && in; ++i) n_sites_selected += get_sites_in_region(in, regions[i], out); @@ -256,30 +213,21 @@ process_with_sites_on_disk(const std::string &sites_file, static void regions_by_chrom( - std::vector ®ions, - unordered_map> &lookup) { - for (auto &&r : regions) { - const std::string chrom_name(r.get_chrom()); - if (lookup.find(chrom_name) == end(lookup)) - lookup[chrom_name] = std::vector(); - lookup[chrom_name].push_back(r); - } + std::vector ®ions, + std::unordered_map> &lookup) { + for (auto &&r : regions) + lookup[r.chrom].push_back(r); regions.clear(); regions.shrink_to_fit(); } -inline bool -file_exists(const std::string &filename) { - return (access(filename.data(), F_OK) == 0); -} - static bool is_compressed_file(const std::string &filename) { - const bgzf_file f(filename.data(), "r"); + const bamxx::bgzf_file f(filename.data(), "r"); htsFormat fmt; const int ret = hts_detect_format(f.f->fp, &fmt); if (ret != 0) - throw runtime_error("failed to detect format: " + filename); + throw std::runtime_error("failed to detect format: " + filename); return fmt.compression != no_compression; } @@ -288,10 +236,11 @@ get_command_line(const int argc, const char *const argv[] // NOLINT(*-avoid-c-arrays) ) -> std::string { if (argc == 0) - return std::string(); + return std::string{}; std::ostringstream cmd; cmd << '"'; - copy(argv, argv + (argc - 1), std::ostream_iterator(cmd, " ")); + std::copy(argv, argv + (argc - 1), + std::ostream_iterator(cmd, " ")); cmd << argv[argc - 1] << '"'; // NOLINT(*-pointer-arithmetic) return cmd.str(); } @@ -299,7 +248,7 @@ get_command_line(const int argc, int main_selectsites(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - bool VERBOSE = false; + bool verbose = false; bool keep_file_on_disk = false; bool compress_output = false; bool allow_extra_fields = false; @@ -326,25 +275,25 @@ main_selectsites(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output); opt_parse.add_opt("relaxed", '\0', "input has extra fields", false, allow_extra_fields); - opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); + opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); opt_parse.set_show_defaults(); std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << '\n' - << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << '\n'; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (std::size(leftover_args) != 2) { - cerr << opt_parse.help_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const std::string regions_file = leftover_args.front(); @@ -356,33 +305,32 @@ main_selectsites(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) selectsites_summary summary; summary.command_line = get_command_line(argc, argv); - if (!fs::is_regular_file(sites_file)) - throw runtime_error("bad input sites file: " + sites_file); + if (!std::filesystem::is_regular_file(sites_file)) + throw std::runtime_error("bad input sites file: " + sites_file); if (is_compressed_file(sites_file)) { keep_file_on_disk = false; - if (VERBOSE) - cerr << "input file is so must be loaded\n"; + if (verbose) + std::cerr << "input file is so must be loaded\n"; } - std::vector regions; - ReadBEDFile(regions_file, regions); - if (!check_sorted(regions)) - throw runtime_error("regions not sorted in file: " + regions_file); + auto regions = read_intervals6(regions_file); + if (!std::is_sorted(std::cbegin(regions), std::cend(regions))) + throw std::runtime_error("regions not sorted in file: " + regions_file); std::tie(summary.n_target_regions, summary.target_region_size) = selectsites_summary::measure_target_regions(regions); const std::size_t n_orig_regions = std::size(regions); collapsebed(regions); - if (VERBOSE && n_orig_regions != std::size(regions)) - cerr << "[number of regions merged due to overlap: " - << n_orig_regions - std::size(regions) << "]\n"; + if (verbose && n_orig_regions != std::size(regions)) + std::cerr << "[number of regions merged due to overlap: " + << n_orig_regions - std::size(regions) << "]\n"; std::tie(summary.n_target_regions_collapsed, summary.target_region_collapsed_size) = selectsites_summary::measure_target_regions(regions); - unordered_map> regions_lookup; + std::unordered_map> regions_lookup; if (!keep_file_on_disk) regions_by_chrom(regions, regions_lookup); @@ -390,19 +338,19 @@ main_selectsites(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) const std::string output_mode = compress_output ? "w" : "wu"; bamxx::bgzf_file out(outfile, output_mode); if (!out) - throw runtime_error("error opening output file: " + outfile); + throw std::runtime_error("error opening output file: " + outfile); if (keep_file_on_disk) summary.n_sites_selected = process_with_sites_on_disk(sites_file, regions, out); else std::tie(summary.n_sites_selected, summary.n_sites_total) = - process_all_sites(VERBOSE, sites_file, regions_lookup, out); + process_all_sites(verbose, sites_file, regions_lookup, out); write_stats_output(summary, summary_file); } catch (const std::exception &e) { - cerr << e.what() << '\n'; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From bc7af19548f5646deac324c181aa891a30288801 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 21 Dec 2025 19:47:31 -0800 Subject: [PATCH 2/6] multiple sources: making output file a required argument --- src/amrfinder/allelicmeth.cpp | 3 +-- src/amrfinder/amrtester.cpp | 2 +- src/analysis/cpgbins.cpp | 3 +-- src/analysis/hmr-rep.cpp | 3 +-- src/analysis/hypermr.cpp | 2 +- src/analysis/metagene.cpp | 3 +-- src/analysis/multimethstat.cpp | 3 +-- src/analysis/pmd.cpp | 3 +-- src/analysis/roimethstat.cpp | 2 +- src/radmeth/radmeth-merge.cpp | 3 +-- src/utils/selectsites.cpp | 3 +-- 11 files changed, 11 insertions(+), 19 deletions(-) diff --git a/src/amrfinder/allelicmeth.cpp b/src/amrfinder/allelicmeth.cpp index 3b99ff58..8ac56a53 100644 --- a/src/amrfinder/allelicmeth.cpp +++ b/src/amrfinder/allelicmeth.cpp @@ -234,8 +234,7 @@ computes probability of allele-specific methylation at each tuple of CpGs /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) description, ""); - opt_parse.add_opt("output", 'o', "output file name (default: stdout)", - false, outfile); + opt_parse.add_opt("output", 'o', "output file name", true, outfile); opt_parse.add_opt("chrom", 'c', "genome sequence file/directory", true, chroms_dir); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); diff --git a/src/amrfinder/amrtester.cpp b/src/amrfinder/amrtester.cpp index 80d776a2..d76ecfac 100644 --- a/src/amrfinder/amrtester.cpp +++ b/src/amrfinder/amrtester.cpp @@ -193,7 +193,7 @@ main_amrtester(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) "resolve epi-alleles", " "); - opt_parse.add_opt("output", 'o', "output file", false, outfile); + opt_parse.add_opt("output", 'o', "output file", true, outfile); opt_parse.add_opt("chrom", 'c', "reference genome fasta file", true, chrom_file); opt_parse.add_opt("itr", 'i', "max iterations", false, max_itr); diff --git a/src/analysis/cpgbins.cpp b/src/analysis/cpgbins.cpp index a76d74f4..b17d102a 100644 --- a/src/analysis/cpgbins.cpp +++ b/src/analysis/cpgbins.cpp @@ -195,8 +195,7 @@ Columns (beyond the first 6) in the BED format output: OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) description, " "); opt_parse.set_show_defaults(); - opt_parse.add_opt("output", 'o', "name of output file (default: stdout)", - true, outfile); + opt_parse.add_opt("output", 'o', "name of output file", true, outfile); opt_parse.add_opt("bin", 'b', "bin size in base pairs", false, bin_size); opt_parse.add_opt("threads", 't', "number of threads", false, n_threads); opt_parse.add_opt("level", 'l', diff --git a/src/analysis/hmr-rep.cpp b/src/analysis/hmr-rep.cpp index 125af4d3..d8ce5624 100644 --- a/src/analysis/hmr-rep.cpp +++ b/src/analysis/hmr-rep.cpp @@ -348,8 +348,7 @@ main_hmr_rep(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) description, " ..."); - opt_parse.add_opt("out", 'o', "output file (default: stdout)", false, - outfile); + opt_parse.add_opt("out", 'o', "output file", true, outfile); opt_parse.add_opt("desert", 'd', "max dist btwn cpgs with reads in HMR", false, desert_size); opt_parse.add_opt("itr", 'i', "max iterations", false, max_iterations); diff --git a/src/analysis/hypermr.cpp b/src/analysis/hypermr.cpp index 12c42532..723209ff 100644 --- a/src/analysis/hypermr.cpp +++ b/src/analysis/hypermr.cpp @@ -243,7 +243,7 @@ main_hypermr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) about, ""); - opt_parse.add_opt("out", 'o', "output file (BED format)", false, outfile); + opt_parse.add_opt("out", 'o', "output file", true, outfile); opt_parse.add_opt("scores", 's', "output file for posterior scores", false, scores_file); opt_parse.add_opt("tolerance", 't', "numerical tolerance", false, diff --git a/src/analysis/metagene.cpp b/src/analysis/metagene.cpp index 4c8f574d..f4e928f2 100644 --- a/src/analysis/metagene.cpp +++ b/src/analysis/metagene.cpp @@ -112,8 +112,7 @@ correctly. /****************** GET COMMAND LINE ARGUMENTS ***************************/ OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) description, " "); - opt_parse.add_opt("output", 'o', "output file (default: terminal)", false, - outfile); + opt_parse.add_opt("output", 'o', "output file", true, outfile); opt_parse.add_opt("size", 's', "analyze this size in both directions", false, region_size); opt_parse.add_opt("bin", 'b', "bin size", false, bin_size); diff --git a/src/analysis/multimethstat.cpp b/src/analysis/multimethstat.cpp index 67e2ce75..3b4d1051 100644 --- a/src/analysis/multimethstat.cpp +++ b/src/analysis/multimethstat.cpp @@ -510,8 +510,7 @@ main_multimethstat(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) "methylation in each of a set of genomic intervals", " "); opt_parse.set_show_defaults(); - opt_parse.add_opt("output", 'o', "Name of output file (default: stdout)", - false, outfile); + opt_parse.add_opt("output", 'o', "output file", true, outfile); opt_parse.add_opt("numeric", 'N', "print numeric values only (not NAs)", false, print_numeric_only); opt_parse.add_opt("preload", 'L', "Load all CpG sites", false, diff --git a/src/analysis/pmd.cpp b/src/analysis/pmd.cpp index 02529e99..82580040 100644 --- a/src/analysis/pmd.cpp +++ b/src/analysis/pmd.cpp @@ -1135,8 +1135,7 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) description, ""); - opt_parse.add_opt("out", 'o', "output file (default: stdout)", false, - outfile); + opt_parse.add_opt("out", 'o', "output file", true, outfile); opt_parse.add_opt("desert", 'd', "max dist between bins with data in PMD", false, desert_size); opt_parse.add_opt("fixedbin", 'f', "Fixed bin size", false, fixed_bin_size); diff --git a/src/analysis/roimethstat.cpp b/src/analysis/roimethstat.cpp index ab393c70..bb2b6efd 100644 --- a/src/analysis/roimethstat.cpp +++ b/src/analysis/roimethstat.cpp @@ -421,7 +421,7 @@ Columns (beyond the first 6) in the BED format output: OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) description, " "); opt_parse.set_show_defaults(); - opt_parse.add_opt("output", 'o', "Name of output file", true, outfile); + opt_parse.add_opt("output", 'o', "output file", true, outfile); opt_parse.add_opt("numeric", 'N', "print numeric values only (not NAs)", false, print_numeric_only); opt_parse.add_opt("preload", 'L', "load all site sites", false, preload); diff --git a/src/radmeth/radmeth-merge.cpp b/src/radmeth/radmeth-merge.cpp index 0f525857..15905956 100644 --- a/src/radmeth/radmeth-merge.cpp +++ b/src/radmeth/radmeth-merge.cpp @@ -143,8 +143,7 @@ main_radmeth_merge(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) " methylated CpGs into DMRs", ""); opt_parse.set_show_defaults(); - opt_parse.add_opt("output", 'o', "output file (default: stdout)", false, - outfile); + opt_parse.add_opt("output", 'o', "output file", true, outfile); opt_parse.add_opt("cutoff", 'p', "p-value cutoff", false, cutoff); std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); diff --git a/src/utils/selectsites.cpp b/src/utils/selectsites.cpp index 59a4a2f1..49f21112 100644 --- a/src/utils/selectsites.cpp +++ b/src/utils/selectsites.cpp @@ -264,8 +264,7 @@ main_selectsites(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) description, " ", 2); - opt_parse.add_opt("output", 'o', "output file (default: stdout)", false, - outfile); + opt_parse.add_opt("output", 'o', "output file", true, outfile); opt_parse.add_opt("disk", 'd', "process sites on disk " "(fast if target intervals are few)", From c348cbce9f697a30f1913d225a0b043a9d51ec31 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 21 Dec 2025 20:20:40 -0800 Subject: [PATCH 3/6] src/utils/selectsites.cpp: using Interval instead of Interval6 --- src/utils/selectsites.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/utils/selectsites.cpp b/src/utils/selectsites.cpp index 49f21112..3b3254e9 100644 --- a/src/utils/selectsites.cpp +++ b/src/utils/selectsites.cpp @@ -16,7 +16,7 @@ selectsites: Select sites, specified in a methcounts format file, that are contained in given (bed format) intervals. )"; -#include "Interval6.hpp" +#include "Interval.hpp" #include "MSite.hpp" #include "OptionParser.hpp" @@ -111,7 +111,7 @@ write_stats_output(const selectsites_summary &summary, } static void -collapsebed(std::vector ®ions) { +collapsebed(std::vector ®ions) { std::ptrdiff_t j = 0; for (const auto &r : regions) { if (regions[j].chrom == r.chrom && r.start <= regions[j].stop) @@ -123,19 +123,19 @@ collapsebed(std::vector ®ions) { } static inline bool -precedes(const Interval6 &r, const MSite &s) { +precedes(const Interval &r, const MSite &s) { return (r.chrom < s.chrom || (r.chrom == s.chrom && r.stop <= s.pos)); } static inline bool -contains(const Interval6 &r, const MSite &s) { +contains(const Interval &r, const MSite &s) { return (r.chrom == s.chrom && (r.start <= s.pos && s.pos < r.stop)); } static auto process_all_sites( const bool verbose, const std::string &sites_file, - const std::unordered_map> ®ions, + const std::unordered_map> ®ions, bamxx::bgzf_file &out) -> std::tuple { bamxx::bgzf_file in(sites_file, "r"); if (!in) @@ -145,7 +145,7 @@ process_all_sites( std::uint64_t n_sites_selected = 0u; MSite the_site, prev_site; - std::vector::const_iterator i, i_lim; + std::vector::const_iterator i, i_lim; bool chrom_is_relevant = false; while (read_site(in, the_site)) { ++n_sites_total; @@ -173,7 +173,7 @@ process_all_sites( } [[nodiscard]] static auto -get_sites_in_region(std::ifstream &site_in, const Interval6 ®ion, +get_sites_in_region(std::ifstream &site_in, const Interval ®ion, bamxx::bgzf_file &out) -> std::uint64_t { const std::string chrom{region.chrom}; const std::size_t start_pos = region.start; @@ -200,7 +200,7 @@ get_sites_in_region(std::ifstream &site_in, const Interval6 ®ion, static auto process_with_sites_on_disk(const std::string &sites_file, - const std::vector ®ions, + const std::vector ®ions, bamxx::bgzf_file &out) -> std::uint64_t { std::ifstream in(sites_file); if (!in) @@ -213,8 +213,8 @@ process_with_sites_on_disk(const std::string &sites_file, static void regions_by_chrom( - std::vector ®ions, - std::unordered_map> &lookup) { + std::vector ®ions, + std::unordered_map> &lookup) { for (auto &&r : regions) lookup[r.chrom].push_back(r); regions.clear(); @@ -313,7 +313,7 @@ main_selectsites(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) std::cerr << "input file is so must be loaded\n"; } - auto regions = read_intervals6(regions_file); + auto regions = read_intervals(regions_file); if (!std::is_sorted(std::cbegin(regions), std::cend(regions))) throw std::runtime_error("regions not sorted in file: " + regions_file); std::tie(summary.n_target_regions, summary.target_region_size) = @@ -329,7 +329,7 @@ main_selectsites(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) summary.target_region_collapsed_size) = selectsites_summary::measure_target_regions(regions); - std::unordered_map> regions_lookup; + std::unordered_map> regions_lookup; if (!keep_file_on_disk) regions_by_chrom(regions, regions_lookup); From dc94e7361c893b355f21240095c51b9465f1e34a Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 21 Dec 2025 20:21:17 -0800 Subject: [PATCH 4/6] src/common/Interval.cpp: fixing a bug that filed when the end of a line was just after the stop site --- src/common/Interval.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/common/Interval.cpp b/src/common/Interval.cpp index 415c2c2a..be073bb0 100644 --- a/src/common/Interval.cpp +++ b/src/common/Interval.cpp @@ -56,7 +56,6 @@ Interval::initialize(const char *c, const char *c_end) -> bool { // stop field_s = std::find_if(field_e + 1, c_end, not_sep); field_e = std::find_if(field_s + 1, c_end, is_sep); - failed = failed || (field_e == c_end); { const auto [ptr, ec] = std::from_chars(field_s, field_e, stop); failed = failed || ec != std::errc{}; From f2ce2d7015cec0bd867a8add9ca94210a63f39be Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 21 Dec 2025 20:40:42 -0800 Subject: [PATCH 5/6] src/analysis/roimethstat.cpp: reverting to local function for formatting levels counter because the format used by roi has been slightly different from the general format. Also reading the input intervals from either a BED3 or BED6, depending on what the user provides --- src/analysis/roimethstat.cpp | 61 +++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/src/analysis/roimethstat.cpp b/src/analysis/roimethstat.cpp index bb2b6efd..560e5b41 100644 --- a/src/analysis/roimethstat.cpp +++ b/src/analysis/roimethstat.cpp @@ -15,7 +15,9 @@ * more details. */ +#include "Interval.hpp" #include "Interval6.hpp" + #include "LevelsCounter.hpp" #include "MSite.hpp" #include "OptionParser.hpp" @@ -41,6 +43,29 @@ // NOLINTBEGIN(*-narrowing-conversions) +[[nodiscard]] static std::string +format_levels_counter_for_roi(const LevelsCounter &lc) { + // ... + // (7) weighted mean methylation + // (8) unweighted mean methylation + // (9) fractional methylation + // (10) number of sites in the region + // (11) number of sites covered at least once + // (12) number of observations in reads indicating methylation + // (13) total number of observations from reads in the region + std::ostringstream oss; + // clang-format off + oss << lc.mean_meth_weighted() << '\t' + << lc.mean_meth() << '\t' + << lc.fractional_meth() << '\t' + << lc.total_sites << '\t' + << lc.sites_covered << '\t' + << lc.total_c << '\t' + << lc.coverage(); + // clang-format on + return oss.str(); +} + static void update(LevelsCounter &lc, const xcounts_entry &xse) { static constexpr auto one_half = 0.5; @@ -85,7 +110,7 @@ process_chrom(const bool report_more_info, const char level_code, : (level_code == 'u' ? lc.sites_covered : lc.total_called())); out << to_string(interval); if (report_more_info) - out << '\t' << format_levels_counter(lc); + out << '\t' << format_levels_counter_for_roi(lc); out << '\n'; } } @@ -94,7 +119,7 @@ static void process_chrom(const bool report_more_info, const std::vector &intervals, std::ostream &out) { LevelsCounter lc; - const std::string lc_formatted = format_levels_counter(lc); + const std::string lc_formatted = format_levels_counter_for_roi(lc); for (const auto &r : intervals) { out << to_string(r); if (report_more_info) @@ -109,8 +134,6 @@ process_from_xcounts(const std::uint32_t n_threads, const bool report_more_info, const std::vector &intervals_in, std::ostream &out) { const auto sites_by_chrom = read_xcounts_by_chrom(n_threads, xsym_file); - // const auto intervals = get_Interval6s(intervals_file); - std::vector> intervals_by_chrom; std::string prev_chrom; for (auto i = 0u; i < std::size(intervals_in); ++i) { @@ -293,7 +316,7 @@ process_preloaded( r_scored.score = score; out << to_string(r_scored); if (report_more_info) - out << '\t' << format_levels_counter(lc); + out << '\t' << format_levels_counter_for_roi(lc); out << '\n'; } } @@ -362,7 +385,7 @@ process_on_disk( r.score = score; out << to_string(r); if (report_more_info) - out << '\t' << format_levels_counter(lc); + out << '\t' << format_levels_counter_for_roi(lc); out << '\n'; } } @@ -382,6 +405,30 @@ get_bed_columns(const std::string ®ions_file) { return n_columns; } +static auto +get_intervals(const std::string &filename) -> std::vector { + const auto field_count = [&] { + bamxx::bgzf_file in(filename, "r"); + if (!in) + throw std::runtime_error("cannot open file: " + filename); + std::string line; + while (getline(in, line) && line[0] == '#') + ; + std::istringstream iss(line); + std::vector v(std::istream_iterator(iss), + (std::istream_iterator())); + return std::size(v); + }(); + if (field_count >= 6) + return read_intervals6(filename); + + auto without_names = read_intervals(filename); + std::vector intervals; + for (const auto &w : without_names) + intervals.emplace_back(w.chrom, w.start, w.stop, std::string{}, 0, '+'); + return intervals; +} + int main_roimethstat(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { @@ -495,7 +542,7 @@ Columns (beyond the first 6) in the BED format output: throw std::runtime_error("seems to be a counts file: " + regions_file + "\ncheck order of input arguments"); - auto regions = read_intervals6(regions_file); + auto regions = get_intervals(regions_file); if (!std::is_sorted(std::begin(regions), std::end(regions))) { if (sort_data_if_needed) { if (verbose) From 9b45c5ceb8d9183e915d7ff42abc64e42e70cfbc Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 21 Dec 2025 20:47:27 -0800 Subject: [PATCH 6/6] src/analysis/pmd.cpp: fixing a bug initializing transition matrix --- src/analysis/pmd.cpp | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/analysis/pmd.cpp b/src/analysis/pmd.cpp index 82580040..02c6e0ba 100644 --- a/src/analysis/pmd.cpp +++ b/src/analysis/pmd.cpp @@ -1083,9 +1083,9 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { static constexpr auto min_observations_for_inference{100ul}; static constexpr auto default_self_transition{0.99}; + static constexpr auto default_transition{0.01}; static constexpr auto default_start_transition{0.5}; static constexpr auto default_end_transition{1e-10}; - static constexpr auto default_transition{0.01}; // magic numbers from paper: highest jaccard index to wgbs static constexpr auto bin_size_for_array{1000ul}; @@ -1100,17 +1100,11 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) static constexpr auto default_bg_alpha{0.95}; static constexpr auto default_bg_beta{0.05}; const auto init_trans = [&] { - return std::vector{ - std::vector(default_self_transition, default_transition), - std::vector(default_transition, default_self_transition), + return std::vector>{ + {default_self_transition, default_transition}, + {default_transition, default_self_transition}, }; }; - // > - // std::vector> t( - // 2, std::vector(2, default_transition)); - // t[0][0] = t[1][1] = default_self_transition; - // return t; - // }; // NOLINTBEGIN(*-avoid-magic-numbers) std::size_t resolution = 500;