diff --git a/.cppcheck_suppress b/.cppcheck_suppress index c925e537..e4ce3e2e 100644 --- a/.cppcheck_suppress +++ b/.cppcheck_suppress @@ -32,12 +32,8 @@ unusedStructMember # Ignore missing includes because if they are real things won't build missingInclude # Exclude external files -*:*CLI11.hpp -*:*json.hpp -*:*asio* *:*smithlab_cpp* -*:*ssl.hpp -*:*indicators.hpp +*:*popcnt.hpp # Problem caused by external files toomanyconfigs # More problems caused by external files -- with too many ifdefs diff --git a/CMakeLists.txt b/CMakeLists.txt index aaf873bf..2ab059f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,15 +4,15 @@ # # Authors: Andrew D. Smith # -# This is free software: you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# This is free software: you can redistribute it and/or modify it under the +# terms of the GNU General Public License as published by the Free Software +# Foundation, either version 3 of the License, or (at your option) any later +# version. # -# This software is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. +# This software is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. # to find the version of cmake do # $ cmake --version @@ -41,96 +41,6 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") configure_file(data/config.h.in config.h) -# Collect any global linker options as needed and then apply them -# individually to targets -set(GLOBAL_COMPILE_OPTIONS "") -set(GLOBAL_LINKER_OPTIONS "") - -if(USE_STATIC_LIBS) - # This needs to come before finding any libraries so that the static - # versions are identified - message(STATUS "Enabling static linkage for all non-system libraries") - message(STATUS "Configuring to clone ZLib") - include(ExternalProject) - set(ZLIB_CMAKE_ARGS - -DZLIB_BUILD_EXAMPLES=off - -DSKIP_INSTALL_FILES=on - -DCMAKE_POSITION_INDEPENDENT_CODE=on - -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR}/src/zlib - -DCMAKE_BUILD_TYPE=Release - ) - if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - list(APPEND ZLIB_CMAKE_ARGS - -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET} - ) - endif() - ExternalProject_Add( - ZLIB - GIT_REPOSITORY https://github.com/madler/zlib.git - GIT_TAG master - CMAKE_ARGS ${ZLIB_CMAKE_ARGS} - ) - # Include the built zlib headers and link against the built zlib library - set(ZLIB_INCLUDE_DIR "${PROJECT_BINARY_DIR}/src/zlib/include") - set(ZLIB_LIBRARY "${PROJECT_BINARY_DIR}/src/zlib/lib/libz.a") - - # Create the CMake target for the built zlib - add_library(ZLIB_IMPORTED INTERFACE) - set_target_properties(ZLIB_IMPORTED PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES ${ZLIB_INCLUDE_DIR} - INTERFACE_LINK_LIBRARIES ${ZLIB_LIBRARY} - ) - - # This alias means we don't need to worry where zlib came from - add_library(ZLIB::ZLIB ALIAS ZLIB_IMPORTED) - - ExternalProject_Add( - HTSLIB - GIT_REPOSITORY https://github.com/samtools/htslib.git - GIT_TAG master - CONFIGURE_COMMAND "" - # "autoreconf -i ; /configure" - BUILD_COMMAND make -C lib-static -j - INSTALL_COMMAND "" - # make install - TEST_COMMAND "" - BUILD_BYPRODUCTS libhts.a - ) - # ExternalProject_Add_Step(HTSLIB - # bootstrap - # COMMAND autoreconf -i - # DEPENDEES download - # DEPENDERS configure - # ) - ExternalProject_Get_Property(HTSLIB SOURCE_DIR) - message(STATUS "SOURCE_DIR: ${SOURCE_DIR}") - - # Include the built zlib headers and link against the built zlib library - set(HTSLIB_INCLUDE_DIR "${SOURCE_DIR}/src/htslib/htslib") - set(HTSLIB_LIBRARY "${SOURCE_DIR}/htslib.a") - - # Create the CMake target for the built zlib - add_library(HTSLIB_IMPORTED INTERFACE) - set_target_properties(HTSLIB_IMPORTED PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES ${HTSLIB_INCLUDE_DIR} - INTERFACE_LINK_LIBRARIES ${HTSLIB_LIBRARY} - ) - set_target_properties(HTSLIB_IMPORTED PROPERTIES IMPORTED_LOCATION ${HTSLIB_LIBRARY}) - - # This alias means we don't need to worry where htslib came from - add_library(HTSLIB::HTSLIB ALIAS HTSLIB_IMPORTED) - - # Set static for the linker so the compiler's libraries will be static - ## ADS: using this instead of forcing -static for everything avoids the - ## static linkage that Aiso warns against, but also means it's not 100% - ## static linked ADS: can't do this if the compiler is AppleClang because - ## they don't have the libc++.a and the libgcc wouldn't make sense anyway. - if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") # macOS - list(APPEND GLOBAL_LINKER_OPTIONS -static-libgcc -static-libstdc++) - endif() - -endif() - if(ENABLE_LTO) # Turn on LTO if we are building for distribution include(CheckIPOSupported) @@ -147,17 +57,12 @@ if(STATIC_ANALYSIS) endif() # ADS: set the most stringent warnings we can -list(APPEND GLOBAL_COMPILE_OPTIONS - -Wall -Wextra -Wpedantic -Werror -Wfatal-errors +add_compile_options( + -Wall + -Wextra + -Wpedantic + -Werror + -Wfatal-errors ) -if(STRIP_PATHS_FROM_BINARIES) - # This is set if we have configured to distribute - set(PREFIX_MAP_ARG -ffile-prefix-map=) - list(TRANSFORM STRIP_SUB_LIST PREPEND ${PREFIX_MAP_ARG}) - list(APPEND GLOBAL_COMPILE_OPTIONS ${STRIP_SUB_LIST}) -endif() - -message(STATUS "Finished global compile and linker configuration") - add_subdirectory(src) diff --git a/Makefile.am b/Makefile.am index f59f583f..6b0fa70c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -222,6 +222,7 @@ dnmtools_SOURCES += src/analysis/levels.cpp dnmtools_SOURCES += src/analysis/hypermr.cpp dnmtools_SOURCES += src/analysis/metagene.cpp dnmtools_SOURCES += src/analysis/autocorr.cpp +dnmtools_SOURCES += src/analysis/methsummary.cpp dnmtools_SOURCES += src/utils/clean-hairpins.cpp dnmtools_SOURCES += src/utils/guessprotocol.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 360aaf4c..14bb74f7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -43,6 +43,7 @@ target_link_libraries(dnmtools PUBLIC nanopore symmetric-cpgs levels + methsummary hmr hmr-rep hypermr diff --git a/src/abismal b/src/abismal index f108dcd3..f31459ac 160000 --- a/src/abismal +++ b/src/abismal @@ -1 +1 @@ -Subproject commit f108dcd35ef4a5ba446c0c7e769fca0059ee2bbc +Subproject commit f31459ac3155c8cbbf35ca747097086a7236a5bb diff --git a/src/amrfinder/amrtester.cpp b/src/amrfinder/amrtester.cpp index a9880589..78da4fed 100644 --- a/src/amrfinder/amrtester.cpp +++ b/src/amrfinder/amrtester.cpp @@ -89,7 +89,7 @@ find_first_epiread_ending_after_position(const string &query_chrom, backup_to_start_of_current_record(in); // we've hit the end of file without finding an epiread - if (low_pos == eof - 2) + if (low_pos + 2 == static_cast(eof)) return -1; if (!(in >> chrom >> start >> seq)) { diff --git a/src/analysis/levels.cpp b/src/analysis/levels.cpp index 3acf6699..780ee804 100644 --- a/src/analysis/levels.cpp +++ b/src/analysis/levels.cpp @@ -121,6 +121,9 @@ main_levels(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) const std::string meth_file = leftover_args.front(); /****************** END COMMAND LINE OPTIONS *****************/ + if (relaxed_mode) + MSite::no_extra_fields = false; + if (!is_msite_file(meth_file)) throw std::runtime_error{"malformed counts file: " + meth_file}; diff --git a/src/analysis/methsummary.cpp b/src/analysis/methsummary.cpp new file mode 100644 index 00000000..ed97225a --- /dev/null +++ b/src/analysis/methsummary.cpp @@ -0,0 +1,1211 @@ +/* summary: does bsrates, levels, sym, states and xsym all in one + * + * Copyright (C) 2025 Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* ADS: This code writes MSite objects to files, but does not use + MSite to do it. Possiby dangerous, but currently much faster. If + MSite has a way to serialize into a char[] directly, then we should + use it. */ +// #include "MSite.hpp" +#include "LevelsCounter.hpp" +#include "bam_record_utils.hpp" +#include "bsutils.hpp" +#include "counts_header.hpp" + +#include "OptionParser.hpp" +#include "smithlab_os.hpp" + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct quick_buf : public std::ostringstream, + public std::basic_stringbuf { + // ADS: By user ecatmur on SO; very fast. Seems to work... + quick_buf() { + // ...but this seems to depend on data layout + static_cast &>(*this).rdbuf(this); + } + void + clear() { + // reset buffer pointers (member functions) + setp(pbase(), pbase()); + } + char const * + data() { + /* between c_str and insertion make sure to clear() */ + *pptr() = '\0'; + return pbase(); + } +}; + +static const char b2c[] = "TNGNNNCNNNNNNNNNNNNA"; // NOLINT + +static void +collect_cpgs(const std::string &s, std::vector &cpgs) { + cpgs.clear(); + const std::uint64_t lim = std::size(s) - 1; + for (auto i = 0u; i < lim; ++i) + if (is_cpg(s, i)) + cpgs.push_back(i); +} + +template +// constexpr // since C++20 +OutputIt +revcomp_copy(BidirIt first, BidirIt last, OutputIt d_first) { + for (; first != last; ++d_first) + *d_first = b2c[*(--last) - 'A']; // NOLINT + return d_first; +} + +static bool +convert_meth_states_pos(const std::vector &cpgs, + const bamxx::bam_header &hdr, const bamxx::bam_rec &aln, + std::uint64_t &first_cpg_index, std::string &states) { + states.clear(); + + const std::uint64_t seq_start = get_pos(aln); + const std::uint64_t width = rlen_from_cigar(aln); + const std::uint64_t seq_end = seq_start + width; + + std::string seq_str; + get_seq_str(aln, seq_str); + apply_cigar(aln, seq_str, 'N'); + + if (std::size(seq_str) != width) + throw std::runtime_error("bad sam record format: " + to_string(hdr, aln)); + + // get the first cpg site equal to or large than seq_start + auto cpg_itr = lower_bound(std::cbegin(cpgs), std::cend(cpgs), seq_start); + auto first_cpg_itr = std::cend(cpgs); + + if (cpg_itr == std::cend(cpgs)) + return false; + + for (; cpg_itr != std::cend(cpgs) && *cpg_itr < seq_end; cpg_itr++) { + const char x = seq_str[*cpg_itr - seq_start]; + states += (x == 'T') ? 'T' : ((x == 'C') ? 'C' : 'N'); + if (first_cpg_itr == std::cend(cpgs)) + first_cpg_itr = cpg_itr; + } + + if (first_cpg_itr != std::cend(cpgs)) + first_cpg_index = distance(std::cbegin(cpgs), first_cpg_itr); + + return states.find_first_of("CT") != std::string::npos; +} + +static bool +convert_meth_states_neg(const std::vector &cpgs, + const bamxx::bam_header &hdr, const bamxx::bam_rec &aln, + std::uint64_t &first_cpg_index, std::string &states) { + /* ADS: the "revcomp" on the read sequence is needed for the cigar + to be applied, since the cigar is relative to the genome + coordinates and not the read's sequence. But the read sequence + may is assumed to have been T-rich to begin with, so it becomes + A-rich. And the position of the C in the CpG becomes the G + position. + */ + + states.clear(); + + const std::uint64_t seq_start = get_pos(aln); + const std::uint64_t width = rlen_from_cigar(aln); + const std::uint64_t seq_end = seq_start + width; + + std::string orig_seq; + get_seq_str(aln, orig_seq); + + std::string seq_str; + seq_str.resize(orig_seq.size()); + revcomp_copy(std::cbegin(orig_seq), std::cend(orig_seq), std::begin(seq_str)); + apply_cigar(aln, seq_str, 'N'); + + if (seq_str.size() != width) + throw std::runtime_error("bad sam record format: " + to_string(hdr, aln)); + + // get the first cpg site equal to or large than seq_start - 1 + // the -1 is because we look for G in the read corresponding to a + // CpG in chromosome, which are indexed in cpgs based on the position of C + auto cpg_itr = lower_bound(std::cbegin(cpgs), std::cend(cpgs), + seq_start > 0 ? seq_start - 1 : 0); + auto first_cpg_itr = std::cend(cpgs); + + if (cpg_itr == std::cend(cpgs)) { + return false; + } + else { + for (; cpg_itr != std::cend(cpgs) && *cpg_itr < seq_end - 1; cpg_itr++) { + const char x = seq_str[*cpg_itr - seq_start + 1]; + states += (x == 'G') ? 'C' : ((x == 'A') ? 'T' : 'N'); + if (first_cpg_itr == std::cend(cpgs)) + first_cpg_itr = cpg_itr; + } + } + + if (first_cpg_itr != std::cend(cpgs)) { + first_cpg_index = distance(std::cbegin(cpgs), first_cpg_itr); + } + + return states.find_first_of("CT") != std::string::npos; +} + +struct bsrate_summary { + // converted_count_positive is the number of nucleotides covering a + // cytosine in the reference that show a thymine in the read, and + // for reads mapping to the positive strand. + std::uint64_t converted_count_positive{}; + // total_count_positive is the number of nucleotides covering a + // cytosine in the reference that show either a cytosine or a + // thymine in the read, and for reads mapping to the positive + // strand. + std::uint64_t total_count_positive{}; + // bisulfite_conversion_rate_positive is equal to + // converted_count_positive divided by total_count_positive, a value + // that is always between 0 and 1. When total_count_positive is 0, + // then bisulfite_conversion_rate_positive is given a value of 0. + double bisulfite_conversion_rate_positive{}; + + // converted_count_negative is the number of nucleotides covering a + // cytosine in the reference that show a thymine in the read, and + // for reads mapping to the negative strand. + std::uint64_t converted_count_negative{}; + // total_count_negative is the number of nucleotides covering a + // cytosine in the reference that show either a cytosine or a + // thymine in the read, and for reads mapping to the negative + // strand. + std::uint64_t total_count_negative{}; + // bisulfite_conversion_rate_negative is equal to + // converted_count_negative divided by total_count_negative, a value + // that is always between 0 and 1. When total_count_negative is 0, + // then bisulfite_conversion_rate_negative is given a value of 0. + double bisulfite_conversion_rate_negative{}; + + // converted_count is equal to the sum of converted_count_positive + // and converted_count_negative. + std::uint64_t converted_count{}; + // total_count is equal to the sum of total_count_positive and + // total_count_negative + std::uint64_t total_count{}; + // bisulfite_conversion_rate is equal to converted_count divided by + // total_count, a value that is always between 0 and 1. When + // total_count is 0, then bisulfite_conversion_rate is given a value + // of 0. + double bisulfite_conversion_rate{}; + + // error_count is the number of nucleotides covering a cytosine in + // the reference shows either an A or a G in the read. + std::uint64_t error_count{}; + // valid_count is the number of nucleotides covering a cytosine in + // the reference shows any nucleotide that is not an N in the read. + std::uint64_t valid_count{}; + // error_rate is equal to error_count divided by valid_count, and is + // a value that is always between 0 and 1. When valid_count is 0, + // then error_rate is given a value of 0. + double error_rate{}; + + void + update_pos(const char nt) { + if (nt == 'C' || nt == 'T') { + ++total_count_positive; + converted_count_positive += (nt == 'T'); + } + else if (nt != 'N') + ++error_count; + } + + void + update_neg(const char nt) { + if (nt == 'C' || nt == 'T') { + ++total_count_negative; + converted_count_negative += (nt == 'T'); + } + else if (nt != 'N') + ++error_count; + } + + bsrate_summary & + operator+=(const bsrate_summary &rhs) { + // ADS: the "rates" are set to 0.0 here to ensure that they are + // computed properly using the full data after any accumulation of + // integer values has completed. + converted_count_positive += rhs.converted_count_positive; + total_count_positive += rhs.total_count_positive; + bisulfite_conversion_rate_positive = 0.0; + + converted_count_negative += rhs.converted_count_negative; + total_count_negative += rhs.total_count_negative; + bisulfite_conversion_rate_negative = 0.0; + + converted_count += rhs.converted_count; + total_count += rhs.total_count; + bisulfite_conversion_rate = 0.0; + + error_count += rhs.error_count; + valid_count += rhs.valid_count; + error_rate = 0.0; + return *this; + } + + void + set_values() { + bisulfite_conversion_rate_positive = + static_cast(converted_count_positive) / + static_cast( + std::max(total_count_positive, static_cast(1))); + + bisulfite_conversion_rate_negative = + static_cast(converted_count_negative) / + static_cast( + std::max(total_count_negative, static_cast(1))); + + converted_count = converted_count_positive + converted_count_negative; + total_count = total_count_positive + total_count_negative; + + bisulfite_conversion_rate = + static_cast(converted_count) / + static_cast(std::max(total_count, static_cast(1))); + + valid_count = total_count + error_count; + + error_rate = + static_cast(error_count) / + static_cast(std::max(valid_count, static_cast(1))); + } + + std::string + tostring_as_row() const { + static constexpr auto precision_val = 5u; + std::ostringstream oss; + oss.precision(precision_val); + oss.setf(std::ios_base::fixed, std::ios_base::floatfield); + // clang-format off + oss << total_count_positive << '\t' + << converted_count_positive << '\t' + << bisulfite_conversion_rate_positive << '\t'; + oss << total_count_negative << '\t' + << converted_count_negative << '\t' + << bisulfite_conversion_rate_negative << '\t'; + oss << total_count << '\t' + << converted_count << '\t' + << bisulfite_conversion_rate << '\t'; + oss << error_count << '\t' + << valid_count << '\t' + << error_rate; + // clang-format on + return oss.str(); + } + + std::string + tostring_as_yaml_list(const std::uint32_t position) const { + static constexpr auto precision_val = 5u; + std::ostringstream oss; + oss.precision(precision_val); + oss.setf(std::ios_base::fixed, std::ios_base::floatfield); + // clang-format off + oss << " - base: " << position << '\n' + << " ptot: " << total_count_positive << '\n' + << " pconv: " << converted_count_positive << '\n' + << " prate: " << bisulfite_conversion_rate_positive << '\n' + << " ntot: " << total_count_negative << '\n' + << " nconv: " << converted_count_negative << '\n' + << " nrate: " << bisulfite_conversion_rate_negative << '\n' + << " bthtot: " << total_count << '\n' + << " bthconv: " << converted_count << '\n' + << " bthrate: " << bisulfite_conversion_rate << '\n' + << " err: " << error_count << '\n' + << " all: " << valid_count << '\n' + << " errrate: " << error_rate << '\n'; + // clang-format on + return oss.str(); + } + + std::string + tostring() const { + static constexpr auto sep = ": "; + std::ostringstream oss; + oss << "converted_count_positive" << sep << converted_count_positive + << '\n'; + + oss << "total_count_positive" << sep << total_count_positive << '\n'; + + oss << "bisulfite_conversion_rate_positive" << sep + << bisulfite_conversion_rate_positive << '\n'; + + oss << "converted_count_negative" << sep << converted_count_negative + << '\n'; + + oss << "total_count_negative" << sep << total_count_negative << '\n'; + + oss << "bisulfite_conversion_rate_negative" << sep + << bisulfite_conversion_rate_negative << '\n'; + + oss << "converted_count" << sep << converted_count << '\n'; + oss << "total_count" << sep << total_count << '\n'; + + oss << "bisulfite_conversion_rate" << sep << bisulfite_conversion_rate + << '\n'; + + oss << "error_count" << sep << error_count << '\n'; + oss << "valid_count" << sep << valid_count << '\n'; + oss << "error_rate" << sep << error_rate; + + return oss.str(); + } +}; + +inline bsrate_summary +operator+(bsrate_summary lhs, const bsrate_summary &rhs) { + lhs += rhs; + return lhs; +} + +static void +write_summary(const std::string &summary_file, + std::vector &summaries) { + auto s = reduce(std::cbegin(summaries), std::cend(summaries)); + s.set_values(); + + std::ofstream out(summary_file); + if (!out) + throw std::runtime_error("failed to open file: " + summary_file); + out << s.tostring() << '\n'; +} + +static std::pair +count_states_pos(const bool INCLUDE_CPGS, const std::string &chrom, + const bamxx::bam_rec &aln, + std::vector &summaries, std::size_t &hanging) { + // NOLINTBEGIN(*-pointer-arithmetic) + std::uint32_t n_conv = 0, n_uconv = 0; + + /* iterate through reference, query/read and fragment */ + const auto seq = bam_get_seq(aln); + const auto beg_cig = bam_get_cigar(aln); + const auto end_cig = + beg_cig + get_n_cigar(aln); // NOLINT(*-pointer-arithmetic) + auto rpos = get_pos(aln); + auto qpos = 0; + auto fpos = 0; + + const decltype(rpos) chrom_lim = + static_cast(std::size(chrom) - 1); + + for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { + const auto op = bam_cigar_op(*c_itr); // NOLINT(*-narrowing-conversions) + const auto n = bam_cigar_oplen(*c_itr); + if (cigar_eats_ref(op) && cigar_eats_query(op)) { + const decltype(qpos) end_qpos = static_cast(qpos + n); + for (; qpos < end_qpos; ++qpos, ++rpos, ++fpos) { + if (rpos > chrom_lim) + ++hanging; + if (is_cytosine(chrom[rpos]) && + (rpos >= chrom_lim || !is_guanine(chrom[rpos + 1]) || + INCLUDE_CPGS)) { + const auto nt = seq_nt16_str[bam_seqi(seq, qpos)]; + summaries[fpos].update_pos(nt); + n_conv += (nt == 'T'); + n_uconv += (nt == 'C'); + } + } + } + else { + if (cigar_eats_query(op)) + qpos += n; // NOLINT + if (cigar_eats_ref(op)) + rpos += n; // NOLINT + if (cigar_eats_frag(op)) + fpos += n; // NOLINT + } + } + assert(qpos == get_l_qseq(aln)); + return {n_conv, n_conv + n_uconv}; + // NOLINTEND(*-pointer-arithmetic) +} + +static std::pair +count_states_neg(const bool INCLUDE_CPGS, const std::string &chrom, + const bamxx::bam_rec &aln, + std::vector &summaries, std::size_t &hanging) { + // NOLINTBEGIN(*-pointer-arithmetic) + std::uint32_t n_conv = 0, n_uconv = 0; + + /* iterate backward over query/read positions but forward over + reference and fragment positions */ + const auto seq = bam_get_seq(aln); + const auto beg_cig = bam_get_cigar(aln); + const auto end_cig = beg_cig + get_n_cigar(aln); + auto rpos = get_pos(aln); + auto qpos = get_l_qseq(aln); + auto fpos = 0; + + const decltype(rpos) chrom_lim = + static_cast(std::size(chrom) - 1); + + for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { + const auto op = bam_cigar_op(*c_itr); + const auto n = bam_cigar_oplen(*c_itr); + if (cigar_eats_ref(op) && cigar_eats_query(op)) { + const decltype(qpos) end_qpos = static_cast(qpos - n); + for (; qpos > end_qpos; --qpos, ++rpos, ++fpos) { + if (rpos > chrom_lim) + ++hanging; + if (is_guanine(chrom[rpos]) && + (rpos == 0 || !is_cytosine(chrom[rpos - 1]) || INCLUDE_CPGS)) { + const auto nt = seq_nt16_str[bam_seqi(seq, qpos - 1)]; + summaries[fpos].update_neg(nt); + n_conv += (nt == 'T'); + n_uconv += (nt == 'C'); + } + } + } + else { + if (cigar_eats_query(op)) + qpos -= static_cast(n); + if (cigar_eats_ref(op)) + rpos += static_cast(n); + if (cigar_eats_frag(op)) + fpos += static_cast(n); + } + } + assert(qpos == 0); + return {n_conv, n_conv + n_uconv}; + // NOLINTEND(*-pointer-arithmetic) +} + +// ADS: we should never have to worry about coverage over > 32767 in +// any downstream analysis, so using "int16_t" here would allow to +// detect wrap around and report it as some kind of weird thing, maybe +// zeroing it and flagging the output. As it is now, if we really need +// >65535-fold coverage, we can make the change here. +typedef std::uint16_t count_type; + +static inline bool +eats_ref(const std::uint32_t c) { + return bam_cigar_type(bam_cigar_op(c)) & 2; +} + +static inline bool +eats_query(const std::uint32_t c) { + return bam_cigar_type(bam_cigar_op(c)) & 1; +} + +/* The three functions below here should probably be moved into + bsutils.hpp. I am not sure if the DDG function is needed, but it + seems like if one considers strand, and the CHH is not symmetric, + then one needs this. Also, Qiang should be consulted on this + because he spent much time thinking about it in the context of + plants. */ +static bool +is_chh(const std::string &s, std::size_t i) { + return (i < (s.length() - 2)) && is_cytosine(s[i]) && !is_guanine(s[i + 1]) && + !is_guanine(s[i + 2]); +} + +static bool +is_ddg(const std::string &s, std::size_t i) { + return (i < (s.length() - 2)) && !is_cytosine(s[i]) && + !is_cytosine(s[i + 1]) && is_guanine(s[i + 2]); +} + +static bool +is_c_at_g(const std::string &s, std::size_t i) { + return (i < (s.length() - 2)) && is_cytosine(s[i]) && + !is_cytosine(s[i + 1]) && !is_guanine(s[i + 1]) && + is_guanine(s[i + 2]); +} + +/* Right now the CountSet objects below are much larger than they need + to be, for the things we are computing. However, it's not clear + that the minimum information would really put the memory + requirement of the program into a more reasonable range, so keeping + all the information seems reasonable. */ +struct CountSet { + std::string + tostring() const { + std::ostringstream oss; + oss << pA << '\t' << pC << '\t' << pG << '\t' << pT << '\t' << nA << '\t' + << nC << '\t' << nG << '\t' << nT; + // << '\t' << N; /* not used */ + return oss.str(); + } + void + add_count_pos(const char x) { + if (x == 'T') + ++pT; // conditions ordered for efficiency + else if (x == 'C') + ++pC; + else if (x == 'G') + ++pG; + else if (x == 'A') + ++pA; + // else ++N; /* not used */ + } + void + add_count_neg(const char x) { + if (x == 'T') + ++nT; // conditions ordered for efficiency(??) + else if (x == 'C') + ++nC; + else if (x == 'G') + ++nG; + else if (x == 'A') + ++nA; + // else ++N; /* not used */ + } + count_type + pos_total() const { + return pA + pC + pG + pT; + } + count_type + neg_total() const { + return nA + nC + nG + nT; + } + + count_type + unconverted_cytosine() const { + return pC; + } + count_type + converted_cytosine() const { + return pT; + } + count_type + unconverted_guanine() const { + return nC; + } + count_type + converted_guanine() const { + return nT; + } + + count_type pA{0}, pC{0}, pG{0}, pT{0}; + count_type nA{0}, nC{0}, nG{0}, nT{0}; + // count_type N; /* this wasn't used and breaks alignment */ +}; + +/* The "tag" returned by this function should be exclusive, so that + * the order of checking conditions doesn't matter. There is also a + * bit of a hack in that the unsigned "pos" could wrap, but this still + * works as long as the chromosome size is not the maximum size of a + * std::size_t. + */ +static std::uint32_t +get_tag_from_genome(const std::string &s, const std::size_t pos) { + if (is_cytosine(s[pos])) { + if (is_cpg(s, pos)) + return 0; + else if (is_chh(s, pos)) + return 1; + else if (is_c_at_g(s, pos)) + return 2; + else + return 3; + } + if (is_guanine(s[pos])) { + if (is_cpg(s, pos - 1)) + return 0; + else if (is_ddg(s, pos - 2)) + return 1; + else if (is_c_at_g(s, pos - 2)) + return 2; + else + return 3; + } + return 4; // shouldn't be used for anything +} + +static inline bool +is_cpg(const std::uint32_t x) { + return x == 0; +} + +static inline bool +is_chh(const std::uint32_t x) { + return x == 1; +} + +static inline bool +is_cxg(const std::uint32_t x) { + return x == 2; +} + +static inline bool +is_ccg(const std::uint32_t x) { + return x == 3; +} + +/* This "has_mutated" function looks on the opposite strand to see + * if the apparent conversion from C->T was actually already in the + * DNA because of a mutation or SNP. + */ +static bool +has_mutated(const char base, const CountSet &cs) { + static const double MUTATION_DEFINING_FRACTION = 0.5; + return is_cytosine(base) + ? (cs.nG < MUTATION_DEFINING_FRACTION * (cs.neg_total())) + : (cs.pG < MUTATION_DEFINING_FRACTION * (cs.pos_total())); +} + +// NOLINTBEGIN(*-avoid-c-arrays) +static const char *const tag_values[] = { + "CpG", // 0 + "CHH", // 1 + "CXG", // 2 + "CCG", // 3 + "N", // 4 + "CpGx", // 5 <---- MUT_OFFSET + "CHHx", // 6 + "CXGx", // 7 + "CCGx", // 8 + "Nx" // 9 +}; +// NOLINTEND(*-avoid-c-arrays) +static constexpr std::uint32_t MUT_OFFSET = 5; + +static inline std::uint32_t +tag_with_mut(const std::uint32_t tag, const bool mut) { + return tag + (mut ? MUT_OFFSET : 0); +} + +// NOLINTBEGIN(cert-err58-cpp,*-avoid-non-const-global-variables) +LevelsCounter cpg("cpg"); +LevelsCounter cpg_symmetric("cpg_symmetric"); +LevelsCounter chh("chh"); +LevelsCounter cxg("cxg"); +LevelsCounter ccg("ccg"); +LevelsCounter cytosines("cytosines"); +// NOLINTEND(cert-err58-cpp,*-avoid-non-const-global-variables) + +static void +update_lc(const bool mut, const std::uint32_t n_reads, + const std::uint32_t n_meth, LevelsCounter &lc) { + static constexpr auto alpha = 0.05; + if (mut) { + ++lc.mutations; + } + if (n_reads > 0) { + ++lc.sites_covered; + lc.max_depth = std::max(lc.max_depth, static_cast(n_reads)); + lc.total_c += n_meth; + lc.total_t += n_reads - n_meth; + const double meth = n_meth / static_cast(n_reads); + lc.total_meth += meth; + double lower{}; + double upper{}; + wilson_ci_for_binomial(alpha, n_reads, meth, lower, upper); + lc.called_meth += (lower > 0.5); // NOLINT(*-avoid-magic-numbers) + lc.called_unmeth += (upper < 0.5); // NOLINT(*-avoid-magic-numbers) + } + ++lc.total_sites; +} + +static void +write_output(const bamxx::bam_header &hdr, bamxx::bgzf_file &out, + bamxx::bgzf_file &out_sym, const int32_t tid, + const std::string &chrom, const std::vector &counts) { + quick_buf buf; // keep underlying buffer space? + buf.clear(); + + buf << sam_hdr_tid2name(hdr, tid) << '\n'; + if (!out.write(buf.data(), buf.tellp())) + throw std::runtime_error("error writing output"); + + std::uint32_t prev_unconverted{}; + std::uint32_t prev_n_reads{}; + std::size_t prev_idx{}; + bool prev_is_c{}; + bool prev_mut{}; + std::uint32_t offset{}; + + for (std::size_t i = 0; i < counts.size(); ++i) { + const char base = chrom[i]; + if (is_cytosine(base) || is_guanine(base)) { + const std::uint32_t the_tag = get_tag_from_genome(chrom, i); + const bool is_c = is_cytosine(base); + const auto unconverted = is_c ? counts[i].unconverted_cytosine() + : counts[i].unconverted_guanine(); + const auto converted = + is_c ? counts[i].converted_cytosine() : counts[i].converted_guanine(); + const auto n_reads = static_cast(unconverted + converted); + + const bool mut = has_mutated(base, counts[i]); + + update_lc(mut, n_reads, unconverted, cytosines); + // cytosines.update(site); + + if (is_cpg(the_tag)) { + update_lc(mut, n_reads, unconverted, cpg); + if (prev_idx + 1 == i && !is_c && prev_is_c) + update_lc(mut || prev_mut, n_reads + prev_n_reads, + unconverted + prev_unconverted, cpg_symmetric); + } + else if (is_chh(the_tag)) + update_lc(mut, n_reads, unconverted, chh); + else if (is_ccg(the_tag)) + update_lc(mut, n_reads, unconverted, ccg); + else if (is_cxg(the_tag)) + update_lc(mut, n_reads, unconverted, cxg); + + if (n_reads > 0) { + buf.clear(); + // ADS: here is where we make an MSite, but not using MSite + buf << i - offset << '\t' << unconverted << '\t' << converted << '\n'; + if (!out.write(buf.data(), buf.tellp())) + throw std::runtime_error("error writing output"); + offset = i; + } + + if (is_cpg(the_tag) && !is_c && prev_is_c) { + buf.clear(); + // ADS: here is where we make an MSite, but not using MSite + + // NOLINTBEGIN(*-constant-array-index) + buf << sam_hdr_tid2name(hdr, tid) << '\t' << prev_idx << '\t' << '+' + << '\t' << tag_values[tag_with_mut(the_tag, mut || prev_mut)] + << '\t' + << ((n_reads + prev_n_reads) > 0 + ? (unconverted + prev_unconverted) / (n_reads + prev_n_reads) + : 0.0) + << '\t' << n_reads + prev_n_reads << '\n'; + // NOLINTEND(*-constant-array-index) + if (!out_sym.write(buf.data(), buf.tellp())) + throw std::runtime_error("error writing output"); + } + prev_n_reads = n_reads; + prev_unconverted = unconverted; + prev_idx = i; + prev_is_c = is_c; + prev_mut = mut; + } + } +} + +static void +count_states_pos(const bamxx::bam_rec &aln, std::vector &counts) { + /* Move through cigar, reference and read positions without + inflating cigar or read sequence */ + const auto seq = bam_get_seq(aln); + const auto beg_cig = bam_get_cigar(aln); + const auto end_cig = + beg_cig + get_n_cigar(aln); // NOLINT(*-pointer-arithmetic) + auto rpos = get_pos(aln); + auto qpos = 0; // to match type with b->core.l_qseq + for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { + const char op = bam_cigar_op(*c_itr); // NOLINT(*-narrowing-conversions) + const std::uint32_t n = bam_cigar_oplen(*c_itr); + if (eats_ref(op) && eats_query(op)) { + const decltype(qpos) end_qpos = static_cast(qpos + n); + for (; qpos < end_qpos; ++qpos) { + // ADS: beware!!! bam_seqi is a macro, so no "qpos++" inside + // its arguments! Why macros?!?! Just make sure the compiler + // inliens it properly ffs! + counts[rpos++].add_count_pos(seq_nt16_str[bam_seqi(seq, qpos)]); + } + } + else if (eats_query(op)) { + qpos += n; // NOLINT + } + else if (eats_ref(op)) { + rpos += n; // NOLINT + } + } + // ADS: somehow previous code included a correction for rpos going + // past the end of the chromosome; this should result at least in a + // soft-clip by any mapper. I'm not checking it here as even if it + // happens I don't want to terminate. + assert(qpos == get_l_qseq(aln)); +} + +static void +count_states_neg(const bamxx::bam_rec &aln, std::vector &counts) { + /* Move through cigar, reference and (*backward*) through read + positions without inflating cigar or read sequence */ + const auto seq = bam_get_seq(aln); + const auto beg_cig = bam_get_cigar(aln); + const auto end_cig = + beg_cig + get_n_cigar(aln); // NOLINT(*-pointer-arithmetic) + std::size_t rpos = get_pos(aln); + std::size_t qpos = get_l_qseq(aln); // to match type with b->core.l_qseq + for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { + const char op = bam_cigar_op(*c_itr); // NOLINT(*-narrowing-conversions) + const std::uint32_t n = bam_cigar_oplen(*c_itr); + if (eats_ref(op) && eats_query(op)) { + const std::size_t end_qpos = qpos - n; // to match type with qpos + for (; qpos > end_qpos; --qpos) // beware ++ in macro below!!! + counts[rpos++].add_count_neg(seq_nt16_str[bam_seqi(seq, qpos - 1)]); + } + else if (eats_query(op)) { + qpos -= n; // NOLINT + } + else if (eats_ref(op)) { + rpos += n; // NOLINT + } + } + /* qpos is unsigned; would wrap around if < 0 */ + // ADS: Same as count_states_pos; see comment there + assert(qpos == 0); +} + +static std::unordered_map +get_tid_to_idx( + const bamxx::bam_header &hdr, + const std::unordered_map &name_to_idx) { + std::unordered_map tid_to_idx; + for (int32_t i = 0; i < hdr.h->n_targets; ++i) { + // "curr_name" gives a "tid_to_name" mapping allowing to jump + // through "name_to_idx" and get "tid_to_idx" + const std::string curr_name( + hdr.h->target_name[i]); // NOLINT(*-pointer-arithmetic) + const auto name_itr(name_to_idx.find(curr_name)); + if (name_itr == std::cend(name_to_idx)) + throw std::runtime_error("failed to find chrom: " + curr_name); + tid_to_idx[i] = name_itr->second; + } + return tid_to_idx; +} + +template +static void +output_skipped_chromosome( + const int32_t tid, const std::unordered_map &tid_to_idx, + const bamxx::bam_header &hdr, + const std::vector::const_iterator chroms_beg, + const std::vector &chrom_sizes, std::vector &counts, + bamxx::bgzf_file &out, bamxx::bgzf_file &out_sym) { + // get the index of the next chrom sequence + const auto chrom_idx = tid_to_idx.find(tid); + if (chrom_idx == std::cend(tid_to_idx)) + throw std::runtime_error("chrom not found: " + sam_hdr_tid2name(hdr, tid)); + + const auto chrom_itr = + chroms_beg + chrom_idx->second; // NOLINT(*-narrowing-conversions) + + // reset the counts + counts.clear(); + counts.resize(chrom_sizes[chrom_idx->second]); + + write_output(hdr, out, out_sym, tid, *chrom_itr, counts); +} + +static bool +consistent_targets(const bamxx::bam_header &hdr, + const std::unordered_map &tid_to_idx, + const std::vector &names, + const std::vector &sizes) { + const std::int32_t n_targets = hdr.h->n_targets; + if (n_targets != static_cast(std::size(names))) + return false; + + for (std::int32_t tid = 0; tid < n_targets; ++tid) { + const std::string tid_name_sam = sam_hdr_tid2name(hdr, tid); // NOLINT + const std::size_t tid_size_sam = sam_hdr_tid2len(hdr, tid); // NOLINT + const auto idx_itr = tid_to_idx.find(tid); + if (idx_itr == std::cend(tid_to_idx)) + return false; + const auto idx = idx_itr->second; + if (tid_name_sam != names[idx] || tid_size_sam != sizes[idx]) + return false; + } + return true; +} + +static void +process_reads(const bool VERBOSE, const bool show_progress, + const bool compress_output, const bool include_header, + const std::int32_t n_threads, const std::string &infile, + const std::string &chroms_file, const std::string &counts_outfile, + const std::string &sym_outfile, const std::string &levels_outfile, + const std::string &bsrate_outfile, + const std::string &states_outfile) { + // assumed maximum length of a fragment for bsrate + static constexpr const std::size_t output_size = 10000; + std::size_t hanging = 0; + + // first get the chromosome names and sequences from the FASTA file + std::vector chroms, names; + read_fasta_file_short_names(chroms_file, names, chroms); + for (auto &i : chroms) + transform(std::cbegin(i), std::cend(i), std::begin(i), + [](const char c) { return std::toupper(c); }); + if (VERBOSE) + std::cerr << "[n chroms in reference: " << chroms.size() << "]\n"; + const auto chroms_beg = cbegin(chroms); + + std::unordered_map name_to_idx; + std::vector chrom_sizes(chroms.size(), 0); + for (std::size_t i = 0; i < chroms.size(); ++i) { + name_to_idx[names[i]] = i; + chrom_sizes[i] = chroms[i].size(); + } + + std::vector summaries(output_size); + + bamxx::bam_tpool tp(n_threads); // Must be destroyed after hts + + // open the hts SAM/BAM input file and get the header + bamxx::bam_in hts(infile); + if (!hts) + throw std::runtime_error("failed to open input file"); + // load the input file's header + bamxx::bam_header hdr(hts); + if (!hdr) + throw std::runtime_error("failed to read header"); + + std::unordered_map tid_to_idx = + get_tid_to_idx(hdr, name_to_idx); + + if (!consistent_targets(hdr, tid_to_idx, names, chrom_sizes)) + throw std::runtime_error("inconsistent reference genome information"); + + // open the output file + const std::string output_mode = compress_output ? "w" : "wu"; + bamxx::bgzf_file out(counts_outfile, output_mode); + if (!out) + throw std::runtime_error("error opening output file: " + counts_outfile); + + // open the sym output file + bamxx::bgzf_file out_sym(sym_outfile, output_mode); + if (!out_sym) + throw std::runtime_error("error opening output file: " + sym_outfile); + + quick_buf buf; // keep underlying buffer space? + buf.clear(); + + // open the states output file + bamxx::bgzf_file out_states(states_outfile, output_mode); + if (!out_states) + throw std::runtime_error("error opening output file: " + states_outfile); + + // set the threads for the input file decompression + if (n_threads > 1) { + tp.set_io(hts); + tp.set_io(out); + tp.set_io(out_sym); + } + + if (include_header) + write_counts_header_from_bam_header(hdr, out); + + // now iterate over the reads, switching chromosomes and writing + // output as needed + bamxx::bam_rec aln; + int32_t prev_tid = -1; + + // this is where all the counts are accumulated + std::vector counts; + std::vector cpgs; + + std::vector::const_iterator chrom_itr; + + while (hts.read(hdr, aln)) { + if (get_tid(aln) == prev_tid) { + // do the work for this mapped read, depending on strand + if (bam_is_rev(aln)) + count_states_neg(aln, counts); + else + count_states_pos(aln, counts); + } + else { // chrom changes, output results, get the next one + // write output if there is any; should fail only once + if (!counts.empty()) + write_output(hdr, out, out_sym, prev_tid, *chrom_itr, counts); + + const int32_t tid = get_tid(aln); + if (tid < prev_tid) + throw std::runtime_error("SAM file is not sorted"); + + for (auto i = prev_tid + 1; i < tid; ++i) + output_skipped_chromosome(i, tid_to_idx, hdr, chroms_beg, chrom_sizes, + counts, out, out_sym); + + // get the next chrom to process + auto chrom_idx(tid_to_idx.find(tid)); + if (chrom_idx == std::cend(tid_to_idx)) + throw std::runtime_error("chromosome not found: " + + std::string(sam_hdr_tid2name(hdr, tid))); + if (show_progress) + std::cerr << "processing " << sam_hdr_tid2name(hdr, tid) << '\n'; + + prev_tid = tid; + chrom_itr = + chroms_beg + chrom_idx->second; // NOLINT(*-narrowing-conversions) + + // reset the counts + counts.clear(); + counts.resize(chrom_sizes[chrom_idx->second]); + // use_this_chrom = seq_to_use.empty() || chrom_idx == chrom_idx_to_use; + + collect_cpgs(*chrom_itr, cpgs); + } + + static constexpr auto use_this_chrom = true; + + if (use_this_chrom) { + static constexpr auto reads_are_a_rich = false; + // do the work for the current mapped read + if (reads_are_a_rich) + flip_conversion(aln); + static constexpr auto INCLUDE_CPGS = false; + if (bam_is_rev(aln)) + count_states_neg(INCLUDE_CPGS, *chrom_itr, aln, summaries, hanging); + else + count_states_pos(INCLUDE_CPGS, *chrom_itr, aln, summaries, hanging); + } + + std::uint64_t first_cpg_index = std::numeric_limits::max(); + std::string seq; + + const bool has_cpgs = + bam_is_rev(aln) + ? convert_meth_states_neg(cpgs, hdr, aln, first_cpg_index, seq) + : convert_meth_states_pos(cpgs, hdr, aln, first_cpg_index, seq); + + if (has_cpgs) { + buf.clear(); + buf << sam_hdr_tid2name(hdr, aln) << '\t' << first_cpg_index << '\t' + << seq << '\n'; + if (!out_states.write(buf.data(), buf.tellp())) { + throw std::runtime_error{"failure writing states output"}; + } + } + } + if (!counts.empty()) + write_output(hdr, out, out_sym, prev_tid, *chrom_itr, counts); + + // ADS: if some chroms might not be covered by reads, we have to + // iterate over what remains + for (auto i = prev_tid + 1; i < hdr.h->n_targets; ++i) + output_skipped_chromosome(i, tid_to_idx, hdr, chroms_beg, chrom_sizes, + counts, out, out_sym); + + // before writing the output we need to ensure the derived + // quantities in bsrate_summary have been calculated + std::for_each(std::begin(summaries), std::end(summaries), + [](bsrate_summary &s) { s.set_values(); }); + write_summary(bsrate_outfile, summaries); + + std::ofstream out_levels(levels_outfile); + if (!out_levels) + throw std::runtime_error{"bad output file: " + levels_outfile}; + out_levels << cytosines << '\n' + << cpg << '\n' + << cpg_symmetric << '\n' + << chh << '\n' + << ccg << '\n' + << cxg << '\n'; +} + +int +main_summary(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) + try { + bool VERBOSE = false; + bool show_progress = false; + bool require_covered = false; + bool compress_output = false; + bool include_header = false; + + std::string chroms_file; + std::string levels_outfile; + std::string bsrate_outfile; + std::string counts_outfile; + std::string sym_outfile; + std::string states_outfile; + std::int32_t n_threads = 1; + + /****************** COMMAND LINE OPTIONS ********************/ + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + "do bsrate, levels, sym, xsym, xcouns and states", + "-c "); + opt_parse.add_opt("threads", 't', "threads to use (few needed)", false, + n_threads); + opt_parse.add_opt("levels-out", '\0', "levels output file", true, + levels_outfile); + opt_parse.add_opt("bsrate-out", '\0', "bsrate output file", true, + bsrate_outfile); + opt_parse.add_opt("counts-out", '\0', "counts output file", true, + counts_outfile); + opt_parse.add_opt("sym-out", '\0', "sym output file", true, sym_outfile); + opt_parse.add_opt("states-out", '\0', "states output file", true, + states_outfile); + opt_parse.add_opt("chrom", 'c', "reference genome file (FASTA format)", + true, chroms_file); + opt_parse.add_opt("require-covered", 'r', "only output covered sites", + false, require_covered); + opt_parse.add_opt("header", 'H', "add a header to identify the reference", + false, include_header); + opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output); + opt_parse.add_opt("progress", '\0', "show progress", false, show_progress); + opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); + std::vector leftover_args; + opt_parse.parse(argc, argv, leftover_args); + if (opt_parse.about_requested() || opt_parse.help_requested() || + leftover_args.empty()) { + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; + return EXIT_SUCCESS; + } + if (opt_parse.option_missing()) { + std::cerr << opt_parse.option_missing_message() << '\n'; + return EXIT_SUCCESS; + } + const std::string mapped_reads_file = leftover_args.front(); + /****************** END COMMAND LINE OPTIONS *****************/ + + if (n_threads < 0) + throw std::runtime_error("thread count cannot be negative"); + + std::ostringstream cmd; + std::copy(argv, argv + argc, std::ostream_iterator(cmd, " ")); + + if (VERBOSE) + std::cerr << "[input BAM/SAM file: " << mapped_reads_file << "]\n" + << "[counts file: " << counts_outfile << "]\n" + << "[sym file: " << sym_outfile << "]\n" + << "[bsrate file: " << bsrate_outfile << "]\n" + << "[levels file: " << levels_outfile << "]\n" + << "[states file: " << states_outfile << "]\n" + << "[genome file: " << chroms_file << "]\n" + << "[threads requested: " << n_threads << "]\n" + << "[command line: \"" << cmd.str() << "\"]\n"; + + process_reads(VERBOSE, show_progress, compress_output, include_header, + n_threads, mapped_reads_file, chroms_file, counts_outfile, + sym_outfile, levels_outfile, bsrate_outfile, states_outfile); + } + catch (const std::exception &e) { + std::cerr << e.what() << '\n'; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index 29a3438f..567800c6 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -49,7 +49,37 @@ #include #include -// NOLINTBEGIN(*-avoid-c-arrays,*-avoid-magic-numbers,*-avoid-non-const-global-variables,*-narrowing-conversions,*-pro-bounds-constant-array-index,*-pro-bounds-pointer-arithmetic) +// NOLINTBEGIN(*-narrowing-conversions,*-pointer-arithmetic) + +// clang-format off +// NOLINTBEGIN(*-avoid-c-arrays) +static constexpr std::uint8_t encoding[] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 16 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 32 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 48 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 64 + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, // 80 + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 96 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 112 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 128 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 144 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 160 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 176 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 192 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 208 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 224 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 240 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 // 256 +}; +// NOLINTEND(*-avoid-c-arrays) +static constexpr auto n_dinucs = 16u; +static const auto dinucs = std::vector{ // NOLINT(cert-err58-cpp) + "AA", "AC", "AG", "AT", + "CA", "CC", "CG", "CT", + "GA", "GC", "GG", "GT", + "TA", "TC", "TG", "TT", +}; +// clang-format on [[nodiscard]] inline bool is_cytosine(const char c) { @@ -263,13 +293,15 @@ enum class missing_code : std::uint8_t { unknown = 1, }; -static const char *tag_values[] = { +// NOLINTBEGIN(*-avoid-c-arrays) +static const char *const tag_values[] = { "CpG", // 0 "CHH", // 1 "CXG", // 2 "CCG", // 3 "N", // 4 }; +// NOLINTEND(*-avoid-c-arrays) template [[nodiscard]] static std::tuple @@ -326,20 +358,25 @@ struct prob_counter { static constexpr auto n_values = 256; std::array meth_hist{}; std::array hydro_hist{}; - std::string + + [[nodiscard]] std::string json() const { std::ostringstream oss; oss << R"({"methyl_hist":[)"; - for (auto i = 0; i < n_values; ++i) { - if (i > 0) + bool first_iter = true; + for (const auto i : meth_hist) { + if (!first_iter) oss << ','; - oss << R"(")" << meth_hist[i] << R"(")"; + oss << R"(")" << i << R"(")"; + first_iter = false; } oss << R"(],"hydroxy_hist":[)"; - for (auto i = 0; i < n_values; ++i) { - if (i > 0) + first_iter = true; + for (const auto i : hydro_hist) { + if (!first_iter) oss << ','; - oss << R"(")" << hydro_hist[i] << R"(")"; + oss << R"(")" << i << R"(")"; + first_iter = false; } oss << R"(]})"; return oss.str(); @@ -420,11 +457,11 @@ struct mod_prob_buffer { if (delta == 0) { const std::uint8_t m_val = bam_auxB2i(mod_prob, methyl_prob_idx++); methyl_probs[i] = m_val; - pc.meth_hist[m_val]++; + pc.meth_hist[m_val]++; // NOLINT(*-constant-array-index) const std::uint8_t h_val = bam_auxB2i(mod_prob, hydroxy_prob_idx++); hydroxy_probs[i] = h_val; - pc.hydro_hist[h_val]++; + pc.hydro_hist[h_val]++; // NOLINT(*-constant-array-index) } // ADS: add 'else' to use '?' and '.' properly --delta; @@ -441,11 +478,11 @@ struct mod_prob_buffer { if (delta == 0) { const std::uint8_t m_val = bam_auxB2i(mod_prob, methyl_prob_idx++); methyl_probs[i] = m_val; - pc.meth_hist[m_val]++; + pc.meth_hist[m_val]++; // NOLINT(*-constant-array-index) const std::uint8_t h_val = bam_auxB2i(mod_prob, hydroxy_prob_idx++); hydroxy_probs[i] = h_val; - pc.hydro_hist[h_val]++; + pc.hydro_hist[h_val]++; // NOLINT(*-constant-array-index) } // ADS: add 'else' to use '?' and '.' properly --delta; @@ -458,34 +495,6 @@ struct mod_prob_buffer { } }; -// clang-format off -static const std::uint8_t encoding[] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 16 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 32 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 48 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 64 - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, // 80 - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 96 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 112 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 128 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 144 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 160 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 176 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 192 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 208 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 224 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 240 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 // 256 -}; -static constexpr auto n_dinucs = 16u; -static const auto dinucs = std::vector{ // NOLINT(cert-err58-cpp) - "AA", "AC", "AG", "AT", - "CA", "CC", "CG", "CT", - "GA", "GC", "GG", "GT", - "TA", "TC", "TG", "TT", -}; -// clang-format on - struct match_counter { static constexpr auto dinuc_combos = 256; typedef std::array container; @@ -495,25 +504,39 @@ struct match_counter { void add_fwd(const std::uint8_t r1, const std::uint8_t r2, const std::uint8_t q1, const std::uint8_t q2) { + constexpr auto get_idx = [](const auto a, const auto b, const auto c, + const auto d) { + return (a << 6) | (b << 4) | (c << 2) | d; // NOLINT(*-magic-numbers) + }; + // NOLINTBEGIN(*-constant-array-index) const auto r1e = encoding[r1]; const auto r2e = encoding[r2]; const auto q1e = encoding[q1]; const auto q2e = encoding[q2]; if ((r1e | r2e | q1e | q2e) & 4u) return; - pos[(r1e * 64) + (r2e * 16) + (q1e * 4) + (q2e * 1)]++; + pos[get_idx(r1e, r2e, q1e, q2e)]++; + // NOLINTEND(*-constant-array-index) + // pos[(r1e * 64) + (r2e * 16) + (q1e * 4) + (q2e * 1)]++; } void add_rev(const std::uint8_t r1, const std::uint8_t r2, const std::uint8_t q1, const std::uint8_t q2) { + constexpr auto get_idx = [](const auto a, const auto b, const auto c, + const auto d) { + return (a << 6) | (b << 4) | (c << 2) | d; // NOLINT(*-magic-numbers) + }; + // NOLINTBEGIN(*-constant-array-index) const auto r1e = encoding[r1]; const auto r2e = encoding[r2]; const auto q1e = encoding[q1]; const auto q2e = encoding[q2]; if ((r1e | r2e | q1e | q2e) & 4u) return; - neg[(r1e * 64) + (r2e * 16) + (q1e * 4) + (q2e * 1)]++; + neg[get_idx(r1e, r2e, q1e, q2e)]++; + // neg[(r1e * 64) + (r2e * 16) + (q1e * 4) + (q2e * 1)]++; + // NOLINTEND(*-constant-array-index) } [[nodiscard]] std::string @@ -901,7 +924,7 @@ struct read_processor { out_fmt, chrom_posn, is_c ? '+' : '-', - tag_values[the_tag], + tag_values[the_tag], // NOLINT mods, n_reads, hydroxy, @@ -1319,4 +1342,4 @@ main_nanocount(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) return EXIT_SUCCESS; } -// NOLINTEND(*-avoid-c-arrays,*-avoid-magic-numbers,*-avoid-non-const-global-variables,*-narrowing-conversions,*-pro-bounds-constant-array-index,*-pro-bounds-pointer-arithmetic) +// NOLINTEND(*-narrowing-conversions,*-pointer-arithmetic) diff --git a/src/bamxx b/src/bamxx index fcd7066a..684f4cd6 160000 --- a/src/bamxx +++ b/src/bamxx @@ -1 +1 @@ -Subproject commit fcd7066a1834fd926b8503511f5382a0ac87b059 +Subproject commit 684f4cd6956b384632a4b4778ffc57d256c46a9a diff --git a/src/dnmtools.cpp b/src/dnmtools.cpp index 48d14da3..327154b0 100644 --- a/src/dnmtools.cpp +++ b/src/dnmtools.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2022-2024 Andrew D. Smith and Guilherme Sena +/* Copyright (C) 2022-2025 Andrew D. Smith and Guilherme Sena * * Authors: Andrew D. Smith and Guilherme Sena * @@ -151,6 +151,8 @@ int main_recovered(int argc, char *argv[]); int kmersites(int argc, char *argv[]); +int +main_summary(int argc, char *argv[]); // NOLINTEND(*-avoid-c-arrays) void @@ -176,62 +178,64 @@ main(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { // clang-format off vector>> command_groups = { -{{"mapping", - {{{"abismal", "map FASTQ reads to a FASTA reference genome or an index", abismal}, - {"abismalidx", "convert a FASTA reference genome to an abismal index", abismalidx}, - {"simreads", "simulate reads in a FASTA reference genome", simreads}}}}, + {{"mapping", + {{{"abismal", "map FASTQ reads to a FASTA reference genome or an index", abismal}, + {"abismalidx", "convert a FASTA reference genome to an abismal index", abismalidx}, + {"simreads", "simulate reads in a FASTA reference genome", simreads}}}}, -{"methylome construction", - {{{"format", "convert SAM/BAM mapped bs-seq reads to standard dnmtools format", main_format}, - {"uniq", "remove duplicate reads from sorted mapped reads", main_uniq}, - {"bsrate", "compute the BS conversion rate from BS-seq reads mapped to a genome", main_bsrate}, - {"counts", "get methylation levels from mapped WGBS reads", main_counts}, - {"counts-nano", "get methylation levels from mapped nanopore reads", main_nanocount}, - {"sym", "get CpG sites and make methylation levels symmetric", main_symmetric_cpgs}, - {"levels", "compute methylation summary statistics from a counts file", main_levels}}}}, + {"methylome construction", + {{{"format", "convert SAM/BAM mapped bs-seq reads to standard dnmtools format", main_format}, + {"uniq", "remove duplicate reads from sorted mapped reads", main_uniq}, + {"bsrate", "compute the BS conversion rate from BS-seq reads mapped to a genome", main_bsrate}, + {"counts", "get methylation levels from mapped WGBS reads", main_counts}, + {"counts-nano", "get methylation levels from mapped nanopore reads", main_nanocount}, + {"sym", "get CpG sites and make methylation levels symmetric", main_symmetric_cpgs}, + {"levels", "compute methylation summary statistics from a counts file", main_levels}}}}, -{"methylome analysis", - {{{"hmr", "identify hypomethylated regions", main_hmr}, - {"hmr-rep", "identify hypomethylated regions in a set of replicate methylomes", main_hmr_rep}, - {"hypermr", "identify hypermethylated regions in plant methylomes", main_hypermr}, - {"entropy", "compute methylation entropy in sliding window", main_methentropy}, - {"pmd", "identify partially methylated domains", main_pmd}, - {"roi", "get average CpG methylation in each of a set of genomic interval", main_roimethstat}, - {"autocorr", "computes statistics on the autocorrelation of methylation levels", main_autocorr}, - {"cpgbins", "get average CpG methylation in genomic bins", main_cpgbins}, - {"multistat", "same as roi except for multiple samples/input files", main_multimethstat}, - {"mlml", "program to estimate hydroxymethylation levels", main_mlml}}}}, + {"methylome analysis", + {{{"hmr", "identify hypomethylated regions", main_hmr}, + {"hmr-rep", "identify hypomethylated regions in a set of replicate methylomes", main_hmr_rep}, + {"hypermr", "identify hypermethylated regions in plant methylomes", main_hypermr}, + {"entropy", "compute methylation entropy in sliding window", main_methentropy}, + {"pmd", "identify partially methylated domains", main_pmd}, + {"roi", "get average CpG methylation in each of a set of genomic interval", main_roimethstat}, + {"autocorr", "computes statistics on the autocorrelation of methylation levels", main_autocorr}, + {"cpgbins", "get average CpG methylation in genomic bins", main_cpgbins}, + {"multistat", "same as roi except for multiple samples/input files", main_multimethstat}, + {"mlml", "program to estimate hydroxymethylation levels", main_mlml}}}}, -{"allele-specific methylation (ASM)", - {{{"states", "convert reads in SAM format into methylation states at CpGs", main_methstates}, - {"allelic", "get probability of ASM for each pair of neighboring CpGs", main_allelicmeth}, - {"amrfinder", "identify regions of ASM in the genome", main_amrfinder}, - {"amrtester", "test a set of genomic intervals for ASM", main_amrtester}}}}, + {"allele-specific methylation (ASM)", + {{{"states", "convert reads in SAM format into methylation states at CpGs", main_methstates}, + {"allelic", "get probability of ASM for each pair of neighboring CpGs", main_allelicmeth}, + {"amrfinder", "identify regions of ASM in the genome", main_amrfinder}, + {"amrtester", "test a set of genomic intervals for ASM", main_amrtester}}}}, -{"differential methylation (DM)", - {{{"dmr", "identify DMRs from genomic intervals and single-CpG DM probabilities", main_dmr}, - {"diff", "compute single-CpG DM probability between two methylomes", main_methdiff}, - {"radmeth", "compute DM probabilities for each CpG using multiple methylomes", main_radmeth}, - {"radmeth-nano", "radmeth designed for nanopore data", main_radmeth_nano}, - {"radadjust", "adjust p-values from radmeth output", main_radmeth_adjust}, - {"radmerge", "merge significant CpGs in radmeth output", main_radmeth_merge}}}}, + {"differential methylation (DM)", + {{{"dmr", "identify DMRs from genomic intervals and single-CpG DM probabilities", main_dmr}, + {"diff", "compute single-CpG DM probability between two methylomes", main_methdiff}, + {"radmeth", "compute DM probabilities for each CpG using multiple methylomes", main_radmeth}, + {"radmeth-nano", "radmeth designed for nanopore data", main_radmeth_nano}, + {"radadjust", "adjust p-values from radmeth output", main_radmeth_adjust}, + {"radmerge", "merge significant CpGs in radmeth output", main_radmeth_merge}}}}, -{"methylation visualization", - {{{"fastlift", "liftover methylation levels between species", main_fast_liftover}, - {"metagene", "summarize methylation around genomic features", metagene}, - {"liftfilter", "filter CpGs that are not CpGs in the target genome", main_lift_filter}}}}, + {"methylation visualization", + {{{"fastlift", "liftover methylation levels between species", main_fast_liftover}, + {"metagene", "summarize methylation around genomic features", metagene}, + {"liftfilter", "filter CpGs that are not CpGs in the target genome", main_lift_filter}}}}, -{"utilities", - {{{"cleanhp", "fix and stat invdup/hairpin reads", main_clean_hairpins}, - {"guessprotocol", "guess whether protocol is ordinary, pbat or random", main_guessprotocol}, - {"merge-bsrate", "merge bisulfite conversion rates files from bsrate", main_merge_bsrate}, - {"merge", "merge multiple counts files into a counts file or a table", main_merge_methcounts}, - {"covered", "filter a counts file for only covered sites", main_covered}, - {"recovered", "replace missing sites in a counts file", main_recovered}, - {"xcounts", "compress counts files by removing information", main_xcounts}, - {"unxcounts", "reverse the xcounts process yielding a counts file", main_unxcounts}, - {"selectsites", "sites inside a set of genomic intervals", main_selectsites}, - {"kmersites", "make track file for sites matching kmer", kmersites}}}}}}; + {"utilities", + {{{"cleanhp", "fix and stat invdup/hairpin reads", main_clean_hairpins}, + {"guessprotocol", "guess whether protocol is ordinary, pbat or random", main_guessprotocol}, + {"merge-bsrate", "merge bisulfite conversion rates files from bsrate", main_merge_bsrate}, + {"merge", "merge multiple counts files into a counts file or a table", main_merge_methcounts}, + {"covered", "filter a counts file for only covered sites", main_covered}, + {"recovered", "replace missing sites in a counts file", main_recovered}, + {"xcounts", "compress counts files by removing information", main_xcounts}, + {"unxcounts", "reverse the xcounts process yielding a counts file", main_unxcounts}, + {"selectsites", "sites inside a set of genomic intervals", main_selectsites}, + {"kmersites", "make track file for sites matching kmer", kmersites}, + {"many", "bsrate, levels, sym, xsym, xcounts and states at once", main_summary}}}}}, + }; // clang-format on if (argc < 2) { diff --git a/src/smithlab_cpp b/src/smithlab_cpp index 15cdbcb0..1318c796 160000 --- a/src/smithlab_cpp +++ b/src/smithlab_cpp @@ -1 +1 @@ -Subproject commit 15cdbcb07480ed36b5f2f511043aecae972ad0bf +Subproject commit 1318c796b96402823a60636eb0f8135b65527de6 diff --git a/src/utils/format-reads.cpp b/src/utils/format-reads.cpp index dfd7a94a..aa19e5dd 100644 --- a/src/utils/format-reads.cpp +++ b/src/utils/format-reads.cpp @@ -341,10 +341,10 @@ swap(bamxx::bam_rec &a, bamxx::bam_rec &b) noexcept { } static void -format(const std::string &cmd, const std::int32_t n_threads, - const std::string &inputfile, const std::string &outfile, - const bool bam_format, const std::string &input_format, - const std::size_t suff_len, const std::int32_t max_frag_len) { +format_reads(const std::string &cmd, const std::int32_t n_threads, + const std::string &inputfile, const std::string &outfile, + const bool bam_format, const std::string &input_format, + const std::size_t suff_len, const std::int32_t max_frag_len) { static const dnmt_error bam_write_err{"error writing bam"}; bamxx::bam_tpool tp(n_threads); @@ -573,8 +573,8 @@ main_format(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) if (verbose && !single_end) std::cerr << "[readname suffix length: " << suff_len << "]" << '\n'; - format(command, n_threads, infile, outfile, bam_format, input_format, - suff_len, max_frag_len); + format_reads(command, n_threads, infile, outfile, bam_format, input_format, + suff_len, max_frag_len); } catch (const std::exception &e) { std::cerr << e.what() << '\n'; diff --git a/src/utils/symmetric-cpgs.cpp b/src/utils/symmetric-cpgs.cpp index 0caf2b69..57e14460 100644 --- a/src/utils/symmetric-cpgs.cpp +++ b/src/utils/symmetric-cpgs.cpp @@ -37,19 +37,6 @@ #include #include -using std::cerr; -using std::cout; -using std::endl; -using std::runtime_error; -using std::string; -using std::unordered_set; - -static inline bool -found_symmetric(const MSite &prev, const MSite &curr) { - // assumes check for CpG already done - return (prev.pos + 1 == curr.pos && prev.strand == '+' && curr.strand == '-'); -} - static inline void ensure_positive_cpg(MSite &s) { s.pos -= (s.strand == '-'); @@ -61,7 +48,7 @@ static std::tuple get_first_site(T &in, T &out) { bool prev_is_cpg = false; MSite prev_site; - string line; + std::string line; bool within_header = true; while (within_header && getline(in, line)) { if (is_counts_header_line(line)) { @@ -86,7 +73,7 @@ process_sites(const bool verbose, T &in, T &out) { auto [prev_site, prev_is_cpg] = get_first_site(in, out); // setup to verfiy that chromosomes are together - unordered_set chroms_seen; + std::unordered_set chroms_seen; chroms_seen.insert(prev_site.chrom); bool sites_are_sorted = true; @@ -107,7 +94,7 @@ process_sites(const bool verbose, T &in, T &out) { } else { if (verbose) - cerr << "processing: " << curr_site.chrom << '\n'; + std::cerr << "processing: " << curr_site.chrom << '\n'; if (chroms_seen.find(curr_site.chrom) != cend(chroms_seen)) return false; chroms_seen.insert(curr_site.chrom); @@ -132,13 +119,13 @@ int main_symmetric_cpgs(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { // file types from HTSlib use "-" for the filename to go to stdout - string outfile{"-"}; + std::string outfile{"-"}; bool verbose = false; bool compress_output = false; bool allow_extra_fields = false; int32_t n_threads = 1; - const string description = + const std::string description = "Get CpG sites and make methylation levels symmetric."; /****************** COMMAND LINE OPTIONS ********************/ @@ -151,44 +138,44 @@ main_symmetric_cpgs(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) opt_parse.add_opt("relaxed", '\0', "allow extra fields in input", false, allow_extra_fields); opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); - std::vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << '\n' - << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << '\n'; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << '\n'; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 1) { - cerr << opt_parse.help_message() << '\n'; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } - const string filename(leftover_args.front()); + const std::string filename(leftover_args.front()); /****************** END COMMAND LINE OPTIONS *****************/ MSite::no_extra_fields = (allow_extra_fields == false); if (n_threads <= 0) - throw runtime_error("threads must be positive"); + throw std::runtime_error("threads must be positive"); bamxx::bam_tpool tp(n_threads); // const bool show_progress = VERBOSE && isatty(fileno(stderr)); bamxx::bgzf_file in(filename, "r"); if (!in) - throw runtime_error("could not open file: " + filename); + throw std::runtime_error("could not open file: " + filename); // open the output file - const string output_mode = compress_output ? "w" : "wu"; + const std::string output_mode = compress_output ? "w" : "wu"; bamxx::bgzf_file out(outfile, output_mode); if (!out) - throw runtime_error("error opening output file: " + outfile); + throw std::runtime_error("error opening output file: " + outfile); if (n_threads > 1) { if (in.is_bgzf()) @@ -199,7 +186,7 @@ main_symmetric_cpgs(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) const bool sites_are_sorted = process_sites(verbose, in, out); if (!sites_are_sorted) { - cerr << "sites are not sorted in: " << filename << '\n'; + std::cerr << "sites are not sorted in: " << filename << '\n'; const std::filesystem::path outpath{outfile}; if (std::filesystem::exists(outpath) && !std::filesystem::remove(outpath)) throw std::runtime_error("failed to remove file: " + outpath.string()); @@ -207,7 +194,7 @@ main_symmetric_cpgs(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) } } catch (const std::exception &e) { - cerr << e.what() << '\n'; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS;