From 4665d2663c788cf4d8ac772ed00514c854c739e0 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 12 May 2025 16:59:08 -0700 Subject: [PATCH 1/4] src/analysis/methcounts.cpp: reporting tid along with error message for incorrect chrom sorting and skipping any unmapped reads which are identified by get_tid returning -1 --- src/analysis/methcounts.cpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/analysis/methcounts.cpp b/src/analysis/methcounts.cpp index 4eab69e6..0fe72620 100644 --- a/src/analysis/methcounts.cpp +++ b/src/analysis/methcounts.cpp @@ -446,22 +446,30 @@ process_reads(const bool VERBOSE, const bool show_progress, vector::const_iterator chrom_itr; while (hts.read(hdr, aln)) { - if (get_tid(aln) == prev_tid) { + const int32_t tid = get_tid(aln); + if (tid == -1) // ADS: skip reads that have no tid -- they are not mapped + continue; + if (tid == prev_tid) { // do the work for this mapped read, depending on strand if (bam_is_rev(aln)) count_states_neg(aln, counts); else count_states_pos(aln, counts); } - else { // chrom changes, output results, get the next one + else { // chrom has changed, so output results and get the next chrom - // write output if there is any; should fail only once + // write output if there is any; counts is empty only for first chrom if (!counts.empty()) write_output(hdr, out, prev_tid, *chrom_itr, counts, CPG_ONLY); - - const int32_t tid = get_tid(aln); - if (tid < prev_tid) throw dnmt_error("SAM file is not sorted"); + // make sure reads are sorted chrom tid number in header + if (tid < prev_tid) { + const std::string message = "SAM file is not sorted " + "previous tid: " + + std::to_string(prev_tid) + + " current tid: " + std::to_string(tid); + throw dnmt_error(message); + } if (!require_covered) for (auto i = prev_tid + 1; i < tid; ++i) From 58a0c01b6eb67c46ee4d59bbac353fc3e870f645 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 12 May 2025 17:31:27 -0700 Subject: [PATCH 2/4] src/analysis/bsrate.cpp: skipping any reads in the input that are not mapped as indicated by a -1 for tid obtained by get_tid --- src/analysis/bsrate.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/analysis/bsrate.cpp b/src/analysis/bsrate.cpp index 776077bb..fd058a12 100644 --- a/src/analysis/bsrate.cpp +++ b/src/analysis/bsrate.cpp @@ -580,12 +580,14 @@ main_bsrate(int argc, const char **argv) { unordered_set chroms_seen; while (hts.read(hdr, aln)) { + const int32_t the_tid = get_tid(aln); + if (the_tid == -1) + continue; + if (reads_are_a_rich) flip_conversion(aln); // get the correct chrom if it has changed - if (get_tid(aln) != current_tid) { - const int32_t the_tid = get_tid(aln); - + if (the_tid != current_tid) { // make sure all reads from same chrom are contiguous in the file if (chroms_seen.find(the_tid) != end(chroms_seen)) throw runtime_error("chroms out of order in mapped reads file"); From e71a8117ce80be3e646646482baeb4b9fd1e2de6 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 12 May 2025 17:32:30 -0700 Subject: [PATCH 3/4] src/utils/format-reads.cpp: when processing reads to format, but not to check read names for a suffix to ignore, if the tid is found to be negative ignore the read as it is unmapped --- src/utils/format-reads.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/utils/format-reads.cpp b/src/utils/format-reads.cpp index ba110a40..c8ba480f 100644 --- a/src/utils/format-reads.cpp +++ b/src/utils/format-reads.cpp @@ -341,6 +341,10 @@ format(const string &cmd, const size_t n_threads, const string &inputfile, swap(aln, prev_aln); // start with prev_aln being first read while (hts.read(hdr, aln)) { + // ADS: skip reads that have no tid -- they are not mapped + if (get_tid(aln) == -1) + continue; + standardize_format(input_format, aln); if (same_name(prev_aln, aln, suff_len)) { // below: essentially check for dovetail From 98c8dec6166ec152f51ae207104d4802e410cfcd Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 12 May 2025 17:36:26 -0700 Subject: [PATCH 4/4] src/utils/uniq.cpp: skipping any unmapped reads which have tid -1 from the get_tid function --- src/utils/uniq.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/utils/uniq.cpp b/src/utils/uniq.cpp index 41a64735..31c38be0 100644 --- a/src/utils/uniq.cpp +++ b/src/utils/uniq.cpp @@ -227,8 +227,14 @@ uniq(const bool add_dup_count, const uint32_t max_buffer_size, } bam_rec aln; - if (hts.read(hdr, aln)) { // valid SAM/BAM can have 0 reads + bool found_mapped_read{false}; // valid SAM/BAM can have 0 reads + while (!found_mapped_read && hts.read(hdr, aln)) { + // ADS: skip reads that have no tid -- they are not mapped + if (get_tid(aln) != -1) + found_mapped_read = true; + } + if (found_mapped_read) { rs_in.update(aln); // update stats for input we just got vector buffer(1, aln); // select output from this buffer @@ -238,6 +244,9 @@ uniq(const bool add_dup_count, const uint32_t max_buffer_size, int32_t cur_chrom = get_tid(aln); while (hts.read(hdr, aln)) { + // ADS: skip reads that have no tid -- they are not mapped + if (get_tid(aln) == -1) + continue; rs_in.update(aln); // below works because buffer reset at every new chrom