diff --git a/inc/rntuple/RAMNTupleRecord.h b/inc/rntuple/RAMNTupleRecord.h index 0f3d621..a6c2c5c 100644 --- a/inc/rntuple/RAMNTupleRecord.h +++ b/inc/rntuple/RAMNTupleRecord.h @@ -18,6 +18,12 @@ #include #include +namespace ROOT { +class RNTupleModel; +class RNTupleWriter; +class RNTupleReader; +} // namespace ROOT + class RAMNTupleRefs; class RAMNTupleIndex; @@ -39,6 +45,7 @@ class RAMNTupleRefs { ~RAMNTupleRefs() = default; int GetRefId(const std::string &rname); + int FindRefId(const std::string &rname) const; ///< Lookup-only; returns -1 if not found. const std::string &GetRefName(int rid) const; void Print() const; diff --git a/src/ramcore/RAMNTupleView.cxx b/src/ramcore/RAMNTupleView.cxx index 7bac2bd..3f3bd7c 100644 --- a/src/ramcore/RAMNTupleView.cxx +++ b/src/ramcore/RAMNTupleView.cxx @@ -1,188 +1,88 @@ #include "ramcore/RAMNTupleView.h" -#include - -#include -#include -#include #include -#include -#include - +#include #include #include #include -#include #include #include - #include "rntuple/RAMNTupleRecord.h" +#include -namespace { - -constexpr int FLAG_UNMAPPED = 0x4; -constexpr int FLAG_SECONDARY = 0x100; -constexpr int FLAG_SUPPLEMENTARY = 0x800; -constexpr int FLAG_FILTER = FLAG_UNMAPPED | FLAG_SECONDARY | FLAG_SUPPLEMENTARY; - -// Computes how many reference bases an alignment covers using CIGAR. -// -// CIGAR (Compact Idiosyncratic Gapped Alignment Report) describes how a read -// aligns to the reference. It's a sequence of operations like "50M2D48M" meaning: -// 50 bases match, 2 bases deleted from reference, 48 bases match. -// -// BAM stores only alignment start position. To find the end position (needed -// for overlap queries), we compute: end = start + refSpan - 1 -// -// In BAM, each CIGAR op is packed into a uint32: (length << 4) | opcode -// Only some ops consume reference bases: -// M(0)=match/mismatch, D(2)=deletion, N(3)=skip, =(7)=seq match, X(8)=seq mismatch -// Ops like I(insertion), S(soft-clip), H(hard-clip) don't consume reference. -// -// Example: CIGAR "50M2D48M" -> refSpan = 50 + 2 + 48 = 100 -// (the read is 98bp but covers 100 reference bases due to 2bp deletion) -// -// See SAM spec section 1.4.6: https://samtools.github.io/hts-specs/SAMv1.pdf -int computeRefSpan(const std::vector &cigarOps) -{ - int span = 0; - for (uint32_t op : cigarOps) { - int code = op & 0xF; - if (code == 0 || code == 2 || code == 3 || code == 7 || code == 8) { - span += static_cast(op >> 4); - } - } - return span; -} - -int resolveRefId(const char *name) -{ - auto refs = RAMNTupleRecord::GetRnameRefs(); - if (!refs) - return -1; - - const auto &refVec = refs->GetRefs(); - for (size_t i = 0; i < refVec.size(); i++) { - if (refVec[i] == name) - return static_cast(i); - } - return -1; -} - -bool parseRegion(const std::string ®ion, TString &rname, Int_t &start, Int_t &end) -{ - // Default: entire chromosome - start = 0; - end = std::numeric_limits::max() - 1; - - // No colon means chromosome-only query e.g. "chr1" - const std::size_t colonPos = region.find(':'); - if (colonPos == std::string::npos) { - rname = region; - return true; - } - - // Split "chr1:1000-2000" into rname="chr1", rest="1000-2000" - rname = region.substr(0, colonPos); - const std::string rest = region.substr(colonPos + 1); - - const std::size_t dashPos = rest.find('-'); - if (dashPos == std::string::npos) { - // Single position e.g. "chr1:500" - if (rest.empty() || !std::all_of(rest.begin(), rest.end(), ::isdigit)) - return false; - start = end = std::stoi(rest) - 1; // SAM 1-based → 0-based - } else { - // Range e.g. "chr1:1000-2000" - const std::string startStr = rest.substr(0, dashPos); - const std::string endStr = rest.substr(dashPos + 1); - - // Reject empty or non-numeric strings before calling stoi - if (startStr.empty() || endStr.empty() || !std::all_of(startStr.begin(), startStr.end(), ::isdigit) || - !std::all_of(endStr.begin(), endStr.end(), ::isdigit)) - return false; - - start = std::stoi(startStr) - 1; // SAM 1-based → 0-based - end = std::stoi(endStr) - 1; - } - - // Clamp negative start (e.g. user passed position 0) - if (start < 0) - start = 0; - return true; -} - -} // namespace - -// NOLINTNEXTLINE(misc-use-internal-linkage) -Long64_t ramntupleview(const char *file, const char *query, bool /*cache*/, bool /*perfstats*/, - const char * /*perfstatsfilename*/) +Long64_t ramntupleview(const char *file, const char *query, bool cache, bool perfstats, const char *perfstatsfilename) { TStopwatch stopwatch; stopwatch.Start(); auto reader = RAMNTupleRecord::OpenRAMFile(file); if (!reader) { - std::cerr << "ramntupleview: failed to open file " << file << std::endl; + std::cerr << "[ramntupleview] failed to open file " << file << "\n"; return 0; } - std::string region = query ? query : ""; - if (region.empty() || region == "*") { - stopwatch.Print(); - return reader->GetNEntries(); + std::string region = query; + int chrDelimiterPos = region.find(":"); + if (chrDelimiterPos == std::string::npos) { + std::cerr << "Invalid region format. Use rname:start-end\n"; + return 0; } - - TString rname; - Int_t rs, re; - if (!parseRegion(region, rname, rs, re)) { - std::cerr << "Invalid region format. Use rname[:start[-end]]\n"; + TString rname = region.substr(0, chrDelimiterPos); + int rangeDelimiterPos = region.find("-", chrDelimiterPos); + if (rangeDelimiterPos == std::string::npos) { + std::cerr << "Invalid region format. Use rname:start-end\n"; return 0; } + Int_t range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1)); + Int_t range_end = std::stoi(region.substr(rangeDelimiterPos + 1)); - int refid = resolveRefId(rname.Data()); + auto refid = RAMNTupleRecord::GetRnameRefs()->FindRefId(rname.Data()); if (refid < 0) { - std::cerr << "Reference '" << rname.Data() << "' not found\n"; + std::cerr << "[ramntupleview] unknown reference sequence '" << rname.Data() << "'\n"; return 0; } - - auto flagView = reader->GetView("record.flag"); - auto refidView = reader->GetView("record.refid"); - auto posView = reader->GetView("record.pos"); - auto cigarView = reader->GetView>("record.cigar"); - auto index = RAMNTupleRecord::GetIndex(); - Long64_t start = (index && index->Size() > 0) ? index->GetRow(refid, rs) : 0; - if (start < 0) - start = 0; + + auto recordView = reader->GetView("record"); Long64_t count = 0; - const Long64_t total = reader->GetNEntries(); - for (Long64_t i = start; i < total; i++) { - if (flagView(i) & FLAG_FILTER) - continue; + if (!index || index->Size() == 0) { + + for (auto i : reader->GetEntryRange()) { + const auto &rec = recordView(i); + if (rec.refid == refid && rec.pos >= range_start - 1 && rec.pos <= range_end - 1) { + count++; + } + } + } else { - int curRef = refidView(i); - if (curRef < refid) - continue; - if (curRef > refid) - break; + auto start_entry = index->GetRow(refid, range_start); + auto end_entry = index->GetRow(refid, range_end); + if (start_entry < 0) + start_entry = 0; + if (end_entry < 0) + end_entry = reader->GetNEntries(); + + for (; start_entry < end_entry; start_entry++) { + const auto &rec = recordView(start_entry); + int seqlen = rec.GetSEQLEN(); + if (rec.pos + seqlen > range_start - 1) { + break; + } + } - int pos = posView(i); - if (pos > re) - break; + Long64_t j; + for (j = start_entry; j < reader->GetNEntries(); j++) { + const auto &rec = recordView(j); - // Overlap: read_end >= rs (pos <= re guaranteed by break above) - if (pos >= rs) { + if (rec.pos >= range_end) { + break; + } count++; - } else { - int readEnd = pos + computeRefSpan(cigarView(i)) - 1; - if (readEnd >= rs) - count++; } } stopwatch.Print(); - std::cout << "Found " << count << " records in region " << region << std::endl; + return count; -} \ No newline at end of file +} diff --git a/src/rntuple/RAMNTupleRecord.cxx b/src/rntuple/RAMNTupleRecord.cxx index 7a4e477..7afc61d 100644 --- a/src/rntuple/RAMNTupleRecord.cxx +++ b/src/rntuple/RAMNTupleRecord.cxx @@ -69,6 +69,18 @@ int RAMNTupleRefs::GetRefId(const std::string &rname) return fLastId; } +int RAMNTupleRefs::FindRefId(const std::string &rname) const +{ + if (rname == "*") + return -1; + if (rname == fLastName) + return fLastId; + auto it = std::find(fRefVec.begin(), fRefVec.end(), rname); + if (it != fRefVec.end()) + return static_cast(std::distance(fRefVec.begin(), it)); + return -1; +} + const std::string &RAMNTupleRefs::GetRefName(int rid) const { static const std::string star = "*"; @@ -439,8 +451,7 @@ std::string DecodeSequence(const std::string &encoded_seq, size_t length) seq[i * 2 + 1] = kCodeToSeq[byte & 0xf]; } if (length % 2) { - // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - seq[length - 1] = kCodeToSeq[static_cast(encoded_seq[length / 2]) >> 4]; + seq[length - 1] = kCodeToSeq[encoded_seq[length / 2] >> 4]; } return seq; diff --git a/test/ramcoretests.cxx b/test/ramcoretests.cxx index 7dd8b24..20f03a1 100644 --- a/test/ramcoretests.cxx +++ b/test/ramcoretests.cxx @@ -462,4 +462,24 @@ TEST_F(ramcoreTest, QUALEncodingDecodingModes) std::remove(samFile); std::remove(ramFile); -} \ No newline at end of file +} +TEST_F(ramcoreTest, InvalidChromosomeDoesNotPolluteFRefVec) +{ + const char *samFile = "samexample.sam"; + const char *rntupleFile = "test_rntuple.root"; + + samtoramntuple(samFile, rntupleFile, true, true, true, 505, 0); + + size_t refsBefore = RAMNTupleRecord::GetRnameRefs()->Size(); + + testing::internal::CaptureStdout(); + Long64_t count = ramntupleview(rntupleFile, "chrINVALID:100-200", true, false, nullptr); + testing::internal::GetCapturedStdout(); + + size_t refsAfter = RAMNTupleRecord::GetRnameRefs()->Size(); + + EXPECT_EQ(refsBefore, refsAfter) + << "Invalid chromosome 'chrINVALID' was inserted into fRefVec (regression of issue #23)"; + EXPECT_EQ(count, 0) + << "Expected 0 records for unknown chromosome, got " << count; +}