Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions inc/rntuple/RAMNTupleRecord.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
#include <memory>
#include <cstdint>

namespace ROOT {
class RNTupleModel;
class RNTupleWriter;
class RNTupleReader;
} // namespace ROOT

class RAMNTupleRefs;
class RAMNTupleIndex;

Expand All @@ -39,6 +45,7 @@ class RAMNTupleRefs {
~RAMNTupleRefs() = default;

int GetRefId(const std::string &rname);
int FindRefId(const std::string &rname) const; ///< Lookup-only; returns -1 if not found.
const std::string &GetRefName(int rid) const;

void Print() const;
Expand Down
200 changes: 50 additions & 150 deletions src/ramcore/RAMNTupleView.cxx
Original file line number Diff line number Diff line change
@@ -1,188 +1,88 @@
#include "ramcore/RAMNTupleView.h"
#include <algorithm>

#include <cctype>
#include <cstddef>
#include <cstdint>
#include <iostream>
#include <string>
#include <vector>

#include <cstring>
#include <ROOT/RNTuple.hxx>
#include <ROOT/RNTupleReader.hxx>
#include <ROOT/RNTupleView.hxx>
#include <Rtypes.h>
#include <TStopwatch.h>
#include <TString.h>

#include "rntuple/RAMNTupleRecord.h"
#include <Rtypes.h>

namespace {

constexpr int FLAG_UNMAPPED = 0x4;
constexpr int FLAG_SECONDARY = 0x100;
constexpr int FLAG_SUPPLEMENTARY = 0x800;
constexpr int FLAG_FILTER = FLAG_UNMAPPED | FLAG_SECONDARY | FLAG_SUPPLEMENTARY;

// Computes how many reference bases an alignment covers using CIGAR.
//
// CIGAR (Compact Idiosyncratic Gapped Alignment Report) describes how a read
// aligns to the reference. It's a sequence of operations like "50M2D48M" meaning:
// 50 bases match, 2 bases deleted from reference, 48 bases match.
//
// BAM stores only alignment start position. To find the end position (needed
// for overlap queries), we compute: end = start + refSpan - 1
//
// In BAM, each CIGAR op is packed into a uint32: (length << 4) | opcode
// Only some ops consume reference bases:
// M(0)=match/mismatch, D(2)=deletion, N(3)=skip, =(7)=seq match, X(8)=seq mismatch
// Ops like I(insertion), S(soft-clip), H(hard-clip) don't consume reference.
//
// Example: CIGAR "50M2D48M" -> refSpan = 50 + 2 + 48 = 100
// (the read is 98bp but covers 100 reference bases due to 2bp deletion)
//
// See SAM spec section 1.4.6: https://samtools.github.io/hts-specs/SAMv1.pdf
int computeRefSpan(const std::vector<uint32_t> &cigarOps)
{
int span = 0;
for (uint32_t op : cigarOps) {
int code = op & 0xF;
if (code == 0 || code == 2 || code == 3 || code == 7 || code == 8) {
span += static_cast<int>(op >> 4);
}
}
return span;
}

int resolveRefId(const char *name)
{
auto refs = RAMNTupleRecord::GetRnameRefs();
if (!refs)
return -1;

const auto &refVec = refs->GetRefs();
for (size_t i = 0; i < refVec.size(); i++) {
if (refVec[i] == name)
return static_cast<int>(i);
}
return -1;
}

bool parseRegion(const std::string &region, TString &rname, Int_t &start, Int_t &end)
{
// Default: entire chromosome
start = 0;
end = std::numeric_limits<Int_t>::max() - 1;

// No colon means chromosome-only query e.g. "chr1"
const std::size_t colonPos = region.find(':');
if (colonPos == std::string::npos) {
rname = region;
return true;
}

// Split "chr1:1000-2000" into rname="chr1", rest="1000-2000"
rname = region.substr(0, colonPos);
const std::string rest = region.substr(colonPos + 1);

const std::size_t dashPos = rest.find('-');
if (dashPos == std::string::npos) {
// Single position e.g. "chr1:500"
if (rest.empty() || !std::all_of(rest.begin(), rest.end(), ::isdigit))
return false;
start = end = std::stoi(rest) - 1; // SAM 1-based → 0-based
} else {
// Range e.g. "chr1:1000-2000"
const std::string startStr = rest.substr(0, dashPos);
const std::string endStr = rest.substr(dashPos + 1);

// Reject empty or non-numeric strings before calling stoi
if (startStr.empty() || endStr.empty() || !std::all_of(startStr.begin(), startStr.end(), ::isdigit) ||
!std::all_of(endStr.begin(), endStr.end(), ::isdigit))
return false;

start = std::stoi(startStr) - 1; // SAM 1-based → 0-based
end = std::stoi(endStr) - 1;
}

// Clamp negative start (e.g. user passed position 0)
if (start < 0)
start = 0;
return true;
}

} // namespace

// NOLINTNEXTLINE(misc-use-internal-linkage)
Long64_t ramntupleview(const char *file, const char *query, bool /*cache*/, bool /*perfstats*/,
const char * /*perfstatsfilename*/)
Long64_t ramntupleview(const char *file, const char *query, bool cache, bool perfstats, const char *perfstatsfilename)
{
TStopwatch stopwatch;
stopwatch.Start();

auto reader = RAMNTupleRecord::OpenRAMFile(file);
if (!reader) {
std::cerr << "ramntupleview: failed to open file " << file << std::endl;
std::cerr << "[ramntupleview] failed to open file " << file << "\n";
return 0;
}

std::string region = query ? query : "";
if (region.empty() || region == "*") {
stopwatch.Print();
return reader->GetNEntries();
std::string region = query;
int chrDelimiterPos = region.find(":");
if (chrDelimiterPos == std::string::npos) {
std::cerr << "Invalid region format. Use rname:start-end\n";
return 0;
}

TString rname;
Int_t rs, re;
if (!parseRegion(region, rname, rs, re)) {
std::cerr << "Invalid region format. Use rname[:start[-end]]\n";
TString rname = region.substr(0, chrDelimiterPos);
int rangeDelimiterPos = region.find("-", chrDelimiterPos);
if (rangeDelimiterPos == std::string::npos) {
std::cerr << "Invalid region format. Use rname:start-end\n";
return 0;
}
Int_t range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1));
Int_t range_end = std::stoi(region.substr(rangeDelimiterPos + 1));

int refid = resolveRefId(rname.Data());
auto refid = RAMNTupleRecord::GetRnameRefs()->FindRefId(rname.Data());
if (refid < 0) {
std::cerr << "Reference '" << rname.Data() << "' not found\n";
std::cerr << "[ramntupleview] unknown reference sequence '" << rname.Data() << "'\n";
return 0;
}

auto flagView = reader->GetView<uint16_t>("record.flag");
auto refidView = reader->GetView<int32_t>("record.refid");
auto posView = reader->GetView<int32_t>("record.pos");
auto cigarView = reader->GetView<std::vector<uint32_t>>("record.cigar");

auto index = RAMNTupleRecord::GetIndex();
Long64_t start = (index && index->Size() > 0) ? index->GetRow(refid, rs) : 0;
if (start < 0)
start = 0;

auto recordView = reader->GetView<RAMNTupleRecord>("record");

Long64_t count = 0;
const Long64_t total = reader->GetNEntries();

for (Long64_t i = start; i < total; i++) {
if (flagView(i) & FLAG_FILTER)
continue;
if (!index || index->Size() == 0) {

for (auto i : reader->GetEntryRange()) {
const auto &rec = recordView(i);
if (rec.refid == refid && rec.pos >= range_start - 1 && rec.pos <= range_end - 1) {
count++;
}
}
} else {

int curRef = refidView(i);
if (curRef < refid)
continue;
if (curRef > refid)
break;
auto start_entry = index->GetRow(refid, range_start);
auto end_entry = index->GetRow(refid, range_end);
if (start_entry < 0)
start_entry = 0;
if (end_entry < 0)
end_entry = reader->GetNEntries();

for (; start_entry < end_entry; start_entry++) {
const auto &rec = recordView(start_entry);
int seqlen = rec.GetSEQLEN();
if (rec.pos + seqlen > range_start - 1) {
break;
}
}

int pos = posView(i);
if (pos > re)
break;
Long64_t j;
for (j = start_entry; j < reader->GetNEntries(); j++) {
const auto &rec = recordView(j);

// Overlap: read_end >= rs (pos <= re guaranteed by break above)
if (pos >= rs) {
if (rec.pos >= range_end) {
break;
}
count++;
} else {
int readEnd = pos + computeRefSpan(cigarView(i)) - 1;
if (readEnd >= rs)
count++;
}
}

stopwatch.Print();
std::cout << "Found " << count << " records in region " << region << std::endl;

return count;
}
}
15 changes: 13 additions & 2 deletions src/rntuple/RAMNTupleRecord.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,18 @@ int RAMNTupleRefs::GetRefId(const std::string &rname)
return fLastId;
}

int RAMNTupleRefs::FindRefId(const std::string &rname) const
{
if (rname == "*")
return -1;
if (rname == fLastName)
return fLastId;
auto it = std::find(fRefVec.begin(), fRefVec.end(), rname);
if (it != fRefVec.end())
return static_cast<int>(std::distance(fRefVec.begin(), it));
return -1;
}

const std::string &RAMNTupleRefs::GetRefName(int rid) const
{
static const std::string star = "*";
Expand Down Expand Up @@ -439,8 +451,7 @@ std::string DecodeSequence(const std::string &encoded_seq, size_t length)
seq[i * 2 + 1] = kCodeToSeq[byte & 0xf];
}
if (length % 2) {
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
seq[length - 1] = kCodeToSeq[static_cast<uint8_t>(encoded_seq[length / 2]) >> 4];
seq[length - 1] = kCodeToSeq[encoded_seq[length / 2] >> 4];
}

return seq;
Expand Down
22 changes: 21 additions & 1 deletion test/ramcoretests.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -462,4 +462,24 @@ TEST_F(ramcoreTest, QUALEncodingDecodingModes)

std::remove(samFile);
std::remove(ramFile);
}
}
TEST_F(ramcoreTest, InvalidChromosomeDoesNotPolluteFRefVec)
{
const char *samFile = "samexample.sam";
const char *rntupleFile = "test_rntuple.root";

samtoramntuple(samFile, rntupleFile, true, true, true, 505, 0);

size_t refsBefore = RAMNTupleRecord::GetRnameRefs()->Size();

testing::internal::CaptureStdout();
Long64_t count = ramntupleview(rntupleFile, "chrINVALID:100-200", true, false, nullptr);
testing::internal::GetCapturedStdout();

size_t refsAfter = RAMNTupleRecord::GetRnameRefs()->Size();

EXPECT_EQ(refsBefore, refsAfter)
<< "Invalid chromosome 'chrINVALID' was inserted into fRefVec (regression of issue #23)";
EXPECT_EQ(count, 0)
<< "Expected 0 records for unknown chromosome, got " << count;
}