Skip to content
Closed
1 change: 1 addition & 0 deletions inc/rntuple/RAMNTupleRecord.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class RAMNTupleRefs {
~RAMNTupleRefs() = default;

int GetRefId(const std::string &rname);
int FindRefId(const std::string &rname) const; ///< Lookup-only; returns -1 if not found.
const std::string &GetRefName(int rid) const;

void Print() const;
Expand Down
198 changes: 49 additions & 149 deletions src/ramcore/RAMNTupleView.cxx
Original file line number Diff line number Diff line change
@@ -1,121 +1,15 @@
#include "ramcore/RAMNTupleView.h"
Comment thread
ZaGrayWolf marked this conversation as resolved.
#include <algorithm>

#include <cctype>
#include <cstddef>
#include <cstdint>
#include <iostream>
#include <string>
#include <vector>

#include <cstring>
#include <ROOT/RNTuple.hxx>
#include <ROOT/RNTupleReader.hxx>
#include <ROOT/RNTupleView.hxx>
#include <Rtypes.h>
#include <TStopwatch.h>
#include <TString.h>

#include "rntuple/RAMNTupleRecord.h"
#include <Rtypes.h>

namespace {

constexpr int FLAG_UNMAPPED = 0x4;
constexpr int FLAG_SECONDARY = 0x100;
constexpr int FLAG_SUPPLEMENTARY = 0x800;
constexpr int FLAG_FILTER = FLAG_UNMAPPED | FLAG_SECONDARY | FLAG_SUPPLEMENTARY;

// Computes how many reference bases an alignment covers using CIGAR.
//
// CIGAR (Compact Idiosyncratic Gapped Alignment Report) describes how a read
// aligns to the reference. It's a sequence of operations like "50M2D48M" meaning:
// 50 bases match, 2 bases deleted from reference, 48 bases match.
//
// BAM stores only alignment start position. To find the end position (needed
// for overlap queries), we compute: end = start + refSpan - 1
//
// In BAM, each CIGAR op is packed into a uint32: (length << 4) | opcode
// Only some ops consume reference bases:
// M(0)=match/mismatch, D(2)=deletion, N(3)=skip, =(7)=seq match, X(8)=seq mismatch
// Ops like I(insertion), S(soft-clip), H(hard-clip) don't consume reference.
//
// Example: CIGAR "50M2D48M" -> refSpan = 50 + 2 + 48 = 100
// (the read is 98bp but covers 100 reference bases due to 2bp deletion)
//
// See SAM spec section 1.4.6: https://samtools.github.io/hts-specs/SAMv1.pdf
int computeRefSpan(const std::vector<uint32_t> &cigarOps)
{
int span = 0;
for (uint32_t op : cigarOps) {
int code = op & 0xF;
if (code == 0 || code == 2 || code == 3 || code == 7 || code == 8) {
span += static_cast<int>(op >> 4);
}
}
return span;
}

int resolveRefId(const char *name)
{
auto refs = RAMNTupleRecord::GetRnameRefs();
if (!refs)
return -1;

const auto &refVec = refs->GetRefs();
for (size_t i = 0; i < refVec.size(); i++) {
if (refVec[i] == name)
return static_cast<int>(i);
}
return -1;
}

bool parseRegion(const std::string &region, TString &rname, Int_t &start, Int_t &end)
{
// Default: entire chromosome
start = 0;
end = std::numeric_limits<Int_t>::max() - 1;

// No colon means chromosome-only query e.g. "chr1"
const std::size_t colonPos = region.find(':');
if (colonPos == std::string::npos) {
rname = region;
return true;
}

// Split "chr1:1000-2000" into rname="chr1", rest="1000-2000"
rname = region.substr(0, colonPos);
const std::string rest = region.substr(colonPos + 1);

const std::size_t dashPos = rest.find('-');
if (dashPos == std::string::npos) {
// Single position e.g. "chr1:500"
if (rest.empty() || !std::all_of(rest.begin(), rest.end(), ::isdigit))
return false;
start = end = std::stoi(rest) - 1; // SAM 1-based → 0-based
} else {
// Range e.g. "chr1:1000-2000"
const std::string startStr = rest.substr(0, dashPos);
const std::string endStr = rest.substr(dashPos + 1);

// Reject empty or non-numeric strings before calling stoi
if (startStr.empty() || endStr.empty() || !std::all_of(startStr.begin(), startStr.end(), ::isdigit) ||
!std::all_of(endStr.begin(), endStr.end(), ::isdigit))
return false;

start = std::stoi(startStr) - 1; // SAM 1-based → 0-based
end = std::stoi(endStr) - 1;
}

// Clamp negative start (e.g. user passed position 0)
if (start < 0)
start = 0;
return true;
}

} // namespace

// NOLINTNEXTLINE(misc-use-internal-linkage)
Long64_t ramntupleview(const char *file, const char *query, bool /*cache*/, bool /*perfstats*/,
const char * /*perfstatsfilename*/)
Long64_t ramntupleviewstatic static (const char *file, const char *query, bool cache, bool perfstats, const char *perfstatsfilename)
{
TStopwatch stopwatch;
stopwatch.Start();
Expand All @@ -126,63 +20,69 @@ Long64_t ramntupleview(const char *file, const char *query, bool /*cache*/, bool
return 0;
}

std::string region = query ? query : "";
if (region.empty() || region == "*") {
stopwatch.Print();
return reader->GetNEntries();
std::string region = query;
Comment thread
ZaGrayWolf marked this conversation as resolved.
int chrDelimiterPos = region.find(":");
if (chrDelimiterPos == std::string::npos) {
std::cerr << "Invalid region format. Use rname:start-end\n";
return 0;
}

TString rname;
Int_t rs, re;
if (!parseRegion(region, rname, rs, re)) {
std::cerr << "Invalid region format. Use rname[:start[-end]]\n";
TString rname = region.substr(0, chrDelimiterPos);
int rangeDelimiterPos = region.find("-", chrDelimiterPos);
if (rangeDelimiterPos == std::string::npos) {
std::cerr << "Invalid region format. Use rname:start-end\n";
return 0;
}
Int_t range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1));
Comment thread
ZaGrayWolf marked this conversation as resolved.
Int_t range_end = std::stoi(region.substr(rangeDelimiterPos + 1));

int refid = resolveRefId(rname.Data());
auto refid = RAMNTupleRecord::GetRnameRefs()->FindRefId(rname.Data());
if (refid < 0) {
std::cerr << "Reference '" << rname.Data() << "' not found\n";
std::cerr << "ramntupleview: unknown reference sequence '" << rname.Data() << "'" << std::endl;
return 0;
}

auto flagView = reader->GetView<uint16_t>("record.flag");
auto refidView = reader->GetView<int32_t>("record.refid");
auto posView = reader->GetView<int32_t>("record.pos");
auto cigarView = reader->GetView<std::vector<uint32_t>>("record.cigar");

auto index = RAMNTupleRecord::GetIndex();
Long64_t start = (index && index->Size() > 0) ? index->GetRow(refid, rs) : 0;
if (start < 0)
start = 0;

auto recordView = reader->GetView<RAMNTupleRecord>("record");

Long64_t count = 0;
const Long64_t total = reader->GetNEntries();

for (Long64_t i = start; i < total; i++) {
if (flagView(i) & FLAG_FILTER)
continue;
if (!index || index->Size() == 0) {

for (auto i : reader->GetEntryRange()) {
const auto &rec = recordView(i);
if (rec.refid == refid && rec.pos >= range_start - 1 && rec.pos <= range_end - 1) {
count++;
}
}
} else {

int curRef = refidView(i);
if (curRef < refid)
continue;
if (curRef > refid)
break;
auto start_entry = index->GetRow(refid, range_start);
auto end_entry = index->GetRow(refid, range_end);
if (start_entry < 0)
start_entry = 0;
if (end_entry < 0)
end_entry = reader->GetNEntries();

for (; start_entry < end_entry; start_entry++) {
const auto &rec = recordView(start_entry);
int seqlen = rec.GetSEQLEN();
if (rec.pos + seqlen > range_start - 1) {
break;
}
}

int pos = posView(i);
if (pos > re)
break;
Long64_t j;
for (j = start_entry; j < reader->GetNEntries(); j++) {
const auto &rec = recordView(j);

// Overlap: read_end >= rs (pos <= re guaranteed by break above)
if (pos >= rs) {
if (rec.pos >= range_end) {
break;
}
count++;
} else {
int readEnd = pos + computeRefSpan(cigarView(i)) - 1;
if (readEnd >= rs)
count++;
}
}

stopwatch.Print();
std::cout << "Found " << count << " records in region " << region << std::endl;

return count;
}
}
2 changes: 1 addition & 1 deletion src/ramcore/SamToNTuple.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ void samtoramntuple_split_by_chromosome(const char *datafile, const char *output
writeOptions.SetUseBufferedWrite(true);

auto parallel_writer =
ROOT::Experimental::RNTupleParallelWriter::Recreate(std::move(model), "RAM", filename, writeOptions);
ROOT::RNTupleParallelWriter::Recreate(std::move(model), "RAM", filename, writeOptions);

const int contexts_per_file = std::min(4, num_threads);
const size_t records_per_context = (records.size() + contexts_per_file - 1) / contexts_per_file;
Expand Down
13 changes: 12 additions & 1 deletion src/rntuple/RAMNTupleRecord.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,18 @@ int RAMNTupleRefs::GetRefId(const std::string &rname)
return fLastId;
}

int RAMNTupleRefs::FindRefId(const std::string &rname) const
{
if (rname == "*")
return -1;
if (rname == fLastName)
return fLastId;
auto it = std::find(fRefVec.begin(), fRefVec.end(), rname);
if (it != fRefVec.end())
return static_cast<int>(std::distance(fRefVec.begin(), it));
return -1;
}

const std::string &RAMNTupleRefs::GetRefName(int rid) const
{
static const std::string star = "*";
Expand Down Expand Up @@ -747,4 +759,3 @@ void RAMNTupleConverter::ViewRegion(const std::string &ram_file, const std::stri
std::cout << "Found " << count << " reads in region " << region << std::endl;
}
}

2 changes: 1 addition & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ function(add_ramcore_test test_name)

if(RAMTOOLS_BUILD_BENCHMARKS)

target_link_libraries(${test_name} benchmark::benchmark)
target_link_libraries(${test_name} PRIVATE benchmark::benchmark)
endif()

target_include_directories(${test_name}
Expand Down
Loading