From f9537021ba3360c462cdea554ef0ce61b97917cf Mon Sep 17 00:00:00 2001 From: Georgi Haralanov Date: Sat, 7 Mar 2026 16:24:07 +0200 Subject: [PATCH 01/11] The beginning of the RDataFrame query tool for ramtools. Currently only prints file data. --- CMakeLists.txt | 4 +++- inc/ramcore/RDF_RAMNTupleView.h | 9 +++++++++ src/ramcore/RDF_RAMNTupleView.cxx | 13 +++++++++++++ tools/CMakeLists.txt | 10 ++++++++-- tools/rdf_ramntupleview.cxx | 18 ++++++++++++++++++ 5 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 inc/ramcore/RDF_RAMNTupleView.h create mode 100644 src/ramcore/RDF_RAMNTupleView.cxx create mode 100644 tools/rdf_ramntupleview.cxx diff --git a/CMakeLists.txt b/CMakeLists.txt index 2bc3b44..1eac1d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ramcore inc/ramcore/SamToTTree.h inc/ramcore/SamToNTuple.h inc/ramcore/RAMNTupleView.h + inc/ramcore/RDF_RAMNTupleView.h SOURCES src/ttree/RAMRecord.cxx src/rntuple/RAMNTupleRecord.cxx @@ -50,6 +51,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ramcore src/ramcore/SamToTTree.cxx src/ramcore/SamToNTuple.cxx src/ramcore/RAMNTupleView.cxx + src/ramcore/RDF_RAMNTupleView.cxx LINKDEF inc/ttree/LinkDef.h DEPENDENCIES @@ -83,7 +85,7 @@ endif() if(RAMTOOLS_BUILD_TESTS) enable_testing() add_subdirectory(test) - + if(ENABLE_COVERAGE) add_custom_target(coverage COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure diff --git a/inc/ramcore/RDF_RAMNTupleView.h b/inc/ramcore/RDF_RAMNTupleView.h new file mode 100644 index 0000000..295411b --- /dev/null +++ b/inc/ramcore/RDF_RAMNTupleView.h @@ -0,0 +1,9 @@ +#ifndef RAMCORE_RDF_RAMNTUPLEVIEW_H +#define RAMCORE_RDF_RAMNTUPLEVIEW_H +#include + +ULong64_t rdf_ramntupleview(const char *file, const char *query = "", bool cache = true, bool perfstats = false, + const char *perfstatsfilename = "perf.root"); + +#endif //RAMCORE_RDF_RAMNTUPLEVIEW_H + diff --git a/src/ramcore/RDF_RAMNTupleView.cxx b/src/ramcore/RDF_RAMNTupleView.cxx new file mode 100644 index 0000000..0a91816 --- /dev/null +++ b/src/ramcore/RDF_RAMNTupleView.cxx @@ -0,0 +1,13 @@ +#include +#include + +ULong64_t rdf_ramntupleview(const char *file, const char *query, bool cache, bool perfstats, const char *perfstatsfilename){ + + TStopwatch ts; + ts.Start(); + auto reader = ROOT::RDF::FromRNTuple("RAM", file); + auto description = reader.Describe(); + description.Print(); + return 0; + +} diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index b67568f..fdcc79b 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -21,8 +21,14 @@ ROOT_EXECUTABLE(ramntupleview ROOT::Core ROOT::ROOTNTuple ) - -install(TARGETS samtoram samtoramntuple ramntupleview +ROOT_EXECUTABLE(rdf_ramntupleview + rdf_ramntupleview.cxx + LIBRARIES + ramcore + ROOT::Core + ROOT::ROOTNTuple +) +install(TARGETS samtoram samtoramntuple ramntupleview rdf_ramntupleview RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) diff --git a/tools/rdf_ramntupleview.cxx b/tools/rdf_ramntupleview.cxx new file mode 100644 index 0000000..0b5da01 --- /dev/null +++ b/tools/rdf_ramntupleview.cxx @@ -0,0 +1,18 @@ + + +#include +#include +int main(int argc, char **argv){ + +if (argc < 2){ + std::cerr << "Usage: " << argv[0] << " [rname:start-end]\n"; + std::cerr << "Example: " << argv[0] << " output.root chr1:1000-2000\n"; + return 1; +} + +const char* file = argv[1]; + +const char* query = argv[2]; +ULong64_t reads = rdf_ramntupleview(file, query); + +} From d18ecdde8a1f8b7e7063e8081f79a9579207fe20 Mon Sep 17 00:00:00 2001 From: Georgi Haralanov Date: Fri, 13 Mar 2026 23:21:28 +0200 Subject: [PATCH 02/11] Fix linking errors and remove unnecessary file --- .gitignore | 1 + CMakeLists.txt | 3 ++- inc/ramcore/RDF_RAMNTupleView.h | 5 ++--- src/ramcore/RDF_RAMNTupleView.cxx | 18 ++++++++++-------- tools/CMakeLists.txt | 9 +-------- tools/ramntupleview.cxx | 2 ++ tools/rdf_ramntupleview.cxx | 18 ------------------ 7 files changed, 18 insertions(+), 38 deletions(-) delete mode 100644 tools/rdf_ramntupleview.cxx diff --git a/.gitignore b/.gitignore index 4502af8..ad7beb7 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ ramexample.root *.root.idx .ipynb_checkpoints/ tmp/ +.cache *.so *.d *.pcm diff --git a/CMakeLists.txt b/CMakeLists.txt index 1eac1d3..4e960ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ if(ENABLE_COVERAGE) endif() set(ROOT_MIN_VERSION 6.26) -find_package(ROOT ${ROOT_MIN_VERSION} REQUIRED COMPONENTS Core RIO Tree ROOTNTuple ROOTNTupleUtil) +find_package(ROOT ${ROOT_MIN_VERSION} REQUIRED COMPONENTS Core RIO Tree ROOTNTuple ROOTNTupleUtil ROOTDataFrame) include(${ROOT_USE_FILE}) @@ -60,6 +60,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ramcore ROOT::Tree ROOT::ROOTNTuple ROOT::ROOTNTupleUtil + ROOT::ROOTDataFrame INSTALL_OPTIONS DESTINATION ${CMAKE_INSTALL_LIBDIR} ) diff --git a/inc/ramcore/RDF_RAMNTupleView.h b/inc/ramcore/RDF_RAMNTupleView.h index 295411b..f4b9414 100644 --- a/inc/ramcore/RDF_RAMNTupleView.h +++ b/inc/ramcore/RDF_RAMNTupleView.h @@ -3,7 +3,6 @@ #include ULong64_t rdf_ramntupleview(const char *file, const char *query = "", bool cache = true, bool perfstats = false, - const char *perfstatsfilename = "perf.root"); - -#endif //RAMCORE_RDF_RAMNTUPLEVIEW_H + const char *perfstatsfilename = "perf.root"); +#endif // RAMCORE_RDF_RAMNTUPLEVIEW_H diff --git a/src/ramcore/RDF_RAMNTupleView.cxx b/src/ramcore/RDF_RAMNTupleView.cxx index 0a91816..c8cea65 100644 --- a/src/ramcore/RDF_RAMNTupleView.cxx +++ b/src/ramcore/RDF_RAMNTupleView.cxx @@ -1,13 +1,15 @@ #include #include -ULong64_t rdf_ramntupleview(const char *file, const char *query, bool cache, bool perfstats, const char *perfstatsfilename){ - - TStopwatch ts; - ts.Start(); - auto reader = ROOT::RDF::FromRNTuple("RAM", file); - auto description = reader.Describe(); - description.Print(); - return 0; +ULong64_t +rdf_ramntupleview(const char *file, const char *query, bool cache, bool perfstats, const char *perfstatsfilename) +{ + TStopwatch ts; + ts.Start(); + auto reader = ROOT::RDF::FromRNTuple("RAM", file); + auto description = reader.Describe(); + description.Print(); + ts.Print(); + return 0; } diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index fdcc79b..7c778df 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -21,14 +21,7 @@ ROOT_EXECUTABLE(ramntupleview ROOT::Core ROOT::ROOTNTuple ) -ROOT_EXECUTABLE(rdf_ramntupleview - rdf_ramntupleview.cxx - LIBRARIES - ramcore - ROOT::Core - ROOT::ROOTNTuple -) -install(TARGETS samtoram samtoramntuple ramntupleview rdf_ramntupleview +install(TARGETS samtoram samtoramntuple ramntupleview RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) diff --git a/tools/ramntupleview.cxx b/tools/ramntupleview.cxx index feee374..b2c348c 100644 --- a/tools/ramntupleview.cxx +++ b/tools/ramntupleview.cxx @@ -1,4 +1,5 @@ #include "ramcore/RAMNTupleView.h" +#include "ramcore/RDF_RAMNTupleView.h" #include #include #include @@ -14,6 +15,7 @@ int main(int argc, char *argv[]) const char *file = argv[1]; const char *region_str = (argc > 2) ? argv[2] : ""; + ULong64_t s = rdf_ramntupleview(file); Long64_t read_count = ramntupleview(file, region_str); printf("Found %lld records in region %s\n", read_count, region_str); diff --git a/tools/rdf_ramntupleview.cxx b/tools/rdf_ramntupleview.cxx deleted file mode 100644 index 0b5da01..0000000 --- a/tools/rdf_ramntupleview.cxx +++ /dev/null @@ -1,18 +0,0 @@ - - -#include -#include -int main(int argc, char **argv){ - -if (argc < 2){ - std::cerr << "Usage: " << argv[0] << " [rname:start-end]\n"; - std::cerr << "Example: " << argv[0] << " output.root chr1:1000-2000\n"; - return 1; -} - -const char* file = argv[1]; - -const char* query = argv[2]; -ULong64_t reads = rdf_ramntupleview(file, query); - -} From a6c7d28d7f5b912330dec61f7dad41237ed42fce Mon Sep 17 00:00:00 2001 From: Georgi Haralanov Date: Tue, 17 Mar 2026 20:19:59 +0200 Subject: [PATCH 03/11] Added functionality for querying without indexes --- src/ramcore/RDF_RAMNTupleView.cxx | 72 ++++++++++++++++++++++++++++--- tools/ramntupleview.cxx | 13 +++--- 2 files changed, 74 insertions(+), 11 deletions(-) diff --git a/src/ramcore/RDF_RAMNTupleView.cxx b/src/ramcore/RDF_RAMNTupleView.cxx index c8cea65..577cf11 100644 --- a/src/ramcore/RDF_RAMNTupleView.cxx +++ b/src/ramcore/RDF_RAMNTupleView.cxx @@ -1,15 +1,77 @@ #include #include +using namespace ROOT::RDF; +namespace rdf{ + +static int GetRefId(ROOT::RDataFrame& rname_refs, const std::string& rname){ + if (rname == "*"){ + return -1; + } + int result = -1; + auto find_index = [rname, &result](ROOT::VecOps::RVec refs){ + for(int i = 0; i < refs.size(); i++){ + if(refs[i] == rname){ + result = i; + } + } + }; + rname_refs.Foreach(find_index, {"rname_refs"}); + return result; +} + +static void Display(ROOT::RDataFrame ram, ROOT::RDataFrame index, ROOT::RDataFrame metadata){ + + auto inde = index.Describe(); + inde.Print(); + auto description = ram.Describe(); + description.Print(); + auto des = metadata.Describe(); + des.Print(); + auto display = ram.Display({"record.qname", "record.refid", "record.refnext", "record.seq", "record.pos", "record.cigar"}); + display->Print(); + auto dis = metadata.Display({"rname_refs", "rnext_refs"}); + dis->Print(); + auto d = index.Display({"index_entries.entry", "index_entries.pos", "index_entries.refid"}); + d->Print(); +} + + +} // namespace rdf + ULong64_t rdf_ramntupleview(const char *file, const char *query, bool cache, bool perfstats, const char *perfstatsfilename) { - TStopwatch ts; ts.Start(); - auto reader = ROOT::RDF::FromRNTuple("RAM", file); - auto description = reader.Describe(); - description.Print(); + std::string region = query; + int chrDelimiterPos = region.find(":"); + if (chrDelimiterPos == std::string::npos) { + std::cerr << "Invalid region format. Use rname:start-end\n"; + return 0; + } + const TString rname = region.substr(0, chrDelimiterPos); + int rangeDelimiterPos = region.find("-", chrDelimiterPos); + if (rangeDelimiterPos == std::string::npos) { + std::cerr << "Invalid region format. Use rname:start-end\n"; + return 0; + } + const Int_t range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1)); + const Int_t range_end = std::stoi(region.substr(rangeDelimiterPos + 1)); + + + // auto index = FromRNTuple("INDEX", file); + auto metadata = FromRNTuple("METADATA", file); + const auto ref = rdf::GetRefId(metadata, rname.Data()); + ROOT::EnableImplicitMT(); + auto ram = FromRNTuple("RAM", file); + + auto check = [&range_start, &range_end, &ref](int32_t refid, int32_t pos){ + return refid == ref && pos >= range_start - 1 && pos <= range_end - 1; + }; + auto filtered = ram.Filter(check, {"record.refid", "record.pos"}); + auto num = filtered.Count(); + *num; ts.Print(); - return 0; + return *num; } diff --git a/tools/ramntupleview.cxx b/tools/ramntupleview.cxx index b2c348c..48bb797 100644 --- a/tools/ramntupleview.cxx +++ b/tools/ramntupleview.cxx @@ -3,7 +3,7 @@ #include #include #include - +//#define RDF_IMPL int main(int argc, char *argv[]) { if (argc < 2) { @@ -14,11 +14,12 @@ int main(int argc, char *argv[]) const char *file = argv[1]; const char *region_str = (argc > 2) ? argv[2] : ""; - - ULong64_t s = rdf_ramntupleview(file); - Long64_t read_count = ramntupleview(file, region_str); - - printf("Found %lld records in region %s\n", read_count, region_str); +//#ifdef RDF_IMPL + ULong64_t s = rdf_ramntupleview(file, region_str); +//#else + Long64_t read_count = ramntupleview(file, region_str); +//#endif + printf("Found %lld records in region %s [single thread]\n Found %lld records in region %s [multi-thread]", read_count, region_str, s, region_str); return 0; } From aa5df5b15c8d9cd948fc09a4eb7998b42fdb296f Mon Sep 17 00:00:00 2001 From: Georgi Haralanov Date: Wed, 25 Mar 2026 19:52:51 +0200 Subject: [PATCH 04/11] Add basic functionality for querying --- inc/ramcore/RDF_RAMNTupleView.h | 4 +- src/ramcore/RAMNTupleView.cxx | 2 - src/ramcore/RDF_RAMNTupleView.cxx | 62 ++++++++++--------------------- tools/ramntupleview.cxx | 23 ++++++++---- 4 files changed, 38 insertions(+), 53 deletions(-) diff --git a/inc/ramcore/RDF_RAMNTupleView.h b/inc/ramcore/RDF_RAMNTupleView.h index f4b9414..878efaa 100644 --- a/inc/ramcore/RDF_RAMNTupleView.h +++ b/inc/ramcore/RDF_RAMNTupleView.h @@ -2,7 +2,7 @@ #define RAMCORE_RDF_RAMNTUPLEVIEW_H #include -ULong64_t rdf_ramntupleview(const char *file, const char *query = "", bool cache = true, bool perfstats = false, - const char *perfstatsfilename = "perf.root"); +Long64_t rdf_ramntupleview(const int num_threads, const char *file, const char *query = "", bool cache = true, + bool perfstats = false, const char *perfstatsfilename = "perf.root"); #endif // RAMCORE_RDF_RAMNTUPLEVIEW_H diff --git a/src/ramcore/RAMNTupleView.cxx b/src/ramcore/RAMNTupleView.cxx index dcb18f0..560cdf4 100644 --- a/src/ramcore/RAMNTupleView.cxx +++ b/src/ramcore/RAMNTupleView.cxx @@ -39,9 +39,7 @@ Long64_t ramntupleview(const char *file, const char *query, bool cache, bool per auto index = RAMNTupleRecord::GetIndex(); auto recordView = reader->GetView("record"); - Long64_t count = 0; - if (!index || index->Size() == 0) { for (auto i : reader->GetEntryRange()) { diff --git a/src/ramcore/RDF_RAMNTupleView.cxx b/src/ramcore/RDF_RAMNTupleView.cxx index 577cf11..5736ef0 100644 --- a/src/ramcore/RDF_RAMNTupleView.cxx +++ b/src/ramcore/RDF_RAMNTupleView.cxx @@ -1,46 +1,23 @@ #include #include using namespace ROOT::RDF; -namespace rdf{ +namespace rdf { -static int GetRefId(ROOT::RDataFrame& rname_refs, const std::string& rname){ - if (rname == "*"){ - return -1; - } - int result = -1; - auto find_index = [rname, &result](ROOT::VecOps::RVec refs){ - for(int i = 0; i < refs.size(); i++){ - if(refs[i] == rname){ - result = i; - } - } - }; - rname_refs.Foreach(find_index, {"rname_refs"}); - return result; -} - -static void Display(ROOT::RDataFrame ram, ROOT::RDataFrame index, ROOT::RDataFrame metadata){ - - auto inde = index.Describe(); - inde.Print(); - auto description = ram.Describe(); - description.Print(); - auto des = metadata.Describe(); - des.Print(); - auto display = ram.Display({"record.qname", "record.refid", "record.refnext", "record.seq", "record.pos", "record.cigar"}); - display->Print(); - auto dis = metadata.Display({"rname_refs", "rnext_refs"}); - dis->Print(); - auto d = index.Display({"index_entries.entry", "index_entries.pos", "index_entries.refid"}); - d->Print(); -} +static int GetRefId(ROOT::RDataFrame &df, const std::string &rname) +{ + if (rname == "*") + return -1; + auto refs = df.Take>("rname_refs"); + const auto &vec = refs.GetValue()[0]; + auto it = std::find(vec.begin(), vec.end(), rname); + return (it == vec.end()) ? -1 : std::distance(vec.begin(), it); +} } // namespace rdf - -ULong64_t -rdf_ramntupleview(const char *file, const char *query, bool cache, bool perfstats, const char *perfstatsfilename) +Long64_t rdf_ramntupleview(const int num_threads, const char *file, const char *query, bool cache, bool perfstats, + const char *perfstatsfilename) { TStopwatch ts; ts.Start(); @@ -59,19 +36,20 @@ rdf_ramntupleview(const char *file, const char *query, bool cache, bool perfstat const Int_t range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1)); const Int_t range_end = std::stoi(region.substr(rangeDelimiterPos + 1)); - - // auto index = FromRNTuple("INDEX", file); + // auto index = FromRNTuple("INDEX", file); auto metadata = FromRNTuple("METADATA", file); const auto ref = rdf::GetRefId(metadata, rname.Data()); - ROOT::EnableImplicitMT(); + if (ref == -1) { + std::cerr << "Chromosome not found:" << rname << "\n"; + } + ROOT::EnableImplicitMT(num_threads); auto ram = FromRNTuple("RAM", file); - - auto check = [&range_start, &range_end, &ref](int32_t refid, int32_t pos){ - return refid == ref && pos >= range_start - 1 && pos <= range_end - 1; + auto check = [ref, range_start, range_end](int32_t refid, int32_t pos) { + return (refid == ref) && (pos >= range_start - 1) && (pos <= range_end - 1); }; auto filtered = ram.Filter(check, {"record.refid", "record.pos"}); auto num = filtered.Count(); *num; ts.Print(); - return *num; + return *num; } diff --git a/tools/ramntupleview.cxx b/tools/ramntupleview.cxx index 48bb797..2aa3a8d 100644 --- a/tools/ramntupleview.cxx +++ b/tools/ramntupleview.cxx @@ -3,7 +3,6 @@ #include #include #include -//#define RDF_IMPL int main(int argc, char *argv[]) { if (argc < 2) { @@ -11,15 +10,25 @@ int main(int argc, char *argv[]) std::cerr << "Example: " << argv[0] << " output.root chr1:1000-2000\n"; return 1; } + bool mt_set = false; + int num_threads = 0; + if (argc > 3) { + std::string mt_flag = argv[3]; + if (mt_flag.find("-m") != std::string::npos) { + mt_set = true; + num_threads = std::atoi(mt_flag.substr(2).data()); + } + } const char *file = argv[1]; const char *region_str = (argc > 2) ? argv[2] : ""; -//#ifdef RDF_IMPL - ULong64_t s = rdf_ramntupleview(file, region_str); -//#else - Long64_t read_count = ramntupleview(file, region_str); -//#endif - printf("Found %lld records in region %s [single thread]\n Found %lld records in region %s [multi-thread]", read_count, region_str, s, region_str); + Long64_t read_count; + if (mt_set) { + read_count = rdf_ramntupleview(num_threads, file, region_str); + } else { + read_count = ramntupleview(file, region_str); + } + printf("Found %lld records in region %s", read_count, region_str); return 0; } From 10d62d87781498e615cbf590615e64dede9c249d Mon Sep 17 00:00:00 2001 From: Georgi Haralanov Date: Tue, 14 Apr 2026 18:17:39 +0300 Subject: [PATCH 05/11] Move changes into existing file --- CMakeLists.txt | 2 -- inc/ramcore/RAMNTupleView.h | 3 +- inc/ramcore/RDF_RAMNTupleView.h | 8 ----- src/ramcore/RAMNTupleView.cxx | 47 +++++++++++++++++++++++++- src/ramcore/RDF_RAMNTupleView.cxx | 55 ------------------------------- tools/ramntupleview.cxx | 5 ++- 6 files changed, 50 insertions(+), 70 deletions(-) delete mode 100644 inc/ramcore/RDF_RAMNTupleView.h delete mode 100644 src/ramcore/RDF_RAMNTupleView.cxx diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e960ef..5a5f3ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,7 +43,6 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ramcore inc/ramcore/SamToTTree.h inc/ramcore/SamToNTuple.h inc/ramcore/RAMNTupleView.h - inc/ramcore/RDF_RAMNTupleView.h SOURCES src/ttree/RAMRecord.cxx src/rntuple/RAMNTupleRecord.cxx @@ -51,7 +50,6 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ramcore src/ramcore/SamToTTree.cxx src/ramcore/SamToNTuple.cxx src/ramcore/RAMNTupleView.cxx - src/ramcore/RDF_RAMNTupleView.cxx LINKDEF inc/ttree/LinkDef.h DEPENDENCIES diff --git a/inc/ramcore/RAMNTupleView.h b/inc/ramcore/RAMNTupleView.h index c7491bf..5c73cd3 100644 --- a/inc/ramcore/RAMNTupleView.h +++ b/inc/ramcore/RAMNTupleView.h @@ -4,5 +4,6 @@ Long64_t ramntupleview(const char *file, const char *query = "", bool cache = true, bool perfstats = false, const char *perfstatsfilename = "perf.root"); - +ULong64_t mt_ramntupleview(int numthreads, const char *file, const char *query = "", bool cache = true, + bool perfstats = false, const char *perfstatsfilename = "perf.root"); #endif // RAMCORE_RAMNTUPLEVIEW_H diff --git a/inc/ramcore/RDF_RAMNTupleView.h b/inc/ramcore/RDF_RAMNTupleView.h deleted file mode 100644 index 878efaa..0000000 --- a/inc/ramcore/RDF_RAMNTupleView.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef RAMCORE_RDF_RAMNTUPLEVIEW_H -#define RAMCORE_RDF_RAMNTUPLEVIEW_H -#include - -Long64_t rdf_ramntupleview(const int num_threads, const char *file, const char *query = "", bool cache = true, - bool perfstats = false, const char *perfstatsfilename = "perf.root"); - -#endif // RAMCORE_RDF_RAMNTUPLEVIEW_H diff --git a/src/ramcore/RAMNTupleView.cxx b/src/ramcore/RAMNTupleView.cxx index 7bac2bd..6c7751f 100644 --- a/src/ramcore/RAMNTupleView.cxx +++ b/src/ramcore/RAMNTupleView.cxx @@ -1,4 +1,6 @@ #include "ramcore/RAMNTupleView.h" +#include +#include #include #include @@ -12,6 +14,7 @@ #include #include #include +#include #include #include @@ -53,6 +56,17 @@ int computeRefSpan(const std::vector &cigarOps) } return span; } +int GetRefId(ROOT::RDataFrame &df, const std::string &rname) +{ + if (rname == "*") + return -1; + + auto refs = df.Take>("rname_refs"); + const auto &refids = refs.GetValue()[0]; + + auto it = std::find(refids.begin(), refids.end(), rname); + return (it == refids.end()) ? -1 : std::distance(refids.begin(), it); +} int resolveRefId(const char *name) { @@ -185,4 +199,35 @@ Long64_t ramntupleview(const char *file, const char *query, bool /*cache*/, bool stopwatch.Print(); std::cout << "Found " << count << " records in region " << region << std::endl; return count; -} \ No newline at end of file +} +// NOLINTNEXTLINE(misc-use-internal-linkage) +ULong64_t mt_ramntupleview(const int numthreads, const char *file, const char *query, bool /*cache*/, + bool /*perfstats*/, const char * /*perfstatsfilename*/) +{ + TStopwatch st; + st.Start(); + TString rname; + std::string region = query; + Int_t start = 0; + Int_t end = 0; + if (!parseRegion(region, rname, start, end)) { + std::cerr << "Invalid region format. Use rname[:start[-end]]\n"; + return 0; + } + auto metadata = ROOT::RDF::FromRNTuple("METADATA", file); + const int refid = GetRefId(metadata, rname.Data()); + if (refid < 0) { + std::cerr << "Reference" << rname.Data() << " not found\n"; + } + ROOT::EnableImplicitMT(numthreads); + auto ram = ROOT::RDF::FromRNTuple("RAM", file); + auto filterfunc = [refid, start, end](int32_t refidentry, int32_t pos) { + return (refid == refidentry) && (pos >= start) && (pos <= end); + }; + + auto filtered = ram.Filter(filterfunc, {"record.refid", "record.pos"}); + auto count = filtered.Count(); + *count; + st.Print(); + return *count; +} diff --git a/src/ramcore/RDF_RAMNTupleView.cxx b/src/ramcore/RDF_RAMNTupleView.cxx deleted file mode 100644 index 5736ef0..0000000 --- a/src/ramcore/RDF_RAMNTupleView.cxx +++ /dev/null @@ -1,55 +0,0 @@ -#include -#include -using namespace ROOT::RDF; -namespace rdf { - -static int GetRefId(ROOT::RDataFrame &df, const std::string &rname) -{ - if (rname == "*") - return -1; - - auto refs = df.Take>("rname_refs"); - const auto &vec = refs.GetValue()[0]; - - auto it = std::find(vec.begin(), vec.end(), rname); - return (it == vec.end()) ? -1 : std::distance(vec.begin(), it); -} -} // namespace rdf - -Long64_t rdf_ramntupleview(const int num_threads, const char *file, const char *query, bool cache, bool perfstats, - const char *perfstatsfilename) -{ - TStopwatch ts; - ts.Start(); - std::string region = query; - int chrDelimiterPos = region.find(":"); - if (chrDelimiterPos == std::string::npos) { - std::cerr << "Invalid region format. Use rname:start-end\n"; - return 0; - } - const TString rname = region.substr(0, chrDelimiterPos); - int rangeDelimiterPos = region.find("-", chrDelimiterPos); - if (rangeDelimiterPos == std::string::npos) { - std::cerr << "Invalid region format. Use rname:start-end\n"; - return 0; - } - const Int_t range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1)); - const Int_t range_end = std::stoi(region.substr(rangeDelimiterPos + 1)); - - // auto index = FromRNTuple("INDEX", file); - auto metadata = FromRNTuple("METADATA", file); - const auto ref = rdf::GetRefId(metadata, rname.Data()); - if (ref == -1) { - std::cerr << "Chromosome not found:" << rname << "\n"; - } - ROOT::EnableImplicitMT(num_threads); - auto ram = FromRNTuple("RAM", file); - auto check = [ref, range_start, range_end](int32_t refid, int32_t pos) { - return (refid == ref) && (pos >= range_start - 1) && (pos <= range_end - 1); - }; - auto filtered = ram.Filter(check, {"record.refid", "record.pos"}); - auto num = filtered.Count(); - *num; - ts.Print(); - return *num; -} diff --git a/tools/ramntupleview.cxx b/tools/ramntupleview.cxx index 2aa3a8d..181c29f 100644 --- a/tools/ramntupleview.cxx +++ b/tools/ramntupleview.cxx @@ -1,5 +1,4 @@ #include "ramcore/RAMNTupleView.h" -#include "ramcore/RDF_RAMNTupleView.h" #include #include #include @@ -22,9 +21,9 @@ int main(int argc, char *argv[]) const char *file = argv[1]; const char *region_str = (argc > 2) ? argv[2] : ""; - Long64_t read_count; + ULong64_t read_count; if (mt_set) { - read_count = rdf_ramntupleview(num_threads, file, region_str); + read_count = mt_ramntupleview(num_threads, file, region_str); } else { read_count = ramntupleview(file, region_str); } From 5a98fbabd546673b0c225fc6fb1b090356c791dd Mon Sep 17 00:00:00 2001 From: Georgi Haralanov Date: Fri, 17 Apr 2026 23:19:54 +0300 Subject: [PATCH 06/11] Add indexing & Remove print debugging for multithreading --- src/ramcore/RAMNTupleView.cxx | 52 +++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/src/ramcore/RAMNTupleView.cxx b/src/ramcore/RAMNTupleView.cxx index 6c7751f..08b954b 100644 --- a/src/ramcore/RAMNTupleView.cxx +++ b/src/ramcore/RAMNTupleView.cxx @@ -1,4 +1,6 @@ #include "ramcore/RAMNTupleView.h" +#include +#include #include #include #include @@ -7,7 +9,9 @@ #include #include #include +#include #include +#include #include #include @@ -56,6 +60,34 @@ int computeRefSpan(const std::vector &cigarOps) } return span; } +std::pair FindIndex(ROOT::RDataFrame &df, int refid, int start, int end) +{ + ULong64_t first = 0; + ULong64_t last = std::numeric_limits::max(); + auto entries = (*df.Take>("index_entries"))[0]; + std::size_t i = 0; + // loop to find the nearest inclusive starting index + for (; i < entries.size(); ++i) { + if (entries[i].refid == refid) { + if (entries[i].pos == start) { + first = entries[i].entry; + break; + } + if (entries[i].pos > start) { + first = entries[i - 1].entry; + break; + } + } + } + for (; i < entries.size(); ++i) { + if (entries[i].refid == refid && entries[i].pos >= end) { + last = entries[i].entry; + break; + } + } + return std::make_pair(first, last); +} + int GetRefId(ROOT::RDataFrame &df, const std::string &rname) { if (rname == "*") @@ -158,12 +190,10 @@ Long64_t ramntupleview(const char *file, const char *query, bool /*cache*/, bool std::cerr << "Reference '" << rname.Data() << "' not found\n"; return 0; } - auto flagView = reader->GetView("record.flag"); auto refidView = reader->GetView("record.refid"); auto posView = reader->GetView("record.pos"); auto cigarView = reader->GetView>("record.cigar"); - auto index = RAMNTupleRecord::GetIndex(); Long64_t start = (index && index->Size() > 0) ? index->GetRow(refid, rs) : 0; if (start < 0) @@ -206,21 +236,36 @@ ULong64_t mt_ramntupleview(const int numthreads, const char *file, const char *q { TStopwatch st; st.Start(); + TString rname; std::string region = query; Int_t start = 0; Int_t end = 0; + if (!parseRegion(region, rname, start, end)) { std::cerr << "Invalid region format. Use rname[:start[-end]]\n"; return 0; } auto metadata = ROOT::RDF::FromRNTuple("METADATA", file); const int refid = GetRefId(metadata, rname.Data()); + if (refid < 0) { std::cerr << "Reference" << rname.Data() << " not found\n"; } + + auto index = ROOT::RDF::FromRNTuple("INDEX", file); + + auto range = FindIndex(index, refid, start, end); ROOT::EnableImplicitMT(numthreads); - auto ram = ROOT::RDF::FromRNTuple("RAM", file); + + ROOT::RDF::Experimental::RDatasetSpec spec; + ROOT::RDF::Experimental::RSample sample("reads", "RAM", file); + spec.AddSample(sample); + ROOT::RDF::Experimental::RDatasetSpec::REntryRange entry_range(range.first, range.second); + spec.WithGlobalRange(entry_range); + + auto ram = ROOT::RDataFrame(spec); + auto filterfunc = [refid, start, end](int32_t refidentry, int32_t pos) { return (refid == refidentry) && (pos >= start) && (pos <= end); }; @@ -229,5 +274,6 @@ ULong64_t mt_ramntupleview(const int numthreads, const char *file, const char *q auto count = filtered.Count(); *count; st.Print(); + return *count; } From e5909e7543b32ba3ff39dd000ebb4076f6034f38 Mon Sep 17 00:00:00 2001 From: Georgi Haralanov Date: Sat, 18 Apr 2026 17:36:57 +0300 Subject: [PATCH 07/11] Fix indexing errors --- src/ramcore/RAMNTupleView.cxx | 21 +++++++++++++++++---- src/ramcore/SamToNTuple.cxx | 4 ++-- test/CMakeLists.txt | 6 +++--- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/ramcore/RAMNTupleView.cxx b/src/ramcore/RAMNTupleView.cxx index 08b954b..107e45d 100644 --- a/src/ramcore/RAMNTupleView.cxx +++ b/src/ramcore/RAMNTupleView.cxx @@ -74,12 +74,21 @@ std::pair FindIndex(ROOT::RDataFrame &df, int refid, int sta break; } if (entries[i].pos > start) { + if (i == 0){ + first = entries[i].entry; + break; + } first = entries[i - 1].entry; break; } } } for (; i < entries.size(); ++i) { + if (entries[i].refid > refid) + { + last = entries[i].entry; + break; + } if (entries[i].refid == refid && entries[i].pos >= end) { last = entries[i].entry; break; @@ -254,17 +263,21 @@ ULong64_t mt_ramntupleview(const int numthreads, const char *file, const char *q } auto index = ROOT::RDF::FromRNTuple("INDEX", file); - + st.Print(); + st.Start(); auto range = FindIndex(index, refid, start, end); + std::cout << range.first << ' ' << range.second << '\n'; ROOT::EnableImplicitMT(numthreads); - + st.Print(); + st.Start(); ROOT::RDF::Experimental::RDatasetSpec spec; ROOT::RDF::Experimental::RSample sample("reads", "RAM", file); spec.AddSample(sample); ROOT::RDF::Experimental::RDatasetSpec::REntryRange entry_range(range.first, range.second); spec.WithGlobalRange(entry_range); - - auto ram = ROOT::RDataFrame(spec); + std::vector files = {file}; + auto ram = ROOT::RDF::RDFInternal::FromRNTuple("RAM", files, range); + //auto ram = ROOT::RDataFrame(spec); auto filterfunc = [refid, start, end](int32_t refidentry, int32_t pos) { return (refid == refidentry) && (pos >= start) && (pos <= end); diff --git a/src/ramcore/SamToNTuple.cxx b/src/ramcore/SamToNTuple.cxx index f6d9a78..9abb466 100644 --- a/src/ramcore/SamToNTuple.cxx +++ b/src/ramcore/SamToNTuple.cxx @@ -234,7 +234,7 @@ void samtoramntuple_split_by_chromosome(const char *datafile, const char *output writeOptions.SetUseBufferedWrite(true); auto parallel_writer = - ROOT::Experimental::RNTupleParallelWriter::Recreate(std::move(model), "RAM", filename, writeOptions); + ROOT::RNTupleParallelWriter::Recreate(std::move(model), "RAM", filename, writeOptions); const int contexts_per_file = std::min(4, num_threads); const size_t records_per_context = (records.size() + contexts_per_file - 1) / contexts_per_file; @@ -313,4 +313,4 @@ void samtoramntuple_split_by_chromosome(const char *datafile, const char *output t.join(); } } -} \ No newline at end of file +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 24b54f0..d067c3a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -11,14 +11,14 @@ function(add_ramcore_test test_name) ROOT::ROOTNTuple ROOT::TreePlayer ROOT::RIO - sam_generator + sam_generator gtest gtest_main ) if(RAMTOOLS_BUILD_BENCHMARKS) - - target_link_libraries(${test_name} benchmark::benchmark) + + target_link_libraries(${test_name} PUBLIC benchmark::benchmark) endif() target_include_directories(${test_name} From 4c1a1f3259f9141458066ae95ad750b0a9ef6c7d Mon Sep 17 00:00:00 2001 From: Georgi Haralanov Date: Tue, 21 Apr 2026 23:33:46 +0300 Subject: [PATCH 08/11] Fix slow opening of file --- src/ramcore/RAMNTupleView.cxx | 76 ++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/src/ramcore/RAMNTupleView.cxx b/src/ramcore/RAMNTupleView.cxx index 107e45d..156f64f 100644 --- a/src/ramcore/RAMNTupleView.cxx +++ b/src/ramcore/RAMNTupleView.cxx @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -64,33 +65,37 @@ std::pair FindIndex(ROOT::RDataFrame &df, int refid, int sta { ULong64_t first = 0; ULong64_t last = std::numeric_limits::max(); - auto entries = (*df.Take>("index_entries"))[0]; + auto entries_refid = df.Take>("index_entries_refid"); + + auto entries_pos = df.Take>("index_entries_pos"); + + auto entries_entry = df.Take>("index_entries_entry"); + std::size_t i = 0; // loop to find the nearest inclusive starting index - for (; i < entries.size(); ++i) { - if (entries[i].refid == refid) { - if (entries[i].pos == start) { - first = entries[i].entry; + for (; i < (*entries_entry)[0].size(); ++i) { + if ((*entries_refid)[0][i] == refid) { + if ((*entries_pos)[0][i] == start) { + first = (*entries_refid)[0][i]; break; } - if (entries[i].pos > start) { + if ((*entries_pos)[0][i] > start) { if (i == 0){ - first = entries[i].entry; + first = (*entries_entry)[0][i]; break; } - first = entries[i - 1].entry; + first = (*entries_entry)[0][i - 1]; break; } } } - for (; i < entries.size(); ++i) { - if (entries[i].refid > refid) - { - last = entries[i].entry; + for (; i < (*entries_refid)[0].size(); ++i) { + if ((*entries_refid)[0][i] > refid) { + last = (*entries_entry)[0][i]; break; } - if (entries[i].refid == refid && entries[i].pos >= end) { - last = entries[i].entry; + if ((*entries_refid)[0][i] == refid && (*entries_pos)[0][i] >= end) { + last = (*entries_entry)[0][i]; break; } } @@ -262,31 +267,38 @@ ULong64_t mt_ramntupleview(const int numthreads, const char *file, const char *q std::cerr << "Reference" << rname.Data() << " not found\n"; } - auto index = ROOT::RDF::FromRNTuple("INDEX", file); + std::pair range; + try { + + auto index = ROOT::RDF::FromRNTuple("INDEX_FAST", file); + range = FindIndex(index, refid, start, end); + } catch (...) { + + std::cerr << "[-]Fast index wasn't found\n[*]Creating fast index ...\n"; + auto index_old = ROOT::RDF::FromRNTuple("INDEX", file); + ROOT::RDF::RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + opts.fMode = "UPDATE"; + index_old.Snapshot("INDEX_FAST", file, {"index_entries.pos", "index_entries.refid", "index_entries.entry"}, opts); + auto index = ROOT::RDF::FromRNTuple("INDEX_FAST", file); + range = FindIndex(index, refid, start, end); + std::cerr << "[+]Index created!\n"; + } + st.Print(); st.Start(); - auto range = FindIndex(index, refid, start, end); std::cout << range.first << ' ' << range.second << '\n'; ROOT::EnableImplicitMT(numthreads); + std::vector files = {file}; + auto ram = ROOT::Internal::RDF::FromRNTuple("RAM", files, range); st.Print(); st.Start(); - ROOT::RDF::Experimental::RDatasetSpec spec; - ROOT::RDF::Experimental::RSample sample("reads", "RAM", file); - spec.AddSample(sample); - ROOT::RDF::Experimental::RDatasetSpec::REntryRange entry_range(range.first, range.second); - spec.WithGlobalRange(entry_range); - std::vector files = {file}; - auto ram = ROOT::RDF::RDFInternal::FromRNTuple("RAM", files, range); - //auto ram = ROOT::RDataFrame(spec); - - auto filterfunc = [refid, start, end](int32_t refidentry, int32_t pos) { - return (refid == refidentry) && (pos >= start) && (pos <= end); + auto filterfunc = [refid, start, end](int32_t refidentry, int32_t pos, uint16_t flag) { + return !(flag & FLAG_FILTER) && (refid == refidentry) && (pos >= start) && (pos <= end); }; - - auto filtered = ram.Filter(filterfunc, {"record.refid", "record.pos"}); + auto filtered = ram.Filter(filterfunc, {"record.refid", "record.pos", "record.flag"}); auto count = filtered.Count(); - *count; + auto res = *count; st.Print(); - - return *count; + return res; } From f84eb46ee3dab7ef89af3a4ec3fb98e13fea08ad Mon Sep 17 00:00:00 2001 From: Georgi Haralanov Date: Sat, 25 Apr 2026 19:05:25 +0300 Subject: [PATCH 09/11] Validate input --- src/ramcore/RAMNTupleView.cxx | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/ramcore/RAMNTupleView.cxx b/src/ramcore/RAMNTupleView.cxx index 156f64f..c6ab06d 100644 --- a/src/ramcore/RAMNTupleView.cxx +++ b/src/ramcore/RAMNTupleView.cxx @@ -252,7 +252,17 @@ ULong64_t mt_ramntupleview(const int numthreads, const char *file, const char *q st.Start(); TString rname; - std::string region = query; + std::string region = query ? query : ""; + if (region.empty() || region == "*") { +auto reader = RAMNTupleRecord::OpenRAMFile(file); + if (!reader) { + std::cerr << "ramntupleview: failed to open file " << file << std::endl; + return 0; + } +st.Print(); + return reader->GetNEntries(); + + } Int_t start = 0; Int_t end = 0; From 6fc5e4a55c3bda2bf0fc785b1dbd3daec43f9371 Mon Sep 17 00:00:00 2001 From: Georgi Haralanov Date: Sat, 25 Apr 2026 19:05:56 +0300 Subject: [PATCH 10/11] Add tests for multithreading --- test/ramcoretests.cxx | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/test/ramcoretests.cxx b/test/ramcoretests.cxx index 7dd8b24..08c6ac6 100644 --- a/test/ramcoretests.cxx +++ b/test/ramcoretests.cxx @@ -92,7 +92,42 @@ TEST_F(ramcoreTest, RNTupleViewRegionQueries) Long64_t zeroStart = ramntupleview(rntupleFile, "chr1:0-100", true, false, nullptr); EXPECT_GE(zeroStart, 0); } +TEST_F(ramcoreTest, MT_RNTupleViewRegionQueries) +{ + const int numthreads = 16; + const char *rntupleFile = "test_rntuple.root"; + samtoramntuple("samexample.sam", rntupleFile, true, true, true, 505, 0); + + Long64_t hit = mt_ramntupleview(numthreads, rntupleFile, "chr1:1-1000000", true, false, nullptr); + EXPECT_GE(hit, 0); + + Long64_t miss = mt_ramntupleview(numthreads, rntupleFile, "chrNonExistent:1-100", true, false, nullptr); + EXPECT_EQ(miss, 0); + + Long64_t wildcard = mt_ramntupleview(numthreads, rntupleFile, "*", true, false, nullptr); + EXPECT_EQ(wildcard, 100); + + Long64_t empty = mt_ramntupleview(numthreads, rntupleFile, "", true, false, nullptr); + EXPECT_EQ(empty, 100); + + Long64_t null = mt_ramntupleview(numthreads, rntupleFile, nullptr, true, false, nullptr); + EXPECT_EQ(null, 100); + + Long64_t whole = mt_ramntupleview(numthreads, rntupleFile, "chr1", true, false, nullptr); + EXPECT_GE(whole, 0); + + Long64_t single = mt_ramntupleview(numthreads, rntupleFile, "chr1:500", true, false, nullptr); + EXPECT_GE(single, 0); + + Long64_t invalid = mt_ramntupleview(numthreads, rntupleFile, "chr1:abc-def", true, false, nullptr); + EXPECT_EQ(invalid, 0); + + Long64_t lateChr = mt_ramntupleview(numthreads, rntupleFile, "chrX:1-100", true, false, nullptr); + EXPECT_GE(lateChr, 0); + Long64_t zeroStart = mt_ramntupleview(numthreads, rntupleFile, "chr1:0-100", true, false, nullptr); + EXPECT_GE(zeroStart, 0); +} TEST_F(ramcoreTest, RNTupleViewOpenFailure) { Long64_t count = ramntupleview("nonexistent_file.root", "chr1:1-100", true, false, nullptr); @@ -462,4 +497,4 @@ TEST_F(ramcoreTest, QUALEncodingDecodingModes) std::remove(samFile); std::remove(ramFile); -} \ No newline at end of file +} From c4003494797720a2189e11cbeee2b93617a4a348 Mon Sep 17 00:00:00 2001 From: Georgi Haralanov Date: Wed, 29 Apr 2026 16:47:00 +0300 Subject: [PATCH 11/11] Fix some edge cases --- src/ramcore/RAMNTupleView.cxx | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/ramcore/RAMNTupleView.cxx b/src/ramcore/RAMNTupleView.cxx index c6ab06d..d0d6718 100644 --- a/src/ramcore/RAMNTupleView.cxx +++ b/src/ramcore/RAMNTupleView.cxx @@ -89,6 +89,9 @@ std::pair FindIndex(ROOT::RDataFrame &df, int refid, int sta } } } + if (first == 0) { + i = 0; + } for (; i < (*entries_refid)[0].size(); ++i) { if ((*entries_refid)[0][i] > refid) { last = (*entries_entry)[0][i]; @@ -270,11 +273,15 @@ st.Print(); std::cerr << "Invalid region format. Use rname[:start[-end]]\n"; return 0; } + if (start == end) { + return 0; + } auto metadata = ROOT::RDF::FromRNTuple("METADATA", file); const int refid = GetRefId(metadata, rname.Data()); if (refid < 0) { - std::cerr << "Reference" << rname.Data() << " not found\n"; + std::cerr << "Reference " << rname.Data() << " not found\n"; + return 0; } std::pair range;