-
Notifications
You must be signed in to change notification settings - Fork 8
Adding option for multithreaded query mode #27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
f953702
d18ecdd
a6c7d28
aa5df5b
aed685f
10d62d8
5a98fba
e5909e7
4c1a1f3
f84eb46
6fc5e4a
c400349
4ca2d3d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ ramexample.root | |
| *.root.idx | ||
| .ipynb_checkpoints/ | ||
| tmp/ | ||
| .cache | ||
| *.so | ||
| *.d | ||
| *.pcm | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,17 +1,25 @@ | ||
| #include "ramcore/RAMNTupleView.h" | ||
| #include <ROOT/RDF/RDatasetSpec.hxx> | ||
| #include <ROOT/RDF/RSample.hxx> | ||
| #include <ROOT/RDataFrame.hxx> | ||
| #include <ROOT/RNTupleDS.hxx> | ||
| #include <ROOT/RSnapshotOptions.hxx> | ||
| #include <algorithm> | ||
|
|
||
| #include <cctype> | ||
| #include <cstddef> | ||
| #include <cstdint> | ||
| #include <iostream> | ||
| #include <limits> | ||
| #include <string> | ||
| #include <utility> | ||
| #include <vector> | ||
|
|
||
| #include <ROOT/RNTuple.hxx> | ||
| #include <ROOT/RNTupleReader.hxx> | ||
| #include <ROOT/RNTupleView.hxx> | ||
| #include <Rtypes.h> | ||
| #include <TROOT.h> | ||
| #include <TStopwatch.h> | ||
| #include <TString.h> | ||
|
|
||
|
|
@@ -53,6 +61,61 @@ int computeRefSpan(const std::vector<uint32_t> &cigarOps) | |
| } | ||
| return span; | ||
| } | ||
| std::pair<Long64_t, Long64_t> FindIndex(ROOT::RDataFrame &df, int refid, int start, int end) | ||
| { | ||
| ULong64_t first = 0; | ||
| ULong64_t last = std::numeric_limits<Long64_t>::max(); | ||
| auto entries_refid = df.Take<std::vector<uint64_t>>("index_entries_refid"); | ||
|
|
||
| auto entries_pos = df.Take<std::vector<uint64_t>>("index_entries_pos"); | ||
|
|
||
| auto entries_entry = df.Take<std::vector<uint64_t>>("index_entries_entry"); | ||
|
|
||
| std::size_t i = 0; | ||
| // loop to find the nearest inclusive starting index | ||
| for (; i < (*entries_entry)[0].size(); ++i) { | ||
| if ((*entries_refid)[0][i] == refid) { | ||
| if ((*entries_pos)[0][i] == start) { | ||
| first = (*entries_refid)[0][i]; | ||
| break; | ||
| } | ||
| if ((*entries_pos)[0][i] > start) { | ||
| if (i == 0){ | ||
| first = (*entries_entry)[0][i]; | ||
| break; | ||
| } | ||
| first = (*entries_entry)[0][i - 1]; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| if (first == 0) { | ||
| i = 0; | ||
| } | ||
| for (; i < (*entries_refid)[0].size(); ++i) { | ||
| if ((*entries_refid)[0][i] > refid) { | ||
| last = (*entries_entry)[0][i]; | ||
| break; | ||
| } | ||
| if ((*entries_refid)[0][i] == refid && (*entries_pos)[0][i] >= end) { | ||
| last = (*entries_entry)[0][i]; | ||
| break; | ||
| } | ||
| } | ||
| return std::make_pair(first, last); | ||
| } | ||
|
|
||
| int GetRefId(ROOT::RDataFrame &df, const std::string &rname) | ||
| { | ||
| if (rname == "*") | ||
| return -1; | ||
|
|
||
| auto refs = df.Take<std::vector<std::string>>("rname_refs"); | ||
| const auto &refids = refs.GetValue()[0]; | ||
|
|
||
| auto it = std::find(refids.begin(), refids.end(), rname); | ||
| return (it == refids.end()) ? -1 : std::distance(refids.begin(), it); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: no header providing "std::distance" is directly included [misc-include-cleaner] src/ramcore/RAMNTupleView.cxx:9: - #include <string>
+ #include <iterator>
+ #include <string> |
||
| } | ||
|
|
||
| int resolveRefId(const char *name) | ||
| { | ||
|
|
@@ -144,12 +207,10 @@ Long64_t ramntupleview(const char *file, const char *query, bool /*cache*/, bool | |
| std::cerr << "Reference '" << rname.Data() << "' not found\n"; | ||
| return 0; | ||
| } | ||
|
|
||
| auto flagView = reader->GetView<uint16_t>("record.flag"); | ||
| auto refidView = reader->GetView<int32_t>("record.refid"); | ||
| auto posView = reader->GetView<int32_t>("record.pos"); | ||
| auto cigarView = reader->GetView<std::vector<uint32_t>>("record.cigar"); | ||
|
|
||
| auto index = RAMNTupleRecord::GetIndex(); | ||
| Long64_t start = (index && index->Size() > 0) ? index->GetRow(refid, rs) : 0; | ||
| if (start < 0) | ||
|
|
@@ -185,4 +246,76 @@ Long64_t ramntupleview(const char *file, const char *query, bool /*cache*/, bool | |
| stopwatch.Print(); | ||
| std::cout << "Found " << count << " records in region " << region << std::endl; | ||
| return count; | ||
| } | ||
| } | ||
| // NOLINTNEXTLINE(misc-use-internal-linkage) | ||
| ULong64_t mt_ramntupleview(const int numthreads, const char *file, const char *query, bool /*cache*/, | ||
| bool /*perfstats*/, const char * /*perfstatsfilename*/) | ||
| { | ||
| TStopwatch st; | ||
| st.Start(); | ||
|
|
||
| TString rname; | ||
| std::string region = query ? query : ""; | ||
| if (region.empty() || region == "*") { | ||
| auto reader = RAMNTupleRecord::OpenRAMFile(file); | ||
| if (!reader) { | ||
| std::cerr << "ramntupleview: failed to open file " << file << std::endl; | ||
| return 0; | ||
| } | ||
| st.Print(); | ||
| return reader->GetNEntries(); | ||
|
|
||
| } | ||
| Int_t start = 0; | ||
| Int_t end = 0; | ||
|
|
||
| if (!parseRegion(region, rname, start, end)) { | ||
| std::cerr << "Invalid region format. Use rname[:start[-end]]\n"; | ||
| return 0; | ||
| } | ||
| if (start == end) { | ||
| return 0; | ||
| } | ||
| auto metadata = ROOT::RDF::FromRNTuple("METADATA", file); | ||
| const int refid = GetRefId(metadata, rname.Data()); | ||
|
|
||
| if (refid < 0) { | ||
| std::cerr << "Reference " << rname.Data() << " not found\n"; | ||
| return 0; | ||
| } | ||
|
|
||
| std::pair<Long64_t, Long64_t> range; | ||
| try { | ||
|
|
||
| auto index = ROOT::RDF::FromRNTuple("INDEX_FAST", file); | ||
| range = FindIndex(index, refid, start, end); | ||
| } catch (...) { | ||
|
|
||
| std::cerr << "[-]Fast index wasn't found\n[*]Creating fast index ...\n"; | ||
| auto index_old = ROOT::RDF::FromRNTuple("INDEX", file); | ||
| ROOT::RDF::RSnapshotOptions opts; | ||
| opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; | ||
| opts.fMode = "UPDATE"; | ||
| index_old.Snapshot("INDEX_FAST", file, {"index_entries.pos", "index_entries.refid", "index_entries.entry"}, opts); | ||
| auto index = ROOT::RDF::FromRNTuple("INDEX_FAST", file); | ||
| range = FindIndex(index, refid, start, end); | ||
| std::cerr << "[+]Index created!\n"; | ||
| } | ||
|
|
||
| st.Print(); | ||
| st.Start(); | ||
| std::cout << range.first << ' ' << range.second << '\n'; | ||
| ROOT::EnableImplicitMT(numthreads); | ||
| std::vector<std::string> files = {file}; | ||
| auto ram = ROOT::Internal::RDF::FromRNTuple("RAM", files, range); | ||
| st.Print(); | ||
| st.Start(); | ||
| auto filterfunc = [refid, start, end](int32_t refidentry, int32_t pos, uint16_t flag) { | ||
| return !(flag & FLAG_FILTER) && (refid == refidentry) && (pos >= start) && (pos <= end); | ||
| }; | ||
| auto filtered = ram.Filter(filterfunc, {"record.refid", "record.pos", "record.flag"}); | ||
| auto count = filtered.Count(); | ||
| auto res = *count; | ||
| st.Print(); | ||
| return res; | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -2,21 +2,32 @@ | |||||
| #include <iostream> | ||||||
| #include <stdio.h> | ||||||
| #include <Rtypes.h> | ||||||
|
|
||||||
| int main(int argc, char *argv[]) | ||||||
| { | ||||||
| if (argc < 2) { | ||||||
| std::cerr << "Usage: " << argv[0] << " <file.root> [rname:start-end]\n"; | ||||||
| std::cerr << "Example: " << argv[0] << " output.root chr1:1000-2000\n"; | ||||||
| return 1; | ||||||
| } | ||||||
| bool mt_set = false; | ||||||
| int num_threads = 0; | ||||||
| if (argc > 3) { | ||||||
| std::string mt_flag = argv[3]; | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: do not use pointer arithmetic [cppcoreguidelines-pro-bounds-pointer-arithmetic] std::string mt_flag = argv[3];
^There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: no header providing "std::string" is directly included [misc-include-cleaner] tools/ramntupleview.cxx:4: - int main(int argc, char *argv[])
+ #include <string>
+ int main(int argc, char *argv[]) |
||||||
| if (mt_flag.find("-m") != std::string::npos) { | ||||||
| mt_set = true; | ||||||
| num_threads = std::atoi(mt_flag.substr(2).data()); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: no header providing "std::atoi" is directly included [misc-include-cleaner] tools/ramntupleview.cxx:1: - #include <iostream>
+ #include <cstdlib>
+ #include <iostream> |
||||||
| } | ||||||
| } | ||||||
|
|
||||||
| const char *file = argv[1]; | ||||||
| const char *region_str = (argc > 2) ? argv[2] : ""; | ||||||
|
|
||||||
| Long64_t read_count = ramntupleview(file, region_str); | ||||||
|
|
||||||
| printf("Found %lld records in region %s\n", read_count, region_str); | ||||||
| ULong64_t read_count; | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: no header providing "ULong64_t" is directly included [misc-include-cleaner] tools/ramntupleview.cxx:1: - #include <iostream>
+ #include <RtypesCore.h>
+ #include <iostream>There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: variable 'read_count' is not initialized [cppcoreguidelines-init-variables]
Suggested change
|
||||||
| if (mt_set) { | ||||||
| read_count = mt_ramntupleview(num_threads, file, region_str); | ||||||
| } else { | ||||||
| read_count = ramntupleview(file, region_str); | ||||||
| } | ||||||
| printf("Found %lld records in region %s", read_count, region_str); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: do not call c-style vararg functions [cppcoreguidelines-pro-type-vararg] printf("Found %lld records in region %s", read_count, region_str);
^ |
||||||
|
|
||||||
| return 0; | ||||||
| } | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: narrowing conversion from 'typename iterator_traits<__normal_iterator<const basic_string *, vector<basic_string>>>::difference_type' (aka 'long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]