-
Notifications
You must be signed in to change notification settings - Fork 8
Add ramsort tool: coordinate and name sort for RAM (RNTuple) files #30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
f1a37dc
d55b669
a57b2da
bd5d9b9
e08a403
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| #ifndef RAMCORE_RAMSORT_H | ||
| #define RAMCORE_RAMSORT_H | ||
|
|
||
| /// Sort a RAM (RNTuple) file by coordinate (refid, pos) or by QNAME. | ||
| /// \param inputFile Path to input .root RAM file | ||
| /// \param outputFile Path to output .root RAM file | ||
| /// \param byName If true, sort by QNAME; otherwise sort by (refid, pos) | ||
| /// \return 0 on success, 1 on error | ||
| int ramsortntuple(const char *inputFile, const char *outputFile, bool byName = false); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: unknown type name 'bool' [clang-diagnostic-error] int ramsortntuple(const char *inputFile, const char *outputFile, bool byName = false);
^There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: use of undeclared identifier 'false' [clang-diagnostic-error] int ramsortntuple(const char *inputFile, const char *outputFile, bool byName = false);
^ |
||
|
|
||
| #endif // RAMCORE_RAMSORT_H | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,98 @@ | ||||||
| #include "ramcore/RAMSort.h" | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: 'ramcore/RAMSort.h' file not found [clang-diagnostic-error] #include "ramcore/RAMSort.h"
^ |
||||||
| #include "rntuple/RAMNTupleRecord.h" | ||||||
|
|
||||||
| #include <ROOT/RNTupleModel.hxx> | ||||||
| #include <ROOT/RNTupleReader.hxx> | ||||||
| #include <ROOT/RNTupleView.hxx> | ||||||
| #include <ROOT/RNTupleWriteOptions.hxx> | ||||||
| #include <ROOT/RNTupleWriter.hxx> | ||||||
| #include <TFile.h> | ||||||
|
|
||||||
| #include <algorithm> | ||||||
| #include <cstdint> | ||||||
| #include <exception> | ||||||
| #include <iostream> | ||||||
| #include <memory> | ||||||
| #include <numeric> | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: included header memory is not used directly [misc-include-cleaner]
Suggested change
|
||||||
| #include <string> | ||||||
| #include <utility> | ||||||
| #include <vector> | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: included header utility is not used directly [misc-include-cleaner]
Suggested change
|
||||||
|
|
||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: included header vector is not used directly [misc-include-cleaner]
Suggested change
|
||||||
| int ramsortntuple(const char *inputFile, const char *outputFile, bool byName) | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: function 'ramsortntuple' can be made static or moved into an anonymous namespace to enforce internal linkage [misc-use-internal-linkage]
Suggested change
|
||||||
| { | ||||||
| RAMNTupleRecord::ReadAllRefs(inputFile); | ||||||
|
|
||||||
| std::unique_ptr<ROOT::RNTupleReader> reader; | ||||||
| try { | ||||||
| reader = ROOT::RNTupleReader::Open("RAM", inputFile); | ||||||
| } catch (const std::exception &e) { | ||||||
| std::cerr << "Error opening input: " << e.what() << "\n"; | ||||||
| return 1; | ||||||
| } | ||||||
|
|
||||||
| const uint64_t nEntries = reader->GetNEntries(); | ||||||
| if (nEntries == 0) { | ||||||
| std::cerr << "Input file has no entries.\n"; | ||||||
| return 1; | ||||||
| } | ||||||
|
|
||||||
| auto viewRefId = reader->GetView<int32_t>("record.refid"); | ||||||
| auto viewPos = reader->GetView<int32_t>("record.pos"); | ||||||
| auto viewQname = reader->GetView<std::string>("record.qname"); | ||||||
|
|
||||||
| std::vector<uint64_t> order(nEntries); | ||||||
| std::iota(order.begin(), order.end(), 0ULL); | ||||||
|
|
||||||
| std::cout << "Sorting " << nEntries << " records"; | ||||||
| if (byName) | ||||||
| std::cout << " by QNAME...\n"; | ||||||
| else | ||||||
| std::cout << " by coordinate (refid, pos)...\n"; | ||||||
|
|
||||||
| if (byName) { | ||||||
| std::vector<std::string> qnames(nEntries); | ||||||
| for (uint64_t i = 0; i < nEntries; ++i) | ||||||
| qnames[i] = viewQname(i); | ||||||
| std::stable_sort(order.begin(), order.end(), | ||||||
| [&](uint64_t a, uint64_t b) { return qnames[a] < qnames[b]; }); | ||||||
| } else { | ||||||
| std::vector<int32_t> refids(nEntries); | ||||||
| std::vector<int32_t> positions(nEntries); | ||||||
| for (uint64_t i = 0; i < nEntries; ++i) { | ||||||
| refids[i] = viewRefId(i); | ||||||
| positions[i] = viewPos(i); | ||||||
| } | ||||||
| std::stable_sort(order.begin(), order.end(), [&](uint64_t a, uint64_t b) { | ||||||
| if (refids[a] != refids[b]) | ||||||
| return refids[a] < refids[b]; | ||||||
| return positions[a] < positions[b]; | ||||||
| }); | ||||||
| } | ||||||
|
|
||||||
| auto viewRecord = reader->GetView<RAMNTupleRecord>("record"); | ||||||
|
|
||||||
| auto rootFile = std::unique_ptr<TFile>(TFile::Open(outputFile, /*option=*/"RECREATE")); | ||||||
| if (!rootFile || !rootFile->IsOpen()) { | ||||||
| std::cerr << "Error: could not create output file " << outputFile << "\n"; | ||||||
| return 1; | ||||||
| } | ||||||
|
|
||||||
| RAMNTupleRecord::InitializeRefs(); | ||||||
| auto model = RAMNTupleRecord::MakeModel(); | ||||||
| ROOT::RNTupleWriteOptions writeOptions; | ||||||
| writeOptions.SetCompression(/*val=*/505); | ||||||
| auto writer = ROOT::RNTupleWriter::Append(std::move(model), "RAM", *rootFile, writeOptions); | ||||||
| auto entry = writer->GetModel().CreateEntry(); | ||||||
| auto recordPtr = entry->GetPtr<RAMNTupleRecord>("record"); | ||||||
|
|
||||||
| for (uint64_t idx : order) { | ||||||
| *recordPtr = viewRecord(idx); | ||||||
| writer->Fill(*entry); | ||||||
| } | ||||||
|
|
||||||
| RAMNTupleRecord::WriteAllRefs(*rootFile); | ||||||
| RAMNTupleRecord::WriteIndex(*rootFile); | ||||||
|
|
||||||
| std::cout << "Sorted output written to " << outputFile << "\n"; | ||||||
| return 0; | ||||||
| } | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,142 @@ | ||
| #include <gtest/gtest.h> | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: 'gtest/gtest.h' file not found [clang-diagnostic-error] #include <gtest/gtest.h>
^ |
||
| #include <ROOT/RNTupleReader.hxx> | ||
| #include <ROOT/RNTupleView.hxx> | ||
| #include <cstdio> | ||
| #include <fstream> | ||
|
|
||
| #include "../benchmark/generate_sam_benchmark.h" | ||
| #include "ramcore/RAMSort.h" | ||
| #include "ramcore/SamToNTuple.h" | ||
|
|
||
| namespace { | ||
|
|
||
| class RAMSortTest : public ::testing::Test { | ||
| protected: | ||
| static constexpr int kNumReads = 200; | ||
| const char *kSamFile = "sort_test.sam"; | ||
| const char *kUnsortedFile = "sort_test_unsorted.root"; | ||
| const char *kSortedFile = "sort_test_sorted.root"; | ||
| const char *kNameSortFile = "sort_test_namesort.root"; | ||
|
|
||
| void SetUp() override | ||
| { | ||
| GenerateSAMFile(kSamFile, kNumReads); | ||
| std::remove(kUnsortedFile); | ||
| std::remove(kSortedFile); | ||
| std::remove(kNameSortFile); | ||
| samtoramntuple(kSamFile, kUnsortedFile, true, true, true, 505, 0); | ||
| } | ||
|
|
||
| void TearDown() override | ||
| { | ||
| std::remove(kSamFile); | ||
| std::remove(kUnsortedFile); | ||
| std::remove(kSortedFile); | ||
| std::remove(kNameSortFile); | ||
| } | ||
| }; | ||
|
|
||
| TEST_F(RAMSortTest, EntryCountPreserved) | ||
| { | ||
| ASSERT_EQ(ramsortntuple(kUnsortedFile, kSortedFile), 0); | ||
| auto readerIn = ROOT::RNTupleReader::Open("RAM", kUnsortedFile); | ||
| auto readerOut = ROOT::RNTupleReader::Open("RAM", kSortedFile); | ||
| ASSERT_NE(readerIn, nullptr); | ||
| ASSERT_NE(readerOut, nullptr); | ||
| EXPECT_EQ(readerIn->GetNEntries(), readerOut->GetNEntries()); | ||
| } | ||
|
|
||
| TEST_F(RAMSortTest, CoordinateSortOrder) | ||
| { | ||
| ASSERT_EQ(ramsortntuple(kUnsortedFile, kSortedFile), 0); | ||
| auto reader = ROOT::RNTupleReader::Open("RAM", kSortedFile); | ||
| ASSERT_NE(reader, nullptr); | ||
| auto viewRefId = reader->GetView<int32_t>("record.refid"); | ||
| auto viewPos = reader->GetView<int32_t>("record.pos"); | ||
| int32_t prevRefId = -1, prevPos = -1; | ||
| for (uint64_t i = 0; i < reader->GetNEntries(); ++i) { | ||
| int32_t refid = viewRefId(i); | ||
| int32_t pos = viewPos(i); | ||
| if (refid == prevRefId) { | ||
| EXPECT_GE(pos, prevPos) << "pos out of order at entry " << i; | ||
| } else { | ||
| EXPECT_GE(refid, prevRefId) << "refid out of order at entry " << i; | ||
| } | ||
| prevRefId = refid; | ||
| prevPos = pos; | ||
| } | ||
| } | ||
|
|
||
| TEST_F(RAMSortTest, NameSortOrder) | ||
| { | ||
| ASSERT_EQ(ramsortntuple(kUnsortedFile, kNameSortFile, true), 0); | ||
| auto reader = ROOT::RNTupleReader::Open("RAM", kNameSortFile); | ||
| ASSERT_NE(reader, nullptr); | ||
| auto viewQname = reader->GetView<std::string>("record.qname"); | ||
| std::string prev = ""; | ||
| for (uint64_t i = 0; i < reader->GetNEntries(); ++i) { | ||
| std::string qname = viewQname(i); | ||
| EXPECT_GE(qname, prev) << "qname out of order at entry " << i; | ||
| prev = qname; | ||
| } | ||
| } | ||
|
|
||
| TEST_F(RAMSortTest, IdempotentSort) | ||
| { | ||
| ASSERT_EQ(ramsortntuple(kUnsortedFile, kSortedFile), 0); | ||
| const char *doubleSorted = "sort_test_double.root"; | ||
| ASSERT_EQ(ramsortntuple(kSortedFile, doubleSorted), 0); | ||
| auto r1 = ROOT::RNTupleReader::Open("RAM", kSortedFile); | ||
| auto r2 = ROOT::RNTupleReader::Open("RAM", doubleSorted); | ||
| ASSERT_NE(r1, nullptr); | ||
| ASSERT_NE(r2, nullptr); | ||
| EXPECT_EQ(r1->GetNEntries(), r2->GetNEntries()); | ||
| auto v1refid = r1->GetView<int32_t>("record.refid"); | ||
| auto v2refid = r2->GetView<int32_t>("record.refid"); | ||
| auto v1pos = r1->GetView<int32_t>("record.pos"); | ||
| auto v2pos = r2->GetView<int32_t>("record.pos"); | ||
| for (uint64_t i = 0; i < r1->GetNEntries(); ++i) { | ||
| EXPECT_EQ(v1refid(i), v2refid(i)); | ||
| EXPECT_EQ(v1pos(i), v2pos(i)); | ||
| } | ||
| std::remove(doubleSorted); | ||
| } | ||
|
|
||
| TEST(RAMSortEdgeCases, MissingInputFileReturnsError) | ||
| { | ||
| int ret = ramsortntuple("nonexistent.root", "/tmp/out.root"); | ||
| EXPECT_NE(ret, 0); | ||
| } | ||
|
|
||
| /// ramsortntuple on an empty file must return 1. | ||
| TEST(RAMSortEdgeCases, EmptyFileReturnsError) | ||
| { | ||
| // Create an empty RAM file with zero entries | ||
| const char *emptySam = "empty_sort.sam"; | ||
| const char *emptyRoot = "empty_sort.root"; | ||
| { | ||
| std::ofstream f(emptySam); | ||
| f << "@HD\tVN:1.6\n"; | ||
| } | ||
| samtoramntuple(emptySam, emptyRoot, false, false, false, 505, 0); | ||
| int ret = ramsortntuple(emptyRoot, "empty_sort_out.root"); | ||
| EXPECT_NE(ret, 0); | ||
| std::remove(emptySam); | ||
| std::remove(emptyRoot); | ||
| std::remove("empty_sort_out.root"); | ||
| } | ||
|
|
||
| /// ramsortntuple with invalid output path must return 1. | ||
| TEST(RAMSortEdgeCases, InvalidOutputPathReturnsError) | ||
| { | ||
| const char *samFile = "sort_edge.sam"; | ||
| const char *rootFile = "sort_edge.root"; | ||
| GenerateSAMFile(samFile, 10); | ||
| samtoramntuple(samFile, rootFile, false, false, false, 505, 0); | ||
| int ret = ramsortntuple(rootFile, "/nonexistent/path/out.root"); | ||
| EXPECT_NE(ret, 0); | ||
| std::remove(samFile); | ||
| std::remove(rootFile); | ||
| } | ||
|
|
||
| } // namespace | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| /// \file ramsort.cxx | ||
| /// \brief Command-line tool to sort a RAM (RNTuple) file by coordinate or name. | ||
| /// | ||
| /// Usage: | ||
| /// ./tools/ramsort <input.root> <output.root> [--by-name] | ||
|
|
||
| #include "ramcore/RAMSort.h" | ||
| #include <cstring> | ||
| #include <iostream> | ||
|
|
||
| int main(int argc, char **argv) | ||
| { | ||
| if (argc < 3) { | ||
| std::cerr << "Usage: " << argv[0] << " <input.root> <output.root> [--by-name]\n"; | ||
| std::cerr << " Sort a RAM (RNTuple) file by genomic coordinate (refid, pos).\n"; | ||
| std::cerr << " --by-name Sort by QNAME instead.\n"; | ||
| return 1; | ||
| } | ||
|
|
||
| bool byName = false; | ||
| for (int i = 3; i < argc; ++i) { | ||
| if (std::strcmp(argv[i], "--by-name") == 0) | ||
| byName = true; | ||
| } | ||
|
|
||
| return ramsortntuple(argv[1], argv[2], byName); | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: header guard does not follow preferred style [llvm-header-guard]
inc/ramcore/RAMSort.h:10: