diff --git a/CMakeLists.txt b/CMakeLists.txt index 2bc3b44..bcb5735 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ramcore inc/ramcore/SamToTTree.h inc/ramcore/SamToNTuple.h inc/ramcore/RAMNTupleView.h + inc/ramcore/RAMSort.h SOURCES src/ttree/RAMRecord.cxx src/rntuple/RAMNTupleRecord.cxx @@ -50,6 +51,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ramcore src/ramcore/SamToTTree.cxx src/ramcore/SamToNTuple.cxx src/ramcore/RAMNTupleView.cxx + src/ramcore/RAMSort.cxx LINKDEF inc/ttree/LinkDef.h DEPENDENCIES diff --git a/inc/ramcore/RAMSort.h b/inc/ramcore/RAMSort.h new file mode 100644 index 0000000..3baea47 --- /dev/null +++ b/inc/ramcore/RAMSort.h @@ -0,0 +1,11 @@ +#ifndef RAMCORE_RAMSORT_H +#define RAMCORE_RAMSORT_H + +/// Sort a RAM (RNTuple) file by coordinate (refid, pos) or by QNAME. +/// \param inputFile Path to input .root RAM file +/// \param outputFile Path to output .root RAM file +/// \param byName If true, sort by QNAME; otherwise sort by (refid, pos) +/// \return 0 on success, 1 on error +int ramsortntuple(const char *inputFile, const char *outputFile, bool byName = false); + +#endif // RAMCORE_RAMSORT_H diff --git a/src/ramcore/RAMSort.cxx b/src/ramcore/RAMSort.cxx new file mode 100644 index 0000000..e642726 --- /dev/null +++ b/src/ramcore/RAMSort.cxx @@ -0,0 +1,98 @@ +#include "ramcore/RAMSort.h" +#include "rntuple/RAMNTupleRecord.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int ramsortntuple(const char *inputFile, const char *outputFile, bool byName) +{ + RAMNTupleRecord::ReadAllRefs(inputFile); + + std::unique_ptr reader; + try { + reader = ROOT::RNTupleReader::Open("RAM", inputFile); + } catch (const std::exception &e) { + std::cerr << "Error opening input: " << e.what() << "\n"; + return 1; + } + + const uint64_t nEntries = reader->GetNEntries(); + if (nEntries == 0) { + std::cerr << "Input file has no entries.\n"; + return 1; + } + + auto viewRefId = reader->GetView("record.refid"); + auto viewPos = reader->GetView("record.pos"); + auto viewQname = reader->GetView("record.qname"); + + std::vector order(nEntries); + std::iota(order.begin(), order.end(), 0ULL); + + std::cout << "Sorting " << nEntries << " records"; + if (byName) + std::cout << " by QNAME...\n"; + else + std::cout << " by coordinate (refid, pos)...\n"; + + if (byName) { + std::vector qnames(nEntries); + for (uint64_t i = 0; i < nEntries; ++i) + qnames[i] = viewQname(i); + std::stable_sort(order.begin(), order.end(), + [&](uint64_t a, uint64_t b) { return qnames[a] < qnames[b]; }); + } else { + std::vector refids(nEntries); + std::vector positions(nEntries); + for (uint64_t i = 0; i < nEntries; ++i) { + refids[i] = viewRefId(i); + positions[i] = viewPos(i); + } + std::stable_sort(order.begin(), order.end(), [&](uint64_t a, uint64_t b) { + if (refids[a] != refids[b]) + return refids[a] < refids[b]; + return positions[a] < positions[b]; + }); + } + + auto viewRecord = reader->GetView("record"); + + auto rootFile = std::unique_ptr(TFile::Open(outputFile, /*option=*/"RECREATE")); + if (!rootFile || !rootFile->IsOpen()) { + std::cerr << "Error: could not create output file " << outputFile << "\n"; + return 1; + } + + RAMNTupleRecord::InitializeRefs(); + auto model = RAMNTupleRecord::MakeModel(); + ROOT::RNTupleWriteOptions writeOptions; + writeOptions.SetCompression(/*val=*/505); + auto writer = ROOT::RNTupleWriter::Append(std::move(model), "RAM", *rootFile, writeOptions); + auto entry = writer->GetModel().CreateEntry(); + auto recordPtr = entry->GetPtr("record"); + + for (uint64_t idx : order) { + *recordPtr = viewRecord(idx); + writer->Fill(*entry); + } + + RAMNTupleRecord::WriteAllRefs(*rootFile); + RAMNTupleRecord::WriteIndex(*rootFile); + + std::cout << "Sorted output written to " << outputFile << "\n"; + return 0; +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 24b54f0..8f39e72 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -36,3 +36,4 @@ install(TARGETS ramcoretests chromosome_split_test RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) +add_ramcore_test(ramsorttests ramsorttests.cxx) diff --git a/test/ramsorttests.cxx b/test/ramsorttests.cxx new file mode 100644 index 0000000..a6519ee --- /dev/null +++ b/test/ramsorttests.cxx @@ -0,0 +1,142 @@ +#include +#include +#include +#include +#include + +#include "../benchmark/generate_sam_benchmark.h" +#include "ramcore/RAMSort.h" +#include "ramcore/SamToNTuple.h" + +namespace { + +class RAMSortTest : public ::testing::Test { +protected: + static constexpr int kNumReads = 200; + const char *kSamFile = "sort_test.sam"; + const char *kUnsortedFile = "sort_test_unsorted.root"; + const char *kSortedFile = "sort_test_sorted.root"; + const char *kNameSortFile = "sort_test_namesort.root"; + + void SetUp() override + { + GenerateSAMFile(kSamFile, kNumReads); + std::remove(kUnsortedFile); + std::remove(kSortedFile); + std::remove(kNameSortFile); + samtoramntuple(kSamFile, kUnsortedFile, true, true, true, 505, 0); + } + + void TearDown() override + { + std::remove(kSamFile); + std::remove(kUnsortedFile); + std::remove(kSortedFile); + std::remove(kNameSortFile); + } +}; + +TEST_F(RAMSortTest, EntryCountPreserved) +{ + ASSERT_EQ(ramsortntuple(kUnsortedFile, kSortedFile), 0); + auto readerIn = ROOT::RNTupleReader::Open("RAM", kUnsortedFile); + auto readerOut = ROOT::RNTupleReader::Open("RAM", kSortedFile); + ASSERT_NE(readerIn, nullptr); + ASSERT_NE(readerOut, nullptr); + EXPECT_EQ(readerIn->GetNEntries(), readerOut->GetNEntries()); +} + +TEST_F(RAMSortTest, CoordinateSortOrder) +{ + ASSERT_EQ(ramsortntuple(kUnsortedFile, kSortedFile), 0); + auto reader = ROOT::RNTupleReader::Open("RAM", kSortedFile); + ASSERT_NE(reader, nullptr); + auto viewRefId = reader->GetView("record.refid"); + auto viewPos = reader->GetView("record.pos"); + int32_t prevRefId = -1, prevPos = -1; + for (uint64_t i = 0; i < reader->GetNEntries(); ++i) { + int32_t refid = viewRefId(i); + int32_t pos = viewPos(i); + if (refid == prevRefId) { + EXPECT_GE(pos, prevPos) << "pos out of order at entry " << i; + } else { + EXPECT_GE(refid, prevRefId) << "refid out of order at entry " << i; + } + prevRefId = refid; + prevPos = pos; + } +} + +TEST_F(RAMSortTest, NameSortOrder) +{ + ASSERT_EQ(ramsortntuple(kUnsortedFile, kNameSortFile, true), 0); + auto reader = ROOT::RNTupleReader::Open("RAM", kNameSortFile); + ASSERT_NE(reader, nullptr); + auto viewQname = reader->GetView("record.qname"); + std::string prev = ""; + for (uint64_t i = 0; i < reader->GetNEntries(); ++i) { + std::string qname = viewQname(i); + EXPECT_GE(qname, prev) << "qname out of order at entry " << i; + prev = qname; + } +} + +TEST_F(RAMSortTest, IdempotentSort) +{ + ASSERT_EQ(ramsortntuple(kUnsortedFile, kSortedFile), 0); + const char *doubleSorted = "sort_test_double.root"; + ASSERT_EQ(ramsortntuple(kSortedFile, doubleSorted), 0); + auto r1 = ROOT::RNTupleReader::Open("RAM", kSortedFile); + auto r2 = ROOT::RNTupleReader::Open("RAM", doubleSorted); + ASSERT_NE(r1, nullptr); + ASSERT_NE(r2, nullptr); + EXPECT_EQ(r1->GetNEntries(), r2->GetNEntries()); + auto v1refid = r1->GetView("record.refid"); + auto v2refid = r2->GetView("record.refid"); + auto v1pos = r1->GetView("record.pos"); + auto v2pos = r2->GetView("record.pos"); + for (uint64_t i = 0; i < r1->GetNEntries(); ++i) { + EXPECT_EQ(v1refid(i), v2refid(i)); + EXPECT_EQ(v1pos(i), v2pos(i)); + } + std::remove(doubleSorted); +} + +TEST(RAMSortEdgeCases, MissingInputFileReturnsError) +{ + int ret = ramsortntuple("nonexistent.root", "/tmp/out.root"); + EXPECT_NE(ret, 0); +} + +/// ramsortntuple on an empty file must return 1. +TEST(RAMSortEdgeCases, EmptyFileReturnsError) +{ + // Create an empty RAM file with zero entries + const char *emptySam = "empty_sort.sam"; + const char *emptyRoot = "empty_sort.root"; + { + std::ofstream f(emptySam); + f << "@HD\tVN:1.6\n"; + } + samtoramntuple(emptySam, emptyRoot, false, false, false, 505, 0); + int ret = ramsortntuple(emptyRoot, "empty_sort_out.root"); + EXPECT_NE(ret, 0); + std::remove(emptySam); + std::remove(emptyRoot); + std::remove("empty_sort_out.root"); +} + +/// ramsortntuple with invalid output path must return 1. +TEST(RAMSortEdgeCases, InvalidOutputPathReturnsError) +{ + const char *samFile = "sort_edge.sam"; + const char *rootFile = "sort_edge.root"; + GenerateSAMFile(samFile, 10); + samtoramntuple(samFile, rootFile, false, false, false, 505, 0); + int ret = ramsortntuple(rootFile, "/nonexistent/path/out.root"); + EXPECT_NE(ret, 0); + std::remove(samFile); + std::remove(rootFile); +} + +} // namespace diff --git a/tools/ramsort.cxx b/tools/ramsort.cxx new file mode 100644 index 0000000..76731b5 --- /dev/null +++ b/tools/ramsort.cxx @@ -0,0 +1,27 @@ +/// \file ramsort.cxx +/// \brief Command-line tool to sort a RAM (RNTuple) file by coordinate or name. +/// +/// Usage: +/// ./tools/ramsort [--by-name] + +#include "ramcore/RAMSort.h" +#include +#include + +int main(int argc, char **argv) +{ + if (argc < 3) { + std::cerr << "Usage: " << argv[0] << " [--by-name]\n"; + std::cerr << " Sort a RAM (RNTuple) file by genomic coordinate (refid, pos).\n"; + std::cerr << " --by-name Sort by QNAME instead.\n"; + return 1; + } + + bool byName = false; + for (int i = 3; i < argc; ++i) { + if (std::strcmp(argv[i], "--by-name") == 0) + byName = true; + } + + return ramsortntuple(argv[1], argv[2], byName); +}