Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,15 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ramcore
inc/ramcore/SamToTTree.h
inc/ramcore/SamToNTuple.h
inc/ramcore/RAMNTupleView.h
inc/ramcore/RAMSort.h
SOURCES
src/ttree/RAMRecord.cxx
src/rntuple/RAMNTupleRecord.cxx
src/ramcore/SamParser.cxx
src/ramcore/SamToTTree.cxx
src/ramcore/SamToNTuple.cxx
src/ramcore/RAMNTupleView.cxx
src/ramcore/RAMSort.cxx
LINKDEF
inc/ttree/LinkDef.h
DEPENDENCIES
Expand Down
11 changes: 11 additions & 0 deletions inc/ramcore/RAMSort.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#ifndef RAMCORE_RAMSORT_H
#define RAMCORE_RAMSORT_H
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: header guard does not follow preferred style [llvm-header-guard]

Suggested change
#define RAMCORE_RAMSORT_H
#ifndef GITHUB_WORKSPACE_INC_RAMCORE_RAMSORT_H
#define GITHUB_WORKSPACE_INC_RAMCORE_RAMSORT_H

inc/ramcore/RAMSort.h:10:

- #endif // RAMCORE_RAMSORT_H
+ #endif // GITHUB_WORKSPACE_INC_RAMCORE_RAMSORT_H


/// Sort a RAM (RNTuple) file by coordinate (refid, pos) or by QNAME.
/// \param inputFile Path to input .root RAM file
/// \param outputFile Path to output .root RAM file
/// \param byName If true, sort by QNAME; otherwise sort by (refid, pos)
/// \return 0 on success, 1 on error
int ramsortntuple(const char *inputFile, const char *outputFile, bool byName = false);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: unknown type name 'bool' [clang-diagnostic-error]

int ramsortntuple(const char *inputFile, const char *outputFile, bool byName = false);
                                                                 ^

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: use of undeclared identifier 'false' [clang-diagnostic-error]

int ramsortntuple(const char *inputFile, const char *outputFile, bool byName = false);
                                                                               ^


#endif // RAMCORE_RAMSORT_H
98 changes: 98 additions & 0 deletions src/ramcore/RAMSort.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#include "ramcore/RAMSort.h"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: 'ramcore/RAMSort.h' file not found [clang-diagnostic-error]

#include "ramcore/RAMSort.h"
         ^

#include "rntuple/RAMNTupleRecord.h"

#include <ROOT/RNTupleModel.hxx>
#include <ROOT/RNTupleReader.hxx>
#include <ROOT/RNTupleView.hxx>
#include <ROOT/RNTupleWriteOptions.hxx>
#include <ROOT/RNTupleWriter.hxx>
#include <TFile.h>

#include <algorithm>
#include <cstdint>
#include <exception>
#include <iostream>
#include <memory>
#include <numeric>
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: included header memory is not used directly [misc-include-cleaner]

Suggested change
#include <numeric>
#include <numeric>

#include <string>
#include <utility>
#include <vector>
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: included header utility is not used directly [misc-include-cleaner]

Suggested change
#include <vector>
#include <vector>


Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: included header vector is not used directly [misc-include-cleaner]

Suggested change

int ramsortntuple(const char *inputFile, const char *outputFile, bool byName)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: function 'ramsortntuple' can be made static or moved into an anonymous namespace to enforce internal linkage [misc-use-internal-linkage]

Suggested change
int ramsortntuple(const char *inputFile, const char *outputFile, bool byName)
static int ramsortntuple(const char *inputFile, const char *outputFile, bool byName)

{
RAMNTupleRecord::ReadAllRefs(inputFile);

std::unique_ptr<ROOT::RNTupleReader> reader;
try {
reader = ROOT::RNTupleReader::Open("RAM", inputFile);
} catch (const std::exception &e) {
std::cerr << "Error opening input: " << e.what() << "\n";
return 1;
}

const uint64_t nEntries = reader->GetNEntries();
if (nEntries == 0) {
std::cerr << "Input file has no entries.\n";
return 1;
}

auto viewRefId = reader->GetView<int32_t>("record.refid");
auto viewPos = reader->GetView<int32_t>("record.pos");
auto viewQname = reader->GetView<std::string>("record.qname");

std::vector<uint64_t> order(nEntries);
std::iota(order.begin(), order.end(), 0ULL);

std::cout << "Sorting " << nEntries << " records";
if (byName)
std::cout << " by QNAME...\n";
else
std::cout << " by coordinate (refid, pos)...\n";

if (byName) {
std::vector<std::string> qnames(nEntries);
for (uint64_t i = 0; i < nEntries; ++i)
qnames[i] = viewQname(i);
std::stable_sort(order.begin(), order.end(),
[&](uint64_t a, uint64_t b) { return qnames[a] < qnames[b]; });
} else {
std::vector<int32_t> refids(nEntries);
std::vector<int32_t> positions(nEntries);
for (uint64_t i = 0; i < nEntries; ++i) {
refids[i] = viewRefId(i);
positions[i] = viewPos(i);
}
std::stable_sort(order.begin(), order.end(), [&](uint64_t a, uint64_t b) {
if (refids[a] != refids[b])
return refids[a] < refids[b];
return positions[a] < positions[b];
});
}

auto viewRecord = reader->GetView<RAMNTupleRecord>("record");

auto rootFile = std::unique_ptr<TFile>(TFile::Open(outputFile, /*option=*/"RECREATE"));
if (!rootFile || !rootFile->IsOpen()) {
std::cerr << "Error: could not create output file " << outputFile << "\n";
return 1;
}

RAMNTupleRecord::InitializeRefs();
auto model = RAMNTupleRecord::MakeModel();
ROOT::RNTupleWriteOptions writeOptions;
writeOptions.SetCompression(/*val=*/505);
auto writer = ROOT::RNTupleWriter::Append(std::move(model), "RAM", *rootFile, writeOptions);
auto entry = writer->GetModel().CreateEntry();
auto recordPtr = entry->GetPtr<RAMNTupleRecord>("record");

for (uint64_t idx : order) {
*recordPtr = viewRecord(idx);
writer->Fill(*entry);
}

RAMNTupleRecord::WriteAllRefs(*rootFile);
RAMNTupleRecord::WriteIndex(*rootFile);

std::cout << "Sorted output written to " << outputFile << "\n";
return 0;
}
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@ install(TARGETS ramcoretests chromosome_split_test
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
)

add_ramcore_test(ramsorttests ramsorttests.cxx)
142 changes: 142 additions & 0 deletions test/ramsorttests.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#include <gtest/gtest.h>
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: 'gtest/gtest.h' file not found [clang-diagnostic-error]

#include <gtest/gtest.h>
         ^

#include <ROOT/RNTupleReader.hxx>
#include <ROOT/RNTupleView.hxx>
#include <cstdio>
#include <fstream>

#include "../benchmark/generate_sam_benchmark.h"
#include "ramcore/RAMSort.h"
#include "ramcore/SamToNTuple.h"

namespace {

class RAMSortTest : public ::testing::Test {
protected:
static constexpr int kNumReads = 200;
const char *kSamFile = "sort_test.sam";
const char *kUnsortedFile = "sort_test_unsorted.root";
const char *kSortedFile = "sort_test_sorted.root";
const char *kNameSortFile = "sort_test_namesort.root";

void SetUp() override
{
GenerateSAMFile(kSamFile, kNumReads);
std::remove(kUnsortedFile);
std::remove(kSortedFile);
std::remove(kNameSortFile);
samtoramntuple(kSamFile, kUnsortedFile, true, true, true, 505, 0);
}

void TearDown() override
{
std::remove(kSamFile);
std::remove(kUnsortedFile);
std::remove(kSortedFile);
std::remove(kNameSortFile);
}
};

TEST_F(RAMSortTest, EntryCountPreserved)
{
ASSERT_EQ(ramsortntuple(kUnsortedFile, kSortedFile), 0);
auto readerIn = ROOT::RNTupleReader::Open("RAM", kUnsortedFile);
auto readerOut = ROOT::RNTupleReader::Open("RAM", kSortedFile);
ASSERT_NE(readerIn, nullptr);
ASSERT_NE(readerOut, nullptr);
EXPECT_EQ(readerIn->GetNEntries(), readerOut->GetNEntries());
}

TEST_F(RAMSortTest, CoordinateSortOrder)
{
ASSERT_EQ(ramsortntuple(kUnsortedFile, kSortedFile), 0);
auto reader = ROOT::RNTupleReader::Open("RAM", kSortedFile);
ASSERT_NE(reader, nullptr);
auto viewRefId = reader->GetView<int32_t>("record.refid");
auto viewPos = reader->GetView<int32_t>("record.pos");
int32_t prevRefId = -1, prevPos = -1;
for (uint64_t i = 0; i < reader->GetNEntries(); ++i) {
int32_t refid = viewRefId(i);
int32_t pos = viewPos(i);
if (refid == prevRefId) {
EXPECT_GE(pos, prevPos) << "pos out of order at entry " << i;
} else {
EXPECT_GE(refid, prevRefId) << "refid out of order at entry " << i;
}
prevRefId = refid;
prevPos = pos;
}
}

TEST_F(RAMSortTest, NameSortOrder)
{
ASSERT_EQ(ramsortntuple(kUnsortedFile, kNameSortFile, true), 0);
auto reader = ROOT::RNTupleReader::Open("RAM", kNameSortFile);
ASSERT_NE(reader, nullptr);
auto viewQname = reader->GetView<std::string>("record.qname");
std::string prev = "";
for (uint64_t i = 0; i < reader->GetNEntries(); ++i) {
std::string qname = viewQname(i);
EXPECT_GE(qname, prev) << "qname out of order at entry " << i;
prev = qname;
}
}

TEST_F(RAMSortTest, IdempotentSort)
{
ASSERT_EQ(ramsortntuple(kUnsortedFile, kSortedFile), 0);
const char *doubleSorted = "sort_test_double.root";
ASSERT_EQ(ramsortntuple(kSortedFile, doubleSorted), 0);
auto r1 = ROOT::RNTupleReader::Open("RAM", kSortedFile);
auto r2 = ROOT::RNTupleReader::Open("RAM", doubleSorted);
ASSERT_NE(r1, nullptr);
ASSERT_NE(r2, nullptr);
EXPECT_EQ(r1->GetNEntries(), r2->GetNEntries());
auto v1refid = r1->GetView<int32_t>("record.refid");
auto v2refid = r2->GetView<int32_t>("record.refid");
auto v1pos = r1->GetView<int32_t>("record.pos");
auto v2pos = r2->GetView<int32_t>("record.pos");
for (uint64_t i = 0; i < r1->GetNEntries(); ++i) {
EXPECT_EQ(v1refid(i), v2refid(i));
EXPECT_EQ(v1pos(i), v2pos(i));
}
std::remove(doubleSorted);
}

TEST(RAMSortEdgeCases, MissingInputFileReturnsError)
{
int ret = ramsortntuple("nonexistent.root", "/tmp/out.root");
EXPECT_NE(ret, 0);
}

/// ramsortntuple on an empty file must return 1.
TEST(RAMSortEdgeCases, EmptyFileReturnsError)
{
// Create an empty RAM file with zero entries
const char *emptySam = "empty_sort.sam";
const char *emptyRoot = "empty_sort.root";
{
std::ofstream f(emptySam);
f << "@HD\tVN:1.6\n";
}
samtoramntuple(emptySam, emptyRoot, false, false, false, 505, 0);
int ret = ramsortntuple(emptyRoot, "empty_sort_out.root");
EXPECT_NE(ret, 0);
std::remove(emptySam);
std::remove(emptyRoot);
std::remove("empty_sort_out.root");
}

/// ramsortntuple with invalid output path must return 1.
TEST(RAMSortEdgeCases, InvalidOutputPathReturnsError)
{
const char *samFile = "sort_edge.sam";
const char *rootFile = "sort_edge.root";
GenerateSAMFile(samFile, 10);
samtoramntuple(samFile, rootFile, false, false, false, 505, 0);
int ret = ramsortntuple(rootFile, "/nonexistent/path/out.root");
EXPECT_NE(ret, 0);
std::remove(samFile);
std::remove(rootFile);
}

} // namespace
27 changes: 27 additions & 0 deletions tools/ramsort.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/// \file ramsort.cxx
/// \brief Command-line tool to sort a RAM (RNTuple) file by coordinate or name.
///
/// Usage:
/// ./tools/ramsort <input.root> <output.root> [--by-name]

#include "ramcore/RAMSort.h"
#include <cstring>
#include <iostream>

int main(int argc, char **argv)
{
if (argc < 3) {
std::cerr << "Usage: " << argv[0] << " <input.root> <output.root> [--by-name]\n";
std::cerr << " Sort a RAM (RNTuple) file by genomic coordinate (refid, pos).\n";
std::cerr << " --by-name Sort by QNAME instead.\n";
return 1;
}

bool byName = false;
for (int i = 3; i < argc; ++i) {
if (std::strcmp(argv[i], "--by-name") == 0)
byName = true;
}

return ramsortntuple(argv[1], argv[2], byName);
}