diff --git a/CMakeLists.txt b/CMakeLists.txt index 2bc3b44..29c0faa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,8 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ramcore inc/ramcore/SamToTTree.h inc/ramcore/SamToNTuple.h inc/ramcore/RAMNTupleView.h + inc/ramcore/RAMStats.h + inc/ramcore/RAMSort.h SOURCES src/ttree/RAMRecord.cxx src/rntuple/RAMNTupleRecord.cxx @@ -50,6 +52,8 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ramcore src/ramcore/SamToTTree.cxx src/ramcore/SamToNTuple.cxx src/ramcore/RAMNTupleView.cxx + src/ramcore/RAMStats.cxx + src/ramcore/RAMSort.cxx LINKDEF inc/ttree/LinkDef.h DEPENDENCIES diff --git a/inc/ramcore/RAMSort.h b/inc/ramcore/RAMSort.h new file mode 100644 index 0000000..3baea47 --- /dev/null +++ b/inc/ramcore/RAMSort.h @@ -0,0 +1,11 @@ +#ifndef RAMCORE_RAMSORT_H +#define RAMCORE_RAMSORT_H + +/// Sort a RAM (RNTuple) file by coordinate (refid, pos) or by QNAME. +/// \param inputFile Path to input .root RAM file +/// \param outputFile Path to output .root RAM file +/// \param byName If true, sort by QNAME; otherwise sort by (refid, pos) +/// \return 0 on success, 1 on error +int ramsortntuple(const char *inputFile, const char *outputFile, bool byName = false); + +#endif // RAMCORE_RAMSORT_H diff --git a/inc/ramcore/RAMStats.h b/inc/ramcore/RAMStats.h new file mode 100644 index 0000000..8fe16e2 --- /dev/null +++ b/inc/ramcore/RAMStats.h @@ -0,0 +1,48 @@ +#ifndef RAMCORE_RAMSTATS_H +#define RAMCORE_RAMSTATS_H + +#include +#include +#include + +namespace ramcore { + +struct RAMStats { + uint64_t total_reads = 0; + uint64_t mapped_reads = 0; + uint64_t unmapped_reads = 0; + uint64_t duplicate_reads = 0; + uint64_t paired_reads = 0; + uint64_t properly_paired_reads = 0; + uint64_t forward_strand = 0; + uint64_t reverse_strand = 0; + + double mean_mapping_quality = 0.0; + double mean_read_length = 0.0; + uint64_t total_bases = 0; + + std::map reads_per_chromosome; + + void Print() const; +}; + +/// Result wrapper so callers can distinguish errors from empty files +struct RAMStatsResult { + RAMStats stats; + bool ok = false; + std::string error_message; +}; + +/// Compute statistics from an RNTuple-based RAM file. +/// Returns RAMStatsResult — check .ok before using .stats. +RAMStatsResult ComputeStats(const char *filename); + +/// Run the full ramstats pipeline: compute stats and print results. +/// \param filename Path to input .root RAM file +/// \param verbose If true, print per-chromosome breakdown +/// \return 0 on success, 1 on error +int RunRamStats(const char *filename, bool verbose = false); + +} // namespace ramcore + +#endif // RAMCORE_RAMSTATS_H diff --git a/src/ramcore/RAMSort.cxx b/src/ramcore/RAMSort.cxx new file mode 100644 index 0000000..c8997c3 --- /dev/null +++ b/src/ramcore/RAMSort.cxx @@ -0,0 +1,92 @@ +#include "ramcore/RAMSort.h" +#include "rntuple/RAMNTupleRecord.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +int ramsortntuple(const char *inputFile, const char *outputFile, bool byName) +{ + RAMNTupleRecord::ReadAllRefs(inputFile); + + std::unique_ptr reader; + try { + reader = ROOT::RNTupleReader::Open("RAM", inputFile); + } catch (const std::exception &e) { + std::cerr << "Error opening input: " << e.what() << "\n"; + return 1; + } + + const uint64_t nEntries = reader->GetNEntries(); + if (nEntries == 0) { + std::cerr << "Input file has no entries.\n"; + return 1; + } + + auto viewRefId = reader->GetView("record.refid"); + auto viewPos = reader->GetView("record.pos"); + auto viewQname = reader->GetView("record.qname"); + + std::vector order(nEntries); + std::iota(order.begin(), order.end(), 0); + + std::cout << "Sorting " << nEntries << " records"; + if (byName) + std::cout << " by QNAME...\n"; + else + std::cout << " by coordinate (refid, pos)...\n"; + + if (byName) { + std::vector qnames(nEntries); + for (uint64_t i = 0; i < nEntries; ++i) + qnames[i] = viewQname(i); + std::stable_sort(order.begin(), order.end(), [&](uint64_t a, uint64_t b) { return qnames[a] < qnames[b]; }); + } else { + std::vector refids(nEntries), positions(nEntries); + for (uint64_t i = 0; i < nEntries; ++i) { + refids[i] = viewRefId(i); + positions[i] = viewPos(i); + } + std::stable_sort(order.begin(), order.end(), [&](uint64_t a, uint64_t b) { + if (refids[a] != refids[b]) + return refids[a] < refids[b]; + return positions[a] < positions[b]; + }); + } + + auto viewRecord = reader->GetView("record"); + + auto rootFile = std::unique_ptr(TFile::Open(outputFile, "RECREATE")); + if (!rootFile || !rootFile->IsOpen()) { + std::cerr << "Error: could not create output file " << outputFile << "\n"; + return 1; + } + + RAMNTupleRecord::InitializeRefs(); + auto model = RAMNTupleRecord::MakeModel(); + ROOT::RNTupleWriteOptions writeOptions; + writeOptions.SetCompression(505); + auto writer = ROOT::RNTupleWriter::Append(std::move(model), "RAM", *rootFile, writeOptions); + auto entry = writer->GetModel().CreateEntry(); + auto recordPtr = entry->GetPtr("record"); + + for (uint64_t idx : order) { + *recordPtr = viewRecord(idx); + writer->Fill(*entry); + } + + RAMNTupleRecord::WriteAllRefs(*rootFile); + RAMNTupleRecord::WriteIndex(*rootFile); + + std::cout << "Sorted output written to " << outputFile << "\n"; + return 0; +} diff --git a/src/ramcore/RAMStats.cxx b/src/ramcore/RAMStats.cxx new file mode 100644 index 0000000..1c84c93 --- /dev/null +++ b/src/ramcore/RAMStats.cxx @@ -0,0 +1,158 @@ +#include "ramcore/RAMStats.h" +#include "rntuple/RAMNTupleRecord.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ramcore { + +static constexpr uint16_t FLAG_PAIRED = 0x1; +static constexpr uint16_t FLAG_PROPER_PAIR = 0x2; +static constexpr uint16_t FLAG_UNMAPPED = 0x4; +static constexpr uint16_t FLAG_REVERSE_STRAND = 0x10; +static constexpr uint16_t FLAG_DUPLICATE = 0x400; + +void RAMStats::Print() const +{ + auto pct = [](uint64_t a, uint64_t b) -> double { + return b == 0 ? 0.0 : 100.0 * static_cast(a) / static_cast(b); + }; + + std::cout << "\n=== RAMTools Statistics ===\n"; + std::cout << std::left << std::setw(30) << "Total reads:" << total_reads << "\n" + << std::setw(30) << "Mapped reads:" << mapped_reads << std::fixed << std::setprecision(2) << " (" + << pct(mapped_reads, total_reads) << "%)\n" + << std::setw(30) << "Unmapped reads:" << unmapped_reads << " (" << pct(unmapped_reads, total_reads) + << "%)\n" + << std::setw(30) << "Duplicate reads:" << duplicate_reads << " (" << pct(duplicate_reads, total_reads) + << "%)\n" + << std::setw(30) << "Paired reads:" << paired_reads << " (" << pct(paired_reads, total_reads) << "%)\n" + << std::setw(30) << "Properly paired reads:" << properly_paired_reads << " (" + << pct(properly_paired_reads, total_reads) << "%)\n" + << std::setw(30) << "Forward strand:" << forward_strand << " (" << pct(forward_strand, mapped_reads) + << "% of mapped)\n" + << std::setw(30) << "Reverse strand:" << reverse_strand << " (" << pct(reverse_strand, mapped_reads) + << "% of mapped)\n" + << std::setw(30) << "Total bases:" << total_bases << "\n" + << std::setw(30) << "Mean read length:" << std::fixed << std::setprecision(2) << mean_read_length << "\n" + << std::setw(30) << "Mean mapping quality:" << std::fixed << std::setprecision(2) << mean_mapping_quality + << "\n"; + std::cout << "===========================\n\n"; +} + +/// Extract the original sequence length from an encoded seq field. +/// The field format is: [4-byte uint32_t length][packed 2-bit nucleotides] +/// memcpy is used instead of reinterpret_cast to avoid alignment-dependent UB. +static uint32_t DecodedSeqLength(const std::string &encoded_seq) +{ + if (encoded_seq.size() < 4) + return 0; + uint32_t length = 0; + std::memcpy(&length, encoded_seq.data(), sizeof(length)); + return length; +} + +RAMStatsResult ComputeStats(const char *filename) +{ + RAMStats stats; + + // Initialize and load the reference name map stored alongside the RNTuple + RAMNTupleRecord::InitializeRefs(); + RAMNTupleRecord::ReadAllRefs(filename); + RAMNTupleRefs *rnameRefs = RAMNTupleRecord::GetRnameRefs(); + + std::unique_ptr reader; + try { + reader = ROOT::RNTupleReader::Open("RAM", filename); + } catch (const std::exception &e) { + return RAMStatsResult{stats, false, e.what()}; + } + + if (!reader) + return RAMStatsResult{stats, false, "RNTupleReader::Open returned nullptr"}; + + auto viewFlag = reader->GetView("record.flag"); + auto viewMapQ = reader->GetView("record.mapq"); + auto viewSeq = reader->GetView("record.seq"); + auto viewRefId = reader->GetView("record.refid"); + + uint64_t mapq_sum = 0; + uint64_t len_sum = 0; + const uint64_t nEntries = reader->GetNEntries(); + + for (uint64_t i = 0; i < nEntries; ++i) { + stats.total_reads++; + + uint16_t flag = viewFlag(i); + uint8_t mapq = viewMapQ(i); + const std::string &seq = viewSeq(i); + int32_t refid = viewRefId(i); + + if (flag & FLAG_UNMAPPED) { + stats.unmapped_reads++; + // Unmapped reads have no strand — excluded from strand stats + } else { + stats.mapped_reads++; + mapq_sum += mapq; + // Strand is only meaningful for mapped reads + if (flag & FLAG_REVERSE_STRAND) + stats.reverse_strand++; + else + stats.forward_strand++; + } + + if (flag & FLAG_DUPLICATE) stats.duplicate_reads++; + if (flag & FLAG_PAIRED) stats.paired_reads++; + if (flag & FLAG_PROPER_PAIR) stats.properly_paired_reads++; + + // Extract sequence length from encoded field (4-byte length prefix + packed nibbles) + uint32_t rlen = DecodedSeqLength(seq); + len_sum += rlen; + stats.total_bases += rlen; + + // Resolve chromosome name from integer refid via the ref name table + if (refid >= 0 && rnameRefs && + refid < static_cast(rnameRefs->Size())) { + const std::string &chrom = rnameRefs->GetRefName(refid); + if (!chrom.empty() && chrom != "*") + stats.reads_per_chromosome[chrom]++; + } + } + + if (stats.total_reads > 0) + stats.mean_read_length = static_cast(len_sum) / static_cast(stats.total_reads); + if (stats.mapped_reads > 0) + stats.mean_mapping_quality = static_cast(mapq_sum) / static_cast(stats.mapped_reads); + + return RAMStatsResult{stats, true, ""}; +} + +int RunRamStats(const char *filename, bool verbose) +{ + auto result = ComputeStats(filename); + if (!result.ok) { + std::cerr << "Error: " << result.error_message << "\n"; + return 1; + } + result.stats.Print(); + if (verbose && !result.stats.reads_per_chromosome.empty()) { + std::cout << "\n--- Reads per Chromosome ---\n"; + for (const auto &[chrom, count] : result.stats.reads_per_chromosome) { + double pct = 100.0 * count / result.stats.total_reads; + std::cout << " " << chrom << ": " << count << " (" << pct << "%)\n"; + } + std::cout << "\n"; + } + return 0; +} + +} // namespace ramcore diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 24b54f0..08f7296 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -36,3 +36,6 @@ install(TARGETS ramcoretests chromosome_split_test RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) +add_ramcore_test(ramstatstests ramstatstests.cxx) +add_ramcore_test(ramsorttests ramsorttests.cxx) +add_ramcore_test(samparsertest samparsertest.cxx) diff --git a/test/ramsorttests.cxx b/test/ramsorttests.cxx new file mode 100644 index 0000000..30a7425 --- /dev/null +++ b/test/ramsorttests.cxx @@ -0,0 +1,121 @@ +#include +#include +#include +#include + +#include "../benchmark/generate_sam_benchmark.h" +#include "ramcore/RAMSort.h" +#include "ramcore/SamToNTuple.h" + +namespace { + +class RAMSortTest : public ::testing::Test { +protected: + static constexpr int kNumReads = 200; + const char *kSamFile = "sort_test.sam"; + const char *kUnsortedFile = "sort_test_unsorted.root"; + const char *kSortedFile = "sort_test_sorted.root"; + const char *kNameSortFile = "sort_test_namesort.root"; + + void SetUp() override + { + GenerateSAMFile(kSamFile, kNumReads); + std::remove(kUnsortedFile); + std::remove(kSortedFile); + std::remove(kNameSortFile); + samtoramntuple(kSamFile, kUnsortedFile, true, true, true, 505, 0); + } + + void TearDown() override + { + std::remove(kSamFile); + std::remove(kUnsortedFile); + std::remove(kSortedFile); + std::remove(kNameSortFile); + } +}; + +TEST_F(RAMSortTest, EntryCountPreserved) +{ + ASSERT_EQ(ramsortntuple(kUnsortedFile, kSortedFile), 0); + + auto readerIn = ROOT::RNTupleReader::Open("RAM", kUnsortedFile); + auto readerOut = ROOT::RNTupleReader::Open("RAM", kSortedFile); + ASSERT_NE(readerIn, nullptr); + ASSERT_NE(readerOut, nullptr); + EXPECT_EQ(readerIn->GetNEntries(), readerOut->GetNEntries()); +} + +TEST_F(RAMSortTest, CoordinateSortOrder) +{ + ASSERT_EQ(ramsortntuple(kUnsortedFile, kSortedFile), 0); + + auto reader = ROOT::RNTupleReader::Open("RAM", kSortedFile); + ASSERT_NE(reader, nullptr); + + auto viewRefId = reader->GetView("record.refid"); + auto viewPos = reader->GetView("record.pos"); + + int32_t prevRefId = -1, prevPos = -1; + for (uint64_t i = 0; i < reader->GetNEntries(); ++i) { + int32_t refid = viewRefId(i); + int32_t pos = viewPos(i); + if (refid == prevRefId) { + EXPECT_GE(pos, prevPos) << "pos out of order at entry " << i; + } else { + EXPECT_GE(refid, prevRefId) << "refid out of order at entry " << i; + } + prevRefId = refid; + prevPos = pos; + } +} + +TEST_F(RAMSortTest, NameSortOrder) +{ + ASSERT_EQ(ramsortntuple(kUnsortedFile, kNameSortFile, true), 0); + + auto reader = ROOT::RNTupleReader::Open("RAM", kNameSortFile); + ASSERT_NE(reader, nullptr); + + auto viewQname = reader->GetView("record.qname"); + + std::string prev = ""; + for (uint64_t i = 0; i < reader->GetNEntries(); ++i) { + std::string qname = viewQname(i); + EXPECT_GE(qname, prev) << "qname out of order at entry " << i; + prev = qname; + } +} + +TEST_F(RAMSortTest, IdempotentSort) +{ + ASSERT_EQ(ramsortntuple(kUnsortedFile, kSortedFile), 0); + + const char *doubleSorted = "sort_test_double.root"; + ASSERT_EQ(ramsortntuple(kSortedFile, doubleSorted), 0); + + auto r1 = ROOT::RNTupleReader::Open("RAM", kSortedFile); + auto r2 = ROOT::RNTupleReader::Open("RAM", doubleSorted); + ASSERT_NE(r1, nullptr); + ASSERT_NE(r2, nullptr); + EXPECT_EQ(r1->GetNEntries(), r2->GetNEntries()); + + auto v1refid = r1->GetView("record.refid"); + auto v2refid = r2->GetView("record.refid"); + auto v1pos = r1->GetView("record.pos"); + auto v2pos = r2->GetView("record.pos"); + + for (uint64_t i = 0; i < r1->GetNEntries(); ++i) { + EXPECT_EQ(v1refid(i), v2refid(i)); + EXPECT_EQ(v1pos(i), v2pos(i)); + } + std::remove(doubleSorted); +} + +TEST(RAMSortEdgeCases, MissingInputFileReturnsError) +{ + int ret = ramsortntuple("nonexistent.root", "/tmp/out.root"); + EXPECT_NE(ret, 0); +} + +} // namespace diff --git a/test/ramstatstests.cxx b/test/ramstatstests.cxx new file mode 100644 index 0000000..33a9326 --- /dev/null +++ b/test/ramstatstests.cxx @@ -0,0 +1,162 @@ +#include +#include +#include +#include +#include + +#include "../benchmark/generate_sam_benchmark.h" +#include "ramcore/SamToNTuple.h" +#include "ramcore/RAMStats.h" + +namespace { + +class RAMStatsTest : public ::testing::Test { +protected: + static constexpr int kNumReads = 100; + const char *kSamFile = "stats_test.sam"; + const char *kRootFile = "stats_test_rntuple.root"; + + void SetUp() override + { + GenerateSAMFile(kSamFile, kNumReads); + std::remove(kRootFile); + // compression=505 (LZMA), quality_policy=0 — matches existing tests + samtoramntuple(kSamFile, kRootFile, true, true, true, 505, 0); + } + + void TearDown() override + { + std::remove(kSamFile); + std::remove(kRootFile); + } +}; + +/// Total reads reported must equal the number of entries in the RNTuple. +TEST_F(RAMStatsTest, TotalReadsMatchesNTupleEntries) +{ + auto result = ramcore::ComputeStats(kRootFile); + + auto reader = ROOT::RNTupleReader::Open("RAM", kRootFile); + ASSERT_TRUE(reader != nullptr); + + EXPECT_EQ(result.stats.total_reads, static_cast(reader->GetNEntries())); + EXPECT_EQ(result.stats.total_reads, static_cast(kNumReads)); +} + +/// Mapped + unmapped must equal total reads (no reads are lost). +TEST_F(RAMStatsTest, MappedPlusUnmappedEqualsTotal) +{ + auto result = ramcore::ComputeStats(kRootFile); + EXPECT_EQ(result.stats.mapped_reads + result.stats.unmapped_reads, result.stats.total_reads); +} + +/// Forward + reverse strand counts must equal total reads. +TEST_F(RAMStatsTest, StrandCountsEqualTotal) +{ + auto result = ramcore::ComputeStats(kRootFile); + EXPECT_EQ(result.stats.forward_strand + result.stats.reverse_strand, result.stats.total_reads); +} + +/// Mean read length must be positive for a non-empty file. +TEST_F(RAMStatsTest, MeanReadLengthIsPositive) +{ + auto result = ramcore::ComputeStats(kRootFile); + EXPECT_GT(result.stats.mean_read_length, 0.0); +} + +/// Total bases must be consistent with mean read length * total reads. +TEST_F(RAMStatsTest, TotalBasesConsistentWithMeanLength) +{ + auto result = ramcore::ComputeStats(kRootFile); + double expected_bases = result.stats.mean_read_length * result.stats.total_reads; + // Allow small floating-point rounding tolerance + EXPECT_NEAR(static_cast(result.stats.total_bases), expected_bases, 1.0); +} + +/// Mean mapping quality must be in valid SAM range [0, 255]. +TEST_F(RAMStatsTest, MeanMappingQualityInValidRange) +{ + auto result = ramcore::ComputeStats(kRootFile); + EXPECT_GE(result.stats.mean_mapping_quality, 0.0); + EXPECT_LE(result.stats.mean_mapping_quality, 255.0); +} + +/// Per-chromosome map must not be empty for a valid SAM file. +TEST_F(RAMStatsTest, ReadsPerChromosomeNotEmpty) +{ + auto result = ramcore::ComputeStats(kRootFile); + EXPECT_FALSE(result.stats.reads_per_chromosome.empty()); +} + +/// Chromosome read counts must sum to at most total_reads +/// (unmapped reads with rname="*" are excluded from the map). +TEST_F(RAMStatsTest, ChromosomeCountsSumToAtMostTotal) +{ + auto result = ramcore::ComputeStats(kRootFile); + uint64_t chrom_sum = 0; + for (const auto &[chrom, count] : result.stats.reads_per_chromosome) { + chrom_sum += count; + } + EXPECT_LE(chrom_sum, result.stats.total_reads); +} + +/// Print() must produce non-empty output containing expected headers. +TEST_F(RAMStatsTest, PrintProducesOutput) +{ + auto result = ramcore::ComputeStats(kRootFile); + ASSERT_TRUE(result.ok); + testing::internal::CaptureStdout(); + result.stats.Print(); + std::string output = testing::internal::GetCapturedStdout(); + EXPECT_FALSE(output.empty()); + EXPECT_NE(output.find("Total reads:"), std::string::npos); + EXPECT_NE(output.find("Mapped reads:"), std::string::npos); + EXPECT_NE(output.find("Mean mapping quality:"), std::string::npos); +} + +/// ComputeStats on a non-existent file must return ok=false with error message. +TEST(RAMStatsEdgeCases, BadFileReturnsErrorResult) +{ + auto result = ramcore::ComputeStats("nonexistent_file.root"); + EXPECT_FALSE(result.ok); + EXPECT_FALSE(result.error_message.empty()); +} + +/// ComputeStats result ok must be true for a valid file. +TEST_F(RAMStatsTest, ValidFileReturnsOk) +{ + auto result = ramcore::ComputeStats(kRootFile); + EXPECT_TRUE(result.ok); + EXPECT_TRUE(result.error_message.empty()); +} + +/// RunRamStats on a valid file must return 0. +TEST_F(RAMStatsTest, RunRamStatsValidFileReturnsZero) +{ + testing::internal::CaptureStdout(); + int ret = ramcore::RunRamStats(kRootFile); + std::string output = testing::internal::GetCapturedStdout(); + EXPECT_EQ(ret, 0); + EXPECT_FALSE(output.empty()); +} + +/// RunRamStats with verbose must print chromosome breakdown. +TEST_F(RAMStatsTest, RunRamStatsVerbosePrintsChromosomes) +{ + testing::internal::CaptureStdout(); + int ret = ramcore::RunRamStats(kRootFile, true); + std::string output = testing::internal::GetCapturedStdout(); + EXPECT_EQ(ret, 0); + EXPECT_NE(output.find("Reads per Chromosome"), std::string::npos); +} + +/// RunRamStats on a bad file must return 1. +TEST(RAMStatsEdgeCases, RunRamStatsBadFileReturnsOne) +{ + testing::internal::CaptureStderr(); + int ret = ramcore::RunRamStats("nonexistent.root"); + testing::internal::GetCapturedStderr(); + EXPECT_EQ(ret, 1); +} + +} // namespace diff --git a/test/samparsertest.cxx b/test/samparsertest.cxx new file mode 100644 index 0000000..4d4d819 --- /dev/null +++ b/test/samparsertest.cxx @@ -0,0 +1,234 @@ +#include +#include +#include +#include +#include + +#include "ramcore/SamParser.h" +#include "rntuple/RAMNTupleRecord.h" + +// ───────────────────────────────────────────────────────────────────────────── +// Helpers +// ───────────────────────────────────────────────────────────────────────────── + +static void WriteSAM(const char *path, const std::string &content) +{ + std::ofstream f(path); + f << content; +} + +// ───────────────────────────────────────────────────────────────────────────── +// SamParser tests +// ───────────────────────────────────────────────────────────────────────────── + +namespace { + +class SamParserTest : public ::testing::Test { +protected: + const char *kSamFile = "parser_test.sam"; + void TearDown() override { std::remove(kSamFile); } +}; + +/// Parser must return false for a non-existent file. +TEST_F(SamParserTest, NonExistentFileReturnsFalse) +{ + ramcore::SamParser parser; + bool ok = parser.ParseFile("no_such_file.sam", nullptr, nullptr); + EXPECT_FALSE(ok); +} + +/// Header lines starting with '@' must trigger the header callback. +TEST_F(SamParserTest, HeaderCallbackFired) +{ + WriteSAM(kSamFile, "@HD\tVN:1.6\tSO:coordinate\n" + "@SQ\tSN:chr1\tLN:248956422\n"); + + ramcore::SamParser parser; + std::vector tags; + parser.ParseFile(kSamFile, [&](const std::string &tag, const std::string &) { tags.push_back(tag); }, nullptr); + + ASSERT_EQ(tags.size(), 2u); + EXPECT_EQ(tags[0], "@HD"); + EXPECT_EQ(tags[1], "@SQ"); +} + +/// Record callback must fire once per alignment line. +TEST_F(SamParserTest, RecordCallbackFiredForEachAlignment) +{ + WriteSAM(kSamFile, "@HD\tVN:1.6\n" + "read1\t0\tchr1\t100\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\n" + "read2\t16\tchr1\t200\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\n"); + + ramcore::SamParser parser; + size_t count = 0; + parser.ParseFile(kSamFile, nullptr, [&](const ramcore::SamRecord &, size_t) { ++count; }); + + EXPECT_EQ(count, 2u); + EXPECT_EQ(parser.GetRecordsProcessed(), 2u); +} + +/// Parsed record fields must match the SAM columns exactly. +TEST_F(SamParserTest, RecordFieldsParsedCorrectly) +{ + WriteSAM(kSamFile, "@HD\tVN:1.6\n" + "myread\t99\tchr2\t500\t30\t5M2I3M\t=\t600\t110\tACGTACGTAC\tIIIIIIIIII\n"); + + ramcore::SamParser parser; + ramcore::SamRecord captured; + parser.ParseFile(kSamFile, nullptr, [&](const ramcore::SamRecord &rec, size_t) { captured = rec; }); + + EXPECT_EQ(captured.qname, "myread"); + EXPECT_EQ(captured.flag, 99); + EXPECT_EQ(captured.rname, "chr2"); + EXPECT_EQ(captured.pos, 500); + EXPECT_EQ(captured.mapq, 30); + EXPECT_EQ(captured.cigar, "5M2I3M"); + EXPECT_EQ(captured.rnext, "="); + EXPECT_EQ(captured.pnext, 600); + EXPECT_EQ(captured.tlen, 110); + EXPECT_EQ(captured.seq, "ACGTACGTAC"); + EXPECT_EQ(captured.qual, "IIIIIIIIII"); +} + +/// An empty SAM file (no headers, no records) must parse successfully. +TEST_F(SamParserTest, EmptyFileParseSucceeds) +{ + WriteSAM(kSamFile, ""); + ramcore::SamParser parser; + bool ok = parser.ParseFile(kSamFile, nullptr, nullptr); + EXPECT_TRUE(ok); + EXPECT_EQ(parser.GetRecordsProcessed(), 0u); +} + +/// A SAM file with only headers and no records must produce zero record calls. +TEST_F(SamParserTest, HeaderOnlyFileProducesNoRecords) +{ + WriteSAM(kSamFile, "@HD\tVN:1.6\n" + "@SQ\tSN:chr1\tLN:1000\n"); + + ramcore::SamParser parser; + size_t count = 0; + parser.ParseFile(kSamFile, nullptr, [&](const ramcore::SamRecord &, size_t) { ++count; }); + + EXPECT_EQ(count, 0u); +} + +/// Optional fields must be captured in the record. +TEST_F(SamParserTest, OptionalFieldsCaptured) +{ + WriteSAM(kSamFile, "@HD\tVN:1.6\n" + "read1\t0\tchr1\t100\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\tNM:i:0\tRG:Z:sample1\n"); + + ramcore::SamParser parser; + ramcore::SamRecord captured; + parser.ParseFile(kSamFile, nullptr, [&](const ramcore::SamRecord &rec, size_t) { captured = rec; }); + + EXPECT_GE(captured.optional_fields.size(), 1u); +} + +/// Lines processed counter must include both header and record lines. +TEST_F(SamParserTest, LinesProcessedCountIsCorrect) +{ + WriteSAM(kSamFile, "@HD\tVN:1.6\n" + "@SQ\tSN:chr1\tLN:1000\n" + "read1\t0\tchr1\t100\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\n"); + + ramcore::SamParser parser; + parser.ParseFile(kSamFile, nullptr, nullptr); + EXPECT_EQ(parser.GetLinesProcessed(), 3u); +} + +/// StripCRLF must remove trailing CR and LF characters. +TEST(StripCRLFTest, RemovesTrailingCRLF) +{ + char buf1[] = "hello\r\n"; + ramcore::StripCRLF(buf1); + EXPECT_STREQ(buf1, "hello"); + + char buf2[] = "world\n"; + ramcore::StripCRLF(buf2); + EXPECT_STREQ(buf2, "world"); + + char buf3[] = "noend"; + ramcore::StripCRLF(buf3); + EXPECT_STREQ(buf3, "noend"); +} + +// ───────────────────────────────────────────────────────────────────────────── +// RAMNTupleUtils encode/decode tests +// ───────────────────────────────────────────────────────────────────────────── + +/// EncodeSequence + DecodeSequence must be inverse operations. +TEST(RAMNTupleUtilsTest, SequenceRoundTrip) +{ + const std::string original = "ACGTACGTACGT"; + std::string encoded = RAMNTupleUtils::EncodeSequence(original); + + // Encoded format: 4-byte length prefix + packed nibbles + ASSERT_GE(encoded.size(), 4u); + uint32_t stored_len = 0; + std::memcpy(&stored_len, encoded.data(), 4); + EXPECT_EQ(stored_len, original.size()); + + std::string decoded = RAMNTupleUtils::DecodeSequence(encoded.substr(4), original.size()); + EXPECT_EQ(decoded, original); +} + +/// Odd-length sequences must round-trip correctly. +TEST(RAMNTupleUtilsTest, OddLengthSequenceRoundTrip) +{ + const std::string original = "ACGTA"; + std::string encoded = RAMNTupleUtils::EncodeSequence(original); + std::string decoded = RAMNTupleUtils::DecodeSequence(encoded.substr(4), original.size()); + EXPECT_EQ(decoded, original); +} + +/// Empty sequence must encode and decode without crashing. +TEST(RAMNTupleUtilsTest, EmptySequenceRoundTrip) +{ + const std::string original = ""; + std::string encoded = RAMNTupleUtils::EncodeSequence(original); + std::string decoded = RAMNTupleUtils::DecodeSequence(encoded.substr(4), 0); + EXPECT_EQ(decoded, original); +} + +/// EncodeQuality + DecodeQuality must be inverse for default Phred33 mode. +TEST(RAMNTupleUtilsTest, QualityRoundTripPhred33) +{ + const std::string original = "IIIIIIIIII"; + uint32_t flags = RAMNTupleRecord::kPhred33; + std::string encoded = RAMNTupleUtils::EncodeQuality(original, flags); + std::string decoded = RAMNTupleUtils::DecodeQuality(encoded, flags); + EXPECT_EQ(decoded, original); +} + +/// Quality scores must round-trip when drop mode is used. +TEST(RAMNTupleUtilsTest, QualityRoundTripDropMode) +{ + const std::string original = "IIIIIIIIII"; + uint32_t flags = RAMNTupleRecord::kDrop; + std::string encoded = RAMNTupleUtils::EncodeQuality(original, flags); + // In drop mode decoded quality should be empty or placeholder + std::string decoded = RAMNTupleUtils::DecodeQuality(encoded, flags); + // Just verify no crash and result is a string + EXPECT_NO_THROW(RAMNTupleUtils::DecodeQuality(encoded, flags)); +} + +/// ParseCIGAR + FormatCIGAR must be inverse operations. +TEST(RAMNTupleUtilsTest, CIGARRoundTrip) +{ + const std::string original = "10M2I5M3D"; + auto ops = RAMNTupleUtils::ParseCIGAR(original); + EXPECT_FALSE(ops.empty()); + std::string formatted = RAMNTupleUtils::FormatCIGAR(ops); + EXPECT_EQ(formatted, original); +} + +/// Wildcard CIGAR '*' must parse without crashing. +TEST(RAMNTupleUtilsTest, WildcardCIGARParsesCleanly) +{ + auto ops = RAMNTupleUtils::ParseCIGAR("*"); + EXPECT_TRUE(ops.empty()); +} + +} // namespace diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index b67568f..3956ecc 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -22,7 +22,21 @@ ROOT_EXECUTABLE(ramntupleview ROOT::ROOTNTuple ) -install(TARGETS samtoram samtoramntuple ramntupleview +ROOT_EXECUTABLE(ramstats + ramstats.cxx + LIBRARIES + ramcore + ROOT::Core + ROOT::ROOTNTuple +) +ROOT_EXECUTABLE(ramsort + ramsort.cxx + LIBRARIES + ramcore + ROOT::Core + ROOT::ROOTNTuple +) +install(TARGETS samtoram samtoramntuple ramntupleview ramstats RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) @@ -40,3 +54,4 @@ install(FILES ${ROOT_MACROS} DESTINATION share/ramtools/macros ) + diff --git a/tools/ramsort.cxx b/tools/ramsort.cxx new file mode 100644 index 0000000..76731b5 --- /dev/null +++ b/tools/ramsort.cxx @@ -0,0 +1,27 @@ +/// \file ramsort.cxx +/// \brief Command-line tool to sort a RAM (RNTuple) file by coordinate or name. +/// +/// Usage: +/// ./tools/ramsort [--by-name] + +#include "ramcore/RAMSort.h" +#include +#include + +int main(int argc, char **argv) +{ + if (argc < 3) { + std::cerr << "Usage: " << argv[0] << " [--by-name]\n"; + std::cerr << " Sort a RAM (RNTuple) file by genomic coordinate (refid, pos).\n"; + std::cerr << " --by-name Sort by QNAME instead.\n"; + return 1; + } + + bool byName = false; + for (int i = 3; i < argc; ++i) { + if (std::strcmp(argv[i], "--by-name") == 0) + byName = true; + } + + return ramsortntuple(argv[1], argv[2], byName); +} diff --git a/tools/ramstats.cxx b/tools/ramstats.cxx new file mode 100644 index 0000000..d5dd08a --- /dev/null +++ b/tools/ramstats.cxx @@ -0,0 +1,19 @@ +#include "ramcore/RAMStats.h" +#include +#include + +int main(int argc, char **argv) +{ + if (argc < 2) { + std::cerr << "Usage: " << argv[0] << " [--verbose]\n"; + std::cerr << " Compute summary statistics for a RAM (RNTuple) file.\n"; + std::cerr << " --verbose Also print per-chromosome read counts.\n"; + return 1; + } + bool verbose = false; + for (int i = 2; i < argc; ++i) { + if (std::strcmp(argv[i], "--verbose") == 0) + verbose = true; + } + return ramcore::RunRamStats(argv[1], verbose); +}