diff --git a/.gitignore b/.gitignore index 4502af8..692a82b 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,15 @@ docs/html/ docs/latex/ html/ +# Reference genome and index files +GRCh38.primary_assembly.genome.fa +GRCh38.primary_assembly.genome.fa.amb +GRCh38.primary_assembly.genome.fa.ann +GRCh38.primary_assembly.genome.fa.bwt +GRCh38.primary_assembly.genome.fa.pac +GRCh38.primary_assembly.genome.fa.sa +*.fastq.gz +*.filt.fastq.gz +sample.sam +--help.ram +README 2.md diff --git a/src/ramcore/SamParser.cxx b/src/ramcore/SamParser.cxx index f5dd07c..1036384 100644 --- a/src/ramcore/SamParser.cxx +++ b/src/ramcore/SamParser.cxx @@ -1,6 +1,7 @@ #include "ramcore/SamParser.h" #include #include +#include namespace ramcore { @@ -11,23 +12,41 @@ void StripCRLF(char* str) { } } -bool SamParser::ParseFile(const char* filename, - HeaderCallback header_cb, +static bool ParseInt(const char* str, int& out, const char* fieldName, int lineNum) { + if (!str || *str == '\0') { + std::cerr << "[SamParser] Warning: empty value for field '" + << fieldName << "' at line " << lineNum + << " — record skipped.\n"; + return false; + } + char* end; + long val = std::strtol(str, &end, 10); + if (*end != '\0' && *end != '\t' && *end != '\n' && *end != '\r') { + std::cerr << "[SamParser] Warning: non-numeric value '" << str + << "' for field '" << fieldName << "' at line " << lineNum + << " — record skipped.\n"; + return false; + } + out = static_cast(val); + return true; +} + +bool SamParser::ParseFile(const char* filename, + HeaderCallback header_cb, RecordCallback record_cb) { FILE* fp = fopen(filename, "r"); if (!fp) { return false; } - lines_processed_ = 0; records_processed_ = 0; - + char line[kMaxLineLength]; SamRecord record; - + while (fgets(line, kMaxLineLength, fp)) { lines_processed_++; - + if (line[0] == '@') { char* tab = strchr(line, '\t'); if (tab) { @@ -44,7 +63,7 @@ bool SamParser::ParseFile(const char* filename, } continue; } - + record.Clear(); if (ParseLine(line, record)) { if (record_cb) { @@ -53,7 +72,7 @@ bool SamParser::ParseFile(const char* filename, records_processed_++; } } - + fclose(fp); return true; } @@ -61,35 +80,47 @@ bool SamParser::ParseFile(const char* filename, bool SamParser::ParseLine(char* line, SamRecord& record) { int field_num = 0; char* token = strtok(line, "\t"); - + while (token) { switch (field_num) { case 0: record.qname = token; break; - case 1: record.flag = atoi(token); break; + case 1: + if (!ParseInt(token, record.flag, "flag", lines_processed_)) + return false; + break; case 2: record.rname = token; break; - case 3: record.pos = atoi(token); break; - case 4: record.mapq = atoi(token); break; + case 3: + if (!ParseInt(token, record.pos, "pos", lines_processed_)) + return false; + break; + case 4: + if (!ParseInt(token, record.mapq, "mapq", lines_processed_)) + return false; + break; case 5: record.cigar = token; break; case 6: record.rnext = token; break; - case 7: record.pnext = atoi(token); break; - case 8: record.tlen = atoi(token); break; + case 7: + if (!ParseInt(token, record.pnext, "pnext", lines_processed_)) + return false; + break; + case 8: + if (!ParseInt(token, record.tlen, "tlen", lines_processed_)) + return false; + break; case 9: record.seq = token; break; - case 10: + case 10: StripCRLF(token); - record.qual = token; + record.qual = token; break; default: StripCRLF(token); record.optional_fields.push_back(token); break; } - field_num++; token = strtok(nullptr, "\t"); } - - return field_num >= 11; + return field_num >= 11; } -} - +} // namespace ramcore diff --git a/src/ramcore/SamToNTuple.cxx b/src/ramcore/SamToNTuple.cxx index f6d9a78..166f315 100644 --- a/src/ramcore/SamToNTuple.cxx +++ b/src/ramcore/SamToNTuple.cxx @@ -234,7 +234,7 @@ void samtoramntuple_split_by_chromosome(const char *datafile, const char *output writeOptions.SetUseBufferedWrite(true); auto parallel_writer = - ROOT::Experimental::RNTupleParallelWriter::Recreate(std::move(model), "RAM", filename, writeOptions); + ROOT::RNTupleParallelWriter::Recreate(std::move(model), "RAM", filename, writeOptions); const int contexts_per_file = std::min(4, num_threads); const size_t records_per_context = (records.size() + contexts_per_file - 1) / contexts_per_file; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 24b54f0..62dbd41 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,7 +18,7 @@ function(add_ramcore_test test_name) if(RAMTOOLS_BUILD_BENCHMARKS) - target_link_libraries(${test_name} benchmark::benchmark) + target_link_libraries(${test_name} PRIVATE benchmark::benchmark) endif() target_include_directories(${test_name} diff --git a/test/ramcoretests.cxx b/test/ramcoretests.cxx index 7dd8b24..1e703bb 100644 --- a/test/ramcoretests.cxx +++ b/test/ramcoretests.cxx @@ -18,6 +18,7 @@ #include "ramcore/SamToNTuple.h" #include "rntuple/RAMNTupleRecord.h" #include "ramcore/SamToTTree.h" +#include "ramcore/SamParser.h" namespace { class ramcoreTest : public ::testing::Test { @@ -383,83 +384,145 @@ TEST_F(ramcoreTest, SmartIndexRespectsPositionInterval) std::remove(customSam); std::remove(rntupleFile); } - +TEST_F(ramcoreTest, MalformedFlagSkipsRecord) { + const char* path = "/tmp/ram_test_flag.sam"; + FILE* f = fopen(path, "w"); + fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + fprintf(f, "bad\tabc\tchr1\t100\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fclose(f); + int count = 0; + ramcore::SamParser parser; + parser.ParseFile(path, + [](const std::string&, const std::string&) {}, + [&](const ramcore::SamRecord&, int) { count++; }); + std::remove(path); + EXPECT_EQ(count, 1); +} +TEST_F(ramcoreTest, MalformedPosSkipsRecord) { + const char* path = "/tmp/ram_test_pos.sam"; + FILE* f = fopen(path, "w"); + fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + fprintf(f, "bad\t0\tchr1\tXYZ\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fclose(f); + int count = 0; + ramcore::SamParser parser; + parser.ParseFile(path, + [](const std::string&, const std::string&) {}, + [&](const ramcore::SamRecord&, int) { count++; }); + std::remove(path); + EXPECT_EQ(count, 1); +} +TEST_F(ramcoreTest, MalformedMapqSkipsRecord) { + const char* path = "/tmp/ram_test_mapq.sam"; + FILE* f = fopen(path, "w"); + fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + fprintf(f, "bad\t0\tchr1\t100\tBAD\t10M\t*\t0\t0\tACGT\tIIII\n"); + fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fclose(f); + int count = 0; + ramcore::SamParser parser; + parser.ParseFile(path, + [](const std::string&, const std::string&) {}, + [&](const ramcore::SamRecord&, int) { count++; }); + std::remove(path); + EXPECT_EQ(count, 1); +} +TEST_F(ramcoreTest, MalformedPnextSkipsRecord) { + const char* path = "/tmp/ram_test_pnext.sam"; + FILE* f = fopen(path, "w"); + fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + fprintf(f, "bad\t0\tchr1\t100\t60\t10M\t*\tNOP\t0\tACGT\tIIII\n"); + fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fclose(f); + int count = 0; + ramcore::SamParser parser; + parser.ParseFile(path, + [](const std::string&, const std::string&) {}, + [&](const ramcore::SamRecord&, int) { count++; }); + std::remove(path); + EXPECT_EQ(count, 1); +} +TEST_F(ramcoreTest, MalformedTlenSkipsRecord) { + const char* path = "/tmp/ram_test_tlen.sam"; + FILE* f = fopen(path, "w"); + fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + fprintf(f, "bad\t0\tchr1\t100\t60\t10M\t*\t0\tBAD\tACGT\tIIII\n"); + fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fclose(f); + int count = 0; + ramcore::SamParser parser; + parser.ParseFile(path, + [](const std::string&, const std::string&) {}, + [&](const ramcore::SamRecord&, int) { count++; }); + std::remove(path); + EXPECT_EQ(count, 1); +} +TEST_F(ramcoreTest, ValidRecordParsesCorrectly) { + const char* path = "/tmp/ram_test_valid.sam"; + FILE* f = fopen(path, "w"); + fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + fprintf(f, "r1\t16\tchr1\t500\t30\t5M\t*\t200\t10\tACGTA\tIIIII\n"); + fclose(f); + ramcore::SamRecord captured; + int count = 0; + ramcore::SamParser parser; + parser.ParseFile(path, + [](const std::string&, const std::string&) {}, + [&](const ramcore::SamRecord& rec, int) { captured = rec; count++; }); + std::remove(path); + ASSERT_EQ(count, 1); + EXPECT_EQ(captured.flag, 16); + EXPECT_EQ(captured.pos, 500); + EXPECT_EQ(captured.mapq, 30); + EXPECT_EQ(captured.pnext, 200); + EXPECT_EQ(captured.tlen, 10); +} TEST_F(ramcoreTest, QUALEncodingDecodingModes) { const char *samFile = "test_qual.sam"; const char *ramFile = "test_qual.root"; - std::string seq = "AAAAAAAAAA"; std::string qual = "@@@FBIEDH!"; - { std::ofstream sam(samFile); sam << "@HD\tVN:1.6\tSO:unsorted\n"; sam << "@SQ\tSN:chr1\tLN:1000\n"; sam << "read1\t0\tchr1\t100\t60\t10M\t*\t0\t0\t" << seq << "\t" << qual << "\n"; } - - // No compression of QUAL field, stored as it is - samtoramntuple(samFile, ramFile, /*index=*/true, /*split=*/false, /*cache=*/false, /*compression_algorithm=*/505, - /*quality_policy=*/0); - + samtoramntuple(samFile, ramFile, true, false, false, 505, 0); { auto reader = ROOT::RNTupleReader::Open("RAM", ramFile); ASSERT_NE(reader, nullptr); - auto view = reader->GetView("record"); const auto &rec = view(0); - - std::string decoded = rec.GetQUAL(); - - // MUST be identical - EXPECT_EQ(decoded, qual) << "QUAL should remain unchanged without compression"; + EXPECT_EQ(rec.GetQUAL(), qual) << "QUAL should remain unchanged without compression"; } - std::remove(ramFile); - - // Drop the quailty field , should stored as "*" - samtoramntuple(samFile, ramFile, /*index=*/true, /*split=*/false, /*cache=*/false, /*compression_algorithm=*/505, - /*quality_policy=*/RAMNTupleRecord::kDrop); - + samtoramntuple(samFile, ramFile, true, false, false, 505, RAMNTupleRecord::kDrop); { auto reader = ROOT::RNTupleReader::Open("RAM", ramFile); ASSERT_NE(reader, nullptr); - auto view = reader->GetView("record"); const auto &rec = view(0); - EXPECT_EQ(rec.GetQUAL(), "*") << "QUAL should be dropped"; } - std::remove(ramFile); - - // Illumina binning - samtoramntuple(samFile, ramFile, /*index=*/true, /*split=*/false, /*cache=*/false, /*compression_algorithm=*/505, - /*quality_policy=*/RAMNTupleRecord::kIlluminaBinning); - + samtoramntuple(samFile, ramFile, true, false, false, 505, RAMNTupleRecord::kIlluminaBinning); { auto reader = ROOT::RNTupleReader::Open("RAM", ramFile); ASSERT_NE(reader, nullptr); - auto view = reader->GetView("record"); const auto &rec = view(0); - std::string decoded = rec.GetQUAL(); - - // Must NOT be same (lossy compression done during encoding) EXPECT_NE(decoded, qual) << "Binning must change quality values"; - - // Length must stay same EXPECT_EQ(decoded.size(), qual.size()); - - // Must still be valid printable ASCII for (char c : decoded) { EXPECT_GE(c, 33); EXPECT_LE(c, 126); } } - std::remove(samFile); std::remove(ramFile); } \ No newline at end of file