From 0a14b5ea122af7cf0eacf1fffac323c15a00a887 Mon Sep 17 00:00:00 2001 From: Abhuday Singh Date: Sat, 28 Mar 2026 03:44:47 +0530 Subject: [PATCH 1/2] fix: replace atoi() with validated ParseInt() in SamParser::ParseLine() - Add ParseInt() helper using strtol(): validates numeric input, logs warning to std::cerr with field name and line number, returns false to skip malformed record - Replace atoi() calls for flag (field 1), pos (field 3), mapq (field 4), pnext (field 7), tlen (field 8) - Malformed records are now skipped with a diagnostic message instead of being silently written as zero-valued integers - Update .gitignore to exclude reference genome files and large data files Fixes #22, closes #26 --- .gitignore | 12 +++++++ src/ramcore/SamParser.cxx | 73 ++++++++++++++++++++++++++++----------- 2 files changed, 64 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 4502af8..692a82b 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,15 @@ docs/html/ docs/latex/ html/ +# Reference genome and index files +GRCh38.primary_assembly.genome.fa +GRCh38.primary_assembly.genome.fa.amb +GRCh38.primary_assembly.genome.fa.ann +GRCh38.primary_assembly.genome.fa.bwt +GRCh38.primary_assembly.genome.fa.pac +GRCh38.primary_assembly.genome.fa.sa +*.fastq.gz +*.filt.fastq.gz +sample.sam +--help.ram +README 2.md diff --git a/src/ramcore/SamParser.cxx b/src/ramcore/SamParser.cxx index f5dd07c..1036384 100644 --- a/src/ramcore/SamParser.cxx +++ b/src/ramcore/SamParser.cxx @@ -1,6 +1,7 @@ #include "ramcore/SamParser.h" #include #include +#include namespace ramcore { @@ -11,23 +12,41 @@ void StripCRLF(char* str) { } } -bool SamParser::ParseFile(const char* filename, - HeaderCallback header_cb, +static bool ParseInt(const char* str, int& out, const char* fieldName, int lineNum) { + if (!str || *str == '\0') { + std::cerr << "[SamParser] Warning: empty value for field '" + << fieldName << "' at line " << lineNum + << " — record skipped.\n"; + return false; + } + char* end; + long val = std::strtol(str, &end, 10); + if (*end != '\0' && *end != '\t' && *end != '\n' && *end != '\r') { + std::cerr << "[SamParser] Warning: non-numeric value '" << str + << "' for field '" << fieldName << "' at line " << lineNum + << " — record skipped.\n"; + return false; + } + out = static_cast(val); + return true; +} + +bool SamParser::ParseFile(const char* filename, + HeaderCallback header_cb, RecordCallback record_cb) { FILE* fp = fopen(filename, "r"); if (!fp) { return false; } - lines_processed_ = 0; records_processed_ = 0; - + char line[kMaxLineLength]; SamRecord record; - + while (fgets(line, kMaxLineLength, fp)) { lines_processed_++; - + if (line[0] == '@') { char* tab = strchr(line, '\t'); if (tab) { @@ -44,7 +63,7 @@ bool SamParser::ParseFile(const char* filename, } continue; } - + record.Clear(); if (ParseLine(line, record)) { if (record_cb) { @@ -53,7 +72,7 @@ bool SamParser::ParseFile(const char* filename, records_processed_++; } } - + fclose(fp); return true; } @@ -61,35 +80,47 @@ bool SamParser::ParseFile(const char* filename, bool SamParser::ParseLine(char* line, SamRecord& record) { int field_num = 0; char* token = strtok(line, "\t"); - + while (token) { switch (field_num) { case 0: record.qname = token; break; - case 1: record.flag = atoi(token); break; + case 1: + if (!ParseInt(token, record.flag, "flag", lines_processed_)) + return false; + break; case 2: record.rname = token; break; - case 3: record.pos = atoi(token); break; - case 4: record.mapq = atoi(token); break; + case 3: + if (!ParseInt(token, record.pos, "pos", lines_processed_)) + return false; + break; + case 4: + if (!ParseInt(token, record.mapq, "mapq", lines_processed_)) + return false; + break; case 5: record.cigar = token; break; case 6: record.rnext = token; break; - case 7: record.pnext = atoi(token); break; - case 8: record.tlen = atoi(token); break; + case 7: + if (!ParseInt(token, record.pnext, "pnext", lines_processed_)) + return false; + break; + case 8: + if (!ParseInt(token, record.tlen, "tlen", lines_processed_)) + return false; + break; case 9: record.seq = token; break; - case 10: + case 10: StripCRLF(token); - record.qual = token; + record.qual = token; break; default: StripCRLF(token); record.optional_fields.push_back(token); break; } - field_num++; token = strtok(nullptr, "\t"); } - - return field_num >= 11; + return field_num >= 11; } -} - +} // namespace ramcore From d3dd824a34cca875db9a9344ef16d74044d65329 Mon Sep 17 00:00:00 2001 From: Abhuday Singh Date: Sat, 11 Apr 2026 20:04:11 +0530 Subject: [PATCH 2/2] test: regression tests for ParseInt() validation in SamParser Six tests via ParseFile() covering all five integer fields (flag, pos, mapq, pnext, tlen) for malformed input and one valid record check. Also fix CMakeLists.txt keyword mixing and ROOT 6.38 namespace in SamToNTuple.cxx (pre-existing build breaks, not related to PR logic). Closes #26 --- src/ramcore/SamToNTuple.cxx | 2 +- test/CMakeLists.txt | 2 +- test/ramcoretests.cxx | 113 ++++++++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+), 2 deletions(-) diff --git a/src/ramcore/SamToNTuple.cxx b/src/ramcore/SamToNTuple.cxx index f6d9a78..166f315 100644 --- a/src/ramcore/SamToNTuple.cxx +++ b/src/ramcore/SamToNTuple.cxx @@ -234,7 +234,7 @@ void samtoramntuple_split_by_chromosome(const char *datafile, const char *output writeOptions.SetUseBufferedWrite(true); auto parallel_writer = - ROOT::Experimental::RNTupleParallelWriter::Recreate(std::move(model), "RAM", filename, writeOptions); + ROOT::RNTupleParallelWriter::Recreate(std::move(model), "RAM", filename, writeOptions); const int contexts_per_file = std::min(4, num_threads); const size_t records_per_context = (records.size() + contexts_per_file - 1) / contexts_per_file; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 24b54f0..62dbd41 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,7 +18,7 @@ function(add_ramcore_test test_name) if(RAMTOOLS_BUILD_BENCHMARKS) - target_link_libraries(${test_name} benchmark::benchmark) + target_link_libraries(${test_name} PRIVATE benchmark::benchmark) endif() target_include_directories(${test_name} diff --git a/test/ramcoretests.cxx b/test/ramcoretests.cxx index 92cb54c..7b75d1d 100644 --- a/test/ramcoretests.cxx +++ b/test/ramcoretests.cxx @@ -18,6 +18,7 @@ #include "ramcore/SamToNTuple.h" #include "rntuple/RAMNTupleRecord.h" #include "ramcore/SamToTTree.h" +#include "ramcore/SamParser.h" namespace { class ramcoreTest : public ::testing::Test { @@ -312,3 +313,115 @@ TEST_F(ramcoreTest, SmartIndexRespectsPositionInterval) std::remove(customSam); std::remove(rntupleFile); } +TEST_F(ramcoreTest, MalformedFlagSkipsRecord) { + const char* path = "/tmp/ram_test_flag.sam"; + FILE* f = fopen(path, "w"); + fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + fprintf(f, "bad\tabc\tchr1\t100\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fclose(f); + + int count = 0; + ramcore::SamParser parser; + parser.ParseFile(path, + [](const std::string&, const std::string&) {}, + [&](const ramcore::SamRecord&, int) { count++; }); + std::remove(path); + + EXPECT_EQ(count, 1); +} + +TEST_F(ramcoreTest, MalformedPosSkipsRecord) { + const char* path = "/tmp/ram_test_pos.sam"; + FILE* f = fopen(path, "w"); + fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + fprintf(f, "bad\t0\tchr1\tXYZ\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fclose(f); + + int count = 0; + ramcore::SamParser parser; + parser.ParseFile(path, + [](const std::string&, const std::string&) {}, + [&](const ramcore::SamRecord&, int) { count++; }); + std::remove(path); + + EXPECT_EQ(count, 1); +} + +TEST_F(ramcoreTest, MalformedMapqSkipsRecord) { + const char* path = "/tmp/ram_test_mapq.sam"; + FILE* f = fopen(path, "w"); + fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + fprintf(f, "bad\t0\tchr1\t100\tBAD\t10M\t*\t0\t0\tACGT\tIIII\n"); + fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fclose(f); + + int count = 0; + ramcore::SamParser parser; + parser.ParseFile(path, + [](const std::string&, const std::string&) {}, + [&](const ramcore::SamRecord&, int) { count++; }); + std::remove(path); + + EXPECT_EQ(count, 1); +} + +TEST_F(ramcoreTest, MalformedPnextSkipsRecord) { + const char* path = "/tmp/ram_test_pnext.sam"; + FILE* f = fopen(path, "w"); + fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + fprintf(f, "bad\t0\tchr1\t100\t60\t10M\t*\tNOP\t0\tACGT\tIIII\n"); + fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fclose(f); + + int count = 0; + ramcore::SamParser parser; + parser.ParseFile(path, + [](const std::string&, const std::string&) {}, + [&](const ramcore::SamRecord&, int) { count++; }); + std::remove(path); + + EXPECT_EQ(count, 1); +} + +TEST_F(ramcoreTest, MalformedTlenSkipsRecord) { + const char* path = "/tmp/ram_test_tlen.sam"; + FILE* f = fopen(path, "w"); + fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + fprintf(f, "bad\t0\tchr1\t100\t60\t10M\t*\t0\tBAD\tACGT\tIIII\n"); + fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n"); + fclose(f); + + int count = 0; + ramcore::SamParser parser; + parser.ParseFile(path, + [](const std::string&, const std::string&) {}, + [&](const ramcore::SamRecord&, int) { count++; }); + std::remove(path); + + EXPECT_EQ(count, 1); +} + +TEST_F(ramcoreTest, ValidRecordParsesCorrectly) { + const char* path = "/tmp/ram_test_valid.sam"; + FILE* f = fopen(path, "w"); + fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + fprintf(f, "r1\t16\tchr1\t500\t30\t5M\t*\t200\t10\tACGTA\tIIIII\n"); + fclose(f); + + ramcore::SamRecord captured; + int count = 0; + ramcore::SamParser parser; + parser.ParseFile(path, + [](const std::string&, const std::string&) {}, + [&](const ramcore::SamRecord& rec, int) { captured = rec; count++; }); + std::remove(path); + + ASSERT_EQ(count, 1); + EXPECT_EQ(captured.flag, 16); + EXPECT_EQ(captured.pos, 500); + EXPECT_EQ(captured.mapq, 30); + EXPECT_EQ(captured.pnext, 200); + EXPECT_EQ(captured.tlen, 10); +}