From 6ec53ac1e0694319f78c8e0dbd81c95bb5bd03d6 Mon Sep 17 00:00:00 2001 From: lukedev45 Date: Wed, 18 Mar 2026 21:31:17 +0000 Subject: [PATCH 1/2] fix: replace atoi() with strict integer parsing in SamParser (#22) atoi() silently returns 0 for non-numeric strings, causing corrupted SAM records to be written with zeroed-out integer fields. Replace all five atoi() calls with a StrictAtoi wrapper using std::stoi that validates input and rejects records with non-numeric FLAG, POS, MAPQ, PNEXT, or TLEN values. --- src/ramcore/SamParser.cxx | 37 ++++++++++++++++++++++++++++++++----- test/ramcoretests.cxx | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/src/ramcore/SamParser.cxx b/src/ramcore/SamParser.cxx index f5dd07c..f6af027 100644 --- a/src/ramcore/SamParser.cxx +++ b/src/ramcore/SamParser.cxx @@ -1,9 +1,36 @@ #include "ramcore/SamParser.h" #include #include +#include +#include +#include namespace ramcore { +static bool StrictAtoi(const char* token, const char* field_name, int& result) { + try { + size_t pos = 0; + result = std::stoi(std::string(token), &pos); + if (pos != std::strlen(token)) { + std::cerr << "SamParser: invalid value '" << token + << "' for field " << field_name + << " (trailing characters)\n"; + return false; + } + return true; + } catch (const std::invalid_argument&) { + std::cerr << "SamParser: invalid value '" << token + << "' for field " << field_name + << " (not a number)\n"; + return false; + } catch (const std::out_of_range&) { + std::cerr << "SamParser: value '" << token + << "' for field " << field_name + << " is out of range\n"; + return false; + } +} + void StripCRLF(char* str) { size_t len = strlen(str); while (len > 0 && (str[len-1] == '\n' || str[len-1] == '\r')) { @@ -65,14 +92,14 @@ bool SamParser::ParseLine(char* line, SamRecord& record) { while (token) { switch (field_num) { case 0: record.qname = token; break; - case 1: record.flag = atoi(token); break; + case 1: if (!StrictAtoi(token, "FLAG", record.flag)) return false; break; case 2: record.rname = token; break; - case 3: record.pos = atoi(token); break; - case 4: record.mapq = atoi(token); break; + case 3: if (!StrictAtoi(token, "POS", record.pos)) return false; break; + case 4: if (!StrictAtoi(token, "MAPQ", record.mapq)) return false; break; case 5: record.cigar = token; break; case 6: record.rnext = token; break; - case 7: record.pnext = atoi(token); break; - case 8: record.tlen = atoi(token); break; + case 7: if (!StrictAtoi(token, "PNEXT", record.pnext)) return false; break; + case 8: if (!StrictAtoi(token, "TLEN", record.tlen)) return false; break; case 9: record.seq = token; break; case 10: StripCRLF(token); diff --git a/test/ramcoretests.cxx b/test/ramcoretests.cxx index 7dd8b24..fdd8269 100644 --- a/test/ramcoretests.cxx +++ b/test/ramcoretests.cxx @@ -462,4 +462,40 @@ TEST_F(ramcoreTest, QUALEncodingDecodingModes) std::remove(samFile); std::remove(ramFile); -} \ No newline at end of file +} + +TEST_F(ramcoreTest, SamParserRejectsNonNumericFields) +{ + const char *badSam = "test_bad_fields.sam"; + const char *rntupleFile = "test_bad_fields.root"; + + { + std::ofstream sam(badSam); + sam << "@HD\tVN:1.6\tSO:unsorted\n"; + sam << "@SQ\tSN:chr1\tLN:10000\n"; + // Non-numeric FLAG + sam << "read1\tBROKEN\tchr1\t100\t60\t50M\t*\t0\t0\t" + << std::string(50, 'A') << "\t*\n"; + // Non-numeric POS + sam << "read2\t0\tchr1\tBAD\t60\t50M\t*\t0\t0\t" + << std::string(50, 'A') << "\t*\n"; + // Trailing characters in MAPQ + sam << "read4\t0\tchr1\t300\t60abc\t50M\t*\t0\t0\t" + << std::string(50, 'A') << "\t*\n"; + // Out-of-range POS + sam << "read5\t0\tchr1\t99999999999999\t60\t50M\t*\t0\t0\t" + << std::string(50, 'A') << "\t*\n"; + // Valid record with negative TLEN + sam << "read3\t0\tchr1\t200\t60\t50M\t*\t0\t-150\t" + << std::string(50, 'A') << "\t*\n"; + } + + samtoramntuple(badSam, rntupleFile, false, false, false, 505, 0); + + auto reader = ROOT::RNTupleReader::Open("RAM", rntupleFile); + EXPECT_EQ(reader->GetNEntries(), 1) + << "Only the valid record should be written; corrupted records must be rejected"; + + std::remove(badSam); + std::remove(rntupleFile); +} From 1993695f469d6b2b2c618aedead0287cbc058f23 Mon Sep 17 00:00:00 2001 From: lukedev45 Date: Thu, 19 Mar 2026 08:23:31 +0000 Subject: [PATCH 2/2] fix: use strtol instead of std::stoi in StrictAtoi strtol works directly with the char* tokens from strtok, avoiding the heap allocation from std::string construction and exception overhead on every call. This is more efficient for a hot parsing loop and fits the C-style pattern already used throughout SamParser. --- src/ramcore/SamParser.cxx | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/src/ramcore/SamParser.cxx b/src/ramcore/SamParser.cxx index f6af027..215c628 100644 --- a/src/ramcore/SamParser.cxx +++ b/src/ramcore/SamParser.cxx @@ -1,34 +1,31 @@ #include "ramcore/SamParser.h" -#include +#include +#include #include -#include -#include +#include #include namespace ramcore { static bool StrictAtoi(const char* token, const char* field_name, int& result) { - try { - size_t pos = 0; - result = std::stoi(std::string(token), &pos); - if (pos != std::strlen(token)) { - std::cerr << "SamParser: invalid value '" << token - << "' for field " << field_name - << " (trailing characters)\n"; - return false; - } - return true; - } catch (const std::invalid_argument&) { + char* end = nullptr; + errno = 0; + long val = strtol(token, &end, 10); + + if (end == token || *end != '\0') { std::cerr << "SamParser: invalid value '" << token << "' for field " << field_name << " (not a number)\n"; return false; - } catch (const std::out_of_range&) { + } + if (errno == ERANGE || val < INT_MIN || val > INT_MAX) { std::cerr << "SamParser: value '" << token << "' for field " << field_name << " is out of range\n"; return false; } + result = static_cast(val); + return true; } void StripCRLF(char* str) {