Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,15 @@ docs/html/
docs/latex/
html/

# Reference genome and index files
GRCh38.primary_assembly.genome.fa
GRCh38.primary_assembly.genome.fa.amb
GRCh38.primary_assembly.genome.fa.ann
GRCh38.primary_assembly.genome.fa.bwt
GRCh38.primary_assembly.genome.fa.pac
GRCh38.primary_assembly.genome.fa.sa
*.fastq.gz
*.filt.fastq.gz
sample.sam
--help.ram
README 2.md
73 changes: 52 additions & 21 deletions src/ramcore/SamParser.cxx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "ramcore/SamParser.h"
#include <cstring>
#include <cstdlib>
#include <iostream>

namespace ramcore {

Expand All @@ -11,23 +12,41 @@ void StripCRLF(char* str) {
}
}

bool SamParser::ParseFile(const char* filename,
HeaderCallback header_cb,
static bool ParseInt(const char* str, int& out, const char* fieldName, int lineNum) {
if (!str || *str == '\0') {
std::cerr << "[SamParser] Warning: empty value for field '"
<< fieldName << "' at line " << lineNum
<< " — record skipped.\n";
return false;
}
char* end;
long val = std::strtol(str, &end, 10);
if (*end != '\0' && *end != '\t' && *end != '\n' && *end != '\r') {
std::cerr << "[SamParser] Warning: non-numeric value '" << str
<< "' for field '" << fieldName << "' at line " << lineNum
<< " — record skipped.\n";
return false;
}
out = static_cast<int>(val);
return true;
}

bool SamParser::ParseFile(const char* filename,
HeaderCallback header_cb,
RecordCallback record_cb) {
FILE* fp = fopen(filename, "r");
if (!fp) {
return false;
}

lines_processed_ = 0;
records_processed_ = 0;

char line[kMaxLineLength];
SamRecord record;

while (fgets(line, kMaxLineLength, fp)) {
lines_processed_++;

if (line[0] == '@') {
char* tab = strchr(line, '\t');
if (tab) {
Expand All @@ -44,7 +63,7 @@ bool SamParser::ParseFile(const char* filename,
}
continue;
}

record.Clear();
if (ParseLine(line, record)) {
if (record_cb) {
Expand All @@ -53,43 +72,55 @@ bool SamParser::ParseFile(const char* filename,
records_processed_++;
}
}

fclose(fp);
return true;
}

bool SamParser::ParseLine(char* line, SamRecord& record) {
int field_num = 0;
char* token = strtok(line, "\t");
Comment thread
ZaGrayWolf marked this conversation as resolved.

while (token) {
switch (field_num) {
case 0: record.qname = token; break;
case 1: record.flag = atoi(token); break;
case 1:
if (!ParseInt(token, record.flag, "flag", lines_processed_))
return false;
break;
case 2: record.rname = token; break;
case 3: record.pos = atoi(token); break;
case 4: record.mapq = atoi(token); break;
case 3:
if (!ParseInt(token, record.pos, "pos", lines_processed_))
return false;
break;
case 4:
if (!ParseInt(token, record.mapq, "mapq", lines_processed_))
return false;
break;
case 5: record.cigar = token; break;
case 6: record.rnext = token; break;
case 7: record.pnext = atoi(token); break;
case 8: record.tlen = atoi(token); break;
case 7:
if (!ParseInt(token, record.pnext, "pnext", lines_processed_))
return false;
break;
case 8:
if (!ParseInt(token, record.tlen, "tlen", lines_processed_))
return false;
break;
case 9: record.seq = token; break;
case 10:
case 10:
StripCRLF(token);
record.qual = token;
record.qual = token;
break;
default:
StripCRLF(token);
record.optional_fields.push_back(token);
break;
}

field_num++;
token = strtok(nullptr, "\t");
}

return field_num >= 11;
return field_num >= 11;
}

}

} // namespace ramcore
2 changes: 1 addition & 1 deletion src/ramcore/SamToNTuple.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ void samtoramntuple_split_by_chromosome(const char *datafile, const char *output
writeOptions.SetUseBufferedWrite(true);

auto parallel_writer =
ROOT::Experimental::RNTupleParallelWriter::Recreate(std::move(model), "RAM", filename, writeOptions);
ROOT::RNTupleParallelWriter::Recreate(std::move(model), "RAM", filename, writeOptions);

const int contexts_per_file = std::min(4, num_threads);
const size_t records_per_context = (records.size() + contexts_per_file - 1) / contexts_per_file;
Expand Down
2 changes: 1 addition & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ function(add_ramcore_test test_name)

if(RAMTOOLS_BUILD_BENCHMARKS)

target_link_libraries(${test_name} benchmark::benchmark)
target_link_libraries(${test_name} PRIVATE benchmark::benchmark)
endif()

target_include_directories(${test_name}
Expand Down
137 changes: 100 additions & 37 deletions test/ramcoretests.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "ramcore/SamToNTuple.h"
#include "rntuple/RAMNTupleRecord.h"
#include "ramcore/SamToTTree.h"
#include "ramcore/SamParser.h"
namespace {

class ramcoreTest : public ::testing::Test {
Expand Down Expand Up @@ -383,83 +384,145 @@ TEST_F(ramcoreTest, SmartIndexRespectsPositionInterval)
std::remove(customSam);
std::remove(rntupleFile);
}

TEST_F(ramcoreTest, MalformedFlagSkipsRecord) {
const char* path = "/tmp/ram_test_flag.sam";
FILE* f = fopen(path, "w");
fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n");
fprintf(f, "bad\tabc\tchr1\t100\t60\t10M\t*\t0\t0\tACGT\tIIII\n");
fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n");
fclose(f);
int count = 0;
ramcore::SamParser parser;
parser.ParseFile(path,
[](const std::string&, const std::string&) {},
[&](const ramcore::SamRecord&, int) { count++; });
std::remove(path);
EXPECT_EQ(count, 1);
}
TEST_F(ramcoreTest, MalformedPosSkipsRecord) {
const char* path = "/tmp/ram_test_pos.sam";
FILE* f = fopen(path, "w");
fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n");
fprintf(f, "bad\t0\tchr1\tXYZ\t60\t10M\t*\t0\t0\tACGT\tIIII\n");
fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n");
fclose(f);
int count = 0;
ramcore::SamParser parser;
parser.ParseFile(path,
[](const std::string&, const std::string&) {},
[&](const ramcore::SamRecord&, int) { count++; });
std::remove(path);
EXPECT_EQ(count, 1);
}
TEST_F(ramcoreTest, MalformedMapqSkipsRecord) {
const char* path = "/tmp/ram_test_mapq.sam";
FILE* f = fopen(path, "w");
fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n");
fprintf(f, "bad\t0\tchr1\t100\tBAD\t10M\t*\t0\t0\tACGT\tIIII\n");
fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n");
fclose(f);
int count = 0;
ramcore::SamParser parser;
parser.ParseFile(path,
[](const std::string&, const std::string&) {},
[&](const ramcore::SamRecord&, int) { count++; });
std::remove(path);
EXPECT_EQ(count, 1);
}
TEST_F(ramcoreTest, MalformedPnextSkipsRecord) {
const char* path = "/tmp/ram_test_pnext.sam";
FILE* f = fopen(path, "w");
fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n");
fprintf(f, "bad\t0\tchr1\t100\t60\t10M\t*\tNOP\t0\tACGT\tIIII\n");
fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n");
fclose(f);
int count = 0;
ramcore::SamParser parser;
parser.ParseFile(path,
[](const std::string&, const std::string&) {},
[&](const ramcore::SamRecord&, int) { count++; });
std::remove(path);
EXPECT_EQ(count, 1);
}
TEST_F(ramcoreTest, MalformedTlenSkipsRecord) {
const char* path = "/tmp/ram_test_tlen.sam";
FILE* f = fopen(path, "w");
fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n");
fprintf(f, "bad\t0\tchr1\t100\t60\t10M\t*\t0\tBAD\tACGT\tIIII\n");
fprintf(f, "good\t0\tchr1\t200\t60\t10M\t*\t0\t0\tACGT\tIIII\n");
fclose(f);
int count = 0;
ramcore::SamParser parser;
parser.ParseFile(path,
[](const std::string&, const std::string&) {},
[&](const ramcore::SamRecord&, int) { count++; });
std::remove(path);
EXPECT_EQ(count, 1);
}
TEST_F(ramcoreTest, ValidRecordParsesCorrectly) {
const char* path = "/tmp/ram_test_valid.sam";
FILE* f = fopen(path, "w");
fprintf(f, "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n");
fprintf(f, "r1\t16\tchr1\t500\t30\t5M\t*\t200\t10\tACGTA\tIIIII\n");
fclose(f);
ramcore::SamRecord captured;
int count = 0;
ramcore::SamParser parser;
parser.ParseFile(path,
[](const std::string&, const std::string&) {},
[&](const ramcore::SamRecord& rec, int) { captured = rec; count++; });
std::remove(path);
ASSERT_EQ(count, 1);
EXPECT_EQ(captured.flag, 16);
EXPECT_EQ(captured.pos, 500);
EXPECT_EQ(captured.mapq, 30);
EXPECT_EQ(captured.pnext, 200);
EXPECT_EQ(captured.tlen, 10);
}
TEST_F(ramcoreTest, QUALEncodingDecodingModes)
{
const char *samFile = "test_qual.sam";
const char *ramFile = "test_qual.root";

std::string seq = "AAAAAAAAAA";
std::string qual = "@@@FBIEDH!";

{
std::ofstream sam(samFile);
sam << "@HD\tVN:1.6\tSO:unsorted\n";
sam << "@SQ\tSN:chr1\tLN:1000\n";
sam << "read1\t0\tchr1\t100\t60\t10M\t*\t0\t0\t" << seq << "\t" << qual << "\n";
}

// No compression of QUAL field, stored as it is
samtoramntuple(samFile, ramFile, /*index=*/true, /*split=*/false, /*cache=*/false, /*compression_algorithm=*/505,
/*quality_policy=*/0);

samtoramntuple(samFile, ramFile, true, false, false, 505, 0);
{
auto reader = ROOT::RNTupleReader::Open("RAM", ramFile);
ASSERT_NE(reader, nullptr);

auto view = reader->GetView<RAMNTupleRecord>("record");
const auto &rec = view(0);

std::string decoded = rec.GetQUAL();

// MUST be identical
EXPECT_EQ(decoded, qual) << "QUAL should remain unchanged without compression";
EXPECT_EQ(rec.GetQUAL(), qual) << "QUAL should remain unchanged without compression";
}

std::remove(ramFile);

// Drop the quailty field , should stored as "*"
samtoramntuple(samFile, ramFile, /*index=*/true, /*split=*/false, /*cache=*/false, /*compression_algorithm=*/505,
/*quality_policy=*/RAMNTupleRecord::kDrop);

samtoramntuple(samFile, ramFile, true, false, false, 505, RAMNTupleRecord::kDrop);
{
auto reader = ROOT::RNTupleReader::Open("RAM", ramFile);
ASSERT_NE(reader, nullptr);

auto view = reader->GetView<RAMNTupleRecord>("record");
const auto &rec = view(0);

EXPECT_EQ(rec.GetQUAL(), "*") << "QUAL should be dropped";
}

std::remove(ramFile);

// Illumina binning
samtoramntuple(samFile, ramFile, /*index=*/true, /*split=*/false, /*cache=*/false, /*compression_algorithm=*/505,
/*quality_policy=*/RAMNTupleRecord::kIlluminaBinning);

samtoramntuple(samFile, ramFile, true, false, false, 505, RAMNTupleRecord::kIlluminaBinning);
{
auto reader = ROOT::RNTupleReader::Open("RAM", ramFile);
ASSERT_NE(reader, nullptr);

auto view = reader->GetView<RAMNTupleRecord>("record");
const auto &rec = view(0);

std::string decoded = rec.GetQUAL();

// Must NOT be same (lossy compression done during encoding)
EXPECT_NE(decoded, qual) << "Binning must change quality values";

// Length must stay same
EXPECT_EQ(decoded.size(), qual.size());

// Must still be valid printable ASCII
for (char c : decoded) {
EXPECT_GE(c, 33);
EXPECT_LE(c, 126);
}
}

std::remove(samFile);
std::remove(ramFile);
}