diff --git a/fastq_to_ubam.nf b/fastq_to_ubam.nf index 006d7a7..b6d8cdd 100644 --- a/fastq_to_ubam.nf +++ b/fastq_to_ubam.nf @@ -5,23 +5,41 @@ input_glob = params.input_glob ?: ['*.{1,2}.fastq.gz'] read_format = params.read_format ?: 'paired-end' params.outdir = './ubam' +// Shared shell function to extract and validate barcode from FASTQ header +// Samples first 10k reads and returns the most frequent valid barcode +def extractBarcodeFunction = ''' +extract_barcode() { + local fastq_file="$1" + + # Extract last colon-field from comment (after space) of first 10k read headers + # Filter to valid barcodes (nucleotides with optional +), count occurrences, return most frequent + barcode=$(zcat "$fastq_file" \ + | head -n 40000 \ + | awk 'NR % 4 == 1 {sub(/.*[[:space:]]/, ""); n=split($0,a,":"); print a[n]}' \ + | grep -E '^[ACGTN+-]+$' \ + | sort | uniq -c | sort -rn | head -1 | awk '{print $2}') + + # Fallback to unknown if no valid barcode found + echo "${barcode:-unknown}" +} +''' + process FastqToBamPaired { conda "bioconda::picard=3.3.0 bioconda::samtools=1.21" publishDir "${params.outdir}", mode: 'copy' memory { params.max_memory ?: 300.GB } - + input: tuple val(library), path(read1), path(read2) - + output: path('*.bam') script: """ set +o pipefail - - barcode=\$(zcat ${read1} | head -n 1 | cut -d ":" -f 10) - + ${extractBarcodeFunction} + barcode=\$(extract_barcode "${read1}") set -o pipefail picard FastqToSam TMP_DIR=/state/partition1/sge_tmp F1=${read1} F2=${read2} OUTPUT=temp.bam SM=${library} LB=${library} CN="New England Biolabs" PU=Illumina QUIET=true @@ -35,19 +53,18 @@ process FastqToBamSingle { conda "bioconda::picard=3.3.0 bioconda::samtools=1.21" publishDir "${params.outdir}", mode: 'copy' memory { params.max_memory ?: 300.GB } - + input: tuple val(library), path(read1) - + output: path('*.bam') script: """ set +o pipefail - - barcode=\$(zcat ${read1} | head -n 1 | cut -d ":" -f 10) - + ${extractBarcodeFunction} + barcode=\$(extract_barcode "${read1}") set -o pipefail picard FastqToSam F1=${read1} OUTPUT=temp.bam SM=${library} LB=${library} CN="New England Biolabs" PU=Illumina QUIET=true diff --git a/tests/fastq_to_ubam.nf.test b/tests/fastq_to_ubam.nf.test index 1f77994..58f0a3e 100644 --- a/tests/fastq_to_ubam.nf.test +++ b/tests/fastq_to_ubam.nf.test @@ -40,4 +40,29 @@ nextflow_pipeline { ) } } + + test("fastq to uBam workflow - non-standard SRA headers (issue #15)") { + // Test that FASTQ files with SRA-style headers (e.g., @SRR... length=111) + // are handled gracefully without breaking due to spaces in extracted barcode + when { + params.input_glob = "$projectDir/tests/fixtures/fastq_sra/sra-test{.1,.2}.fastq.gz" + } + + then { + def ubam = bam("${launchDir}/ubam/sra-test.bam").getStatistics() + // Read the BAM header to verify barcode is set to 'unknown' (not containing spaces) + def bamHeader = bam("${launchDir}/ubam/sra-test.bam").getHeader() + def rgLine = bamHeader.find { it.startsWith('@RG') } + + assertAll( + { assert workflow.success }, + // Verify the barcode field doesn't contain spaces (which would break commands) + { assert rgLine.contains('BC:unknown') : "Expected BC:unknown for non-standard headers, got: ${rgLine}" }, + { assert snapshot(workflow.trace, + ["ubam", ubam], + ).match() + } + ) + } + } } diff --git a/tests/fastq_to_ubam.nf.test.snap b/tests/fastq_to_ubam.nf.test.snap index f79aa9f..2460220 100644 --- a/tests/fastq_to_ubam.nf.test.snap +++ b/tests/fastq_to_ubam.nf.test.snap @@ -25,6 +25,32 @@ }, "timestamp": "2025-08-20T09:23:01.410361919" }, + "fastq to uBam workflow - non-standard SRA headers (issue #15)": { + "content": [ + { + "tasksFailed": 0, + "tasksCount": 1, + "tasksSucceeded": 1 + }, + [ + "ubam", + { + "maxReadLength": 150, + "minReadLength": 150, + "meanReadLength": 150, + "maxQuality": 0, + "minQuality": 0, + "meanQuality": 0, + "readCount": 200 + } + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-25T16:55:47.816983" + }, "fastq to uBam workflow - paired-end": { "content": [ { diff --git a/tests/fixtures/fastq_sra/sra-test.1.fastq.gz b/tests/fixtures/fastq_sra/sra-test.1.fastq.gz new file mode 100644 index 0000000..c6654d9 Binary files /dev/null and b/tests/fixtures/fastq_sra/sra-test.1.fastq.gz differ diff --git a/tests/fixtures/fastq_sra/sra-test.2.fastq.gz b/tests/fixtures/fastq_sra/sra-test.2.fastq.gz new file mode 100644 index 0000000..f94cfd7 Binary files /dev/null and b/tests/fixtures/fastq_sra/sra-test.2.fastq.gz differ