Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 27 additions & 10 deletions fastq_to_ubam.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,41 @@ input_glob = params.input_glob ?: ['*.{1,2}.fastq.gz']
read_format = params.read_format ?: 'paired-end'
params.outdir = './ubam'

// Shared shell function to extract and validate barcode from FASTQ header
// Samples first 10k reads and returns the most frequent valid barcode
def extractBarcodeFunction = '''
extract_barcode() {
local fastq_file="$1"

# Extract last colon-field from comment (after space) of first 10k read headers
# Filter to valid barcodes (nucleotides with optional +), count occurrences, return most frequent
barcode=$(zcat "$fastq_file" \
| head -n 40000 \
| awk 'NR % 4 == 1 {sub(/.*[[:space:]]/, ""); n=split($0,a,":"); print a[n]}' \
| grep -E '^[ACGTN+-]+$' \
| sort | uniq -c | sort -rn | head -1 | awk '{print $2}')

# Fallback to unknown if no valid barcode found
echo "${barcode:-unknown}"
}
'''

process FastqToBamPaired {
conda "bioconda::picard=3.3.0 bioconda::samtools=1.21"
publishDir "${params.outdir}", mode: 'copy'
memory { params.max_memory ?: 300.GB }

input:
tuple val(library), path(read1), path(read2)

output:
path('*.bam')

script:
"""
set +o pipefail

barcode=\$(zcat ${read1} | head -n 1 | cut -d ":" -f 10)

${extractBarcodeFunction}
barcode=\$(extract_barcode "${read1}")
set -o pipefail

picard FastqToSam TMP_DIR=/state/partition1/sge_tmp F1=${read1} F2=${read2} OUTPUT=temp.bam SM=${library} LB=${library} CN="New England Biolabs" PU=Illumina QUIET=true
Expand All @@ -35,19 +53,18 @@ process FastqToBamSingle {
conda "bioconda::picard=3.3.0 bioconda::samtools=1.21"
publishDir "${params.outdir}", mode: 'copy'
memory { params.max_memory ?: 300.GB }

input:
tuple val(library), path(read1)

output:
path('*.bam')

script:
"""
set +o pipefail

barcode=\$(zcat ${read1} | head -n 1 | cut -d ":" -f 10)

${extractBarcodeFunction}
barcode=\$(extract_barcode "${read1}")
set -o pipefail

picard FastqToSam F1=${read1} OUTPUT=temp.bam SM=${library} LB=${library} CN="New England Biolabs" PU=Illumina QUIET=true
Expand Down
25 changes: 25 additions & 0 deletions tests/fastq_to_ubam.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,29 @@ nextflow_pipeline {
)
}
}

test("fastq to uBam workflow - non-standard SRA headers (issue #15)") {
// Test that FASTQ files with SRA-style headers (e.g., @SRR... length=111)
// are handled gracefully without breaking due to spaces in extracted barcode
when {
params.input_glob = "$projectDir/tests/fixtures/fastq_sra/sra-test{.1,.2}.fastq.gz"
}

then {
def ubam = bam("${launchDir}/ubam/sra-test.bam").getStatistics()
// Read the BAM header to verify barcode is set to 'unknown' (not containing spaces)
def bamHeader = bam("${launchDir}/ubam/sra-test.bam").getHeader()
def rgLine = bamHeader.find { it.startsWith('@RG') }

assertAll(
{ assert workflow.success },
// Verify the barcode field doesn't contain spaces (which would break commands)
{ assert rgLine.contains('BC:unknown') : "Expected BC:unknown for non-standard headers, got: ${rgLine}" },
{ assert snapshot(workflow.trace,
["ubam", ubam],
).match()
}
)
}
}
}
26 changes: 26 additions & 0 deletions tests/fastq_to_ubam.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,32 @@
},
"timestamp": "2025-08-20T09:23:01.410361919"
},
"fastq to uBam workflow - non-standard SRA headers (issue #15)": {
"content": [
{
"tasksFailed": 0,
"tasksCount": 1,
"tasksSucceeded": 1
},
[
"ubam",
{
"maxReadLength": 150,
"minReadLength": 150,
"meanReadLength": 150,
"maxQuality": 0,
"minQuality": 0,
"meanQuality": 0,
"readCount": 200
}
]
],
"meta": {
"nf-test": "0.9.3",
"nextflow": "25.10.2"
},
"timestamp": "2026-01-25T16:55:47.816983"
},
"fastq to uBam workflow - paired-end": {
"content": [
{
Expand Down
Binary file added tests/fixtures/fastq_sra/sra-test.1.fastq.gz
Binary file not shown.
Binary file added tests/fixtures/fastq_sra/sra-test.2.fastq.gz
Binary file not shown.