diff --git a/fastq_to_ubam.nf b/fastq_to_ubam.nf index 006d7a7..727a817 100644 --- a/fastq_to_ubam.nf +++ b/fastq_to_ubam.nf @@ -6,7 +6,7 @@ read_format = params.read_format ?: 'paired-end' params.outdir = './ubam' process FastqToBamPaired { - conda "bioconda::picard=3.3.0 bioconda::samtools=1.21" + conda "bioconda::samtools=1.23" publishDir "${params.outdir}", mode: 'copy' memory { params.max_memory ?: 300.GB } @@ -18,21 +18,39 @@ process FastqToBamPaired { script: """ - set +o pipefail - - barcode=\$(zcat ${read1} | head -n 1 | cut -d ":" -f 10) - - set -o pipefail - - picard FastqToSam TMP_DIR=/state/partition1/sge_tmp F1=${read1} F2=${read2} OUTPUT=temp.bam SM=${library} LB=${library} CN="New England Biolabs" PU=Illumina QUIET=true - - samtools reheader -c "sed \\"s/RG/RG\\tBC:\$barcode/\\"" temp.bam > ${library}.bam - rm temp.bam + # Extract most frequent barcode from first 10k reads + # Split on : and take last field, then remove everything after any space + # Filter to only valid barcodes (matching [ACGTN+-]+) + barcode=\$(zcat ${read1} | sed -n '1~4p' | head -n 10000 | awk -F: '{print \$NF}' | awk '{print \$1}' | grep -E '^[ACGTN+-]+\$' | sort | uniq -c | sort -rn | head -n 1 | awk '{print \$2}') + + # Only add BC tag if a valid barcode was found + if [ -n "\${barcode}" ]; then + samtools import -i \ + -r ID:${library} \ + -r SM:${library} \ + -r LB:${library} \ + -r PL:ILLUMINA \ + -r CN:"New England Biolabs" \ + -r BC:\${barcode} \ + -1 ${read1} \ + -2 ${read2} \ + -o ${library}.bam + else + samtools import -i \ + -r ID:${library} \ + -r SM:${library} \ + -r LB:${library} \ + -r PL:ILLUMINA \ + -r CN:"New England Biolabs" \ + -1 ${read1} \ + -2 ${read2} \ + -o ${library}.bam + fi """ } process FastqToBamSingle { - conda "bioconda::picard=3.3.0 bioconda::samtools=1.21" + conda "bioconda::samtools=1.23" publishDir "${params.outdir}", mode: 'copy' memory { params.max_memory ?: 300.GB } @@ -44,16 +62,32 @@ process FastqToBamSingle { script: """ - set +o pipefail - - barcode=\$(zcat ${read1} | head -n 1 | cut -d ":" -f 10) - - set -o pipefail - - picard FastqToSam F1=${read1} OUTPUT=temp.bam SM=${library} LB=${library} CN="New England Biolabs" PU=Illumina QUIET=true - - samtools reheader -c "sed \\"s/RG/RG\\tBC:\$barcode/\\"" temp.bam > ${library}.bam - rm temp.bam + # Extract most frequent barcode from first 10k reads + # Split on : and take last field, then remove everything after any space + # Filter to only valid barcodes (matching [ACGTN+-]+) + barcode=\$(zcat ${read1} | sed -n '1~4p' | head -n 10000 | awk -F: '{print \$NF}' | awk '{print \$1}' | grep -E '^[ACGTN+-]+\$' | sort | uniq -c | sort -rn | head -n 1 | awk '{print \$2}') + + # Only add BC tag if a valid barcode was found + if [ -n "\${barcode}" ]; then + samtools import -i \ + -r ID:${library} \ + -r SM:${library} \ + -r LB:${library} \ + -r PL:ILLUMINA \ + -r CN:"New England Biolabs" \ + -r BC:\${barcode} \ + -0 ${read1} \ + -o ${library}.bam + else + samtools import -i \ + -r ID:${library} \ + -r SM:${library} \ + -r LB:${library} \ + -r PL:ILLUMINA \ + -r CN:"New England Biolabs" \ + -0 ${read1} \ + -o ${library}.bam + fi """ } diff --git a/tests/fastq_to_ubam.nf.test b/tests/fastq_to_ubam.nf.test index 1f77994..2a30ccb 100644 --- a/tests/fastq_to_ubam.nf.test +++ b/tests/fastq_to_ubam.nf.test @@ -6,7 +6,7 @@ nextflow_pipeline { test("fastq to uBam workflow - paired-end") { when { - params.input_glob = "$projectDir/tests/fixtures/fastq/emseq-test*{.ds.1,.ds.2}.fastq.gz" + params.input_glob = "$projectDir/tests/fixtures/fastq/emseq-test1{.ds.1,.ds.2}.fastq.gz" } then { @@ -24,7 +24,7 @@ nextflow_pipeline { test("fastq to uBam workflow - single-end") { when { - params.input_glob = "$projectDir/tests/fixtures/fastq/emseq-test*.ds.1.fastq.gz" + params.input_glob = "$projectDir/tests/fixtures/fastq/emseq-test1.ds.1.fastq.gz" params.read_format = 'single-end' } @@ -40,4 +40,52 @@ nextflow_pipeline { ) } } + + test("fastq to uBam workflow - paired-end with non-standard header (SRR format)") { + when { + params.input_glob = "$projectDir/tests/fixtures/fastq/test-srr-format.{1,2}.fastq.gz" + } + + then { + assertAll( + { assert workflow.success }, + { assert path("${launchDir}/ubam/test-srr-format.bam").exists() } + // Note: SRR format doesn't have valid barcodes (numbers don't match [ACGTN+-]+) + // BAM should NOT contain BC tag - can be manually verified with: + // samtools view -H ubam/test-srr-format.bam | grep BC + ) + } + } + + test("fastq to uBam workflow - single-end with non-standard header (SRR format)") { + when { + params.input_glob = "$projectDir/tests/fixtures/fastq/test-srr-format.1.fastq.gz" + params.read_format = 'single-end' + } + + then { + assertAll( + { assert workflow.success }, + { assert path("${launchDir}/ubam/test-srr-format.1.bam").exists() } + // Note: SRR format doesn't have valid barcodes (numbers don't match [ACGTN+-]+) + // BAM should NOT contain BC tag - can be manually verified with: + // samtools view -H ubam/test-srr-format.1.bam | grep BC + ) + } + } + + test("fastq to uBam workflow - validates standard barcode extraction") { + when { + params.input_glob = "$projectDir/tests/fixtures/fastq/emseq-test1{.ds.1,.ds.2}.fastq.gz" + } + + then { + assertAll( + { assert workflow.success }, + { assert path("${launchDir}/ubam/emseq-test1.bam").exists() } + // Note: Standard EM-seq format should have BC:GCTTCACAAT+TAGCTTTAAC in read group + // Can be manually verified with: samtools view -H ubam/emseq-test1.bam | grep BC + ) + } + } } diff --git a/tests/fixtures/fastq/test-srr-format.1.fastq.gz b/tests/fixtures/fastq/test-srr-format.1.fastq.gz new file mode 100644 index 0000000..3b78669 Binary files /dev/null and b/tests/fixtures/fastq/test-srr-format.1.fastq.gz differ diff --git a/tests/fixtures/fastq/test-srr-format.2.fastq.gz b/tests/fixtures/fastq/test-srr-format.2.fastq.gz new file mode 100644 index 0000000..6d023a3 Binary files /dev/null and b/tests/fixtures/fastq/test-srr-format.2.fastq.gz differ