From 9e9957d95ee6082fcae7ab8c8c274b47619c3d76 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 16 Oct 2025 07:33:21 -0700 Subject: [PATCH 1/2] Get alignment fully working --- .gitignore | 4 + containers/Makefile | 63 +++ containers/bwa/align.def | 46 +++ containers/bwa/bwa-aggregate-base.def | 116 ++++++ containers/bwa/density_files.def | 67 ++++ containers/bwa/insert_size.def | 58 +++ containers/bwa/macs2.def | 53 +++ containers/bwa/mark_duplicates.def | 50 +++ containers/bwa/pysam-samtools.def | 60 +++ containers/bwa/samtools-1.12.def | 58 +++ containers/bwa/spot_score.def | 83 ++++ containers/resolved_modules.txt | 53 +++ nextflow.config | 20 +- processes/bwa/nextflow.config | 51 +++ processes/bwa/process_bwa_paired_trimmed.nf | 400 ++++++++++++-------- 15 files changed, 1014 insertions(+), 168 deletions(-) create mode 100644 containers/Makefile create mode 100644 containers/bwa/align.def create mode 100644 containers/bwa/bwa-aggregate-base.def create mode 100644 containers/bwa/density_files.def create mode 100644 containers/bwa/insert_size.def create mode 100644 containers/bwa/macs2.def create mode 100644 containers/bwa/mark_duplicates.def create mode 100644 containers/bwa/pysam-samtools.def create mode 100644 containers/bwa/samtools-1.12.def create mode 100644 containers/bwa/spot_score.def create mode 100644 containers/resolved_modules.txt diff --git a/.gitignore b/.gitignore index ad9f65f9..7e9b29fd 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,7 @@ environments/ work genome_build tmp + +*.sif +containers/module_files/* +programs/* diff --git a/containers/Makefile b/containers/Makefile new file mode 100644 index 00000000..1e7b4b88 --- /dev/null +++ b/containers/Makefile @@ -0,0 +1,63 @@ +# Makefile for building BWA process containers +# +# Usage: +# make all - Build all containers +# make align - Build just the align container +# make clean - Remove all .sif files +# make list - List all available targets + +# Container definitions and their corresponding .sif files +CONTAINERS = align samtools-1.12 pysam-samtools mark_duplicates insert_size spot_score density_files macs2 bwa-aggregate-base + +# Directory where containers are stored +CONTAINER_DIR = bwa + +# Default target +.PHONY: all +all: $(addprefix $(CONTAINER_DIR)/, $(addsuffix .sif, $(CONTAINERS))) + +# Pattern rule for building .sif files from .def files +$(CONTAINER_DIR)/%.sif: $(CONTAINER_DIR)/%.def + @echo "Building container: $@" + apptainer build --force "$@" "$<" + @echo "Successfully built: $@" + +# Individual container targets for convenience +.PHONY: align samtools-1.12 pysam-samtools mark_duplicates insert_size spot_score density_files macs2 bwa-aggregate-base +align: $(CONTAINER_DIR)/align.sif +samtools-1.12: $(CONTAINER_DIR)/samtools-1.12.sif +pysam-samtools: $(CONTAINER_DIR)/pysam-samtools.sif +mark_duplicates: $(CONTAINER_DIR)/mark_duplicates.sif +insert_size: $(CONTAINER_DIR)/insert_size.sif +density_files: $(CONTAINER_DIR)/density_files.sif +spot_score: $(CONTAINER_DIR)/spot_score.sif +macs2: $(CONTAINER_DIR)/macs2.sif +bwa-aggregate-base: $(CONTAINER_DIR)/bwa-aggregate-base.sif + +# Clean target to remove all .sif files +.PHONY: clean +clean: + @echo "Removing all .sif files..." + rm -f $(CONTAINER_DIR)/*.sif + @echo "Clean complete." + +# List available targets +.PHONY: list +list: + @echo "Available container targets:" + @echo " all - Build all containers" + @echo " align - Build align container" + @echo " samtools-1.12 - Build samtools-1.12 container" + @echo " pysam-samtools - Build pysam-samtools container" + @echo " mark_duplicates - Build mark_duplicates container" + @echo " insert_size - Build insert_size container" + @echo " density_files - Build density_files container" + @echo " spot_score - Build spot_score container" + @echo " macs2 - Build macs2 container" + @echo " bwa-aggregate-base - Build bwa-aggregate-base container" + @echo " clean - Remove all .sif files" + @echo " list - Show this help" + +# Help target +.PHONY: help +help: list diff --git a/containers/bwa/align.def b/containers/bwa/align.def new file mode 100644 index 00000000..ab59962a --- /dev/null +++ b/containers/bwa/align.def @@ -0,0 +1,46 @@ +Bootstrap: docker +From: ubuntu:20.04 + +%post -c /bin/bash + # Install Environment Modules, sed, and findutils + apt-get update && apt-get install -y environment-modules sed findutils libncurses5 + + # Fix the hardcoded paths in all modulefiles + find /etc/modulefiles -type f -exec sed -i 's|/net/module/sw|/opt/sw|g' {} + + + # Load modules + source /etc/profile.d/modules.sh + export MODULEPATH=/etc/modulefiles + + # save current ENV to file + env > /tmp/env_before_modules.txt + module load bwa/0.7.12 samtools/1.3 + + # diff env to store modified variables in APPTAINER_ENVIRONMENT + env > /tmp/env_after_modules.txt + diff /tmp/env_before_modules.txt /tmp/env_after_modules.txt | grep '^> ' | sed 's/^> //' >> "$APPTAINER_ENVIRONMENT" + + +%files + # Copy only the modulefiles needed for align process + module_files/modulefiles/bwa /etc/modulefiles/bwa + module_files/modulefiles/samtools /etc/modulefiles/samtools + + # Copy only the software needed for align process + module_files/sw/bwa /opt/sw/bwa + module_files/sw/samtools /opt/sw/samtools + +%environment + # Set the path to the custom modulefiles + export MODULEPATH=/etc/modulefiles + +%runscript + # This script runs when the container is executed + echo "BWA Align container - modules: bwa/0.7.12, samtools/1.3" + echo "You can use 'module avail' and 'module load' commands." + exec /bin/bash + +%test + # Test basic functionality + bwa 2>&1 | grep -q "Usage" + samtools --version | grep -q "samtools 1.3" diff --git a/containers/bwa/bwa-aggregate-base.def b/containers/bwa/bwa-aggregate-base.def new file mode 100644 index 00000000..f9b323a0 --- /dev/null +++ b/containers/bwa/bwa-aggregate-base.def @@ -0,0 +1,116 @@ +Bootstrap: docker +From: ubuntu:18.04 # We use 18.04 to get libgfortan3 + +%post -c /bin/bash + # Install Environment Modules, sed, and findutils + apt-get update && apt-get install -y environment-modules sed findutils libncurses5 libgfortran3 tcl + + # Fix the hardcoded paths in all modulefiles + find /etc/modulefiles -type f -exec sed -i 's|/net/module/sw|/opt/sw|g' {} + + + # Load modules + source /etc/profile.d/modules.sh + export MODULEPATH=/etc/modulefiles + + # save current ENV to file + env > /tmp/env_before_modules.txt + module load openssl-dev/1.0.1t libpng/1.6.21 mysql-connector/6.1.6 htslib/1.3.1 bedops/2.4.35-typical samtools/1.3 modwt/1.0 kentutil/302 hotspot2/2.1.1 jdk/1.8.0_92 gcc/4.7.2 R/3.2.5 picard/2.8.1 git/2.3.3 coreutils/8.25 bedtools/2.25.0 python/3.5.1 pysam/0.9.0 htslib/1.6.0 numpy/1.11.0 atlas-lapack/3.10.2 scipy/1.0.0 scikit-learn/0.18.1 preseq/2.0.3 gsl/2.4 + + # diff env to store modified variables in APPTAINER_ENVIRONMENT + env > /tmp/env_after_modules.txt + diff /tmp/env_before_modules.txt /tmp/env_after_modules.txt | grep '^> ' | sed 's/^> //' | sed -e 's/\([^=]*\)=\(.*\)/export \1="\2"/' | sed 's/&/\\&/g' >> "$APPTAINER_ENVIRONMENT" + + +%files + # Copy all the modulefiles needed for this container + module_files/modulefiles/openssl-dev/1.0.1t /etc/modulefiles/openssl-dev/1.0.1t + module_files/modulefiles/libpng/1.6.21 /etc/modulefiles/libpng/1.6.21 + module_files/modulefiles/mysql-connector/6.1.6 /etc/modulefiles/mysql-connector/6.1.6 + module_files/modulefiles/htslib/1.3.1 /etc/modulefiles/htslib/1.3.1 + module_files/modulefiles/htslib/1.6.0 /etc/modulefiles/htslib/1.6.0 + module_files/modulefiles/bedops/2.4.35-typical /etc/modulefiles/bedops/2.4.35-typical + module_files/modulefiles/samtools/1.3 /etc/modulefiles/samtools/1.3 + module_files/modulefiles/modwt/1.0 /etc/modulefiles/modwt/1.0 + module_files/modulefiles/kentutil/302 /etc/modulefiles/kentutil/302 + module_files/modulefiles/hotspot2/2.1.1 /etc/modulefiles/hotspot2/2.1.1 + module_files/modulefiles/jdk/1.8.0_92 /etc/modulefiles/jdk/1.8.0_92 + module_files/modulefiles/gcc/4.7.2 /etc/modulefiles/gcc/4.7.2 + module_files/modulefiles/R/3.2.5 /etc/modulefiles/R/3.2.5 + module_files/modulefiles/picard/2.8.1 /etc/modulefiles/picard/2.8.1 + module_files/modulefiles/git/2.3.3 /etc/modulefiles/git/2.3.3 + module_files/modulefiles/coreutils/8.25 /etc/modulefiles/coreutils/8.25 + module_files/modulefiles/bedtools/2.25.0 /etc/modulefiles/bedtools/2.25.0 + module_files/modulefiles/python/3.5.1 /etc/modulefiles/python/3.5.1 + module_files/modulefiles/pysam/0.9.0 /etc/modulefiles/pysam/0.9.0 + module_files/modulefiles/numpy/1.11.0 /etc/modulefiles/numpy/1.11.0 + module_files/modulefiles/atlas-lapack/3.10.2 /etc/modulefiles/atlas-lapack/3.10.2 + module_files/modulefiles/scipy/1.0.0 /etc/modulefiles/scipy/1.0.0 + module_files/modulefiles/scikit-learn/0.18.1 /etc/modulefiles/scikit-learn/0.18.1 + module_files/modulefiles/preseq/2.0.3 /etc/modulefiles/preseq/2.0.3 + module_files/modulefiles/gsl/2.4 /etc/modulefiles/gsl/2.4 + module_files/modulefiles/zlib/1.2.8 /etc/modulefiles/zlib/1.2.8 + module_files/modulefiles/libssh2/1.7.0 /etc/modulefiles/libssh2/1.7.0 + module_files/modulefiles/curl/7.49.1 /etc/modulefiles/curl/7.49.1 + + # Copy all the software needed for this container + module_files/sw/openssl-dev/1.0.1t /opt/sw/openssl-dev/1.0.1t + module_files/sw/libpng/1.6.21 /opt/sw/libpng/1.6.21 + module_files/sw/mysql-connector/6.1.6 /opt/sw/mysql-connector/6.1.6 + module_files/sw/htslib/1.3.1 /opt/sw/htslib/1.3.1 + module_files/sw/htslib/1.6.0 /opt/sw/htslib/1.6.0 + module_files/sw/bedops/2.4.35-typical /opt/sw/bedops/2.4.35-typical + module_files/sw/samtools/1.3 /opt/sw/samtools/1.3 + module_files/sw/modwt/1.0 /opt/sw/modwt/1.0 + module_files/sw/kentutil/302 /opt/sw/kentutil/302 + module_files/sw/hotspot2/2.1.1 /opt/sw/hotspot2/2.1.1 + module_files/sw/jdk/1.8.0_92 /opt/sw/jdk/1.8.0_92 + module_files/sw/gcc/4.7.2 /opt/sw/gcc/4.7.2 + module_files/sw/R/3.2.5 /opt/sw/R/3.2.5 + module_files/sw/picard/2.8.1 /opt/sw/picard/2.8.1 + module_files/sw/git/2.3.3 /opt/sw/git/2.3.3 + module_files/sw/coreutils/8.25 /opt/sw/coreutils/8.25 + module_files/sw/bedtools/2.25.0 /opt/sw/bedtools/2.25.0 + module_files/sw/python/3.5.1 /opt/sw/python/3.5.1 + module_files/sw/pysam/0.9.0 /opt/sw/pysam/0.9.0 + module_files/sw/numpy/1.11.0 /opt/sw/numpy/1.11.0 + module_files/sw/atlas-lapack/3.10.2 /opt/sw/atlas-lapack/3.10.2 + module_files/sw/scipy/1.0.0 /opt/sw/scipy/1.0.0 + module_files/sw/scikit-learn/0.18.1 /opt/sw/scikit-learn/0.18.1 + module_files/sw/preseq/2.0.3 /opt/sw/preseq/2.0.3 + module_files/sw/gsl/2.4 /opt/sw/gsl/2.4 + module_files/sw/zlib/1.2.8 /opt/sw/zlib/1.2.8 + module_files/sw/libssh2/1.7.0 /opt/sw/libssh2/1.7.0 + module_files/sw/curl/7.49.1 /opt/sw/curl/7.49.1 + +%environment + # Set the path to the custom modulefiles + export MODULEPATH=/etc/modulefiles + +%runscript + # This script runs when the container is executed + echo "bwa-aggregate-base container" + exec /bin/bash + +%test + # Test basic functionality + samtools --version | grep -q "samtools" + python3.5 --version | grep -q "Python 3.5" + R --version | grep -q "R version" + java -version 2>&1 | grep -q "java version" + gcc --version | grep -q "gcc" + git --version | grep -q "git version" + bedtools --version | grep -q "bedtools" + preseq 2>&1 | grep -q "Usage" + + # Test Python scientific stack + python3.5 -c "import pysam; print('pysam import successful')" + python3.5 -c "import numpy; print('numpy import successful')" + python3.5 -c "import scipy; print('scipy import successful')" + python3.5 -c "import sklearn; print('scikit-learn import successful')" + + # Test bedops tools + which convert2bed + which bedops + + # Test that picard is available + picard 2>&1 | grep -q MarkDuplicates \ No newline at end of file diff --git a/containers/bwa/density_files.def b/containers/bwa/density_files.def new file mode 100644 index 00000000..8581613a --- /dev/null +++ b/containers/bwa/density_files.def @@ -0,0 +1,67 @@ +Bootstrap: docker +From: ubuntu:20.04 + +%post -c /bin/bash + set -e + # Install Environment Modules, sed, and findutils + apt-get update && apt-get install -y environment-modules sed findutils libncurses5 libcurl4 + + # Fix the hardcoded paths in all modulefiles + find /etc/modulefiles -type f -exec sed -i 's|/net/module/sw|/opt/sw|g' {} + + + # Load modules + source /etc/profile.d/modules.sh + export MODULEPATH=/etc/modulefiles + + # save current ENV to file + env > /tmp/env_before_modules.txt + module load openssl-dev/1.0.1t libpng/1.6.21 git/2.3.3 mysql-connector/6.1.6 bedops/2.4.35-typical samtools/1.3 htslib/1.6.0 kentutil/302 + + # diff env to store modified variables in APPTAINER_ENVIRONMENT + env > /tmp/env_after_modules.txt + diff /tmp/env_before_modules.txt /tmp/env_after_modules.txt | grep '^> ' | sed 's/^> //' | sed -e 's/\([^=]*\)=\(.*\)/\1="\2"/' >> "$APPTAINER_ENVIRONMENT" + + +%files + # Copy only the modulefiles needed for this container + module_files/modulefiles/openssl-dev /etc/modulefiles/openssl-dev + module_files/modulefiles/libpng /etc/modulefiles/libpng + module_files/modulefiles/git /etc/modulefiles/git + module_files/modulefiles/mysql-connector /etc/modulefiles/mysql-connector + module_files/modulefiles/bedops /etc/modulefiles/bedops + module_files/modulefiles/samtools /etc/modulefiles/samtools + module_files/modulefiles/htslib /etc/modulefiles/htslib + module_files/modulefiles/kentutil /etc/modulefiles/kentutil + + # Copy only the software needed for this container + module_files/sw/openssl-dev /opt/sw/openssl-dev + module_files/sw/libpng /opt/sw/libpng + module_files/sw/git /opt/sw/git + module_files/sw/mysql-connector /opt/sw/mysql-connector + module_files/sw/bedops /opt/sw/bedops + module_files/sw/samtools /opt/sw/samtools + module_files/sw/htslib /opt/sw/htslib + module_files/sw/kentutil /opt/sw/kentutil + +%environment + # Set the path to the custom modulefiles + export MODULEPATH=/etc/modulefiles + +%runscript + # This script runs when the container is executed + echo "density_files container - modules: bedops/2.4.35-typical, samtools/1.3, htslib/1.6.0, kentutil/302" + exec /bin/bash + +%test + # Test basic functionality + samtools --version | grep -q "samtools" + git --version | grep -q "git version" + bgzip --version | grep -q "bgzip" + + # Test bedops tools + convert2bed --version + bedops --version + + # Test kentutil tools (common ones) + bigWigToBedGraph 2>&1 | grep -q usage + bedGraphToBigWig 2>&1 | grep -q usage diff --git a/containers/bwa/insert_size.def b/containers/bwa/insert_size.def new file mode 100644 index 00000000..dff7dc72 --- /dev/null +++ b/containers/bwa/insert_size.def @@ -0,0 +1,58 @@ +Bootstrap: docker +From: ubuntu:16.04 # We use 16.04 to get libgfortran3 and readline4 + +%post -c /bin/bash + # Install Environment Modules, sed, and findutils + apt-get update && apt-get install -y environment-modules sed findutils libncurses5 libgomp1 libgfortran3 tcl libreadline-dev + #curl http://archive.ubuntu.com/ubuntu/pool/main/r/readline6/libreadline6_6.3-8ubuntu2_amd64.deb -O libreadline6_6.3-8ubuntu2_amd64.deb + #dpkg -i libreadline6_6.3-8ubuntu2_amd64.deb + + # Fix the hardcoded paths in all modulefiles + find /etc/modulefiles /opt/sw -type f -exec sed -i 's|/net/module/sw|/opt/sw|g' {} + + + # Load modules + source /etc/profile.d/modules.sh + export MODULEPATH=/etc/modulefiles + + # save current ENV to file + env > /tmp/env_before_modules.txt + module load jdk/1.8.0_92 picard/2.8.1 samtools/1.3 R/3.2.5 + + # diff env to store modified variables in APPTAINER_ENVIRONMENT + env > /tmp/env_after_modules.txt + diff /tmp/env_before_modules.txt /tmp/env_after_modules.txt | grep '^> ' | sed 's/^> //' | sed -e 's/\([^=]*\)=\(.*\)/\1="\2"/' >> "$APPTAINER_ENVIRONMENT" + + # fix libreadline dependency for R + #ln -s /usr/lib/x86_64-linux-gnu/libreadline.so.7 /usr/lib/x86_64-linux-gnu/libreadline.so.6 + +%files + # Copy only the modulefiles needed for this container + module_files/modulefiles/jdk /etc/modulefiles/jdk + module_files/modulefiles/picard /etc/modulefiles/picard + module_files/modulefiles/samtools /etc/modulefiles/samtools + module_files/modulefiles/R /etc/modulefiles/R + + # Copy only the software needed for this container + module_files/sw/jdk /opt/sw/jdk + module_files/sw/picard /opt/sw/picard + module_files/sw/samtools/1.3 /opt/sw/samtools/1.3 + module_files/sw/R/3.2.5 /opt/sw/R/3.2.5 + +%environment + # Set the path to the custom modulefiles + export MODULEPATH=/etc/modulefiles + +%runscript + # This script runs when the container is executed + echo "insert_size container - modules: jdk/1.8.0_92, picard/2.8.1, samtools/1.3, R/3.2.5" + exec /bin/bash + +%test + # Test basic functionality + java -version 2>&1 | grep -q "java version" + samtools --version | grep -q "samtools" + R --version | grep -q "R version" + Rscript --version 2>&1 | grep -q "R scripting" + + # Test that picard is available + picard CollectInsertSizeMetrics --version 2>&1 | grep -q 2.8.1 diff --git a/containers/bwa/macs2.def b/containers/bwa/macs2.def new file mode 100644 index 00000000..acacdeb6 --- /dev/null +++ b/containers/bwa/macs2.def @@ -0,0 +1,53 @@ +Bootstrap: docker +From: ubuntu:20.04 + +%post -c /bin/bash + set -e + # Install Environment Modules, sed, and findutils + apt-get update && apt-get install -y environment-modules sed findutils + + # Fix the hardcoded paths in all modulefiles + find /etc/modulefiles -type f -exec sed -i 's|/net/module/sw|/opt/sw|g' {} + + + # Load modules + export MODULEPATH=/etc/modulefiles + source /etc/profile.d/modules.sh + env > /tmp/env_before_modules.txt + + # save current ENV to file + module load openssl-dev/1.0.1t python/2.7.11 numpy MACS + + # diff env to store modified variables in APPTAINER_ENVIRONMENT + env > /tmp/env_after_modules.txt + diff /tmp/env_before_modules.txt /tmp/env_after_modules.txt | grep '^> ' | sed 's/^> //' | sed -e "s/\([^=]*\)=\(.*\)/export \1='\2'/" >> "$APPTAINER_ENVIRONMENT" + + +%files + # Copy only the modulefiles needed for this container + module_files/modulefiles/openssl-dev /etc/modulefiles/openssl-dev + module_files/modulefiles/python /etc/modulefiles/python + module_files/modulefiles/numpy /etc/modulefiles/numpy + module_files/modulefiles/MACS /etc/modulefiles/MACS + + # Copy only the software needed for this container + module_files/sw/openssl-dev /opt/sw/openssl-dev + module_files/sw/python /opt/sw/python + module_files/sw/numpy /opt/sw/numpy + module_files/sw/MACS /opt/sw/MACS + +%environment + # Set the path to the custom modulefiles + export MODULEPATH=/etc/modulefiles + +%runscript + # This script runs when the container is executed + echo "macs2 container - modules: python/2.7.11, numpy, MACS" + exec /bin/bash + +%test + # Test basic functionality + python2.7 --version 2>&1 | grep -q "Python 2.7" + macs2 --version 2>&1 | grep -q "macs2" + + # Test Python imports + python2.7 -c "import numpy; print('numpy import successful')" \ No newline at end of file diff --git a/containers/bwa/mark_duplicates.def b/containers/bwa/mark_duplicates.def new file mode 100644 index 00000000..e74a7d54 --- /dev/null +++ b/containers/bwa/mark_duplicates.def @@ -0,0 +1,50 @@ +Bootstrap: docker +From: ubuntu:20.04 + +%post -c /bin/bash + # Install Environment Modules, sed, and findutils + apt-get update && apt-get install -y environment-modules sed findutils libncurses5 + + # Fix the hardcoded paths in all modulefiles + find /etc/modulefiles -type f -exec sed -i 's|/net/module/sw|/opt/sw|g' {} + + + # Load modules + source /etc/profile.d/modules.sh + export MODULEPATH=/etc/modulefiles + + # save current ENV to file + env > /tmp/env_before_modules.txt + module load jdk/1.8.0_92 picard/2.8.1 samtools/1.3 + + # diff env to store modified variables in APPTAINER_ENVIRONMENT + env > /tmp/env_after_modules.txt + diff /tmp/env_before_modules.txt /tmp/env_after_modules.txt | grep '^> ' | sed 's/^> //' | sed -e 's/\([^=]*\)=\(.*\)/export \1="\2"/' | sed 's/&/\\&/g' >> "$APPTAINER_ENVIRONMENT" + + +%files + # Copy only the modulefiles needed for this container + module_files/modulefiles/jdk/1.8.0_92 /etc/modulefiles/jdk/1.8.0_92 + module_files/modulefiles/picard/2.8.1 /etc/modulefiles/picard/2.8.1 + module_files/modulefiles/samtools/1.3 /etc/modulefiles/samtools/1.3 + + # Copy only the software needed for this container + module_files/sw/jdk/1.8.0_92 /opt/sw/jdk/1.8.0_92 + module_files/sw/picard/2.8.1 /opt/sw/picard/2.8.1 + module_files/sw/samtools/1.3 /opt/sw/samtools/1.3 + +%environment + # Set the path to the custom modulefiles + export MODULEPATH=/etc/modulefiles + +%runscript + # This script runs when the container is executed + echo "mark_duplicates container - modules: jdk/1.8.0_92, picard/2.8.1, samtools/1.3" + exec /bin/bash + +%test + # Test basic functionality + java -version 2>&1 | grep -q "java version" + samtools --version | grep -q "samtools" + + # Test that picard is available + picard MarkDuplicates --version 2>&1 | grep -q 2.8.1 \ No newline at end of file diff --git a/containers/bwa/pysam-samtools.def b/containers/bwa/pysam-samtools.def new file mode 100644 index 00000000..53dd3463 --- /dev/null +++ b/containers/bwa/pysam-samtools.def @@ -0,0 +1,60 @@ +Bootstrap: docker +From: ubuntu:20.04 + +%post -c /bin/bash + set -e + # Install Environment Modules, sed, and findutils + apt-get update && apt-get install -y environment-modules sed findutils libncurses5 + + # Fix the hardcoded paths in all modulefiles + find /etc/modulefiles -type f -exec sed -i 's|/net/module/sw|/opt/sw|g' {} + + + # Load modules + source /etc/profile.d/modules.sh + export MODULEPATH=/etc/modulefiles + + # save current ENV to file + env > /tmp/env_before_modules.txt + module load htslib/1.3.1 samtools/1.3 python/3.5.1 pysam/0.9.0 + # diff env to store modified variables in APPTAINER_ENVIRONMENT + env > /tmp/env_after_modules.txt + diff /tmp/env_before_modules.txt /tmp/env_after_modules.txt | grep '^> ' | sed 's/^> //' | sed -e 's/\([^=]*\)=\(.*\)/export \1="\2"/' | sed 's/&/\\&/g' >> "$APPTAINER_ENVIRONMENT" + + +%files + # Copy only the modulefiles needed for this container + module_files/modulefiles/samtools/1.3 /etc/modulefiles/samtools/1.3 + module_files/modulefiles/python /etc/modulefiles/python + module_files/modulefiles/pysam /etc/modulefiles/pysam + module_files/modulefiles/htslib /etc/modulefiles/htslib + module_files/modulefiles/openssl-dev /etc/modulefiles/openssl-dev + module_files/modulefiles/libssh2 /etc/modulefiles/libssh2 + module_files/modulefiles/curl /etc/modulefiles/curl + module_files/modulefiles/zlib /etc/modulefiles/zlib + + # Copy only the software needed for this container + module_files/sw/samtools/1.3 /opt/sw/samtools/1.3 + module_files/sw/python/3.5.1 /opt/sw/python/3.5.1 + module_files/sw/pysam/0.9.0 /opt/sw/pysam/0.9.0 + module_files/sw/htslib/1.3.1 /opt/sw/htslib/1.3.1 + module_files/sw/openssl-dev/1.0.1t /opt/sw/openssl-dev/1.0.1t + module_files/sw/libssh2 /opt/sw/libssh2 + module_files/sw/curl /opt/sw/curl + module_files/sw/zlib /opt/sw/zlib + +%environment + # Set the path to the custom modulefiles + export MODULEPATH=/etc/modulefiles + +%runscript + # This script runs when the container is executed + echo "pysam-samtools container" + exec /bin/bash + +%test + # Test basic functionality + samtools --version | grep -q "samtools" + python3.5 --version | grep -q "Python 3.5" + + # Test Python imports + python3.5 -c "import pysam; print('pysam import successful')" diff --git a/containers/bwa/samtools-1.12.def b/containers/bwa/samtools-1.12.def new file mode 100644 index 00000000..11717c2a --- /dev/null +++ b/containers/bwa/samtools-1.12.def @@ -0,0 +1,58 @@ +Bootstrap: docker +From: staphb/samtools:1.12 + +%test + # Test that samtools is available and working + which samtools + samtools --version | grep -q "samtools 1.12" + + +# Note: Below is left for posterity - samtools 1.12 container built from ubuntu 20.04 +# Could not get libcrypto.so.10 to install correctly + +# Bootstrap: docker +# From: ubuntu:20.04 +# +# %post -c /bin/bash +# # Install Environment Modules, sed, and findutils +# apt-get update && apt-get install -y environment-modules sed findutils libcurl4-openssl-dev libssl-dev +# +# # set up symlinks for older versions of ssl libraries required by samtools 1.12 +# cd /usr/lib/x86_64-linux-gnu +# ln -s libssl.so.1.1.0 libssl.so.10 +# ln -s libcrypto.so.1.1.0 libcrypto.so.10 +# +# # Fix the hardcoded paths in all modulefiles +# find /etc/modulefiles -type f -exec sed -i 's|/net/module/sw|/opt/sw|g' {} + +# +# # Load modules +# source /etc/profile.d/modules.sh +# export MODULEPATH=/etc/modulefiles +# +# # save current ENV to file +# env > /tmp/env_before_modules.txt +# module load samtools/1.12 +# +# # diff env to store modified variables in APPTAINER_ENVIRONMENT +# env > /tmp/env_after_modules.txt +# diff /tmp/env_before_modules.txt /tmp/env_after_modules.txt | grep '^> ' | sed 's/^> //' >> "$APPTAINER_ENVIRONMENT" +# +# cat "$APPTAINER_ENVIRONMENT" +# +# +# %files +# # Copy only the modulefiles needed for this container +# module_files/modulefiles/samtools /etc/modulefiles/samtools +# +# # Copy only the software needed for this container +# module_files/sw/samtools /opt/sw/samtools +# +# %environment +# # Set the path to the custom modulefiles +# export MODULEPATH=/etc/modulefiles +# +# %runscript +# # This script runs when the container is executed +# echo "Samtools 1.12 container" +# exec /bin/bash +# \ No newline at end of file diff --git a/containers/bwa/spot_score.def b/containers/bwa/spot_score.def new file mode 100644 index 00000000..66e8a19a --- /dev/null +++ b/containers/bwa/spot_score.def @@ -0,0 +1,83 @@ +Bootstrap: docker +From: ubuntu:20.04 + +%post -c /bin/bash + # Install Environment Modules, sed, and findutils + apt-get update && apt-get install -y environment-modules sed findutils libncurses5 bc + + # Fix the hardcoded paths in all modulefiles + find /etc/modulefiles -type f -exec sed -i 's|/net/module/sw|/opt/sw|g' {} + + + # Load modules + source /etc/profile.d/modules.sh + export MODULEPATH=/etc/modulefiles + + # save current ENV to file + env > /tmp/env_before_modules.txt + module load samtools/1.3 python/2.7.11 python/3.5.1 pysam/0.9.0 bedops/2.4.35-typical bedtools/2.25.0 R/3.2.5 jdk/1.8.0_92 picard/2.8.1 + + # diff env to store modified variables in APPTAINER_ENVIRONMENT + env > /tmp/env_after_modules.txt + diff /tmp/env_before_modules.txt /tmp/env_after_modules.txt | grep '^> ' | sed 's/^> //' | sed -e 's/\([^=]*\)=\(.*\)/export \1="\2"/' | sed 's/&/\\&/g' >> "$APPTAINER_ENVIRONMENT" + + +%files + # Copy only the modulefiles needed for this container + module_files/modulefiles/htslib/1.3.1 /etc/modulefiles/htslib/1.3.1 + module_files/modulefiles/openssl-dev/1.0.1t /etc/modulefiles/openssl-dev/1.0.1t + module_files/modulefiles/samtools/1.3 /etc/modulefiles/samtools/1.3 + module_files/modulefiles/python/2.7.11 /etc/modulefiles/python/2.7.11 + module_files/modulefiles/python/3.5.1 /etc/modulefiles/python/3.5.1 + module_files/modulefiles/pysam/0.9.0 /etc/modulefiles/pysam/0.9.0 + module_files/modulefiles/bedops/2.4.35-typical /etc/modulefiles/bedops/2.4.35-typical + module_files/modulefiles/bedtools/2.25.0 /etc/modulefiles/bedtools/2.25.0 + module_files/modulefiles/R/3.2.5 /etc/modulefiles/R/3.2.5 + module_files/modulefiles/jdk/1.8.0_92 /etc/modulefiles/jdk/1.8.0_92 + module_files/modulefiles/picard/2.8.1 /etc/modulefiles/picard/2.8.1 + module_files/modulefiles/curl/7.49.1 /etc/modulefiles/curl/7.49.1 + module_files/modulefiles/libssh2/1.7.0 /etc/modulefiles/libssh2/1.7.0 + module_files/modulefiles/zlib/1.2.8 /etc/modulefiles/zlib/1.2.8 + + # Copy only the software needed for this container + module_files/sw/htslib/1.3.1 /opt/sw/htslib/1.3.1 + module_files/sw/openssl-dev/1.0.1t /opt/sw/openssl-dev/1.0.1t + module_files/sw/samtools/1.3 /opt/sw/samtools/1.3 + module_files/sw/python/2.7.11 /opt/sw/python/2.7.11 + module_files/sw/python/3.5.1 /opt/sw/python/3.5.1 + module_files/sw/pysam/0.9.0 /opt/sw/pysam/0.9.0 + module_files/sw/bedops/2.4.35-typical /opt/sw/bedops/2.4.35-typical + module_files/sw/bedtools/2.25.0 /opt/sw/bedtools/2.25.0 + module_files/sw/R/3.2.5 /opt/sw/R/3.2.5 + module_files/sw/jdk/1.8.0_92 /opt/sw/jdk/1.8.0_92 + module_files/sw/picard/2.8.1 /opt/sw/picard/2.8.1 + module_files/sw/curl/7.49.1 /opt/sw/curl/7.49.1 + module_files/sw/libssh2/1.7.0 /opt/sw/libssh2/1.7.0 + module_files/sw/zlib/1.2.8 /opt/sw/zlib/1.2.8 + +%environment + # Set the path to the custom modulefiles + export MODULEPATH=/etc/modulefiles + +%runscript + # This script runs when the container is executed + echo "spot_score container - comprehensive genomics analysis tools" + exec /bin/bash + +%test + # Test basic functionality + samtools --version | grep -q "samtools" + python2.7 --version 2>&1 | grep -q "Python 2.7" + python3.5 --version | grep -q "Python 3.5" + bedtools --version | grep -q "bedtools" + R --version | grep -q "R version" + java -version 2>&1 | grep -q "java version" + + # Test Python imports + python3.5 -c "import pysam; print('pysam import successful')" + + # Test bedops tools + which convert2bed + which bedops + + # Test that picard is available + test -f /opt/sw/picard/2.8.1/bin/picard || test -f /opt/sw/picard/2.8.1/picard.jar diff --git a/containers/resolved_modules.txt b/containers/resolved_modules.txt new file mode 100644 index 00000000..3872171a --- /dev/null +++ b/containers/resolved_modules.txt @@ -0,0 +1,53 @@ +MACS/2.1.2 +R/3.2.5 +RSEM/1.2.30 +STAR/2.4.2a +STAR/2.7.11b +anaquin/2.0.1 +atlas-lapack/3.10.2 +bcl2fastq2/2.20.0.422 +bedops/2.4.19 +bedops/2.4.35-typical +bedtools/2.25.0 +bowtie/1.0.0 +bowtie/1.1.2 +bwa/0.7.12 +bwa/0.7.7 +coreutils/8.25 +cufflinks/2.2.1 +curl/7.49.1 +fastp/0.21.0 +gcc/4.7.2 +git/2.3.3 +gsl/2.4 +hotspot2/2.1.1 +htslib/1.12 +htslib/1.3.1 +htslib/1.6.0 +jdk/1.8.0_92 +jdk/11.0.16 +kallisto/0.43.1 +kentutil/302 +libpng/1.6.21 +libssh2/1.12.0 +libssh2/1.7.0 +modwt/1.0 +mysql-connector/6.1.6 +numpy/1.11.0 +openssl-dev/1.0.1t +perl/5.16.3 +picard/2.8.1 +picard/2.9.0 +preseq/2.0.3 +pysam/0.9.0 +python/2.7.11 +python/3.5.1 +samtools/1.12 +samtools/1.14 +samtools/1.3 +samtools/1.7 +scikit-learn/0.18.1 +scipy/1.0.0 +stringtie/1.3.4d +subread/1.5.1 +zlib/1.2.8 diff --git a/nextflow.config b/nextflow.config index 3cf78454..25a9ca19 100644 --- a/nextflow.config +++ b/nextflow.config @@ -32,8 +32,8 @@ profiles { cluster { process { executor = 'slurm' - queue = 'hpcz-2' - errorStrategy = { task.exitStatus in [137, 143] ? 'retry' : 'terminate' } + queue = 'hpcz-test' + errorStrategy = { task.exitStatus in [137, 143] ? 'retry' : 'ignore' } //errorStrategy = { task.exitStatus in [137, 143] ? 'retry' : 'finish' } //errorStrategy 'retry' maxRetries = 4 @@ -73,9 +73,9 @@ profiles { process.scratch = false } - singularity { - singularity.enabled = true - } + //singularity { + // singularity.enabled = true + //} docker { process.container = "fwip/stampipes:latest" @@ -86,11 +86,11 @@ profiles { docker.runOptions = '-u $(id -u):$(id -g)' } } - apptainer { - singularity { - enabled = true - } - } + //apptainer { + //singularity { + //enabled = true + //} + //} } report { diff --git a/processes/bwa/nextflow.config b/processes/bwa/nextflow.config index cdef24e2..b08df629 100644 --- a/processes/bwa/nextflow.config +++ b/processes/bwa/nextflow.config @@ -5,6 +5,56 @@ profiles { process.container = 'fwip/stampipes:latest' } + apptainer { + apptainer { + enabled = true + runOptions = "--mount type=bind,src=${System.getenv('STAMPIPES')},dst=${System.getenv('STAMPIPES')} --mount type=bind,src=${System.getenv('HOTSPOT_DIR')},dst=${System.getenv('HOTSPOT_DIR')}" + } + // temporary + process.errorStrategy = "finish" + + env { + STAMPIPES = System.getenv("STAMPIPES") + HOTSPOT_DIR = System.getenv("HOTSPOT_DIR") + } + process { + withName: 'align' { + container = "$baseDir/../../containers/bwa/align.sif" + } + withName: 'filter_bam' { + container = "$baseDir/../../containers/bwa/pysam-samtools.sif" + } + withName: 'sort_bam' { + container = "$baseDir/../../containers/bwa/samtools-1.12.sif" + } + withName: 'merge_bam' { + container = "$baseDir/../../containers/bwa/samtools-1.12.sif" + } + withName: 'filter_bam_to_unique' { + container = "$baseDir/../../containers/bwa/samtools-1.12.sif" + } + withName: 'cram' { + container = "$baseDir/../../containers/bwa/samtools-1.12.sif" + } + withName: 'mark_duplicates' { + container = "$baseDir/../../containers/bwa/mark_duplicates.sif" + } + withName: 'bam_counts' { + container = "$baseDir/../../containers/bwa/pysam-samtools.sif" + } + withName: 'insert_size' { + container = "$baseDir/../../containers/bwa/insert_size.sif" + } + withName: 'spot_score' { + container = "$baseDir/../../containers/bwa/spot_score.sif" + } + withName: 'density_files' { + container = "$baseDir/../../containers/bwa/density_files.sif" + } + + } + } + modules { process { withName: 'align' { @@ -16,6 +66,7 @@ profiles { withName: 'sort_bam' { module = 'samtools/1.12' } withName: 'merge_bam' { module = 'samtools/1.12' } withName: 'filter_bam_to_unique' { module = 'samtools/1.12' } + withName: 'cram' { module = 'samtools/1.12' } withName: 'mark_duplicates' { module = 'jdk/1.8.0_92:picard/2.8.1:samtools/1.3' } diff --git a/processes/bwa/process_bwa_paired_trimmed.nf b/processes/bwa/process_bwa_paired_trimmed.nf index 2dbf2271..861241fb 100755 --- a/processes/bwa/process_bwa_paired_trimmed.nf +++ b/processes/bwa/process_bwa_paired_trimmed.nf @@ -3,6 +3,9 @@ * This is our main DNase alignment pipeline */ +nextflow.enable.dsl = 2 + + params.help = false params.threads = 1 params.chunk_size = 16000000 @@ -17,11 +20,9 @@ params.readlength = 36 params.cramthreads = 10 -nuclear_chroms = "${params.genome}.nuclear.txt" -dataDir = "${baseDir}/../../data" - def helpMessage() { - log.info""" + log.info( + """ Usage: nextflow run process_bwa_paired_trimmed.nf \\ --r1 r1.fastq.gz \\ --r2 r2.fastq.gz \\ @@ -33,55 +34,38 @@ def helpMessage() { --UMI The reads contain UMI markers ('single-strand', 'thruplex') (false) --trim_to [length] Trim fastq reads to [length] bp (0 for no trimming) (0) --outdir [dir] Where to write results to (output) - """.stripIndent(); -} - -// Used for publishDir to avoid publishing bam files -def exclude_bam = { name -> name ==~ /.*ba[mi]$/ ? null : name } - - -if (params.help || !params.r1 || !params.r2 || !params.genome){ - helpMessage(); - exit 0; + """.stripIndent() + ) } -// Some renaming for easier usage later -genome = params.genome -genome_name = file(params.genome).baseName -threads = params.threads -/* - * Step 0: Split Fastq into chunks - */ -fastq_line_chunks = 4 * params.chunk_size process split_r1_fastq { - input: - file(r1) from file(params.r1) + path r1 val fastq_line_chunks output: - file('split_r1*gz') into split_r1 mode flatten + path 'split_r1*gz' script: """ - zcat $r1 \ - | split -l $fastq_line_chunks \ + zcat ${r1} \ + | split -l ${fastq_line_chunks} \ --filter='gzip -1 > \$FILE.gz' - 'split_r1' """ } process split_r2_fastq { input: - file(r2) from file(params.r2) + path r2 val fastq_line_chunks output: - file('split_r2*gz') into split_r2 mode flatten + path 'split_r2*gz' script: """ - zcat $r2 \ - | split -l $fastq_line_chunks \ + zcat ${r2} \ + | split -l ${fastq_line_chunks} \ --filter='gzip -1 > \$FILE.gz' - 'split_r2' """ } @@ -95,22 +79,22 @@ process trim_adapters { scratch false input: - file split_r1 - file split_r2 - file adapters from file(params.adapter_file) + path split_r1 + path split_r2 + path adapters output: - set file('trim.R1.fastq.gz'), file('trim.R2.fastq.gz') into trimmed - file('trim.counts.txt') into trim_counts + tuple path('trim.R1.fastq.gz'), path('trim.R2.fastq.gz') + path 'trim.counts.txt' script: """ trim-adapters-illumina \ - -f "$adapters" \ + -f "${adapters}" \ -1 P5 -2 P7 \ --threads=${params.threads} \ - "$split_r1" \ - "$split_r2" \ + "${split_r1}" \ + "${split_r2}" \ "trim.R1.fastq.gz" \ "trim.R2.fastq.gz" \ &> trimstats.txt @@ -127,58 +111,64 @@ process trim_adapters { process trim_to_length { scratch false + input: - set file(r1), file(r2) from trimmed + tuple path(r1), path(r2) output: - set file('r1.trim.fastq.gz'), file('r2.trim.fastq.gz') into trimmed_fastq + tuple path('r1.trim.fastq.gz'), path('r2.trim.fastq.gz') script: - if (params.trim_to != 0) + if (params.trim_to != 0) { // TODO: Add padding to length with N's """ - zcat $r1 | awk 'NR%2==0 {print substr(\$0, 1, $params.trim_to)} NR%2!=0' | gzip -c -1 > r1.trim.fastq.gz - zcat $r2 | awk 'NR%2==0 {print substr(\$0, 1, $params.trim_to)} NR%2!=0' | gzip -c -1 > r2.trim.fastq.gz + zcat ${r1} | awk 'NR%2==0 {print substr(\$0, 1, ${params.trim_to})} NR%2!=0' | gzip -c -1 > r1.trim.fastq.gz + zcat ${r2} | awk 'NR%2==0 {print substr(\$0, 1, ${params.trim_to})} NR%2!=0' | gzip -c -1 > r2.trim.fastq.gz """ - else + } + else { """ - ln -s $r1 r1.trim.fastq.gz - ln -s $r2 r2.trim.fastq.gz + ln -s ${r1} r1.trim.fastq.gz + ln -s ${r2} r2.trim.fastq.gz """ - + } } process add_umi_info { scratch false + input: - set file(r1), file(r2) from trimmed_fastq + tuple path(r1), path(r2) output: - set file('r1.fastq.umi.gz'), file('r2.fastq.umi.gz') into with_umi + tuple path('r1.fastq.umi.gz'), path('r2.fastq.umi.gz') script: - if (params.UMI == 'thruplex') + if (params.UMI == 'thruplex') { """ python3 \$STAMPIPES/scripts/umi/extract_umt.py \ - <(zcat $r1) \ - <(zcat $r2) \ + <(zcat ${r1}) \ + <(zcat ${r2}) \ >(gzip -c -1 > r1.fastq.umi.gz) \ >(gzip -c -1 > r2.fastq.umi.gz) """ - else if (params.UMI == 'single-strand') + } + else if (params.UMI == 'single-strand') { """ - python3 \$STAMPIPES/scripts/umi/fastq_umi_add.py $r1 r1.fastq.umi.gz - python3 \$STAMPIPES/scripts/umi/fastq_umi_add.py $r2 r2.fastq.umi.gz + python3 \$STAMPIPES/scripts/umi/fastq_umi_add.py ${r1} r1.fastq.umi.gz + python3 \$STAMPIPES/scripts/umi/fastq_umi_add.py ${r2} r2.fastq.umi.gz """ - - else if (params.UMI == false || params.UMI == "") + } + else if (params.UMI == false || params.UMI == "") { """ - ln -s $r1 r1.fastq.umi.gz - ln -s $r2 r2.fastq.umi.gz + ln -s ${r1} r1.fastq.umi.gz + ln -s ${r2} r2.fastq.umi.gz """ - else - error "--UMI must be `thruplex`, `single-strand` (for single-strand preparation), or false, got: '" + params.UMI + "'" + } + else { + error("--UMI must be `thruplex`, `single-strand` (for single-strand preparation), or false, got: '" + params.UMI + "'") + } } /* @@ -187,16 +177,17 @@ process add_umi_info { process fastq_counts { scratch false + input: - file(r1) from file(params.r1) - file(r2) from file(params.r2) + path r1 + path r2 output: - file 'fastq.counts' into fastq_counts + path 'fastq.counts' script: """ - zcat $r1 \ + zcat ${r1} \ | awk -v paired=1 -f \$STAMPIPES/awk/illumina_fastq_count.awk \ > fastq.counts """ @@ -210,19 +201,17 @@ process align { cpus params.threads input: - set file(trimmed_r1), file(trimmed_r2) from with_umi - - file genome from file(params.genome) - file '*' from file("${params.genome}.amb") - file '*' from file("${params.genome}.ann") - file '*' from file("${params.genome}.bwt") - file '*' from file("${params.genome}.fai") - file '*' from file("${params.genome}.pac") - file '*' from file("${params.genome}.sa") - + tuple path(trimmed_r1), path(trimmed_r2) + path genome + path '*' + path '*' + path '*' + path '*' + path '*' + path '*' output: - file 'out.bam' into unfiltered_bam + path 'out.bam' script: """ @@ -230,26 +219,25 @@ process align { bwa aln \ -Y -l 32 -n 0.04 \ -t "${params.threads}" \ - "$genome" \ - "$trimmed_r1" \ + "${genome}" \ + "${trimmed_r1}" \ > out1.sai bwa aln \ -Y -l 32 -n 0.04 \ - -t "$params.threads" \ - "$genome" \ - "$trimmed_r2" \ + -t "${params.threads}" \ + "${genome}" \ + "${trimmed_r2}" \ > out2.sai bwa sampe \ -n 10 -a 750 \ - "$genome" \ + "${genome}" \ out1.sai out2.sai \ - "$trimmed_r1" "$trimmed_r2" \ - | samtools view -b -t "$genome".fai - \ + "${trimmed_r1}" "${trimmed_r2}" \ + | samtools view -b -t "${genome}".fai - \ > out.bam """ - } /* @@ -260,18 +248,18 @@ process filter_bam { scratch false input: - file unfiltered_bam - file nuclear_chroms from file(nuclear_chroms) + path unfiltered_bam + path nuclear_chroms output: - file 'filtered.bam' into filtered_bam + path 'filtered.bam' script: """ python3 \$STAMPIPES/scripts/bwa/filter_reads.py \ - "$unfiltered_bam" \ + "${unfiltered_bam}" \ filtered.bam \ - "$nuclear_chroms" + "${nuclear_chroms}" """ } @@ -284,15 +272,15 @@ process sort_bam { scratch false input: - file filtered_bam + path filtered_bam output: - file 'sorted.bam' into sorted_bam + path 'sorted.bam' script: """ samtools sort \ - -l 0 -m 2G -@ "${params.threads}" "$filtered_bam" \ + -l 0 -m 2G -@ "${params.threads}" "${filtered_bam}" \ > sorted.bam """ } @@ -304,10 +292,10 @@ process merge_bam { scratch false input: - file 'sorted_bam_*' from sorted_bam.collect() + path 'sorted_bam_*' output: - file 'merged.bam' into merged_bam + path 'merged.bam' script: """ @@ -323,33 +311,30 @@ process mark_duplicates { label "high_mem" - publishDir params.outdir, saveAs: exclude_bam + publishDir params.outdir input: - file(merged_bam) from merged_bam + path merged_bam output: - file 'marked.bam' into marked_bam - file 'marked.bam' into bams_to_cram - file 'marked.bam' into marked_bam_for_counts - file 'MarkDuplicates.picard' - + path 'marked.bam' + path 'MarkDuplicates.picard' script: - if (params.UMI) + if (params.UMI) { """ picard RevertOriginalBaseQualitiesAndAddMateCigar \ - INPUT=$merged_bam OUTPUT=cigar.bam \ + INPUT=${merged_bam} OUTPUT=cigar.bam \ VALIDATION_STRINGENCY=SILENT RESTORE_ORIGINAL_QUALITIES=false SORT_ORDER=coordinate MAX_RECORDS_TO_EXAMINE=0 picard UmiAwareMarkDuplicatesWithMateCigar INPUT=cigar.bam OUTPUT=marked.bam \ METRICS_FILE=MarkDuplicates.picard UMI_TAG_NAME=XD ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT \ READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*' """ - else + } else { """ picard RevertOriginalBaseQualitiesAndAddMateCigar \ - INPUT=$merged_bam OUTPUT=cigar.bam \ + INPUT=${merged_bam} OUTPUT=cigar.bam \ VALIDATION_STRINGENCY=SILENT RESTORE_ORIGINAL_QUALITIES=false SORT_ORDER=coordinate MAX_RECORDS_TO_EXAMINE=0 picard MarkDuplicatesWithMateCigar INPUT=cigar.bam OUTPUT=marked.bam \ @@ -357,35 +342,32 @@ process mark_duplicates { READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*' \ MINIMUM_DISTANCE=300 """ + } } /* * Step 5: Filter bam file */ -filter_flag = 512 -if (params.UMI) - filter_flag = 1536 + process filter_bam_to_unique { scratch false + input: - file marked_bam + path marked_bam output: - set file('filtered.bam'), file('filtered.bam.bai') into uniquely_mapping_bam + tuple path('filtered.bam'), path('filtered.bam.bai') script: + filter_flag = params.UMI ? 1536 : 512 """ - samtools view $marked_bam -b -F $filter_flag > filtered.bam + samtools view ${marked_bam} -b -F ${filter_flag} > filtered.bam samtools index filtered.bam """ - } -// Marked bam file is used by several downstream results -uniquely_mapping_bam.into { bam_for_insert; bam_for_spot; bam_for_density } - /* * Metrics: bam counts */ @@ -394,15 +376,15 @@ process bam_counts { scratch false input: - file(sorted_bam) from marked_bam_for_counts + path sorted_bam output: - file('bam.counts.txt') into bam_counts + path 'bam.counts.txt' script: """ python3 \$STAMPIPES/scripts/bwa/bamcounts.py \ - "$sorted_bam" \ + "${sorted_bam}" \ bam.counts.txt """ } @@ -415,19 +397,19 @@ process insert_size { publishDir params.outdir input: - set file(bam), file(bai) from bam_for_insert + tuple path(bam), path(bai) output: - file 'CollectInsertSizeMetrics.picard' - file 'CollectInsertSizeMetrics.picard.pdf' + path 'CollectInsertSizeMetrics.picard' + path 'CollectInsertSizeMetrics.picard.pdf' script: """ - samtools idxstats "$bam" \ + samtools idxstats "${bam}" \ | cut -f 1 \ | grep -v chrM \ | grep -v chrC \ - | xargs samtools view -b "$bam" \ + | xargs samtools view -b "${bam}" \ > nuclear.bam picard CollectInsertSizeMetrics \ @@ -448,18 +430,19 @@ process spot_score { publishDir params.outdir input: - set file(bam), file(bai) from bam_for_spot - file('*') from file("${dataDir}/annotations/${genome_name}.K${params.readlength}.mappable_only.bed") - file('*') from file("${dataDir}/annotations/${genome_name}.chromInfo.bed") + tuple path(bam), path(bai) + path '*' + path '*' output: - file 'subsample.r1.spot.out' - file 'spotdups.txt' + path 'subsample.r1.spot.out' + path 'spotdups.txt' script: + genome_name = file(params.genome).baseName """ # random sample - samtools view -h -F 12 -f 3 "$bam" \ + samtools view -h -F 12 -f 3 "${bam}" \ | awk '{if( ! index(\$3, "chrM") && \$3 != "chrC" && \$3 != "random"){print}}' \ | samtools view -1 - \ -o paired.bam @@ -496,30 +479,28 @@ process spot_score { /* * Density tracks (for browser) */ -win = 75 -bini = 20 process density_files { label "high_mem" publishDir params.outdir input: - set file(bam), file(bai) from bam_for_density - file fai from file("${params.genome}.fai") - file density_buckets from file( - "$baseDir/../../data/densities/chrom-buckets.${genome_name}.${win}_${bini}.bed.starch" - ) + tuple path(bam), path(bai) + path fai + path density_buckets output: - file 'density.bed.starch' - file 'density.bw' - file 'density.bed.bgz' - + path 'density.bed.starch' + path 'density.bw' + path 'density.bed.bgz' script: + win = 75 + bini = 20 """ + ls -l bam2bed -d \ - < $bam \ + < ${bam} \ | cut -f1-6 \ | awk '{ if( \$6=="+" ){ s=\$2; e=\$2+1 } else { s=\$3-1; e=\$3 } print \$1 "\t" s "\t" e "\tid\t" 1 }' \ | sort-bed - \ @@ -527,13 +508,14 @@ process density_files { unstarch "${density_buckets}" \ | bedmap --faster --echo --count --delim "\t" - sample.bed \ - | awk -v binI=$bini -v win=$win \ + | awk -v binI=${bini} -v win=${win} \ 'BEGIN{ halfBin=binI/2; shiftFactor=win-halfBin } { print \$1 "\t" \$2 + shiftFactor "\t" \$3-shiftFactor "\tid\t" i \$4}' \ | starch - \ > density.bed.starch - unstarch density.bed.starch | awk -v binI=$bini -f \$STAMPIPES/awk/bedToWig.awk > density.wig - + unstarch density.bed.starch | awk -v binI=${bini} -f \$STAMPIPES/awk/bedToWig.awk > density.wig + echo "########" + ls -l wigToBigWig -clip density.wig "${fai}" density.bw @@ -550,12 +532,12 @@ process total_counts { publishDir params.outdir input: - file 'fastqcounts*' from fastq_counts.collect() - file 'trimcounts*' from trim_counts.collect() - file 'bamcounts*' from bam_counts.collect() + path 'fastqcounts*' + path 'trimcounts*' + path 'bamcounts*' output: - file 'tagcounts.txt' + path 'tagcounts.txt' script: """ @@ -574,17 +556,14 @@ process cram { cpus params.cramthreads / 2 scratch false - // TODO: put in config - module "samtools/1.12" - input: - file bam from bams_to_cram - file ref from file("${genome}.fa") - file fai from file("${genome}.fai") + path bam + path ref + path fai output: - file cramfile - file "${cramfile}.crai" + path cramfile + path "${cramfile}.crai" script: cramfile = bam.name.replace("bam", "cram") @@ -597,3 +576,108 @@ process cram { -o "${cramfile}" """ } + +workflow { + // Check for required parameters + if (params.help || !params.r1 || !params.r2 || !params.genome) { + helpMessage() + exit(0) + } + + // Setup variables + nuclear_chroms = "${params.genome}.nuclear.txt" + dataDir = "${baseDir}/../../data" + genome_name = file(params.genome).baseName + fastq_line_chunks = 4 * params.chunk_size + + // Input channels + r1_ch = Channel.fromPath(params.r1) + r2_ch = Channel.fromPath(params.r2) + adapter_file_ch = Channel.fromPath(params.adapter_file) + + // Genome files channels + genome_ch = Channel.fromPath(params.genome) + genome_amb_ch = Channel.fromPath("${params.genome}.amb") + genome_ann_ch = Channel.fromPath("${params.genome}.ann") + genome_bwt_ch = Channel.fromPath("${params.genome}.bwt") + genome_fai_ch = Channel.fromPath("${params.genome}.fai") + genome_pac_ch = Channel.fromPath("${params.genome}.pac") + genome_sa_ch = Channel.fromPath("${params.genome}.sa") + genome_fa_ch = Channel.fromPath("${params.genome}.fa") + nuclear_chroms_ch = Channel.fromPath(nuclear_chroms) + + // Annotation files + mappable_bed_ch = Channel.fromPath("${dataDir}/annotations/${genome_name}.K${params.readlength}.mappable_only.bed") + chrominfo_bed_ch = Channel.fromPath("${dataDir}/annotations/${genome_name}.chromInfo.bed") + density_buckets_ch = Channel.fromPath("${baseDir}/../../data/densities/chrom-buckets.${genome_name}.75_20.bed.starch") + + // Step 0: Split fastq files + split_r1_fastq(r1_ch, fastq_line_chunks) + split_r2_fastq(r2_ch, fastq_line_chunks) + + // Step 1: Trim adapters + trimmed_reads = trim_adapters( + split_r1_fastq.out.flatten(), + split_r2_fastq.out.flatten(), + adapter_file_ch, + ) + + // Step 1.1: Trim to length + length_trimmed = trim_to_length(trimmed_reads[0]) + + // Step 1.2: Add UMI info + umi_reads = add_umi_info(length_trimmed) + + // Metrics: Fastq counts + fastq_count_results = fastq_counts(r1_ch, r2_ch) + + // Step 2a: Align reads + unfiltered_bams = align( + umi_reads, + genome_ch, + genome_amb_ch, + genome_ann_ch, + genome_bwt_ch, + genome_fai_ch, + genome_pac_ch, + genome_sa_ch, + ) + + // Step 2b: Filter bam files + filtered_bams = filter_bam(unfiltered_bams, nuclear_chroms_ch) + + // Step 2c: Sort bam files + sorted_bams = sort_bam(filtered_bams) + + // Step 3: Merge alignments + merged_bam = merge_bam(sorted_bams.collect()) + + // Step 4: Mark duplicates + mark_dup_results = mark_duplicates(merged_bam) + marked_bam = mark_dup_results[0] + + // Step 5: Filter to unique reads + unique_bam = filter_bam_to_unique(marked_bam) + + // Metrics: BAM counts + bam_count_results = bam_counts(marked_bam) + + // Metrics: Insert size + insert_size(unique_bam) + + // Metrics: SPOT score + spot_score(unique_bam, mappable_bed_ch, chrominfo_bed_ch) + + // Density files + density_files(unique_bam, genome_fai_ch, density_buckets_ch) + + // Total counts + total_counts( + fastq_count_results.collect(), + trimmed_reads[1].collect(), + bam_count_results.collect(), + ) + + // CRAM conversion + cram(marked_bam, genome_fa_ch, genome_fai_ch) +} From 417184403b5cb6927d98093b72c0aa07062d573c Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Fri, 17 Oct 2025 12:54:08 -0700 Subject: [PATCH 2/2] WIP - port dnase agg to new cluster --- containers/bwa/bwa-aggregate-base.def | 2 +- nextflow.config | 4 + processes/bwa/aggregate/basic.nf | 694 ++++++++++++++---------- processes/bwa/aggregate/nextflow.config | 40 +- processes/bwa/nextflow.config | 7 +- 5 files changed, 460 insertions(+), 287 deletions(-) diff --git a/containers/bwa/bwa-aggregate-base.def b/containers/bwa/bwa-aggregate-base.def index f9b323a0..1c452e10 100644 --- a/containers/bwa/bwa-aggregate-base.def +++ b/containers/bwa/bwa-aggregate-base.def @@ -113,4 +113,4 @@ From: ubuntu:18.04 # We use 18.04 to get libgfortan3 which bedops # Test that picard is available - picard 2>&1 | grep -q MarkDuplicates \ No newline at end of file + picard 2>&1 | grep -q MarkDuplicates diff --git a/nextflow.config b/nextflow.config index 25a9ca19..1d0232e7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -95,13 +95,17 @@ profiles { report { enabled = true + overwrite = true } trace { enabled = true + overwrite = true } timeline { enabled = true + overwrite = true } dag { enabled = true + overwrite = true } diff --git a/processes/bwa/aggregate/basic.nf b/processes/bwa/aggregate/basic.nf index 7003fbbf..7db22749 100644 --- a/processes/bwa/aggregate/basic.nf +++ b/processes/bwa/aggregate/basic.nf @@ -1,3 +1,5 @@ +nextflow.enable.dsl = 2 + params.help = false params.threads = 1 @@ -20,38 +22,28 @@ params.hotspot_index = "." params.cramthreads = 10 def helpMessage() { - log.info""" + log.info( + """ Usage: nextflow run basic.nf \\ --genome /path/to/genome \\ --bams '1.bam,2.bam...' \\ --UMI true/false \\ --outdir /path/to/output - """.stripIndent(); + """.stripIndent() + ) } -// Used for publishDir to avoid publishing bam files -def exclude_bam = { name -> name ==~ /.*ba[mi]$/ ? null : name } - -dataDir = "$baseDir/../../../data" -genome_name = file(params.genome).baseName - -bams = Channel.from( - params.bams.tokenize(',') -).map { - file(it) -}.collect() - -process merge { +process merge_bams { label "modules" input: // Inputs may be cram file but that's fine - file 'in*.bam' from bams + path 'in*.bam' output: - file 'merged.bam' into merged + path 'merged.bam' script: """ @@ -63,49 +55,54 @@ process merge { process dups { label "modules" label 'high_mem' - publishDir params.outdir, saveAs: exclude_bam + publishDir params.outdir, saveAs: { name -> name ==~ /.*ba[mi]$/ ? null : name } input: - file(merged) + path merged output: - file 'marked.bam' into marked_bam - file 'marked.bam' into bams_to_cram_marked - file 'marked.bam.bai' - file 'MarkDuplicates.picard' + path 'marked.bam', emit: marked_bam + path 'marked.bam', emit: bams_to_cram_marked + path 'marked.bam.bai' + path 'MarkDuplicates.picard' script: if (params.UMI) { cmd = "UmiAwareMarkDuplicatesWithMateCigar" extra = "UMI_TAG_NAME=XD" - } else { + } + else { cmd = "MarkDuplicatesWithMateCigar" extra = "MINIMUM_DISTANCE=300" } """ + env + ls -l picard RevertOriginalBaseQualitiesAndAddMateCigar \ "INPUT=${merged}" OUTPUT=cigar.bam \ VALIDATION_STRINGENCY=SILENT RESTORE_ORIGINAL_QUALITIES=false SORT_ORDER=coordinate MAX_RECORDS_TO_EXAMINE=0 - + echo "###" + ls -l picard "${cmd}" \ INPUT=cigar.bam OUTPUT=marked.bam \ - $extra \ + ${extra} \ METRICS_FILE=MarkDuplicates.picard ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT \ READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]+:[a-zA-Z0-9]+:[0-9]+:([0-9]+):([0-9]+):([0-9]+).*' + echo "###" + ls -l samtools index marked.bam """ } -marked_bam.into { bam_for_counts; bam_for_adapter_counts; bam_for_filter; bam_for_diff_peaks; bam_for_multimapping_density } -process filter { +process filter_bam { label "modules" input: - file bam from bam_for_filter + path bam output: - file "filtered.bam" into filtered_bam + path "filtered.bam", emit: filtered_bam script: flag = params.UMI ? 1536 : 512 @@ -113,19 +110,16 @@ process filter { samtools view -b -F "${flag}" marked.bam > filtered.bam """ } -// TODO: Do we need to CRAM/publish this? -filtered_bam.into { bam_for_spot_score; bam_for_cutcounts; bam_for_density; bam_for_inserts; bam_for_nuclear; bam_for_footprints; bams_to_cram } process filter_nuclear { label "modules" + input: - file bam from bam_for_nuclear - file nuclear_chroms from file("${params.genome}.nuclear.txt") + path bam + path nuclear_chroms output: - file 'nuclear.bam' into nuclear_bam - file 'nuclear.bam' into bam_for_hotspot2 - file 'nuclear.bam' into bam_for_macs2 + path 'nuclear.bam', emit: nuclear_bam script: """ @@ -141,19 +135,19 @@ process macs2 { publishDir "${params.outdir}/peaks_macs2" scratch false - when: - params.peakcaller == "macs2" - input: - file bam from bam_for_macs2 + path bam output: - file 'NA_*' + path 'NA_*' + + when: + params.peakcaller == "macs2" script: """ macs2 callpeak \ - -t "$bam" \ + -t "${bam}" \ -f BAMPE \ -g hs \ -B \ @@ -163,28 +157,25 @@ process macs2 { process hotspot2 { label "modules" - + label 'high_mem' publishDir "${params.outdir}" container "fwip/hotspot2:latest" - label 'high_mem' - - when: - params.peakcaller == "hotspot2" - input: - val(hotspotid) from params.hotspot_id - file(nuclear) from bam_for_hotspot2 - file(mappable) from file(params.mappable) - file(chrom_sizes) from file(params.chrom_sizes) - file(centers) from file(params.centers) - + val hotspotid + path nuclear + path mappable + path chrom_sizes + path centers output: - file('peaks/nuclear*') - file('peaks/nuclear.hotspots.fdr0.05.starch') into hotspot_calls - file('peaks/nuclear.hotspots.fdr0.05.starch') into hotspot_calls_for_bias - file('peaks/nuclear.peaks.fdr0.001.starch') into onepercent_peaks + path 'peaks/nuclear*' + path 'peaks/nuclear.hotspots.fdr0.05.starch', emit: hotspot_calls + path 'peaks/nuclear.hotspots.fdr0.05.starch', emit: hotspot_calls_for_bias + path 'peaks/nuclear.peaks.fdr0.001.starch', emit: onepercent_peaks + + when: + params.peakcaller == "hotspot2" script: """ @@ -215,7 +206,6 @@ process hotspot2 { rm -rf "\$TMPDIR" """ - } process spot_score { @@ -223,18 +213,19 @@ process spot_score { publishDir params.outdir input: - file(bam) from bam_for_spot_score - file(mappable) from file("${dataDir}/annotations/${genome_name}.K${params.readlength}.mappable_only.bed") - file(chromInfo) from file("${dataDir}/annotations/${genome_name}.chromInfo.bed") + path bam + path mappable + path chromInfo + val genome_name output: - file 'r1.spot.out' - file 'r1.hotspot.info' + path 'r1.spot.out' + path 'r1.hotspot.info' script: """ # random sample - samtools view -h -F 12 -f 3 "$bam" \ + samtools view -h -F 12 -f 3 "${bam}" \ | awk '{if( ! index(\$3, "chrM") && \$3 != "chrC" && \$3 != "random"){print}}' \ | samtools view -uS - \ > nuclear.bam @@ -264,15 +255,15 @@ process bam_counts { publishDir params.outdir input: - file(bam) from bam_for_counts + path bam output: - file('tagcounts.txt') + path 'tagcounts.txt' script: """ python3 \$STAMPIPES/scripts/bwa/bamcounts.py \ - "$bam" \ + "${bam}" \ tagcounts.txt """ } @@ -282,10 +273,10 @@ process count_adapters { publishDir params.outdir input: - file(bam) from bam_for_adapter_counts + path bam output: - file('adapter.counts.txt') + path 'adapter.counts.txt' script: """ @@ -298,17 +289,18 @@ process count_adapters { process preseq { label "modules" publishDir params.outdir + input: - file nuclear_bam + path nuclear_bam + + output: + path 'preseq.txt' + path 'preseq_targets.txt' + path 'dups.hist' when: !params.UMI - output: - file 'preseq.txt' - file 'preseq_targets.txt' - file 'dups.hist' - script: """ python3 \$STAMPIPES/scripts/bam/mark_dups.py -i "${nuclear_bam}" -o /dev/null --hist dups.hist @@ -328,20 +320,20 @@ process cutcounts { label 'high_mem' input: - file(fai) from file("${params.genome}.fai") - file(filtered_bam) from bam_for_cutcounts + path fai + path filtered_bam output: - file('fragments.starch') - file('cutcounts.starch') - file('cutcounts.bw') - file('cutcounts.bed.bgz') - file('cutcounts.bed.bgz.tbi') + path 'fragments.starch' + path 'cutcounts.starch' + path 'cutcounts.bw' + path 'cutcounts.bed.bgz' + path 'cutcounts.bed.bgz.tbi' script: """ bam2bed --do-not-sort \ - < "$filtered_bam" \ + < "${filtered_bam}" \ | awk -v cutfile=cuts.bed -v fragmentfile=fragments.bed -f \$STAMPIPES/scripts/bwa/aggregate/basic/cutfragments.awk sort-bed fragments.bed | starch - > fragments.starch @@ -359,7 +351,7 @@ process cutcounts { | starch - > cutcounts.starch # Bigwig - "$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ + "\$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ cutcounts.starch \ cutcounts.bw \ "${fai}" @@ -377,51 +369,49 @@ process density { label 'high_mem' input: - file filtered_bam from bam_for_density - file chrom_bucket from file(params.chrom_bucket) - file fai from file("${params.genome}.fai") + path filtered_bam + path chrom_bucket + path fai output: - file 'density.starch' - file 'density.bw' - file 'density.bgz' - file 'density.bgz.tbi' - set(file(filtered_bam), file('density.starch')) into to_normalize - - shell: - window_size = 75 - bin_size = 20 - scale = 1_000_000 - ''' + path 'density.starch' + path 'density.bw' + path 'density.bgz' + path 'density.bgz.tbi' + tuple path(filtered_bam), path('density.starch'), emit: to_normalize + + script: + def window_size = 75 + def bin_size = 20 + """ mkfifo density.bed bam2bed -d \ - < "!{filtered_bam}" \ + < "${filtered_bam}" \ | cut -f1-6 \ - | awk '{ if( $6=="+" ){ s=$2; e=$2+1 } else { s=$3-1; e=$3 } print $1 "\t" s "\t" e "\tid\t" 1 }' \ + | awk '{ if( \$6=="+" ){ s=\$2; e=\$2+1 } else { s=\$3-1; e=\$3 } print \$1 "\t" s "\t" e "\tid\t" 1 }' \ | sort-bed - \ > density.bed \ & - unstarch "!{chrom_bucket}" \ + unstarch "${chrom_bucket}" \ | bedmap --faster --echo --count --delim "\t" - density.bed \ - | awk -v "binI=!{bin_size}" -v "win=!{window_size}" \ - 'BEGIN{ halfBin=binI/2; shiftFactor=win-halfBin } { print $1 "\t" $2 + shiftFactor "\t" $3-shiftFactor "\tid\t" i $4}' \ + | awk -v "binI=${bin_size}" -v "win=${window_size}" \ + 'BEGIN{ halfBin=binI/2; shiftFactor=win-halfBin } { print \$1 "\t" \$2 + shiftFactor "\t" \$3-shiftFactor "\tid\t" i \$4}' \ | starch - \ > density.starch # Bigwig - "$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ + "\$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ density.starch \ density.bw \ - "!{fai}" \ - "!{bin_size}" + "${fai}" \ + "${bin_size}" # Tabix unstarch density.starch | bgzip > density.bgz tabix -p bed density.bgz - ''' - + """ } process multimapping_density { @@ -431,24 +421,24 @@ process multimapping_density { label 'high_mem' input: - file marked_bam from bam_for_multimapping_density - file chrom_bucket from file(params.chrom_bucket) - file fai from file("${params.genome}.fai") + path marked_bam + path chrom_bucket + path fai output: - file "mm_density.starch" - file "mm_density.bw" - file 'normalized.mm_density.starch' - file 'normalized.mm_density.bw' - - shell: - window_size = 75 - bin_size = 20 - scale = 1_000_000 - ''' + path "mm_density.starch" + path "mm_density.bw" + path 'normalized.mm_density.starch' + path 'normalized.mm_density.bw' + + script: + def window_size = 75 + def bin_size = 20 + def scale = 1_000_000 + """ # Mark multi-mapping reads as QC-pass! - samtools view -h "!{marked_bam}" | - awk 'BEGIN{OFS="\t"} /XA:Z/ {$2 = and(or($2, 2), compl(512))} 1' | + samtools view -h "${marked_bam}" | + awk 'BEGIN{OFS="\t"} /XA:Z/ {\$2 = and(or(\$2, 2), compl(512))} 1' | samtools view --threads 3 -F 512 -o filtered.bam samtools index filtered.bam @@ -459,15 +449,15 @@ process multimapping_density { bam2bed -d \ < filtered.bam \ | cut -f1-6 \ - | awk '{ if( $6=="+" ){ s=$2; e=$2+1 } else { s=$3-1; e=$3 } print $1 "\t" s "\t" e "\tid\t" 1 }' \ + | awk '{ if( \$6=="+" ){ s=\$2; e=\$2+1 } else { s=\$3-1; e=\$3 } print \$1 "\t" s "\t" e "\tid\t" 1 }' \ | sort-bed - \ > density.bed \ & - unstarch "!{chrom_bucket}" \ + unstarch "${chrom_bucket}" \ | bedmap --faster --echo --count --delim "\t" - density.bed \ - | awk -v "binI=!{bin_size}" -v "win=!{window_size}" \ - 'BEGIN{ halfBin=binI/2; shiftFactor=win-halfBin } { print $1 "\t" $2 + shiftFactor "\t" $3-shiftFactor "\tid\t" i $4}' \ + | awk -v "binI=${bin_size}" -v "win=${window_size}" \ + 'BEGIN{ halfBin=binI/2; shiftFactor=win-halfBin } { print \$1 "\t" \$2 + shiftFactor "\t" \$3-shiftFactor "\tid\t" i \$4}' \ | starch - \ > mm_density.starch @@ -475,8 +465,8 @@ process multimapping_density { "\$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ mm_density.starch \ mm_density.bw \ - "!{fai}" \ - "!{bin_size}" + "${fai}" \ + "${bin_size}" # # Tabix # unstarch density.starch | bgzip > density.bgz @@ -486,21 +476,21 @@ process multimapping_density { # Normalized density unstarch mm_density.starch \ - | awk -v allcounts=$(samtools view -c filtered.bam) \ - -v extranuclear_counts=$(samtools view -c "filtered.bam" chrM chrC) \ - -v scale=!{scale} \ + | awk -v allcounts=\$(samtools view -c filtered.bam) \ + -v extranuclear_counts=\$(samtools view -c "filtered.bam" chrM chrC) \ + -v scale=${scale} \ 'BEGIN{ tagcount=allcounts-extranuclear_counts } - { z=$5; + { z=\$5; n=(z/tagcount)*scale; - print $1 "\t" $2 "\t" $3 "\t" $4 "\t" n }' \ + print \$1 "\t" \$2 "\t" \$3 "\t" \$4 "\t" n }' \ | starch - > normalized.mm_density.starch "\$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ normalized.mm_density.starch \ normalized.mm_density.bw \ - "!{fai}" \ - "!{bin_size}" - ''' + "${fai}" \ + "${bin_size}" + """ } process normalize_density { @@ -508,41 +498,40 @@ process normalize_density { publishDir params.outdir input: - set(file(filtered_bam), file(density)) from to_normalize - file(fai) from file("${params.genome}.fai") + tuple path(filtered_bam), path(density) + path fai output: - file 'normalized.density.starch' - file 'normalized.density.bw' - file 'normalized.density.bgz' - file 'normalized.density.bgz.tbi' - - shell: - bin_size = 20 - scale = 1_000_000 - ''' - samtools index "!{filtered_bam}" + path 'normalized.density.starch' + path 'normalized.density.bw' + path 'normalized.density.bgz' + path 'normalized.density.bgz.tbi' + + script: + def bin_size = 20 + def scale = 1_000_000 + """ + samtools index "${filtered_bam}" # Normalized density unstarch density.starch \ - | awk -v allcounts=$(samtools view -c !{filtered_bam}) \ - -v extranuclear_counts=$(samtools view -c "!{filtered_bam}" chrM chrC) \ - -v scale=!{scale} \ + | awk -v allcounts=\$(samtools view -c ${filtered_bam}) \ + -v extranuclear_counts=\$(samtools view -c "${filtered_bam}" chrM chrC) \ + -v scale=${scale} \ 'BEGIN{ tagcount=allcounts-extranuclear_counts } - { z=$5; + { z=\$5; n=(z/tagcount)*scale; - print $1 "\t" $2 "\t" $3 "\t" $4 "\t" n }' \ + print \$1 "\t" \$2 "\t" \$3 "\t" \$4 "\t" n }' \ | starch - > normalized.density.starch - "$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ + "\$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ normalized.density.starch \ normalized.density.bw \ - "!{fai}" \ - "!{bin_size}" + "${fai}" \ + "${bin_size}" unstarch normalized.density.starch | bgzip > normalized.density.bgz tabix -p bed normalized.density.bgz - ''' - + """ } process insert_sizes { @@ -550,15 +539,18 @@ process insert_sizes { publishDir params.outdir + scratch false + input: - file nuclear_bam from bam_for_inserts - file nuclear_chroms from file("${params.genome}.nuclear.txt") + path nuclear_bam + path nuclear_chroms output: - file 'CollectInsertSizeMetrics.picard*' + path 'CollectInsertSizeMetrics.picard*' script: """ + export JAVA_TOOL_OPTIONS="-Djdk.lang.Process.launchMechanism=vfork" picard CollectInsertSizeMetrics \ "INPUT=${nuclear_bam}" \ OUTPUT=CollectInsertSizeMetrics.picard \ @@ -581,12 +573,12 @@ process motif_matrix { publishDir params.outdir input: - file hotspot_calls - file fimo_transfac from file("${dataDir}/motifs/${genome_name}.fimo.starch") - file fimo_names from file("${dataDir}/motifs/${genome_name}.fimo.transfac.names.txt") + path hotspot_calls + path fimo_transfac + path fimo_names output: - file 'hs_motifs*.txt' + path 'hs_motifs*.txt' when: params.domotifs @@ -605,16 +597,16 @@ process closest_features { publishDir params.outdir input: - file hotspot_calls - file transcript_starts from file("${dataDir}/features/${genome_name}.CombinedTxStarts.bed") - val thresholds from "0 1000 2500 5000 10000" + path hotspot_calls + path transcript_starts + val thresholds + + output: + path 'prox_dist.info' when: params.dofeatures - output: - file 'prox_dist.info' - script: """ closest-features \ @@ -630,7 +622,7 @@ process closest_features { | sed -e 's/-//g' \ > closest.clean.txt - for t in $thresholds ; do + for t in ${thresholds} ; do awk \ -v t=\$t \ '\$1 > t {sum+=1} END {print "percent-proximal-" t "bp " sum/NR}' \ @@ -649,38 +641,38 @@ process differential_hotspots { publishDir params.outdir input: - file bam from bam_for_diff_peaks - file peaks from onepercent_peaks - file index from file(params.hotspot_index) + path bam + path peaks + path index output: - file 'differential_index_report.tsv' + path 'differential_index_report.tsv' when: params.hotspot_index != "." - shell: - version = (new File(params.hotspot_index)).getAbsoluteFile().getParentFile().getName() - diffName = "dhsindex_${version}_differential_peaks" - diffPerName = "dhsindex_${version}_differential_peaks_percent" - conName = "dhsindex_${version}_constitutive_peaks" - conPerName = "dhsindex_${version}_constitutive_peaks_percent" + script: + def version = (new File(params.hotspot_index)).getAbsoluteFile().getParentFile().getName() + def diffName = "dhsindex_${version}_differential_peaks" + def diffPerName = "dhsindex_${version}_differential_peaks_percent" + def conName = "dhsindex_${version}_constitutive_peaks" + def conPerName = "dhsindex_${version}_constitutive_peaks_percent" - ''' + """ set -e -o pipefail - statOverlap=$(bedops -e 1 "!{peaks}" "!{index}" | wc -l) - statNoOverlap=$(bedops -n 1 "!{peaks}" "!{index}" | wc -l) - total=$(unstarch "!{peaks}" | wc -l) - statPercOverlap=$(echo "scale=3; $statOverlap * 100.0/$total" | bc -q) - statPercNoOverlap=$(echo "scale=3; $statNoOverlap * 100.0/$total" | bc -q) + statOverlap=\$(bedops -e 1 "${peaks}" "${index}" | wc -l) + statNoOverlap=\$(bedops -n 1 "${peaks}" "${index}" | wc -l) + total=\$(unstarch "${peaks}" | wc -l) + statPercOverlap=\$(echo "scale=3; \$statOverlap * 100.0/\$total" | bc -q) + statPercNoOverlap=\$(echo "scale=3; \$statNoOverlap * 100.0/\$total" | bc -q) { - echo -e "!{diffName}\t$statNoOverlap" - echo -e "!{diffPerName}\t$statPercNoOverlap" - echo -e "!{conName}\t$statOverlap" - echo -e "!{conPerName}\t$statPercOverlap" + echo -e "${diffName}\t\$statNoOverlap" + echo -e "${diffPerName}\t\$statPercNoOverlap" + echo -e "${conName}\t\$statOverlap" + echo -e "${conPerName}\t\$statPercOverlap" } > differential_index_report.tsv - ''' + """ } @@ -688,116 +680,111 @@ process differential_hotspots { * Footprint calling */ process learn_dispersion { - label "footprints" publishDir params.outdir - - memory = '8 GB' - cpus = 8 - - when: - params.bias != "" + memory '8 GB' + cpus 8 input: - file ref from file("${params.genome}.fa") - file bam from bam_for_footprints - //file bai - file spots from hotspot_calls_for_bias - file bias from file(params.bias) + path ref + path bam + //path bai + path spots + path bias output: - set file('dm.json'), file(bam), file ("${bam}.bai") into dispersion - file 'dm.json' into to_plot + tuple path('dm.json'), path(bam), path("${bam}.bai"), emit: dispersion + path 'dm.json', emit: to_plot + + when: + params.bias != "" script: """ - samtools index "$bam" + samtools index "${bam}" # TODO: Use nuclear file - unstarch $spots \ + unstarch ${spots} \ | grep -v "_random" \ | grep -v "chrUn" \ | grep -v "chrM" \ > intervals.bed ftd-learn-dispersion-model \ - --bm $bias \ + --bm ${bias} \ --half-win-width 5 \ --processors 8 \ - $bam \ - $ref \ + ${bam} \ + ${ref} \ intervals.bed \ > dm.json """.stripIndent() - } process make_intervals { label "footprints" + input: - file starch from hotspot_calls + path starch output: - file 'chunk_*' into intervals mode flatten + path 'chunk_*', emit: intervals script: """ - unstarch "$starch" \ + unstarch "${starch}" \ | grep -v "_random" \ | grep -v "chrUn" \ | grep -v "chrM" \ - | split -l "$params.chunksize" -a 4 -d - chunk_ + | split -l "${params.chunksize}" -a 4 -d - chunk_ """.stripIndent() - } process compute_deviation { - label "footprints" - memory = '8 GB' - cpus = 4 + memory '8 GB' + cpus 4 input: - set file(interval), file(dispersion), file(bam), file(bai) from intervals.combine(dispersion) - file(bias) from file(params.bias) - file(ref) from file("${params.genome}.fa") + tuple path(interval), path(dispersion), path(bam), path(bai) + path bias + path ref output: - file 'deviation.out' into deviations + path 'deviation.out', emit: deviations script: """ ftd-compute-deviation \ - --bm "$bias" \ + --bm "${bias}" \ --half-win-width 5 \ --smooth-half-win-width 50 \ --smooth-clip 0.01 \ - --dm "$dispersion" \ + --dm "${dispersion}" \ --fdr-shuffle-n 50 \ --processors 4 \ - "$bam" \ - "$ref" \ - "$interval" \ + "${bam}" \ + "${ref}" \ + "${interval}" \ | sort --buffer-size=8G -k1,1 -k2,2n \ > deviation.out """.stripIndent() } process merge_deviation { - label "footprints" - memory = "32 GB" - cpus = 1 - - when: - params.bias != "" + memory "32 GB" + cpus 1 input: - file 'chunk_*' from deviations.collect() + path 'chunk_*' output: - file 'interval.all.bedgraph' into merged_interval + path 'interval.all.bedgraph', emit: merged_interval + + when: + params.bias != "" script: """ @@ -807,47 +794,40 @@ process merge_deviation { } process working_tracks { - label "footprints" - memory = '32 GB' - cpus = 1 - + memory '32 GB' + cpus 1 publishDir params.outdir input: - file merged_interval + path merged_interval output: - file 'interval.all.bedgraph' into bedgraph - file 'interval.all.bedgraph.starch' - file 'interval.all.bedgraph.gz' - file 'interval.all.bedgraph.gz.tbi' + path 'interval.all.bedgraph', emit: bedgraph + path 'interval.all.bedgraph.starch' + path 'interval.all.bedgraph.gz' + path 'interval.all.bedgraph.gz.tbi' script: """ - sort-bed "$merged_interval" | starch - > interval.all.bedgraph.starch - bgzip -c "$merged_interval" > interval.all.bedgraph.gz + sort-bed "${merged_interval}" | starch - > interval.all.bedgraph.starch + bgzip -c "${merged_interval}" > interval.all.bedgraph.gz tabix -0 -p bed interval.all.bedgraph.gz """.stripIndent() - } -thresholds = Channel.from(0.2, 0.1, 0.05, 0.01, 0.001, 0.0001) - process compute_footprints { - label "footprints" - memory = '8 GB' - cpus = 1 - + memory '8 GB' + cpus 1 publishDir params.outdir input: - set file(merged_interval), val(threshold) from merged_interval.combine(thresholds) + tuple path(merged_interval), val(threshold) output: - file "interval.all.fps.${threshold}.bed.gz" - file "interval.all.fps.${threshold}.bed.gz.tbi" + path "interval.all.fps.${threshold}.bed.gz" + path "interval.all.fps.${threshold}.bed.gz.tbi" script: """ @@ -862,7 +842,6 @@ process compute_footprints { bgzip -c "\$output" > "\$output.gz" tabix -0 -p bed "\$output.gz" """.stripIndent() - } process plot_footprints { @@ -871,33 +850,31 @@ process plot_footprints { publishDir params.outdir input: - file model from to_plot - file plot from file("$baseDir/plot_footprints.py") + path model + path plot output: - file "dispersion.*pdf" + path "dispersion.*pdf" script: """ - "./$plot" "$model" + "./${plot}" "${model}" """ } process cram { publishDir params.outdir cpus params.cramthreads / 2 - - // TODO: put in config - module "samtools/1.12" + label 'samtools' input: - file bam from bams_to_cram.mix(bams_to_cram_marked) - file ref from file("${params.genome}.fa") - file fai from file("${params.genome}.fai") + path bam + path ref + path fai output: - file cramfile - file "${cramfile}.crai" + path cramfile + path "${cramfile}.crai" script: cramfile = bam.name.replace("bam", "cram") @@ -916,19 +893,176 @@ process starch_to_bigbed { label "modules" input: - file starch_in from onepercent_peaks - file chrom_sizes_bed from file(params.chrom_sizes) + path starch_in + path chrom_sizes_bed output: - file('peaks/nuclear.peaks.fdr0.001.bb') + path 'peaks/nuclear.peaks.fdr0.001.bb' script: outfile = starch_in.name.replace("starch", "bb") """ - cut -f1,3 "$chrom_sizes_bed" > chrom_sizes + cut -f1,3 "${chrom_sizes_bed}" > chrom_sizes mkdir -p peaks unstarch "${starch_in}" | cut -f1-4 > temp.bed - bedToBigBed temp.bed chrom_sizes "peaks/$outfile" + bedToBigBed temp.bed chrom_sizes "peaks/${outfile}" rm temp.bed """ } + +workflow { + dataDir = "${baseDir}/../../../data" + genome_name = file(params.genome).baseName + + // Create input channels + bams = Channel.from( + params.bams.tokenize(',') + ) + .map { + file(it) + } + .collect() + + // Main workflow execution + merge_bams(bams) + dups(merge_bams.out) + filter_bam(dups.out.marked_bam) + filter_nuclear(filter_bam.out.filtered_bam, file("${params.genome}.nuclear.txt")) + + // Conditional processes + if (params.peakcaller == "macs2") { + macs2(filter_nuclear.out.nuclear_bam) + } + + if (params.peakcaller == "hotspot2") { + hotspot2( + params.hotspot_id, + filter_nuclear.out.nuclear_bam, + file(params.mappable), + file(params.chrom_sizes), + file(params.centers), + ) + } + + // Analysis processes + spot_score( + filter_bam.out.filtered_bam, + file("${dataDir}/annotations/${genome_name}.K${params.readlength}.mappable_only.bed"), + file("${dataDir}/annotations/${genome_name}.chromInfo.bed"), + genome_name, + ) + + bam_counts(dups.out.marked_bam) + count_adapters(dups.out.marked_bam) + + if (!params.UMI) { + preseq(filter_nuclear.out.nuclear_bam) + } + + cutcounts( + file("${params.genome}.fai"), + filter_bam.out.filtered_bam, + ) + + density( + filter_bam.out.filtered_bam, + file(params.chrom_bucket), + file("${params.genome}.fai"), + ) + + multimapping_density( + dups.out.marked_bam, + file(params.chrom_bucket), + file("${params.genome}.fai"), + ) + + normalize_density( + density.out.to_normalize, + file("${params.genome}.fai"), + ) + + insert_sizes( + filter_bam.out.filtered_bam, + file("${params.genome}.nuclear.txt"), + ) + + // Conditional analysis processes + if (params.peakcaller == "hotspot2") { + if (params.domotifs) { + motif_matrix( + hotspot2.out.hotspot_calls, + file("${dataDir}/motifs/${genome_name}.fimo.starch"), + file("${dataDir}/motifs/${genome_name}.fimo.transfac.names.txt"), + ) + } + + if (params.dofeatures) { + closest_features( + hotspot2.out.hotspot_calls, + file("${dataDir}/features/${genome_name}.CombinedTxStarts.bed"), + "0 1000 2500 5000 10000", + ) + } + + if (params.hotspot_index != ".") { + differential_hotspots( + dups.out.marked_bam, + hotspot2.out.onepercent_peaks, + file(params.hotspot_index), + ) + } + + // Footprint analysis + if (params.bias != "") { + learn_dispersion( + file("${params.genome}.fa"), + filter_bam.out.filtered_bam, + hotspot2.out.hotspot_calls_for_bias, + file(params.bias), + ) + + make_intervals(hotspot2.out.hotspot_calls) + + intervals_combined = make_intervals.out.intervals + .flatten() + .combine(learn_dispersion.out.dispersion) + + compute_deviation( + intervals_combined, + file(params.bias), + file("${params.genome}.fa"), + ) + + merge_deviation( + compute_deviation.out.deviations.collect() + ) + + working_tracks( + merge_deviation.out.merged_interval + ) + + thresholds = Channel.from(0.2, 0.1, 0.05, 0.01, 0.001, 0.0001) + intervals_with_thresholds = merge_deviation.out.merged_interval.combine(thresholds) + + compute_footprints(intervals_with_thresholds) + + plot_footprints( + learn_dispersion.out.to_plot, + file("${baseDir}/plot_footprints.py"), + ) + } + + starch_to_bigbed( + hotspot2.out.onepercent_peaks, + file(params.chrom_sizes), + ) + } + + // CRAM conversion + bams_to_cram_combined = filter_bam.out.filtered_bam.mix(dups.out.bams_to_cram_marked) + cram( + bams_to_cram_combined, + file("${params.genome}.fa"), + file("${params.genome}.fai"), + ) +} diff --git a/processes/bwa/aggregate/nextflow.config b/processes/bwa/aggregate/nextflow.config index c4bdca3e..8f9489c1 100644 --- a/processes/bwa/aggregate/nextflow.config +++ b/processes/bwa/aggregate/nextflow.config @@ -26,8 +26,38 @@ profiles { } } + apptainer { + apptainer { + enabled = true + runOptions = [ + "--mount type=bind,src=${System.getenv('STAMPIPES')},dst=${System.getenv('STAMPIPES')}", + "--mount type=bind,src=${System.getenv('HOTSPOT_DIR')},dst=${System.getenv('HOTSPOT_DIR')}", + "--mount type=bind,src=/net/seq/data/genomes/ref_cache,dst=/net/seq/data/genomes/ref_cache", + ].join(" ") + } + env { + STAMPIPES = System.getenv("STAMPIPES") + HOTSPOT_DIR = System.getenv("HOTSPOT_DIR") + REF_PATH = System.getenv("REF_PATH") + } + process { + withLabel: 'samtools' { + container = "$baseDir/../../../containers/bwa/samtools-1.12.sif" + } + withLabel: "modules" { + container = "$baseDir/../../../containers/bwa/bwa-aggregate-base.sif" + } + withLabel: "macs2" { + container = "$baseDir/../../../containers/bwa/macs2.sif" + } + } + } + modules { process { + withName: 'spot_score' { + container = "$baseDir/../../../containers/bwa/spot_score.sif" + } withLabel: "modules" { module = "bedops/2.4.35-typical:samtools/1.3:modwt/1.0:kentutil/302:hotspot2/2.1.1:jdk/1.8.0_92:gcc/4.7.2:R/3.2.5:picard/2.8.1:git/2.3.3:coreutils/8.25:bedtools/2.25.0:python/3.5.1:pysam/0.9.0:htslib/1.6.0:numpy/1.11.0:atlas-lapack/3.10.2:scipy/1.0.0:scikit-learn/0.18.1:preseq/2.0.3:gsl/2.4" } @@ -44,10 +74,10 @@ profiles { withName: 'density|multimapping_density|cutcounts|insert_sizes' { // Density process sometimes reports OOM with a 255 exit code - errorStrategy = { task.exitStatus in [1, 137, 143, 255] ? 'retry' : 'terminate' } + errorStrategy = { task.exitStatus in [1, 137, 143, 255] ? 'retry' : 'ignore' } // Temporary, for some specific aggs - memory = { 96.GB * (2**(task.attempt - 1)) } + //memory = { 96.GB * (2**(task.attempt - 1)) } //queue = 'bigmem' } // process { @@ -77,16 +107,20 @@ profiles { } report { enabled = true + overwrite = true } trace { enabled = true + overwrite = true } timeline { enabled = true + overwrite = true } dag { enabled = true - file = "dag.html" + file = "dag.html" + overwrite = true } } diff --git a/processes/bwa/nextflow.config b/processes/bwa/nextflow.config index b08df629..d140cd3a 100644 --- a/processes/bwa/nextflow.config +++ b/processes/bwa/nextflow.config @@ -8,10 +8,11 @@ profiles { apptainer { apptainer { enabled = true - runOptions = "--mount type=bind,src=${System.getenv('STAMPIPES')},dst=${System.getenv('STAMPIPES')} --mount type=bind,src=${System.getenv('HOTSPOT_DIR')},dst=${System.getenv('HOTSPOT_DIR')}" + runOptions = [ + "--mount type=bind,src=${System.getenv('STAMPIPES')},dst=${System.getenv('STAMPIPES')}", + "--mount type=bind,src=${System.getenv('HOTSPOT_DIR')},dst=${System.getenv('HOTSPOT_DIR')}" + ].join(" ") } - // temporary - process.errorStrategy = "finish" env { STAMPIPES = System.getenv("STAMPIPES")