diff --git a/processes/build_references/build_reference.nf b/processes/build_references/build_reference.nf index 451d0dbe..1ad752c7 100755 --- a/processes/build_references/build_reference.nf +++ b/processes/build_references/build_reference.nf @@ -14,7 +14,7 @@ readlengths = params.readlength.tokenize(',') process bwa { - publishDir "${params.outdir}/bwa" + publishDir "${params.outdir}/bwa", mode: "link" input: file genome from file(params.genome) @@ -29,7 +29,7 @@ process bwa { } process samtools { - publishDir "${params.outdir}/samtools" + publishDir "${params.outdir}/samtools", mode: "link" input: file genome from file(params.genome) @@ -61,7 +61,7 @@ process bowtie_index { process mappability { - publishDir "${params.outdir}/annotations" + publishDir "${params.outdir}/annotations", mode: "link" input: file genome from file(params.genome) @@ -85,7 +85,7 @@ process mappability { process density { - publishDir "${params.outdir}/densities" + publishDir "${params.outdir}/densities", mode: "link" input: file fai @@ -104,7 +104,7 @@ process density { process chrom_sizes { - publishDir "${params.outdir}/hotspot2" + publishDir "${params.outdir}/hotspot2", mode: "link" input: file fai @@ -120,7 +120,7 @@ process chrom_sizes { process chrom_info { - publishDir "${params.outdir}/annotations" + publishDir "${params.outdir}/annotations", mode: "link" input: file chrom_sizes @@ -138,7 +138,7 @@ process hotspot2 { container "fwip/hotspot2:latest" - publishDir "${params.outdir}/hotspot2" + publishDir "${params.outdir}/hotspot2", mode: "link" input: file chrom_sizes @@ -161,7 +161,7 @@ process hotspot2 { } process nuclear_center_sites { - publishDir "${params.outdir}/hotspot2" + publishDir "${params.outdir}/hotspot2", mode: "link" input: file center_sites diff --git a/processes/bwa/aggregate/basic.nf b/processes/bwa/aggregate/basic.nf index c25443d7..bd36dfd1 100644 --- a/processes/bwa/aggregate/basic.nf +++ b/processes/bwa/aggregate/basic.nf @@ -48,7 +48,7 @@ process merge { output: file 'merged.bam' into merged - publishDir params.outdir + publishDir params.outdir, mode: "link" script: """ @@ -59,7 +59,7 @@ process merge { // TODO: single end process dups { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" label 'high_mem' input: @@ -97,7 +97,7 @@ marked_bam.into { bam_for_counts; bam_for_adapter_counts; bam_for_filter; bam_fo process filter { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file bam from bam_for_filter @@ -135,7 +135,7 @@ process filter_nuclear { process macs2 { label "macs2" - publishDir "${params.outdir}/peaks_macs2" + publishDir "${params.outdir}/peaks_macs2", mode: "link" scratch false when: @@ -161,7 +161,7 @@ process macs2 { process hotspot2 { label "modules" - publishDir "${params.outdir}" + publishDir "${params.outdir}", mode: "link" container "fwip/hotspot2:latest" when: @@ -215,7 +215,7 @@ process hotspot2 { process spot_score { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file(bam) from bam_for_spot_score @@ -256,7 +256,7 @@ process spot_score { process bam_counts { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file(bam) from bam_for_counts @@ -274,7 +274,7 @@ process bam_counts { process count_adapters { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file(bam) from bam_for_adapter_counts @@ -292,7 +292,7 @@ process count_adapters { process preseq { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file nuclear_bam @@ -318,7 +318,7 @@ process preseq { process cutcounts { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file(fai) from file("${params.genome}.fai") @@ -366,7 +366,7 @@ process cutcounts { process density { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" label 'high_mem' input: @@ -419,7 +419,7 @@ process density { process multimapping_density { - publishDir params.outdir + publishDir params.outdir, mode: "link" label 'modules' label 'high_mem' @@ -498,7 +498,7 @@ process multimapping_density { process normalize_density { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: set(file(filtered_bam), file(density)) from to_normalize @@ -541,7 +541,7 @@ process normalize_density { process insert_sizes { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file nuclear_bam from bam_for_inserts @@ -571,7 +571,7 @@ process insert_sizes { process motif_matrix { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file hotspot_calls @@ -595,7 +595,7 @@ process motif_matrix { process closest_features { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file hotspot_calls @@ -639,7 +639,7 @@ process closest_features { process differential_hotspots { label "modules" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file bam from bam_for_diff_peaks @@ -683,7 +683,7 @@ process differential_hotspots { process learn_dispersion { label "footprints" - publishDir params.outdir + publishDir params.outdir, mode: "link" memory = '8 GB' cpus = 8 @@ -805,7 +805,7 @@ process working_tracks { memory = '32 GB' cpus = 1 - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file merged_interval @@ -833,7 +833,7 @@ process compute_footprints { memory = '8 GB' cpus = 1 - publishDir params.outdir + publishDir params.outdir, mode: "link" input: set file(merged_interval), val(threshold) from merged_interval.combine(thresholds) @@ -861,7 +861,7 @@ process compute_footprints { process plot_footprints { label "footprints" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file model from to_plot diff --git a/processes/bwa/process_bwa_paired_trimmed.nf b/processes/bwa/process_bwa_paired_trimmed.nf index e3f48ce5..88c8f873 100755 --- a/processes/bwa/process_bwa_paired_trimmed.nf +++ b/processes/bwa/process_bwa_paired_trimmed.nf @@ -308,7 +308,7 @@ process mark_duplicates { label "high_mem" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file(merged_bam) from merged_bam @@ -352,7 +352,7 @@ if (params.UMI) process filter_bam_to_unique { - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file marked_bam @@ -396,7 +396,7 @@ process bam_counts { */ process insert_size { - publishDir params.outdir + publishDir params.outdir, mode: "link" input: set file(bam), file(bai) from bam_for_insert @@ -429,7 +429,7 @@ process insert_size { process spot_score { - publishDir params.outdir + publishDir params.outdir, mode: "link" input: set file(bam), file(bai) from bam_for_spot @@ -484,7 +484,7 @@ win = 75 bini = 20 process density_files { - publishDir params.outdir + publishDir params.outdir, mode: "link" input: set file(bam), file(bai) from bam_for_density @@ -530,7 +530,7 @@ process density_files { */ process total_counts { - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file 'fastqcounts*' from fastq_counts.collect() diff --git a/processes/rna-star/aggregation/cufflinks_featurecounts.nf b/processes/rna-star/aggregation/cufflinks_featurecounts.nf index e44c8bd2..92fe071e 100644 --- a/processes/rna-star/aggregation/cufflinks_featurecounts.nf +++ b/processes/rna-star/aggregation/cufflinks_featurecounts.nf @@ -69,7 +69,7 @@ workflow RNA_AGG { process merge_transcriptome_bam { - publishDir params.outdir + publishDir params.outdir, mode: "link" module "samtools/1.12" input: // Assume sorted by coord @@ -90,7 +90,7 @@ process merge_transcriptome_bam { } process merge_genome_bam { - publishDir params.outdir + publishDir params.outdir, mode: "link" module "samtools/1.12" input: // Assume sorted by coord @@ -114,7 +114,7 @@ process merge_genome_bam { process remove_duplicate_reads { - publishDir params.outdir + publishDir params.outdir, mode: "link" module "jdk/2.8.1", "picard/2.8.1", "samtools/1.12" label 'high_mem' input: @@ -152,7 +152,7 @@ process remove_duplicate_reads { } process mark_duplicate_reads { - publishDir params.outdir + publishDir params.outdir, mode: "link" module "jdk/2.8.1", "picard/2.8.1", "samtools/1.12" input: @@ -177,7 +177,7 @@ process mark_duplicate_reads { process bam_to_fastq { - publishDir params.outdir + publishDir params.outdir, mode: "link" module "samtools/1.12" input: @@ -202,7 +202,7 @@ process bam_to_fastq { process density { module "STAR/2.4.2a", "bedops/2.4.19", "htslib/1.6.0" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: path(input_bam) path("ref/*") @@ -260,7 +260,7 @@ process density { process cufflinks { - publishDir params.outdir + publishDir params.outdir, mode: "link" module "cufflinks/2.2.1", "R/3.2.5", "anaquin/2.0.1" input: @@ -300,7 +300,7 @@ process cufflinks { process stringtie { - publishDir params.outdir + publishDir params.outdir, mode: "link" module "stringtie/1.3.4d" input: path(input_bam) @@ -321,7 +321,7 @@ process stringtie { process feature_counts { - publishDir params.outdir + publishDir params.outdir, mode: "link" module "subread/1.5.1" input: @@ -343,7 +343,7 @@ process feature_counts { process kallisto { - publishDir params.outdir + publishDir params.outdir, mode: "link" module "kallisto/0.43.1", "anaquin/2.0.1" input: @@ -369,7 +369,7 @@ process kallisto { process kallisto_advanced { - publishDir params.outdir + publishDir params.outdir, mode: "link" module "kallisto/0.43.1", "anaquin/2.0.1" input: @@ -397,7 +397,7 @@ process kallisto_advanced { process anaquin { module "samtools/1.12", "anaquin/2.0.1", "kallisto/0.43.1", "R/3.2.5" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: path input_bam @@ -457,7 +457,7 @@ process anaquin { process insert_sizes { module "jdk", "picard/2.9.0", "R/3.2.5" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: path input_bam @@ -474,7 +474,7 @@ process insert_sizes { process rna_metrics { module "jdk", "picard/2.9.0" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: path input_bam @@ -495,7 +495,7 @@ process rna_metrics { process adapter_count { module "bwa", "samtools", "jdk", "picard/2.9.0" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: path input_bam @@ -513,7 +513,7 @@ process adapter_count { process ribosomal_count { module "bowtie/1.0.0" - publishDir params.outdir + publishDir params.outdir, mode: "link" input: tuple path(r1_fq), path(r2_fq) diff --git a/processes/rna-star/modules/star.nf b/processes/rna-star/modules/star.nf index a88ba66c..81664dbb 100644 --- a/processes/rna-star/modules/star.nf +++ b/processes/rna-star/modules/star.nf @@ -8,7 +8,7 @@ process star { cpus params.star_threads module 'STAR/2.4.2a', 'samtools/1.7' - publishDir params.outdir, enabled: params.publish + publishDir params.outdir, enabled: params.publish, mode: "link" label 'high_mem' diff --git a/processes/rna-star/rna_star.nf b/processes/rna-star/rna_star.nf index a12dd234..cab6e12d 100755 --- a/processes/rna-star/rna_star.nf +++ b/processes/rna-star/rna_star.nf @@ -49,7 +49,7 @@ process star { cpus params.star_threads module 'STAR', 'samtools/1.7', 'gcc/4.7.2' - publishDir params.outdir + publishDir params.outdir, mode: "link" input: set file(r1), file(r2) from trimmed @@ -102,7 +102,7 @@ process star_bedgraph { module 'STAR' - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file coordBam from coordinateBam @@ -128,7 +128,7 @@ process rsem { module 'perl/5.16.3', 'RSEM/1.2.30' - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file transcriptBam @@ -158,7 +158,7 @@ process rsem_plot { module 'perl/5.16.3', 'RSEM/1.2.30', 'R/3.2.5' - publishDir params.outdir + publishDir params.outdir, mode: "link" input: file "Quant.stat/*" from rsem_quant diff --git a/processes/rna-star/star_only.nf b/processes/rna-star/star_only.nf index 9c6b7cbc..bd098ebf 100755 --- a/processes/rna-star/star_only.nf +++ b/processes/rna-star/star_only.nf @@ -49,7 +49,7 @@ process star { cpus params.star_threads module 'STAR', 'samtools/1.7' - publishDir params.outdir + publishDir params.outdir, mode: "link" label 'high_mem' diff --git a/scripts/utility/estimate_nf_clean.sh b/scripts/utility/estimate_nf_clean.sh new file mode 100755 index 00000000..3381e4a6 --- /dev/null +++ b/scripts/utility/estimate_nf_clean.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +for dir in "$@" ; do + [[ -d "$dir/work" ]] || continue + dir=$(readlink -f "$dir") + find "$dir/work" -type f \ + | grep -v -F -f <( \ + find "$dir"/output* -type l -print0 \ + | xargs --no-run-if-empty -0 readlink -f \ + ) \ + | xargs --no-run-if-empty du -scb | tail -n1 | sed "s!total!$dir!" +done diff --git a/scripts/utility/nextflow_clean.sh b/scripts/utility/nextflow_clean.sh new file mode 100755 index 00000000..d648e297 --- /dev/null +++ b/scripts/utility/nextflow_clean.sh @@ -0,0 +1,221 @@ +#!/bin/bash + + +# 1 visit directory containing work/ dir +# 2 find all symlinks within top-level dir (and subdirs), excluding those in work dir. +# 3 for each symlink found: +# 3A get the target of the symlink, resolving any symlink chains (e.g: the symlink may point to another symlink, etc) +# 3B if the symlink does not point into a work/ directory, skip it. +# 3C make a copy of the link for backup purposes in case we get interrupted (e.g: mysymlink.bak) +# 3D replace the symlink with a hard link to the real target +# 3E remove the symlink backup +# 4 repeat step 2, exiting with error if any symlinks remain. +# 5 remove work directory +# 6 set LIMS status + + +set -euo pipefail + +AGG_DIR_REGEX=/net/seq/data/aggregations/LN[0-9]+/aggregation-[0-9]+ + +######################### + +function usage(){ + echo "Usage: $0 [-f] dir [dirs...]" >&2 + exit 2 +} + +# Parse arguments +FORCE= +while getopts ":fh" arg; do + case $arg in + h) + usage + ;; + f) + FORCE=TRUE + ;; + *) + usage + ;; + + esac +done +shift $((OPTIND -1)) +[[ -z "$*" ]] && usage + +# Print arguments to stderr (printf style format string) and exit +DIE_F(){ + printf "ERROR: " + # shellcheck disable=SC2059 + printf "$@" + printf "\n" + exit 1 +} +INFO_F(){ + printf "INFO: " + # shellcheck disable=SC2059 + printf "$@" + printf "\n" +} + +RUN_IF_FORCE(){ + if [[ "$FORCE" == TRUE ]] ; then + printf "INFO: Running:' " + printf "%q " "$@" + printf "'\n" + "$@" + else + printf "INFO: Dry-run, would have run: '" + printf "%q " "$@" + printf "'\n" + fi +} + +lims_patch_by_url() { + /usr/bin/curl \ + --request PATCH \ + --data "$2" \ + "$1" \ + -H "Authorization: Token $LIMS_API_TOKEN" \ + -k \ + 2>/dev/null +} + +# $1: url, $2: field=val +lims_patch () { + lims_patch_by_url "$LIMS_API_URL/$1" "$2" +} + +lock_aggregation() { + local aggdir + local aggid + aggdir=$(readlink -f $1) + if [[ ! "$aggdir" =~ $AGG_DIR_REGEX ]] ; then + INFO_F "Not an aggregation directory, not locking" + return + fi + + # Get aggregation ID from directory name + aggid=${aggdir/*aggregation-/} + if [[ ! $aggid =~ ^[0-9]+$ ]] ; then + DIE_F "Couldn't parse aggregation directory '%s' ('%s') into aggid (got '%s')" "$1" "$aggdir" "$aggid" + fi + + lims_patch "aggregation/$aggid/" 'locked=True' +} + +# Find output symlinks in a directory +# Expects them to be in subdirectories starting with 'output' +find_output_symlinks(){ + # find all symlinks within top-level dir (and subdirs), excluding those in work dir. + find "$1" -maxdepth 1 -name 'output*' -type d -print0 \ + | xargs --no-run-if-empty -0 -I '{}' find '{}' -type l +} + +has_broken_symlink(){ + local broken_symlink + broken_symlink=$(find "$1" -maxdepth 1 -name 'output*' -type d -print0 \ + | xargs --no-run-if-empty -0 -I '{}' find '{}' -type l -xtype l -print -quit) + [[ -n "$broken_symlink" ]] +} + +# Following symlinks, see if the two arguments point to the same region of the disk. +is_same_inode(){ + [[ "$1" -ef "$2" ]] +} + +process_dir() { + local dir=$1 + # Start subshell to isolate our changes/work + # Also redirecting logs + ( + INFO_F "Processing dir '%s'" "$dir" + # Make sure dir exists + [[ -d "$dir" ]] || DIE_F "Directory does not exist: '%s'" "$dir" + # It should also contain a workdir + [[ -d "$dir/work" ]] || DIE_F "Work directory does not exist: '%s/work'" "$dir" + + set -euo pipefail # This is inherited, but in case somebody changes the top declaration. + + # 1) visit directory containing work/ dir + cd "$dir" + + # 1.5) Check for broken symlinks + has_broken_symlink . && + DIE_F "Broken symlink detected. dir=%s" "$dir" + + # 2) find symlinks + local symlinks=() + readarray -t symlinks < <( find_output_symlinks . ) + + # Check + # https://stackoverflow.com/questions/7577052/bash-empty-array-expansion-with-set-u + # shellcheck disable=SC2199 + if [[ -z ${symlinks[@]+"${symlinks[@]}"} ]] ; then + INFO_F "No input symlinks found. dir=%s" "$dir" + symlinks=() + else + + # 3) + for symlink in "${symlinks[@]}" ; do + # 3A) Get symlink target + local target + target=$(readlink -f "$symlink") + # 3B) Die if not in work directory + [[ "$target" =~ /work/ ]] || + DIE_F "Target not in workdir. dir='%s';symlink='%s';target='%s'" "$dir" "$symlink" "$target" + + # 3?) Make sure symlink isn't broken + [[ -e "$target" ]] || + DIE_F "Target not found. dir='%s';symlink='%s';target='%s'" "$dir" "$symlink" "$target" + + is_same_inode "$target" "$symlink" || + DIE_F "Somehow, target & symlink inodes differ. dir='%s';symlink='%s';target='%s'" "$dir" "$symlink" "$target" + + # 3C) make a copy of the link for backup purposes in case we get interrupted (e.g: mysymlink.bak) + local symlink_bak=$symlink.bak + RUN_IF_FORCE cp --no-dereference "$symlink" "$symlink_bak" + + # 3D) Create hard link to real target, replacing soft link + local hardlink=$symlink + RUN_IF_FORCE ln -f --logical "$target" "$hardlink" + + # Triple-check, make sure the hard link and backup symlink point to the same place + if [[ "$FORCE" == TRUE ]] ; then + is_same_inode "$symlink_bak" "$hardlink" || + DIE_F "Somehow, the symlink and hard link point to different files. hardlink='%s', target='%s'" "$hardlink" "$target" + fi + + # 3E) remove backup symlink + RUN_IF_FORCE rm "$symlink_bak" + done + fi + + # Only run check for symlinks if we deleted + if [[ $FORCE == TRUE ]] ; then + # 4) + local remaining_symlinks + remaining_symlinks=$(find_output_symlinks .) + [[ -z "$remaining_symlinks" ]] || + DIE_F "Some symlinks remain, not removing workdir. dir='%s';remaining='%q'" "$dir" "${remaining_symlinks[*]}" + else + INFO_F "Dry-run, skipping the check for remaining symlinks" + fi + + RUN_IF_FORCE rm -r --one-file-system --preserve-root "work" + + RUN_IF_FORCE lock_aggregation . + + INFO_F "Done. dir='%s'" "$dir" + + ) | tee "$dir"/freezing.log +} + +for directory in "$@" ; do + if [[ -d "$directory/work" ]] ; then + process_dir "$(readlink -f $directory)" + else + INFO_F "Dir '%s' does not contain a workdir" "$directory" + fi +done