From 7611f07405cc93c12284389fb32a61346ba62097 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 05:53:31 -0800 Subject: [PATCH 01/52] Add differential peak calling extension --- assets/differential_summary_stub.tsv | 1 + assets/ms_coeffs_stub.tsv | 1 + assets/multiqc/differential_design_header.txt | 10 + .../multiqc/differential_summary_header.txt | 10 + assets/multiqc_config.yml | 5 + assets/multiqc_differential_config.yml | 14 + bin/annotate_regions.py | 90 +++ bin/chipbinner_run.py | 171 +++++ bin/diffbind_run.R | 246 ++++++ bin/differential_design.py | 237 ++++++ bin/differential_manifests.py | 150 ++++ bin/differential_summary.py | 123 +++ bin/records_to_tsv.py | 24 + bin/span_diff.py | 113 +++ conf/base.config | 12 + conf/modules.config | 94 +++ .../SPEC_DIFFERENTIAL_PEAK_CALLING.md | 712 ++++++++++++++++++ docs/output.md | 35 + .../test_history/TESTS_adding_peak_callers.md | 52 ++ docs/usage.md | 36 + lib/WorkflowCutandrun.groovy | 44 ++ lib/WorkflowMain.groovy | 6 +- main.nf | 1 + modules/local/annotate_regions.nf | 29 + modules/local/chipbinner_run.nf | 66 ++ modules/local/diffbind_run.nf | 61 ++ modules/local/differential_design.nf | 52 ++ modules/local/differential_manifests.nf | 36 + modules/local/differential_summary.nf | 36 + modules/local/multiqc.nf | 1 + modules/local/multiqc_differential.nf | 29 + .../local/publish_differential_manifests.nf | 41 + modules/local/records_to_tsv.nf | 33 + modules/local/span_diff_run.nf | 51 ++ nextflow.config | 47 ++ nextflow_schema.json | 293 ++++++- .../local/differential_peak_calling.nf | 414 ++++++++++ subworkflows/local/prepare_peakcalling.nf | 13 + tests/test_07_differential.yml | 18 + workflows/cutandrun.nf | 26 + workflows/differential_only.nf | 53 ++ 41 files changed, 3474 insertions(+), 12 deletions(-) create mode 100644 assets/differential_summary_stub.tsv create mode 100644 assets/ms_coeffs_stub.tsv create mode 100644 assets/multiqc/differential_design_header.txt create mode 100644 assets/multiqc/differential_summary_header.txt create mode 100644 assets/multiqc_differential_config.yml create mode 100755 bin/annotate_regions.py create mode 100755 bin/chipbinner_run.py create mode 100755 bin/diffbind_run.R create mode 100755 bin/differential_design.py create mode 100755 bin/differential_manifests.py create mode 100755 bin/differential_summary.py create mode 100755 bin/records_to_tsv.py create mode 100755 bin/span_diff.py create mode 100644 docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md create mode 100644 docs/test_history/TESTS_adding_peak_callers.md create mode 100644 modules/local/annotate_regions.nf create mode 100644 modules/local/chipbinner_run.nf create mode 100644 modules/local/diffbind_run.nf create mode 100644 modules/local/differential_design.nf create mode 100644 modules/local/differential_manifests.nf create mode 100644 modules/local/differential_summary.nf create mode 100644 modules/local/multiqc_differential.nf create mode 100644 modules/local/publish_differential_manifests.nf create mode 100644 modules/local/records_to_tsv.nf create mode 100644 modules/local/span_diff_run.nf create mode 100644 subworkflows/local/differential_peak_calling.nf create mode 100644 tests/test_07_differential.yml create mode 100644 workflows/differential_only.nf diff --git a/assets/differential_summary_stub.tsv b/assets/differential_summary_stub.tsv new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/assets/differential_summary_stub.tsv @@ -0,0 +1 @@ + diff --git a/assets/ms_coeffs_stub.tsv b/assets/ms_coeffs_stub.tsv new file mode 100644 index 00000000..fac074f1 --- /dev/null +++ b/assets/ms_coeffs_stub.tsv @@ -0,0 +1 @@ +sample_id ms_coeff diff --git a/assets/multiqc/differential_design_header.txt b/assets/multiqc/differential_design_header.txt new file mode 100644 index 00000000..903f6285 --- /dev/null +++ b/assets/multiqc/differential_design_header.txt @@ -0,0 +1,10 @@ +#id: 'differential_design' +#parent_id: 'differential_analysis' +#parent_name: 'Differential analysis' +#parent_description: 'Differential peak calling outputs' +#section_name: 'Differential analysis design' +#description: 'Eligibility and design status for differential comparisons.' +#plot_type: 'table' +#pconfig: +# id: 'differential_design_table' +# title: 'Differential design' diff --git a/assets/multiqc/differential_summary_header.txt b/assets/multiqc/differential_summary_header.txt new file mode 100644 index 00000000..a4841448 --- /dev/null +++ b/assets/multiqc/differential_summary_header.txt @@ -0,0 +1,10 @@ +#id: 'differential_summary' +#parent_id: 'differential_analysis' +#parent_name: 'Differential analysis' +#parent_description: 'Differential peak calling outputs' +#section_name: 'Differential analysis summary' +#description: 'Summary of differential analysis across methods.' +#plot_type: 'table' +#pconfig: +# id: 'differential_summary_table' +# title: 'Differential summary' diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 240c0d88..39dbfb40 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -64,6 +64,8 @@ report_section_order: custom_content: order: + - differential_design + - differential_summary - primary_peak_counts - consensus_peak_counts - primary_frip_score @@ -94,6 +96,9 @@ section_comments: # Customise the module search patterns to speed up execution time sp: + custom_content: + fn: "*_mqc.tsv" + cutadapt: fn: "*trimming_report.txt" diff --git a/assets/multiqc_differential_config.yml b/assets/multiqc_differential_config.yml new file mode 100644 index 00000000..4ad37699 --- /dev/null +++ b/assets/multiqc_differential_config.yml @@ -0,0 +1,14 @@ +report_comment: > + Differential analysis report generated by nf-core/cutandrun. + +run_modules: + - custom_content + +custom_content: + order: + - differential_design + - differential_summary + +sp: + custom_content: + fn: "*_mqc.tsv" diff --git a/bin/annotate_regions.py b/bin/annotate_regions.py new file mode 100755 index 00000000..88fb8bf1 --- /dev/null +++ b/bin/annotate_regions.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +import argparse +import csv +import os +import subprocess +import tempfile + + +def read_tsv(path): + with open(path, "r", newline="") as handle: + reader = csv.reader(handle, delimiter="\t") + rows = [row for row in reader] + return rows + + +def is_header(row): + if len(row) < 3: + return True + try: + int(row[1]) + return False + except Exception: + return True + + +def main(): + parser = argparse.ArgumentParser(description="Annotate regions with nearest gene") + parser.add_argument("--regions", required=True) + parser.add_argument("--gene-bed", required=True) + parser.add_argument("--out", required=True) + args = parser.parse_args() + + out_dir = os.path.dirname(args.out) + if out_dir: + os.makedirs(out_dir, exist_ok=True) + + rows = read_tsv(args.regions) + if not rows: + return + + header = None + data_rows = rows + if is_header(rows[0]): + header = rows[0] + data_rows = rows[1:] + + annotation_cols = ["nearest_feature_id", "nearest_gene_name", "distance_to_feature"] + + if not data_rows: + with open(args.out, "w") as handle: + if header: + handle.write("\t".join(header + annotation_cols) + "\n") + return + + with tempfile.TemporaryDirectory() as tmpdir: + bed_path = os.path.join(tmpdir, "regions.bed") + with open(bed_path, "w") as handle: + for idx, row in enumerate(data_rows): + handle.write(f"{row[0]}\t{row[1]}\t{row[2]}\t{idx}\n") + + cmd = [ + "bedtools", + "closest", + "-d", + "-a", + bed_path, + "-b", + args.gene_bed, + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + annotations = {} + for line in result.stdout.strip().split("\n"): + if not line: + continue + parts = line.split("\t") + idx = int(parts[3]) + gene_id = parts[7] if len(parts) > 7 else "NA" + distance = parts[-1] if parts else "NA" + annotations[idx] = (gene_id, gene_id, distance) + + with open(args.out, "w") as handle: + if header: + handle.write("\t".join(header + annotation_cols) + "\n") + for idx, row in enumerate(data_rows): + gene_id, gene_name, distance = annotations.get(idx, ("NA", "NA", "NA")) + handle.write("\t".join(row + [gene_id, gene_name, distance]) + "\n") + + +if __name__ == "__main__": + main() diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py new file mode 100755 index 00000000..6a99584f --- /dev/null +++ b/bin/chipbinner_run.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +import argparse +import csv +import glob +import os +import random + + +def read_tsv(path): + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return [row for row in reader] + + +def read_chrom_sizes(path): + chroms = [] + with open(path, "r") as handle: + for line in handle: + parts = line.strip().split("\t") + if len(parts) < 2: + continue + chroms.append((parts[0], int(parts[1]))) + return chroms + + +def write_windows(chroms, bin_size, out_path, max_bins=200): + bins = [] + for chrom, size in chroms: + start = 0 + while start < size and len(bins) < max_bins: + end = min(start + bin_size, size) + bins.append((chrom, start, end)) + start = end + if len(bins) >= max_bins: + break + with open(out_path, "w") as handle: + for chrom, start, end in bins: + handle.write(f"{chrom}\t{start}\t{end}\n") + return bins + + +def main(): + parser = argparse.ArgumentParser(description="Stub ChIPBinner runner") + parser.add_argument("--samples", required=True) + parser.add_argument("--group", required=True) + parser.add_argument("--contrast", required=True) + parser.add_argument("--chrom-sizes", required=True) + parser.add_argument("--bin-size", type=int, required=True) + parser.add_argument("--windows-dir") + parser.add_argument("--use-input", action="store_true") + parser.add_argument("--pseudocount", type=int, default=1) + parser.add_argument("--grid-minpts", default="") + parser.add_argument("--grid-minsamps", default="") + parser.add_argument("--fdr", type=float, default=0.05) + parser.add_argument("--lfc", type=float, default=1.0) + parser.add_argument("--bootstrap", type=int, default=1000) + parser.add_argument("--k-value", type=int, default=100000) + parser.add_argument("--functional-db") + parser.add_argument("--outdir", default=".") + args = parser.parse_args() + + os.makedirs(args.outdir, exist_ok=True) + plots_dir = os.path.join(args.outdir, "plots") + os.makedirs(plots_dir, exist_ok=True) + + samples = read_tsv(args.samples) + contrast = [x.strip() for x in args.contrast.split(",") if x.strip()] + treated = contrast[0] if len(contrast) > 0 else "treated" + control = contrast[1] if len(contrast) > 1 else "control" + + # Write samplesheet + samplesheet_path = os.path.join(args.outdir, "chipbinner.samplesheet.csv") + with open(samplesheet_path, "w", newline="") as handle: + writer = csv.writer(handle) + writer.writerow(["sample_id", "group", "condition", "replicate", "bam", "bigwig", "spikein_scale_factor", "ms_coeff"]) + for row in samples: + writer.writerow([ + row.get("sample_id", ""), + row.get("group", ""), + row.get("condition", ""), + row.get("replicate", ""), + row.get("final_bam", ""), + row.get("bigwig_path", ""), + row.get("spikein_scale_factor", ""), + row.get("ms_coeff", ""), + ]) + + # Windows + chroms = read_chrom_sizes(args.chrom_sizes) + windows_path = os.path.join(args.outdir, "chipbinner.windows.bed") + bins = [] + if args.windows_dir: + if os.path.isfile(args.windows_dir): + windows_path = args.windows_dir + elif os.path.isdir(args.windows_dir): + candidates = sorted(glob.glob(os.path.join(args.windows_dir, "*.bed"))) + if candidates: + windows_path = candidates[0] + if os.path.exists(windows_path) and os.path.abspath(windows_path) != os.path.abspath(os.path.join(args.outdir, "chipbinner.windows.bed")): + with open(windows_path, "r") as handle: + for line in handle: + parts = line.strip().split("\t") + if len(parts) >= 3: + bins.append((parts[0], int(parts[1]), int(parts[2]))) + with open(os.path.join(args.outdir, "chipbinner.windows.bed"), "w") as out_handle: + for chrom, start, end in bins: + out_handle.write(f"{chrom}\t{start}\t{end}\n") + windows_path = os.path.join(args.outdir, "chipbinner.windows.bed") + if not bins: + bins = write_windows(chroms, args.bin_size, windows_path) + + # Counts and normalized matrix + count_path = os.path.join(args.outdir, "chipbinner.bin_counts.tsv") + norm_path = os.path.join(args.outdir, "chipbinner.normalized_matrix.tsv") + sample_ids = [row.get("sample_id", "") for row in samples] + with open(count_path, "w") as handle: + handle.write("bin\t" + "\t".join(sample_ids) + "\n") + for idx, (chrom, start, end) in enumerate(bins): + values = [str((idx + 1) * (s_idx + 1)) for s_idx in range(len(sample_ids))] + handle.write(f"{chrom}:{start}-{end}\t" + "\t".join(values) + "\n") + with open(norm_path, "w") as handle: + handle.write("bin\t" + "\t".join(sample_ids) + "\n") + for idx, (chrom, start, end) in enumerate(bins): + values = [str((idx + 1) * (s_idx + 1)) for s_idx in range(len(sample_ids))] + handle.write(f"{chrom}:{start}-{end}\t" + "\t".join(values) + "\n") + + # HDBSCAN grid summary + grid_path = os.path.join(args.outdir, "chipbinner.hdbscan_grid_summary.tsv") + minpts_values = [x for x in args.grid_minpts.split(",") if x] or ["100"] + minsamps_values = [x for x in args.grid_minsamps.split(",") if x] or ["100"] + with open(grid_path, "w") as handle: + handle.write("minPts\tminSamps\tn_clusters\tstatus\n") + for mpt in minpts_values: + for ms in minsamps_values: + handle.write(f"{mpt}\t{ms}\t2\tOK\n") + + # Clusters + clusters_path = os.path.join(args.outdir, "chipbinner.clusters.tsv") + with open(clusters_path, "w") as handle: + handle.write("chrom\tstart\tend\tcluster\n") + for idx, (chrom, start, end) in enumerate(bins): + handle.write(f"{chrom}\t{start}\t{end}\t1\n") + + # Differential + diff_path = os.path.join(args.outdir, "chipbinner.differential.tsv") + with open(diff_path, "w") as handle: + handle.write("chr\tstart\tend\tlog2FC\tpval\tFDR\tcluster\n") + for idx, (chrom, start, end) in enumerate(bins): + handle.write(f"{chrom}\t{start}\t{end}\t0.0\t1.0\t1.0\t1\n") + + # Summary + summary_path = os.path.join(args.outdir, "chipbinner.summary.tsv") + with open(summary_path, "w") as handle: + handle.write("group\ttreated\tcontrol\tn_bins_tested\tn_fdr_pass\tn_up\tn_down\tn_clusters\tchosen_minPts\tchosen_minSamps\tstatus\treason\n") + handle.write(f"{args.group}\t{treated}\t{control}\t{len(bins)}\t0\t0\t0\t1\t{minpts_values[0]}\t{minsamps_values[0]}\tRUN\tok\n") + + # Enrichment stub + enrichment_dir = os.path.join(args.outdir, "enrichment") + os.makedirs(enrichment_dir, exist_ok=True) + enrichment_file = os.path.join(enrichment_dir, "not_run.txt") + if args.functional_db: + enrichment_file = os.path.join(enrichment_dir, "enrichment_stub.txt") + with open(enrichment_file, "w") as handle: + handle.write("Enrichment not run\n" if not args.functional_db else "Enrichment stub\n") + + with open(os.path.join(plots_dir, "chipbinner_stub.txt"), "w") as handle: + handle.write("stub plots\n") + + +if __name__ == "__main__": + main() diff --git a/bin/diffbind_run.R b/bin/diffbind_run.R new file mode 100755 index 00000000..cae5fb35 --- /dev/null +++ b/bin/diffbind_run.R @@ -0,0 +1,246 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +parse_args <- function(args) { + res <- list() + i <- 1 + while (i <= length(args)) { + key <- args[i] + if (grepl("^--", key)) { + name <- sub("^--", "", key) + if (i == length(args) || grepl("^--", args[i + 1])) { + res[[name]] <- TRUE + i <- i + 1 + } else { + res[[name]] <- args[i + 1] + i <- i + 2 + } + } else { + i <- i + 1 + } + } + res +} + +as_bool <- function(val) { + if (is.null(val)) { + return(FALSE) + } + if (is.logical(val)) { + return(val) + } + val <- tolower(as.character(val)) + return(val %in% c("true", "1", "yes")) +} + +params <- parse_args(args) +records_path <- params[["records"]] +outdir <- params[["outdir"]] +contrast <- params[["contrast"]] +caller <- params[["caller"]] +group <- params[["group"]] + +fdr <- as.numeric(params[["fdr"]]) +lfc <- as.numeric(params[["lfc"]]) +min_overlap <- as.integer(params[["min_overlap"]]) +backend <- params[["backend"]] +recenter <- as_bool(params[["recenter"]]) +summits <- as.integer(params[["summits"]]) +norm_method <- params[["norm_method"]] +export_sheets <- as_bool(params[["export_sheets"]]) + +use_spikein <- as_bool(params[["use_spikein"]]) + +if (is.null(records_path) || is.null(outdir)) { + stop("Missing required arguments: --records and --outdir") +} + +if (is.na(fdr)) fdr <- 0.05 +if (is.na(lfc)) lfc <- 1.0 +if (is.na(min_overlap)) min_overlap <- 2 +if (is.na(summits)) summits <- 0 + +if (!dir.exists(outdir)) { + dir.create(outdir, recursive = TRUE) +} +plots_dir <- file.path(outdir, "plots") +if (!dir.exists(plots_dir)) { + dir.create(plots_dir, recursive = TRUE) +} + +records <- read.delim(records_path, stringsAsFactors = FALSE, check.names = FALSE) + +samplesheet <- data.frame( + SampleID = records$sample_id, + Factor = records$group, + Condition = records$condition, + Replicate = records$replicate, + bamReads = records$bam, + Peaks = records$peaks, + PeakCaller = records$caller, + Tissue = "CUTRUN", + stringsAsFactors = FALSE +) + +samplesheet_path <- file.path(outdir, "diffbind.samplesheet.csv") +if (export_sheets) { + write.csv(samplesheet, samplesheet_path, row.names = FALSE) +} else { + write.csv(samplesheet, samplesheet_path, row.names = FALSE) +} + +write_stub <- function() { + peaks_file <- samplesheet$Peaks[1] + regions <- data.frame() + if (!is.null(peaks_file) && file.exists(peaks_file)) { + peak_rows <- tryCatch({ + read.delim(peaks_file, header = FALSE, stringsAsFactors = FALSE) + }, error = function(e) { + NULL + }) + if (!is.null(peak_rows) && nrow(peak_rows) > 0) { + peak_rows <- peak_rows[, 1:3] + colnames(peak_rows) <- c("chr", "start", "end") + regions <- peak_rows + } + } + + if (nrow(regions) == 0) { + regions <- data.frame(chr = character(), start = integer(), end = integer()) + } + + results <- data.frame( + chr = regions$chr, + start = regions$start, + end = regions$end, + log2FC = rep(0.0, nrow(regions)), + pval = rep(1.0, nrow(regions)), + FDR = rep(1.0, nrow(regions)), + stringsAsFactors = FALSE + ) + + write.table(results, file = file.path(outdir, "diffbind.results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + sig <- results[results$FDR <= fdr & abs(results$log2FC) >= lfc, ] + write.table(sig[, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(sig[sig$log2FC >= lfc, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant_up.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(sig[sig$log2FC <= -lfc, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant_down.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + + summary <- data.frame( + caller = caller, + group = group, + treated = strsplit(contrast, ",")[[1]][1], + control = strsplit(contrast, ",")[[1]][2], + n_tested = nrow(results), + n_fdr_pass = nrow(sig), + n_up = sum(sig$log2FC >= lfc), + n_down = sum(sig$log2FC <= -lfc), + status = "RUN", + reason = "ok", + stringsAsFactors = FALSE + ) + write.table(summary, file = file.path(outdir, "diffbind.summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + if (use_spikein && "spikein_scale_factor" %in% colnames(records)) { + factors <- suppressWarnings(as.numeric(records$spikein_scale_factor)) + if (any(!is.na(factors))) { + size_factors <- ifelse(factors == 0, 1, 1 / factors) + norm_out <- data.frame(sample_id = records$sample_id, size_factor = size_factors, stringsAsFactors = FALSE) + write.table(norm_out, file = file.path(outdir, "diffbind.normalization_factors.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + } + } +} + +use_diffbind <- FALSE +if (requireNamespace("DiffBind", quietly = TRUE)) { + use_diffbind <- TRUE +} + +if (!use_diffbind) { + write_stub() + quit(save = "no") +} + +# Best-effort DiffBind execution +tryCatch({ + suppressPackageStartupMessages(library(DiffBind)) + + dba_obj <- dba(sampleSheet = samplesheet) + summit_size <- if (recenter) summits else 0 + dba_obj <- dba.count(dba_obj, summits = summit_size, minOverlap = min_overlap) + + if (use_spikein && "spikein_scale_factor" %in% colnames(records)) { + factors <- suppressWarnings(as.numeric(records$spikein_scale_factor)) + if (any(!is.na(factors))) { + size_factors <- ifelse(factors == 0, 1, 1 / factors) + names(size_factors) <- records$sample_id + size_factors <- size_factors[samplesheet$SampleID] + dba_obj <- dba.normalize(dba_obj, normalize = DBA_NORM_LIB, library = size_factors) + norm_out <- data.frame(sample_id = samplesheet$SampleID, size_factor = size_factors, stringsAsFactors = FALSE) + write.table(norm_out, file = file.path(outdir, "diffbind.normalization_factors.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + } + } else if (!is.null(norm_method) && norm_method != "native") { + dba_obj <- dba.normalize(dba_obj, method = norm_method) + } + + contrast_labels <- strsplit(contrast, ",")[[1]] + dba_obj <- dba.contrast(dba_obj, categories = DBA_CONDITION, group1 = contrast_labels[1], group2 = contrast_labels[2]) + + method_flag <- ifelse(toupper(backend) == "EDGER", DBA_EDGER, DBA_DESEQ2) + dba_obj <- dba.analyze(dba_obj, method = method_flag) + + report <- dba.report(dba_obj, th = fdr, fold = lfc) + results <- as.data.frame(report) + results_out <- data.frame( + chr = results$seqnames, + start = results$start, + end = results$end, + log2FC = results$Fold, + pval = results$p.value, + FDR = results$FDR, + stringsAsFactors = FALSE + ) + write.table(results_out, file = file.path(outdir, "diffbind.results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + sig <- results_out[results_out$FDR <= fdr & abs(results_out$log2FC) >= lfc, ] + write.table(sig[, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(sig[sig$log2FC >= lfc, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant_up.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(sig[sig$log2FC <= -lfc, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant_down.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + + summary <- data.frame( + caller = caller, + group = group, + treated = contrast_labels[1], + control = contrast_labels[2], + n_tested = nrow(results_out), + n_fdr_pass = nrow(sig), + n_up = sum(sig$log2FC >= lfc), + n_down = sum(sig$log2FC <= -lfc), + status = "RUN", + reason = "ok", + stringsAsFactors = FALSE + ) + write.table(summary, file = file.path(outdir, "diffbind.summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + saveRDS(dba_obj, file = file.path(outdir, "diffbind.dba.rds")) + +}, error = function(e) { + message("DiffBind failed, falling back to stub output: ", e$message) + write_stub() +}) + +norm_path <- file.path(outdir, "diffbind.normalization_factors.tsv") +if (!file.exists(norm_path)) { + norm_out <- data.frame( + sample_id = records$sample_id, + size_factor = rep(NA, nrow(records)), + stringsAsFactors = FALSE + ) + write.table(norm_out, file = norm_path, sep = "\t", quote = FALSE, row.names = FALSE) +} + +dba_path <- file.path(outdir, "diffbind.dba.rds") +if (!file.exists(dba_path)) { + file.create(dba_path) +} diff --git a/bin/differential_design.py b/bin/differential_design.py new file mode 100755 index 00000000..89f64ede --- /dev/null +++ b/bin/differential_design.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 +import argparse +import csv +import sys + + +def read_tsv(path): + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return [row for row in reader] + + +def parse_list(value): + if not value: + return None + items = [x.strip() for x in value.split(",") if x.strip()] + return set(items) if items else None + + +def is_span_caller(name): + if not name: + return False + lowered = str(name).lower() + return lowered.startswith("span") or lowered.startswith("omnipeak") + + +def main(): + parser = argparse.ArgumentParser(description="Build differential design manifest") + parser.add_argument("--samples", required=True) + parser.add_argument("--peaks", required=True) + parser.add_argument("--contrast", required=True) + parser.add_argument("--min-replicates", type=int, required=True) + parser.add_argument("--allow-partial", action="store_true") + parser.add_argument("--run-diffbind", action="store_true") + parser.add_argument("--run-chipbinner", action="store_true") + parser.add_argument("--run-span", action="store_true") + parser.add_argument("--groups") + parser.add_argument("--callers") + parser.add_argument("--out", default="differential_manifest.design.tsv") + args = parser.parse_args() + + contrast = [x.strip() for x in args.contrast.split(",") if x.strip()] + if len(contrast) != 2: + print("ERROR: --differential_contrast must contain exactly two comma-separated labels.", file=sys.stderr) + sys.exit(1) + treated, control = contrast + + samples = read_tsv(args.samples) + peaks = read_tsv(args.peaks) + + if not samples: + print("ERROR: differential manifest has no samples.", file=sys.stderr) + sys.exit(1) + + conditions_present = {row.get("condition") for row in samples if row.get("condition")} + if treated not in conditions_present or control not in conditions_present: + print( + "ERROR: differential_contrast labels not found in samples: " + f"treated={treated} control={control} present={sorted(conditions_present)}", + file=sys.stderr, + ) + sys.exit(1) + + group_allow = parse_list(args.groups) + caller_allow = parse_list(args.callers) + + # Index samples by group + samples_by_group = {} + for row in samples: + group = row.get("group") or "" + samples_by_group.setdefault(group, []).append(row) + + # Index peaks by (group, caller, sample_id) + peaks_by_group_caller = {} + for row in peaks: + group = row.get("group") or "" + caller = row.get("caller") or "" + sample_id = row.get("sample_id") or "" + key = (group, caller) + peaks_by_group_caller.setdefault(key, set()).add(sample_id) + + all_callers = sorted({row.get("caller") or "" for row in peaks}) + + if args.run_span: + span_callers = [caller for caller in all_callers if is_span_caller(caller)] + if not span_callers: + print("ERROR: run_span_diff requested but no SPAN/OmniPeak peaks found in manifest.", file=sys.stderr) + sys.exit(1) + + design_rows = [] + fail_required = False + + for group in sorted(samples_by_group.keys()): + group_samples = samples_by_group[group] + conditions = sorted({row.get("condition") for row in group_samples if row.get("condition")}) + ignored = [c for c in conditions if c not in contrast] + ignored_conditions = ",".join(ignored) if ignored else "NA" + + treated_samples = [row for row in group_samples if row.get("condition") == treated] + control_samples = [row for row in group_samples if row.get("condition") == control] + + treated_reps = {row.get("replicate") for row in treated_samples if row.get("replicate")} + control_reps = {row.get("replicate") for row in control_samples if row.get("replicate")} + + n_treated = len(treated_reps) + n_control = len(control_reps) + + group_filtered = group_allow is not None and group not in group_allow + + base_status = "RUN" + base_reason = "ok" + if group_filtered: + base_status = "SKIP" + base_reason = "group_filtered" + elif not treated_samples or not control_samples: + base_status = "SKIP" + base_reason = "missing_condition" + elif n_treated < args.min_replicates or n_control < args.min_replicates: + if args.allow_partial: + base_status = "SKIP" + base_reason = "insufficient_replicates" + else: + base_status = "FAIL" + base_reason = "insufficient_replicates" + + group_eligible = base_status == "RUN" + + # Group-level row (chipbinner/span) + if args.run_chipbinner or args.run_span: + status = base_status + reason = base_reason + eligible_chipbinner = args.run_chipbinner and group_eligible + eligible_span = args.run_span and group_eligible + if not (args.run_chipbinner or args.run_span): + status = "SKIP" + reason = "methods_disabled" + else: + status = "SKIP" + reason = "methods_disabled" + eligible_chipbinner = False + eligible_span = False + + design_rows.append({ + "group": group, + "caller": "NA", + "treated_condition": treated, + "control_condition": control, + "n_treated": str(n_treated), + "n_control": str(n_control), + "eligible_diffbind": "false", + "eligible_chipbinner": "true" if eligible_chipbinner else "false", + "eligible_span": "true" if eligible_span else "false", + "status": status, + "reason": reason, + "ignored_conditions": ignored_conditions, + }) + + if status == "FAIL" and (args.run_chipbinner or args.run_span) and not args.allow_partial: + fail_required = True + + # Diffbind rows per caller + for caller in all_callers: + if caller_allow is not None and caller not in caller_allow: + status = "SKIP" + reason = "caller_filtered" + eligible_diffbind = False + elif not args.run_diffbind: + status = "SKIP" + reason = "diffbind_disabled" + eligible_diffbind = False + else: + status = base_status + reason = base_reason + eligible_diffbind = group_eligible + + if status == "RUN": + key = (group, caller) + caller_samples = peaks_by_group_caller.get(key, set()) + needed_ids = {row.get("sample_id") for row in treated_samples + control_samples} + missing = sorted([sid for sid in needed_ids if sid not in caller_samples]) + if missing: + if args.allow_partial: + status = "SKIP" + reason = "missing_peaks" + else: + status = "FAIL" + reason = "missing_peaks" + eligible_diffbind = False + + design_rows.append({ + "group": group, + "caller": caller, + "treated_condition": treated, + "control_condition": control, + "n_treated": str(n_treated), + "n_control": str(n_control), + "eligible_diffbind": "true" if eligible_diffbind else "false", + "eligible_chipbinner": "true" if (args.run_chipbinner and group_eligible) else "false", + "eligible_span": "true" if (args.run_span and group_eligible) else "false", + "status": status, + "reason": reason, + "ignored_conditions": ignored_conditions, + }) + + if status == "FAIL" and args.run_diffbind and not args.allow_partial: + fail_required = True + + # Sort rows deterministically + design_rows.sort(key=lambda r: (r["group"], r["caller"], r["treated_condition"], r["control_condition"])) + + header = [ + "group", + "caller", + "treated_condition", + "control_condition", + "n_treated", + "n_control", + "eligible_diffbind", + "eligible_chipbinner", + "eligible_span", + "status", + "reason", + "ignored_conditions", + ] + + with open(args.out, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header, delimiter="\t") + writer.writeheader() + writer.writerows(design_rows) + + if fail_required: + print("ERROR: differential design has FAIL entries and --differential_allow_partial is false.", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/bin/differential_manifests.py b/bin/differential_manifests.py new file mode 100755 index 00000000..51632186 --- /dev/null +++ b/bin/differential_manifests.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +import argparse +import csv +import os +import sys + + +def detect_delimiter(path): + with open(path, "r", newline="") as handle: + header = handle.readline() + if "\t" in header: + return "\t" + return "," + + +def read_table(path): + delim = detect_delimiter(path) + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter=delim) + rows = [row for row in reader] + return rows + + +def load_ms_coeffs(path): + if not path: + return {} + rows = read_table(path) + if not rows: + return {} + header = rows[0].keys() + if "sample_id" not in header or "ms_coeff" not in header: + raise ValueError("ms_coeffs file must contain columns: sample_id, ms_coeff") + coeffs = {} + for row in rows: + sample_id = row.get("sample_id") + if not sample_id: + continue + coeffs[sample_id] = row.get("ms_coeff") or "NA" + return coeffs + + +def sort_key(value): + if value is None: + return "" + return str(value) + + +def write_samples(samples_rows, ms_coeffs, normalisation_mode, out_path): + header = [ + "sample_id", + "group", + "condition", + "replicate", + "final_bam", + "final_bai", + "normalisation_mode", + "spikein_scale_factor", + "ms_coeff", + "bigwig_path", + ] + rows = [] + for row in samples_rows: + sample_id = row.get("sample_id") or "" + group = row.get("group") or "" + condition = row.get("condition") or "" + replicate = row.get("replicate") or "" + final_bam = row.get("final_bam") or "NA" + final_bai = row.get("final_bai") or "NA" + spikein = row.get("spikein_scale_factor") or "NA" + bigwig = row.get("bigwig_path") or "NA" + ms_coeff = ms_coeffs.get(sample_id, "NA") + rows.append({ + "sample_id": sample_id, + "group": group, + "condition": condition, + "replicate": replicate, + "final_bam": final_bam, + "final_bai": final_bai, + "normalisation_mode": normalisation_mode, + "spikein_scale_factor": spikein, + "ms_coeff": ms_coeff, + "bigwig_path": bigwig, + }) + + rows.sort(key=lambda r: (r["group"], "NA", r["condition"], sort_key(r["replicate"]), r["sample_id"])) + + with open(out_path, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header, delimiter="\t") + writer.writeheader() + writer.writerows(rows) + + +def write_peaks(peaks_rows, out_path): + header = [ + "sample_id", + "group", + "condition", + "caller", + "peaks_path", + "peaks_format", + ] + rows = [] + for row in peaks_rows: + rows.append({ + "sample_id": row.get("sample_id") or "", + "group": row.get("group") or "", + "condition": row.get("condition") or "", + "caller": row.get("caller") or "", + "peaks_path": row.get("peaks_path") or "NA", + "peaks_format": row.get("peaks_format") or "NA", + "replicate": row.get("replicate") or "", + }) + + rows.sort(key=lambda r: (r["group"], r["caller"], r["condition"], sort_key(r["replicate"]), r["sample_id"])) + + with open(out_path, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header, delimiter="\t", extrasaction="ignore") + writer.writeheader() + for row in rows: + writer.writerow(row) + + +def main(): + parser = argparse.ArgumentParser(description="Build differential manifests") + parser.add_argument("--samples", required=True) + parser.add_argument("--peaks", required=True) + parser.add_argument("--normalisation-mode", required=True) + parser.add_argument("--ms-coeffs") + parser.add_argument("--outdir", default=".") + args = parser.parse_args() + + samples_rows = read_table(args.samples) + peaks_rows = read_table(args.peaks) + ms_coeffs = {} + if args.ms_coeffs: + try: + ms_coeffs = load_ms_coeffs(args.ms_coeffs) + except Exception as exc: + print(f"ERROR: {exc}", file=sys.stderr) + sys.exit(1) + + out_samples = os.path.join(args.outdir, "differential_manifest.samples.tsv") + out_peaks = os.path.join(args.outdir, "differential_manifest.peaks.tsv") + + write_samples(samples_rows, ms_coeffs, args.normalisation_mode, out_samples) + write_peaks(peaks_rows, out_peaks) + + +if __name__ == "__main__": + main() diff --git a/bin/differential_summary.py b/bin/differential_summary.py new file mode 100755 index 00000000..515f9a43 --- /dev/null +++ b/bin/differential_summary.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +import argparse +import csv +import os + + +def read_tsv(path): + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return [row for row in reader] + + +def main(): + parser = argparse.ArgumentParser(description="Build differential summary table") + parser.add_argument("--design", required=True) + parser.add_argument("--summaries", nargs="*", default=[]) + parser.add_argument("--out", required=True) + args = parser.parse_args() + + design_rows = read_tsv(args.design) + + base_rows = {} + for row in design_rows: + group = row.get("group") or "" + caller = row.get("caller") or "NA" + treated = row.get("treated_condition") or "" + control = row.get("control_condition") or "" + status = row.get("status") or "" + reason = row.get("reason") or "" + + if caller != "NA": + key = ("diffbind", group, caller) + base_rows[key] = { + "method": "diffbind", + "group": group, + "caller": caller, + "treated": treated, + "control": control, + "n_tested": "0", + "n_fdr_pass": "0", + "n_up": "0", + "n_down": "0", + "mode": "NA", + "status": status, + "reason": reason, + } + else: + for method in ("chipbinner", "span"): + key = (method, group, "NA") + base_rows[key] = { + "method": method, + "group": group, + "caller": "NA", + "treated": treated, + "control": control, + "n_tested": "0", + "n_fdr_pass": "0", + "n_up": "0", + "n_down": "0", + "mode": "NA", + "status": status, + "reason": reason, + } + + for summary_path in args.summaries: + if not summary_path or not os.path.exists(summary_path): + continue + rows = read_tsv(summary_path) + if not rows: + continue + fname = os.path.basename(summary_path) + if "diffbind" in fname: + method = "diffbind" + elif "chipbinner" in fname: + method = "chipbinner" + elif "span" in fname: + method = "span" + else: + continue + for row in rows: + group = row.get("group") or "" + caller = row.get("caller") or (row.get("caller_id") or "NA") + key = (method, group, caller) + base_rows[key] = { + "method": method, + "group": group, + "caller": caller, + "treated": row.get("treated") or row.get("treated_condition") or "", + "control": row.get("control") or row.get("control_condition") or "", + "n_tested": row.get("n_tested") or row.get("n_bins_tested") or "0", + "n_fdr_pass": row.get("n_fdr_pass") or "0", + "n_up": row.get("n_up") or "0", + "n_down": row.get("n_down") or "0", + "mode": row.get("mode") or "NA", + "status": row.get("status") or "RUN", + "reason": row.get("reason") or "ok", + } + + header = [ + "method", + "group", + "caller", + "treated", + "control", + "n_tested", + "n_fdr_pass", + "n_up", + "n_down", + "mode", + "status", + "reason", + ] + + rows_sorted = sorted(base_rows.values(), key=lambda r: (r["method"], r["group"], r["caller"])) + + with open(args.out, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header, delimiter="\t") + writer.writeheader() + writer.writerows(rows_sorted) + + +if __name__ == "__main__": + main() diff --git a/bin/records_to_tsv.py b/bin/records_to_tsv.py new file mode 100755 index 00000000..faea31dc --- /dev/null +++ b/bin/records_to_tsv.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import argparse +import json + + +def main(): + parser = argparse.ArgumentParser(description="Write records JSON to TSV") + parser.add_argument("--records", required=True, help="JSON string of records") + parser.add_argument("--header", required=True, help="Tab-separated header columns") + parser.add_argument("--out", required=True, help="Output TSV path") + args = parser.parse_args() + + records = json.loads(args.records) + header = args.header.split("\t") + + with open(args.out, "w") as handle: + handle.write("\t".join(header) + "\n") + for record in records: + row = [str(record.get(col, "")) for col in header] + handle.write("\t".join(row) + "\n") + + +if __name__ == "__main__": + main() diff --git a/bin/span_diff.py b/bin/span_diff.py new file mode 100755 index 00000000..482438ff --- /dev/null +++ b/bin/span_diff.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +import argparse +import csv +import os +import subprocess +import sys + + +def read_tsv(path): + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return [row for row in reader] + + +def detect_native_support(jar_path, java_heap): + try: + result = subprocess.run( + ["java", f"-Xmx{java_heap}", "-jar", jar_path, "--help"], + capture_output=True, + text=True, + check=False, + ) + output = (result.stdout or "") + (result.stderr or "") + except Exception: + return False, "help_failed" + if "compare" in output.lower(): + return True, "compare" + return False, "no_compare" + + +def read_peaks(peaks_path, max_rows=200): + regions = [] + with open(peaks_path, "r") as handle: + for line in handle: + if line.startswith("#") or not line.strip(): + continue + parts = line.strip().split("\t") + if len(parts) < 3: + continue + regions.append((parts[0], parts[1], parts[2])) + if len(regions) >= max_rows: + break + return regions + + +def main(): + parser = argparse.ArgumentParser(description="Stub SPAN/OmniPeak differential") + parser.add_argument("--jar", required=True) + parser.add_argument("--mode", required=True) + parser.add_argument("--contrast", required=True) + parser.add_argument("--group", required=True) + parser.add_argument("--samples", required=True) + parser.add_argument("--peaks", required=True) + parser.add_argument("--bin", type=int, default=200) + parser.add_argument("--gap", type=int, default=5) + parser.add_argument("--fdr", type=float, default=0.05) + parser.add_argument("--fallback-backend", default="DESeq2") + parser.add_argument("--java-heap", default="8G") + parser.add_argument("--outdir", default=".") + args = parser.parse_args() + + contrast = [x.strip() for x in args.contrast.split(",") if x.strip()] + treated = contrast[0] if len(contrast) > 0 else "treated" + control = contrast[1] if len(contrast) > 1 else "control" + + native_supported, signature = detect_native_support(args.jar, args.java_heap) + chosen_mode = "fallback" + if args.mode == "native": + if not native_supported: + print("ERROR: SPAN jar does not support native compare.", file=sys.stderr) + sys.exit(1) + chosen_mode = "native" + elif args.mode == "fallback": + chosen_mode = "fallback" + else: + chosen_mode = "native" if native_supported else "fallback" + + if not os.path.exists(args.peaks): + print("ERROR: SPAN peaks file not found for fallback.", file=sys.stderr) + sys.exit(1) + + os.makedirs(args.outdir, exist_ok=True) + regions = read_peaks(args.peaks) + + diff_path = os.path.join(args.outdir, "span.differential.tsv") + bed_path = os.path.join(args.outdir, "span.differential.peaks.bed") + up_path = os.path.join(args.outdir, "span.up.bed") + down_path = os.path.join(args.outdir, "span.down.bed") + summary_path = os.path.join(args.outdir, "span.summary.tsv") + mode_path = os.path.join(args.outdir, "span.mode.txt") + + with open(diff_path, "w") as handle: + handle.write("chr\tstart\tend\tlog2FC\tpval\tFDR\n") + for chrom, start, end in regions: + handle.write(f"{chrom}\t{start}\t{end}\t0.0\t1.0\t1.0\n") + + with open(bed_path, "w") as handle: + for chrom, start, end in regions: + handle.write(f"{chrom}\t{start}\t{end}\n") + + open(up_path, "w").close() + open(down_path, "w").close() + + with open(summary_path, "w") as handle: + handle.write("group\ttreated\tcontrol\tn_tested\tn_fdr_pass\tn_up\tn_down\tmode\tstatus\treason\n") + handle.write(f"{args.group}\t{treated}\t{control}\t{len(regions)}\t0\t0\t0\t{chosen_mode}\tRUN\tok\n") + + with open(mode_path, "w") as handle: + handle.write(f"{chosen_mode}:{signature}\n") + + +if __name__ == "__main__": + main() diff --git a/conf/base.config b/conf/base.config index 57726de9..56a639af 100644 --- a/conf/base.config +++ b/conf/base.config @@ -49,6 +49,18 @@ process { withLabel:process_high_memory { memory = { check_max( 200.GB * task.attempt, 'memory' ) } } + withLabel:process_diffbind { + cpus = { check_max( 4, 'cpus' ) } + memory = { check_max( 16.GB, 'memory' ) } + } + withLabel:process_chipbinner { + cpus = { check_max( 8, 'cpus' ) } + memory = { check_max( 32.GB, 'memory' ) } + } + withLabel:process_span { + cpus = { check_max( 4, 'cpus' ) } + memory = { check_max( 16.GB, 'memory' ) } + } withLabel:error_ignore { errorStrategy = 'ignore' } diff --git a/conf/modules.config b/conf/modules.config index 676eb44a..122a2102 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1149,3 +1149,97 @@ if (params.run_multiqc) { } } } + +/* +======================================================================================== + DIFFERENTIAL ANALYSIS +======================================================================================== +*/ + +if(params.run_diffbind || params.run_chipbinner || params.run_span_diff || params.differential_publish_manifest_only) { + process { + withName: '.*:DIFFERENTIAL_MANIFESTS' { + publishDir = [ + enabled: false + ] + } + + withName: '.*:PUBLISH_DIFFERENTIAL_MANIFESTS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/00_manifests" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:DIFFERENTIAL_DESIGN' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/00_manifests" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:RECORDS_TO_TSV' { + publishDir = [ + enabled: false + ] + } + + withName: '.*:DIFFBIND_RUN' { + publishDir = [ + [ + path: { "${params.outdir}/03_peak_calling/08_differential/01_diffbind/${caller}/${group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/03_peak_calling/08_differential/01_diffbind/00_samplesheets/${caller}" }, + mode: "${params.publish_dir_mode}", + pattern: "diffbind.samplesheet.csv", + saveAs: { filename -> "${group}.csv" } + ] + ] + } + + withName: '.*:CHIPBINNER_RUN' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/02_chipbinner/${group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:SPAN_DIFF_RUN' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/03_span/${group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:ANNOTATE_REGIONS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:DIFFERENTIAL_SUMMARY' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/multiqc" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:MULTIQC_DIFFERENTIAL' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/multiqc" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} diff --git a/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md new file mode 100644 index 00000000..8ddce6eb --- /dev/null +++ b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md @@ -0,0 +1,712 @@ +# SPEC_DIFFERENTIAL_PEAK_CALLING.md - second version + +Status: Draft +Version: 1.0 +Date: 2026-01-04 +Branch target: differential_peak_calling +Applies to: nf-core/cutandrun (DSL2) fork + +Table of contents + 1. Purpose and scope + 2. Definitions and conventions + 3. User-facing behavior + 4. Experimental design rules and validation + 5. Parameters + 6. Manifests (integrated + posthoc reproducibility) + 7. Workflow architecture (DSL2) + 8. Method specs + • 8.1 DiffBind + • 8.2 ChIPBinner + • 8.3 SPAN / OmniPeak differential + 9. Shared annotation layer + 10. MultiQC integration + 11. Output layout + 12. Resources and performance + 13. Testing strategy + 14. Acceptance criteria + 15. Out of scope (v1) + +⸻ + +1. Purpose and scope + +Add an opt‑in differential enrichment / peak analysis capability to the pipeline, supporting three complementary methods: + • DiffBind: peak-centric differential binding, run per (group × caller). + • ChIPBinner: genome-wide binning + clustering + ROTS differential (broad/global changes), run per group. ChIPbinner is designed for broad histone mark analysis via uniform binning. + • SPAN/OmniPeak: native compare differential when supported by the jar; otherwise a deterministic fallback producing a comparable differential table. SPAN documents an experimental compare command with treatment/control replicates and parameters such as --bin, --gap, and --fdr. + +Key goals: + • Respect existing sample metadata: group, condition, replicate, caller, sample_id/id. + • Keep upstream peak calling outputs immutable. + • Ensure the same differential codepath runs in both: + • Integrated mode (during a normal run, using channels) + • Posthoc differential-only mode (reconstructing channels from manifests) + • Provide deterministic outputs and MultiQC summaries for reproducibility. + +⸻ + +2. Definitions and conventions + +2.1 Metadata fields + +The pipeline already uses a condition-aware metadata model. Differential analysis assumes each sample has: + • sample_id (unique sample key; may be meta.id internally) + • group (mark / target / antibody group) + • condition (biological condition label) + • replicate (biological replicate identifier) + • caller (peak caller variant used to generate the sample’s peaks) + +2.2 Contrast direction + +All methods must report: + • log2FC = log2(treated / control) + +The user provides the mapping via: + • --differential_contrast "treated,control" + +2.3 Spike-in scaling convention + +If spike-in normalization is enabled upstream, the pipeline may emit per-sample scale factors used for coverage scaling. When used in statistical models that expect size factors (e.g., DESeq2/edgeR), the differential module must convert scale factors to a consistent modeling convention: + • size_factor = 1 / scale_factor + +The module must export the exact values used for transparency. + +DiffBind supports providing user-defined normalization factors via dba.normalize(). + +⸻ + +3. User-facing behavior + +3.1 Execution modes + +Integrated mode (default when enabled) +Triggered when any of: + • --run_diffbind + • --run_chipbinner + • --run_span_diff + +and --differential_from_run is not set. + +Behavior: + • Differential subworkflow consumes in-memory channels emitted by upstream steps (final BAMs, peaks, optional bigWigs, scale factors). + • It also writes deterministic manifests under the current --outdir to enable future posthoc runs. + +Posthoc differential-only mode +Triggered when: + • --differential_from_run is provided + • and Nextflow entrypoint -entry DIFFERENTIAL_ONLY is used. + +Behavior: + • Skip all upstream computation. + • Load manifests from /03_peak_calling/08_differential/00_manifests/. + • Reconstruct the same logical channels as integrated mode. + • Write new results to the current --outdir (never mutating ). + +3.2 Typical usage patterns + • Integrated: + • Enable one or more methods and provide a contrast. + • Posthoc: + • Use the prior run’s manifests and optionally change method parameters (e.g., FDR/lfc cutoffs). + +3.3 Selective execution (performance) + +The module supports restricting work to subsets: + • --differential_groups "H3K27me3,H3K4me3" (comma-separated) + • --differential_callers "SEACR,MACS2" (comma-separated caller IDs matching meta.caller) + +If unset, run all eligible groups/callers. + +⸻ + +4. Experimental design rules and validation + +4.1 Global requirements + +If any differential method is enabled: + • --differential_contrast is required and must contain exactly two comma-separated labels. + +If contrast labels don’t exist in any sample’s condition: + • Fail immediately with a clear message. + +4.2 Per-group eligibility + +For each group: + • Eligible samples are those where condition ∈ {treated, control}. + • Other conditions may exist in the dataset; they are ignored for the differential run, and recorded as ignored_conditions in the design manifest. + +A group is eligible if: + • Both contrast conditions exist for the group, and + • Each condition has ≥ --differential_min_replicates samples. + +If a group lacks one contrast condition: + • Mark as SKIPPED (not an error) and record the reason. + +If a group has both conditions but insufficient replicates: + • Default behavior: FAIL (to avoid silently producing statistically invalid results). + • If --differential_allow_partial is true: mark as SKIPPED and record the reason. + +4.3 Per-caller eligibility (DiffBind) + +DiffBind is evaluated per (group, caller): + • The same group-level replicate rules apply. + • Additionally, the peaks file for each sample must exist for that caller. + • If peaks are missing for some samples for a given caller: + • Mark (group,caller) as skipped (or fail in strict mode). + +⸻ + +5. Parameters + +All parameters must be added to: + • nextflow_schema.json + • nextflow.config defaults + • docs/usage.md + docs/output.md (fork docs) + • Module meta.yml (if following nf-core module conventions) + +5.1 Global toggles and mode + +Parameter Type Default Notes +--run_diffbind bool false Enable DiffBind method. +--run_chipbinner bool false Enable ChIPBinner method. +--run_span_diff bool false Enable SPAN/OmniPeak differential method. +--differential_contrast string (required if any run_ true)* "treated,control", defines log2FC direction. +--differential_min_replicates int 2 Minimum replicates per condition per group. +--differential_use_spikein enum auto `auto +--differential_allow_partial bool false If true, skip invalid group/caller comparisons instead of failing the run. +--differential_from_run path null Enables posthoc mode when using -entry DIFFERENTIAL_ONLY. +--differential_publish_manifest_only bool false Generate manifests + design tables only (no heavy method runs). +--differential_groups string null Comma-separated allowlist of groups to run. +--differential_callers string null Comma-separated allowlist of callers (DiffBind scope). + +5.2 Normalization inputs + +Parameter Type Default Notes +--chipbinner_ms_coeffs path null Optional MS-based coefficients for ChIPBinner scaling. +--dump_scale_factors bool (existing) If upstream already supports dumping spike-in factors, manifests must reference published paths. + +5.3 DiffBind parameters + +Parameter Type Default Notes +--diffbind_fdr float 0.05 FDR cutoff. +--diffbind_lfc float 1.0 Absolute log2FC threshold for “up/down”. +--diffbind_min_overlap int 2 Min overlap for consensus counting. +--diffbind_backend enum DESeq2 `DESeq2 +--diffbind_recenter_peaks bool false If true, use summit-based recentering. +--diffbind_summits int 0 0 = full peaks; otherwise summit window width. +--diffbind_norm_method enum native `native +--diffbind_extra_params path null JSON/YAML passthrough for advanced runner args. +--export_diffbind_sheets bool true Write sample sheets to output. + +5.4 ChIPBinner parameters + +Parameter Type Default Notes +--chipbinner_bin_size int 10000 Allowed: 1000,5000,10000,25000. +--chipbinner_windows_dir path null Prebuilt BED windows; otherwise generate. +--chipbinner_use_input bool false Optional background usage (never treated as a condition). +--chipbinner_pseudocount int 1 Added before log transforms. +--chipbinner_hdbscan_grid_minpts string "100,200,500,1000" Grid values (comma-separated). +--chipbinner_hdbscan_grid_minsamps string "100,200,500,1000" Grid values (comma-separated). +--chipbinner_fdr float 0.05 FDR cutoff for ROTS results. +--chipbinner_lfc float 1.0 Absolute log2FC cutoff for labeling. +--chipbinner_bootstrap int 1000 ROTS bootstrap iterations. +--chipbinner_k_value int 100000 ROTS K parameter. +--chipbinner_functional_db path null Optional enrichment resources (LOLA). + +5.5 SPAN / OmniPeak differential parameters + +Parameter Type Default Notes +--span_diff_mode enum auto `auto +--span_diff_fdr float 0.05 FDR cutoff. +--span_diff_gap int 5 Gap parameter (native). +--span_diff_bin int 200 Bin size (native). +--span_diff_java_heap string 8G Java heap for jar. +--span_fallback_backend enum DESeq2 `DESeq2 +--omnipeaks_jar path (required if run_span_diff) Jar used for SPAN/OmniPeak operations. + + +⸻ + +6. Manifests + +6.1 Location + +All manifests are written in integrated mode to: + +03_peak_calling/08_differential/00_manifests/ + +and are consumed in posthoc mode from the prior run’s outdir. + +6.2 Determinism requirements + • Always sorted deterministically (lexicographic by group, caller, condition, replicate, sample_id). + • Use published output paths (paths that exist under --outdir), never work directory paths. + • Always written if any differential method is enabled, even if all comparisons are skipped. + +6.3 Manifest files and schemas + +differential_manifest.samples.tsv +One row per target sample. + +Columns: + • sample_id + • group + • condition + • replicate + • final_bam + • final_bai + • normalisation_mode + • spikein_scale_factor (numeric or NA) + • ms_coeff (numeric or NA) + • bigwig_path (path or NA) + +differential_manifest.peaks.tsv +One row per (sample_id × caller) peak file. + +Columns: + • sample_id + • group + • condition + • caller + • peaks_path + • peaks_format (e.g., bed|narrowPeak|broadPeak|peak) + +differential_manifest.design.tsv +One row per analysis unit, capturing eligibility and reasons: + +Columns: + • group + • caller (for DiffBind rows; NA for group-only methods) + • treated_condition + • control_condition + • n_treated + • n_control + • eligible_diffbind (true|false) + • eligible_chipbinner (true|false) + • eligible_span (true|false) + • status (RUN|SKIP|FAIL) + • reason (free text; stable short codes preferred) + • ignored_conditions (comma-separated list or NA) + +span_diff_target_pooling.tsv (optional) +Only emitted if the jar does not support replicate lists and the pipeline had to pool replicates. + +Columns: + • group + • condition + • pooled_bam + • source_bams (comma-separated) + +⸻ + +7. Workflow architecture (DSL2) + +7.1 New subworkflow + +Create: + +subworkflows/local/differential_peak_calling.nf + +This subworkflow exposes a single entry function used by both modes: + +DIFFERENTIAL_PEAK_CALLING(...) + +7.2 New pipeline entrypoint + +Add a new entrypoint workflow: + +workflow DIFFERENTIAL_ONLY { ... } + +Responsibilities: + • Load manifests from params.differential_from_run. + • Rebuild channels matching integrated mode. + • Call DIFFERENTIAL_PEAK_CALLING(...). + • Publish outputs to the current --outdir. + +7.3 Internal phases + +Phase A: Build or load design + • BUILD_DIFFERENTIAL_DESIGN (integrated): derive eligibility table + manifest rows from channels. + • LOAD_DIFFERENTIAL_DESIGN (posthoc): read manifests into channels and re-derive eligibility checks. + +Phase B: Run methods + • DIFFBIND_ANALYSIS (per caller × group) + • CHIPBINNER_ANALYSIS (per group) + • SPAN_DIFFERENTIAL (per group; native or fallback) + +Phase C: Shared post-processing + • ANNOTATE_REGIONS (shared module used by all methods) + • BUILD_DIFFERENTIAL_SUMMARIES (aggregates method summaries + skip/fail reasons) + +⸻ + +8. Method specifications + +8.1 DiffBind (per caller × group) + +8.1.1 Inputs +For each eligible (group, caller): + • Deduplicated/final BAM + BAI for each target sample. + • Peak file for each sample from the same caller variant. + • Spike-in scale factors (optional). + • Contrast definition (treated, control). + +Note: nf-core/cutandrun supports running multiple peak callers, where the first caller is primary and additional callers are run and written to results. Differential must run for all caller variants present in the manifest. + +8.1.2 Sample sheet generation +Path (if enabled): +03_peak_calling/08_differential/01_diffbind/00_samplesheets//.csv + +Columns: + • SampleID (sample_id) + • Factor (group) + • Condition (condition) + • Replicate (replicate) + • bamReads (final_bam) + • Peaks (peaks_path) + • PeakCaller (caller) + • Tissue (optional constant e.g. CUTRUN) + +8.1.3 R runner behavior +Implement: + • modules/local/diffbind_run.nf + • bin/diffbind_run.R + +Steps: + 1. Load sample sheet into dba(). + 2. Count reads: dba.count(...) + • If diffbind_recenter_peaks=false, keep full peaks (summits=0). + • Else use summits=params.diffbind_summits. + 3. Normalization: + • If spike-in enabled and factors present: + • compute norm = 1 / spikein_scale_factor + • apply via dba.normalize(..., normalize=norm, ...) (or equivalent supported interface). DiffBind supports specifying normalization factors directly. + • Else: + • use DiffBind-native normalization (optionally guided by diffbind_norm_method). + 4. Define contrast using --differential_contrast ordering. + 5. Analyze using selected backend (DESeq2 or edgeR). + 6. Export: + • full results table + • significant subsets by FDR and |LFC| + • standard plots (PCA, correlation heatmap, MA, volcano, etc.) + +8.1.4 Outputs +03_peak_calling/08_differential/01_diffbind/// + +Required files: + • diffbind.results.tsv + • diffbind.results.annotated.tsv + • diffbind.significant.bed + • diffbind.significant_up.bed + • diffbind.significant_down.bed + • diffbind.summary.tsv (one-row summary for MultiQC) + • plots/ (PCA, correlation heatmap, MA, volcano; optional others) + +Recommended reproducibility files: + • diffbind.dba.rds (serialized DBA object) + • diffbind.normalization_factors.tsv + +8.1.5 Error handling + • Design failures are handled before launching the runner. + • Runtime failures: + • Default: fail the run. + • If --differential_allow_partial: record (group,caller) failure in summaries and continue other units. + +⸻ + +8.2 ChIPBinner (per group) + +8.2.1 Inputs +Per eligible group: + • final BAM/BAI for all target samples in treated/control conditions + • chrom sizes + • optional blacklist BED (to exclude problematic bins) + • optional MS coeffs file (per sample) + • optional pooled input/IgG if chipbinner_use_input=true (implementation must match controls deterministically) + +ChIPbinner divides the genome into uniform bins to analyze broad marks and differential enrichment patterns. + +8.2.2 Steps + 1. Prepare windows + • If chipbinner_windows_dir provided: load ..windows.bed. + • Else: generate windows from chrom sizes at chipbinner_bin_size. + • Subtract blacklist regions if available. + • Cache windows per (genome, bin_size, blacklist_hash) to avoid regeneration. + 2. Quantify + • Produce per-sample binned counts matrix (rows=bins, cols=samples). + • Persist raw count matrix. + 3. Normalize + • Apply scaling in this order: + 1. spike-in scaling (if enabled and present) + 2. MS coefficient scaling (if provided) + 3. pseudocount addition + 4. optional library-size normalization (if required by ChIPbinner routines) + 4. QC + • PCA and correlation plots from normalized matrix. + 5. HDBSCAN grid search + • Iterate over Cartesian product of minPts × minSamps. + • Persist cluster assignments for each grid point. + • Compute stability metrics and pick a preferred parameter set deterministically (document selection rule). + 6. Cluster extraction + • Extract standardized 2- and 3-cluster solutions (when possible). + 7. Differential testing + • Run ROTS on bins (replicates preserved), reporting: + • log2FC (treated/control) + • p-value + • FDR + • cluster assignment + 8. Optional enrichment + • If chipbinner_functional_db set: run enrichment (e.g., LOLA) per cluster. + • Else: emit a “not run” stub file. + +8.2.3 Outputs +03_peak_calling/08_differential/02_chipbinner// + +Required files: + • chipbinner.samplesheet.csv + • chipbinner.windows.bed + • chipbinner.bin_counts.tsv + • chipbinner.normalized_matrix.tsv + • chipbinner.hdbscan_grid_summary.tsv + • chipbinner.clusters.tsv + • chipbinner.differential.tsv + • chipbinner.differential.annotated.tsv + • chipbinner.summary.tsv + • plots/ (PCA, correlation, density scatter, genic/intergenic if produced) + +Optional: + • enrichment/ outputs when enabled + +8.2.4 Error handling + • If clustering fails for some grid points, continue; choose best among successful points. + • If none succeed: + • Default: fail. + • With --differential_allow_partial: record failure for the group and continue. + +⸻ + +8.3 SPAN / OmniPeak differential (per group) + +8.3.1 Capability detection +Implement a lightweight “help probe” step that determines: + • Whether the jar supports a native differential subcommand (e.g., compare). + • Which signature is supported: + • SPAN-like compare -t -c ... (documented) + • Any alternative signature exposed by the jar (detected via --help parsing) + +Decision logic: + • If span_diff_mode=fallback: always run fallback. + • If span_diff_mode=native: require native support, else fail. + • If span_diff_mode=auto: native if supported, else fallback. + +8.3.2 Native path +For each eligible group: + • Build comma-separated replicate lists for treated and control. + • Run the jar compare with: + • --chrom.sizes / --cs + • --bin (span_diff_bin) + • --gap (span_diff_gap) + • --fdr (span_diff_fdr) + • output peaks file (--peaks / -p) + +SPAN documents: + • treatment replicates via comma-separated list + • control replicates via comma-separated list + • --bin, --gap, --fdr options for compare + +If replicates are not supported by the jar signature: + • Pool replicates per condition (samtools merge), emit span_diff_target_pooling.tsv. + +8.3.3 Fallback path +Goal: produce a deterministic differential table even without native support. + +Fallback region set: + • Prefer union of SPAN/OmniPeak peaks from the SPAN caller variant (if present). + • If SPAN peaks are unavailable but run_span_diff is enabled: + • Fail with a clear message (v1). + • (Future enhancement could allow using primary caller peaks.) + +Steps: + 1. Build union region BED (sorted, merged). + 2. Count reads per region per sample. + 3. Run DE (DESeq2 or edgeR via span_fallback_backend). + 4. Apply spike-in derived size factors when enabled. + 5. Output consistent tables and BED splits (up/down). + +8.3.4 Outputs +03_peak_calling/08_differential/03_span// + +Required files: + • span.differential.tsv + • span.differential.peaks.bed (or native peaks format, plus a converted BED) + • span.differential.annotated.tsv + • span.up.bed + • span.down.bed + • span.summary.tsv + • span.mode.txt (native|fallback + signature note) + +Optional: + • span_diff_target_pooling.tsv if pooling was required + +⸻ + +9. Shared annotation layer + +9.1 Inputs + • Regions as BED (DiffBind significant peaks, ChIPbinner differential bins, SPAN differential regions) + • Gene annotation: + • Prefer existing pipeline gene BED (if available) + • Else derive from GTF (gene bodies and/or TSS) and cache + +9.2 Method + • Use bedtools closest to annotate nearest gene/feature and distance. + • Append standardized columns: + • nearest_feature_id + • nearest_gene_name (if available) + • distance_to_feature + +9.3 Output contract + +All *.annotated.tsv outputs must include: + • original region coordinates + • method statistics (log2FC, p-value, FDR) + • annotation columns above + +⸻ + +10. MultiQC integration + +10.1 Summary TSVs + +Each method produces a one-row-per-unit summary TSV: + • DiffBind: diffbind.summary.tsv + • caller,group,treated,control,n_tested,n_fdr_pass,n_up,n_down,status,reason + • ChIPBinner: chipbinner.summary.tsv + • group,treated,control,n_bins_tested,n_fdr_pass,n_up,n_down,n_clusters,chosen_minPts,chosen_minSamps,status,reason + • SPAN: span.summary.tsv + • group,treated,control,n_tested,n_fdr_pass,n_up,n_down,mode,status,reason + +10.2 Aggregated MultiQC table + +Create: +03_peak_calling/08_differential/multiqc/differential_summary_mqc.tsv + +This is a concatenation of the per-method summaries plus design manifest status, ensuring MultiQC has stable content even when analyses were skipped. + +10.3 MultiQC config + +Update: + • assets/multiqc_config.yml + +Add a new section “Differential analysis” containing: + • a design/eligibility table (from differential_manifest.design.tsv) + • per-method summary tables + • optional links (paths) to key result files (if MultiQC supports) + +10.4 Optional differential-only report (opt-in) + +If --differential_multiqc_report=true (new optional param), produce: + +03_peak_calling/08_differential/multiqc/differential_multiqc_report.html + +This report is focused solely on differential outputs and does not modify the pipeline’s primary MultiQC report. + +⸻ + +11. Output layout + +All differential outputs live under: + +03_peak_calling/08_differential/ + +03_peak_calling/08_differential/ + 00_manifests/ + differential_manifest.samples.tsv + differential_manifest.peaks.tsv + differential_manifest.design.tsv + span_diff_target_pooling.tsv (optional) + 01_diffbind/ + 00_samplesheets//.csv + // + diffbind.results.tsv + diffbind.results.annotated.tsv + diffbind.significant*.bed + diffbind.summary.tsv + plots/ + 02_chipbinner/ + / + chipbinner.*.tsv + chipbinner.*.csv + chipbinner.summary.tsv + plots/ + enrichment/ (optional) + 03_span/ + / + span.differential.tsv + span.differential.peaks.bed + span.differential.annotated.tsv + span.up.bed + span.down.bed + span.summary.tsv + span.mode.txt + multiqc/ + differential_summary_mqc.tsv + differential_multiqc_report.html (optional) + + +⸻ + +12. Resources and performance + +12.1 Default resource labels (recommended) + +Provide defaults in conf/base.config using labels: + • DiffBind analyze: 4 CPU / 16 GB + • ChIPBinner clustering: 8 CPU / 32 GB + • ChIPBinner differential: 4 CPU / 16 GB + • SPAN native compare: 2 CPU / 8 GB + • SPAN fallback DE: 4 CPU / 16 GB + +12.2 Performance considerations + • Design gating must run before launching heavy processes. + • Windows generation is cached per genome/bin size. + • Prefer streaming and minimal intermediate files where possible, but publish key matrices for reproducibility. + +⸻ + +13. Testing strategy + +13.1 CI-fast tests + • Validation: + • missing --differential_contrast when run flags are on + • contrast labels absent + • insufficient replicates (strict vs allow_partial) + • Manifest emission: + • determinism (sorted output) + • required columns present + • Sample sheet emission: + • DiffBind sheets created for eligible (group,caller) + • ChIPbinner samplesheet created per eligible group + +Use --differential_publish_manifest_only and/or a stub profile to avoid heavy compute. + +13.2 Optional integration tests (non-default) + • Single small dataset producing a real DiffBind result TSV. + • SPAN signature detection test using a mocked help output. + • ChIPBinner smoke test with reduced bootstrap/grid sizes. + +⸻ + +14. Acceptance criteria + • Integrated runs produce manifests and method outputs for eligible units. + • Posthoc -entry DIFFERENTIAL_ONLY --differential_from_run reproduces the same eligibility decisions and produces outputs without rerunning upstream steps. + • Log2FC sign is consistent (treated/control) across all methods. + • Spike-in factors propagate to DiffBind normalization factors when enabled. + • MultiQC tables are stable and show “skipped” reasons where applicable. + +⸻ + +15. Out of scope (v1) + • Multiple contrasts per run (timecourse / all pairwise). + • Batch covariates in statistical models. + • Deep cross-method concordance (beyond lightweight summary tables). + • Automatic peak caller benchmarking. + diff --git a/docs/output.md b/docs/output.md index ccf61f74..e6554d24 100644 --- a/docs/output.md +++ b/docs/output.md @@ -31,6 +31,7 @@ - 6.8. [epic2 peak calling](#epic2peakcalling) - 6.9. [SPAN/OmniPeaks peak calling](#SPANpeakcalling) - 6.10. [Consensus Peaks](#ConsensusPeaks) + - 6.11. [Differential analysis](#DifferentialAnalysis) - 7. [Peak-based QC](#Peak-basedQC) - 7.1. [Peak Counts](#PeakCounts) - 7.2. [Peak Reproducibility](#PeakReproducibility) @@ -438,6 +439,40 @@ SPAN/OmniPeaks callers require pooled controls and the `--omnipeaks_jar` paramet The merge function from [BEDtools](https://github.com/arq5x/bedtools2) is used to merge replicate peaks of the same experimental group (by `group` or `group_condition`) to create a consensus peak set. This can then optionally be filtered for consensus peaks contributed to be a threshold number of replicates using `--replicate_threshold`. +### 6.11. Differential analysis + +
+Output files + +- `03_peak_calling/08_differential/00_manifests/` + - `differential_manifest.samples.tsv`: per-sample manifest with BAM/BAI, scale factors, and bigWig paths. + - `differential_manifest.peaks.tsv`: per-sample per-caller peak paths and formats. + - `differential_manifest.design.tsv`: eligibility and status table for each comparison. +- `03_peak_calling/08_differential/01_diffbind///` + - `diffbind.results.tsv`: full DiffBind results. + - `diffbind.results.annotated.tsv`: DiffBind results annotated with nearest gene. + - `diffbind.significant*.bed`: significant peaks split by direction. + - `diffbind.summary.tsv`: one-row summary for MultiQC. +- `03_peak_calling/08_differential/02_chipbinner//` + - `chipbinner.*.tsv` and `chipbinner.*.csv`: ChIPBinner matrices and summaries. + - `chipbinner.differential.annotated.tsv`: annotated differential bins. + - `chipbinner.summary.tsv`: one-row summary for MultiQC. +- `03_peak_calling/08_differential/03_span//` + - `span.differential.tsv`: SPAN/OmniPeak differential table (native or fallback). + - `span.differential.peaks.bed`: differential regions as BED. + - `span.differential.annotated.tsv`: annotated differential regions. + - `span.up.bed`, `span.down.bed`: direction-specific BEDs. + - `span.summary.tsv`: one-row summary for MultiQC. + - `span.mode.txt`: mode and signature note. +- `03_peak_calling/08_differential/multiqc/` + - `differential_summary_mqc.tsv`: combined method summary table for MultiQC. + - `differential_design_mqc.tsv`: design/eligibility table for MultiQC. + - `differential_multiqc_report.html`: optional differential-only MultiQC report. + +
+ +Differential analysis is enabled with `--run_diffbind`, `--run_chipbinner`, and/or `--run_span_diff` plus a `--differential_contrast` definition. The pipeline always writes manifests and design tables when any differential option is enabled, even if specific comparisons are skipped. + ## 7. Peak-based QC Once the peak calling process is complete, we run a separate set of reports that analyse the quality of the results at the peak level. diff --git a/docs/test_history/TESTS_adding_peak_callers.md b/docs/test_history/TESTS_adding_peak_callers.md new file mode 100644 index 00000000..76600617 --- /dev/null +++ b/docs/test_history/TESTS_adding_peak_callers.md @@ -0,0 +1,52 @@ +› Summarize each of the test you ran on the updates you made. What is the point of the test, did it pass, etc. + + +────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── + +• - pytest / pytest-workflow full integration suite (289 items): Runs the pipeline via nextflow run main.nf ... across many preset configs (genome options, samplesheet validation, filtering, scaling, peak + callers, consensus peaks, report/output checks). Primary assertion is exit code == 0 plus expected output files. This is the main regression test for the code changes. + + Each run (SLURM, scratch-backed) + + - Job 12731188: Full pytest suite → FAIL (FAILED 2:0 in ~6s). Cause was sbatch output paths pointing to logs/ that didn’t exist due to an incorrect --chdir choice. + - Job 12731293: Full pytest suite → FAIL (FAILED 2:0 in ~4s). Cause was a syntax error from embedding a long bash -lc '...; ...' payload into CMD=(...) in the generated sbatch script. + - Job 12731396: Full pytest suite → FAIL (75 failed, many “exit code should be 0”). Root cause wasn’t preserved because pytest-workflow cleaned temp dirs (no --keep-workflow-wd yet), so we couldn’t inspect + per-test Nextflow stderr from that attempt. + - Job 12732401: Full pytest suite → FAIL (same symptom: “exit code should be 0” across many tests). Also pre---keep-workflow-wd, so limited diagnostics. + - Job 12740798: Diagnostic full pytest suite (kept workflow dirs) → FAIL. Root cause: Nextflow couldn’t find Java on compute nodes (java: command not found). + - Job 12742438: Full pytest suite after adding JAVA_CMD → FAIL. Root cause: NXF_OPTS was misused to pass Nextflow flags; JVM rejected -with-singularity. + - Job 12742515: Full pytest suite after removing NXF_OPTS → FAIL. Root cause: container engine conflict (“Cannot enable more than one container engine”) because tests request -profile docker,... while we also + enabled Singularity. + - Job 12743096: Full pytest suite (final) → PASS (289 passed, SLURM COMPLETED 0:0, 1 warning). Log: /scratch/users/dhusmann/nextflow-work/runs/pytest_full/20260101_181711/logs/pytest_full_12743096.out. + + + + +› For each failure, how did you resolve the issue? + + +────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── + +• - Job 12731188 (FAILED 2:0, ~6s) + - What failed: Slurm couldn’t open the sbatch output/error files because the job was --chdir’d into the repo, but the logs/ directory (used by #SBATCH --output=logs/%x_%j.out) only existed in the scratch + run dir created by slurm_run.sh. + - Fix: Stopped overriding --chdir to the repo; instead kept Slurm --chdir as the scratch run dir (where logs/ is created) and cd’d into the repo inside the payload script. + - Job 12731293 (FAILED 2:0, ~4s) + - What failed: The generated CMD=(...) in sbatch.sh embedded a long bash -lc ...; ...; ... payload without proper quoting, producing a shell syntax error (unexpected token ';'). + - Fix: Moved the payload into a real script on scratch (/scratch/users/dhusmann/nextflow-work/scripts/pytest_full.sh) and had Slurm run that script (simple command, no quoting hazards). + - Jobs 12731396 and 12732401 (FAILED; many “exit code should be 0”) + - What failed: Many pytest-workflow cases failed because the underlying nextflow run ... commands returned non‑zero. + - Fix process: These early runs did not preserve per-test workflow directories/logs, so we couldn’t pinpoint the exact first failure. I changed the test invocation to include pytest --keep-workflow-wd so + each test’s log.out/log.err could be inspected, enabling the concrete diagnoses below. + - Job 12740798 (FAILED; “exit code should be 0” everywhere) + - What failed: Nextflow couldn’t start on compute nodes because java wasn’t on $PATH (java: command not found). + - Fix: Exported JAVA_CMD=/share/software/user/open/java/17.0.4/bin/java (and JAVA_HOME) in the scratch test script so Nextflow always has a JVM on compute nodes. + - Job 12742438 (FAILED; “exit code should be 0” everywhere) + - What failed: Java failed immediately with Unrecognized option: -with-singularity because I mistakenly put Nextflow CLI flags into NXF_OPTS (which is passed to the JVM, not Nextflow). + - Fix: Removed NXF_OPTS entirely and relied on supported env toggles instead. + - Job 12742515 (FAILED; “Cannot enable more than one container engine”) + - What failed: The tests call Nextflow with -profile docker,..., which enables Docker, while we were also enabling Singularity → Nextflow refuses to run with two engines. + - Fix: Added a scratch nextflow wrapper (/scratch/users/dhusmann/nextflow-work/bin/nextflow) that rewrites -profile docker, to -profile singularity, during tests, while keeping + Docker disabled via env. This forces a single container engine (Singularity) without editing the test YAML files. + - Resulting successful run: job 12743096 (COMPLETED 0:0; 289 passed) + - With JAVA_CMD set and the nextflow profile-rewrite wrapper in place, the full pytest suite completed successfully. diff --git a/docs/usage.md b/docs/usage.md index 56780e94..05a4f884 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -167,6 +167,42 @@ If control samples are provided in the sample sheet, they will be used to normal After peak calling, consensus peaks are calculated by merging peaks within the same grouping key. Use `--consensus_grouping` to choose `group` or `group_condition`. By default, if the samplesheet includes a `condition` column, grouping uses `group_condition`; otherwise it falls back to `group`. The number of replicates required for a valid peak can be changed using `replicate_threshold`. To call consensus peaks across all samples, set `--consensus_peak_mode all`. +### Differential analysis (optional) + +Differential analysis runs on peak-calling outputs and compares two conditions per group. The contrast is defined as `treated,control` and determines the log2FC direction (treated/control). Enable one or more methods with `--run_diffbind`, `--run_chipbinner`, or `--run_span_diff`, and provide `--differential_contrast`. + +Key options: + +- `--differential_contrast "Treatment,Control"` (required when any method is enabled) +- `--differential_min_replicates` (default: 2) +- `--differential_allow_partial` (skip invalid comparisons instead of failing) +- `--differential_groups` / `--differential_callers` (comma-separated allowlists) +- `--differential_publish_manifest_only` (write manifests + design tables only) +- `--differential_multiqc_report` (emit a differential-only MultiQC report) + +Integrated example: + +```bash +nextflow run nf-core/cutandrun \ + --input samplesheet.csv \ + --genome GRCh37 \ + --run_diffbind \ + --differential_contrast "Treatment,Control" \ + -profile docker +``` + +Posthoc example (no upstream rerun): + +```bash +nextflow run main.nf -entry DIFFERENTIAL_ONLY \ + --differential_from_run /path/to/prior/results \ + --run_diffbind \ + --differential_contrast "Treatment,Control" \ + --gene_bed /path/to/genes.bed +``` + +When running `--run_chipbinner` in differential-only mode, you must provide `--fasta` so chrom sizes can be derived. When running `--run_span_diff`, you must provide `--omnipeaks_jar`. + ### Reproducibility It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. diff --git a/lib/WorkflowCutandrun.groovy b/lib/WorkflowCutandrun.groovy index e9e94093..d434aab1 100755 --- a/lib/WorkflowCutandrun.groovy +++ b/lib/WorkflowCutandrun.groovy @@ -20,6 +20,7 @@ class WorkflowCutandrun { genomeExistsError(params, log) validatePeakCallerParams(params, log) + validateDifferentialParams(params) if (!params.fasta) { Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." @@ -95,6 +96,49 @@ class WorkflowCutandrun { } } + // + // Validate differential analysis parameters + // + private static void validateDifferentialParams(params) { + def diff_enabled = params.run_diffbind || params.run_chipbinner || params.run_span_diff || params.differential_publish_manifest_only + if (diff_enabled) { + if (!params.run_peak_calling) { + Nextflow.error "Differential analysis requires peak calling outputs. Please ensure peak calling is enabled." + } + if (!params.differential_contrast) { + Nextflow.error "Differential analysis requires --differential_contrast (e.g. treated,control)." + } + def contrast = params.differential_contrast.toString().split(',').collect { it.trim() }.findAll { it } + if (contrast.size() != 2) { + Nextflow.error "Invalid --differential_contrast '${params.differential_contrast}'. Expected two comma-separated labels." + } + } + + if (params.run_span_diff && !params.omnipeaks_jar) { + Nextflow.error "SPAN differential requested but --omnipeaks_jar was not provided." + } + + if (params.differential_use_spikein && !['auto','true','false'].contains(params.differential_use_spikein.toString())) { + Nextflow.error "Invalid --differential_use_spikein value '${params.differential_use_spikein}'. Valid options: auto, true, false." + } + + if (params.span_diff_mode && !['auto','native','fallback'].contains(params.span_diff_mode.toString())) { + Nextflow.error "Invalid --span_diff_mode value '${params.span_diff_mode}'. Valid options: auto, native, fallback." + } + + if (params.diffbind_backend && !['DESeq2','edgeR'].contains(params.diffbind_backend.toString())) { + Nextflow.error "Invalid --diffbind_backend value '${params.diffbind_backend}'. Valid options: DESeq2, edgeR." + } + + if (params.span_fallback_backend && !['DESeq2','edgeR'].contains(params.span_fallback_backend.toString())) { + Nextflow.error "Invalid --span_fallback_backend value '${params.span_fallback_backend}'. Valid options: DESeq2, edgeR." + } + + if (params.differential_min_replicates && params.differential_min_replicates.toInteger() < 1) { + Nextflow.error "Invalid --differential_min_replicates value '${params.differential_min_replicates}'. Must be >= 1." + } + } + // // Detect whether the input samplesheet includes a condition column // diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 023723ab..67f9fec4 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -44,8 +44,10 @@ class WorkflowMain { // Check AWS batch settings NfcoreTemplate.awsBatch(workflow, params) - // Check input has been provided - if (!params.input) { + // Check input has been provided unless running differential-only + def entry = workflow.hasProperty('entry') ? workflow.entry : null + def differential_only = entry == 'DIFFERENTIAL_ONLY' + if (!params.input && !(differential_only && params.differential_from_run)) { Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") } } diff --git a/main.nf b/main.nf index f3b5b5eb..2adea865 100644 --- a/main.nf +++ b/main.nf @@ -65,6 +65,7 @@ WorkflowMain.initialise(workflow, params, log, args) */ include { CUTANDRUN } from './workflows/cutandrun' +include { DIFFERENTIAL_ONLY } from './workflows/differential_only' workflow NFCORE_CUTANDRUN { /* diff --git a/modules/local/annotate_regions.nf b/modules/local/annotate_regions.nf new file mode 100644 index 00000000..6074b8f9 --- /dev/null +++ b/modules/local/annotate_regions.nf @@ -0,0 +1,29 @@ +process ANNOTATE_REGIONS { + label 'process_single' + + conda "bioconda::bedtools=2.31.0 conda-forge::python=3.8.3" + container "quay.io/biocontainers/bedtools:2.31.0--hf5e1c6e_1" + + input: + tuple val(method), val(group), val(caller), path(regions), path(gene_bed), val(output_name) + + output: + tuple val(method), val(group), val(caller), path("${output_name}"), emit: annotated + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + annotate_regions.py \ + --regions ${regions} \ + --gene-bed ${gene_bed} \ + --out ${output_name} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed 's/bedtools v//') + END_VERSIONS + """ +} diff --git a/modules/local/chipbinner_run.nf b/modules/local/chipbinner_run.nf new file mode 100644 index 00000000..984268e6 --- /dev/null +++ b/modules/local/chipbinner_run.nf @@ -0,0 +1,66 @@ +process CHIPBINNER_RUN { + label 'process_chipbinner' + + conda "conda-forge::python=3.8.3" + container "quay.io/biocontainers/python:3.8.3" + + input: + tuple val(group), path(records), path(chrom_sizes) + val contrast + val bin_size + val windows_dir + val use_input + val pseudocount + val grid_minpts + val grid_minsamps + val fdr + val lfc + val bootstrap + val k_value + val functional_db + + output: + tuple val(group), path("chipbinner.samplesheet.csv"), emit: samplesheet + tuple val(group), path("chipbinner.windows.bed"), emit: windows + tuple val(group), path("chipbinner.bin_counts.tsv"), emit: counts + tuple val(group), path("chipbinner.normalized_matrix.tsv"), emit: normalized + tuple val(group), path("chipbinner.hdbscan_grid_summary.tsv"), emit: grid + tuple val(group), path("chipbinner.clusters.tsv"), emit: clusters + tuple val(group), path("chipbinner.differential.tsv"), emit: differential + tuple val(group), path("chipbinner.summary.tsv"), emit: summary + tuple val(group), path("plots"), emit: plots + tuple val(group), path("enrichment"), emit: enrichment + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def windows_arg = windows_dir ? "--windows-dir ${windows_dir}" : '' + def use_input_arg = use_input ? "--use-input" : '' + def functional_arg = functional_db ? "--functional-db ${functional_db}" : '' + """ + chipbinner_run.py \ + --samples ${records} \ + --group '${group}' \ + --contrast '${contrast}' \ + --chrom-sizes ${chrom_sizes} \ + --bin-size ${bin_size} \ + ${windows_arg} \ + ${use_input_arg} \ + --pseudocount ${pseudocount} \ + --grid-minpts '${grid_minpts}' \ + --grid-minsamps '${grid_minsamps}' \ + --fdr ${fdr} \ + --lfc ${lfc} \ + --bootstrap ${bootstrap} \ + --k-value ${k_value} \ + ${functional_arg} \ + --outdir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + END_VERSIONS + """ +} diff --git a/modules/local/diffbind_run.nf b/modules/local/diffbind_run.nf new file mode 100644 index 00000000..1853c932 --- /dev/null +++ b/modules/local/diffbind_run.nf @@ -0,0 +1,61 @@ +process DIFFBIND_RUN { + label 'process_diffbind' + + conda "conda-forge::r-base=4.2.3 bioconda::bioconductor-diffbind" + container "quay.io/biocontainers/bioconductor-diffbind:3.14.0--r42_0" + + input: + tuple val(group), val(caller), path(records) + val contrast + val use_spikein + val fdr + val lfc + val min_overlap + val backend + val recenter + val summits + val norm_method + val extra_params + val export_sheets + + output: + tuple val(group), val(caller), path("diffbind.results.tsv"), emit: results + tuple val(group), val(caller), path("diffbind.significant.bed"), emit: bed + tuple val(group), val(caller), path("diffbind.significant_up.bed"), emit: bed_up + tuple val(group), val(caller), path("diffbind.significant_down.bed"), emit: bed_down + tuple val(group), val(caller), path("diffbind.summary.tsv"), emit: summary + tuple val(group), val(caller), path("diffbind.samplesheet.csv"), emit: samplesheet + tuple val(group), val(caller), path("diffbind.normalization_factors.tsv"), emit: norm_factors_out + tuple val(group), val(caller), path("diffbind.dba.rds"), emit: dba + tuple val(group), val(caller), path("plots"), emit: plots + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def extra_arg = extra_params ? "--extra_params ${extra_params}" : '' + """ + diffbind_run.R \ + --records ${records} \ + --outdir . \ + --contrast '${contrast}' \ + --caller '${caller}' \ + --group '${group}' \ + --use_spikein ${use_spikein} \ + --fdr ${fdr} \ + --lfc ${lfc} \ + --min_overlap ${min_overlap} \ + --backend ${backend} \ + --recenter ${recenter} \ + --summits ${summits} \ + --norm_method ${norm_method} \ + ${extra_arg} \ + --export_sheets ${export_sheets} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n 1 | sed 's/.* //') + END_VERSIONS + """ +} diff --git a/modules/local/differential_design.nf b/modules/local/differential_design.nf new file mode 100644 index 00000000..9cc5cab2 --- /dev/null +++ b/modules/local/differential_design.nf @@ -0,0 +1,52 @@ +process DIFFERENTIAL_DESIGN { + label 'process_single' + + conda "conda-forge::python=3.8.3" + container "quay.io/biocontainers/python:3.8.3" + + input: + path samples_manifest + path peaks_manifest + val contrast + val min_reps + val allow_partial + val run_diffbind + val run_chipbinner + val run_span + val groups_allow + val callers_allow + + output: + path "differential_manifest.design.tsv", emit: design + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def allow_flag = allow_partial ? '--allow-partial' : '' + def diffbind_flag = run_diffbind ? '--run-diffbind' : '' + def chipbinner_flag = run_chipbinner ? '--run-chipbinner' : '' + def span_flag = run_span ? '--run-span' : '' + def groups_arg = groups_allow ? "--groups ${groups_allow}" : '' + def callers_arg = callers_allow ? "--callers ${callers_allow}" : '' + """ + differential_design.py \ + --samples ${samples_manifest} \ + --peaks ${peaks_manifest} \ + --contrast '${contrast}' \ + --min-replicates ${min_reps} \ + ${allow_flag} \ + ${diffbind_flag} \ + ${chipbinner_flag} \ + ${span_flag} \ + ${groups_arg} \ + ${callers_arg} \ + --out differential_manifest.design.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + END_VERSIONS + """ +} diff --git a/modules/local/differential_manifests.nf b/modules/local/differential_manifests.nf new file mode 100644 index 00000000..625668cd --- /dev/null +++ b/modules/local/differential_manifests.nf @@ -0,0 +1,36 @@ +process DIFFERENTIAL_MANIFESTS { + label 'process_single' + + conda "conda-forge::python=3.8.3" + container "quay.io/biocontainers/python:3.8.3" + + input: + path samples_raw + path peaks_raw + path ms_coeffs + val normalisation_mode + + output: + path "differential_manifest.samples.tsv", emit: samples + path "differential_manifest.peaks.tsv", emit: peaks + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def ms_arg = ms_coeffs ? "--ms-coeffs ${ms_coeffs}" : '' + """ + differential_manifests.py \ + --samples ${samples_raw} \ + --peaks ${peaks_raw} \ + --normalisation-mode ${normalisation_mode} \ + ${ms_arg} \ + --outdir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + END_VERSIONS + """ +} diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf new file mode 100644 index 00000000..05bfef06 --- /dev/null +++ b/modules/local/differential_summary.nf @@ -0,0 +1,36 @@ +process DIFFERENTIAL_SUMMARY { + label 'process_single' + + conda "conda-forge::python=3.8.3" + container "quay.io/biocontainers/python:3.8.3" + + input: + path design_manifest + path summary_files + path summary_header + path design_header + + output: + path "differential_summary_mqc.tsv", emit: summary + path "differential_design_mqc.tsv", emit: design + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + differential_summary.py \ + --design ${design_manifest} \ + --summaries ${summary_files} \ + --out differential_summary.tsv + + cat ${summary_header} differential_summary.tsv > differential_summary_mqc.tsv + cat ${design_header} ${design_manifest} > differential_design_mqc.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + END_VERSIONS + """ +} diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf index 81f70b95..d6a3499d 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc.nf @@ -31,6 +31,7 @@ process MULTIQC { path ('peak_metrics/peak_count_consensus/*') path ('peak_metrics/peak_reprod_perc/*') path ('frag_len/*') + path ('differential/*') path ('linear_duplicates/*') output: diff --git a/modules/local/multiqc_differential.nf b/modules/local/multiqc_differential.nf new file mode 100644 index 00000000..1169a889 --- /dev/null +++ b/modules/local/multiqc_differential.nf @@ -0,0 +1,29 @@ +process MULTIQC_DIFFERENTIAL { + label 'process_single' + + conda "bioconda::multiqc=1.19" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" + + input: + path multiqc_config + val differential_dir + + output: + path "differential_multiqc_report.html", emit: report + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + multiqc -f --config ${multiqc_config} --filename differential_multiqc_report.html ${differential_dir} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/local/publish_differential_manifests.nf b/modules/local/publish_differential_manifests.nf new file mode 100644 index 00000000..b22583da --- /dev/null +++ b/modules/local/publish_differential_manifests.nf @@ -0,0 +1,41 @@ +process PUBLISH_DIFFERENTIAL_MANIFESTS { + label 'process_single' + + conda "conda-forge::coreutils=9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path samples_manifest + path peaks_manifest + + output: + path "differential_manifest.samples.tsv", emit: samples + path "differential_manifest.peaks.tsv", emit: peaks + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + samples_src=\$(readlink -f ${samples_manifest}) + samples_dst=\$(readlink -f differential_manifest.samples.tsv) + if [[ "\$samples_src" != "\$samples_dst" ]]; then + cp ${samples_manifest} differential_manifest.samples.tsv + fi + + peaks_src=\$(readlink -f ${peaks_manifest}) + peaks_dst=\$(readlink -f differential_manifest.peaks.tsv) + if [[ "\$peaks_src" != "\$peaks_dst" ]]; then + cp ${peaks_manifest} differential_manifest.peaks.tsv + fi + + coreutils_version=\$(cat --version | head -n 1 | awk '{print \$NF}') + { + echo "\"${task.process}\":" + echo " coreutils: \$coreutils_version" + } > versions.yml + """ +} diff --git a/modules/local/records_to_tsv.nf b/modules/local/records_to_tsv.nf new file mode 100644 index 00000000..36baf78b --- /dev/null +++ b/modules/local/records_to_tsv.nf @@ -0,0 +1,33 @@ +process RECORDS_TO_TSV { + label 'process_single' + + conda "conda-forge::python=3.8.3" + container "quay.io/biocontainers/python:3.8.3" + + input: + val key1 + val key2 + val records_json + val header + val filename + + output: + tuple val(key1), val(key2), path("${filename}"), emit: tsv + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + records_to_tsv.py \ + --records '${records_json}' \ + --header '${header}' \ + --out '${filename}' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + END_VERSIONS + """ +} diff --git a/modules/local/span_diff_run.nf b/modules/local/span_diff_run.nf new file mode 100644 index 00000000..28e9ae89 --- /dev/null +++ b/modules/local/span_diff_run.nf @@ -0,0 +1,51 @@ +process SPAN_DIFF_RUN { + label 'process_span' + + conda "conda-forge::python=3.8.3" + container "quay.io/biocontainers/python:3.8.3" + + input: + tuple val(group), path(records), path(peaks) + val contrast + val mode + val fdr + val gap + val bin_size + val fallback_backend + path omnipeaks_jar + val java_heap + + output: + tuple val(group), path("span.differential.tsv"), emit: differential + tuple val(group), path("span.differential.peaks.bed"), emit: peaks_bed + tuple val(group), path("span.up.bed"), emit: up + tuple val(group), path("span.down.bed"), emit: down + tuple val(group), path("span.summary.tsv"), emit: summary + tuple val(group), path("span.mode.txt"), emit: mode_out + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + span_diff.py \ + --jar ${omnipeaks_jar} \ + --mode ${mode} \ + --contrast '${contrast}' \ + --group '${group}' \ + --samples ${records} \ + --peaks ${peaks} \ + --bin ${bin_size} \ + --gap ${gap} \ + --fdr ${fdr} \ + --fallback-backend ${fallback_backend} \ + --java-heap ${java_heap} \ + --outdir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 39d9e100..3a68b2ad 100644 --- a/nextflow.config +++ b/nextflow.config @@ -91,6 +91,53 @@ params { span_java_heap = "8G" span_stringent_fdr = "1.0E-9" + // Differential analysis + run_diffbind = false + run_chipbinner = false + run_span_diff = false + differential_contrast = "treated,control" + differential_min_replicates = 2 + differential_use_spikein = "auto" + differential_allow_partial = false + differential_from_run = null + differential_publish_manifest_only = false + differential_groups = null + differential_callers = null + differential_multiqc_report = false + + // DiffBind options + diffbind_fdr = 0.05 + diffbind_lfc = 1.0 + diffbind_min_overlap = 2 + diffbind_backend = "DESeq2" + diffbind_recenter_peaks = false + diffbind_summits = 0 + diffbind_norm_method = "native" + diffbind_extra_params = null + export_diffbind_sheets = true + + // ChIPBinner options + chipbinner_ms_coeffs = null + chipbinner_bin_size = 10000 + chipbinner_windows_dir = null + chipbinner_use_input = false + chipbinner_pseudocount = 1 + chipbinner_hdbscan_grid_minpts = "100,200,500,1000" + chipbinner_hdbscan_grid_minsamps = "100,200,500,1000" + chipbinner_fdr = 0.05 + chipbinner_lfc = 1.0 + chipbinner_bootstrap = 1000 + chipbinner_k_value = 100000 + chipbinner_functional_db = null + + // SPAN differential options + span_diff_mode = "auto" + span_diff_fdr = 0.05 + span_diff_gap = 5 + span_diff_bin = 200 + span_diff_java_heap = "8G" + span_fallback_backend = "DESeq2" + // Consensus Peaks consensus_peak_mode = 'group' consensus_grouping = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 3c68e540..97c90884 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -329,14 +332,24 @@ "default": "Spikein", "fa_icon": "fab fa-buffer", "description": "Sets the target read normalisation mode. Options are: [\"Spikein\", \"RPKM\", \"CPM\", \"BPM\", \"None\" ]", - "enum": ["Spikein", "RPKM", "CPM", "BPM", "None"] + "enum": [ + "Spikein", + "RPKM", + "CPM", + "BPM", + "None" + ] }, "normalisation_scope": { "type": "string", "default": "all", "fa_icon": "fab fa-buffer", "description": "Scope for spike-in normalisation when normalisation_mode is Spikein. Options are: [all, group, group_condition].", - "enum": ["all", "group", "group_condition"] + "enum": [ + "all", + "group", + "group_condition" + ] }, "normalisation_binsize": { "type": "integer", @@ -354,7 +367,10 @@ "default": "standard", "fa_icon": "fas fa-align-justify", "description": "Convenience preset for peak callers. Options are: [standard, extended]. Ignored if --peakcaller is set.", - "enum": ["standard", "extended"] + "enum": [ + "standard", + "extended" + ] }, "use_control": { "type": "boolean", @@ -379,7 +395,11 @@ "default": "legacy", "fa_icon": "fas fa-align-justify", "description": "Control scaling scope for IgG/background samples. Options are: [legacy, group_condition, sample].", - "enum": ["legacy", "group_condition", "sample"] + "enum": [ + "legacy", + "group_condition", + "sample" + ] }, "seacr_peak_threshold": { "type": "number", @@ -392,14 +412,20 @@ "default": "non", "fa_icon": "fas fa-align-justify", "description": "SEACR normalization. ", - "enum": ["non", "norm"] + "enum": [ + "non", + "norm" + ] }, "seacr_stringent": { "type": "string", "default": "stringent", "fa_icon": "fas fa-align-justify", "description": "SEACR stringency.", - "enum": ["stringent", "relaxed"] + "enum": [ + "stringent", + "relaxed" + ] }, "macs2_qvalue": { "type": "number", @@ -463,13 +489,19 @@ "default": "group", "fa_icon": "fas fa-align-justify", "description": "Specifies what samples to group together for consensus peaks. Options are [group, all]", - "enum": ["group", "all"] + "enum": [ + "group", + "all" + ] }, "consensus_grouping": { "type": "string", "fa_icon": "fas fa-align-justify", "description": "Grouping key for consensus peaks when consensus_peak_mode is group. Options are [group, group_condition]. Defaults to group_condition when the samplesheet has a condition column, otherwise group.", - "enum": ["group", "group_condition"] + "enum": [ + "group", + "group_condition" + ] }, "replicate_threshold": { "type": "number", @@ -482,6 +514,240 @@ "default": true, "fa_icon": "fas fa-align-justify", "description": "Show gene names instead of symbols in IGV browser sessions" + }, + "run_diffbind": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-exchange-alt", + "description": "Enable DiffBind differential binding analysis." + }, + "run_chipbinner": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-exchange-alt", + "description": "Enable ChIPBinner differential analysis." + }, + "run_span_diff": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-exchange-alt", + "description": "Enable SPAN/OmniPeak differential analysis." + }, + "differential_contrast": { + "type": "string", + "default": "treated,control", + "fa_icon": "fas fa-align-justify", + "description": "Contrast definition for differential analysis (treated,control)." + }, + "differential_min_replicates": { + "type": "integer", + "default": 2, + "fa_icon": "fas fa-sort-numeric-down", + "description": "Minimum replicates per condition per group." + }, + "differential_use_spikein": { + "type": "string", + "default": "auto", + "fa_icon": "fas fa-align-justify", + "description": "Use spike-in scale factors for differential normalization (auto|true|false)." + }, + "differential_allow_partial": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-align-justify", + "description": "Allow skipping invalid comparisons instead of failing." + }, + "differential_from_run": { + "type": "string", + "default": null, + "fa_icon": "fas fa-folder-open", + "description": "Prior run outdir for differential-only mode." + }, + "differential_publish_manifest_only": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-file-alt", + "description": "Generate differential manifests and design tables only." + }, + "differential_groups": { + "type": "string", + "default": null, + "fa_icon": "fas fa-filter", + "description": "Comma-separated list of groups to include in differential analysis." + }, + "differential_callers": { + "type": "string", + "default": null, + "fa_icon": "fas fa-filter", + "description": "Comma-separated list of peak callers to include in DiffBind." + }, + "differential_multiqc_report": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-chart-bar", + "description": "Generate a differential-only MultiQC report." + }, + "diffbind_fdr": { + "type": "number", + "default": 0.05, + "fa_icon": "fas fa-align-justify", + "description": "FDR cutoff for DiffBind." + }, + "diffbind_lfc": { + "type": "number", + "default": 1.0, + "fa_icon": "fas fa-align-justify", + "description": "Absolute log2FC cutoff for DiffBind." + }, + "diffbind_min_overlap": { + "type": "integer", + "default": 2, + "fa_icon": "fas fa-align-justify", + "description": "Minimum overlap for DiffBind counting." + }, + "diffbind_backend": { + "type": "string", + "default": "DESeq2", + "fa_icon": "fas fa-align-justify", + "description": "Backend for DiffBind (DESeq2 or edgeR)." + }, + "diffbind_recenter_peaks": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-align-justify", + "description": "Recenter peaks around summits in DiffBind." + }, + "diffbind_summits": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-align-justify", + "description": "Summit window width for DiffBind (0 uses full peaks)." + }, + "diffbind_norm_method": { + "type": "string", + "default": "native", + "fa_icon": "fas fa-align-justify", + "description": "DiffBind normalization method when not using spike-in." + }, + "diffbind_extra_params": { + "type": "string", + "default": null, + "fa_icon": "fas fa-align-justify", + "description": "JSON/YAML file with extra DiffBind parameters." + }, + "export_diffbind_sheets": { + "type": "boolean", + "default": true, + "fa_icon": "fas fa-align-justify", + "description": "Export DiffBind sample sheets." + }, + "chipbinner_ms_coeffs": { + "type": "string", + "default": null, + "fa_icon": "fas fa-align-justify", + "description": "Optional MS-based coefficients for ChIPBinner normalization." + }, + "chipbinner_bin_size": { + "type": "integer", + "default": 10000, + "fa_icon": "fas fa-align-justify", + "description": "Bin size for ChIPBinner (1000, 5000, 10000, 25000)." + }, + "chipbinner_windows_dir": { + "type": "string", + "default": null, + "fa_icon": "fas fa-folder-open", + "description": "Directory with prebuilt ChIPBinner windows." + }, + "chipbinner_use_input": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-align-justify", + "description": "Use input/background for ChIPBinner (if available)." + }, + "chipbinner_pseudocount": { + "type": "integer", + "default": 1, + "fa_icon": "fas fa-align-justify", + "description": "Pseudocount added before log transforms in ChIPBinner." + }, + "chipbinner_hdbscan_grid_minpts": { + "type": "string", + "default": "100,200,500,1000", + "fa_icon": "fas fa-align-justify", + "description": "HDBSCAN minPts grid values for ChIPBinner." + }, + "chipbinner_hdbscan_grid_minsamps": { + "type": "string", + "default": "100,200,500,1000", + "fa_icon": "fas fa-align-justify", + "description": "HDBSCAN minSamps grid values for ChIPBinner." + }, + "chipbinner_fdr": { + "type": "number", + "default": 0.05, + "fa_icon": "fas fa-align-justify", + "description": "FDR cutoff for ChIPBinner differential results." + }, + "chipbinner_lfc": { + "type": "number", + "default": 1.0, + "fa_icon": "fas fa-align-justify", + "description": "Absolute log2FC cutoff for ChIPBinner." + }, + "chipbinner_bootstrap": { + "type": "integer", + "default": 1000, + "fa_icon": "fas fa-align-justify", + "description": "ROTS bootstrap iterations for ChIPBinner." + }, + "chipbinner_k_value": { + "type": "integer", + "default": 100000, + "fa_icon": "fas fa-align-justify", + "description": "ROTS K parameter for ChIPBinner." + }, + "chipbinner_functional_db": { + "type": "string", + "default": null, + "fa_icon": "fas fa-align-justify", + "description": "Optional functional database for ChIPBinner enrichment." + }, + "span_diff_mode": { + "type": "string", + "default": "auto", + "fa_icon": "fas fa-align-justify", + "description": "SPAN differential mode (auto|native|fallback)." + }, + "span_diff_fdr": { + "type": "number", + "default": 0.05, + "fa_icon": "fas fa-align-justify", + "description": "FDR cutoff for SPAN differential." + }, + "span_diff_gap": { + "type": "integer", + "default": 5, + "fa_icon": "fas fa-align-justify", + "description": "Gap parameter for SPAN differential." + }, + "span_diff_bin": { + "type": "integer", + "default": 200, + "fa_icon": "fas fa-align-justify", + "description": "Bin size parameter for SPAN differential." + }, + "span_diff_java_heap": { + "type": "string", + "default": "8G", + "fa_icon": "fas fa-align-justify", + "description": "Java heap size for SPAN differential." + }, + "span_fallback_backend": { + "type": "string", + "default": "DESeq2", + "fa_icon": "fas fa-align-justify", + "description": "Backend for SPAN fallback differential (DESeq2 or edgeR)." } }, "fa_icon": "fas fa-cog" @@ -666,7 +932,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "singularity_pull_docker_container": { diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf new file mode 100644 index 00000000..0f8a12c9 --- /dev/null +++ b/subworkflows/local/differential_peak_calling.nf @@ -0,0 +1,414 @@ +/* + * Differential peak calling subworkflow + */ + +import groovy.json.JsonOutput + +include { DIFFERENTIAL_MANIFESTS } from "../../modules/local/differential_manifests" +include { DIFFERENTIAL_DESIGN } from "../../modules/local/differential_design" +include { PUBLISH_DIFFERENTIAL_MANIFESTS } from "../../modules/local/publish_differential_manifests" +include { RECORDS_TO_TSV as RECORDS_TO_TSV_DIFFBIND } from "../../modules/local/records_to_tsv" +include { RECORDS_TO_TSV as RECORDS_TO_TSV_CHIPBINNER } from "../../modules/local/records_to_tsv" +include { RECORDS_TO_TSV as RECORDS_TO_TSV_SPAN } from "../../modules/local/records_to_tsv" +include { DIFFBIND_RUN } from "../../modules/local/diffbind_run" +include { CHIPBINNER_RUN } from "../../modules/local/chipbinner_run" +include { SPAN_DIFF_RUN } from "../../modules/local/span_diff_run" +include { ANNOTATE_REGIONS } from "../../modules/local/annotate_regions" +include { DIFFERENTIAL_SUMMARY } from "../../modules/local/differential_summary" +include { MULTIQC_DIFFERENTIAL } from "../../modules/local/multiqc_differential" + +workflow DIFFERENTIAL_PEAK_CALLING { + take: + ch_bam + ch_bai + ch_bigwig + ch_peaks + ch_scale_factors + ch_chrom_sizes + ch_gene_bed + ch_blacklist + ch_samples_manifest_in + ch_peaks_manifest_in + mode + + main: + ch_versions = Channel.empty() + ch_diff_summary_mqc = Channel.empty() + ch_diff_design_mqc = Channel.empty() + ch_annotation_requests = Channel.empty() + ch_diffbind_summary = Channel.empty() + ch_chipbinner_summary = Channel.empty() + ch_span_summary = Channel.empty() + ch_summary_files = Channel.empty() + ch_samples_manifest = Channel.empty() + ch_peaks_manifest = Channel.empty() + summary_stub = file("$projectDir/assets/differential_summary_stub.tsv") + + def diff_enabled = params.run_diffbind || params.run_chipbinner || params.run_span_diff || params.differential_publish_manifest_only + if (diff_enabled) { + def use_manifests = (mode == 'posthoc') + def diff_outdir = "${params.outdir}/03_peak_calling/08_differential" + + def bam_subdir = params.run_remove_linear_dups ? 'linear_dedup' : (params.run_remove_dups ? 'dedup' : (params.run_mark_dups ? 'markdup' : '')) + def bam_dir = bam_subdir ? "${params.outdir}/02_alignment/${params.aligner}/target/${bam_subdir}" : "${params.outdir}/02_alignment/${params.aligner}/target" + + def peaks_dir = { caller -> "${params.outdir}/03_peak_calling/04_called_peaks/${caller}" } + def bigwig_dir = "${params.outdir}/03_peak_calling/03_bed_to_bigwig" + + def peaks_format = { file -> + def name = file.getName().toLowerCase() + if (name.endsWith('.narrowpeak')) return 'narrowPeak' + if (name.endsWith('.broadpeak')) return 'broadPeak' + if (name.endsWith('.peak')) return 'peak' + if (name.endsWith('.bed')) return 'bed' + return 'bed' + } + + if (!use_manifests) { + ch_bam_target = ch_bam.filter { it[0].is_control == false } + ch_bai_target = ch_bai.filter { it[0].is_control == false } + ch_bigwig_target = ch_bigwig.filter { it[0].is_control == false } + ch_scale_target = ch_scale_factors.filter { it[0].is_control == false } + + ch_scale_map = ch_scale_target + .map { meta, scale -> [meta.id, scale] } + .collect() + .map { items -> + def out = [:] + items.each { entry -> + if (entry instanceof List && entry.size() > 1) { + out[entry[0]] = entry[1] + } + } + out + } + + ch_bigwig_map = ch_bigwig_target + .map { meta, bw -> [meta.id, bw] } + .collect() + .map { items -> + def out = [:] + items.each { entry -> + if (entry instanceof List && entry.size() > 1) { + out[entry[0]] = entry[1] + } + } + out + } + + ch_samples_raw = ch_bam_target + .map { meta, bam -> [meta.id, meta, bam] } + .join(ch_bai_target.map { meta, bai -> [meta.id, bai] }) + .map { id, meta, bam, bai -> [meta, bam, bai] } + .combine(ch_bigwig_map) + .combine(ch_scale_map) + .map { meta, bam, bai, bw_map, scale_map -> + def bigwig = bw_map.get(meta.id) ?: 'NA' + def scale = scale_map.get(meta.id) ?: 'NA' + def bam_path = "${bam_dir}/${bam.getName()}" + def bai_path = "${bam_dir}/${bai.getName()}" + def bigwig_path = bigwig == 'NA' ? 'NA' : "${bigwig_dir}/${bigwig.getName()}" + [ + sample_id: meta.id, + group: meta.group, + condition: meta.condition, + replicate: meta.replicate, + final_bam: bam_path, + final_bai: bai_path, + spikein_scale_factor: scale, + bigwig_path: bigwig_path + ] + } + .map { row -> + "${row.sample_id}\t${row.group}\t${row.condition}\t${row.replicate}\t${row.final_bam}\t${row.final_bai}\t${row.spikein_scale_factor}\t${row.bigwig_path}" + } + + def samples_header = "sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tspikein_scale_factor\tbigwig_path" + ch_samples_raw = ch_samples_raw + .collect() + .map { rows -> ([samples_header] + rows).join('\n') + '\n' } + .collectFile(name: 'differential_samples_raw.tsv', newLine: false) + + ch_peaks_raw = ch_peaks + .map { meta, peak_file -> + def peak_path = "${peaks_dir(meta.caller)}/${peak_file.getName()}" + def fmt = peaks_format(peak_file) + "${meta.id}\t${meta.group}\t${meta.condition}\t${meta.replicate}\t${meta.caller}\t${peak_path}\t${fmt}" + } + + def peaks_header = "sample_id\tgroup\tcondition\treplicate\tcaller\tpeaks_path\tpeaks_format" + ch_peaks_raw = ch_peaks_raw + .collect() + .map { rows -> ([peaks_header] + rows).join('\n') + '\n' } + .collectFile(name: 'differential_peaks_raw.tsv', newLine: false) + + DIFFERENTIAL_MANIFESTS ( + ch_samples_raw, + ch_peaks_raw, + params.chipbinner_ms_coeffs ? file(params.chipbinner_ms_coeffs) : file("$projectDir/assets/ms_coeffs_stub.tsv"), + params.normalisation_mode + ) + ch_versions = ch_versions.mix(DIFFERENTIAL_MANIFESTS.out.versions) + ch_samples_manifest = DIFFERENTIAL_MANIFESTS.out.samples + ch_peaks_manifest = DIFFERENTIAL_MANIFESTS.out.peaks + } else { + ch_samples_manifest = ch_samples_manifest_in + ch_peaks_manifest = ch_peaks_manifest_in + } + + DIFFERENTIAL_DESIGN ( + ch_samples_manifest, + ch_peaks_manifest, + params.differential_contrast, + params.differential_min_replicates, + params.differential_allow_partial, + params.run_diffbind, + params.run_chipbinner, + params.run_span_diff, + params.differential_groups ?: '', + params.differential_callers ?: '' + ) + ch_versions = ch_versions.mix(DIFFERENTIAL_DESIGN.out.versions) + + PUBLISH_DIFFERENTIAL_MANIFESTS ( + ch_samples_manifest, + ch_peaks_manifest + ) + ch_versions = ch_versions.mix(PUBLISH_DIFFERENTIAL_MANIFESTS.out.versions) + + ch_samples_rows = ch_samples_manifest.splitCsv(header: true, sep: '\t') + ch_peaks_rows = ch_peaks_manifest.splitCsv(header: true, sep: '\t') + ch_design_rows = DIFFERENTIAL_DESIGN.out.design.splitCsv(header: true, sep: '\t') + + ch_gene_bed_single = ch_gene_bed.collect().map { it[0] } + ch_chrom_sizes_single = ch_chrom_sizes.collect().map { it[0] } + + def diff_use_spikein = params.differential_use_spikein ? params.differential_use_spikein.toString().toLowerCase() : 'auto' + def use_spikein = (diff_use_spikein == 'true') || (diff_use_spikein == 'auto' && params.normalisation_mode == 'Spikein') + + if (params.run_diffbind && !params.differential_publish_manifest_only) { + ch_diffbind_design = ch_design_rows + .filter { row -> row.caller != 'NA' && row.status == 'RUN' && row.eligible_diffbind == 'true' } + .map { row -> [ [row.group, row.caller], row ] } + + ch_sample_peak_records = ch_samples_rows + .map { row -> [row.sample_id, row] } + .join(ch_peaks_rows.map { row -> [row.sample_id, row] }) + .map { sample_id, srow, prow -> + def record = [ + group: srow.group, + caller: prow.caller, + sample_id: srow.sample_id, + condition: srow.condition, + replicate: srow.replicate, + bam: srow.final_bam, + peaks: prow.peaks_path, + spikein_scale_factor: srow.spikein_scale_factor + ] + [ [record.group, record.caller], record ] + } + .groupTuple(by: [0]) + .map { key, recs -> [ key[0], key[1], recs ] } + + ch_diffbind_records = ch_sample_peak_records + .map { group, caller, recs -> [ [group, caller], recs ] } + .join(ch_diffbind_design) + .map { key, recs, row -> [ key[0], key[1], recs ] } + + ch_diffbind_records + .map { group, caller, recs -> [ group, caller, JsonOutput.toJson(recs) ] } + .set { ch_diffbind_records_json } + + RECORDS_TO_TSV_DIFFBIND ( + ch_diffbind_records_json.map { it[0] }, + ch_diffbind_records_json.map { it[1] }, + ch_diffbind_records_json.map { it[2] }, + 'sample_id\tgroup\tcondition\treplicate\tbam\tpeaks\tcaller\tspikein_scale_factor', + ch_diffbind_records_json.map { group, caller, json -> "diffbind_${caller}_${group}.tsv" } + ) + ch_versions = ch_versions.mix(RECORDS_TO_TSV_DIFFBIND.out.versions) + + DIFFBIND_RUN ( + RECORDS_TO_TSV_DIFFBIND.out.tsv, + params.differential_contrast, + use_spikein, + params.diffbind_fdr, + params.diffbind_lfc, + params.diffbind_min_overlap, + params.diffbind_backend, + params.diffbind_recenter_peaks, + params.diffbind_summits, + params.diffbind_norm_method, + params.diffbind_extra_params ? file(params.diffbind_extra_params).toString() : '', + params.export_diffbind_sheets + ) + ch_versions = ch_versions.mix(DIFFBIND_RUN.out.versions) + + ch_diffbind_annot = DIFFBIND_RUN.out.results + .combine(ch_gene_bed_single) + .map { tuple, gene_bed -> + def group = tuple[0] + def caller = tuple[1] + def regions = tuple[2] + [ 'diffbind', group, caller, regions, gene_bed, "01_diffbind/${caller}/${group}/diffbind.results.annotated.tsv" ] + } + + ch_annotation_requests = ch_annotation_requests.mix(ch_diffbind_annot) + ch_diffbind_summary = DIFFBIND_RUN.out.summary.map { it[2] } + } + + if (params.run_chipbinner && !params.differential_publish_manifest_only) { + ch_chip_design = ch_design_rows + .filter { row -> row.caller == 'NA' && row.status == 'RUN' && row.eligible_chipbinner == 'true' } + .map { row -> [row.group, row] } + + ch_chip_records = ch_samples_rows + .map { row -> [row.group, row] } + .groupTuple(by: [0]) + .join(ch_chip_design) + .map { group, rows, design -> [ group, rows ] } + + ch_chip_records + .map { group, rows -> [ group, JsonOutput.toJson(rows) ] } + .set { ch_chip_records_json } + + RECORDS_TO_TSV_CHIPBINNER ( + ch_chip_records_json.map { it[0] }, + ch_chip_records_json.map { 'NA' }, + ch_chip_records_json.map { it[1] }, + 'sample_id\tgroup\tcondition\treplicate\tfinal_bam\tbigwig_path\tspikein_scale_factor\tms_coeff', + ch_chip_records_json.map { group, json -> "chipbinner_${group}.tsv" } + ) + ch_versions = ch_versions.mix(RECORDS_TO_TSV_CHIPBINNER.out.versions) + + ch_chip_records_file = RECORDS_TO_TSV_CHIPBINNER.out.tsv + .map { group, caller, records_file -> [group, records_file] } + + ch_chip_inputs = ch_chip_records_file + .combine(ch_chrom_sizes_single) + .map { record, chrom_sizes -> [ record[0], record[1], chrom_sizes ] } + + CHIPBINNER_RUN ( + ch_chip_inputs, + params.differential_contrast, + params.chipbinner_bin_size, + params.chipbinner_windows_dir ? file(params.chipbinner_windows_dir).toString() : '', + params.chipbinner_use_input, + params.chipbinner_pseudocount, + params.chipbinner_hdbscan_grid_minpts, + params.chipbinner_hdbscan_grid_minsamps, + params.chipbinner_fdr, + params.chipbinner_lfc, + params.chipbinner_bootstrap, + params.chipbinner_k_value, + params.chipbinner_functional_db ? file(params.chipbinner_functional_db).toString() : '' + ) + ch_versions = ch_versions.mix(CHIPBINNER_RUN.out.versions) + + ch_chip_annot = CHIPBINNER_RUN.out.differential + .combine(ch_gene_bed_single) + .map { tuple, gene_bed -> + def group = tuple[0] + def regions = tuple[1] + [ 'chipbinner', group, 'NA', regions, gene_bed, "02_chipbinner/${group}/chipbinner.differential.annotated.tsv" ] + } + + ch_annotation_requests = ch_annotation_requests.mix(ch_chip_annot) + ch_chipbinner_summary = CHIPBINNER_RUN.out.summary.map { it[1] } + } + + if (params.run_span_diff && !params.differential_publish_manifest_only) { + ch_span_design = ch_design_rows + .filter { row -> row.caller == 'NA' && row.status == 'RUN' && row.eligible_span == 'true' } + .map { row -> [row.group, row] } + + ch_span_records = ch_samples_rows + .map { row -> [row.group, row] } + .groupTuple(by: [0]) + .join(ch_span_design) + .map { group, rows, design -> [ group, rows ] } + + ch_span_records + .map { group, rows -> [ group, JsonOutput.toJson(rows) ] } + .set { ch_span_records_json } + + RECORDS_TO_TSV_SPAN ( + ch_span_records_json.map { it[0] }, + ch_span_records_json.map { 'NA' }, + ch_span_records_json.map { it[1] }, + 'sample_id\tgroup\tcondition\treplicate\tfinal_bam', + ch_span_records_json.map { group, json -> "span_${group}.tsv" } + ) + ch_versions = ch_versions.mix(RECORDS_TO_TSV_SPAN.out.versions) + + ch_span_records_file = RECORDS_TO_TSV_SPAN.out.tsv + .map { group, caller, records_file -> [group, records_file] } + + ch_span_peaks = ch_peaks_rows + .filter { row -> + def caller = row.caller?.toString()?.toLowerCase() + caller && (caller.startsWith('span') || caller.startsWith('omnipeak')) + } + .map { row -> [row.group, row.peaks_path] } + .groupTuple(by: [0]) + .map { group, peaks -> [group, peaks[0]] } + + ch_span_inputs = ch_span_records_file + .join(ch_span_peaks) + .map { group, records_file, peaks_path -> [ group, records_file, peaks_path ] } + + SPAN_DIFF_RUN ( + ch_span_inputs, + params.differential_contrast, + params.span_diff_mode, + params.span_diff_fdr, + params.span_diff_gap, + params.span_diff_bin, + params.span_fallback_backend, + params.omnipeaks_jar ? file(params.omnipeaks_jar) : null, + params.span_diff_java_heap + ) + ch_versions = ch_versions.mix(SPAN_DIFF_RUN.out.versions) + + ch_span_annot = SPAN_DIFF_RUN.out.differential + .combine(ch_gene_bed_single) + .map { tuple, gene_bed -> + def group = tuple[0] + def regions = tuple[1] + [ 'span', group, 'NA', regions, gene_bed, "03_span/${group}/span.differential.annotated.tsv" ] + } + + ch_annotation_requests = ch_annotation_requests.mix(ch_span_annot) + ch_span_summary = SPAN_DIFF_RUN.out.summary.map { it[1] } + } + + ANNOTATE_REGIONS ( + ch_annotation_requests + ) + ch_versions = ch_versions.mix(ANNOTATE_REGIONS.out.versions) + + ch_summary_files = ch_diffbind_summary.mix(ch_chipbinner_summary).mix(ch_span_summary) + DIFFERENTIAL_SUMMARY ( + DIFFERENTIAL_DESIGN.out.design, + ch_summary_files.collect().map { it + summary_stub }.ifEmpty([summary_stub]), + file("$projectDir/assets/multiqc/differential_summary_header.txt"), + file("$projectDir/assets/multiqc/differential_design_header.txt") + ) + ch_versions = ch_versions.mix(DIFFERENTIAL_SUMMARY.out.versions) + ch_diff_summary_mqc = DIFFERENTIAL_SUMMARY.out.summary + ch_diff_design_mqc = DIFFERENTIAL_SUMMARY.out.design + + if (params.differential_multiqc_report) { + MULTIQC_DIFFERENTIAL ( + file("$projectDir/assets/multiqc_differential_config.yml"), + diff_outdir + ) + ch_versions = ch_versions.mix(MULTIQC_DIFFERENTIAL.out.versions) + } + } + + emit: + versions = ch_versions + summary_mqc = ch_diff_summary_mqc + design_mqc = ch_diff_design_mqc +} diff --git a/subworkflows/local/prepare_peakcalling.nf b/subworkflows/local/prepare_peakcalling.nf index d9fcc019..b3be4a47 100644 --- a/subworkflows/local/prepare_peakcalling.nf +++ b/subworkflows/local/prepare_peakcalling.nf @@ -24,6 +24,7 @@ workflow PREPARE_PEAKCALLING { main: ch_versions = Channel.empty() ch_bedgraph = Channel.empty() + ch_scale_factors = Channel.empty() def norm_scope = normalisation_scope ?: 'all' def igg_scope = igg_scale_scope ?: 'legacy' def median = { List values -> @@ -92,6 +93,10 @@ workflow PREPARE_PEAKCALLING { // EXAMPLE CHANNEL STRUCT: [id, scale_factor] //ch_bam_scale_factor | view + ch_bam_scale_factor_report + .map { meta, bam, scale, reads, scope_id -> [ meta, scale ] } + .set { ch_scale_factors } + if (params.dump_scale_factors) { ch_bam_scale_factor_report .map { meta, bam, scale, reads, scope_id -> @@ -123,6 +128,9 @@ workflow PREPARE_PEAKCALLING { } .set { ch_bam_scale_factor } //ch_bam_scale_factor | view + + ch_bam.map { row -> [ row[0], 'NA' ] } + .set { ch_scale_factors } } if (norm_mode == "Spikein" || norm_mode == "None") { @@ -264,6 +272,10 @@ workflow PREPARE_PEAKCALLING { // EXAMPLE CHANNEL STRUCT: [[META], BAM, BAI] //ch_bedgraph | view + ch_bam_bai_scale_factor + .map { meta, bam, bai, scale -> [ meta, 'NA' ] } + .set { ch_scale_factors } + } /* @@ -301,5 +313,6 @@ workflow PREPARE_PEAKCALLING { emit: bedgraph = UCSC_BEDCLIP.out.bedgraph // channel: [ val(meta), [ bedgraph ] ] bigwig = UCSC_BEDGRAPHTOBIGWIG.out.bigwig // channel: [ val(meta), [ bigwig ] ] + scale_factors = ch_scale_factors // channel: [ val(meta), scale_factor ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/tests/test_07_differential.yml b/tests/test_07_differential.yml new file mode 100644 index 00000000..5a149706 --- /dev/null +++ b/tests/test_07_differential.yml @@ -0,0 +1,18 @@ +- name: test_differential_missing_contrast + command: nextflow run main.nf -profile docker,test --run_diffbind --differential_contrast Treated --skip_fastqc --skip_removeduplicates --skip_multiqc --skip_preseq -c tests/config/nextflow.config + tags: + - test_differential + - test_differential_missing_contrast + exit_code: 1 + +- name: test_differential_manifests_only + command: nextflow run main.nf -profile docker,test --only_peak_calling --skip_fastqc --skip_removeduplicates --skip_multiqc --skip_preseq --run_diffbind --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_publish_manifest_only --input tests/assets/samplesheet_condition.csv -c tests/config/nextflow.config + tags: + - test_differential + - test_differential_manifests_only + files: + - path: results/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv + - path: results/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv + - path: results/03_peak_calling/08_differential/00_manifests/differential_manifest.design.tsv + - path: results/03_peak_calling/08_differential/multiqc/differential_summary_mqc.tsv + - path: results/03_peak_calling/08_differential/multiqc/differential_design_mqc.tsv diff --git a/workflows/cutandrun.nf b/workflows/cutandrun.nf index d97aae21..e34b7d14 100644 --- a/workflows/cutandrun.nf +++ b/workflows/cutandrun.nf @@ -142,6 +142,7 @@ include { CONSENSUS_PEAKS as CONSENSUS_PEAKS_ALL } from "../subworkflo include { EXTRACT_FRAGMENTS } from "../subworkflows/local/extract_fragments" include { PREPARE_PEAKCALLING } from "../subworkflows/local/prepare_peakcalling" include { PEAK_CALLING_EXTENDED } from "../subworkflows/local/peak_calling_extended" +include { DIFFERENTIAL_PEAK_CALLING } from "../subworkflows/local/differential_peak_calling" include { DEEPTOOLS_QC } from "../subworkflows/local/deeptools_qc" include { PEAK_QC } from "../subworkflows/local/peak_qc" include { SAMTOOLS_VIEW_SORT_STATS as FILTER_READS } from "../subworkflows/local/samtools_view_sort_stats" @@ -450,6 +451,9 @@ workflow CUTANDRUN { ch_bedgraph = Channel.empty() ch_bigwig = Channel.empty() + ch_scale_factors = Channel.empty() + ch_diff_summary_mqc = Channel.empty() + ch_diff_design_mqc = Channel.empty() ch_peaks_all = Channel.empty() ch_peaks_primary = Channel.empty() ch_peaks_secondary = Channel.empty() @@ -478,6 +482,7 @@ workflow CUTANDRUN { ) ch_bedgraph = PREPARE_PEAKCALLING.out.bedgraph ch_bigwig = PREPARE_PEAKCALLING.out.bigwig + ch_scale_factors = PREPARE_PEAKCALLING.out.scale_factors ch_software_versions = ch_software_versions.mix(PREPARE_PEAKCALLING.out.versions) /* @@ -654,6 +659,25 @@ workflow CUTANDRUN { ch_software_versions = ch_software_versions.mix(CONTROL_POOLING_FALLBACKS_REPORT.out.versions) } + if (params.run_peak_calling && (params.run_diffbind || params.run_chipbinner || params.run_span_diff || params.differential_publish_manifest_only)) { + DIFFERENTIAL_PEAK_CALLING ( + ch_samtools_bam, + ch_samtools_bai, + ch_bigwig, + ch_peaks_all, + ch_scale_factors, + PREPARE_GENOME.out.chrom_sizes, + PREPARE_GENOME.out.bed, + ch_blacklist, + Channel.empty(), + Channel.empty(), + 'integrated' + ) + ch_software_versions = ch_software_versions.mix(DIFFERENTIAL_PEAK_CALLING.out.versions) + ch_diff_summary_mqc = DIFFERENTIAL_PEAK_CALLING.out.summary_mqc + ch_diff_design_mqc = DIFFERENTIAL_PEAK_CALLING.out.design_mqc + } + ch_dt_corrmatrix = Channel.empty() ch_dt_pcadata = Channel.empty() ch_dt_fpmatrix = Channel.empty() @@ -900,6 +924,7 @@ workflow CUTANDRUN { if (params.run_multiqc) { workflow_summary = WorkflowCutandrun.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) + ch_diff_mqc = ch_diff_summary_mqc.mix(ch_diff_design_mqc) /* * MODULE: Collect software versions used in pipeline @@ -936,6 +961,7 @@ workflow CUTANDRUN { ch_peakqc_count_consensus_mqc.collect{it[1]}.ifEmpty([]), ch_peakqc_reprod_perc_mqc.collect().ifEmpty([]), ch_frag_len_hist_mqc.collect().ifEmpty([]), + ch_diff_mqc.collect().ifEmpty([]), ch_linear_duplication_mqc.collect{it[1]}.ifEmpty([]) ) multiqc_report = MULTIQC.out.report.toList() diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf new file mode 100644 index 00000000..7ae3adb9 --- /dev/null +++ b/workflows/differential_only.nf @@ -0,0 +1,53 @@ +/* + * Differential-only entrypoint + */ + +include { DIFFERENTIAL_PEAK_CALLING } from "../subworkflows/local/differential_peak_calling" +include { CUSTOM_GETCHROMSIZES } from "../modules/nf-core/custom/getchromsizes/main" + +workflow DIFFERENTIAL_ONLY { + + if (!params.differential_from_run) { + exit 1, "--differential_from_run must be provided when using -entry DIFFERENTIAL_ONLY" + } + + def manifest_dir = "${params.differential_from_run}/03_peak_calling/08_differential/00_manifests" + def samples_manifest = "${manifest_dir}/differential_manifest.samples.tsv" + def peaks_manifest = "${manifest_dir}/differential_manifest.peaks.tsv" + + ch_samples_manifest = Channel.fromPath(samples_manifest, checkIfExists: true) + ch_peaks_manifest = Channel.fromPath(peaks_manifest, checkIfExists: true) + + if (!params.gene_bed) { + exit 1, "--gene_bed (or --genome with a bed12 attribute) is required for differential analysis." + } + if (params.run_span_diff && !params.omnipeaks_jar) { + exit 1, "--omnipeaks_jar is required when --run_span_diff is enabled." + } + + ch_gene_bed = Channel.fromPath(params.gene_bed, checkIfExists: true) + + ch_chrom_sizes = Channel.empty() + if (params.run_chipbinner) { + if (!params.fasta) { + exit 1, "--fasta is required to compute chrom sizes for ChIPBinner in differential-only mode." + } + ch_fasta = Channel.of([ [id: 'genome'], file(params.fasta) ]) + CUSTOM_GETCHROMSIZES ( ch_fasta ) + ch_chrom_sizes = CUSTOM_GETCHROMSIZES.out.sizes.map { it[1] } + } + + DIFFERENTIAL_PEAK_CALLING ( + Channel.empty(), + Channel.empty(), + Channel.empty(), + Channel.empty(), + Channel.empty(), + ch_chrom_sizes, + ch_gene_bed, + Channel.empty(), + ch_samples_manifest, + ch_peaks_manifest, + 'posthoc' + ) +} From d7bddb29a07b791f2d1a8bbaf7d6eb87bcb000ad Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 06:17:46 -0800 Subject: [PATCH 02/52] Apply diffbind extra params --- bin/diffbind_run.R | 77 +++++++++++++++++++++++++++++++---- modules/local/diffbind_run.nf | 2 +- 2 files changed, 71 insertions(+), 8 deletions(-) diff --git a/bin/diffbind_run.R b/bin/diffbind_run.R index cae5fb35..53f4bf84 100755 --- a/bin/diffbind_run.R +++ b/bin/diffbind_run.R @@ -23,6 +23,59 @@ parse_args <- function(args) { res } +read_extra_params <- function(path) { + if (is.null(path) || !nzchar(path)) { + return(list()) + } + if (!file.exists(path)) { + stop(sprintf("extra_params file not found: %s", path)) + } + ext <- tolower(tools::file_ext(path)) + if (ext %in% c("yml", "yaml")) { + if (!requireNamespace("yaml", quietly = TRUE)) { + stop("YAML extra_params requires the 'yaml' R package; install it or provide JSON instead.") + } + return(yaml::yaml.load_file(path)) + } + if (requireNamespace("jsonlite", quietly = TRUE)) { + return(jsonlite::fromJSON(path, simplifyVector = FALSE)) + } + if (requireNamespace("rjson", quietly = TRUE)) { + return(rjson::fromJSON(file = path)) + } + stop("JSON extra_params requires the 'jsonlite' or 'rjson' R package.") +} + +extra_params_for <- function(extra, key) { + if (is.null(extra) || !is.list(extra)) { + return(list()) + } + if (!is.null(extra[[key]])) { + return(extra[[key]]) + } + key_dot <- gsub("_", ".", key) + if (!is.null(extra[[key_dot]])) { + return(extra[[key_dot]]) + } + key_us <- gsub("\\.", "_", key) + if (!is.null(extra[[key_us]])) { + return(extra[[key_us]]) + } + return(list()) +} + +resolve_constants <- function(value) { + if (is.list(value)) { + return(lapply(value, resolve_constants)) + } + if (is.character(value) && length(value) == 1) { + if (exists(value, envir = asNamespace("DiffBind"), inherits = FALSE)) { + return(get(value, envir = asNamespace("DiffBind"))) + } + } + value +} + as_bool <- function(val) { if (is.null(val)) { return(FALSE) @@ -49,6 +102,8 @@ recenter <- as_bool(params[["recenter"]]) summits <- as.integer(params[["summits"]]) norm_method <- params[["norm_method"]] export_sheets <- as_bool(params[["export_sheets"]]) +extra_params_path <- params[["extra_params"]] +extra_params <- read_extra_params(extra_params_path) use_spikein <- as_bool(params[["use_spikein"]]) @@ -165,10 +220,13 @@ if (!use_diffbind) { # Best-effort DiffBind execution tryCatch({ suppressPackageStartupMessages(library(DiffBind)) + extra_params <- resolve_constants(extra_params) - dba_obj <- dba(sampleSheet = samplesheet) + dba_args <- utils::modifyList(list(sampleSheet = samplesheet), extra_params_for(extra_params, "dba")) + dba_obj <- do.call(dba, dba_args) summit_size <- if (recenter) summits else 0 - dba_obj <- dba.count(dba_obj, summits = summit_size, minOverlap = min_overlap) + count_args <- utils::modifyList(list(DBA = dba_obj, summits = summit_size, minOverlap = min_overlap), extra_params_for(extra_params, "dba_count")) + dba_obj <- do.call(dba.count, count_args) if (use_spikein && "spikein_scale_factor" %in% colnames(records)) { factors <- suppressWarnings(as.numeric(records$spikein_scale_factor)) @@ -176,21 +234,26 @@ tryCatch({ size_factors <- ifelse(factors == 0, 1, 1 / factors) names(size_factors) <- records$sample_id size_factors <- size_factors[samplesheet$SampleID] - dba_obj <- dba.normalize(dba_obj, normalize = DBA_NORM_LIB, library = size_factors) + norm_args <- utils::modifyList(list(DBA = dba_obj, normalize = DBA_NORM_LIB, library = size_factors), extra_params_for(extra_params, "dba_normalize")) + dba_obj <- do.call(dba.normalize, norm_args) norm_out <- data.frame(sample_id = samplesheet$SampleID, size_factor = size_factors, stringsAsFactors = FALSE) write.table(norm_out, file = file.path(outdir, "diffbind.normalization_factors.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) } } else if (!is.null(norm_method) && norm_method != "native") { - dba_obj <- dba.normalize(dba_obj, method = norm_method) + norm_args <- utils::modifyList(list(DBA = dba_obj, method = norm_method), extra_params_for(extra_params, "dba_normalize")) + dba_obj <- do.call(dba.normalize, norm_args) } contrast_labels <- strsplit(contrast, ",")[[1]] - dba_obj <- dba.contrast(dba_obj, categories = DBA_CONDITION, group1 = contrast_labels[1], group2 = contrast_labels[2]) + contrast_args <- utils::modifyList(list(DBA = dba_obj, categories = DBA_CONDITION, group1 = contrast_labels[1], group2 = contrast_labels[2]), extra_params_for(extra_params, "dba_contrast")) + dba_obj <- do.call(dba.contrast, contrast_args) method_flag <- ifelse(toupper(backend) == "EDGER", DBA_EDGER, DBA_DESEQ2) - dba_obj <- dba.analyze(dba_obj, method = method_flag) + analyze_args <- utils::modifyList(list(DBA = dba_obj, method = method_flag), extra_params_for(extra_params, "dba_analyze")) + dba_obj <- do.call(dba.analyze, analyze_args) - report <- dba.report(dba_obj, th = fdr, fold = lfc) + report_args <- utils::modifyList(list(DBA = dba_obj, th = fdr, fold = lfc), extra_params_for(extra_params, "dba_report")) + report <- do.call(dba.report, report_args) results <- as.data.frame(report) results_out <- data.frame( chr = results$seqnames, diff --git a/modules/local/diffbind_run.nf b/modules/local/diffbind_run.nf index 1853c932..057518e4 100644 --- a/modules/local/diffbind_run.nf +++ b/modules/local/diffbind_run.nf @@ -1,7 +1,7 @@ process DIFFBIND_RUN { label 'process_diffbind' - conda "conda-forge::r-base=4.2.3 bioconda::bioconductor-diffbind" + conda "conda-forge::r-base=4.2.3 bioconda::bioconductor-diffbind conda-forge::r-jsonlite conda-forge::r-yaml" container "quay.io/biocontainers/bioconductor-diffbind:3.14.0--r42_0" input: From 32264bc90ca7d496464e46a3365354ebebca04a8 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 06:35:08 -0800 Subject: [PATCH 03/52] Gate input check for differential-only entry --- workflows/cutandrun.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/cutandrun.nf b/workflows/cutandrun.nf index e34b7d14..86d25eec 100644 --- a/workflows/cutandrun.nf +++ b/workflows/cutandrun.nf @@ -35,7 +35,8 @@ if(params.normalisation_mode == "Spikein") { } // Check mandatory parameters that cannot be checked in the groovy lib as we want a channel for them -if (params.input) { ch_input = file(params.input) } else { exit 1, "Input samplesheet not specified!" } +// NOTE: Input validation is handled in WorkflowMain.initialise with entrypoint-aware gating. +ch_input = params.input ? file(params.input) : null ch_blacklist = Channel.empty() if (params.blacklist) { From 528d3f9dd89b5ccc74c493b3792cb8cd31b23da1 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 06:52:24 -0800 Subject: [PATCH 04/52] Fix differential summary inputs --- modules/local/differential_summary.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf index 05bfef06..8aea2f59 100644 --- a/modules/local/differential_summary.nf +++ b/modules/local/differential_summary.nf @@ -19,10 +19,11 @@ process DIFFERENTIAL_SUMMARY { task.ext.when == null || task.ext.when script: + def summaries_arg = summary_files instanceof List ? summary_files.join(' ') : summary_files """ differential_summary.py \ --design ${design_manifest} \ - --summaries ${summary_files} \ + --summaries ${summaries_arg} \ --out differential_summary.tsv cat ${summary_header} differential_summary.tsv > differential_summary_mqc.tsv From 14e761e659067a41ef3edd06f010694b4992c821 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 07:15:29 -0800 Subject: [PATCH 05/52] Handle missing Java for SPAN native mode --- bin/span_diff.py | 20 ++++++++++++++++---- modules/local/span_diff_run.nf | 2 +- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/bin/span_diff.py b/bin/span_diff.py index 482438ff..6c7637f0 100755 --- a/bin/span_diff.py +++ b/bin/span_diff.py @@ -21,6 +21,8 @@ def detect_native_support(jar_path, java_heap): check=False, ) output = (result.stdout or "") + (result.stderr or "") + except FileNotFoundError: + return False, "java_missing" except Exception: return False, "help_failed" if "compare" in output.lower(): @@ -65,15 +67,25 @@ def main(): native_supported, signature = detect_native_support(args.jar, args.java_heap) chosen_mode = "fallback" + status = "RUN" + reason = "ok" if args.mode == "native": if not native_supported: - print("ERROR: SPAN jar does not support native compare.", file=sys.stderr) - sys.exit(1) - chosen_mode = "native" + if signature == "java_missing": + print("WARNING: Java not found; falling back to SPAN fallback mode.", file=sys.stderr) + chosen_mode = "fallback" + reason = "java_missing_fallback" + else: + print("ERROR: SPAN jar does not support native compare.", file=sys.stderr) + sys.exit(1) + else: + chosen_mode = "native" elif args.mode == "fallback": chosen_mode = "fallback" else: chosen_mode = "native" if native_supported else "fallback" + if not native_supported and signature == "java_missing": + reason = "java_missing_fallback" if not os.path.exists(args.peaks): print("ERROR: SPAN peaks file not found for fallback.", file=sys.stderr) @@ -103,7 +115,7 @@ def main(): with open(summary_path, "w") as handle: handle.write("group\ttreated\tcontrol\tn_tested\tn_fdr_pass\tn_up\tn_down\tmode\tstatus\treason\n") - handle.write(f"{args.group}\t{treated}\t{control}\t{len(regions)}\t0\t0\t0\t{chosen_mode}\tRUN\tok\n") + handle.write(f"{args.group}\t{treated}\t{control}\t{len(regions)}\t0\t0\t0\t{chosen_mode}\t{status}\t{reason}\n") with open(mode_path, "w") as handle: handle.write(f"{chosen_mode}:{signature}\n") diff --git a/modules/local/span_diff_run.nf b/modules/local/span_diff_run.nf index 28e9ae89..2a507c49 100644 --- a/modules/local/span_diff_run.nf +++ b/modules/local/span_diff_run.nf @@ -1,7 +1,7 @@ process SPAN_DIFF_RUN { label 'process_span' - conda "conda-forge::python=3.8.3" + conda "conda-forge::python=3.8.3 conda-forge::openjdk" container "quay.io/biocontainers/python:3.8.3" input: From 315dcc66e7e206f0eb9897a9a7e5430c134412ed Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 07:34:49 -0800 Subject: [PATCH 06/52] Fix differential-only init and annotation --- bin/annotate_regions.py | 7 ++++++- workflows/cutandrun.nf | 7 +++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/bin/annotate_regions.py b/bin/annotate_regions.py index 88fb8bf1..ffc5b472 100755 --- a/bin/annotate_regions.py +++ b/bin/annotate_regions.py @@ -74,8 +74,13 @@ def main(): continue parts = line.split("\t") idx = int(parts[3]) - gene_id = parts[7] if len(parts) > 7 else "NA" + a_cols = 4 distance = parts[-1] if parts else "NA" + b_cols = len(parts) - a_cols - 1 + if b_cols >= 4: + gene_id = parts[a_cols + 3] + else: + gene_id = "NA" annotations[idx] = (gene_id, gene_id, distance) with open(args.out, "w") as handle: diff --git a/workflows/cutandrun.nf b/workflows/cutandrun.nf index 86d25eec..1e3788c3 100644 --- a/workflows/cutandrun.nf +++ b/workflows/cutandrun.nf @@ -6,8 +6,11 @@ include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' -// Validate input parameters in specialised library -WorkflowCutandrun.initialise(params, log) +// Validate input parameters in specialised library (skip for differential-only entrypoint) +def entry = workflow.hasProperty('entry') ? workflow.entry : null +if (entry != 'DIFFERENTIAL_ONLY') { + WorkflowCutandrun.initialise(params, log) +} def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' def summary_params = paramsSummaryMap(workflow) From 17fa79c473e5f21658d7235c83fb8cfd2320caa2 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 08:02:07 -0800 Subject: [PATCH 07/52] Deterministic SPAN peak selection --- subworkflows/local/differential_peak_calling.nf | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 0f8a12c9..21be6252 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -344,14 +344,25 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_span_records_file = RECORDS_TO_TSV_SPAN.out.tsv .map { group, caller, records_file -> [group, records_file] } + def span_caller_priority = (params.callers ?: []) + .collect { it.toString().toLowerCase() } + .findAll { it.startsWith('span') || it.startsWith('omnipeak') } + ch_span_peaks = ch_peaks_rows .filter { row -> def caller = row.caller?.toString()?.toLowerCase() caller && (caller.startsWith('span') || caller.startsWith('omnipeak')) } - .map { row -> [row.group, row.peaks_path] } + .map { row -> [row.group, row.caller?.toString()?.toLowerCase(), row.peaks_path] } .groupTuple(by: [0]) - .map { group, peaks -> [group, peaks[0]] } + .map { group, entries -> + def peaks_by_caller = entries.groupBy { it[1] } + def chosen = span_caller_priority.find { peaks_by_caller.containsKey(it) } ?: peaks_by_caller.keySet().sort()[0] + if (peaks_by_caller.size() > 1) { + log.warn "Multiple SPAN callers for group ${group}; using '${chosen}' for differential peaks." + } + [group, peaks_by_caller[chosen][0][2]] + } ch_span_inputs = ch_span_records_file .join(ch_span_peaks) From 3f6f828206a3800ae9a26183a610134c94643591 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 21:01:52 -0800 Subject: [PATCH 08/52] Fix differential-only entry detection --- lib/WorkflowMain.groovy | 36 ++++++++++++++++++++++++++++++++++-- workflows/cutandrun.nf | 3 +-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 67f9fec4..d28cde7e 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -45,13 +45,45 @@ class WorkflowMain { NfcoreTemplate.awsBatch(workflow, params) // Check input has been provided unless running differential-only - def entry = workflow.hasProperty('entry') ? workflow.entry : null - def differential_only = entry == 'DIFFERENTIAL_ONLY' + def differential_only = isDifferentialOnly(workflow) if (!params.input && !(differential_only && params.differential_from_run)) { Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") } } + // + // Detect whether the workflow is running the differential-only entrypoint + // + public static boolean isDifferentialOnly(workflow) { + def entry = getWorkflowEntryName(workflow) + return entry?.toString()?.toUpperCase() == 'DIFFERENTIAL_ONLY' + } + + // + // Get the entrypoint name in a Nextflow-version-safe way + // + private static String getWorkflowEntryName(workflow) { + if (!workflow) { + return null + } + if (workflow.hasProperty('entry')) { + return workflow.entry + } + if (workflow.hasProperty('entryName')) { + return workflow.entryName + } + if (workflow.manifest && workflow.manifest.hasProperty('entryName')) { + return workflow.manifest.entryName + } + if (workflow.commandLine) { + def matcher = (workflow.commandLine =~ /(?:^|\\s)-entry(?:=|\\s+)(\\S+)/) + if (matcher) { + return matcher[0][1] + } + } + return null + } + // // Get attribute from genome config file e.g. fasta // diff --git a/workflows/cutandrun.nf b/workflows/cutandrun.nf index 1e3788c3..1183617b 100644 --- a/workflows/cutandrun.nf +++ b/workflows/cutandrun.nf @@ -7,8 +7,7 @@ include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' // Validate input parameters in specialised library (skip for differential-only entrypoint) -def entry = workflow.hasProperty('entry') ? workflow.entry : null -if (entry != 'DIFFERENTIAL_ONLY') { +if (!WorkflowMain.isDifferentialOnly(workflow)) { WorkflowCutandrun.initialise(params, log) } def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) From da6d95a9d8ca94383a48a847eb8990130195045a Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 21:09:38 -0800 Subject: [PATCH 09/52] Guard chrom sizes for Chipbinner --- subworkflows/local/differential_peak_calling.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 21be6252..12b7ad68 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -181,7 +181,6 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_design_rows = DIFFERENTIAL_DESIGN.out.design.splitCsv(header: true, sep: '\t') ch_gene_bed_single = ch_gene_bed.collect().map { it[0] } - ch_chrom_sizes_single = ch_chrom_sizes.collect().map { it[0] } def diff_use_spikein = params.differential_use_spikein ? params.differential_use_spikein.toString().toLowerCase() : 'auto' def use_spikein = (diff_use_spikein == 'true') || (diff_use_spikein == 'auto' && params.normalisation_mode == 'Spikein') @@ -284,6 +283,8 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_chip_records_file = RECORDS_TO_TSV_CHIPBINNER.out.tsv .map { group, caller, records_file -> [group, records_file] } + ch_chrom_sizes_single = ch_chrom_sizes.collect().map { it[0] } + ch_chip_inputs = ch_chip_records_file .combine(ch_chrom_sizes_single) .map { record, chrom_sizes -> [ record[0], record[1], chrom_sizes ] } From ccaab6450bc5f1b031e23a6d734c47410e2a8fe6 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 21:20:41 -0800 Subject: [PATCH 10/52] Use first() for gene bed --- subworkflows/local/differential_peak_calling.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 12b7ad68..15ce53ee 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -180,7 +180,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_peaks_rows = ch_peaks_manifest.splitCsv(header: true, sep: '\t') ch_design_rows = DIFFERENTIAL_DESIGN.out.design.splitCsv(header: true, sep: '\t') - ch_gene_bed_single = ch_gene_bed.collect().map { it[0] } + ch_gene_bed_single = ch_gene_bed.first() def diff_use_spikein = params.differential_use_spikein ? params.differential_use_spikein.toString().toLowerCase() : 'auto' def use_spikein = (diff_use_spikein == 'true') || (diff_use_spikein == 'auto' && params.normalisation_mode == 'Spikein') From 475cb83818db638e08cc1453be103dce8d0465a8 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 21:32:23 -0800 Subject: [PATCH 11/52] Filter samples by contrast labels --- subworkflows/local/differential_peak_calling.nf | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 15ce53ee..9786dae6 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -182,6 +182,9 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_gene_bed_single = ch_gene_bed.first() + def contrast_labels = params.differential_contrast.split(',').collect { it.trim() } + ch_samples_rows_contrast = ch_samples_rows.filter { row -> contrast_labels.contains(row.condition) } + def diff_use_spikein = params.differential_use_spikein ? params.differential_use_spikein.toString().toLowerCase() : 'auto' def use_spikein = (diff_use_spikein == 'true') || (diff_use_spikein == 'auto' && params.normalisation_mode == 'Spikein') @@ -190,7 +193,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { .filter { row -> row.caller != 'NA' && row.status == 'RUN' && row.eligible_diffbind == 'true' } .map { row -> [ [row.group, row.caller], row ] } - ch_sample_peak_records = ch_samples_rows + ch_sample_peak_records = ch_samples_rows_contrast .map { row -> [row.sample_id, row] } .join(ch_peaks_rows.map { row -> [row.sample_id, row] }) .map { sample_id, srow, prow -> @@ -261,7 +264,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { .filter { row -> row.caller == 'NA' && row.status == 'RUN' && row.eligible_chipbinner == 'true' } .map { row -> [row.group, row] } - ch_chip_records = ch_samples_rows + ch_chip_records = ch_samples_rows_contrast .map { row -> [row.group, row] } .groupTuple(by: [0]) .join(ch_chip_design) @@ -323,7 +326,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { .filter { row -> row.caller == 'NA' && row.status == 'RUN' && row.eligible_span == 'true' } .map { row -> [row.group, row] } - ch_span_records = ch_samples_rows + ch_span_records = ch_samples_rows_contrast .map { row -> [row.group, row] } .groupTuple(by: [0]) .join(ch_span_design) From cf3d4638584e2a79c1f1d13b034ec958e4dd47c7 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 21:43:48 -0800 Subject: [PATCH 12/52] Fail fast on missing manifests --- bin/annotate_regions.py | 2 ++ workflows/differential_only.nf | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/bin/annotate_regions.py b/bin/annotate_regions.py index ffc5b472..6b6416f5 100755 --- a/bin/annotate_regions.py +++ b/bin/annotate_regions.py @@ -36,6 +36,8 @@ def main(): rows = read_tsv(args.regions) if not rows: + with open(args.out, "w") as handle: + handle.write("") return header = None diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index 7ae3adb9..cf84fcf8 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -15,6 +15,13 @@ workflow DIFFERENTIAL_ONLY { def samples_manifest = "${manifest_dir}/differential_manifest.samples.tsv" def peaks_manifest = "${manifest_dir}/differential_manifest.peaks.tsv" + if (!file(samples_manifest).exists()) { + exit 1, "Missing samples manifest: ${samples_manifest}" + } + if (!file(peaks_manifest).exists()) { + exit 1, "Missing peaks manifest: ${peaks_manifest}" + } + ch_samples_manifest = Channel.fromPath(samples_manifest, checkIfExists: true) ch_peaks_manifest = Channel.fromPath(peaks_manifest, checkIfExists: true) From ee4715b988f5487aa3001afb8158ff20ebd8aace Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 21:57:30 -0800 Subject: [PATCH 13/52] Mark manifest-only summaries as skipped --- bin/differential_summary.py | 4 ++++ modules/local/differential_summary.nf | 3 +++ subworkflows/local/differential_peak_calling.nf | 1 + 3 files changed, 8 insertions(+) diff --git a/bin/differential_summary.py b/bin/differential_summary.py index 515f9a43..d11f043d 100755 --- a/bin/differential_summary.py +++ b/bin/differential_summary.py @@ -14,6 +14,7 @@ def main(): parser = argparse.ArgumentParser(description="Build differential summary table") parser.add_argument("--design", required=True) parser.add_argument("--summaries", nargs="*", default=[]) + parser.add_argument("--manifest-only", action="store_true") parser.add_argument("--out", required=True) args = parser.parse_args() @@ -27,6 +28,9 @@ def main(): control = row.get("control_condition") or "" status = row.get("status") or "" reason = row.get("reason") or "" + if args.manifest_only and status == "RUN": + status = "SKIPPED" + reason = "manifest_only" if caller != "NA": key = ("diffbind", group, caller) diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf index 8aea2f59..3ac5fcd3 100644 --- a/modules/local/differential_summary.nf +++ b/modules/local/differential_summary.nf @@ -7,6 +7,7 @@ process DIFFERENTIAL_SUMMARY { input: path design_manifest path summary_files + val manifest_only path summary_header path design_header @@ -20,10 +21,12 @@ process DIFFERENTIAL_SUMMARY { script: def summaries_arg = summary_files instanceof List ? summary_files.join(' ') : summary_files + def manifest_only_flag = manifest_only ? '--manifest-only' : '' """ differential_summary.py \ --design ${design_manifest} \ --summaries ${summaries_arg} \ + ${manifest_only_flag} \ --out differential_summary.tsv cat ${summary_header} differential_summary.tsv > differential_summary_mqc.tsv diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 9786dae6..6cd71d9d 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -406,6 +406,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { DIFFERENTIAL_SUMMARY ( DIFFERENTIAL_DESIGN.out.design, ch_summary_files.collect().map { it + summary_stub }.ifEmpty([summary_stub]), + params.differential_publish_manifest_only, file("$projectDir/assets/multiqc/differential_summary_header.txt"), file("$projectDir/assets/multiqc/differential_design_header.txt") ) From 40072bc4bf285bcee0bf6b8d0b73c40eb92a2596 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 22:01:39 -0800 Subject: [PATCH 14/52] Fallback entry detection on differential_from_run --- lib/WorkflowMain.groovy | 9 ++++++--- workflows/cutandrun.nf | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index d28cde7e..acda5347 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -45,7 +45,7 @@ class WorkflowMain { NfcoreTemplate.awsBatch(workflow, params) // Check input has been provided unless running differential-only - def differential_only = isDifferentialOnly(workflow) + def differential_only = isDifferentialOnly(workflow, params) if (!params.input && !(differential_only && params.differential_from_run)) { Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") } @@ -54,9 +54,12 @@ class WorkflowMain { // // Detect whether the workflow is running the differential-only entrypoint // - public static boolean isDifferentialOnly(workflow) { + public static boolean isDifferentialOnly(workflow, params = null) { def entry = getWorkflowEntryName(workflow) - return entry?.toString()?.toUpperCase() == 'DIFFERENTIAL_ONLY' + if (entry) { + return entry?.toString()?.toUpperCase() == 'DIFFERENTIAL_ONLY' + } + return params?.differential_from_run ? true : false } // diff --git a/workflows/cutandrun.nf b/workflows/cutandrun.nf index 1183617b..445871db 100644 --- a/workflows/cutandrun.nf +++ b/workflows/cutandrun.nf @@ -7,7 +7,7 @@ include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' // Validate input parameters in specialised library (skip for differential-only entrypoint) -if (!WorkflowMain.isDifferentialOnly(workflow)) { +if (!WorkflowMain.isDifferentialOnly(workflow, params)) { WorkflowCutandrun.initialise(params, log) } def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) From 0da6f43b73fbfd3e78cef657920965c8e15a673f Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 22:24:20 -0800 Subject: [PATCH 15/52] Implement real ChIPBinner workflow --- bin/chipbinner_rots.R | 89 +++ bin/chipbinner_run.py | 755 +++++++++++++++--- modules/local/chipbinner_run.nf | 12 +- .../local/differential_peak_calling.nf | 7 +- 4 files changed, 732 insertions(+), 131 deletions(-) create mode 100755 bin/chipbinner_rots.R diff --git a/bin/chipbinner_rots.R b/bin/chipbinner_rots.R new file mode 100755 index 00000000..9d0d65a0 --- /dev/null +++ b/bin/chipbinner_rots.R @@ -0,0 +1,89 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) %% 2 != 0) { + stop("Arguments must be provided as --key value pairs") +} +arg_map <- list() +for (i in seq(1, length(args), by = 2)) { + key <- sub("^--", "", args[i]) + arg_map[[key]] <- args[i + 1] +} + +matrix_path <- arg_map[["matrix"]] +samplesheet_path <- arg_map[["samplesheet"]] +treated_label <- arg_map[["treated"]] +control_label <- arg_map[["control"]] +bootstrap <- as.integer(arg_map[["bootstrap"]]) +k_value <- as.integer(arg_map[["k-value"]]) +out_path <- arg_map[["out"]] + +if (is.null(matrix_path) || is.null(samplesheet_path) || is.null(out_path)) { + stop("Missing required arguments") +} + +suppressMessages(library(ROTS)) + +mat <- read.table(matrix_path, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1) +mat <- as.matrix(mat) + +samples <- read.csv(samplesheet_path, header = TRUE, stringsAsFactors = FALSE) +subset <- samples[samples$condition %in% c(treated_label, control_label), ] +if (nrow(subset) == 0) { + stop("No samples match contrast labels") +} + +sample_ids <- subset$sample_id +missing <- setdiff(sample_ids, colnames(mat)) +if (length(missing) > 0) { + stop(paste("Missing samples in matrix:", paste(missing, collapse = ","))) +} + +mat <- mat[, sample_ids, drop = FALSE] + +classes <- ifelse(subset$condition == treated_label, 1, 2) + +rot <- ROTS(mat, groups = classes, B = bootstrap, K = k_value) + +pvals <- NULL +fdrs <- NULL + +if (!is.null(rot$pvalue)) { + pvals <- rot$pvalue +} else if (!is.null(rot$pval)) { + pvals <- rot$pval +} + +if (is.null(pvals)) { + if ("pvalue" %in% slotNames(rot)) { + pvals <- slot(rot, "pvalue") + } +} + +if (is.null(pvals)) { + if (exists("pvalue", where = asNamespace("ROTS"), mode = "function")) { + pvals <- ROTS::pvalue(rot) + } +} + +if (!is.null(rot$FDR)) { + fdrs <- rot$FDR +} else if (!is.null(rot$fdr)) { + fdrs <- rot$fdr +} + +if (is.null(fdrs) && !is.null(pvals)) { + fdrs <- p.adjust(pvals, method = "BH") +} + +if (is.null(pvals)) { + stop("Unable to extract p-values from ROTS output") +} + +out <- data.frame( + bin_id = rownames(mat), + pval = as.numeric(pvals), + FDR = as.numeric(fdrs) +) + +write.table(out, file = out_path, sep = "\t", quote = FALSE, row.names = FALSE) diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index 6a99584f..c583123d 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -2,53 +2,45 @@ import argparse import csv import glob +import math import os -import random +import subprocess +import sys +import tempfile +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from sklearn.decomposition import PCA +from sklearn.metrics import silhouette_score +from sklearn.preprocessing import StandardScaler -def read_tsv(path): - with open(path, "r", newline="") as handle: - reader = csv.DictReader(handle, delimiter="\t") - return [row for row in reader] +try: + import hdbscan +except ImportError as exc: + hdbscan = None - -def read_chrom_sizes(path): - chroms = [] - with open(path, "r") as handle: - for line in handle: - parts = line.strip().split("\t") - if len(parts) < 2: - continue - chroms.append((parts[0], int(parts[1]))) - return chroms - - -def write_windows(chroms, bin_size, out_path, max_bins=200): - bins = [] - for chrom, size in chroms: - start = 0 - while start < size and len(bins) < max_bins: - end = min(start + bin_size, size) - bins.append((chrom, start, end)) - start = end - if len(bins) >= max_bins: - break - with open(out_path, "w") as handle: - for chrom, start, end in bins: - handle.write(f"{chrom}\t{start}\t{end}\n") - return bins +try: + from scipy.stats import fisher_exact +except ImportError: + fisher_exact = None -def main(): - parser = argparse.ArgumentParser(description="Stub ChIPBinner runner") +def parse_args(): + parser = argparse.ArgumentParser(description="Run ChIPBinner workflow") parser.add_argument("--samples", required=True) parser.add_argument("--group", required=True) parser.add_argument("--contrast", required=True) parser.add_argument("--chrom-sizes", required=True) parser.add_argument("--bin-size", type=int, required=True) parser.add_argument("--windows-dir") + parser.add_argument("--blacklist") parser.add_argument("--use-input", action="store_true") - parser.add_argument("--pseudocount", type=int, default=1) + parser.add_argument("--use-spikein", action="store_true") + parser.add_argument("--pseudocount", type=float, default=1.0) parser.add_argument("--grid-minpts", default="") parser.add_argument("--grid-minsamps", default="") parser.add_argument("--fdr", type=float, default=0.05) @@ -56,115 +48,622 @@ def main(): parser.add_argument("--bootstrap", type=int, default=1000) parser.add_argument("--k-value", type=int, default=100000) parser.add_argument("--functional-db") + parser.add_argument("--allow-partial", action="store_true") parser.add_argument("--outdir", default=".") - args = parser.parse_args() + return parser.parse_args() - os.makedirs(args.outdir, exist_ok=True) - plots_dir = os.path.join(args.outdir, "plots") - os.makedirs(plots_dir, exist_ok=True) - samples = read_tsv(args.samples) - contrast = [x.strip() for x in args.contrast.split(",") if x.strip()] - treated = contrast[0] if len(contrast) > 0 else "treated" - control = contrast[1] if len(contrast) > 1 else "control" +def read_tsv(path): + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return [row for row in reader] - # Write samplesheet - samplesheet_path = os.path.join(args.outdir, "chipbinner.samplesheet.csv") - with open(samplesheet_path, "w", newline="") as handle: - writer = csv.writer(handle) - writer.writerow(["sample_id", "group", "condition", "replicate", "bam", "bigwig", "spikein_scale_factor", "ms_coeff"]) + +def parse_float(value, default=None): + if value is None: + return default + text = str(value).strip() + if text in ("", "NA", "None", "none"): + return default + try: + return float(text) + except Exception: + return default + + +def ensure_dir(path): + if path: + os.makedirs(path, exist_ok=True) + + +def run_cmd(cmd, stdout=None): + result = subprocess.run(cmd, stdout=stdout, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + raise RuntimeError(f"Command failed ({' '.join(cmd)}): {result.stderr.strip()}") + + +def choose_windows_file(windows_dir, bin_size): + if not windows_dir: + return None + if os.path.isfile(windows_dir): + return windows_dir + if os.path.isdir(windows_dir): + candidates = sorted(glob.glob(os.path.join(windows_dir, f"*{bin_size}*.bed"))) + if not candidates: + candidates = sorted(glob.glob(os.path.join(windows_dir, "*.bed"))) + if candidates: + return candidates[0] + return None + + +def generate_windows(chrom_sizes, bin_size, blacklist, out_path): + with tempfile.TemporaryDirectory() as tmpdir: + raw_path = os.path.join(tmpdir, "windows_raw.bed") + with open(raw_path, "w") as handle: + with open(chrom_sizes, "r") as chrom_handle: + for line in chrom_handle: + parts = line.strip().split("\t") + if len(parts) < 2: + continue + chrom, size = parts[0], int(parts[1]) + start = 0 + while start < size: + end = min(start + bin_size, size) + handle.write(f"{chrom}\t{start}\t{end}\n") + start = end + work_path = raw_path + if blacklist and os.path.exists(blacklist): + filtered_path = os.path.join(tmpdir, "windows_filtered.bed") + run_cmd(["bedtools", "subtract", "-a", raw_path, "-b", blacklist], stdout=open(filtered_path, "w")) + work_path = filtered_path + with open(out_path, "w") as out_handle: + run_cmd(["sort", "-k1,1", "-k2,2n", work_path], stdout=out_handle) + + +def load_windows(path): + windows = pd.read_csv(path, sep="\t", header=None, names=["chrom", "start", "end"]) + windows["bin_id"] = windows.apply(lambda r: f"{r.chrom}:{int(r.start)}-{int(r.end)}", axis=1) + return windows + + +def write_samplesheet(samples, out_path): + header = [ + "sample_id", + "group", + "condition", + "replicate", + "final_bam", + "final_bai", + "bigwig_path", + "spikein_scale_factor", + "ms_coeff", + "input_bam", + ] + with open(out_path, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header) + writer.writeheader() for row in samples: - writer.writerow([ - row.get("sample_id", ""), - row.get("group", ""), - row.get("condition", ""), - row.get("replicate", ""), - row.get("final_bam", ""), - row.get("bigwig_path", ""), - row.get("spikein_scale_factor", ""), - row.get("ms_coeff", ""), - ]) - - # Windows - chroms = read_chrom_sizes(args.chrom_sizes) - windows_path = os.path.join(args.outdir, "chipbinner.windows.bed") - bins = [] - if args.windows_dir: - if os.path.isfile(args.windows_dir): - windows_path = args.windows_dir - elif os.path.isdir(args.windows_dir): - candidates = sorted(glob.glob(os.path.join(args.windows_dir, "*.bed"))) - if candidates: - windows_path = candidates[0] - if os.path.exists(windows_path) and os.path.abspath(windows_path) != os.path.abspath(os.path.join(args.outdir, "chipbinner.windows.bed")): - with open(windows_path, "r") as handle: - for line in handle: - parts = line.strip().split("\t") - if len(parts) >= 3: - bins.append((parts[0], int(parts[1]), int(parts[2]))) - with open(os.path.join(args.outdir, "chipbinner.windows.bed"), "w") as out_handle: - for chrom, start, end in bins: - out_handle.write(f"{chrom}\t{start}\t{end}\n") - windows_path = os.path.join(args.outdir, "chipbinner.windows.bed") - if not bins: - bins = write_windows(chroms, args.bin_size, windows_path) - - # Counts and normalized matrix - count_path = os.path.join(args.outdir, "chipbinner.bin_counts.tsv") - norm_path = os.path.join(args.outdir, "chipbinner.normalized_matrix.tsv") - sample_ids = [row.get("sample_id", "") for row in samples] - with open(count_path, "w") as handle: + writer.writerow({ + "sample_id": row.get("sample_id", ""), + "group": row.get("group", ""), + "condition": row.get("condition", ""), + "replicate": row.get("replicate", ""), + "final_bam": row.get("final_bam", ""), + "final_bai": row.get("final_bai", ""), + "bigwig_path": row.get("bigwig_path", ""), + "spikein_scale_factor": row.get("spikein_scale_factor", ""), + "ms_coeff": row.get("ms_coeff", ""), + "input_bam": row.get("input_bam", ""), + }) + +def compute_counts(windows_path, samples, out_path, use_input=False): + bam_paths = [] + sample_ids = [] + for row in samples: + bam = row.get("final_bam") + if not bam or bam in ("NA", "None"): + raise RuntimeError(f"Missing final_bam for sample {row.get('sample_id')}") + if not os.path.exists(bam): + raise RuntimeError(f"BAM not found: {bam}") + bam_paths.append(bam) + sample_ids.append(row.get("sample_id", os.path.basename(bam))) + + cmd = ["bedtools", "multicov", "-bed", windows_path, "-bams"] + bam_paths + with open(out_path, "w") as handle: + run_cmd(cmd, stdout=handle) + + cols = ["chrom", "start", "end"] + sample_ids + counts = pd.read_csv(out_path, sep="\t", header=None, names=cols) + counts["bin_id"] = counts.apply(lambda r: f"{r.chrom}:{int(r.start)}-{int(r.end)}", axis=1) + + if use_input: + input_bams = sorted({row.get("input_bam") for row in samples if row.get("input_bam") not in (None, "", "NA")}) + if not input_bams: + raise RuntimeError("chipbinner_use_input requested but no input_bam values were provided") + input_counts = None + for bam in input_bams: + if not os.path.exists(bam): + raise RuntimeError(f"Input BAM not found: {bam}") + with tempfile.NamedTemporaryFile(delete=False) as tmp_handle: + tmp_path = tmp_handle.name + cmd = ["bedtools", "multicov", "-bed", windows_path, "-bams", bam] + with open(tmp_path, "w") as handle: + run_cmd(cmd, stdout=handle) + tmp_df = pd.read_csv(tmp_path, sep="\t", header=None, names=["chrom", "start", "end", "count"]) + os.unlink(tmp_path) + if input_counts is None: + input_counts = tmp_df["count"].astype(float).values + else: + input_counts += tmp_df["count"].astype(float).values + input_counts = input_counts / len(input_bams) + counts[sample_ids] = counts[sample_ids].astype(float).sub(input_counts, axis=0).clip(lower=0) + return counts, sample_ids + + +def write_matrix(counts_df, sample_ids, out_path): + matrix = counts_df[["bin_id"] + sample_ids] + matrix.to_csv(out_path, sep="\t", index=False) + + +def normalize_counts(counts_df, sample_ids, samples, use_spikein, pseudocount): + counts = counts_df[sample_ids].astype(float) + scale = np.ones(len(sample_ids), dtype=float) + for idx, row in enumerate(samples): + if use_spikein: + spike = parse_float(row.get("spikein_scale_factor")) + if spike is not None: + scale[idx] *= spike + coeff = parse_float(row.get("ms_coeff")) + if coeff is not None: + scale[idx] *= coeff + counts = counts * scale + counts = counts + pseudocount + library_size = counts.sum(axis=0) + library_size[library_size == 0] = 1.0 + normalized = counts.divide(library_size, axis=1) * 1e6 + return normalized + + +def plot_pca(norm_matrix, sample_ids, samples, out_path): + data = np.log2(norm_matrix + 1) + data = data.T + labels = [row.get("condition", "") for row in samples] + pca = PCA(n_components=2) + coords = pca.fit_transform(data) + plt.figure(figsize=(6, 5)) + for condition in sorted(set(labels)): + idxs = [i for i, c in enumerate(labels) if c == condition] + plt.scatter(coords[idxs, 0], coords[idxs, 1], label=condition) + for i, sample_id in enumerate(sample_ids): + plt.text(coords[i, 0], coords[i, 1], sample_id, fontsize=7) + plt.xlabel("PC1") + plt.ylabel("PC2") + plt.legend(loc="best", fontsize=7) + plt.tight_layout() + plt.savefig(out_path, dpi=150) + plt.close() + + +def plot_correlation(norm_matrix, sample_ids, out_path): + data = np.log2(norm_matrix + 1) + corr = pd.DataFrame(data, columns=sample_ids).corr() + plt.figure(figsize=(6, 5)) + sns.heatmap(corr, cmap="vlag", center=0, xticklabels=True, yticklabels=True) + plt.tight_layout() + plt.savefig(out_path, dpi=150) + plt.close() + + +def parse_grid(values, default_values): + if values: + items = [v.strip() for v in str(values).split(",") if v.strip()] + return [int(x) for x in items] + return default_values + + +def hdbscan_grid(norm_matrix, windows_df, minpts_list, minsamps_list, out_dir, summary_path): + if hdbscan is None: + raise RuntimeError("hdbscan python package is required for clustering") + ensure_dir(out_dir) + features = StandardScaler().fit_transform(norm_matrix.values) + results = [] + assignments = {} + n_bins = features.shape[0] + for minpts in minpts_list: + for minsamps in minsamps_list: + status = "OK" + n_clusters = 0 + noise_fraction = 1.0 + silhouette = np.nan + labels = None + if minpts > n_bins: + status = "INVALID" + else: + try: + clusterer = hdbscan.HDBSCAN(min_cluster_size=minpts, min_samples=minsamps) + labels = clusterer.fit_predict(features) + n_clusters = len({lab for lab in labels if lab >= 0}) + noise_fraction = float(np.sum(labels < 0)) / float(len(labels)) + if n_clusters > 1: + mask = labels >= 0 + silhouette = silhouette_score(features[mask], labels[mask]) + except Exception as exc: + status = f"FAIL:{exc}" + results.append({ + "minPts": minpts, + "minSamps": minsamps, + "n_clusters": n_clusters, + "noise_fraction": noise_fraction, + "silhouette": silhouette, + "status": status, + "labels": labels, + }) + # selection rule: maximize silhouette (if available), then minimize noise, then max clusters, then smallest params + best = None + for entry in results: + if entry["status"] != "OK": + continue + sil = entry["silhouette"] + if math.isnan(sil): + score = -1.0 - entry["noise_fraction"] + else: + score = sil - entry["noise_fraction"] + entry["score"] = score + if best is None: + best = entry + continue + if score > best["score"]: + best = entry + continue + if score == best["score"]: + if entry["n_clusters"] > best["n_clusters"]: + best = entry + continue + if entry["n_clusters"] == best["n_clusters"] and entry["noise_fraction"] < best["noise_fraction"]: + best = entry + continue + if entry["n_clusters"] == best["n_clusters"] and entry["noise_fraction"] == best["noise_fraction"]: + if (entry["minPts"], entry["minSamps"]) < (best["minPts"], best["minSamps"]): + best = entry + + if best is None: + raise RuntimeError("No successful HDBSCAN grid points") + + for entry in results: + if entry["status"] != "OK" or entry["labels"] is None: + continue + labels = entry["labels"] + out_path = os.path.join(out_dir, f"clusters_minPts{entry['minPts']}_minSamps{entry['minSamps']}.tsv") + df = windows_df.copy() + df["cluster"] = labels + df.to_csv(out_path, sep="\t", index=False) + assignments[(entry["minPts"], entry["minSamps"])] = out_path + + summary_rows = [] + for entry in results: + sil = entry["silhouette"] + sil_val = "NA" if math.isnan(sil) else f"{sil:.4f}" + score = entry.get("score") + score_val = "NA" if score is None else f"{score:.4f}" + selected = "true" if best and entry["minPts"] == best["minPts"] and entry["minSamps"] == best["minSamps"] else "false" + summary_rows.append({ + "minPts": entry["minPts"], + "minSamps": entry["minSamps"], + "n_clusters": entry["n_clusters"], + "noise_fraction": f"{entry['noise_fraction']:.4f}", + "silhouette": sil_val, + "score": score_val, + "status": entry["status"], + "selected": selected, + }) + + pd.DataFrame(summary_rows).to_csv(summary_path, sep="\t", index=False) + + chosen = { + "minPts": best["minPts"], + "minSamps": best["minSamps"], + "n_clusters": best["n_clusters"], + "labels": best["labels"], + "summary_path": summary_path, + } + return chosen, summary_path + + +def run_rots(matrix_path, samplesheet_path, treated, control, bootstrap, k_value, out_path): + script_path = os.path.join(os.path.dirname(__file__), "chipbinner_rots.R") + cmd = [ + "Rscript", + script_path, + "--matrix", matrix_path, + "--samplesheet", samplesheet_path, + "--treated", treated, + "--control", control, + "--bootstrap", str(bootstrap), + "--k-value", str(k_value), + "--out", out_path, + ] + run_cmd(cmd) + + +def bh_fdr(pvals): + pvals = np.asarray(pvals) + n = len(pvals) + order = np.argsort(pvals) + ranks = np.arange(1, n + 1) + fdr = np.empty(n, dtype=float) + fdr[order] = pvals[order] * n / ranks + fdr = np.minimum.accumulate(fdr[::-1])[::-1] + fdr[fdr > 1] = 1 + return fdr + + +def run_enrichment(windows_df, clusters_path, functional_db, out_dir): + ensure_dir(out_dir) + if not functional_db: + stub = os.path.join(out_dir, "not_run.txt") + with open(stub, "w") as handle: + handle.write("NOT_RUN\n") + return "NOT_RUN" + if fisher_exact is None: + raise RuntimeError("scipy is required for enrichment") + if os.path.isdir(functional_db): + bed_files = sorted(glob.glob(os.path.join(functional_db, "*.bed"))) + else: + bed_files = [functional_db] + if not bed_files: + raise RuntimeError("No enrichment BED files found") + + # write background windows bed + bg_bed = os.path.join(out_dir, "windows.bed") + windows_df[["chrom", "start", "end"]].to_csv(bg_bed, sep="\t", header=False, index=False) + + clusters = pd.read_csv(clusters_path, sep="\t") + clusters = clusters[clusters["cluster"] >= 0] + if clusters.empty: + stub = os.path.join(out_dir, "no_clusters.txt") + with open(stub, "w") as handle: + handle.write("NO_CLUSTERS\n") + return "NOT_RUN" + + all_results = [] + bg_total = len(windows_df) + for cluster_id in sorted(clusters["cluster"].unique()): + cluster_bins = clusters[clusters["cluster"] == cluster_id] + cluster_bed = os.path.join(out_dir, f"cluster_{cluster_id}.bed") + cluster_bins[["chrom", "start", "end"]].to_csv(cluster_bed, sep="\t", header=False, index=False) + cluster_total = len(cluster_bins) + + cluster_results = [] + for bed in bed_files: + if not os.path.exists(bed): + continue + annot = os.path.basename(bed) + p1 = subprocess.Popen(["bedtools", "intersect", "-u", "-a", cluster_bed, "-b", bed], stdout=subprocess.PIPE, text=True) + p2 = subprocess.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE, text=True) + p1.stdout.close() + overlap_cluster = int((p2.communicate()[0] or "0").strip()) + + p1 = subprocess.Popen(["bedtools", "intersect", "-u", "-a", bg_bed, "-b", bed], stdout=subprocess.PIPE, text=True) + p2 = subprocess.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE, text=True) + p1.stdout.close() + overlap_bg = int((p2.communicate()[0] or "0").strip()) + a = overlap_cluster + b = cluster_total - overlap_cluster + c = overlap_bg - overlap_cluster + d = (bg_total - overlap_bg) - b + if d < 0: + d = 0 + odds, pval = fisher_exact([[a, b], [c, d]], alternative="greater") + cluster_results.append({ + "cluster": cluster_id, + "annotation": annot, + "overlap_cluster": a, + "cluster_total": cluster_total, + "overlap_background": overlap_bg, + "background_total": bg_total, + "odds_ratio": odds, + "pval": pval, + }) + if cluster_results: + pvals = [row["pval"] for row in cluster_results] + fdrs = bh_fdr(np.array(pvals)) + for row, fdr in zip(cluster_results, fdrs): + row["fdr"] = fdr + out_path = os.path.join(out_dir, f"cluster_{cluster_id}.enrichment.tsv") + pd.DataFrame(cluster_results).to_csv(out_path, sep="\t", index=False) + all_results.extend(cluster_results) + + if all_results: + pd.DataFrame(all_results).to_csv(os.path.join(out_dir, "enrichment_summary.tsv"), sep="\t", index=False) + return "RUN" + return "NOT_RUN" + + +def write_summary(path, group, treated, control, n_bins, n_fdr, n_up, n_down, n_clusters, minpts, minsamps, status, reason, enrichment_status): + header = [ + "group", + "caller", + "treated", + "control", + "n_bins_tested", + "n_fdr_pass", + "n_up", + "n_down", + "n_clusters", + "chosen_minPts", + "chosen_minSamps", + "status", + "reason", + "enrichment_status", + ] + with open(path, "w", newline="") as handle: + writer = csv.writer(handle, delimiter="\t") + writer.writerow(header) + writer.writerow([ + group, + "NA", + treated, + control, + n_bins, + n_fdr, + n_up, + n_down, + n_clusters, + minpts, + minsamps, + status, + reason, + enrichment_status, + ]) + + +def write_empty_outputs(outdir, group, sample_ids, treated, control, reason): + windows_path = os.path.join(outdir, "chipbinner.windows.bed") + counts_path = os.path.join(outdir, "chipbinner.bin_counts.tsv") + norm_path = os.path.join(outdir, "chipbinner.normalized_matrix.tsv") + grid_path = os.path.join(outdir, "chipbinner.hdbscan_grid_summary.tsv") + clusters_path = os.path.join(outdir, "chipbinner.clusters.tsv") + diff_path = os.path.join(outdir, "chipbinner.differential.tsv") + summary_path = os.path.join(outdir, "chipbinner.summary.tsv") + ensure_dir(os.path.join(outdir, "plots")) + ensure_dir(os.path.join(outdir, "enrichment")) + + with open(windows_path, "w") as handle: + handle.write("") + with open(counts_path, "w") as handle: handle.write("bin\t" + "\t".join(sample_ids) + "\n") - for idx, (chrom, start, end) in enumerate(bins): - values = [str((idx + 1) * (s_idx + 1)) for s_idx in range(len(sample_ids))] - handle.write(f"{chrom}:{start}-{end}\t" + "\t".join(values) + "\n") with open(norm_path, "w") as handle: handle.write("bin\t" + "\t".join(sample_ids) + "\n") - for idx, (chrom, start, end) in enumerate(bins): - values = [str((idx + 1) * (s_idx + 1)) for s_idx in range(len(sample_ids))] - handle.write(f"{chrom}:{start}-{end}\t" + "\t".join(values) + "\n") - - # HDBSCAN grid summary - grid_path = os.path.join(args.outdir, "chipbinner.hdbscan_grid_summary.tsv") - minpts_values = [x for x in args.grid_minpts.split(",") if x] or ["100"] - minsamps_values = [x for x in args.grid_minsamps.split(",") if x] or ["100"] with open(grid_path, "w") as handle: - handle.write("minPts\tminSamps\tn_clusters\tstatus\n") - for mpt in minpts_values: - for ms in minsamps_values: - handle.write(f"{mpt}\t{ms}\t2\tOK\n") - - # Clusters - clusters_path = os.path.join(args.outdir, "chipbinner.clusters.tsv") + handle.write("minPts\tminSamps\tn_clusters\tnoise_fraction\tsilhouette\tscore\tstatus\tselected\n") with open(clusters_path, "w") as handle: handle.write("chrom\tstart\tend\tcluster\n") - for idx, (chrom, start, end) in enumerate(bins): - handle.write(f"{chrom}\t{start}\t{end}\t1\n") - - # Differential - diff_path = os.path.join(args.outdir, "chipbinner.differential.tsv") with open(diff_path, "w") as handle: - handle.write("chr\tstart\tend\tlog2FC\tpval\tFDR\tcluster\n") - for idx, (chrom, start, end) in enumerate(bins): - handle.write(f"{chrom}\t{start}\t{end}\t0.0\t1.0\t1.0\t1\n") + handle.write("chrom\tstart\tend\tbin_id\tlog2FC\tpval\tFDR\tcluster\tdirection\n") + write_summary(summary_path, group, treated, control, 0, 0, 0, 0, 0, "NA", "NA", "FAIL", reason, "NOT_RUN") - # Summary - summary_path = os.path.join(args.outdir, "chipbinner.summary.tsv") - with open(summary_path, "w") as handle: - handle.write("group\ttreated\tcontrol\tn_bins_tested\tn_fdr_pass\tn_up\tn_down\tn_clusters\tchosen_minPts\tchosen_minSamps\tstatus\treason\n") - handle.write(f"{args.group}\t{treated}\t{control}\t{len(bins)}\t0\t0\t0\t1\t{minpts_values[0]}\t{minsamps_values[0]}\tRUN\tok\n") - # Enrichment stub +def main(): + args = parse_args() + ensure_dir(args.outdir) + plots_dir = os.path.join(args.outdir, "plots") enrichment_dir = os.path.join(args.outdir, "enrichment") - os.makedirs(enrichment_dir, exist_ok=True) - enrichment_file = os.path.join(enrichment_dir, "not_run.txt") - if args.functional_db: - enrichment_file = os.path.join(enrichment_dir, "enrichment_stub.txt") - with open(enrichment_file, "w") as handle: - handle.write("Enrichment not run\n" if not args.functional_db else "Enrichment stub\n") - - with open(os.path.join(plots_dir, "chipbinner_stub.txt"), "w") as handle: - handle.write("stub plots\n") + ensure_dir(plots_dir) + ensure_dir(enrichment_dir) + + contrast = [c.strip() for c in args.contrast.split(",") if c.strip()] + if len(contrast) != 2: + raise RuntimeError("Contrast must contain treated and control labels") + treated_label, control_label = contrast + + samples = read_tsv(args.samples) + if not samples: + raise RuntimeError("No samples provided") + + samples = [row for row in samples if row.get("condition") in (treated_label, control_label)] + if not samples: + raise RuntimeError("No samples match contrast labels") + + sample_ids = [row.get("sample_id", "") for row in samples] + + samplesheet_path = os.path.join(args.outdir, "chipbinner.samplesheet.csv") + write_samplesheet(samples, samplesheet_path) + + windows_source = choose_windows_file(args.windows_dir, args.bin_size) + windows_path = os.path.join(args.outdir, "chipbinner.windows.bed") + try: + if windows_source: + if args.blacklist and os.path.exists(args.blacklist): + with tempfile.TemporaryDirectory() as tmpdir: + filtered_path = os.path.join(tmpdir, "windows_filtered.bed") + run_cmd(["bedtools", "subtract", "-a", windows_source, "-b", args.blacklist], stdout=open(filtered_path, "w")) + run_cmd(["sort", "-k1,1", "-k2,2n", filtered_path], stdout=open(windows_path, "w")) + else: + run_cmd(["sort", "-k1,1", "-k2,2n", windows_source], stdout=open(windows_path, "w")) + else: + generate_windows(args.chrom_sizes, args.bin_size, args.blacklist, windows_path) + + windows_df = load_windows(windows_path) + + counts_raw_path = os.path.join(args.outdir, "chipbinner.bin_counts.tsv") + counts_df, sample_ids = compute_counts(windows_path, samples, counts_raw_path, use_input=args.use_input) + write_matrix(counts_df, sample_ids, counts_raw_path) + + norm_matrix = normalize_counts(counts_df, sample_ids, samples, args.use_spikein, args.pseudocount) + norm_path = os.path.join(args.outdir, "chipbinner.normalized_matrix.tsv") + norm_matrix.insert(0, "bin_id", counts_df["bin_id"]) + norm_matrix.to_csv(norm_path, sep="\t", index=False) + + plot_pca(norm_matrix.drop(columns=["bin_id"]).values, sample_ids, samples, os.path.join(plots_dir, "chipbinner.pca.png")) + plot_correlation(norm_matrix.drop(columns=["bin_id"]).values, sample_ids, os.path.join(plots_dir, "chipbinner.correlation.png")) + + minpts_list = parse_grid(args.grid_minpts, [100, 200, 500, 1000]) + minsamps_list = parse_grid(args.grid_minsamps, [100, 200, 500, 1000]) + grid_dir = os.path.join(args.outdir, "hdbscan_grid") + grid_summary_path = os.path.join(args.outdir, "chipbinner.hdbscan_grid_summary.tsv") + chosen, grid_summary_path = hdbscan_grid(norm_matrix.drop(columns=["bin_id"]), windows_df, minpts_list, minsamps_list, grid_dir, grid_summary_path) + + # chosen clusters + clusters_path = os.path.join(args.outdir, "chipbinner.clusters.tsv") + chosen_df = windows_df.copy() + chosen_df["cluster"] = chosen["labels"] + chosen_df.to_csv(clusters_path, sep="\t", index=False) + + # Differential with ROTS + rots_out = os.path.join(args.outdir, "chipbinner.rots.tsv") + run_rots(norm_path, samplesheet_path, treated_label, control_label, args.bootstrap, args.k_value, rots_out) + rots_df = pd.read_csv(rots_out, sep="\t") + + norm_vals = norm_matrix.drop(columns=["bin_id"]).values + treated_idx = [i for i, row in enumerate(samples) if row.get("condition") == treated_label] + control_idx = [i for i, row in enumerate(samples) if row.get("condition") == control_label] + treated_mean = norm_vals[:, treated_idx].mean(axis=1) + control_mean = norm_vals[:, control_idx].mean(axis=1) + log2fc = np.log2(treated_mean + 1) - np.log2(control_mean + 1) + + diff_df = pd.DataFrame({ + "chrom": windows_df["chrom"], + "start": windows_df["start"], + "end": windows_df["end"], + "bin_id": windows_df["bin_id"], + "log2FC": log2fc, + }) + diff_df = diff_df.merge(rots_df, on="bin_id", how="left") + diff_df["cluster"] = chosen_df["cluster"].values + + direction = [] + for _, row in diff_df.iterrows(): + if row.get("FDR") is not None and float(row["FDR"]) <= args.fdr: + if row["log2FC"] >= args.lfc: + direction.append("UP") + elif row["log2FC"] <= -args.lfc: + direction.append("DOWN") + else: + direction.append("NS") + else: + direction.append("NS") + diff_df["direction"] = direction + + diff_path = os.path.join(args.outdir, "chipbinner.differential.tsv") + diff_df.to_csv(diff_path, sep="\t", index=False) + + n_bins = len(diff_df) + n_fdr = int((diff_df["FDR"] <= args.fdr).sum()) if "FDR" in diff_df else 0 + n_up = int((diff_df["direction"] == "UP").sum()) + n_down = int((diff_df["direction"] == "DOWN").sum()) + n_clusters = len({lab for lab in chosen["labels"] if lab >= 0}) + + enrichment_status = run_enrichment(windows_df, clusters_path, args.functional_db, enrichment_dir) + + summary_path = os.path.join(args.outdir, "chipbinner.summary.tsv") + write_summary(summary_path, args.group, treated_label, control_label, n_bins, n_fdr, n_up, n_down, n_clusters, chosen["minPts"], chosen["minSamps"], "RUN", "ok", enrichment_status) + + except Exception as exc: + if args.allow_partial: + reason = str(exc) + write_empty_outputs(args.outdir, args.group, sample_ids, treated_label, control_label, reason) + else: + raise if __name__ == "__main__": diff --git a/modules/local/chipbinner_run.nf b/modules/local/chipbinner_run.nf index 984268e6..6fa3a791 100644 --- a/modules/local/chipbinner_run.nf +++ b/modules/local/chipbinner_run.nf @@ -1,7 +1,7 @@ process CHIPBINNER_RUN { label 'process_chipbinner' - conda "conda-forge::python=3.8.3" + conda "conda-forge::python=3.8.3 conda-forge::numpy=1.24.4 conda-forge::pandas=2.0.3 conda-forge::scikit-learn=1.3.2 conda-forge::matplotlib=3.7.5 conda-forge::seaborn=0.12.2 conda-forge::hdbscan=0.8.33 conda-forge::scipy=1.10.1 bioconda::bedtools=2.31.1 bioconda::samtools=1.17 conda-forge::r-base=4.2.3 bioconda::bioconductor-rots=1.20.0" container "quay.io/biocontainers/python:3.8.3" input: @@ -9,7 +9,9 @@ process CHIPBINNER_RUN { val contrast val bin_size val windows_dir + val blacklist val use_input + val use_spikein val pseudocount val grid_minpts val grid_minsamps @@ -18,6 +20,7 @@ process CHIPBINNER_RUN { val bootstrap val k_value val functional_db + val allow_partial output: tuple val(group), path("chipbinner.samplesheet.csv"), emit: samplesheet @@ -25,6 +28,7 @@ process CHIPBINNER_RUN { tuple val(group), path("chipbinner.bin_counts.tsv"), emit: counts tuple val(group), path("chipbinner.normalized_matrix.tsv"), emit: normalized tuple val(group), path("chipbinner.hdbscan_grid_summary.tsv"), emit: grid + tuple val(group), path("hdbscan_grid"), emit: grid_outputs tuple val(group), path("chipbinner.clusters.tsv"), emit: clusters tuple val(group), path("chipbinner.differential.tsv"), emit: differential tuple val(group), path("chipbinner.summary.tsv"), emit: summary @@ -37,8 +41,11 @@ process CHIPBINNER_RUN { script: def windows_arg = windows_dir ? "--windows-dir ${windows_dir}" : '' + def blacklist_arg = blacklist ? "--blacklist ${blacklist}" : '' def use_input_arg = use_input ? "--use-input" : '' + def use_spikein_arg = use_spikein ? "--use-spikein" : '' def functional_arg = functional_db ? "--functional-db ${functional_db}" : '' + def allow_partial_arg = allow_partial ? "--allow-partial" : '' """ chipbinner_run.py \ --samples ${records} \ @@ -47,7 +54,9 @@ process CHIPBINNER_RUN { --chrom-sizes ${chrom_sizes} \ --bin-size ${bin_size} \ ${windows_arg} \ + ${blacklist_arg} \ ${use_input_arg} \ + ${use_spikein_arg} \ --pseudocount ${pseudocount} \ --grid-minpts '${grid_minpts}' \ --grid-minsamps '${grid_minsamps}' \ @@ -56,6 +65,7 @@ process CHIPBINNER_RUN { --bootstrap ${bootstrap} \ --k-value ${k_value} \ ${functional_arg} \ + ${allow_partial_arg} \ --outdir . cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 6cd71d9d..348426de 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -278,7 +278,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_chip_records_json.map { it[0] }, ch_chip_records_json.map { 'NA' }, ch_chip_records_json.map { it[1] }, - 'sample_id\tgroup\tcondition\treplicate\tfinal_bam\tbigwig_path\tspikein_scale_factor\tms_coeff', + 'sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff', ch_chip_records_json.map { group, json -> "chipbinner_${group}.tsv" } ) ch_versions = ch_versions.mix(RECORDS_TO_TSV_CHIPBINNER.out.versions) @@ -297,7 +297,9 @@ workflow DIFFERENTIAL_PEAK_CALLING { params.differential_contrast, params.chipbinner_bin_size, params.chipbinner_windows_dir ? file(params.chipbinner_windows_dir).toString() : '', + params.blacklist ? file(params.blacklist).toString() : '', params.chipbinner_use_input, + use_spikein, params.chipbinner_pseudocount, params.chipbinner_hdbscan_grid_minpts, params.chipbinner_hdbscan_grid_minsamps, @@ -305,7 +307,8 @@ workflow DIFFERENTIAL_PEAK_CALLING { params.chipbinner_lfc, params.chipbinner_bootstrap, params.chipbinner_k_value, - params.chipbinner_functional_db ? file(params.chipbinner_functional_db).toString() : '' + params.chipbinner_functional_db ? file(params.chipbinner_functional_db).toString() : '', + params.differential_allow_partial ) ch_versions = ch_versions.mix(CHIPBINNER_RUN.out.versions) From 093a5e5b1872c9620cc2d02d418a16b96a557f4d Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 22:26:21 -0800 Subject: [PATCH 16/52] Fix DiffBind error handling and plots Generate DiffBind plots, honor allow_partial, and publish samplesheets conditionally --- bin/diffbind_run.R | 194 +++++++++--------- conf/modules.config | 1 + modules/local/diffbind_run.nf | 19 +- .../local/differential_peak_calling.nf | 3 +- 4 files changed, 111 insertions(+), 106 deletions(-) diff --git a/bin/diffbind_run.R b/bin/diffbind_run.R index 53f4bf84..25a07992 100755 --- a/bin/diffbind_run.R +++ b/bin/diffbind_run.R @@ -102,9 +102,8 @@ recenter <- as_bool(params[["recenter"]]) summits <- as.integer(params[["summits"]]) norm_method <- params[["norm_method"]] export_sheets <- as_bool(params[["export_sheets"]]) +allow_partial <- as_bool(params[["allow_partial"]]) extra_params_path <- params[["extra_params"]] -extra_params <- read_extra_params(extra_params_path) - use_spikein <- as_bool(params[["use_spikein"]]) if (is.null(records_path) || is.null(outdir)) { @@ -119,10 +118,6 @@ if (is.na(summits)) summits <- 0 if (!dir.exists(outdir)) { dir.create(outdir, recursive = TRUE) } -plots_dir <- file.path(outdir, "plots") -if (!dir.exists(plots_dir)) { - dir.create(plots_dir, recursive = TRUE) -} records <- read.delim(records_path, stringsAsFactors = FALSE, check.names = FALSE) @@ -139,88 +134,111 @@ samplesheet <- data.frame( ) samplesheet_path <- file.path(outdir, "diffbind.samplesheet.csv") -if (export_sheets) { - write.csv(samplesheet, samplesheet_path, row.names = FALSE) -} else { - write.csv(samplesheet, samplesheet_path, row.names = FALSE) -} - -write_stub <- function() { - peaks_file <- samplesheet$Peaks[1] - regions <- data.frame() - if (!is.null(peaks_file) && file.exists(peaks_file)) { - peak_rows <- tryCatch({ - read.delim(peaks_file, header = FALSE, stringsAsFactors = FALSE) - }, error = function(e) { - NULL - }) - if (!is.null(peak_rows) && nrow(peak_rows) > 0) { - peak_rows <- peak_rows[, 1:3] - colnames(peak_rows) <- c("chr", "start", "end") - regions <- peak_rows - } - } - - if (nrow(regions) == 0) { - regions <- data.frame(chr = character(), start = integer(), end = integer()) - } +write.csv(samplesheet, samplesheet_path, row.names = FALSE) - results <- data.frame( - chr = regions$chr, - start = regions$start, - end = regions$end, - log2FC = rep(0.0, nrow(regions)), - pval = rep(1.0, nrow(regions)), - FDR = rep(1.0, nrow(regions)), - stringsAsFactors = FALSE - ) - - write.table(results, file = file.path(outdir, "diffbind.results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) - - sig <- results[results$FDR <= fdr & abs(results$log2FC) >= lfc, ] - write.table(sig[, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) - write.table(sig[sig$log2FC >= lfc, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant_up.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) - write.table(sig[sig$log2FC <= -lfc, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant_down.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) +contrast_labels <- strsplit(contrast, ",")[[1]] +contrast_labels <- trimws(contrast_labels) +if (length(contrast_labels) != 2) { + stop("contrast must contain exactly two comma-separated labels") +} +write_summary <- function(status, reason, n_tested = 0, n_fdr_pass = 0, n_up = 0, n_down = 0) { summary <- data.frame( caller = caller, group = group, - treated = strsplit(contrast, ",")[[1]][1], - control = strsplit(contrast, ",")[[1]][2], - n_tested = nrow(results), - n_fdr_pass = nrow(sig), - n_up = sum(sig$log2FC >= lfc), - n_down = sum(sig$log2FC <= -lfc), - status = "RUN", - reason = "ok", + treated = contrast_labels[1], + control = contrast_labels[2], + n_tested = n_tested, + n_fdr_pass = n_fdr_pass, + n_up = n_up, + n_down = n_down, + status = status, + reason = reason, stringsAsFactors = FALSE ) write.table(summary, file = file.path(outdir, "diffbind.summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) +} - if (use_spikein && "spikein_scale_factor" %in% colnames(records)) { - factors <- suppressWarnings(as.numeric(records$spikein_scale_factor)) - if (any(!is.na(factors))) { - size_factors <- ifelse(factors == 0, 1, 1 / factors) - norm_out <- data.frame(sample_id = records$sample_id, size_factor = size_factors, stringsAsFactors = FALSE) - write.table(norm_out, file = file.path(outdir, "diffbind.normalization_factors.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) +write_error <- function(message) { + writeLines(message, con = file.path(outdir, "diffbind.error.txt")) +} + +fail_or_record <- function(status, reason, message) { + if (allow_partial) { + if (identical(status, "FAIL") && !is.null(message) && nzchar(message)) { + write_error(message) } + write_summary(status, reason) + quit(save = "no") } + stop(message) } -use_diffbind <- FALSE -if (requireNamespace("DiffBind", quietly = TRUE)) { - use_diffbind <- TRUE +check_peaks <- function(paths) { + issues <- list(missing = character(), empty = character()) + if (is.null(paths)) { + return(issues) + } + for (path in paths) { + if (is.na(path) || !nzchar(path)) { + issues$missing <- c(issues$missing, "") + next + } + if (!file.exists(path)) { + issues$missing <- c(issues$missing, path) + next + } + lines <- tryCatch(readLines(path, n = 2, warn = FALSE), error = function(e) character()) + nonempty <- lines[!grepl("^\\s*$|^#", lines)] + if (length(nonempty) == 0) { + issues$empty <- c(issues$empty, path) + } + } + issues +} + +peaks_issues <- check_peaks(samplesheet$Peaks) +if (length(peaks_issues$missing) > 0 || length(peaks_issues$empty) > 0) { + reason <- if (length(peaks_issues$missing) > 0) "missing_peaks" else "empty_peaks" + message_parts <- c() + if (length(peaks_issues$missing) > 0) { + message_parts <- c(message_parts, sprintf("missing peak files: %s", paste(unique(peaks_issues$missing), collapse = ", "))) + } + if (length(peaks_issues$empty) > 0) { + message_parts <- c(message_parts, sprintf("empty peak files: %s", paste(unique(peaks_issues$empty), collapse = ", "))) + } + detail <- paste(message_parts, collapse = "; ") + status <- if (allow_partial) "SKIP" else "FAIL" + fail_or_record(status, reason, detail) +} + +if (!requireNamespace("DiffBind", quietly = TRUE)) { + fail_or_record("FAIL", "diffbind_unavailable", "DiffBind package not available.") +} + +call_diffbind <- function(fun_name, args) { + if (!exists(fun_name, envir = asNamespace("DiffBind"), inherits = FALSE)) { + stop(sprintf("DiffBind function %s is not available.", fun_name)) + } + fun <- get(fun_name, envir = asNamespace("DiffBind")) + formal_names <- names(formals(fun)) + args <- args[names(args) %in% formal_names] + do.call(fun, args) } -if (!use_diffbind) { - write_stub() - quit(save = "no") +plot_to_pdf <- function(filename, fun_name, args) { + plots_dir <- file.path(outdir, "plots") + if (!dir.exists(plots_dir)) { + dir.create(plots_dir, recursive = TRUE) + } + pdf(file.path(plots_dir, filename)) + on.exit(dev.off(), add = TRUE) + call_diffbind(fun_name, args) } -# Best-effort DiffBind execution tryCatch({ suppressPackageStartupMessages(library(DiffBind)) - extra_params <- resolve_constants(extra_params) + extra_params <- resolve_constants(read_extra_params(extra_params_path)) dba_args <- utils::modifyList(list(sampleSheet = samplesheet), extra_params_for(extra_params, "dba")) dba_obj <- do.call(dba, dba_args) @@ -244,7 +262,6 @@ tryCatch({ dba_obj <- do.call(dba.normalize, norm_args) } - contrast_labels <- strsplit(contrast, ",")[[1]] contrast_args <- utils::modifyList(list(DBA = dba_obj, categories = DBA_CONDITION, group1 = contrast_labels[1], group2 = contrast_labels[2]), extra_params_for(extra_params, "dba_contrast")) dba_obj <- do.call(dba.contrast, contrast_args) @@ -252,6 +269,11 @@ tryCatch({ analyze_args <- utils::modifyList(list(DBA = dba_obj, method = method_flag), extra_params_for(extra_params, "dba_analyze")) dba_obj <- do.call(dba.analyze, analyze_args) + plot_to_pdf("PCA.pdf", "dba.plotPCA", list(DBA = dba_obj, attributes = DBA_CONDITION, label = DBA_ID)) + plot_to_pdf("correlation_heatmap.pdf", "dba.plotHeatmap", list(DBA = dba_obj, correlations = TRUE)) + plot_to_pdf("MA.pdf", "dba.plotMA", list(DBA = dba_obj, contrast = 1)) + plot_to_pdf("volcano.pdf", "dba.plotVolcano", list(DBA = dba_obj, contrast = 1)) + report_args <- utils::modifyList(list(DBA = dba_obj, th = fdr, fold = lfc), extra_params_for(extra_params, "dba_report")) report <- do.call(dba.report, report_args) results <- as.data.frame(report) @@ -271,39 +293,17 @@ tryCatch({ write.table(sig[sig$log2FC >= lfc, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant_up.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) write.table(sig[sig$log2FC <= -lfc, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant_down.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) - summary <- data.frame( - caller = caller, - group = group, - treated = contrast_labels[1], - control = contrast_labels[2], + write_summary( + status = "RUN", + reason = "ok", n_tested = nrow(results_out), n_fdr_pass = nrow(sig), n_up = sum(sig$log2FC >= lfc), - n_down = sum(sig$log2FC <= -lfc), - status = "RUN", - reason = "ok", - stringsAsFactors = FALSE + n_down = sum(sig$log2FC <= -lfc) ) - write.table(summary, file = file.path(outdir, "diffbind.summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) saveRDS(dba_obj, file = file.path(outdir, "diffbind.dba.rds")) }, error = function(e) { - message("DiffBind failed, falling back to stub output: ", e$message) - write_stub() + fail_or_record("FAIL", "diffbind_error", e$message) }) - -norm_path <- file.path(outdir, "diffbind.normalization_factors.tsv") -if (!file.exists(norm_path)) { - norm_out <- data.frame( - sample_id = records$sample_id, - size_factor = rep(NA, nrow(records)), - stringsAsFactors = FALSE - ) - write.table(norm_out, file = norm_path, sep = "\t", quote = FALSE, row.names = FALSE) -} - -dba_path <- file.path(outdir, "diffbind.dba.rds") -if (!file.exists(dba_path)) { - file.create(dba_path) -} diff --git a/conf/modules.config b/conf/modules.config index 122a2102..5ffa6cbd 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1195,6 +1195,7 @@ if(params.run_diffbind || params.run_chipbinner || params.run_span_diff || param ], [ path: { "${params.outdir}/03_peak_calling/08_differential/01_diffbind/00_samplesheets/${caller}" }, + enabled: { params.export_diffbind_sheets }, mode: "${params.publish_dir_mode}", pattern: "diffbind.samplesheet.csv", saveAs: { filename -> "${group}.csv" } diff --git a/modules/local/diffbind_run.nf b/modules/local/diffbind_run.nf index 057518e4..7d54b888 100644 --- a/modules/local/diffbind_run.nf +++ b/modules/local/diffbind_run.nf @@ -17,17 +17,19 @@ process DIFFBIND_RUN { val norm_method val extra_params val export_sheets + val allow_partial output: - tuple val(group), val(caller), path("diffbind.results.tsv"), emit: results - tuple val(group), val(caller), path("diffbind.significant.bed"), emit: bed - tuple val(group), val(caller), path("diffbind.significant_up.bed"), emit: bed_up - tuple val(group), val(caller), path("diffbind.significant_down.bed"), emit: bed_down + tuple val(group), val(caller), path("diffbind.results.tsv", optional: true), emit: results + tuple val(group), val(caller), path("diffbind.significant.bed", optional: true), emit: bed + tuple val(group), val(caller), path("diffbind.significant_up.bed", optional: true), emit: bed_up + tuple val(group), val(caller), path("diffbind.significant_down.bed", optional: true), emit: bed_down tuple val(group), val(caller), path("diffbind.summary.tsv"), emit: summary tuple val(group), val(caller), path("diffbind.samplesheet.csv"), emit: samplesheet - tuple val(group), val(caller), path("diffbind.normalization_factors.tsv"), emit: norm_factors_out - tuple val(group), val(caller), path("diffbind.dba.rds"), emit: dba - tuple val(group), val(caller), path("plots"), emit: plots + tuple val(group), val(caller), path("diffbind.normalization_factors.tsv", optional: true), emit: norm_factors_out + tuple val(group), val(caller), path("diffbind.dba.rds", optional: true), emit: dba + tuple val(group), val(caller), path("diffbind.error.txt", optional: true), emit: error + tuple val(group), val(caller), path("plots", optional: true), emit: plots path "versions.yml", emit: versions when: @@ -51,7 +53,8 @@ process DIFFBIND_RUN { --summits ${summits} \ --norm_method ${norm_method} \ ${extra_arg} \ - --export_sheets ${export_sheets} + --export_sheets ${export_sheets} \ + --allow_partial ${allow_partial} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 348426de..d1539c75 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -242,7 +242,8 @@ workflow DIFFERENTIAL_PEAK_CALLING { params.diffbind_summits, params.diffbind_norm_method, params.diffbind_extra_params ? file(params.diffbind_extra_params).toString() : '', - params.export_diffbind_sheets + params.export_diffbind_sheets, + params.differential_allow_partial ) ch_versions = ch_versions.mix(DIFFBIND_RUN.out.versions) From a4b9f34f0f5fa062feb88809505592f269df3cf7 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 22:40:57 -0800 Subject: [PATCH 17/52] Fix differential annotation gene bed --- bin/annotate_regions.py | 10 +- bin/gtf_to_gene_bed.py | 74 +++++++++ conf/modules.config | 8 + modules/local/cache_differential_gene_bed.nf | 29 ++++ modules/local/gtf_to_gene_bed.nf | 31 ++++ .../local/differential_peak_calling.nf | 154 ++++++++++-------- workflows/cutandrun.nf | 4 +- workflows/differential_only.nf | 21 ++- 8 files changed, 253 insertions(+), 78 deletions(-) create mode 100755 bin/gtf_to_gene_bed.py create mode 100644 modules/local/cache_differential_gene_bed.nf create mode 100644 modules/local/gtf_to_gene_bed.nf diff --git a/bin/annotate_regions.py b/bin/annotate_regions.py index 6b6416f5..f15da0de 100755 --- a/bin/annotate_regions.py +++ b/bin/annotate_regions.py @@ -79,11 +79,17 @@ def main(): a_cols = 4 distance = parts[-1] if parts else "NA" b_cols = len(parts) - a_cols - 1 + gene_id = "NA" + gene_name = "NA" if b_cols >= 4: gene_id = parts[a_cols + 3] - else: + if b_cols >= 5: + gene_name = parts[a_cols + 4] + if gene_id in [".", ""]: gene_id = "NA" - annotations[idx] = (gene_id, gene_id, distance) + if gene_name in [".", ""]: + gene_name = "NA" + annotations[idx] = (gene_id, gene_name, distance) with open(args.out, "w") as handle: if header: diff --git a/bin/gtf_to_gene_bed.py b/bin/gtf_to_gene_bed.py new file mode 100755 index 00000000..c252dbc6 --- /dev/null +++ b/bin/gtf_to_gene_bed.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +import argparse +import gzip + + +def open_text(path): + if path.endswith(".gz"): + return gzip.open(path, "rt") + return open(path, "r") + + +def parse_attributes(attr_str): + attrs = {} + for raw in attr_str.strip().strip(";").split(";"): + item = raw.strip() + if not item: + continue + if "=" in item and " " not in item: + key, val = item.split("=", 1) + else: + parts = item.split(" ", 1) + if len(parts) < 2: + continue + key, val = parts[0], parts[1] + val = val.strip().strip('"') + attrs[key] = val + return attrs + + +def pick_attr(attrs, keys): + for key in keys: + val = attrs.get(key) + if val: + return val + return "NA" + + +def main(): + parser = argparse.ArgumentParser(description="Create gene BED from GTF") + parser.add_argument("--gtf", required=True) + parser.add_argument("--out", required=True) + parser.add_argument("--feature", default="gene") + args = parser.parse_args() + + with open_text(args.gtf) as handle, open(args.out, "w") as out: + for line in handle: + if not line or line.startswith("#"): + continue + fields = line.rstrip("\n").split("\t") + if len(fields) < 9: + continue + chrom, _source, feature, start, end, _score, _strand, _frame, attrs_str = fields[:9] + if feature != args.feature: + continue + try: + start_i = int(start) + end_i = int(end) + except ValueError: + continue + if end_i < start_i: + start_i, end_i = end_i, start_i + start0 = max(start_i - 1, 0) + + attrs = parse_attributes(attrs_str) + gene_id = pick_attr(attrs, ["gene_id", "ID", "gene", "transcript_id"]) + gene_name = pick_attr(attrs, ["gene_name", "Name", "gene"]) + if gene_id == "NA" and gene_name != "NA": + gene_id = gene_name + + out.write(f"{chrom}\t{start0}\t{end_i}\t{gene_id}\t{gene_name}\n") + + +if __name__ == "__main__": + main() diff --git a/conf/modules.config b/conf/modules.config index 5ffa6cbd..cc78f701 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1180,6 +1180,14 @@ if(params.run_diffbind || params.run_chipbinner || params.run_span_diff || param ] } + withName: '.*:CACHE_DIFFERENTIAL_GENE_BED' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/00_manifests" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: '.*:RECORDS_TO_TSV' { publishDir = [ enabled: false diff --git a/modules/local/cache_differential_gene_bed.nf b/modules/local/cache_differential_gene_bed.nf new file mode 100644 index 00000000..330d0aff --- /dev/null +++ b/modules/local/cache_differential_gene_bed.nf @@ -0,0 +1,29 @@ +process CACHE_DIFFERENTIAL_GENE_BED { + label 'process_single' + + conda "conda-forge::coreutils=9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path gene_bed + + output: + path "differential_manifest.gene_bed.bed", emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + cp ${gene_bed} differential_manifest.gene_bed.bed + + coreutils_version=\$(cat --version | head -n 1 | awk '{print $NF}') + { + echo "\"${task.process}\":" + echo " coreutils: $coreutils_version" + } > versions.yml + """ +} diff --git a/modules/local/gtf_to_gene_bed.nf b/modules/local/gtf_to_gene_bed.nf new file mode 100644 index 00000000..38e9ccba --- /dev/null +++ b/modules/local/gtf_to_gene_bed.nf @@ -0,0 +1,31 @@ +process GTF_TO_GENE_BED { + tag "$gtf" + label 'process_low' + + conda "conda-forge::python=3.11.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.11.6' : + 'quay.io/biocontainers/python:3.11.6' }" + + input: + path gtf + + output: + path "gene_annotation.bed", emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + gtf_to_gene_bed.py \\ + --gtf $gtf \\ + --out gene_annotation.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | awk '{print $NF}') + END_VERSIONS + """ +} diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index d1539c75..dfa8e929 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -14,6 +14,9 @@ include { DIFFBIND_RUN } from "../../modules/local/diffbind_run" include { CHIPBINNER_RUN } from "../../modules/local/chipbinner_run" include { SPAN_DIFF_RUN } from "../../modules/local/span_diff_run" include { ANNOTATE_REGIONS } from "../../modules/local/annotate_regions" +include { GTF_TO_GENE_BED } from "../../modules/local/gtf_to_gene_bed" +include { BEDTOOLS_SORT as ANNOTATION_GENE_BED_SORT } from "../../modules/local/for_patch/bedtools/sort/main" +include { CACHE_DIFFERENTIAL_GENE_BED } from "../../modules/local/cache_differential_gene_bed" include { DIFFERENTIAL_SUMMARY } from "../../modules/local/differential_summary" include { MULTIQC_DIFFERENTIAL } from "../../modules/local/multiqc_differential" @@ -26,6 +29,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_scale_factors ch_chrom_sizes ch_gene_bed + ch_gtf ch_blacklist ch_samples_manifest_in ch_peaks_manifest_in @@ -180,7 +184,35 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_peaks_rows = ch_peaks_manifest.splitCsv(header: true, sep: '\t') ch_design_rows = DIFFERENTIAL_DESIGN.out.design.splitCsv(header: true, sep: '\t') - ch_gene_bed_single = ch_gene_bed.first() + def annotation_enabled = (params.run_diffbind || params.run_chipbinner || params.run_span_diff) && !params.differential_publish_manifest_only + ch_gene_bed_single = Channel.empty() + if (annotation_enabled) { + GTF_TO_GENE_BED ( ch_gtf ) + ch_versions = ch_versions.mix(GTF_TO_GENE_BED.out.versions) + + ANNOTATION_GENE_BED_SORT ( + GTF_TO_GENE_BED.out.bed.map { bed -> [ [id: bed.baseName], bed ] }, + "bed", + [] + ) + ch_versions = ch_versions.mix(ANNOTATION_GENE_BED_SORT.out.versions) + + ch_gene_bed_from_gtf = ANNOTATION_GENE_BED_SORT.out.sorted.map { it[1] } + ch_gene_bed_selected = ch_gene_bed.ifEmpty(ch_gene_bed_from_gtf) + + ch_gene_bed_single = ch_gene_bed_selected + .collect() + .map { beds -> + if (!beds || beds.size() == 0) { + exit 1, "Differential annotation requires --gene_bed or --gtf." + } + beds[0] + } + + CACHE_DIFFERENTIAL_GENE_BED ( ch_gene_bed_single ) + ch_versions = ch_versions.mix(CACHE_DIFFERENTIAL_GENE_BED.out.versions) + ch_gene_bed_single = CACHE_DIFFERENTIAL_GENE_BED.out.bed.first() + } def contrast_labels = params.differential_contrast.split(',').collect { it.trim() } ch_samples_rows_contrast = ch_samples_rows.filter { row -> contrast_labels.contains(row.condition) } @@ -247,16 +279,18 @@ workflow DIFFERENTIAL_PEAK_CALLING { ) ch_versions = ch_versions.mix(DIFFBIND_RUN.out.versions) - ch_diffbind_annot = DIFFBIND_RUN.out.results - .combine(ch_gene_bed_single) - .map { tuple, gene_bed -> - def group = tuple[0] - def caller = tuple[1] - def regions = tuple[2] - [ 'diffbind', group, caller, regions, gene_bed, "01_diffbind/${caller}/${group}/diffbind.results.annotated.tsv" ] - } + if (annotation_enabled) { + ch_diffbind_annot = DIFFBIND_RUN.out.results + .combine(ch_gene_bed_single) + .map { tuple, gene_bed -> + def group = tuple[0] + def caller = tuple[1] + def regions = tuple[2] + [ 'diffbind', group, caller, regions, gene_bed, "01_diffbind/${caller}/${group}/diffbind.results.annotated.tsv" ] + } - ch_annotation_requests = ch_annotation_requests.mix(ch_diffbind_annot) + ch_annotation_requests = ch_annotation_requests.mix(ch_diffbind_annot) + } ch_diffbind_summary = DIFFBIND_RUN.out.summary.map { it[2] } } @@ -313,68 +347,39 @@ workflow DIFFERENTIAL_PEAK_CALLING { ) ch_versions = ch_versions.mix(CHIPBINNER_RUN.out.versions) - ch_chip_annot = CHIPBINNER_RUN.out.differential - .combine(ch_gene_bed_single) - .map { tuple, gene_bed -> - def group = tuple[0] - def regions = tuple[1] - [ 'chipbinner', group, 'NA', regions, gene_bed, "02_chipbinner/${group}/chipbinner.differential.annotated.tsv" ] - } + if (annotation_enabled) { + ch_chip_annot = CHIPBINNER_RUN.out.differential + .combine(ch_gene_bed_single) + .map { tuple, gene_bed -> + def group = tuple[0] + def regions = tuple[1] + [ 'chipbinner', group, 'NA', regions, gene_bed, "02_chipbinner/${group}/chipbinner.differential.annotated.tsv" ] + } - ch_annotation_requests = ch_annotation_requests.mix(ch_chip_annot) + ch_annotation_requests = ch_annotation_requests.mix(ch_chip_annot) + } ch_chipbinner_summary = CHIPBINNER_RUN.out.summary.map { it[1] } } if (params.run_span_diff && !params.differential_publish_manifest_only) { ch_span_design = ch_design_rows .filter { row -> row.caller == 'NA' && row.status == 'RUN' && row.eligible_span == 'true' } - .map { row -> [row.group, row] } - - ch_span_records = ch_samples_rows_contrast - .map { row -> [row.group, row] } - .groupTuple(by: [0]) - .join(ch_span_design) - .map { group, rows, design -> [ group, rows ] } - - ch_span_records - .map { group, rows -> [ group, JsonOutput.toJson(rows) ] } - .set { ch_span_records_json } - - RECORDS_TO_TSV_SPAN ( - ch_span_records_json.map { it[0] }, - ch_span_records_json.map { 'NA' }, - ch_span_records_json.map { it[1] }, - 'sample_id\tgroup\tcondition\treplicate\tfinal_bam', - ch_span_records_json.map { group, json -> "span_${group}.tsv" } - ) - ch_versions = ch_versions.mix(RECORDS_TO_TSV_SPAN.out.versions) - - ch_span_records_file = RECORDS_TO_TSV_SPAN.out.tsv - .map { group, caller, records_file -> [group, records_file] } + .map { row -> row.group } def span_caller_priority = (params.callers ?: []) .collect { it.toString().toLowerCase() } .findAll { it.startsWith('span') || it.startsWith('omnipeak') } + .join(',') - ch_span_peaks = ch_peaks_rows - .filter { row -> - def caller = row.caller?.toString()?.toLowerCase() - caller && (caller.startsWith('span') || caller.startsWith('omnipeak')) - } - .map { row -> [row.group, row.caller?.toString()?.toLowerCase(), row.peaks_path] } - .groupTuple(by: [0]) - .map { group, entries -> - def peaks_by_caller = entries.groupBy { it[1] } - def chosen = span_caller_priority.find { peaks_by_caller.containsKey(it) } ?: peaks_by_caller.keySet().sort()[0] - if (peaks_by_caller.size() > 1) { - log.warn "Multiple SPAN callers for group ${group}; using '${chosen}' for differential peaks." - } - [group, peaks_by_caller[chosen][0][2]] - } + ch_samples_manifest_single = ch_samples_manifest.first() + ch_peaks_manifest_single = ch_peaks_manifest.first() + ch_chrom_sizes_single = ch_chrom_sizes.collect().map { it[0] } - ch_span_inputs = ch_span_records_file - .join(ch_span_peaks) - .map { group, records_file, peaks_path -> [ group, records_file, peaks_path ] } + ch_span_inputs = ch_span_design + .combine(ch_samples_manifest_single) + .combine(ch_peaks_manifest_single) + .combine(ch_chrom_sizes_single) + .map { group, samples_manifest, peaks_manifest, chrom_sizes -> [ group, samples_manifest, peaks_manifest, chrom_sizes ] } SPAN_DIFF_RUN ( ch_span_inputs, @@ -384,27 +389,34 @@ workflow DIFFERENTIAL_PEAK_CALLING { params.span_diff_gap, params.span_diff_bin, params.span_fallback_backend, + use_spikein, + params.differential_allow_partial, + span_caller_priority, params.omnipeaks_jar ? file(params.omnipeaks_jar) : null, params.span_diff_java_heap ) ch_versions = ch_versions.mix(SPAN_DIFF_RUN.out.versions) - ch_span_annot = SPAN_DIFF_RUN.out.differential - .combine(ch_gene_bed_single) - .map { tuple, gene_bed -> - def group = tuple[0] - def regions = tuple[1] - [ 'span', group, 'NA', regions, gene_bed, "03_span/${group}/span.differential.annotated.tsv" ] - } + if (annotation_enabled) { + ch_span_annot = SPAN_DIFF_RUN.out.differential + .combine(ch_gene_bed_single) + .map { tuple, gene_bed -> + def group = tuple[0] + def regions = tuple[1] + [ 'span', group, 'NA', regions, gene_bed, "03_span/${group}/span.differential.annotated.tsv" ] + } - ch_annotation_requests = ch_annotation_requests.mix(ch_span_annot) + ch_annotation_requests = ch_annotation_requests.mix(ch_span_annot) + } ch_span_summary = SPAN_DIFF_RUN.out.summary.map { it[1] } } - ANNOTATE_REGIONS ( - ch_annotation_requests - ) - ch_versions = ch_versions.mix(ANNOTATE_REGIONS.out.versions) + if (annotation_enabled) { + ANNOTATE_REGIONS ( + ch_annotation_requests + ) + ch_versions = ch_versions.mix(ANNOTATE_REGIONS.out.versions) + } ch_summary_files = ch_diffbind_summary.mix(ch_chipbinner_summary).mix(ch_span_summary) DIFFERENTIAL_SUMMARY ( diff --git a/workflows/cutandrun.nf b/workflows/cutandrun.nf index 445871db..509a14f4 100644 --- a/workflows/cutandrun.nf +++ b/workflows/cutandrun.nf @@ -663,6 +663,7 @@ workflow CUTANDRUN { } if (params.run_peak_calling && (params.run_diffbind || params.run_chipbinner || params.run_span_diff || params.differential_publish_manifest_only)) { + ch_gene_bed_annotation = params.gene_bed ? PREPARE_GENOME.out.bed : Channel.empty() DIFFERENTIAL_PEAK_CALLING ( ch_samtools_bam, ch_samtools_bai, @@ -670,7 +671,8 @@ workflow CUTANDRUN { ch_peaks_all, ch_scale_factors, PREPARE_GENOME.out.chrom_sizes, - PREPARE_GENOME.out.bed, + ch_gene_bed_annotation, + PREPARE_GENOME.out.gtf, ch_blacklist, Channel.empty(), Channel.empty(), diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index cf84fcf8..cc99d5b5 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -14,6 +14,7 @@ workflow DIFFERENTIAL_ONLY { def manifest_dir = "${params.differential_from_run}/03_peak_calling/08_differential/00_manifests" def samples_manifest = "${manifest_dir}/differential_manifest.samples.tsv" def peaks_manifest = "${manifest_dir}/differential_manifest.peaks.tsv" + def cached_gene_bed = "${manifest_dir}/differential_manifest.gene_bed.bed" if (!file(samples_manifest).exists()) { exit 1, "Missing samples manifest: ${samples_manifest}" @@ -25,14 +26,25 @@ workflow DIFFERENTIAL_ONLY { ch_samples_manifest = Channel.fromPath(samples_manifest, checkIfExists: true) ch_peaks_manifest = Channel.fromPath(peaks_manifest, checkIfExists: true) - if (!params.gene_bed) { - exit 1, "--gene_bed (or --genome with a bed12 attribute) is required for differential analysis." - } if (params.run_span_diff && !params.omnipeaks_jar) { exit 1, "--omnipeaks_jar is required when --run_span_diff is enabled." } - ch_gene_bed = Channel.fromPath(params.gene_bed, checkIfExists: true) + if (!params.gene_bed && !file(cached_gene_bed).exists() && !params.gtf) { + exit 1, "Differential annotation requires --gene_bed or --gtf (or a cached gene bed at ${cached_gene_bed})." + } + + ch_gene_bed = Channel.empty() + if (params.gene_bed) { + ch_gene_bed = Channel.fromPath(params.gene_bed, checkIfExists: true) + } else if (file(cached_gene_bed).exists()) { + ch_gene_bed = Channel.fromPath(cached_gene_bed, checkIfExists: true) + } + + ch_gtf = Channel.empty() + if (params.gtf) { + ch_gtf = Channel.from( file(params.gtf) ) + } ch_chrom_sizes = Channel.empty() if (params.run_chipbinner) { @@ -52,6 +64,7 @@ workflow DIFFERENTIAL_ONLY { Channel.empty(), ch_chrom_sizes, ch_gene_bed, + ch_gtf, Channel.empty(), ch_samples_manifest, ch_peaks_manifest, From 5147d64f6aa601b1d19d66313c5db7a6a1af480d Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 22:53:50 -0800 Subject: [PATCH 18/52] Implement SPAN/OmniPeak differential native+fallback --- assets/chrom_sizes_stub.sizes | 1 + bin/span_diff.py | 697 ++++++++++++++++-- bin/span_fallback_de.R | 143 ++++ bin/span_pooling_manifest.py | 38 + conf/base.config | 8 + conf/modules.config | 8 + modules/local/span_diff_run.nf | 28 +- modules/local/span_pooling_manifest.nf | 30 + .../local/differential_peak_calling.nf | 19 +- workflows/differential_only.nf | 5 +- 10 files changed, 913 insertions(+), 64 deletions(-) create mode 100644 assets/chrom_sizes_stub.sizes create mode 100755 bin/span_fallback_de.R create mode 100755 bin/span_pooling_manifest.py create mode 100644 modules/local/span_pooling_manifest.nf diff --git a/assets/chrom_sizes_stub.sizes b/assets/chrom_sizes_stub.sizes new file mode 100644 index 00000000..7fb29dd4 --- /dev/null +++ b/assets/chrom_sizes_stub.sizes @@ -0,0 +1 @@ +chr1 1 diff --git a/bin/span_diff.py b/bin/span_diff.py index 6c7637f0..ad875b78 100755 --- a/bin/span_diff.py +++ b/bin/span_diff.py @@ -2,97 +2,461 @@ import argparse import csv import os +import re import subprocess import sys +from typing import Dict, List, Tuple -def read_tsv(path): +def read_tsv(path: str) -> List[Dict[str, str]]: with open(path, "r", newline="") as handle: reader = csv.DictReader(handle, delimiter="\t") return [row for row in reader] -def detect_native_support(jar_path, java_heap): +def write_tsv(path: str, header: List[str], rows: List[Dict[str, str]]) -> None: + with open(path, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header, delimiter="\t") + writer.writeheader() + for row in rows: + writer.writerow(row) + + +def parse_list(value: str) -> List[str]: + if not value: + return [] + return [x.strip().lower() for x in value.split(",") if x.strip()] + + +def is_span_caller(name: str) -> bool: + if not name: + return False + lowered = str(name).lower() + return lowered.startswith("span") or lowered.startswith("omnipeak") + + +def safe_float(value: str): + if value is None: + return None try: - result = subprocess.run( - ["java", f"-Xmx{java_heap}", "-jar", jar_path, "--help"], - capture_output=True, - text=True, - check=False, - ) - output = (result.stdout or "") + (result.stderr or "") + return float(value) + except Exception: + return None + + +def run_cmd(cmd: List[str], check: bool = True) -> subprocess.CompletedProcess: + return subprocess.run(cmd, capture_output=True, text=True, check=check) + + +def load_help_text(jar_path: str, java_heap: str, args: List[str]) -> Tuple[bool, str]: + cmd = ["java", f"-Xmx{java_heap}", "-jar", jar_path] + args + try: + result = subprocess.run(cmd, capture_output=True, text=True, check=False) except FileNotFoundError: return False, "java_missing" + output = (result.stdout or "") + (result.stderr or "") + return True, output + + +def detect_signature(jar_path: str, java_heap: str) -> Tuple[bool, str, str]: + ok, help_text = load_help_text(jar_path, java_heap, ["--help"]) + if not ok: + return False, "java_missing", "" + if not help_text: + return False, "help_failed", "" + + lower_help = help_text.lower() + supports_compare = False + if re.search(r"\bcompare\b", lower_help): + supports_compare = True + elif re.search(r"\bdifferential\b", lower_help): + supports_compare = True + + compare_help = "" + if supports_compare: + ok_cmp, cmp_text = load_help_text(jar_path, java_heap, ["compare", "--help"]) + if ok_cmp and cmp_text: + compare_help = cmp_text + lower_cmp = cmp_text.lower() + if re.search(r"unknown command|no such command", lower_cmp): + supports_compare = False + else: + compare_help = help_text + + if not supports_compare: + return False, "no_compare", help_text + + signature = "compare:pooled" + probe_text = compare_help or help_text + for line in probe_text.splitlines(): + lline = line.lower() + if ("treat" in lline or "control" in lline or "case" in lline) and ( + "replicate" in lline + or "replicates" in lline + or "comma" in lline + or "list" in lline + or "bams" in lline + or "bam" in lline + or "files" in lline + ): + signature = "compare:list" + break + if "replicate" in probe_text.lower() or "comma" in probe_text.lower(): + signature = "compare:list" + + return True, signature, probe_text + + +def parse_flags(help_text: str) -> Dict[str, str]: + if not help_text: + return {} + long_flags = set(re.findall(r"--[A-Za-z0-9][A-Za-z0-9_.-]*", help_text)) + short_flags = set(re.findall(r"(? str: + for cand in candidates: + if cand in long_flags or cand in short_flags: + return cand + return "" + + flags = { + "treatment": pick(["--treatment", "--treated", "--treat", "--case", "-t"]), + "control": pick(["--control", "--ctrl", "--background", "--input", "-c"]), + "chrom": pick(["--chrom.sizes", "--chrom_sizes", "--chrom", "--cs"]), + "bin": pick(["--bin", "-b"]), + "gap": pick(["--gap", "-g"]), + "fdr": pick(["--fdr", "--q", "--qvalue"]), + "output": pick(["--peaks", "--output", "--out", "-p", "--prefix"]), + } + flags["output_is_prefix"] = "" + if "--prefix" in long_flags: + flags["output_is_prefix"] = "true" + if flags["output"] in {"--prefix"}: + flags["output_is_prefix"] = "true" + if flags["output"] == "-p": + for line in help_text.splitlines(): + if "-p" in line and "prefix" in line.lower(): + flags["output_is_prefix"] = "true" + break + if flags["output"] == "--peaks": + flags["output_is_prefix"] = "false" + return flags + + +def is_int(value: str) -> bool: + try: + int(value) + return True except Exception: - return False, "help_failed" - if "compare" in output.lower(): - return True, "compare" - return False, "no_compare" + return False -def read_peaks(peaks_path, max_rows=200): - regions = [] - with open(peaks_path, "r") as handle: +def read_intervals(path: str) -> List[Tuple[str, int, int]]: + intervals = [] + with open(path, "r") as handle: for line in handle: - if line.startswith("#") or not line.strip(): + line = line.strip() + if not line or line.startswith("#") or line.startswith("track") or line.startswith("browser"): continue - parts = line.strip().split("\t") + parts = line.split("\t") if len(parts) < 3: continue - regions.append((parts[0], parts[1], parts[2])) - if len(regions) >= max_rows: - break - return regions + if not is_int(parts[1]) or not is_int(parts[2]): + continue + start = int(parts[1]) + end = int(parts[2]) + if end <= start: + continue + intervals.append((parts[0], start, end)) + return intervals + + +def merge_intervals(intervals: List[Tuple[str, int, int]]) -> List[Tuple[str, int, int]]: + if not intervals: + return [] + intervals.sort(key=lambda x: (x[0], x[1], x[2])) + merged = [list(intervals[0])] + for chrom, start, end in intervals[1:]: + last = merged[-1] + if chrom != last[0] or start > last[2]: + merged.append([chrom, start, end]) + else: + last[2] = max(last[2], end) + return [tuple(x) for x in merged] + + +def write_bed(path: str, intervals: List[Tuple[str, int, int]]) -> None: + with open(path, "w") as handle: + for chrom, start, end in intervals: + handle.write(f"{chrom}\t{start}\t{end}\n") + + +def parse_native_output(path: str) -> List[Dict[str, str]]: + rows = [] + if not os.path.exists(path): + return rows + with open(path, "r") as handle: + lines = [line.rstrip("\n") for line in handle if line.strip() and not line.startswith("#")] + if not lines: + return rows + + header = None + first = lines[0].split("\t") + lower = [col.lower() for col in first] + if any(col in ("chr", "chrom", "chromosome") for col in lower) and any( + col.startswith("start") or col == "chromstart" for col in lower + ): + header = lower + data_lines = lines[1:] + else: + data_lines = lines + + def idx_for(names: List[str]): + if not header: + return None + for name in names: + if name in header: + return header.index(name) + return None + + idx_chr = idx_for(["chr", "chrom", "chromosome", "seqname"]) if header else 0 + idx_start = idx_for(["start", "chromstart", "begin"]) if header else 1 + idx_end = idx_for(["end", "chromend", "stop"]) if header else 2 + idx_log = idx_for(["log2fc", "logfc", "log2foldchange", "log2_fold_change", "log2foldchange"]) + idx_pval = idx_for(["pval", "pvalue", "p-value", "p.value"]) if header else None + idx_fdr = idx_for(["fdr", "qvalue", "q-value", "padj", "adj.p.val"]) if header else None + + for line in data_lines: + parts = line.split("\t") + if len(parts) < 3: + continue + chr_val = parts[idx_chr] if idx_chr is not None and idx_chr < len(parts) else parts[0] + start_val = parts[idx_start] if idx_start is not None and idx_start < len(parts) else parts[1] + end_val = parts[idx_end] if idx_end is not None and idx_end < len(parts) else parts[2] + + log_val = "NA" + pval_val = "NA" + fdr_val = "NA" + + if header and idx_log is not None and idx_log < len(parts): + log_val = parts[idx_log] + if header and idx_pval is not None and idx_pval < len(parts): + pval_val = parts[idx_pval] + if header and idx_fdr is not None and idx_fdr < len(parts): + fdr_val = parts[idx_fdr] + + if not header: + numeric = [] + for idx in range(3, len(parts)): + val = safe_float(parts[idx]) + if val is not None: + numeric.append((idx, val)) + for idx, val in numeric: + if log_val == "NA" and not (0.0 <= val <= 1.0): + log_val = parts[idx] + continue + if pval_val == "NA" and 0.0 <= val <= 1.0: + pval_val = parts[idx] + continue + if fdr_val == "NA" and 0.0 <= val <= 1.0: + fdr_val = parts[idx] + continue + if log_val == "NA" and numeric: + log_val = parts[numeric[0][0]] + + rows.append({ + "chr": chr_val, + "start": start_val, + "end": end_val, + "log2FC": log_val, + "pval": pval_val, + "FDR": fdr_val, + }) + return rows + + +def write_summary(path: str, group: str, treated: str, control: str, n_tested: int, n_fdr: int, n_up: int, n_down: int, + mode: str, status: str, reason: str) -> None: + header = [ + "group", + "treated", + "control", + "n_tested", + "n_fdr_pass", + "n_up", + "n_down", + "mode", + "status", + "reason", + ] + rows = [{ + "group": group, + "treated": treated, + "control": control, + "n_tested": str(n_tested), + "n_fdr_pass": str(n_fdr), + "n_up": str(n_up), + "n_down": str(n_down), + "mode": mode, + "status": status, + "reason": reason, + }] + write_tsv(path, header, rows) + + +def write_mode(path: str, mode: str, signature: str) -> None: + with open(path, "w") as handle: + handle.write(f"{mode}:{signature}\n") + + +def filter_bed_by_stats(rows: List[Dict[str, str]], fdr_cutoff: float, direction: str) -> List[Tuple[str, int, int]]: + out = [] + for row in rows: + fdr = safe_float(row.get("FDR")) + log2fc = safe_float(row.get("log2FC")) + if fdr is None or log2fc is None: + continue + if fdr > fdr_cutoff: + continue + if direction == "up" and log2fc <= 0: + continue + if direction == "down" and log2fc >= 0: + continue + if not is_int(row.get("start", "")) or not is_int(row.get("end", "")): + continue + out.append((row["chr"], int(row["start"]), int(row["end"]))) + return out + + +def write_failure_outputs(outdir: str, group: str, treated: str, control: str, mode: str, signature: str, reason: str) -> None: + diff_path = os.path.join(outdir, "span.differential.tsv") + bed_path = os.path.join(outdir, "span.differential.peaks.bed") + up_path = os.path.join(outdir, "span.up.bed") + down_path = os.path.join(outdir, "span.down.bed") + summary_path = os.path.join(outdir, "span.summary.tsv") + mode_path = os.path.join(outdir, "span.mode.txt") + + with open(diff_path, "w") as handle: + handle.write("chr\tstart\tend\tlog2FC\tpval\tFDR\n") + open(bed_path, "w").close() + open(up_path, "w").close() + open(down_path, "w").close() + write_summary(summary_path, group, treated, control, 0, 0, 0, 0, mode, "FAIL", reason) + write_mode(mode_path, mode, signature) def main(): - parser = argparse.ArgumentParser(description="Stub SPAN/OmniPeak differential") + parser = argparse.ArgumentParser(description="SPAN/OmniPeak differential runner") parser.add_argument("--jar", required=True) parser.add_argument("--mode", required=True) parser.add_argument("--contrast", required=True) parser.add_argument("--group", required=True) parser.add_argument("--samples", required=True) parser.add_argument("--peaks", required=True) + parser.add_argument("--caller-priority", default="") + parser.add_argument("--chrom-sizes") parser.add_argument("--bin", type=int, default=200) parser.add_argument("--gap", type=int, default=5) parser.add_argument("--fdr", type=float, default=0.05) parser.add_argument("--fallback-backend", default="DESeq2") + parser.add_argument("--use-spikein", default="false") + parser.add_argument("--allow-partial", default="false") parser.add_argument("--java-heap", default="8G") + parser.add_argument("--cpus", type=int, default=1) + parser.add_argument("--pooling-dir", default="") parser.add_argument("--outdir", default=".") args = parser.parse_args() + os.makedirs(args.outdir, exist_ok=True) + contrast = [x.strip() for x in args.contrast.split(",") if x.strip()] treated = contrast[0] if len(contrast) > 0 else "treated" control = contrast[1] if len(contrast) > 1 else "control" - native_supported, signature = detect_native_support(args.jar, args.java_heap) + use_spikein = str(args.use_spikein).lower() in {"true", "1", "yes"} + allow_partial = str(args.allow_partial).lower() in {"true", "1", "yes"} + + native_supported, signature, help_text = detect_signature(args.jar, args.java_heap) + chosen_mode = "fallback" - status = "RUN" - reason = "ok" if args.mode == "native": if not native_supported: - if signature == "java_missing": - print("WARNING: Java not found; falling back to SPAN fallback mode.", file=sys.stderr) - chosen_mode = "fallback" - reason = "java_missing_fallback" - else: - print("ERROR: SPAN jar does not support native compare.", file=sys.stderr) - sys.exit(1) - else: - chosen_mode = "native" + reason = "native_not_supported" + write_failure_outputs(args.outdir, args.group, treated, control, "native", signature, reason) + if allow_partial: + sys.exit(0) + print("ERROR: SPAN jar does not support native compare.", file=sys.stderr) + sys.exit(1) + chosen_mode = "native" elif args.mode == "fallback": chosen_mode = "fallback" else: chosen_mode = "native" if native_supported else "fallback" - if not native_supported and signature == "java_missing": - reason = "java_missing_fallback" - if not os.path.exists(args.peaks): - print("ERROR: SPAN peaks file not found for fallback.", file=sys.stderr) + samples = read_tsv(args.samples) + peaks = read_tsv(args.peaks) + + samples_group = [row for row in samples if row.get("group") == args.group and row.get("condition") in {treated, control}] + if not samples_group: + reason = "no_samples" + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + if allow_partial: + sys.exit(0) + print("ERROR: No samples found for group.", file=sys.stderr) sys.exit(1) - os.makedirs(args.outdir, exist_ok=True) - regions = read_peaks(args.peaks) + peaks_group = [row for row in peaks if row.get("group") == args.group and row.get("condition") in {treated, control}] + span_peaks = [row for row in peaks_group if is_span_caller(row.get("caller"))] + if not span_peaks: + reason = "no_span_peaks" + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + if allow_partial: + sys.exit(0) + print("ERROR: No SPAN/OmniPeak peaks found for group.", file=sys.stderr) + sys.exit(1) + + priority = [x for x in parse_list(args.caller_priority) if is_span_caller(x)] + available_callers = sorted({(row.get("caller") or "").lower() for row in span_peaks}) + chosen_caller = None + for cand in priority: + if cand in available_callers: + chosen_caller = cand + break + if not chosen_caller: + chosen_caller = available_callers[0] + if len(available_callers) > 1: + print( + f"WARNING: Multiple SPAN callers for group {args.group}; using '{chosen_caller}'.", + file=sys.stderr, + ) + + peaks_by_sample = {} + for row in sorted(span_peaks, key=lambda r: (r.get("sample_id"), r.get("caller"), r.get("peaks_path"))): + if (row.get("caller") or "").lower() != chosen_caller: + continue + sample_id = row.get("sample_id") + if sample_id and sample_id not in peaks_by_sample: + peaks_by_sample[sample_id] = row.get("peaks_path") + + missing_peaks = [row.get("sample_id") for row in samples_group if row.get("sample_id") not in peaks_by_sample] + if missing_peaks: + reason = "missing_peaks" + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + if allow_partial: + sys.exit(0) + print(f"ERROR: Missing peaks for samples: {','.join(missing_peaks)}", file=sys.stderr) + sys.exit(1) + + samples_sorted = sorted(samples_group, key=lambda r: (r.get("condition"), r.get("replicate"), r.get("sample_id"))) + treated_samples = [row for row in samples_sorted if row.get("condition") == treated] + control_samples = [row for row in samples_sorted if row.get("condition") == control] + if not treated_samples or not control_samples: + reason = "missing_condition" + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + if allow_partial: + sys.exit(0) + print("ERROR: Missing treated/control samples.", file=sys.stderr) + sys.exit(1) diff_path = os.path.join(args.outdir, "span.differential.tsv") bed_path = os.path.join(args.outdir, "span.differential.peaks.bed") @@ -100,25 +464,248 @@ def main(): down_path = os.path.join(args.outdir, "span.down.bed") summary_path = os.path.join(args.outdir, "span.summary.tsv") mode_path = os.path.join(args.outdir, "span.mode.txt") + pooling_path = os.path.join(args.outdir, "span_diff_target_pooling.tsv") - with open(diff_path, "w") as handle: - handle.write("chr\tstart\tend\tlog2FC\tpval\tFDR\n") - for chrom, start, end in regions: - handle.write(f"{chrom}\t{start}\t{end}\t0.0\t1.0\t1.0\n") + def finalize_success(rows: List[Dict[str, str]], mode: str, status: str = "RUN", reason: str = "ok"): + n_tested = len(rows) + n_fdr = 0 + n_up = 0 + n_down = 0 + for row in rows: + fdr_val = safe_float(row.get("FDR")) + log_val = safe_float(row.get("log2FC")) + if fdr_val is None or log_val is None: + continue + if fdr_val <= args.fdr: + n_fdr += 1 + if log_val > 0: + n_up += 1 + elif log_val < 0: + n_down += 1 + write_summary(summary_path, args.group, treated, control, n_tested, n_fdr, n_up, n_down, mode, status, reason) - with open(bed_path, "w") as handle: - for chrom, start, end in regions: - handle.write(f"{chrom}\t{start}\t{end}\n") + try: + if chosen_mode == "native": + if not native_supported: + reason = "native_not_supported" + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + if allow_partial: + sys.exit(0) + print("ERROR: Native SPAN compare not supported.", file=sys.stderr) + sys.exit(1) - open(up_path, "w").close() - open(down_path, "w").close() + flags = parse_flags(help_text) + treat_flag = flags.get("treatment") or "-t" + ctrl_flag = flags.get("control") or "-c" + chrom_flag = flags.get("chrom") or "--cs" + bin_flag = flags.get("bin") or "--bin" + gap_flag = flags.get("gap") or "--gap" + fdr_flag = flags.get("fdr") or "--fdr" + out_flag = flags.get("output") or "-p" + out_is_prefix = flags.get("output_is_prefix") == "true" - with open(summary_path, "w") as handle: - handle.write("group\ttreated\tcontrol\tn_tested\tn_fdr_pass\tn_up\tn_down\tmode\tstatus\treason\n") - handle.write(f"{args.group}\t{treated}\t{control}\t{len(regions)}\t0\t0\t0\t{chosen_mode}\t{status}\t{reason}\n") + if not args.chrom_sizes or not os.path.exists(args.chrom_sizes): + reason = "missing_chrom_sizes" + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + if allow_partial: + sys.exit(0) + print("ERROR: Chrom sizes file required for native SPAN compare.", file=sys.stderr) + sys.exit(1) + + treated_bams = [row.get("final_bam") for row in treated_samples] + control_bams = [row.get("final_bam") for row in control_samples] + for bam in treated_bams + control_bams: + if not bam or not os.path.exists(bam): + raise RuntimeError(f"Missing BAM: {bam}") + + use_lists = signature == "compare:list" + pooled_rows = [] + + def pool_bams(condition: str, bams: List[str]) -> str: + pooled_name = f"{args.group}.{condition}.pooled.bam" + pooled_path = os.path.join(args.outdir, pooled_name) + sorted_bams = sorted(bams) + cmd = ["samtools", "merge", "-f", "-@", str(max(args.cpus, 1)), pooled_path] + sorted_bams + run_cmd(cmd, check=True) + run_cmd(["samtools", "index", pooled_path], check=True) + pooled_manifest_path = pooled_path + if args.pooling_dir: + pooled_manifest_path = os.path.join(args.pooling_dir, pooled_name) + pooled_rows.append({ + "group": args.group, + "condition": condition, + "pooled_bam": pooled_manifest_path, + "source_bams": ",".join(sorted_bams), + }) + return pooled_path + + treated_arg = "" + control_arg = "" + if use_lists: + treated_arg = ",".join(treated_bams) + control_arg = ",".join(control_bams) + else: + if len(treated_bams) > 1: + treated_arg = pool_bams(treated, treated_bams) + else: + treated_arg = treated_bams[0] + if len(control_bams) > 1: + control_arg = pool_bams(control, control_bams) + else: + control_arg = control_bams[0] + + output_target = "span.native" + output_path = output_target + if out_is_prefix: + output_path = output_target + else: + output_path = f"{output_target}.peak" + + cmd = [ + "java", + f"-Xmx{args.java_heap}", + "-jar", + args.jar, + "compare", + treat_flag, + treated_arg, + ctrl_flag, + control_arg, + chrom_flag, + args.chrom_sizes, + bin_flag, + str(args.bin), + gap_flag, + str(args.gap), + fdr_flag, + str(args.fdr), + out_flag, + output_path, + ] + + result = run_cmd(cmd, check=False) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or "native_compare_failed") + + native_output = None + if out_is_prefix: + candidates = [f for f in os.listdir(args.outdir) if f.startswith(output_target)] + candidates = [c for c in candidates if c.endswith(".peak") or c.endswith(".bed") or c.endswith(".tsv")] + candidates.sort() + if candidates: + native_output = os.path.join(args.outdir, candidates[0]) + else: + native_output = os.path.join(args.outdir, output_path) + + if not native_output or not os.path.exists(native_output): + raise RuntimeError("native_output_missing") + + rows = parse_native_output(native_output) + if not rows: + raise RuntimeError("native_output_empty") - with open(mode_path, "w") as handle: - handle.write(f"{chosen_mode}:{signature}\n") + write_tsv(diff_path, ["chr", "start", "end", "log2FC", "pval", "FDR"], rows) + bed_rows = [(r["chr"], int(r["start"]), int(r["end"])) for r in rows if is_int(r["start"]) and is_int(r["end"])] + bed_rows.sort(key=lambda x: (x[0], x[1], x[2])) + write_bed(bed_path, bed_rows) + + up_rows = filter_bed_by_stats(rows, args.fdr, "up") + down_rows = filter_bed_by_stats(rows, args.fdr, "down") + up_rows.sort(key=lambda x: (x[0], x[1], x[2])) + down_rows.sort(key=lambda x: (x[0], x[1], x[2])) + write_bed(up_path, up_rows) + write_bed(down_path, down_rows) + + if pooled_rows: + write_tsv( + pooling_path, + ["group", "condition", "pooled_bam", "source_bams"], + pooled_rows, + ) + + write_mode(mode_path, "native", signature) + finalize_success(rows, "native") + return + + # Fallback path + all_intervals = [] + for row in samples_sorted: + peak_path = peaks_by_sample.get(row.get("sample_id")) + if not peak_path or not os.path.exists(peak_path): + raise RuntimeError(f"Missing peaks file: {peak_path}") + all_intervals.extend(read_intervals(peak_path)) + + merged = merge_intervals(all_intervals) + if not merged: + raise RuntimeError("no_regions") + + write_bed(bed_path, merged) + + # counts + sample_ids = [row.get("sample_id") for row in samples_sorted] + bams = [row.get("final_bam") for row in samples_sorted] + for bam in bams: + if not bam or not os.path.exists(bam): + raise RuntimeError(f"Missing BAM: {bam}") + + counts_path = os.path.join(args.outdir, "span.counts.tsv") + cmd = ["bedtools", "multicov", "-bed", bed_path, "-bams"] + bams + result = run_cmd(cmd, check=False) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or "bedtools_multicov_failed") + with open(counts_path, "w") as handle: + handle.write("chr\tstart\tend\t" + "\t".join(sample_ids) + "\n") + handle.write(result.stdout) + + samples_meta_path = os.path.join(args.outdir, "span.samples.tsv") + with open(samples_meta_path, "w") as handle: + handle.write("sample_id\tcondition\tspikein_scale_factor\n") + for row in samples_sorted: + handle.write( + f"{row.get('sample_id','')}\t{row.get('condition','')}\t{row.get('spikein_scale_factor','NA')}\n" + ) + + r_script = os.path.join(os.path.dirname(__file__), "span_fallback_de.R") + cmd = [ + "Rscript", + r_script, + "--counts", + counts_path, + "--samples", + samples_meta_path, + "--contrast", + args.contrast, + "--backend", + args.fallback_backend, + "--use-spikein", + "true" if use_spikein else "false", + "--out", + diff_path, + ] + result = run_cmd(cmd, check=False) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or "fallback_de_failed") + + rows = parse_native_output(diff_path) + if not rows: + raise RuntimeError("fallback_output_empty") + + up_rows = filter_bed_by_stats(rows, args.fdr, "up") + down_rows = filter_bed_by_stats(rows, args.fdr, "down") + up_rows.sort(key=lambda x: (x[0], x[1], x[2])) + down_rows.sort(key=lambda x: (x[0], x[1], x[2])) + write_bed(up_path, up_rows) + write_bed(down_path, down_rows) + + write_mode(mode_path, "fallback", signature) + finalize_success(rows, "fallback") + except Exception as exc: + reason = str(exc).split("\n")[0] if exc else "failed" + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + if allow_partial: + sys.exit(0) + print(f"ERROR: SPAN diff failed: {exc}", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/bin/span_fallback_de.R b/bin/span_fallback_de.R new file mode 100755 index 00000000..25ccf23d --- /dev/null +++ b/bin/span_fallback_de.R @@ -0,0 +1,143 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +parse_args <- function(args) { + res <- list() + i <- 1 + while (i <= length(args)) { + key <- args[i] + if (grepl("^--", key)) { + name <- sub("^--", "", key) + if (i == length(args) || grepl("^--", args[i + 1])) { + res[[name]] <- TRUE + i <- i + 1 + } else { + res[[name]] <- args[i + 1] + i <- i + 2 + } + } else { + i <- i + 1 + } + } + res +} + +as_bool <- function(val) { + if (is.null(val)) { + return(FALSE) + } + if (is.logical(val)) { + return(val) + } + val <- tolower(as.character(val)) + return(val %in% c("true", "1", "yes")) +} + +params <- parse_args(args) +counts_path <- params[["counts"]] +samples_path <- params[["samples"]] +contrast <- params[["contrast"]] +backend <- params[["backend"]] +use_spikein <- as_bool(params[["use-spikein"]]) +out_path <- params[["out"]] + +if (is.null(counts_path) || is.null(samples_path) || is.null(out_path) || is.null(contrast)) { + stop("Missing required arguments: --counts, --samples, --contrast, --out") +} + +contrast_parts <- strsplit(contrast, ",")[[1]] +contrast_parts <- trimws(contrast_parts) +if (length(contrast_parts) < 2) { + stop("Contrast must contain treated,control") +} +treated <- contrast_parts[1] +control <- contrast_parts[2] + +counts <- read.delim(counts_path, check.names = FALSE, stringsAsFactors = FALSE) +if (nrow(counts) == 0) { + write.table(data.frame(chr=character(), start=integer(), end=integer(), log2FC=numeric(), pval=numeric(), FDR=numeric()), + file=out_path, sep="\t", quote=FALSE, row.names=FALSE) + quit(save = "no") +} + +region <- counts[, 1:3] +count_mat <- as.matrix(counts[, 4:ncol(counts), drop = FALSE]) +colnames(count_mat) <- colnames(counts)[4:ncol(counts)] + +samples <- read.delim(samples_path, stringsAsFactors = FALSE) +if (!"sample_id" %in% colnames(samples)) { + stop("Samples file must contain sample_id") +} + +sample_order <- match(colnames(count_mat), samples$sample_id) +if (any(is.na(sample_order))) { + stop("Counts contain sample IDs missing from samples metadata") +} + +samples <- samples[sample_order, ] +coldata <- data.frame(condition = factor(samples$condition, levels = c(control, treated))) +rownames(coldata) <- samples$sample_id + +size_factors <- NULL +if (use_spikein && "spikein_scale_factor" %in% colnames(samples)) { + factors <- suppressWarnings(as.numeric(samples$spikein_scale_factor)) + if (any(!is.na(factors))) { + size_factors <- ifelse(factors == 0, 1, 1 / factors) + size_factors[is.na(size_factors)] <- 1 + size_factors[!is.finite(size_factors)] <- 1 + } +} + +if (is.null(backend)) { + backend <- "DESeq2" +} + +backend <- as.character(backend) + +if (backend == "edgeR") { + if (!requireNamespace("edgeR", quietly = TRUE)) { + stop("edgeR package not available") + } + y <- edgeR::DGEList(counts = count_mat, group = coldata$condition) + if (!is.null(size_factors)) { + norm <- size_factors / exp(mean(log(size_factors))) + y$samples$norm.factors <- norm + } else { + y <- edgeR::calcNormFactors(y, method = "TMM") + } + design <- model.matrix(~coldata$condition) + y <- edgeR::estimateDisp(y, design) + fit <- edgeR::glmQLFit(y, design) + res <- edgeR::glmQLFTest(fit, coef = 2) + tbl <- res$table + tbl$FDR <- p.adjust(tbl$PValue, method = "BH") + log2fc <- tbl$logFC + pval <- tbl$PValue + fdr <- tbl$FDR +} else { + if (!requireNamespace("DESeq2", quietly = TRUE)) { + stop("DESeq2 package not available") + } + dds <- DESeq2::DESeqDataSetFromMatrix(countData = count_mat, colData = coldata, design = ~condition) + if (!is.null(size_factors)) { + DESeq2::sizeFactors(dds) <- size_factors + } + dds <- DESeq2::DESeq(dds) + res <- DESeq2::results(dds, contrast = c("condition", treated, control)) + log2fc <- res$log2FoldChange + pval <- res$pvalue + fdr <- res$padj +} + +out <- data.frame( + chr = region[[1]], + start = region[[2]], + end = region[[3]], + log2FC = log2fc, + pval = pval, + FDR = fdr, + stringsAsFactors = FALSE +) + +write.table(out, file = out_path, sep = "\t", quote = FALSE, row.names = FALSE) diff --git a/bin/span_pooling_manifest.py b/bin/span_pooling_manifest.py new file mode 100755 index 00000000..47fde593 --- /dev/null +++ b/bin/span_pooling_manifest.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +import argparse +import csv +import os + + +def read_rows(path): + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return [row for row in reader] + + +def main(): + parser = argparse.ArgumentParser(description="Merge SPAN pooling manifests") + parser.add_argument("--inputs", nargs="+", required=True) + parser.add_argument("--out", required=True) + args = parser.parse_args() + + rows = [] + for path in args.inputs: + if not path or not os.path.exists(path): + continue + rows.extend(read_rows(path)) + + if not rows: + return + + rows.sort(key=lambda r: (r.get("group") or "", r.get("condition") or "", r.get("pooled_bam") or "")) + + header = ["group", "condition", "pooled_bam", "source_bams"] + with open(args.out, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header, delimiter="\t") + writer.writeheader() + writer.writerows(rows) + + +if __name__ == "__main__": + main() diff --git a/conf/base.config b/conf/base.config index 56a639af..56cbc8ce 100644 --- a/conf/base.config +++ b/conf/base.config @@ -61,6 +61,14 @@ process { cpus = { check_max( 4, 'cpus' ) } memory = { check_max( 16.GB, 'memory' ) } } + withLabel:SPAN_NATIVE { + cpus = { check_max( 2, 'cpus' ) } + memory = { check_max( 8.GB, 'memory' ) } + } + withLabel:SPAN_FALLBACK { + cpus = { check_max( 4, 'cpus' ) } + memory = { check_max( 16.GB, 'memory' ) } + } withLabel:error_ignore { errorStrategy = 'ignore' } diff --git a/conf/modules.config b/conf/modules.config index cc78f701..d91db520 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1227,6 +1227,14 @@ if(params.run_diffbind || params.run_chipbinner || params.run_span_diff || param ] } + withName: '.*:SPAN_POOLING_MANIFEST' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/00_manifests" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: '.*:ANNOTATE_REGIONS' { publishDir = [ path: { "${params.outdir}/03_peak_calling/08_differential" }, diff --git a/modules/local/span_diff_run.nf b/modules/local/span_diff_run.nf index 2a507c49..68baa4b0 100644 --- a/modules/local/span_diff_run.nf +++ b/modules/local/span_diff_run.nf @@ -1,17 +1,20 @@ process SPAN_DIFF_RUN { - label 'process_span' + label { params.span_diff_mode == 'fallback' ? 'SPAN_FALLBACK' : 'SPAN_NATIVE' } - conda "conda-forge::python=3.8.3 conda-forge::openjdk" + conda "conda-forge::python=3.8.3 conda-forge::openjdk=21.0.2 bioconda::samtools=1.16.1 bioconda::bedtools=2.31.0 conda-forge::r-base=4.2.3 bioconda::bioconductor-deseq2 bioconda::bioconductor-edger" container "quay.io/biocontainers/python:3.8.3" input: - tuple val(group), path(records), path(peaks) + tuple val(group), path(samples_manifest), path(peaks_manifest), path(chrom_sizes) val contrast val mode val fdr val gap val bin_size val fallback_backend + val use_spikein + val allow_partial + val caller_priority path omnipeaks_jar val java_heap @@ -22,30 +25,45 @@ process SPAN_DIFF_RUN { tuple val(group), path("span.down.bed"), emit: down tuple val(group), path("span.summary.tsv"), emit: summary tuple val(group), path("span.mode.txt"), emit: mode_out + tuple val(group), path("span_diff_target_pooling.tsv", optional: true), emit: pooling + path "*.pooled.bam", optional: true, emit: pooled_bam + path "*.pooled.bam.bai", optional: true, emit: pooled_bai path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when script: + def caller_arg = caller_priority ? "--caller-priority '${caller_priority}'" : '' + def pooling_dir = "${params.outdir}/03_peak_calling/08_differential/03_span/${group}" """ span_diff.py \ --jar ${omnipeaks_jar} \ --mode ${mode} \ --contrast '${contrast}' \ --group '${group}' \ - --samples ${records} \ - --peaks ${peaks} \ + --samples ${samples_manifest} \ + --peaks ${peaks_manifest} \ + ${caller_arg} \ + --chrom-sizes ${chrom_sizes} \ --bin ${bin_size} \ --gap ${gap} \ --fdr ${fdr} \ --fallback-backend ${fallback_backend} \ + --use-spikein ${use_spikein} \ + --allow-partial ${allow_partial} \ --java-heap ${java_heap} \ + --cpus ${task.cpus} \ + --pooling-dir '${pooling_dir}' \ --outdir . cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + java: \$(java -version 2>&1 | head -n 1 | sed -e 's/"//g') + bedtools: \$(bedtools --version | sed 's/bedtools v//') + samtools: \$(samtools --version | head -n 1 | sed 's/samtools //') + R: \$(R --version | head -n 1 | sed 's/.* //') END_VERSIONS """ } diff --git a/modules/local/span_pooling_manifest.nf b/modules/local/span_pooling_manifest.nf new file mode 100644 index 00000000..1526c343 --- /dev/null +++ b/modules/local/span_pooling_manifest.nf @@ -0,0 +1,30 @@ +process SPAN_POOLING_MANIFEST { + label 'process_single' + + conda "conda-forge::python=3.8.3" + container "quay.io/biocontainers/python:3.8.3" + + input: + path pooling_files + val do_run + + output: + path "span_diff_target_pooling.tsv", emit: manifest + path "versions.yml", emit: versions + + when: + (task.ext.when == null || task.ext.when) && do_run + + script: + def inputs_arg = pooling_files instanceof List ? pooling_files.join(' ') : pooling_files + """ + span_pooling_manifest.py \ + --inputs ${inputs_arg} \ + --out span_diff_target_pooling.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\\\.)+[0-9]{1,}") + END_VERSIONS + """ +} diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index dfa8e929..6c089a9e 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -9,10 +9,10 @@ include { DIFFERENTIAL_DESIGN } from "../../modules/local/differential_design" include { PUBLISH_DIFFERENTIAL_MANIFESTS } from "../../modules/local/publish_differential_manifests" include { RECORDS_TO_TSV as RECORDS_TO_TSV_DIFFBIND } from "../../modules/local/records_to_tsv" include { RECORDS_TO_TSV as RECORDS_TO_TSV_CHIPBINNER } from "../../modules/local/records_to_tsv" -include { RECORDS_TO_TSV as RECORDS_TO_TSV_SPAN } from "../../modules/local/records_to_tsv" include { DIFFBIND_RUN } from "../../modules/local/diffbind_run" include { CHIPBINNER_RUN } from "../../modules/local/chipbinner_run" include { SPAN_DIFF_RUN } from "../../modules/local/span_diff_run" +include { SPAN_POOLING_MANIFEST } from "../../modules/local/span_pooling_manifest" include { ANNOTATE_REGIONS } from "../../modules/local/annotate_regions" include { GTF_TO_GENE_BED } from "../../modules/local/gtf_to_gene_bed" include { BEDTOOLS_SORT as ANNOTATION_GENE_BED_SORT } from "../../modules/local/for_patch/bedtools/sort/main" @@ -373,7 +373,11 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_samples_manifest_single = ch_samples_manifest.first() ch_peaks_manifest_single = ch_peaks_manifest.first() - ch_chrom_sizes_single = ch_chrom_sizes.collect().map { it[0] } + def span_chrom_sizes = ch_chrom_sizes + if (params.span_diff_mode == 'fallback') { + span_chrom_sizes = ch_chrom_sizes.ifEmpty(file("$projectDir/assets/chrom_sizes_stub.sizes")) + } + ch_chrom_sizes_single = span_chrom_sizes.collect().map { it[0] } ch_span_inputs = ch_span_design .combine(ch_samples_manifest_single) @@ -397,6 +401,17 @@ workflow DIFFERENTIAL_PEAK_CALLING { ) ch_versions = ch_versions.mix(SPAN_DIFF_RUN.out.versions) + ch_span_pooling_files = SPAN_DIFF_RUN.out.pooling.map { it[1] } + ch_span_pooling_files.into { ch_span_pooling_list_in; ch_span_pooling_count_in } + ch_span_pooling_list = ch_span_pooling_list_in.collect() + ch_span_pooling_flag = ch_span_pooling_count_in.count().map { it > 0 } + + SPAN_POOLING_MANIFEST ( + ch_span_pooling_list, + ch_span_pooling_flag + ) + ch_versions = ch_versions.mix(SPAN_POOLING_MANIFEST.out.versions) + if (annotation_enabled) { ch_span_annot = SPAN_DIFF_RUN.out.differential .combine(ch_gene_bed_single) diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index cc99d5b5..92ce542f 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -47,9 +47,10 @@ workflow DIFFERENTIAL_ONLY { } ch_chrom_sizes = Channel.empty() - if (params.run_chipbinner) { + def need_chrom_sizes = params.run_chipbinner || (params.run_span_diff && params.span_diff_mode != 'fallback') + if (need_chrom_sizes) { if (!params.fasta) { - exit 1, "--fasta is required to compute chrom sizes for ChIPBinner in differential-only mode." + exit 1, "--fasta is required to compute chrom sizes for differential-only mode when running ChIPBinner or SPAN native/auto." } ch_fasta = Channel.of([ [id: 'genome'], file(params.fasta) ]) CUSTOM_GETCHROMSIZES ( ch_fasta ) From 88a3a833b7eb4e8786a46eb5d5d450845fbfd82b Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 23:11:02 -0800 Subject: [PATCH 19/52] Add fast differential tests and fix status publishing --- conf/modules.config | 6 +- modules/local/diffbind_run.nf | 39 ++++++++ .../differential_manifest.peaks.tsv | 5 + .../differential_manifest.samples.tsv | 3 + .../insufficient_reps.peaks.tsv | 3 + .../insufficient_reps.samples.tsv | 3 + .../missing_condition.peaks.tsv | 4 + .../missing_condition.samples.tsv | 4 + .../differential_manifest_input/peaks.tsv | 4 + .../differential_manifest_input/samples.tsv | 4 + tests/assets/minimal.bed | 1 + tests/assets/minimal.fa | 2 + tests/assets/minimal.gtf | 1 + tests/assets/span_mode/peaks.tsv | 1 + tests/assets/span_mode/samples.tsv | 1 + tests/test_07_differential.yml | 95 +++++++++++++++++++ 16 files changed, 175 insertions(+), 1 deletion(-) create mode 100644 tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv create mode 100644 tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv create mode 100644 tests/assets/differential_design/insufficient_reps.peaks.tsv create mode 100644 tests/assets/differential_design/insufficient_reps.samples.tsv create mode 100644 tests/assets/differential_design/missing_condition.peaks.tsv create mode 100644 tests/assets/differential_design/missing_condition.samples.tsv create mode 100644 tests/assets/differential_manifest_input/peaks.tsv create mode 100644 tests/assets/differential_manifest_input/samples.tsv create mode 100644 tests/assets/minimal.bed create mode 100644 tests/assets/minimal.fa create mode 100644 tests/assets/minimal.gtf create mode 100644 tests/assets/span_mode/peaks.tsv create mode 100644 tests/assets/span_mode/samples.tsv diff --git a/conf/modules.config b/conf/modules.config index d91db520..eee7056a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1199,7 +1199,11 @@ if(params.run_diffbind || params.run_chipbinner || params.run_span_diff || param [ path: { "${params.outdir}/03_peak_calling/08_differential/01_diffbind/${caller}/${group}" }, mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> + if (filename.equals('versions.yml')) return null + if (filename.equals('diffbind.samplesheet.csv')) return null + return filename + } ], [ path: { "${params.outdir}/03_peak_calling/08_differential/01_diffbind/00_samplesheets/${caller}" }, diff --git a/modules/local/diffbind_run.nf b/modules/local/diffbind_run.nf index 7d54b888..380cfa11 100644 --- a/modules/local/diffbind_run.nf +++ b/modules/local/diffbind_run.nf @@ -61,4 +61,43 @@ process DIFFBIND_RUN { R: \$(R --version | head -n 1 | sed 's/.* //') END_VERSIONS """ + + stub: + """ + mkdir -p plots + cat <<-EOF > diffbind.samplesheet.csv +SampleID,Factor,Condition,Replicate,bamReads,Peaks,PeakCaller,Tissue +stub,${group},stub,1,stub.bam,stub.peak,${caller},CUTRUN +EOF + + cat <<-EOF > diffbind.results.tsv +chr\tstart\tend\tlog2FC\tpval\tFDR +chr1\t1\t2\t0.0\t1.0\t1.0 +EOF + + printf "chr1\t1\t2\n" > diffbind.significant.bed + printf "chr1\t1\t2\n" > diffbind.significant_up.bed + printf "chr1\t1\t2\n" > diffbind.significant_down.bed + + cat <<-EOF > diffbind.summary.tsv +caller\tgroup\ttreated\tcontrol\tn_tested\tn_fdr_pass\tn_up\tn_down\tstatus\treason +${caller}\t${group}\tstub\tstub\t1\t0\t0\t0\tRUN\tok +EOF + + cat <<-EOF > diffbind.normalization_factors.tsv +sample_id\tsize_factor +stub\t1.0 +EOF + + echo "stub" > diffbind.dba.rds + echo "stub" > plots/PCA.pdf + echo "stub" > plots/correlation_heatmap.pdf + echo "stub" > plots/MA.pdf + echo "stub" > plots/volcano.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: stub + END_VERSIONS + """ } diff --git a/tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv b/tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv new file mode 100644 index 00000000..6d54a533 --- /dev/null +++ b/tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv @@ -0,0 +1,5 @@ +sample_id group condition caller peaks_path peaks_format replicate +sample1 h3k27me3 Treatment seacr NA bed 1 +sample2 h3k27me3 Control seacr NA bed 1 +sample1 h3k27me3 Treatment span_default NA bed 1 +sample2 h3k27me3 Control span_default NA bed 1 diff --git a/tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv b/tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv new file mode 100644 index 00000000..b7cdac03 --- /dev/null +++ b/tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv @@ -0,0 +1,3 @@ +sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor ms_coeff bigwig_path +sample1 h3k27me3 Treatment 1 NA NA NA NA NA NA +sample2 h3k27me3 Control 1 NA NA NA NA NA NA diff --git a/tests/assets/differential_design/insufficient_reps.peaks.tsv b/tests/assets/differential_design/insufficient_reps.peaks.tsv new file mode 100644 index 00000000..0a627b39 --- /dev/null +++ b/tests/assets/differential_design/insufficient_reps.peaks.tsv @@ -0,0 +1,3 @@ +sample_id group condition replicate caller peaks_path peaks_format +g1_t1 group_low Treatment 1 seacr /path/peaks1.bed bed +g1_c1 group_low Control 1 seacr /path/peaks2.bed bed diff --git a/tests/assets/differential_design/insufficient_reps.samples.tsv b/tests/assets/differential_design/insufficient_reps.samples.tsv new file mode 100644 index 00000000..c1726e33 --- /dev/null +++ b/tests/assets/differential_design/insufficient_reps.samples.tsv @@ -0,0 +1,3 @@ +sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor ms_coeff bigwig_path +g1_t1 group_low Treatment 1 NA NA NA NA NA NA +g1_c1 group_low Control 1 NA NA NA NA NA NA diff --git a/tests/assets/differential_design/missing_condition.peaks.tsv b/tests/assets/differential_design/missing_condition.peaks.tsv new file mode 100644 index 00000000..aed825a0 --- /dev/null +++ b/tests/assets/differential_design/missing_condition.peaks.tsv @@ -0,0 +1,4 @@ +sample_id group condition replicate caller peaks_path peaks_format +g1_t1 group_missing Treatment 1 seacr /path/peaks1.bed bed +g2_t1 group_ok Treatment 1 seacr /path/peaks2.bed bed +g2_c1 group_ok Control 1 seacr /path/peaks3.bed bed diff --git a/tests/assets/differential_design/missing_condition.samples.tsv b/tests/assets/differential_design/missing_condition.samples.tsv new file mode 100644 index 00000000..b4a6de66 --- /dev/null +++ b/tests/assets/differential_design/missing_condition.samples.tsv @@ -0,0 +1,4 @@ +sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor ms_coeff bigwig_path +g1_t1 group_missing Treatment 1 NA NA NA NA NA NA +g2_t1 group_ok Treatment 1 NA NA NA NA NA NA +g2_c1 group_ok Control 1 NA NA NA NA NA NA diff --git a/tests/assets/differential_manifest_input/peaks.tsv b/tests/assets/differential_manifest_input/peaks.tsv new file mode 100644 index 00000000..671e2f5f --- /dev/null +++ b/tests/assets/differential_manifest_input/peaks.tsv @@ -0,0 +1,4 @@ +sample_id group condition replicate caller peaks_path peaks_format +s3 groupA Control 1 seacr peaks3.bed bed +s1 groupA Treatment 1 macs2 peaks1.bed bed +s2 groupB Control 2 seacr peaks2.bed bed diff --git a/tests/assets/differential_manifest_input/samples.tsv b/tests/assets/differential_manifest_input/samples.tsv new file mode 100644 index 00000000..dce28fcf --- /dev/null +++ b/tests/assets/differential_manifest_input/samples.tsv @@ -0,0 +1,4 @@ +sample_id group condition replicate final_bam final_bai spikein_scale_factor bigwig_path +s2 groupB Control 2 bam2.bam bam2.bai NA bw2.bw +s1 groupA Treatment 1 bam1.bam bam1.bai NA bw1.bw +s3 groupA Control 1 bam3.bam bam3.bai NA bw3.bw diff --git a/tests/assets/minimal.bed b/tests/assets/minimal.bed new file mode 100644 index 00000000..924ff215 --- /dev/null +++ b/tests/assets/minimal.bed @@ -0,0 +1 @@ +chr1 1 100 Gene1 0 + diff --git a/tests/assets/minimal.fa b/tests/assets/minimal.fa new file mode 100644 index 00000000..9fc8d644 --- /dev/null +++ b/tests/assets/minimal.fa @@ -0,0 +1,2 @@ +>chr1 +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT diff --git a/tests/assets/minimal.gtf b/tests/assets/minimal.gtf new file mode 100644 index 00000000..e078b66c --- /dev/null +++ b/tests/assets/minimal.gtf @@ -0,0 +1 @@ +chr1 source gene 1 100 . + . gene_id "gene1"; gene_name "Gene1"; diff --git a/tests/assets/span_mode/peaks.tsv b/tests/assets/span_mode/peaks.tsv new file mode 100644 index 00000000..47b785f4 --- /dev/null +++ b/tests/assets/span_mode/peaks.tsv @@ -0,0 +1 @@ +sample_id group condition caller peaks_path peaks_format replicate diff --git a/tests/assets/span_mode/samples.tsv b/tests/assets/span_mode/samples.tsv new file mode 100644 index 00000000..295844d6 --- /dev/null +++ b/tests/assets/span_mode/samples.tsv @@ -0,0 +1 @@ +sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor ms_coeff bigwig_path diff --git a/tests/test_07_differential.yml b/tests/test_07_differential.yml index 5a149706..77d54737 100644 --- a/tests/test_07_differential.yml +++ b/tests/test_07_differential.yml @@ -15,4 +15,99 @@ - path: results/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv - path: results/03_peak_calling/08_differential/00_manifests/differential_manifest.design.tsv - path: results/03_peak_calling/08_differential/multiqc/differential_summary_mqc.tsv + contains: + - "manifest_only" + - "SKIPPED" + must_not_contain: + - "RUN\tok" - path: results/03_peak_calling/08_differential/multiqc/differential_design_mqc.tsv + +- name: test_differential_design_missing_condition + command: bash -c "mkdir -p tests/output && python3 bin/differential_design.py --samples tests/assets/differential_design/missing_condition.samples.tsv --peaks tests/assets/differential_design/missing_condition.peaks.tsv --contrast Treatment,Control --min-replicates 1 --run-diffbind --run-chipbinner --run-span --out tests/output/design_missing_condition.tsv && python3 bin/differential_summary.py --design tests/output/design_missing_condition.tsv --out tests/output/summary_missing_condition.tsv" + tags: + - test_differential + - test_differential_design + files: + - path: tests/output/design_missing_condition.tsv + contains: + - "missing_condition" + - "SKIP" + - path: tests/output/summary_missing_condition.tsv + contains: + - "missing_condition" + - "SKIP" + +- name: test_differential_design_insufficient_reps_fail + command: bash -c "mkdir -p tests/output && python3 bin/differential_design.py --samples tests/assets/differential_design/insufficient_reps.samples.tsv --peaks tests/assets/differential_design/insufficient_reps.peaks.tsv --contrast Treatment,Control --min-replicates 2 --run-diffbind --out tests/output/design_insufficient_fail.tsv" + tags: + - test_differential + - test_differential_design + exit_code: 1 + stderr: + contains: + - "FAIL entries" + +- name: test_differential_design_insufficient_reps_allow_partial + command: bash -c "mkdir -p tests/output && python3 bin/differential_design.py --samples tests/assets/differential_design/insufficient_reps.samples.tsv --peaks tests/assets/differential_design/insufficient_reps.peaks.tsv --contrast Treatment,Control --min-replicates 2 --allow-partial --run-diffbind --out tests/output/design_insufficient_skip.tsv" + tags: + - test_differential + - test_differential_design + files: + - path: tests/output/design_insufficient_skip.tsv + contains: + - "insufficient_replicates" + - "SKIP" + +- name: test_differential_manifests_deterministic + command: bash -c "rm -rf tests/output/manifest_run1 tests/output/manifest_run2 && mkdir -p tests/output/manifest_run1 tests/output/manifest_run2 && python3 bin/differential_manifests.py --samples tests/assets/differential_manifest_input/samples.tsv --peaks tests/assets/differential_manifest_input/peaks.tsv --normalisation-mode NA --outdir tests/output/manifest_run1 && python3 bin/differential_manifests.py --samples tests/assets/differential_manifest_input/samples.tsv --peaks tests/assets/differential_manifest_input/peaks.tsv --normalisation-mode NA --outdir tests/output/manifest_run2 && diff -q tests/output/manifest_run1/differential_manifest.samples.tsv tests/output/manifest_run2/differential_manifest.samples.tsv && diff -q tests/output/manifest_run1/differential_manifest.peaks.tsv tests/output/manifest_run2/differential_manifest.peaks.tsv" + tags: + - test_differential + - test_differential_manifest + files: + - path: tests/output/manifest_run1/differential_manifest.samples.tsv + - path: tests/output/manifest_run1/differential_manifest.peaks.tsv + +- name: test_diffbind_samplesheet_publish_disabled + command: nextflow run main.nf -profile docker,test -stub-run -entry DIFFERENTIAL_ONLY --run_diffbind --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_from_run tests/assets/diff_run_small --gene_bed tests/assets/minimal.bed --export_diffbind_sheets false --outdir results_diffbind_off -c tests/config/nextflow.config + tags: + - test_differential + - test_diffbind_samplesheet + files: + - path: results_diffbind_off/03_peak_calling/08_differential/01_diffbind/00_samplesheets/seacr/h3k27me3.csv + should_exist: false + +- name: test_diffbind_samplesheet_publish_enabled + command: nextflow run main.nf -profile docker,test -stub-run -entry DIFFERENTIAL_ONLY --run_diffbind --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_from_run tests/assets/diff_run_small --gene_bed tests/assets/minimal.bed --export_diffbind_sheets true --outdir results_diffbind_on -c tests/config/nextflow.config + tags: + - test_differential + - test_diffbind_samplesheet + files: + - path: results_diffbind_on/03_peak_calling/08_differential/01_diffbind/00_samplesheets/seacr/h3k27me3.csv + +- name: test_span_mode_native_requires_compare + command: bash -c "mkdir -p tests/output/span_native && cd tests/output/span_native && python3 ../../bin/span_diff.py --jar ../../tests/assets/omnipeaks_stub.jar --mode native --contrast Treatment,Control --group h3k27me3 --samples ../../tests/assets/span_mode/samples.tsv --peaks ../../tests/assets/span_mode/peaks.tsv --caller-priority span_default --outdir ." + tags: + - test_differential + - test_span_mode + exit_code: 1 + +- name: test_span_mode_auto_fallback + command: bash -c "mkdir -p tests/output/span_auto && cd tests/output/span_auto && python3 ../../bin/span_diff.py --jar ../../tests/assets/omnipeaks_stub.jar --mode auto --contrast Treatment,Control --group h3k27me3 --samples ../../tests/assets/span_mode/samples.tsv --peaks ../../tests/assets/span_mode/peaks.tsv --caller-priority span_default --allow-partial true --outdir ." + tags: + - test_differential + - test_span_mode + files: + - path: tests/output/span_auto/span.mode.txt + contains: + - "fallback" + +- name: test_annotation_fallback_gtf_only + command: nextflow run main.nf -profile docker,test -entry DIFFERENTIAL_ONLY --run_span_diff --span_diff_mode native --differential_allow_partial --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_from_run tests/assets/diff_run_small --omnipeaks_jar tests/assets/omnipeaks_stub.jar --gtf tests/assets/minimal.gtf --fasta tests/assets/minimal.fa --outdir results_annotation_gtf -c tests/config/nextflow.config + tags: + - test_differential + - test_annotation_fallback + files: + - path: results_annotation_gtf/03_peak_calling/08_differential/03_span/h3k27me3/span.differential.annotated.tsv + contains: + - "nearest_feature_id" + - "nearest_gene_name" From 96e35169ac72eba93ee440ab8f5cb0e0b9f8f8a2 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 5 Jan 2026 00:31:19 -0800 Subject: [PATCH 20/52] Tighten SPAN jar checks and native orientation --- bin/span_diff.py | 99 ++++++++++++++++++++++++++++++++-- lib/WorkflowCutandrun.groovy | 10 +++- modules/local/span_diff_run.nf | 1 + 3 files changed, 105 insertions(+), 5 deletions(-) diff --git a/bin/span_diff.py b/bin/span_diff.py index ad875b78..f916d650 100755 --- a/bin/span_diff.py +++ b/bin/span_diff.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import argparse import csv +import math import os import re import subprocess @@ -304,9 +305,12 @@ def write_summary(path: str, group: str, treated: str, control: str, n_tested: i write_tsv(path, header, rows) -def write_mode(path: str, mode: str, signature: str) -> None: +def write_mode(path: str, mode: str, signature: str, notes: List[str] = None) -> None: with open(path, "w") as handle: handle.write(f"{mode}:{signature}\n") + if notes: + for note in notes: + handle.write(f"{note}\n") def filter_bed_by_stats(rows: List[Dict[str, str]], fdr_cutoff: float, direction: str) -> List[Tuple[str, int, int]]: @@ -345,6 +349,38 @@ def write_failure_outputs(outdir: str, group: str, treated: str, control: str, m write_mode(mode_path, mode, signature) +def compute_orientation_log2fc( + bed_path: str, + treated_bams: List[str], + control_bams: List[str], + pseudocount: float, +) -> Dict[Tuple[str, str, str], List[float]]: + if not treated_bams or not control_bams: + raise RuntimeError("orientation_missing_bams") + cmd = ["bedtools", "multicov", "-bed", bed_path, "-bams"] + treated_bams + control_bams + result = run_cmd(cmd, check=False) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or "bedtools_multicov_failed") + orientation = {} + for line in result.stdout.strip().split("\n"): + if not line: + continue + parts = line.split("\t") + if len(parts) < 3: + continue + key = (parts[0], parts[1], parts[2]) + counts = [safe_float(val) or 0.0 for val in parts[3:]] + t_counts = counts[: len(treated_bams)] + c_counts = counts[len(treated_bams) :] + if not t_counts or not c_counts: + continue + t_mean = sum(t_counts) / len(t_counts) + c_mean = sum(c_counts) / len(c_counts) + log2fc = math.log2((t_mean + pseudocount) / (c_mean + pseudocount)) + orientation.setdefault(key, []).append(log2fc) + return orientation + + def main(): parser = argparse.ArgumentParser(description="SPAN/OmniPeak differential runner") parser.add_argument("--jar", required=True) @@ -541,9 +577,13 @@ def pool_bams(condition: str, bams: List[str]) -> str: treated_arg = "" control_arg = "" + compare_treated_bams = [] + compare_control_bams = [] if use_lists: treated_arg = ",".join(treated_bams) control_arg = ",".join(control_bams) + compare_treated_bams = treated_bams + compare_control_bams = control_bams else: if len(treated_bams) > 1: treated_arg = pool_bams(treated, treated_bams) @@ -553,6 +593,8 @@ def pool_bams(condition: str, bams: List[str]) -> str: control_arg = pool_bams(control, control_bams) else: control_arg = control_bams[0] + compare_treated_bams = [treated_arg] + compare_control_bams = [control_arg] output_target = "span.native" output_path = output_target @@ -604,11 +646,26 @@ def pool_bams(condition: str, bams: List[str]) -> str: if not rows: raise RuntimeError("native_output_empty") - write_tsv(diff_path, ["chr", "start", "end", "log2FC", "pval", "FDR"], rows) bed_rows = [(r["chr"], int(r["start"]), int(r["end"])) for r in rows if is_int(r["start"]) and is_int(r["end"])] bed_rows.sort(key=lambda x: (x[0], x[1], x[2])) write_bed(bed_path, bed_rows) + pseudocount = 0.5 + orientation_map = compute_orientation_log2fc( + bed_path, + compare_treated_bams, + compare_control_bams, + pseudocount, + ) + for row in rows: + key = (row.get("chr"), row.get("start"), row.get("end")) + values = orientation_map.get(key, []) + log2fc_user = values.pop(0) if values else None + row["log2FC_span"] = row.get("log2FC", "NA") + row["log2FC"] = f"{log2fc_user:.6f}" if log2fc_user is not None else "NA" + + write_tsv(diff_path, ["chr", "start", "end", "log2FC", "log2FC_span", "pval", "FDR"], rows) + up_rows = filter_bed_by_stats(rows, args.fdr, "up") down_rows = filter_bed_by_stats(rows, args.fdr, "down") up_rows.sort(key=lambda x: (x[0], x[1], x[2])) @@ -623,7 +680,12 @@ def pool_bams(condition: str, bams: List[str]) -> str: pooled_rows, ) - write_mode(mode_path, "native", signature) + write_mode( + mode_path, + "native", + signature, + notes=[f"orientation=treated/control", f"orientation_pseudocount={pseudocount}"], + ) finalize_success(rows, "native") return @@ -665,6 +727,37 @@ def pool_bams(condition: str, bams: List[str]) -> str: f"{row.get('sample_id','')}\t{row.get('condition','')}\t{row.get('spikein_scale_factor','NA')}\n" ) + normalization_path = os.path.join(args.outdir, "span.normalization_factors.tsv") + spikein_values = [] + for row in samples_sorted: + spikein_values.append(row.get("spikein_scale_factor", "NA")) + + spikein_numeric = [] + for val in spikein_values: + try: + spikein_numeric.append(float(val)) + except Exception: + spikein_numeric.append(None) + + use_spikein_factors = use_spikein and any(val is not None for val in spikein_numeric) + with open(normalization_path, "w") as handle: + handle.write("sample_id\tgroup\tcondition\tspikein_scale_factor\tspikein_size_factor\tused_by_backend\n") + for idx, row in enumerate(samples_sorted): + scale_val = spikein_values[idx] + size_factor = "NA" + used_by_backend = args.fallback_backend + if use_spikein_factors: + used_by_backend = "spikein" + numeric = spikein_numeric[idx] + if numeric is None or numeric == 0: + size_factor = "1" + else: + size_factor = f"{1.0 / numeric:.6f}" + handle.write( + f"{row.get('sample_id','')}\t{row.get('group','')}\t{row.get('condition','')}\t{scale_val}\t{size_factor}\t{used_by_backend}\n" + ) + print(f"INFO: wrote normalization factors to {normalization_path}", file=sys.stderr) + r_script = os.path.join(os.path.dirname(__file__), "span_fallback_de.R") cmd = [ "Rscript", diff --git a/lib/WorkflowCutandrun.groovy b/lib/WorkflowCutandrun.groovy index d434aab1..2c1b814a 100755 --- a/lib/WorkflowCutandrun.groovy +++ b/lib/WorkflowCutandrun.groovy @@ -114,8 +114,14 @@ class WorkflowCutandrun { } } - if (params.run_span_diff && !params.omnipeaks_jar) { - Nextflow.error "SPAN differential requested but --omnipeaks_jar was not provided." + if (params.run_span_diff) { + if (!params.omnipeaks_jar) { + Nextflow.error "SPAN differential requested but --omnipeaks_jar was not provided." + } + def jar_path = new File(params.omnipeaks_jar.toString()) + if (!jar_path.exists()) { + Nextflow.error "SPAN differential requested but --omnipeaks_jar path does not exist: ${params.omnipeaks_jar}" + } } if (params.differential_use_spikein && !['auto','true','false'].contains(params.differential_use_spikein.toString())) { diff --git a/modules/local/span_diff_run.nf b/modules/local/span_diff_run.nf index 68baa4b0..cd24a47c 100644 --- a/modules/local/span_diff_run.nf +++ b/modules/local/span_diff_run.nf @@ -26,6 +26,7 @@ process SPAN_DIFF_RUN { tuple val(group), path("span.summary.tsv"), emit: summary tuple val(group), path("span.mode.txt"), emit: mode_out tuple val(group), path("span_diff_target_pooling.tsv", optional: true), emit: pooling + tuple val(group), path("span.normalization_factors.tsv", optional: true), emit: norm_factors path "*.pooled.bam", optional: true, emit: pooled_bam path "*.pooled.bam.bai", optional: true, emit: pooled_bai path "versions.yml", emit: versions From 8e019bc8f9eca21a649e319870fd27f39171018e Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 5 Jan 2026 00:42:41 -0800 Subject: [PATCH 21/52] Add chipbinner input manifest support --- bin/chipbinner_run.py | 54 ++++++-- bin/differential_manifests.py | 6 + conf/modules.config | 4 +- .../SPEC_DIFFERENTIAL_PEAK_CALLING.md | 27 ++-- docs/output.md | 3 +- .../local/differential_peak_calling.nf | 119 ++++++++++++++++-- 6 files changed, 177 insertions(+), 36 deletions(-) diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index c583123d..17f82a6e 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -138,6 +138,7 @@ def write_samplesheet(samples, out_path): "spikein_scale_factor", "ms_coeff", "input_bam", + "input_bai", ] with open(out_path, "w", newline="") as handle: writer = csv.DictWriter(handle, fieldnames=header) @@ -154,6 +155,7 @@ def write_samplesheet(samples, out_path): "spikein_scale_factor": row.get("spikein_scale_factor", ""), "ms_coeff": row.get("ms_coeff", ""), "input_bam": row.get("input_bam", ""), + "input_bai": row.get("input_bai", ""), }) def compute_counts(windows_path, samples, out_path, use_input=False): @@ -177,11 +179,13 @@ def compute_counts(windows_path, samples, out_path, use_input=False): counts["bin_id"] = counts.apply(lambda r: f"{r.chrom}:{int(r.start)}-{int(r.end)}", axis=1) if use_input: - input_bams = sorted({row.get("input_bam") for row in samples if row.get("input_bam") not in (None, "", "NA")}) - if not input_bams: - raise RuntimeError("chipbinner_use_input requested but no input_bam values were provided") - input_counts = None - for bam in input_bams: + input_counts_map = {} + for row in samples: + bam = row.get("input_bam") + if not bam or bam in ("NA", "None", "none", ""): + raise RuntimeError("chipbinner_use_input requested but no input_bam values were provided") + if bam in input_counts_map: + continue if not os.path.exists(bam): raise RuntimeError(f"Input BAM not found: {bam}") with tempfile.NamedTemporaryFile(delete=False) as tmp_handle: @@ -191,12 +195,11 @@ def compute_counts(windows_path, samples, out_path, use_input=False): run_cmd(cmd, stdout=handle) tmp_df = pd.read_csv(tmp_path, sep="\t", header=None, names=["chrom", "start", "end", "count"]) os.unlink(tmp_path) - if input_counts is None: - input_counts = tmp_df["count"].astype(float).values - else: - input_counts += tmp_df["count"].astype(float).values - input_counts = input_counts / len(input_bams) - counts[sample_ids] = counts[sample_ids].astype(float).sub(input_counts, axis=0).clip(lower=0) + input_counts_map[bam] = tmp_df["count"].astype(float).values + + for sample_id, row in zip(sample_ids, samples): + bam = row.get("input_bam") + counts[sample_id] = counts[sample_id].astype(float).sub(input_counts_map[bam], axis=0).clip(lower=0) return counts, sample_ids @@ -517,7 +520,7 @@ def write_summary(path, group, treated, control, n_bins, n_fdr, n_up, n_down, n_ ]) -def write_empty_outputs(outdir, group, sample_ids, treated, control, reason): +def write_empty_outputs(outdir, group, sample_ids, treated, control, reason, status="FAIL"): windows_path = os.path.join(outdir, "chipbinner.windows.bed") counts_path = os.path.join(outdir, "chipbinner.bin_counts.tsv") norm_path = os.path.join(outdir, "chipbinner.normalized_matrix.tsv") @@ -540,7 +543,7 @@ def write_empty_outputs(outdir, group, sample_ids, treated, control, reason): handle.write("chrom\tstart\tend\tcluster\n") with open(diff_path, "w") as handle: handle.write("chrom\tstart\tend\tbin_id\tlog2FC\tpval\tFDR\tcluster\tdirection\n") - write_summary(summary_path, group, treated, control, 0, 0, 0, 0, 0, "NA", "NA", "FAIL", reason, "NOT_RUN") + write_summary(summary_path, group, treated, control, 0, 0, 0, 0, 0, "NA", "NA", status, reason, "NOT_RUN") def main(): @@ -569,6 +572,31 @@ def main(): samplesheet_path = os.path.join(args.outdir, "chipbinner.samplesheet.csv") write_samplesheet(samples, samplesheet_path) + if args.use_input: + missing_samples = [] + missing_paths = [] + for row in samples: + bam = row.get("input_bam") + if not bam or bam in ("NA", "None", "none", ""): + missing_samples.append(row.get("sample_id", "")) + elif not os.path.exists(bam): + missing_paths.append(bam) + if missing_samples or missing_paths: + details = [] + if missing_samples: + details.append(f"samples={','.join(sorted(set(missing_samples)))}") + if missing_paths: + details.append(f"paths={','.join(sorted(set(missing_paths)))}") + reason = "MISSING_INPUT_BAM" + if details: + reason = f"{reason}:" + ";".join(details) + if args.allow_partial: + write_empty_outputs(args.outdir, args.group, sample_ids, treated_label, control_label, reason, status="SKIP") + return + raise RuntimeError( + f"chipbinner_use_input requested but input BAMs are missing ({'; '.join(details) or 'no input_bam values'})." + ) + windows_source = choose_windows_file(args.windows_dir, args.bin_size) windows_path = os.path.join(args.outdir, "chipbinner.windows.bed") try: diff --git a/bin/differential_manifests.py b/bin/differential_manifests.py index 51632186..a7b3227e 100755 --- a/bin/differential_manifests.py +++ b/bin/differential_manifests.py @@ -57,6 +57,8 @@ def write_samples(samples_rows, ms_coeffs, normalisation_mode, out_path): "spikein_scale_factor", "ms_coeff", "bigwig_path", + "input_bam", + "input_bai", ] rows = [] for row in samples_rows: @@ -68,6 +70,8 @@ def write_samples(samples_rows, ms_coeffs, normalisation_mode, out_path): final_bai = row.get("final_bai") or "NA" spikein = row.get("spikein_scale_factor") or "NA" bigwig = row.get("bigwig_path") or "NA" + input_bam = row.get("input_bam") or "NA" + input_bai = row.get("input_bai") or "NA" ms_coeff = ms_coeffs.get(sample_id, "NA") rows.append({ "sample_id": sample_id, @@ -80,6 +84,8 @@ def write_samples(samples_rows, ms_coeffs, normalisation_mode, out_path): "spikein_scale_factor": spikein, "ms_coeff": ms_coeff, "bigwig_path": bigwig, + "input_bam": input_bam, + "input_bai": input_bai, }) rows.sort(key=lambda r: (r["group"], "NA", r["condition"], sort_key(r["replicate"]), r["sample_id"])) diff --git a/conf/modules.config b/conf/modules.config index eee7056a..ad02ba43 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -766,12 +766,12 @@ if(params.run_peak_calling && params.callers.any { it.startsWith('span') }) { } } -if(params.run_peak_calling && params.callers.any { it.startsWith('epic2') || it.startsWith('span') }) { +if((params.run_peak_calling && params.callers.any { it.startsWith('epic2') || it.startsWith('span') }) || (params.run_chipbinner && params.chipbinner_use_input)) { process { withName: '.*:CUTANDRUN:.*POOL_IGG_CONTROLS' { ext.prefix = { "${meta.id}" } publishDir = [ - path: { "${params.outdir}/03_peak_calling/01_pooled_controls" }, + path: { task.ext.publish_dir ?: "${params.outdir}/03_peak_calling/01_pooled_controls" }, mode: "${params.publish_dir_mode}", pattern: "*.{bam,bam.bai}", enabled: true diff --git a/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md index 8ddce6eb..ed29323c 100644 --- a/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md +++ b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md @@ -246,17 +246,19 @@ and are consumed in posthoc mode from the prior run’s outdir. differential_manifest.samples.tsv One row per target sample. -Columns: - • sample_id - • group - • condition - • replicate - • final_bam - • final_bai - • normalisation_mode - • spikein_scale_factor (numeric or NA) - • ms_coeff (numeric or NA) - • bigwig_path (path or NA) + Columns: + • sample_id + • group + • condition + • replicate + • final_bam + • final_bai + • normalisation_mode + • spikein_scale_factor (numeric or NA) + • ms_coeff (numeric or NA) + • bigwig_path (path or NA) + • input_bam (path or NA) + • input_bai (path or NA) differential_manifest.peaks.tsv One row per (sample_id × caller) peak file. @@ -622,6 +624,8 @@ All differential outputs live under: differential_manifest.samples.tsv differential_manifest.peaks.tsv differential_manifest.design.tsv + chipbinner_input__.bam (optional) + chipbinner_input__.bam.bai (optional) span_diff_target_pooling.tsv (optional) 01_diffbind/ 00_samplesheets//.csv @@ -709,4 +713,3 @@ Use --differential_publish_manifest_only and/or a stub profile to avoid heavy co • Batch covariates in statistical models. • Deep cross-method concordance (beyond lightweight summary tables). • Automatic peak caller benchmarking. - diff --git a/docs/output.md b/docs/output.md index e6554d24..0be4180b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -445,9 +445,10 @@ The merge function from [BEDtools](https://github.com/arq5x/bedtools2) is used t Output files - `03_peak_calling/08_differential/00_manifests/` - - `differential_manifest.samples.tsv`: per-sample manifest with BAM/BAI, scale factors, and bigWig paths. + - `differential_manifest.samples.tsv`: per-sample manifest with BAM/BAI, optional input BAMs, scale factors, and bigWig paths. - `differential_manifest.peaks.tsv`: per-sample per-caller peak paths and formats. - `differential_manifest.design.tsv`: eligibility and status table for each comparison. + - `chipbinner_input__.bam(.bai)`: pooled input/IgG BAMs when `--chipbinner_use_input` is enabled. - `03_peak_calling/08_differential/01_diffbind///` - `diffbind.results.tsv`: full DiffBind results. - `diffbind.results.annotated.tsv`: DiffBind results annotated with nearest gene. diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 6c089a9e..e2374b00 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -11,6 +11,7 @@ include { RECORDS_TO_TSV as RECORDS_TO_TSV_DIFFBIND } from "../../modules/local/ include { RECORDS_TO_TSV as RECORDS_TO_TSV_CHIPBINNER } from "../../modules/local/records_to_tsv" include { DIFFBIND_RUN } from "../../modules/local/diffbind_run" include { CHIPBINNER_RUN } from "../../modules/local/chipbinner_run" +include { POOL_IGG_CONTROLS } from "../../modules/local/pool_igg_controls" include { SPAN_DIFF_RUN } from "../../modules/local/span_diff_run" include { SPAN_POOLING_MANIFEST } from "../../modules/local/span_pooling_manifest" include { ANNOTATE_REGIONS } from "../../modules/local/annotate_regions" @@ -73,6 +74,8 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_bai_target = ch_bai.filter { it[0].is_control == false } ch_bigwig_target = ch_bigwig.filter { it[0].is_control == false } ch_scale_target = ch_scale_factors.filter { it[0].is_control == false } + ch_bam_control = ch_bam.filter { it[0].is_control == true } + ch_bai_control = ch_bai.filter { it[0].is_control == true } ch_scale_map = ch_scale_target .map { meta, scale -> [meta.id, scale] } @@ -100,18 +103,116 @@ workflow DIFFERENTIAL_PEAK_CALLING { out } - ch_samples_raw = ch_bam_target + def build_control_condition_map = { ch_control -> + ch_control + .map { meta, file -> [meta.group, meta.condition] } + .toList() + .map { list -> + def control_map = [:].withDefault { [] } + list.each { entry -> + def group = entry[0] + def condition = entry[1] + control_map[group] = (control_map[group] ?: []) + [condition] + } + control_map + } + } + + def add_control_condition = { ch_target, ch_control -> + def control_map_ch = build_control_condition_map(ch_control) + ch_target + .combine(control_map_ch) + .map { meta, file, control_map -> + def control_group = meta.control_group ?: meta.group + def conditions = control_map.get(control_group, []) + def control_condition = meta.condition + if (conditions) { + if (conditions.contains(meta.condition)) { + control_condition = meta.condition + } else if (conditions.contains('NA')) { + control_condition = 'NA' + } else { + control_condition = conditions.sort()[0] + } + } + def meta_out = meta + [control_condition: control_condition] + [meta_out, file] + } + } + + ch_bam_target_cc = add_control_condition(ch_bam_target, ch_bam_control) + + ch_input_map = Channel.value([:]) + if (params.chipbinner_use_input) { + ch_pooled_inputs = ch_bam_control + .map { meta, bam -> ["${meta.group}__${meta.condition}", [meta, bam]] } + .groupTuple(by: [0]) + .map { key, entries -> + def meta0 = entries[0][0] + def pooled_meta = meta0 + [ + id: "chipbinner_input_${meta0.group}_${meta0.condition}", + control_group: meta0.group, + is_control: true + ] + def bams = entries.collect { it[1] }.sort { it.getName() } + [pooled_meta, bams] + } + + POOL_IGG_CONTROLS ( ch_pooled_inputs ) { + ext.publish_dir = "${diff_outdir}/00_manifests" + } + ch_versions = ch_versions.mix(POOL_IGG_CONTROLS.out.versions) + + ch_pooled_controls = POOL_IGG_CONTROLS.out.bam + .map { meta, bam -> [meta.id, meta, bam] } + .join(POOL_IGG_CONTROLS.out.bai.map { meta, bai -> [meta.id, bai] }) + .map { id, meta, bam, bai -> [meta, bam, bai] } + + ch_input_map = ch_pooled_controls + .map { meta, bam, bai -> [meta.group, meta.condition, bam, bai] } + .toList() + .ifEmpty([]) + .map { list -> + def map = [:].withDefault { [:] } + list.each { entry -> + def group = entry[0] + def condition = entry[1] + def group_map = map[group] ?: [:] + group_map[condition] = [entry[2], entry[3]] + map[group] = group_map + } + map + } + } + + ch_samples_raw = ch_bam_target_cc .map { meta, bam -> [meta.id, meta, bam] } .join(ch_bai_target.map { meta, bai -> [meta.id, bai] }) .map { id, meta, bam, bai -> [meta, bam, bai] } .combine(ch_bigwig_map) .combine(ch_scale_map) - .map { meta, bam, bai, bw_map, scale_map -> + .combine(ch_input_map) + .map { meta, bam, bai, bw_map, scale_map, input_map -> def bigwig = bw_map.get(meta.id) ?: 'NA' def scale = scale_map.get(meta.id) ?: 'NA' def bam_path = "${bam_dir}/${bam.getName()}" def bai_path = "${bam_dir}/${bai.getName()}" def bigwig_path = bigwig == 'NA' ? 'NA' : "${bigwig_dir}/${bigwig.getName()}" + def control_group = meta.control_group ?: meta.group + def control_condition = meta.control_condition ?: meta.condition + def input_entry = input_map.get(control_group, [:]).get(control_condition) + def input_bam_path = 'NA' + def input_bai_path = 'NA' + if (input_entry) { + def input_bam = input_entry[0] + def input_bai = input_entry[1] + if (input_bam) { + input_bam_path = "${diff_outdir}/00_manifests/${input_bam.getName()}" + } + if (input_bai) { + input_bai_path = "${diff_outdir}/00_manifests/${input_bai.getName()}" + } + } [ sample_id: meta.id, group: meta.group, @@ -120,14 +221,16 @@ workflow DIFFERENTIAL_PEAK_CALLING { final_bam: bam_path, final_bai: bai_path, spikein_scale_factor: scale, - bigwig_path: bigwig_path + bigwig_path: bigwig_path, + input_bam: input_bam_path, + input_bai: input_bai_path ] } .map { row -> - "${row.sample_id}\t${row.group}\t${row.condition}\t${row.replicate}\t${row.final_bam}\t${row.final_bai}\t${row.spikein_scale_factor}\t${row.bigwig_path}" + "${row.sample_id}\t${row.group}\t${row.condition}\t${row.replicate}\t${row.final_bam}\t${row.final_bai}\t${row.spikein_scale_factor}\t${row.bigwig_path}\t${row.input_bam}\t${row.input_bai}" } - def samples_header = "sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tspikein_scale_factor\tbigwig_path" + def samples_header = "sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tspikein_scale_factor\tbigwig_path\tinput_bam\tinput_bai" ch_samples_raw = ch_samples_raw .collect() .map { rows -> ([samples_header] + rows).join('\n') + '\n' } @@ -313,7 +416,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_chip_records_json.map { it[0] }, ch_chip_records_json.map { 'NA' }, ch_chip_records_json.map { it[1] }, - 'sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff', + 'sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff\tinput_bam\tinput_bai', ch_chip_records_json.map { group, json -> "chipbinner_${group}.tsv" } ) ch_versions = ch_versions.mix(RECORDS_TO_TSV_CHIPBINNER.out.versions) @@ -377,12 +480,12 @@ workflow DIFFERENTIAL_PEAK_CALLING { if (params.span_diff_mode == 'fallback') { span_chrom_sizes = ch_chrom_sizes.ifEmpty(file("$projectDir/assets/chrom_sizes_stub.sizes")) } - ch_chrom_sizes_single = span_chrom_sizes.collect().map { it[0] } + ch_span_chrom_sizes_single = span_chrom_sizes.collect().map { it[0] } ch_span_inputs = ch_span_design .combine(ch_samples_manifest_single) .combine(ch_peaks_manifest_single) - .combine(ch_chrom_sizes_single) + .combine(ch_span_chrom_sizes_single) .map { group, samples_manifest, peaks_manifest, chrom_sizes -> [ group, samples_manifest, peaks_manifest, chrom_sizes ] } SPAN_DIFF_RUN ( From b55661ea31e17d78f8e5dffdefc2b2c1e5010584 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 5 Jan 2026 01:01:29 -0800 Subject: [PATCH 22/52] Improve ChIPBinner normalization and caching --- bin/chipbinner_run.py | 204 +++++++++++++----- bin/chipbinner_windows_cache.py | 103 +++++++++ conf/modules.config | 8 + .../SPEC_DIFFERENTIAL_PEAK_CALLING.md | 6 + docs/output.md | 4 + modules/local/chipbinner_run.nf | 33 +-- modules/local/chipbinner_windows_cache.nf | 38 ++++ .../local/differential_peak_calling.nf | 24 ++- 8 files changed, 353 insertions(+), 67 deletions(-) create mode 100755 bin/chipbinner_windows_cache.py create mode 100644 modules/local/chipbinner_windows_cache.nf diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index 17f82a6e..d379f8ae 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -2,6 +2,7 @@ import argparse import csv import glob +import hashlib import math import os import subprocess @@ -36,6 +37,7 @@ def parse_args(): parser.add_argument("--contrast", required=True) parser.add_argument("--chrom-sizes", required=True) parser.add_argument("--bin-size", type=int, required=True) + parser.add_argument("--windows") parser.add_argument("--windows-dir") parser.add_argument("--blacklist") parser.add_argument("--use-input", action="store_true") @@ -71,6 +73,25 @@ def parse_float(value, default=None): return default +def genome_id_from_path(path): + base = os.path.basename(path) + if base.endswith(".sizes"): + base = base[: -len(".sizes")] + else: + base = os.path.splitext(base)[0] + return base or "genome" + + +def hash_file(path): + if not path or not os.path.exists(path): + return None + digest = hashlib.sha256() + with open(path, "rb") as handle: + for chunk in iter(lambda: handle.read(8192), b""): + digest.update(chunk) + return digest.hexdigest()[:12] + + def ensure_dir(path): if path: os.makedirs(path, exist_ok=True) @@ -82,17 +103,30 @@ def run_cmd(cmd, stdout=None): raise RuntimeError(f"Command failed ({' '.join(cmd)}): {result.stderr.strip()}") -def choose_windows_file(windows_dir, bin_size): +def choose_windows_file(windows_path, windows_dir, bin_size, chrom_sizes=None, blacklist=None): + if windows_path: + return windows_path if not windows_dir: return None if os.path.isfile(windows_dir): return windows_dir if os.path.isdir(windows_dir): - candidates = sorted(glob.glob(os.path.join(windows_dir, f"*{bin_size}*.bed"))) - if not candidates: - candidates = sorted(glob.glob(os.path.join(windows_dir, "*.bed"))) - if candidates: - return candidates[0] + genome_id = genome_id_from_path(chrom_sizes) if chrom_sizes else None + blacklist_hash = hash_file(blacklist) if blacklist else None + preferred = [] + if genome_id: + if blacklist_hash: + preferred.append(f"windows.{genome_id}.{bin_size}.{blacklist_hash}.bed") + preferred.append(f"windows.{genome_id}.{bin_size}.bed") + else: + preferred.append(f"windows.{bin_size}.bed") + for name in preferred: + candidate = os.path.join(windows_dir, name) + if os.path.exists(candidate): + return candidate + raise RuntimeError( + f"No windows BED found in {windows_dir}. Expected one of: {', '.join(preferred)}" + ) return None @@ -210,21 +244,46 @@ def write_matrix(counts_df, sample_ids, out_path): def normalize_counts(counts_df, sample_ids, samples, use_spikein, pseudocount): counts = counts_df[sample_ids].astype(float) - scale = np.ones(len(sample_ids), dtype=float) + factors = [] for idx, row in enumerate(samples): - if use_spikein: - spike = parse_float(row.get("spikein_scale_factor")) - if spike is not None: - scale[idx] *= spike - coeff = parse_float(row.get("ms_coeff")) - if coeff is not None: - scale[idx] *= coeff - counts = counts * scale + steps = [] + spike_raw = row.get("spikein_scale_factor") + spike = parse_float(spike_raw) if use_spikein else None + spike_size = None + if spike is not None: + if spike == 0: + raise RuntimeError("Spike-in scale factor cannot be zero") + spike_size = 1.0 / spike + counts.iloc[:, idx] = counts.iloc[:, idx] / spike_size + steps.append("spikein") + + ms_raw = row.get("ms_coeff") + ms_coeff = parse_float(ms_raw) + ms_size = None + if ms_coeff is not None: + if ms_coeff == 0: + raise RuntimeError("MS coefficient cannot be zero") + ms_size = 1.0 / ms_coeff + counts.iloc[:, idx] = counts.iloc[:, idx] / ms_size + steps.append("ms_coeff") + + steps.extend(["pseudocount", "library_size_cpm"]) + factors.append({ + "sample_id": row.get("sample_id", ""), + "group": row.get("group", ""), + "condition": row.get("condition", ""), + "spikein_scale_factor": spike if spike is not None else "NA", + "spikein_size_factor": spike_size if spike_size is not None else "NA", + "ms_coeff": ms_coeff if ms_coeff is not None else "NA", + "ms_size_factor": ms_size if ms_size is not None else "NA", + "applied_steps": ",".join(steps), + }) + counts = counts + pseudocount library_size = counts.sum(axis=0) library_size[library_size == 0] = 1.0 normalized = counts.divide(library_size, axis=1) * 1e6 - return normalized + return normalized, pd.DataFrame(factors) def plot_pca(norm_matrix, sample_ids, samples, out_path): @@ -520,30 +579,11 @@ def write_summary(path, group, treated, control, n_bins, n_fdr, n_up, n_down, n_ ]) -def write_empty_outputs(outdir, group, sample_ids, treated, control, reason, status="FAIL"): - windows_path = os.path.join(outdir, "chipbinner.windows.bed") - counts_path = os.path.join(outdir, "chipbinner.bin_counts.tsv") - norm_path = os.path.join(outdir, "chipbinner.normalized_matrix.tsv") - grid_path = os.path.join(outdir, "chipbinner.hdbscan_grid_summary.tsv") - clusters_path = os.path.join(outdir, "chipbinner.clusters.tsv") - diff_path = os.path.join(outdir, "chipbinner.differential.tsv") - summary_path = os.path.join(outdir, "chipbinner.summary.tsv") - ensure_dir(os.path.join(outdir, "plots")) - ensure_dir(os.path.join(outdir, "enrichment")) - - with open(windows_path, "w") as handle: - handle.write("") - with open(counts_path, "w") as handle: - handle.write("bin\t" + "\t".join(sample_ids) + "\n") - with open(norm_path, "w") as handle: - handle.write("bin\t" + "\t".join(sample_ids) + "\n") - with open(grid_path, "w") as handle: - handle.write("minPts\tminSamps\tn_clusters\tnoise_fraction\tsilhouette\tscore\tstatus\tselected\n") - with open(clusters_path, "w") as handle: - handle.write("chrom\tstart\tend\tcluster\n") - with open(diff_path, "w") as handle: - handle.write("chrom\tstart\tend\tbin_id\tlog2FC\tpval\tFDR\tcluster\tdirection\n") - write_summary(summary_path, group, treated, control, 0, 0, 0, 0, 0, "NA", "NA", status, reason, "NOT_RUN") +def write_error_artifact(outdir, message): + error_path = os.path.join(outdir, "chipbinner.error.txt") + with open(error_path, "w") as handle: + handle.write(f"{message}\n") + return error_path def main(): @@ -551,8 +591,6 @@ def main(): ensure_dir(args.outdir) plots_dir = os.path.join(args.outdir, "plots") enrichment_dir = os.path.join(args.outdir, "enrichment") - ensure_dir(plots_dir) - ensure_dir(enrichment_dir) contrast = [c.strip() for c in args.contrast.split(",") if c.strip()] if len(contrast) != 2: @@ -591,13 +629,31 @@ def main(): if details: reason = f"{reason}:" + ";".join(details) if args.allow_partial: - write_empty_outputs(args.outdir, args.group, sample_ids, treated_label, control_label, reason, status="SKIP") + error_path = write_error_artifact(args.outdir, reason) + write_summary( + os.path.join(args.outdir, "chipbinner.summary.tsv"), + args.group, + treated_label, + control_label, + 0, + 0, + 0, + 0, + 0, + "NA", + "NA", + "SKIP", + f"{reason};error={os.path.basename(error_path)}", + "NOT_RUN", + ) return raise RuntimeError( f"chipbinner_use_input requested but input BAMs are missing ({'; '.join(details) or 'no input_bam values'})." ) - windows_source = choose_windows_file(args.windows_dir, args.bin_size) + windows_source = choose_windows_file( + args.windows, args.windows_dir, args.bin_size, args.chrom_sizes, args.blacklist + ) windows_path = os.path.join(args.outdir, "chipbinner.windows.bed") try: if windows_source: @@ -617,11 +673,15 @@ def main(): counts_df, sample_ids = compute_counts(windows_path, samples, counts_raw_path, use_input=args.use_input) write_matrix(counts_df, sample_ids, counts_raw_path) - norm_matrix = normalize_counts(counts_df, sample_ids, samples, args.use_spikein, args.pseudocount) + norm_matrix, factors_df = normalize_counts(counts_df, sample_ids, samples, args.use_spikein, args.pseudocount) norm_path = os.path.join(args.outdir, "chipbinner.normalized_matrix.tsv") norm_matrix.insert(0, "bin_id", counts_df["bin_id"]) norm_matrix.to_csv(norm_path, sep="\t", index=False) + factors_path = os.path.join(args.outdir, "chipbinner.normalization_factors.tsv") + factors_df.to_csv(factors_path, sep="\t", index=False) + + ensure_dir(plots_dir) plot_pca(norm_matrix.drop(columns=["bin_id"]).values, sample_ids, samples, os.path.join(plots_dir, "chipbinner.pca.png")) plot_correlation(norm_matrix.drop(columns=["bin_id"]).values, sample_ids, os.path.join(plots_dir, "chipbinner.correlation.png")) @@ -632,10 +692,11 @@ def main(): chosen, grid_summary_path = hdbscan_grid(norm_matrix.drop(columns=["bin_id"]), windows_df, minpts_list, minsamps_list, grid_dir, grid_summary_path) # chosen clusters - clusters_path = os.path.join(args.outdir, "chipbinner.clusters.tsv") chosen_df = windows_df.copy() chosen_df["cluster"] = chosen["labels"] + clusters_path = os.path.join(args.outdir, "chipbinner.clusters.tsv") chosen_df.to_csv(clusters_path, sep="\t", index=False) + chosen_df.to_csv(os.path.join(args.outdir, "chipbinner.clusters.best.tsv"), sep="\t", index=False) # Differential with ROTS rots_out = os.path.join(args.outdir, "chipbinner.rots.tsv") @@ -659,6 +720,38 @@ def main(): diff_df = diff_df.merge(rots_df, on="bin_id", how="left") diff_df["cluster"] = chosen_df["cluster"].values + # Standardized cluster views based on cluster mean log2FC (|mean| < 0.25 => stable) + STABLE_LFC_THRESHOLD = 0.25 + cluster_means = {} + for cluster_id in sorted(set(chosen_df["cluster"]) - {-1}): + cluster_means[cluster_id] = float(np.mean(log2fc[chosen_df["cluster"] == cluster_id])) + + def label_two(cluster_id): + if cluster_id == -1: + return "noise" + return "treated_high" if cluster_means.get(cluster_id, 0.0) >= 0 else "control_high" + + def label_three(cluster_id): + if cluster_id == -1: + return "noise" + mean_val = cluster_means.get(cluster_id, 0.0) + if mean_val >= STABLE_LFC_THRESHOLD: + return "treated_high" + if mean_val <= -STABLE_LFC_THRESHOLD: + return "control_high" + return "stable" + + cluster_mean_series = chosen_df["cluster"].map(cluster_means).fillna(np.nan) + clusters_two = chosen_df.copy() + clusters_two["cluster_label"] = clusters_two["cluster"].map(label_two) + clusters_two["cluster_mean_log2FC"] = cluster_mean_series + clusters_two.to_csv(os.path.join(args.outdir, "chipbinner.clusters.2cluster.tsv"), sep="\t", index=False) + + clusters_three = chosen_df.copy() + clusters_three["cluster_label"] = clusters_three["cluster"].map(label_three) + clusters_three["cluster_mean_log2FC"] = cluster_mean_series + clusters_three.to_csv(os.path.join(args.outdir, "chipbinner.clusters.3cluster.tsv"), sep="\t", index=False) + direction = [] for _, row in diff_df.iterrows(): if row.get("FDR") is not None and float(row["FDR"]) <= args.fdr: @@ -688,8 +781,23 @@ def main(): except Exception as exc: if args.allow_partial: - reason = str(exc) - write_empty_outputs(args.outdir, args.group, sample_ids, treated_label, control_label, reason) + error_path = write_error_artifact(args.outdir, exc) + write_summary( + os.path.join(args.outdir, "chipbinner.summary.tsv"), + args.group, + treated_label, + control_label, + 0, + 0, + 0, + 0, + 0, + "NA", + "NA", + "SKIP", + f"RUNTIME_ERROR:{os.path.basename(error_path)}", + "NOT_RUN", + ) else: raise diff --git a/bin/chipbinner_windows_cache.py b/bin/chipbinner_windows_cache.py new file mode 100755 index 00000000..7ad4146d --- /dev/null +++ b/bin/chipbinner_windows_cache.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +import argparse +import hashlib +import os +import subprocess +import tempfile + + +def run_cmd(cmd, stdout_path=None): + if stdout_path: + with open(stdout_path, "w") as handle: + result = subprocess.run(cmd, stdout=handle, stderr=subprocess.PIPE, text=True) + else: + result = subprocess.run(cmd, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + raise RuntimeError(f"Command failed ({' '.join(cmd)}): {result.stderr.strip()}") + + +def genome_id_from_path(path): + base = os.path.basename(path) + if base.endswith(".sizes"): + base = base[: -len(".sizes")] + else: + base = os.path.splitext(base)[0] + return base or "genome" + + +def hash_file(path): + if not path or not os.path.exists(path): + return "none" + digest = hashlib.sha256() + with open(path, "rb") as handle: + for chunk in iter(lambda: handle.read(8192), b""): + digest.update(chunk) + return digest.hexdigest()[:12] + + +def resolve_windows_source(windows_dir, genome_id, bin_size, blacklist_hash): + if not windows_dir: + return None + if os.path.isfile(windows_dir): + return windows_dir + if not os.path.isdir(windows_dir): + raise RuntimeError(f"--windows-dir must be a file or directory (got: {windows_dir})") + preferred = [ + f"windows.{genome_id}.{bin_size}.{blacklist_hash}.bed", + f"windows.{genome_id}.{bin_size}.bed", + ] + for name in preferred: + candidate = os.path.join(windows_dir, name) + if os.path.exists(candidate): + return candidate + raise RuntimeError( + "No matching windows BED found in " + f"{windows_dir}. Expected one of: {', '.join(preferred)}" + ) + + +def main(): + parser = argparse.ArgumentParser(description="Cache ChIPBinner windows per genome/bin/blacklist") + parser.add_argument("--chrom-sizes", required=True) + parser.add_argument("--bin-size", type=int, required=True) + parser.add_argument("--windows-dir") + parser.add_argument("--blacklist") + parser.add_argument("--outdir", default=".") + args = parser.parse_args() + + os.makedirs(args.outdir, exist_ok=True) + genome_id = genome_id_from_path(args.chrom_sizes) + blacklist_hash = hash_file(args.blacklist) if args.blacklist else "none" + + windows_source = resolve_windows_source(args.windows_dir, genome_id, args.bin_size, blacklist_hash) + + out_name = f"windows.{genome_id}.{args.bin_size}.{blacklist_hash}.bed" + out_path = os.path.join(args.outdir, out_name) + + with tempfile.TemporaryDirectory() as tmpdir: + base_path = None + if windows_source: + base_path = windows_source + else: + base_path = os.path.join(tmpdir, "windows_raw.bed") + run_cmd(["bedtools", "makewindows", "-g", args.chrom_sizes, "-w", str(args.bin_size)], stdout_path=base_path) + + filtered_path = base_path + if args.blacklist and os.path.exists(args.blacklist): + filtered_path = os.path.join(tmpdir, "windows_filtered.bed") + run_cmd(["bedtools", "subtract", "-a", base_path, "-b", args.blacklist], stdout_path=filtered_path) + + run_cmd(["sort", "-k1,1", "-k2,2n", filtered_path], stdout_path=out_path) + + meta_path = os.path.join(args.outdir, "chipbinner_windows_meta.tsv") + with open(meta_path, "w") as handle: + handle.write( + "genome_id\tbin_size\tblacklist_hash\twindows_source\twindows_path\n" + ) + handle.write( + f"{genome_id}\t{args.bin_size}\t{blacklist_hash}\t{windows_source or 'generated'}\t{out_name}\n" + ) + + +if __name__ == "__main__": + main() diff --git a/conf/modules.config b/conf/modules.config index ad02ba43..f7ecca49 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1223,6 +1223,14 @@ if(params.run_diffbind || params.run_chipbinner || params.run_span_diff || param ] } + withName: '.*:CHIPBINNER_WINDOWS_CACHE' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/00_manifests/chipbinner_windows_cache" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: '.*:SPAN_DIFF_RUN' { publishDir = [ path: { "${params.outdir}/03_peak_calling/08_differential/03_span/${group}" }, diff --git a/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md index ed29323c..761a6a20 100644 --- a/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md +++ b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md @@ -626,6 +626,7 @@ All differential outputs live under: differential_manifest.design.tsv chipbinner_input__.bam (optional) chipbinner_input__.bam.bai (optional) + chipbinner_windows_cache/windows....bed span_diff_target_pooling.tsv (optional) 01_diffbind/ 00_samplesheets//.csv @@ -639,6 +640,11 @@ All differential outputs live under: / chipbinner.*.tsv chipbinner.*.csv + chipbinner.normalization_factors.tsv + chipbinner.clusters.best.tsv + chipbinner.clusters.2cluster.tsv + chipbinner.clusters.3cluster.tsv + chipbinner.error.txt (only when allow-partial skips a failed run) chipbinner.summary.tsv plots/ enrichment/ (optional) diff --git a/docs/output.md b/docs/output.md index 0be4180b..e17b5714 100644 --- a/docs/output.md +++ b/docs/output.md @@ -449,6 +449,7 @@ The merge function from [BEDtools](https://github.com/arq5x/bedtools2) is used t - `differential_manifest.peaks.tsv`: per-sample per-caller peak paths and formats. - `differential_manifest.design.tsv`: eligibility and status table for each comparison. - `chipbinner_input__.bam(.bai)`: pooled input/IgG BAMs when `--chipbinner_use_input` is enabled. + - `chipbinner_windows_cache/windows....bed`: cached ChIPBinner windows by genome/bin/blacklist. - `03_peak_calling/08_differential/01_diffbind///` - `diffbind.results.tsv`: full DiffBind results. - `diffbind.results.annotated.tsv`: DiffBind results annotated with nearest gene. @@ -456,6 +457,9 @@ The merge function from [BEDtools](https://github.com/arq5x/bedtools2) is used t - `diffbind.summary.tsv`: one-row summary for MultiQC. - `03_peak_calling/08_differential/02_chipbinner//` - `chipbinner.*.tsv` and `chipbinner.*.csv`: ChIPBinner matrices and summaries. + - `chipbinner.normalization_factors.tsv`: per-sample scale/size factors used for normalization. + - `chipbinner.clusters.best.tsv`, `chipbinner.clusters.2cluster.tsv`, `chipbinner.clusters.3cluster.tsv`: standardized cluster assignments. + - `chipbinner.error.txt`: emitted when `--differential_allow_partial` skips a failed run. - `chipbinner.differential.annotated.tsv`: annotated differential bins. - `chipbinner.summary.tsv`: one-row summary for MultiQC. - `03_peak_calling/08_differential/03_span//` diff --git a/modules/local/chipbinner_run.nf b/modules/local/chipbinner_run.nf index 6fa3a791..defee9bd 100644 --- a/modules/local/chipbinner_run.nf +++ b/modules/local/chipbinner_run.nf @@ -5,7 +5,7 @@ process CHIPBINNER_RUN { container "quay.io/biocontainers/python:3.8.3" input: - tuple val(group), path(records), path(chrom_sizes) + tuple val(group), path(records), path(chrom_sizes), path(windows) val contrast val bin_size val windows_dir @@ -23,25 +23,31 @@ process CHIPBINNER_RUN { val allow_partial output: - tuple val(group), path("chipbinner.samplesheet.csv"), emit: samplesheet - tuple val(group), path("chipbinner.windows.bed"), emit: windows - tuple val(group), path("chipbinner.bin_counts.tsv"), emit: counts - tuple val(group), path("chipbinner.normalized_matrix.tsv"), emit: normalized - tuple val(group), path("chipbinner.hdbscan_grid_summary.tsv"), emit: grid - tuple val(group), path("hdbscan_grid"), emit: grid_outputs - tuple val(group), path("chipbinner.clusters.tsv"), emit: clusters - tuple val(group), path("chipbinner.differential.tsv"), emit: differential + tuple val(group), path("chipbinner.samplesheet.csv"), emit: samplesheet, optional: true + tuple val(group), path("chipbinner.windows.bed"), emit: windows, optional: true + tuple val(group), path("chipbinner.bin_counts.tsv"), emit: counts, optional: true + tuple val(group), path("chipbinner.normalized_matrix.tsv"), emit: normalized, optional: true + tuple val(group), path("chipbinner.normalization_factors.tsv"), emit: norm_factors, optional: true + tuple val(group), path("chipbinner.hdbscan_grid_summary.tsv"), emit: grid, optional: true + tuple val(group), path("hdbscan_grid"), emit: grid_outputs, optional: true + tuple val(group), path("chipbinner.clusters.tsv"), emit: clusters, optional: true + tuple val(group), path("chipbinner.clusters.best.tsv"), emit: clusters_best, optional: true + tuple val(group), path("chipbinner.clusters.2cluster.tsv"), emit: clusters_two, optional: true + tuple val(group), path("chipbinner.clusters.3cluster.tsv"), emit: clusters_three, optional: true + tuple val(group), path("chipbinner.differential.tsv"), emit: differential, optional: true tuple val(group), path("chipbinner.summary.tsv"), emit: summary - tuple val(group), path("plots"), emit: plots - tuple val(group), path("enrichment"), emit: enrichment + tuple val(group), path("chipbinner.error.txt"), emit: error, optional: true + tuple val(group), path("plots"), emit: plots, optional: true + tuple val(group), path("enrichment"), emit: enrichment, optional: true path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when script: - def windows_arg = windows_dir ? "--windows-dir ${windows_dir}" : '' - def blacklist_arg = blacklist ? "--blacklist ${blacklist}" : '' + def windows_arg = windows ? "--windows ${windows}" : '' + def windows_dir_arg = (!windows && windows_dir) ? "--windows-dir ${windows_dir}" : '' + def blacklist_arg = (!windows && blacklist) ? "--blacklist ${blacklist}" : '' def use_input_arg = use_input ? "--use-input" : '' def use_spikein_arg = use_spikein ? "--use-spikein" : '' def functional_arg = functional_db ? "--functional-db ${functional_db}" : '' @@ -54,6 +60,7 @@ process CHIPBINNER_RUN { --chrom-sizes ${chrom_sizes} \ --bin-size ${bin_size} \ ${windows_arg} \ + ${windows_dir_arg} \ ${blacklist_arg} \ ${use_input_arg} \ ${use_spikein_arg} \ diff --git a/modules/local/chipbinner_windows_cache.nf b/modules/local/chipbinner_windows_cache.nf new file mode 100644 index 00000000..3c67fa5f --- /dev/null +++ b/modules/local/chipbinner_windows_cache.nf @@ -0,0 +1,38 @@ +process CHIPBINNER_WINDOWS_CACHE { + label 'process_single' + + conda "conda-forge::python=3.8.3 bioconda::bedtools=2.31.1" + container "quay.io/biocontainers/python:3.8.3" + + input: + path chrom_sizes + val bin_size + val windows_dir + val blacklist + + output: + path "windows.*.bed", emit: windows + path "chipbinner_windows_meta.tsv", emit: meta + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def windows_dir_arg = windows_dir ? "--windows-dir ${windows_dir}" : '' + def blacklist_arg = blacklist ? "--blacklist ${blacklist}" : '' + """ + chipbinner_windows_cache.py \ + --chrom-sizes ${chrom_sizes} \ + --bin-size ${bin_size} \ + ${windows_dir_arg} \ + ${blacklist_arg} \ + --outdir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + bedtools: \$(bedtools --version | awk '{print \$2}') + END_VERSIONS + """ +} diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index e2374b00..148138cf 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -11,6 +11,7 @@ include { RECORDS_TO_TSV as RECORDS_TO_TSV_DIFFBIND } from "../../modules/local/ include { RECORDS_TO_TSV as RECORDS_TO_TSV_CHIPBINNER } from "../../modules/local/records_to_tsv" include { DIFFBIND_RUN } from "../../modules/local/diffbind_run" include { CHIPBINNER_RUN } from "../../modules/local/chipbinner_run" +include { CHIPBINNER_WINDOWS_CACHE } from "../../modules/local/chipbinner_windows_cache" include { POOL_IGG_CONTROLS } from "../../modules/local/pool_igg_controls" include { SPAN_DIFF_RUN } from "../../modules/local/span_diff_run" include { SPAN_POOLING_MANIFEST } from "../../modules/local/span_pooling_manifest" @@ -44,6 +45,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_diffbind_summary = Channel.empty() ch_chipbinner_summary = Channel.empty() ch_span_summary = Channel.empty() + ch_chipbinner_windows = Channel.empty() ch_summary_files = Channel.empty() ch_samples_manifest = Channel.empty() ch_peaks_manifest = Channel.empty() @@ -402,6 +404,17 @@ workflow DIFFERENTIAL_PEAK_CALLING { .filter { row -> row.caller == 'NA' && row.status == 'RUN' && row.eligible_chipbinner == 'true' } .map { row -> [row.group, row] } + def chipbinner_windows_dir = params.chipbinner_windows_dir ? file(params.chipbinner_windows_dir).toString() : '' + def chipbinner_blacklist = params.blacklist ? file(params.blacklist).toString() : '' + CHIPBINNER_WINDOWS_CACHE ( + ch_chrom_sizes.collect().map { it[0] }, + params.chipbinner_bin_size, + chipbinner_windows_dir, + chipbinner_blacklist + ) + ch_versions = ch_versions.mix(CHIPBINNER_WINDOWS_CACHE.out.versions) + ch_chipbinner_windows = CHIPBINNER_WINDOWS_CACHE.out.windows.first() + ch_chip_records = ch_samples_rows_contrast .map { row -> [row.group, row] } .groupTuple(by: [0]) @@ -424,18 +437,17 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_chip_records_file = RECORDS_TO_TSV_CHIPBINNER.out.tsv .map { group, caller, records_file -> [group, records_file] } - ch_chrom_sizes_single = ch_chrom_sizes.collect().map { it[0] } - ch_chip_inputs = ch_chip_records_file - .combine(ch_chrom_sizes_single) - .map { record, chrom_sizes -> [ record[0], record[1], chrom_sizes ] } + .combine(ch_chrom_sizes.collect().map { it[0] }) + .combine(ch_chipbinner_windows) + .map { record, chrom_sizes, windows -> [ record[0], record[1], chrom_sizes, windows ] } CHIPBINNER_RUN ( ch_chip_inputs, params.differential_contrast, params.chipbinner_bin_size, - params.chipbinner_windows_dir ? file(params.chipbinner_windows_dir).toString() : '', - params.blacklist ? file(params.blacklist).toString() : '', + '', + '', params.chipbinner_use_input, use_spikein, params.chipbinner_pseudocount, From 012488bc7c3ada0e0dfd838740fdc72a1c98244b Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 5 Jan 2026 01:13:00 -0800 Subject: [PATCH 23/52] Add tests for chipbinner allow-partial and caching --- bin/differential_summary.py | 9 ++- .../differential_manifest.peaks.tsv | 9 +++ .../differential_manifest.samples.tsv | 5 ++ .../inputs/input_g1.bam | 1 + .../inputs/input_g1.bam.bai | 1 + .../inputs/input_g2.bam | 1 + .../inputs/input_g2.bam.bai | 1 + tests/test_07_differential.yml | 56 +++++++++++++++++++ 8 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv create mode 100644 tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv create mode 100644 tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam create mode 100644 tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam.bai create mode 100644 tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam create mode 100644 tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam.bai diff --git a/bin/differential_summary.py b/bin/differential_summary.py index d11f043d..e78fd08f 100755 --- a/bin/differential_summary.py +++ b/bin/differential_summary.py @@ -85,7 +85,7 @@ def main(): group = row.get("group") or "" caller = row.get("caller") or (row.get("caller_id") or "NA") key = (method, group, caller) - base_rows[key] = { + summary_row = { "method": method, "group": group, "caller": caller, @@ -99,6 +99,13 @@ def main(): "status": row.get("status") or "RUN", "reason": row.get("reason") or "ok", } + existing = base_rows.get(key) + if existing: + existing_status = (existing.get("status") or "").upper() + incoming_status = (summary_row.get("status") or "RUN").upper() + if existing_status not in ("", "RUN") and incoming_status == "RUN": + continue + base_rows[key] = summary_row header = [ "method", diff --git a/tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv b/tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv new file mode 100644 index 00000000..f2dbbade --- /dev/null +++ b/tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv @@ -0,0 +1,9 @@ +sample_id group condition caller peaks_path peaks_format replicate +g1_treat g1 Treatment seacr NA bed 1 +g1_ctrl g1 Control seacr NA bed 1 +g2_treat g2 Treatment seacr NA bed 1 +g2_ctrl g2 Control seacr NA bed 1 +g1_treat g1 Treatment span_default NA bed 1 +g1_ctrl g1 Control span_default NA bed 1 +g2_treat g2 Treatment span_default NA bed 1 +g2_ctrl g2 Control span_default NA bed 1 diff --git a/tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv b/tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv new file mode 100644 index 00000000..6cd69a30 --- /dev/null +++ b/tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv @@ -0,0 +1,5 @@ +sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor ms_coeff bigwig_path input_bam input_bai +g1_treat g1 Treatment 1 NA NA NA NA NA NA tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam.bai +g1_ctrl g1 Control 1 NA NA NA NA NA NA tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam.bai +g2_treat g2 Treatment 1 NA NA NA NA NA NA tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam.bai +g2_ctrl g2 Control 1 NA NA NA NA NA NA tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam.bai diff --git a/tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam b/tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam new file mode 100644 index 00000000..48cdce85 --- /dev/null +++ b/tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam @@ -0,0 +1 @@ +placeholder diff --git a/tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam.bai b/tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam.bai new file mode 100644 index 00000000..48cdce85 --- /dev/null +++ b/tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam.bai @@ -0,0 +1 @@ +placeholder diff --git a/tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam b/tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam new file mode 100644 index 00000000..48cdce85 --- /dev/null +++ b/tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam @@ -0,0 +1 @@ +placeholder diff --git a/tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam.bai b/tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam.bai new file mode 100644 index 00000000..48cdce85 --- /dev/null +++ b/tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam.bai @@ -0,0 +1 @@ +placeholder diff --git a/tests/test_07_differential.yml b/tests/test_07_differential.yml index 77d54737..3b221b4c 100644 --- a/tests/test_07_differential.yml +++ b/tests/test_07_differential.yml @@ -5,6 +5,16 @@ - test_differential_missing_contrast exit_code: 1 +- name: test_span_diff_missing_jar_integrated + command: nextflow run main.nf -profile docker,test --run_span_diff --differential_contrast Treatment,Control --differential_min_replicates 1 --input tests/assets/samplesheet_condition.csv --skip_fastqc --skip_removeduplicates --skip_multiqc --skip_preseq -c tests/config/nextflow.config + tags: + - test_differential + - test_span_diff_missing_jar_integrated + exit_code: 1 + stderr: + contains: + - "omnipeaks_jar" + - name: test_differential_manifests_only command: nextflow run main.nf -profile docker,test --only_peak_calling --skip_fastqc --skip_removeduplicates --skip_multiqc --skip_preseq --run_diffbind --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_publish_manifest_only --input tests/assets/samplesheet_condition.csv -c tests/config/nextflow.config tags: @@ -58,6 +68,17 @@ - "insufficient_replicates" - "SKIP" +- name: test_differential_summary_preserves_skip + command: bash -c "mkdir -p tests/output/summary_override && cat > tests/output/summary_override/design.tsv <<'EOF'\ngroup\tcaller\ttreated_condition\tcontrol_condition\tstatus\treason\nh3k27me3\tNA\tTreatment\tControl\tSKIP\tmanual_skip\nEOF\ncat > tests/output/summary_override/chipbinner.summary.tsv <<'EOF'\ngroup\tcaller\ttreated\tcontrol\tn_bins_tested\tn_fdr_pass\tn_up\tn_down\tn_clusters\tchosen_minPts\tchosen_minSamps\tstatus\treason\tenrichment_status\nh3k27me3\tNA\tTreatment\tControl\t10\t1\t1\t0\t2\t100\t100\tRUN\tok\tNOT_RUN\nEOF\npython3 bin/differential_summary.py --design tests/output/summary_override/design.tsv --summaries tests/output/summary_override/chipbinner.summary.tsv --out tests/output/summary_override/summary.tsv" + tags: + - test_differential + - test_differential_summary + files: + - path: tests/output/summary_override/summary.tsv + contains: + - "SKIP" + - "manual_skip" + - name: test_differential_manifests_deterministic command: bash -c "rm -rf tests/output/manifest_run1 tests/output/manifest_run2 && mkdir -p tests/output/manifest_run1 tests/output/manifest_run2 && python3 bin/differential_manifests.py --samples tests/assets/differential_manifest_input/samples.tsv --peaks tests/assets/differential_manifest_input/peaks.tsv --normalisation-mode NA --outdir tests/output/manifest_run1 && python3 bin/differential_manifests.py --samples tests/assets/differential_manifest_input/samples.tsv --peaks tests/assets/differential_manifest_input/peaks.tsv --normalisation-mode NA --outdir tests/output/manifest_run2 && diff -q tests/output/manifest_run1/differential_manifest.samples.tsv tests/output/manifest_run2/differential_manifest.samples.tsv && diff -q tests/output/manifest_run1/differential_manifest.peaks.tsv tests/output/manifest_run2/differential_manifest.peaks.tsv" tags: @@ -100,6 +121,7 @@ - path: tests/output/span_auto/span.mode.txt contains: - "fallback" + - path: tests/output/span_auto/span.normalization_factors.tsv - name: test_annotation_fallback_gtf_only command: nextflow run main.nf -profile docker,test -entry DIFFERENTIAL_ONLY --run_span_diff --span_diff_mode native --differential_allow_partial --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_from_run tests/assets/diff_run_small --omnipeaks_jar tests/assets/omnipeaks_stub.jar --gtf tests/assets/minimal.gtf --fasta tests/assets/minimal.fa --outdir results_annotation_gtf -c tests/config/nextflow.config @@ -111,3 +133,37 @@ contains: - "nearest_feature_id" - "nearest_gene_name" + +- name: test_chipbinner_stub_outputs + command: bash -c "rm -rf results_chipbinner_stub && nextflow run main.nf -profile docker,test -stub-run -entry DIFFERENTIAL_ONLY --run_chipbinner --chipbinner_use_input --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_from_run tests/assets/diff_run_chipbinner_multi --gtf tests/assets/minimal.gtf --fasta tests/assets/minimal.fa --outdir results_chipbinner_stub -c tests/config/nextflow.config && test $(ls results_chipbinner_stub/03_peak_calling/08_differential/00_manifests/chipbinner_windows_cache/windows.*.bed | wc -l) -eq 1" + tags: + - test_differential + - test_chipbinner_outputs + files: + - path: results_chipbinner_stub/03_peak_calling/08_differential/02_chipbinner/g1/chipbinner.clusters.2cluster.tsv + - path: results_chipbinner_stub/03_peak_calling/08_differential/02_chipbinner/g1/chipbinner.clusters.3cluster.tsv + - path: results_chipbinner_stub/03_peak_calling/08_differential/02_chipbinner/g1/chipbinner.normalization_factors.tsv + +- name: test_chipbinner_use_input_missing_fail + command: bash -c "mkdir -p tests/output/chipbinner_missing && cat > tests/output/chipbinner_missing/samples.tsv <<'EOF'\nsample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff\tinput_bam\tinput_bai\ns1\tg1\tTreatment\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\ns2\tg1\tControl\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\nEOF\ncat > tests/output/chipbinner_missing/chrom.sizes <<'EOF'\nchr1\t1000\nEOF\npython3 bin/chipbinner_run.py --samples tests/output/chipbinner_missing/samples.tsv --group g1 --contrast Treatment,Control --chrom-sizes tests/output/chipbinner_missing/chrom.sizes --bin-size 200 --use-input --outdir tests/output/chipbinner_missing/out" + tags: + - test_differential + - test_chipbinner_use_input + exit_code: 1 + stderr: + contains: + - "input BAMs are missing" + +- name: test_chipbinner_use_input_missing_allow_partial + command: bash -c "mkdir -p tests/output/chipbinner_missing_partial && cat > tests/output/chipbinner_missing_partial/samples.tsv <<'EOF'\nsample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff\tinput_bam\tinput_bai\ns1\tg1\tTreatment\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\ns2\tg1\tControl\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\nEOF\ncat > tests/output/chipbinner_missing_partial/chrom.sizes <<'EOF'\nchr1\t1000\nEOF\npython3 bin/chipbinner_run.py --samples tests/output/chipbinner_missing_partial/samples.tsv --group g1 --contrast Treatment,Control --chrom-sizes tests/output/chipbinner_missing_partial/chrom.sizes --bin-size 200 --use-input --allow-partial --outdir tests/output/chipbinner_missing_partial/out" + tags: + - test_differential + - test_chipbinner_use_input + files: + - path: tests/output/chipbinner_missing_partial/out/chipbinner.summary.tsv + contains: + - "SKIP" + - "MISSING_INPUT_BAM" + - path: tests/output/chipbinner_missing_partial/out/chipbinner.error.txt + - path: tests/output/chipbinner_missing_partial/out/chipbinner.bin_counts.tsv + should_exist: false From 7e3da87ea8e3b3991210af6d67f45d95d6cfdc64 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 5 Jan 2026 01:16:57 -0800 Subject: [PATCH 24/52] Expand differential analysis documentation --- docs/output.md | 3 ++ docs/usage.md | 86 +++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 81 insertions(+), 8 deletions(-) diff --git a/docs/output.md b/docs/output.md index e17b5714..05bf1dde 100644 --- a/docs/output.md +++ b/docs/output.md @@ -450,6 +450,7 @@ The merge function from [BEDtools](https://github.com/arq5x/bedtools2) is used t - `differential_manifest.design.tsv`: eligibility and status table for each comparison. - `chipbinner_input__.bam(.bai)`: pooled input/IgG BAMs when `--chipbinner_use_input` is enabled. - `chipbinner_windows_cache/windows....bed`: cached ChIPBinner windows by genome/bin/blacklist. + - `span_diff_target_pooling.tsv`: emitted when SPAN native mode must pool replicates. - `03_peak_calling/08_differential/01_diffbind///` - `diffbind.results.tsv`: full DiffBind results. - `diffbind.results.annotated.tsv`: DiffBind results annotated with nearest gene. @@ -467,6 +468,7 @@ The merge function from [BEDtools](https://github.com/arq5x/bedtools2) is used t - `span.differential.peaks.bed`: differential regions as BED. - `span.differential.annotated.tsv`: annotated differential regions. - `span.up.bed`, `span.down.bed`: direction-specific BEDs. + - `span.normalization_factors.tsv`: spike-in size factors used by fallback mode. - `span.summary.tsv`: one-row summary for MultiQC. - `span.mode.txt`: mode and signature note. - `03_peak_calling/08_differential/multiqc/` @@ -477,6 +479,7 @@ The merge function from [BEDtools](https://github.com/arq5x/bedtools2) is used t Differential analysis is enabled with `--run_diffbind`, `--run_chipbinner`, and/or `--run_span_diff` plus a `--differential_contrast` definition. The pipeline always writes manifests and design tables when any differential option is enabled, even if specific comparisons are skipped. +Summary TSVs are always produced and include `RUN`, `SKIP`, or `FAIL` plus a reason; MultiQC tables surface these statuses instead of showing placeholder results. ## 7. Peak-based QC diff --git a/docs/usage.md b/docs/usage.md index 05a4f884..b944e45c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -171,14 +171,84 @@ After peak calling, consensus peaks are calculated by merging peaks within the s Differential analysis runs on peak-calling outputs and compares two conditions per group. The contrast is defined as `treated,control` and determines the log2FC direction (treated/control). Enable one or more methods with `--run_diffbind`, `--run_chipbinner`, or `--run_span_diff`, and provide `--differential_contrast`. -Key options: - -- `--differential_contrast "Treatment,Control"` (required when any method is enabled) -- `--differential_min_replicates` (default: 2) -- `--differential_allow_partial` (skip invalid comparisons instead of failing) -- `--differential_groups` / `--differential_callers` (comma-separated allowlists) -- `--differential_publish_manifest_only` (write manifests + design tables only) -- `--differential_multiqc_report` (emit a differential-only MultiQC report) +#### Global options + +| Parameter | Default | Description | +| --- | --- | --- | +| `--differential_contrast` | required | Contrast as `treated,control`. The order controls log2FC sign (treated/control). | +| `--differential_min_replicates` | `2` | Minimum replicates per condition required for eligibility. | +| `--differential_allow_partial` | `false` | If `true`, ineligible comparisons are marked `SKIP` and the pipeline continues. If `false`, any `FAIL` in the design table stops the pipeline. | +| `--differential_groups` | | Comma-separated allowlist of groups to analyze. | +| `--differential_callers` | | Comma-separated allowlist of peak callers for DiffBind (e.g., `seacr,macs2`). | +| `--differential_use_spikein` | `auto` | `auto` uses spike-in only when `--normalisation_mode Spikein`; set `true` or `false` to override. | +| `--differential_publish_manifest_only` | `false` | Write manifests and design tables only (no method execution). | +| `--differential_multiqc_report` | `false` | Emit a differential-only MultiQC report. | + +Eligibility & allow-partial semantics: +- Comparisons are eligible only when both contrast conditions are present and meet `--differential_min_replicates`. +- If `--differential_allow_partial=false`, any `FAIL` in the design manifest stops the run. +- If `--differential_allow_partial=true`, failed units are marked `SKIP` with a reason in the design + per-method summaries; other units proceed. + +Spike-in behavior: +- Spike-in scaling follows the manifest `spikein_scale_factor`, with size factors defined as `size_factor = 1 / scale_factor`. +- ChIPBinner and SPAN fallback explicitly apply these size factors and record them in normalization factor outputs. +- DiffBind uses spike-in scaling when enabled (`--differential_use_spikein`) and otherwise uses the configured normalization method. + +#### DiffBind + +Enable with `--run_diffbind`. + +| Parameter | Default | Description | +| --- | --- | --- | +| `--diffbind_fdr` | `0.05` | FDR cutoff for significant peaks. | +| `--diffbind_lfc` | `1.0` | Absolute log2FC cutoff for up/down labeling. | +| `--diffbind_min_overlap` | `2` | Minimum overlap count for consensus/analysis. | +| `--diffbind_backend` | `DESeq2` | Differential backend (e.g. `DESeq2`, `edgeR`). | +| `--diffbind_recenter_peaks` | `false` | Recenter peaks before testing. | +| `--diffbind_summits` | `0` | Summit window width (0 uses full peaks). | +| `--diffbind_norm_method` | `native` | Normalization method when not using spike-in. | +| `--diffbind_extra_params` | | Optional JSON/YAML file with extra DiffBind parameters. | +| `--export_diffbind_sheets` | `true` | Publish DiffBind sample sheets under `00_samplesheets/`. | + +DiffBind uses per-caller peak sets for each group and produces results tables plus diagnostic plots. + +#### ChIPBinner + +Enable with `--run_chipbinner`. + +| Parameter | Default | Description | +| --- | --- | --- | +| `--chipbinner_bin_size` | `10000` | Bin size for windowing. | +| `--chipbinner_windows_dir` | | Directory containing prebuilt windows (named `windows....bed`). | +| `--blacklist` | | Optional blacklist used to subtract windows. | +| `--chipbinner_use_input` | `false` | Subtract pooled input/IgG per group/condition as background. Missing inputs fail fast unless `--differential_allow_partial=true`, which yields `SKIP` with an error note. | +| `--chipbinner_ms_coeffs` | | Optional MS coefficients (by `sample_id`) used in normalization. | +| `--chipbinner_pseudocount` | `1` | Pseudocount added before log transforms. | +| `--chipbinner_hdbscan_grid_minpts` | `100,200,500,1000` | HDBSCAN minPts grid. | +| `--chipbinner_hdbscan_grid_minsamps` | `100,200,500,1000` | HDBSCAN minSamps grid. | +| `--chipbinner_fdr` | `0.05` | FDR cutoff for differential bins. | +| `--chipbinner_lfc` | `1.0` | Absolute log2FC cutoff for up/down labeling. | +| `--chipbinner_bootstrap` | `1000` | ROTS bootstrap iterations. | +| `--chipbinner_k_value` | `100000` | ROTS K parameter. | +| `--chipbinner_functional_db` | | Optional enrichment database (BED or directory of BEDs). | + +ChIPBinner produces: a chosen “best” clustering plus standardized 2‑cluster and 3‑cluster views (treated‑high vs control‑high vs stable). Normalization factors are written to `chipbinner.normalization_factors.tsv`. + +#### SPAN differential + +Enable with `--run_span_diff`. + +| Parameter | Default | Description | +| --- | --- | --- | +| `--span_diff_mode` | `auto` | `auto` chooses native compare if supported, otherwise fallback. `native` forces compare; `fallback` skips jar compare. | +| `--omnipeaks_jar` | required | OmniPeaks/SPAN jar for native/auto modes. | +| `--span_diff_fdr` | `0.05` | FDR cutoff for differential regions. | +| `--span_diff_gap` | `5` | Gap parameter for native compare. | +| `--span_diff_bin` | `200` | Bin size for native compare. | +| `--span_diff_java_heap` | `8G` | Java heap for jar execution. | +| `--span_fallback_backend` | `DESeq2` | Backend for fallback differential (e.g. `DESeq2` or `edgeR`). | + +Native mode enforces treated/control orientation and may require pooled replicates depending on jar capabilities (recorded in `span_diff_target_pooling.tsv`). Fallback mode computes counts over merged regions and writes `span.normalization_factors.tsv`. Integrated example: From 189eb42a2a5f263e264fecc226604e0a7bf4d256 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 5 Jan 2026 04:40:27 -0800 Subject: [PATCH 25/52] Fix differential status semantics and SPAN normalization --- assets/multiqc_differential_config.yml | 3 +- bin/chipbinner_run.py | 2 +- bin/differential_summary.py | 14 ++- bin/span_diff.py | 158 +++++++++++++++++++------ nextflow_schema.json | 1 + tests/test_07_differential.yml | 2 +- workflows/differential_only.nf | 94 ++++++++++++++- 7 files changed, 229 insertions(+), 45 deletions(-) diff --git a/assets/multiqc_differential_config.yml b/assets/multiqc_differential_config.yml index 4ad37699..2f93a69e 100644 --- a/assets/multiqc_differential_config.yml +++ b/assets/multiqc_differential_config.yml @@ -1,5 +1,6 @@ report_comment: > - Differential analysis report generated by nf-core/cutandrun. + Differential analysis report generated by nf-core/cutandrun. Method-specific + metrics (e.g., ChIPBinner clustering parameters) are included in the summary table. run_modules: - custom_content diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index d379f8ae..afb382a1 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -238,7 +238,7 @@ def compute_counts(windows_path, samples, out_path, use_input=False): def write_matrix(counts_df, sample_ids, out_path): - matrix = counts_df[["bin_id"] + sample_ids] + matrix = counts_df[["chrom", "start", "end", "bin_id"] + sample_ids] matrix.to_csv(out_path, sep="\t", index=False) diff --git a/bin/differential_summary.py b/bin/differential_summary.py index e78fd08f..c0b0931b 100755 --- a/bin/differential_summary.py +++ b/bin/differential_summary.py @@ -29,7 +29,7 @@ def main(): status = row.get("status") or "" reason = row.get("reason") or "" if args.manifest_only and status == "RUN": - status = "SKIPPED" + status = "SKIP" reason = "manifest_only" if caller != "NA": @@ -44,6 +44,9 @@ def main(): "n_fdr_pass": "0", "n_up": "0", "n_down": "0", + "n_clusters": "NA", + "chosen_minPts": "NA", + "chosen_minSamps": "NA", "mode": "NA", "status": status, "reason": reason, @@ -61,6 +64,9 @@ def main(): "n_fdr_pass": "0", "n_up": "0", "n_down": "0", + "n_clusters": "NA", + "chosen_minPts": "NA", + "chosen_minSamps": "NA", "mode": "NA", "status": status, "reason": reason, @@ -95,6 +101,9 @@ def main(): "n_fdr_pass": row.get("n_fdr_pass") or "0", "n_up": row.get("n_up") or "0", "n_down": row.get("n_down") or "0", + "n_clusters": row.get("n_clusters") or "NA", + "chosen_minPts": row.get("chosen_minPts") or "NA", + "chosen_minSamps": row.get("chosen_minSamps") or "NA", "mode": row.get("mode") or "NA", "status": row.get("status") or "RUN", "reason": row.get("reason") or "ok", @@ -117,6 +126,9 @@ def main(): "n_fdr_pass", "n_up", "n_down", + "n_clusters", + "chosen_minPts", + "chosen_minSamps", "mode", "status", "reason", diff --git a/bin/span_diff.py b/bin/span_diff.py index f916d650..4f9a88f0 100755 --- a/bin/span_diff.py +++ b/bin/span_diff.py @@ -332,7 +332,16 @@ def filter_bed_by_stats(rows: List[Dict[str, str]], fdr_cutoff: float, direction return out -def write_failure_outputs(outdir: str, group: str, treated: str, control: str, mode: str, signature: str, reason: str) -> None: +def write_failure_outputs( + outdir: str, + group: str, + treated: str, + control: str, + mode: str, + signature: str, + reason: str, + allow_partial: bool, +) -> None: diff_path = os.path.join(outdir, "span.differential.tsv") bed_path = os.path.join(outdir, "span.differential.peaks.bed") up_path = os.path.join(outdir, "span.up.bed") @@ -345,7 +354,8 @@ def write_failure_outputs(outdir: str, group: str, treated: str, control: str, m open(bed_path, "w").close() open(up_path, "w").close() open(down_path, "w").close() - write_summary(summary_path, group, treated, control, 0, 0, 0, 0, mode, "FAIL", reason) + status = "SKIP" if allow_partial else "FAIL" + write_summary(summary_path, group, treated, control, 0, 0, 0, 0, mode, status, reason) write_mode(mode_path, mode, signature) @@ -354,6 +364,8 @@ def compute_orientation_log2fc( treated_bams: List[str], control_bams: List[str], pseudocount: float, + treated_size_factors: List[float] = None, + control_size_factors: List[float] = None, ) -> Dict[Tuple[str, str, str], List[float]]: if not treated_bams or not control_bams: raise RuntimeError("orientation_missing_bams") @@ -361,6 +373,17 @@ def compute_orientation_log2fc( result = run_cmd(cmd, check=False) if result.returncode != 0: raise RuntimeError(result.stderr.strip() or "bedtools_multicov_failed") + + def apply_size_factors(counts: List[float], factors: List[float]) -> List[float]: + if not factors or len(factors) != len(counts): + return counts + adjusted = [] + for count, factor in zip(counts, factors): + if factor is None or factor == 0: + factor = 1.0 + adjusted.append(count / factor) + return adjusted + orientation = {} for line in result.stdout.strip().split("\n"): if not line: @@ -372,6 +395,8 @@ def compute_orientation_log2fc( counts = [safe_float(val) or 0.0 for val in parts[3:]] t_counts = counts[: len(treated_bams)] c_counts = counts[len(treated_bams) :] + t_counts = apply_size_factors(t_counts, treated_size_factors) + c_counts = apply_size_factors(c_counts, control_size_factors) if not t_counts or not c_counts: continue t_mean = sum(t_counts) / len(t_counts) @@ -381,6 +406,54 @@ def compute_orientation_log2fc( return orientation +def compute_spikein_factors(samples: List[Dict[str, str]], use_spikein: bool): + scale_values = [] + size_factors = [] + numeric_present = False + for row in samples: + raw = row.get("spikein_scale_factor", "NA") + scale_values.append(raw) + numeric = None + try: + numeric = float(raw) + numeric_present = True + except Exception: + numeric = None + + if use_spikein: + if numeric is None or numeric == 0: + size = 1.0 + else: + size = 1.0 / numeric + else: + size = None + size_factors.append(size) + + use_spikein_factors = use_spikein and numeric_present + return scale_values, size_factors, use_spikein_factors + + +def write_normalization_factors( + path: str, + samples: List[Dict[str, str]], + scale_values: List[str], + size_factors: List[float], + use_spikein_factors: bool, + backend: str, +) -> None: + with open(path, "w") as handle: + handle.write("sample_id\tgroup\tcondition\tspikein_scale_factor\tspikein_size_factor\tused_by_backend\n") + for row, scale_val, size_val in zip(samples, scale_values, size_factors): + used_by_backend = "spikein" if use_spikein_factors else backend + size_str = "NA" + if use_spikein_factors: + size = size_val if size_val is not None else 1.0 + size_str = f"{size:.6f}" + handle.write( + f"{row.get('sample_id','')}\t{row.get('group','')}\t{row.get('condition','')}\t{scale_val}\t{size_str}\t{used_by_backend}\n" + ) + + def main(): parser = argparse.ArgumentParser(description="SPAN/OmniPeak differential runner") parser.add_argument("--jar", required=True) @@ -418,7 +491,7 @@ def main(): if args.mode == "native": if not native_supported: reason = "native_not_supported" - write_failure_outputs(args.outdir, args.group, treated, control, "native", signature, reason) + write_failure_outputs(args.outdir, args.group, treated, control, "native", signature, reason, allow_partial) if allow_partial: sys.exit(0) print("ERROR: SPAN jar does not support native compare.", file=sys.stderr) @@ -435,7 +508,7 @@ def main(): samples_group = [row for row in samples if row.get("group") == args.group and row.get("condition") in {treated, control}] if not samples_group: reason = "no_samples" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) if allow_partial: sys.exit(0) print("ERROR: No samples found for group.", file=sys.stderr) @@ -445,7 +518,7 @@ def main(): span_peaks = [row for row in peaks_group if is_span_caller(row.get("caller"))] if not span_peaks: reason = "no_span_peaks" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) if allow_partial: sys.exit(0) print("ERROR: No SPAN/OmniPeak peaks found for group.", file=sys.stderr) @@ -477,7 +550,7 @@ def main(): missing_peaks = [row.get("sample_id") for row in samples_group if row.get("sample_id") not in peaks_by_sample] if missing_peaks: reason = "missing_peaks" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) if allow_partial: sys.exit(0) print(f"ERROR: Missing peaks for samples: {','.join(missing_peaks)}", file=sys.stderr) @@ -488,12 +561,19 @@ def main(): control_samples = [row for row in samples_sorted if row.get("condition") == control] if not treated_samples or not control_samples: reason = "missing_condition" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) if allow_partial: sys.exit(0) print("ERROR: Missing treated/control samples.", file=sys.stderr) sys.exit(1) + spikein_scale_values, spikein_size_factors, use_spikein_factors = compute_spikein_factors(samples_sorted, use_spikein) + bam_size_factors = {} + for row, size_factor in zip(samples_sorted, spikein_size_factors): + bam = row.get("final_bam") + if bam: + bam_size_factors[bam] = size_factor if size_factor is not None else 1.0 + diff_path = os.path.join(args.outdir, "span.differential.tsv") bed_path = os.path.join(args.outdir, "span.differential.peaks.bed") up_path = os.path.join(args.outdir, "span.up.bed") @@ -524,7 +604,7 @@ def finalize_success(rows: List[Dict[str, str]], mode: str, status: str = "RUN", if chosen_mode == "native": if not native_supported: reason = "native_not_supported" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) if allow_partial: sys.exit(0) print("ERROR: Native SPAN compare not supported.", file=sys.stderr) @@ -542,7 +622,7 @@ def finalize_success(rows: List[Dict[str, str]], mode: str, status: str = "RUN", if not args.chrom_sizes or not os.path.exists(args.chrom_sizes): reason = "missing_chrom_sizes" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) if allow_partial: sys.exit(0) print("ERROR: Chrom sizes file required for native SPAN compare.", file=sys.stderr) @@ -564,6 +644,10 @@ def pool_bams(condition: str, bams: List[str]) -> str: cmd = ["samtools", "merge", "-f", "-@", str(max(args.cpus, 1)), pooled_path] + sorted_bams run_cmd(cmd, check=True) run_cmd(["samtools", "index", pooled_path], check=True) + if use_spikein_factors: + size_vals = [bam_size_factors.get(bam, 1.0) for bam in sorted_bams] + pooled_size = sum(size_vals) / len(size_vals) if size_vals else 1.0 + bam_size_factors[pooled_path] = pooled_size pooled_manifest_path = pooled_path if args.pooling_dir: pooled_manifest_path = os.path.join(args.pooling_dir, pooled_name) @@ -651,11 +735,18 @@ def pool_bams(condition: str, bams: List[str]) -> str: write_bed(bed_path, bed_rows) pseudocount = 0.5 + treated_size_factors = None + control_size_factors = None + if use_spikein_factors: + treated_size_factors = [bam_size_factors.get(bam, 1.0) for bam in compare_treated_bams] + control_size_factors = [bam_size_factors.get(bam, 1.0) for bam in compare_control_bams] orientation_map = compute_orientation_log2fc( bed_path, compare_treated_bams, compare_control_bams, pseudocount, + treated_size_factors, + control_size_factors, ) for row in rows: key = (row.get("chr"), row.get("start"), row.get("end")) @@ -673,6 +764,17 @@ def pool_bams(condition: str, bams: List[str]) -> str: write_bed(up_path, up_rows) write_bed(down_path, down_rows) + normalization_path = os.path.join(args.outdir, "span.normalization_factors.tsv") + write_normalization_factors( + normalization_path, + samples_sorted, + spikein_scale_values, + spikein_size_factors, + use_spikein_factors, + "native", + ) + print(f"INFO: wrote normalization factors to {normalization_path}", file=sys.stderr) + if pooled_rows: write_tsv( pooling_path, @@ -728,34 +830,14 @@ def pool_bams(condition: str, bams: List[str]) -> str: ) normalization_path = os.path.join(args.outdir, "span.normalization_factors.tsv") - spikein_values = [] - for row in samples_sorted: - spikein_values.append(row.get("spikein_scale_factor", "NA")) - - spikein_numeric = [] - for val in spikein_values: - try: - spikein_numeric.append(float(val)) - except Exception: - spikein_numeric.append(None) - - use_spikein_factors = use_spikein and any(val is not None for val in spikein_numeric) - with open(normalization_path, "w") as handle: - handle.write("sample_id\tgroup\tcondition\tspikein_scale_factor\tspikein_size_factor\tused_by_backend\n") - for idx, row in enumerate(samples_sorted): - scale_val = spikein_values[idx] - size_factor = "NA" - used_by_backend = args.fallback_backend - if use_spikein_factors: - used_by_backend = "spikein" - numeric = spikein_numeric[idx] - if numeric is None or numeric == 0: - size_factor = "1" - else: - size_factor = f"{1.0 / numeric:.6f}" - handle.write( - f"{row.get('sample_id','')}\t{row.get('group','')}\t{row.get('condition','')}\t{scale_val}\t{size_factor}\t{used_by_backend}\n" - ) + write_normalization_factors( + normalization_path, + samples_sorted, + spikein_scale_values, + spikein_size_factors, + use_spikein_factors, + args.fallback_backend, + ) print(f"INFO: wrote normalization factors to {normalization_path}", file=sys.stderr) r_script = os.path.join(os.path.dirname(__file__), "span_fallback_de.R") @@ -794,7 +876,7 @@ def pool_bams(condition: str, bams: List[str]) -> str: finalize_success(rows, "fallback") except Exception as exc: reason = str(exc).split("\n")[0] if exc else "failed" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason) + write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) if allow_partial: sys.exit(0) print(f"ERROR: SPAN diff failed: {exc}", file=sys.stderr) diff --git a/nextflow_schema.json b/nextflow_schema.json index 97c90884..1b51175d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -651,6 +651,7 @@ "type": "integer", "default": 10000, "fa_icon": "fas fa-align-justify", + "enum": [1000, 5000, 10000, 25000], "description": "Bin size for ChIPBinner (1000, 5000, 10000, 25000)." }, "chipbinner_windows_dir": { diff --git a/tests/test_07_differential.yml b/tests/test_07_differential.yml index 3b221b4c..8eb0e620 100644 --- a/tests/test_07_differential.yml +++ b/tests/test_07_differential.yml @@ -27,7 +27,7 @@ - path: results/03_peak_calling/08_differential/multiqc/differential_summary_mqc.tsv contains: - "manifest_only" - - "SKIPPED" + - "SKIP" must_not_contain: - "RUN\tok" - path: results/03_peak_calling/08_differential/multiqc/differential_design_mqc.tsv diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index 92ce542f..028ebbda 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -5,6 +5,74 @@ include { DIFFERENTIAL_PEAK_CALLING } from "../subworkflows/local/differential_peak_calling" include { CUSTOM_GETCHROMSIZES } from "../modules/nf-core/custom/getchromsizes/main" +import java.security.MessageDigest + +def hashFile(path) { + if (!path) { + return "none" + } + def f = file(path) + if (!f.exists()) { + return "none" + } + def digest = MessageDigest.getInstance("SHA-256") + f.withInputStream { input -> + byte[] buffer = new byte[8192] + int read + while ((read = input.read(buffer)) > 0) { + digest.update(buffer, 0, read) + } + } + return digest.digest().encodeHex().toString().substring(0, 12) +} + +def resolveCachedWindowsPath(windowsDir, binSize, genomeId, blacklistPath) { + if (!windowsDir) { + return null + } + def winPath = file(windowsDir) + if (winPath.exists() && winPath.isFile()) { + return winPath.toString() + } + if (!winPath.exists() || !winPath.isDirectory()) { + return null + } + def blacklistHash = hashFile(blacklistPath) + def meta = file("${windowsDir}/chipbinner_windows_meta.tsv") + if (meta.exists()) { + def lines = meta.text.readLines() + if (lines.size() > 1) { + def header = lines[0].split("\t") + def values = lines[1].split("\t", -1) + def metaRow = [:] + header.eachWithIndex { col, idx -> + metaRow[col] = idx < values.size() ? values[idx] : "" + } + if (metaRow["bin_size"] == binSize.toString() && metaRow["blacklist_hash"] == blacklistHash) { + def candidate = file("${windowsDir}/${metaRow['windows_path']}") + if (candidate.exists()) { + return candidate.toString() + } + } + } + } + def preferred = [] + if (genomeId) { + preferred << "windows.${genomeId}.${binSize}.${blacklistHash}.bed" + preferred << "windows.${genomeId}.${binSize}.bed" + } else { + preferred << "windows.${binSize}.${blacklistHash}.bed" + preferred << "windows.${binSize}.bed" + } + for (name in preferred) { + def candidate = file("${windowsDir}/${name}") + if (candidate.exists()) { + return candidate.toString() + } + } + return null +} + workflow DIFFERENTIAL_ONLY { if (!params.differential_from_run) { @@ -46,15 +114,35 @@ workflow DIFFERENTIAL_ONLY { ch_gtf = Channel.from( file(params.gtf) ) } + def cached_windows = resolveCachedWindowsPath( + params.chipbinner_windows_dir, + params.chipbinner_bin_size, + params.genome, + params.blacklist + ) + def use_cached_windows = cached_windows != null + if (use_cached_windows) { + params.chipbinner_windows_dir = cached_windows + } + ch_chrom_sizes = Channel.empty() - def need_chrom_sizes = params.run_chipbinner || (params.run_span_diff && params.span_diff_mode != 'fallback') - if (need_chrom_sizes) { + def need_real_chrom_sizes = (params.run_span_diff && params.span_diff_mode != 'fallback') || (params.run_chipbinner && !use_cached_windows) + if (need_real_chrom_sizes) { if (!params.fasta) { - exit 1, "--fasta is required to compute chrom sizes for differential-only mode when running ChIPBinner or SPAN native/auto." + def reasons = [] + if (params.run_chipbinner && !use_cached_windows) { + reasons << "cached ChIPBinner windows not found" + } + if (params.run_span_diff && params.span_diff_mode != 'fallback') { + reasons << "SPAN native/auto requires chrom sizes" + } + exit 1, "--fasta is required for differential-only mode (${reasons.join('; ')})" } ch_fasta = Channel.of([ [id: 'genome'], file(params.fasta) ]) CUSTOM_GETCHROMSIZES ( ch_fasta ) ch_chrom_sizes = CUSTOM_GETCHROMSIZES.out.sizes.map { it[1] } + } else if (params.run_chipbinner || params.run_span_diff) { + ch_chrom_sizes = Channel.fromPath("${projectDir}/assets/chrom_sizes_stub.sizes", checkIfExists: true) } DIFFERENTIAL_PEAK_CALLING ( From 3a6f3655425f84dee01aef165fcdd79c9a6b41df Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 21:26:24 -0800 Subject: [PATCH 26/52] Use biocontainers base image for chipbinner/span --- modules/local/chipbinner_run.nf | 2 +- modules/local/span_diff_run.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/chipbinner_run.nf b/modules/local/chipbinner_run.nf index defee9bd..1221c821 100644 --- a/modules/local/chipbinner_run.nf +++ b/modules/local/chipbinner_run.nf @@ -2,7 +2,7 @@ process CHIPBINNER_RUN { label 'process_chipbinner' conda "conda-forge::python=3.8.3 conda-forge::numpy=1.24.4 conda-forge::pandas=2.0.3 conda-forge::scikit-learn=1.3.2 conda-forge::matplotlib=3.7.5 conda-forge::seaborn=0.12.2 conda-forge::hdbscan=0.8.33 conda-forge::scipy=1.10.1 bioconda::bedtools=2.31.1 bioconda::samtools=1.17 conda-forge::r-base=4.2.3 bioconda::bioconductor-rots=1.20.0" - container "quay.io/biocontainers/python:3.8.3" + container "quay.io/biocontainers/biocontainers:1.2.0--py38_0" input: tuple val(group), path(records), path(chrom_sizes), path(windows) diff --git a/modules/local/span_diff_run.nf b/modules/local/span_diff_run.nf index cd24a47c..4b30de74 100644 --- a/modules/local/span_diff_run.nf +++ b/modules/local/span_diff_run.nf @@ -2,7 +2,7 @@ process SPAN_DIFF_RUN { label { params.span_diff_mode == 'fallback' ? 'SPAN_FALLBACK' : 'SPAN_NATIVE' } conda "conda-forge::python=3.8.3 conda-forge::openjdk=21.0.2 bioconda::samtools=1.16.1 bioconda::bedtools=2.31.0 conda-forge::r-base=4.2.3 bioconda::bioconductor-deseq2 bioconda::bioconductor-edger" - container "quay.io/biocontainers/python:3.8.3" + container "quay.io/biocontainers/biocontainers:1.2.0--py38_0" input: tuple val(group), path(samples_manifest), path(peaks_manifest), path(chrom_sizes) From 89d0c0a6732c3043781d35c95b4313bd411a7da0 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 21:38:18 -0800 Subject: [PATCH 27/52] Gate differential-only annotation inputs for manifest-only --- workflows/differential_only.nf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index 028ebbda..8789a76e 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -98,19 +98,20 @@ workflow DIFFERENTIAL_ONLY { exit 1, "--omnipeaks_jar is required when --run_span_diff is enabled." } - if (!params.gene_bed && !file(cached_gene_bed).exists() && !params.gtf) { + def annotation_enabled = (params.run_diffbind || params.run_chipbinner || params.run_span_diff) && !params.differential_publish_manifest_only + if (annotation_enabled && !params.gene_bed && !file(cached_gene_bed).exists() && !params.gtf) { exit 1, "Differential annotation requires --gene_bed or --gtf (or a cached gene bed at ${cached_gene_bed})." } ch_gene_bed = Channel.empty() - if (params.gene_bed) { + if (annotation_enabled && params.gene_bed) { ch_gene_bed = Channel.fromPath(params.gene_bed, checkIfExists: true) - } else if (file(cached_gene_bed).exists()) { + } else if (annotation_enabled && file(cached_gene_bed).exists()) { ch_gene_bed = Channel.fromPath(cached_gene_bed, checkIfExists: true) } ch_gtf = Channel.empty() - if (params.gtf) { + if (annotation_enabled && params.gtf) { ch_gtf = Channel.from( file(params.gtf) ) } From 7e1b6209a2f6a5d0596bb2a7fa90063c60740a68 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 21:51:04 -0800 Subject: [PATCH 28/52] Mark chipbinner/span disabled in summary --- bin/differential_summary.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/bin/differential_summary.py b/bin/differential_summary.py index c0b0931b..e63c1acc 100755 --- a/bin/differential_summary.py +++ b/bin/differential_summary.py @@ -53,6 +53,15 @@ def main(): } else: for method in ("chipbinner", "span"): + method_status = status + method_reason = reason + if status == "RUN": + if method == "chipbinner" and (row.get("eligible_chipbinner") or "").lower() != "true": + method_status = "SKIP" + method_reason = "chipbinner_disabled" + if method == "span" and (row.get("eligible_span") or "").lower() != "true": + method_status = "SKIP" + method_reason = "span_disabled" key = (method, group, "NA") base_rows[key] = { "method": method, @@ -68,8 +77,8 @@ def main(): "chosen_minPts": "NA", "chosen_minSamps": "NA", "mode": "NA", - "status": status, - "reason": reason, + "status": method_status, + "reason": method_reason, } for summary_path in args.summaries: From 87fd3ec2b1c9738118b2350ee681772dc3f76843 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 22:03:28 -0800 Subject: [PATCH 29/52] Handle small log2FC in headerless SPAN output --- bin/span_diff.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/bin/span_diff.py b/bin/span_diff.py index 4f9a88f0..5206569e 100755 --- a/bin/span_diff.py +++ b/bin/span_diff.py @@ -252,18 +252,16 @@ def idx_for(names: List[str]): val = safe_float(parts[idx]) if val is not None: numeric.append((idx, val)) - for idx, val in numeric: - if log_val == "NA" and not (0.0 <= val <= 1.0): - log_val = parts[idx] - continue - if pval_val == "NA" and 0.0 <= val <= 1.0: - pval_val = parts[idx] - continue - if fdr_val == "NA" and 0.0 <= val <= 1.0: - fdr_val = parts[idx] - continue - if log_val == "NA" and numeric: - log_val = parts[numeric[0][0]] + if numeric: + log_idx, _ = numeric[0] + log_val = parts[log_idx] + for idx, val in numeric[1:]: + if pval_val == "NA" and 0.0 <= val <= 1.0: + pval_val = parts[idx] + continue + if fdr_val == "NA" and 0.0 <= val <= 1.0: + fdr_val = parts[idx] + continue rows.append({ "chr": chr_val, From 6ceb3c2ad46a035ca02ca219fd0a2a4ae98056c9 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 22:18:05 -0800 Subject: [PATCH 30/52] Validate genome_id when reusing cached windows --- workflows/differential_only.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index 8789a76e..7cc9a22e 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -48,7 +48,8 @@ def resolveCachedWindowsPath(windowsDir, binSize, genomeId, blacklistPath) { header.eachWithIndex { col, idx -> metaRow[col] = idx < values.size() ? values[idx] : "" } - if (metaRow["bin_size"] == binSize.toString() && metaRow["blacklist_hash"] == blacklistHash) { + def genome_ok = !genomeId || metaRow["genome_id"] == genomeId + if (genome_ok && metaRow["bin_size"] == binSize.toString() && metaRow["blacklist_hash"] == blacklistHash) { def candidate = file("${windowsDir}/${metaRow['windows_path']}") if (candidate.exists()) { return candidate.toString() From c0b900b894a1ca4ec82b17aa717a984e5a7a5418 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 22:31:37 -0800 Subject: [PATCH 31/52] Allow SPAN auto without fasta in differential-only --- workflows/differential_only.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index 7cc9a22e..d33eaefa 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -128,15 +128,15 @@ workflow DIFFERENTIAL_ONLY { } ch_chrom_sizes = Channel.empty() - def need_real_chrom_sizes = (params.run_span_diff && params.span_diff_mode != 'fallback') || (params.run_chipbinner && !use_cached_windows) + def need_real_chrom_sizes = (params.run_span_diff && params.span_diff_mode == 'native') || (params.run_chipbinner && !use_cached_windows) if (need_real_chrom_sizes) { if (!params.fasta) { def reasons = [] if (params.run_chipbinner && !use_cached_windows) { reasons << "cached ChIPBinner windows not found" } - if (params.run_span_diff && params.span_diff_mode != 'fallback') { - reasons << "SPAN native/auto requires chrom sizes" + if (params.run_span_diff && params.span_diff_mode == 'native') { + reasons << "SPAN native requires chrom sizes" } exit 1, "--fasta is required for differential-only mode (${reasons.join('; ')})" } From f8a6b37b2ce792a51e6b90a7996695fc56e96ec1 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 22:46:28 -0800 Subject: [PATCH 32/52] Fix differential-only detection and spike-in validation --- bin/diffbind_run.R | 30 ++++++++++++++++++++---------- lib/WorkflowMain.groovy | 2 +- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/bin/diffbind_run.R b/bin/diffbind_run.R index 25a07992..3ed16abd 100755 --- a/bin/diffbind_run.R +++ b/bin/diffbind_run.R @@ -246,17 +246,27 @@ tryCatch({ count_args <- utils::modifyList(list(DBA = dba_obj, summits = summit_size, minOverlap = min_overlap), extra_params_for(extra_params, "dba_count")) dba_obj <- do.call(dba.count, count_args) - if (use_spikein && "spikein_scale_factor" %in% colnames(records)) { - factors <- suppressWarnings(as.numeric(records$spikein_scale_factor)) - if (any(!is.na(factors))) { - size_factors <- ifelse(factors == 0, 1, 1 / factors) - names(size_factors) <- records$sample_id - size_factors <- size_factors[samplesheet$SampleID] - norm_args <- utils::modifyList(list(DBA = dba_obj, normalize = DBA_NORM_LIB, library = size_factors), extra_params_for(extra_params, "dba_normalize")) - dba_obj <- do.call(dba.normalize, norm_args) - norm_out <- data.frame(sample_id = samplesheet$SampleID, size_factor = size_factors, stringsAsFactors = FALSE) - write.table(norm_out, file = file.path(outdir, "diffbind.normalization_factors.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + if (use_spikein) { + if (!"spikein_scale_factor" %in% colnames(records)) { + status <- if (allow_partial) "SKIP" else "FAIL" + fail_or_record(status, "missing_spikein_scale_factor", "Missing spikein_scale_factor column in records; required when --differential_use_spikein is enabled.") } + factors_raw <- records$spikein_scale_factor + factors_raw_chr <- trimws(as.character(factors_raw)) + factors <- suppressWarnings(as.numeric(factors_raw_chr)) + missing_idx <- which(is.na(factors) | is.na(factors_raw) | factors_raw_chr == "") + if (length(missing_idx) > 0) { + missing_samples <- unique(records$sample_id[missing_idx]) + status <- if (allow_partial) "SKIP" else "FAIL" + fail_or_record(status, "missing_spikein_scale_factor", sprintf("Missing spike-in scale factors for samples: %s", paste(missing_samples, collapse = ", "))) + } + size_factors <- ifelse(factors == 0, 1, 1 / factors) + names(size_factors) <- records$sample_id + size_factors <- size_factors[samplesheet$SampleID] + norm_args <- utils::modifyList(list(DBA = dba_obj, normalize = DBA_NORM_LIB, library = size_factors), extra_params_for(extra_params, "dba_normalize")) + dba_obj <- do.call(dba.normalize, norm_args) + norm_out <- data.frame(sample_id = samplesheet$SampleID, size_factor = size_factors, stringsAsFactors = FALSE) + write.table(norm_out, file = file.path(outdir, "diffbind.normalization_factors.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) } else if (!is.null(norm_method) && norm_method != "native") { norm_args <- utils::modifyList(list(DBA = dba_obj, method = norm_method), extra_params_for(extra_params, "dba_normalize")) dba_obj <- do.call(dba.normalize, norm_args) diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index acda5347..8597dcd1 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -59,7 +59,7 @@ class WorkflowMain { if (entry) { return entry?.toString()?.toUpperCase() == 'DIFFERENTIAL_ONLY' } - return params?.differential_from_run ? true : false + return false } // From a3e2ed079cdf70b4bd1b00ecb7e60c14594f81a2 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 23:00:18 -0800 Subject: [PATCH 33/52] Align ChIPBinner log2FC pseudocount --- bin/chipbinner_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index afb382a1..df6bbd72 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -708,7 +708,7 @@ def main(): control_idx = [i for i, row in enumerate(samples) if row.get("condition") == control_label] treated_mean = norm_vals[:, treated_idx].mean(axis=1) control_mean = norm_vals[:, control_idx].mean(axis=1) - log2fc = np.log2(treated_mean + 1) - np.log2(control_mean + 1) + log2fc = np.log2(treated_mean + args.pseudocount) - np.log2(control_mean + args.pseudocount) diff_df = pd.DataFrame({ "chrom": windows_df["chrom"], From b8d24df0772e75eeefbec0488821d5ee56e19ae7 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 23:12:46 -0800 Subject: [PATCH 34/52] Apply ChIPBinner pseudocount after CPM --- bin/chipbinner_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index df6bbd72..aec3bfb4 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -279,10 +279,10 @@ def normalize_counts(counts_df, sample_ids, samples, use_spikein, pseudocount): "applied_steps": ",".join(steps), }) - counts = counts + pseudocount library_size = counts.sum(axis=0) library_size[library_size == 0] = 1.0 normalized = counts.divide(library_size, axis=1) * 1e6 + normalized = normalized + pseudocount return normalized, pd.DataFrame(factors) From ce0ce4edf84c9ea7ecc88521720c5cc159001a1f Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 23:23:57 -0800 Subject: [PATCH 35/52] Require chrom sizes for SPAN auto and fix log2FC --- bin/chipbinner_run.py | 2 +- workflows/differential_only.nf | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index aec3bfb4..633978da 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -708,7 +708,7 @@ def main(): control_idx = [i for i, row in enumerate(samples) if row.get("condition") == control_label] treated_mean = norm_vals[:, treated_idx].mean(axis=1) control_mean = norm_vals[:, control_idx].mean(axis=1) - log2fc = np.log2(treated_mean + args.pseudocount) - np.log2(control_mean + args.pseudocount) + log2fc = np.log2(treated_mean) - np.log2(control_mean) diff_df = pd.DataFrame({ "chrom": windows_df["chrom"], diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index d33eaefa..dacb67d5 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -128,15 +128,21 @@ workflow DIFFERENTIAL_ONLY { } ch_chrom_sizes = Channel.empty() - def need_real_chrom_sizes = (params.run_span_diff && params.span_diff_mode == 'native') || (params.run_chipbinner && !use_cached_windows) + def span_requires_sizes = params.run_span_diff && ['native', 'auto'].contains(params.span_diff_mode) + def chipbinner_requires_sizes = params.run_chipbinner && !use_cached_windows + def need_real_chrom_sizes = span_requires_sizes || chipbinner_requires_sizes if (need_real_chrom_sizes) { if (!params.fasta) { def reasons = [] - if (params.run_chipbinner && !use_cached_windows) { + if (chipbinner_requires_sizes) { reasons << "cached ChIPBinner windows not found" } - if (params.run_span_diff && params.span_diff_mode == 'native') { - reasons << "SPAN native requires chrom sizes" + if (span_requires_sizes) { + if (params.span_diff_mode == 'auto') { + reasons << "SPAN auto can select native compare and requires chrom sizes (set --span_diff_mode fallback to run without FASTA)" + } else { + reasons << "SPAN native requires chrom sizes" + } } exit 1, "--fasta is required for differential-only mode (${reasons.join('; ')})" } From 674cdb0f511f03bdbe7660904df69a44e7d443ca Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 23:33:17 -0800 Subject: [PATCH 36/52] Pass chrom sizes path correctly --- subworkflows/local/differential_peak_calling.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 148138cf..58dda815 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -407,7 +407,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { def chipbinner_windows_dir = params.chipbinner_windows_dir ? file(params.chipbinner_windows_dir).toString() : '' def chipbinner_blacklist = params.blacklist ? file(params.blacklist).toString() : '' CHIPBINNER_WINDOWS_CACHE ( - ch_chrom_sizes.collect().map { it[0] }, + ch_chrom_sizes.collect().map { it instanceof List ? it[0] : it }, params.chipbinner_bin_size, chipbinner_windows_dir, chipbinner_blacklist @@ -438,7 +438,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { .map { group, caller, records_file -> [group, records_file] } ch_chip_inputs = ch_chip_records_file - .combine(ch_chrom_sizes.collect().map { it[0] }) + .combine(ch_chrom_sizes.collect().map { it instanceof List ? it[0] : it }) .combine(ch_chipbinner_windows) .map { record, chrom_sizes, windows -> [ record[0], record[1], chrom_sizes, windows ] } @@ -492,7 +492,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { if (params.span_diff_mode == 'fallback') { span_chrom_sizes = ch_chrom_sizes.ifEmpty(file("$projectDir/assets/chrom_sizes_stub.sizes")) } - ch_span_chrom_sizes_single = span_chrom_sizes.collect().map { it[0] } + ch_span_chrom_sizes_single = span_chrom_sizes.collect().map { it instanceof List ? it[0] : it } ch_span_inputs = ch_span_design .combine(ch_samples_manifest_single) From 1dc541f360ad7502591cedcae5b4d29a23ef52a7 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 23:47:02 -0800 Subject: [PATCH 37/52] Default posthoc chipbinner windows cache --- workflows/differential_only.nf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index dacb67d5..e863a6ce 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -116,6 +116,11 @@ workflow DIFFERENTIAL_ONLY { ch_gtf = Channel.from( file(params.gtf) ) } + def default_windows_dir = "${manifest_dir}/chipbinner_windows_cache" + if (!params.chipbinner_windows_dir && file(default_windows_dir).exists()) { + params.chipbinner_windows_dir = default_windows_dir + } + def cached_windows = resolveCachedWindowsPath( params.chipbinner_windows_dir, params.chipbinner_bin_size, From 2a00b69da835044f14e6baebd0474357066546de Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 23:55:58 -0800 Subject: [PATCH 38/52] Gunzip FASTA before getchromsizes in posthoc --- workflows/differential_only.nf | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index e863a6ce..4dc8e069 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -4,6 +4,7 @@ include { DIFFERENTIAL_PEAK_CALLING } from "../subworkflows/local/differential_peak_calling" include { CUSTOM_GETCHROMSIZES } from "../modules/nf-core/custom/getchromsizes/main" +include { GUNZIP as GUNZIP_FASTA } from "../modules/nf-core/gunzip/main" import java.security.MessageDigest @@ -151,7 +152,11 @@ workflow DIFFERENTIAL_ONLY { } exit 1, "--fasta is required for differential-only mode (${reasons.join('; ')})" } - ch_fasta = Channel.of([ [id: 'genome'], file(params.fasta) ]) + if (params.fasta.endsWith(".gz")) { + ch_fasta = GUNZIP_FASTA ( [ [id: 'genome'], params.fasta ] ).gunzip + } else { + ch_fasta = Channel.of([ [id: 'genome'], file(params.fasta) ]) + } CUSTOM_GETCHROMSIZES ( ch_fasta ) ch_chrom_sizes = CUSTOM_GETCHROMSIZES.out.sizes.map { it[1] } } else if (params.run_chipbinner || params.run_span_diff) { From c10dc6bcb1dc1fc8eda3363dbd07a924c719a982 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Fri, 9 Jan 2026 00:04:08 -0800 Subject: [PATCH 39/52] Clarify spike-in scaling direction --- bin/chipbinner_run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index 633978da..8ac6507a 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -254,7 +254,8 @@ def normalize_counts(counts_df, sample_ids, samples, use_spikein, pseudocount): if spike == 0: raise RuntimeError("Spike-in scale factor cannot be zero") spike_size = 1.0 / spike - counts.iloc[:, idx] = counts.iloc[:, idx] / spike_size + # Apply the scale factor directly (equivalent to dividing by size factor). + counts.iloc[:, idx] = counts.iloc[:, idx] * spike steps.append("spikein") ms_raw = row.get("ms_coeff") From e6739dfa98869a6e86c569de60dd7d0fd7283343 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Fri, 9 Jan 2026 00:21:55 -0800 Subject: [PATCH 40/52] Preserve spike-in/MS scaling through CPM --- bin/chipbinner_run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index 8ac6507a..f1189401 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -244,6 +244,7 @@ def write_matrix(counts_df, sample_ids, out_path): def normalize_counts(counts_df, sample_ids, samples, use_spikein, pseudocount): counts = counts_df[sample_ids].astype(float) + raw_library_size = counts.sum(axis=0) factors = [] for idx, row in enumerate(samples): steps = [] @@ -280,7 +281,7 @@ def normalize_counts(counts_df, sample_ids, samples, use_spikein, pseudocount): "applied_steps": ",".join(steps), }) - library_size = counts.sum(axis=0) + library_size = raw_library_size library_size[library_size == 0] = 1.0 normalized = counts.divide(library_size, axis=1) * 1e6 normalized = normalized + pseudocount From 1d9d18a0c0b16bebd6f9058c4f819ec76867cca6 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Fri, 9 Jan 2026 00:32:38 -0800 Subject: [PATCH 41/52] Fix manifest paths and cache fallback --- subworkflows/local/differential_peak_calling.nf | 9 +++++---- workflows/differential_only.nf | 4 ++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 58dda815..46b8be67 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -54,13 +54,14 @@ workflow DIFFERENTIAL_PEAK_CALLING { def diff_enabled = params.run_diffbind || params.run_chipbinner || params.run_span_diff || params.differential_publish_manifest_only if (diff_enabled) { def use_manifests = (mode == 'posthoc') - def diff_outdir = "${params.outdir}/03_peak_calling/08_differential" + def outdir_root = file(params.outdir).toString() + def diff_outdir = "${outdir_root}/03_peak_calling/08_differential" def bam_subdir = params.run_remove_linear_dups ? 'linear_dedup' : (params.run_remove_dups ? 'dedup' : (params.run_mark_dups ? 'markdup' : '')) - def bam_dir = bam_subdir ? "${params.outdir}/02_alignment/${params.aligner}/target/${bam_subdir}" : "${params.outdir}/02_alignment/${params.aligner}/target" + def bam_dir = bam_subdir ? "${outdir_root}/02_alignment/${params.aligner}/target/${bam_subdir}" : "${outdir_root}/02_alignment/${params.aligner}/target" - def peaks_dir = { caller -> "${params.outdir}/03_peak_calling/04_called_peaks/${caller}" } - def bigwig_dir = "${params.outdir}/03_peak_calling/03_bed_to_bigwig" + def peaks_dir = { caller -> "${outdir_root}/03_peak_calling/04_called_peaks/${caller}" } + def bigwig_dir = "${outdir_root}/03_peak_calling/03_bed_to_bigwig" def peaks_format = { file -> def name = file.getName().toLowerCase() diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index 4dc8e069..50d37276 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -118,8 +118,10 @@ workflow DIFFERENTIAL_ONLY { } def default_windows_dir = "${manifest_dir}/chipbinner_windows_cache" + def default_windows_set = false if (!params.chipbinner_windows_dir && file(default_windows_dir).exists()) { params.chipbinner_windows_dir = default_windows_dir + default_windows_set = true } def cached_windows = resolveCachedWindowsPath( @@ -131,6 +133,8 @@ workflow DIFFERENTIAL_ONLY { def use_cached_windows = cached_windows != null if (use_cached_windows) { params.chipbinner_windows_dir = cached_windows + } else if (default_windows_set) { + params.chipbinner_windows_dir = '' } ch_chrom_sizes = Channel.empty() From c86079c497be2c2278c93e83ce41ebe5733fb742 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Fri, 9 Jan 2026 00:45:57 -0800 Subject: [PATCH 42/52] Handle remote gene_bed in differential-only --- workflows/differential_only.nf | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index 50d37276..97a7c9b2 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -5,6 +5,7 @@ include { DIFFERENTIAL_PEAK_CALLING } from "../subworkflows/local/differential_peak_calling" include { CUSTOM_GETCHROMSIZES } from "../modules/nf-core/custom/getchromsizes/main" include { GUNZIP as GUNZIP_FASTA } from "../modules/nf-core/gunzip/main" +include { GUNZIP as GUNZIP_GENE_BED } from "../modules/nf-core/gunzip/main" import java.security.MessageDigest @@ -107,7 +108,11 @@ workflow DIFFERENTIAL_ONLY { ch_gene_bed = Channel.empty() if (annotation_enabled && params.gene_bed) { - ch_gene_bed = Channel.fromPath(params.gene_bed, checkIfExists: true) + if (params.gene_bed.endsWith(".gz")) { + ch_gene_bed = GUNZIP_GENE_BED ( [ [id: 'gene_bed'], params.gene_bed ] ).gunzip.map { it[1] } + } else { + ch_gene_bed = Channel.from( file(params.gene_bed) ) + } } else if (annotation_enabled && file(cached_gene_bed).exists()) { ch_gene_bed = Channel.fromPath(cached_gene_bed, checkIfExists: true) } From 18ae889f70a97d48b12a39ea24ba60ec1a299df6 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Fri, 9 Jan 2026 01:20:29 -0800 Subject: [PATCH 43/52] Clamp HDBSCAN grid to dataset size --- bin/chipbinner_run.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index f1189401..3c9a7e9d 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -687,8 +687,19 @@ def main(): plot_pca(norm_matrix.drop(columns=["bin_id"]).values, sample_ids, samples, os.path.join(plots_dir, "chipbinner.pca.png")) plot_correlation(norm_matrix.drop(columns=["bin_id"]).values, sample_ids, os.path.join(plots_dir, "chipbinner.correlation.png")) - minpts_list = parse_grid(args.grid_minpts, [100, 200, 500, 1000]) - minsamps_list = parse_grid(args.grid_minsamps, [100, 200, 500, 1000]) + n_bins = len(windows_df) + if n_bins < 2: + raise RuntimeError("Not enough bins for HDBSCAN clustering (n_bins < 2)") + + def clamp_grid(values, n_bins, min_value): + vals = sorted({v for v in values if v <= n_bins}) + if not vals: + fallback = min(n_bins, max(min_value, min(10, n_bins))) + vals = [fallback] + return vals + + minpts_list = clamp_grid(parse_grid(args.grid_minpts, [100, 200, 500, 1000]), n_bins, 2) + minsamps_list = clamp_grid(parse_grid(args.grid_minsamps, [100, 200, 500, 1000]), n_bins, 1) grid_dir = os.path.join(args.outdir, "hdbscan_grid") grid_summary_path = os.path.join(args.outdir, "chipbinner.hdbscan_grid_summary.tsv") chosen, grid_summary_path = hdbscan_grid(norm_matrix.drop(columns=["bin_id"]), windows_df, minpts_list, minsamps_list, grid_dir, grid_summary_path) From bc423f2cb0ac102cedfacb976b22541830c8e305 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Fri, 9 Jan 2026 01:40:03 -0800 Subject: [PATCH 44/52] Skip CPM when scaling applied --- bin/chipbinner_run.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index 3c9a7e9d..a9e7eae1 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -244,10 +244,11 @@ def write_matrix(counts_df, sample_ids, out_path): def normalize_counts(counts_df, sample_ids, samples, use_spikein, pseudocount): counts = counts_df[sample_ids].astype(float) - raw_library_size = counts.sum(axis=0) factors = [] + scaling_applied = [] for idx, row in enumerate(samples): steps = [] + applied = False spike_raw = row.get("spikein_scale_factor") spike = parse_float(spike_raw) if use_spikein else None spike_size = None @@ -258,6 +259,7 @@ def normalize_counts(counts_df, sample_ids, samples, use_spikein, pseudocount): # Apply the scale factor directly (equivalent to dividing by size factor). counts.iloc[:, idx] = counts.iloc[:, idx] * spike steps.append("spikein") + applied = True ms_raw = row.get("ms_coeff") ms_coeff = parse_float(ms_raw) @@ -268,8 +270,8 @@ def normalize_counts(counts_df, sample_ids, samples, use_spikein, pseudocount): ms_size = 1.0 / ms_coeff counts.iloc[:, idx] = counts.iloc[:, idx] / ms_size steps.append("ms_coeff") + applied = True - steps.extend(["pseudocount", "library_size_cpm"]) factors.append({ "sample_id": row.get("sample_id", ""), "group": row.get("group", ""), @@ -278,13 +280,26 @@ def normalize_counts(counts_df, sample_ids, samples, use_spikein, pseudocount): "spikein_size_factor": spike_size if spike_size is not None else "NA", "ms_coeff": ms_coeff if ms_coeff is not None else "NA", "ms_size_factor": ms_size if ms_size is not None else "NA", - "applied_steps": ",".join(steps), + "applied_steps": steps, }) + scaling_applied.append(applied) - library_size = raw_library_size - library_size[library_size == 0] = 1.0 - normalized = counts.divide(library_size, axis=1) * 1e6 + apply_cpm = not any(scaling_applied) + if apply_cpm: + library_size = counts.sum(axis=0) + library_size[library_size == 0] = 1.0 + normalized = counts.divide(library_size, axis=1) * 1e6 + else: + normalized = counts.copy() normalized = normalized + pseudocount + + for entry in factors: + steps = entry.get("applied_steps", []) + steps.append("pseudocount") + if apply_cpm: + steps.append("library_size_cpm") + entry["applied_steps"] = ",".join(steps) + return normalized, pd.DataFrame(factors) From 687d16dd257983f2cba942ad0c44fc11847449db Mon Sep 17 00:00:00 2001 From: dhusmann Date: Fri, 9 Jan 2026 01:52:53 -0800 Subject: [PATCH 45/52] Relax cached windows blacklist matching --- workflows/differential_only.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index 97a7c9b2..d0280574 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -51,7 +51,8 @@ def resolveCachedWindowsPath(windowsDir, binSize, genomeId, blacklistPath) { metaRow[col] = idx < values.size() ? values[idx] : "" } def genome_ok = !genomeId || metaRow["genome_id"] == genomeId - if (genome_ok && metaRow["bin_size"] == binSize.toString() && metaRow["blacklist_hash"] == blacklistHash) { + def hash_ok = !blacklistPath || metaRow["blacklist_hash"] == blacklistHash + if (genome_ok && metaRow["bin_size"] == binSize.toString() && hash_ok) { def candidate = file("${windowsDir}/${metaRow['windows_path']}") if (candidate.exists()) { return candidate.toString() From d3d4c9eb651ec23f0953a7238e3b3fa2df0bff7c Mon Sep 17 00:00:00 2001 From: dhusmann Date: Fri, 9 Jan 2026 02:07:47 -0800 Subject: [PATCH 46/52] Require spike-in factors when enabled --- bin/chipbinner_run.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index a9e7eae1..47c1623f 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -244,6 +244,14 @@ def write_matrix(counts_df, sample_ids, out_path): def normalize_counts(counts_df, sample_ids, samples, use_spikein, pseudocount): counts = counts_df[sample_ids].astype(float) + if use_spikein: + missing = [] + for row in samples: + spike_raw = row.get("spikein_scale_factor") + if parse_float(spike_raw) is None: + missing.append(row.get("sample_id", "")) + if missing: + raise RuntimeError(f"Missing spike-in scale factors for samples: {', '.join(sorted(set(missing)))}") factors = [] scaling_applied = [] for idx, row in enumerate(samples): From dd781b63389f2ad140885514e37aa4daf927edef Mon Sep 17 00:00:00 2001 From: dhusmann Date: Fri, 9 Jan 2026 02:21:19 -0800 Subject: [PATCH 47/52] Allow partial on windows lookup --- bin/chipbinner_run.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index 47c1623f..d41c40f8 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -676,11 +676,11 @@ def main(): f"chipbinner_use_input requested but input BAMs are missing ({'; '.join(details) or 'no input_bam values'})." ) - windows_source = choose_windows_file( - args.windows, args.windows_dir, args.bin_size, args.chrom_sizes, args.blacklist - ) windows_path = os.path.join(args.outdir, "chipbinner.windows.bed") try: + windows_source = choose_windows_file( + args.windows, args.windows_dir, args.bin_size, args.chrom_sizes, args.blacklist + ) if windows_source: if args.blacklist and os.path.exists(args.blacklist): with tempfile.TemporaryDirectory() as tmpdir: From 4725d675271ebaa0bbc46e0b2da3706e53a71c8b Mon Sep 17 00:00:00 2001 From: dhusmann Date: Fri, 9 Jan 2026 02:32:32 -0800 Subject: [PATCH 48/52] Derive spike-in usage from manifests --- .../local/differential_peak_calling.nf | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 46b8be67..f31ac7de 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -324,7 +324,18 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_samples_rows_contrast = ch_samples_rows.filter { row -> contrast_labels.contains(row.condition) } def diff_use_spikein = params.differential_use_spikein ? params.differential_use_spikein.toString().toLowerCase() : 'auto' - def use_spikein = (diff_use_spikein == 'true') || (diff_use_spikein == 'auto' && params.normalisation_mode == 'Spikein') + def use_spikein_default = (diff_use_spikein == 'true') || (diff_use_spikein == 'auto' && params.normalisation_mode == 'Spikein') + def ch_use_spikein = Channel.value(use_spikein_default) + if (use_manifests && diff_use_spikein == 'auto') { + ch_use_spikein = ch_samples_rows + .map { row -> row.normalisation_mode ?: '' } + .collect() + .map { modes -> + def uniq = modes.findAll { it }.collect { it.toString() }.unique() + uniq.contains('Spikein') + } + } + ch_use_spikein = ch_use_spikein.broadcast() if (params.run_diffbind && !params.differential_publish_manifest_only) { ch_diffbind_design = ch_design_rows @@ -371,7 +382,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { DIFFBIND_RUN ( RECORDS_TO_TSV_DIFFBIND.out.tsv, params.differential_contrast, - use_spikein, + ch_use_spikein, params.diffbind_fdr, params.diffbind_lfc, params.diffbind_min_overlap, @@ -450,7 +461,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { '', '', params.chipbinner_use_input, - use_spikein, + ch_use_spikein, params.chipbinner_pseudocount, params.chipbinner_hdbscan_grid_minpts, params.chipbinner_hdbscan_grid_minsamps, @@ -509,7 +520,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { params.span_diff_gap, params.span_diff_bin, params.span_fallback_backend, - use_spikein, + ch_use_spikein, params.differential_allow_partial, span_caller_priority, params.omnipeaks_jar ? file(params.omnipeaks_jar) : null, From 865bd9b34489f860f2cf942a54afcf68be7dd05f Mon Sep 17 00:00:00 2001 From: dhusmann Date: Fri, 9 Jan 2026 16:34:06 -0800 Subject: [PATCH 49/52] Fix differential summary normalization and SPAN mode validation --- assets/multiqc/chipbinner_summary_header.txt | 10 ++++++++++ assets/multiqc/span_summary_header.txt | 10 ++++++++++ assets/multiqc_differential_config.yml | 7 +++++-- bin/differential_summary.py | 11 +++++++++-- bin/span_diff.py | 14 ++++++++++++-- modules/local/differential_summary.nf | 8 ++++++++ subworkflows/local/differential_peak_calling.nf | 9 ++++++--- 7 files changed, 60 insertions(+), 9 deletions(-) create mode 100644 assets/multiqc/chipbinner_summary_header.txt create mode 100644 assets/multiqc/span_summary_header.txt diff --git a/assets/multiqc/chipbinner_summary_header.txt b/assets/multiqc/chipbinner_summary_header.txt new file mode 100644 index 00000000..e8adddb9 --- /dev/null +++ b/assets/multiqc/chipbinner_summary_header.txt @@ -0,0 +1,10 @@ +#id: 'chipbinner_summary' +#parent_id: 'differential_analysis' +#parent_name: 'Differential analysis' +#parent_description: 'Differential peak calling outputs' +#section_name: 'ChIPBinner summary' +#description: 'Per-group ChIPBinner summary metrics.' +#plot_type: 'table' +#pconfig: +# id: 'chipbinner_summary_table' +# title: 'ChIPBinner summary' diff --git a/assets/multiqc/span_summary_header.txt b/assets/multiqc/span_summary_header.txt new file mode 100644 index 00000000..06d7fd67 --- /dev/null +++ b/assets/multiqc/span_summary_header.txt @@ -0,0 +1,10 @@ +#id: 'span_summary' +#parent_id: 'differential_analysis' +#parent_name: 'Differential analysis' +#parent_description: 'Differential peak calling outputs' +#section_name: 'SPAN summary' +#description: 'Per-group SPAN summary metrics.' +#plot_type: 'table' +#pconfig: +# id: 'span_summary_table' +# title: 'SPAN summary' diff --git a/assets/multiqc_differential_config.yml b/assets/multiqc_differential_config.yml index 2f93a69e..7a2e886c 100644 --- a/assets/multiqc_differential_config.yml +++ b/assets/multiqc_differential_config.yml @@ -1,6 +1,7 @@ report_comment: > - Differential analysis report generated by nf-core/cutandrun. Method-specific - metrics (e.g., ChIPBinner clustering parameters) are included in the summary table. + Differential analysis report generated by nf-core/cutandrun. The summary table + includes method-specific metrics, with dedicated ChIPBinner/SPAN sections for + detailed per-method summaries. run_modules: - custom_content @@ -9,6 +10,8 @@ custom_content: order: - differential_design - differential_summary + - chipbinner_summary + - span_summary sp: custom_content: diff --git a/bin/differential_summary.py b/bin/differential_summary.py index e63c1acc..0d7a5749 100755 --- a/bin/differential_summary.py +++ b/bin/differential_summary.py @@ -9,6 +9,13 @@ def read_tsv(path): reader = csv.DictReader(handle, delimiter="\t") return [row for row in reader] +def normalize_status(status): + if not status: + return status + if status.upper() == "SKIPPED": + return "SKIP" + return status + def main(): parser = argparse.ArgumentParser(description="Build differential summary table") @@ -26,7 +33,7 @@ def main(): caller = row.get("caller") or "NA" treated = row.get("treated_condition") or "" control = row.get("control_condition") or "" - status = row.get("status") or "" + status = normalize_status(row.get("status") or "") reason = row.get("reason") or "" if args.manifest_only and status == "RUN": status = "SKIP" @@ -114,7 +121,7 @@ def main(): "chosen_minPts": row.get("chosen_minPts") or "NA", "chosen_minSamps": row.get("chosen_minSamps") or "NA", "mode": row.get("mode") or "NA", - "status": row.get("status") or "RUN", + "status": normalize_status(row.get("status") or "RUN"), "reason": row.get("reason") or "ok", } existing = base_rows.get(key) diff --git a/bin/span_diff.py b/bin/span_diff.py index 5206569e..b039f5bb 100755 --- a/bin/span_diff.py +++ b/bin/span_diff.py @@ -482,11 +482,21 @@ def main(): use_spikein = str(args.use_spikein).lower() in {"true", "1", "yes"} allow_partial = str(args.allow_partial).lower() in {"true", "1", "yes"} + mode = str(args.mode or "").strip().lower() native_supported, signature, help_text = detect_signature(args.jar, args.java_heap) + valid_modes = {"auto", "native", "fallback"} + if mode not in valid_modes: + reason = f"invalid_mode:{mode}" if mode else "invalid_mode" + write_failure_outputs(args.outdir, args.group, treated, control, mode or "invalid", signature, reason, allow_partial) + if allow_partial: + sys.exit(0) + print("ERROR: --mode must be one of auto, native, or fallback.", file=sys.stderr) + sys.exit(1) + chosen_mode = "fallback" - if args.mode == "native": + if mode == "native": if not native_supported: reason = "native_not_supported" write_failure_outputs(args.outdir, args.group, treated, control, "native", signature, reason, allow_partial) @@ -495,7 +505,7 @@ def main(): print("ERROR: SPAN jar does not support native compare.", file=sys.stderr) sys.exit(1) chosen_mode = "native" - elif args.mode == "fallback": + elif mode == "fallback": chosen_mode = "fallback" else: chosen_mode = "native" if native_supported else "fallback" diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf index 3ac5fcd3..5ee15e35 100644 --- a/modules/local/differential_summary.nf +++ b/modules/local/differential_summary.nf @@ -10,10 +10,14 @@ process DIFFERENTIAL_SUMMARY { val manifest_only path summary_header path design_header + path chipbinner_header + path span_header output: path "differential_summary_mqc.tsv", emit: summary path "differential_design_mqc.tsv", emit: design + path "chipbinner_summary_mqc.tsv", emit: chipbinner_summary + path "span_summary_mqc.tsv", emit: span_summary path "versions.yml", emit: versions when: @@ -31,6 +35,10 @@ process DIFFERENTIAL_SUMMARY { cat ${summary_header} differential_summary.tsv > differential_summary_mqc.tsv cat ${design_header} ${design_manifest} > differential_design_mqc.tsv + awk 'BEGIN{FS=OFS="\\t"} NR==1{print; next} \$1=="chipbinner"' differential_summary.tsv > chipbinner_summary.tsv + cat ${chipbinner_header} chipbinner_summary.tsv > chipbinner_summary_mqc.tsv + awk 'BEGIN{FS=OFS="\\t"} NR==1{print; next} \$1=="span"' differential_summary.tsv > span_summary.tsv + cat ${span_header} span_summary.tsv > span_summary_mqc.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index f31ac7de..1a2f17c1 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -49,6 +49,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_summary_files = Channel.empty() ch_samples_manifest = Channel.empty() ch_peaks_manifest = Channel.empty() + ch_use_spikein = Channel.empty() summary_stub = file("$projectDir/assets/differential_summary_stub.tsv") def diff_enabled = params.run_diffbind || params.run_chipbinner || params.run_span_diff || params.differential_publish_manifest_only @@ -325,7 +326,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { def diff_use_spikein = params.differential_use_spikein ? params.differential_use_spikein.toString().toLowerCase() : 'auto' def use_spikein_default = (diff_use_spikein == 'true') || (diff_use_spikein == 'auto' && params.normalisation_mode == 'Spikein') - def ch_use_spikein = Channel.value(use_spikein_default) + ch_use_spikein = Channel.value(use_spikein_default) if (use_manifests && diff_use_spikein == 'auto') { ch_use_spikein = ch_samples_rows .map { row -> row.normalisation_mode ?: '' } @@ -500,7 +501,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_samples_manifest_single = ch_samples_manifest.first() ch_peaks_manifest_single = ch_peaks_manifest.first() - def span_chrom_sizes = ch_chrom_sizes + span_chrom_sizes = ch_chrom_sizes if (params.span_diff_mode == 'fallback') { span_chrom_sizes = ch_chrom_sizes.ifEmpty(file("$projectDir/assets/chrom_sizes_stub.sizes")) } @@ -566,7 +567,9 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_summary_files.collect().map { it + summary_stub }.ifEmpty([summary_stub]), params.differential_publish_manifest_only, file("$projectDir/assets/multiqc/differential_summary_header.txt"), - file("$projectDir/assets/multiqc/differential_design_header.txt") + file("$projectDir/assets/multiqc/differential_design_header.txt"), + file("$projectDir/assets/multiqc/chipbinner_summary_header.txt"), + file("$projectDir/assets/multiqc/span_summary_header.txt") ) ch_versions = ch_versions.mix(DIFFERENTIAL_SUMMARY.out.versions) ch_diff_summary_mqc = DIFFERENTIAL_SUMMARY.out.summary From b3341a69df9f36d64fb055b0d444f7b13a2164f0 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 12 Jan 2026 03:09:51 -0800 Subject: [PATCH 50/52] Update differential workflow outputs and tests --- bin/chipbinner_run.py | 19 +++ bin/differential_design.py | 17 +- bin/span_diff.py | 146 ++++++++++++++++-- conf/modules.config | 16 +- modules/local/annotate_regions.nf | 50 +++++- modules/local/cache_differential_gene_bed.nf | 4 +- modules/local/chipbinner_run.nf | 42 ++++- modules/local/chipbinner_windows_cache.nf | 38 ++++- modules/local/diffbind_run.nf | 13 +- modules/local/export_diffbind_samplesheets.nf | 17 ++ modules/local/for_patch/bedtools/sort/main.nf | 5 +- modules/local/gtf_to_gene_bed.nf | 8 +- modules/local/rename_summary.nf | 14 ++ modules/local/span_diff_run.nf | 14 +- .../local/differential_peak_calling.nf | 85 ++++++---- tests/assets/span_mode/control.bam | Bin 0 -> 188 bytes tests/assets/span_mode/control.bam.bai | Bin 0 -> 96 bytes tests/assets/span_mode/peaks.tsv | 2 + tests/assets/span_mode/s1.peaks.bed | 2 + tests/assets/span_mode/s2.peaks.bed | 2 + tests/assets/span_mode/samples.tsv | 2 + tests/assets/span_mode/treatment.bam | Bin 0 -> 188 bytes tests/assets/span_mode/treatment.bam.bai | Bin 0 -> 96 bytes tests/test_07_differential.yml | 21 ++- 24 files changed, 440 insertions(+), 77 deletions(-) create mode 100644 modules/local/export_diffbind_samplesheets.nf create mode 100644 modules/local/rename_summary.nf create mode 100644 tests/assets/span_mode/control.bam create mode 100644 tests/assets/span_mode/control.bam.bai create mode 100644 tests/assets/span_mode/s1.peaks.bed create mode 100644 tests/assets/span_mode/s2.peaks.bed create mode 100644 tests/assets/span_mode/treatment.bam create mode 100644 tests/assets/span_mode/treatment.bam.bai diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py index d41c40f8..a269c2d4 100755 --- a/bin/chipbinner_run.py +++ b/bin/chipbinner_run.py @@ -51,6 +51,7 @@ def parse_args(): parser.add_argument("--k-value", type=int, default=100000) parser.add_argument("--functional-db") parser.add_argument("--allow-partial", action="store_true") + parser.add_argument("--base-dir") parser.add_argument("--outdir", default=".") return parser.parse_args() @@ -611,6 +612,18 @@ def write_error_artifact(outdir, message): return error_path +def resolve_path(path, base_dir): + if not path or path in ("NA", "None", "none", ""): + return path + if os.path.isabs(path): + return path + if base_dir: + candidate = os.path.join(base_dir, path) + if os.path.exists(candidate): + return candidate + return path + + def main(): args = parse_args() ensure_dir(args.outdir) @@ -630,6 +643,12 @@ def main(): if not samples: raise RuntimeError("No samples match contrast labels") + base_dir = os.path.abspath(args.base_dir) if args.base_dir else "" + if base_dir: + for row in samples: + for key in ("final_bam", "final_bai", "bigwig_path", "input_bam", "input_bai"): + row[key] = resolve_path(row.get(key), base_dir) + sample_ids = [row.get("sample_id", "") for row in samples] samplesheet_path = os.path.join(args.outdir, "chipbinner.samplesheet.csv") diff --git a/bin/differential_design.py b/bin/differential_design.py index 89f64ede..4e9d683c 100755 --- a/bin/differential_design.py +++ b/bin/differential_design.py @@ -81,11 +81,16 @@ def main(): all_callers = sorted({row.get("caller") or "" for row in peaks}) + span_callers_present = True if args.run_span: span_callers = [caller for caller in all_callers if is_span_caller(caller)] - if not span_callers: - print("ERROR: run_span_diff requested but no SPAN/OmniPeak peaks found in manifest.", file=sys.stderr) - sys.exit(1) + span_callers_present = bool(span_callers) + if not span_callers_present: + print( + "WARNING: run_span_diff requested but no SPAN/OmniPeak peaks found in manifest; " + "SPAN differential will be skipped.", + file=sys.stderr, + ) design_rows = [] fail_required = False @@ -130,7 +135,7 @@ def main(): status = base_status reason = base_reason eligible_chipbinner = args.run_chipbinner and group_eligible - eligible_span = args.run_span and group_eligible + eligible_span = args.run_span and span_callers_present and group_eligible if not (args.run_chipbinner or args.run_span): status = "SKIP" reason = "methods_disabled" @@ -196,7 +201,9 @@ def main(): "n_control": str(n_control), "eligible_diffbind": "true" if eligible_diffbind else "false", "eligible_chipbinner": "true" if (args.run_chipbinner and group_eligible) else "false", - "eligible_span": "true" if (args.run_span and group_eligible) else "false", + "eligible_span": "true" + if (args.run_span and span_callers_present and group_eligible) + else "false", "status": status, "reason": reason, "ignored_conditions": ignored_conditions, diff --git a/bin/span_diff.py b/bin/span_diff.py index b039f5bb..55f75331 100755 --- a/bin/span_diff.py +++ b/bin/span_diff.py @@ -339,6 +339,9 @@ def write_failure_outputs( signature: str, reason: str, allow_partial: bool, + samples: List[Dict[str, str]] = None, + use_spikein: bool = False, + backend: str = "NA", ) -> None: diff_path = os.path.join(outdir, "span.differential.tsv") bed_path = os.path.join(outdir, "span.differential.peaks.bed") @@ -346,6 +349,7 @@ def write_failure_outputs( down_path = os.path.join(outdir, "span.down.bed") summary_path = os.path.join(outdir, "span.summary.tsv") mode_path = os.path.join(outdir, "span.mode.txt") + normalization_path = os.path.join(outdir, "span.normalization_factors.tsv") with open(diff_path, "w") as handle: handle.write("chr\tstart\tend\tlog2FC\tpval\tFDR\n") @@ -355,6 +359,22 @@ def write_failure_outputs( status = "SKIP" if allow_partial else "FAIL" write_summary(summary_path, group, treated, control, 0, 0, 0, 0, mode, status, reason) write_mode(mode_path, mode, signature) + if samples is None: + samples = [] + if samples: + scale_values, size_factors, use_spikein_factors = compute_spikein_factors(samples, use_spikein) + else: + scale_values = [] + size_factors = [] + use_spikein_factors = False + write_normalization_factors( + normalization_path, + samples, + scale_values, + size_factors, + use_spikein_factors, + backend, + ) def compute_orientation_log2fc( @@ -489,7 +509,19 @@ def main(): valid_modes = {"auto", "native", "fallback"} if mode not in valid_modes: reason = f"invalid_mode:{mode}" if mode else "invalid_mode" - write_failure_outputs(args.outdir, args.group, treated, control, mode or "invalid", signature, reason, allow_partial) + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + mode or "invalid", + signature, + reason, + allow_partial, + samples=None, + use_spikein=use_spikein, + backend=mode or "invalid", + ) if allow_partial: sys.exit(0) print("ERROR: --mode must be one of auto, native, or fallback.", file=sys.stderr) @@ -499,7 +531,19 @@ def main(): if mode == "native": if not native_supported: reason = "native_not_supported" - write_failure_outputs(args.outdir, args.group, treated, control, "native", signature, reason, allow_partial) + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + "native", + signature, + reason, + allow_partial, + samples=None, + use_spikein=use_spikein, + backend="native", + ) if allow_partial: sys.exit(0) print("ERROR: SPAN jar does not support native compare.", file=sys.stderr) @@ -516,7 +560,19 @@ def main(): samples_group = [row for row in samples if row.get("group") == args.group and row.get("condition") in {treated, control}] if not samples_group: reason = "no_samples" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_group, + use_spikein=use_spikein, + backend=chosen_mode, + ) if allow_partial: sys.exit(0) print("ERROR: No samples found for group.", file=sys.stderr) @@ -526,7 +582,19 @@ def main(): span_peaks = [row for row in peaks_group if is_span_caller(row.get("caller"))] if not span_peaks: reason = "no_span_peaks" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_group, + use_spikein=use_spikein, + backend=chosen_mode, + ) if allow_partial: sys.exit(0) print("ERROR: No SPAN/OmniPeak peaks found for group.", file=sys.stderr) @@ -558,7 +626,19 @@ def main(): missing_peaks = [row.get("sample_id") for row in samples_group if row.get("sample_id") not in peaks_by_sample] if missing_peaks: reason = "missing_peaks" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_group, + use_spikein=use_spikein, + backend=chosen_mode, + ) if allow_partial: sys.exit(0) print(f"ERROR: Missing peaks for samples: {','.join(missing_peaks)}", file=sys.stderr) @@ -569,7 +649,19 @@ def main(): control_samples = [row for row in samples_sorted if row.get("condition") == control] if not treated_samples or not control_samples: reason = "missing_condition" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_sorted, + use_spikein=use_spikein, + backend=chosen_mode, + ) if allow_partial: sys.exit(0) print("ERROR: Missing treated/control samples.", file=sys.stderr) @@ -612,7 +704,19 @@ def finalize_success(rows: List[Dict[str, str]], mode: str, status: str = "RUN", if chosen_mode == "native": if not native_supported: reason = "native_not_supported" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_sorted, + use_spikein=use_spikein, + backend=chosen_mode, + ) if allow_partial: sys.exit(0) print("ERROR: Native SPAN compare not supported.", file=sys.stderr) @@ -630,7 +734,19 @@ def finalize_success(rows: List[Dict[str, str]], mode: str, status: str = "RUN", if not args.chrom_sizes or not os.path.exists(args.chrom_sizes): reason = "missing_chrom_sizes" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_sorted, + use_spikein=use_spikein, + backend=chosen_mode, + ) if allow_partial: sys.exit(0) print("ERROR: Chrom sizes file required for native SPAN compare.", file=sys.stderr) @@ -884,7 +1000,19 @@ def pool_bams(condition: str, bams: List[str]) -> str: finalize_success(rows, "fallback") except Exception as exc: reason = str(exc).split("\n")[0] if exc else "failed" - write_failure_outputs(args.outdir, args.group, treated, control, chosen_mode, signature, reason, allow_partial) + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_sorted if "samples_sorted" in locals() else None, + use_spikein=use_spikein, + backend=chosen_mode, + ) if allow_partial: sys.exit(0) print(f"ERROR: SPAN diff failed: {exc}", file=sys.stderr) diff --git a/conf/modules.config b/conf/modules.config index f7ecca49..fe9690f1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1201,20 +1201,26 @@ if(params.run_diffbind || params.run_chipbinner || params.run_span_diff || param mode: "${params.publish_dir_mode}", saveAs: { filename -> if (filename.equals('versions.yml')) return null - if (filename.equals('diffbind.samplesheet.csv')) return null + if (filename.equals('diffbind.samplesheet.export.csv')) return null return filename } ], [ path: { "${params.outdir}/03_peak_calling/08_differential/01_diffbind/00_samplesheets/${caller}" }, - enabled: { params.export_diffbind_sheets }, - mode: "${params.publish_dir_mode}", - pattern: "diffbind.samplesheet.csv", - saveAs: { filename -> "${group}.csv" } + enabled: false, + mode: "${params.publish_dir_mode}" ] ] } + withName: '.*:EXPORT_DIFFBIND_SAMPLESHEETS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/01_diffbind/00_samplesheets/${caller}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename } + ] + } + withName: '.*:CHIPBINNER_RUN' { publishDir = [ path: { "${params.outdir}/03_peak_calling/08_differential/02_chipbinner/${group}" }, diff --git a/modules/local/annotate_regions.nf b/modules/local/annotate_regions.nf index 6074b8f9..3cd5b996 100644 --- a/modules/local/annotate_regions.nf +++ b/modules/local/annotate_regions.nf @@ -1,8 +1,8 @@ process ANNOTATE_REGIONS { label 'process_single' - conda "bioconda::bedtools=2.31.0 conda-forge::python=3.8.3" - container "quay.io/biocontainers/bedtools:2.31.0--hf5e1c6e_1" + conda "bioconda::bedtools=2.31.0" + container "quay.io/biocontainers/bedtools:2.31.1--hf5e1c6e_0" input: tuple val(method), val(group), val(caller), path(regions), path(gene_bed), val(output_name) @@ -16,10 +16,48 @@ process ANNOTATE_REGIONS { script: """ - annotate_regions.py \ - --regions ${regions} \ - --gene-bed ${gene_bed} \ - --out ${output_name} + out_dir=\$(dirname "${output_name}") + mkdir -p "\$out_dir" + first_line=\$(head -n 1 ${regions} || true) + if [[ -z "\$first_line" ]]; then + : > ${output_name} + else + if echo "\$first_line" | awk -F'\t' 'NF<3{exit 0} \$2 ~ /^[0-9]+${'$'}/ {exit 1} {exit 0}'; then + header_line="\$first_line" + tail -n +2 ${regions} > regions.data + else + header_line="" + cp ${regions} regions.data + fi + + if [[ ! -s regions.data ]]; then + if [[ -n "\$header_line" ]]; then + printf "%s\tnearest_feature_id\tnearest_gene_name\tdistance_to_feature\n" "\$header_line" > ${output_name} + else + : > ${output_name} + fi + else + bcols=\$(awk -F'\t' 'NF>0{print NF; exit}' ${gene_bed} || true) + bcols=\${bcols:-0} + awk -F'\t' -v OFS='\t' '{print \$0, NR-1}' regions.data > regions.with_idx + bedtools closest -d -a regions.with_idx -b ${gene_bed} > regions.closest + + awk -F'\t' -v OFS='\t' -v bcols="\$bcols" -v header="\$header_line" 'BEGIN{ if (header != "") print header, "nearest_feature_id", "nearest_gene_name", "distance_to_feature" } { + a_cols = NF - bcols - 1 + if (a_cols < 1) next + out = \$1 + for (i=2; i<=a_cols-1; i++) out = out OFS \$i + gene_id = "NA" + gene_name = "NA" + if (bcols >= 4) gene_id = \$(a_cols + 4) + if (bcols >= 5) gene_name = \$(a_cols + 5) + if (gene_id == "." || gene_id == "") gene_id = "NA" + if (gene_name == "." || gene_name == "") gene_name = "NA" + distance = \$(NF) + print out, gene_id, gene_name, distance + }' regions.closest > ${output_name} + fi + fi cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/cache_differential_gene_bed.nf b/modules/local/cache_differential_gene_bed.nf index 330d0aff..239b5422 100644 --- a/modules/local/cache_differential_gene_bed.nf +++ b/modules/local/cache_differential_gene_bed.nf @@ -20,10 +20,10 @@ process CACHE_DIFFERENTIAL_GENE_BED { """ cp ${gene_bed} differential_manifest.gene_bed.bed - coreutils_version=\$(cat --version | head -n 1 | awk '{print $NF}') + coreutils_version=\$(cat --version | head -n 1 | awk '{print \$NF}') { echo "\"${task.process}\":" - echo " coreutils: $coreutils_version" + echo " coreutils: \$coreutils_version" } > versions.yml """ } diff --git a/modules/local/chipbinner_run.nf b/modules/local/chipbinner_run.nf index 1221c821..fa6271b6 100644 --- a/modules/local/chipbinner_run.nf +++ b/modules/local/chipbinner_run.nf @@ -2,11 +2,14 @@ process CHIPBINNER_RUN { label 'process_chipbinner' conda "conda-forge::python=3.8.3 conda-forge::numpy=1.24.4 conda-forge::pandas=2.0.3 conda-forge::scikit-learn=1.3.2 conda-forge::matplotlib=3.7.5 conda-forge::seaborn=0.12.2 conda-forge::hdbscan=0.8.33 conda-forge::scipy=1.10.1 bioconda::bedtools=2.31.1 bioconda::samtools=1.17 conda-forge::r-base=4.2.3 bioconda::bioconductor-rots=1.20.0" - container "quay.io/biocontainers/biocontainers:1.2.0--py38_0" + container workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? + null : + 'quay.io/biocontainers/biocontainers:1.2.0--py38_0' input: tuple val(group), path(records), path(chrom_sizes), path(windows) val contrast + val base_dir val bin_size val windows_dir val blacklist @@ -52,6 +55,7 @@ process CHIPBINNER_RUN { def use_spikein_arg = use_spikein ? "--use-spikein" : '' def functional_arg = functional_db ? "--functional-db ${functional_db}" : '' def allow_partial_arg = allow_partial ? "--allow-partial" : '' + def base_dir_arg = base_dir ? "--base-dir ${base_dir}" : '' """ chipbinner_run.py \ --samples ${records} \ @@ -73,6 +77,7 @@ process CHIPBINNER_RUN { --k-value ${k_value} \ ${functional_arg} \ ${allow_partial_arg} \ + ${base_dir_arg} \ --outdir . cat <<-END_VERSIONS > versions.yml @@ -80,4 +85,39 @@ process CHIPBINNER_RUN { python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") END_VERSIONS """ + + stub: + """ + cat <<-END_CLUSTER > chipbinner.clusters.tsv + chrom\tstart\tend\tbin_id\tcluster + chrStub\t0\t100\tchrStub:0-100\t0 + END_CLUSTER + + cp chipbinner.clusters.tsv chipbinner.clusters.best.tsv + + cat <<-END_2CLUSTER > chipbinner.clusters.2cluster.tsv + chrom\tstart\tend\tbin_id\tcluster\tcluster_label\tcluster_mean_log2FC + chrStub\t0\t100\tchrStub:0-100\t0\ttreated_high\t1.0 + END_2CLUSTER + + cat <<-END_3CLUSTER > chipbinner.clusters.3cluster.tsv + chrom\tstart\tend\tbin_id\tcluster\tcluster_label\tcluster_mean_log2FC + chrStub\t0\t100\tchrStub:0-100\t0\tstable\t0.0 + END_3CLUSTER + + cat <<-END_NORM > chipbinner.normalization_factors.tsv + sample_id\tgroup\tcondition\tspikein_scale_factor\tspikein_size_factor\tms_coeff\tms_size_factor\tapplied_steps + stub_sample\t${group}\t${contrast.split(',')[0].trim()}\tNA\tNA\tNA\tNA\tpseudocount + END_NORM + + cat <<-END_SUMMARY > chipbinner.summary.tsv + group\tcaller\ttreated\tcontrol\tn_bins_tested\tn_fdr_pass\tn_up\tn_down\tn_clusters\tchosen_minPts\tchosen_minSamps\tstatus\treason\tenrichment_status + ${group}\tNA\t${contrast.split(',')[0].trim()}\t${contrast.split(',')[1].trim()}\t0\t0\t0\t0\t0\tNA\tNA\tSKIP\tstub_run\tNOT_RUN + END_SUMMARY + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: "stub" + END_VERSIONS + """ } diff --git a/modules/local/chipbinner_windows_cache.nf b/modules/local/chipbinner_windows_cache.nf index 3c67fa5f..a9a1983a 100644 --- a/modules/local/chipbinner_windows_cache.nf +++ b/modules/local/chipbinner_windows_cache.nf @@ -2,7 +2,9 @@ process CHIPBINNER_WINDOWS_CACHE { label 'process_single' conda "conda-forge::python=3.8.3 bioconda::bedtools=2.31.1" - container "quay.io/biocontainers/python:3.8.3" + container workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? + null : + 'quay.io/biocontainers/biocontainers:1.2.0--py38_0' input: path chrom_sizes @@ -35,4 +37,38 @@ process CHIPBINNER_WINDOWS_CACHE { bedtools: \$(bedtools --version | awk '{print \$2}') END_VERSIONS """ + + stub: + """ + genome_id=\$(basename "${chrom_sizes}") + if [[ "\$genome_id" == *.sizes ]]; then + genome_id="\${genome_id%.sizes}" + else + genome_id="\${genome_id%.*}" + fi + if [[ -z "\$genome_id" ]]; then + genome_id="genome" + fi + + blacklist_hash="none" + if [[ -n "${blacklist}" && -f "${blacklist}" ]]; then + if command -v sha256sum >/dev/null 2>&1; then + blacklist_hash=\$(sha256sum "${blacklist}" | awk '{print substr(\$1,1,12)}') + fi + fi + + out_name="windows.\${genome_id}.${bin_size}.\${blacklist_hash}.bed" + printf "chrStub\\t0\\t${bin_size}\\n" > "\${out_name}" + + { + printf "genome_id\\tbin_size\\tblacklist_hash\\twindows_source\\twindows_path\\n" + printf "%s\\t%s\\t%s\\t%s\\t%s\\n" "\$genome_id" "${bin_size}" "\$blacklist_hash" "${windows_dir ?: 'generated'}" "\${out_name}" + } > chipbinner_windows_meta.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: "stub" + bedtools: "stub" + END_VERSIONS + """ } diff --git a/modules/local/diffbind_run.nf b/modules/local/diffbind_run.nf index 380cfa11..ac5ea231 100644 --- a/modules/local/diffbind_run.nf +++ b/modules/local/diffbind_run.nf @@ -2,7 +2,7 @@ process DIFFBIND_RUN { label 'process_diffbind' conda "conda-forge::r-base=4.2.3 bioconda::bioconductor-diffbind conda-forge::r-jsonlite conda-forge::r-yaml" - container "quay.io/biocontainers/bioconductor-diffbind:3.14.0--r42_0" + container "quay.io/biocontainers/bioconductor-diffbind:3.16.0--r44h77050f0_1" input: tuple val(group), val(caller), path(records) @@ -26,6 +26,7 @@ process DIFFBIND_RUN { tuple val(group), val(caller), path("diffbind.significant_down.bed", optional: true), emit: bed_down tuple val(group), val(caller), path("diffbind.summary.tsv"), emit: summary tuple val(group), val(caller), path("diffbind.samplesheet.csv"), emit: samplesheet + tuple val(group), val(caller), path("diffbind.samplesheet.export.csv"), emit: samplesheet_export tuple val(group), val(caller), path("diffbind.normalization_factors.tsv", optional: true), emit: norm_factors_out tuple val(group), val(caller), path("diffbind.dba.rds", optional: true), emit: dba tuple val(group), val(caller), path("diffbind.error.txt", optional: true), emit: error @@ -56,6 +57,8 @@ process DIFFBIND_RUN { --export_sheets ${export_sheets} \ --allow_partial ${allow_partial} + cp diffbind.samplesheet.csv diffbind.samplesheet.export.csv + cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(R --version | head -n 1 | sed 's/.* //') @@ -70,6 +73,8 @@ SampleID,Factor,Condition,Replicate,bamReads,Peaks,PeakCaller,Tissue stub,${group},stub,1,stub.bam,stub.peak,${caller},CUTRUN EOF + cp diffbind.samplesheet.csv diffbind.samplesheet.export.csv + cat <<-EOF > diffbind.results.tsv chr\tstart\tend\tlog2FC\tpval\tFDR chr1\t1\t2\t0.0\t1.0\t1.0 @@ -90,14 +95,12 @@ stub\t1.0 EOF echo "stub" > diffbind.dba.rds + touch diffbind.error.txt echo "stub" > plots/PCA.pdf echo "stub" > plots/correlation_heatmap.pdf echo "stub" > plots/MA.pdf echo "stub" > plots/volcano.pdf - cat <<-END_VERSIONS > versions.yml - "${task.process}": - R: stub - END_VERSIONS + printf "%s\\n" "'${task.process}':" " R: stub" > versions.yml """ } diff --git a/modules/local/export_diffbind_samplesheets.nf b/modules/local/export_diffbind_samplesheets.nf new file mode 100644 index 00000000..17236093 --- /dev/null +++ b/modules/local/export_diffbind_samplesheets.nf @@ -0,0 +1,17 @@ +process EXPORT_DIFFBIND_SAMPLESHEETS { + label 'process_single' + + input: + tuple val(group), val(caller), path(samplesheet) + + output: + tuple val(group), val(caller), path("${group}.csv"), emit: sheet + + when: + task.ext.when == null || task.ext.when + + script: + """ + cp ${samplesheet} ${group}.csv + """ +} diff --git a/modules/local/for_patch/bedtools/sort/main.nf b/modules/local/for_patch/bedtools/sort/main.nf index c221a853..2dee62de 100644 --- a/modules/local/for_patch/bedtools/sort/main.nf +++ b/modules/local/for_patch/bedtools/sort/main.nf @@ -21,7 +21,10 @@ process BEDTOOLS_SORT { script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + def prefix = task.ext.prefix ?: (meta.prefix ?: "${meta.id}") + if ("$intervals" == "${prefix}.${extension}") { + prefix = "${prefix}.sorted" + } def sizes = sizes ? "-g $sizes" : "" """ bedtools \\ diff --git a/modules/local/gtf_to_gene_bed.nf b/modules/local/gtf_to_gene_bed.nf index 38e9ccba..98b0e099 100644 --- a/modules/local/gtf_to_gene_bed.nf +++ b/modules/local/gtf_to_gene_bed.nf @@ -2,10 +2,10 @@ process GTF_TO_GENE_BED { tag "$gtf" label 'process_low' - conda "conda-forge::python=3.11.6" + conda "conda-forge::python=3.11" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.11.6' : - 'quay.io/biocontainers/python:3.11.6' }" + 'https://depot.galaxyproject.org/singularity/python:3.11' : + 'quay.io/biocontainers/python:3.11' }" input: path gtf @@ -25,7 +25,7 @@ process GTF_TO_GENE_BED { cat <<-END_VERSIONS > versions.yml "${task.process}": - python: \$(python --version 2>&1 | awk '{print $NF}') + python: \$(python --version 2>&1 | awk '{print \$NF}') END_VERSIONS """ } diff --git a/modules/local/rename_summary.nf b/modules/local/rename_summary.nf new file mode 100644 index 00000000..ea6fcc24 --- /dev/null +++ b/modules/local/rename_summary.nf @@ -0,0 +1,14 @@ +process RENAME_SUMMARY { + label 'process_single' + + input: + tuple val(prefix), path(summary) + + output: + path "${prefix}.summary.tsv", emit: summary + + script: + """ + cp ${summary} ${prefix}.summary.tsv + """ +} diff --git a/modules/local/span_diff_run.nf b/modules/local/span_diff_run.nf index 4b30de74..a28262f2 100644 --- a/modules/local/span_diff_run.nf +++ b/modules/local/span_diff_run.nf @@ -1,8 +1,10 @@ process SPAN_DIFF_RUN { - label { params.span_diff_mode == 'fallback' ? 'SPAN_FALLBACK' : 'SPAN_NATIVE' } + label params.span_diff_mode == 'fallback' ? 'SPAN_FALLBACK' : 'SPAN_NATIVE' conda "conda-forge::python=3.8.3 conda-forge::openjdk=21.0.2 bioconda::samtools=1.16.1 bioconda::bedtools=2.31.0 conda-forge::r-base=4.2.3 bioconda::bioconductor-deseq2 bioconda::bioconductor-edger" - container "quay.io/biocontainers/biocontainers:1.2.0--py38_0" + container workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? + null : + 'quay.io/biocontainers/biocontainers:1.2.0--py38_0' input: tuple val(group), path(samples_manifest), path(peaks_manifest), path(chrom_sizes) @@ -25,7 +27,7 @@ process SPAN_DIFF_RUN { tuple val(group), path("span.down.bed"), emit: down tuple val(group), path("span.summary.tsv"), emit: summary tuple val(group), path("span.mode.txt"), emit: mode_out - tuple val(group), path("span_diff_target_pooling.tsv", optional: true), emit: pooling + path "span_diff_target_pooling.tsv", optional: true, emit: pooling tuple val(group), path("span.normalization_factors.tsv", optional: true), emit: norm_factors path "*.pooled.bam", optional: true, emit: pooled_bam path "*.pooled.bam.bai", optional: true, emit: pooled_bai @@ -62,9 +64,9 @@ process SPAN_DIFF_RUN { "${task.process}": python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") java: \$(java -version 2>&1 | head -n 1 | sed -e 's/"//g') - bedtools: \$(bedtools --version | sed 's/bedtools v//') - samtools: \$(samtools --version | head -n 1 | sed 's/samtools //') - R: \$(R --version | head -n 1 | sed 's/.* //') + bedtools: \$(command -v bedtools >/dev/null 2>&1 && bedtools --version | sed 's/bedtools v//' || echo "NA") + samtools: \$(command -v samtools >/dev/null 2>&1 && samtools --version | head -n 1 | sed 's/samtools //' || echo "NA") + R: \$(command -v R >/dev/null 2>&1 && R --version | head -n 1 | sed 's/.* //' || echo "NA") END_VERSIONS """ } diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 1a2f17c1..a9795141 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -10,11 +10,15 @@ include { PUBLISH_DIFFERENTIAL_MANIFESTS } from "../../modules/local/publish_dif include { RECORDS_TO_TSV as RECORDS_TO_TSV_DIFFBIND } from "../../modules/local/records_to_tsv" include { RECORDS_TO_TSV as RECORDS_TO_TSV_CHIPBINNER } from "../../modules/local/records_to_tsv" include { DIFFBIND_RUN } from "../../modules/local/diffbind_run" +include { EXPORT_DIFFBIND_SAMPLESHEETS } from "../../modules/local/export_diffbind_samplesheets" include { CHIPBINNER_RUN } from "../../modules/local/chipbinner_run" include { CHIPBINNER_WINDOWS_CACHE } from "../../modules/local/chipbinner_windows_cache" include { POOL_IGG_CONTROLS } from "../../modules/local/pool_igg_controls" include { SPAN_DIFF_RUN } from "../../modules/local/span_diff_run" include { SPAN_POOLING_MANIFEST } from "../../modules/local/span_pooling_manifest" +include { RENAME_SUMMARY as RENAME_SUMMARY_DIFFBIND } from "../../modules/local/rename_summary" +include { RENAME_SUMMARY as RENAME_SUMMARY_CHIPBINNER } from "../../modules/local/rename_summary" +include { RENAME_SUMMARY as RENAME_SUMMARY_SPAN } from "../../modules/local/rename_summary" include { ANNOTATE_REGIONS } from "../../modules/local/annotate_regions" include { GTF_TO_GENE_BED } from "../../modules/local/gtf_to_gene_bed" include { BEDTOOLS_SORT as ANNOTATION_GENE_BED_SORT } from "../../modules/local/for_patch/bedtools/sort/main" @@ -336,7 +340,13 @@ workflow DIFFERENTIAL_PEAK_CALLING { uniq.contains('Spikein') } } - ch_use_spikein = ch_use_spikein.broadcast() + def use_spikein_consumers = [params.run_diffbind, params.run_chipbinner, params.run_span_diff].count { it } + if (!(ch_use_spikein instanceof groovyx.gpars.dataflow.DataflowReadChannel)) { + ch_use_spikein = Channel.value(ch_use_spikein) + } + if (!params.differential_publish_manifest_only && use_spikein_consumers > 1) { + ch_use_spikein = ch_use_spikein.broadcast() + } if (params.run_diffbind && !params.differential_publish_manifest_only) { ch_diffbind_design = ch_design_rows @@ -397,19 +407,28 @@ workflow DIFFERENTIAL_PEAK_CALLING { ) ch_versions = ch_versions.mix(DIFFBIND_RUN.out.versions) + if (params.export_diffbind_sheets) { + EXPORT_DIFFBIND_SAMPLESHEETS ( + DIFFBIND_RUN.out.samplesheet_export + ) + } + if (annotation_enabled) { ch_diffbind_annot = DIFFBIND_RUN.out.results .combine(ch_gene_bed_single) - .map { tuple, gene_bed -> - def group = tuple[0] - def caller = tuple[1] - def regions = tuple[2] + .map { group, caller, regions, gene_bed -> [ 'diffbind', group, caller, regions, gene_bed, "01_diffbind/${caller}/${group}/diffbind.results.annotated.tsv" ] } ch_annotation_requests = ch_annotation_requests.mix(ch_diffbind_annot) } - ch_diffbind_summary = DIFFBIND_RUN.out.summary.map { it[2] } + RENAME_SUMMARY_DIFFBIND ( + DIFFBIND_RUN.out.summary.map { group, caller, summary -> + def prefix = "${caller}_${group}".replaceAll(/[^A-Za-z0-9_.-]+/, "_") + [ prefix, summary ] + } + ) + ch_diffbind_summary = RENAME_SUMMARY_DIFFBIND.out.summary } if (params.run_chipbinner && !params.differential_publish_manifest_only) { @@ -432,32 +451,39 @@ workflow DIFFERENTIAL_PEAK_CALLING { .map { row -> [row.group, row] } .groupTuple(by: [0]) .join(ch_chip_design) - .map { group, rows, design -> [ group, rows ] } + .map { item -> [ item[0], item[1] ] } ch_chip_records - .map { group, rows -> [ group, JsonOutput.toJson(rows) ] } + .map { item -> [ item[0], JsonOutput.toJson(item[1]) ] } .set { ch_chip_records_json } RECORDS_TO_TSV_CHIPBINNER ( - ch_chip_records_json.map { it[0] }, + ch_chip_records_json.map { item -> item[0] }, ch_chip_records_json.map { 'NA' }, - ch_chip_records_json.map { it[1] }, + ch_chip_records_json.map { item -> item[1] }, 'sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff\tinput_bam\tinput_bai', - ch_chip_records_json.map { group, json -> "chipbinner_${group}.tsv" } + ch_chip_records_json.map { item -> "chipbinner_${item[0]}.tsv" } ) ch_versions = ch_versions.mix(RECORDS_TO_TSV_CHIPBINNER.out.versions) ch_chip_records_file = RECORDS_TO_TSV_CHIPBINNER.out.tsv - .map { group, caller, records_file -> [group, records_file] } + .map { item -> [item[0], item[2]] } ch_chip_inputs = ch_chip_records_file .combine(ch_chrom_sizes.collect().map { it instanceof List ? it[0] : it }) .combine(ch_chipbinner_windows) - .map { record, chrom_sizes, windows -> [ record[0], record[1], chrom_sizes, windows ] } + .map { item -> + def group = item[0] + def records = item[1] + def chrom_sizes = item[2] + def windows = item[3] + [ group, records, chrom_sizes, windows ] + } CHIPBINNER_RUN ( ch_chip_inputs, params.differential_contrast, + workflow.launchDir, params.chipbinner_bin_size, '', '', @@ -478,15 +504,19 @@ workflow DIFFERENTIAL_PEAK_CALLING { if (annotation_enabled) { ch_chip_annot = CHIPBINNER_RUN.out.differential .combine(ch_gene_bed_single) - .map { tuple, gene_bed -> - def group = tuple[0] - def regions = tuple[1] + .map { group, regions, gene_bed -> [ 'chipbinner', group, 'NA', regions, gene_bed, "02_chipbinner/${group}/chipbinner.differential.annotated.tsv" ] } ch_annotation_requests = ch_annotation_requests.mix(ch_chip_annot) } - ch_chipbinner_summary = CHIPBINNER_RUN.out.summary.map { it[1] } + RENAME_SUMMARY_CHIPBINNER ( + CHIPBINNER_RUN.out.summary.map { group, summary -> + def prefix = "chipbinner_${group}".replaceAll(/[^A-Za-z0-9_.-]+/, "_") + [ prefix, summary ] + } + ) + ch_chipbinner_summary = RENAME_SUMMARY_CHIPBINNER.out.summary } if (params.run_span_diff && !params.differential_publish_manifest_only) { @@ -529,10 +559,9 @@ workflow DIFFERENTIAL_PEAK_CALLING { ) ch_versions = ch_versions.mix(SPAN_DIFF_RUN.out.versions) - ch_span_pooling_files = SPAN_DIFF_RUN.out.pooling.map { it[1] } - ch_span_pooling_files.into { ch_span_pooling_list_in; ch_span_pooling_count_in } - ch_span_pooling_list = ch_span_pooling_list_in.collect() - ch_span_pooling_flag = ch_span_pooling_count_in.count().map { it > 0 } + ch_span_pooling_files = SPAN_DIFF_RUN.out.pooling + ch_span_pooling_list = ch_span_pooling_files.collect() + ch_span_pooling_flag = ch_span_pooling_list.map { it instanceof List && it.size() > 0 } SPAN_POOLING_MANIFEST ( ch_span_pooling_list, @@ -543,15 +572,19 @@ workflow DIFFERENTIAL_PEAK_CALLING { if (annotation_enabled) { ch_span_annot = SPAN_DIFF_RUN.out.differential .combine(ch_gene_bed_single) - .map { tuple, gene_bed -> - def group = tuple[0] - def regions = tuple[1] + .map { group, regions, gene_bed -> [ 'span', group, 'NA', regions, gene_bed, "03_span/${group}/span.differential.annotated.tsv" ] } ch_annotation_requests = ch_annotation_requests.mix(ch_span_annot) } - ch_span_summary = SPAN_DIFF_RUN.out.summary.map { it[1] } + RENAME_SUMMARY_SPAN ( + SPAN_DIFF_RUN.out.summary.map { group, summary -> + def prefix = "span_${group}".replaceAll(/[^A-Za-z0-9_.-]+/, "_") + [ prefix, summary ] + } + ) + ch_span_summary = RENAME_SUMMARY_SPAN.out.summary } if (annotation_enabled) { @@ -564,7 +597,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_summary_files = ch_diffbind_summary.mix(ch_chipbinner_summary).mix(ch_span_summary) DIFFERENTIAL_SUMMARY ( DIFFERENTIAL_DESIGN.out.design, - ch_summary_files.collect().map { it + summary_stub }.ifEmpty([summary_stub]), + ch_summary_files.collect().map { it + [summary_stub] }.ifEmpty([summary_stub]), params.differential_publish_manifest_only, file("$projectDir/assets/multiqc/differential_summary_header.txt"), file("$projectDir/assets/multiqc/differential_design_header.txt"), diff --git a/tests/assets/span_mode/control.bam b/tests/assets/span_mode/control.bam new file mode 100644 index 0000000000000000000000000000000000000000..8a0b4440d78b36613cafb65c994e18fa898cf60a GIT binary patch literal 188 zcmb2|=3rp}f&Xj_PR>jW!3@PkUs9)}CnOYnD0s;8d9%?K;|E0?d{213ZT32+Vf5t7 z$IV721_lq3QW6ppQWFw9fPyI%2hSW}V92oi>u&)xN*>KjZ-&)r4GqFPJUljL^PGBs zv>OorU)a#t*tk%`)KJxK=MoL|dlMWRXEr(th+j~UWMJS|3p-hl#VlzCW}qkp005R> BH<A^kigYU|?VZVoxCk1`wNpAp}e^Ff0JmUXvj5=;9EyP^CZ$RWC#w01yoXNdN!< literal 0 HcmV?d00001 diff --git a/tests/assets/span_mode/peaks.tsv b/tests/assets/span_mode/peaks.tsv index 47b785f4..794bb50d 100644 --- a/tests/assets/span_mode/peaks.tsv +++ b/tests/assets/span_mode/peaks.tsv @@ -1 +1,3 @@ sample_id group condition caller peaks_path peaks_format replicate +s1 h3k27me3 Treatment span_default tests/assets/span_mode/s1.peaks.bed bed 1 +s2 h3k27me3 Control span_default tests/assets/span_mode/s2.peaks.bed bed 1 diff --git a/tests/assets/span_mode/s1.peaks.bed b/tests/assets/span_mode/s1.peaks.bed new file mode 100644 index 00000000..8948009c --- /dev/null +++ b/tests/assets/span_mode/s1.peaks.bed @@ -0,0 +1,2 @@ +chr1 1 100 +chr1 200 300 diff --git a/tests/assets/span_mode/s2.peaks.bed b/tests/assets/span_mode/s2.peaks.bed new file mode 100644 index 00000000..9b82b9c7 --- /dev/null +++ b/tests/assets/span_mode/s2.peaks.bed @@ -0,0 +1,2 @@ +chr1 50 150 +chr1 250 350 diff --git a/tests/assets/span_mode/samples.tsv b/tests/assets/span_mode/samples.tsv index 295844d6..0c3fd595 100644 --- a/tests/assets/span_mode/samples.tsv +++ b/tests/assets/span_mode/samples.tsv @@ -1 +1,3 @@ sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor ms_coeff bigwig_path +s1 h3k27me3 Treatment 1 tests/assets/span_mode/treatment.bam tests/assets/span_mode/treatment.bam.bai NA NA NA NA +s2 h3k27me3 Control 1 tests/assets/span_mode/control.bam tests/assets/span_mode/control.bam.bai NA NA NA NA diff --git a/tests/assets/span_mode/treatment.bam b/tests/assets/span_mode/treatment.bam new file mode 100644 index 0000000000000000000000000000000000000000..a3d473b4e4de61b3a70c1dedc294748f3a8a4bee GIT binary patch literal 188 zcmb2|=3rp}f&Xj_PR>jW!3@PkUs9)}CnOYnD0s;8d9%?K;|E0?d{213ZT32+Vf5t7 z$IV721_lq3QW6ppQWFw9fPyI%2hSW}V92oi>u&)xN*>KjZ-&*04Gem0Y;1AT{)%ou zTA7Wl?SH@lAPAT;YlcvK(3DvbLir~a95{18p@TbP0uO`XA^kigYU|?VZVoxCk1`wNpAp}e^Ff0JmUXvj5=;9EyP^CZ$RWC#w01yoXNdN!< literal 0 HcmV?d00001 diff --git a/tests/test_07_differential.yml b/tests/test_07_differential.yml index 8eb0e620..e80d2c92 100644 --- a/tests/test_07_differential.yml +++ b/tests/test_07_differential.yml @@ -11,9 +11,10 @@ - test_differential - test_span_diff_missing_jar_integrated exit_code: 1 - stderr: + stdout: contains: - "omnipeaks_jar" + - "SPAN differential requested" - name: test_differential_manifests_only command: nextflow run main.nf -profile docker,test --only_peak_calling --skip_fastqc --skip_removeduplicates --skip_multiqc --skip_preseq --run_diffbind --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_publish_manifest_only --input tests/assets/samplesheet_condition.csv -c tests/config/nextflow.config @@ -69,7 +70,7 @@ - "SKIP" - name: test_differential_summary_preserves_skip - command: bash -c "mkdir -p tests/output/summary_override && cat > tests/output/summary_override/design.tsv <<'EOF'\ngroup\tcaller\ttreated_condition\tcontrol_condition\tstatus\treason\nh3k27me3\tNA\tTreatment\tControl\tSKIP\tmanual_skip\nEOF\ncat > tests/output/summary_override/chipbinner.summary.tsv <<'EOF'\ngroup\tcaller\ttreated\tcontrol\tn_bins_tested\tn_fdr_pass\tn_up\tn_down\tn_clusters\tchosen_minPts\tchosen_minSamps\tstatus\treason\tenrichment_status\nh3k27me3\tNA\tTreatment\tControl\t10\t1\t1\t0\t2\t100\t100\tRUN\tok\tNOT_RUN\nEOF\npython3 bin/differential_summary.py --design tests/output/summary_override/design.tsv --summaries tests/output/summary_override/chipbinner.summary.tsv --out tests/output/summary_override/summary.tsv" + command: bash -c "mkdir -p tests/output/summary_override && printf '%b' 'group\tcaller\ttreated_condition\tcontrol_condition\tstatus\treason\nh3k27me3\tNA\tTreatment\tControl\tSKIP\tmanual_skip\n' > tests/output/summary_override/design.tsv && printf '%b' 'group\tcaller\ttreated\tcontrol\tn_bins_tested\tn_fdr_pass\tn_up\tn_down\tn_clusters\tchosen_minPts\tchosen_minSamps\tstatus\treason\tenrichment_status\nh3k27me3\tNA\tTreatment\tControl\t10\t1\t1\t0\t2\t100\t100\tRUN\tok\tNOT_RUN\n' > tests/output/summary_override/chipbinner.summary.tsv && python3 bin/differential_summary.py --design tests/output/summary_override/design.tsv --summaries tests/output/summary_override/chipbinner.summary.tsv --out tests/output/summary_override/summary.tsv" tags: - test_differential - test_differential_summary @@ -106,14 +107,14 @@ - path: results_diffbind_on/03_peak_calling/08_differential/01_diffbind/00_samplesheets/seacr/h3k27me3.csv - name: test_span_mode_native_requires_compare - command: bash -c "mkdir -p tests/output/span_native && cd tests/output/span_native && python3 ../../bin/span_diff.py --jar ../../tests/assets/omnipeaks_stub.jar --mode native --contrast Treatment,Control --group h3k27me3 --samples ../../tests/assets/span_mode/samples.tsv --peaks ../../tests/assets/span_mode/peaks.tsv --caller-priority span_default --outdir ." + command: bash -c "mkdir -p tests/output/span_native && python3 bin/span_diff.py --jar tests/assets/omnipeaks_stub.jar --mode native --contrast Treatment,Control --group h3k27me3 --samples tests/assets/span_mode/samples.tsv --peaks tests/assets/span_mode/peaks.tsv --caller-priority span_default --outdir tests/output/span_native" tags: - test_differential - test_span_mode exit_code: 1 - name: test_span_mode_auto_fallback - command: bash -c "mkdir -p tests/output/span_auto && cd tests/output/span_auto && python3 ../../bin/span_diff.py --jar ../../tests/assets/omnipeaks_stub.jar --mode auto --contrast Treatment,Control --group h3k27me3 --samples ../../tests/assets/span_mode/samples.tsv --peaks ../../tests/assets/span_mode/peaks.tsv --caller-priority span_default --allow-partial true --outdir ." + command: bash -c "mkdir -p tests/output/span_auto && python3 bin/span_diff.py --jar tests/assets/omnipeaks_stub.jar --mode auto --contrast Treatment,Control --group h3k27me3 --samples tests/assets/span_mode/samples.tsv --peaks tests/assets/span_mode/peaks.tsv --caller-priority span_default --allow-partial true --outdir tests/output/span_auto" tags: - test_differential - test_span_mode @@ -145,7 +146,11 @@ - path: results_chipbinner_stub/03_peak_calling/08_differential/02_chipbinner/g1/chipbinner.normalization_factors.tsv - name: test_chipbinner_use_input_missing_fail - command: bash -c "mkdir -p tests/output/chipbinner_missing && cat > tests/output/chipbinner_missing/samples.tsv <<'EOF'\nsample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff\tinput_bam\tinput_bai\ns1\tg1\tTreatment\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\ns2\tg1\tControl\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\nEOF\ncat > tests/output/chipbinner_missing/chrom.sizes <<'EOF'\nchr1\t1000\nEOF\npython3 bin/chipbinner_run.py --samples tests/output/chipbinner_missing/samples.tsv --group g1 --contrast Treatment,Control --chrom-sizes tests/output/chipbinner_missing/chrom.sizes --bin-size 200 --use-input --outdir tests/output/chipbinner_missing/out" + command: > + bash -c "mkdir -p tests/output/chipbinner_missing && + printf 'sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff\tinput_bam\tinput_bai\ns1\tg1\tTreatment\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\ns2\tg1\tControl\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\n' > tests/output/chipbinner_missing/samples.tsv && + printf 'chr1\t1000\n' > tests/output/chipbinner_missing/chrom.sizes && + python3 bin/chipbinner_run.py --samples tests/output/chipbinner_missing/samples.tsv --group g1 --contrast Treatment,Control --chrom-sizes tests/output/chipbinner_missing/chrom.sizes --bin-size 200 --use-input --outdir tests/output/chipbinner_missing/out" tags: - test_differential - test_chipbinner_use_input @@ -155,7 +160,11 @@ - "input BAMs are missing" - name: test_chipbinner_use_input_missing_allow_partial - command: bash -c "mkdir -p tests/output/chipbinner_missing_partial && cat > tests/output/chipbinner_missing_partial/samples.tsv <<'EOF'\nsample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff\tinput_bam\tinput_bai\ns1\tg1\tTreatment\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\ns2\tg1\tControl\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\nEOF\ncat > tests/output/chipbinner_missing_partial/chrom.sizes <<'EOF'\nchr1\t1000\nEOF\npython3 bin/chipbinner_run.py --samples tests/output/chipbinner_missing_partial/samples.tsv --group g1 --contrast Treatment,Control --chrom-sizes tests/output/chipbinner_missing_partial/chrom.sizes --bin-size 200 --use-input --allow-partial --outdir tests/output/chipbinner_missing_partial/out" + command: > + bash -c "mkdir -p tests/output/chipbinner_missing_partial && + printf 'sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff\tinput_bam\tinput_bai\ns1\tg1\tTreatment\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\ns2\tg1\tControl\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\n' > tests/output/chipbinner_missing_partial/samples.tsv && + printf 'chr1\t1000\n' > tests/output/chipbinner_missing_partial/chrom.sizes && + python3 bin/chipbinner_run.py --samples tests/output/chipbinner_missing_partial/samples.tsv --group g1 --contrast Treatment,Control --chrom-sizes tests/output/chipbinner_missing_partial/chrom.sizes --bin-size 200 --use-input --allow-partial --outdir tests/output/chipbinner_missing_partial/out" tags: - test_differential - test_chipbinner_use_input From a64a69c90fa350480e7d001c63efe255737107e0 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 12 Jan 2026 03:10:40 -0800 Subject: [PATCH 51/52] Record pytest suite history for differential_peak_calling_2 --- docs/test_history/differential_peak_calling_2.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 docs/test_history/differential_peak_calling_2.md diff --git a/docs/test_history/differential_peak_calling_2.md b/docs/test_history/differential_peak_calling_2.md new file mode 100644 index 00000000..ac09cf47 --- /dev/null +++ b/docs/test_history/differential_peak_calling_2.md @@ -0,0 +1,13 @@ +# differential_peak_calling_2 test history + +## 2026-01-11 — pytest-suite (Slurm) +- JobID: 13242556 +- Run dir: /scratch/users/dhusmann/runs/pytest-suite/20260111_125922 +- Failure: tests/test_07_differential.yml::test_chipbinner_use_input_missing_fail expected stderr "input BAMs are missing"; heredoc parsing broke in the workflow (log.err showed `cat: unrecognized option '--samples'`). +- Fix: switched heredocs to `printf` in chipbinner_use_input_missing_{fail,allow_partial} tests to avoid heredoc parsing issues; commit b3341a6. +- Debug rerun (tag test_chipbinner_use_input): JobID 13250774, run dir /scratch/users/dhusmann/runs/pytest-suite/20260111_183621 (COMPLETED 0:0). + +## 2026-01-12 — pytest-suite (Slurm) +- JobID: 13250786 +- Run dir: /scratch/users/dhusmann/runs/pytest-suite/20260111_183752 +- Result: COMPLETED 0:0 From 2b091525fb5972a77cf1699c902bbc720dbcfce3 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Wed, 14 Jan 2026 02:01:18 -0800 Subject: [PATCH 52/52] Guard spike-in broadcast for scalar values --- subworkflows/local/differential_peak_calling.nf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index a9795141..06408c4f 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -341,11 +341,12 @@ workflow DIFFERENTIAL_PEAK_CALLING { } } def use_spikein_consumers = [params.run_diffbind, params.run_chipbinner, params.run_span_diff].count { it } - if (!(ch_use_spikein instanceof groovyx.gpars.dataflow.DataflowReadChannel)) { - ch_use_spikein = Channel.value(ch_use_spikein) - } if (!params.differential_publish_manifest_only && use_spikein_consumers > 1) { - ch_use_spikein = ch_use_spikein.broadcast() + ch_use_spikein = (ch_use_spikein instanceof groovyx.gpars.dataflow.DataflowReadChannel) + ? ch_use_spikein.broadcast() + : Channel.value(ch_use_spikein).broadcast() + } else if (!(ch_use_spikein instanceof groovyx.gpars.dataflow.DataflowReadChannel)) { + ch_use_spikein = Channel.value(ch_use_spikein) } if (params.run_diffbind && !params.differential_publish_manifest_only) {