diff --git a/assets/multiqc/differential_skipped_header.txt b/assets/multiqc/differential_skipped_header.txt new file mode 100644 index 00000000..7f2c3224 --- /dev/null +++ b/assets/multiqc/differential_skipped_header.txt @@ -0,0 +1,11 @@ +#id: 'differential_skipped' +#parent_id: 'peak_qc' +#parent_name: 'Peak QC' +#parent_description: 'Differential peak calling skipped comparisons' +#section_name: 'Differential Peak Calling Skipped' +#description: 'Comparisons skipped due to missing conditions or replicate constraints' +#plot_type: 'table' +#anchor: 'differential_skipped' +#pconfig: +# id: 'differential_skipped_table' +# title: 'Differential Peak Calling Skipped' diff --git a/assets/multiqc/differential_summary_header.txt b/assets/multiqc/differential_summary_header.txt new file mode 100644 index 00000000..b3f53395 --- /dev/null +++ b/assets/multiqc/differential_summary_header.txt @@ -0,0 +1,11 @@ +#id: 'differential_summary' +#parent_id: 'peak_qc' +#parent_name: 'Peak QC' +#parent_description: 'Differential peak calling summary' +#section_name: 'Differential Peak Calling Summary' +#description: 'Counts of significant differential regions per method, group, and caller' +#plot_type: 'table' +#anchor: 'differential_summary' +#pconfig: +# id: 'differential_summary_table' +# title: 'Differential Peak Calling Summary' diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 240c0d88..08dccd35 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -68,6 +68,8 @@ custom_content: - consensus_peak_counts - primary_frip_score - peak_reprod_perc + - differential_summary + - differential_skipped - software-versions-by-process - software-versions-unique diff --git a/bin/annotate_regions.py b/bin/annotate_regions.py new file mode 100755 index 00000000..672b1e35 --- /dev/null +++ b/bin/annotate_regions.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +import argparse +import csv +import os +import subprocess +import sys +from pathlib import Path + + +def detect_cols(header): + header_lower = [h.lower() for h in header] + def find(names): + for name in names: + if name in header_lower: + return header[header_lower.index(name)] + return None + chr_col = find(["chr", "chrom", "seqnames", "seqname"]) + start_col = find(["start"]) + end_col = find(["end"]) + return chr_col, start_col, end_col + + +def parse_gtf_attributes(attr_str): + attrs = {} + for item in attr_str.strip().split(";"): + item = item.strip() + if not item: + continue + if " " not in item: + continue + key, value = item.split(" ", 1) + value = value.strip().strip('"') + attrs[key] = value + return attrs + + +def gtf_to_tss_bed(gtf_path, out_path): + with gtf_path.open() as in_handle, out_path.open("w") as out_handle: + for line in in_handle: + if not line.strip() or line.startswith("#"): + continue + fields = line.rstrip().split("\t") + if len(fields) < 9: + continue + chrom, _, _, start, end, _, strand, _, attrs = fields + try: + start_i = int(start) - 1 + end_i = int(end) + except ValueError: + continue + if start_i < 0: + start_i = 0 + if strand == "+": + tss_start = start_i + elif strand == "-": + tss_start = max(end_i - 1, 0) + else: + continue + tss_end = tss_start + 1 + attr_map = parse_gtf_attributes(attrs) + name = ( + attr_map.get("gene_name") + or attr_map.get("gene_id") + or attr_map.get("transcript_id") + or attr_map.get("ID") + or "NA" + ) + out_handle.write("\t".join([chrom, str(tss_start), str(tss_end), name]) + "\n") + + +def run(cmd, stdout=None): + result = subprocess.run(cmd, stdout=stdout, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + raise RuntimeError(f"Command failed: {' '.join(cmd)}\n{result.stderr}") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--input", required=True) + parser.add_argument("--output", required=True) + parser.add_argument("--features", default=None) + parser.add_argument("--gtf", default=None) + args = parser.parse_args() + + input_path = Path(args.input) + output_path = Path(args.output) + + with input_path.open() as handle: + reader = csv.reader(handle, delimiter='\t') + header = next(reader) + + chr_col, start_col, end_col = detect_cols(header) + if not chr_col or not start_col or not end_col: + raise SystemExit("Could not detect chr/start/end columns for annotation") + + features_bed = None + if args.features and Path(args.features).exists(): + features_bed = Path(args.features) + elif args.gtf and Path(args.gtf).exists(): + tss_bed = Path("features.tss.bed") + gtf_to_tss_bed(Path(args.gtf), tss_bed) + features_bed = tss_bed + + if not features_bed or not features_bed.exists(): + # no annotation possible; copy input to output + output_path.write_text(input_path.read_text()) + return + + features_has_name = False + feature_name_idx = None + with features_bed.open() as handle: + for line in handle: + if not line.strip() or line.startswith("#"): + continue + features_cols = len(line.rstrip().split("\t")) + features_has_name = features_cols >= 4 + if features_has_name: + feature_name_idx = 4 + 3 + break + + regions_bed = Path("regions.bed") + with input_path.open() as handle, regions_bed.open("w") as out_handle: + reader = csv.DictReader(handle, delimiter='\t') + for idx, row in enumerate(reader): + out_handle.write("\t".join([ + row[chr_col], + str(row[start_col]), + str(row[end_col]), + str(idx) + ]) + "\n") + + closest_out = Path("closest.tsv") + run(["bedtools", "closest", "-d", "-a", str(regions_bed), "-b", str(features_bed)], stdout=closest_out.open("w")) + + annotations = {} + with closest_out.open() as handle: + for line in handle: + if not line.strip(): + continue + fields = line.rstrip().split("\t") + if len(fields) < 5: + continue + row_id = int(fields[3]) + feature_id = "NA" + if feature_name_idx is not None and len(fields) > feature_name_idx: + feature_id = fields[feature_name_idx] + distance = fields[-1] + annotations[row_id] = (feature_id, distance) + + with input_path.open() as handle, output_path.open("w") as out_handle: + reader = csv.DictReader(handle, delimiter='\t') + fieldnames = reader.fieldnames + ["nearest_feature_id", "distance_to_feature"] + writer = csv.DictWriter(out_handle, delimiter='\t', fieldnames=fieldnames, lineterminator='\n') + writer.writeheader() + for idx, row in enumerate(reader): + feature_id, distance = annotations.get(idx, ("NA", "NA")) + row["nearest_feature_id"] = feature_id + row["distance_to_feature"] = distance + writer.writerow(row) + + +if __name__ == "__main__": + main() diff --git a/bin/chipbinner_lola.R b/bin/chipbinner_lola.R new file mode 100644 index 00000000..931bbce4 --- /dev/null +++ b/bin/chipbinner_lola.R @@ -0,0 +1,68 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages({ + library(optparse) + library(LOLA) + library(GenomicRanges) + library(rtracklayer) + library(ggplot2) +}) + +option_list <- list( + make_option(c("--bed"), type = "character"), + make_option(c("--universe"), type = "character"), + make_option(c("--db"), type = "character"), + make_option(c("--label"), type = "character"), + make_option(c("--out_tsv"), type = "character"), + make_option(c("--out_pdf"), type = "character", default = "") +) + +opt <- parse_args(OptionParser(option_list = option_list)) + +if (!dir.exists(opt$db)) { + stop(paste("LOLA database not found:", opt$db)) +} + +bed_lines <- readLines(opt$bed, warn = FALSE) +bed_lines <- bed_lines[!grepl("^#", bed_lines)] +if (length(bed_lines) == 0) { + write.table(data.frame(), opt$out_tsv, sep = "\t", quote = FALSE, row.names = FALSE) + if (opt$out_pdf != "") { + pdf(opt$out_pdf) + plot.new() + text(0.5, 0.5, "No regions available for enrichment") + dev.off() + } + quit(status = 0) +} + +user_set <- rtracklayer::import(opt$bed, format = "bed") +universe <- rtracklayer::import(opt$universe, format = "bed") +region_db <- LOLA::loadRegionDB(opt$db) + +user_sets <- GRangesList() +user_sets[[opt$label]] <- user_set + +res <- LOLA::runLOLA(user_sets, universe, region_db) +if (!is.null(res) && nrow(res) > 0) { + res <- res[res$userSet == opt$label, , drop = FALSE] +} + +write.table(res, opt$out_tsv, sep = "\t", quote = FALSE, row.names = FALSE) + +if (opt$out_pdf != "" && !is.null(res) && nrow(res) > 0) { + top <- res[order(res$pValue), , drop = FALSE] + if (nrow(top) > 20) { + top <- top[1:20, , drop = FALSE] + } + top$neglog10p <- -log10(top$pValue) + top$label <- if ("description" %in% colnames(top)) top$description else top$filename + pdf(opt$out_pdf) + ggplot(top, aes(x = reorder(label, neglog10p), y = neglog10p)) + + geom_col(fill = "#4C72B0") + + coord_flip() + + xlab("Region set") + + ylab("-log10(p-value)") + + ggtitle(paste("LOLA enrichment:", opt$label)) + dev.off() +} diff --git a/bin/chipbinner_rots.R b/bin/chipbinner_rots.R new file mode 100755 index 00000000..be96b791 --- /dev/null +++ b/bin/chipbinner_rots.R @@ -0,0 +1,335 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages({ + library(optparse) + library(jsonlite) +}) + +option_list <- list( + make_option(c("--matrix"), type = "character"), + make_option(c("--clusters"), type = "character"), + make_option(c("--clusters_2"), type = "character", default = ""), + make_option(c("--clusters_3"), type = "character", default = ""), + make_option(c("--grid_summary"), type = "character"), + make_option(c("--norm_info"), type = "character"), + make_option(c("--samplesheet"), type = "character"), + make_option(c("--treated"), type = "character"), + make_option(c("--control"), type = "character"), + make_option(c("--group"), type = "character", default = "NA"), + make_option(c("--fdr"), type = "double", default = 0.05), + make_option(c("--lfc"), type = "double", default = 1.0), + make_option(c("--bootstrap"), type = "integer", default = 1000), + make_option(c("--k_value"), type = "integer", default = 100000), + make_option(c("--lola_run"), type = "character", default = "false") +) + +opt <- parse_args(OptionParser(option_list = option_list)) + +matrix_df <- read.table(opt$matrix, header = TRUE, sep = "\t", check.names = FALSE) +samples_df <- read.csv(opt$samplesheet, stringsAsFactors = FALSE) + +sample_ids <- samples_df$sample_id +conditions <- samples_df$condition +treated_idx <- conditions == opt$treated +control_idx <- conditions == opt$control + +if (!all(sample_ids %in% colnames(matrix_df))) { + stop("Matrix columns do not include all sample IDs") +} + +counts <- as.matrix(matrix_df[, sample_ids, drop = FALSE]) +rownames(counts) <- paste(matrix_df$chr, matrix_df$start, matrix_df$end, sep = ":") + +# Differential testing +log2fc <- log2(rowMeans(counts[, treated_idx, drop = FALSE]) / + rowMeans(counts[, control_idx, drop = FALSE])) + +use_rots <- TRUE +tryCatch({ + suppressPackageStartupMessages(library(ROTS)) +}, error = function(e) { + use_rots <<- FALSE +}) + +if (use_rots && (sum(treated_idx) < 2 || sum(control_idx) < 2)) { + warning("ROTS requires >=2 samples per condition; falling back to simple tests") + use_rots <- FALSE +} + +if (use_rots) { + groups <- ifelse(conditions == opt$treated, 1, 0) + rots_res <- ROTS::ROTS(counts, groups = groups, B = opt$bootstrap, K = opt$k_value) + pvals <- rots_res$pvalue +} else { + if (sum(treated_idx) < 2 || sum(control_idx) < 2) { + warning("ROTS unavailable and <2 samples per condition; assigning p-values of 1") + pvals <- rep(1, nrow(counts)) + } else { + pvals <- apply(counts, 1, function(x) { + t.test(x[treated_idx], x[control_idx])$p.value + }) + } +} + +fdr <- p.adjust(pvals, method = "BH") + +results <- data.frame( + chr = matrix_df$chr, + start = matrix_df$start, + end = matrix_df$end, + log2FC = log2fc, + pvalue = pvals, + FDR = fdr +) + +read_clusters <- function(path) { + if (is.null(path) || path == "" || !file.exists(path)) { + return(NULL) + } + df <- tryCatch(read.table(path, header = TRUE, sep = "\t", check.names = FALSE), + error = function(e) NULL) + if (is.null(df) || nrow(df) == 0) { + return(NULL) + } + if (!all(c("chr", "start", "end", "cluster") %in% colnames(df))) { + return(NULL) + } + df +} + +cluster_ids_from_df <- function(clusters_df, results_df) { + if (is.null(clusters_df)) { + return(NULL) + } + clusters_key <- paste(clusters_df$chr, clusters_df$start, clusters_df$end, sep = ":") + cluster_ids <- clusters_df$cluster[match(paste(results_df$chr, results_df$start, results_df$end, sep = ":"), clusters_key)] + cluster_ids +} + +compute_cluster_labels <- function(cluster_ids, counts_mat, conds, treated, control, lfc) { + if (is.null(cluster_ids) || length(cluster_ids) == 0) { + return(NULL) + } + labels <- rep("stable", length(cluster_ids)) + if (all(is.na(cluster_ids))) { + labels[] <- "NA" + return(labels) + } + labels[is.na(cluster_ids)] <- "NA" + treated_idx <- conds == treated + control_idx <- conds == control + for (cid in unique(cluster_ids)) { + if (is.na(cid)) { + next + } + if (cid == -1) { + labels[cluster_ids == cid] <- "noise" + next + } + cluster_rows <- cluster_ids == cid + treated_mean <- mean(rowMeans(counts_mat[cluster_rows, treated_idx, drop = FALSE])) + control_mean <- mean(rowMeans(counts_mat[cluster_rows, control_idx, drop = FALSE])) + if (is.nan(treated_mean)) treated_mean <- 0 + if (is.nan(control_mean)) control_mean <- 0 + log2fc_cluster <- log2((treated_mean + 1e-6) / (control_mean + 1e-6)) + if (abs(log2fc_cluster) < lfc) { + label <- "stable" + } else if (log2fc_cluster > 0) { + label <- "treated_enriched" + } else { + label <- "control_enriched" + } + labels[cluster_ids == cid] <- label + } + labels +} + +write_cluster_beds <- function(results_df, labels, suffix, model, manifest_df) { + if (is.null(labels)) { + return(manifest_df) + } + suffix_tag <- if (suffix == "") "" else paste0(".", suffix) + out_files <- list( + control_enriched = paste0("chipbinner.control_enriched", suffix_tag, ".bed"), + treated_enriched = paste0("chipbinner.treated_enriched", suffix_tag, ".bed"), + stable = paste0("chipbinner.stable", suffix_tag, ".bed"), + noise = paste0("chipbinner.noise", suffix_tag, ".bed") + ) + write.table(results_df[labels == "control_enriched", c("chr", "start", "end")], + out_files$control_enriched, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(results_df[labels == "treated_enriched", c("chr", "start", "end")], + out_files$treated_enriched, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(results_df[labels == "stable", c("chr", "start", "end")], + out_files$stable, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(results_df[labels == "noise", c("chr", "start", "end")], + out_files$noise, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + + for (label in names(out_files)) { + bed_file <- out_files[[label]] + if (file.exists(bed_file)) { + manifest_df <- rbind( + manifest_df, + data.frame( + model = model, + label = label, + bed = bed_file, + stringsAsFactors = FALSE + ) + ) + } + } + manifest_df +} + +clusters_best <- read_clusters(opt$clusters) +clusters_2 <- read_clusters(opt$clusters_2) +clusters_3 <- read_clusters(opt$clusters_3) + +cluster_ids_best <- cluster_ids_from_df(clusters_best, results) +cluster_labels_best <- compute_cluster_labels(cluster_ids_best, counts, conditions, opt$treated, opt$control, opt$lfc) +if (is.null(cluster_ids_best)) { + cluster_ids_best <- rep(NA, nrow(results)) +} +if (is.null(cluster_labels_best)) { + cluster_labels_best <- rep("NA", nrow(results)) +} + +cluster_ids_2_raw <- cluster_ids_from_df(clusters_2, results) +cluster_labels_2_raw <- compute_cluster_labels(cluster_ids_2_raw, counts, conditions, opt$treated, opt$control, opt$lfc) +cluster_ids_2 <- cluster_ids_2_raw +cluster_labels_2 <- cluster_labels_2_raw +if (is.null(cluster_ids_2)) { + cluster_ids_2 <- rep(NA, nrow(results)) +} +if (is.null(cluster_labels_2)) { + cluster_labels_2 <- rep("NA", nrow(results)) +} + +cluster_ids_3_raw <- cluster_ids_from_df(clusters_3, results) +cluster_labels_3_raw <- compute_cluster_labels(cluster_ids_3_raw, counts, conditions, opt$treated, opt$control, opt$lfc) +cluster_ids_3 <- cluster_ids_3_raw +cluster_labels_3 <- cluster_labels_3_raw +if (is.null(cluster_ids_3)) { + cluster_ids_3 <- rep(NA, nrow(results)) +} +if (is.null(cluster_labels_3)) { + cluster_labels_3 <- rep("NA", nrow(results)) +} + +results$cluster_id <- cluster_ids_best +results$cluster_label <- cluster_labels_best +results$cluster_id_2clusters <- cluster_ids_2 +results$cluster_label_2clusters <- cluster_labels_2 +results$cluster_id_3clusters <- cluster_ids_3 +results$cluster_label_3clusters <- cluster_labels_3 + +write.table(results, "chipbinner.differential.tsv", sep = "\t", quote = FALSE, row.names = FALSE) + +sig <- results[ + !is.na(results$FDR) & + !is.na(results$log2FC) & + results$FDR <= opt$fdr & + abs(results$log2FC) >= opt$lfc, + , + drop = FALSE +] +up <- sig[sig$log2FC > 0, , drop = FALSE] +down <- sig[sig$log2FC < 0, , drop = FALSE] + +write.table(up[, c("chr", "start", "end")], "chipbinner.significant_up.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) +write.table(down[, c("chr", "start", "end")], "chipbinner.significant_down.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + +bed_manifest <- data.frame( + model = character(), + label = character(), + bed = character(), + stringsAsFactors = FALSE +) +bed_manifest <- write_cluster_beds(results, cluster_labels_best, "", "best", bed_manifest) + +if (!is.null(cluster_labels_2_raw)) { + bed_manifest <- write_cluster_beds(results, cluster_labels_2_raw, "2clusters", "2clusters", bed_manifest) +} + +if (!is.null(cluster_labels_3_raw)) { + bed_manifest <- write_cluster_beds(results, cluster_labels_3_raw, "3clusters", "3clusters", bed_manifest) +} + +write.table(bed_manifest, "chipbinner.cluster_beds.tsv", sep = "\t", quote = FALSE, row.names = FALSE) + +selected_row <- NULL +if (!is.null(opt$grid_summary) && file.exists(opt$grid_summary)) { + grid <- read.table(opt$grid_summary, header = TRUE, sep = "\t", check.names = FALSE) + if ("selected" %in% colnames(grid)) { + selected_row <- grid[grid$selected == TRUE | grid$selected == "True" | grid$selected == "true", , drop = FALSE] + if (nrow(selected_row) > 0) { + selected_row <- selected_row[1, , drop = FALSE] + } + } +} + +norm_info <- list(use_input = FALSE, use_ms_scaling = FALSE) +if (!is.null(opt$norm_info) && file.exists(opt$norm_info)) { + try({ + norm_info <- jsonlite::fromJSON(opt$norm_info) + }, silent = TRUE) +} + +use_spikein_flag <- ifelse(is.null(norm_info$use_spikein_scaling), "false", + ifelse(norm_info$use_spikein_scaling, "true", "false")) + +n_clusters <- length(unique(cluster_ids_best[!is.na(cluster_ids_best) & cluster_ids_best != -1])) +min_cluster_size <- if (!is.null(selected_row) && "min_cluster_size" %in% colnames(selected_row)) selected_row$min_cluster_size else NA +min_samples <- if (!is.null(selected_row) && "min_samples" %in% colnames(selected_row)) selected_row$min_samples else NA + +summary <- data.frame( + method = "chipbinner", + group = opt$group, + caller = "NA", + treated_condition = opt$treated, + control_condition = opt$control, + n_bins_tested = nrow(results), + n_tested = nrow(results), + n_fdr_pass = nrow(sig), + n_up = nrow(up), + n_down = nrow(down), + n_clusters = n_clusters, + min_cluster_size = min_cluster_size, + min_samples = min_samples, + input_normalization_used = ifelse(is.null(norm_info$use_input), FALSE, norm_info$use_input), + ms_scaling_used = ifelse(is.null(norm_info$use_ms_scaling), FALSE, norm_info$use_ms_scaling), + use_spikein = use_spikein_flag, + span_mode_used = "NA", + lola_run = tolower(opt$lola_run) %in% c("true", "1", "yes") +) +write.table(summary, "chipbinner.summary.tsv", sep = "\t", quote = FALSE, row.names = FALSE) + +# Plots +# counts already include pseudocount from normalization +counts_log <- log2(counts) +try({ + pca <- prcomp(t(counts_log), scale. = TRUE) + pdf("plots/PCA.pdf") + plot(pca$x[,1], pca$x[,2], col = as.factor(conditions), pch = 19, + xlab = "PC1", ylab = "PC2", main = "PCA") + legend("topright", legend = levels(as.factor(conditions)), col = 1:length(unique(conditions)), pch = 19) + dev.off() +}, silent = TRUE) + +try({ + pdf("plots/correlation_heatmap.pdf") + heatmap(cor(counts_log), main = "Correlation") + dev.off() +}, silent = TRUE) + +try({ + treated_means <- rowMeans(counts[, conditions == opt$treated, drop = FALSE]) + control_means <- rowMeans(counts[, conditions == opt$control, drop = FALSE]) + pdf("plots/density_scatter.pdf") + cluster_factor <- as.factor(cluster_labels_best) + cols <- as.numeric(cluster_factor) + plot(control_means, treated_means, pch = 16, cex = 0.5, col = cols, + xlab = "Control mean", ylab = "Treated mean", main = "Density scatter") + legend("topleft", legend = levels(cluster_factor), col = seq_along(levels(cluster_factor)), pch = 16, cex = 0.6) + dev.off() +}, silent = TRUE) diff --git a/bin/diffbind_run.R b/bin/diffbind_run.R new file mode 100755 index 00000000..f0080146 --- /dev/null +++ b/bin/diffbind_run.R @@ -0,0 +1,380 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages({ + library(optparse) +}) + +option_list <- list( + make_option(c("--samplesheet"), type = "character"), + make_option(c("--treated"), type = "character"), + make_option(c("--control"), type = "character"), + make_option(c("--group"), type = "character", default = "NA"), + make_option(c("--caller"), type = "character", default = "NA"), + make_option(c("--fdr"), type = "double", default = 0.05), + make_option(c("--lfc"), type = "double", default = 1.0), + make_option(c("--min_overlap"), type = "integer", default = 2), + make_option(c("--summits"), type = "integer", default = 0), + make_option(c("--backend"), type = "character", default = "DESeq2"), + make_option(c("--use_spikein"), type = "character", default = "false"), + make_option(c("--summary_method"), type = "character", default = "diffbind"), + make_option(c("--span_mode_used"), type = "character", default = "NA"), + make_option(c("--prefix"), type = "character", default = "diffbind"), + make_option(c("--extra_params"), type = "character", default = NULL) +) + +opt <- parse_args(OptionParser(option_list = option_list)) + +suppressPackageStartupMessages({ + library(DiffBind) +}) + +use_spikein <- tolower(opt$use_spikein) %in% c("true", "1", "yes") + +samplesheet <- read.csv(opt$samplesheet, stringsAsFactors = FALSE) +required_cols <- c("SampleID", "Condition", "bamReads", "Peaks", "Replicate") +missing <- setdiff(required_cols, colnames(samplesheet)) +if (length(missing) > 0) { + stop("Missing required columns in samplesheet: ", paste(missing, collapse = ", ")) +} + +samplesheet <- samplesheet[samplesheet$Condition %in% c(opt$treated, opt$control), , drop = FALSE] +if (nrow(samplesheet) == 0) { + stop("No samples remain after filtering to contrast conditions") +} + +sample_ids <- samplesheet$SampleID +scale_factors <- NULL +if ("SpikeinScaleFactor" %in% colnames(samplesheet)) { + scale_factors <- suppressWarnings(as.numeric(samplesheet$SpikeinScaleFactor)) +} + +if (use_spikein && (is.null(scale_factors) || any(is.na(scale_factors)))) { + message("Spike-in scaling requested but scale factors missing; falling back to default normalization") + use_spikein <- FALSE +} + +extra <- list() +if (!is.null(opt$extra_params) && file.exists(opt$extra_params)) { + suppressPackageStartupMessages({ + library(jsonlite) + library(yaml) + }) + if (grepl("\\.ya?ml$", opt$extra_params, ignore.case = TRUE)) { + extra <- yaml::read_yaml(opt$extra_params) + } else { + extra <- jsonlite::fromJSON(opt$extra_params) + } +} + +backend <- toupper(opt$backend) +backend_method <- if (backend == "EDGER") DBA_EDGER else DBA_DESEQ2 + +summits_value <- if (opt$summits > 0) opt$summits else FALSE + +count_args <- list(summits = summits_value, minOverlap = opt$min_overlap) +if (!is.null(extra$dba_count) && is.list(extra$dba_count)) { + count_args <- modifyList(count_args, extra$dba_count) +} + +analyze_args <- list(method = backend_method) +if (!is.null(extra$dba_analyze) && is.list(extra$dba_analyze)) { + analyze_args <- modifyList(analyze_args, extra$dba_analyze) +} + +contrast_args <- list(categories = DBA_CONDITION, group1 = opt$treated, group2 = opt$control) +if (!is.null(extra$dba_contrast) && is.list(extra$dba_contrast)) { + contrast_args <- modifyList(contrast_args, extra$dba_contrast) +} + +extract_counts <- function(dba_obj, sample_ids) { + df <- NULL + counts <- NULL + peaks_df <- NULL + try({ + df <- dba.peakset(dba_obj, bRetrieve = TRUE, DataType = DBA_DATA_FRAME) + }, silent = TRUE) + if (!is.null(df)) { + count_cols <- match(sample_ids, colnames(df)) + if (all(!is.na(count_cols))) { + counts <- as.matrix(df[, count_cols, drop = FALSE]) + } + coord_cols <- list(chr = NULL, start = NULL, end = NULL) + for (col in colnames(df)) { + if (tolower(col) %in% c("chr", "chrom", "seqnames", "seqname")) coord_cols$chr <- col + if (tolower(col) %in% c("start")) coord_cols$start <- col + if (tolower(col) %in% c("end")) coord_cols$end <- col + } + if (!is.null(coord_cols$chr) && !is.null(coord_cols$start) && !is.null(coord_cols$end)) { + peaks_df <- df[, c(coord_cols$chr, coord_cols$start, coord_cols$end), drop = FALSE] + colnames(peaks_df) <- c("chr", "start", "end") + } + } + if (is.null(counts)) { + try({ + gr <- dba.peakset(dba_obj, bRetrieve = TRUE) + df_gr <- as.data.frame(gr) + count_cols <- match(sample_ids, colnames(df_gr)) + if (all(!is.na(count_cols))) { + counts <- as.matrix(df_gr[, count_cols, drop = FALSE]) + } + if (all(c("seqnames", "start", "end") %in% colnames(df_gr))) { + peaks_df <- df_gr[, c("seqnames", "start", "end")] + colnames(peaks_df) <- c("chr", "start", "end") + } + }, silent = TRUE) + } + list(counts = counts, peaks = peaks_df) +} + +run_deseq2 <- function(counts, conditions, scale_factors, treated, control) { + suppressPackageStartupMessages(library(DESeq2)) + cond <- factor(conditions, levels = c(control, treated)) + dds <- DESeqDataSetFromMatrix(countData = counts, colData = data.frame(condition = cond), design = ~condition) + if (!is.null(scale_factors)) { + sizeFactors(dds) <- 1 / scale_factors + } + dds <- DESeq(dds) + res <- results(dds, contrast = c("condition", treated, control)) + data.frame(log2FC = res$log2FoldChange, pvalue = res$pvalue, FDR = res$padj) +} + +run_edger <- function(counts, conditions, scale_factors, treated, control) { + suppressPackageStartupMessages(library(edgeR)) + cond <- factor(conditions, levels = c(control, treated)) + y <- DGEList(counts = counts, group = cond) + if (!is.null(scale_factors)) { + y$samples$norm.factors <- 1 / scale_factors + } else { + y <- calcNormFactors(y) + } + design <- model.matrix(~cond) + y <- estimateDisp(y, design) + fit <- glmQLFit(y, design) + qlf <- glmQLFTest(fit, coef = 2) + data.frame(log2FC = qlf$table$logFC, pvalue = qlf$table$PValue, FDR = p.adjust(qlf$table$PValue, method = "BH")) +} + +write_plot_placeholder <- function(path, title) { + pdf(path) + plot.new() + title(main = title) + dev.off() +} + +samplesheet$Condition <- as.character(samplesheet$Condition) + +if (use_spikein) { + dba_obj <- dba(sampleSheet = samplesheet) + dba_obj <- do.call(dba.count, c(list(dba_obj), count_args)) + dba_obj <- do.call(dba.contrast, c(list(dba_obj), contrast_args)) + + extracted <- extract_counts(dba_obj, sample_ids) + if (is.null(extracted$counts) || is.null(extracted$peaks)) { + message("Unable to extract counts/peaks for spike-in normalization; falling back to DiffBind default") + use_spikein <- FALSE + } else { + counts <- extracted$counts + if (backend == "EDGER") { + res_stats <- run_edger(counts, samplesheet$Condition, scale_factors, opt$treated, opt$control) + } else { + res_stats <- run_deseq2(counts, samplesheet$Condition, scale_factors, opt$treated, opt$control) + } + res_df <- cbind(extracted$peaks, res_stats) + res_df$log2FC[is.na(res_df$log2FC)] <- 0 + res_df$FDR[is.na(res_df$FDR)] <- 1 + res_df$pvalue[is.na(res_df$pvalue)] <- 1 + + write.table(res_df, paste0(opt$prefix, ".results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + sig <- res_df[res_df$FDR <= opt$fdr & abs(res_df$log2FC) >= opt$lfc, , drop = FALSE] + write.table(sig[, c("chr", "start", "end")], paste0(opt$prefix, ".significant.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + up <- sig[sig$log2FC > 0, , drop = FALSE] + down <- sig[sig$log2FC < 0, , drop = FALSE] + write.table(up[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_up.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(down[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_down.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + + n_tested <- nrow(res_df) + n_fdr <- nrow(sig) + n_up <- nrow(up) + n_down <- nrow(down) + use_spikein_flag <- ifelse(use_spikein, "true", "false") + summary <- data.frame( + method = opt$summary_method, + group = opt$group, + caller = opt$caller, + treated_condition = opt$treated, + control_condition = opt$control, + n_tested = n_tested, + n_fdr_pass = n_fdr, + n_up = n_up, + n_down = n_down, + use_spikein = use_spikein_flag, + span_mode_used = opt$span_mode_used + ) + write.table(summary, paste0(opt$prefix, ".summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + # plots + counts_log <- log2(counts + 1) + try({ + pca <- prcomp(t(counts_log), scale. = TRUE) + pdf("plots/PCA.pdf") + plot(pca$x[,1], pca$x[,2], col = as.factor(samplesheet$Condition), pch = 19, + xlab = "PC1", ylab = "PC2", main = "PCA") + legend("topright", legend = levels(as.factor(samplesheet$Condition)), col = 1:length(unique(samplesheet$Condition)), pch = 19) + dev.off() + }, silent = TRUE) + try({ + pdf("plots/correlation_heatmap.pdf") + heatmap(cor(counts_log), main = "Correlation") + dev.off() + }, silent = TRUE) + try({ + pdf("plots/MA.pdf") + plot(rowMeans(counts_log), res_df$log2FC, pch = 16, cex = 0.5, main = "MA", xlab = "Mean", ylab = "log2FC") + dev.off() + }, silent = TRUE) + try({ + pdf("plots/volcano.pdf") + plot(res_df$log2FC, -log10(res_df$FDR), pch = 16, cex = 0.5, main = "Volcano", xlab = "log2FC", ylab = "-log10(FDR)") + dev.off() + }, silent = TRUE) + try({ + pdf("plots/heatmap.pdf") + heatmap(counts_log, Rowv = NA, Colv = NA, scale = "row", main = "Binding heatmap") + dev.off() + }, silent = TRUE) + } +} + +if (!use_spikein) { + dba_obj <- dba(sampleSheet = samplesheet) + dba_obj <- do.call(dba.count, c(list(dba_obj), count_args)) + dba_obj <- do.call(dba.contrast, c(list(dba_obj), contrast_args)) + dba_obj <- do.call(dba.analyze, c(list(dba_obj), analyze_args)) + + res <- tryCatch(dba.report(dba_obj, th = 1, fold = 0), error = function(e) NULL) + if (is.null(res)) { + res_df <- data.frame() + } else { + res_df <- as.data.frame(res) + } + if (nrow(res_df) == 0 && ncol(res_df) == 0) { + res_df <- data.frame( + chr = character(), + start = integer(), + end = integer(), + log2FC = numeric(), + FDR = numeric(), + stringsAsFactors = FALSE + ) + } + + coord_cols <- list(chr = NULL, start = NULL, end = NULL) + for (col in colnames(res_df)) { + if (tolower(col) %in% c("chr", "chrom", "seqnames", "seqname")) coord_cols$chr <- col + if (tolower(col) %in% c("start")) coord_cols$start <- col + if (tolower(col) %in% c("end")) coord_cols$end <- col + } + if (!is.null(coord_cols$chr)) { + res_df$chr <- res_df[[coord_cols$chr]] + } + if (!is.null(coord_cols$start)) { + res_df$start <- res_df[[coord_cols$start]] + } + if (!is.null(coord_cols$end)) { + res_df$end <- res_df[[coord_cols$end]] + } + if (!"chr" %in% colnames(res_df)) { + res_df$chr <- NA + } + if (!"start" %in% colnames(res_df)) { + res_df$start <- NA + } + if (!"end" %in% colnames(res_df)) { + res_df$end <- NA + } + + if (!"log2FC" %in% colnames(res_df)) { + if ("Fold" %in% colnames(res_df)) { + res_df$log2FC <- res_df$Fold + } else if ("fold" %in% colnames(res_df)) { + res_df$log2FC <- res_df$fold + } else { + res_df$log2FC <- NA_real_ + } + } + if (!"FDR" %in% colnames(res_df)) { + res_df$FDR <- NA_real_ + } + + write.table(res_df, paste0(opt$prefix, ".results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + sig <- res_df[!is.na(res_df$FDR) & res_df$FDR <= opt$fdr & abs(res_df$log2FC) >= opt$lfc, , drop = FALSE] + write.table(sig[, c("chr", "start", "end")], paste0(opt$prefix, ".significant.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + up <- sig[sig$log2FC > 0, , drop = FALSE] + down <- sig[sig$log2FC < 0, , drop = FALSE] + write.table(up[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_up.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(down[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_down.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + + n_tested <- nrow(res_df) + n_fdr <- nrow(sig) + n_up <- nrow(up) + n_down <- nrow(down) + use_spikein_flag <- ifelse(use_spikein, "true", "false") + summary <- data.frame( + method = opt$summary_method, + group = opt$group, + caller = opt$caller, + treated_condition = opt$treated, + control_condition = opt$control, + n_tested = n_tested, + n_fdr_pass = n_fdr, + n_up = n_up, + n_down = n_down, + use_spikein = use_spikein_flag, + span_mode_used = opt$span_mode_used + ) + write.table(summary, paste0(opt$prefix, ".summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + # plots + try({ + pdf("plots/PCA.pdf") + dba.plotPCA(dba_obj, DBA_CONDITION, label = DBA_ID) + dev.off() + }, silent = TRUE) + + try({ + pdf("plots/correlation_heatmap.pdf") + dba.plotHeatmap(dba_obj, correlations = TRUE) + dev.off() + }, silent = TRUE) + + try({ + pdf("plots/MA.pdf") + dba.plotMA(dba_obj) + dev.off() + }, silent = TRUE) + + try({ + pdf("plots/volcano.pdf") + dba.plotVolcano(dba_obj) + dev.off() + }, silent = TRUE) + + try({ + pdf("plots/heatmap.pdf") + dba.plotHeatmap(dba_obj) + dev.off() + }, silent = TRUE) +} + +if (!file.exists(paste0(opt$prefix, ".results.tsv"))) { + stop("DiffBind failed to produce results") +} + +plots <- c("plots/PCA.pdf", "plots/correlation_heatmap.pdf", "plots/MA.pdf", "plots/volcano.pdf", "plots/heatmap.pdf") +for (plot_file in plots) { + if (!file.exists(plot_file)) { + write_plot_placeholder(plot_file, basename(plot_file)) + } +} diff --git a/bin/hdbscan_grid.py b/bin/hdbscan_grid.py new file mode 100755 index 00000000..a304e1bd --- /dev/null +++ b/bin/hdbscan_grid.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +import argparse +import csv +import itertools +import sys + +import numpy as np + + +def parse_list(value): + return [int(x.strip()) for x in str(value).split(',') if x.strip()] + + +def load_matrix(path): + with open(path, newline='') as handle: + reader = csv.reader(handle, delimiter='\t') + header = next(reader, []) + rows = [row for row in reader if row] + if len(header) <= 3: + raise SystemExit("Matrix missing sample columns") + coords = [row[:3] for row in rows] + data = [] + for row in rows: + data.append([float(x) for x in row[3:]]) + matrix = np.array(data) if data else np.empty((0, len(header) - 3)) + return header[:3], header[3:], coords, matrix + + +def load_samplesheet(path, sample_ids): + if not path: + return [] + conditions = {} + with open(path, newline="") as handle: + reader = csv.DictReader(handle) + for row in reader: + sample_id = row.get("sample_id") + condition = row.get("condition") + if not sample_id: + continue + conditions[sample_id] = condition or "" + missing = [sid for sid in sample_ids if sid not in conditions] + if missing: + raise SystemExit(f"Samplesheet missing condition for: {', '.join(missing)}") + return [conditions[sid] for sid in sample_ids] + + +def replicate_consistency(matrix, labels, conditions): + if matrix.size == 0 or not conditions: + return 0.0 + scores = [] + unique_conditions = sorted(set(conditions)) + for cluster_id in sorted(set(labels)): + if cluster_id == -1: + continue + cluster_rows = matrix[labels == cluster_id, :] + if cluster_rows.shape[0] < 2: + continue + for condition in unique_conditions: + idx = [i for i, cond in enumerate(conditions) if cond == condition] + if len(idx) < 2: + continue + data = cluster_rows[:, idx] + try: + corr = np.corrcoef(data, rowvar=False) + except Exception: + continue + if corr.size <= 1: + continue + tri = corr[np.triu_indices_from(corr, k=1)] + tri = tri[~np.isnan(tri)] + if tri.size: + scores.append(float(np.mean(tri))) + if not scores: + return 0.0 + return float(np.mean(scores)) + + +def write_summary(path, rows): + fieldnames = [ + "min_cluster_size", + "min_samples", + "n_clusters", + "frac_assigned_non_noise", + "cluster_stability_metric", + "replicate_consistency_metric", + "score_total", + "selected", + "selected_2clusters", + "selected_3clusters", + ] + with open(path, "w", newline="") as handle: + writer = csv.DictWriter(handle, delimiter="\t", fieldnames=fieldnames, lineterminator="\n") + writer.writeheader() + for row in rows: + writer.writerow(row) + + +def write_clusters(path, coord_header, coords, labels): + with open(path, "w", newline="") as handle: + writer = csv.writer(handle, delimiter="\t", lineterminator="\n") + writer.writerow(coord_header + ["cluster"]) + for row, label in zip(coords, labels): + writer.writerow(row + [label]) + + +def write_not_found(path, coord_header, message): + with open(path, "w", newline="") as handle: + handle.write(f"# {message}\n") + writer = csv.writer(handle, delimiter="\t", lineterminator="\n") + writer.writerow(coord_header + ["cluster"]) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--matrix", required=True) + parser.add_argument("--min_cluster_size", required=True) + parser.add_argument("--min_samples", required=True) + parser.add_argument("--summary", required=True) + parser.add_argument("--clusters_best", required=True) + parser.add_argument("--clusters_2", required=True) + parser.add_argument("--clusters_3", required=True) + parser.add_argument("--samplesheet", default="") + args = parser.parse_args() + + coord_header, sample_ids, coords, matrix = load_matrix(args.matrix) + if matrix.size: + matrix = np.log2(np.maximum(matrix, 1e-6)) + conditions = load_samplesheet(args.samplesheet, sample_ids) if args.samplesheet else [] + + try: + import hdbscan + except Exception: + # fallback: single cluster + summary_rows = [ + { + "min_cluster_size": 0, + "min_samples": 0, + "n_clusters": 1, + "frac_assigned_non_noise": 1.0, + "cluster_stability_metric": 1.0, + "replicate_consistency_metric": 0.0, + "score_total": 1.0, + "selected": True, + "selected_2clusters": False, + "selected_3clusters": False, + } + ] + write_summary(args.summary, summary_rows) + labels = [0 for _ in coords] + write_clusters(args.clusters_best, coord_header, coords, labels) + write_not_found(args.clusters_2, coord_header, "No 2-cluster solution found (HDBSCAN unavailable)") + write_not_found(args.clusters_3, coord_header, "No 3-cluster solution found (HDBSCAN unavailable)") + return + + min_cluster_sizes = parse_list(args.min_cluster_size) + min_samples_list = parse_list(args.min_samples) + + results = [] + labels_by_params = {} + + for mcs, ms in itertools.product(min_cluster_sizes, min_samples_list): + clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs, min_samples=ms) + labels = clusterer.fit_predict(matrix) if matrix.size else np.array([]) + n_clusters = len(set(labels)) - (1 if -1 in labels else 0) + fraction_assigned = float(np.sum(labels >= 0)) / float(len(labels)) if len(labels) else 0.0 + cluster_stability = float(np.mean(clusterer.cluster_persistence_)) if getattr(clusterer, 'cluster_persistence_', None) is not None and len(clusterer.cluster_persistence_) > 0 else 0.0 + rep_consistency = replicate_consistency(matrix, labels, conditions) if len(labels) else 0.0 + score_total = float(fraction_assigned + cluster_stability + rep_consistency) + labels_by_params[(mcs, ms)] = labels + results.append({ + "min_cluster_size": mcs, + "min_samples": ms, + "n_clusters": n_clusters, + "frac_assigned_non_noise": round(fraction_assigned, 4), + "cluster_stability_metric": round(cluster_stability, 4), + "replicate_consistency_metric": round(rep_consistency, 4), + "score_total": round(score_total, 4), + "selected": False, + "selected_2clusters": False, + "selected_3clusters": False, + }) + + # Rank by overall score first, then cluster count, fraction assigned, and stability. + rank_key = lambda row: (row["score_total"], row["n_clusters"], row["frac_assigned_non_noise"], row["cluster_stability_metric"]) + best = max(results, key=rank_key) + best_2 = None + best_3 = None + candidates_2 = [row for row in results if row["n_clusters"] == 2] + candidates_3 = [row for row in results if row["n_clusters"] == 3] + if candidates_2: + best_2 = max(candidates_2, key=rank_key) + if candidates_3: + best_3 = max(candidates_3, key=rank_key) + + for row in results: + if row is best: + row["selected"] = True + if best_2 is not None and row is best_2: + row["selected_2clusters"] = True + if best_3 is not None and row is best_3: + row["selected_3clusters"] = True + + results_sorted = sorted(results, key=rank_key, reverse=True) + write_summary(args.summary, results_sorted) + + labels = list(labels_by_params.get((best["min_cluster_size"], best["min_samples"]), [])) if coords else [] + if coords: + write_clusters(args.clusters_best, coord_header, coords, labels) + else: + write_clusters(args.clusters_best, coord_header, coords, []) + + if best_2 is not None: + labels_2 = list(labels_by_params.get((best_2["min_cluster_size"], best_2["min_samples"]), [])) + write_clusters(args.clusters_2, coord_header, coords, labels_2) + else: + sys.stderr.write("No 2-cluster solution found for HDBSCAN grid search\n") + write_not_found(args.clusters_2, coord_header, "No 2-cluster solution found") + + if best_3 is not None: + labels_3 = list(labels_by_params.get((best_3["min_cluster_size"], best_3["min_samples"]), [])) + write_clusters(args.clusters_3, coord_header, coords, labels_3) + else: + sys.stderr.write("No 3-cluster solution found for HDBSCAN grid search\n") + write_not_found(args.clusters_3, coord_header, "No 3-cluster solution found") + + +if __name__ == "__main__": + main() diff --git a/bin/span_fallback_de.R b/bin/span_fallback_de.R new file mode 100755 index 00000000..09b437fb --- /dev/null +++ b/bin/span_fallback_de.R @@ -0,0 +1,193 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages({ + library(optparse) +}) + +option_list <- list( + make_option(c("--samplesheet"), type = "character"), + make_option(c("--treated"), type = "character"), + make_option(c("--control"), type = "character"), + make_option(c("--group"), type = "character", default = "NA"), + make_option(c("--fdr"), type = "double", default = 0.05), + make_option(c("--backend"), type = "character", default = "DESeq2"), + make_option(c("--use_spikein"), type = "character", default = "false"), + make_option(c("--prefix"), type = "character", default = "span_fallback") +) + +opt <- parse_args(OptionParser(option_list = option_list)) + +suppressPackageStartupMessages({ + library(GenomicRanges) + library(GenomicAlignments) + library(Rsamtools) +}) + +use_spikein <- tolower(opt$use_spikein) %in% c("true", "1", "yes") + +samplesheet <- read.csv(opt$samplesheet, stringsAsFactors = FALSE) +required_cols <- c("SampleID", "Condition", "bamReads", "Peaks") +missing <- setdiff(required_cols, colnames(samplesheet)) +if (length(missing) > 0) { + stop("Missing required columns in samplesheet: ", paste(missing, collapse = ", ")) +} + +samplesheet <- samplesheet[samplesheet$Condition %in% c(opt$treated, opt$control), , drop = FALSE] +if (nrow(samplesheet) == 0) { + stop("No samples remain after filtering to contrast conditions") +} + +read_peaks <- function(path) { + df <- tryCatch({ + read.table(path, sep = "\t", header = FALSE, comment.char = "", quote = "", stringsAsFactors = FALSE) + }, error = function(e) { + stop("Failed to read peaks file: ", path) + }) + if (ncol(df) < 3) { + stop("Peaks file has fewer than 3 columns: ", path) + } + df <- df[, 1:3] + suppressWarnings({ + df <- df[!is.na(as.numeric(df[, 2])) & !is.na(as.numeric(df[, 3])), , drop = FALSE] + }) + colnames(df) <- c("chr", "start", "end") + df$start <- as.integer(df$start) + df$end <- as.integer(df$end) + df +} + +if (any(is.na(samplesheet$Peaks)) || any(samplesheet$Peaks == "")) { + stop("Missing SPAN peak files in samplesheet for fallback differential.") +} + +peak_list <- lapply(samplesheet$Peaks, read_peaks) +peaks_list_path <- paste0(opt$prefix, ".peaks.list") +writeLines(paste(samplesheet$Peaks, collapse = ";"), peaks_list_path) +gr_list <- lapply(peak_list, function(df) { + GRanges(seqnames = df$chr, ranges = IRanges(start = df$start + 1L, end = df$end)) +}) + +union_gr <- reduce(do.call(c, gr_list)) + +union_df <- data.frame( + chr = as.character(seqnames(union_gr)), + start = start(union_gr) - 1L, + end = end(union_gr), + stringsAsFactors = FALSE +) + +union_bed <- paste0(opt$prefix, ".union.bed") +write.table(union_df, union_bed, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + +bam_files <- samplesheet$bamReads +sample_ids <- samplesheet$SampleID +names(bam_files) <- sample_ids + +detect_paired <- function(bam) { + bf <- BamFile(bam, yieldSize = 1000) + open(bf) + flags <- tryCatch({ + scanBam(bf, param = ScanBamParam(what = "flag"))[[1]]$flag + }, error = function(e) { + integer() + }) + close(bf) + if (length(flags) == 0) { + return(FALSE) + } + any(bitwAnd(flags, 1L) != 0) +} + +paired_flags <- vapply(bam_files, detect_paired, logical(1)) +single_end <- !any(paired_flags) + +bam_list <- BamFileList(bam_files, yieldSize = 1000000) +se <- summarizeOverlaps( + features = union_gr, + reads = bam_list, + mode = "Union", + singleEnd = single_end, + ignore.strand = TRUE +) + +counts <- assay(se) +colnames(counts) <- sample_ids + +counts_df <- cbind(union_df, counts) +counts_path <- paste0(opt$prefix, ".counts.tsv") +write.table(counts_df, counts_path, sep = "\t", quote = FALSE, row.names = FALSE) + +scale_factors <- NULL +if ("SpikeinScaleFactor" %in% colnames(samplesheet)) { + scale_factors <- suppressWarnings(as.numeric(samplesheet$SpikeinScaleFactor)) +} + +if (use_spikein && (is.null(scale_factors) || any(is.na(scale_factors)))) { + stop("Spike-in scaling requested but spike-in scale factors are missing from the samplesheet.") +} + +backend <- toupper(opt$backend) + +run_deseq2 <- function(counts, conditions, scale_factors, treated, control) { + suppressPackageStartupMessages(library(DESeq2)) + cond <- factor(conditions, levels = c(control, treated)) + dds <- DESeqDataSetFromMatrix(countData = counts, colData = data.frame(condition = cond), design = ~condition) + if (!is.null(scale_factors)) { + sizeFactors(dds) <- 1 / scale_factors + } + dds <- DESeq(dds) + res <- results(dds, contrast = c("condition", treated, control)) + data.frame(log2FC = res$log2FoldChange, pvalue = res$pvalue, FDR = res$padj) +} + +run_edger <- function(counts, conditions, scale_factors, treated, control) { + suppressPackageStartupMessages(library(edgeR)) + cond <- factor(conditions, levels = c(control, treated)) + y <- DGEList(counts = counts, group = cond) + if (!is.null(scale_factors)) { + y$samples$norm.factors <- 1 / scale_factors + } else { + y <- calcNormFactors(y) + } + design <- model.matrix(~cond) + y <- estimateDisp(y, design) + fit <- glmQLFit(y, design) + qlf <- glmQLFTest(fit, coef = 2) + data.frame(log2FC = qlf$table$logFC, pvalue = qlf$table$PValue, FDR = p.adjust(qlf$table$PValue, method = "BH")) +} + +conditions <- samplesheet$Condition +if (backend == "EDGER") { + res_stats <- run_edger(counts, conditions, if (use_spikein) scale_factors else NULL, opt$treated, opt$control) +} else { + res_stats <- run_deseq2(counts, conditions, if (use_spikein) scale_factors else NULL, opt$treated, opt$control) +} + +res_stats$log2FC[is.na(res_stats$log2FC)] <- 0 +res_stats$pvalue[is.na(res_stats$pvalue)] <- 1 +res_stats$FDR[is.na(res_stats$FDR)] <- 1 + +res_df <- cbind(union_df, res_stats) +write.table(res_df, paste0(opt$prefix, ".results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + +sig <- res_df[res_df$FDR <= opt$fdr, , drop = FALSE] +write.table(sig[, c("chr", "start", "end")], paste0(opt$prefix, ".significant.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) +up <- sig[sig$log2FC > 0, , drop = FALSE] +down <- sig[sig$log2FC < 0, , drop = FALSE] +write.table(up[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_up.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) +write.table(down[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_down.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + +summary <- data.frame( + method = "span", + group = opt$group, + caller = "NA", + treated_condition = opt$treated, + control_condition = opt$control, + n_tested = nrow(res_df), + n_fdr_pass = nrow(sig), + n_up = nrow(up), + n_down = nrow(down), + use_spikein = if (use_spikein) "true" else "false", + span_mode_used = "fallback" +) +write.table(summary, paste0(opt$prefix, ".summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) diff --git a/conf/flowswitch.config b/conf/flowswitch.config index a79202c8..07afeb04 100644 --- a/conf/flowswitch.config +++ b/conf/flowswitch.config @@ -135,3 +135,23 @@ if(params.only_peak_calling) { } if(params.skip_multiqc) { params.run_multiqc = false } + +if(params.differential_from_run) { + params.run_genome_prep = false + params.run_input_check = false + params.run_cat_fastq = false + params.run_trim_galore_fastqc = false + params.run_alignment = false + params.run_read_filter = false + params.run_preseq = false + params.run_mark_dups = false + params.run_remove_dups = false + params.run_remove_linear_dups = false + params.run_peak_calling = false + params.run_reporting = false + params.run_deeptools_heatmaps = false + params.run_deeptools_qc = false + params.run_peak_qc = false + params.run_multiqc = false + params.run_igv = false +} diff --git a/conf/modules.config b/conf/modules.config index 676eb44a..e95da593 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1149,3 +1149,121 @@ if (params.run_multiqc) { } } } + +/* +======================================================================================== + DIFFERENTIAL PEAK CALLING +======================================================================================== +*/ + +process { + withName: '.*WRITE_DIFFERENTIAL_MANIFESTS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/00_manifests" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*MAKE_DIFFBIND_SAMPLESHEET' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/01_diffbind/00_samplesheets/${caller}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*DIFFBIND_RUN' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/01_diffbind/${caller}/${group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*CHIPBINNER_BINS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/bins" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*CHIPBINNER_COUNTS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/counts" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + container = '' + } + + withName: '.*CHIPBINNER_HDBSCAN_GRID' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/clustering" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + container = '' + } + + withName: '.*CHIPBINNER_ROTS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> + if (filename == 'versions.yml') { return null } + if (filename == 'plots') { return 'plots' } + if (filename.startsWith('plots')) { return "plots/${filename}" } + if (filename.endsWith('.bed')) { return "bed/${filename}" } + if (filename.contains('differential')) { return "differential/${filename}" } + return filename + } + ] + container = '' + } + + withName: '.*SPAN_CAPABILITY_PROBE' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/00_manifests" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*SPAN_COMPARE' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/${meta.group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*SPAN_FALLBACK_DIFF' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/${group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*ANNOTATE_REGIONS' { + container = '' + } + + withName: '.*DIFFERENTIAL_OVERLAP' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/04_cross_comparison" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*DIFFERENTIAL_SUMMARY_MERGE' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/multiqc" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } +} diff --git a/docs/extension_specs/4.5_DIFFERENTIAL_PEAK_CALLING-SPEC.md b/docs/extension_specs/4.5_DIFFERENTIAL_PEAK_CALLING-SPEC.md new file mode 100644 index 00000000..b4a7a403 --- /dev/null +++ b/docs/extension_specs/4.5_DIFFERENTIAL_PEAK_CALLING-SPEC.md @@ -0,0 +1,657 @@ +# Differential Peak-Calling Specification + +**Status**: TODO #4 from `docs/extension_specs/TODO.md` +**Version**: 1.0 +**Date**: 2026-01-04 + +--- + +## Table of Contents + +1. [Overview](#overview) +2. [Architecture](#architecture) +3. [Parameters](#parameters) +4. [DiffBind Implementation](#diffbind-implementation) +5. [ChIPBinner Implementation](#chipbinner-implementation) +6. [SPAN Differential Implementation](#span-differential-implementation) +7. [Output Directory Structure](#output-directory-structure) +8. [MultiQC Integration](#multiqc-integration) +9. [Execution Examples](#execution-examples) +10. [Implementation Notes](#implementation-notes) + +--- + +## Overview + +This specification defines three complementary differential analysis approaches for the nf-core/cutandrun pipeline: + +| Approach | Best For | Method | Outputs | +|----------|----------|--------|---------| +| **DiffBind** | Traditional peak-centric differential binding | Counts reads in peak regions; DESeq2/edgeR statistics | Differential binding tables, MA/volcano plots | +| **ChIPBinner** | Broad marks with global changes (e.g., H3K36me2 after NSD1 KO) | Genome-wide binning + HDBSCAN clustering | Clustered regions, scatterplots, LOLA enrichment | +| **SPAN Differential** | Semi-supervised broad peak comparison | SPAN's native 'compare' mode | Differential peak regions | + +### When to Use Each + +- **DiffBind**: Standard choice for most CUT&RUN experiments with focal/punctate marks (H3K4me3, H3K27ac at promoters, CTCF). Works with any peak caller output. + +- **ChIPBinner**: Specifically designed for experiments where you expect **global redistribution** of signal (e.g., writer/eraser knockouts affecting H3K36me2/me3, H3K27me3). Requires spike-in or MS normalization for reliable results. + +- **SPAN Differential**: Best for broad histone marks analyzed with SPAN peak calling. Leverages SPAN's internal model for semi-supervised comparison. + +### Contrast Model + +All three approaches compare **conditions within the same group** (i.e., within the same histone mark/target): + +``` +H3K36me2_sgNSD3 vs H3K36me2_sgControl ✓ Valid comparison +H3K27me3_Treatment vs H3K27me3_Control ✓ Valid comparison +H3K36me2_sgNSD3 vs H3K27me3_sgNSD3 ✗ Not supported (cross-group) +``` + +Groups lacking a second condition are **skipped with a warning** (not an error). + +--- + +## Architecture + +### Subworkflow Structure + +``` +DIFFERENTIAL_ANALYSIS (new subworkflow) +├── DIFFBIND_ANALYSIS +│ ├── GENERATE_DIFFBIND_SAMPLESHEET +│ ├── DIFFBIND_COUNT +│ ├── DIFFBIND_ANALYZE +│ └── DIFFBIND_REPORT +├── CHIPBINNER_ANALYSIS +│ ├── CHIPBINNER_BINNING +│ ├── CHIPBINNER_NORMALIZE +│ ├── CHIPBINNER_CLUSTER (HDBSCAN grid search) +│ ├── CHIPBINNER_ANNOTATE +│ ├── CHIPBINNER_DIFFERENTIAL (ROTS) +│ ├── CHIPBINNER_ENRICHMENT (LOLA) +│ └── CHIPBINNER_PLOTS +└── SPAN_DIFFERENTIAL + ├── SPAN_COMPARE + └── SPAN_DIFF_REPORT +``` + +### Data Flow + +``` + Peak Calls (per-caller) + │ + ┌──────────────────┼──────────────────┐ + │ │ │ + ▼ ▼ ▼ + [DiffBind] [ChIPBinner] [SPAN Diff] + │ │ │ + │ BAMs + spike-in factors │ + │ │ │ + │ ▼ │ + │ Binned signal matrix │ + │ │ │ + │ ▼ │ + │ HDBSCAN clusters │ + │ │ │ + ▼ ▼ ▼ + Diff tables Cluster BEDs Diff peaks + + plots + LOLA results + stats + │ │ │ + └────────────┼────────────────────────┘ + ▼ + 08_differential/ +``` + +--- + +## Parameters + +### Global Differential Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `--run_diffbind` | boolean | `false` | Enable DiffBind differential analysis | +| `--run_chipbinner` | boolean | `false` | Enable ChIPBinner differential analysis | +| `--run_span_diff` | boolean | `false` | Enable SPAN native differential comparison | + +### DiffBind Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `--diffbind_fdr` | float | `0.05` | FDR threshold for significant differential peaks | +| `--diffbind_lfc` | float | `1.0` | Minimum absolute log2 fold-change threshold | +| `--diffbind_method` | string | `DESeq2` | Analysis method: `DESeq2` or `edgeR` | +| `--diffbind_norm` | string | `lib` | Normalization: `lib` (library size), `RLE`, `TMM`, `native` | +| `--diffbind_summits` | integer | `200` | Extend peaks to this size around summit (0 = use full peak) | +| `--export_diffbind_sheets` | boolean | `true` | Export DiffBind-compatible samplesheets | + +### ChIPBinner Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `--chipbinner_bin_size` | integer | `10000` | Bin size in bp (1000, 5000, 10000, 25000) | +| `--chipbinner_use_control` | boolean | `true` | Use IgG/input control for normalization | +| `--chipbinner_fdr` | float | `0.05` | FDR threshold for ROTS differential analysis | +| `--chipbinner_lfc` | float | `1.0` | Minimum absolute log2 fold-change | +| `--chipbinner_bootstrap` | integer | `1000` | ROTS bootstrap iterations | +| `--chipbinner_k_value` | integer | `100000` | ROTS top list size (K parameter) | +| `--hdbscan_min_range` | string | `100,400,700,1000` | min_cluster_size values for grid search | +| `--hdbscan_max_range` | string | `100,400,700,1000` | min_samples values for grid search | + +### SPAN Differential Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `--span_diff_gap` | integer | `5` | Gap parameter for SPAN compare mode | +| `--span_diff_fdr` | float | `0.05` | FDR threshold for SPAN differential peaks | +| `--span_diff_bin` | integer | `200` | Bin size for SPAN comparison | + +--- + +## DiffBind Implementation + +### Overview + +DiffBind analyzes each peak caller's output independently, running separate differential analyses per caller. This allows comparison of how different peak callers affect downstream differential results. + +### Input Requirements + +For each `(group, caller)` combination where the group has 2+ conditions: + +| Input | Source | Format | +|-------|--------|--------| +| Peak files | Peak calling subworkflow | BED/narrowPeak/broadPeak | +| BAM files | Alignment output | Sorted, deduplicated BAM + BAI | +| Sample metadata | Samplesheet | group, condition, replicate | + +### Samplesheet Generation + +The pipeline automatically generates DiffBind-compatible samplesheets: + +```csv +SampleID,Tissue,Factor,Condition,Replicate,bamReads,Peaks,PeakCaller +H3K36me2_sgControl_rep1,H3K36me2,H3K36me2,sgControl,1,/path/to/bam,/path/to/peaks.bed,macs2 +H3K36me2_sgControl_rep2,H3K36me2,H3K36me2,sgControl,2,/path/to/bam,/path/to/peaks.bed,macs2 +H3K36me2_sgNSD3_rep1,H3K36me2,H3K36me2,sgNSD3,1,/path/to/bam,/path/to/peaks.bed,macs2 +H3K36me2_sgNSD3_rep2,H3K36me2,H3K36me2,sgNSD3,2,/path/to/bam,/path/to/peaks.bed,macs2 +``` + +**Mapping from pipeline metadata:** +- `SampleID` = `meta.sample_id` +- `Tissue` = `meta.group` +- `Factor` = `meta.group` +- `Condition` = `meta.condition` +- `Replicate` = `meta.replicate` +- `bamReads` = Deduplicated BAM path +- `Peaks` = Peak file path +- `PeakCaller` = Caller name (for metadata; DiffBind uses peak file directly) + +### DiffBind v3+ Workflow + +```r +# 1. Create DBA object from samplesheet +dba_obj <- dba(sampleSheet = "samplesheet.csv") + +# 2. Count reads in peaks +dba_obj <- dba.count(dba_obj, summits = params$diffbind_summits) + +# 3. Normalize +dba_obj <- dba.normalize(dba_obj, method = DBA_ALL_METHODS, normalize = params$diffbind_norm) + +# 4. Set up contrast (condition1 vs condition2) +dba_obj <- dba.contrast(dba_obj, categories = DBA_CONDITION, minMembers = 2) + +# 5. Run differential analysis +dba_obj <- dba.analyze(dba_obj, method = params$diffbind_method) + +# 6. Extract results +results <- dba.report(dba_obj, th = params$diffbind_fdr, fold = params$diffbind_lfc) +``` + +### Output Visualizations + +| Plot | File | Description | +|------|------|-------------| +| MA plot | `{group}_{caller}_MA.pdf` | Log fold-change vs mean concentration | +| Volcano plot | `{group}_{caller}_volcano.pdf` | -log10(p-value) vs log2FC | +| PCA plot | `{group}_{caller}_PCA.pdf` | Sample clustering by binding affinity | +| Correlation heatmap | `{group}_{caller}_correlation.pdf` | Sample-sample correlation matrix | +| Binding heatmap | `{group}_{caller}_heatmap.pdf` | Binding affinity at differential sites | +| Venn diagram | `{group}_{caller}_venn.pdf` | Overlap of peaks across conditions | +| Concentration plot | `{group}_{caller}_concentration.pdf` | Read concentration at binding sites | + +### Output Tables + +| File | Format | Content | +|------|--------|---------| +| `{group}_{caller}_differential.tsv` | TSV | Full results with coordinates, FC, FDR, concentrations | +| `{group}_{caller}_significant_up.bed` | BED | Peaks significantly UP in treatment | +| `{group}_{caller}_significant_down.bed` | BED | Peaks significantly DOWN in treatment | +| `{group}_{caller}_samplesheet.csv` | CSV | DiffBind-ready samplesheet for external use | + +--- + +## ChIPBinner Implementation + +### Overview + +ChIPBinner is implemented as a self-contained module that handles the complete workflow from BAM files to differential clusters. It leverages the pipeline's existing spike-in normalization factors. + +### Module Architecture + +``` +CHIPBINNER_ANALYSIS +│ +├─ Input: BAMs + spike-in scale factors + (optional) pooled IgG +│ +├─ Step 1: CHIPBINNER_BINNING +│ └─ BAM → BED → binned BED (bedtools intersect with reference windows) +│ +├─ Step 2: CHIPBINNER_NORMALIZE +│ └─ Apply spike-in factors via norm_bw() +│ └─ Output: normalized bigWigs per sample +│ +├─ Step 3: CHIPBINNER_PRECLUST +│ └─ Generate signal matrix + pooled BED coordinates +│ +├─ Step 4: CHIPBINNER_CLUSTER (grid search) +│ └─ Run HDBSCAN with parameter combinations +│ └─ Output: cluster assignments for each parameter set +│ +├─ Step 5: CHIPBINNER_ANNOTATE +│ └─ Extract 2-cluster and 3-cluster solutions +│ └─ Output: annotated_clusters.rda (GRanges per cluster) +│ +├─ Step 6: CHIPBINNER_DIFFERENTIAL +│ └─ ROTS analysis per cluster +│ └─ Output: logFC, FDR per bin per cluster +│ +├─ Step 7: CHIPBINNER_ENRICHMENT +│ └─ LOLA enrichment with Ensembl, cCRE, RepeatMasker databases +│ └─ Output: enrichment tables and plots per cluster +│ +└─ Step 8: CHIPBINNER_PLOTS + └─ Generate all visualizations +``` + +### Normalization Flow + +ChIPBinner uses the pipeline's existing spike-in infrastructure: + +``` +Pipeline spike-in calculation (PREPARE_PEAKCALLING) + │ + ▼ + Scale factors (from --normalisation_mode Spikein) + │ + ▼ + ChIPBinner norm_bw(scaling_factor = pipeline_scale_factor) + │ + ▼ + If --chipbinner_use_control: + norm_bw(use_input = TRUE, input_binned_file = pooled_IgG.binned.bed) + Else: + norm_bw(use_input = FALSE) +``` + +### Binning Reference + +Reference window BED files are required for binning. The pipeline expects: + +``` +# From ChIPBinner_database repository +hg38.10kb.windows.bed # For --chipbinner_bin_size 10000 +hg38.5kb.windows.bed # For --chipbinner_bin_size 5000 +hg38.1kb.windows.bed # For --chipbinner_bin_size 1000 +mm10.10kb.windows.bed # For mouse +``` + +**Parameter**: `--chipbinner_windows_dir` (path to directory containing window BED files) + +### HDBSCAN Grid Search + +The pipeline runs HDBSCAN clustering with a grid of parameters: + +```python +# Grid search combinations +for min_cluster_size in [100, 400, 700, 1000]: + for min_samples in [100, 400, 700, 1000]: + clusterer = hdbscan.HDBSCAN( + min_cluster_size=min_cluster_size, + min_samples=min_samples + ).fit(signal_matrix) + save(f"clus.{comparison}.{min_cluster_size}.{min_samples}.txt") +``` + +**Total combinations**: 16 (4 × 4 grid) + +### Cluster Extraction + +From the grid search results, extract **fixed 2-cluster and 3-cluster solutions**: + +```r +# For each parameter combination, extract: +# - 2clusters.annotated_clusters.rda (Cluster A, Cluster B) +# - 3clusters.annotated_clusters.rda (Cluster A, Cluster B, Cluster C) + +# Clusters are labeled by behavior: +# A = bins with signal in both conditions (stable) +# B = bins with signal preferentially in WT/control +# C = bins with signal preferentially in treatment/KO +``` + +### LOLA Enrichment + +Automatic enrichment analysis using ChIPBinner's bundled databases: + +| Database | Source | Content | +|----------|--------|---------| +| Ensembl | ensembl.org | Gene annotations, promoters, exons, introns | +| cCRE | ENCODE | Candidate cis-regulatory elements | +| RepeatMasker | repeatmasker.org | Repeat element annotations | + +**Output per cluster:** +- `{cluster}_ensembl_enrichment.tsv` +- `{cluster}_ccre_enrichment.tsv` +- `{cluster}_repeats_enrichment.tsv` +- `{cluster}_enrichment_plot.pdf` + +### Visualizations + +| Plot | File | Description | +|------|------|-------------| +| Genic/intergenic scatterplot | `{comparison}_genic_intergenic.pdf` | Signal stratified by genomic context | +| Density scatterplot | `{comparison}_density_scatter.pdf` | WT vs treatment with cluster coloring | +| PCA | `{comparison}_PCA.pdf` | Sample clustering by bin signal | +| Correlation | `{comparison}_correlation.pdf` | Sample-sample correlations | + +### ChIPBinner Output Files + +``` +08_differential/chipbinner/{group}/ +├── binned_beds/ +│ ├── {sample}.{bin_size}kb.binned.bed +│ └── ... +├── normalized_bigwigs/ +│ ├── {sample}.{bin_size}kb.norm.bw +│ └── ... +├── clustering/ +│ ├── {comparison}_preclust_mat.csv +│ ├── {comparison}_preclust_pooled.bed +│ ├── clus.{comparison}.{minpts}.{minsamps}.txt (16 files) +│ ├── {comparison}.2clusters.annotated_clusters.rda +│ └── {comparison}.3clusters.annotated_clusters.rda +├── differential/ +│ ├── {comparison}_rots_results.csv +│ └── {comparison}_stats_per_cluster.csv +├── enrichment/ +│ ├── {comparison}_{cluster}_ensembl_enrichment.tsv +│ ├── {comparison}_{cluster}_ccre_enrichment.tsv +│ └── {comparison}_{cluster}_enrichment_plot.pdf +└── plots/ + ├── {comparison}_genic_intergenic.pdf + ├── {comparison}_density_scatter.pdf + ├── {comparison}_PCA.pdf + └── {comparison}_correlation.pdf +``` + +--- + +## SPAN Differential Implementation + +### Overview + +SPAN provides native differential peak calling via its 'compare' mode, which directly compares two conditions using SPAN's semi-supervised model. + +### Input Requirements + +| Input | Source | Description | +|-------|--------|-------------| +| Treatment BAMs | Alignment output | Condition 1 replicates (pooled or individual) | +| Control BAMs | Alignment output | Condition 2 replicates (pooled or individual) | +| IgG BAMs | Pooled controls | Background control for both conditions | +| Chrom sizes | Reference | Chromosome sizes file | +| omnipeaks.jar | `--omnipeaks_jar` | SPAN executable | + +### SPAN Compare Command + +```bash +java -Xmx${java_heap} -jar omnipeaks.jar compare \ + -t1 condition1_rep1.bam,condition1_rep2.bam \ + -t2 condition2_rep1.bam,condition2_rep2.bam \ + -c1 condition1_IgG.bam \ + -c2 condition2_IgG.bam \ + --cs chrom.sizes \ + --bin ${span_diff_bin} \ + --gap ${span_diff_gap} \ + --fdr ${span_diff_fdr} \ + -o ${output_prefix} +``` + +### Output Files + +``` +08_differential/span_diff/{group}/ +├── {comparison}_span_diff.peak # Differential peaks +├── {comparison}_span_diff_up.bed # Peaks higher in condition1 +├── {comparison}_span_diff_down.bed # Peaks higher in condition2 +└── {comparison}_span_diff_stats.tsv # Statistics per peak +``` + +--- + +## Output Directory Structure + +All differential analysis outputs are organized under `03_peak_calling/08_differential/`: + +``` +03_peak_calling/ +└── 08_differential/ + ├── diffbind/ + │ ├── samplesheets/ + │ │ ├── {group}_{caller}_samplesheet.csv + │ │ └── ... + │ ├── {group}_{caller}/ + │ │ ├── {group}_{caller}_differential.tsv + │ │ ├── {group}_{caller}_significant_up.bed + │ │ ├── {group}_{caller}_significant_down.bed + │ │ ├── {group}_{caller}_MA.pdf + │ │ ├── {group}_{caller}_volcano.pdf + │ │ ├── {group}_{caller}_PCA.pdf + │ │ ├── {group}_{caller}_correlation.pdf + │ │ ├── {group}_{caller}_heatmap.pdf + │ │ └── {group}_{caller}_venn.pdf + │ └── ... + ├── chipbinner/ + │ ├── {group}/ + │ │ ├── binned_beds/ + │ │ ├── normalized_bigwigs/ + │ │ ├── clustering/ + │ │ ├── differential/ + │ │ ├── enrichment/ + │ │ └── plots/ + │ └── ... + ├── span_diff/ + │ ├── {group}/ + │ │ ├── {comparison}_span_diff.peak + │ │ ├── {comparison}_span_diff_up.bed + │ │ └── {comparison}_span_diff_down.bed + │ └── ... + └── multiqc/ + ├── differential_summary_mqc.tsv + └── differential_multiqc_report.html +``` + +--- + +## MultiQC Integration + +### Integrated Summary (Main Report) + +Add to the existing MultiQC report: + +| Section | Content | +|---------|---------| +| Differential Summary | Table of differential peak counts per group/caller/method | +| Method Comparison | Overlap statistics between DiffBind/ChIPBinner/SPAN results | + +**Header file**: `assets/multiqc/differential_summary_header.txt` + +### Separate Differential Report + +Generate `differential_multiqc_report.html` with detailed sections: + +| Section | Content | +|---------|---------| +| DiffBind Results | Per-group tables, embedded MA/volcano plots | +| ChIPBinner Results | Cluster statistics, enrichment summaries | +| SPAN Results | Differential peak statistics | +| Cross-Method Comparison | Venn diagrams, overlap metrics | + +--- + +## Execution Examples + +### Basic Usage + +```bash +# Run DiffBind only +nextflow run dhusmann/cutandrun \ + -r adding_new_peak_callers \ + --input samplesheet.csv \ + --run_diffbind \ + --peakcaller seacr,macs2 + +# Run ChIPBinner for broad marks +nextflow run dhusmann/cutandrun \ + -r adding_new_peak_callers \ + --input samplesheet.csv \ + --run_chipbinner \ + --normalisation_mode Spikein \ + --chipbinner_bin_size 10000 + +# Run all three methods +nextflow run dhusmann/cutandrun \ + -r adding_new_peak_callers \ + --input samplesheet.csv \ + --run_diffbind \ + --run_chipbinner \ + --run_span_diff \ + --peakcaller span_default +``` + +### Customized Thresholds + +```bash +nextflow run dhusmann/cutandrun \ + -r adding_new_peak_callers \ + --input samplesheet.csv \ + --run_diffbind \ + --run_chipbinner \ + --diffbind_fdr 0.01 \ + --diffbind_lfc 1.5 \ + --chipbinner_fdr 0.05 \ + --chipbinner_lfc 1.0 \ + --chipbinner_use_control false +``` + +### Using Params File + +```json +{ + "input": "samplesheet.csv", + "outdir": "results", + "normalisation_mode": "Spikein", + "peakcaller": "seacr,macs2,span_default", + + "run_diffbind": true, + "run_chipbinner": true, + "run_span_diff": true, + + "diffbind_fdr": 0.05, + "diffbind_lfc": 1.0, + "diffbind_method": "DESeq2", + + "chipbinner_bin_size": 10000, + "chipbinner_use_control": true, + "chipbinner_fdr": 0.05, + + "span_diff_gap": 5, + "span_diff_fdr": 0.05 +} +``` + +--- + +## Implementation Notes + +### Container Requirements + +**DiffBind Module:** +- R 4.2+ +- Bioconductor 3.16+ +- DiffBind 3.8+ +- DESeq2, edgeR + +**ChIPBinner Module:** +- R 4.2+ +- ChIPBinner (installed at runtime from GitHub) +- Python 3.9+ (for HDBSCAN via reticulate) +- LOLA, GenomicRanges, rtracklayer + +**SPAN Module:** +- Java 21 JRE +- omnipeaks.jar (provided via `--omnipeaks_jar`) + +### Condition Detection + +Groups with valid comparisons are detected automatically: + +```groovy +// In DIFFERENTIAL_ANALYSIS subworkflow +ch_valid_groups = ch_samples + .map { meta, files -> [meta.group, meta.condition] } + .groupTuple() + .filter { group, conditions -> conditions.unique().size() >= 2 } + .map { group, conditions -> group } + +// Groups with only one condition are logged as warnings +ch_skipped_groups = ch_samples + .map { meta, files -> [meta.group, meta.condition] } + .groupTuple() + .filter { group, conditions -> conditions.unique().size() < 2 } + .subscribe { group, conditions -> + log.warn "Skipping differential analysis for group '${group}': only one condition found (${conditions.unique()})" + } +``` + +### Resource Requirements + +| Process | Memory | CPUs | Time Estimate | +|---------|--------|------|---------------| +| DIFFBIND_ANALYZE | 16 GB | 4 | 30 min/group | +| CHIPBINNER_CLUSTER | 32 GB | 8 | 2-4 hrs/comparison | +| CHIPBINNER_DIFFERENTIAL | 16 GB | 4 | 1 hr/comparison | +| SPAN_COMPARE | 8 GB | 2 | 30 min/comparison | + +### Error Handling + +| Scenario | Behavior | +|----------|----------| +| Group has < 2 conditions | Skip with warning | +| Group has < 2 replicates per condition | Skip with warning (DiffBind requires >= 2) | +| Peak caller produced no peaks | Skip DiffBind for that caller with warning | +| ChIPBinner clustering fails | Retry with lower min_cluster_size | +| SPAN compare fails | Log error, continue with other groups | + +--- + +## References + +- **DiffBind**: Stark R, Brown G (2011). DiffBind: differential binding analysis of ChIP-Seq peak data. Bioconductor package. +- **ChIPBinner**: Padilla R, Bareke E, Hu B, Majewski J (2024). ChIPbinner: An R package for analyzing broad histone marks binned in uniform windows. bioRxiv. +- **SPAN**: Shpynov O et al. (2021). Semi-supervised peak calling with SPAN. Bioinformatics. +- **ROTS**: Suomi T et al. (2017). ROTS: An R package for reproducibility-optimized statistical testing. PLoS Comput Biol. +- **LOLA**: Sheffield NC, Bock C (2016). LOLA: enrichment analysis for genomic region sets. Bioinformatics. diff --git a/docs/extension_specs/5.2_DIFFERENTIAL_PEAK_CALLING-SPEC.md b/docs/extension_specs/5.2_DIFFERENTIAL_PEAK_CALLING-SPEC.md new file mode 100644 index 00000000..ed39c0ff --- /dev/null +++ b/docs/extension_specs/5.2_DIFFERENTIAL_PEAK_CALLING-SPEC.md @@ -0,0 +1,582 @@ +# 5.2 Differential Peak Calling Spec + +Summary + +This SPEC defines how to implement **differential peak calling / differential enrichment** in the `dhusmann/cutandrun` fork (branch `differential_peak_calling`) using three complementary approaches: + 1. DiffBind (using peaks produced by *existing* peak callers in the pipeline) + 2. ChIPBinner (bin-based differential + clustering for broad marks; include Snakemake-parity notes for easier porting) + 3. SPAN/OmniPeaks (a SPAN-native differential workflow, if supported by the OmniPeaks JAR / SPAN ecosystem) + +This SPEC is written to be compatible with the nf-core/cutandrun architecture and conventions (DSL2, modular processes, schema-driven params, MultiQC reporting), and to build directly on the already-implemented condition-aware data model (`group`, `condition`, `replicate`) and multi-peakcaller outputs. + +⸻ + +Context (what exists already in this fork) + + • Samplesheet supports `condition` and constructs unique `sample_id` and `group_condition`. + • Peak calling supports multiple callers / variants (comma-separated `--peakcaller`, case-insensitive). + • First caller is “primary” for downstream consensus/QC; other callers are output-only **today**. + • Spike-in scaling can be computed per scope and recorded as TSVs when `--dump_scale_factors true`. + • Some callers (epic2, SPAN) require pooled controls (IgG) per `(control_group, condition)` for peak calling. + +Differential peak calling should treat differential analysis as a **separate layer** that: + • does not mutate existing peak calling outputs, + • can be run during a standard pipeline execution, or posthoc from an existing `--outdir`, + • and (per requirements) runs differential analyses for **every caller** used in `--peakcaller`, not only the primary. + +⸻ + +Requirements captured from user interview (implementation must match these) + +Invocation / modes + • Differential analysis must support: + • Integrated mode: run as part of a normal pipeline run (opt-in). + • Posthoc mode: run differential-only against an existing pipeline `--outdir` without re-running upstream. + +Caller coverage + • Differential analysis must run for **every caller variant** in `--peakcaller` (primary + secondary), producing parallel outputs per caller. + +Peak set strategy + • For DiffBind: use per-sample called peaks; DiffBind handles its own consensus/union internally. + • No peak recentering by default. + • Set a reasonable default for `minOverlap`, but allow the user to pass “full DiffBind params” if desired. + +Normalization / controls + • If `--normalisation_mode Spikein`, incorporate the pipeline’s spike-in scale factors into differential analyses (see details below). + • IgG / control samples are **not used** in the differential stage by default (IgG affects peak calling; differential compares WT vs KO targets only). + +Experimental design constraints (differential stage) + • There should be **exactly 2 biological conditions** for any differential comparison. + • For each `group` (mark), require **≥ 2 biological replicates** per condition; fail if fewer. + • Batch effects are out-of-scope for v1 (no batch covariate in the model by default). + +Outputs + • For each method (DiffBind / ChIPBinner / SPAN-diff), produce: + • Differential result tables (TSV) + BED-like outputs for significant features + • Diagnostic plots (e.g., MA/volcano/heatmap/PCA as applicable) + • Gene annotation (e.g., nearest gene/TSS from a BED/GTF-derived reference) + • MultiQC integration stubs (at minimum: summary tables consumable by MultiQC custom content) + +⸻ + +Goals + +Functional goals + 1. Add opt-in differential analysis to the pipeline (DiffBind, ChIPBinner, SPAN-diff). + 2. Run differential **per mark (`group`)** and **per caller** (for DiffBind and any caller-derived method). + 3. Support two execution modes: + • Integrated: differential runs immediately after peak calling outputs exist. + • Posthoc: differential can be executed later from an existing `--outdir`. + 4. Use spike-in scale factors (when present) as external normalization input for differential analysis. + 5. Produce a predictable output tree with enough metadata/manifesting to make downstream usage repeatable. + +Non-goals (explicitly out of scope for this SPEC) + • `--compare_norm_methods` and `--compare_peak_callers` modes (TODO #5–6). + • General downstream enrichment characterization (ChromHMM, cCRE, broad LOLA/GSEA library generation) beyond what is needed for ChIPBinner parity stubs; that work is TODO #7. + • Automatic batch covariate modeling (can be added later once the data model supports it cleanly). + +⸻ + +Definitions / data model (reused from existing fork) + +Key fields (already exist in `meta`) + • group: biological target (mark/factor) + • condition: biological condition label (WT vs KO; control vs treated) + • replicate: integer replicate index + • sample_id / meta.id: unique sample name (already collision-safe across conditions) + • caller: peak caller id (e.g., `seacr`, `macs2_narrow`, `span_default`) + +Differential comparison terminology + • “control condition”: the baseline / reference in the contrast (WT / sgControl) + • “treated condition”: the perturbed condition (KO / sgKO) + • log2FC sign convention: **log2(treated / control)** (must be enforced consistently across methods) + +NOTE: The pipeline cannot reliably infer treated vs control purely from condition names across all datasets. Therefore, the differential layer must expose an explicit contrast selection interface (see Parameters). + +⸻ + +User-facing interface (new parameters) + +Top-level toggles (v1) + • `--run_diffbind` (boolean; default false) + • `--run_chipbinner` (boolean; default false) + • `--run_span_diff` (boolean; default false) + +Execution mode (integrated vs posthoc) + • `--differential_from_run ` (string path; default null) + • If set: pipeline runs *differential-only* and reads required inputs from the referenced `--outdir`. + • If unset: differential runs integrated (if any `--run_*` toggles are true). + +Contrast selection (must exist for all differential methods) + • `--differential_contrast ","` + • Example: `--differential_contrast "sgNSD3,sgControl"` + • This defines the log2FC direction: **treated vs control**. + • Validation: both conditions must exist for each affected `group`. + +Minimum replicate enforcement + • `--differential_min_replicates 2` (integer; default 2) + • Enforced per `group`×`condition`. + +DiffBind parameters (v1 defaults + “full params” escape hatch) + • `--diffbind_fdr 0.05` (default 0.05) + • `--diffbind_min_overlap 2` (default 2; see notes below) + • `--diffbind_recenter_peaks false` (default false; do not summit-recenter) + • `--diffbind_backend DESeq2|edgeR` (default DESeq2) + • `--diffbind_extra_params ` (optional) + • If set: pass through to the DiffBind R runner to override any internal defaults and/or to supply additional DiffBind `dba.*` arguments. + • Implementation detail: prefer a JSON file (schema described below) to avoid brittle string parsing. + +Spike-in integration switches + • `--differential_use_spikein true|false` (default: auto) + • Auto behavior: + • If `--normalisation_mode Spikein`: true (use scale factors as external normalization) + • Else: false + +ChIPBinner parameters (v1) + • `--chipbinner_bin_size 10000` (default 10000; 10kb) + • `--chipbinner_use_input false` (default false; aligns with “no IgG in differential”) + • `--chipbinner_pseudocount 1` (default 1) + • `--chipbinner_hdbscan_grid_minpts "100,200,500,1000"` (default as string list) + • `--chipbinner_hdbscan_grid_minsamps "100,200,500,1000"` (default as string list) + • `--chipbinner_ms_coeffs ` (optional; Snakemake-parity support) + • `--chipbinner_functional_db ` (optional; LOLA/enrichment stubs) + +SPAN/OmniPeaks differential parameters (v1; some may be “TBD” pending OmniPeaks CLI capabilities) + • `--span_diff_mode native` (v1: native) + • `--span_diff_fdr 0.05` (default 0.05) + • `--span_diff_gap 5` (default 5) + • `--span_diff_java_heap 8G` (default 8G) + • `--omnipeaks_jar ` (already exists for peak calling; reused here) + +Schema + docs requirements + • All new params must be added to: + • `nextflow.config` defaults + • `nextflow_schema.json` + • `docs/usage.md` and `docs/output.md` + +⸻ + +High-level workflow design + +Add a new subworkflow layer that runs after peak calling outputs exist: + + • `subworkflows/local/differential_peak_calling.nf` (new) + • inputs: channels of target BAMs, per-sample per-caller peaks, metadata, reference features + • implements: DIFFBIND, CHIPBINNER, SPAN_DIFF (each gated by `--run_*`) + • emits: differential result artifacts + summary tables for MultiQC + +Integrated execution (normal pipeline run) + • `workflows/cutandrun.nf` calls `DIFFERENTIAL_PEAK_CALLING` after peak calling is complete. + • The differential layer consumes in-memory channels, avoiding filesystem re-discovery. + +Posthoc execution (differential-only from an existing run) + Two acceptable patterns; pick one and implement fully (recommended: A). + +Pattern A (recommended): dedicated entrypoint + • Add `workflow DIFFERENTIAL_ONLY` (new DSL2 entry) that: + • reads a “design manifest” produced by the main pipeline run (see next section), + • builds channels equivalent to integrated mode, and + • runs `DIFFERENTIAL_PEAK_CALLING` only. + • Invoke via Nextflow: + • `nextflow run dhusmann/cutandrun -r differential_peak_calling -entry DIFFERENTIAL_ONLY -profile singularity --differential_from_run --outdir --differential_contrast "KO,WT" ...` + +Pattern B: flowswitch-driven partial execution + • Add a flowswitch rule such that `--differential_from_run` disables upstream steps. + • Requires careful branching in the main workflow so channels are either “computed” or “loaded”. + +⸻ + +Make posthoc reliable: write a design manifest (new output artifact) + +Posthoc differential should not rely on fragile filename parsing. Add a small, explicit manifest to the normal pipeline output. + +New files (generated during integrated runs) + + • `03_peak_calling/06_differential/00_manifests/` + • `differential_manifest.samples.tsv` + • One row per *target* sample. + • Columns (minimum): + • sample_id + • group + • condition + • replicate + • final_bam + • final_bai + • normalisation_mode + • spikein_scale_factor (or NA) + • `differential_manifest.peaks.tsv` + • One row per (sample_id, caller). + • Columns: + • sample_id + • caller + • peaks_path + • peaks_format (bed / narrowPeak / broadPeak / etc.) + • `differential_manifest.bigwig.tsv` (optional, but recommended) + • One row per sample for the target bigWig used in visualization layers. + +Manifest generation rules + • Emit manifests only when peak calling ran successfully (or emit partial manifests + a failure summary). + • Manifests must be deterministic (sorted stable order) to support `-resume`. + • Manifests must only include *published* paths under `--outdir` so posthoc runs don’t need `work/`. + +⸻ + +Method 1: DiffBind (caller-derived differential peak calling) + +1.1 Inputs + • Per sample: + • final target BAM + BAI (post-processed BAM saved by the pipeline) + • called peaks for the selected caller variant + • Metadata: + • group (mark), condition, replicate + • Reference: + • Gene BED (or GTF-derived features) for annotation (see “Annotation” section) + • Spike-in: + • Per-sample spike-in scale factor (when `--differential_use_spikein` is true) + +IgG/controls + • Do not use IgG BAMs as `bamControl` in DiffBind by default (per requirement #5a). + • Keep `bamControl`/`ControlID` blank or `NA` in the DiffBind sample sheet. + +1.2 Grouping strategy + • Run DiffBind separately for each: + • `group` (mark), and + • `caller` (every caller in `params.callers`) + +In pseudocode: + • for caller in params.callers: + • for group in unique(meta.group): + • run DiffBind on samples where meta.group == group + +1.3 DiffBind sample sheet generation (per group×caller) + +Write one CSV per `(caller, group)`: + + `03_peak_calling/06_differential/01_diffbind/00_samplesheets//.diffbind.csv` + +Required columns (DiffBind-compatible minimum) + • SampleID: `meta.id` + • Factor: `meta.group` (mark) + • Condition: `meta.condition` + • Replicate: `meta.replicate` + • bamReads: path to final BAM + • Peaks: path to peak file for this caller+sample + +Optional columns (recommended) + • Tissue: a constant placeholder like `CUTRUN` + • PeakCaller: `caller` (or a short descriptor) + • Caller: same as `PeakCaller` (for easier downstream parsing) + +Validation + • Fail if: + • more or less than 2 unique conditions exist for the selected group (in v1). + • either condition has fewer than `--differential_min_replicates`. + • any referenced BAM/BAI/Peaks file is missing. + +1.4 DiffBind execution (R runner module) + +Implement as a Nextflow module that executes an R script: + • `modules/local/diffbind_run.nf` (new) + • `bin/diffbind_run.R` (new) + +Required behaviors + • Accept: + • sample sheet CSV + • contrast definition: treated vs control + • default parameters (minOverlap, FDR, backend, no recentering) + • optional `diffbind_extra_params` file to override defaults + • optional spike-in scale factor TSV for those samples (or embedded column) + +Defaults (hard defaults, but overridable) + • minOverlap: 2 + • summits/recenter: disabled + • FDR threshold: 0.05 + • backend: DESeq2 (allow edgeR) + +Spike-in integration (required when enabled) + +Goal: incorporate per-sample external scaling so that differential log2FC reflects spike-in normalization. + +Design constraint: DiffBind internally uses DESeq2/edgeR normalization assumptions (“most features unchanged”), which can be violated in global-change settings. When `--normalisation_mode Spikein`, we have external scale factors and should prefer them. + +Implementation strategy (v1; robust and explicit): + 1. Use DiffBind primarily to: + • import peaks/BAMs, + • build a consensus peakset and counts matrix (per DiffBind conventions). + 2. Run the differential test using DESeq2/edgeR **with user-provided size factors derived from spike-in**: + • Let `sf = 1 / scale_factor` per sample (so normalized counts = counts / sf = counts * scale_factor). + • Disable DiffBind’s internal normalization where possible, or bypass it by extracting the count matrix and running the backend directly. + +Notes: + • The exact DiffBind API details must be validated during implementation; the R runner should be structured so we can: + • fall back to standard DiffBind analysis if external scaling can’t be injected safely, + • while still meeting the “use spike-in scaling” requirement when possible. + +1.5 Outputs (DiffBind) + +Per `(caller, group)` write to: + + `03_peak_calling/06_differential/01_diffbind///` + +Artifacts (minimum) + • `diffbind.results.tsv` + • One row per tested region with: + • chrom, start, end + • log2FC (treated/control) + • pvalue, FDR + • concentration / counts fields if available + • `diffbind.significant.bed` + • BED-like output for peaks with FDR ≤ `--diffbind_fdr` + • Plots: + • `PCA.pdf` + • `correlation_heatmap.pdf` + • `MA.pdf` + • `volcano.pdf` + +Annotation (required) + • `diffbind.results.annotated.tsv` + • Add at minimum: + • nearest_gene (or nearest_feature) + • distance_to_feature + • feature_id (optional) + +MultiQC stubs (required) + • `diffbind.summary.tsv` + • Columns: + • caller, group, treated_condition, control_condition + • n_tested + • n_fdr_pass + • n_up (log2FC>0 & FDR pass) + • n_down (log2FC<0 & FDR pass) + +1.6 Error handling (DiffBind) + • Fail fast per group/caller if: + • replicate constraints are violated, + • input files missing, + • R script errors. + • Optionally support “continue on failure” later (not required for v1); for v1, prefer clear failure to prevent silent partial analyses. + +⸻ + +Method 2: ChIPBinner (bin-based differential + clustering; broad mark emphasis) + +2.1 Purpose and positioning + +ChIPBinner is best for broad marks and global changes. It does not produce traditional “peaks” first; instead it bins the genome (typically 10kb), quantifies signal per bin, clusters bins by behavior, and tests bins for differential enrichment (via ROTS). + +This method complements peak-caller-derived methods and should be implemented as a separate differential workflow (not as a peakcaller option). + +2.2 Inputs + • Per target sample: + • final target BAM + BAI (required) + • Reference: + • chrom sizes (required) + • binning windows BED (required; can be generated) + • Optional: + • MS coefficients YAML (Snakemake-parity) + • functional DB (for enrichment stubs) + +Controls (IgG/input) + • Default: `chipbinner_use_input=false` (aligns with the requirement of “no controls in differential stage” and with the Snakemake workflow note that controls are disabled by default). + • However, the implementation should be designed so that enabling input later is straightforward: + • if enabled, control bigWigs/BAMs should be matched by `(control_group, condition)` similarly to existing pooling logic. + +2.3 Workflow outline (per group) + +For each `group` (mark): + 1. Build (or load) reference bins: + • Default: generate bins from chrom sizes using `bedtools makewindows -w chipbinner_bin_size` + • Optional: subtract blacklist regions if available + 2. Quantify signal per bin per sample (BAM → bin counts) + 3. Normalize per sample: + • If `--differential_use_spikein` is true: + • Apply external scaling factor per sample (spike-in or MS) + • Optionally apply depth normalization (library size), but beware global-change assumptions + 4. QC: + • PCA + • sample correlation + 5. Clustering: + • Run HDBSCAN over bins using a grid search of `minPts` / `minSamps` (Snakemake-parity) + 6. Differential testing: + • Run ROTS-based differential per bin using replicates kept separate + 7. Annotation: + • Annotate bins to nearest genes / features (same annotation machinery as DiffBind) + 8. Enrichment stubs: + • If a functional DB is provided, run (or stub) LOLA/enrich_clust outputs + +2.4 Snakemake-parity notes (include these to ease porting) + +The `snakemake_chipbinner` repo adds automation beyond a minimal ChIPBinner run. To make porting easier, design the Nextflow implementation around the same conceptual interfaces: + + • CSV-based sample sheet management + • Implement a ChIPBinner-specific sample sheet CSV (one per group) written to: + • `03_peak_calling/06_differential/02_chipbinner/00_samplesheets/.chipbinner.csv` + • Config-driven MS normalization + • Support `--chipbinner_ms_coeffs `: + • v1: load the file and validate structure (even if only partially used initially). + • v2: use MS coefficients as scaling factors, analogous to spike-in scale factors. + • Batch comparisons + • v1: not supported in the statistical model. + • v2: add a “comparisons file” that enumerates multiple (treated, control) contrasts. + • Clustering parameter grid search + • Implement a grid over: + • minPts in [100..1000] + • minSamps in [100..1000] + • Output a summary table ranking parameter choices by cluster stability / replicate consistency metrics. + • Controls disabled by default + • Match snakemake defaults: `chipbinner_use_input=false`. + +2.5 Outputs (ChIPBinner) + +Write to: + `03_peak_calling/06_differential/02_chipbinner//` + +Artifacts (minimum) + • `chipbinner.bin_counts.tsv` (or CSV) + • `chipbinner.normalized_matrix.tsv` (or CSV) + • `chipbinner.clusters.tsv` (bin → cluster assignment) + • `chipbinner.differential.tsv` (bin-level logFC, FDR, cluster) + • Plots: + • `PCA.pdf` + • `correlation.pdf` + • Annotation: + • `chipbinner.differential.annotated.tsv` + • Grid search summary: + • `chipbinner.hdbscan_grid_summary.tsv` + +MultiQC stubs + • `chipbinner.summary.tsv` + • group, treated_condition, control_condition + • n_bins_tested, n_fdr_pass, n_up, n_down + • n_clusters, chosen_minPts, chosen_minSamps + +⸻ + +Method 3: SPAN/OmniPeaks differential (SPAN-native) + +3.1 Clarify tool capability (required discovery step) + +The current pipeline uses OmniPeaks via: + • `java -jar omnipeaks.jar analyze -t -c ...` + +This SPEC requires a SPAN-native differential workflow (user requirement #11b). Implementation must therefore: + 1. Inspect the OmniPeaks CLI help (`java -jar omnipeaks.jar --help`) and/or upstream documentation to determine whether a native differential subcommand exists (e.g., `diff`, `compare`, `differential`, etc.). + 2. If a native differential mode exists: + • implement it directly as a new module (recommended). + 3. If a native differential mode does *not* exist: + • document this clearly and implement the closest SPAN-consistent differential alternative as a separate workflow stage (see 3.4 fallback plan). + +3.2 Inputs (expected) +At minimum: + • final target BAMs (replicates per condition) + • chrom sizes + • OmniPeaks JAR + +Controls (IgG/input) + • Differential stage does not use IgG for modeling (per requirement #5a). + • If the OmniPeaks differential mode requires controls, use the already-produced pooled controls per condition as *technical inputs* to the SPAN command, but do not treat IgG as a “condition” in differential interpretation. + +3.3 Replicates + • Enforce `--differential_min_replicates` per condition (≥2). + • If OmniPeaks differential does not accept replicates directly, pool target BAMs per `(group, condition)` for the SPAN differential step, and record pooling explicitly: + • `03_peak_calling/06_differential/03_span/00_manifests/span_diff_target_pooling.tsv` + +3.4 Outputs (SPAN-diff) + +Write to: + `03_peak_calling/06_differential/03_span//` + +Artifacts (minimum) + • `span.differential.peaks.bed` (or native SPAN output) + • `span.differential.tsv` (log2FC, FDR, peak coords) + • Plots (if SPAN produces them; otherwise create minimal diagnostics): + • `signal_distributions.pdf` (optional) + • Annotation: + • `span.differential.annotated.tsv` + • MultiQC stubs: + • `span.summary.tsv` + +3.5 Fallback plan (if no native SPAN differential exists) + +If OmniPeaks cannot do native differential: + • Do not silently “pretend” it can. + • Implement a transparent SPAN-consistent fallback: + • Use SPAN’s own peak outputs per replicate (already generated), + • build a region set (union across samples), + • count reads per region from the final BAMs, + • run DESeq2/edgeR with spike-in size factors (same external normalization as DiffBind), + • label outputs clearly as “SPAN peaks + DESeq2 differential”. + +This fallback still satisfies “SPAN/OmniPeaks differential” in spirit while being explicit about the statistical engine used. + +⸻ + +Shared component: Gene/feature annotation (required for all methods) + +Because the pipeline already supports a gene BED / GTF for QC, prefer a minimal, tool-agnostic annotation approach: + • Convert differential regions (peaks/bins) to BED + • Annotate with nearest gene/feature using: + • `bedtools closest` against `--gene_bed` (if provided), or + • a GTF-derived BED of TSS/gene bodies (if `--gtf` exists) + +Outputs should include: + • nearest_feature_id / gene_name (if available) + • distance_to_feature + +⸻ + +MultiQC integration stubs (required) + +v1 should not attempt full MultiQC module development unless it’s low effort. Instead: + • Write small summary TSVs for each method. + • Add them as MultiQC custom content tables via `assets/multiqc_config.yml` (custom content). + +Minimum: include per-method summary tables described above. + +⸻ + +Acceptance criteria (what “done” means for implementation) + +Functional + • With a condition-aware samplesheet containing exactly 2 conditions and ≥2 reps/condition: + • `--run_diffbind` produces DiffBind results for **each caller** in `--peakcaller` for each `group`. + • `--run_chipbinner` produces ChIPBinner outputs per `group` including differential tables and plots. + • `--run_span_diff` produces SPAN differential outputs per `group` (native if supported, otherwise explicit fallback). + • With `--normalisation_mode Spikein`, differential results incorporate spike-in scaling (documented and testable). + • With `--differential_from_run `, differential-only runs succeed without recomputing upstream steps. + +Validation / errors + • Fail clearly if: + • contrast not provided or invalid, + • not exactly 2 conditions, + • replicate constraints violated. + +Outputs + • Output directories and filenames match this SPEC (or the implemented naming is documented and stable). + • All three methods produce: + • a result TSV + • plots + • annotated TSV + • MultiQC summary TSV stubs + +⸻ + +Testing strategy (pytest-workflow friendly) + +Given runtime constraints, implement tests in layers: + +1) Unit-ish / fast integration tests (recommended for CI) + • Validate parameter failures: + • no contrast → fails + • 1 condition → fails + • <2 reps/condition → fails + • Validate manifest generation: + • manifests exist and contain expected columns + • Validate that enabling `--run_diffbind` triggers creation of DiffBind sample sheets (even if analysis is stubbed in test mode) + +2) Full integration tests (optional; may be heavy) + • Run a small dataset with `--run_diffbind` and confirm: + • result TSV exists for at least one group and one caller + • For ChIPBinner and SPAN-diff, consider “smoke tests” that run the module scripts with tiny synthetic inputs. + +To keep CI feasible, allow a `test` profile behavior where heavy differential steps are skipped but sample sheet/manifest generation is exercised. diff --git a/docs/extension_specs/EXTENSIONS_ADDED_20260104.md b/docs/extension_specs/EXTENSIONS_ADDED_20260104.md new file mode 100644 index 00000000..1849106b --- /dev/null +++ b/docs/extension_specs/EXTENSIONS_ADDED_20260104.md @@ -0,0 +1,111 @@ +# What we added to the fork (branch adding_new_peak_callers) + + - Samplesheet now supports condition (new header: group,condition,replicate,fastq_1,fastq_2,control). Legacy + sheets without condition still work (condition defaults to NA). This enables group × condition-aware + behavior throughout. + - New grouping metadata: every sample gets group_condition = "${group}_${condition}" and filenames/IDs avoid + collisions across conditions. + - Group/condition-aware normalisation controls + - --normalisation_scope {all,group,group_condition}: for --normalisation_mode Spikein, spike-in scale + factors can be computed globally (legacy) or separately per group / per group+condition. + - --dump_scale_factors true: writes spike-in scale-factor TSVs under + 03_peak_calling/00_normalisation_factors/. + - --igg_scale_scope {legacy,group_condition,sample}: for IgG/control scaling when using read-count + normalisation modes (e.g. BPM/CPM/RPKM). + - Condition-aware pooled IgG controls for callers that require it: + - Pooled BAMs per (control_group, condition) under 03_peak_calling/01_pooled_controls/. + - Fallbacks (if no exact condition-match control exists) are recorded in 03_peak_calling/07_qc_tables/ + control_pooling_fallbacks.tsv. + - New peak callers + variants (in --peakcaller, comma-separated, case-insensitive): + - MACS2_NARROW, MACS2_BROAD (no-control variants) + - GOPEAKS_NARROW, GOPEAKS_BROAD (emits *_gopeaks.json for MultiQC) + - EPIC2_200BP, EPIC2_150BP, EPIC2_25BP (requires pooled controls; needs --epic2_genome if not inferable) + - SPAN_DEFAULT, SPAN_STRINGENT (requires pooled controls; requires --omnipeaks_jar) + - Plus --peakcaller_preset {standard,extended} (extended = all new callers/variants) + - Consensus peaks can be condition-aware + - --consensus_grouping {group,group_condition} (defaults to group_condition when condition exists in the + samplesheet). + - Primary-caller semantics are enforced + - The first entry in --peakcaller (or the first in the preset list) is the “primary” used for downstream + consensus/QC; others are output-only. + - EPIC2_* and SPAN_* error early if --use_control false. + +# DONE (from docs/extension_specs/TODO.md) + + - #1 Add condition to samplesheet: Implemented (condition column supported; legacy sheets still work with + default condition=NA; metadata now includes group_condition). + - #2 Group-specific handling (Normalization + Peak-calling): Implemented (new scoping knobs like + --normalisation_scope and --igg_scale_scope, condition-aware control pooling for control-required callers, + and condition-aware consensus via --consensus_grouping). + - #3 Support additional peakcallers: Implemented (new caller variants incl. GoPeaks, epic2, SPAN/OMNIPEAKS, + MACS2 narrow/broad; plus --peakcaller_preset extended and primary-caller semantics). + +# Peak Caller Variants integrated into our peak-calling pipeline; + - seacr (SEACR; supports --use_control true/false) + - macs2 (legacy MACS2; supports --use_control true/false, with broad/narrow controlled by existing MACS2 + params) + - macs2_narrow (MACS2 narrow, no control) + - macs2_broad (MACS2 broad, no control) + - gopeaks_narrow (GoPeaks narrow; emits peaks + *_gopeaks.json for MultiQC) + - gopeaks_broad (GoPeaks broad; emits peaks + *_gopeaks.json for MultiQC) + - epic2_200bp, epic2_150bp, epic2_25bp (epic2 with different binning; requires pooled controls → + --use_control true) + - span_default, span_stringent (SPAN/OMNIPEAKS; requires pooled controls → --use_control true) + +# How to run your modified pipeline (modeled on your 20251120_CNR scripts) + + 1. Make a condition-aware samplesheet (this is what actually “turns on” the condition/group additions). + - Your current samplesheet.csv encodes condition inside group (e.g. H3K4me3_sgControl). Convert to + something like: + + group,condition,replicate,fastq_1,fastq_2,control + IgG,sgControl,1,fastq/.../C1_1_..._1.fq.gz,fastq/.../C1_1_..._2.fq.gz, + IgG,sgControl,2,fastq/.../C2_1_..._1.fq.gz,fastq/.../C2_1_..._2.fq.gz, + IgG,sgNSD3,1,fastq/.../S1_1_..._1.fq.gz,fastq/.../S1_1_..._2.fq.gz, + IgG,sgNSD3,2,fastq/.../S2_1_..._1.fq.gz,fastq/.../S2_1_..._2.fq.gz, + H3K4me3,sgControl,1,fastq/.../C1_2_..._1.fq.gz,fastq/.../C1_2_..._2.fq.gz,IgG + H3K4me3,sgControl,2,fastq/.../C2_2_..._1.fq.gz,fastq/.../C2_2_..._2.fq.gz,IgG + H3K4me3,sgNSD3,1,fastq/.../S1_2_..._1.fq.gz,fastq/.../S1_2_..._2.fq.gz,IgG + ... + + 2. Create a new params JSON (copy your existing nf-params.narrow.json / nf-params.spikein.json and add the + new knobs). Examples: + + - BPM run using condition-aware consensus + condition-aware IgG scaling + extended callers + - Set at least: + - "input": "samplesheet.with_condition.csv" + - "consensus_grouping": "group_condition" (or omit; defaults this way when condition exists) + - "igg_scale_scope": "group_condition" (new) + - Either "peakcaller_preset": "extended" or explicitly choose/order callers via "peakcaller": "..." + - If you include SPAN/epic2, add: + - "omnipeaks_jar": "/path/to/omnipeaks.jar" + - "epic2_genome": "mm39" (important for your GRCm39 resources unless you set genome in a way that + can be inferred) + - Spike-in run using group-scoped spike-in scaling + scale-factor TSVs + - Add: + - "normalisation_mode": "Spikein" + - "normalisation_scope": "group" (or "group_condition") + - "dump_scale_factors": true + + 3. Run your fork instead of nf-core/cutandrun + + - Replace: + - nextflow run nf-core/cutandrun -r 3.2.1 ... + - With either: + - nextflow run dhusmann/cutandrun -r adding_new_peak_callers -profile singularity -work-dir + -params-file -resume + - or run the local clone directly: + - nextflow run /home/users/dhusmann/.nextflow/assets/nf-core/cutandrun_dev/cutandrun -profile + singularity -work-dir -params-file -resume + + That will produce the new outputs under 03_peak_calling/ (pooled controls, per-caller peak folders, + per-(group,condition) consensus peaks, optional normalisation-factor TSVs, and control fallback QC). + +# REMAINS TODO + - 4 Differential peak calling subworkflow (DiffBind samplesheets + standalone ChIPBinner + standalone + SPAN): Not implemented yet (only design/docs scaffolding exists). + - 5 --compare_norm_methods mode: Not implemented. + - 6 --compare_peak_callers mode: Not implemented. + - 7 Differential peak/enrichment characterization (ChromHMM, cCRE, LOLA/GSEA sets from group×cond, etc.): + Not implemented. + diff --git a/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md new file mode 100644 index 00000000..7c422fe2 --- /dev/null +++ b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md @@ -0,0 +1,732 @@ +# SPEC_DIFFERENTIAL_PEAK_CALLING.md +⸻ + +Differential Peak Calling Specification + +Status: Draft (implementation-ready) +Version: 1.0 +Date: 2026-01-04 +Target: dhusmann/cutandrun (branch: differential_peak_calling) + +⸻ + +Table of Contents + 1. Overview + 2. Goals and Non-goals + 3. Design Principles + 4. Pipeline Architecture + 1. Integrated vs posthoc execution + 2. Design manifest + 3. Subworkflow layout + 4. Data model and contrast semantics + 5. Validation and skip/strict behavior + 5. Shared Components + 1. Peak set and region annotation + 2. Summary + MultiQC inputs + 3. Cross-method and cross-caller overlaps + 6. Method 1: DiffBind + 7. Method 2: ChIPBinner + 8. Method 3: SPAN / OmniPeaks differential + 9. Output Structure + 10. User-facing Parameters + 11. Implementation Plan + 12. Testing Strategy and Acceptance Criteria + 13. Resource Labels and Containers + +⸻ + +Overview + +This spec adds differential peak calling / differential enrichment as a first-class (opt-in) feature layer for the pipeline, using three complementary approaches: + +Method Best for Core idea Primary outputs +DiffBind Peak-centric differential binding for punctate/focal marks and TFs Counts reads in a peak consensus/union and tests with DESeq2/edgeR Differential region TSV + up/down BED + diagnostic plots +ChIPBinner Broad marks and global redistribution / genome-wide shifts Genome-wide fixed binning + clustering + bin-level differential testing Bin-level differential TSV + cluster BEDs + diagnostic plots (+ optional enrichment) +SPAN / OmniPeaks differential SPAN users who want SPAN-consistent comparison Use OmniPeaks differential mode if supported, otherwise explicit fallback Differential regions BED/TSV (+ optional pooling manifest) + +Key requirement: differential analysis must run for every peak caller variant specified in --peakcaller (primary + secondary), producing parallel DiffBind outputs per caller. ChIPBinner and SPAN-diff run per group (mark), not per caller (unless SPAN fallback explicitly uses SPAN-derived peaks). + +⸻ + +Goals and Non-goals + +Goals + 1. Add opt-in differential analysis to the pipeline: + • DiffBind (per caller × group) + • ChIPBinner (per group) + • SPAN/OmniPeaks differential (per group; native if possible, explicit fallback otherwise) + 2. Support two execution modes: + • Integrated mode (normal run): differential executes after peak calling outputs exist. + • Posthoc mode: run differential-only from an existing --outdir without recomputing upstream steps. + 3. Require an explicit contrast definition to ensure consistent log2FC direction: + • log2FC = log2(treated / control) across all methods. + 4. If spike-in normalization is present and enabled, incorporate external scale factors into differential analysis. + 5. Emit stable, structured outputs including: + • raw result tables (TSV) + • annotated result tables (TSV) + • BED-like significant sets (up/down) + • diagnostic plots (PCA/correlation + method-specific) + • MultiQC-consumable summary tables + +Non-goals (v1) + • Automatic modeling of batch covariates (batch-aware design matrices). + • “Compare normalization methods” / “compare peak callers” decision frameworks (beyond overlap metrics and summary tables). + • A full custom MultiQC plugin/module (v1 uses MultiQC custom-content tables and plot discovery). + +⸻ + +Design Principles + 1. Layered architecture: differential analysis is a separate layer that: + • does not mutate upstream peak calling outputs, + • can run integrated or posthoc, + • consumes published artifacts only in posthoc mode (no work/ assumptions). + 2. Per-caller parity: any caller listed in --peakcaller is treated as a first-class source for DiffBind. + 3. Deterministic + resumable: manifests and samplesheets must be stable-sorted to support -resume and reproducibility. + 4. Explicit contrast direction: do not guess treated vs control from labels. + 5. Fail/skip is user-controlled: default behavior is to skip invalid comparisons with clear warnings, but a strict mode is provided to fail fast. + 6. Performance-aware design: avoid repeated expensive operations across methods; keep per-group computation parallelizable. + +⸻ + +Pipeline Architecture + +Integrated vs posthoc execution + +Integrated mode + • Differential runs when any --run_diffbind, --run_chipbinner, or --run_span_diff is enabled. + • The workflow consumes in-memory channels produced by upstream alignment + peak calling. + +Posthoc mode + • Set --differential_from_run to run differential-only: + • A dedicated entry workflow DIFFERENTIAL_ONLY reads manifest(s) from , + • reconstructs the necessary channels (BAM/BAI, per-caller peaks, optional scale factors), + • runs the same DIFFERENTIAL_PEAK_CALLING subworkflow. + +Design manifest + +To make posthoc differential reliable, the main pipeline writes deterministic manifests that contain published paths under --outdir. + +Emit location (new): +03_peak_calling/06_differential/00_manifests/ + +Files (new): + 1. differential_manifest.samples.tsv +One row per target sample (not IgG), minimum columns: + + • sample_id + • group + • condition + • replicate + • final_bam + • final_bai + • normalisation_mode + • spikein_scale_factor (or NA) + • bigwig (optional; NA if absent) + + 2. differential_manifest.peaks.tsv +One row per (sample_id, caller): + + • sample_id + • caller + • peaks_path + • peaks_format (bed/narrowPeak/broadPeak/other) + • caller_role (primary|secondary; optional) + + 3. differential_manifest.run_meta.json +Captures: + + • pipeline version/hash + • reference genome id (if any) + • the caller list used + • the default contrast if provided (optional) + +Manifest rules + • Sorted by stable keys (group, condition, replicate, sample_id, caller) before writing. + • Contains only published paths under --outdir. + +Subworkflow layout + +Add a new subworkflow: + +subworkflows/local/differential_peak_calling.nf + +It contains three gated method blocks and shared utilities: + +DIFFERENTIAL_PEAK_CALLING +├── WRITE_DIFFERENTIAL_MANIFESTS (integrated mode only) +├── (optional) VALIDATE_DIFFERENTIAL_DESIGN +├── DIFFBIND (per group × caller) +│ ├── MAKE_DIFFBIND_SAMPLESHEETS +│ ├── RUN_DIFFBIND_R +│ ├── ANNOTATE_REGIONS (shared) +│ └── DIFFBIND_SUMMARY +├── CHIPBINNER (per group) +│ ├── MAKE_BINS (or LOAD_BINS) +│ ├── BIN_COUNTS +│ ├── CHIPBINNER_CLUSTER_GRID (HDBSCAN grid search) +│ ├── CHIPBINNER_SELECT_MODELS +│ ├── CHIPBINNER_DIFFERENTIAL (ROTS) +│ ├── ANNOTATE_REGIONS (shared) +│ ├── (optional) CHIPBINNER_ENRICHMENT (LOLA) +│ └── CHIPBINNER_SUMMARY +├── SPAN_DIFF (per group) +│ ├── OMNIPEAKS_CAPABILITY_PROBE +│ ├── SPAN_COMPARE (native) OR SPAN_FALLBACK_DIFF +│ ├── ANNOTATE_REGIONS (shared) +│ └── SPAN_SUMMARY +└── CROSS_COMPARISON (optional) + ├── OVERLAP_CALLERS (DiffBind across callers) + └── OVERLAP_METHODS (DiffBind vs SPAN vs ChIPBinner) + +Add a new DSL2 entry workflow: + • workflow DIFFERENTIAL_ONLY (new entrypoint) + • Reads manifests from --differential_from_run + • Runs DIFFERENTIAL_PEAK_CALLING only + +Data model and contrast semantics + +Existing metadata (already in fork per existing work): + • meta.group (mark/target) + • meta.condition + • meta.replicate (int) + • meta.sample_id / meta.id (unique) + • meta.caller (peak caller id) + +Contrast definition (required for all methods): + • --differential_contrast "TREATED,CONTROL" + • log2FC sign convention: log2(TREATED / CONTROL) + +Filtering rule (v1): + • For each group, differential compares only the two conditions in --differential_contrast. + • If additional conditions exist for that group: + • default: ignore them with a warning + • strict mode: error + +Validation and skip/strict behavior + +Validation is performed per group (and per caller where applicable): + +Checks: + • both contrast conditions exist for the group + • each condition has ≥ --differential_min_replicates + • required inputs exist on disk (BAM/BAI + peak files) + +Behavior: + • default: invalid comparisons are skipped with a clear warning and included in a skipped.tsv report + • --differential_strict true: any invalid comparison causes the workflow to fail + +⸻ + +Shared Components + +Peak set and region annotation + +All methods must produce a BED-like region set that can be annotated consistently. + +Shared annotation module: + • modules/local/annotate_regions.nf + • Uses bedtools closest against one of: + • --gene_bed if present, else + • a derived TSS BED from --gtf if present, else + • skip annotation with warning (still emit unannotated outputs) + +Annotation fields appended to result TSV (minimum): + • nearest_feature_id (or gene id/name if available) + • distance_to_feature + +Summary + MultiQC inputs + +Each method writes a small summary TSV (one row per group/caller comparison) with standard columns: + +Standard summary columns: + • method (diffbind|chipbinner|span) + • group + • caller (diffbind only; else NA) + • treated_condition + • control_condition + • n_tested + • n_fdr_pass + • n_up + • n_down + +Additional method-specific columns may be appended (e.g., chosen HDBSCAN parameters). + +MultiQC integration (v1): + • MultiQC custom content tables discover *.summary.tsv under differential output tree. + • No custom plugin required for v1. + +Cross-method and cross-caller overlaps + +To provide immediate utility and “best-of-both” value, emit overlap metrics: + 1. Across peak callers (DiffBind significant sets) + + • For each group: + • compute pairwise overlap counts and Jaccard between caller outputs + + 2. Across methods + + • For each group: + • overlap DiffBind significant BED (per caller) with: + • SPAN significant BED (if present) + • ChIPBinner cluster-derived differential BEDs (if present) + • Emit both summary TSV and optional plots (UpSet if feasible; v1 can be summary-only) + +Implementation: + • bedtools-based overlap (fast, deterministic) + • 03_peak_calling/06_differential/04_cross_comparison/ + +⸻ + +Method 1: DiffBind + +Purpose + +Run classic differential binding using called peaks as inputs. Runs separately for each group × caller. + +Inputs + +Per sample: + • final BAM + BAI + • called peaks for the selected caller variant + +Per comparison: + • explicit contrast (treated/control) + • optional spike-in scale factor per sample (if enabled) + +Samplesheet generation + +Write one DiffBind samplesheet CSV per (group, caller): + +Path: +03_peak_calling/06_differential/01_diffbind/00_samplesheets//.diffbind.csv + +Columns (minimum DiffBind-compatible): + • SampleID = sample id + • Factor = group + • Condition = condition + • Replicate = replicate + • bamReads = final bam path + • Peaks = peak path + +Recommended extra columns: + • Tissue (constant, e.g. CUTRUN) + • Caller / PeakCaller (for provenance) + • SpikeinScaleFactor (optional; used by runner if enabled) + +Execution + +Create a new module: + • modules/local/diffbind_run.nf + • bin/diffbind_run.R + +Runner responsibilities: + 1. Validate design for the specific group×caller subset. + 2. Construct DiffBind object from samplesheet. + 3. Count reads in peaks: + • default: no summit recentering (use full peak widths) + • optional: if --diffbind_summits > 0, use summit-centered widths + 4. Normalization: + • if external scaling enabled and scale factors exist: + • inject user-provided scaling into the downstream DE test (preferred) + • else: + • use DiffBind’s standard normalization/backend + 5. Contrast: + • enforce treated vs control and consistent sign convention + 6. Report: + • full TSV + • significant BED sets: + • up (log2FC > 0 & FDR pass) + • down (log2FC < 0 & FDR pass) + 7. Plots (minimum): + • PCA + • correlation heatmap + • MA + • volcano + • heatmap (binding affinity) + • (optional) Venn/upset across conditions (if supported) + +Spike-in scaling integration (v1 implementation contract) + +If --differential_use_spikein resolves to true: + • Runner must attempt to use external sample scaling so differential log2FC reflects spike-in normalization. + • Implementation must be explicit and testable: + • either by setting normalization factors in DiffBind/DESeq2/edgeR, or + • by extracting the count matrix and running DESeq2/edgeR with user-provided size factors. + +Outputs + +Per (caller, group): + +03_peak_calling/06_differential/01_diffbind/// + +Minimum artifacts: + • diffbind.results.tsv + • diffbind.results.annotated.tsv (if annotation available) + • diffbind.significant.bed + • diffbind.significant_up.bed + • diffbind.significant_down.bed + • plots/PCA.pdf + • plots/correlation_heatmap.pdf + • plots/MA.pdf + • plots/volcano.pdf + • plots/heatmap.pdf + • diffbind.summary.tsv + +⸻ + +Method 2: ChIPBinner + +Purpose + +Broad-mark differential enrichment with robustness to global changes: + • fixed genomic bins (default 10kb) + • normalization (external scaling when available) + • clustering (HDBSCAN grid) + • bin-level differential testing (ROTS) + • optional enrichment (LOLA) + +Inputs + +Per sample: + • final target BAM + BAI + • group / condition / replicate metadata + • optional external scale factors: + • spike-in scale factors (preferred when present) + • MS coefficients YAML (optional) + +Reference: + • chrom sizes (*.sizes) + • optional blacklist BED + +Workflow (per group) + 1. Bin definition + • default: generate bins from chrom sizes + • optional: load precomputed windows from --chipbinner_windows_dir + • optional: subtract blacklist regions + 2. Quantification + • compute bin-level counts matrix across samples + 3. Normalization + • apply external scaling factors if enabled + • add pseudocount (default 1) before log transforms for clustering/plots + 4. QC + • PCA plot + • correlation heatmap + 5. Clustering (grid search) + • run HDBSCAN over bins using a parameter grid: + • min_cluster_size values + • min_samples/minPts values + • emit: + • chipbinner.hdbscan_grid_summary.tsv ranking solutions by: + • number of clusters (excluding noise) + • fraction of bins assigned (non-noise) + • mean cluster stability/persistence (if available) + • replicate consistency proxy (optional metric) + 6. Model selection + • choose: + • best overall solution + • best solution yielding ~2 clusters (if any) + • best solution yielding ~3 clusters (if any) + 7. Differential testing + • run ROTS on bins (replicates separate) + • produce bin-level log2FC/FDR with consistent sign + 8. Cluster labeling + • label clusters by behavior based on per-cluster mean signals: + • stable (similar signal) + • control_enriched + • treated_enriched + • noise (if applicable) + 9. Annotation + • annotate bins and/or cluster BEDs to nearest features (shared module) + 10. (Optional) Enrichment + + • if a LOLA DB is supplied, run enrichment per cluster + +Outputs (per group) + +03_peak_calling/06_differential/02_chipbinner// + +Minimum artifacts: + • bins/..windows.bed + • counts/chipbinner.bin_counts.tsv + • counts/chipbinner.normalized_matrix.tsv + • clustering/chipbinner.hdbscan_grid_summary.tsv + • clustering/chipbinner.clusters.best.tsv + • differential/chipbinner.differential.tsv + • differential/chipbinner.differential.annotated.tsv (if annotation available) + • bed/chipbinner.significant_up.bed + • bed/chipbinner.significant_down.bed + • plots/PCA.pdf + • plots/correlation_heatmap.pdf + • plots/density_scatter.pdf (treated vs control means, colored by cluster) + • chipbinner.summary.tsv +Optional enrichment (if enabled): + • enrichment/_lola.tsv + • enrichment/_lola_plot.pdf + +⸻ + +Method 3: SPAN / OmniPeaks differential + +Purpose + +Provide a SPAN-consistent differential workflow: + • Prefer OmniPeaks native differential mode if supported. + • Otherwise, run an explicit fallback and label it clearly. + +Capability detection + +Add a probe step: + • OMNIPEAKS_CAPABILITY_PROBE + • Runs java -jar omnipeaks.jar --help (or equivalent) and parses supported subcommands. + • Emits omnipeaks_capabilities.json for audit. + +Native mode (if supported) + +If a native differential subcommand exists (e.g., compare): + • Run it per group using the two contrast conditions. + • Replicate handling: + • if tool supports multiple BAMs per condition: pass them + • else: pool BAMs per condition and write pooling manifest + +Fallback mode (explicit) + +If no native differential exists: + • Do not silently pretend it does. + • Implement “SPAN peaks + DESeq2/edgeR” fallback: + 1. Require SPAN peak outputs to exist (SPAN caller present in --peakcaller or peaks provided in manifests). + 2. Build union peak set across samples in the contrast. + 3. Count reads per union region from final BAMs. + 4. Run DESeq2/edgeR with external scaling if enabled. + 5. Emit outputs under span_fallback/ with clear naming. + +Outputs (per group) + +03_peak_calling/06_differential/03_span// + +Native: + • span.differential.tsv + • span.differential.bed + • span.differential.annotated.tsv (if annotation available) + • span.significant_up.bed + • span.significant_down.bed + • span.summary.tsv + • 00_manifests/span_pooling.tsv (only if pooling used) + +Fallback: + • same outputs but with span_fallback.* prefix and a span_fallback.readme.txt describing the method. + +⸻ + +Output Structure + +All outputs live under: + +03_peak_calling/06_differential/ + +03_peak_calling/06_differential/ +├── 00_manifests/ +│ ├── differential_manifest.samples.tsv +│ ├── differential_manifest.peaks.tsv +│ └── differential_manifest.run_meta.json +├── 01_diffbind/ +│ ├── 00_samplesheets//.diffbind.csv +│ ├── // +│ │ ├── diffbind.results.tsv +│ │ ├── diffbind.results.annotated.tsv +│ │ ├── diffbind.significant.bed +│ │ ├── diffbind.significant_up.bed +│ │ ├── diffbind.significant_down.bed +│ │ ├── diffbind.summary.tsv +│ │ └── plots/*.pdf +├── 02_chipbinner/ +│ └── / +│ ├── bins/*.bed +│ ├── counts/*.tsv +│ ├── clustering/*.tsv +│ ├── differential/*.tsv +│ ├── bed/*.bed +│ ├── plots/*.pdf +│ ├── enrichment/ (optional) +│ └── chipbinner.summary.tsv +├── 03_span/ +│ └── / +│ ├── span.differential.tsv +│ ├── span.differential.bed +│ ├── span.differential.annotated.tsv +│ ├── span.significant_up.bed +│ ├── span.significant_down.bed +│ ├── span.summary.tsv +│ └── 00_manifests/ (optional) +├── 04_cross_comparison/ (optional) +│ ├── overlap_callers.tsv +│ ├── overlap_methods.tsv +│ └── plots/ (optional) +└── multiqc/ + ├── differential.summary.all_methods.tsv + ├── differential.skipped.tsv + └── (optional) differential_multiqc_report.html + + +⸻ + +User-facing Parameters + +Top-level toggles + • --run_diffbind (bool; default: false) + • --run_chipbinner (bool; default: false) + • --run_span_diff (bool; default: false) + +Execution mode + • --differential_from_run (string; default: null) + • if set: run differential-only using manifests from that outdir + +Contrast and validation behavior + • --differential_contrast "TREATED,CONTROL" (string; required if any method enabled) + • --differential_min_replicates (int; default: 2) + • --differential_strict (bool; default: false) + • --differential_use_spikein (string; default: "auto") + • allowed: auto|true|false + • auto resolves to true iff pipeline normalization mode is spike-in and scale factors exist + +Shared outputs / QC + • --differential_annotate (bool; default: true) + • --differential_cross_compare (bool; default: true) + • --differential_run_multiqc (bool; default: true) + +DiffBind parameters + • --diffbind_fdr (float; default: 0.05) + • --diffbind_lfc (float; default: 1.0) + • --diffbind_backend (string; default: DESeq2) + • allowed: DESeq2|edgeR + • --diffbind_min_overlap (int; default: 2) + • --diffbind_summits (int; default: 0) + • 0 = do not recenter; >0 = summit-based fixed width + • --diffbind_extra_params (path; default: null) + • JSON/YAML override file for advanced DiffBind runner options + +ChIPBinner parameters + • --chipbinner_bin_size (int; default: 10000) + • --chipbinner_windows_dir (path; default: null) + • if set, load bins from this dir; else generate + • --chipbinner_blacklist (path; default: null) + • --chipbinner_use_input (bool; default: false) + • --chipbinner_pseudocount (int; default: 1) + • --chipbinner_fdr (float; default: 0.05) + • --chipbinner_lfc (float; default: 1.0) + • --chipbinner_bootstrap (int; default: 1000) + • --chipbinner_k_value (int; default: 100000) + • --chipbinner_hdbscan_grid_min_cluster_size (string; default: "100,200,500,1000") + • --chipbinner_hdbscan_grid_min_samples (string; default: "100,200,500,1000") + • --chipbinner_ms_coeffs (path; default: null) + • --chipbinner_lola_db (path; default: null) + • --chipbinner_run_lola (bool; default: false) + +SPAN / OmniPeaks parameters + • --omnipeaks_jar (path; required for native SPAN diff; optional for fallback) + • --span_diff_mode (string; default: auto) + • auto|native|fallback + • --span_diff_fdr (float; default: 0.05) + • --span_diff_gap (int; default: 5) + • --span_diff_bin (int; default: 200) + • --span_diff_java_heap (string; default: 8G) + +⸻ + +Implementation Plan + +New files + +Subworkflows: + • subworkflows/local/differential_peak_calling.nf + • workflows/differential_only.nf (or add new entrypoint in existing workflow file) + +Modules: + • modules/local/write_differential_manifests.nf + • modules/local/diffbind_run.nf + • modules/local/chipbinner_bins.nf + • modules/local/chipbinner_counts.nf + • modules/local/chipbinner_hdbscan_grid.nf + • modules/local/chipbinner_rots.nf + • modules/local/span_capability_probe.nf + • modules/local/span_compare.nf + • modules/local/span_fallback_diff.nf + • modules/local/annotate_regions.nf + • modules/local/differential_overlap.nf + +Scripts: + • bin/diffbind_run.R + • bin/chipbinner_rots.R + • bin/chipbinner_select_models.R (or embedded in rots runner) + • bin/span_fallback_deseq2.R (or shared DE runner) + • bin/hdbscan_grid.py (if clustering done in python) + • bin/annotate_regions.sh (optional wrapper around bedtools) + +Modified files + • workflows/cutandrun.nf (call DIFFERENTIAL_PEAK_CALLING in integrated mode) + • nextflow.config (params defaults) + • nextflow_schema.json (new params + help text) + • conf/base.config (resource labels for new processes) + • assets/multiqc_config.yml (custom content tables ordering + file cleanup) + • docs/usage.md (new params & examples) + • docs/output.md (new output tree section) + • tests/ (new minimal tests and/or profile behavior) + +⸻ + +Testing Strategy and Acceptance Criteria + +Minimal CI-friendly tests (recommended) + 1. Parameter validation tests: + + • missing --differential_contrast when methods enabled → fails + • only one condition present → skipped (default) or fails (strict) + • <2 replicates/condition → skipped (default) or fails (strict) + + 2. Manifest generation test: + + • after a small peak-calling test run, confirm: + • differential_manifest.samples.tsv exists with expected columns + • differential_manifest.peaks.tsv exists and includes all callers + + 3. DiffBind samplesheet generation test: + + • enabling --run_diffbind should create per-caller per-group samplesheets + +Optional heavier integration tests + • Run a small dataset with 2×2 design and confirm: + • DiffBind outputs exist for at least one caller and group + • ChIPBinner outputs exist for one group + • SPAN diff produces either native or fallback outputs (depending on tool availability) + +Acceptance criteria + • DiffBind runs per group × caller and respects log2FC direction. + • Posthoc differential-only mode runs without upstream recompute (using manifests). + • Spike-in scaling is applied when enabled and scale factors exist. + • Each method emits: result TSV, annotated TSV (if possible), significant BEDs, plots, summary TSV. + • MultiQC custom content tables can load the summaries. + +⸻ + +Resource Labels and Containers + +Resource labels (conf/base.config) + +Add/extend labels: + • process_single (default) + • process_medium (DiffBind, annotation) + • process_high (ChIPBinner binning/clustering, DE tests) + • process_long (SPAN compare, heavy clustering) + +Containers / environments + +DiffBind: + • R (>=4.2), Bioconductor DiffBind, DESeq2, edgeR, ggplot2, rtracklayer + +ChIPBinner: + • R (>=4.2), ROTS, (optional) LOLA, plus clustering dependencies + • If python used: python3 + hdbscan + numpy/pandas + +SPAN: + • Java runtime and omnipeaks.jar + +All modules must emit versions.yml for nf-core provenance. diff --git a/docs/output.md b/docs/output.md index ccf61f74..22d65384 100644 --- a/docs/output.md +++ b/docs/output.md @@ -30,7 +30,8 @@ - 6.7. [GoPeaks peak calling](#GoPeakspeakcalling) - 6.8. [epic2 peak calling](#epic2peakcalling) - 6.9. [SPAN/OmniPeaks peak calling](#SPANpeakcalling) - - 6.10. [Consensus Peaks](#ConsensusPeaks) + - 6.10. [Differential peak calling](#DifferentialPeakCalling) + - 6.11. [Consensus Peaks](#ConsensusPeaks) - 7. [Peak-based QC](#Peak-basedQC) - 7.1. [Peak Counts](#PeakCounts) - 7.2. [Peak Reproducibility](#PeakReproducibility) @@ -425,7 +426,52 @@ epic2 callers require pooled controls; see the pooled controls section above. SPAN/OmniPeaks callers require pooled controls and the `--omnipeaks_jar` parameter. -### 6.10. Consensus Peaks +### 6.10. Differential peak calling + +
+Output files + +- `03_peak_calling/06_differential/00_manifests/` + - `differential_manifest.samples.tsv`: per-sample metadata for posthoc differential runs. + - `differential_manifest.peaks.tsv`: per-sample peak paths for each caller. + - `differential_manifest.run_meta.json`: run metadata (callers, reference, default contrast). +- `03_peak_calling/06_differential/01_diffbind/00_samplesheets//` + - `.diffbind.csv`: DiffBind samplesheet for each group/caller. +- `03_peak_calling/06_differential/01_diffbind///` + - `diffbind.results.tsv`: differential result table. + - `diffbind.results.annotated.tsv`: annotated differential results. + - `diffbind.significant.bed`: significant regions. + - `diffbind.significant_up.bed`: regions enriched in treated. + - `diffbind.significant_down.bed`: regions enriched in control. + - `diffbind.summary.tsv`: per-group summary for MultiQC. + - `plots/*.pdf`: diagnostic plots (PCA/correlation). +- `03_peak_calling/06_differential/02_chipbinner//` + - `bins/*.bed`: genome-wide bins. + - `counts/*.tsv`: bin count matrices. + - `clustering/*.tsv`: clustering diagnostics. + - `differential/*.tsv`: differential results. + - `bed/*.bed`: significant differential bins. + - `plots/*.pdf`: diagnostic plots. + - `chipbinner.summary.tsv`: per-group summary for MultiQC. +- `03_peak_calling/06_differential/03_span//` + - `span.differential.tsv`: differential results. + - `span.differential.bed`: significant regions. + - `span.differential.annotated.tsv`: annotated differential results. + - `span.significant_up.bed`: regions enriched in treated. + - `span.significant_down.bed`: regions enriched in control. + - `span.summary.tsv`: per-group summary for MultiQC. +- `03_peak_calling/06_differential/04_cross_comparison/` + - `overlap_callers.tsv`: overlap between DiffBind results across callers. + - `overlap_methods.tsv`: overlap between DiffBind/ChIPBinner/SPAN results. +- `03_peak_calling/06_differential/multiqc/` + - `differential.summary.all_methods.tsv`: merged summary table. + - `differential.skipped.tsv`: skipped groups/callers with reasons. + +
+ +Differential peak calling is enabled with `--run_diffbind`, `--run_chipbinner`, and/or `--run_span_diff`. A contrast definition (`--differential_contrast "TREATED,CONTROL"`) is required to set the log2FC direction. The pipeline can also run in posthoc mode via `--differential_from_run` to reuse an existing run directory. + +### 6.11. Consensus Peaks
Output files diff --git a/docs/test_histor/differential_peak_calling.md b/docs/test_histor/differential_peak_calling.md new file mode 100644 index 00000000..c091e664 --- /dev/null +++ b/docs/test_histor/differential_peak_calling.md @@ -0,0 +1,15 @@ +# Differential peak calling test history (local notes) + +## 2026-01-10 — pytest failures resolved (current context) + +- **ROTS crash with <2 replicates** (`rowSums(is.na(...))` error in `test_differential_chipbinner`). + **Fix:** disable ROTS when a condition has <2 samples and fall back to simple tests (or p=1) in `bin/chipbinner_rots.R`. + +- **`DIFFERENTIAL_SUMMARY_MERGE` unbound variable** due to `$` in staged summary filename. + **Fix:** stage summaries with `stageAs: 'summaries/*'` and build `summary_files.list` via `find summaries -type f` in `modules/local/differential_summary.nf`. + +- **`ANNOTATE_REGIONS` missing python in container** (command not found). + **Fix:** disable container for `ANNOTATE_REGIONS` so conda provides python (`conf/modules.config`). + +- **`ANNOTATE_REGIONS` publishDir error** (`Unexpected path value` from publishDir closure). + **Fix:** replace publishDir closure with a static map + `enabled` closure in `modules/local/annotate_regions.nf`. diff --git a/docs/test_history/differential_peak_calling.md b/docs/test_history/differential_peak_calling.md new file mode 100644 index 00000000..1b8ac4d6 --- /dev/null +++ b/docs/test_history/differential_peak_calling.md @@ -0,0 +1,36 @@ +# Differential peak calling test history + +## 2026-01-09 — full pytest suite (Slurm) + +- **Failure:** `LexerNoViableAltException` / Groovy parsing errors during `span_compare` (seen in Nextflow stderr). + **Cause:** unescaped `$` tokens in AWK and regex anchors inside `modules/local/span_compare.nf`. + **Fix:** escape `$` in AWK fields and regex anchors (e.g., `\$2`, `\$i`, `/pattern\\$/`). + +- **Failure:** Nextflow could not start because `java` was not found in the pytest job environment. + **Fix:** set `JAVA_HOME`, `NXF_JAVA_HOME`, and `JAVA_CMD` to `/share/software/user/open/java/17.0.4` in the pytest script. + +- **Failure:** pytest workflow missing `PROFILE` environment variable, causing Nextflow config/profile resolution to fail in tests. + **Fix:** export `PROFILE=singularity` in the pytest script. + +- **Failure:** Nextflow picked up `~/.nextflow/config` with `process.executor=slurm`, leading to nested Slurm execution during pytest runs. + **Fix:** set `NXF_HOME` to a scratch directory and `NXF_EXECUTOR=local` in the pytest script to isolate from user config and force local executor. + +- **Failure:** `CHIPBINNER_COUNTS` input file name collision (same BAM/BAI filenames for target and control inputs) in `test_differential_chipbinner`. + **Fix:** stage control BAM/BAI into a dedicated subdirectory via `stageAs: { "controls/${it.getName()}" }` in `modules/local/chipbinner_counts.nf`. + +## 2026-01-10 — differential tag + full pytest suite (Slurm) + +- **Failure:** `ROTS` error (`rowSums(is.na(...))` expects matrix) when a condition has <2 replicates in `test_differential_chipbinner` (job `13175581`, run dir `/scratch/users/dhusmann/runs/pytest-suite/20260109_210345`). + **Fix:** disable ROTS when <2 samples per condition and fall back to simple tests (or p=1 if <2) in `bin/chipbinner_rots.R`. + +- **Failure:** `DIFFERENTIAL_SUMMARY_MERGE` unbound variable due to `$` in staged summary filename (job `13175959`, run dir `/scratch/users/dhusmann/runs/pytest-suite/20260109_212945`). + **Fix:** stage summaries with `stageAs: 'summaries/*'` and build `summary_files.list` via `find summaries -type f` in `modules/local/differential_summary.nf`. + +- **Failure:** `ANNOTATE_REGIONS` container missing `python` (job `13177002`, run dir `/scratch/users/dhusmann/runs/pytest-suite/20260109_222413`). + **Fix:** disable container for `ANNOTATE_REGIONS` so conda provides python (`conf/modules.config`). + +- **Failure:** `ANNOTATE_REGIONS` publishDir error (`Unexpected path value` from publishDir closure) (job `13178611`, run dir `/scratch/users/dhusmann/runs/pytest-suite/20260109_225233`). + **Fix:** replace publishDir closure with a static map + `enabled` closure in `modules/local/annotate_regions.nf`. + +- **Pass:** `test_differential_chipbinner` tag rerun succeeded (job `13179209`, run dir `/scratch/users/dhusmann/runs/pytest-suite/20260109_231631`). +- **Pass:** full pytest suite succeeded (job `13179771`, run dir `/scratch/users/dhusmann/runs/pytest-suite/20260109_234549`). diff --git a/docs/usage.md b/docs/usage.md index 56780e94..d606d195 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -167,6 +167,58 @@ If control samples are provided in the sample sheet, they will be used to normal After peak calling, consensus peaks are calculated by merging peaks within the same grouping key. Use `--consensus_grouping` to choose `group` or `group_condition`. By default, if the samplesheet includes a `condition` column, grouping uses `group_condition`; otherwise it falls back to `group`. The number of replicates required for a valid peak can be changed using `replicate_threshold`. To call consensus peaks across all samples, set `--consensus_peak_mode all`. +### Differential Peak Calling + +Differential peak calling is an optional analysis layer that can be enabled after peak calling. It supports three complementary methods: + +- **DiffBind** (per caller × group) +- **ChIPBinner** (per group) +- **SPAN/OmniPeaks differential** (per group) + +To enable differential analysis, set one or more `--run_*` toggles and provide an explicit contrast: + +``` +--run_diffbind \ +--differential_contrast "Treated,Control" +``` + +Key parameters: + +- `--run_diffbind`, `--run_chipbinner`, `--run_span_diff` +- `--differential_contrast "TREATED,CONTROL"` (required when any method is enabled) +- `--differential_min_replicates` (default: 2) +- `--differential_strict` (fail on invalid comparisons instead of skipping) +- `--differential_use_spikein` (auto|true|false) +- `--differential_annotate` (annotate differential regions) +- `--differential_cross_compare` (cross-method/caller overlap tables) +- `--differential_run_multiqc` (generate summary tables for MultiQC) + +DiffBind options: + +- `--diffbind_fdr`, `--diffbind_lfc`, `--diffbind_backend`, `--diffbind_min_overlap`, `--diffbind_summits` +- `--diffbind_extra_params` (JSON/YAML overrides for advanced DiffBind settings) + +ChIPBinner options: + +- `--chipbinner_bin_size`, `--chipbinner_windows_dir`, `--chipbinner_blacklist` +- `--chipbinner_pseudocount`, `--chipbinner_fdr`, `--chipbinner_lfc` +- `--chipbinner_hdbscan_grid_min_cluster_size`, `--chipbinner_hdbscan_grid_min_samples` +- `--chipbinner_bootstrap`, `--chipbinner_k_value` + +SPAN differential options: + +- `--omnipeaks_jar` (required for native SPAN differential; auto mode falls back if missing) +- `--span_diff_mode` (auto|native|fallback), `--span_diff_fdr`, `--span_diff_gap`, `--span_diff_bin` +- `--span_diff_java_heap` + +Posthoc differential-only mode: + +``` +nextflow run . -entry DIFFERENTIAL_ONLY \ + --differential_from_run \ + --differential_contrast "Treated,Control" +``` + ### Reproducibility It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. diff --git a/main.nf b/main.nf index f3b5b5eb..2adea865 100644 --- a/main.nf +++ b/main.nf @@ -65,6 +65,7 @@ WorkflowMain.initialise(workflow, params, log, args) */ include { CUTANDRUN } from './workflows/cutandrun' +include { DIFFERENTIAL_ONLY } from './workflows/differential_only' workflow NFCORE_CUTANDRUN { /* diff --git a/modules/local/annotate_regions.nf b/modules/local/annotate_regions.nf new file mode 100644 index 00000000..ca944ea7 --- /dev/null +++ b/modules/local/annotate_regions.nf @@ -0,0 +1,46 @@ +process ANNOTATE_REGIONS { + tag "${results_tsv.simpleName}" + label 'process_medium' + + publishDir = [ + path: { meta?.publish_dir ?: task.ext.publish_dir }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: { meta?.publish_dir ?: task.ext.publish_dir ? true : false } + ] + + conda "bioconda::bedtools=2.31.1 bioconda::bedops=2.4.41 conda-forge::python=3.11 conda-forge::perl=5.26.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + tuple val(meta), path(results_tsv) + val gene_bed + val gtf + + output: + path "*.annotated.tsv", emit: annotated + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: results_tsv.baseName + def gene_arg = gene_bed ? "--features ${gene_bed}" : '' + def gtf_arg = (!gene_bed && gtf) ? "--gtf ${gtf}" : '' + """ + python ${projectDir}/bin/annotate_regions.py \ + --input ${results_tsv} \ + --output ${prefix}.annotated.tsv \ + ${gene_arg} \ + ${gtf_arg} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """ +} diff --git a/modules/local/chipbinner_bins.nf b/modules/local/chipbinner_bins.nf new file mode 100644 index 00000000..d92a0f4d --- /dev/null +++ b/modules/local/chipbinner_bins.nf @@ -0,0 +1,57 @@ +process CHIPBINNER_BINS { + tag "${group}.${bin_size}" + label 'process_high' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/bins" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "bioconda::bedtools=2.31.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + path chrom_sizes + val bin_size + val group + val windows_dir + val blacklist + + output: + tuple val(group), path("*.windows.bed"), emit: bins + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${group}.${bin_size}" + def windows_arg = windows_dir ? "${windows_dir}" : '' + def blacklist_arg = blacklist ? "${blacklist}" : '' + """ + set -euo pipefail + if [ -n "${windows_arg}" ]; then + windows_file=\$(ls ${windows_arg}/*${bin_size}*.bed 2>/dev/null | head -n 1 || true) + if [ -z "\$windows_file" ]; then + echo "No window file found in ${windows_arg} for bin size ${bin_size}" >&2 + exit 1 + fi + cp "\$windows_file" ${prefix}.windows.bed + else + bedtools makewindows -g ${chrom_sizes} -w ${bin_size} > ${prefix}.windows.bed + fi + + if [ -n "${blacklist_arg}" ]; then + bedtools subtract -a ${prefix}.windows.bed -b ${blacklist_arg} > ${prefix}.windows.filtered.bed + mv ${prefix}.windows.filtered.bed ${prefix}.windows.bed + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/local/chipbinner_counts.nf b/modules/local/chipbinner_counts.nf new file mode 100644 index 00000000..1e32496e --- /dev/null +++ b/modules/local/chipbinner_counts.nf @@ -0,0 +1,256 @@ +process CHIPBINNER_COUNTS { + tag "${group}" + label 'process_high' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/counts" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "bioconda::bedtools=2.31.1 conda-forge::python=3.11 conda-forge::pyyaml" + container 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' + + input: + path bins + val samples_json + path bams + path bais + path control_bams + path control_bais + val control_conditions_json + val use_input + val group + val use_spikein + val pseudocount + val ms_coeffs + + output: + path "chipbinner.bin_counts.tsv" , emit: counts + path "chipbinner.normalized_matrix.tsv", emit: normalized + path "chipbinner.normalization.json" , emit: norm_info + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + printf "%s\\n" ${bams} > bam_paths.txt + printf "%s\\n" ${bais} > bai_paths.txt + printf "%s\\n" ${control_bams} > control_bam_paths.txt + printf "%s\\n" ${control_bais} > control_bai_paths.txt + + python3 - <<'PY' + import json + import subprocess + import sys + from pathlib import Path + + samples = json.loads(r'''${samples_json}''') + with open('bam_paths.txt') as handle: + bam_paths = [line.strip() for line in handle if line.strip()] + with open('bai_paths.txt') as handle: + bai_paths = [line.strip() for line in handle if line.strip()] + sample_ids = [s['sample_id'] for s in samples] + scale_factors = [s.get('spikein_scale_factor') for s in samples] + ms_coeffs_path = r'''${ms_coeffs}''' + control_conditions = json.loads(r'''${control_conditions_json}''') + use_input = str('${use_input}').lower() in ("true", "1", "yes") + use_spikein = str('${use_spikein}').lower() in ("true", "1", "yes") + + if len(bam_paths) != len(sample_ids): + sys.stderr.write(f"Expected {len(sample_ids)} BAMs but found {len(bam_paths)}\\n") + sys.exit(1) + + def ensure_bai_for_bams(bams, bais, label): + missing = [] + for bam in bams: + bam_path = Path(bam) + candidates = [ + Path(f"{bam}.bai"), + bam_path.with_suffix(".bai"), + ] + if any(candidate.exists() for candidate in candidates): + continue + missing.append(str(bam_path)) + if missing: + sys.stderr.write(f"Missing BAM index for {label}: {', '.join(missing)}\\n") + sys.stderr.write(f"Available BAI files: {', '.join(bais) if bais else 'none'}\\n") + sys.exit(1) + + ensure_bai_for_bams(bam_paths, bai_paths, "chipbinner samples") + + cmd = ["bedtools", "multicov", "-bams"] + bam_paths + ["-bed", "${bins}"] + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + sys.stderr.write(result.stderr) + sys.exit(result.returncode) + + lines = result.stdout.strip().splitlines() + with open('chipbinner.bin_counts.tsv', 'w') as handle: + handle.write("\\t".join(["chr", "start", "end"] + sample_ids) + "\\n") + for line in lines: + if not line.strip(): + continue + handle.write(line + "\\n") + + def parse_ms_coeffs_file(path): + try: + import yaml + except Exception as exc: + raise ValueError(f"PyYAML is required to parse MS coefficients: {exc}") + with open(path) as handle: + data = yaml.safe_load(handle) + if data is None: + return {} + + def parse_entries(entries): + coeffs = {} + if not isinstance(entries, list): + raise ValueError("MS coefficients 'samples' must be a list") + for entry in entries: + if not isinstance(entry, dict): + raise ValueError("MS coefficients entry must be a mapping") + sample_id = entry.get("sample_id") or entry.get("sample") + if not sample_id: + raise ValueError("MS coefficients entry missing sample_id") + value = entry.get("ms_coeff") or entry.get("coefficient") or entry.get("scale_factor") or entry.get("scale") + if value is None or value == "": + raise ValueError(f"MS coefficients entry missing coefficient for {sample_id}") + coeffs[str(sample_id)] = float(value) + return coeffs + + if isinstance(data, list): + return parse_entries(data) + if isinstance(data, dict): + if "samples" in data: + samples = data["samples"] + if isinstance(samples, list): + return parse_entries(samples) + if isinstance(samples, dict): + return {str(key): float(value) for key, value in samples.items()} + raise ValueError("MS coefficients 'samples' must be a list or mapping") + return {str(key): float(value) for key, value in data.items()} + raise ValueError("Unsupported MS coefficients YAML structure") + + ms_coeffs = {} + if ms_coeffs_path: + try: + ms_coeffs = parse_ms_coeffs_file(ms_coeffs_path) + except Exception as exc: + sys.stderr.write(f"Failed to parse MS coefficients YAML: {exc}\\n") + sys.exit(1) + if not ms_coeffs: + sys.stderr.write("MS coefficients YAML contained no entries\\n") + sys.exit(1) + missing = [sid for sid in sample_ids if sid not in ms_coeffs] + if missing: + sys.stderr.write(f"MS coefficients missing entries for samples: {', '.join(missing)}\\n") + sys.exit(1) + + def counts_from_lines(lines): + counts = [] + for line in lines: + if not line.strip(): + continue + parts = line.split("\\t") + counts.append([float(x) for x in parts[3:]]) + return counts + + counts = counts_from_lines(lines) + if not counts: + counts = [] + + scaling_source = "library_size" + scale_values = None + if ms_coeffs: + scaling_source = "ms" + scale_values = [float(ms_coeffs[sid]) for sid in sample_ids] + elif use_spikein and all(sf not in (None, "NA", "") for sf in scale_factors): + scaling_source = "spikein" + scale_values = [float(sf) for sf in scale_factors] + else: + if counts: + lib_sizes = [0.0 for _ in sample_ids] + for row in counts: + for idx, val in enumerate(row): + lib_sizes[idx] += val + lib_sorted = sorted(lib_sizes) + median_lib = lib_sorted[len(lib_sorted) // 2] if lib_sorted else 1.0 + scale_values = [float(median_lib / ls) if ls > 0 else 1.0 for ls in lib_sizes] + else: + scale_values = [1.0 for _ in sample_ids] + + scaled_counts = [] + for row in counts: + scaled_counts.append([val * scale_values[idx] for idx, val in enumerate(row)]) + + if use_input and control_conditions: + with open('control_bam_paths.txt') as handle: + control_bam_paths = [line.strip() for line in handle if line.strip()] + with open('control_bai_paths.txt') as handle: + control_bai_paths = [line.strip() for line in handle if line.strip()] + ensure_bai_for_bams(control_bam_paths, control_bai_paths, "chipbinner controls") + if len(control_bam_paths) != len(control_conditions): + sys.stderr.write("Control BAMs and conditions mismatch\\n") + sys.exit(1) + control_cmd = ["bedtools", "multicov", "-bams"] + control_bam_paths + ["-bed", "${bins}"] + control_res = subprocess.run(control_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if control_res.returncode != 0: + sys.stderr.write(control_res.stderr) + sys.exit(control_res.returncode) + control_lines = control_res.stdout.strip().splitlines() + control_counts = counts_from_lines(control_lines) + if control_counts and len(control_counts) != len(scaled_counts): + sys.stderr.write("Control counts row mismatch with target counts\\n") + sys.exit(1) + + cond_to_idx = {cond: idx for idx, cond in enumerate(control_conditions)} + for idx, sample in enumerate(samples): + cond = sample.get('condition') + if cond not in cond_to_idx: + sys.stderr.write(f"Missing control counts for condition {cond}\\n") + sys.exit(1) + ctrl_idx = cond_to_idx[cond] + for row_idx, row in enumerate(scaled_counts): + row[idx] = row[idx] - (control_counts[row_idx][ctrl_idx] * scale_values[idx]) + for row in scaled_counts: + for idx, val in enumerate(row): + row[idx] = val if val > 0 else 0.0 + + pseudocount = float('${pseudocount}') + for row in scaled_counts: + for idx, val in enumerate(row): + row[idx] = val + pseudocount + + with open('chipbinner.normalized_matrix.tsv', 'w') as handle: + handle.write("\\t".join(["chr", "start", "end"] + sample_ids) + "\\n") + for idx, line in enumerate(lines): + if not line.strip(): + continue + parts = line.split("\\t") + coords = parts[:3] + row = scaled_counts[idx] if scaled_counts else [] + handle.write("\\t".join(coords + [f"{c:.6f}" for c in row]) + "\\n") + + norm_info = { + "use_input": bool(use_input and control_conditions), + "use_ms_scaling": scaling_source == "ms", + "use_spikein_scaling": scaling_source == "spikein", + "use_library_size": scaling_source == "library_size", + "scaling_source": scaling_source, + "control_conditions": control_conditions if use_input and control_conditions else [], + } + with open('chipbinner.normalization.json', 'w') as handle: + json.dump(norm_info, handle, indent=2) + handle.write("\\n") + PY + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + python: \$(python3 --version | awk '{print \$2}') + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/chipbinner_hdbscan_grid.nf b/modules/local/chipbinner_hdbscan_grid.nf new file mode 100644 index 00000000..2b9abd89 --- /dev/null +++ b/modules/local/chipbinner_hdbscan_grid.nf @@ -0,0 +1,60 @@ +process CHIPBINNER_HDBSCAN_GRID { + tag "${group}" + label 'process_high' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/clustering" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::python=3.11 conda-forge::numpy conda-forge::pandas conda-forge::scikit-learn conda-forge::hdbscan" + container 'biocontainers/hdbscan:0.8.33--pyhdfd78af_0' + + input: + path matrix + val samples_json + val group + val min_cluster_size + val min_samples + + output: + path "chipbinner.hdbscan_grid_summary.tsv", emit: grid_summary + path "chipbinner.clusters.best.tsv" , emit: clusters + path "chipbinner.clusters.2clusters.tsv" , emit: clusters_2 + path "chipbinner.clusters.3clusters.tsv" , emit: clusters_3 + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + python - <<'PY' + import json + import csv + + samples = json.loads(r'''${samples_json}''') + with open('chipbinner.samplesheet.csv', 'w', newline='') as handle: + writer = csv.writer(handle) + writer.writerow(['sample_id', 'condition']) + for row in samples: + writer.writerow([row['sample_id'], row['condition']]) + PY + + python ${projectDir}/bin/hdbscan_grid.py \ + --matrix ${matrix} \ + --samplesheet chipbinner.samplesheet.csv \ + --min_cluster_size ${min_cluster_size} \ + --min_samples ${min_samples} \ + --summary chipbinner.hdbscan_grid_summary.tsv \ + --clusters_best chipbinner.clusters.best.tsv \ + --clusters_2 chipbinner.clusters.2clusters.tsv \ + --clusters_3 chipbinner.clusters.3clusters.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/chipbinner_lola.nf b/modules/local/chipbinner_lola.nf new file mode 100644 index 00000000..2b761470 --- /dev/null +++ b/modules/local/chipbinner_lola.nf @@ -0,0 +1,44 @@ +process CHIPBINNER_LOLA { + tag "${group}.${model}.${label}" + label 'process_medium' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/enrichment${model && model != 'best' ? '/' + model : ''}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "bioconda::bioconductor-lola=1.28.0 conda-forge::r-ggplot2 conda-forge::r-optparse" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-lola:1.28.0--r42hdfd78af_0' : + 'biocontainers/bioconductor-lola:1.28.0--r42hdfd78af_0' }" + + input: + tuple val(group), val(label), val(model), path(bed) + path universe + path lola_db + + output: + tuple val(group), val(label), val(model), path("${label}_lola.tsv"), emit: tsv + tuple val(group), val(label), val(model), path("${label}_lola_plot.pdf"), optional: true, emit: plot + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + Rscript ${projectDir}/bin/chipbinner_lola.R \ + --bed ${bed} \ + --universe ${universe} \ + --db ${lola_db} \ + --label ${label} \ + --out_tsv ${label}_lola.tsv \ + --out_pdf ${label}_lola_plot.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + r-base: \$(R --version 2>&1 | head -n 1 | sed -e 's/.*R version //; s/ .*//') + END_VERSIONS + """ +} diff --git a/modules/local/chipbinner_rots.nf b/modules/local/chipbinner_rots.nf new file mode 100644 index 00000000..9afaf89d --- /dev/null +++ b/modules/local/chipbinner_rots.nf @@ -0,0 +1,96 @@ +process CHIPBINNER_ROTS { + tag "${group}" + label 'process_high' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename == 'versions.yml') { return null } + if (filename == 'plots') { return 'plots' } + if (filename.startsWith('plots')) { return "plots/${filename}" } + if (filename.endsWith('.bed')) { return "bed/${filename}" } + if (filename.contains('differential')) { return "differential/${filename}" } + return filename + } + ] + + conda "bioconda::bioconductor-rots conda-forge::r-base=4.2.3 conda-forge::r-optparse conda-forge::r-ggplot2 conda-forge::r-jsonlite conda-forge::python=3.11" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-rots:1.19.0--r42hdfd78af_0' : + 'biocontainers/r-rots:1.19.0--r42hdfd78af_0' }" + + input: + path matrix + path clusters + path clusters_2 + path clusters_3 + path grid_summary + path norm_info + val samples_json + val treated + val control + val group + val fdr + val lfc + val bootstrap + val k_value + val lola_run + + output: + tuple val(group), path("chipbinner.differential.tsv") , emit: results + tuple val(group), path("chipbinner.significant.bed") , emit: significant + tuple val(group), path("chipbinner.significant_up.bed") , emit: up + tuple val(group), path("chipbinner.significant_down.bed") , emit: down + tuple val(group), path("chipbinner.control_enriched.bed") , emit: control_enriched + tuple val(group), path("chipbinner.treated_enriched.bed") , emit: treated_enriched + tuple val(group), path("chipbinner.stable.bed") , emit: stable + tuple val(group), path("chipbinner.noise.bed") , emit: noise + path "chipbinner.*clusters*.bed" , optional: true, emit: cluster_beds_extra + tuple val(group), path("chipbinner.cluster_beds.tsv") , emit: cluster_beds + tuple val(group), path("chipbinner.summary.tsv") , emit: summary + tuple val(group), path("plots") , emit: plots + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + mkdir -p plots + python3 - <<'PY' + import json + import csv + samples = json.loads(r'''${samples_json}''') + with open('chipbinner.samplesheet.csv', 'w', newline='') as handle: + writer = csv.writer(handle) + writer.writerow(['sample_id', 'condition']) + for row in samples: + writer.writerow([row['sample_id'], row['condition']]) + PY + + Rscript ${projectDir}/bin/chipbinner_rots.R \ + --matrix ${matrix} \ + --clusters ${clusters} \ + --clusters_2 ${clusters_2} \ + --clusters_3 ${clusters_3} \ + --grid_summary ${grid_summary} \ + --norm_info ${norm_info} \ + --samplesheet chipbinner.samplesheet.csv \ + --treated ${treated} \ + --control ${control} \ + --group ${group} \ + --fdr ${fdr} \ + --lfc ${lfc} \ + --bootstrap ${bootstrap} \ + --k_value ${k_value} \ + --lola_run ${lola_run} + + cat chipbinner.significant_up.bed chipbinner.significant_down.bed | awk 'NF' | sort -k1,1 -k2,2n | uniq > chipbinner.significant.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + r-base: \$(R --version 2>&1 | head -n 1 | sed -e 's/.*R version //; s/ .*//') + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/diffbind_run.nf b/modules/local/diffbind_run.nf new file mode 100644 index 00000000..2a7d27f3 --- /dev/null +++ b/modules/local/diffbind_run.nf @@ -0,0 +1,68 @@ +process DIFFBIND_RUN { + tag "${group}.${caller}" + label 'process_medium' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/01_diffbind/${caller}/${group}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "bioconda::bioconductor-diffbind bioconda::bioconductor-deseq2 bioconda::bioconductor-edger bioconda::bioconductor-genomicranges conda-forge::r-base=4.2.3 conda-forge::r-optparse conda-forge::r-ggplot2 conda-forge::r-jsonlite conda-forge::r-yaml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-diffbind:3.14.0--r42hdfd78af_0' : + 'biocontainers/bioconductor-diffbind:3.14.0--r42hdfd78af_0' }" + + input: + path samplesheet + val treated + val control + val group + val caller + val use_spikein + val fdr + val lfc + val min_overlap + val summits + val backend + val extra_params + val output_prefix + + output: + tuple val(group), val(caller), path("${output_prefix}.results.tsv") , emit: results + tuple val(group), val(caller), path("${output_prefix}.significant.bed") , emit: significant + tuple val(group), val(caller), path("${output_prefix}.significant_up.bed") , emit: significant_up + tuple val(group), val(caller), path("${output_prefix}.significant_down.bed") , emit: significant_down + tuple val(group), val(caller), path("${output_prefix}.summary.tsv") , emit: summary + tuple val(group), val(caller), path("plots") , emit: plots + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def extra_arg = extra_params ? "--extra_params ${extra_params}" : '' + """ + mkdir -p plots + Rscript ${projectDir}/bin/diffbind_run.R \ + --samplesheet ${samplesheet} \ + --treated ${treated} \ + --control ${control} \ + --group ${group} \ + --caller ${caller} \ + --fdr ${fdr} \ + --lfc ${lfc} \ + --min_overlap ${min_overlap} \ + --summits ${summits} \ + --backend ${backend} \ + --use_spikein ${use_spikein} \ + --prefix ${output_prefix} \ + ${extra_arg} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diffbind: \$(Rscript -e 'packageVersion("DiffBind")' 2>/dev/null | tr -d '[]') + r-base: \$(R --version 2>&1 | head -n 1 | sed -e 's/.*R version //; s/ .*//') + END_VERSIONS + """ +} diff --git a/modules/local/diffbind_samplesheet.nf b/modules/local/diffbind_samplesheet.nf new file mode 100644 index 00000000..21a790f3 --- /dev/null +++ b/modules/local/diffbind_samplesheet.nf @@ -0,0 +1,61 @@ +process MAKE_DIFFBIND_SAMPLESHEET { + tag "${group}.${caller}" + label 'process_single' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/01_diffbind/00_samplesheets/${caller}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::python=3.11" + container "quay.io/biocontainers/python:3.8.3" + + input: + val samples_json + val group + val caller + + output: + tuple val(group), val(caller), path("*.diffbind.csv"), emit: samplesheet + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${group}.diffbind" + """\ + python - <<'PY' + import csv + import json + + samples = json.loads(r'''${samples_json}''') + out = f"${prefix}.csv" + + headers = [ + 'SampleID','Tissue','Factor','Condition','Replicate','bamReads','Peaks','PeakCaller','SpikeinScaleFactor' + ] + with open(out, 'w', newline='') as handle: + writer = csv.writer(handle) + writer.writerow(headers) + for sample in samples: + writer.writerow([ + sample['sample_id'], + 'CUTRUN', + sample['group'], + sample['condition'], + sample['replicate'], + sample['bam'], + sample['peaks'], + sample['caller'], + sample.get('spikein_scale_factor', 'NA'), + ]) + PY + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/differential_overlap.nf b/modules/local/differential_overlap.nf new file mode 100644 index 00000000..5ef01d11 --- /dev/null +++ b/modules/local/differential_overlap.nf @@ -0,0 +1,107 @@ +process DIFFERENTIAL_OVERLAP { + tag "overlap" + label 'process_medium' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/04_cross_comparison" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "bioconda::bedtools=2.31.1 conda-forge::python=3.11" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + path callers_tsv + path methods_tsv + + output: + path "overlap_callers.tsv" , emit: callers + path "overlap_methods.tsv" , emit: methods + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + set -euo pipefail + + declare -A bp_cache + + calc_bp() { + local bed="\$1" + if [[ ! -s "\$bed" ]]; then + echo 0 + return + fi + bedtools sort -i "\$bed" | bedtools merge -i - | awk '{sum += (\$3 - \$2)} END {print sum + 0}' + } + + get_bp() { + local bed="\$1" + if [[ -n "\${bp_cache[\$bed]+x}" ]]; then + echo "\${bp_cache[\$bed]}" + return + fi + local value + value=\$(calc_bp "\$bed") + bp_cache["\$bed"]="\$value" + echo "\$value" + } + + printf "group\tcaller_a\tcaller_b\tintersection\tunion\tjaccard\n" > overlap_callers.tsv + if [[ -s "${callers_tsv}" ]]; then + awk -F'\\t' 'BEGIN{OFS="\\t"} { g=\$1; c=\$2; p=\$3; n[g]++; callers[g,n[g]]=c; paths[g,n[g]]=p } + END { for (g in n) { for (i=1; i<=n[g]; i++) { for (j=i+1; j<=n[g]; j++) { print g, callers[g,i], paths[g,i], callers[g,j], paths[g,j] } } } }' "${callers_tsv}" \ + | while IFS=\$'\\t' read -r group caller_a path_a caller_b path_b; do + jaccard_line=\$(bedtools jaccard -a "\$path_a" -b "\$path_b" | awk 'NR==2 {print \$1"\\t"\$2"\\t"\$3}') + if [[ -z "\$jaccard_line" ]]; then + inter=0 + union=0 + jacc=0 + else + IFS=\$'\\t' read -r inter union jacc <<< "\$jaccard_line" + fi + printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n" "\$group" "\$caller_a" "\$caller_b" "\$inter" "\$union" "\$jacc" >> overlap_callers.tsv + done + fi + + printf "group\tcaller\tmethod_a\tmethod_b\tn_a\tn_b\tn_overlap\tjaccard\tspan_mode_used\n" > overlap_methods.tsv + if [[ -s "${methods_tsv}" ]]; then + awk -F'\\t' 'BEGIN{OFS="\\t"} { g=\$1; m=\$2; c=\$3; p=\$4; s=\$5; n[g]++; methods[g,n[g]]=m; callers[g,n[g]]=c; paths[g,n[g]]=p; spans[g,n[g]]=s } + END { for (g in n) { for (i=1; i<=n[g]; i++) { for (j=i+1; j<=n[g]; j++) { if (methods[g,i] == methods[g,j]) { next } print g, methods[g,i], callers[g,i], paths[g,i], spans[g,i], methods[g,j], callers[g,j], paths[g,j], spans[g,j] } } } }' "${methods_tsv}" \ + | while IFS=\$'\\t' read -r group method_a caller_a path_a span_a method_b caller_b path_b span_b; do + n_a=\$(get_bp "\$path_a") + n_b=\$(get_bp "\$path_b") + jaccard_line=\$(bedtools jaccard -a "\$path_a" -b "\$path_b" | awk 'NR==2 {print \$1"\\t"\$3}') + if [[ -z "\$jaccard_line" ]]; then + n_overlap=0 + jacc=0 + else + IFS=\$'\\t' read -r n_overlap jacc <<< "\$jaccard_line" + fi + caller="NA" + if [[ "\$method_a" == "diffbind" ]]; then + caller="\$caller_a" + elif [[ "\$method_b" == "diffbind" ]]; then + caller="\$caller_b" + fi + span_mode="NA" + if [[ "\$method_a" == "span" ]]; then + span_mode="\$span_a" + elif [[ "\$method_b" == "span" ]]; then + span_mode="\$span_b" + fi + printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n" "\$group" "\$caller" "\$method_a" "\$method_b" "\$n_a" "\$n_b" "\$n_overlap" "\$jacc" "\$span_mode" >> overlap_methods.tsv + done + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf new file mode 100644 index 00000000..35b84ffb --- /dev/null +++ b/modules/local/differential_summary.nf @@ -0,0 +1,104 @@ +process DIFFERENTIAL_SUMMARY_MERGE { + tag "differential_summary" + label 'process_single' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/multiqc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::python=3.11" + container "quay.io/biocontainers/python:3.8.3" + + input: + path summary_files, stageAs: 'summaries/*' + val skipped_json + path summary_header + path skipped_header + + output: + path "differential.summary.all_methods.tsv", emit: summary + path "differential.skipped.tsv" , emit: skipped + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + mkdir -p summaries + : > summary_files.list + find summaries -type f -name "*.tsv" -print | sort > summary_files.list + + python - <<'PY' + import csv + import json + import sys + from pathlib import Path + + summary_files = [] + with open('summary_files.list') as handle: + for line in handle: + line = line.strip() + if line: + summary_files.append(Path(line)) + summary_files = sorted(summary_files) + + with open('${summary_header}', 'r') as header_handle: + header_lines = header_handle.read().rstrip('\\n') + + rows = [] + for path in summary_files: + with path.open() as handle: + reader = csv.DictReader(handle, delimiter='\\t') + for row in reader: + rows.append(row) + + base_fields = [ + 'method','group','caller','treated_condition','control_condition','n_tested','n_fdr_pass','n_up','n_down','use_spikein','span_mode_used' + ] + if rows: + fieldnames = list(base_fields) + for row in rows: + for key in row.keys(): + if key not in fieldnames: + fieldnames.append(key) + else: + fieldnames = list(base_fields) + + with open('differential.summary.all_methods.tsv', 'w', newline='') as out_handle: + if header_lines: + out_handle.write(header_lines + '\\n') + writer = csv.DictWriter(out_handle, delimiter='\\t', fieldnames=fieldnames, lineterminator='\\n') + writer.writeheader() + for row in rows: + writer.writerow(row) + + skipped = json.loads(r'''${skipped_json}''') if '${skipped_json}' else [] + + with open('${skipped_header}', 'r') as header_handle: + skipped_header = header_handle.read().rstrip('\\n') + + with open('differential.skipped.tsv', 'w', newline='') as out_handle: + if skipped_header: + out_handle.write(skipped_header + '\\n') + fieldnames = ['method','group','caller','reason','details'] + writer = csv.DictWriter(out_handle, delimiter='\\t', fieldnames=fieldnames, lineterminator='\\n') + writer.writeheader() + for row in skipped: + writer.writerow({ + 'method': row.get('method', 'NA'), + 'group': row.get('group', 'NA'), + 'caller': row.get('caller', 'NA'), + 'reason': row.get('reason', 'NA'), + 'details': row.get('details', '') + }) + PY + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf index 81f70b95..bde1405a 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc.nf @@ -32,6 +32,8 @@ process MULTIQC { path ('peak_metrics/peak_reprod_perc/*') path ('frag_len/*') path ('linear_duplicates/*') + path ('differential/*') + path ('differential/*') output: path "*multiqc_report.html", emit: report diff --git a/modules/local/samtools_merge_bams.nf b/modules/local/samtools_merge_bams.nf new file mode 100644 index 00000000..caf9ae51 --- /dev/null +++ b/modules/local/samtools_merge_bams.nf @@ -0,0 +1,31 @@ +process SAMTOOLS_MERGE_BAMS { + tag "${meta.group}.${meta.role}" + label 'process_medium' + + conda "bioconda::samtools=1.19.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : + 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" + + input: + tuple val(meta), path(bams) + + output: + tuple val(meta), path("${meta.group}.${meta.role}.merged.bam"), path("${meta.group}.${meta.role}.merged.bam.bai"), emit: merged + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.group}.${meta.role}.merged" + """ + samtools merge -f ${prefix}.bam ${bams} + samtools index ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(samtools --version 2>&1 | head -n 1 | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/span_capability_probe.nf b/modules/local/span_capability_probe.nf new file mode 100644 index 00000000..c0b6cd3b --- /dev/null +++ b/modules/local/span_capability_probe.nf @@ -0,0 +1,108 @@ +process SPAN_CAPABILITY_PROBE { + tag "omnipeaks" + label 'process_single' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/00_manifests" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::openjdk=21.0.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://eclipse-temurin:21-jre' : + 'eclipse-temurin:21-jre' }" + + input: + path omnipeaks_jar + + output: + path "omnipeaks_capabilities.json", emit: capabilities + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + java -jar ${omnipeaks_jar} --help > omnipeaks_help.txt + commands=\$(awk ' + { + line=\$0 + sub(/^[ \\t]+/, "", line) + if (line == "") next + if (match(line, /^([A-Za-z][A-Za-z0-9_-]*)[ \\t]+/, m)) { + cmd=m[1] + lc=tolower(cmd) + if (lc != "usage" && lc != "options" && lc != "commands") print cmd + } + }' omnipeaks_help.txt | sort -u) + + has_compare=false + compare_cmd="" + for cmd in \${commands}; do + lc=\$(printf '%s' "\${cmd}" | tr '[:upper:]' '[:lower:]') + if [ "\${lc}" = "compare" ] || [ "\${lc}" = "diff" ] || [ "\${lc}" = "differential" ]; then + has_compare=true + fi + if [ -z "\${compare_cmd}" ] && [ "\${lc}" = "compare" ]; then + compare_cmd="\${cmd}" + fi + done + if [ -z "\${compare_cmd}" ]; then + for cmd in \${commands}; do + lc=\$(printf '%s' "\${cmd}" | tr '[:upper:]' '[:lower:]') + if [ "\${lc}" = "diff" ] || [ "\${lc}" = "differential" ]; then + compare_cmd="\${cmd}" + break + fi + done + fi + + compare_multibam=false + compare_multibam_mode="single" + if [ -n "\${compare_cmd}" ]; then + java -jar ${omnipeaks_jar} "\${compare_cmd}" --help > omnipeaks_compare_help.txt 2>/dev/null || true + if [ -s omnipeaks_compare_help.txt ]; then + if grep -Eiq 'comma[- ]?separated|comma separated|list|multiple|replicate|replicates|bams?' omnipeaks_compare_help.txt; then + compare_multibam=true + fi + if grep -Eiq 'comma[- ]?separated|bam[^ ]*,|bam\[,?bam' omnipeaks_compare_help.txt; then + compare_multibam=true + compare_multibam_mode="comma" + fi + if grep -Eiq 'repeat|multiple[[:space:]]+times|use[[:space:]]+multiple' omnipeaks_compare_help.txt; then + if [ "\${compare_multibam_mode}" = "single" ]; then + compare_multibam_mode="repeat" + fi + compare_multibam=true + fi + if [ "\${compare_multibam}" = true ] && [ "\${compare_multibam_mode}" = "single" ]; then + compare_multibam_mode="comma" + fi + fi + fi + + if [ -n "\${commands}" ]; then + commands_json=\$(printf '%s\n' "\${commands}" | awk 'BEGIN{first=1} {gsub(/\"/,"\\\\\""); if (!first) printf ", "; printf "\"%s\"", \$0; first=0} END{print ""}') + commands_json="[ \${commands_json} ]" + else + commands_json="[]" + fi + + cat <<-END_JSON > omnipeaks_capabilities.json + { + "commands": \${commands_json}, + "has_compare": \${has_compare}, + "compare_command": "\${compare_cmd}", + "compare_multi_bam": \${compare_multibam}, + "compare_multi_bam_mode": "\${compare_multibam_mode}" + } + END_JSON + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + java: \$(java -version 2>&1 | head -n 1 | sed -e 's/"//g') + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/span_compare.nf b/modules/local/span_compare.nf new file mode 100644 index 00000000..19b0b308 --- /dev/null +++ b/modules/local/span_compare.nf @@ -0,0 +1,218 @@ +process SPAN_COMPARE { + tag "${meta.id}" + label 'process_long' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/${meta.group}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::openjdk=21.0.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://eclipse-temurin:21-jre' : + 'eclipse-temurin:21-jre' }" + + input: + tuple val(meta), path(treated_bams), path(control_bams) + path chrom_sizes + path omnipeaks_jar + val gap + val bin + val fdr + val java_heap + val compare_cmd + val multibam_mode + + output: + tuple val(meta), path("span.differential.tsv") , emit: tsv + tuple val(meta), path("span.differential.bed") , emit: bed + tuple val(meta), path("span.significant.bed") , emit: significant + tuple val(meta), path("span.significant_up.bed") , emit: up + tuple val(meta), path("span.significant_down.bed") , emit: down + tuple val(meta), path("span.summary.tsv") , emit: summary + path "00_manifests/span_compare_inputs.tsv" , emit: manifest + path "span.readme.txt" , optional: true, emit: readme + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}.span_compare" + def fdr_arg = fdr ? "--fdr ${fdr}" : '' + def bin_arg = bin ? "--bin ${bin}" : '' + def treated_manifest = meta.treated_bams instanceof List ? meta.treated_bams.join(';') : (meta.treated_bams ?: '') + def control_manifest = meta.control_bams instanceof List ? meta.control_bams.join(';') : (meta.control_bams ?: '') + def input_mode = meta.input_mode ?: 'native' + def compare_mode = multibam_mode ?: 'single' + def compare_command = compare_cmd ?: 'compare' + """ + java_cmd=\${JAVA_CMD:-java} + compare_mode="${compare_mode}" + compare_command="${compare_command}" + + treated_list=( ${treated_bams} ) + control_list=( ${control_bams} ) + if [ "\${compare_mode}" = "repeat" ]; then + treated_args="" + for bam in "\${treated_list[@]}"; do + treated_args="\${treated_args} -t \${bam}" + done + control_args="" + for bam in "\${control_list[@]}"; do + control_args="\${control_args} -c \${bam}" + done + else + treated_joined=\$(IFS=,; echo "\${treated_list[*]}") + control_joined=\$(IFS=,; echo "\${control_list[*]}") + treated_args="-t \${treated_joined}" + control_args="-c \${control_joined}" + fi + + "\$java_cmd" -Xmx${java_heap} -jar ${omnipeaks_jar} \${compare_command} \ + \${treated_args} \ + \${control_args} \ + --cs ${chrom_sizes} \ + --gap ${gap} \ + ${bin_arg} \ + ${fdr_arg} \ + -p ${prefix} + + diff_file=\$(ls ${prefix}*.peak 2>/dev/null | head -n 1 || true) + if [ -z "\$diff_file" ]; then + diff_file=\$(ls ${prefix}*.bed 2>/dev/null | head -n 1 || true) + fi + if [ -z "\$diff_file" ]; then + echo "SPAN compare produced no output" >&2 + exit 1 + fi + + cp "\$diff_file" span.differential.tsv + + cat <<'AWK' > span_compare_bed.awk + function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?\$/) } + BEGIN { OFS="\\t"; header_done=0; chr_idx=1; start_idx=2; end_idx=3 } + /^#/ || /^track/ || /^browser/ { next } + !header_done { + if (!isnum(\$2)) { + for (i=1; i<=NF; i++) { + col=tolower(\$i) + if (col ~ /^(chr|chrom|seqnames|seqname)\$/) chr_idx=i + if (col == "start") start_idx=i + if (col == "end") end_idx=i + } + header_done=1 + next + } else { + header_done=1 + } + } + { print \$chr_idx,\$start_idx,\$end_idx } + AWK + awk -f span_compare_bed.awk "\$diff_file" > span.differential.bed + + : > span.significant_up.bed + : > span.significant_down.bed + cat <<'AWK' > span_compare_filter.awk + function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?\$/) } + function calc_fdr(val) { + if (!isnum(val)) return 1 + if (val <= 1) return val + 0 + return 10^(-val) + } + function calc_log2(val) { + if (!isnum(val)) return "" + if (val < 0) return val + 0 + if (val == 0) return 0 + return log(val) / log(2) + } + BEGIN { OFS="\\t"; header_done=0; chr_idx=1; start_idx=2; end_idx=3; log2_idx=0; fdr_idx=0; qval_idx=0; fold_idx=0; proxy_log2=0 } + /^#/ || /^track/ || /^browser/ { next } + !header_done { + if (!isnum(\$2)) { + for (i=1; i<=NF; i++) { + col=tolower(\$i) + if (col ~ /^(chr|chrom|seqnames|seqname)\$/) chr_idx=i + if (col == "start") start_idx=i + if (col == "end") end_idx=i + if (col ~ /log2fc|log2foldchange|log2_fold_change|log2fold|log_fc|log2ratio|log2_ratio/) log2_idx=i + if (col ~ /^(fdr|false.discovery|padj|adj_p|adjp)\$/) fdr_idx=i + if (col ~ /qvalue|qval|q-value/) qval_idx=i + if ((col ~ /fold|fc/) && log2_idx==0) fold_idx=i + } + header_done=1 + next + } else { + chr_idx=1; start_idx=2; end_idx=3; log2_idx=7; qval_idx=9 + proxy_log2=1 + header_done=1 + } + } + { + log2_val="" + if (log2_idx>0 && isnum(\$log2_idx)) log2_val=\$log2_idx+0 + if (log2_val=="" && fold_idx>0 && isnum(\$fold_idx)) { log2_val=calc_log2(\$fold_idx); proxy_log2=1 } + if (log2_val=="" && log2_idx==7 && isnum(\$7)) { log2_val=calc_log2(\$7); proxy_log2=1 } + fdr_val="" + if (fdr_idx>0) fdr_val=\$fdr_idx + else if (qval_idx>0) fdr_val=\$qval_idx + else if (NF>=9) fdr_val=\$9 + fdr_val=calc_fdr(fdr_val) + if (fdr_val <= fdr_thresh) { + if (log2_val > 0) print \$chr_idx,\$start_idx,\$end_idx >> up_file + else if (log2_val < 0) print \$chr_idx,\$start_idx,\$end_idx >> down_file + } + } + END { + if (proxy_log2) print "proxy_log2" > proxy_file + } + AWK + + awk -v fdr_thresh="${fdr}" -v up_file="span.significant_up.bed" -v down_file="span.significant_down.bed" -v proxy_file="span.proxy_log2.txt" -f span_compare_filter.awk "\$diff_file" + + cat span.significant_up.bed span.significant_down.bed | awk 'NF' | sort -k1,1 -k2,2n | uniq > span.significant.bed + + total=\$(awk 'function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?\$/) } /^#/ || /^track/ || /^browser/ { next } !header_seen { if (!isnum(\$2)) { header_seen=1; next } header_seen=1 } { n++ } END { print n+0 }' "\$diff_file") + n_up=\$(wc -l < span.significant_up.bed | awk '{print \$1}') + n_down=\$(wc -l < span.significant_down.bed | awk '{print \$1}') + n_fdr_pass=\$((n_up + n_down)) + cat <<-END_SUMMARY > span.summary.tsv + method\tgroup\tcaller\ttreated_condition\tcontrol_condition\tn_tested\tn_fdr_pass\tn_up\tn_down\tuse_spikein\tspan_mode_used + span\t${meta.group}\tNA\t${meta.treated ?: 'NA'}\t${meta.control ?: 'NA'}\t\${total}\t\${n_fdr_pass}\t\${n_up}\t\${n_down}\tNA\tnative + END_SUMMARY + + mkdir -p 00_manifests + if [ -z "${treated_manifest}" ]; then + treated_manifest=\$(IFS=';'; echo "\${treated_list[*]}") + else + treated_manifest="${treated_manifest}" + fi + if [ -z "${control_manifest}" ]; then + control_manifest=\$(IFS=';'; echo "\${control_list[*]}") + else + control_manifest="${control_manifest}" + fi + { + printf "group\\tcondition\\tinput_bams\\tinput_mode\\n" + printf "%s\\t%s\\t%s\\t%s\\n" "${meta.group}" "${meta.treated ?: 'NA'}" "\${treated_manifest}" "${input_mode}" + printf "%s\\t%s\\t%s\\t%s\\n" "${meta.group}" "${meta.control ?: 'NA'}" "\${control_manifest}" "${input_mode}" + } > 00_manifests/span_compare_inputs.tsv + + if [ -f span.proxy_log2.txt ]; then + cat <<-END_README > span.readme.txt + SPAN native differential parsing notes: + - No explicit log2FC column detected; log2FC was derived from the signal/fold-change column. + - For positive values, log2FC = log2(value); negative values are treated as already log2-scaled. + - FDR was computed from the Q-value column (column 9) assuming -log10 scale when values > 1. + - Direction follows treated (${meta.treated ?: 'NA'}) vs control (${meta.control ?: 'NA'}). + END_README + fi + rm -f span_compare_filter.awk span_compare_bed.awk span.proxy_log2.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + java: \$(\${java_cmd} -version 2>&1 | head -n 1 | sed -e 's/\"//g') + END_VERSIONS + """ +} diff --git a/modules/local/span_fallback_diff.nf b/modules/local/span_fallback_diff.nf new file mode 100644 index 00000000..2dbe6547 --- /dev/null +++ b/modules/local/span_fallback_diff.nf @@ -0,0 +1,76 @@ +process SPAN_FALLBACK_DIFF { + tag "${group}" + label 'process_medium' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/${group}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "bioconda::bioconductor-diffbind bioconda::bioconductor-deseq2 bioconda::bioconductor-edger bioconda::bioconductor-genomicranges bioconda::bioconductor-genomicalignments bioconda::bioconductor-rsamtools conda-forge::r-base=4.2.3 conda-forge::r-optparse" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-diffbind:3.14.0--r42hdfd78af_0' : + 'biocontainers/bioconductor-diffbind:3.14.0--r42hdfd78af_0' }" + + input: + path samplesheet + val treated + val control + val group + val use_spikein + val fdr + val backend + + output: + tuple val(group), path("span_fallback.differential.tsv") , emit: results + tuple val(group), path("span_fallback.differential.bed") , emit: bed + tuple val(group), path("span_fallback.significant.bed") , emit: significant + tuple val(group), path("span_fallback.significant_up.bed") , emit: up + tuple val(group), path("span_fallback.significant_down.bed") , emit: down + tuple val(group), path("span_fallback.summary.tsv") , emit: summary + tuple val(group), path("span_fallback.readme.txt") , emit: readme + tuple val(group), path("span_fallback.union.bed") , emit: union + tuple val(group), path("span_fallback.counts.tsv") , emit: counts + path "00_manifests/span_fallback_union_manifest.tsv" , emit: manifest + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + Rscript ${projectDir}/bin/span_fallback_de.R \\ + --samplesheet ${samplesheet} \\ + --treated ${treated} \\ + --control ${control} \\ + --group ${group} \\ + --fdr ${fdr} \\ + --backend ${backend} \\ + --use_spikein ${use_spikein} \\ + --prefix span_fallback + + mv span_fallback.results.tsv span_fallback.differential.tsv + awk 'BEGIN{OFS="\\t"} NR>1 {print \$1,\$2,\$3}' span_fallback.differential.tsv > span_fallback.differential.bed + + mkdir -p 00_manifests + peaks_list=\$(cat span_fallback.peaks.list 2>/dev/null || true) + cat <<-END_MANIFEST > 00_manifests/span_fallback_union_manifest.tsv + group\tunion_bed\tcount_matrix\tsource_peaks + ${group}\t${params.outdir}/03_peak_calling/06_differential/03_span/${group}/span_fallback.union.bed\t${params.outdir}/03_peak_calling/06_differential/03_span/${group}/span_fallback.counts.tsv\t\${peaks_list} + END_MANIFEST + rm -f span_fallback.peaks.list + + cat <<-END_README > span_fallback.readme.txt + SPAN fallback differential: SPAN peaks -> union peaks -> per-sample counts -> ${backend} differential. + Log2FC is treated (${treated}) vs control (${control}). FDR threshold: ${fdr}. + Spike-in scaling applied: ${use_spikein}. + END_README + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + deseq2: \$(Rscript -e 'packageVersion("DESeq2")' 2>/dev/null | tr -d '[]') + r-base: \$(R --version 2>&1 | head -n 1 | sed -e 's/.*R version //; s/ .*//') + END_VERSIONS + """ +} diff --git a/modules/local/span_pooling_manifest.nf b/modules/local/span_pooling_manifest.nf new file mode 100644 index 00000000..a7974e31 --- /dev/null +++ b/modules/local/span_pooling_manifest.nf @@ -0,0 +1,52 @@ +process SPAN_POOLING_MANIFEST { + tag "${group}" + label 'process_single' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/${group}/00_manifests" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::coreutils=9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + val group + val records + + output: + tuple val(group), path("span_pooling.tsv"), emit: manifest + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def header = "group\tcondition\tpooled_bam\tpooled_bai\tinput_bams\tpooling_reason\tcreated_by" + def lines = records ? records.collect { record -> + [ + record.group, + record.condition, + record.pooled_bam ?: 'NA', + record.pooled_bai ?: 'NA', + record.input_bams ?: '', + record.pooling_reason ?: '', + record.created_by ?: 'native' + ].join('\t') + }.join('\n') : '' + """ + printf "%s\\n" "${header}" > span_pooling.tsv + if [ -n "${lines}" ]; then + printf "%s\\n" "${lines}" >> span_pooling.tsv + fi + + coreutils_version=\$(cat --version | head -n 1 | awk '{print \$NF}') + { + echo "\\"${task.process}\\":" + echo " coreutils: \$coreutils_version" + } > versions.yml + """ +} diff --git a/modules/local/write_differential_manifests.nf b/modules/local/write_differential_manifests.nf new file mode 100644 index 00000000..2a03b360 --- /dev/null +++ b/modules/local/write_differential_manifests.nf @@ -0,0 +1,138 @@ +process WRITE_DIFFERENTIAL_MANIFESTS { + tag "differential_manifests" + label 'process_single' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/00_manifests" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::python=3.11" + container "quay.io/biocontainers/python:3.8.3" + + input: + val samples_json + val peaks_json + val run_meta_json + val outdir + val chrom_sizes + + output: + path "differential_manifest.samples.tsv", emit: samples + path "differential_manifest.peaks.tsv" , emit: peaks + path "differential_manifest.run_meta.json", emit: run_meta + path "chrom_sizes.sizes" , optional: true, emit: chrom_sizes + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + python - <<'PY' + import json + import sys + import shutil + from pathlib import Path + + samples = json.loads(r'''${samples_json}''') + peaks = json.loads(r'''${peaks_json}''') + run_meta = json.loads(r'''${run_meta_json}''') + outdir = Path(r'''${outdir}''') + + def resolve_published(path_str): + if not path_str or path_str == 'NA': + return path_str + path = Path(path_str) + if path.exists() and outdir in path.parents: + return str(path) + if not outdir.exists(): + return path_str + matches = list(outdir.rglob(path.name)) + if matches: + return str(matches[0]) + return path_str + + samples_sorted = sorted( + samples, + key=lambda x: ( + str(x.get('group', '')), + str(x.get('condition', '')), + int(x.get('replicate', 0)), + str(x.get('sample_id', '')), + ), + ) + + peaks_sorted = sorted( + peaks, + key=lambda x: ( + str(x.get('group', '')), + str(x.get('condition', '')), + int(x.get('replicate', 0)), + str(x.get('sample_id', '')), + str(x.get('caller', '')), + ), + ) + + with open('differential_manifest.samples.tsv', 'w') as handle: + handle.write("\\t".join([ + 'sample_id', + 'group', + 'condition', + 'replicate', + 'control_group', + 'control_condition', + 'final_bam', + 'final_bai', + 'normalisation_mode', + 'spikein_scale_factor', + 'bigwig', + ]) + "\\n") + for row in samples_sorted: + handle.write("\\t".join([ + str(row.get('sample_id', '')), + str(row.get('group', '')), + str(row.get('condition', '')), + str(row.get('replicate', '')), + str(row.get('control_group', 'NA')), + str(row.get('control_condition', 'NA')), + resolve_published(str(row.get('final_bam', ''))), + resolve_published(str(row.get('final_bai', ''))), + str(row.get('normalisation_mode', '')), + str(row.get('spikein_scale_factor', 'NA')), + resolve_published(str(row.get('bigwig', 'NA'))), + ]) + "\\n") + + with open('differential_manifest.peaks.tsv', 'w') as handle: + handle.write("\\t".join([ + 'sample_id', + 'caller', + 'peaks_path', + 'peaks_format', + 'caller_role', + ]) + "\\n") + for row in peaks_sorted: + handle.write("\\t".join([ + str(row.get('sample_id', '')), + str(row.get('caller', '')), + resolve_published(str(row.get('peaks_path', ''))), + str(row.get('peaks_format', 'other')), + str(row.get('caller_role', '')), + ]) + "\\n") + + with open('differential_manifest.run_meta.json', 'w') as handle: + json.dump(run_meta, handle, indent=2, sort_keys=True) + handle.write("\\n") + + chrom = '${chrom_sizes}' + if chrom and Path(chrom).exists(): + shutil.copy(chrom, 'chrom_sizes.sizes') + PY + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """.stripIndent() +} diff --git a/nextflow.config b/nextflow.config index 39d9e100..999fff73 100644 --- a/nextflow.config +++ b/nextflow.config @@ -96,6 +96,50 @@ params { consensus_grouping = null replicate_threshold = 1.0 + // Differential peak calling + run_diffbind = false + run_chipbinner = false + run_span_diff = false + differential_from_run = null + differential_contrast = null + differential_min_replicates = 2 + differential_strict = false + differential_use_spikein = "auto" + differential_annotate = true + differential_cross_compare = true + differential_run_multiqc = true + + // DiffBind options + diffbind_fdr = 0.05 + diffbind_lfc = 1.0 + diffbind_backend = "DESeq2" + diffbind_min_overlap = 2 + diffbind_summits = 0 + diffbind_extra_params = null + + // ChIPBinner options + chipbinner_bin_size = 10000 + chipbinner_windows_dir = null + chipbinner_blacklist = null + chipbinner_use_input = false + chipbinner_pseudocount = 1 + chipbinner_fdr = 0.05 + chipbinner_lfc = 1.0 + chipbinner_bootstrap = 1000 + chipbinner_k_value = 100000 + chipbinner_hdbscan_grid_min_cluster_size = "100,200,500,1000" + chipbinner_hdbscan_grid_min_samples = "100,200,500,1000" + chipbinner_ms_coeffs = null + chipbinner_lola_db = null + chipbinner_run_lola = false + + // SPAN differential options + span_diff_mode = "auto" + span_diff_fdr = 0.05 + span_diff_gap = 5 + span_diff_bin = 200 + span_diff_java_heap = "8G" + // Reporting and Visualisation skip_reporting = false skip_igv = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 3c68e540..5d33059d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -329,14 +332,24 @@ "default": "Spikein", "fa_icon": "fab fa-buffer", "description": "Sets the target read normalisation mode. Options are: [\"Spikein\", \"RPKM\", \"CPM\", \"BPM\", \"None\" ]", - "enum": ["Spikein", "RPKM", "CPM", "BPM", "None"] + "enum": [ + "Spikein", + "RPKM", + "CPM", + "BPM", + "None" + ] }, "normalisation_scope": { "type": "string", "default": "all", "fa_icon": "fab fa-buffer", "description": "Scope for spike-in normalisation when normalisation_mode is Spikein. Options are: [all, group, group_condition].", - "enum": ["all", "group", "group_condition"] + "enum": [ + "all", + "group", + "group_condition" + ] }, "normalisation_binsize": { "type": "integer", @@ -354,7 +367,10 @@ "default": "standard", "fa_icon": "fas fa-align-justify", "description": "Convenience preset for peak callers. Options are: [standard, extended]. Ignored if --peakcaller is set.", - "enum": ["standard", "extended"] + "enum": [ + "standard", + "extended" + ] }, "use_control": { "type": "boolean", @@ -379,7 +395,11 @@ "default": "legacy", "fa_icon": "fas fa-align-justify", "description": "Control scaling scope for IgG/background samples. Options are: [legacy, group_condition, sample].", - "enum": ["legacy", "group_condition", "sample"] + "enum": [ + "legacy", + "group_condition", + "sample" + ] }, "seacr_peak_threshold": { "type": "number", @@ -392,14 +412,20 @@ "default": "non", "fa_icon": "fas fa-align-justify", "description": "SEACR normalization. ", - "enum": ["non", "norm"] + "enum": [ + "non", + "norm" + ] }, "seacr_stringent": { "type": "string", "default": "stringent", "fa_icon": "fas fa-align-justify", "description": "SEACR stringency.", - "enum": ["stringent", "relaxed"] + "enum": [ + "stringent", + "relaxed" + ] }, "macs2_qvalue": { "type": "number", @@ -463,13 +489,19 @@ "default": "group", "fa_icon": "fas fa-align-justify", "description": "Specifies what samples to group together for consensus peaks. Options are [group, all]", - "enum": ["group", "all"] + "enum": [ + "group", + "all" + ] }, "consensus_grouping": { "type": "string", "fa_icon": "fas fa-align-justify", "description": "Grouping key for consensus peaks when consensus_peak_mode is group. Options are [group, group_condition]. Defaults to group_condition when the samplesheet has a condition column, otherwise group.", - "enum": ["group", "group_condition"] + "enum": [ + "group", + "group_condition" + ] }, "replicate_threshold": { "type": "number", @@ -666,7 +698,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "singularity_pull_docker_container": { @@ -755,6 +794,210 @@ "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } } + }, + "differential_options": { + "title": "Differential peak calling", + "type": "object", + "description": "Options for differential peak calling / enrichment.", + "default": "", + "fa_icon": "fas fa-balance-scale", + "properties": { + "run_diffbind": { + "type": "boolean", + "default": false, + "description": "Enable DiffBind differential analysis." + }, + "run_chipbinner": { + "type": "boolean", + "default": false, + "description": "Enable ChIPBinner differential analysis." + }, + "run_span_diff": { + "type": "boolean", + "default": false, + "description": "Enable SPAN/OmniPeaks differential analysis." + }, + "differential_from_run": { + "type": "string", + "default": null, + "description": "Run differential-only using manifests from an existing output directory.", + "fa_icon": "fas fa-folder-open" + }, + "differential_contrast": { + "type": "string", + "default": null, + "description": "Contrast definition as 'TREATED,CONTROL'." + }, + "differential_min_replicates": { + "type": "integer", + "default": 2, + "description": "Minimum replicates per condition for differential analysis." + }, + "differential_strict": { + "type": "boolean", + "default": false, + "description": "Fail on invalid differential comparisons instead of skipping." + }, + "differential_use_spikein": { + "type": "string", + "default": "auto", + "description": "Use spike-in scale factors for differential analysis (auto|true|false).", + "enum": [ + "auto", + "true", + "false" + ] + }, + "differential_annotate": { + "type": "boolean", + "default": true, + "description": "Annotate differential regions to nearest features." + }, + "differential_cross_compare": { + "type": "boolean", + "default": true, + "description": "Compute cross-caller and cross-method overlaps." + }, + "differential_run_multiqc": { + "type": "boolean", + "default": true, + "description": "Generate differential MultiQC summary tables." + }, + "diffbind_fdr": { + "type": "number", + "default": 0.05, + "description": "DiffBind FDR threshold." + }, + "diffbind_lfc": { + "type": "number", + "default": 1.0, + "description": "DiffBind minimum log2 fold-change." + }, + "diffbind_backend": { + "type": "string", + "default": "DESeq2", + "description": "DiffBind backend (DESeq2 or edgeR).", + "enum": [ + "DESeq2", + "edgeR" + ] + }, + "diffbind_min_overlap": { + "type": "integer", + "default": 2, + "description": "DiffBind minimum overlap for consensus peakset." + }, + "diffbind_summits": { + "type": "integer", + "default": 0, + "description": "Summit-centered peak width for DiffBind (0 = use full peaks)." + }, + "diffbind_extra_params": { + "type": "string", + "default": null, + "description": "JSON/YAML file with extra DiffBind options." + }, + "chipbinner_bin_size": { + "type": "integer", + "default": 10000, + "description": "Bin size in bp for ChIPBinner." + }, + "chipbinner_windows_dir": { + "type": "string", + "default": null, + "description": "Directory with precomputed bin windows for ChIPBinner." + }, + "chipbinner_blacklist": { + "type": "string", + "default": null, + "description": "Blacklist BED to exclude from ChIPBinner bins." + }, + "chipbinner_use_input": { + "type": "boolean", + "default": false, + "description": "Use control/input samples in ChIPBinner (if available)." + }, + "chipbinner_pseudocount": { + "type": "integer", + "default": 1, + "description": "Pseudocount added before log transforms." + }, + "chipbinner_fdr": { + "type": "number", + "default": 0.05, + "description": "ChIPBinner FDR threshold." + }, + "chipbinner_lfc": { + "type": "number", + "default": 1.0, + "description": "ChIPBinner minimum log2 fold-change." + }, + "chipbinner_bootstrap": { + "type": "integer", + "default": 1000, + "description": "ROTS bootstrap iterations for ChIPBinner." + }, + "chipbinner_k_value": { + "type": "integer", + "default": 100000, + "description": "ROTS K value (top list size)." + }, + "chipbinner_hdbscan_grid_min_cluster_size": { + "type": "string", + "default": "100,200,500,1000", + "description": "HDBSCAN min_cluster_size grid (comma-separated)." + }, + "chipbinner_hdbscan_grid_min_samples": { + "type": "string", + "default": "100,200,500,1000", + "description": "HDBSCAN min_samples grid (comma-separated)." + }, + "chipbinner_ms_coeffs": { + "type": "string", + "default": null, + "description": "MS coefficients YAML for ChIPBinner normalization." + }, + "chipbinner_lola_db": { + "type": "string", + "default": null, + "description": "LOLA database path for ChIPBinner enrichment." + }, + "chipbinner_run_lola": { + "type": "boolean", + "default": false, + "description": "Run LOLA enrichment for ChIPBinner clusters." + }, + "span_diff_mode": { + "type": "string", + "default": "auto", + "description": "SPAN differential mode (auto|native|fallback).", + "enum": [ + "auto", + "native", + "fallback" + ] + }, + "span_diff_fdr": { + "type": "number", + "default": 0.05, + "description": "SPAN differential FDR threshold." + }, + "span_diff_gap": { + "type": "integer", + "default": 5, + "description": "SPAN compare gap parameter." + }, + "span_diff_bin": { + "type": "integer", + "default": 200, + "description": "SPAN compare bin size." + }, + "span_diff_java_heap": { + "type": "string", + "default": "8G", + "description": "Java heap for SPAN/OmniPeaks differential." + } + } } }, "allOf": [ @@ -773,6 +1016,9 @@ { "$ref": "#/definitions/pipeline_options" }, + { + "$ref": "#/definitions/differential_options" + }, { "$ref": "#/definitions/reporting_options" }, @@ -786,4 +1032,4 @@ "$ref": "#/definitions/generic_options" } ] -} +} \ No newline at end of file diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf new file mode 100644 index 00000000..e390b39d --- /dev/null +++ b/subworkflows/local/differential_peak_calling.nf @@ -0,0 +1,847 @@ +/* + * Differential peak calling / enrichment + */ + +include { WRITE_DIFFERENTIAL_MANIFESTS } from '../../modules/local/write_differential_manifests' +include { MAKE_DIFFBIND_SAMPLESHEET } from '../../modules/local/diffbind_samplesheet' +include { MAKE_DIFFBIND_SAMPLESHEET as MAKE_SPAN_SAMPLESHEET } from '../../modules/local/diffbind_samplesheet' +include { DIFFBIND_RUN } from '../../modules/local/diffbind_run' +include { CHIPBINNER_BINS } from '../../modules/local/chipbinner_bins' +include { CHIPBINNER_COUNTS } from '../../modules/local/chipbinner_counts' +include { CHIPBINNER_HDBSCAN_GRID } from '../../modules/local/chipbinner_hdbscan_grid' +include { CHIPBINNER_ROTS } from '../../modules/local/chipbinner_rots' +include { CHIPBINNER_LOLA } from '../../modules/local/chipbinner_lola' +include { SAMTOOLS_MERGE_BAMS } from '../../modules/local/samtools_merge_bams' +include { SPAN_CAPABILITY_PROBE } from '../../modules/local/span_capability_probe' +include { SPAN_COMPARE } from '../../modules/local/span_compare' +include { SPAN_FALLBACK_DIFF } from '../../modules/local/span_fallback_diff' +include { SPAN_POOLING_MANIFEST } from '../../modules/local/span_pooling_manifest' +include { ANNOTATE_REGIONS } from '../../modules/local/annotate_regions' +include { DIFFERENTIAL_OVERLAP } from '../../modules/local/differential_overlap' +include { DIFFERENTIAL_SUMMARY_MERGE } from '../../modules/local/differential_summary' + +workflow DIFFERENTIAL_PEAK_CALLING { + take: + ch_bam_bai // tuple [meta, bam, bai] + ch_control_bam_bai // tuple [meta, bam, bai] (controls) + ch_peaks // tuple [meta, peaks] + ch_bigwig // tuple [meta, bigwig] + ch_spikein_scale // tuple [meta, scale] + ch_chrom_sizes // path + callers // list of callers + + main: + ch_versions = Channel.empty() + ch_skipped_records = Channel.empty() + ch_summary_files = Channel.empty() + ch_diffbind_significant = Channel.empty() + ch_chipbinner_up = Channel.empty() + ch_chipbinner_down = Channel.empty() + ch_span_native_significant = Channel.empty() + ch_span_fallback_significant = Channel.empty() + + ch_chrom_sizes_single = ch_chrom_sizes + .map { it instanceof List ? it[0] : it } + .ifEmpty('') + ch_gene_bed = Channel.value(params.gene_bed ? file(params.gene_bed) : '') + ch_gtf = Channel.value(params.gtf ? file(params.gtf) : '') + + def contrast_parts = params.differential_contrast?.split(',')?.collect { it.trim() }?.findAll { it } + if (!contrast_parts || contrast_parts.size() != 2) { + exit 1, "Invalid --differential_contrast value. Expected 'TREATED,CONTROL'." + } + def treated = contrast_parts[0] + def control = contrast_parts[1] + + def use_spikein_param = params.differential_use_spikein?.toString()?.toLowerCase() + + ch_bam_bai_target = ch_bam_bai.filter { meta, bam, bai -> meta.is_control == false } + ch_bigwig_target = ch_bigwig.filter { meta, bw -> meta.is_control == false } + ch_scale_target = ch_spikein_scale.filter { meta, scale -> meta.is_control == false } + + ch_bigwig_map = ch_bigwig_target + .map { meta, bw -> [meta.sample_id ?: meta.id, bw] } + .toList() + .map { list -> list.collectEntries { [(it[0]): it[1]] } } + + ch_scale_map = ch_scale_target + .map { meta, scale -> [meta.sample_id ?: meta.id, scale] } + .toList() + .map { list -> list.collectEntries { [(it[0]): it[1]] } } + + ch_samples = ch_bam_bai_target + .combine(ch_bigwig_map) + .combine(ch_scale_map) + .map { meta, bam, bai, bw_map, scale_map -> + def sample_id = meta.sample_id ?: meta.id + def scale = scale_map.get(sample_id, 'NA') + def bigwig = bw_map.get(sample_id, 'NA') + [ + sample_id: sample_id, + group: meta.group, + condition: meta.condition, + replicate: meta.replicate, + control_group: meta.control_group ?: meta.group, + control_condition: meta.control_condition ?: 'NA', + bam: bam, + bai: bai, + bam_path: bam.toString(), + bai_path: bai.toString(), + normalisation_mode: meta.normalisation_mode ?: params.normalisation_mode, + spikein_scale_factor: scale, + bigwig_path: bigwig == null ? 'NA' : bigwig.toString(), + meta: meta + ] + } + + def resolve_auto_spikein = { List records -> + def modes = records.collect { it.normalisation_mode ?: '' } + .findAll { it != null && it.toString().trim() } + .collect { it.toString() } + .unique() + def mode_is_spikein = modes.size() == 1 && modes[0].equalsIgnoreCase('Spikein') + def scales = records.collect { it.spikein_scale_factor } + def scales_present = scales && scales.every { it != null && it.toString() != 'NA' && it.toString() != '' } + return (mode_is_spikein && scales_present) ? 'true' : 'false' + } + + ch_use_spikein = Channel.value('false') + if (use_spikein_param == 'true') { + ch_use_spikein = Channel.value('true') + } else if (use_spikein_param == 'auto') { + ch_use_spikein = ch_samples + .toList() + .map { records -> resolve_auto_spikein(records) } + } + + // Write manifests in integrated mode + if (!params.differential_from_run) { + ch_samples_manifest = ch_samples + .map { rec -> + [ + sample_id: rec.sample_id, + group: rec.group, + condition: rec.condition, + replicate: rec.replicate, + control_group: rec.control_group ?: 'NA', + control_condition: rec.control_condition ?: 'NA', + final_bam: rec.bam_path, + final_bai: rec.bai_path, + normalisation_mode: rec.normalisation_mode, + spikein_scale_factor: rec.spikein_scale_factor ?: 'NA', + bigwig: rec.bigwig_path ?: 'NA' + ] + } + .toList() + .map { list -> groovy.json.JsonOutput.toJson(list) } + + ch_peaks_manifest = ch_peaks + .filter { meta, peaks -> meta.is_control == false } + .map { meta, peaks -> + def sample_id = meta.sample_id ?: meta.id + def caller_role = callers && callers[0] == meta.caller ? 'primary' : 'secondary' + def peaks_format = 'other' + def name = peaks.getName() + if (name.endsWith('.narrowPeak')) { + peaks_format = 'narrowPeak' + } else if (name.endsWith('.broadPeak')) { + peaks_format = 'broadPeak' + } else if (name.endsWith('.bed')) { + peaks_format = 'bed' + } + [ + sample_id: sample_id, + caller: meta.caller, + peaks_path: peaks.toString(), + peaks_format: peaks_format, + caller_role: caller_role, + group: meta.group, + condition: meta.condition, + replicate: meta.replicate + ] + } + .toList() + .map { list -> groovy.json.JsonOutput.toJson(list) } + + def run_meta = [ + pipeline: workflow.manifest.name, + pipeline_version: workflow.manifest.version, + pipeline_revision: workflow.revision ?: 'NA', + genome: params.genome, + callers: callers, + default_contrast: params.differential_contrast, + chrom_sizes: "${params.outdir}/03_peak_calling/06_differential/00_manifests/chrom_sizes.sizes" + ] + + ch_run_meta = Channel.value(groovy.json.JsonOutput.toJson(run_meta)) + ch_outdir = Channel.value(params.outdir) + + WRITE_DIFFERENTIAL_MANIFESTS( + ch_samples_manifest, + ch_peaks_manifest, + ch_run_meta, + ch_outdir, + ch_chrom_sizes_single + ) + ch_versions = ch_versions.mix(WRITE_DIFFERENTIAL_MANIFESTS.out.versions) + } + + // Group-level validation + ch_group_samples = ch_samples + .map { rec -> [rec.group, rec] } + .groupTuple() + + ch_group_eval = ch_group_samples + .map { group, records -> + def treated_records = records.findAll { it.condition == treated } + def control_records = records.findAll { it.condition == control } + def conditions = records.collect { it.condition }.unique() + def other_conditions = conditions.findAll { !(it in [treated, control]) } + def reasons = [] + if (!treated_records) { + reasons << "missing_treated" + } + if (!control_records) { + reasons << "missing_control" + } + if (treated_records.size() < params.differential_min_replicates) { + reasons << "treated_replicates_lt_${params.differential_min_replicates}" + } + if (control_records.size() < params.differential_min_replicates) { + reasons << "control_replicates_lt_${params.differential_min_replicates}" + } + if (other_conditions) { + if (params.differential_strict) { + exit 1, "Group '${group}' has additional conditions (${other_conditions.join(',')}) but strict mode is enabled." + } + log.warn "Group '${group}' has additional conditions (${other_conditions.join(',')}); ignoring for differential analysis." + } + def valid = reasons.isEmpty() + if (!valid && params.differential_strict) { + exit 1, "Group '${group}' failed differential validation: ${reasons.join(';')}" + } + def filtered = records.findAll { it.condition in [treated, control] } + [group: group, records: filtered, valid: valid, reason: reasons.join(';'), details: other_conditions ? "ignored_conditions=${other_conditions.join(',')}" : ""] + } + + ch_group_eval.branch { valid: it.valid; invalid: !it.valid }.set { ch_group_split } + ch_valid_groups = ch_group_split.valid + ch_invalid_groups = ch_group_split.invalid + + ch_skipped_records = ch_skipped_records.mix( + ch_invalid_groups.map { rec -> [method: 'all', group: rec.group, caller: 'NA', reason: rec.reason, details: rec.details] } + ) + + // DiffBind per caller x group + if (params.run_diffbind) { + ch_peaks_map = ch_peaks + .filter { meta, peaks -> meta.is_control == false } + .map { meta, peaks -> ["${meta.group}__${meta.caller}", [meta, peaks]] } + .groupTuple() + .toList() + .map { list -> list.collectEntries { [(it[0]): it[1]] } } + + ch_diffbind_inputs = ch_valid_groups + .flatMap { rec -> + callers.collect { caller -> [group: rec.group, caller: caller, records: rec.records] } + } + .combine(ch_peaks_map) + .map { rec, peaks_map -> + def key = "${rec.group}__${rec.caller}" + def peaks_list = peaks_map.get(key, []) + def peaks_by_sample = peaks_list.collectEntries { [(it[0].sample_id ?: it[0].id): it[1]] } + def missing = rec.records.findAll { !peaks_by_sample.containsKey(it.sample_id) }.collect { it.sample_id } + def valid = missing.isEmpty() + def samples = rec.records.collect { row -> + [ + sample_id: row.sample_id, + group: row.group, + condition: row.condition, + replicate: row.replicate, + bam: row.bam_path, + peaks: peaks_by_sample.get(row.sample_id)?.toString(), + caller: rec.caller, + spikein_scale_factor: row.spikein_scale_factor ?: 'NA' + ] + } + [group: rec.group, caller: rec.caller, samples: samples, valid: valid, missing: missing] + } + + ch_diffbind_inputs.branch { valid: it.valid; invalid: !it.valid }.set { ch_diffbind_split } + ch_diffbind_valid = ch_diffbind_split.valid + ch_diffbind_invalid = ch_diffbind_split.invalid + + ch_skipped_records = ch_skipped_records.mix( + ch_diffbind_invalid.map { rec -> [method: 'diffbind', group: rec.group, caller: rec.caller, reason: 'missing_peaks', details: rec.missing.join(',')] } + ) + + ch_diffbind_valid + .map { rec -> [groovy.json.JsonOutput.toJson(rec.samples), rec.group, rec.caller] } + .set { ch_diffbind_samples } + + MAKE_DIFFBIND_SAMPLESHEET( + ch_diffbind_samples.map { samples_json, group, caller -> samples_json }, + ch_diffbind_samples.map { samples_json, group, caller -> group }, + ch_diffbind_samples.map { samples_json, group, caller -> caller } + ) + ch_versions = ch_versions.mix(MAKE_DIFFBIND_SAMPLESHEET.out.versions) + + DIFFBIND_RUN( + MAKE_DIFFBIND_SAMPLESHEET.out.samplesheet.map { group, caller, sheet -> sheet }, + Channel.value(treated), + Channel.value(control), + MAKE_DIFFBIND_SAMPLESHEET.out.samplesheet.map { group, caller, sheet -> group }, + MAKE_DIFFBIND_SAMPLESHEET.out.samplesheet.map { group, caller, sheet -> caller }, + ch_use_spikein, + Channel.value(params.diffbind_fdr), + Channel.value(params.diffbind_lfc), + Channel.value(params.diffbind_min_overlap), + Channel.value(params.diffbind_summits), + Channel.value(params.diffbind_backend), + Channel.value(params.diffbind_extra_params ? file(params.diffbind_extra_params) : ''), + Channel.value('diffbind') + ) + ch_versions = ch_versions.mix(DIFFBIND_RUN.out.versions) + ch_summary_files = ch_summary_files.mix(DIFFBIND_RUN.out.summary.map { group, caller, file -> file }) + ch_diffbind_significant = DIFFBIND_RUN.out.significant + + if (params.differential_annotate) { + ch_diffbind_annotate = DIFFBIND_RUN.out.results.map { group, caller, file -> + def meta = [publish_dir: "${params.outdir}/03_peak_calling/06_differential/01_diffbind/${caller}/${group}"] + [meta, file] + } + ANNOTATE_REGIONS( + ch_diffbind_annotate, + ch_gene_bed, + ch_gtf + ) + ch_versions = ch_versions.mix(ANNOTATE_REGIONS.out.versions) + } + } + + // ChIPBinner per group + if (params.run_chipbinner) { + CHIPBINNER_BINS( + ch_chrom_sizes_single, + Channel.value(params.chipbinner_bin_size), + ch_valid_groups.map { rec -> rec.group }, + Channel.value(params.chipbinner_windows_dir ? file(params.chipbinner_windows_dir) : ''), + Channel.value(params.chipbinner_blacklist ? file(params.chipbinner_blacklist) : '') + ) + ch_versions = ch_versions.mix(CHIPBINNER_BINS.out.versions) + + ch_control_map = Channel.value([:]) + if (params.chipbinner_use_input) { + ch_control_pool_inputs = ch_control_bam_bai + .map { meta, bam, bai -> + def control_group = meta.control_group ?: meta.group + ["${control_group}__${meta.condition}", [meta, bam]] + } + .groupTuple(by: [0]) + .map { key, entries -> + def meta0 = entries[0][0] + def control_group = meta0.control_group ?: meta0.group + def meta_out = [ + id: "${control_group}.${meta0.condition}.control", + group: control_group, + condition: meta0.condition, + role: "control_${meta0.condition}" + ] + [meta_out, entries.collect { it[1] }] + } + + SAMTOOLS_MERGE_BAMS(ch_control_pool_inputs) + ch_versions = ch_versions.mix(SAMTOOLS_MERGE_BAMS.out.versions) + + ch_control_map = SAMTOOLS_MERGE_BAMS.out.merged + .map { meta, bam, bai -> [meta.group, meta.condition, bam, bai] } + .toList() + .map { list -> + def map = [:].withDefault { [:] } + list.each { entry -> + def group = entry[0] + def condition = entry[1] + map[group][condition] = [entry[2], entry[3]] + } + map + } + .ifEmpty([:]) + } + + ch_chipbinner_samples = ch_valid_groups.combine(ch_control_map).map { rec, control_map -> + def ordered = rec.records.sort { a, b -> + a.condition <=> b.condition ?: a.replicate <=> b.replicate ?: a.sample_id <=> b.sample_id + } + def samples = ordered.collect { row -> + [ + sample_id: row.sample_id, + condition: row.condition, + bam: row.bam_path, + spikein_scale_factor: row.spikein_scale_factor ?: 'NA' + ] + } + def bams = ordered.collect { row -> row.bam } + def bais = ordered.collect { row -> row.bai } + + def control_group = ordered ? (ordered[0].meta?.control_group ?: rec.group) : rec.group + def control_by_group = control_map.get(control_group, [:]) ?: [:] + def required_conditions = [treated, control] + def missing_controls = required_conditions.findAll { !control_by_group.containsKey(it) } + def use_input_group = params.chipbinner_use_input && missing_controls.isEmpty() + if (params.chipbinner_use_input && !use_input_group) { + def message = "ChIPBinner input normalization requested but controls missing for group '${rec.group}' conditions: ${missing_controls.join(',')}" + if (params.differential_strict) { + exit 1, message + } else { + log.warn message + } + } + + def control_bams = use_input_group ? required_conditions.collect { control_by_group[it][0] } : [] + def control_bais = use_input_group ? required_conditions.collect { control_by_group[it][1] } : [] + def control_conditions_json = use_input_group ? groovy.json.JsonOutput.toJson(required_conditions) : groovy.json.JsonOutput.toJson([]) + + [rec.group, groovy.json.JsonOutput.toJson(samples), bams, bais, control_bams, control_bais, control_conditions_json, use_input_group] + } + + ch_chipbinner_inputs = CHIPBINNER_BINS.out.bins.join(ch_chipbinner_samples) + .map { group, bins_file, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group -> + [bins_file, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group] + } + + CHIPBINNER_COUNTS( + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> bins }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> samples_json }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> bams }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> bais }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> control_bams }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> control_bais }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> control_conditions_json }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> use_input_group }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> group }, + ch_use_spikein, + Channel.value(params.chipbinner_pseudocount), + Channel.value(params.chipbinner_ms_coeffs ? file(params.chipbinner_ms_coeffs).toString() : '') + ) + ch_versions = ch_versions.mix(CHIPBINNER_COUNTS.out.versions) + + CHIPBINNER_HDBSCAN_GRID( + CHIPBINNER_COUNTS.out.normalized, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> samples_json }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> group }, + Channel.value(params.chipbinner_hdbscan_grid_min_cluster_size), + Channel.value(params.chipbinner_hdbscan_grid_min_samples) + ) + ch_versions = ch_versions.mix(CHIPBINNER_HDBSCAN_GRID.out.versions) + + def lola_db_path = params.chipbinner_lola_db ? file(params.chipbinner_lola_db) : null + def lola_db_exists = lola_db_path && lola_db_path.exists() + def lola_should_run = params.chipbinner_run_lola && lola_db_exists + if (params.chipbinner_run_lola && !lola_db_exists) { + def message = "ChIPBinner LOLA requested but --chipbinner_lola_db not found: ${params.chipbinner_lola_db}" + if (params.differential_strict) { + exit 1, message + } else { + log.warn message + } + lola_should_run = false + } + + CHIPBINNER_ROTS( + CHIPBINNER_COUNTS.out.normalized, + CHIPBINNER_HDBSCAN_GRID.out.clusters, + CHIPBINNER_HDBSCAN_GRID.out.clusters_2, + CHIPBINNER_HDBSCAN_GRID.out.clusters_3, + CHIPBINNER_HDBSCAN_GRID.out.grid_summary, + CHIPBINNER_COUNTS.out.norm_info, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> samples_json }, + Channel.value(treated), + Channel.value(control), + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> group }, + Channel.value(params.chipbinner_fdr), + Channel.value(params.chipbinner_lfc), + Channel.value(params.chipbinner_bootstrap), + Channel.value(params.chipbinner_k_value), + Channel.value(lola_should_run) + ) + ch_versions = ch_versions.mix(CHIPBINNER_ROTS.out.versions) + ch_summary_files = ch_summary_files.mix(CHIPBINNER_ROTS.out.summary.map { group, file -> file }) + ch_chipbinner_up = CHIPBINNER_ROTS.out.up + ch_chipbinner_down = CHIPBINNER_ROTS.out.down + + if (lola_should_run) { + ch_lola_beds = CHIPBINNER_ROTS.out.cluster_beds + .flatMap { group, manifest -> + def lines = manifest.text.readLines() + if (lines.size() <= 1) { + return [] + } + lines.drop(1).collect { line -> + def parts = line.split('\\t', -1) + if (parts.size() < 3) { + return null + } + def model = parts[0] + def label = parts[1] + def bed_path = parts[2] + if (label == 'noise') { + return null + } + def bed_file = manifest.parent ? new File(manifest.parent, bed_path) : new File(bed_path) + [group, label, model, file(bed_file.toString())] + }.findAll { it != null } + } + + ch_lola_inputs = CHIPBINNER_BINS.out.bins.join(ch_lola_beds) + .map { group, bins_file, label, model, bed_file -> [group, label, model, bed_file, bins_file] } + + CHIPBINNER_LOLA( + ch_lola_inputs.map { group, label, model, bed_file, bins_file -> [group, label, model, bed_file] }, + ch_lola_inputs.map { group, label, model, bed_file, bins_file -> bins_file }, + Channel.value(lola_db_path) + ) + ch_versions = ch_versions.mix(CHIPBINNER_LOLA.out.versions) + } + + if (params.differential_annotate) { + ch_chipbinner_annotate = CHIPBINNER_ROTS.out.results.map { group, file -> + def meta = [publish_dir: "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/differential"] + [meta, file] + } + ANNOTATE_REGIONS( + ch_chipbinner_annotate, + ch_gene_bed, + ch_gtf + ) + ch_versions = ch_versions.mix(ANNOTATE_REGIONS.out.versions) + } + } + + // SPAN differential per group + if (params.run_span_diff) { + def span_mode = params.span_diff_mode?.toString()?.toLowerCase() ?: 'auto' + def span_has_jar = params.omnipeaks_jar ? true : false + + if (span_mode == 'native' && !span_has_jar) { + exit 1, "--omnipeaks_jar is required for --span_diff_mode native" + } + if (span_mode == 'auto' && !span_has_jar) { + log.warn "SPAN diff auto mode: --omnipeaks_jar not provided; using explicit fallback when SPAN peaks are available." + } + + def span_use_native = span_mode == 'native' || (span_mode == 'auto' && span_has_jar) + def span_use_fallback = span_mode == 'fallback' || span_mode == 'auto' + + ch_span_has_compare = Channel.value(false) + ch_span_multibam = Channel.value(false) + ch_span_multibam_mode = Channel.value('single') + ch_span_compare_cmd = Channel.value('compare') + + if (span_has_jar) { + SPAN_CAPABILITY_PROBE(Channel.value(file(params.omnipeaks_jar))) + ch_versions = ch_versions.mix(SPAN_CAPABILITY_PROBE.out.versions) + + ch_span_caps = SPAN_CAPABILITY_PROBE.out.capabilities + .map { file -> new groovy.json.JsonSlurper().parse(file) } + .first() + ch_span_has_compare = ch_span_caps.map { it.has_compare ?: false } + ch_span_multibam = ch_span_caps.map { it.compare_multi_bam ?: false } + ch_span_multibam_mode = ch_span_caps.map { it.compare_multi_bam_mode ?: 'single' } + ch_span_compare_cmd = ch_span_caps.map { it.compare_command ?: 'compare' } + } + + if (span_use_native && span_has_jar) { + ch_span_native_groups = ch_valid_groups + if (span_mode == 'auto') { + ch_span_native_groups = ch_valid_groups.combine(ch_span_has_compare) + .filter { rec, has_compare -> has_compare } + .map { rec, has_compare -> rec } + } + + ch_span_direct_groups = ch_span_native_groups.combine(ch_span_multibam) + .filter { rec, multibam -> multibam } + .map { rec, multibam -> rec } + + ch_span_pooled_groups = ch_span_native_groups.combine(ch_span_multibam) + .filter { rec, multibam -> !multibam } + .map { rec, multibam -> rec } + + ch_span_compare_direct = ch_span_direct_groups.map { rec -> + def treated_bams = rec.records.findAll { it.condition == treated }.collect { it.bam } + def control_bams = rec.records.findAll { it.condition == control }.collect { it.bam } + def meta = [ + id: rec.group, + group: rec.group, + treated: treated, + control: control, + input_mode: 'direct', + treated_bams: treated_bams.collect { it.toString() }, + control_bams: control_bams.collect { it.toString() } + ] + [meta, treated_bams, control_bams] + } + + ch_span_pool_inputs = ch_span_pooled_groups.flatMap { rec -> + def treated_bams = rec.records.findAll { it.condition == treated }.collect { it.bam } + def control_bams = rec.records.findAll { it.condition == control }.collect { it.bam } + [ + [key: "${rec.group}__treated", group: rec.group, role: 'treated', condition: treated, bams: treated_bams], + [key: "${rec.group}__control", group: rec.group, role: 'control', condition: control, bams: control_bams] + ] + } + + ch_span_merge_inputs = ch_span_pool_inputs.map { rec -> + [[group: rec.group, role: rec.role, id: "${rec.group}.${rec.role}"], rec.bams] + } + SAMTOOLS_MERGE_BAMS(ch_span_merge_inputs) + ch_versions = ch_versions.mix(SAMTOOLS_MERGE_BAMS.out.versions) + + ch_span_pool_inputs_keyed = ch_span_pool_inputs.map { rec -> [rec.key, rec] } + ch_span_pooled_outputs = SAMTOOLS_MERGE_BAMS.out.merged.map { meta, bam, bai -> + ["${meta.group}__${meta.role}", meta, bam, bai] + } + ch_span_pool_manifest_records = ch_span_pooled_outputs.join(ch_span_pool_inputs_keyed) + .map { key, meta, bam, bai, rec -> + [ + group: meta.group, + condition: rec.condition, + pooled_bam: bam.toString(), + pooled_bai: bai.toString(), + input_bams: rec.bams.collect { it.toString() }.join(';'), + pooling_reason: 'omnipeaks_no_multibam_support', + created_by: 'native' + ] + } + + ch_span_pool_manifest_grouped = ch_span_pool_manifest_records + .map { rec -> [rec.group, rec] } + .groupTuple() + .map { group, recs -> [group, recs] } + + SPAN_POOLING_MANIFEST( + ch_span_pool_manifest_grouped.map { group, recs -> group }, + ch_span_pool_manifest_grouped.map { group, recs -> recs } + ) + ch_versions = ch_versions.mix(SPAN_POOLING_MANIFEST.out.versions) + + ch_span_merged_treated = SAMTOOLS_MERGE_BAMS.out.merged + .filter { meta, bam, bai -> meta.role == 'treated' } + .map { meta, bam, bai -> [meta.group, bam] } + + ch_span_merged_control = SAMTOOLS_MERGE_BAMS.out.merged + .filter { meta, bam, bai -> meta.role == 'control' } + .map { meta, bam, bai -> [meta.group, bam] } + + ch_span_compare_pooled = ch_span_merged_treated.join(ch_span_merged_control) + .map { group, treated_bam, control_bam -> + def meta = [ + id: group, + group: group, + treated: treated, + control: control, + input_mode: 'pooled', + treated_bams: [treated_bam.toString()], + control_bams: [control_bam.toString()] + ] + [meta, treated_bam, control_bam] + } + + ch_span_compare_inputs = ch_span_compare_direct.mix(ch_span_compare_pooled) + + SPAN_COMPARE( + ch_span_compare_inputs, + ch_chrom_sizes_single, + Channel.value(file(params.omnipeaks_jar)), + Channel.value(params.span_diff_gap), + Channel.value(params.span_diff_bin), + Channel.value(params.span_diff_fdr), + Channel.value(params.span_diff_java_heap), + ch_span_compare_cmd, + ch_span_multibam_mode + ) + ch_versions = ch_versions.mix(SPAN_COMPARE.out.versions) + ch_summary_files = ch_summary_files.mix(SPAN_COMPARE.out.summary.map { meta, file -> file }) + ch_span_native_significant = ch_span_native_significant.mix( + SPAN_COMPARE.out.significant.map { meta, file -> [group: meta.group, path: file.toString(), span_mode: 'native'] } + ) + + if (params.differential_annotate) { + ch_span_annotate = SPAN_COMPARE.out.tsv.map { meta, file -> + def publish_meta = [publish_dir: "${params.outdir}/03_peak_calling/06_differential/03_span/${meta.group}"] + [publish_meta, file] + } + ANNOTATE_REGIONS( + ch_span_annotate, + ch_gene_bed, + ch_gtf + ) + ch_versions = ch_versions.mix(ANNOTATE_REGIONS.out.versions) + } + } + + ch_span_fallback_groups = Channel.empty() + if (span_mode == 'fallback') { + ch_span_fallback_groups = ch_valid_groups + } else if (span_mode == 'auto') { + if (!span_has_jar) { + ch_span_fallback_groups = ch_valid_groups + } else { + ch_span_fallback_groups = ch_valid_groups.combine(ch_span_has_compare) + .filter { rec, has_compare -> !has_compare } + .map { rec, has_compare -> rec } + } + } + + if (span_use_fallback) { + def span_callers = callers.findAll { it.startsWith('span_') } + if (!span_callers) { + def details = span_has_jar ? + 'SPAN caller not in --peakcaller' : + 'SPAN caller not in --peakcaller; --omnipeaks_jar missing so native compare is unavailable' + if (params.differential_strict) { + exit 1, "SPAN differential unavailable: ${details}" + } + log.warn "SPAN differential skipped: ${details}" + ch_skipped_records = ch_skipped_records.mix( + ch_span_fallback_groups.map { rec -> [method: 'span', group: rec.group, caller: 'NA', reason: 'no_span_peaks', details: details] } + ) + } else { + def span_caller = span_callers[0] + ch_span_peaks = ch_peaks + .filter { meta, peaks -> meta.is_control == false && meta.caller == span_caller } + .map { meta, peaks -> [meta.sample_id ?: meta.id, peaks] } + .toList() + .map { list -> list.collectEntries { [(it[0]): it[1]] } } + + ch_span_samples = ch_span_fallback_groups.combine(ch_span_peaks) + .map { rec, peaks_map -> + def missing = rec.records.findAll { !peaks_map.containsKey(it.sample_id) }.collect { it.sample_id } + def samples = rec.records.collect { row -> + [ + sample_id: row.sample_id, + group: row.group, + condition: row.condition, + replicate: row.replicate, + bam: row.bam_path, + peaks: peaks_map.get(row.sample_id)?.toString(), + caller: span_caller, + spikein_scale_factor: row.spikein_scale_factor ?: 'NA' + ] + } + [group: rec.group, samples: samples, missing: missing] + } + + ch_span_samples.branch { valid: it.missing.isEmpty(); invalid: !it.missing.isEmpty() }.set { ch_span_split } + ch_span_valid = ch_span_split.valid + ch_span_invalid = ch_span_split.invalid + + ch_skipped_records = ch_skipped_records.mix( + ch_span_invalid.map { rec -> [method: 'span', group: rec.group, caller: span_caller, reason: 'missing_peaks', details: rec.missing.join(',')] } + ) + + MAKE_SPAN_SAMPLESHEET( + ch_span_valid.map { rec -> groovy.json.JsonOutput.toJson(rec.samples) }, + ch_span_valid.map { rec -> rec.group }, + Channel.value('span_fallback') + ) + ch_versions = ch_versions.mix(MAKE_SPAN_SAMPLESHEET.out.versions) + + SPAN_FALLBACK_DIFF( + MAKE_SPAN_SAMPLESHEET.out.samplesheet.map { group, caller, sheet -> sheet }, + Channel.value(treated), + Channel.value(control), + MAKE_SPAN_SAMPLESHEET.out.samplesheet.map { group, caller, sheet -> group }, + ch_use_spikein, + Channel.value(params.span_diff_fdr), + Channel.value(params.diffbind_backend) + ) + ch_versions = ch_versions.mix(SPAN_FALLBACK_DIFF.out.versions) + ch_summary_files = ch_summary_files.mix(SPAN_FALLBACK_DIFF.out.summary.map { group, file -> file }) + ch_span_fallback_significant = ch_span_fallback_significant.mix( + SPAN_FALLBACK_DIFF.out.significant.map { group, file -> [group: group, path: file.toString(), span_mode: 'fallback'] } + ) + + if (params.differential_annotate) { + ch_span_fallback_annotate = SPAN_FALLBACK_DIFF.out.results.map { group, file -> + def meta = [publish_dir: "${params.outdir}/03_peak_calling/06_differential/03_span/${group}"] + [meta, file] + } + ANNOTATE_REGIONS( + ch_span_fallback_annotate, + ch_gene_bed, + ch_gtf + ) + ch_versions = ch_versions.mix(ANNOTATE_REGIONS.out.versions) + } + } + } + } + + if (params.differential_cross_compare) { + diffbind_beds = ch_diffbind_significant.map { group, caller, file -> + [group: group, caller: caller, path: file.toString()] + } + ch_span_effective = ch_span_native_significant + .mix(ch_span_fallback_significant) + .map { rec -> [rec.group, rec] } + .groupTuple() + .map { group, recs -> + def native_record = recs.find { it.span_mode == 'native' } + native_record ?: recs[0] + } + + method_beds = Channel.empty() + method_beds = method_beds.mix( + ch_diffbind_significant + .map { group, caller, file -> + [group: group, method: 'diffbind', caller: caller, path: file.toString(), span_mode_used: 'NA'] + } + ) + method_beds = method_beds.mix( + ch_chipbinner_up.map { group, file -> + [group: group, method: 'chipbinner', caller: 'treated_enriched', path: file.toString(), span_mode_used: 'NA'] + }, + ch_chipbinner_down.map { group, file -> + [group: group, method: 'chipbinner', caller: 'control_enriched', path: file.toString(), span_mode_used: 'NA'] + } + ) + method_beds = method_beds.mix( + ch_span_effective.map { rec -> + [group: rec.group, method: 'span', caller: 'NA', path: rec.path, span_mode_used: rec.span_mode] + } + ) + + callers_tsv = diffbind_beds + .map { rec -> [rec.group, rec.caller, rec.path].join('\t') } + .toSortedList() + .map { lines -> lines.join('\n') + '\n' } + .collectFile(name: 'overlap_callers.tsv') + methods_tsv = method_beds + .map { rec -> [rec.group, rec.method, rec.caller ?: 'NA', rec.path, rec.span_mode_used ?: 'NA'].join('\t') } + .toSortedList() + .map { lines -> lines.join('\n') + '\n' } + .collectFile(name: 'overlap_methods.tsv') + + DIFFERENTIAL_OVERLAP(callers_tsv, methods_tsv) + ch_versions = ch_versions.mix(DIFFERENTIAL_OVERLAP.out.versions) + } + + ch_summary_out = Channel.empty() + ch_skipped_out = Channel.empty() + if (params.differential_run_multiqc) { + DIFFERENTIAL_SUMMARY_MERGE( + ch_summary_files.collect().ifEmpty([]), + ch_skipped_records.toList().map { list -> groovy.json.JsonOutput.toJson(list) }, + Channel.value(file("${projectDir}/assets/multiqc/differential_summary_header.txt")), + Channel.value(file("${projectDir}/assets/multiqc/differential_skipped_header.txt")) + ) + ch_versions = ch_versions.mix(DIFFERENTIAL_SUMMARY_MERGE.out.versions) + ch_summary_out = DIFFERENTIAL_SUMMARY_MERGE.out.summary + ch_skipped_out = DIFFERENTIAL_SUMMARY_MERGE.out.skipped + } + + emit: + summary = ch_summary_out + skipped = ch_skipped_out + versions = ch_versions +} diff --git a/subworkflows/local/prepare_peakcalling.nf b/subworkflows/local/prepare_peakcalling.nf index d9fcc019..baef316c 100644 --- a/subworkflows/local/prepare_peakcalling.nf +++ b/subworkflows/local/prepare_peakcalling.nf @@ -24,6 +24,7 @@ workflow PREPARE_PEAKCALLING { main: ch_versions = Channel.empty() ch_bedgraph = Channel.empty() + ch_spikein_scale_factors = Channel.empty() def norm_scope = normalisation_scope ?: 'all' def igg_scope = igg_scale_scope ?: 'legacy' def median = { List values -> @@ -113,6 +114,10 @@ workflow PREPARE_PEAKCALLING { NORMALISATION_FACTORS_REPORT ( ch_norm_factors ) ch_versions = ch_versions.mix(NORMALISATION_FACTORS_REPORT.out.versions) } + + ch_bam_scale_factor + .map { meta, bam, scale -> [ meta, scale ] } + .set { ch_spikein_scale_factors } } else if (norm_mode == "None") { /* @@ -302,4 +307,5 @@ workflow PREPARE_PEAKCALLING { bedgraph = UCSC_BEDCLIP.out.bedgraph // channel: [ val(meta), [ bedgraph ] ] bigwig = UCSC_BEDGRAPHTOBIGWIG.out.bigwig // channel: [ val(meta), [ bigwig ] ] versions = ch_versions // channel: [ versions.yml ] + spikein_scale_factors = ch_spikein_scale_factors // channel: [ val(meta), scale_factor ] } diff --git a/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.peaks.tsv b/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.peaks.tsv new file mode 100644 index 00000000..30a54bad --- /dev/null +++ b/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.peaks.tsv @@ -0,0 +1,3 @@ +sample_id caller peaks_path peaks_format caller_role +h3k27me3_Control_rep1 seacr tests/assets/differential_mock/data/h3k27me3_control.peaks.bed bed primary +h3k27me3_Treatment_rep1 seacr tests/assets/differential_mock/data/h3k27me3_treatment.peaks.bed bed primary diff --git a/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.run_meta.json b/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.run_meta.json new file mode 100644 index 00000000..5088659e --- /dev/null +++ b/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.run_meta.json @@ -0,0 +1,4 @@ +{ + "callers": ["seacr"], + "default_contrast": "Treatment,Control" +} diff --git a/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.samples.tsv b/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.samples.tsv new file mode 100644 index 00000000..890cbebf --- /dev/null +++ b/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.samples.tsv @@ -0,0 +1,3 @@ +sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor bigwig +h3k27me3_Control_rep1 h3k27me3 Control 1 tests/assets/differential_mock/data/h3k27me3_control.bam tests/assets/differential_mock/data/h3k27me3_control.bam.bai None NA NA +h3k27me3_Treatment_rep1 h3k27me3 Treatment 1 tests/assets/differential_mock/data/h3k27me3_treatment.bam tests/assets/differential_mock/data/h3k27me3_treatment.bam.bai None NA NA diff --git a/tests/config/differential.config b/tests/config/differential.config new file mode 100644 index 00000000..67ed1990 --- /dev/null +++ b/tests/config/differential.config @@ -0,0 +1,22 @@ +params { + outdir = "results/" + publish_dir_mode = "copy" + singularity_pull_docker_container = false +} + +process { + cpus = 2 + memory = 6.GB + time = 6.h +} + +singularity.enabled = true +singularity.autoMounts = true +conda.enabled = true +conda.channels = ['conda-forge', 'bioconda', 'defaults'] + +process { + withName: '.*DIFFBIND_RUN' { + ext.when = false + } +} diff --git a/tests/test_07_differential_peak_calling.yml b/tests/test_07_differential_peak_calling.yml new file mode 100644 index 00000000..2efa34f7 --- /dev/null +++ b/tests/test_07_differential_peak_calling.yml @@ -0,0 +1,43 @@ +- name: test_differential_missing_contrast + command: nextflow run main.nf -profile docker,test --run_diffbind --skip_fastqc --skip_removeduplicates --skip_preseq --skip_multiqc -c tests/config/nextflow.config + tags: + - test_differential + - test_differential_missing_contrast + exit_code: 1 + +- name: test_differential_manifests_samplesheet + command: nextflow run main.nf -profile docker,test --only_peak_calling --skip_fastqc --skip_removeduplicates --skip_preseq --skip_multiqc --run_diffbind --differential_contrast "Treatment,Control" --differential_min_replicates 1 --input tests/assets/samplesheet_condition.csv -c tests/config/differential.config + tags: + - test_differential + - test_differential_manifests + files: + - path: results/03_peak_calling/06_differential/00_manifests/differential_manifest.samples.tsv + - path: results/03_peak_calling/06_differential/00_manifests/differential_manifest.peaks.tsv + - path: results/03_peak_calling/06_differential/01_diffbind/00_samplesheets/seacr/h3k27me3.diffbind.csv + +- name: test_differential_chipbinner + command: nextflow run main.nf -profile docker,test --run_chipbinner --differential_contrast "Treatment,Control" --differential_min_replicates 1 --input tests/assets/samplesheet_condition.csv --skip_fastqc --skip_removeduplicates --skip_preseq --skip_multiqc -c tests/config/differential.config + tags: + - test_differential + - test_differential_chipbinner + files: + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/clustering/chipbinner.hdbscan_grid_summary.tsv + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/clustering/chipbinner.clusters.best.tsv + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/clustering/chipbinner.clusters.2clusters.tsv + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/clustering/chipbinner.clusters.3clusters.tsv + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/differential/chipbinner.differential.tsv + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/bed/chipbinner.control_enriched.bed + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/bed/chipbinner.treated_enriched.bed + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/bed/chipbinner.stable.bed + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/chipbinner.summary.tsv + +- name: test_differential_only_skipped_invalid + command: nextflow run main.nf -entry DIFFERENTIAL_ONLY -profile docker,test --run_diffbind --differential_contrast "Treatment,Control" --differential_from_run tests/assets/differential_mock --skip_multiqc -c tests/config/differential.config + tags: + - test_differential + - test_differential_only + files: + - path: results/03_peak_calling/06_differential/multiqc/differential.skipped.tsv + contains: + - "treated_replicates_lt_2" + - "control_replicates_lt_2" diff --git a/workflows/cutandrun.nf b/workflows/cutandrun.nf index d97aae21..b8081990 100644 --- a/workflows/cutandrun.nf +++ b/workflows/cutandrun.nf @@ -146,6 +146,7 @@ include { DEEPTOOLS_QC } from "../subworkflo include { PEAK_QC } from "../subworkflows/local/peak_qc" include { SAMTOOLS_VIEW_SORT_STATS as FILTER_READS } from "../subworkflows/local/samtools_view_sort_stats" include { DEDUPLICATE_LINEAR } from "../subworkflows/local/deduplicate_linear" +include { DIFFERENTIAL_PEAK_CALLING } from "../subworkflows/local/differential_peak_calling" /* ======================================================================================== @@ -187,6 +188,9 @@ workflow CUTANDRUN { // Init ch_software_versions = Channel.empty() + if (params.run_peak_calling && (params.run_diffbind || params.run_chipbinner || params.run_span_diff) && !params.differential_contrast) { + exit 1, "Differential analysis requested but --differential_contrast was not provided (expected \"TREATED,CONTROL\")." + } /* * SUBWORKFLOW: Uncompress and prepare reference genome files @@ -450,6 +454,9 @@ workflow CUTANDRUN { ch_bedgraph = Channel.empty() ch_bigwig = Channel.empty() + ch_spikein_scale_factors = Channel.empty() + ch_bam_target = Channel.empty() + ch_bam_control = Channel.empty() ch_peaks_all = Channel.empty() ch_peaks_primary = Channel.empty() ch_peaks_secondary = Channel.empty() @@ -478,6 +485,7 @@ workflow CUTANDRUN { ) ch_bedgraph = PREPARE_PEAKCALLING.out.bedgraph ch_bigwig = PREPARE_PEAKCALLING.out.bigwig + ch_spikein_scale_factors = PREPARE_PEAKCALLING.out.spikein_scale_factors ch_software_versions = ch_software_versions.mix(PREPARE_PEAKCALLING.out.versions) /* @@ -654,6 +662,59 @@ workflow CUTANDRUN { ch_software_versions = ch_software_versions.mix(CONTROL_POOLING_FALLBACKS_REPORT.out.versions) } + /* + * CHANNEL: Filter bais for target/control + */ + ch_samtools_bai.filter { it -> it[0].is_control == false } + .set { ch_bai_target } + ch_samtools_bai.filter { it -> it[0].is_control == true } + .set { ch_bai_control } + //ch_bai_target | view + //ch_bai_control | view + + /* + * CHANNEL: Combine bam and bai files on id + */ + ch_bam_target.map { row -> [row[0].id, row ].flatten()} + .join ( ch_bai_target.map { row -> [row[0].id, row ].flatten()} ) + .map { row -> [row[1], row[2], row[4]] } + .set { ch_bam_bai } + // EXAMPLE CHANNEL STRUCT: [[META], BAM, BAI] + //ch_bam_bai | view + + /* + * CHANNEL: Combine control bam and bai files on id + */ + ch_bam_control.map { row -> [row[0].id, row ].flatten()} + .join ( ch_bai_control.map { row -> [row[0].id, row ].flatten()} ) + .map { row -> [row[1], row[2], row[4]] } + .set { ch_control_bam_bai } + //ch_control_bam_bai | view + + ch_differential_summary = Channel.empty() + ch_differential_skipped = Channel.empty() + ch_differential_versions = Channel.empty() + if (params.run_peak_calling && (params.run_diffbind || params.run_chipbinner || params.run_span_diff)) { + DIFFERENTIAL_PEAK_CALLING ( + ch_bam_bai, + ch_control_bam_bai, + ch_peaks_all, + ch_bigwig, + ch_spikein_scale_factors, + PREPARE_GENOME.out.chrom_sizes.collect(), + callers + ) + ch_differential_summary = DIFFERENTIAL_PEAK_CALLING.out.summary + ch_differential_skipped = DIFFERENTIAL_PEAK_CALLING.out.skipped + ch_differential_versions = DIFFERENTIAL_PEAK_CALLING.out.versions + ch_software_versions = ch_software_versions.mix(ch_differential_versions) + if (!params.run_multiqc) { + ch_differential_summary.subscribe { } + ch_differential_skipped.subscribe { } + ch_differential_versions.subscribe { } + } + } + ch_dt_corrmatrix = Channel.empty() ch_dt_pcadata = Channel.empty() ch_dt_fpmatrix = Channel.empty() @@ -791,13 +852,6 @@ workflow CUTANDRUN { ch_software_versions = ch_software_versions.mix(DEEPTOOLS_QC.out.versions) } - /* - * CHANNEL: Filter bais for target only - */ - ch_samtools_bai.filter { it -> it[0].is_control == false } - .set { ch_bai_target } - //ch_bai_target | view - if (params.run_peak_qc && params.run_peak_calling) { /* * CHANNEL: Filter flagstat for target only @@ -850,17 +904,6 @@ workflow CUTANDRUN { } //ch_peakqc_reprod_perc_mqc | view - /* - * CHANNEL: Combine bam and bai files on id - */ - - ch_bam_target.map { row -> [row[0].id, row ].flatten()} - .join ( ch_bai_target.map { row -> [row[0].id, row ].flatten()} ) - .map { row -> [row[1], row[2], row[4]] } - .set { ch_bam_bai } - // EXAMPLE CHANNEL STRUCT: [[META], BAM, BAI] - //ch_bam_bai | view - /* * MODULE: Calculate fragment lengths */ @@ -936,7 +979,9 @@ workflow CUTANDRUN { ch_peakqc_count_consensus_mqc.collect{it[1]}.ifEmpty([]), ch_peakqc_reprod_perc_mqc.collect().ifEmpty([]), ch_frag_len_hist_mqc.collect().ifEmpty([]), - ch_linear_duplication_mqc.collect{it[1]}.ifEmpty([]) + ch_linear_duplication_mqc.collect{it[1]}.ifEmpty([]), + ch_differential_summary.collect().ifEmpty([]), + ch_differential_skipped.collect().ifEmpty([]) ) multiqc_report = MULTIQC.out.report.toList() } diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf new file mode 100644 index 00000000..4177c42b --- /dev/null +++ b/workflows/differential_only.nf @@ -0,0 +1,170 @@ +/* + * Differential-only entrypoint (posthoc) + */ + +import groovy.json.JsonSlurper + +include { DIFFERENTIAL_PEAK_CALLING } from '../subworkflows/local/differential_peak_calling' + +workflow DIFFERENTIAL_ONLY { + main: + if (!params.differential_from_run) { + exit 1, "--differential_from_run must be provided for DIFFERENTIAL_ONLY" + } + + def run_dir = params.differential_from_run + def samples_path = file("${run_dir}/03_peak_calling/06_differential/00_manifests/differential_manifest.samples.tsv") + def peaks_path = file("${run_dir}/03_peak_calling/06_differential/00_manifests/differential_manifest.peaks.tsv") + def run_meta_path = file("${run_dir}/03_peak_calling/06_differential/00_manifests/differential_manifest.run_meta.json") + def chrom_sizes_path = file("${run_dir}/03_peak_calling/06_differential/00_manifests/chrom_sizes.sizes") + + if (!samples_path.exists()) { + exit 1, "Missing differential manifest samples.tsv under ${run_dir}" + } + if (!peaks_path.exists()) { + exit 1, "Missing differential manifest peaks.tsv under ${run_dir}" + } + if (run_meta_path.exists() && !params.differential_contrast) { + def meta = new JsonSlurper().parse(run_meta_path) + if (meta?.default_contrast) { + params.differential_contrast = meta.default_contrast + } + } + + def callers_list = [] + if (run_meta_path.exists()) { + def meta = new JsonSlurper().parse(run_meta_path) + callers_list = meta?.callers ?: [] + } + def known_conditions = [] + samples_path.eachLine { line, idx -> + if (idx == 1) { + def headers = line.split('\t', -1) + def cond_idx = headers.findIndexOf { it == 'condition' } + if (cond_idx >= 0) { + samples_path.eachLine { row, row_idx -> + if (row_idx == 1) return + if (!row.trim()) return + def fields = row.split('\t', -1) + if (fields.size() > cond_idx) { + def value = fields[cond_idx] + if (value && value != 'NA') { + known_conditions << value + } + } + } + } + } + } + known_conditions = known_conditions.unique() + + ch_samples = Channel.fromPath(samples_path).splitCsv(header: true, sep: '\t') + .map { row -> + def control_group = (row.control_group && row.control_group != 'NA') ? row.control_group : row.group + def meta = [ + id: row.sample_id, + sample_id: row.sample_id, + group: row.group, + condition: row.condition, + replicate: row.replicate.toInteger(), + control_group: control_group, + control_condition: row.control_condition ?: 'NA', + is_control: false, + normalisation_mode: row.normalisation_mode ?: params.normalisation_mode + ] + def bam = file(row.final_bam) + def bai = file(row.final_bai) + [meta, bam, bai, row.spikein_scale_factor, row.bigwig] + } + + ch_bam_bai = ch_samples.map { meta, bam, bai, scale, bigwig -> [meta, bam, bai] } + ch_spikein_scale = ch_samples + .filter { meta, bam, bai, scale, bigwig -> scale && scale != 'NA' } + .map { meta, bam, bai, scale, bigwig -> [meta, scale] } + + ch_bigwig = ch_samples + .filter { meta, bam, bai, scale, bigwig -> bigwig && bigwig != 'NA' } + .map { meta, bam, bai, scale, bigwig -> [meta, file(bigwig)] } + + // Optional pooled controls (used for ChIPBinner input normalization) + def pooled_controls_dir = file("${run_dir}/03_peak_calling/01_pooled_controls") + ch_control_bam_bai = Channel.empty() + if (pooled_controls_dir.exists()) { + ch_control_bam_bai = Channel.fromPath("${pooled_controls_dir}/*.bam") + .map { bam -> + def base = bam.baseName + def matched = known_conditions.findAll { base.endsWith("_${it}") } + def condition = matched ? matched.max { it.size() } : null + def group = base + if (condition) { + group = base[0..-(condition.size() + 2)] + } else { + def parts = base.tokenize('_') + condition = parts.size() > 1 ? parts[-1] : 'NA' + group = parts.size() > 1 ? parts[0..-2].join('_') : base + } + def bai = file("${bam}.bai") + if (!bai.exists()) { + bai = file("${bam.parent}/${bam.baseName}.bai") + } + if (!bai.exists()) { + log.warn "Missing BAI for pooled control ${bam}" + return null + } + def meta = [ + id: base, + sample_id: base, + group: group, + condition: condition, + replicate: 1, + is_control: true + ] + [meta, bam, bai] + } + .filter { it != null } + } + + ch_samples_map = ch_samples + .map { meta, bam, bai, scale, bigwig -> [meta.sample_id, meta] } + .toList() + .map { list -> list.collectEntries { [(it[0]): it[1]] } } + + ch_peaks = Channel.fromPath(peaks_path).splitCsv(header: true, sep: '\t') + .combine(ch_samples_map) + .map { row, samples_map -> + def meta = samples_map[row.sample_id] + if (!meta) { + return null + } + def meta_out = meta + [caller: row.caller] + [meta_out, file(row.peaks_path)] + } + .filter { it != null } + + if (!callers_list) { + callers_list = peaks_path.text.readLines() + .drop(1) + .collect { it.split('\\t')[1] } + .unique() + } + + ch_chrom_sizes = chrom_sizes_path.exists() ? Channel.value(chrom_sizes_path) : Channel.empty() + + DIFFERENTIAL_PEAK_CALLING( + ch_bam_bai, + ch_control_bam_bai, + ch_peaks, + ch_bigwig, + ch_spikein_scale, + ch_chrom_sizes, + callers_list + ) + ch_diff_summary = DIFFERENTIAL_PEAK_CALLING.out.summary + ch_diff_skipped = DIFFERENTIAL_PEAK_CALLING.out.skipped + ch_diff_versions = DIFFERENTIAL_PEAK_CALLING.out.versions + if (!params.run_multiqc) { + ch_diff_summary.subscribe { } + ch_diff_skipped.subscribe { } + ch_diff_versions.subscribe { } + } +}