diff --git a/assets/chrom_sizes_stub.sizes b/assets/chrom_sizes_stub.sizes new file mode 100644 index 00000000..7fb29dd4 --- /dev/null +++ b/assets/chrom_sizes_stub.sizes @@ -0,0 +1 @@ +chr1 1 diff --git a/assets/differential_summary_stub.tsv b/assets/differential_summary_stub.tsv new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/assets/differential_summary_stub.tsv @@ -0,0 +1 @@ + diff --git a/assets/ms_coeffs_stub.tsv b/assets/ms_coeffs_stub.tsv new file mode 100644 index 00000000..fac074f1 --- /dev/null +++ b/assets/ms_coeffs_stub.tsv @@ -0,0 +1 @@ +sample_id ms_coeff diff --git a/assets/multiqc/chipbinner_summary_header.txt b/assets/multiqc/chipbinner_summary_header.txt new file mode 100644 index 00000000..e8adddb9 --- /dev/null +++ b/assets/multiqc/chipbinner_summary_header.txt @@ -0,0 +1,10 @@ +#id: 'chipbinner_summary' +#parent_id: 'differential_analysis' +#parent_name: 'Differential analysis' +#parent_description: 'Differential peak calling outputs' +#section_name: 'ChIPBinner summary' +#description: 'Per-group ChIPBinner summary metrics.' +#plot_type: 'table' +#pconfig: +# id: 'chipbinner_summary_table' +# title: 'ChIPBinner summary' diff --git a/assets/multiqc/differential_design_header.txt b/assets/multiqc/differential_design_header.txt new file mode 100644 index 00000000..903f6285 --- /dev/null +++ b/assets/multiqc/differential_design_header.txt @@ -0,0 +1,10 @@ +#id: 'differential_design' +#parent_id: 'differential_analysis' +#parent_name: 'Differential analysis' +#parent_description: 'Differential peak calling outputs' +#section_name: 'Differential analysis design' +#description: 'Eligibility and design status for differential comparisons.' +#plot_type: 'table' +#pconfig: +# id: 'differential_design_table' +# title: 'Differential design' diff --git a/assets/multiqc/differential_summary_header.txt b/assets/multiqc/differential_summary_header.txt new file mode 100644 index 00000000..a4841448 --- /dev/null +++ b/assets/multiqc/differential_summary_header.txt @@ -0,0 +1,10 @@ +#id: 'differential_summary' +#parent_id: 'differential_analysis' +#parent_name: 'Differential analysis' +#parent_description: 'Differential peak calling outputs' +#section_name: 'Differential analysis summary' +#description: 'Summary of differential analysis across methods.' +#plot_type: 'table' +#pconfig: +# id: 'differential_summary_table' +# title: 'Differential summary' diff --git a/assets/multiqc/span_summary_header.txt b/assets/multiqc/span_summary_header.txt new file mode 100644 index 00000000..06d7fd67 --- /dev/null +++ b/assets/multiqc/span_summary_header.txt @@ -0,0 +1,10 @@ +#id: 'span_summary' +#parent_id: 'differential_analysis' +#parent_name: 'Differential analysis' +#parent_description: 'Differential peak calling outputs' +#section_name: 'SPAN summary' +#description: 'Per-group SPAN summary metrics.' +#plot_type: 'table' +#pconfig: +# id: 'span_summary_table' +# title: 'SPAN summary' diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 240c0d88..39dbfb40 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -64,6 +64,8 @@ report_section_order: custom_content: order: + - differential_design + - differential_summary - primary_peak_counts - consensus_peak_counts - primary_frip_score @@ -94,6 +96,9 @@ section_comments: # Customise the module search patterns to speed up execution time sp: + custom_content: + fn: "*_mqc.tsv" + cutadapt: fn: "*trimming_report.txt" diff --git a/assets/multiqc_differential_config.yml b/assets/multiqc_differential_config.yml new file mode 100644 index 00000000..7a2e886c --- /dev/null +++ b/assets/multiqc_differential_config.yml @@ -0,0 +1,18 @@ +report_comment: > + Differential analysis report generated by nf-core/cutandrun. The summary table + includes method-specific metrics, with dedicated ChIPBinner/SPAN sections for + detailed per-method summaries. + +run_modules: + - custom_content + +custom_content: + order: + - differential_design + - differential_summary + - chipbinner_summary + - span_summary + +sp: + custom_content: + fn: "*_mqc.tsv" diff --git a/bin/annotate_regions.py b/bin/annotate_regions.py new file mode 100755 index 00000000..f15da0de --- /dev/null +++ b/bin/annotate_regions.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +import argparse +import csv +import os +import subprocess +import tempfile + + +def read_tsv(path): + with open(path, "r", newline="") as handle: + reader = csv.reader(handle, delimiter="\t") + rows = [row for row in reader] + return rows + + +def is_header(row): + if len(row) < 3: + return True + try: + int(row[1]) + return False + except Exception: + return True + + +def main(): + parser = argparse.ArgumentParser(description="Annotate regions with nearest gene") + parser.add_argument("--regions", required=True) + parser.add_argument("--gene-bed", required=True) + parser.add_argument("--out", required=True) + args = parser.parse_args() + + out_dir = os.path.dirname(args.out) + if out_dir: + os.makedirs(out_dir, exist_ok=True) + + rows = read_tsv(args.regions) + if not rows: + with open(args.out, "w") as handle: + handle.write("") + return + + header = None + data_rows = rows + if is_header(rows[0]): + header = rows[0] + data_rows = rows[1:] + + annotation_cols = ["nearest_feature_id", "nearest_gene_name", "distance_to_feature"] + + if not data_rows: + with open(args.out, "w") as handle: + if header: + handle.write("\t".join(header + annotation_cols) + "\n") + return + + with tempfile.TemporaryDirectory() as tmpdir: + bed_path = os.path.join(tmpdir, "regions.bed") + with open(bed_path, "w") as handle: + for idx, row in enumerate(data_rows): + handle.write(f"{row[0]}\t{row[1]}\t{row[2]}\t{idx}\n") + + cmd = [ + "bedtools", + "closest", + "-d", + "-a", + bed_path, + "-b", + args.gene_bed, + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + annotations = {} + for line in result.stdout.strip().split("\n"): + if not line: + continue + parts = line.split("\t") + idx = int(parts[3]) + a_cols = 4 + distance = parts[-1] if parts else "NA" + b_cols = len(parts) - a_cols - 1 + gene_id = "NA" + gene_name = "NA" + if b_cols >= 4: + gene_id = parts[a_cols + 3] + if b_cols >= 5: + gene_name = parts[a_cols + 4] + if gene_id in [".", ""]: + gene_id = "NA" + if gene_name in [".", ""]: + gene_name = "NA" + annotations[idx] = (gene_id, gene_name, distance) + + with open(args.out, "w") as handle: + if header: + handle.write("\t".join(header + annotation_cols) + "\n") + for idx, row in enumerate(data_rows): + gene_id, gene_name, distance = annotations.get(idx, ("NA", "NA", "NA")) + handle.write("\t".join(row + [gene_id, gene_name, distance]) + "\n") + + +if __name__ == "__main__": + main() diff --git a/bin/chipbinner_rots.R b/bin/chipbinner_rots.R new file mode 100755 index 00000000..9d0d65a0 --- /dev/null +++ b/bin/chipbinner_rots.R @@ -0,0 +1,89 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) %% 2 != 0) { + stop("Arguments must be provided as --key value pairs") +} +arg_map <- list() +for (i in seq(1, length(args), by = 2)) { + key <- sub("^--", "", args[i]) + arg_map[[key]] <- args[i + 1] +} + +matrix_path <- arg_map[["matrix"]] +samplesheet_path <- arg_map[["samplesheet"]] +treated_label <- arg_map[["treated"]] +control_label <- arg_map[["control"]] +bootstrap <- as.integer(arg_map[["bootstrap"]]) +k_value <- as.integer(arg_map[["k-value"]]) +out_path <- arg_map[["out"]] + +if (is.null(matrix_path) || is.null(samplesheet_path) || is.null(out_path)) { + stop("Missing required arguments") +} + +suppressMessages(library(ROTS)) + +mat <- read.table(matrix_path, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1) +mat <- as.matrix(mat) + +samples <- read.csv(samplesheet_path, header = TRUE, stringsAsFactors = FALSE) +subset <- samples[samples$condition %in% c(treated_label, control_label), ] +if (nrow(subset) == 0) { + stop("No samples match contrast labels") +} + +sample_ids <- subset$sample_id +missing <- setdiff(sample_ids, colnames(mat)) +if (length(missing) > 0) { + stop(paste("Missing samples in matrix:", paste(missing, collapse = ","))) +} + +mat <- mat[, sample_ids, drop = FALSE] + +classes <- ifelse(subset$condition == treated_label, 1, 2) + +rot <- ROTS(mat, groups = classes, B = bootstrap, K = k_value) + +pvals <- NULL +fdrs <- NULL + +if (!is.null(rot$pvalue)) { + pvals <- rot$pvalue +} else if (!is.null(rot$pval)) { + pvals <- rot$pval +} + +if (is.null(pvals)) { + if ("pvalue" %in% slotNames(rot)) { + pvals <- slot(rot, "pvalue") + } +} + +if (is.null(pvals)) { + if (exists("pvalue", where = asNamespace("ROTS"), mode = "function")) { + pvals <- ROTS::pvalue(rot) + } +} + +if (!is.null(rot$FDR)) { + fdrs <- rot$FDR +} else if (!is.null(rot$fdr)) { + fdrs <- rot$fdr +} + +if (is.null(fdrs) && !is.null(pvals)) { + fdrs <- p.adjust(pvals, method = "BH") +} + +if (is.null(pvals)) { + stop("Unable to extract p-values from ROTS output") +} + +out <- data.frame( + bin_id = rownames(mat), + pval = as.numeric(pvals), + FDR = as.numeric(fdrs) +) + +write.table(out, file = out_path, sep = "\t", quote = FALSE, row.names = FALSE) diff --git a/bin/chipbinner_run.py b/bin/chipbinner_run.py new file mode 100755 index 00000000..a269c2d4 --- /dev/null +++ b/bin/chipbinner_run.py @@ -0,0 +1,861 @@ +#!/usr/bin/env python3 +import argparse +import csv +import glob +import hashlib +import math +import os +import subprocess +import sys +import tempfile + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from sklearn.decomposition import PCA +from sklearn.metrics import silhouette_score +from sklearn.preprocessing import StandardScaler + +try: + import hdbscan +except ImportError as exc: + hdbscan = None + +try: + from scipy.stats import fisher_exact +except ImportError: + fisher_exact = None + + +def parse_args(): + parser = argparse.ArgumentParser(description="Run ChIPBinner workflow") + parser.add_argument("--samples", required=True) + parser.add_argument("--group", required=True) + parser.add_argument("--contrast", required=True) + parser.add_argument("--chrom-sizes", required=True) + parser.add_argument("--bin-size", type=int, required=True) + parser.add_argument("--windows") + parser.add_argument("--windows-dir") + parser.add_argument("--blacklist") + parser.add_argument("--use-input", action="store_true") + parser.add_argument("--use-spikein", action="store_true") + parser.add_argument("--pseudocount", type=float, default=1.0) + parser.add_argument("--grid-minpts", default="") + parser.add_argument("--grid-minsamps", default="") + parser.add_argument("--fdr", type=float, default=0.05) + parser.add_argument("--lfc", type=float, default=1.0) + parser.add_argument("--bootstrap", type=int, default=1000) + parser.add_argument("--k-value", type=int, default=100000) + parser.add_argument("--functional-db") + parser.add_argument("--allow-partial", action="store_true") + parser.add_argument("--base-dir") + parser.add_argument("--outdir", default=".") + return parser.parse_args() + + +def read_tsv(path): + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return [row for row in reader] + + +def parse_float(value, default=None): + if value is None: + return default + text = str(value).strip() + if text in ("", "NA", "None", "none"): + return default + try: + return float(text) + except Exception: + return default + + +def genome_id_from_path(path): + base = os.path.basename(path) + if base.endswith(".sizes"): + base = base[: -len(".sizes")] + else: + base = os.path.splitext(base)[0] + return base or "genome" + + +def hash_file(path): + if not path or not os.path.exists(path): + return None + digest = hashlib.sha256() + with open(path, "rb") as handle: + for chunk in iter(lambda: handle.read(8192), b""): + digest.update(chunk) + return digest.hexdigest()[:12] + + +def ensure_dir(path): + if path: + os.makedirs(path, exist_ok=True) + + +def run_cmd(cmd, stdout=None): + result = subprocess.run(cmd, stdout=stdout, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + raise RuntimeError(f"Command failed ({' '.join(cmd)}): {result.stderr.strip()}") + + +def choose_windows_file(windows_path, windows_dir, bin_size, chrom_sizes=None, blacklist=None): + if windows_path: + return windows_path + if not windows_dir: + return None + if os.path.isfile(windows_dir): + return windows_dir + if os.path.isdir(windows_dir): + genome_id = genome_id_from_path(chrom_sizes) if chrom_sizes else None + blacklist_hash = hash_file(blacklist) if blacklist else None + preferred = [] + if genome_id: + if blacklist_hash: + preferred.append(f"windows.{genome_id}.{bin_size}.{blacklist_hash}.bed") + preferred.append(f"windows.{genome_id}.{bin_size}.bed") + else: + preferred.append(f"windows.{bin_size}.bed") + for name in preferred: + candidate = os.path.join(windows_dir, name) + if os.path.exists(candidate): + return candidate + raise RuntimeError( + f"No windows BED found in {windows_dir}. Expected one of: {', '.join(preferred)}" + ) + return None + + +def generate_windows(chrom_sizes, bin_size, blacklist, out_path): + with tempfile.TemporaryDirectory() as tmpdir: + raw_path = os.path.join(tmpdir, "windows_raw.bed") + with open(raw_path, "w") as handle: + with open(chrom_sizes, "r") as chrom_handle: + for line in chrom_handle: + parts = line.strip().split("\t") + if len(parts) < 2: + continue + chrom, size = parts[0], int(parts[1]) + start = 0 + while start < size: + end = min(start + bin_size, size) + handle.write(f"{chrom}\t{start}\t{end}\n") + start = end + work_path = raw_path + if blacklist and os.path.exists(blacklist): + filtered_path = os.path.join(tmpdir, "windows_filtered.bed") + run_cmd(["bedtools", "subtract", "-a", raw_path, "-b", blacklist], stdout=open(filtered_path, "w")) + work_path = filtered_path + with open(out_path, "w") as out_handle: + run_cmd(["sort", "-k1,1", "-k2,2n", work_path], stdout=out_handle) + + +def load_windows(path): + windows = pd.read_csv(path, sep="\t", header=None, names=["chrom", "start", "end"]) + windows["bin_id"] = windows.apply(lambda r: f"{r.chrom}:{int(r.start)}-{int(r.end)}", axis=1) + return windows + + +def write_samplesheet(samples, out_path): + header = [ + "sample_id", + "group", + "condition", + "replicate", + "final_bam", + "final_bai", + "bigwig_path", + "spikein_scale_factor", + "ms_coeff", + "input_bam", + "input_bai", + ] + with open(out_path, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header) + writer.writeheader() + for row in samples: + writer.writerow({ + "sample_id": row.get("sample_id", ""), + "group": row.get("group", ""), + "condition": row.get("condition", ""), + "replicate": row.get("replicate", ""), + "final_bam": row.get("final_bam", ""), + "final_bai": row.get("final_bai", ""), + "bigwig_path": row.get("bigwig_path", ""), + "spikein_scale_factor": row.get("spikein_scale_factor", ""), + "ms_coeff": row.get("ms_coeff", ""), + "input_bam": row.get("input_bam", ""), + "input_bai": row.get("input_bai", ""), + }) + +def compute_counts(windows_path, samples, out_path, use_input=False): + bam_paths = [] + sample_ids = [] + for row in samples: + bam = row.get("final_bam") + if not bam or bam in ("NA", "None"): + raise RuntimeError(f"Missing final_bam for sample {row.get('sample_id')}") + if not os.path.exists(bam): + raise RuntimeError(f"BAM not found: {bam}") + bam_paths.append(bam) + sample_ids.append(row.get("sample_id", os.path.basename(bam))) + + cmd = ["bedtools", "multicov", "-bed", windows_path, "-bams"] + bam_paths + with open(out_path, "w") as handle: + run_cmd(cmd, stdout=handle) + + cols = ["chrom", "start", "end"] + sample_ids + counts = pd.read_csv(out_path, sep="\t", header=None, names=cols) + counts["bin_id"] = counts.apply(lambda r: f"{r.chrom}:{int(r.start)}-{int(r.end)}", axis=1) + + if use_input: + input_counts_map = {} + for row in samples: + bam = row.get("input_bam") + if not bam or bam in ("NA", "None", "none", ""): + raise RuntimeError("chipbinner_use_input requested but no input_bam values were provided") + if bam in input_counts_map: + continue + if not os.path.exists(bam): + raise RuntimeError(f"Input BAM not found: {bam}") + with tempfile.NamedTemporaryFile(delete=False) as tmp_handle: + tmp_path = tmp_handle.name + cmd = ["bedtools", "multicov", "-bed", windows_path, "-bams", bam] + with open(tmp_path, "w") as handle: + run_cmd(cmd, stdout=handle) + tmp_df = pd.read_csv(tmp_path, sep="\t", header=None, names=["chrom", "start", "end", "count"]) + os.unlink(tmp_path) + input_counts_map[bam] = tmp_df["count"].astype(float).values + + for sample_id, row in zip(sample_ids, samples): + bam = row.get("input_bam") + counts[sample_id] = counts[sample_id].astype(float).sub(input_counts_map[bam], axis=0).clip(lower=0) + return counts, sample_ids + + +def write_matrix(counts_df, sample_ids, out_path): + matrix = counts_df[["chrom", "start", "end", "bin_id"] + sample_ids] + matrix.to_csv(out_path, sep="\t", index=False) + + +def normalize_counts(counts_df, sample_ids, samples, use_spikein, pseudocount): + counts = counts_df[sample_ids].astype(float) + if use_spikein: + missing = [] + for row in samples: + spike_raw = row.get("spikein_scale_factor") + if parse_float(spike_raw) is None: + missing.append(row.get("sample_id", "")) + if missing: + raise RuntimeError(f"Missing spike-in scale factors for samples: {', '.join(sorted(set(missing)))}") + factors = [] + scaling_applied = [] + for idx, row in enumerate(samples): + steps = [] + applied = False + spike_raw = row.get("spikein_scale_factor") + spike = parse_float(spike_raw) if use_spikein else None + spike_size = None + if spike is not None: + if spike == 0: + raise RuntimeError("Spike-in scale factor cannot be zero") + spike_size = 1.0 / spike + # Apply the scale factor directly (equivalent to dividing by size factor). + counts.iloc[:, idx] = counts.iloc[:, idx] * spike + steps.append("spikein") + applied = True + + ms_raw = row.get("ms_coeff") + ms_coeff = parse_float(ms_raw) + ms_size = None + if ms_coeff is not None: + if ms_coeff == 0: + raise RuntimeError("MS coefficient cannot be zero") + ms_size = 1.0 / ms_coeff + counts.iloc[:, idx] = counts.iloc[:, idx] / ms_size + steps.append("ms_coeff") + applied = True + + factors.append({ + "sample_id": row.get("sample_id", ""), + "group": row.get("group", ""), + "condition": row.get("condition", ""), + "spikein_scale_factor": spike if spike is not None else "NA", + "spikein_size_factor": spike_size if spike_size is not None else "NA", + "ms_coeff": ms_coeff if ms_coeff is not None else "NA", + "ms_size_factor": ms_size if ms_size is not None else "NA", + "applied_steps": steps, + }) + scaling_applied.append(applied) + + apply_cpm = not any(scaling_applied) + if apply_cpm: + library_size = counts.sum(axis=0) + library_size[library_size == 0] = 1.0 + normalized = counts.divide(library_size, axis=1) * 1e6 + else: + normalized = counts.copy() + normalized = normalized + pseudocount + + for entry in factors: + steps = entry.get("applied_steps", []) + steps.append("pseudocount") + if apply_cpm: + steps.append("library_size_cpm") + entry["applied_steps"] = ",".join(steps) + + return normalized, pd.DataFrame(factors) + + +def plot_pca(norm_matrix, sample_ids, samples, out_path): + data = np.log2(norm_matrix + 1) + data = data.T + labels = [row.get("condition", "") for row in samples] + pca = PCA(n_components=2) + coords = pca.fit_transform(data) + plt.figure(figsize=(6, 5)) + for condition in sorted(set(labels)): + idxs = [i for i, c in enumerate(labels) if c == condition] + plt.scatter(coords[idxs, 0], coords[idxs, 1], label=condition) + for i, sample_id in enumerate(sample_ids): + plt.text(coords[i, 0], coords[i, 1], sample_id, fontsize=7) + plt.xlabel("PC1") + plt.ylabel("PC2") + plt.legend(loc="best", fontsize=7) + plt.tight_layout() + plt.savefig(out_path, dpi=150) + plt.close() + + +def plot_correlation(norm_matrix, sample_ids, out_path): + data = np.log2(norm_matrix + 1) + corr = pd.DataFrame(data, columns=sample_ids).corr() + plt.figure(figsize=(6, 5)) + sns.heatmap(corr, cmap="vlag", center=0, xticklabels=True, yticklabels=True) + plt.tight_layout() + plt.savefig(out_path, dpi=150) + plt.close() + + +def parse_grid(values, default_values): + if values: + items = [v.strip() for v in str(values).split(",") if v.strip()] + return [int(x) for x in items] + return default_values + + +def hdbscan_grid(norm_matrix, windows_df, minpts_list, minsamps_list, out_dir, summary_path): + if hdbscan is None: + raise RuntimeError("hdbscan python package is required for clustering") + ensure_dir(out_dir) + features = StandardScaler().fit_transform(norm_matrix.values) + results = [] + assignments = {} + n_bins = features.shape[0] + for minpts in minpts_list: + for minsamps in minsamps_list: + status = "OK" + n_clusters = 0 + noise_fraction = 1.0 + silhouette = np.nan + labels = None + if minpts > n_bins: + status = "INVALID" + else: + try: + clusterer = hdbscan.HDBSCAN(min_cluster_size=minpts, min_samples=minsamps) + labels = clusterer.fit_predict(features) + n_clusters = len({lab for lab in labels if lab >= 0}) + noise_fraction = float(np.sum(labels < 0)) / float(len(labels)) + if n_clusters > 1: + mask = labels >= 0 + silhouette = silhouette_score(features[mask], labels[mask]) + except Exception as exc: + status = f"FAIL:{exc}" + results.append({ + "minPts": minpts, + "minSamps": minsamps, + "n_clusters": n_clusters, + "noise_fraction": noise_fraction, + "silhouette": silhouette, + "status": status, + "labels": labels, + }) + # selection rule: maximize silhouette (if available), then minimize noise, then max clusters, then smallest params + best = None + for entry in results: + if entry["status"] != "OK": + continue + sil = entry["silhouette"] + if math.isnan(sil): + score = -1.0 - entry["noise_fraction"] + else: + score = sil - entry["noise_fraction"] + entry["score"] = score + if best is None: + best = entry + continue + if score > best["score"]: + best = entry + continue + if score == best["score"]: + if entry["n_clusters"] > best["n_clusters"]: + best = entry + continue + if entry["n_clusters"] == best["n_clusters"] and entry["noise_fraction"] < best["noise_fraction"]: + best = entry + continue + if entry["n_clusters"] == best["n_clusters"] and entry["noise_fraction"] == best["noise_fraction"]: + if (entry["minPts"], entry["minSamps"]) < (best["minPts"], best["minSamps"]): + best = entry + + if best is None: + raise RuntimeError("No successful HDBSCAN grid points") + + for entry in results: + if entry["status"] != "OK" or entry["labels"] is None: + continue + labels = entry["labels"] + out_path = os.path.join(out_dir, f"clusters_minPts{entry['minPts']}_minSamps{entry['minSamps']}.tsv") + df = windows_df.copy() + df["cluster"] = labels + df.to_csv(out_path, sep="\t", index=False) + assignments[(entry["minPts"], entry["minSamps"])] = out_path + + summary_rows = [] + for entry in results: + sil = entry["silhouette"] + sil_val = "NA" if math.isnan(sil) else f"{sil:.4f}" + score = entry.get("score") + score_val = "NA" if score is None else f"{score:.4f}" + selected = "true" if best and entry["minPts"] == best["minPts"] and entry["minSamps"] == best["minSamps"] else "false" + summary_rows.append({ + "minPts": entry["minPts"], + "minSamps": entry["minSamps"], + "n_clusters": entry["n_clusters"], + "noise_fraction": f"{entry['noise_fraction']:.4f}", + "silhouette": sil_val, + "score": score_val, + "status": entry["status"], + "selected": selected, + }) + + pd.DataFrame(summary_rows).to_csv(summary_path, sep="\t", index=False) + + chosen = { + "minPts": best["minPts"], + "minSamps": best["minSamps"], + "n_clusters": best["n_clusters"], + "labels": best["labels"], + "summary_path": summary_path, + } + return chosen, summary_path + + +def run_rots(matrix_path, samplesheet_path, treated, control, bootstrap, k_value, out_path): + script_path = os.path.join(os.path.dirname(__file__), "chipbinner_rots.R") + cmd = [ + "Rscript", + script_path, + "--matrix", matrix_path, + "--samplesheet", samplesheet_path, + "--treated", treated, + "--control", control, + "--bootstrap", str(bootstrap), + "--k-value", str(k_value), + "--out", out_path, + ] + run_cmd(cmd) + + +def bh_fdr(pvals): + pvals = np.asarray(pvals) + n = len(pvals) + order = np.argsort(pvals) + ranks = np.arange(1, n + 1) + fdr = np.empty(n, dtype=float) + fdr[order] = pvals[order] * n / ranks + fdr = np.minimum.accumulate(fdr[::-1])[::-1] + fdr[fdr > 1] = 1 + return fdr + + +def run_enrichment(windows_df, clusters_path, functional_db, out_dir): + ensure_dir(out_dir) + if not functional_db: + stub = os.path.join(out_dir, "not_run.txt") + with open(stub, "w") as handle: + handle.write("NOT_RUN\n") + return "NOT_RUN" + if fisher_exact is None: + raise RuntimeError("scipy is required for enrichment") + if os.path.isdir(functional_db): + bed_files = sorted(glob.glob(os.path.join(functional_db, "*.bed"))) + else: + bed_files = [functional_db] + if not bed_files: + raise RuntimeError("No enrichment BED files found") + + # write background windows bed + bg_bed = os.path.join(out_dir, "windows.bed") + windows_df[["chrom", "start", "end"]].to_csv(bg_bed, sep="\t", header=False, index=False) + + clusters = pd.read_csv(clusters_path, sep="\t") + clusters = clusters[clusters["cluster"] >= 0] + if clusters.empty: + stub = os.path.join(out_dir, "no_clusters.txt") + with open(stub, "w") as handle: + handle.write("NO_CLUSTERS\n") + return "NOT_RUN" + + all_results = [] + bg_total = len(windows_df) + for cluster_id in sorted(clusters["cluster"].unique()): + cluster_bins = clusters[clusters["cluster"] == cluster_id] + cluster_bed = os.path.join(out_dir, f"cluster_{cluster_id}.bed") + cluster_bins[["chrom", "start", "end"]].to_csv(cluster_bed, sep="\t", header=False, index=False) + cluster_total = len(cluster_bins) + + cluster_results = [] + for bed in bed_files: + if not os.path.exists(bed): + continue + annot = os.path.basename(bed) + p1 = subprocess.Popen(["bedtools", "intersect", "-u", "-a", cluster_bed, "-b", bed], stdout=subprocess.PIPE, text=True) + p2 = subprocess.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE, text=True) + p1.stdout.close() + overlap_cluster = int((p2.communicate()[0] or "0").strip()) + + p1 = subprocess.Popen(["bedtools", "intersect", "-u", "-a", bg_bed, "-b", bed], stdout=subprocess.PIPE, text=True) + p2 = subprocess.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE, text=True) + p1.stdout.close() + overlap_bg = int((p2.communicate()[0] or "0").strip()) + a = overlap_cluster + b = cluster_total - overlap_cluster + c = overlap_bg - overlap_cluster + d = (bg_total - overlap_bg) - b + if d < 0: + d = 0 + odds, pval = fisher_exact([[a, b], [c, d]], alternative="greater") + cluster_results.append({ + "cluster": cluster_id, + "annotation": annot, + "overlap_cluster": a, + "cluster_total": cluster_total, + "overlap_background": overlap_bg, + "background_total": bg_total, + "odds_ratio": odds, + "pval": pval, + }) + if cluster_results: + pvals = [row["pval"] for row in cluster_results] + fdrs = bh_fdr(np.array(pvals)) + for row, fdr in zip(cluster_results, fdrs): + row["fdr"] = fdr + out_path = os.path.join(out_dir, f"cluster_{cluster_id}.enrichment.tsv") + pd.DataFrame(cluster_results).to_csv(out_path, sep="\t", index=False) + all_results.extend(cluster_results) + + if all_results: + pd.DataFrame(all_results).to_csv(os.path.join(out_dir, "enrichment_summary.tsv"), sep="\t", index=False) + return "RUN" + return "NOT_RUN" + + +def write_summary(path, group, treated, control, n_bins, n_fdr, n_up, n_down, n_clusters, minpts, minsamps, status, reason, enrichment_status): + header = [ + "group", + "caller", + "treated", + "control", + "n_bins_tested", + "n_fdr_pass", + "n_up", + "n_down", + "n_clusters", + "chosen_minPts", + "chosen_minSamps", + "status", + "reason", + "enrichment_status", + ] + with open(path, "w", newline="") as handle: + writer = csv.writer(handle, delimiter="\t") + writer.writerow(header) + writer.writerow([ + group, + "NA", + treated, + control, + n_bins, + n_fdr, + n_up, + n_down, + n_clusters, + minpts, + minsamps, + status, + reason, + enrichment_status, + ]) + + +def write_error_artifact(outdir, message): + error_path = os.path.join(outdir, "chipbinner.error.txt") + with open(error_path, "w") as handle: + handle.write(f"{message}\n") + return error_path + + +def resolve_path(path, base_dir): + if not path or path in ("NA", "None", "none", ""): + return path + if os.path.isabs(path): + return path + if base_dir: + candidate = os.path.join(base_dir, path) + if os.path.exists(candidate): + return candidate + return path + + +def main(): + args = parse_args() + ensure_dir(args.outdir) + plots_dir = os.path.join(args.outdir, "plots") + enrichment_dir = os.path.join(args.outdir, "enrichment") + + contrast = [c.strip() for c in args.contrast.split(",") if c.strip()] + if len(contrast) != 2: + raise RuntimeError("Contrast must contain treated and control labels") + treated_label, control_label = contrast + + samples = read_tsv(args.samples) + if not samples: + raise RuntimeError("No samples provided") + + samples = [row for row in samples if row.get("condition") in (treated_label, control_label)] + if not samples: + raise RuntimeError("No samples match contrast labels") + + base_dir = os.path.abspath(args.base_dir) if args.base_dir else "" + if base_dir: + for row in samples: + for key in ("final_bam", "final_bai", "bigwig_path", "input_bam", "input_bai"): + row[key] = resolve_path(row.get(key), base_dir) + + sample_ids = [row.get("sample_id", "") for row in samples] + + samplesheet_path = os.path.join(args.outdir, "chipbinner.samplesheet.csv") + write_samplesheet(samples, samplesheet_path) + + if args.use_input: + missing_samples = [] + missing_paths = [] + for row in samples: + bam = row.get("input_bam") + if not bam or bam in ("NA", "None", "none", ""): + missing_samples.append(row.get("sample_id", "")) + elif not os.path.exists(bam): + missing_paths.append(bam) + if missing_samples or missing_paths: + details = [] + if missing_samples: + details.append(f"samples={','.join(sorted(set(missing_samples)))}") + if missing_paths: + details.append(f"paths={','.join(sorted(set(missing_paths)))}") + reason = "MISSING_INPUT_BAM" + if details: + reason = f"{reason}:" + ";".join(details) + if args.allow_partial: + error_path = write_error_artifact(args.outdir, reason) + write_summary( + os.path.join(args.outdir, "chipbinner.summary.tsv"), + args.group, + treated_label, + control_label, + 0, + 0, + 0, + 0, + 0, + "NA", + "NA", + "SKIP", + f"{reason};error={os.path.basename(error_path)}", + "NOT_RUN", + ) + return + raise RuntimeError( + f"chipbinner_use_input requested but input BAMs are missing ({'; '.join(details) or 'no input_bam values'})." + ) + + windows_path = os.path.join(args.outdir, "chipbinner.windows.bed") + try: + windows_source = choose_windows_file( + args.windows, args.windows_dir, args.bin_size, args.chrom_sizes, args.blacklist + ) + if windows_source: + if args.blacklist and os.path.exists(args.blacklist): + with tempfile.TemporaryDirectory() as tmpdir: + filtered_path = os.path.join(tmpdir, "windows_filtered.bed") + run_cmd(["bedtools", "subtract", "-a", windows_source, "-b", args.blacklist], stdout=open(filtered_path, "w")) + run_cmd(["sort", "-k1,1", "-k2,2n", filtered_path], stdout=open(windows_path, "w")) + else: + run_cmd(["sort", "-k1,1", "-k2,2n", windows_source], stdout=open(windows_path, "w")) + else: + generate_windows(args.chrom_sizes, args.bin_size, args.blacklist, windows_path) + + windows_df = load_windows(windows_path) + + counts_raw_path = os.path.join(args.outdir, "chipbinner.bin_counts.tsv") + counts_df, sample_ids = compute_counts(windows_path, samples, counts_raw_path, use_input=args.use_input) + write_matrix(counts_df, sample_ids, counts_raw_path) + + norm_matrix, factors_df = normalize_counts(counts_df, sample_ids, samples, args.use_spikein, args.pseudocount) + norm_path = os.path.join(args.outdir, "chipbinner.normalized_matrix.tsv") + norm_matrix.insert(0, "bin_id", counts_df["bin_id"]) + norm_matrix.to_csv(norm_path, sep="\t", index=False) + + factors_path = os.path.join(args.outdir, "chipbinner.normalization_factors.tsv") + factors_df.to_csv(factors_path, sep="\t", index=False) + + ensure_dir(plots_dir) + plot_pca(norm_matrix.drop(columns=["bin_id"]).values, sample_ids, samples, os.path.join(plots_dir, "chipbinner.pca.png")) + plot_correlation(norm_matrix.drop(columns=["bin_id"]).values, sample_ids, os.path.join(plots_dir, "chipbinner.correlation.png")) + + n_bins = len(windows_df) + if n_bins < 2: + raise RuntimeError("Not enough bins for HDBSCAN clustering (n_bins < 2)") + + def clamp_grid(values, n_bins, min_value): + vals = sorted({v for v in values if v <= n_bins}) + if not vals: + fallback = min(n_bins, max(min_value, min(10, n_bins))) + vals = [fallback] + return vals + + minpts_list = clamp_grid(parse_grid(args.grid_minpts, [100, 200, 500, 1000]), n_bins, 2) + minsamps_list = clamp_grid(parse_grid(args.grid_minsamps, [100, 200, 500, 1000]), n_bins, 1) + grid_dir = os.path.join(args.outdir, "hdbscan_grid") + grid_summary_path = os.path.join(args.outdir, "chipbinner.hdbscan_grid_summary.tsv") + chosen, grid_summary_path = hdbscan_grid(norm_matrix.drop(columns=["bin_id"]), windows_df, minpts_list, minsamps_list, grid_dir, grid_summary_path) + + # chosen clusters + chosen_df = windows_df.copy() + chosen_df["cluster"] = chosen["labels"] + clusters_path = os.path.join(args.outdir, "chipbinner.clusters.tsv") + chosen_df.to_csv(clusters_path, sep="\t", index=False) + chosen_df.to_csv(os.path.join(args.outdir, "chipbinner.clusters.best.tsv"), sep="\t", index=False) + + # Differential with ROTS + rots_out = os.path.join(args.outdir, "chipbinner.rots.tsv") + run_rots(norm_path, samplesheet_path, treated_label, control_label, args.bootstrap, args.k_value, rots_out) + rots_df = pd.read_csv(rots_out, sep="\t") + + norm_vals = norm_matrix.drop(columns=["bin_id"]).values + treated_idx = [i for i, row in enumerate(samples) if row.get("condition") == treated_label] + control_idx = [i for i, row in enumerate(samples) if row.get("condition") == control_label] + treated_mean = norm_vals[:, treated_idx].mean(axis=1) + control_mean = norm_vals[:, control_idx].mean(axis=1) + log2fc = np.log2(treated_mean) - np.log2(control_mean) + + diff_df = pd.DataFrame({ + "chrom": windows_df["chrom"], + "start": windows_df["start"], + "end": windows_df["end"], + "bin_id": windows_df["bin_id"], + "log2FC": log2fc, + }) + diff_df = diff_df.merge(rots_df, on="bin_id", how="left") + diff_df["cluster"] = chosen_df["cluster"].values + + # Standardized cluster views based on cluster mean log2FC (|mean| < 0.25 => stable) + STABLE_LFC_THRESHOLD = 0.25 + cluster_means = {} + for cluster_id in sorted(set(chosen_df["cluster"]) - {-1}): + cluster_means[cluster_id] = float(np.mean(log2fc[chosen_df["cluster"] == cluster_id])) + + def label_two(cluster_id): + if cluster_id == -1: + return "noise" + return "treated_high" if cluster_means.get(cluster_id, 0.0) >= 0 else "control_high" + + def label_three(cluster_id): + if cluster_id == -1: + return "noise" + mean_val = cluster_means.get(cluster_id, 0.0) + if mean_val >= STABLE_LFC_THRESHOLD: + return "treated_high" + if mean_val <= -STABLE_LFC_THRESHOLD: + return "control_high" + return "stable" + + cluster_mean_series = chosen_df["cluster"].map(cluster_means).fillna(np.nan) + clusters_two = chosen_df.copy() + clusters_two["cluster_label"] = clusters_two["cluster"].map(label_two) + clusters_two["cluster_mean_log2FC"] = cluster_mean_series + clusters_two.to_csv(os.path.join(args.outdir, "chipbinner.clusters.2cluster.tsv"), sep="\t", index=False) + + clusters_three = chosen_df.copy() + clusters_three["cluster_label"] = clusters_three["cluster"].map(label_three) + clusters_three["cluster_mean_log2FC"] = cluster_mean_series + clusters_three.to_csv(os.path.join(args.outdir, "chipbinner.clusters.3cluster.tsv"), sep="\t", index=False) + + direction = [] + for _, row in diff_df.iterrows(): + if row.get("FDR") is not None and float(row["FDR"]) <= args.fdr: + if row["log2FC"] >= args.lfc: + direction.append("UP") + elif row["log2FC"] <= -args.lfc: + direction.append("DOWN") + else: + direction.append("NS") + else: + direction.append("NS") + diff_df["direction"] = direction + + diff_path = os.path.join(args.outdir, "chipbinner.differential.tsv") + diff_df.to_csv(diff_path, sep="\t", index=False) + + n_bins = len(diff_df) + n_fdr = int((diff_df["FDR"] <= args.fdr).sum()) if "FDR" in diff_df else 0 + n_up = int((diff_df["direction"] == "UP").sum()) + n_down = int((diff_df["direction"] == "DOWN").sum()) + n_clusters = len({lab for lab in chosen["labels"] if lab >= 0}) + + enrichment_status = run_enrichment(windows_df, clusters_path, args.functional_db, enrichment_dir) + + summary_path = os.path.join(args.outdir, "chipbinner.summary.tsv") + write_summary(summary_path, args.group, treated_label, control_label, n_bins, n_fdr, n_up, n_down, n_clusters, chosen["minPts"], chosen["minSamps"], "RUN", "ok", enrichment_status) + + except Exception as exc: + if args.allow_partial: + error_path = write_error_artifact(args.outdir, exc) + write_summary( + os.path.join(args.outdir, "chipbinner.summary.tsv"), + args.group, + treated_label, + control_label, + 0, + 0, + 0, + 0, + 0, + "NA", + "NA", + "SKIP", + f"RUNTIME_ERROR:{os.path.basename(error_path)}", + "NOT_RUN", + ) + else: + raise + + +if __name__ == "__main__": + main() diff --git a/bin/chipbinner_windows_cache.py b/bin/chipbinner_windows_cache.py new file mode 100755 index 00000000..7ad4146d --- /dev/null +++ b/bin/chipbinner_windows_cache.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +import argparse +import hashlib +import os +import subprocess +import tempfile + + +def run_cmd(cmd, stdout_path=None): + if stdout_path: + with open(stdout_path, "w") as handle: + result = subprocess.run(cmd, stdout=handle, stderr=subprocess.PIPE, text=True) + else: + result = subprocess.run(cmd, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + raise RuntimeError(f"Command failed ({' '.join(cmd)}): {result.stderr.strip()}") + + +def genome_id_from_path(path): + base = os.path.basename(path) + if base.endswith(".sizes"): + base = base[: -len(".sizes")] + else: + base = os.path.splitext(base)[0] + return base or "genome" + + +def hash_file(path): + if not path or not os.path.exists(path): + return "none" + digest = hashlib.sha256() + with open(path, "rb") as handle: + for chunk in iter(lambda: handle.read(8192), b""): + digest.update(chunk) + return digest.hexdigest()[:12] + + +def resolve_windows_source(windows_dir, genome_id, bin_size, blacklist_hash): + if not windows_dir: + return None + if os.path.isfile(windows_dir): + return windows_dir + if not os.path.isdir(windows_dir): + raise RuntimeError(f"--windows-dir must be a file or directory (got: {windows_dir})") + preferred = [ + f"windows.{genome_id}.{bin_size}.{blacklist_hash}.bed", + f"windows.{genome_id}.{bin_size}.bed", + ] + for name in preferred: + candidate = os.path.join(windows_dir, name) + if os.path.exists(candidate): + return candidate + raise RuntimeError( + "No matching windows BED found in " + f"{windows_dir}. Expected one of: {', '.join(preferred)}" + ) + + +def main(): + parser = argparse.ArgumentParser(description="Cache ChIPBinner windows per genome/bin/blacklist") + parser.add_argument("--chrom-sizes", required=True) + parser.add_argument("--bin-size", type=int, required=True) + parser.add_argument("--windows-dir") + parser.add_argument("--blacklist") + parser.add_argument("--outdir", default=".") + args = parser.parse_args() + + os.makedirs(args.outdir, exist_ok=True) + genome_id = genome_id_from_path(args.chrom_sizes) + blacklist_hash = hash_file(args.blacklist) if args.blacklist else "none" + + windows_source = resolve_windows_source(args.windows_dir, genome_id, args.bin_size, blacklist_hash) + + out_name = f"windows.{genome_id}.{args.bin_size}.{blacklist_hash}.bed" + out_path = os.path.join(args.outdir, out_name) + + with tempfile.TemporaryDirectory() as tmpdir: + base_path = None + if windows_source: + base_path = windows_source + else: + base_path = os.path.join(tmpdir, "windows_raw.bed") + run_cmd(["bedtools", "makewindows", "-g", args.chrom_sizes, "-w", str(args.bin_size)], stdout_path=base_path) + + filtered_path = base_path + if args.blacklist and os.path.exists(args.blacklist): + filtered_path = os.path.join(tmpdir, "windows_filtered.bed") + run_cmd(["bedtools", "subtract", "-a", base_path, "-b", args.blacklist], stdout_path=filtered_path) + + run_cmd(["sort", "-k1,1", "-k2,2n", filtered_path], stdout_path=out_path) + + meta_path = os.path.join(args.outdir, "chipbinner_windows_meta.tsv") + with open(meta_path, "w") as handle: + handle.write( + "genome_id\tbin_size\tblacklist_hash\twindows_source\twindows_path\n" + ) + handle.write( + f"{genome_id}\t{args.bin_size}\t{blacklist_hash}\t{windows_source or 'generated'}\t{out_name}\n" + ) + + +if __name__ == "__main__": + main() diff --git a/bin/diffbind_run.R b/bin/diffbind_run.R new file mode 100755 index 00000000..3ed16abd --- /dev/null +++ b/bin/diffbind_run.R @@ -0,0 +1,319 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +parse_args <- function(args) { + res <- list() + i <- 1 + while (i <= length(args)) { + key <- args[i] + if (grepl("^--", key)) { + name <- sub("^--", "", key) + if (i == length(args) || grepl("^--", args[i + 1])) { + res[[name]] <- TRUE + i <- i + 1 + } else { + res[[name]] <- args[i + 1] + i <- i + 2 + } + } else { + i <- i + 1 + } + } + res +} + +read_extra_params <- function(path) { + if (is.null(path) || !nzchar(path)) { + return(list()) + } + if (!file.exists(path)) { + stop(sprintf("extra_params file not found: %s", path)) + } + ext <- tolower(tools::file_ext(path)) + if (ext %in% c("yml", "yaml")) { + if (!requireNamespace("yaml", quietly = TRUE)) { + stop("YAML extra_params requires the 'yaml' R package; install it or provide JSON instead.") + } + return(yaml::yaml.load_file(path)) + } + if (requireNamespace("jsonlite", quietly = TRUE)) { + return(jsonlite::fromJSON(path, simplifyVector = FALSE)) + } + if (requireNamespace("rjson", quietly = TRUE)) { + return(rjson::fromJSON(file = path)) + } + stop("JSON extra_params requires the 'jsonlite' or 'rjson' R package.") +} + +extra_params_for <- function(extra, key) { + if (is.null(extra) || !is.list(extra)) { + return(list()) + } + if (!is.null(extra[[key]])) { + return(extra[[key]]) + } + key_dot <- gsub("_", ".", key) + if (!is.null(extra[[key_dot]])) { + return(extra[[key_dot]]) + } + key_us <- gsub("\\.", "_", key) + if (!is.null(extra[[key_us]])) { + return(extra[[key_us]]) + } + return(list()) +} + +resolve_constants <- function(value) { + if (is.list(value)) { + return(lapply(value, resolve_constants)) + } + if (is.character(value) && length(value) == 1) { + if (exists(value, envir = asNamespace("DiffBind"), inherits = FALSE)) { + return(get(value, envir = asNamespace("DiffBind"))) + } + } + value +} + +as_bool <- function(val) { + if (is.null(val)) { + return(FALSE) + } + if (is.logical(val)) { + return(val) + } + val <- tolower(as.character(val)) + return(val %in% c("true", "1", "yes")) +} + +params <- parse_args(args) +records_path <- params[["records"]] +outdir <- params[["outdir"]] +contrast <- params[["contrast"]] +caller <- params[["caller"]] +group <- params[["group"]] + +fdr <- as.numeric(params[["fdr"]]) +lfc <- as.numeric(params[["lfc"]]) +min_overlap <- as.integer(params[["min_overlap"]]) +backend <- params[["backend"]] +recenter <- as_bool(params[["recenter"]]) +summits <- as.integer(params[["summits"]]) +norm_method <- params[["norm_method"]] +export_sheets <- as_bool(params[["export_sheets"]]) +allow_partial <- as_bool(params[["allow_partial"]]) +extra_params_path <- params[["extra_params"]] +use_spikein <- as_bool(params[["use_spikein"]]) + +if (is.null(records_path) || is.null(outdir)) { + stop("Missing required arguments: --records and --outdir") +} + +if (is.na(fdr)) fdr <- 0.05 +if (is.na(lfc)) lfc <- 1.0 +if (is.na(min_overlap)) min_overlap <- 2 +if (is.na(summits)) summits <- 0 + +if (!dir.exists(outdir)) { + dir.create(outdir, recursive = TRUE) +} + +records <- read.delim(records_path, stringsAsFactors = FALSE, check.names = FALSE) + +samplesheet <- data.frame( + SampleID = records$sample_id, + Factor = records$group, + Condition = records$condition, + Replicate = records$replicate, + bamReads = records$bam, + Peaks = records$peaks, + PeakCaller = records$caller, + Tissue = "CUTRUN", + stringsAsFactors = FALSE +) + +samplesheet_path <- file.path(outdir, "diffbind.samplesheet.csv") +write.csv(samplesheet, samplesheet_path, row.names = FALSE) + +contrast_labels <- strsplit(contrast, ",")[[1]] +contrast_labels <- trimws(contrast_labels) +if (length(contrast_labels) != 2) { + stop("contrast must contain exactly two comma-separated labels") +} + +write_summary <- function(status, reason, n_tested = 0, n_fdr_pass = 0, n_up = 0, n_down = 0) { + summary <- data.frame( + caller = caller, + group = group, + treated = contrast_labels[1], + control = contrast_labels[2], + n_tested = n_tested, + n_fdr_pass = n_fdr_pass, + n_up = n_up, + n_down = n_down, + status = status, + reason = reason, + stringsAsFactors = FALSE + ) + write.table(summary, file = file.path(outdir, "diffbind.summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) +} + +write_error <- function(message) { + writeLines(message, con = file.path(outdir, "diffbind.error.txt")) +} + +fail_or_record <- function(status, reason, message) { + if (allow_partial) { + if (identical(status, "FAIL") && !is.null(message) && nzchar(message)) { + write_error(message) + } + write_summary(status, reason) + quit(save = "no") + } + stop(message) +} + +check_peaks <- function(paths) { + issues <- list(missing = character(), empty = character()) + if (is.null(paths)) { + return(issues) + } + for (path in paths) { + if (is.na(path) || !nzchar(path)) { + issues$missing <- c(issues$missing, "") + next + } + if (!file.exists(path)) { + issues$missing <- c(issues$missing, path) + next + } + lines <- tryCatch(readLines(path, n = 2, warn = FALSE), error = function(e) character()) + nonempty <- lines[!grepl("^\\s*$|^#", lines)] + if (length(nonempty) == 0) { + issues$empty <- c(issues$empty, path) + } + } + issues +} + +peaks_issues <- check_peaks(samplesheet$Peaks) +if (length(peaks_issues$missing) > 0 || length(peaks_issues$empty) > 0) { + reason <- if (length(peaks_issues$missing) > 0) "missing_peaks" else "empty_peaks" + message_parts <- c() + if (length(peaks_issues$missing) > 0) { + message_parts <- c(message_parts, sprintf("missing peak files: %s", paste(unique(peaks_issues$missing), collapse = ", "))) + } + if (length(peaks_issues$empty) > 0) { + message_parts <- c(message_parts, sprintf("empty peak files: %s", paste(unique(peaks_issues$empty), collapse = ", "))) + } + detail <- paste(message_parts, collapse = "; ") + status <- if (allow_partial) "SKIP" else "FAIL" + fail_or_record(status, reason, detail) +} + +if (!requireNamespace("DiffBind", quietly = TRUE)) { + fail_or_record("FAIL", "diffbind_unavailable", "DiffBind package not available.") +} + +call_diffbind <- function(fun_name, args) { + if (!exists(fun_name, envir = asNamespace("DiffBind"), inherits = FALSE)) { + stop(sprintf("DiffBind function %s is not available.", fun_name)) + } + fun <- get(fun_name, envir = asNamespace("DiffBind")) + formal_names <- names(formals(fun)) + args <- args[names(args) %in% formal_names] + do.call(fun, args) +} + +plot_to_pdf <- function(filename, fun_name, args) { + plots_dir <- file.path(outdir, "plots") + if (!dir.exists(plots_dir)) { + dir.create(plots_dir, recursive = TRUE) + } + pdf(file.path(plots_dir, filename)) + on.exit(dev.off(), add = TRUE) + call_diffbind(fun_name, args) +} + +tryCatch({ + suppressPackageStartupMessages(library(DiffBind)) + extra_params <- resolve_constants(read_extra_params(extra_params_path)) + + dba_args <- utils::modifyList(list(sampleSheet = samplesheet), extra_params_for(extra_params, "dba")) + dba_obj <- do.call(dba, dba_args) + summit_size <- if (recenter) summits else 0 + count_args <- utils::modifyList(list(DBA = dba_obj, summits = summit_size, minOverlap = min_overlap), extra_params_for(extra_params, "dba_count")) + dba_obj <- do.call(dba.count, count_args) + + if (use_spikein) { + if (!"spikein_scale_factor" %in% colnames(records)) { + status <- if (allow_partial) "SKIP" else "FAIL" + fail_or_record(status, "missing_spikein_scale_factor", "Missing spikein_scale_factor column in records; required when --differential_use_spikein is enabled.") + } + factors_raw <- records$spikein_scale_factor + factors_raw_chr <- trimws(as.character(factors_raw)) + factors <- suppressWarnings(as.numeric(factors_raw_chr)) + missing_idx <- which(is.na(factors) | is.na(factors_raw) | factors_raw_chr == "") + if (length(missing_idx) > 0) { + missing_samples <- unique(records$sample_id[missing_idx]) + status <- if (allow_partial) "SKIP" else "FAIL" + fail_or_record(status, "missing_spikein_scale_factor", sprintf("Missing spike-in scale factors for samples: %s", paste(missing_samples, collapse = ", "))) + } + size_factors <- ifelse(factors == 0, 1, 1 / factors) + names(size_factors) <- records$sample_id + size_factors <- size_factors[samplesheet$SampleID] + norm_args <- utils::modifyList(list(DBA = dba_obj, normalize = DBA_NORM_LIB, library = size_factors), extra_params_for(extra_params, "dba_normalize")) + dba_obj <- do.call(dba.normalize, norm_args) + norm_out <- data.frame(sample_id = samplesheet$SampleID, size_factor = size_factors, stringsAsFactors = FALSE) + write.table(norm_out, file = file.path(outdir, "diffbind.normalization_factors.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + } else if (!is.null(norm_method) && norm_method != "native") { + norm_args <- utils::modifyList(list(DBA = dba_obj, method = norm_method), extra_params_for(extra_params, "dba_normalize")) + dba_obj <- do.call(dba.normalize, norm_args) + } + + contrast_args <- utils::modifyList(list(DBA = dba_obj, categories = DBA_CONDITION, group1 = contrast_labels[1], group2 = contrast_labels[2]), extra_params_for(extra_params, "dba_contrast")) + dba_obj <- do.call(dba.contrast, contrast_args) + + method_flag <- ifelse(toupper(backend) == "EDGER", DBA_EDGER, DBA_DESEQ2) + analyze_args <- utils::modifyList(list(DBA = dba_obj, method = method_flag), extra_params_for(extra_params, "dba_analyze")) + dba_obj <- do.call(dba.analyze, analyze_args) + + plot_to_pdf("PCA.pdf", "dba.plotPCA", list(DBA = dba_obj, attributes = DBA_CONDITION, label = DBA_ID)) + plot_to_pdf("correlation_heatmap.pdf", "dba.plotHeatmap", list(DBA = dba_obj, correlations = TRUE)) + plot_to_pdf("MA.pdf", "dba.plotMA", list(DBA = dba_obj, contrast = 1)) + plot_to_pdf("volcano.pdf", "dba.plotVolcano", list(DBA = dba_obj, contrast = 1)) + + report_args <- utils::modifyList(list(DBA = dba_obj, th = fdr, fold = lfc), extra_params_for(extra_params, "dba_report")) + report <- do.call(dba.report, report_args) + results <- as.data.frame(report) + results_out <- data.frame( + chr = results$seqnames, + start = results$start, + end = results$end, + log2FC = results$Fold, + pval = results$p.value, + FDR = results$FDR, + stringsAsFactors = FALSE + ) + write.table(results_out, file = file.path(outdir, "diffbind.results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + sig <- results_out[results_out$FDR <= fdr & abs(results_out$log2FC) >= lfc, ] + write.table(sig[, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(sig[sig$log2FC >= lfc, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant_up.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(sig[sig$log2FC <= -lfc, c("chr", "start", "end")], file = file.path(outdir, "diffbind.significant_down.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + + write_summary( + status = "RUN", + reason = "ok", + n_tested = nrow(results_out), + n_fdr_pass = nrow(sig), + n_up = sum(sig$log2FC >= lfc), + n_down = sum(sig$log2FC <= -lfc) + ) + + saveRDS(dba_obj, file = file.path(outdir, "diffbind.dba.rds")) + +}, error = function(e) { + fail_or_record("FAIL", "diffbind_error", e$message) +}) diff --git a/bin/differential_design.py b/bin/differential_design.py new file mode 100755 index 00000000..4e9d683c --- /dev/null +++ b/bin/differential_design.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +import argparse +import csv +import sys + + +def read_tsv(path): + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return [row for row in reader] + + +def parse_list(value): + if not value: + return None + items = [x.strip() for x in value.split(",") if x.strip()] + return set(items) if items else None + + +def is_span_caller(name): + if not name: + return False + lowered = str(name).lower() + return lowered.startswith("span") or lowered.startswith("omnipeak") + + +def main(): + parser = argparse.ArgumentParser(description="Build differential design manifest") + parser.add_argument("--samples", required=True) + parser.add_argument("--peaks", required=True) + parser.add_argument("--contrast", required=True) + parser.add_argument("--min-replicates", type=int, required=True) + parser.add_argument("--allow-partial", action="store_true") + parser.add_argument("--run-diffbind", action="store_true") + parser.add_argument("--run-chipbinner", action="store_true") + parser.add_argument("--run-span", action="store_true") + parser.add_argument("--groups") + parser.add_argument("--callers") + parser.add_argument("--out", default="differential_manifest.design.tsv") + args = parser.parse_args() + + contrast = [x.strip() for x in args.contrast.split(",") if x.strip()] + if len(contrast) != 2: + print("ERROR: --differential_contrast must contain exactly two comma-separated labels.", file=sys.stderr) + sys.exit(1) + treated, control = contrast + + samples = read_tsv(args.samples) + peaks = read_tsv(args.peaks) + + if not samples: + print("ERROR: differential manifest has no samples.", file=sys.stderr) + sys.exit(1) + + conditions_present = {row.get("condition") for row in samples if row.get("condition")} + if treated not in conditions_present or control not in conditions_present: + print( + "ERROR: differential_contrast labels not found in samples: " + f"treated={treated} control={control} present={sorted(conditions_present)}", + file=sys.stderr, + ) + sys.exit(1) + + group_allow = parse_list(args.groups) + caller_allow = parse_list(args.callers) + + # Index samples by group + samples_by_group = {} + for row in samples: + group = row.get("group") or "" + samples_by_group.setdefault(group, []).append(row) + + # Index peaks by (group, caller, sample_id) + peaks_by_group_caller = {} + for row in peaks: + group = row.get("group") or "" + caller = row.get("caller") or "" + sample_id = row.get("sample_id") or "" + key = (group, caller) + peaks_by_group_caller.setdefault(key, set()).add(sample_id) + + all_callers = sorted({row.get("caller") or "" for row in peaks}) + + span_callers_present = True + if args.run_span: + span_callers = [caller for caller in all_callers if is_span_caller(caller)] + span_callers_present = bool(span_callers) + if not span_callers_present: + print( + "WARNING: run_span_diff requested but no SPAN/OmniPeak peaks found in manifest; " + "SPAN differential will be skipped.", + file=sys.stderr, + ) + + design_rows = [] + fail_required = False + + for group in sorted(samples_by_group.keys()): + group_samples = samples_by_group[group] + conditions = sorted({row.get("condition") for row in group_samples if row.get("condition")}) + ignored = [c for c in conditions if c not in contrast] + ignored_conditions = ",".join(ignored) if ignored else "NA" + + treated_samples = [row for row in group_samples if row.get("condition") == treated] + control_samples = [row for row in group_samples if row.get("condition") == control] + + treated_reps = {row.get("replicate") for row in treated_samples if row.get("replicate")} + control_reps = {row.get("replicate") for row in control_samples if row.get("replicate")} + + n_treated = len(treated_reps) + n_control = len(control_reps) + + group_filtered = group_allow is not None and group not in group_allow + + base_status = "RUN" + base_reason = "ok" + if group_filtered: + base_status = "SKIP" + base_reason = "group_filtered" + elif not treated_samples or not control_samples: + base_status = "SKIP" + base_reason = "missing_condition" + elif n_treated < args.min_replicates or n_control < args.min_replicates: + if args.allow_partial: + base_status = "SKIP" + base_reason = "insufficient_replicates" + else: + base_status = "FAIL" + base_reason = "insufficient_replicates" + + group_eligible = base_status == "RUN" + + # Group-level row (chipbinner/span) + if args.run_chipbinner or args.run_span: + status = base_status + reason = base_reason + eligible_chipbinner = args.run_chipbinner and group_eligible + eligible_span = args.run_span and span_callers_present and group_eligible + if not (args.run_chipbinner or args.run_span): + status = "SKIP" + reason = "methods_disabled" + else: + status = "SKIP" + reason = "methods_disabled" + eligible_chipbinner = False + eligible_span = False + + design_rows.append({ + "group": group, + "caller": "NA", + "treated_condition": treated, + "control_condition": control, + "n_treated": str(n_treated), + "n_control": str(n_control), + "eligible_diffbind": "false", + "eligible_chipbinner": "true" if eligible_chipbinner else "false", + "eligible_span": "true" if eligible_span else "false", + "status": status, + "reason": reason, + "ignored_conditions": ignored_conditions, + }) + + if status == "FAIL" and (args.run_chipbinner or args.run_span) and not args.allow_partial: + fail_required = True + + # Diffbind rows per caller + for caller in all_callers: + if caller_allow is not None and caller not in caller_allow: + status = "SKIP" + reason = "caller_filtered" + eligible_diffbind = False + elif not args.run_diffbind: + status = "SKIP" + reason = "diffbind_disabled" + eligible_diffbind = False + else: + status = base_status + reason = base_reason + eligible_diffbind = group_eligible + + if status == "RUN": + key = (group, caller) + caller_samples = peaks_by_group_caller.get(key, set()) + needed_ids = {row.get("sample_id") for row in treated_samples + control_samples} + missing = sorted([sid for sid in needed_ids if sid not in caller_samples]) + if missing: + if args.allow_partial: + status = "SKIP" + reason = "missing_peaks" + else: + status = "FAIL" + reason = "missing_peaks" + eligible_diffbind = False + + design_rows.append({ + "group": group, + "caller": caller, + "treated_condition": treated, + "control_condition": control, + "n_treated": str(n_treated), + "n_control": str(n_control), + "eligible_diffbind": "true" if eligible_diffbind else "false", + "eligible_chipbinner": "true" if (args.run_chipbinner and group_eligible) else "false", + "eligible_span": "true" + if (args.run_span and span_callers_present and group_eligible) + else "false", + "status": status, + "reason": reason, + "ignored_conditions": ignored_conditions, + }) + + if status == "FAIL" and args.run_diffbind and not args.allow_partial: + fail_required = True + + # Sort rows deterministically + design_rows.sort(key=lambda r: (r["group"], r["caller"], r["treated_condition"], r["control_condition"])) + + header = [ + "group", + "caller", + "treated_condition", + "control_condition", + "n_treated", + "n_control", + "eligible_diffbind", + "eligible_chipbinner", + "eligible_span", + "status", + "reason", + "ignored_conditions", + ] + + with open(args.out, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header, delimiter="\t") + writer.writeheader() + writer.writerows(design_rows) + + if fail_required: + print("ERROR: differential design has FAIL entries and --differential_allow_partial is false.", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/bin/differential_manifests.py b/bin/differential_manifests.py new file mode 100755 index 00000000..a7b3227e --- /dev/null +++ b/bin/differential_manifests.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +import argparse +import csv +import os +import sys + + +def detect_delimiter(path): + with open(path, "r", newline="") as handle: + header = handle.readline() + if "\t" in header: + return "\t" + return "," + + +def read_table(path): + delim = detect_delimiter(path) + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter=delim) + rows = [row for row in reader] + return rows + + +def load_ms_coeffs(path): + if not path: + return {} + rows = read_table(path) + if not rows: + return {} + header = rows[0].keys() + if "sample_id" not in header or "ms_coeff" not in header: + raise ValueError("ms_coeffs file must contain columns: sample_id, ms_coeff") + coeffs = {} + for row in rows: + sample_id = row.get("sample_id") + if not sample_id: + continue + coeffs[sample_id] = row.get("ms_coeff") or "NA" + return coeffs + + +def sort_key(value): + if value is None: + return "" + return str(value) + + +def write_samples(samples_rows, ms_coeffs, normalisation_mode, out_path): + header = [ + "sample_id", + "group", + "condition", + "replicate", + "final_bam", + "final_bai", + "normalisation_mode", + "spikein_scale_factor", + "ms_coeff", + "bigwig_path", + "input_bam", + "input_bai", + ] + rows = [] + for row in samples_rows: + sample_id = row.get("sample_id") or "" + group = row.get("group") or "" + condition = row.get("condition") or "" + replicate = row.get("replicate") or "" + final_bam = row.get("final_bam") or "NA" + final_bai = row.get("final_bai") or "NA" + spikein = row.get("spikein_scale_factor") or "NA" + bigwig = row.get("bigwig_path") or "NA" + input_bam = row.get("input_bam") or "NA" + input_bai = row.get("input_bai") or "NA" + ms_coeff = ms_coeffs.get(sample_id, "NA") + rows.append({ + "sample_id": sample_id, + "group": group, + "condition": condition, + "replicate": replicate, + "final_bam": final_bam, + "final_bai": final_bai, + "normalisation_mode": normalisation_mode, + "spikein_scale_factor": spikein, + "ms_coeff": ms_coeff, + "bigwig_path": bigwig, + "input_bam": input_bam, + "input_bai": input_bai, + }) + + rows.sort(key=lambda r: (r["group"], "NA", r["condition"], sort_key(r["replicate"]), r["sample_id"])) + + with open(out_path, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header, delimiter="\t") + writer.writeheader() + writer.writerows(rows) + + +def write_peaks(peaks_rows, out_path): + header = [ + "sample_id", + "group", + "condition", + "caller", + "peaks_path", + "peaks_format", + ] + rows = [] + for row in peaks_rows: + rows.append({ + "sample_id": row.get("sample_id") or "", + "group": row.get("group") or "", + "condition": row.get("condition") or "", + "caller": row.get("caller") or "", + "peaks_path": row.get("peaks_path") or "NA", + "peaks_format": row.get("peaks_format") or "NA", + "replicate": row.get("replicate") or "", + }) + + rows.sort(key=lambda r: (r["group"], r["caller"], r["condition"], sort_key(r["replicate"]), r["sample_id"])) + + with open(out_path, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header, delimiter="\t", extrasaction="ignore") + writer.writeheader() + for row in rows: + writer.writerow(row) + + +def main(): + parser = argparse.ArgumentParser(description="Build differential manifests") + parser.add_argument("--samples", required=True) + parser.add_argument("--peaks", required=True) + parser.add_argument("--normalisation-mode", required=True) + parser.add_argument("--ms-coeffs") + parser.add_argument("--outdir", default=".") + args = parser.parse_args() + + samples_rows = read_table(args.samples) + peaks_rows = read_table(args.peaks) + ms_coeffs = {} + if args.ms_coeffs: + try: + ms_coeffs = load_ms_coeffs(args.ms_coeffs) + except Exception as exc: + print(f"ERROR: {exc}", file=sys.stderr) + sys.exit(1) + + out_samples = os.path.join(args.outdir, "differential_manifest.samples.tsv") + out_peaks = os.path.join(args.outdir, "differential_manifest.peaks.tsv") + + write_samples(samples_rows, ms_coeffs, args.normalisation_mode, out_samples) + write_peaks(peaks_rows, out_peaks) + + +if __name__ == "__main__": + main() diff --git a/bin/differential_summary.py b/bin/differential_summary.py new file mode 100755 index 00000000..0d7a5749 --- /dev/null +++ b/bin/differential_summary.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +import argparse +import csv +import os + + +def read_tsv(path): + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return [row for row in reader] + +def normalize_status(status): + if not status: + return status + if status.upper() == "SKIPPED": + return "SKIP" + return status + + +def main(): + parser = argparse.ArgumentParser(description="Build differential summary table") + parser.add_argument("--design", required=True) + parser.add_argument("--summaries", nargs="*", default=[]) + parser.add_argument("--manifest-only", action="store_true") + parser.add_argument("--out", required=True) + args = parser.parse_args() + + design_rows = read_tsv(args.design) + + base_rows = {} + for row in design_rows: + group = row.get("group") or "" + caller = row.get("caller") or "NA" + treated = row.get("treated_condition") or "" + control = row.get("control_condition") or "" + status = normalize_status(row.get("status") or "") + reason = row.get("reason") or "" + if args.manifest_only and status == "RUN": + status = "SKIP" + reason = "manifest_only" + + if caller != "NA": + key = ("diffbind", group, caller) + base_rows[key] = { + "method": "diffbind", + "group": group, + "caller": caller, + "treated": treated, + "control": control, + "n_tested": "0", + "n_fdr_pass": "0", + "n_up": "0", + "n_down": "0", + "n_clusters": "NA", + "chosen_minPts": "NA", + "chosen_minSamps": "NA", + "mode": "NA", + "status": status, + "reason": reason, + } + else: + for method in ("chipbinner", "span"): + method_status = status + method_reason = reason + if status == "RUN": + if method == "chipbinner" and (row.get("eligible_chipbinner") or "").lower() != "true": + method_status = "SKIP" + method_reason = "chipbinner_disabled" + if method == "span" and (row.get("eligible_span") or "").lower() != "true": + method_status = "SKIP" + method_reason = "span_disabled" + key = (method, group, "NA") + base_rows[key] = { + "method": method, + "group": group, + "caller": "NA", + "treated": treated, + "control": control, + "n_tested": "0", + "n_fdr_pass": "0", + "n_up": "0", + "n_down": "0", + "n_clusters": "NA", + "chosen_minPts": "NA", + "chosen_minSamps": "NA", + "mode": "NA", + "status": method_status, + "reason": method_reason, + } + + for summary_path in args.summaries: + if not summary_path or not os.path.exists(summary_path): + continue + rows = read_tsv(summary_path) + if not rows: + continue + fname = os.path.basename(summary_path) + if "diffbind" in fname: + method = "diffbind" + elif "chipbinner" in fname: + method = "chipbinner" + elif "span" in fname: + method = "span" + else: + continue + for row in rows: + group = row.get("group") or "" + caller = row.get("caller") or (row.get("caller_id") or "NA") + key = (method, group, caller) + summary_row = { + "method": method, + "group": group, + "caller": caller, + "treated": row.get("treated") or row.get("treated_condition") or "", + "control": row.get("control") or row.get("control_condition") or "", + "n_tested": row.get("n_tested") or row.get("n_bins_tested") or "0", + "n_fdr_pass": row.get("n_fdr_pass") or "0", + "n_up": row.get("n_up") or "0", + "n_down": row.get("n_down") or "0", + "n_clusters": row.get("n_clusters") or "NA", + "chosen_minPts": row.get("chosen_minPts") or "NA", + "chosen_minSamps": row.get("chosen_minSamps") or "NA", + "mode": row.get("mode") or "NA", + "status": normalize_status(row.get("status") or "RUN"), + "reason": row.get("reason") or "ok", + } + existing = base_rows.get(key) + if existing: + existing_status = (existing.get("status") or "").upper() + incoming_status = (summary_row.get("status") or "RUN").upper() + if existing_status not in ("", "RUN") and incoming_status == "RUN": + continue + base_rows[key] = summary_row + + header = [ + "method", + "group", + "caller", + "treated", + "control", + "n_tested", + "n_fdr_pass", + "n_up", + "n_down", + "n_clusters", + "chosen_minPts", + "chosen_minSamps", + "mode", + "status", + "reason", + ] + + rows_sorted = sorted(base_rows.values(), key=lambda r: (r["method"], r["group"], r["caller"])) + + with open(args.out, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header, delimiter="\t") + writer.writeheader() + writer.writerows(rows_sorted) + + +if __name__ == "__main__": + main() diff --git a/bin/gtf_to_gene_bed.py b/bin/gtf_to_gene_bed.py new file mode 100755 index 00000000..c252dbc6 --- /dev/null +++ b/bin/gtf_to_gene_bed.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +import argparse +import gzip + + +def open_text(path): + if path.endswith(".gz"): + return gzip.open(path, "rt") + return open(path, "r") + + +def parse_attributes(attr_str): + attrs = {} + for raw in attr_str.strip().strip(";").split(";"): + item = raw.strip() + if not item: + continue + if "=" in item and " " not in item: + key, val = item.split("=", 1) + else: + parts = item.split(" ", 1) + if len(parts) < 2: + continue + key, val = parts[0], parts[1] + val = val.strip().strip('"') + attrs[key] = val + return attrs + + +def pick_attr(attrs, keys): + for key in keys: + val = attrs.get(key) + if val: + return val + return "NA" + + +def main(): + parser = argparse.ArgumentParser(description="Create gene BED from GTF") + parser.add_argument("--gtf", required=True) + parser.add_argument("--out", required=True) + parser.add_argument("--feature", default="gene") + args = parser.parse_args() + + with open_text(args.gtf) as handle, open(args.out, "w") as out: + for line in handle: + if not line or line.startswith("#"): + continue + fields = line.rstrip("\n").split("\t") + if len(fields) < 9: + continue + chrom, _source, feature, start, end, _score, _strand, _frame, attrs_str = fields[:9] + if feature != args.feature: + continue + try: + start_i = int(start) + end_i = int(end) + except ValueError: + continue + if end_i < start_i: + start_i, end_i = end_i, start_i + start0 = max(start_i - 1, 0) + + attrs = parse_attributes(attrs_str) + gene_id = pick_attr(attrs, ["gene_id", "ID", "gene", "transcript_id"]) + gene_name = pick_attr(attrs, ["gene_name", "Name", "gene"]) + if gene_id == "NA" and gene_name != "NA": + gene_id = gene_name + + out.write(f"{chrom}\t{start0}\t{end_i}\t{gene_id}\t{gene_name}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/records_to_tsv.py b/bin/records_to_tsv.py new file mode 100755 index 00000000..faea31dc --- /dev/null +++ b/bin/records_to_tsv.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import argparse +import json + + +def main(): + parser = argparse.ArgumentParser(description="Write records JSON to TSV") + parser.add_argument("--records", required=True, help="JSON string of records") + parser.add_argument("--header", required=True, help="Tab-separated header columns") + parser.add_argument("--out", required=True, help="Output TSV path") + args = parser.parse_args() + + records = json.loads(args.records) + header = args.header.split("\t") + + with open(args.out, "w") as handle: + handle.write("\t".join(header) + "\n") + for record in records: + row = [str(record.get(col, "")) for col in header] + handle.write("\t".join(row) + "\n") + + +if __name__ == "__main__": + main() diff --git a/bin/span_diff.py b/bin/span_diff.py new file mode 100755 index 00000000..55f75331 --- /dev/null +++ b/bin/span_diff.py @@ -0,0 +1,1023 @@ +#!/usr/bin/env python3 +import argparse +import csv +import math +import os +import re +import subprocess +import sys +from typing import Dict, List, Tuple + + +def read_tsv(path: str) -> List[Dict[str, str]]: + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return [row for row in reader] + + +def write_tsv(path: str, header: List[str], rows: List[Dict[str, str]]) -> None: + with open(path, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header, delimiter="\t") + writer.writeheader() + for row in rows: + writer.writerow(row) + + +def parse_list(value: str) -> List[str]: + if not value: + return [] + return [x.strip().lower() for x in value.split(",") if x.strip()] + + +def is_span_caller(name: str) -> bool: + if not name: + return False + lowered = str(name).lower() + return lowered.startswith("span") or lowered.startswith("omnipeak") + + +def safe_float(value: str): + if value is None: + return None + try: + return float(value) + except Exception: + return None + + +def run_cmd(cmd: List[str], check: bool = True) -> subprocess.CompletedProcess: + return subprocess.run(cmd, capture_output=True, text=True, check=check) + + +def load_help_text(jar_path: str, java_heap: str, args: List[str]) -> Tuple[bool, str]: + cmd = ["java", f"-Xmx{java_heap}", "-jar", jar_path] + args + try: + result = subprocess.run(cmd, capture_output=True, text=True, check=False) + except FileNotFoundError: + return False, "java_missing" + output = (result.stdout or "") + (result.stderr or "") + return True, output + + +def detect_signature(jar_path: str, java_heap: str) -> Tuple[bool, str, str]: + ok, help_text = load_help_text(jar_path, java_heap, ["--help"]) + if not ok: + return False, "java_missing", "" + if not help_text: + return False, "help_failed", "" + + lower_help = help_text.lower() + supports_compare = False + if re.search(r"\bcompare\b", lower_help): + supports_compare = True + elif re.search(r"\bdifferential\b", lower_help): + supports_compare = True + + compare_help = "" + if supports_compare: + ok_cmp, cmp_text = load_help_text(jar_path, java_heap, ["compare", "--help"]) + if ok_cmp and cmp_text: + compare_help = cmp_text + lower_cmp = cmp_text.lower() + if re.search(r"unknown command|no such command", lower_cmp): + supports_compare = False + else: + compare_help = help_text + + if not supports_compare: + return False, "no_compare", help_text + + signature = "compare:pooled" + probe_text = compare_help or help_text + for line in probe_text.splitlines(): + lline = line.lower() + if ("treat" in lline or "control" in lline or "case" in lline) and ( + "replicate" in lline + or "replicates" in lline + or "comma" in lline + or "list" in lline + or "bams" in lline + or "bam" in lline + or "files" in lline + ): + signature = "compare:list" + break + if "replicate" in probe_text.lower() or "comma" in probe_text.lower(): + signature = "compare:list" + + return True, signature, probe_text + + +def parse_flags(help_text: str) -> Dict[str, str]: + if not help_text: + return {} + long_flags = set(re.findall(r"--[A-Za-z0-9][A-Za-z0-9_.-]*", help_text)) + short_flags = set(re.findall(r"(? str: + for cand in candidates: + if cand in long_flags or cand in short_flags: + return cand + return "" + + flags = { + "treatment": pick(["--treatment", "--treated", "--treat", "--case", "-t"]), + "control": pick(["--control", "--ctrl", "--background", "--input", "-c"]), + "chrom": pick(["--chrom.sizes", "--chrom_sizes", "--chrom", "--cs"]), + "bin": pick(["--bin", "-b"]), + "gap": pick(["--gap", "-g"]), + "fdr": pick(["--fdr", "--q", "--qvalue"]), + "output": pick(["--peaks", "--output", "--out", "-p", "--prefix"]), + } + flags["output_is_prefix"] = "" + if "--prefix" in long_flags: + flags["output_is_prefix"] = "true" + if flags["output"] in {"--prefix"}: + flags["output_is_prefix"] = "true" + if flags["output"] == "-p": + for line in help_text.splitlines(): + if "-p" in line and "prefix" in line.lower(): + flags["output_is_prefix"] = "true" + break + if flags["output"] == "--peaks": + flags["output_is_prefix"] = "false" + return flags + + +def is_int(value: str) -> bool: + try: + int(value) + return True + except Exception: + return False + + +def read_intervals(path: str) -> List[Tuple[str, int, int]]: + intervals = [] + with open(path, "r") as handle: + for line in handle: + line = line.strip() + if not line or line.startswith("#") or line.startswith("track") or line.startswith("browser"): + continue + parts = line.split("\t") + if len(parts) < 3: + continue + if not is_int(parts[1]) or not is_int(parts[2]): + continue + start = int(parts[1]) + end = int(parts[2]) + if end <= start: + continue + intervals.append((parts[0], start, end)) + return intervals + + +def merge_intervals(intervals: List[Tuple[str, int, int]]) -> List[Tuple[str, int, int]]: + if not intervals: + return [] + intervals.sort(key=lambda x: (x[0], x[1], x[2])) + merged = [list(intervals[0])] + for chrom, start, end in intervals[1:]: + last = merged[-1] + if chrom != last[0] or start > last[2]: + merged.append([chrom, start, end]) + else: + last[2] = max(last[2], end) + return [tuple(x) for x in merged] + + +def write_bed(path: str, intervals: List[Tuple[str, int, int]]) -> None: + with open(path, "w") as handle: + for chrom, start, end in intervals: + handle.write(f"{chrom}\t{start}\t{end}\n") + + +def parse_native_output(path: str) -> List[Dict[str, str]]: + rows = [] + if not os.path.exists(path): + return rows + with open(path, "r") as handle: + lines = [line.rstrip("\n") for line in handle if line.strip() and not line.startswith("#")] + if not lines: + return rows + + header = None + first = lines[0].split("\t") + lower = [col.lower() for col in first] + if any(col in ("chr", "chrom", "chromosome") for col in lower) and any( + col.startswith("start") or col == "chromstart" for col in lower + ): + header = lower + data_lines = lines[1:] + else: + data_lines = lines + + def idx_for(names: List[str]): + if not header: + return None + for name in names: + if name in header: + return header.index(name) + return None + + idx_chr = idx_for(["chr", "chrom", "chromosome", "seqname"]) if header else 0 + idx_start = idx_for(["start", "chromstart", "begin"]) if header else 1 + idx_end = idx_for(["end", "chromend", "stop"]) if header else 2 + idx_log = idx_for(["log2fc", "logfc", "log2foldchange", "log2_fold_change", "log2foldchange"]) + idx_pval = idx_for(["pval", "pvalue", "p-value", "p.value"]) if header else None + idx_fdr = idx_for(["fdr", "qvalue", "q-value", "padj", "adj.p.val"]) if header else None + + for line in data_lines: + parts = line.split("\t") + if len(parts) < 3: + continue + chr_val = parts[idx_chr] if idx_chr is not None and idx_chr < len(parts) else parts[0] + start_val = parts[idx_start] if idx_start is not None and idx_start < len(parts) else parts[1] + end_val = parts[idx_end] if idx_end is not None and idx_end < len(parts) else parts[2] + + log_val = "NA" + pval_val = "NA" + fdr_val = "NA" + + if header and idx_log is not None and idx_log < len(parts): + log_val = parts[idx_log] + if header and idx_pval is not None and idx_pval < len(parts): + pval_val = parts[idx_pval] + if header and idx_fdr is not None and idx_fdr < len(parts): + fdr_val = parts[idx_fdr] + + if not header: + numeric = [] + for idx in range(3, len(parts)): + val = safe_float(parts[idx]) + if val is not None: + numeric.append((idx, val)) + if numeric: + log_idx, _ = numeric[0] + log_val = parts[log_idx] + for idx, val in numeric[1:]: + if pval_val == "NA" and 0.0 <= val <= 1.0: + pval_val = parts[idx] + continue + if fdr_val == "NA" and 0.0 <= val <= 1.0: + fdr_val = parts[idx] + continue + + rows.append({ + "chr": chr_val, + "start": start_val, + "end": end_val, + "log2FC": log_val, + "pval": pval_val, + "FDR": fdr_val, + }) + return rows + + +def write_summary(path: str, group: str, treated: str, control: str, n_tested: int, n_fdr: int, n_up: int, n_down: int, + mode: str, status: str, reason: str) -> None: + header = [ + "group", + "treated", + "control", + "n_tested", + "n_fdr_pass", + "n_up", + "n_down", + "mode", + "status", + "reason", + ] + rows = [{ + "group": group, + "treated": treated, + "control": control, + "n_tested": str(n_tested), + "n_fdr_pass": str(n_fdr), + "n_up": str(n_up), + "n_down": str(n_down), + "mode": mode, + "status": status, + "reason": reason, + }] + write_tsv(path, header, rows) + + +def write_mode(path: str, mode: str, signature: str, notes: List[str] = None) -> None: + with open(path, "w") as handle: + handle.write(f"{mode}:{signature}\n") + if notes: + for note in notes: + handle.write(f"{note}\n") + + +def filter_bed_by_stats(rows: List[Dict[str, str]], fdr_cutoff: float, direction: str) -> List[Tuple[str, int, int]]: + out = [] + for row in rows: + fdr = safe_float(row.get("FDR")) + log2fc = safe_float(row.get("log2FC")) + if fdr is None or log2fc is None: + continue + if fdr > fdr_cutoff: + continue + if direction == "up" and log2fc <= 0: + continue + if direction == "down" and log2fc >= 0: + continue + if not is_int(row.get("start", "")) or not is_int(row.get("end", "")): + continue + out.append((row["chr"], int(row["start"]), int(row["end"]))) + return out + + +def write_failure_outputs( + outdir: str, + group: str, + treated: str, + control: str, + mode: str, + signature: str, + reason: str, + allow_partial: bool, + samples: List[Dict[str, str]] = None, + use_spikein: bool = False, + backend: str = "NA", +) -> None: + diff_path = os.path.join(outdir, "span.differential.tsv") + bed_path = os.path.join(outdir, "span.differential.peaks.bed") + up_path = os.path.join(outdir, "span.up.bed") + down_path = os.path.join(outdir, "span.down.bed") + summary_path = os.path.join(outdir, "span.summary.tsv") + mode_path = os.path.join(outdir, "span.mode.txt") + normalization_path = os.path.join(outdir, "span.normalization_factors.tsv") + + with open(diff_path, "w") as handle: + handle.write("chr\tstart\tend\tlog2FC\tpval\tFDR\n") + open(bed_path, "w").close() + open(up_path, "w").close() + open(down_path, "w").close() + status = "SKIP" if allow_partial else "FAIL" + write_summary(summary_path, group, treated, control, 0, 0, 0, 0, mode, status, reason) + write_mode(mode_path, mode, signature) + if samples is None: + samples = [] + if samples: + scale_values, size_factors, use_spikein_factors = compute_spikein_factors(samples, use_spikein) + else: + scale_values = [] + size_factors = [] + use_spikein_factors = False + write_normalization_factors( + normalization_path, + samples, + scale_values, + size_factors, + use_spikein_factors, + backend, + ) + + +def compute_orientation_log2fc( + bed_path: str, + treated_bams: List[str], + control_bams: List[str], + pseudocount: float, + treated_size_factors: List[float] = None, + control_size_factors: List[float] = None, +) -> Dict[Tuple[str, str, str], List[float]]: + if not treated_bams or not control_bams: + raise RuntimeError("orientation_missing_bams") + cmd = ["bedtools", "multicov", "-bed", bed_path, "-bams"] + treated_bams + control_bams + result = run_cmd(cmd, check=False) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or "bedtools_multicov_failed") + + def apply_size_factors(counts: List[float], factors: List[float]) -> List[float]: + if not factors or len(factors) != len(counts): + return counts + adjusted = [] + for count, factor in zip(counts, factors): + if factor is None or factor == 0: + factor = 1.0 + adjusted.append(count / factor) + return adjusted + + orientation = {} + for line in result.stdout.strip().split("\n"): + if not line: + continue + parts = line.split("\t") + if len(parts) < 3: + continue + key = (parts[0], parts[1], parts[2]) + counts = [safe_float(val) or 0.0 for val in parts[3:]] + t_counts = counts[: len(treated_bams)] + c_counts = counts[len(treated_bams) :] + t_counts = apply_size_factors(t_counts, treated_size_factors) + c_counts = apply_size_factors(c_counts, control_size_factors) + if not t_counts or not c_counts: + continue + t_mean = sum(t_counts) / len(t_counts) + c_mean = sum(c_counts) / len(c_counts) + log2fc = math.log2((t_mean + pseudocount) / (c_mean + pseudocount)) + orientation.setdefault(key, []).append(log2fc) + return orientation + + +def compute_spikein_factors(samples: List[Dict[str, str]], use_spikein: bool): + scale_values = [] + size_factors = [] + numeric_present = False + for row in samples: + raw = row.get("spikein_scale_factor", "NA") + scale_values.append(raw) + numeric = None + try: + numeric = float(raw) + numeric_present = True + except Exception: + numeric = None + + if use_spikein: + if numeric is None or numeric == 0: + size = 1.0 + else: + size = 1.0 / numeric + else: + size = None + size_factors.append(size) + + use_spikein_factors = use_spikein and numeric_present + return scale_values, size_factors, use_spikein_factors + + +def write_normalization_factors( + path: str, + samples: List[Dict[str, str]], + scale_values: List[str], + size_factors: List[float], + use_spikein_factors: bool, + backend: str, +) -> None: + with open(path, "w") as handle: + handle.write("sample_id\tgroup\tcondition\tspikein_scale_factor\tspikein_size_factor\tused_by_backend\n") + for row, scale_val, size_val in zip(samples, scale_values, size_factors): + used_by_backend = "spikein" if use_spikein_factors else backend + size_str = "NA" + if use_spikein_factors: + size = size_val if size_val is not None else 1.0 + size_str = f"{size:.6f}" + handle.write( + f"{row.get('sample_id','')}\t{row.get('group','')}\t{row.get('condition','')}\t{scale_val}\t{size_str}\t{used_by_backend}\n" + ) + + +def main(): + parser = argparse.ArgumentParser(description="SPAN/OmniPeak differential runner") + parser.add_argument("--jar", required=True) + parser.add_argument("--mode", required=True) + parser.add_argument("--contrast", required=True) + parser.add_argument("--group", required=True) + parser.add_argument("--samples", required=True) + parser.add_argument("--peaks", required=True) + parser.add_argument("--caller-priority", default="") + parser.add_argument("--chrom-sizes") + parser.add_argument("--bin", type=int, default=200) + parser.add_argument("--gap", type=int, default=5) + parser.add_argument("--fdr", type=float, default=0.05) + parser.add_argument("--fallback-backend", default="DESeq2") + parser.add_argument("--use-spikein", default="false") + parser.add_argument("--allow-partial", default="false") + parser.add_argument("--java-heap", default="8G") + parser.add_argument("--cpus", type=int, default=1) + parser.add_argument("--pooling-dir", default="") + parser.add_argument("--outdir", default=".") + args = parser.parse_args() + + os.makedirs(args.outdir, exist_ok=True) + + contrast = [x.strip() for x in args.contrast.split(",") if x.strip()] + treated = contrast[0] if len(contrast) > 0 else "treated" + control = contrast[1] if len(contrast) > 1 else "control" + + use_spikein = str(args.use_spikein).lower() in {"true", "1", "yes"} + allow_partial = str(args.allow_partial).lower() in {"true", "1", "yes"} + mode = str(args.mode or "").strip().lower() + + native_supported, signature, help_text = detect_signature(args.jar, args.java_heap) + + valid_modes = {"auto", "native", "fallback"} + if mode not in valid_modes: + reason = f"invalid_mode:{mode}" if mode else "invalid_mode" + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + mode or "invalid", + signature, + reason, + allow_partial, + samples=None, + use_spikein=use_spikein, + backend=mode or "invalid", + ) + if allow_partial: + sys.exit(0) + print("ERROR: --mode must be one of auto, native, or fallback.", file=sys.stderr) + sys.exit(1) + + chosen_mode = "fallback" + if mode == "native": + if not native_supported: + reason = "native_not_supported" + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + "native", + signature, + reason, + allow_partial, + samples=None, + use_spikein=use_spikein, + backend="native", + ) + if allow_partial: + sys.exit(0) + print("ERROR: SPAN jar does not support native compare.", file=sys.stderr) + sys.exit(1) + chosen_mode = "native" + elif mode == "fallback": + chosen_mode = "fallback" + else: + chosen_mode = "native" if native_supported else "fallback" + + samples = read_tsv(args.samples) + peaks = read_tsv(args.peaks) + + samples_group = [row for row in samples if row.get("group") == args.group and row.get("condition") in {treated, control}] + if not samples_group: + reason = "no_samples" + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_group, + use_spikein=use_spikein, + backend=chosen_mode, + ) + if allow_partial: + sys.exit(0) + print("ERROR: No samples found for group.", file=sys.stderr) + sys.exit(1) + + peaks_group = [row for row in peaks if row.get("group") == args.group and row.get("condition") in {treated, control}] + span_peaks = [row for row in peaks_group if is_span_caller(row.get("caller"))] + if not span_peaks: + reason = "no_span_peaks" + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_group, + use_spikein=use_spikein, + backend=chosen_mode, + ) + if allow_partial: + sys.exit(0) + print("ERROR: No SPAN/OmniPeak peaks found for group.", file=sys.stderr) + sys.exit(1) + + priority = [x for x in parse_list(args.caller_priority) if is_span_caller(x)] + available_callers = sorted({(row.get("caller") or "").lower() for row in span_peaks}) + chosen_caller = None + for cand in priority: + if cand in available_callers: + chosen_caller = cand + break + if not chosen_caller: + chosen_caller = available_callers[0] + if len(available_callers) > 1: + print( + f"WARNING: Multiple SPAN callers for group {args.group}; using '{chosen_caller}'.", + file=sys.stderr, + ) + + peaks_by_sample = {} + for row in sorted(span_peaks, key=lambda r: (r.get("sample_id"), r.get("caller"), r.get("peaks_path"))): + if (row.get("caller") or "").lower() != chosen_caller: + continue + sample_id = row.get("sample_id") + if sample_id and sample_id not in peaks_by_sample: + peaks_by_sample[sample_id] = row.get("peaks_path") + + missing_peaks = [row.get("sample_id") for row in samples_group if row.get("sample_id") not in peaks_by_sample] + if missing_peaks: + reason = "missing_peaks" + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_group, + use_spikein=use_spikein, + backend=chosen_mode, + ) + if allow_partial: + sys.exit(0) + print(f"ERROR: Missing peaks for samples: {','.join(missing_peaks)}", file=sys.stderr) + sys.exit(1) + + samples_sorted = sorted(samples_group, key=lambda r: (r.get("condition"), r.get("replicate"), r.get("sample_id"))) + treated_samples = [row for row in samples_sorted if row.get("condition") == treated] + control_samples = [row for row in samples_sorted if row.get("condition") == control] + if not treated_samples or not control_samples: + reason = "missing_condition" + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_sorted, + use_spikein=use_spikein, + backend=chosen_mode, + ) + if allow_partial: + sys.exit(0) + print("ERROR: Missing treated/control samples.", file=sys.stderr) + sys.exit(1) + + spikein_scale_values, spikein_size_factors, use_spikein_factors = compute_spikein_factors(samples_sorted, use_spikein) + bam_size_factors = {} + for row, size_factor in zip(samples_sorted, spikein_size_factors): + bam = row.get("final_bam") + if bam: + bam_size_factors[bam] = size_factor if size_factor is not None else 1.0 + + diff_path = os.path.join(args.outdir, "span.differential.tsv") + bed_path = os.path.join(args.outdir, "span.differential.peaks.bed") + up_path = os.path.join(args.outdir, "span.up.bed") + down_path = os.path.join(args.outdir, "span.down.bed") + summary_path = os.path.join(args.outdir, "span.summary.tsv") + mode_path = os.path.join(args.outdir, "span.mode.txt") + pooling_path = os.path.join(args.outdir, "span_diff_target_pooling.tsv") + + def finalize_success(rows: List[Dict[str, str]], mode: str, status: str = "RUN", reason: str = "ok"): + n_tested = len(rows) + n_fdr = 0 + n_up = 0 + n_down = 0 + for row in rows: + fdr_val = safe_float(row.get("FDR")) + log_val = safe_float(row.get("log2FC")) + if fdr_val is None or log_val is None: + continue + if fdr_val <= args.fdr: + n_fdr += 1 + if log_val > 0: + n_up += 1 + elif log_val < 0: + n_down += 1 + write_summary(summary_path, args.group, treated, control, n_tested, n_fdr, n_up, n_down, mode, status, reason) + + try: + if chosen_mode == "native": + if not native_supported: + reason = "native_not_supported" + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_sorted, + use_spikein=use_spikein, + backend=chosen_mode, + ) + if allow_partial: + sys.exit(0) + print("ERROR: Native SPAN compare not supported.", file=sys.stderr) + sys.exit(1) + + flags = parse_flags(help_text) + treat_flag = flags.get("treatment") or "-t" + ctrl_flag = flags.get("control") or "-c" + chrom_flag = flags.get("chrom") or "--cs" + bin_flag = flags.get("bin") or "--bin" + gap_flag = flags.get("gap") or "--gap" + fdr_flag = flags.get("fdr") or "--fdr" + out_flag = flags.get("output") or "-p" + out_is_prefix = flags.get("output_is_prefix") == "true" + + if not args.chrom_sizes or not os.path.exists(args.chrom_sizes): + reason = "missing_chrom_sizes" + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_sorted, + use_spikein=use_spikein, + backend=chosen_mode, + ) + if allow_partial: + sys.exit(0) + print("ERROR: Chrom sizes file required for native SPAN compare.", file=sys.stderr) + sys.exit(1) + + treated_bams = [row.get("final_bam") for row in treated_samples] + control_bams = [row.get("final_bam") for row in control_samples] + for bam in treated_bams + control_bams: + if not bam or not os.path.exists(bam): + raise RuntimeError(f"Missing BAM: {bam}") + + use_lists = signature == "compare:list" + pooled_rows = [] + + def pool_bams(condition: str, bams: List[str]) -> str: + pooled_name = f"{args.group}.{condition}.pooled.bam" + pooled_path = os.path.join(args.outdir, pooled_name) + sorted_bams = sorted(bams) + cmd = ["samtools", "merge", "-f", "-@", str(max(args.cpus, 1)), pooled_path] + sorted_bams + run_cmd(cmd, check=True) + run_cmd(["samtools", "index", pooled_path], check=True) + if use_spikein_factors: + size_vals = [bam_size_factors.get(bam, 1.0) for bam in sorted_bams] + pooled_size = sum(size_vals) / len(size_vals) if size_vals else 1.0 + bam_size_factors[pooled_path] = pooled_size + pooled_manifest_path = pooled_path + if args.pooling_dir: + pooled_manifest_path = os.path.join(args.pooling_dir, pooled_name) + pooled_rows.append({ + "group": args.group, + "condition": condition, + "pooled_bam": pooled_manifest_path, + "source_bams": ",".join(sorted_bams), + }) + return pooled_path + + treated_arg = "" + control_arg = "" + compare_treated_bams = [] + compare_control_bams = [] + if use_lists: + treated_arg = ",".join(treated_bams) + control_arg = ",".join(control_bams) + compare_treated_bams = treated_bams + compare_control_bams = control_bams + else: + if len(treated_bams) > 1: + treated_arg = pool_bams(treated, treated_bams) + else: + treated_arg = treated_bams[0] + if len(control_bams) > 1: + control_arg = pool_bams(control, control_bams) + else: + control_arg = control_bams[0] + compare_treated_bams = [treated_arg] + compare_control_bams = [control_arg] + + output_target = "span.native" + output_path = output_target + if out_is_prefix: + output_path = output_target + else: + output_path = f"{output_target}.peak" + + cmd = [ + "java", + f"-Xmx{args.java_heap}", + "-jar", + args.jar, + "compare", + treat_flag, + treated_arg, + ctrl_flag, + control_arg, + chrom_flag, + args.chrom_sizes, + bin_flag, + str(args.bin), + gap_flag, + str(args.gap), + fdr_flag, + str(args.fdr), + out_flag, + output_path, + ] + + result = run_cmd(cmd, check=False) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or "native_compare_failed") + + native_output = None + if out_is_prefix: + candidates = [f for f in os.listdir(args.outdir) if f.startswith(output_target)] + candidates = [c for c in candidates if c.endswith(".peak") or c.endswith(".bed") or c.endswith(".tsv")] + candidates.sort() + if candidates: + native_output = os.path.join(args.outdir, candidates[0]) + else: + native_output = os.path.join(args.outdir, output_path) + + if not native_output or not os.path.exists(native_output): + raise RuntimeError("native_output_missing") + + rows = parse_native_output(native_output) + if not rows: + raise RuntimeError("native_output_empty") + + bed_rows = [(r["chr"], int(r["start"]), int(r["end"])) for r in rows if is_int(r["start"]) and is_int(r["end"])] + bed_rows.sort(key=lambda x: (x[0], x[1], x[2])) + write_bed(bed_path, bed_rows) + + pseudocount = 0.5 + treated_size_factors = None + control_size_factors = None + if use_spikein_factors: + treated_size_factors = [bam_size_factors.get(bam, 1.0) for bam in compare_treated_bams] + control_size_factors = [bam_size_factors.get(bam, 1.0) for bam in compare_control_bams] + orientation_map = compute_orientation_log2fc( + bed_path, + compare_treated_bams, + compare_control_bams, + pseudocount, + treated_size_factors, + control_size_factors, + ) + for row in rows: + key = (row.get("chr"), row.get("start"), row.get("end")) + values = orientation_map.get(key, []) + log2fc_user = values.pop(0) if values else None + row["log2FC_span"] = row.get("log2FC", "NA") + row["log2FC"] = f"{log2fc_user:.6f}" if log2fc_user is not None else "NA" + + write_tsv(diff_path, ["chr", "start", "end", "log2FC", "log2FC_span", "pval", "FDR"], rows) + + up_rows = filter_bed_by_stats(rows, args.fdr, "up") + down_rows = filter_bed_by_stats(rows, args.fdr, "down") + up_rows.sort(key=lambda x: (x[0], x[1], x[2])) + down_rows.sort(key=lambda x: (x[0], x[1], x[2])) + write_bed(up_path, up_rows) + write_bed(down_path, down_rows) + + normalization_path = os.path.join(args.outdir, "span.normalization_factors.tsv") + write_normalization_factors( + normalization_path, + samples_sorted, + spikein_scale_values, + spikein_size_factors, + use_spikein_factors, + "native", + ) + print(f"INFO: wrote normalization factors to {normalization_path}", file=sys.stderr) + + if pooled_rows: + write_tsv( + pooling_path, + ["group", "condition", "pooled_bam", "source_bams"], + pooled_rows, + ) + + write_mode( + mode_path, + "native", + signature, + notes=[f"orientation=treated/control", f"orientation_pseudocount={pseudocount}"], + ) + finalize_success(rows, "native") + return + + # Fallback path + all_intervals = [] + for row in samples_sorted: + peak_path = peaks_by_sample.get(row.get("sample_id")) + if not peak_path or not os.path.exists(peak_path): + raise RuntimeError(f"Missing peaks file: {peak_path}") + all_intervals.extend(read_intervals(peak_path)) + + merged = merge_intervals(all_intervals) + if not merged: + raise RuntimeError("no_regions") + + write_bed(bed_path, merged) + + # counts + sample_ids = [row.get("sample_id") for row in samples_sorted] + bams = [row.get("final_bam") for row in samples_sorted] + for bam in bams: + if not bam or not os.path.exists(bam): + raise RuntimeError(f"Missing BAM: {bam}") + + counts_path = os.path.join(args.outdir, "span.counts.tsv") + cmd = ["bedtools", "multicov", "-bed", bed_path, "-bams"] + bams + result = run_cmd(cmd, check=False) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or "bedtools_multicov_failed") + with open(counts_path, "w") as handle: + handle.write("chr\tstart\tend\t" + "\t".join(sample_ids) + "\n") + handle.write(result.stdout) + + samples_meta_path = os.path.join(args.outdir, "span.samples.tsv") + with open(samples_meta_path, "w") as handle: + handle.write("sample_id\tcondition\tspikein_scale_factor\n") + for row in samples_sorted: + handle.write( + f"{row.get('sample_id','')}\t{row.get('condition','')}\t{row.get('spikein_scale_factor','NA')}\n" + ) + + normalization_path = os.path.join(args.outdir, "span.normalization_factors.tsv") + write_normalization_factors( + normalization_path, + samples_sorted, + spikein_scale_values, + spikein_size_factors, + use_spikein_factors, + args.fallback_backend, + ) + print(f"INFO: wrote normalization factors to {normalization_path}", file=sys.stderr) + + r_script = os.path.join(os.path.dirname(__file__), "span_fallback_de.R") + cmd = [ + "Rscript", + r_script, + "--counts", + counts_path, + "--samples", + samples_meta_path, + "--contrast", + args.contrast, + "--backend", + args.fallback_backend, + "--use-spikein", + "true" if use_spikein else "false", + "--out", + diff_path, + ] + result = run_cmd(cmd, check=False) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or "fallback_de_failed") + + rows = parse_native_output(diff_path) + if not rows: + raise RuntimeError("fallback_output_empty") + + up_rows = filter_bed_by_stats(rows, args.fdr, "up") + down_rows = filter_bed_by_stats(rows, args.fdr, "down") + up_rows.sort(key=lambda x: (x[0], x[1], x[2])) + down_rows.sort(key=lambda x: (x[0], x[1], x[2])) + write_bed(up_path, up_rows) + write_bed(down_path, down_rows) + + write_mode(mode_path, "fallback", signature) + finalize_success(rows, "fallback") + except Exception as exc: + reason = str(exc).split("\n")[0] if exc else "failed" + write_failure_outputs( + args.outdir, + args.group, + treated, + control, + chosen_mode, + signature, + reason, + allow_partial, + samples=samples_sorted if "samples_sorted" in locals() else None, + use_spikein=use_spikein, + backend=chosen_mode, + ) + if allow_partial: + sys.exit(0) + print(f"ERROR: SPAN diff failed: {exc}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/bin/span_fallback_de.R b/bin/span_fallback_de.R new file mode 100755 index 00000000..25ccf23d --- /dev/null +++ b/bin/span_fallback_de.R @@ -0,0 +1,143 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +parse_args <- function(args) { + res <- list() + i <- 1 + while (i <= length(args)) { + key <- args[i] + if (grepl("^--", key)) { + name <- sub("^--", "", key) + if (i == length(args) || grepl("^--", args[i + 1])) { + res[[name]] <- TRUE + i <- i + 1 + } else { + res[[name]] <- args[i + 1] + i <- i + 2 + } + } else { + i <- i + 1 + } + } + res +} + +as_bool <- function(val) { + if (is.null(val)) { + return(FALSE) + } + if (is.logical(val)) { + return(val) + } + val <- tolower(as.character(val)) + return(val %in% c("true", "1", "yes")) +} + +params <- parse_args(args) +counts_path <- params[["counts"]] +samples_path <- params[["samples"]] +contrast <- params[["contrast"]] +backend <- params[["backend"]] +use_spikein <- as_bool(params[["use-spikein"]]) +out_path <- params[["out"]] + +if (is.null(counts_path) || is.null(samples_path) || is.null(out_path) || is.null(contrast)) { + stop("Missing required arguments: --counts, --samples, --contrast, --out") +} + +contrast_parts <- strsplit(contrast, ",")[[1]] +contrast_parts <- trimws(contrast_parts) +if (length(contrast_parts) < 2) { + stop("Contrast must contain treated,control") +} +treated <- contrast_parts[1] +control <- contrast_parts[2] + +counts <- read.delim(counts_path, check.names = FALSE, stringsAsFactors = FALSE) +if (nrow(counts) == 0) { + write.table(data.frame(chr=character(), start=integer(), end=integer(), log2FC=numeric(), pval=numeric(), FDR=numeric()), + file=out_path, sep="\t", quote=FALSE, row.names=FALSE) + quit(save = "no") +} + +region <- counts[, 1:3] +count_mat <- as.matrix(counts[, 4:ncol(counts), drop = FALSE]) +colnames(count_mat) <- colnames(counts)[4:ncol(counts)] + +samples <- read.delim(samples_path, stringsAsFactors = FALSE) +if (!"sample_id" %in% colnames(samples)) { + stop("Samples file must contain sample_id") +} + +sample_order <- match(colnames(count_mat), samples$sample_id) +if (any(is.na(sample_order))) { + stop("Counts contain sample IDs missing from samples metadata") +} + +samples <- samples[sample_order, ] +coldata <- data.frame(condition = factor(samples$condition, levels = c(control, treated))) +rownames(coldata) <- samples$sample_id + +size_factors <- NULL +if (use_spikein && "spikein_scale_factor" %in% colnames(samples)) { + factors <- suppressWarnings(as.numeric(samples$spikein_scale_factor)) + if (any(!is.na(factors))) { + size_factors <- ifelse(factors == 0, 1, 1 / factors) + size_factors[is.na(size_factors)] <- 1 + size_factors[!is.finite(size_factors)] <- 1 + } +} + +if (is.null(backend)) { + backend <- "DESeq2" +} + +backend <- as.character(backend) + +if (backend == "edgeR") { + if (!requireNamespace("edgeR", quietly = TRUE)) { + stop("edgeR package not available") + } + y <- edgeR::DGEList(counts = count_mat, group = coldata$condition) + if (!is.null(size_factors)) { + norm <- size_factors / exp(mean(log(size_factors))) + y$samples$norm.factors <- norm + } else { + y <- edgeR::calcNormFactors(y, method = "TMM") + } + design <- model.matrix(~coldata$condition) + y <- edgeR::estimateDisp(y, design) + fit <- edgeR::glmQLFit(y, design) + res <- edgeR::glmQLFTest(fit, coef = 2) + tbl <- res$table + tbl$FDR <- p.adjust(tbl$PValue, method = "BH") + log2fc <- tbl$logFC + pval <- tbl$PValue + fdr <- tbl$FDR +} else { + if (!requireNamespace("DESeq2", quietly = TRUE)) { + stop("DESeq2 package not available") + } + dds <- DESeq2::DESeqDataSetFromMatrix(countData = count_mat, colData = coldata, design = ~condition) + if (!is.null(size_factors)) { + DESeq2::sizeFactors(dds) <- size_factors + } + dds <- DESeq2::DESeq(dds) + res <- DESeq2::results(dds, contrast = c("condition", treated, control)) + log2fc <- res$log2FoldChange + pval <- res$pvalue + fdr <- res$padj +} + +out <- data.frame( + chr = region[[1]], + start = region[[2]], + end = region[[3]], + log2FC = log2fc, + pval = pval, + FDR = fdr, + stringsAsFactors = FALSE +) + +write.table(out, file = out_path, sep = "\t", quote = FALSE, row.names = FALSE) diff --git a/bin/span_pooling_manifest.py b/bin/span_pooling_manifest.py new file mode 100755 index 00000000..47fde593 --- /dev/null +++ b/bin/span_pooling_manifest.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +import argparse +import csv +import os + + +def read_rows(path): + with open(path, "r", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return [row for row in reader] + + +def main(): + parser = argparse.ArgumentParser(description="Merge SPAN pooling manifests") + parser.add_argument("--inputs", nargs="+", required=True) + parser.add_argument("--out", required=True) + args = parser.parse_args() + + rows = [] + for path in args.inputs: + if not path or not os.path.exists(path): + continue + rows.extend(read_rows(path)) + + if not rows: + return + + rows.sort(key=lambda r: (r.get("group") or "", r.get("condition") or "", r.get("pooled_bam") or "")) + + header = ["group", "condition", "pooled_bam", "source_bams"] + with open(args.out, "w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=header, delimiter="\t") + writer.writeheader() + writer.writerows(rows) + + +if __name__ == "__main__": + main() diff --git a/conf/base.config b/conf/base.config index 57726de9..56cbc8ce 100644 --- a/conf/base.config +++ b/conf/base.config @@ -49,6 +49,26 @@ process { withLabel:process_high_memory { memory = { check_max( 200.GB * task.attempt, 'memory' ) } } + withLabel:process_diffbind { + cpus = { check_max( 4, 'cpus' ) } + memory = { check_max( 16.GB, 'memory' ) } + } + withLabel:process_chipbinner { + cpus = { check_max( 8, 'cpus' ) } + memory = { check_max( 32.GB, 'memory' ) } + } + withLabel:process_span { + cpus = { check_max( 4, 'cpus' ) } + memory = { check_max( 16.GB, 'memory' ) } + } + withLabel:SPAN_NATIVE { + cpus = { check_max( 2, 'cpus' ) } + memory = { check_max( 8.GB, 'memory' ) } + } + withLabel:SPAN_FALLBACK { + cpus = { check_max( 4, 'cpus' ) } + memory = { check_max( 16.GB, 'memory' ) } + } withLabel:error_ignore { errorStrategy = 'ignore' } diff --git a/conf/modules.config b/conf/modules.config index 676eb44a..fe9690f1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -766,12 +766,12 @@ if(params.run_peak_calling && params.callers.any { it.startsWith('span') }) { } } -if(params.run_peak_calling && params.callers.any { it.startsWith('epic2') || it.startsWith('span') }) { +if((params.run_peak_calling && params.callers.any { it.startsWith('epic2') || it.startsWith('span') }) || (params.run_chipbinner && params.chipbinner_use_input)) { process { withName: '.*:CUTANDRUN:.*POOL_IGG_CONTROLS' { ext.prefix = { "${meta.id}" } publishDir = [ - path: { "${params.outdir}/03_peak_calling/01_pooled_controls" }, + path: { task.ext.publish_dir ?: "${params.outdir}/03_peak_calling/01_pooled_controls" }, mode: "${params.publish_dir_mode}", pattern: "*.{bam,bam.bai}", enabled: true @@ -1149,3 +1149,132 @@ if (params.run_multiqc) { } } } + +/* +======================================================================================== + DIFFERENTIAL ANALYSIS +======================================================================================== +*/ + +if(params.run_diffbind || params.run_chipbinner || params.run_span_diff || params.differential_publish_manifest_only) { + process { + withName: '.*:DIFFERENTIAL_MANIFESTS' { + publishDir = [ + enabled: false + ] + } + + withName: '.*:PUBLISH_DIFFERENTIAL_MANIFESTS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/00_manifests" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:DIFFERENTIAL_DESIGN' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/00_manifests" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:CACHE_DIFFERENTIAL_GENE_BED' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/00_manifests" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:RECORDS_TO_TSV' { + publishDir = [ + enabled: false + ] + } + + withName: '.*:DIFFBIND_RUN' { + publishDir = [ + [ + path: { "${params.outdir}/03_peak_calling/08_differential/01_diffbind/${caller}/${group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> + if (filename.equals('versions.yml')) return null + if (filename.equals('diffbind.samplesheet.export.csv')) return null + return filename + } + ], + [ + path: { "${params.outdir}/03_peak_calling/08_differential/01_diffbind/00_samplesheets/${caller}" }, + enabled: false, + mode: "${params.publish_dir_mode}" + ] + ] + } + + withName: '.*:EXPORT_DIFFBIND_SAMPLESHEETS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/01_diffbind/00_samplesheets/${caller}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename } + ] + } + + withName: '.*:CHIPBINNER_RUN' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/02_chipbinner/${group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:CHIPBINNER_WINDOWS_CACHE' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/00_manifests/chipbinner_windows_cache" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:SPAN_DIFF_RUN' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/03_span/${group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:SPAN_POOLING_MANIFEST' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/00_manifests" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:ANNOTATE_REGIONS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:DIFFERENTIAL_SUMMARY' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/multiqc" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:MULTIQC_DIFFERENTIAL' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/08_differential/multiqc" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} diff --git a/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md new file mode 100644 index 00000000..761a6a20 --- /dev/null +++ b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md @@ -0,0 +1,721 @@ +# SPEC_DIFFERENTIAL_PEAK_CALLING.md - second version + +Status: Draft +Version: 1.0 +Date: 2026-01-04 +Branch target: differential_peak_calling +Applies to: nf-core/cutandrun (DSL2) fork + +Table of contents + 1. Purpose and scope + 2. Definitions and conventions + 3. User-facing behavior + 4. Experimental design rules and validation + 5. Parameters + 6. Manifests (integrated + posthoc reproducibility) + 7. Workflow architecture (DSL2) + 8. Method specs + • 8.1 DiffBind + • 8.2 ChIPBinner + • 8.3 SPAN / OmniPeak differential + 9. Shared annotation layer + 10. MultiQC integration + 11. Output layout + 12. Resources and performance + 13. Testing strategy + 14. Acceptance criteria + 15. Out of scope (v1) + +⸻ + +1. Purpose and scope + +Add an opt‑in differential enrichment / peak analysis capability to the pipeline, supporting three complementary methods: + • DiffBind: peak-centric differential binding, run per (group × caller). + • ChIPBinner: genome-wide binning + clustering + ROTS differential (broad/global changes), run per group. ChIPbinner is designed for broad histone mark analysis via uniform binning. + • SPAN/OmniPeak: native compare differential when supported by the jar; otherwise a deterministic fallback producing a comparable differential table. SPAN documents an experimental compare command with treatment/control replicates and parameters such as --bin, --gap, and --fdr. + +Key goals: + • Respect existing sample metadata: group, condition, replicate, caller, sample_id/id. + • Keep upstream peak calling outputs immutable. + • Ensure the same differential codepath runs in both: + • Integrated mode (during a normal run, using channels) + • Posthoc differential-only mode (reconstructing channels from manifests) + • Provide deterministic outputs and MultiQC summaries for reproducibility. + +⸻ + +2. Definitions and conventions + +2.1 Metadata fields + +The pipeline already uses a condition-aware metadata model. Differential analysis assumes each sample has: + • sample_id (unique sample key; may be meta.id internally) + • group (mark / target / antibody group) + • condition (biological condition label) + • replicate (biological replicate identifier) + • caller (peak caller variant used to generate the sample’s peaks) + +2.2 Contrast direction + +All methods must report: + • log2FC = log2(treated / control) + +The user provides the mapping via: + • --differential_contrast "treated,control" + +2.3 Spike-in scaling convention + +If spike-in normalization is enabled upstream, the pipeline may emit per-sample scale factors used for coverage scaling. When used in statistical models that expect size factors (e.g., DESeq2/edgeR), the differential module must convert scale factors to a consistent modeling convention: + • size_factor = 1 / scale_factor + +The module must export the exact values used for transparency. + +DiffBind supports providing user-defined normalization factors via dba.normalize(). + +⸻ + +3. User-facing behavior + +3.1 Execution modes + +Integrated mode (default when enabled) +Triggered when any of: + • --run_diffbind + • --run_chipbinner + • --run_span_diff + +and --differential_from_run is not set. + +Behavior: + • Differential subworkflow consumes in-memory channels emitted by upstream steps (final BAMs, peaks, optional bigWigs, scale factors). + • It also writes deterministic manifests under the current --outdir to enable future posthoc runs. + +Posthoc differential-only mode +Triggered when: + • --differential_from_run is provided + • and Nextflow entrypoint -entry DIFFERENTIAL_ONLY is used. + +Behavior: + • Skip all upstream computation. + • Load manifests from /03_peak_calling/08_differential/00_manifests/. + • Reconstruct the same logical channels as integrated mode. + • Write new results to the current --outdir (never mutating ). + +3.2 Typical usage patterns + • Integrated: + • Enable one or more methods and provide a contrast. + • Posthoc: + • Use the prior run’s manifests and optionally change method parameters (e.g., FDR/lfc cutoffs). + +3.3 Selective execution (performance) + +The module supports restricting work to subsets: + • --differential_groups "H3K27me3,H3K4me3" (comma-separated) + • --differential_callers "SEACR,MACS2" (comma-separated caller IDs matching meta.caller) + +If unset, run all eligible groups/callers. + +⸻ + +4. Experimental design rules and validation + +4.1 Global requirements + +If any differential method is enabled: + • --differential_contrast is required and must contain exactly two comma-separated labels. + +If contrast labels don’t exist in any sample’s condition: + • Fail immediately with a clear message. + +4.2 Per-group eligibility + +For each group: + • Eligible samples are those where condition ∈ {treated, control}. + • Other conditions may exist in the dataset; they are ignored for the differential run, and recorded as ignored_conditions in the design manifest. + +A group is eligible if: + • Both contrast conditions exist for the group, and + • Each condition has ≥ --differential_min_replicates samples. + +If a group lacks one contrast condition: + • Mark as SKIPPED (not an error) and record the reason. + +If a group has both conditions but insufficient replicates: + • Default behavior: FAIL (to avoid silently producing statistically invalid results). + • If --differential_allow_partial is true: mark as SKIPPED and record the reason. + +4.3 Per-caller eligibility (DiffBind) + +DiffBind is evaluated per (group, caller): + • The same group-level replicate rules apply. + • Additionally, the peaks file for each sample must exist for that caller. + • If peaks are missing for some samples for a given caller: + • Mark (group,caller) as skipped (or fail in strict mode). + +⸻ + +5. Parameters + +All parameters must be added to: + • nextflow_schema.json + • nextflow.config defaults + • docs/usage.md + docs/output.md (fork docs) + • Module meta.yml (if following nf-core module conventions) + +5.1 Global toggles and mode + +Parameter Type Default Notes +--run_diffbind bool false Enable DiffBind method. +--run_chipbinner bool false Enable ChIPBinner method. +--run_span_diff bool false Enable SPAN/OmniPeak differential method. +--differential_contrast string (required if any run_ true)* "treated,control", defines log2FC direction. +--differential_min_replicates int 2 Minimum replicates per condition per group. +--differential_use_spikein enum auto `auto +--differential_allow_partial bool false If true, skip invalid group/caller comparisons instead of failing the run. +--differential_from_run path null Enables posthoc mode when using -entry DIFFERENTIAL_ONLY. +--differential_publish_manifest_only bool false Generate manifests + design tables only (no heavy method runs). +--differential_groups string null Comma-separated allowlist of groups to run. +--differential_callers string null Comma-separated allowlist of callers (DiffBind scope). + +5.2 Normalization inputs + +Parameter Type Default Notes +--chipbinner_ms_coeffs path null Optional MS-based coefficients for ChIPBinner scaling. +--dump_scale_factors bool (existing) If upstream already supports dumping spike-in factors, manifests must reference published paths. + +5.3 DiffBind parameters + +Parameter Type Default Notes +--diffbind_fdr float 0.05 FDR cutoff. +--diffbind_lfc float 1.0 Absolute log2FC threshold for “up/down”. +--diffbind_min_overlap int 2 Min overlap for consensus counting. +--diffbind_backend enum DESeq2 `DESeq2 +--diffbind_recenter_peaks bool false If true, use summit-based recentering. +--diffbind_summits int 0 0 = full peaks; otherwise summit window width. +--diffbind_norm_method enum native `native +--diffbind_extra_params path null JSON/YAML passthrough for advanced runner args. +--export_diffbind_sheets bool true Write sample sheets to output. + +5.4 ChIPBinner parameters + +Parameter Type Default Notes +--chipbinner_bin_size int 10000 Allowed: 1000,5000,10000,25000. +--chipbinner_windows_dir path null Prebuilt BED windows; otherwise generate. +--chipbinner_use_input bool false Optional background usage (never treated as a condition). +--chipbinner_pseudocount int 1 Added before log transforms. +--chipbinner_hdbscan_grid_minpts string "100,200,500,1000" Grid values (comma-separated). +--chipbinner_hdbscan_grid_minsamps string "100,200,500,1000" Grid values (comma-separated). +--chipbinner_fdr float 0.05 FDR cutoff for ROTS results. +--chipbinner_lfc float 1.0 Absolute log2FC cutoff for labeling. +--chipbinner_bootstrap int 1000 ROTS bootstrap iterations. +--chipbinner_k_value int 100000 ROTS K parameter. +--chipbinner_functional_db path null Optional enrichment resources (LOLA). + +5.5 SPAN / OmniPeak differential parameters + +Parameter Type Default Notes +--span_diff_mode enum auto `auto +--span_diff_fdr float 0.05 FDR cutoff. +--span_diff_gap int 5 Gap parameter (native). +--span_diff_bin int 200 Bin size (native). +--span_diff_java_heap string 8G Java heap for jar. +--span_fallback_backend enum DESeq2 `DESeq2 +--omnipeaks_jar path (required if run_span_diff) Jar used for SPAN/OmniPeak operations. + + +⸻ + +6. Manifests + +6.1 Location + +All manifests are written in integrated mode to: + +03_peak_calling/08_differential/00_manifests/ + +and are consumed in posthoc mode from the prior run’s outdir. + +6.2 Determinism requirements + • Always sorted deterministically (lexicographic by group, caller, condition, replicate, sample_id). + • Use published output paths (paths that exist under --outdir), never work directory paths. + • Always written if any differential method is enabled, even if all comparisons are skipped. + +6.3 Manifest files and schemas + +differential_manifest.samples.tsv +One row per target sample. + + Columns: + • sample_id + • group + • condition + • replicate + • final_bam + • final_bai + • normalisation_mode + • spikein_scale_factor (numeric or NA) + • ms_coeff (numeric or NA) + • bigwig_path (path or NA) + • input_bam (path or NA) + • input_bai (path or NA) + +differential_manifest.peaks.tsv +One row per (sample_id × caller) peak file. + +Columns: + • sample_id + • group + • condition + • caller + • peaks_path + • peaks_format (e.g., bed|narrowPeak|broadPeak|peak) + +differential_manifest.design.tsv +One row per analysis unit, capturing eligibility and reasons: + +Columns: + • group + • caller (for DiffBind rows; NA for group-only methods) + • treated_condition + • control_condition + • n_treated + • n_control + • eligible_diffbind (true|false) + • eligible_chipbinner (true|false) + • eligible_span (true|false) + • status (RUN|SKIP|FAIL) + • reason (free text; stable short codes preferred) + • ignored_conditions (comma-separated list or NA) + +span_diff_target_pooling.tsv (optional) +Only emitted if the jar does not support replicate lists and the pipeline had to pool replicates. + +Columns: + • group + • condition + • pooled_bam + • source_bams (comma-separated) + +⸻ + +7. Workflow architecture (DSL2) + +7.1 New subworkflow + +Create: + +subworkflows/local/differential_peak_calling.nf + +This subworkflow exposes a single entry function used by both modes: + +DIFFERENTIAL_PEAK_CALLING(...) + +7.2 New pipeline entrypoint + +Add a new entrypoint workflow: + +workflow DIFFERENTIAL_ONLY { ... } + +Responsibilities: + • Load manifests from params.differential_from_run. + • Rebuild channels matching integrated mode. + • Call DIFFERENTIAL_PEAK_CALLING(...). + • Publish outputs to the current --outdir. + +7.3 Internal phases + +Phase A: Build or load design + • BUILD_DIFFERENTIAL_DESIGN (integrated): derive eligibility table + manifest rows from channels. + • LOAD_DIFFERENTIAL_DESIGN (posthoc): read manifests into channels and re-derive eligibility checks. + +Phase B: Run methods + • DIFFBIND_ANALYSIS (per caller × group) + • CHIPBINNER_ANALYSIS (per group) + • SPAN_DIFFERENTIAL (per group; native or fallback) + +Phase C: Shared post-processing + • ANNOTATE_REGIONS (shared module used by all methods) + • BUILD_DIFFERENTIAL_SUMMARIES (aggregates method summaries + skip/fail reasons) + +⸻ + +8. Method specifications + +8.1 DiffBind (per caller × group) + +8.1.1 Inputs +For each eligible (group, caller): + • Deduplicated/final BAM + BAI for each target sample. + • Peak file for each sample from the same caller variant. + • Spike-in scale factors (optional). + • Contrast definition (treated, control). + +Note: nf-core/cutandrun supports running multiple peak callers, where the first caller is primary and additional callers are run and written to results. Differential must run for all caller variants present in the manifest. + +8.1.2 Sample sheet generation +Path (if enabled): +03_peak_calling/08_differential/01_diffbind/00_samplesheets//.csv + +Columns: + • SampleID (sample_id) + • Factor (group) + • Condition (condition) + • Replicate (replicate) + • bamReads (final_bam) + • Peaks (peaks_path) + • PeakCaller (caller) + • Tissue (optional constant e.g. CUTRUN) + +8.1.3 R runner behavior +Implement: + • modules/local/diffbind_run.nf + • bin/diffbind_run.R + +Steps: + 1. Load sample sheet into dba(). + 2. Count reads: dba.count(...) + • If diffbind_recenter_peaks=false, keep full peaks (summits=0). + • Else use summits=params.diffbind_summits. + 3. Normalization: + • If spike-in enabled and factors present: + • compute norm = 1 / spikein_scale_factor + • apply via dba.normalize(..., normalize=norm, ...) (or equivalent supported interface). DiffBind supports specifying normalization factors directly. + • Else: + • use DiffBind-native normalization (optionally guided by diffbind_norm_method). + 4. Define contrast using --differential_contrast ordering. + 5. Analyze using selected backend (DESeq2 or edgeR). + 6. Export: + • full results table + • significant subsets by FDR and |LFC| + • standard plots (PCA, correlation heatmap, MA, volcano, etc.) + +8.1.4 Outputs +03_peak_calling/08_differential/01_diffbind/// + +Required files: + • diffbind.results.tsv + • diffbind.results.annotated.tsv + • diffbind.significant.bed + • diffbind.significant_up.bed + • diffbind.significant_down.bed + • diffbind.summary.tsv (one-row summary for MultiQC) + • plots/ (PCA, correlation heatmap, MA, volcano; optional others) + +Recommended reproducibility files: + • diffbind.dba.rds (serialized DBA object) + • diffbind.normalization_factors.tsv + +8.1.5 Error handling + • Design failures are handled before launching the runner. + • Runtime failures: + • Default: fail the run. + • If --differential_allow_partial: record (group,caller) failure in summaries and continue other units. + +⸻ + +8.2 ChIPBinner (per group) + +8.2.1 Inputs +Per eligible group: + • final BAM/BAI for all target samples in treated/control conditions + • chrom sizes + • optional blacklist BED (to exclude problematic bins) + • optional MS coeffs file (per sample) + • optional pooled input/IgG if chipbinner_use_input=true (implementation must match controls deterministically) + +ChIPbinner divides the genome into uniform bins to analyze broad marks and differential enrichment patterns. + +8.2.2 Steps + 1. Prepare windows + • If chipbinner_windows_dir provided: load ..windows.bed. + • Else: generate windows from chrom sizes at chipbinner_bin_size. + • Subtract blacklist regions if available. + • Cache windows per (genome, bin_size, blacklist_hash) to avoid regeneration. + 2. Quantify + • Produce per-sample binned counts matrix (rows=bins, cols=samples). + • Persist raw count matrix. + 3. Normalize + • Apply scaling in this order: + 1. spike-in scaling (if enabled and present) + 2. MS coefficient scaling (if provided) + 3. pseudocount addition + 4. optional library-size normalization (if required by ChIPbinner routines) + 4. QC + • PCA and correlation plots from normalized matrix. + 5. HDBSCAN grid search + • Iterate over Cartesian product of minPts × minSamps. + • Persist cluster assignments for each grid point. + • Compute stability metrics and pick a preferred parameter set deterministically (document selection rule). + 6. Cluster extraction + • Extract standardized 2- and 3-cluster solutions (when possible). + 7. Differential testing + • Run ROTS on bins (replicates preserved), reporting: + • log2FC (treated/control) + • p-value + • FDR + • cluster assignment + 8. Optional enrichment + • If chipbinner_functional_db set: run enrichment (e.g., LOLA) per cluster. + • Else: emit a “not run” stub file. + +8.2.3 Outputs +03_peak_calling/08_differential/02_chipbinner// + +Required files: + • chipbinner.samplesheet.csv + • chipbinner.windows.bed + • chipbinner.bin_counts.tsv + • chipbinner.normalized_matrix.tsv + • chipbinner.hdbscan_grid_summary.tsv + • chipbinner.clusters.tsv + • chipbinner.differential.tsv + • chipbinner.differential.annotated.tsv + • chipbinner.summary.tsv + • plots/ (PCA, correlation, density scatter, genic/intergenic if produced) + +Optional: + • enrichment/ outputs when enabled + +8.2.4 Error handling + • If clustering fails for some grid points, continue; choose best among successful points. + • If none succeed: + • Default: fail. + • With --differential_allow_partial: record failure for the group and continue. + +⸻ + +8.3 SPAN / OmniPeak differential (per group) + +8.3.1 Capability detection +Implement a lightweight “help probe” step that determines: + • Whether the jar supports a native differential subcommand (e.g., compare). + • Which signature is supported: + • SPAN-like compare -t -c ... (documented) + • Any alternative signature exposed by the jar (detected via --help parsing) + +Decision logic: + • If span_diff_mode=fallback: always run fallback. + • If span_diff_mode=native: require native support, else fail. + • If span_diff_mode=auto: native if supported, else fallback. + +8.3.2 Native path +For each eligible group: + • Build comma-separated replicate lists for treated and control. + • Run the jar compare with: + • --chrom.sizes / --cs + • --bin (span_diff_bin) + • --gap (span_diff_gap) + • --fdr (span_diff_fdr) + • output peaks file (--peaks / -p) + +SPAN documents: + • treatment replicates via comma-separated list + • control replicates via comma-separated list + • --bin, --gap, --fdr options for compare + +If replicates are not supported by the jar signature: + • Pool replicates per condition (samtools merge), emit span_diff_target_pooling.tsv. + +8.3.3 Fallback path +Goal: produce a deterministic differential table even without native support. + +Fallback region set: + • Prefer union of SPAN/OmniPeak peaks from the SPAN caller variant (if present). + • If SPAN peaks are unavailable but run_span_diff is enabled: + • Fail with a clear message (v1). + • (Future enhancement could allow using primary caller peaks.) + +Steps: + 1. Build union region BED (sorted, merged). + 2. Count reads per region per sample. + 3. Run DE (DESeq2 or edgeR via span_fallback_backend). + 4. Apply spike-in derived size factors when enabled. + 5. Output consistent tables and BED splits (up/down). + +8.3.4 Outputs +03_peak_calling/08_differential/03_span// + +Required files: + • span.differential.tsv + • span.differential.peaks.bed (or native peaks format, plus a converted BED) + • span.differential.annotated.tsv + • span.up.bed + • span.down.bed + • span.summary.tsv + • span.mode.txt (native|fallback + signature note) + +Optional: + • span_diff_target_pooling.tsv if pooling was required + +⸻ + +9. Shared annotation layer + +9.1 Inputs + • Regions as BED (DiffBind significant peaks, ChIPbinner differential bins, SPAN differential regions) + • Gene annotation: + • Prefer existing pipeline gene BED (if available) + • Else derive from GTF (gene bodies and/or TSS) and cache + +9.2 Method + • Use bedtools closest to annotate nearest gene/feature and distance. + • Append standardized columns: + • nearest_feature_id + • nearest_gene_name (if available) + • distance_to_feature + +9.3 Output contract + +All *.annotated.tsv outputs must include: + • original region coordinates + • method statistics (log2FC, p-value, FDR) + • annotation columns above + +⸻ + +10. MultiQC integration + +10.1 Summary TSVs + +Each method produces a one-row-per-unit summary TSV: + • DiffBind: diffbind.summary.tsv + • caller,group,treated,control,n_tested,n_fdr_pass,n_up,n_down,status,reason + • ChIPBinner: chipbinner.summary.tsv + • group,treated,control,n_bins_tested,n_fdr_pass,n_up,n_down,n_clusters,chosen_minPts,chosen_minSamps,status,reason + • SPAN: span.summary.tsv + • group,treated,control,n_tested,n_fdr_pass,n_up,n_down,mode,status,reason + +10.2 Aggregated MultiQC table + +Create: +03_peak_calling/08_differential/multiqc/differential_summary_mqc.tsv + +This is a concatenation of the per-method summaries plus design manifest status, ensuring MultiQC has stable content even when analyses were skipped. + +10.3 MultiQC config + +Update: + • assets/multiqc_config.yml + +Add a new section “Differential analysis” containing: + • a design/eligibility table (from differential_manifest.design.tsv) + • per-method summary tables + • optional links (paths) to key result files (if MultiQC supports) + +10.4 Optional differential-only report (opt-in) + +If --differential_multiqc_report=true (new optional param), produce: + +03_peak_calling/08_differential/multiqc/differential_multiqc_report.html + +This report is focused solely on differential outputs and does not modify the pipeline’s primary MultiQC report. + +⸻ + +11. Output layout + +All differential outputs live under: + +03_peak_calling/08_differential/ + +03_peak_calling/08_differential/ + 00_manifests/ + differential_manifest.samples.tsv + differential_manifest.peaks.tsv + differential_manifest.design.tsv + chipbinner_input__.bam (optional) + chipbinner_input__.bam.bai (optional) + chipbinner_windows_cache/windows....bed + span_diff_target_pooling.tsv (optional) + 01_diffbind/ + 00_samplesheets//.csv + // + diffbind.results.tsv + diffbind.results.annotated.tsv + diffbind.significant*.bed + diffbind.summary.tsv + plots/ + 02_chipbinner/ + / + chipbinner.*.tsv + chipbinner.*.csv + chipbinner.normalization_factors.tsv + chipbinner.clusters.best.tsv + chipbinner.clusters.2cluster.tsv + chipbinner.clusters.3cluster.tsv + chipbinner.error.txt (only when allow-partial skips a failed run) + chipbinner.summary.tsv + plots/ + enrichment/ (optional) + 03_span/ + / + span.differential.tsv + span.differential.peaks.bed + span.differential.annotated.tsv + span.up.bed + span.down.bed + span.summary.tsv + span.mode.txt + multiqc/ + differential_summary_mqc.tsv + differential_multiqc_report.html (optional) + + +⸻ + +12. Resources and performance + +12.1 Default resource labels (recommended) + +Provide defaults in conf/base.config using labels: + • DiffBind analyze: 4 CPU / 16 GB + • ChIPBinner clustering: 8 CPU / 32 GB + • ChIPBinner differential: 4 CPU / 16 GB + • SPAN native compare: 2 CPU / 8 GB + • SPAN fallback DE: 4 CPU / 16 GB + +12.2 Performance considerations + • Design gating must run before launching heavy processes. + • Windows generation is cached per genome/bin size. + • Prefer streaming and minimal intermediate files where possible, but publish key matrices for reproducibility. + +⸻ + +13. Testing strategy + +13.1 CI-fast tests + • Validation: + • missing --differential_contrast when run flags are on + • contrast labels absent + • insufficient replicates (strict vs allow_partial) + • Manifest emission: + • determinism (sorted output) + • required columns present + • Sample sheet emission: + • DiffBind sheets created for eligible (group,caller) + • ChIPbinner samplesheet created per eligible group + +Use --differential_publish_manifest_only and/or a stub profile to avoid heavy compute. + +13.2 Optional integration tests (non-default) + • Single small dataset producing a real DiffBind result TSV. + • SPAN signature detection test using a mocked help output. + • ChIPBinner smoke test with reduced bootstrap/grid sizes. + +⸻ + +14. Acceptance criteria + • Integrated runs produce manifests and method outputs for eligible units. + • Posthoc -entry DIFFERENTIAL_ONLY --differential_from_run reproduces the same eligibility decisions and produces outputs without rerunning upstream steps. + • Log2FC sign is consistent (treated/control) across all methods. + • Spike-in factors propagate to DiffBind normalization factors when enabled. + • MultiQC tables are stable and show “skipped” reasons where applicable. + +⸻ + +15. Out of scope (v1) + • Multiple contrasts per run (timecourse / all pairwise). + • Batch covariates in statistical models. + • Deep cross-method concordance (beyond lightweight summary tables). + • Automatic peak caller benchmarking. diff --git a/docs/output.md b/docs/output.md index ccf61f74..05bf1dde 100644 --- a/docs/output.md +++ b/docs/output.md @@ -31,6 +31,7 @@ - 6.8. [epic2 peak calling](#epic2peakcalling) - 6.9. [SPAN/OmniPeaks peak calling](#SPANpeakcalling) - 6.10. [Consensus Peaks](#ConsensusPeaks) + - 6.11. [Differential analysis](#DifferentialAnalysis) - 7. [Peak-based QC](#Peak-basedQC) - 7.1. [Peak Counts](#PeakCounts) - 7.2. [Peak Reproducibility](#PeakReproducibility) @@ -438,6 +439,48 @@ SPAN/OmniPeaks callers require pooled controls and the `--omnipeaks_jar` paramet The merge function from [BEDtools](https://github.com/arq5x/bedtools2) is used to merge replicate peaks of the same experimental group (by `group` or `group_condition`) to create a consensus peak set. This can then optionally be filtered for consensus peaks contributed to be a threshold number of replicates using `--replicate_threshold`. +### 6.11. Differential analysis + +
+Output files + +- `03_peak_calling/08_differential/00_manifests/` + - `differential_manifest.samples.tsv`: per-sample manifest with BAM/BAI, optional input BAMs, scale factors, and bigWig paths. + - `differential_manifest.peaks.tsv`: per-sample per-caller peak paths and formats. + - `differential_manifest.design.tsv`: eligibility and status table for each comparison. + - `chipbinner_input__.bam(.bai)`: pooled input/IgG BAMs when `--chipbinner_use_input` is enabled. + - `chipbinner_windows_cache/windows....bed`: cached ChIPBinner windows by genome/bin/blacklist. + - `span_diff_target_pooling.tsv`: emitted when SPAN native mode must pool replicates. +- `03_peak_calling/08_differential/01_diffbind///` + - `diffbind.results.tsv`: full DiffBind results. + - `diffbind.results.annotated.tsv`: DiffBind results annotated with nearest gene. + - `diffbind.significant*.bed`: significant peaks split by direction. + - `diffbind.summary.tsv`: one-row summary for MultiQC. +- `03_peak_calling/08_differential/02_chipbinner//` + - `chipbinner.*.tsv` and `chipbinner.*.csv`: ChIPBinner matrices and summaries. + - `chipbinner.normalization_factors.tsv`: per-sample scale/size factors used for normalization. + - `chipbinner.clusters.best.tsv`, `chipbinner.clusters.2cluster.tsv`, `chipbinner.clusters.3cluster.tsv`: standardized cluster assignments. + - `chipbinner.error.txt`: emitted when `--differential_allow_partial` skips a failed run. + - `chipbinner.differential.annotated.tsv`: annotated differential bins. + - `chipbinner.summary.tsv`: one-row summary for MultiQC. +- `03_peak_calling/08_differential/03_span//` + - `span.differential.tsv`: SPAN/OmniPeak differential table (native or fallback). + - `span.differential.peaks.bed`: differential regions as BED. + - `span.differential.annotated.tsv`: annotated differential regions. + - `span.up.bed`, `span.down.bed`: direction-specific BEDs. + - `span.normalization_factors.tsv`: spike-in size factors used by fallback mode. + - `span.summary.tsv`: one-row summary for MultiQC. + - `span.mode.txt`: mode and signature note. +- `03_peak_calling/08_differential/multiqc/` + - `differential_summary_mqc.tsv`: combined method summary table for MultiQC. + - `differential_design_mqc.tsv`: design/eligibility table for MultiQC. + - `differential_multiqc_report.html`: optional differential-only MultiQC report. + +
+ +Differential analysis is enabled with `--run_diffbind`, `--run_chipbinner`, and/or `--run_span_diff` plus a `--differential_contrast` definition. The pipeline always writes manifests and design tables when any differential option is enabled, even if specific comparisons are skipped. +Summary TSVs are always produced and include `RUN`, `SKIP`, or `FAIL` plus a reason; MultiQC tables surface these statuses instead of showing placeholder results. + ## 7. Peak-based QC Once the peak calling process is complete, we run a separate set of reports that analyse the quality of the results at the peak level. diff --git a/docs/test_history/TESTS_adding_peak_callers.md b/docs/test_history/TESTS_adding_peak_callers.md new file mode 100644 index 00000000..76600617 --- /dev/null +++ b/docs/test_history/TESTS_adding_peak_callers.md @@ -0,0 +1,52 @@ +› Summarize each of the test you ran on the updates you made. What is the point of the test, did it pass, etc. + + +────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── + +• - pytest / pytest-workflow full integration suite (289 items): Runs the pipeline via nextflow run main.nf ... across many preset configs (genome options, samplesheet validation, filtering, scaling, peak + callers, consensus peaks, report/output checks). Primary assertion is exit code == 0 plus expected output files. This is the main regression test for the code changes. + + Each run (SLURM, scratch-backed) + + - Job 12731188: Full pytest suite → FAIL (FAILED 2:0 in ~6s). Cause was sbatch output paths pointing to logs/ that didn’t exist due to an incorrect --chdir choice. + - Job 12731293: Full pytest suite → FAIL (FAILED 2:0 in ~4s). Cause was a syntax error from embedding a long bash -lc '...; ...' payload into CMD=(...) in the generated sbatch script. + - Job 12731396: Full pytest suite → FAIL (75 failed, many “exit code should be 0”). Root cause wasn’t preserved because pytest-workflow cleaned temp dirs (no --keep-workflow-wd yet), so we couldn’t inspect + per-test Nextflow stderr from that attempt. + - Job 12732401: Full pytest suite → FAIL (same symptom: “exit code should be 0” across many tests). Also pre---keep-workflow-wd, so limited diagnostics. + - Job 12740798: Diagnostic full pytest suite (kept workflow dirs) → FAIL. Root cause: Nextflow couldn’t find Java on compute nodes (java: command not found). + - Job 12742438: Full pytest suite after adding JAVA_CMD → FAIL. Root cause: NXF_OPTS was misused to pass Nextflow flags; JVM rejected -with-singularity. + - Job 12742515: Full pytest suite after removing NXF_OPTS → FAIL. Root cause: container engine conflict (“Cannot enable more than one container engine”) because tests request -profile docker,... while we also + enabled Singularity. + - Job 12743096: Full pytest suite (final) → PASS (289 passed, SLURM COMPLETED 0:0, 1 warning). Log: /scratch/users/dhusmann/nextflow-work/runs/pytest_full/20260101_181711/logs/pytest_full_12743096.out. + + + + +› For each failure, how did you resolve the issue? + + +────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── + +• - Job 12731188 (FAILED 2:0, ~6s) + - What failed: Slurm couldn’t open the sbatch output/error files because the job was --chdir’d into the repo, but the logs/ directory (used by #SBATCH --output=logs/%x_%j.out) only existed in the scratch + run dir created by slurm_run.sh. + - Fix: Stopped overriding --chdir to the repo; instead kept Slurm --chdir as the scratch run dir (where logs/ is created) and cd’d into the repo inside the payload script. + - Job 12731293 (FAILED 2:0, ~4s) + - What failed: The generated CMD=(...) in sbatch.sh embedded a long bash -lc ...; ...; ... payload without proper quoting, producing a shell syntax error (unexpected token ';'). + - Fix: Moved the payload into a real script on scratch (/scratch/users/dhusmann/nextflow-work/scripts/pytest_full.sh) and had Slurm run that script (simple command, no quoting hazards). + - Jobs 12731396 and 12732401 (FAILED; many “exit code should be 0”) + - What failed: Many pytest-workflow cases failed because the underlying nextflow run ... commands returned non‑zero. + - Fix process: These early runs did not preserve per-test workflow directories/logs, so we couldn’t pinpoint the exact first failure. I changed the test invocation to include pytest --keep-workflow-wd so + each test’s log.out/log.err could be inspected, enabling the concrete diagnoses below. + - Job 12740798 (FAILED; “exit code should be 0” everywhere) + - What failed: Nextflow couldn’t start on compute nodes because java wasn’t on $PATH (java: command not found). + - Fix: Exported JAVA_CMD=/share/software/user/open/java/17.0.4/bin/java (and JAVA_HOME) in the scratch test script so Nextflow always has a JVM on compute nodes. + - Job 12742438 (FAILED; “exit code should be 0” everywhere) + - What failed: Java failed immediately with Unrecognized option: -with-singularity because I mistakenly put Nextflow CLI flags into NXF_OPTS (which is passed to the JVM, not Nextflow). + - Fix: Removed NXF_OPTS entirely and relied on supported env toggles instead. + - Job 12742515 (FAILED; “Cannot enable more than one container engine”) + - What failed: The tests call Nextflow with -profile docker,..., which enables Docker, while we were also enabling Singularity → Nextflow refuses to run with two engines. + - Fix: Added a scratch nextflow wrapper (/scratch/users/dhusmann/nextflow-work/bin/nextflow) that rewrites -profile docker, to -profile singularity, during tests, while keeping + Docker disabled via env. This forces a single container engine (Singularity) without editing the test YAML files. + - Resulting successful run: job 12743096 (COMPLETED 0:0; 289 passed) + - With JAVA_CMD set and the nextflow profile-rewrite wrapper in place, the full pytest suite completed successfully. diff --git a/docs/test_history/differential_peak_calling_2.md b/docs/test_history/differential_peak_calling_2.md new file mode 100644 index 00000000..ac09cf47 --- /dev/null +++ b/docs/test_history/differential_peak_calling_2.md @@ -0,0 +1,13 @@ +# differential_peak_calling_2 test history + +## 2026-01-11 — pytest-suite (Slurm) +- JobID: 13242556 +- Run dir: /scratch/users/dhusmann/runs/pytest-suite/20260111_125922 +- Failure: tests/test_07_differential.yml::test_chipbinner_use_input_missing_fail expected stderr "input BAMs are missing"; heredoc parsing broke in the workflow (log.err showed `cat: unrecognized option '--samples'`). +- Fix: switched heredocs to `printf` in chipbinner_use_input_missing_{fail,allow_partial} tests to avoid heredoc parsing issues; commit b3341a6. +- Debug rerun (tag test_chipbinner_use_input): JobID 13250774, run dir /scratch/users/dhusmann/runs/pytest-suite/20260111_183621 (COMPLETED 0:0). + +## 2026-01-12 — pytest-suite (Slurm) +- JobID: 13250786 +- Run dir: /scratch/users/dhusmann/runs/pytest-suite/20260111_183752 +- Result: COMPLETED 0:0 diff --git a/docs/usage.md b/docs/usage.md index 56780e94..b944e45c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -167,6 +167,112 @@ If control samples are provided in the sample sheet, they will be used to normal After peak calling, consensus peaks are calculated by merging peaks within the same grouping key. Use `--consensus_grouping` to choose `group` or `group_condition`. By default, if the samplesheet includes a `condition` column, grouping uses `group_condition`; otherwise it falls back to `group`. The number of replicates required for a valid peak can be changed using `replicate_threshold`. To call consensus peaks across all samples, set `--consensus_peak_mode all`. +### Differential analysis (optional) + +Differential analysis runs on peak-calling outputs and compares two conditions per group. The contrast is defined as `treated,control` and determines the log2FC direction (treated/control). Enable one or more methods with `--run_diffbind`, `--run_chipbinner`, or `--run_span_diff`, and provide `--differential_contrast`. + +#### Global options + +| Parameter | Default | Description | +| --- | --- | --- | +| `--differential_contrast` | required | Contrast as `treated,control`. The order controls log2FC sign (treated/control). | +| `--differential_min_replicates` | `2` | Minimum replicates per condition required for eligibility. | +| `--differential_allow_partial` | `false` | If `true`, ineligible comparisons are marked `SKIP` and the pipeline continues. If `false`, any `FAIL` in the design table stops the pipeline. | +| `--differential_groups` | | Comma-separated allowlist of groups to analyze. | +| `--differential_callers` | | Comma-separated allowlist of peak callers for DiffBind (e.g., `seacr,macs2`). | +| `--differential_use_spikein` | `auto` | `auto` uses spike-in only when `--normalisation_mode Spikein`; set `true` or `false` to override. | +| `--differential_publish_manifest_only` | `false` | Write manifests and design tables only (no method execution). | +| `--differential_multiqc_report` | `false` | Emit a differential-only MultiQC report. | + +Eligibility & allow-partial semantics: +- Comparisons are eligible only when both contrast conditions are present and meet `--differential_min_replicates`. +- If `--differential_allow_partial=false`, any `FAIL` in the design manifest stops the run. +- If `--differential_allow_partial=true`, failed units are marked `SKIP` with a reason in the design + per-method summaries; other units proceed. + +Spike-in behavior: +- Spike-in scaling follows the manifest `spikein_scale_factor`, with size factors defined as `size_factor = 1 / scale_factor`. +- ChIPBinner and SPAN fallback explicitly apply these size factors and record them in normalization factor outputs. +- DiffBind uses spike-in scaling when enabled (`--differential_use_spikein`) and otherwise uses the configured normalization method. + +#### DiffBind + +Enable with `--run_diffbind`. + +| Parameter | Default | Description | +| --- | --- | --- | +| `--diffbind_fdr` | `0.05` | FDR cutoff for significant peaks. | +| `--diffbind_lfc` | `1.0` | Absolute log2FC cutoff for up/down labeling. | +| `--diffbind_min_overlap` | `2` | Minimum overlap count for consensus/analysis. | +| `--diffbind_backend` | `DESeq2` | Differential backend (e.g. `DESeq2`, `edgeR`). | +| `--diffbind_recenter_peaks` | `false` | Recenter peaks before testing. | +| `--diffbind_summits` | `0` | Summit window width (0 uses full peaks). | +| `--diffbind_norm_method` | `native` | Normalization method when not using spike-in. | +| `--diffbind_extra_params` | | Optional JSON/YAML file with extra DiffBind parameters. | +| `--export_diffbind_sheets` | `true` | Publish DiffBind sample sheets under `00_samplesheets/`. | + +DiffBind uses per-caller peak sets for each group and produces results tables plus diagnostic plots. + +#### ChIPBinner + +Enable with `--run_chipbinner`. + +| Parameter | Default | Description | +| --- | --- | --- | +| `--chipbinner_bin_size` | `10000` | Bin size for windowing. | +| `--chipbinner_windows_dir` | | Directory containing prebuilt windows (named `windows....bed`). | +| `--blacklist` | | Optional blacklist used to subtract windows. | +| `--chipbinner_use_input` | `false` | Subtract pooled input/IgG per group/condition as background. Missing inputs fail fast unless `--differential_allow_partial=true`, which yields `SKIP` with an error note. | +| `--chipbinner_ms_coeffs` | | Optional MS coefficients (by `sample_id`) used in normalization. | +| `--chipbinner_pseudocount` | `1` | Pseudocount added before log transforms. | +| `--chipbinner_hdbscan_grid_minpts` | `100,200,500,1000` | HDBSCAN minPts grid. | +| `--chipbinner_hdbscan_grid_minsamps` | `100,200,500,1000` | HDBSCAN minSamps grid. | +| `--chipbinner_fdr` | `0.05` | FDR cutoff for differential bins. | +| `--chipbinner_lfc` | `1.0` | Absolute log2FC cutoff for up/down labeling. | +| `--chipbinner_bootstrap` | `1000` | ROTS bootstrap iterations. | +| `--chipbinner_k_value` | `100000` | ROTS K parameter. | +| `--chipbinner_functional_db` | | Optional enrichment database (BED or directory of BEDs). | + +ChIPBinner produces: a chosen “best” clustering plus standardized 2‑cluster and 3‑cluster views (treated‑high vs control‑high vs stable). Normalization factors are written to `chipbinner.normalization_factors.tsv`. + +#### SPAN differential + +Enable with `--run_span_diff`. + +| Parameter | Default | Description | +| --- | --- | --- | +| `--span_diff_mode` | `auto` | `auto` chooses native compare if supported, otherwise fallback. `native` forces compare; `fallback` skips jar compare. | +| `--omnipeaks_jar` | required | OmniPeaks/SPAN jar for native/auto modes. | +| `--span_diff_fdr` | `0.05` | FDR cutoff for differential regions. | +| `--span_diff_gap` | `5` | Gap parameter for native compare. | +| `--span_diff_bin` | `200` | Bin size for native compare. | +| `--span_diff_java_heap` | `8G` | Java heap for jar execution. | +| `--span_fallback_backend` | `DESeq2` | Backend for fallback differential (e.g. `DESeq2` or `edgeR`). | + +Native mode enforces treated/control orientation and may require pooled replicates depending on jar capabilities (recorded in `span_diff_target_pooling.tsv`). Fallback mode computes counts over merged regions and writes `span.normalization_factors.tsv`. + +Integrated example: + +```bash +nextflow run nf-core/cutandrun \ + --input samplesheet.csv \ + --genome GRCh37 \ + --run_diffbind \ + --differential_contrast "Treatment,Control" \ + -profile docker +``` + +Posthoc example (no upstream rerun): + +```bash +nextflow run main.nf -entry DIFFERENTIAL_ONLY \ + --differential_from_run /path/to/prior/results \ + --run_diffbind \ + --differential_contrast "Treatment,Control" \ + --gene_bed /path/to/genes.bed +``` + +When running `--run_chipbinner` in differential-only mode, you must provide `--fasta` so chrom sizes can be derived. When running `--run_span_diff`, you must provide `--omnipeaks_jar`. + ### Reproducibility It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. diff --git a/lib/WorkflowCutandrun.groovy b/lib/WorkflowCutandrun.groovy index e9e94093..2c1b814a 100755 --- a/lib/WorkflowCutandrun.groovy +++ b/lib/WorkflowCutandrun.groovy @@ -20,6 +20,7 @@ class WorkflowCutandrun { genomeExistsError(params, log) validatePeakCallerParams(params, log) + validateDifferentialParams(params) if (!params.fasta) { Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." @@ -95,6 +96,55 @@ class WorkflowCutandrun { } } + // + // Validate differential analysis parameters + // + private static void validateDifferentialParams(params) { + def diff_enabled = params.run_diffbind || params.run_chipbinner || params.run_span_diff || params.differential_publish_manifest_only + if (diff_enabled) { + if (!params.run_peak_calling) { + Nextflow.error "Differential analysis requires peak calling outputs. Please ensure peak calling is enabled." + } + if (!params.differential_contrast) { + Nextflow.error "Differential analysis requires --differential_contrast (e.g. treated,control)." + } + def contrast = params.differential_contrast.toString().split(',').collect { it.trim() }.findAll { it } + if (contrast.size() != 2) { + Nextflow.error "Invalid --differential_contrast '${params.differential_contrast}'. Expected two comma-separated labels." + } + } + + if (params.run_span_diff) { + if (!params.omnipeaks_jar) { + Nextflow.error "SPAN differential requested but --omnipeaks_jar was not provided." + } + def jar_path = new File(params.omnipeaks_jar.toString()) + if (!jar_path.exists()) { + Nextflow.error "SPAN differential requested but --omnipeaks_jar path does not exist: ${params.omnipeaks_jar}" + } + } + + if (params.differential_use_spikein && !['auto','true','false'].contains(params.differential_use_spikein.toString())) { + Nextflow.error "Invalid --differential_use_spikein value '${params.differential_use_spikein}'. Valid options: auto, true, false." + } + + if (params.span_diff_mode && !['auto','native','fallback'].contains(params.span_diff_mode.toString())) { + Nextflow.error "Invalid --span_diff_mode value '${params.span_diff_mode}'. Valid options: auto, native, fallback." + } + + if (params.diffbind_backend && !['DESeq2','edgeR'].contains(params.diffbind_backend.toString())) { + Nextflow.error "Invalid --diffbind_backend value '${params.diffbind_backend}'. Valid options: DESeq2, edgeR." + } + + if (params.span_fallback_backend && !['DESeq2','edgeR'].contains(params.span_fallback_backend.toString())) { + Nextflow.error "Invalid --span_fallback_backend value '${params.span_fallback_backend}'. Valid options: DESeq2, edgeR." + } + + if (params.differential_min_replicates && params.differential_min_replicates.toInteger() < 1) { + Nextflow.error "Invalid --differential_min_replicates value '${params.differential_min_replicates}'. Must be >= 1." + } + } + // // Detect whether the input samplesheet includes a condition column // diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 023723ab..8597dcd1 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -44,12 +44,49 @@ class WorkflowMain { // Check AWS batch settings NfcoreTemplate.awsBatch(workflow, params) - // Check input has been provided - if (!params.input) { + // Check input has been provided unless running differential-only + def differential_only = isDifferentialOnly(workflow, params) + if (!params.input && !(differential_only && params.differential_from_run)) { Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") } } + // + // Detect whether the workflow is running the differential-only entrypoint + // + public static boolean isDifferentialOnly(workflow, params = null) { + def entry = getWorkflowEntryName(workflow) + if (entry) { + return entry?.toString()?.toUpperCase() == 'DIFFERENTIAL_ONLY' + } + return false + } + + // + // Get the entrypoint name in a Nextflow-version-safe way + // + private static String getWorkflowEntryName(workflow) { + if (!workflow) { + return null + } + if (workflow.hasProperty('entry')) { + return workflow.entry + } + if (workflow.hasProperty('entryName')) { + return workflow.entryName + } + if (workflow.manifest && workflow.manifest.hasProperty('entryName')) { + return workflow.manifest.entryName + } + if (workflow.commandLine) { + def matcher = (workflow.commandLine =~ /(?:^|\\s)-entry(?:=|\\s+)(\\S+)/) + if (matcher) { + return matcher[0][1] + } + } + return null + } + // // Get attribute from genome config file e.g. fasta // diff --git a/main.nf b/main.nf index f3b5b5eb..2adea865 100644 --- a/main.nf +++ b/main.nf @@ -65,6 +65,7 @@ WorkflowMain.initialise(workflow, params, log, args) */ include { CUTANDRUN } from './workflows/cutandrun' +include { DIFFERENTIAL_ONLY } from './workflows/differential_only' workflow NFCORE_CUTANDRUN { /* diff --git a/modules/local/annotate_regions.nf b/modules/local/annotate_regions.nf new file mode 100644 index 00000000..3cd5b996 --- /dev/null +++ b/modules/local/annotate_regions.nf @@ -0,0 +1,67 @@ +process ANNOTATE_REGIONS { + label 'process_single' + + conda "bioconda::bedtools=2.31.0" + container "quay.io/biocontainers/bedtools:2.31.1--hf5e1c6e_0" + + input: + tuple val(method), val(group), val(caller), path(regions), path(gene_bed), val(output_name) + + output: + tuple val(method), val(group), val(caller), path("${output_name}"), emit: annotated + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + out_dir=\$(dirname "${output_name}") + mkdir -p "\$out_dir" + first_line=\$(head -n 1 ${regions} || true) + if [[ -z "\$first_line" ]]; then + : > ${output_name} + else + if echo "\$first_line" | awk -F'\t' 'NF<3{exit 0} \$2 ~ /^[0-9]+${'$'}/ {exit 1} {exit 0}'; then + header_line="\$first_line" + tail -n +2 ${regions} > regions.data + else + header_line="" + cp ${regions} regions.data + fi + + if [[ ! -s regions.data ]]; then + if [[ -n "\$header_line" ]]; then + printf "%s\tnearest_feature_id\tnearest_gene_name\tdistance_to_feature\n" "\$header_line" > ${output_name} + else + : > ${output_name} + fi + else + bcols=\$(awk -F'\t' 'NF>0{print NF; exit}' ${gene_bed} || true) + bcols=\${bcols:-0} + awk -F'\t' -v OFS='\t' '{print \$0, NR-1}' regions.data > regions.with_idx + bedtools closest -d -a regions.with_idx -b ${gene_bed} > regions.closest + + awk -F'\t' -v OFS='\t' -v bcols="\$bcols" -v header="\$header_line" 'BEGIN{ if (header != "") print header, "nearest_feature_id", "nearest_gene_name", "distance_to_feature" } { + a_cols = NF - bcols - 1 + if (a_cols < 1) next + out = \$1 + for (i=2; i<=a_cols-1; i++) out = out OFS \$i + gene_id = "NA" + gene_name = "NA" + if (bcols >= 4) gene_id = \$(a_cols + 4) + if (bcols >= 5) gene_name = \$(a_cols + 5) + if (gene_id == "." || gene_id == "") gene_id = "NA" + if (gene_name == "." || gene_name == "") gene_name = "NA" + distance = \$(NF) + print out, gene_id, gene_name, distance + }' regions.closest > ${output_name} + fi + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed 's/bedtools v//') + END_VERSIONS + """ +} diff --git a/modules/local/cache_differential_gene_bed.nf b/modules/local/cache_differential_gene_bed.nf new file mode 100644 index 00000000..239b5422 --- /dev/null +++ b/modules/local/cache_differential_gene_bed.nf @@ -0,0 +1,29 @@ +process CACHE_DIFFERENTIAL_GENE_BED { + label 'process_single' + + conda "conda-forge::coreutils=9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path gene_bed + + output: + path "differential_manifest.gene_bed.bed", emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + cp ${gene_bed} differential_manifest.gene_bed.bed + + coreutils_version=\$(cat --version | head -n 1 | awk '{print \$NF}') + { + echo "\"${task.process}\":" + echo " coreutils: \$coreutils_version" + } > versions.yml + """ +} diff --git a/modules/local/chipbinner_run.nf b/modules/local/chipbinner_run.nf new file mode 100644 index 00000000..fa6271b6 --- /dev/null +++ b/modules/local/chipbinner_run.nf @@ -0,0 +1,123 @@ +process CHIPBINNER_RUN { + label 'process_chipbinner' + + conda "conda-forge::python=3.8.3 conda-forge::numpy=1.24.4 conda-forge::pandas=2.0.3 conda-forge::scikit-learn=1.3.2 conda-forge::matplotlib=3.7.5 conda-forge::seaborn=0.12.2 conda-forge::hdbscan=0.8.33 conda-forge::scipy=1.10.1 bioconda::bedtools=2.31.1 bioconda::samtools=1.17 conda-forge::r-base=4.2.3 bioconda::bioconductor-rots=1.20.0" + container workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? + null : + 'quay.io/biocontainers/biocontainers:1.2.0--py38_0' + + input: + tuple val(group), path(records), path(chrom_sizes), path(windows) + val contrast + val base_dir + val bin_size + val windows_dir + val blacklist + val use_input + val use_spikein + val pseudocount + val grid_minpts + val grid_minsamps + val fdr + val lfc + val bootstrap + val k_value + val functional_db + val allow_partial + + output: + tuple val(group), path("chipbinner.samplesheet.csv"), emit: samplesheet, optional: true + tuple val(group), path("chipbinner.windows.bed"), emit: windows, optional: true + tuple val(group), path("chipbinner.bin_counts.tsv"), emit: counts, optional: true + tuple val(group), path("chipbinner.normalized_matrix.tsv"), emit: normalized, optional: true + tuple val(group), path("chipbinner.normalization_factors.tsv"), emit: norm_factors, optional: true + tuple val(group), path("chipbinner.hdbscan_grid_summary.tsv"), emit: grid, optional: true + tuple val(group), path("hdbscan_grid"), emit: grid_outputs, optional: true + tuple val(group), path("chipbinner.clusters.tsv"), emit: clusters, optional: true + tuple val(group), path("chipbinner.clusters.best.tsv"), emit: clusters_best, optional: true + tuple val(group), path("chipbinner.clusters.2cluster.tsv"), emit: clusters_two, optional: true + tuple val(group), path("chipbinner.clusters.3cluster.tsv"), emit: clusters_three, optional: true + tuple val(group), path("chipbinner.differential.tsv"), emit: differential, optional: true + tuple val(group), path("chipbinner.summary.tsv"), emit: summary + tuple val(group), path("chipbinner.error.txt"), emit: error, optional: true + tuple val(group), path("plots"), emit: plots, optional: true + tuple val(group), path("enrichment"), emit: enrichment, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def windows_arg = windows ? "--windows ${windows}" : '' + def windows_dir_arg = (!windows && windows_dir) ? "--windows-dir ${windows_dir}" : '' + def blacklist_arg = (!windows && blacklist) ? "--blacklist ${blacklist}" : '' + def use_input_arg = use_input ? "--use-input" : '' + def use_spikein_arg = use_spikein ? "--use-spikein" : '' + def functional_arg = functional_db ? "--functional-db ${functional_db}" : '' + def allow_partial_arg = allow_partial ? "--allow-partial" : '' + def base_dir_arg = base_dir ? "--base-dir ${base_dir}" : '' + """ + chipbinner_run.py \ + --samples ${records} \ + --group '${group}' \ + --contrast '${contrast}' \ + --chrom-sizes ${chrom_sizes} \ + --bin-size ${bin_size} \ + ${windows_arg} \ + ${windows_dir_arg} \ + ${blacklist_arg} \ + ${use_input_arg} \ + ${use_spikein_arg} \ + --pseudocount ${pseudocount} \ + --grid-minpts '${grid_minpts}' \ + --grid-minsamps '${grid_minsamps}' \ + --fdr ${fdr} \ + --lfc ${lfc} \ + --bootstrap ${bootstrap} \ + --k-value ${k_value} \ + ${functional_arg} \ + ${allow_partial_arg} \ + ${base_dir_arg} \ + --outdir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + END_VERSIONS + """ + + stub: + """ + cat <<-END_CLUSTER > chipbinner.clusters.tsv + chrom\tstart\tend\tbin_id\tcluster + chrStub\t0\t100\tchrStub:0-100\t0 + END_CLUSTER + + cp chipbinner.clusters.tsv chipbinner.clusters.best.tsv + + cat <<-END_2CLUSTER > chipbinner.clusters.2cluster.tsv + chrom\tstart\tend\tbin_id\tcluster\tcluster_label\tcluster_mean_log2FC + chrStub\t0\t100\tchrStub:0-100\t0\ttreated_high\t1.0 + END_2CLUSTER + + cat <<-END_3CLUSTER > chipbinner.clusters.3cluster.tsv + chrom\tstart\tend\tbin_id\tcluster\tcluster_label\tcluster_mean_log2FC + chrStub\t0\t100\tchrStub:0-100\t0\tstable\t0.0 + END_3CLUSTER + + cat <<-END_NORM > chipbinner.normalization_factors.tsv + sample_id\tgroup\tcondition\tspikein_scale_factor\tspikein_size_factor\tms_coeff\tms_size_factor\tapplied_steps + stub_sample\t${group}\t${contrast.split(',')[0].trim()}\tNA\tNA\tNA\tNA\tpseudocount + END_NORM + + cat <<-END_SUMMARY > chipbinner.summary.tsv + group\tcaller\ttreated\tcontrol\tn_bins_tested\tn_fdr_pass\tn_up\tn_down\tn_clusters\tchosen_minPts\tchosen_minSamps\tstatus\treason\tenrichment_status + ${group}\tNA\t${contrast.split(',')[0].trim()}\t${contrast.split(',')[1].trim()}\t0\t0\t0\t0\t0\tNA\tNA\tSKIP\tstub_run\tNOT_RUN + END_SUMMARY + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: "stub" + END_VERSIONS + """ +} diff --git a/modules/local/chipbinner_windows_cache.nf b/modules/local/chipbinner_windows_cache.nf new file mode 100644 index 00000000..a9a1983a --- /dev/null +++ b/modules/local/chipbinner_windows_cache.nf @@ -0,0 +1,74 @@ +process CHIPBINNER_WINDOWS_CACHE { + label 'process_single' + + conda "conda-forge::python=3.8.3 bioconda::bedtools=2.31.1" + container workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? + null : + 'quay.io/biocontainers/biocontainers:1.2.0--py38_0' + + input: + path chrom_sizes + val bin_size + val windows_dir + val blacklist + + output: + path "windows.*.bed", emit: windows + path "chipbinner_windows_meta.tsv", emit: meta + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def windows_dir_arg = windows_dir ? "--windows-dir ${windows_dir}" : '' + def blacklist_arg = blacklist ? "--blacklist ${blacklist}" : '' + """ + chipbinner_windows_cache.py \ + --chrom-sizes ${chrom_sizes} \ + --bin-size ${bin_size} \ + ${windows_dir_arg} \ + ${blacklist_arg} \ + --outdir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + bedtools: \$(bedtools --version | awk '{print \$2}') + END_VERSIONS + """ + + stub: + """ + genome_id=\$(basename "${chrom_sizes}") + if [[ "\$genome_id" == *.sizes ]]; then + genome_id="\${genome_id%.sizes}" + else + genome_id="\${genome_id%.*}" + fi + if [[ -z "\$genome_id" ]]; then + genome_id="genome" + fi + + blacklist_hash="none" + if [[ -n "${blacklist}" && -f "${blacklist}" ]]; then + if command -v sha256sum >/dev/null 2>&1; then + blacklist_hash=\$(sha256sum "${blacklist}" | awk '{print substr(\$1,1,12)}') + fi + fi + + out_name="windows.\${genome_id}.${bin_size}.\${blacklist_hash}.bed" + printf "chrStub\\t0\\t${bin_size}\\n" > "\${out_name}" + + { + printf "genome_id\\tbin_size\\tblacklist_hash\\twindows_source\\twindows_path\\n" + printf "%s\\t%s\\t%s\\t%s\\t%s\\n" "\$genome_id" "${bin_size}" "\$blacklist_hash" "${windows_dir ?: 'generated'}" "\${out_name}" + } > chipbinner_windows_meta.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: "stub" + bedtools: "stub" + END_VERSIONS + """ +} diff --git a/modules/local/diffbind_run.nf b/modules/local/diffbind_run.nf new file mode 100644 index 00000000..ac5ea231 --- /dev/null +++ b/modules/local/diffbind_run.nf @@ -0,0 +1,106 @@ +process DIFFBIND_RUN { + label 'process_diffbind' + + conda "conda-forge::r-base=4.2.3 bioconda::bioconductor-diffbind conda-forge::r-jsonlite conda-forge::r-yaml" + container "quay.io/biocontainers/bioconductor-diffbind:3.16.0--r44h77050f0_1" + + input: + tuple val(group), val(caller), path(records) + val contrast + val use_spikein + val fdr + val lfc + val min_overlap + val backend + val recenter + val summits + val norm_method + val extra_params + val export_sheets + val allow_partial + + output: + tuple val(group), val(caller), path("diffbind.results.tsv", optional: true), emit: results + tuple val(group), val(caller), path("diffbind.significant.bed", optional: true), emit: bed + tuple val(group), val(caller), path("diffbind.significant_up.bed", optional: true), emit: bed_up + tuple val(group), val(caller), path("diffbind.significant_down.bed", optional: true), emit: bed_down + tuple val(group), val(caller), path("diffbind.summary.tsv"), emit: summary + tuple val(group), val(caller), path("diffbind.samplesheet.csv"), emit: samplesheet + tuple val(group), val(caller), path("diffbind.samplesheet.export.csv"), emit: samplesheet_export + tuple val(group), val(caller), path("diffbind.normalization_factors.tsv", optional: true), emit: norm_factors_out + tuple val(group), val(caller), path("diffbind.dba.rds", optional: true), emit: dba + tuple val(group), val(caller), path("diffbind.error.txt", optional: true), emit: error + tuple val(group), val(caller), path("plots", optional: true), emit: plots + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def extra_arg = extra_params ? "--extra_params ${extra_params}" : '' + """ + diffbind_run.R \ + --records ${records} \ + --outdir . \ + --contrast '${contrast}' \ + --caller '${caller}' \ + --group '${group}' \ + --use_spikein ${use_spikein} \ + --fdr ${fdr} \ + --lfc ${lfc} \ + --min_overlap ${min_overlap} \ + --backend ${backend} \ + --recenter ${recenter} \ + --summits ${summits} \ + --norm_method ${norm_method} \ + ${extra_arg} \ + --export_sheets ${export_sheets} \ + --allow_partial ${allow_partial} + + cp diffbind.samplesheet.csv diffbind.samplesheet.export.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n 1 | sed 's/.* //') + END_VERSIONS + """ + + stub: + """ + mkdir -p plots + cat <<-EOF > diffbind.samplesheet.csv +SampleID,Factor,Condition,Replicate,bamReads,Peaks,PeakCaller,Tissue +stub,${group},stub,1,stub.bam,stub.peak,${caller},CUTRUN +EOF + + cp diffbind.samplesheet.csv diffbind.samplesheet.export.csv + + cat <<-EOF > diffbind.results.tsv +chr\tstart\tend\tlog2FC\tpval\tFDR +chr1\t1\t2\t0.0\t1.0\t1.0 +EOF + + printf "chr1\t1\t2\n" > diffbind.significant.bed + printf "chr1\t1\t2\n" > diffbind.significant_up.bed + printf "chr1\t1\t2\n" > diffbind.significant_down.bed + + cat <<-EOF > diffbind.summary.tsv +caller\tgroup\ttreated\tcontrol\tn_tested\tn_fdr_pass\tn_up\tn_down\tstatus\treason +${caller}\t${group}\tstub\tstub\t1\t0\t0\t0\tRUN\tok +EOF + + cat <<-EOF > diffbind.normalization_factors.tsv +sample_id\tsize_factor +stub\t1.0 +EOF + + echo "stub" > diffbind.dba.rds + touch diffbind.error.txt + echo "stub" > plots/PCA.pdf + echo "stub" > plots/correlation_heatmap.pdf + echo "stub" > plots/MA.pdf + echo "stub" > plots/volcano.pdf + + printf "%s\\n" "'${task.process}':" " R: stub" > versions.yml + """ +} diff --git a/modules/local/differential_design.nf b/modules/local/differential_design.nf new file mode 100644 index 00000000..9cc5cab2 --- /dev/null +++ b/modules/local/differential_design.nf @@ -0,0 +1,52 @@ +process DIFFERENTIAL_DESIGN { + label 'process_single' + + conda "conda-forge::python=3.8.3" + container "quay.io/biocontainers/python:3.8.3" + + input: + path samples_manifest + path peaks_manifest + val contrast + val min_reps + val allow_partial + val run_diffbind + val run_chipbinner + val run_span + val groups_allow + val callers_allow + + output: + path "differential_manifest.design.tsv", emit: design + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def allow_flag = allow_partial ? '--allow-partial' : '' + def diffbind_flag = run_diffbind ? '--run-diffbind' : '' + def chipbinner_flag = run_chipbinner ? '--run-chipbinner' : '' + def span_flag = run_span ? '--run-span' : '' + def groups_arg = groups_allow ? "--groups ${groups_allow}" : '' + def callers_arg = callers_allow ? "--callers ${callers_allow}" : '' + """ + differential_design.py \ + --samples ${samples_manifest} \ + --peaks ${peaks_manifest} \ + --contrast '${contrast}' \ + --min-replicates ${min_reps} \ + ${allow_flag} \ + ${diffbind_flag} \ + ${chipbinner_flag} \ + ${span_flag} \ + ${groups_arg} \ + ${callers_arg} \ + --out differential_manifest.design.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + END_VERSIONS + """ +} diff --git a/modules/local/differential_manifests.nf b/modules/local/differential_manifests.nf new file mode 100644 index 00000000..625668cd --- /dev/null +++ b/modules/local/differential_manifests.nf @@ -0,0 +1,36 @@ +process DIFFERENTIAL_MANIFESTS { + label 'process_single' + + conda "conda-forge::python=3.8.3" + container "quay.io/biocontainers/python:3.8.3" + + input: + path samples_raw + path peaks_raw + path ms_coeffs + val normalisation_mode + + output: + path "differential_manifest.samples.tsv", emit: samples + path "differential_manifest.peaks.tsv", emit: peaks + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def ms_arg = ms_coeffs ? "--ms-coeffs ${ms_coeffs}" : '' + """ + differential_manifests.py \ + --samples ${samples_raw} \ + --peaks ${peaks_raw} \ + --normalisation-mode ${normalisation_mode} \ + ${ms_arg} \ + --outdir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + END_VERSIONS + """ +} diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf new file mode 100644 index 00000000..5ee15e35 --- /dev/null +++ b/modules/local/differential_summary.nf @@ -0,0 +1,48 @@ +process DIFFERENTIAL_SUMMARY { + label 'process_single' + + conda "conda-forge::python=3.8.3" + container "quay.io/biocontainers/python:3.8.3" + + input: + path design_manifest + path summary_files + val manifest_only + path summary_header + path design_header + path chipbinner_header + path span_header + + output: + path "differential_summary_mqc.tsv", emit: summary + path "differential_design_mqc.tsv", emit: design + path "chipbinner_summary_mqc.tsv", emit: chipbinner_summary + path "span_summary_mqc.tsv", emit: span_summary + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def summaries_arg = summary_files instanceof List ? summary_files.join(' ') : summary_files + def manifest_only_flag = manifest_only ? '--manifest-only' : '' + """ + differential_summary.py \ + --design ${design_manifest} \ + --summaries ${summaries_arg} \ + ${manifest_only_flag} \ + --out differential_summary.tsv + + cat ${summary_header} differential_summary.tsv > differential_summary_mqc.tsv + cat ${design_header} ${design_manifest} > differential_design_mqc.tsv + awk 'BEGIN{FS=OFS="\\t"} NR==1{print; next} \$1=="chipbinner"' differential_summary.tsv > chipbinner_summary.tsv + cat ${chipbinner_header} chipbinner_summary.tsv > chipbinner_summary_mqc.tsv + awk 'BEGIN{FS=OFS="\\t"} NR==1{print; next} \$1=="span"' differential_summary.tsv > span_summary.tsv + cat ${span_header} span_summary.tsv > span_summary_mqc.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + END_VERSIONS + """ +} diff --git a/modules/local/export_diffbind_samplesheets.nf b/modules/local/export_diffbind_samplesheets.nf new file mode 100644 index 00000000..17236093 --- /dev/null +++ b/modules/local/export_diffbind_samplesheets.nf @@ -0,0 +1,17 @@ +process EXPORT_DIFFBIND_SAMPLESHEETS { + label 'process_single' + + input: + tuple val(group), val(caller), path(samplesheet) + + output: + tuple val(group), val(caller), path("${group}.csv"), emit: sheet + + when: + task.ext.when == null || task.ext.when + + script: + """ + cp ${samplesheet} ${group}.csv + """ +} diff --git a/modules/local/for_patch/bedtools/sort/main.nf b/modules/local/for_patch/bedtools/sort/main.nf index c221a853..2dee62de 100644 --- a/modules/local/for_patch/bedtools/sort/main.nf +++ b/modules/local/for_patch/bedtools/sort/main.nf @@ -21,7 +21,10 @@ process BEDTOOLS_SORT { script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + def prefix = task.ext.prefix ?: (meta.prefix ?: "${meta.id}") + if ("$intervals" == "${prefix}.${extension}") { + prefix = "${prefix}.sorted" + } def sizes = sizes ? "-g $sizes" : "" """ bedtools \\ diff --git a/modules/local/gtf_to_gene_bed.nf b/modules/local/gtf_to_gene_bed.nf new file mode 100644 index 00000000..98b0e099 --- /dev/null +++ b/modules/local/gtf_to_gene_bed.nf @@ -0,0 +1,31 @@ +process GTF_TO_GENE_BED { + tag "$gtf" + label 'process_low' + + conda "conda-forge::python=3.11" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.11' : + 'quay.io/biocontainers/python:3.11' }" + + input: + path gtf + + output: + path "gene_annotation.bed", emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + gtf_to_gene_bed.py \\ + --gtf $gtf \\ + --out gene_annotation.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | awk '{print \$NF}') + END_VERSIONS + """ +} diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf index 81f70b95..d6a3499d 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc.nf @@ -31,6 +31,7 @@ process MULTIQC { path ('peak_metrics/peak_count_consensus/*') path ('peak_metrics/peak_reprod_perc/*') path ('frag_len/*') + path ('differential/*') path ('linear_duplicates/*') output: diff --git a/modules/local/multiqc_differential.nf b/modules/local/multiqc_differential.nf new file mode 100644 index 00000000..1169a889 --- /dev/null +++ b/modules/local/multiqc_differential.nf @@ -0,0 +1,29 @@ +process MULTIQC_DIFFERENTIAL { + label 'process_single' + + conda "bioconda::multiqc=1.19" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" + + input: + path multiqc_config + val differential_dir + + output: + path "differential_multiqc_report.html", emit: report + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + multiqc -f --config ${multiqc_config} --filename differential_multiqc_report.html ${differential_dir} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/local/publish_differential_manifests.nf b/modules/local/publish_differential_manifests.nf new file mode 100644 index 00000000..b22583da --- /dev/null +++ b/modules/local/publish_differential_manifests.nf @@ -0,0 +1,41 @@ +process PUBLISH_DIFFERENTIAL_MANIFESTS { + label 'process_single' + + conda "conda-forge::coreutils=9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path samples_manifest + path peaks_manifest + + output: + path "differential_manifest.samples.tsv", emit: samples + path "differential_manifest.peaks.tsv", emit: peaks + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + samples_src=\$(readlink -f ${samples_manifest}) + samples_dst=\$(readlink -f differential_manifest.samples.tsv) + if [[ "\$samples_src" != "\$samples_dst" ]]; then + cp ${samples_manifest} differential_manifest.samples.tsv + fi + + peaks_src=\$(readlink -f ${peaks_manifest}) + peaks_dst=\$(readlink -f differential_manifest.peaks.tsv) + if [[ "\$peaks_src" != "\$peaks_dst" ]]; then + cp ${peaks_manifest} differential_manifest.peaks.tsv + fi + + coreutils_version=\$(cat --version | head -n 1 | awk '{print \$NF}') + { + echo "\"${task.process}\":" + echo " coreutils: \$coreutils_version" + } > versions.yml + """ +} diff --git a/modules/local/records_to_tsv.nf b/modules/local/records_to_tsv.nf new file mode 100644 index 00000000..36baf78b --- /dev/null +++ b/modules/local/records_to_tsv.nf @@ -0,0 +1,33 @@ +process RECORDS_TO_TSV { + label 'process_single' + + conda "conda-forge::python=3.8.3" + container "quay.io/biocontainers/python:3.8.3" + + input: + val key1 + val key2 + val records_json + val header + val filename + + output: + tuple val(key1), val(key2), path("${filename}"), emit: tsv + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + records_to_tsv.py \ + --records '${records_json}' \ + --header '${header}' \ + --out '${filename}' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + END_VERSIONS + """ +} diff --git a/modules/local/rename_summary.nf b/modules/local/rename_summary.nf new file mode 100644 index 00000000..ea6fcc24 --- /dev/null +++ b/modules/local/rename_summary.nf @@ -0,0 +1,14 @@ +process RENAME_SUMMARY { + label 'process_single' + + input: + tuple val(prefix), path(summary) + + output: + path "${prefix}.summary.tsv", emit: summary + + script: + """ + cp ${summary} ${prefix}.summary.tsv + """ +} diff --git a/modules/local/span_diff_run.nf b/modules/local/span_diff_run.nf new file mode 100644 index 00000000..a28262f2 --- /dev/null +++ b/modules/local/span_diff_run.nf @@ -0,0 +1,72 @@ +process SPAN_DIFF_RUN { + label params.span_diff_mode == 'fallback' ? 'SPAN_FALLBACK' : 'SPAN_NATIVE' + + conda "conda-forge::python=3.8.3 conda-forge::openjdk=21.0.2 bioconda::samtools=1.16.1 bioconda::bedtools=2.31.0 conda-forge::r-base=4.2.3 bioconda::bioconductor-deseq2 bioconda::bioconductor-edger" + container workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? + null : + 'quay.io/biocontainers/biocontainers:1.2.0--py38_0' + + input: + tuple val(group), path(samples_manifest), path(peaks_manifest), path(chrom_sizes) + val contrast + val mode + val fdr + val gap + val bin_size + val fallback_backend + val use_spikein + val allow_partial + val caller_priority + path omnipeaks_jar + val java_heap + + output: + tuple val(group), path("span.differential.tsv"), emit: differential + tuple val(group), path("span.differential.peaks.bed"), emit: peaks_bed + tuple val(group), path("span.up.bed"), emit: up + tuple val(group), path("span.down.bed"), emit: down + tuple val(group), path("span.summary.tsv"), emit: summary + tuple val(group), path("span.mode.txt"), emit: mode_out + path "span_diff_target_pooling.tsv", optional: true, emit: pooling + tuple val(group), path("span.normalization_factors.tsv", optional: true), emit: norm_factors + path "*.pooled.bam", optional: true, emit: pooled_bam + path "*.pooled.bam.bai", optional: true, emit: pooled_bai + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def caller_arg = caller_priority ? "--caller-priority '${caller_priority}'" : '' + def pooling_dir = "${params.outdir}/03_peak_calling/08_differential/03_span/${group}" + """ + span_diff.py \ + --jar ${omnipeaks_jar} \ + --mode ${mode} \ + --contrast '${contrast}' \ + --group '${group}' \ + --samples ${samples_manifest} \ + --peaks ${peaks_manifest} \ + ${caller_arg} \ + --chrom-sizes ${chrom_sizes} \ + --bin ${bin_size} \ + --gap ${gap} \ + --fdr ${fdr} \ + --fallback-backend ${fallback_backend} \ + --use-spikein ${use_spikein} \ + --allow-partial ${allow_partial} \ + --java-heap ${java_heap} \ + --cpus ${task.cpus} \ + --pooling-dir '${pooling_dir}' \ + --outdir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\.)+[0-9]{1,}") + java: \$(java -version 2>&1 | head -n 1 | sed -e 's/"//g') + bedtools: \$(command -v bedtools >/dev/null 2>&1 && bedtools --version | sed 's/bedtools v//' || echo "NA") + samtools: \$(command -v samtools >/dev/null 2>&1 && samtools --version | head -n 1 | sed 's/samtools //' || echo "NA") + R: \$(command -v R >/dev/null 2>&1 && R --version | head -n 1 | sed 's/.* //' || echo "NA") + END_VERSIONS + """ +} diff --git a/modules/local/span_pooling_manifest.nf b/modules/local/span_pooling_manifest.nf new file mode 100644 index 00000000..1526c343 --- /dev/null +++ b/modules/local/span_pooling_manifest.nf @@ -0,0 +1,30 @@ +process SPAN_POOLING_MANIFEST { + label 'process_single' + + conda "conda-forge::python=3.8.3" + container "quay.io/biocontainers/python:3.8.3" + + input: + path pooling_files + val do_run + + output: + path "span_diff_target_pooling.tsv", emit: manifest + path "versions.yml", emit: versions + + when: + (task.ext.when == null || task.ext.when) && do_run + + script: + def inputs_arg = pooling_files instanceof List ? pooling_files.join(' ') : pooling_files + """ + span_pooling_manifest.py \ + --inputs ${inputs_arg} \ + --out span_diff_target_pooling.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | grep -E -o "([0-9]{1,}\\\\.)+[0-9]{1,}") + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 39d9e100..3a68b2ad 100644 --- a/nextflow.config +++ b/nextflow.config @@ -91,6 +91,53 @@ params { span_java_heap = "8G" span_stringent_fdr = "1.0E-9" + // Differential analysis + run_diffbind = false + run_chipbinner = false + run_span_diff = false + differential_contrast = "treated,control" + differential_min_replicates = 2 + differential_use_spikein = "auto" + differential_allow_partial = false + differential_from_run = null + differential_publish_manifest_only = false + differential_groups = null + differential_callers = null + differential_multiqc_report = false + + // DiffBind options + diffbind_fdr = 0.05 + diffbind_lfc = 1.0 + diffbind_min_overlap = 2 + diffbind_backend = "DESeq2" + diffbind_recenter_peaks = false + diffbind_summits = 0 + diffbind_norm_method = "native" + diffbind_extra_params = null + export_diffbind_sheets = true + + // ChIPBinner options + chipbinner_ms_coeffs = null + chipbinner_bin_size = 10000 + chipbinner_windows_dir = null + chipbinner_use_input = false + chipbinner_pseudocount = 1 + chipbinner_hdbscan_grid_minpts = "100,200,500,1000" + chipbinner_hdbscan_grid_minsamps = "100,200,500,1000" + chipbinner_fdr = 0.05 + chipbinner_lfc = 1.0 + chipbinner_bootstrap = 1000 + chipbinner_k_value = 100000 + chipbinner_functional_db = null + + // SPAN differential options + span_diff_mode = "auto" + span_diff_fdr = 0.05 + span_diff_gap = 5 + span_diff_bin = 200 + span_diff_java_heap = "8G" + span_fallback_backend = "DESeq2" + // Consensus Peaks consensus_peak_mode = 'group' consensus_grouping = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 3c68e540..1b51175d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -329,14 +332,24 @@ "default": "Spikein", "fa_icon": "fab fa-buffer", "description": "Sets the target read normalisation mode. Options are: [\"Spikein\", \"RPKM\", \"CPM\", \"BPM\", \"None\" ]", - "enum": ["Spikein", "RPKM", "CPM", "BPM", "None"] + "enum": [ + "Spikein", + "RPKM", + "CPM", + "BPM", + "None" + ] }, "normalisation_scope": { "type": "string", "default": "all", "fa_icon": "fab fa-buffer", "description": "Scope for spike-in normalisation when normalisation_mode is Spikein. Options are: [all, group, group_condition].", - "enum": ["all", "group", "group_condition"] + "enum": [ + "all", + "group", + "group_condition" + ] }, "normalisation_binsize": { "type": "integer", @@ -354,7 +367,10 @@ "default": "standard", "fa_icon": "fas fa-align-justify", "description": "Convenience preset for peak callers. Options are: [standard, extended]. Ignored if --peakcaller is set.", - "enum": ["standard", "extended"] + "enum": [ + "standard", + "extended" + ] }, "use_control": { "type": "boolean", @@ -379,7 +395,11 @@ "default": "legacy", "fa_icon": "fas fa-align-justify", "description": "Control scaling scope for IgG/background samples. Options are: [legacy, group_condition, sample].", - "enum": ["legacy", "group_condition", "sample"] + "enum": [ + "legacy", + "group_condition", + "sample" + ] }, "seacr_peak_threshold": { "type": "number", @@ -392,14 +412,20 @@ "default": "non", "fa_icon": "fas fa-align-justify", "description": "SEACR normalization. ", - "enum": ["non", "norm"] + "enum": [ + "non", + "norm" + ] }, "seacr_stringent": { "type": "string", "default": "stringent", "fa_icon": "fas fa-align-justify", "description": "SEACR stringency.", - "enum": ["stringent", "relaxed"] + "enum": [ + "stringent", + "relaxed" + ] }, "macs2_qvalue": { "type": "number", @@ -463,13 +489,19 @@ "default": "group", "fa_icon": "fas fa-align-justify", "description": "Specifies what samples to group together for consensus peaks. Options are [group, all]", - "enum": ["group", "all"] + "enum": [ + "group", + "all" + ] }, "consensus_grouping": { "type": "string", "fa_icon": "fas fa-align-justify", "description": "Grouping key for consensus peaks when consensus_peak_mode is group. Options are [group, group_condition]. Defaults to group_condition when the samplesheet has a condition column, otherwise group.", - "enum": ["group", "group_condition"] + "enum": [ + "group", + "group_condition" + ] }, "replicate_threshold": { "type": "number", @@ -482,6 +514,241 @@ "default": true, "fa_icon": "fas fa-align-justify", "description": "Show gene names instead of symbols in IGV browser sessions" + }, + "run_diffbind": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-exchange-alt", + "description": "Enable DiffBind differential binding analysis." + }, + "run_chipbinner": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-exchange-alt", + "description": "Enable ChIPBinner differential analysis." + }, + "run_span_diff": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-exchange-alt", + "description": "Enable SPAN/OmniPeak differential analysis." + }, + "differential_contrast": { + "type": "string", + "default": "treated,control", + "fa_icon": "fas fa-align-justify", + "description": "Contrast definition for differential analysis (treated,control)." + }, + "differential_min_replicates": { + "type": "integer", + "default": 2, + "fa_icon": "fas fa-sort-numeric-down", + "description": "Minimum replicates per condition per group." + }, + "differential_use_spikein": { + "type": "string", + "default": "auto", + "fa_icon": "fas fa-align-justify", + "description": "Use spike-in scale factors for differential normalization (auto|true|false)." + }, + "differential_allow_partial": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-align-justify", + "description": "Allow skipping invalid comparisons instead of failing." + }, + "differential_from_run": { + "type": "string", + "default": null, + "fa_icon": "fas fa-folder-open", + "description": "Prior run outdir for differential-only mode." + }, + "differential_publish_manifest_only": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-file-alt", + "description": "Generate differential manifests and design tables only." + }, + "differential_groups": { + "type": "string", + "default": null, + "fa_icon": "fas fa-filter", + "description": "Comma-separated list of groups to include in differential analysis." + }, + "differential_callers": { + "type": "string", + "default": null, + "fa_icon": "fas fa-filter", + "description": "Comma-separated list of peak callers to include in DiffBind." + }, + "differential_multiqc_report": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-chart-bar", + "description": "Generate a differential-only MultiQC report." + }, + "diffbind_fdr": { + "type": "number", + "default": 0.05, + "fa_icon": "fas fa-align-justify", + "description": "FDR cutoff for DiffBind." + }, + "diffbind_lfc": { + "type": "number", + "default": 1.0, + "fa_icon": "fas fa-align-justify", + "description": "Absolute log2FC cutoff for DiffBind." + }, + "diffbind_min_overlap": { + "type": "integer", + "default": 2, + "fa_icon": "fas fa-align-justify", + "description": "Minimum overlap for DiffBind counting." + }, + "diffbind_backend": { + "type": "string", + "default": "DESeq2", + "fa_icon": "fas fa-align-justify", + "description": "Backend for DiffBind (DESeq2 or edgeR)." + }, + "diffbind_recenter_peaks": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-align-justify", + "description": "Recenter peaks around summits in DiffBind." + }, + "diffbind_summits": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-align-justify", + "description": "Summit window width for DiffBind (0 uses full peaks)." + }, + "diffbind_norm_method": { + "type": "string", + "default": "native", + "fa_icon": "fas fa-align-justify", + "description": "DiffBind normalization method when not using spike-in." + }, + "diffbind_extra_params": { + "type": "string", + "default": null, + "fa_icon": "fas fa-align-justify", + "description": "JSON/YAML file with extra DiffBind parameters." + }, + "export_diffbind_sheets": { + "type": "boolean", + "default": true, + "fa_icon": "fas fa-align-justify", + "description": "Export DiffBind sample sheets." + }, + "chipbinner_ms_coeffs": { + "type": "string", + "default": null, + "fa_icon": "fas fa-align-justify", + "description": "Optional MS-based coefficients for ChIPBinner normalization." + }, + "chipbinner_bin_size": { + "type": "integer", + "default": 10000, + "fa_icon": "fas fa-align-justify", + "enum": [1000, 5000, 10000, 25000], + "description": "Bin size for ChIPBinner (1000, 5000, 10000, 25000)." + }, + "chipbinner_windows_dir": { + "type": "string", + "default": null, + "fa_icon": "fas fa-folder-open", + "description": "Directory with prebuilt ChIPBinner windows." + }, + "chipbinner_use_input": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-align-justify", + "description": "Use input/background for ChIPBinner (if available)." + }, + "chipbinner_pseudocount": { + "type": "integer", + "default": 1, + "fa_icon": "fas fa-align-justify", + "description": "Pseudocount added before log transforms in ChIPBinner." + }, + "chipbinner_hdbscan_grid_minpts": { + "type": "string", + "default": "100,200,500,1000", + "fa_icon": "fas fa-align-justify", + "description": "HDBSCAN minPts grid values for ChIPBinner." + }, + "chipbinner_hdbscan_grid_minsamps": { + "type": "string", + "default": "100,200,500,1000", + "fa_icon": "fas fa-align-justify", + "description": "HDBSCAN minSamps grid values for ChIPBinner." + }, + "chipbinner_fdr": { + "type": "number", + "default": 0.05, + "fa_icon": "fas fa-align-justify", + "description": "FDR cutoff for ChIPBinner differential results." + }, + "chipbinner_lfc": { + "type": "number", + "default": 1.0, + "fa_icon": "fas fa-align-justify", + "description": "Absolute log2FC cutoff for ChIPBinner." + }, + "chipbinner_bootstrap": { + "type": "integer", + "default": 1000, + "fa_icon": "fas fa-align-justify", + "description": "ROTS bootstrap iterations for ChIPBinner." + }, + "chipbinner_k_value": { + "type": "integer", + "default": 100000, + "fa_icon": "fas fa-align-justify", + "description": "ROTS K parameter for ChIPBinner." + }, + "chipbinner_functional_db": { + "type": "string", + "default": null, + "fa_icon": "fas fa-align-justify", + "description": "Optional functional database for ChIPBinner enrichment." + }, + "span_diff_mode": { + "type": "string", + "default": "auto", + "fa_icon": "fas fa-align-justify", + "description": "SPAN differential mode (auto|native|fallback)." + }, + "span_diff_fdr": { + "type": "number", + "default": 0.05, + "fa_icon": "fas fa-align-justify", + "description": "FDR cutoff for SPAN differential." + }, + "span_diff_gap": { + "type": "integer", + "default": 5, + "fa_icon": "fas fa-align-justify", + "description": "Gap parameter for SPAN differential." + }, + "span_diff_bin": { + "type": "integer", + "default": 200, + "fa_icon": "fas fa-align-justify", + "description": "Bin size parameter for SPAN differential." + }, + "span_diff_java_heap": { + "type": "string", + "default": "8G", + "fa_icon": "fas fa-align-justify", + "description": "Java heap size for SPAN differential." + }, + "span_fallback_backend": { + "type": "string", + "default": "DESeq2", + "fa_icon": "fas fa-align-justify", + "description": "Backend for SPAN fallback differential (DESeq2 or edgeR)." } }, "fa_icon": "fas fa-cog" @@ -666,7 +933,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "singularity_pull_docker_container": { diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf new file mode 100644 index 00000000..06408c4f --- /dev/null +++ b/subworkflows/local/differential_peak_calling.nf @@ -0,0 +1,625 @@ +/* + * Differential peak calling subworkflow + */ + +import groovy.json.JsonOutput + +include { DIFFERENTIAL_MANIFESTS } from "../../modules/local/differential_manifests" +include { DIFFERENTIAL_DESIGN } from "../../modules/local/differential_design" +include { PUBLISH_DIFFERENTIAL_MANIFESTS } from "../../modules/local/publish_differential_manifests" +include { RECORDS_TO_TSV as RECORDS_TO_TSV_DIFFBIND } from "../../modules/local/records_to_tsv" +include { RECORDS_TO_TSV as RECORDS_TO_TSV_CHIPBINNER } from "../../modules/local/records_to_tsv" +include { DIFFBIND_RUN } from "../../modules/local/diffbind_run" +include { EXPORT_DIFFBIND_SAMPLESHEETS } from "../../modules/local/export_diffbind_samplesheets" +include { CHIPBINNER_RUN } from "../../modules/local/chipbinner_run" +include { CHIPBINNER_WINDOWS_CACHE } from "../../modules/local/chipbinner_windows_cache" +include { POOL_IGG_CONTROLS } from "../../modules/local/pool_igg_controls" +include { SPAN_DIFF_RUN } from "../../modules/local/span_diff_run" +include { SPAN_POOLING_MANIFEST } from "../../modules/local/span_pooling_manifest" +include { RENAME_SUMMARY as RENAME_SUMMARY_DIFFBIND } from "../../modules/local/rename_summary" +include { RENAME_SUMMARY as RENAME_SUMMARY_CHIPBINNER } from "../../modules/local/rename_summary" +include { RENAME_SUMMARY as RENAME_SUMMARY_SPAN } from "../../modules/local/rename_summary" +include { ANNOTATE_REGIONS } from "../../modules/local/annotate_regions" +include { GTF_TO_GENE_BED } from "../../modules/local/gtf_to_gene_bed" +include { BEDTOOLS_SORT as ANNOTATION_GENE_BED_SORT } from "../../modules/local/for_patch/bedtools/sort/main" +include { CACHE_DIFFERENTIAL_GENE_BED } from "../../modules/local/cache_differential_gene_bed" +include { DIFFERENTIAL_SUMMARY } from "../../modules/local/differential_summary" +include { MULTIQC_DIFFERENTIAL } from "../../modules/local/multiqc_differential" + +workflow DIFFERENTIAL_PEAK_CALLING { + take: + ch_bam + ch_bai + ch_bigwig + ch_peaks + ch_scale_factors + ch_chrom_sizes + ch_gene_bed + ch_gtf + ch_blacklist + ch_samples_manifest_in + ch_peaks_manifest_in + mode + + main: + ch_versions = Channel.empty() + ch_diff_summary_mqc = Channel.empty() + ch_diff_design_mqc = Channel.empty() + ch_annotation_requests = Channel.empty() + ch_diffbind_summary = Channel.empty() + ch_chipbinner_summary = Channel.empty() + ch_span_summary = Channel.empty() + ch_chipbinner_windows = Channel.empty() + ch_summary_files = Channel.empty() + ch_samples_manifest = Channel.empty() + ch_peaks_manifest = Channel.empty() + ch_use_spikein = Channel.empty() + summary_stub = file("$projectDir/assets/differential_summary_stub.tsv") + + def diff_enabled = params.run_diffbind || params.run_chipbinner || params.run_span_diff || params.differential_publish_manifest_only + if (diff_enabled) { + def use_manifests = (mode == 'posthoc') + def outdir_root = file(params.outdir).toString() + def diff_outdir = "${outdir_root}/03_peak_calling/08_differential" + + def bam_subdir = params.run_remove_linear_dups ? 'linear_dedup' : (params.run_remove_dups ? 'dedup' : (params.run_mark_dups ? 'markdup' : '')) + def bam_dir = bam_subdir ? "${outdir_root}/02_alignment/${params.aligner}/target/${bam_subdir}" : "${outdir_root}/02_alignment/${params.aligner}/target" + + def peaks_dir = { caller -> "${outdir_root}/03_peak_calling/04_called_peaks/${caller}" } + def bigwig_dir = "${outdir_root}/03_peak_calling/03_bed_to_bigwig" + + def peaks_format = { file -> + def name = file.getName().toLowerCase() + if (name.endsWith('.narrowpeak')) return 'narrowPeak' + if (name.endsWith('.broadpeak')) return 'broadPeak' + if (name.endsWith('.peak')) return 'peak' + if (name.endsWith('.bed')) return 'bed' + return 'bed' + } + + if (!use_manifests) { + ch_bam_target = ch_bam.filter { it[0].is_control == false } + ch_bai_target = ch_bai.filter { it[0].is_control == false } + ch_bigwig_target = ch_bigwig.filter { it[0].is_control == false } + ch_scale_target = ch_scale_factors.filter { it[0].is_control == false } + ch_bam_control = ch_bam.filter { it[0].is_control == true } + ch_bai_control = ch_bai.filter { it[0].is_control == true } + + ch_scale_map = ch_scale_target + .map { meta, scale -> [meta.id, scale] } + .collect() + .map { items -> + def out = [:] + items.each { entry -> + if (entry instanceof List && entry.size() > 1) { + out[entry[0]] = entry[1] + } + } + out + } + + ch_bigwig_map = ch_bigwig_target + .map { meta, bw -> [meta.id, bw] } + .collect() + .map { items -> + def out = [:] + items.each { entry -> + if (entry instanceof List && entry.size() > 1) { + out[entry[0]] = entry[1] + } + } + out + } + + def build_control_condition_map = { ch_control -> + ch_control + .map { meta, file -> [meta.group, meta.condition] } + .toList() + .map { list -> + def control_map = [:].withDefault { [] } + list.each { entry -> + def group = entry[0] + def condition = entry[1] + control_map[group] = (control_map[group] ?: []) + [condition] + } + control_map + } + } + + def add_control_condition = { ch_target, ch_control -> + def control_map_ch = build_control_condition_map(ch_control) + ch_target + .combine(control_map_ch) + .map { meta, file, control_map -> + def control_group = meta.control_group ?: meta.group + def conditions = control_map.get(control_group, []) + def control_condition = meta.condition + if (conditions) { + if (conditions.contains(meta.condition)) { + control_condition = meta.condition + } else if (conditions.contains('NA')) { + control_condition = 'NA' + } else { + control_condition = conditions.sort()[0] + } + } + def meta_out = meta + [control_condition: control_condition] + [meta_out, file] + } + } + + ch_bam_target_cc = add_control_condition(ch_bam_target, ch_bam_control) + + ch_input_map = Channel.value([:]) + if (params.chipbinner_use_input) { + ch_pooled_inputs = ch_bam_control + .map { meta, bam -> ["${meta.group}__${meta.condition}", [meta, bam]] } + .groupTuple(by: [0]) + .map { key, entries -> + def meta0 = entries[0][0] + def pooled_meta = meta0 + [ + id: "chipbinner_input_${meta0.group}_${meta0.condition}", + control_group: meta0.group, + is_control: true + ] + def bams = entries.collect { it[1] }.sort { it.getName() } + [pooled_meta, bams] + } + + POOL_IGG_CONTROLS ( ch_pooled_inputs ) { + ext.publish_dir = "${diff_outdir}/00_manifests" + } + ch_versions = ch_versions.mix(POOL_IGG_CONTROLS.out.versions) + + ch_pooled_controls = POOL_IGG_CONTROLS.out.bam + .map { meta, bam -> [meta.id, meta, bam] } + .join(POOL_IGG_CONTROLS.out.bai.map { meta, bai -> [meta.id, bai] }) + .map { id, meta, bam, bai -> [meta, bam, bai] } + + ch_input_map = ch_pooled_controls + .map { meta, bam, bai -> [meta.group, meta.condition, bam, bai] } + .toList() + .ifEmpty([]) + .map { list -> + def map = [:].withDefault { [:] } + list.each { entry -> + def group = entry[0] + def condition = entry[1] + def group_map = map[group] ?: [:] + group_map[condition] = [entry[2], entry[3]] + map[group] = group_map + } + map + } + } + + ch_samples_raw = ch_bam_target_cc + .map { meta, bam -> [meta.id, meta, bam] } + .join(ch_bai_target.map { meta, bai -> [meta.id, bai] }) + .map { id, meta, bam, bai -> [meta, bam, bai] } + .combine(ch_bigwig_map) + .combine(ch_scale_map) + .combine(ch_input_map) + .map { meta, bam, bai, bw_map, scale_map, input_map -> + def bigwig = bw_map.get(meta.id) ?: 'NA' + def scale = scale_map.get(meta.id) ?: 'NA' + def bam_path = "${bam_dir}/${bam.getName()}" + def bai_path = "${bam_dir}/${bai.getName()}" + def bigwig_path = bigwig == 'NA' ? 'NA' : "${bigwig_dir}/${bigwig.getName()}" + def control_group = meta.control_group ?: meta.group + def control_condition = meta.control_condition ?: meta.condition + def input_entry = input_map.get(control_group, [:]).get(control_condition) + def input_bam_path = 'NA' + def input_bai_path = 'NA' + if (input_entry) { + def input_bam = input_entry[0] + def input_bai = input_entry[1] + if (input_bam) { + input_bam_path = "${diff_outdir}/00_manifests/${input_bam.getName()}" + } + if (input_bai) { + input_bai_path = "${diff_outdir}/00_manifests/${input_bai.getName()}" + } + } + [ + sample_id: meta.id, + group: meta.group, + condition: meta.condition, + replicate: meta.replicate, + final_bam: bam_path, + final_bai: bai_path, + spikein_scale_factor: scale, + bigwig_path: bigwig_path, + input_bam: input_bam_path, + input_bai: input_bai_path + ] + } + .map { row -> + "${row.sample_id}\t${row.group}\t${row.condition}\t${row.replicate}\t${row.final_bam}\t${row.final_bai}\t${row.spikein_scale_factor}\t${row.bigwig_path}\t${row.input_bam}\t${row.input_bai}" + } + + def samples_header = "sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tspikein_scale_factor\tbigwig_path\tinput_bam\tinput_bai" + ch_samples_raw = ch_samples_raw + .collect() + .map { rows -> ([samples_header] + rows).join('\n') + '\n' } + .collectFile(name: 'differential_samples_raw.tsv', newLine: false) + + ch_peaks_raw = ch_peaks + .map { meta, peak_file -> + def peak_path = "${peaks_dir(meta.caller)}/${peak_file.getName()}" + def fmt = peaks_format(peak_file) + "${meta.id}\t${meta.group}\t${meta.condition}\t${meta.replicate}\t${meta.caller}\t${peak_path}\t${fmt}" + } + + def peaks_header = "sample_id\tgroup\tcondition\treplicate\tcaller\tpeaks_path\tpeaks_format" + ch_peaks_raw = ch_peaks_raw + .collect() + .map { rows -> ([peaks_header] + rows).join('\n') + '\n' } + .collectFile(name: 'differential_peaks_raw.tsv', newLine: false) + + DIFFERENTIAL_MANIFESTS ( + ch_samples_raw, + ch_peaks_raw, + params.chipbinner_ms_coeffs ? file(params.chipbinner_ms_coeffs) : file("$projectDir/assets/ms_coeffs_stub.tsv"), + params.normalisation_mode + ) + ch_versions = ch_versions.mix(DIFFERENTIAL_MANIFESTS.out.versions) + ch_samples_manifest = DIFFERENTIAL_MANIFESTS.out.samples + ch_peaks_manifest = DIFFERENTIAL_MANIFESTS.out.peaks + } else { + ch_samples_manifest = ch_samples_manifest_in + ch_peaks_manifest = ch_peaks_manifest_in + } + + DIFFERENTIAL_DESIGN ( + ch_samples_manifest, + ch_peaks_manifest, + params.differential_contrast, + params.differential_min_replicates, + params.differential_allow_partial, + params.run_diffbind, + params.run_chipbinner, + params.run_span_diff, + params.differential_groups ?: '', + params.differential_callers ?: '' + ) + ch_versions = ch_versions.mix(DIFFERENTIAL_DESIGN.out.versions) + + PUBLISH_DIFFERENTIAL_MANIFESTS ( + ch_samples_manifest, + ch_peaks_manifest + ) + ch_versions = ch_versions.mix(PUBLISH_DIFFERENTIAL_MANIFESTS.out.versions) + + ch_samples_rows = ch_samples_manifest.splitCsv(header: true, sep: '\t') + ch_peaks_rows = ch_peaks_manifest.splitCsv(header: true, sep: '\t') + ch_design_rows = DIFFERENTIAL_DESIGN.out.design.splitCsv(header: true, sep: '\t') + + def annotation_enabled = (params.run_diffbind || params.run_chipbinner || params.run_span_diff) && !params.differential_publish_manifest_only + ch_gene_bed_single = Channel.empty() + if (annotation_enabled) { + GTF_TO_GENE_BED ( ch_gtf ) + ch_versions = ch_versions.mix(GTF_TO_GENE_BED.out.versions) + + ANNOTATION_GENE_BED_SORT ( + GTF_TO_GENE_BED.out.bed.map { bed -> [ [id: bed.baseName], bed ] }, + "bed", + [] + ) + ch_versions = ch_versions.mix(ANNOTATION_GENE_BED_SORT.out.versions) + + ch_gene_bed_from_gtf = ANNOTATION_GENE_BED_SORT.out.sorted.map { it[1] } + ch_gene_bed_selected = ch_gene_bed.ifEmpty(ch_gene_bed_from_gtf) + + ch_gene_bed_single = ch_gene_bed_selected + .collect() + .map { beds -> + if (!beds || beds.size() == 0) { + exit 1, "Differential annotation requires --gene_bed or --gtf." + } + beds[0] + } + + CACHE_DIFFERENTIAL_GENE_BED ( ch_gene_bed_single ) + ch_versions = ch_versions.mix(CACHE_DIFFERENTIAL_GENE_BED.out.versions) + ch_gene_bed_single = CACHE_DIFFERENTIAL_GENE_BED.out.bed.first() + } + + def contrast_labels = params.differential_contrast.split(',').collect { it.trim() } + ch_samples_rows_contrast = ch_samples_rows.filter { row -> contrast_labels.contains(row.condition) } + + def diff_use_spikein = params.differential_use_spikein ? params.differential_use_spikein.toString().toLowerCase() : 'auto' + def use_spikein_default = (diff_use_spikein == 'true') || (diff_use_spikein == 'auto' && params.normalisation_mode == 'Spikein') + ch_use_spikein = Channel.value(use_spikein_default) + if (use_manifests && diff_use_spikein == 'auto') { + ch_use_spikein = ch_samples_rows + .map { row -> row.normalisation_mode ?: '' } + .collect() + .map { modes -> + def uniq = modes.findAll { it }.collect { it.toString() }.unique() + uniq.contains('Spikein') + } + } + def use_spikein_consumers = [params.run_diffbind, params.run_chipbinner, params.run_span_diff].count { it } + if (!params.differential_publish_manifest_only && use_spikein_consumers > 1) { + ch_use_spikein = (ch_use_spikein instanceof groovyx.gpars.dataflow.DataflowReadChannel) + ? ch_use_spikein.broadcast() + : Channel.value(ch_use_spikein).broadcast() + } else if (!(ch_use_spikein instanceof groovyx.gpars.dataflow.DataflowReadChannel)) { + ch_use_spikein = Channel.value(ch_use_spikein) + } + + if (params.run_diffbind && !params.differential_publish_manifest_only) { + ch_diffbind_design = ch_design_rows + .filter { row -> row.caller != 'NA' && row.status == 'RUN' && row.eligible_diffbind == 'true' } + .map { row -> [ [row.group, row.caller], row ] } + + ch_sample_peak_records = ch_samples_rows_contrast + .map { row -> [row.sample_id, row] } + .join(ch_peaks_rows.map { row -> [row.sample_id, row] }) + .map { sample_id, srow, prow -> + def record = [ + group: srow.group, + caller: prow.caller, + sample_id: srow.sample_id, + condition: srow.condition, + replicate: srow.replicate, + bam: srow.final_bam, + peaks: prow.peaks_path, + spikein_scale_factor: srow.spikein_scale_factor + ] + [ [record.group, record.caller], record ] + } + .groupTuple(by: [0]) + .map { key, recs -> [ key[0], key[1], recs ] } + + ch_diffbind_records = ch_sample_peak_records + .map { group, caller, recs -> [ [group, caller], recs ] } + .join(ch_diffbind_design) + .map { key, recs, row -> [ key[0], key[1], recs ] } + + ch_diffbind_records + .map { group, caller, recs -> [ group, caller, JsonOutput.toJson(recs) ] } + .set { ch_diffbind_records_json } + + RECORDS_TO_TSV_DIFFBIND ( + ch_diffbind_records_json.map { it[0] }, + ch_diffbind_records_json.map { it[1] }, + ch_diffbind_records_json.map { it[2] }, + 'sample_id\tgroup\tcondition\treplicate\tbam\tpeaks\tcaller\tspikein_scale_factor', + ch_diffbind_records_json.map { group, caller, json -> "diffbind_${caller}_${group}.tsv" } + ) + ch_versions = ch_versions.mix(RECORDS_TO_TSV_DIFFBIND.out.versions) + + DIFFBIND_RUN ( + RECORDS_TO_TSV_DIFFBIND.out.tsv, + params.differential_contrast, + ch_use_spikein, + params.diffbind_fdr, + params.diffbind_lfc, + params.diffbind_min_overlap, + params.diffbind_backend, + params.diffbind_recenter_peaks, + params.diffbind_summits, + params.diffbind_norm_method, + params.diffbind_extra_params ? file(params.diffbind_extra_params).toString() : '', + params.export_diffbind_sheets, + params.differential_allow_partial + ) + ch_versions = ch_versions.mix(DIFFBIND_RUN.out.versions) + + if (params.export_diffbind_sheets) { + EXPORT_DIFFBIND_SAMPLESHEETS ( + DIFFBIND_RUN.out.samplesheet_export + ) + } + + if (annotation_enabled) { + ch_diffbind_annot = DIFFBIND_RUN.out.results + .combine(ch_gene_bed_single) + .map { group, caller, regions, gene_bed -> + [ 'diffbind', group, caller, regions, gene_bed, "01_diffbind/${caller}/${group}/diffbind.results.annotated.tsv" ] + } + + ch_annotation_requests = ch_annotation_requests.mix(ch_diffbind_annot) + } + RENAME_SUMMARY_DIFFBIND ( + DIFFBIND_RUN.out.summary.map { group, caller, summary -> + def prefix = "${caller}_${group}".replaceAll(/[^A-Za-z0-9_.-]+/, "_") + [ prefix, summary ] + } + ) + ch_diffbind_summary = RENAME_SUMMARY_DIFFBIND.out.summary + } + + if (params.run_chipbinner && !params.differential_publish_manifest_only) { + ch_chip_design = ch_design_rows + .filter { row -> row.caller == 'NA' && row.status == 'RUN' && row.eligible_chipbinner == 'true' } + .map { row -> [row.group, row] } + + def chipbinner_windows_dir = params.chipbinner_windows_dir ? file(params.chipbinner_windows_dir).toString() : '' + def chipbinner_blacklist = params.blacklist ? file(params.blacklist).toString() : '' + CHIPBINNER_WINDOWS_CACHE ( + ch_chrom_sizes.collect().map { it instanceof List ? it[0] : it }, + params.chipbinner_bin_size, + chipbinner_windows_dir, + chipbinner_blacklist + ) + ch_versions = ch_versions.mix(CHIPBINNER_WINDOWS_CACHE.out.versions) + ch_chipbinner_windows = CHIPBINNER_WINDOWS_CACHE.out.windows.first() + + ch_chip_records = ch_samples_rows_contrast + .map { row -> [row.group, row] } + .groupTuple(by: [0]) + .join(ch_chip_design) + .map { item -> [ item[0], item[1] ] } + + ch_chip_records + .map { item -> [ item[0], JsonOutput.toJson(item[1]) ] } + .set { ch_chip_records_json } + + RECORDS_TO_TSV_CHIPBINNER ( + ch_chip_records_json.map { item -> item[0] }, + ch_chip_records_json.map { 'NA' }, + ch_chip_records_json.map { item -> item[1] }, + 'sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff\tinput_bam\tinput_bai', + ch_chip_records_json.map { item -> "chipbinner_${item[0]}.tsv" } + ) + ch_versions = ch_versions.mix(RECORDS_TO_TSV_CHIPBINNER.out.versions) + + ch_chip_records_file = RECORDS_TO_TSV_CHIPBINNER.out.tsv + .map { item -> [item[0], item[2]] } + + ch_chip_inputs = ch_chip_records_file + .combine(ch_chrom_sizes.collect().map { it instanceof List ? it[0] : it }) + .combine(ch_chipbinner_windows) + .map { item -> + def group = item[0] + def records = item[1] + def chrom_sizes = item[2] + def windows = item[3] + [ group, records, chrom_sizes, windows ] + } + + CHIPBINNER_RUN ( + ch_chip_inputs, + params.differential_contrast, + workflow.launchDir, + params.chipbinner_bin_size, + '', + '', + params.chipbinner_use_input, + ch_use_spikein, + params.chipbinner_pseudocount, + params.chipbinner_hdbscan_grid_minpts, + params.chipbinner_hdbscan_grid_minsamps, + params.chipbinner_fdr, + params.chipbinner_lfc, + params.chipbinner_bootstrap, + params.chipbinner_k_value, + params.chipbinner_functional_db ? file(params.chipbinner_functional_db).toString() : '', + params.differential_allow_partial + ) + ch_versions = ch_versions.mix(CHIPBINNER_RUN.out.versions) + + if (annotation_enabled) { + ch_chip_annot = CHIPBINNER_RUN.out.differential + .combine(ch_gene_bed_single) + .map { group, regions, gene_bed -> + [ 'chipbinner', group, 'NA', regions, gene_bed, "02_chipbinner/${group}/chipbinner.differential.annotated.tsv" ] + } + + ch_annotation_requests = ch_annotation_requests.mix(ch_chip_annot) + } + RENAME_SUMMARY_CHIPBINNER ( + CHIPBINNER_RUN.out.summary.map { group, summary -> + def prefix = "chipbinner_${group}".replaceAll(/[^A-Za-z0-9_.-]+/, "_") + [ prefix, summary ] + } + ) + ch_chipbinner_summary = RENAME_SUMMARY_CHIPBINNER.out.summary + } + + if (params.run_span_diff && !params.differential_publish_manifest_only) { + ch_span_design = ch_design_rows + .filter { row -> row.caller == 'NA' && row.status == 'RUN' && row.eligible_span == 'true' } + .map { row -> row.group } + + def span_caller_priority = (params.callers ?: []) + .collect { it.toString().toLowerCase() } + .findAll { it.startsWith('span') || it.startsWith('omnipeak') } + .join(',') + + ch_samples_manifest_single = ch_samples_manifest.first() + ch_peaks_manifest_single = ch_peaks_manifest.first() + span_chrom_sizes = ch_chrom_sizes + if (params.span_diff_mode == 'fallback') { + span_chrom_sizes = ch_chrom_sizes.ifEmpty(file("$projectDir/assets/chrom_sizes_stub.sizes")) + } + ch_span_chrom_sizes_single = span_chrom_sizes.collect().map { it instanceof List ? it[0] : it } + + ch_span_inputs = ch_span_design + .combine(ch_samples_manifest_single) + .combine(ch_peaks_manifest_single) + .combine(ch_span_chrom_sizes_single) + .map { group, samples_manifest, peaks_manifest, chrom_sizes -> [ group, samples_manifest, peaks_manifest, chrom_sizes ] } + + SPAN_DIFF_RUN ( + ch_span_inputs, + params.differential_contrast, + params.span_diff_mode, + params.span_diff_fdr, + params.span_diff_gap, + params.span_diff_bin, + params.span_fallback_backend, + ch_use_spikein, + params.differential_allow_partial, + span_caller_priority, + params.omnipeaks_jar ? file(params.omnipeaks_jar) : null, + params.span_diff_java_heap + ) + ch_versions = ch_versions.mix(SPAN_DIFF_RUN.out.versions) + + ch_span_pooling_files = SPAN_DIFF_RUN.out.pooling + ch_span_pooling_list = ch_span_pooling_files.collect() + ch_span_pooling_flag = ch_span_pooling_list.map { it instanceof List && it.size() > 0 } + + SPAN_POOLING_MANIFEST ( + ch_span_pooling_list, + ch_span_pooling_flag + ) + ch_versions = ch_versions.mix(SPAN_POOLING_MANIFEST.out.versions) + + if (annotation_enabled) { + ch_span_annot = SPAN_DIFF_RUN.out.differential + .combine(ch_gene_bed_single) + .map { group, regions, gene_bed -> + [ 'span', group, 'NA', regions, gene_bed, "03_span/${group}/span.differential.annotated.tsv" ] + } + + ch_annotation_requests = ch_annotation_requests.mix(ch_span_annot) + } + RENAME_SUMMARY_SPAN ( + SPAN_DIFF_RUN.out.summary.map { group, summary -> + def prefix = "span_${group}".replaceAll(/[^A-Za-z0-9_.-]+/, "_") + [ prefix, summary ] + } + ) + ch_span_summary = RENAME_SUMMARY_SPAN.out.summary + } + + if (annotation_enabled) { + ANNOTATE_REGIONS ( + ch_annotation_requests + ) + ch_versions = ch_versions.mix(ANNOTATE_REGIONS.out.versions) + } + + ch_summary_files = ch_diffbind_summary.mix(ch_chipbinner_summary).mix(ch_span_summary) + DIFFERENTIAL_SUMMARY ( + DIFFERENTIAL_DESIGN.out.design, + ch_summary_files.collect().map { it + [summary_stub] }.ifEmpty([summary_stub]), + params.differential_publish_manifest_only, + file("$projectDir/assets/multiqc/differential_summary_header.txt"), + file("$projectDir/assets/multiqc/differential_design_header.txt"), + file("$projectDir/assets/multiqc/chipbinner_summary_header.txt"), + file("$projectDir/assets/multiqc/span_summary_header.txt") + ) + ch_versions = ch_versions.mix(DIFFERENTIAL_SUMMARY.out.versions) + ch_diff_summary_mqc = DIFFERENTIAL_SUMMARY.out.summary + ch_diff_design_mqc = DIFFERENTIAL_SUMMARY.out.design + + if (params.differential_multiqc_report) { + MULTIQC_DIFFERENTIAL ( + file("$projectDir/assets/multiqc_differential_config.yml"), + diff_outdir + ) + ch_versions = ch_versions.mix(MULTIQC_DIFFERENTIAL.out.versions) + } + } + + emit: + versions = ch_versions + summary_mqc = ch_diff_summary_mqc + design_mqc = ch_diff_design_mqc +} diff --git a/subworkflows/local/prepare_peakcalling.nf b/subworkflows/local/prepare_peakcalling.nf index d9fcc019..b3be4a47 100644 --- a/subworkflows/local/prepare_peakcalling.nf +++ b/subworkflows/local/prepare_peakcalling.nf @@ -24,6 +24,7 @@ workflow PREPARE_PEAKCALLING { main: ch_versions = Channel.empty() ch_bedgraph = Channel.empty() + ch_scale_factors = Channel.empty() def norm_scope = normalisation_scope ?: 'all' def igg_scope = igg_scale_scope ?: 'legacy' def median = { List values -> @@ -92,6 +93,10 @@ workflow PREPARE_PEAKCALLING { // EXAMPLE CHANNEL STRUCT: [id, scale_factor] //ch_bam_scale_factor | view + ch_bam_scale_factor_report + .map { meta, bam, scale, reads, scope_id -> [ meta, scale ] } + .set { ch_scale_factors } + if (params.dump_scale_factors) { ch_bam_scale_factor_report .map { meta, bam, scale, reads, scope_id -> @@ -123,6 +128,9 @@ workflow PREPARE_PEAKCALLING { } .set { ch_bam_scale_factor } //ch_bam_scale_factor | view + + ch_bam.map { row -> [ row[0], 'NA' ] } + .set { ch_scale_factors } } if (norm_mode == "Spikein" || norm_mode == "None") { @@ -264,6 +272,10 @@ workflow PREPARE_PEAKCALLING { // EXAMPLE CHANNEL STRUCT: [[META], BAM, BAI] //ch_bedgraph | view + ch_bam_bai_scale_factor + .map { meta, bam, bai, scale -> [ meta, 'NA' ] } + .set { ch_scale_factors } + } /* @@ -301,5 +313,6 @@ workflow PREPARE_PEAKCALLING { emit: bedgraph = UCSC_BEDCLIP.out.bedgraph // channel: [ val(meta), [ bedgraph ] ] bigwig = UCSC_BEDGRAPHTOBIGWIG.out.bigwig // channel: [ val(meta), [ bigwig ] ] + scale_factors = ch_scale_factors // channel: [ val(meta), scale_factor ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv b/tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv new file mode 100644 index 00000000..f2dbbade --- /dev/null +++ b/tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv @@ -0,0 +1,9 @@ +sample_id group condition caller peaks_path peaks_format replicate +g1_treat g1 Treatment seacr NA bed 1 +g1_ctrl g1 Control seacr NA bed 1 +g2_treat g2 Treatment seacr NA bed 1 +g2_ctrl g2 Control seacr NA bed 1 +g1_treat g1 Treatment span_default NA bed 1 +g1_ctrl g1 Control span_default NA bed 1 +g2_treat g2 Treatment span_default NA bed 1 +g2_ctrl g2 Control span_default NA bed 1 diff --git a/tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv b/tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv new file mode 100644 index 00000000..6cd69a30 --- /dev/null +++ b/tests/assets/diff_run_chipbinner_multi/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv @@ -0,0 +1,5 @@ +sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor ms_coeff bigwig_path input_bam input_bai +g1_treat g1 Treatment 1 NA NA NA NA NA NA tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam.bai +g1_ctrl g1 Control 1 NA NA NA NA NA NA tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam.bai +g2_treat g2 Treatment 1 NA NA NA NA NA NA tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam.bai +g2_ctrl g2 Control 1 NA NA NA NA NA NA tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam.bai diff --git a/tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam b/tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam new file mode 100644 index 00000000..48cdce85 --- /dev/null +++ b/tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam @@ -0,0 +1 @@ +placeholder diff --git a/tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam.bai b/tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam.bai new file mode 100644 index 00000000..48cdce85 --- /dev/null +++ b/tests/assets/diff_run_chipbinner_multi/inputs/input_g1.bam.bai @@ -0,0 +1 @@ +placeholder diff --git a/tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam b/tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam new file mode 100644 index 00000000..48cdce85 --- /dev/null +++ b/tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam @@ -0,0 +1 @@ +placeholder diff --git a/tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam.bai b/tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam.bai new file mode 100644 index 00000000..48cdce85 --- /dev/null +++ b/tests/assets/diff_run_chipbinner_multi/inputs/input_g2.bam.bai @@ -0,0 +1 @@ +placeholder diff --git a/tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv b/tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv new file mode 100644 index 00000000..6d54a533 --- /dev/null +++ b/tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv @@ -0,0 +1,5 @@ +sample_id group condition caller peaks_path peaks_format replicate +sample1 h3k27me3 Treatment seacr NA bed 1 +sample2 h3k27me3 Control seacr NA bed 1 +sample1 h3k27me3 Treatment span_default NA bed 1 +sample2 h3k27me3 Control span_default NA bed 1 diff --git a/tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv b/tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv new file mode 100644 index 00000000..b7cdac03 --- /dev/null +++ b/tests/assets/diff_run_small/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv @@ -0,0 +1,3 @@ +sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor ms_coeff bigwig_path +sample1 h3k27me3 Treatment 1 NA NA NA NA NA NA +sample2 h3k27me3 Control 1 NA NA NA NA NA NA diff --git a/tests/assets/differential_design/insufficient_reps.peaks.tsv b/tests/assets/differential_design/insufficient_reps.peaks.tsv new file mode 100644 index 00000000..0a627b39 --- /dev/null +++ b/tests/assets/differential_design/insufficient_reps.peaks.tsv @@ -0,0 +1,3 @@ +sample_id group condition replicate caller peaks_path peaks_format +g1_t1 group_low Treatment 1 seacr /path/peaks1.bed bed +g1_c1 group_low Control 1 seacr /path/peaks2.bed bed diff --git a/tests/assets/differential_design/insufficient_reps.samples.tsv b/tests/assets/differential_design/insufficient_reps.samples.tsv new file mode 100644 index 00000000..c1726e33 --- /dev/null +++ b/tests/assets/differential_design/insufficient_reps.samples.tsv @@ -0,0 +1,3 @@ +sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor ms_coeff bigwig_path +g1_t1 group_low Treatment 1 NA NA NA NA NA NA +g1_c1 group_low Control 1 NA NA NA NA NA NA diff --git a/tests/assets/differential_design/missing_condition.peaks.tsv b/tests/assets/differential_design/missing_condition.peaks.tsv new file mode 100644 index 00000000..aed825a0 --- /dev/null +++ b/tests/assets/differential_design/missing_condition.peaks.tsv @@ -0,0 +1,4 @@ +sample_id group condition replicate caller peaks_path peaks_format +g1_t1 group_missing Treatment 1 seacr /path/peaks1.bed bed +g2_t1 group_ok Treatment 1 seacr /path/peaks2.bed bed +g2_c1 group_ok Control 1 seacr /path/peaks3.bed bed diff --git a/tests/assets/differential_design/missing_condition.samples.tsv b/tests/assets/differential_design/missing_condition.samples.tsv new file mode 100644 index 00000000..b4a6de66 --- /dev/null +++ b/tests/assets/differential_design/missing_condition.samples.tsv @@ -0,0 +1,4 @@ +sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor ms_coeff bigwig_path +g1_t1 group_missing Treatment 1 NA NA NA NA NA NA +g2_t1 group_ok Treatment 1 NA NA NA NA NA NA +g2_c1 group_ok Control 1 NA NA NA NA NA NA diff --git a/tests/assets/differential_manifest_input/peaks.tsv b/tests/assets/differential_manifest_input/peaks.tsv new file mode 100644 index 00000000..671e2f5f --- /dev/null +++ b/tests/assets/differential_manifest_input/peaks.tsv @@ -0,0 +1,4 @@ +sample_id group condition replicate caller peaks_path peaks_format +s3 groupA Control 1 seacr peaks3.bed bed +s1 groupA Treatment 1 macs2 peaks1.bed bed +s2 groupB Control 2 seacr peaks2.bed bed diff --git a/tests/assets/differential_manifest_input/samples.tsv b/tests/assets/differential_manifest_input/samples.tsv new file mode 100644 index 00000000..dce28fcf --- /dev/null +++ b/tests/assets/differential_manifest_input/samples.tsv @@ -0,0 +1,4 @@ +sample_id group condition replicate final_bam final_bai spikein_scale_factor bigwig_path +s2 groupB Control 2 bam2.bam bam2.bai NA bw2.bw +s1 groupA Treatment 1 bam1.bam bam1.bai NA bw1.bw +s3 groupA Control 1 bam3.bam bam3.bai NA bw3.bw diff --git a/tests/assets/minimal.bed b/tests/assets/minimal.bed new file mode 100644 index 00000000..924ff215 --- /dev/null +++ b/tests/assets/minimal.bed @@ -0,0 +1 @@ +chr1 1 100 Gene1 0 + diff --git a/tests/assets/minimal.fa b/tests/assets/minimal.fa new file mode 100644 index 00000000..9fc8d644 --- /dev/null +++ b/tests/assets/minimal.fa @@ -0,0 +1,2 @@ +>chr1 +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT diff --git a/tests/assets/minimal.gtf b/tests/assets/minimal.gtf new file mode 100644 index 00000000..e078b66c --- /dev/null +++ b/tests/assets/minimal.gtf @@ -0,0 +1 @@ +chr1 source gene 1 100 . + . gene_id "gene1"; gene_name "Gene1"; diff --git a/tests/assets/span_mode/control.bam b/tests/assets/span_mode/control.bam new file mode 100644 index 00000000..8a0b4440 Binary files /dev/null and b/tests/assets/span_mode/control.bam differ diff --git a/tests/assets/span_mode/control.bam.bai b/tests/assets/span_mode/control.bam.bai new file mode 100644 index 00000000..c7a34619 Binary files /dev/null and b/tests/assets/span_mode/control.bam.bai differ diff --git a/tests/assets/span_mode/peaks.tsv b/tests/assets/span_mode/peaks.tsv new file mode 100644 index 00000000..794bb50d --- /dev/null +++ b/tests/assets/span_mode/peaks.tsv @@ -0,0 +1,3 @@ +sample_id group condition caller peaks_path peaks_format replicate +s1 h3k27me3 Treatment span_default tests/assets/span_mode/s1.peaks.bed bed 1 +s2 h3k27me3 Control span_default tests/assets/span_mode/s2.peaks.bed bed 1 diff --git a/tests/assets/span_mode/s1.peaks.bed b/tests/assets/span_mode/s1.peaks.bed new file mode 100644 index 00000000..8948009c --- /dev/null +++ b/tests/assets/span_mode/s1.peaks.bed @@ -0,0 +1,2 @@ +chr1 1 100 +chr1 200 300 diff --git a/tests/assets/span_mode/s2.peaks.bed b/tests/assets/span_mode/s2.peaks.bed new file mode 100644 index 00000000..9b82b9c7 --- /dev/null +++ b/tests/assets/span_mode/s2.peaks.bed @@ -0,0 +1,2 @@ +chr1 50 150 +chr1 250 350 diff --git a/tests/assets/span_mode/samples.tsv b/tests/assets/span_mode/samples.tsv new file mode 100644 index 00000000..0c3fd595 --- /dev/null +++ b/tests/assets/span_mode/samples.tsv @@ -0,0 +1,3 @@ +sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor ms_coeff bigwig_path +s1 h3k27me3 Treatment 1 tests/assets/span_mode/treatment.bam tests/assets/span_mode/treatment.bam.bai NA NA NA NA +s2 h3k27me3 Control 1 tests/assets/span_mode/control.bam tests/assets/span_mode/control.bam.bai NA NA NA NA diff --git a/tests/assets/span_mode/treatment.bam b/tests/assets/span_mode/treatment.bam new file mode 100644 index 00000000..a3d473b4 Binary files /dev/null and b/tests/assets/span_mode/treatment.bam differ diff --git a/tests/assets/span_mode/treatment.bam.bai b/tests/assets/span_mode/treatment.bam.bai new file mode 100644 index 00000000..c7a34619 Binary files /dev/null and b/tests/assets/span_mode/treatment.bam.bai differ diff --git a/tests/test_07_differential.yml b/tests/test_07_differential.yml new file mode 100644 index 00000000..e80d2c92 --- /dev/null +++ b/tests/test_07_differential.yml @@ -0,0 +1,178 @@ +- name: test_differential_missing_contrast + command: nextflow run main.nf -profile docker,test --run_diffbind --differential_contrast Treated --skip_fastqc --skip_removeduplicates --skip_multiqc --skip_preseq -c tests/config/nextflow.config + tags: + - test_differential + - test_differential_missing_contrast + exit_code: 1 + +- name: test_span_diff_missing_jar_integrated + command: nextflow run main.nf -profile docker,test --run_span_diff --differential_contrast Treatment,Control --differential_min_replicates 1 --input tests/assets/samplesheet_condition.csv --skip_fastqc --skip_removeduplicates --skip_multiqc --skip_preseq -c tests/config/nextflow.config + tags: + - test_differential + - test_span_diff_missing_jar_integrated + exit_code: 1 + stdout: + contains: + - "omnipeaks_jar" + - "SPAN differential requested" + +- name: test_differential_manifests_only + command: nextflow run main.nf -profile docker,test --only_peak_calling --skip_fastqc --skip_removeduplicates --skip_multiqc --skip_preseq --run_diffbind --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_publish_manifest_only --input tests/assets/samplesheet_condition.csv -c tests/config/nextflow.config + tags: + - test_differential + - test_differential_manifests_only + files: + - path: results/03_peak_calling/08_differential/00_manifests/differential_manifest.samples.tsv + - path: results/03_peak_calling/08_differential/00_manifests/differential_manifest.peaks.tsv + - path: results/03_peak_calling/08_differential/00_manifests/differential_manifest.design.tsv + - path: results/03_peak_calling/08_differential/multiqc/differential_summary_mqc.tsv + contains: + - "manifest_only" + - "SKIP" + must_not_contain: + - "RUN\tok" + - path: results/03_peak_calling/08_differential/multiqc/differential_design_mqc.tsv + +- name: test_differential_design_missing_condition + command: bash -c "mkdir -p tests/output && python3 bin/differential_design.py --samples tests/assets/differential_design/missing_condition.samples.tsv --peaks tests/assets/differential_design/missing_condition.peaks.tsv --contrast Treatment,Control --min-replicates 1 --run-diffbind --run-chipbinner --run-span --out tests/output/design_missing_condition.tsv && python3 bin/differential_summary.py --design tests/output/design_missing_condition.tsv --out tests/output/summary_missing_condition.tsv" + tags: + - test_differential + - test_differential_design + files: + - path: tests/output/design_missing_condition.tsv + contains: + - "missing_condition" + - "SKIP" + - path: tests/output/summary_missing_condition.tsv + contains: + - "missing_condition" + - "SKIP" + +- name: test_differential_design_insufficient_reps_fail + command: bash -c "mkdir -p tests/output && python3 bin/differential_design.py --samples tests/assets/differential_design/insufficient_reps.samples.tsv --peaks tests/assets/differential_design/insufficient_reps.peaks.tsv --contrast Treatment,Control --min-replicates 2 --run-diffbind --out tests/output/design_insufficient_fail.tsv" + tags: + - test_differential + - test_differential_design + exit_code: 1 + stderr: + contains: + - "FAIL entries" + +- name: test_differential_design_insufficient_reps_allow_partial + command: bash -c "mkdir -p tests/output && python3 bin/differential_design.py --samples tests/assets/differential_design/insufficient_reps.samples.tsv --peaks tests/assets/differential_design/insufficient_reps.peaks.tsv --contrast Treatment,Control --min-replicates 2 --allow-partial --run-diffbind --out tests/output/design_insufficient_skip.tsv" + tags: + - test_differential + - test_differential_design + files: + - path: tests/output/design_insufficient_skip.tsv + contains: + - "insufficient_replicates" + - "SKIP" + +- name: test_differential_summary_preserves_skip + command: bash -c "mkdir -p tests/output/summary_override && printf '%b' 'group\tcaller\ttreated_condition\tcontrol_condition\tstatus\treason\nh3k27me3\tNA\tTreatment\tControl\tSKIP\tmanual_skip\n' > tests/output/summary_override/design.tsv && printf '%b' 'group\tcaller\ttreated\tcontrol\tn_bins_tested\tn_fdr_pass\tn_up\tn_down\tn_clusters\tchosen_minPts\tchosen_minSamps\tstatus\treason\tenrichment_status\nh3k27me3\tNA\tTreatment\tControl\t10\t1\t1\t0\t2\t100\t100\tRUN\tok\tNOT_RUN\n' > tests/output/summary_override/chipbinner.summary.tsv && python3 bin/differential_summary.py --design tests/output/summary_override/design.tsv --summaries tests/output/summary_override/chipbinner.summary.tsv --out tests/output/summary_override/summary.tsv" + tags: + - test_differential + - test_differential_summary + files: + - path: tests/output/summary_override/summary.tsv + contains: + - "SKIP" + - "manual_skip" + +- name: test_differential_manifests_deterministic + command: bash -c "rm -rf tests/output/manifest_run1 tests/output/manifest_run2 && mkdir -p tests/output/manifest_run1 tests/output/manifest_run2 && python3 bin/differential_manifests.py --samples tests/assets/differential_manifest_input/samples.tsv --peaks tests/assets/differential_manifest_input/peaks.tsv --normalisation-mode NA --outdir tests/output/manifest_run1 && python3 bin/differential_manifests.py --samples tests/assets/differential_manifest_input/samples.tsv --peaks tests/assets/differential_manifest_input/peaks.tsv --normalisation-mode NA --outdir tests/output/manifest_run2 && diff -q tests/output/manifest_run1/differential_manifest.samples.tsv tests/output/manifest_run2/differential_manifest.samples.tsv && diff -q tests/output/manifest_run1/differential_manifest.peaks.tsv tests/output/manifest_run2/differential_manifest.peaks.tsv" + tags: + - test_differential + - test_differential_manifest + files: + - path: tests/output/manifest_run1/differential_manifest.samples.tsv + - path: tests/output/manifest_run1/differential_manifest.peaks.tsv + +- name: test_diffbind_samplesheet_publish_disabled + command: nextflow run main.nf -profile docker,test -stub-run -entry DIFFERENTIAL_ONLY --run_diffbind --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_from_run tests/assets/diff_run_small --gene_bed tests/assets/minimal.bed --export_diffbind_sheets false --outdir results_diffbind_off -c tests/config/nextflow.config + tags: + - test_differential + - test_diffbind_samplesheet + files: + - path: results_diffbind_off/03_peak_calling/08_differential/01_diffbind/00_samplesheets/seacr/h3k27me3.csv + should_exist: false + +- name: test_diffbind_samplesheet_publish_enabled + command: nextflow run main.nf -profile docker,test -stub-run -entry DIFFERENTIAL_ONLY --run_diffbind --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_from_run tests/assets/diff_run_small --gene_bed tests/assets/minimal.bed --export_diffbind_sheets true --outdir results_diffbind_on -c tests/config/nextflow.config + tags: + - test_differential + - test_diffbind_samplesheet + files: + - path: results_diffbind_on/03_peak_calling/08_differential/01_diffbind/00_samplesheets/seacr/h3k27me3.csv + +- name: test_span_mode_native_requires_compare + command: bash -c "mkdir -p tests/output/span_native && python3 bin/span_diff.py --jar tests/assets/omnipeaks_stub.jar --mode native --contrast Treatment,Control --group h3k27me3 --samples tests/assets/span_mode/samples.tsv --peaks tests/assets/span_mode/peaks.tsv --caller-priority span_default --outdir tests/output/span_native" + tags: + - test_differential + - test_span_mode + exit_code: 1 + +- name: test_span_mode_auto_fallback + command: bash -c "mkdir -p tests/output/span_auto && python3 bin/span_diff.py --jar tests/assets/omnipeaks_stub.jar --mode auto --contrast Treatment,Control --group h3k27me3 --samples tests/assets/span_mode/samples.tsv --peaks tests/assets/span_mode/peaks.tsv --caller-priority span_default --allow-partial true --outdir tests/output/span_auto" + tags: + - test_differential + - test_span_mode + files: + - path: tests/output/span_auto/span.mode.txt + contains: + - "fallback" + - path: tests/output/span_auto/span.normalization_factors.tsv + +- name: test_annotation_fallback_gtf_only + command: nextflow run main.nf -profile docker,test -entry DIFFERENTIAL_ONLY --run_span_diff --span_diff_mode native --differential_allow_partial --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_from_run tests/assets/diff_run_small --omnipeaks_jar tests/assets/omnipeaks_stub.jar --gtf tests/assets/minimal.gtf --fasta tests/assets/minimal.fa --outdir results_annotation_gtf -c tests/config/nextflow.config + tags: + - test_differential + - test_annotation_fallback + files: + - path: results_annotation_gtf/03_peak_calling/08_differential/03_span/h3k27me3/span.differential.annotated.tsv + contains: + - "nearest_feature_id" + - "nearest_gene_name" + +- name: test_chipbinner_stub_outputs + command: bash -c "rm -rf results_chipbinner_stub && nextflow run main.nf -profile docker,test -stub-run -entry DIFFERENTIAL_ONLY --run_chipbinner --chipbinner_use_input --differential_contrast Treatment,Control --differential_min_replicates 1 --differential_from_run tests/assets/diff_run_chipbinner_multi --gtf tests/assets/minimal.gtf --fasta tests/assets/minimal.fa --outdir results_chipbinner_stub -c tests/config/nextflow.config && test $(ls results_chipbinner_stub/03_peak_calling/08_differential/00_manifests/chipbinner_windows_cache/windows.*.bed | wc -l) -eq 1" + tags: + - test_differential + - test_chipbinner_outputs + files: + - path: results_chipbinner_stub/03_peak_calling/08_differential/02_chipbinner/g1/chipbinner.clusters.2cluster.tsv + - path: results_chipbinner_stub/03_peak_calling/08_differential/02_chipbinner/g1/chipbinner.clusters.3cluster.tsv + - path: results_chipbinner_stub/03_peak_calling/08_differential/02_chipbinner/g1/chipbinner.normalization_factors.tsv + +- name: test_chipbinner_use_input_missing_fail + command: > + bash -c "mkdir -p tests/output/chipbinner_missing && + printf 'sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff\tinput_bam\tinput_bai\ns1\tg1\tTreatment\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\ns2\tg1\tControl\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\n' > tests/output/chipbinner_missing/samples.tsv && + printf 'chr1\t1000\n' > tests/output/chipbinner_missing/chrom.sizes && + python3 bin/chipbinner_run.py --samples tests/output/chipbinner_missing/samples.tsv --group g1 --contrast Treatment,Control --chrom-sizes tests/output/chipbinner_missing/chrom.sizes --bin-size 200 --use-input --outdir tests/output/chipbinner_missing/out" + tags: + - test_differential + - test_chipbinner_use_input + exit_code: 1 + stderr: + contains: + - "input BAMs are missing" + +- name: test_chipbinner_use_input_missing_allow_partial + command: > + bash -c "mkdir -p tests/output/chipbinner_missing_partial && + printf 'sample_id\tgroup\tcondition\treplicate\tfinal_bam\tfinal_bai\tbigwig_path\tspikein_scale_factor\tms_coeff\tinput_bam\tinput_bai\ns1\tg1\tTreatment\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\ns2\tg1\tControl\t1\tNA\tNA\tNA\tNA\tNA\tNA\tNA\n' > tests/output/chipbinner_missing_partial/samples.tsv && + printf 'chr1\t1000\n' > tests/output/chipbinner_missing_partial/chrom.sizes && + python3 bin/chipbinner_run.py --samples tests/output/chipbinner_missing_partial/samples.tsv --group g1 --contrast Treatment,Control --chrom-sizes tests/output/chipbinner_missing_partial/chrom.sizes --bin-size 200 --use-input --allow-partial --outdir tests/output/chipbinner_missing_partial/out" + tags: + - test_differential + - test_chipbinner_use_input + files: + - path: tests/output/chipbinner_missing_partial/out/chipbinner.summary.tsv + contains: + - "SKIP" + - "MISSING_INPUT_BAM" + - path: tests/output/chipbinner_missing_partial/out/chipbinner.error.txt + - path: tests/output/chipbinner_missing_partial/out/chipbinner.bin_counts.tsv + should_exist: false diff --git a/workflows/cutandrun.nf b/workflows/cutandrun.nf index d97aae21..509a14f4 100644 --- a/workflows/cutandrun.nf +++ b/workflows/cutandrun.nf @@ -6,8 +6,10 @@ include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' -// Validate input parameters in specialised library -WorkflowCutandrun.initialise(params, log) +// Validate input parameters in specialised library (skip for differential-only entrypoint) +if (!WorkflowMain.isDifferentialOnly(workflow, params)) { + WorkflowCutandrun.initialise(params, log) +} def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' def summary_params = paramsSummaryMap(workflow) @@ -35,7 +37,8 @@ if(params.normalisation_mode == "Spikein") { } // Check mandatory parameters that cannot be checked in the groovy lib as we want a channel for them -if (params.input) { ch_input = file(params.input) } else { exit 1, "Input samplesheet not specified!" } +// NOTE: Input validation is handled in WorkflowMain.initialise with entrypoint-aware gating. +ch_input = params.input ? file(params.input) : null ch_blacklist = Channel.empty() if (params.blacklist) { @@ -142,6 +145,7 @@ include { CONSENSUS_PEAKS as CONSENSUS_PEAKS_ALL } from "../subworkflo include { EXTRACT_FRAGMENTS } from "../subworkflows/local/extract_fragments" include { PREPARE_PEAKCALLING } from "../subworkflows/local/prepare_peakcalling" include { PEAK_CALLING_EXTENDED } from "../subworkflows/local/peak_calling_extended" +include { DIFFERENTIAL_PEAK_CALLING } from "../subworkflows/local/differential_peak_calling" include { DEEPTOOLS_QC } from "../subworkflows/local/deeptools_qc" include { PEAK_QC } from "../subworkflows/local/peak_qc" include { SAMTOOLS_VIEW_SORT_STATS as FILTER_READS } from "../subworkflows/local/samtools_view_sort_stats" @@ -450,6 +454,9 @@ workflow CUTANDRUN { ch_bedgraph = Channel.empty() ch_bigwig = Channel.empty() + ch_scale_factors = Channel.empty() + ch_diff_summary_mqc = Channel.empty() + ch_diff_design_mqc = Channel.empty() ch_peaks_all = Channel.empty() ch_peaks_primary = Channel.empty() ch_peaks_secondary = Channel.empty() @@ -478,6 +485,7 @@ workflow CUTANDRUN { ) ch_bedgraph = PREPARE_PEAKCALLING.out.bedgraph ch_bigwig = PREPARE_PEAKCALLING.out.bigwig + ch_scale_factors = PREPARE_PEAKCALLING.out.scale_factors ch_software_versions = ch_software_versions.mix(PREPARE_PEAKCALLING.out.versions) /* @@ -654,6 +662,27 @@ workflow CUTANDRUN { ch_software_versions = ch_software_versions.mix(CONTROL_POOLING_FALLBACKS_REPORT.out.versions) } + if (params.run_peak_calling && (params.run_diffbind || params.run_chipbinner || params.run_span_diff || params.differential_publish_manifest_only)) { + ch_gene_bed_annotation = params.gene_bed ? PREPARE_GENOME.out.bed : Channel.empty() + DIFFERENTIAL_PEAK_CALLING ( + ch_samtools_bam, + ch_samtools_bai, + ch_bigwig, + ch_peaks_all, + ch_scale_factors, + PREPARE_GENOME.out.chrom_sizes, + ch_gene_bed_annotation, + PREPARE_GENOME.out.gtf, + ch_blacklist, + Channel.empty(), + Channel.empty(), + 'integrated' + ) + ch_software_versions = ch_software_versions.mix(DIFFERENTIAL_PEAK_CALLING.out.versions) + ch_diff_summary_mqc = DIFFERENTIAL_PEAK_CALLING.out.summary_mqc + ch_diff_design_mqc = DIFFERENTIAL_PEAK_CALLING.out.design_mqc + } + ch_dt_corrmatrix = Channel.empty() ch_dt_pcadata = Channel.empty() ch_dt_fpmatrix = Channel.empty() @@ -900,6 +929,7 @@ workflow CUTANDRUN { if (params.run_multiqc) { workflow_summary = WorkflowCutandrun.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) + ch_diff_mqc = ch_diff_summary_mqc.mix(ch_diff_design_mqc) /* * MODULE: Collect software versions used in pipeline @@ -936,6 +966,7 @@ workflow CUTANDRUN { ch_peakqc_count_consensus_mqc.collect{it[1]}.ifEmpty([]), ch_peakqc_reprod_perc_mqc.collect().ifEmpty([]), ch_frag_len_hist_mqc.collect().ifEmpty([]), + ch_diff_mqc.collect().ifEmpty([]), ch_linear_duplication_mqc.collect{it[1]}.ifEmpty([]) ) multiqc_report = MULTIQC.out.report.toList() diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf new file mode 100644 index 00000000..d0280574 --- /dev/null +++ b/workflows/differential_only.nf @@ -0,0 +1,190 @@ +/* + * Differential-only entrypoint + */ + +include { DIFFERENTIAL_PEAK_CALLING } from "../subworkflows/local/differential_peak_calling" +include { CUSTOM_GETCHROMSIZES } from "../modules/nf-core/custom/getchromsizes/main" +include { GUNZIP as GUNZIP_FASTA } from "../modules/nf-core/gunzip/main" +include { GUNZIP as GUNZIP_GENE_BED } from "../modules/nf-core/gunzip/main" + +import java.security.MessageDigest + +def hashFile(path) { + if (!path) { + return "none" + } + def f = file(path) + if (!f.exists()) { + return "none" + } + def digest = MessageDigest.getInstance("SHA-256") + f.withInputStream { input -> + byte[] buffer = new byte[8192] + int read + while ((read = input.read(buffer)) > 0) { + digest.update(buffer, 0, read) + } + } + return digest.digest().encodeHex().toString().substring(0, 12) +} + +def resolveCachedWindowsPath(windowsDir, binSize, genomeId, blacklistPath) { + if (!windowsDir) { + return null + } + def winPath = file(windowsDir) + if (winPath.exists() && winPath.isFile()) { + return winPath.toString() + } + if (!winPath.exists() || !winPath.isDirectory()) { + return null + } + def blacklistHash = hashFile(blacklistPath) + def meta = file("${windowsDir}/chipbinner_windows_meta.tsv") + if (meta.exists()) { + def lines = meta.text.readLines() + if (lines.size() > 1) { + def header = lines[0].split("\t") + def values = lines[1].split("\t", -1) + def metaRow = [:] + header.eachWithIndex { col, idx -> + metaRow[col] = idx < values.size() ? values[idx] : "" + } + def genome_ok = !genomeId || metaRow["genome_id"] == genomeId + def hash_ok = !blacklistPath || metaRow["blacklist_hash"] == blacklistHash + if (genome_ok && metaRow["bin_size"] == binSize.toString() && hash_ok) { + def candidate = file("${windowsDir}/${metaRow['windows_path']}") + if (candidate.exists()) { + return candidate.toString() + } + } + } + } + def preferred = [] + if (genomeId) { + preferred << "windows.${genomeId}.${binSize}.${blacklistHash}.bed" + preferred << "windows.${genomeId}.${binSize}.bed" + } else { + preferred << "windows.${binSize}.${blacklistHash}.bed" + preferred << "windows.${binSize}.bed" + } + for (name in preferred) { + def candidate = file("${windowsDir}/${name}") + if (candidate.exists()) { + return candidate.toString() + } + } + return null +} + +workflow DIFFERENTIAL_ONLY { + + if (!params.differential_from_run) { + exit 1, "--differential_from_run must be provided when using -entry DIFFERENTIAL_ONLY" + } + + def manifest_dir = "${params.differential_from_run}/03_peak_calling/08_differential/00_manifests" + def samples_manifest = "${manifest_dir}/differential_manifest.samples.tsv" + def peaks_manifest = "${manifest_dir}/differential_manifest.peaks.tsv" + def cached_gene_bed = "${manifest_dir}/differential_manifest.gene_bed.bed" + + if (!file(samples_manifest).exists()) { + exit 1, "Missing samples manifest: ${samples_manifest}" + } + if (!file(peaks_manifest).exists()) { + exit 1, "Missing peaks manifest: ${peaks_manifest}" + } + + ch_samples_manifest = Channel.fromPath(samples_manifest, checkIfExists: true) + ch_peaks_manifest = Channel.fromPath(peaks_manifest, checkIfExists: true) + + if (params.run_span_diff && !params.omnipeaks_jar) { + exit 1, "--omnipeaks_jar is required when --run_span_diff is enabled." + } + + def annotation_enabled = (params.run_diffbind || params.run_chipbinner || params.run_span_diff) && !params.differential_publish_manifest_only + if (annotation_enabled && !params.gene_bed && !file(cached_gene_bed).exists() && !params.gtf) { + exit 1, "Differential annotation requires --gene_bed or --gtf (or a cached gene bed at ${cached_gene_bed})." + } + + ch_gene_bed = Channel.empty() + if (annotation_enabled && params.gene_bed) { + if (params.gene_bed.endsWith(".gz")) { + ch_gene_bed = GUNZIP_GENE_BED ( [ [id: 'gene_bed'], params.gene_bed ] ).gunzip.map { it[1] } + } else { + ch_gene_bed = Channel.from( file(params.gene_bed) ) + } + } else if (annotation_enabled && file(cached_gene_bed).exists()) { + ch_gene_bed = Channel.fromPath(cached_gene_bed, checkIfExists: true) + } + + ch_gtf = Channel.empty() + if (annotation_enabled && params.gtf) { + ch_gtf = Channel.from( file(params.gtf) ) + } + + def default_windows_dir = "${manifest_dir}/chipbinner_windows_cache" + def default_windows_set = false + if (!params.chipbinner_windows_dir && file(default_windows_dir).exists()) { + params.chipbinner_windows_dir = default_windows_dir + default_windows_set = true + } + + def cached_windows = resolveCachedWindowsPath( + params.chipbinner_windows_dir, + params.chipbinner_bin_size, + params.genome, + params.blacklist + ) + def use_cached_windows = cached_windows != null + if (use_cached_windows) { + params.chipbinner_windows_dir = cached_windows + } else if (default_windows_set) { + params.chipbinner_windows_dir = '' + } + + ch_chrom_sizes = Channel.empty() + def span_requires_sizes = params.run_span_diff && ['native', 'auto'].contains(params.span_diff_mode) + def chipbinner_requires_sizes = params.run_chipbinner && !use_cached_windows + def need_real_chrom_sizes = span_requires_sizes || chipbinner_requires_sizes + if (need_real_chrom_sizes) { + if (!params.fasta) { + def reasons = [] + if (chipbinner_requires_sizes) { + reasons << "cached ChIPBinner windows not found" + } + if (span_requires_sizes) { + if (params.span_diff_mode == 'auto') { + reasons << "SPAN auto can select native compare and requires chrom sizes (set --span_diff_mode fallback to run without FASTA)" + } else { + reasons << "SPAN native requires chrom sizes" + } + } + exit 1, "--fasta is required for differential-only mode (${reasons.join('; ')})" + } + if (params.fasta.endsWith(".gz")) { + ch_fasta = GUNZIP_FASTA ( [ [id: 'genome'], params.fasta ] ).gunzip + } else { + ch_fasta = Channel.of([ [id: 'genome'], file(params.fasta) ]) + } + CUSTOM_GETCHROMSIZES ( ch_fasta ) + ch_chrom_sizes = CUSTOM_GETCHROMSIZES.out.sizes.map { it[1] } + } else if (params.run_chipbinner || params.run_span_diff) { + ch_chrom_sizes = Channel.fromPath("${projectDir}/assets/chrom_sizes_stub.sizes", checkIfExists: true) + } + + DIFFERENTIAL_PEAK_CALLING ( + Channel.empty(), + Channel.empty(), + Channel.empty(), + Channel.empty(), + Channel.empty(), + ch_chrom_sizes, + ch_gene_bed, + ch_gtf, + Channel.empty(), + ch_samples_manifest, + ch_peaks_manifest, + 'posthoc' + ) +}