From 2aebdffe0d08a234cf49e4009a23f06835b3ee11 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 01:48:00 -0800 Subject: [PATCH 01/31] Add differential peak calling specifications --- .../4.5_DIFFERENTIAL_PEAK_CALLING-SPEC.md | 657 ++++++++++++++++++ .../5.2_DIFFERENTIAL_PEAK_CALLING-SPEC.md | 582 ++++++++++++++++ .../EXTENSIONS_ADDED_20260104.md | 111 +++ 3 files changed, 1350 insertions(+) create mode 100644 docs/extension_specs/4.5_DIFFERENTIAL_PEAK_CALLING-SPEC.md create mode 100644 docs/extension_specs/5.2_DIFFERENTIAL_PEAK_CALLING-SPEC.md create mode 100644 docs/extension_specs/EXTENSIONS_ADDED_20260104.md diff --git a/docs/extension_specs/4.5_DIFFERENTIAL_PEAK_CALLING-SPEC.md b/docs/extension_specs/4.5_DIFFERENTIAL_PEAK_CALLING-SPEC.md new file mode 100644 index 00000000..b4a7a403 --- /dev/null +++ b/docs/extension_specs/4.5_DIFFERENTIAL_PEAK_CALLING-SPEC.md @@ -0,0 +1,657 @@ +# Differential Peak-Calling Specification + +**Status**: TODO #4 from `docs/extension_specs/TODO.md` +**Version**: 1.0 +**Date**: 2026-01-04 + +--- + +## Table of Contents + +1. [Overview](#overview) +2. [Architecture](#architecture) +3. [Parameters](#parameters) +4. [DiffBind Implementation](#diffbind-implementation) +5. [ChIPBinner Implementation](#chipbinner-implementation) +6. [SPAN Differential Implementation](#span-differential-implementation) +7. [Output Directory Structure](#output-directory-structure) +8. [MultiQC Integration](#multiqc-integration) +9. [Execution Examples](#execution-examples) +10. [Implementation Notes](#implementation-notes) + +--- + +## Overview + +This specification defines three complementary differential analysis approaches for the nf-core/cutandrun pipeline: + +| Approach | Best For | Method | Outputs | +|----------|----------|--------|---------| +| **DiffBind** | Traditional peak-centric differential binding | Counts reads in peak regions; DESeq2/edgeR statistics | Differential binding tables, MA/volcano plots | +| **ChIPBinner** | Broad marks with global changes (e.g., H3K36me2 after NSD1 KO) | Genome-wide binning + HDBSCAN clustering | Clustered regions, scatterplots, LOLA enrichment | +| **SPAN Differential** | Semi-supervised broad peak comparison | SPAN's native 'compare' mode | Differential peak regions | + +### When to Use Each + +- **DiffBind**: Standard choice for most CUT&RUN experiments with focal/punctate marks (H3K4me3, H3K27ac at promoters, CTCF). Works with any peak caller output. + +- **ChIPBinner**: Specifically designed for experiments where you expect **global redistribution** of signal (e.g., writer/eraser knockouts affecting H3K36me2/me3, H3K27me3). Requires spike-in or MS normalization for reliable results. + +- **SPAN Differential**: Best for broad histone marks analyzed with SPAN peak calling. Leverages SPAN's internal model for semi-supervised comparison. + +### Contrast Model + +All three approaches compare **conditions within the same group** (i.e., within the same histone mark/target): + +``` +H3K36me2_sgNSD3 vs H3K36me2_sgControl ✓ Valid comparison +H3K27me3_Treatment vs H3K27me3_Control ✓ Valid comparison +H3K36me2_sgNSD3 vs H3K27me3_sgNSD3 ✗ Not supported (cross-group) +``` + +Groups lacking a second condition are **skipped with a warning** (not an error). + +--- + +## Architecture + +### Subworkflow Structure + +``` +DIFFERENTIAL_ANALYSIS (new subworkflow) +├── DIFFBIND_ANALYSIS +│ ├── GENERATE_DIFFBIND_SAMPLESHEET +│ ├── DIFFBIND_COUNT +│ ├── DIFFBIND_ANALYZE +│ └── DIFFBIND_REPORT +├── CHIPBINNER_ANALYSIS +│ ├── CHIPBINNER_BINNING +│ ├── CHIPBINNER_NORMALIZE +│ ├── CHIPBINNER_CLUSTER (HDBSCAN grid search) +│ ├── CHIPBINNER_ANNOTATE +│ ├── CHIPBINNER_DIFFERENTIAL (ROTS) +│ ├── CHIPBINNER_ENRICHMENT (LOLA) +│ └── CHIPBINNER_PLOTS +└── SPAN_DIFFERENTIAL + ├── SPAN_COMPARE + └── SPAN_DIFF_REPORT +``` + +### Data Flow + +``` + Peak Calls (per-caller) + │ + ┌──────────────────┼──────────────────┐ + │ │ │ + ▼ ▼ ▼ + [DiffBind] [ChIPBinner] [SPAN Diff] + │ │ │ + │ BAMs + spike-in factors │ + │ │ │ + │ ▼ │ + │ Binned signal matrix │ + │ │ │ + │ ▼ │ + │ HDBSCAN clusters │ + │ │ │ + ▼ ▼ ▼ + Diff tables Cluster BEDs Diff peaks + + plots + LOLA results + stats + │ │ │ + └────────────┼────────────────────────┘ + ▼ + 08_differential/ +``` + +--- + +## Parameters + +### Global Differential Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `--run_diffbind` | boolean | `false` | Enable DiffBind differential analysis | +| `--run_chipbinner` | boolean | `false` | Enable ChIPBinner differential analysis | +| `--run_span_diff` | boolean | `false` | Enable SPAN native differential comparison | + +### DiffBind Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `--diffbind_fdr` | float | `0.05` | FDR threshold for significant differential peaks | +| `--diffbind_lfc` | float | `1.0` | Minimum absolute log2 fold-change threshold | +| `--diffbind_method` | string | `DESeq2` | Analysis method: `DESeq2` or `edgeR` | +| `--diffbind_norm` | string | `lib` | Normalization: `lib` (library size), `RLE`, `TMM`, `native` | +| `--diffbind_summits` | integer | `200` | Extend peaks to this size around summit (0 = use full peak) | +| `--export_diffbind_sheets` | boolean | `true` | Export DiffBind-compatible samplesheets | + +### ChIPBinner Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `--chipbinner_bin_size` | integer | `10000` | Bin size in bp (1000, 5000, 10000, 25000) | +| `--chipbinner_use_control` | boolean | `true` | Use IgG/input control for normalization | +| `--chipbinner_fdr` | float | `0.05` | FDR threshold for ROTS differential analysis | +| `--chipbinner_lfc` | float | `1.0` | Minimum absolute log2 fold-change | +| `--chipbinner_bootstrap` | integer | `1000` | ROTS bootstrap iterations | +| `--chipbinner_k_value` | integer | `100000` | ROTS top list size (K parameter) | +| `--hdbscan_min_range` | string | `100,400,700,1000` | min_cluster_size values for grid search | +| `--hdbscan_max_range` | string | `100,400,700,1000` | min_samples values for grid search | + +### SPAN Differential Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `--span_diff_gap` | integer | `5` | Gap parameter for SPAN compare mode | +| `--span_diff_fdr` | float | `0.05` | FDR threshold for SPAN differential peaks | +| `--span_diff_bin` | integer | `200` | Bin size for SPAN comparison | + +--- + +## DiffBind Implementation + +### Overview + +DiffBind analyzes each peak caller's output independently, running separate differential analyses per caller. This allows comparison of how different peak callers affect downstream differential results. + +### Input Requirements + +For each `(group, caller)` combination where the group has 2+ conditions: + +| Input | Source | Format | +|-------|--------|--------| +| Peak files | Peak calling subworkflow | BED/narrowPeak/broadPeak | +| BAM files | Alignment output | Sorted, deduplicated BAM + BAI | +| Sample metadata | Samplesheet | group, condition, replicate | + +### Samplesheet Generation + +The pipeline automatically generates DiffBind-compatible samplesheets: + +```csv +SampleID,Tissue,Factor,Condition,Replicate,bamReads,Peaks,PeakCaller +H3K36me2_sgControl_rep1,H3K36me2,H3K36me2,sgControl,1,/path/to/bam,/path/to/peaks.bed,macs2 +H3K36me2_sgControl_rep2,H3K36me2,H3K36me2,sgControl,2,/path/to/bam,/path/to/peaks.bed,macs2 +H3K36me2_sgNSD3_rep1,H3K36me2,H3K36me2,sgNSD3,1,/path/to/bam,/path/to/peaks.bed,macs2 +H3K36me2_sgNSD3_rep2,H3K36me2,H3K36me2,sgNSD3,2,/path/to/bam,/path/to/peaks.bed,macs2 +``` + +**Mapping from pipeline metadata:** +- `SampleID` = `meta.sample_id` +- `Tissue` = `meta.group` +- `Factor` = `meta.group` +- `Condition` = `meta.condition` +- `Replicate` = `meta.replicate` +- `bamReads` = Deduplicated BAM path +- `Peaks` = Peak file path +- `PeakCaller` = Caller name (for metadata; DiffBind uses peak file directly) + +### DiffBind v3+ Workflow + +```r +# 1. Create DBA object from samplesheet +dba_obj <- dba(sampleSheet = "samplesheet.csv") + +# 2. Count reads in peaks +dba_obj <- dba.count(dba_obj, summits = params$diffbind_summits) + +# 3. Normalize +dba_obj <- dba.normalize(dba_obj, method = DBA_ALL_METHODS, normalize = params$diffbind_norm) + +# 4. Set up contrast (condition1 vs condition2) +dba_obj <- dba.contrast(dba_obj, categories = DBA_CONDITION, minMembers = 2) + +# 5. Run differential analysis +dba_obj <- dba.analyze(dba_obj, method = params$diffbind_method) + +# 6. Extract results +results <- dba.report(dba_obj, th = params$diffbind_fdr, fold = params$diffbind_lfc) +``` + +### Output Visualizations + +| Plot | File | Description | +|------|------|-------------| +| MA plot | `{group}_{caller}_MA.pdf` | Log fold-change vs mean concentration | +| Volcano plot | `{group}_{caller}_volcano.pdf` | -log10(p-value) vs log2FC | +| PCA plot | `{group}_{caller}_PCA.pdf` | Sample clustering by binding affinity | +| Correlation heatmap | `{group}_{caller}_correlation.pdf` | Sample-sample correlation matrix | +| Binding heatmap | `{group}_{caller}_heatmap.pdf` | Binding affinity at differential sites | +| Venn diagram | `{group}_{caller}_venn.pdf` | Overlap of peaks across conditions | +| Concentration plot | `{group}_{caller}_concentration.pdf` | Read concentration at binding sites | + +### Output Tables + +| File | Format | Content | +|------|--------|---------| +| `{group}_{caller}_differential.tsv` | TSV | Full results with coordinates, FC, FDR, concentrations | +| `{group}_{caller}_significant_up.bed` | BED | Peaks significantly UP in treatment | +| `{group}_{caller}_significant_down.bed` | BED | Peaks significantly DOWN in treatment | +| `{group}_{caller}_samplesheet.csv` | CSV | DiffBind-ready samplesheet for external use | + +--- + +## ChIPBinner Implementation + +### Overview + +ChIPBinner is implemented as a self-contained module that handles the complete workflow from BAM files to differential clusters. It leverages the pipeline's existing spike-in normalization factors. + +### Module Architecture + +``` +CHIPBINNER_ANALYSIS +│ +├─ Input: BAMs + spike-in scale factors + (optional) pooled IgG +│ +├─ Step 1: CHIPBINNER_BINNING +│ └─ BAM → BED → binned BED (bedtools intersect with reference windows) +│ +├─ Step 2: CHIPBINNER_NORMALIZE +│ └─ Apply spike-in factors via norm_bw() +│ └─ Output: normalized bigWigs per sample +│ +├─ Step 3: CHIPBINNER_PRECLUST +│ └─ Generate signal matrix + pooled BED coordinates +│ +├─ Step 4: CHIPBINNER_CLUSTER (grid search) +│ └─ Run HDBSCAN with parameter combinations +│ └─ Output: cluster assignments for each parameter set +│ +├─ Step 5: CHIPBINNER_ANNOTATE +│ └─ Extract 2-cluster and 3-cluster solutions +│ └─ Output: annotated_clusters.rda (GRanges per cluster) +│ +├─ Step 6: CHIPBINNER_DIFFERENTIAL +│ └─ ROTS analysis per cluster +│ └─ Output: logFC, FDR per bin per cluster +│ +├─ Step 7: CHIPBINNER_ENRICHMENT +│ └─ LOLA enrichment with Ensembl, cCRE, RepeatMasker databases +│ └─ Output: enrichment tables and plots per cluster +│ +└─ Step 8: CHIPBINNER_PLOTS + └─ Generate all visualizations +``` + +### Normalization Flow + +ChIPBinner uses the pipeline's existing spike-in infrastructure: + +``` +Pipeline spike-in calculation (PREPARE_PEAKCALLING) + │ + ▼ + Scale factors (from --normalisation_mode Spikein) + │ + ▼ + ChIPBinner norm_bw(scaling_factor = pipeline_scale_factor) + │ + ▼ + If --chipbinner_use_control: + norm_bw(use_input = TRUE, input_binned_file = pooled_IgG.binned.bed) + Else: + norm_bw(use_input = FALSE) +``` + +### Binning Reference + +Reference window BED files are required for binning. The pipeline expects: + +``` +# From ChIPBinner_database repository +hg38.10kb.windows.bed # For --chipbinner_bin_size 10000 +hg38.5kb.windows.bed # For --chipbinner_bin_size 5000 +hg38.1kb.windows.bed # For --chipbinner_bin_size 1000 +mm10.10kb.windows.bed # For mouse +``` + +**Parameter**: `--chipbinner_windows_dir` (path to directory containing window BED files) + +### HDBSCAN Grid Search + +The pipeline runs HDBSCAN clustering with a grid of parameters: + +```python +# Grid search combinations +for min_cluster_size in [100, 400, 700, 1000]: + for min_samples in [100, 400, 700, 1000]: + clusterer = hdbscan.HDBSCAN( + min_cluster_size=min_cluster_size, + min_samples=min_samples + ).fit(signal_matrix) + save(f"clus.{comparison}.{min_cluster_size}.{min_samples}.txt") +``` + +**Total combinations**: 16 (4 × 4 grid) + +### Cluster Extraction + +From the grid search results, extract **fixed 2-cluster and 3-cluster solutions**: + +```r +# For each parameter combination, extract: +# - 2clusters.annotated_clusters.rda (Cluster A, Cluster B) +# - 3clusters.annotated_clusters.rda (Cluster A, Cluster B, Cluster C) + +# Clusters are labeled by behavior: +# A = bins with signal in both conditions (stable) +# B = bins with signal preferentially in WT/control +# C = bins with signal preferentially in treatment/KO +``` + +### LOLA Enrichment + +Automatic enrichment analysis using ChIPBinner's bundled databases: + +| Database | Source | Content | +|----------|--------|---------| +| Ensembl | ensembl.org | Gene annotations, promoters, exons, introns | +| cCRE | ENCODE | Candidate cis-regulatory elements | +| RepeatMasker | repeatmasker.org | Repeat element annotations | + +**Output per cluster:** +- `{cluster}_ensembl_enrichment.tsv` +- `{cluster}_ccre_enrichment.tsv` +- `{cluster}_repeats_enrichment.tsv` +- `{cluster}_enrichment_plot.pdf` + +### Visualizations + +| Plot | File | Description | +|------|------|-------------| +| Genic/intergenic scatterplot | `{comparison}_genic_intergenic.pdf` | Signal stratified by genomic context | +| Density scatterplot | `{comparison}_density_scatter.pdf` | WT vs treatment with cluster coloring | +| PCA | `{comparison}_PCA.pdf` | Sample clustering by bin signal | +| Correlation | `{comparison}_correlation.pdf` | Sample-sample correlations | + +### ChIPBinner Output Files + +``` +08_differential/chipbinner/{group}/ +├── binned_beds/ +│ ├── {sample}.{bin_size}kb.binned.bed +│ └── ... +├── normalized_bigwigs/ +│ ├── {sample}.{bin_size}kb.norm.bw +│ └── ... +├── clustering/ +│ ├── {comparison}_preclust_mat.csv +│ ├── {comparison}_preclust_pooled.bed +│ ├── clus.{comparison}.{minpts}.{minsamps}.txt (16 files) +│ ├── {comparison}.2clusters.annotated_clusters.rda +│ └── {comparison}.3clusters.annotated_clusters.rda +├── differential/ +│ ├── {comparison}_rots_results.csv +│ └── {comparison}_stats_per_cluster.csv +├── enrichment/ +│ ├── {comparison}_{cluster}_ensembl_enrichment.tsv +│ ├── {comparison}_{cluster}_ccre_enrichment.tsv +│ └── {comparison}_{cluster}_enrichment_plot.pdf +└── plots/ + ├── {comparison}_genic_intergenic.pdf + ├── {comparison}_density_scatter.pdf + ├── {comparison}_PCA.pdf + └── {comparison}_correlation.pdf +``` + +--- + +## SPAN Differential Implementation + +### Overview + +SPAN provides native differential peak calling via its 'compare' mode, which directly compares two conditions using SPAN's semi-supervised model. + +### Input Requirements + +| Input | Source | Description | +|-------|--------|-------------| +| Treatment BAMs | Alignment output | Condition 1 replicates (pooled or individual) | +| Control BAMs | Alignment output | Condition 2 replicates (pooled or individual) | +| IgG BAMs | Pooled controls | Background control for both conditions | +| Chrom sizes | Reference | Chromosome sizes file | +| omnipeaks.jar | `--omnipeaks_jar` | SPAN executable | + +### SPAN Compare Command + +```bash +java -Xmx${java_heap} -jar omnipeaks.jar compare \ + -t1 condition1_rep1.bam,condition1_rep2.bam \ + -t2 condition2_rep1.bam,condition2_rep2.bam \ + -c1 condition1_IgG.bam \ + -c2 condition2_IgG.bam \ + --cs chrom.sizes \ + --bin ${span_diff_bin} \ + --gap ${span_diff_gap} \ + --fdr ${span_diff_fdr} \ + -o ${output_prefix} +``` + +### Output Files + +``` +08_differential/span_diff/{group}/ +├── {comparison}_span_diff.peak # Differential peaks +├── {comparison}_span_diff_up.bed # Peaks higher in condition1 +├── {comparison}_span_diff_down.bed # Peaks higher in condition2 +└── {comparison}_span_diff_stats.tsv # Statistics per peak +``` + +--- + +## Output Directory Structure + +All differential analysis outputs are organized under `03_peak_calling/08_differential/`: + +``` +03_peak_calling/ +└── 08_differential/ + ├── diffbind/ + │ ├── samplesheets/ + │ │ ├── {group}_{caller}_samplesheet.csv + │ │ └── ... + │ ├── {group}_{caller}/ + │ │ ├── {group}_{caller}_differential.tsv + │ │ ├── {group}_{caller}_significant_up.bed + │ │ ├── {group}_{caller}_significant_down.bed + │ │ ├── {group}_{caller}_MA.pdf + │ │ ├── {group}_{caller}_volcano.pdf + │ │ ├── {group}_{caller}_PCA.pdf + │ │ ├── {group}_{caller}_correlation.pdf + │ │ ├── {group}_{caller}_heatmap.pdf + │ │ └── {group}_{caller}_venn.pdf + │ └── ... + ├── chipbinner/ + │ ├── {group}/ + │ │ ├── binned_beds/ + │ │ ├── normalized_bigwigs/ + │ │ ├── clustering/ + │ │ ├── differential/ + │ │ ├── enrichment/ + │ │ └── plots/ + │ └── ... + ├── span_diff/ + │ ├── {group}/ + │ │ ├── {comparison}_span_diff.peak + │ │ ├── {comparison}_span_diff_up.bed + │ │ └── {comparison}_span_diff_down.bed + │ └── ... + └── multiqc/ + ├── differential_summary_mqc.tsv + └── differential_multiqc_report.html +``` + +--- + +## MultiQC Integration + +### Integrated Summary (Main Report) + +Add to the existing MultiQC report: + +| Section | Content | +|---------|---------| +| Differential Summary | Table of differential peak counts per group/caller/method | +| Method Comparison | Overlap statistics between DiffBind/ChIPBinner/SPAN results | + +**Header file**: `assets/multiqc/differential_summary_header.txt` + +### Separate Differential Report + +Generate `differential_multiqc_report.html` with detailed sections: + +| Section | Content | +|---------|---------| +| DiffBind Results | Per-group tables, embedded MA/volcano plots | +| ChIPBinner Results | Cluster statistics, enrichment summaries | +| SPAN Results | Differential peak statistics | +| Cross-Method Comparison | Venn diagrams, overlap metrics | + +--- + +## Execution Examples + +### Basic Usage + +```bash +# Run DiffBind only +nextflow run dhusmann/cutandrun \ + -r adding_new_peak_callers \ + --input samplesheet.csv \ + --run_diffbind \ + --peakcaller seacr,macs2 + +# Run ChIPBinner for broad marks +nextflow run dhusmann/cutandrun \ + -r adding_new_peak_callers \ + --input samplesheet.csv \ + --run_chipbinner \ + --normalisation_mode Spikein \ + --chipbinner_bin_size 10000 + +# Run all three methods +nextflow run dhusmann/cutandrun \ + -r adding_new_peak_callers \ + --input samplesheet.csv \ + --run_diffbind \ + --run_chipbinner \ + --run_span_diff \ + --peakcaller span_default +``` + +### Customized Thresholds + +```bash +nextflow run dhusmann/cutandrun \ + -r adding_new_peak_callers \ + --input samplesheet.csv \ + --run_diffbind \ + --run_chipbinner \ + --diffbind_fdr 0.01 \ + --diffbind_lfc 1.5 \ + --chipbinner_fdr 0.05 \ + --chipbinner_lfc 1.0 \ + --chipbinner_use_control false +``` + +### Using Params File + +```json +{ + "input": "samplesheet.csv", + "outdir": "results", + "normalisation_mode": "Spikein", + "peakcaller": "seacr,macs2,span_default", + + "run_diffbind": true, + "run_chipbinner": true, + "run_span_diff": true, + + "diffbind_fdr": 0.05, + "diffbind_lfc": 1.0, + "diffbind_method": "DESeq2", + + "chipbinner_bin_size": 10000, + "chipbinner_use_control": true, + "chipbinner_fdr": 0.05, + + "span_diff_gap": 5, + "span_diff_fdr": 0.05 +} +``` + +--- + +## Implementation Notes + +### Container Requirements + +**DiffBind Module:** +- R 4.2+ +- Bioconductor 3.16+ +- DiffBind 3.8+ +- DESeq2, edgeR + +**ChIPBinner Module:** +- R 4.2+ +- ChIPBinner (installed at runtime from GitHub) +- Python 3.9+ (for HDBSCAN via reticulate) +- LOLA, GenomicRanges, rtracklayer + +**SPAN Module:** +- Java 21 JRE +- omnipeaks.jar (provided via `--omnipeaks_jar`) + +### Condition Detection + +Groups with valid comparisons are detected automatically: + +```groovy +// In DIFFERENTIAL_ANALYSIS subworkflow +ch_valid_groups = ch_samples + .map { meta, files -> [meta.group, meta.condition] } + .groupTuple() + .filter { group, conditions -> conditions.unique().size() >= 2 } + .map { group, conditions -> group } + +// Groups with only one condition are logged as warnings +ch_skipped_groups = ch_samples + .map { meta, files -> [meta.group, meta.condition] } + .groupTuple() + .filter { group, conditions -> conditions.unique().size() < 2 } + .subscribe { group, conditions -> + log.warn "Skipping differential analysis for group '${group}': only one condition found (${conditions.unique()})" + } +``` + +### Resource Requirements + +| Process | Memory | CPUs | Time Estimate | +|---------|--------|------|---------------| +| DIFFBIND_ANALYZE | 16 GB | 4 | 30 min/group | +| CHIPBINNER_CLUSTER | 32 GB | 8 | 2-4 hrs/comparison | +| CHIPBINNER_DIFFERENTIAL | 16 GB | 4 | 1 hr/comparison | +| SPAN_COMPARE | 8 GB | 2 | 30 min/comparison | + +### Error Handling + +| Scenario | Behavior | +|----------|----------| +| Group has < 2 conditions | Skip with warning | +| Group has < 2 replicates per condition | Skip with warning (DiffBind requires >= 2) | +| Peak caller produced no peaks | Skip DiffBind for that caller with warning | +| ChIPBinner clustering fails | Retry with lower min_cluster_size | +| SPAN compare fails | Log error, continue with other groups | + +--- + +## References + +- **DiffBind**: Stark R, Brown G (2011). DiffBind: differential binding analysis of ChIP-Seq peak data. Bioconductor package. +- **ChIPBinner**: Padilla R, Bareke E, Hu B, Majewski J (2024). ChIPbinner: An R package for analyzing broad histone marks binned in uniform windows. bioRxiv. +- **SPAN**: Shpynov O et al. (2021). Semi-supervised peak calling with SPAN. Bioinformatics. +- **ROTS**: Suomi T et al. (2017). ROTS: An R package for reproducibility-optimized statistical testing. PLoS Comput Biol. +- **LOLA**: Sheffield NC, Bock C (2016). LOLA: enrichment analysis for genomic region sets. Bioinformatics. diff --git a/docs/extension_specs/5.2_DIFFERENTIAL_PEAK_CALLING-SPEC.md b/docs/extension_specs/5.2_DIFFERENTIAL_PEAK_CALLING-SPEC.md new file mode 100644 index 00000000..ed39c0ff --- /dev/null +++ b/docs/extension_specs/5.2_DIFFERENTIAL_PEAK_CALLING-SPEC.md @@ -0,0 +1,582 @@ +# 5.2 Differential Peak Calling Spec + +Summary + +This SPEC defines how to implement **differential peak calling / differential enrichment** in the `dhusmann/cutandrun` fork (branch `differential_peak_calling`) using three complementary approaches: + 1. DiffBind (using peaks produced by *existing* peak callers in the pipeline) + 2. ChIPBinner (bin-based differential + clustering for broad marks; include Snakemake-parity notes for easier porting) + 3. SPAN/OmniPeaks (a SPAN-native differential workflow, if supported by the OmniPeaks JAR / SPAN ecosystem) + +This SPEC is written to be compatible with the nf-core/cutandrun architecture and conventions (DSL2, modular processes, schema-driven params, MultiQC reporting), and to build directly on the already-implemented condition-aware data model (`group`, `condition`, `replicate`) and multi-peakcaller outputs. + +⸻ + +Context (what exists already in this fork) + + • Samplesheet supports `condition` and constructs unique `sample_id` and `group_condition`. + • Peak calling supports multiple callers / variants (comma-separated `--peakcaller`, case-insensitive). + • First caller is “primary” for downstream consensus/QC; other callers are output-only **today**. + • Spike-in scaling can be computed per scope and recorded as TSVs when `--dump_scale_factors true`. + • Some callers (epic2, SPAN) require pooled controls (IgG) per `(control_group, condition)` for peak calling. + +Differential peak calling should treat differential analysis as a **separate layer** that: + • does not mutate existing peak calling outputs, + • can be run during a standard pipeline execution, or posthoc from an existing `--outdir`, + • and (per requirements) runs differential analyses for **every caller** used in `--peakcaller`, not only the primary. + +⸻ + +Requirements captured from user interview (implementation must match these) + +Invocation / modes + • Differential analysis must support: + • Integrated mode: run as part of a normal pipeline run (opt-in). + • Posthoc mode: run differential-only against an existing pipeline `--outdir` without re-running upstream. + +Caller coverage + • Differential analysis must run for **every caller variant** in `--peakcaller` (primary + secondary), producing parallel outputs per caller. + +Peak set strategy + • For DiffBind: use per-sample called peaks; DiffBind handles its own consensus/union internally. + • No peak recentering by default. + • Set a reasonable default for `minOverlap`, but allow the user to pass “full DiffBind params” if desired. + +Normalization / controls + • If `--normalisation_mode Spikein`, incorporate the pipeline’s spike-in scale factors into differential analyses (see details below). + • IgG / control samples are **not used** in the differential stage by default (IgG affects peak calling; differential compares WT vs KO targets only). + +Experimental design constraints (differential stage) + • There should be **exactly 2 biological conditions** for any differential comparison. + • For each `group` (mark), require **≥ 2 biological replicates** per condition; fail if fewer. + • Batch effects are out-of-scope for v1 (no batch covariate in the model by default). + +Outputs + • For each method (DiffBind / ChIPBinner / SPAN-diff), produce: + • Differential result tables (TSV) + BED-like outputs for significant features + • Diagnostic plots (e.g., MA/volcano/heatmap/PCA as applicable) + • Gene annotation (e.g., nearest gene/TSS from a BED/GTF-derived reference) + • MultiQC integration stubs (at minimum: summary tables consumable by MultiQC custom content) + +⸻ + +Goals + +Functional goals + 1. Add opt-in differential analysis to the pipeline (DiffBind, ChIPBinner, SPAN-diff). + 2. Run differential **per mark (`group`)** and **per caller** (for DiffBind and any caller-derived method). + 3. Support two execution modes: + • Integrated: differential runs immediately after peak calling outputs exist. + • Posthoc: differential can be executed later from an existing `--outdir`. + 4. Use spike-in scale factors (when present) as external normalization input for differential analysis. + 5. Produce a predictable output tree with enough metadata/manifesting to make downstream usage repeatable. + +Non-goals (explicitly out of scope for this SPEC) + • `--compare_norm_methods` and `--compare_peak_callers` modes (TODO #5–6). + • General downstream enrichment characterization (ChromHMM, cCRE, broad LOLA/GSEA library generation) beyond what is needed for ChIPBinner parity stubs; that work is TODO #7. + • Automatic batch covariate modeling (can be added later once the data model supports it cleanly). + +⸻ + +Definitions / data model (reused from existing fork) + +Key fields (already exist in `meta`) + • group: biological target (mark/factor) + • condition: biological condition label (WT vs KO; control vs treated) + • replicate: integer replicate index + • sample_id / meta.id: unique sample name (already collision-safe across conditions) + • caller: peak caller id (e.g., `seacr`, `macs2_narrow`, `span_default`) + +Differential comparison terminology + • “control condition”: the baseline / reference in the contrast (WT / sgControl) + • “treated condition”: the perturbed condition (KO / sgKO) + • log2FC sign convention: **log2(treated / control)** (must be enforced consistently across methods) + +NOTE: The pipeline cannot reliably infer treated vs control purely from condition names across all datasets. Therefore, the differential layer must expose an explicit contrast selection interface (see Parameters). + +⸻ + +User-facing interface (new parameters) + +Top-level toggles (v1) + • `--run_diffbind` (boolean; default false) + • `--run_chipbinner` (boolean; default false) + • `--run_span_diff` (boolean; default false) + +Execution mode (integrated vs posthoc) + • `--differential_from_run ` (string path; default null) + • If set: pipeline runs *differential-only* and reads required inputs from the referenced `--outdir`. + • If unset: differential runs integrated (if any `--run_*` toggles are true). + +Contrast selection (must exist for all differential methods) + • `--differential_contrast ","` + • Example: `--differential_contrast "sgNSD3,sgControl"` + • This defines the log2FC direction: **treated vs control**. + • Validation: both conditions must exist for each affected `group`. + +Minimum replicate enforcement + • `--differential_min_replicates 2` (integer; default 2) + • Enforced per `group`×`condition`. + +DiffBind parameters (v1 defaults + “full params” escape hatch) + • `--diffbind_fdr 0.05` (default 0.05) + • `--diffbind_min_overlap 2` (default 2; see notes below) + • `--diffbind_recenter_peaks false` (default false; do not summit-recenter) + • `--diffbind_backend DESeq2|edgeR` (default DESeq2) + • `--diffbind_extra_params ` (optional) + • If set: pass through to the DiffBind R runner to override any internal defaults and/or to supply additional DiffBind `dba.*` arguments. + • Implementation detail: prefer a JSON file (schema described below) to avoid brittle string parsing. + +Spike-in integration switches + • `--differential_use_spikein true|false` (default: auto) + • Auto behavior: + • If `--normalisation_mode Spikein`: true (use scale factors as external normalization) + • Else: false + +ChIPBinner parameters (v1) + • `--chipbinner_bin_size 10000` (default 10000; 10kb) + • `--chipbinner_use_input false` (default false; aligns with “no IgG in differential”) + • `--chipbinner_pseudocount 1` (default 1) + • `--chipbinner_hdbscan_grid_minpts "100,200,500,1000"` (default as string list) + • `--chipbinner_hdbscan_grid_minsamps "100,200,500,1000"` (default as string list) + • `--chipbinner_ms_coeffs ` (optional; Snakemake-parity support) + • `--chipbinner_functional_db ` (optional; LOLA/enrichment stubs) + +SPAN/OmniPeaks differential parameters (v1; some may be “TBD” pending OmniPeaks CLI capabilities) + • `--span_diff_mode native` (v1: native) + • `--span_diff_fdr 0.05` (default 0.05) + • `--span_diff_gap 5` (default 5) + • `--span_diff_java_heap 8G` (default 8G) + • `--omnipeaks_jar ` (already exists for peak calling; reused here) + +Schema + docs requirements + • All new params must be added to: + • `nextflow.config` defaults + • `nextflow_schema.json` + • `docs/usage.md` and `docs/output.md` + +⸻ + +High-level workflow design + +Add a new subworkflow layer that runs after peak calling outputs exist: + + • `subworkflows/local/differential_peak_calling.nf` (new) + • inputs: channels of target BAMs, per-sample per-caller peaks, metadata, reference features + • implements: DIFFBIND, CHIPBINNER, SPAN_DIFF (each gated by `--run_*`) + • emits: differential result artifacts + summary tables for MultiQC + +Integrated execution (normal pipeline run) + • `workflows/cutandrun.nf` calls `DIFFERENTIAL_PEAK_CALLING` after peak calling is complete. + • The differential layer consumes in-memory channels, avoiding filesystem re-discovery. + +Posthoc execution (differential-only from an existing run) + Two acceptable patterns; pick one and implement fully (recommended: A). + +Pattern A (recommended): dedicated entrypoint + • Add `workflow DIFFERENTIAL_ONLY` (new DSL2 entry) that: + • reads a “design manifest” produced by the main pipeline run (see next section), + • builds channels equivalent to integrated mode, and + • runs `DIFFERENTIAL_PEAK_CALLING` only. + • Invoke via Nextflow: + • `nextflow run dhusmann/cutandrun -r differential_peak_calling -entry DIFFERENTIAL_ONLY -profile singularity --differential_from_run --outdir --differential_contrast "KO,WT" ...` + +Pattern B: flowswitch-driven partial execution + • Add a flowswitch rule such that `--differential_from_run` disables upstream steps. + • Requires careful branching in the main workflow so channels are either “computed” or “loaded”. + +⸻ + +Make posthoc reliable: write a design manifest (new output artifact) + +Posthoc differential should not rely on fragile filename parsing. Add a small, explicit manifest to the normal pipeline output. + +New files (generated during integrated runs) + + • `03_peak_calling/06_differential/00_manifests/` + • `differential_manifest.samples.tsv` + • One row per *target* sample. + • Columns (minimum): + • sample_id + • group + • condition + • replicate + • final_bam + • final_bai + • normalisation_mode + • spikein_scale_factor (or NA) + • `differential_manifest.peaks.tsv` + • One row per (sample_id, caller). + • Columns: + • sample_id + • caller + • peaks_path + • peaks_format (bed / narrowPeak / broadPeak / etc.) + • `differential_manifest.bigwig.tsv` (optional, but recommended) + • One row per sample for the target bigWig used in visualization layers. + +Manifest generation rules + • Emit manifests only when peak calling ran successfully (or emit partial manifests + a failure summary). + • Manifests must be deterministic (sorted stable order) to support `-resume`. + • Manifests must only include *published* paths under `--outdir` so posthoc runs don’t need `work/`. + +⸻ + +Method 1: DiffBind (caller-derived differential peak calling) + +1.1 Inputs + • Per sample: + • final target BAM + BAI (post-processed BAM saved by the pipeline) + • called peaks for the selected caller variant + • Metadata: + • group (mark), condition, replicate + • Reference: + • Gene BED (or GTF-derived features) for annotation (see “Annotation” section) + • Spike-in: + • Per-sample spike-in scale factor (when `--differential_use_spikein` is true) + +IgG/controls + • Do not use IgG BAMs as `bamControl` in DiffBind by default (per requirement #5a). + • Keep `bamControl`/`ControlID` blank or `NA` in the DiffBind sample sheet. + +1.2 Grouping strategy + • Run DiffBind separately for each: + • `group` (mark), and + • `caller` (every caller in `params.callers`) + +In pseudocode: + • for caller in params.callers: + • for group in unique(meta.group): + • run DiffBind on samples where meta.group == group + +1.3 DiffBind sample sheet generation (per group×caller) + +Write one CSV per `(caller, group)`: + + `03_peak_calling/06_differential/01_diffbind/00_samplesheets//.diffbind.csv` + +Required columns (DiffBind-compatible minimum) + • SampleID: `meta.id` + • Factor: `meta.group` (mark) + • Condition: `meta.condition` + • Replicate: `meta.replicate` + • bamReads: path to final BAM + • Peaks: path to peak file for this caller+sample + +Optional columns (recommended) + • Tissue: a constant placeholder like `CUTRUN` + • PeakCaller: `caller` (or a short descriptor) + • Caller: same as `PeakCaller` (for easier downstream parsing) + +Validation + • Fail if: + • more or less than 2 unique conditions exist for the selected group (in v1). + • either condition has fewer than `--differential_min_replicates`. + • any referenced BAM/BAI/Peaks file is missing. + +1.4 DiffBind execution (R runner module) + +Implement as a Nextflow module that executes an R script: + • `modules/local/diffbind_run.nf` (new) + • `bin/diffbind_run.R` (new) + +Required behaviors + • Accept: + • sample sheet CSV + • contrast definition: treated vs control + • default parameters (minOverlap, FDR, backend, no recentering) + • optional `diffbind_extra_params` file to override defaults + • optional spike-in scale factor TSV for those samples (or embedded column) + +Defaults (hard defaults, but overridable) + • minOverlap: 2 + • summits/recenter: disabled + • FDR threshold: 0.05 + • backend: DESeq2 (allow edgeR) + +Spike-in integration (required when enabled) + +Goal: incorporate per-sample external scaling so that differential log2FC reflects spike-in normalization. + +Design constraint: DiffBind internally uses DESeq2/edgeR normalization assumptions (“most features unchanged”), which can be violated in global-change settings. When `--normalisation_mode Spikein`, we have external scale factors and should prefer them. + +Implementation strategy (v1; robust and explicit): + 1. Use DiffBind primarily to: + • import peaks/BAMs, + • build a consensus peakset and counts matrix (per DiffBind conventions). + 2. Run the differential test using DESeq2/edgeR **with user-provided size factors derived from spike-in**: + • Let `sf = 1 / scale_factor` per sample (so normalized counts = counts / sf = counts * scale_factor). + • Disable DiffBind’s internal normalization where possible, or bypass it by extracting the count matrix and running the backend directly. + +Notes: + • The exact DiffBind API details must be validated during implementation; the R runner should be structured so we can: + • fall back to standard DiffBind analysis if external scaling can’t be injected safely, + • while still meeting the “use spike-in scaling” requirement when possible. + +1.5 Outputs (DiffBind) + +Per `(caller, group)` write to: + + `03_peak_calling/06_differential/01_diffbind///` + +Artifacts (minimum) + • `diffbind.results.tsv` + • One row per tested region with: + • chrom, start, end + • log2FC (treated/control) + • pvalue, FDR + • concentration / counts fields if available + • `diffbind.significant.bed` + • BED-like output for peaks with FDR ≤ `--diffbind_fdr` + • Plots: + • `PCA.pdf` + • `correlation_heatmap.pdf` + • `MA.pdf` + • `volcano.pdf` + +Annotation (required) + • `diffbind.results.annotated.tsv` + • Add at minimum: + • nearest_gene (or nearest_feature) + • distance_to_feature + • feature_id (optional) + +MultiQC stubs (required) + • `diffbind.summary.tsv` + • Columns: + • caller, group, treated_condition, control_condition + • n_tested + • n_fdr_pass + • n_up (log2FC>0 & FDR pass) + • n_down (log2FC<0 & FDR pass) + +1.6 Error handling (DiffBind) + • Fail fast per group/caller if: + • replicate constraints are violated, + • input files missing, + • R script errors. + • Optionally support “continue on failure” later (not required for v1); for v1, prefer clear failure to prevent silent partial analyses. + +⸻ + +Method 2: ChIPBinner (bin-based differential + clustering; broad mark emphasis) + +2.1 Purpose and positioning + +ChIPBinner is best for broad marks and global changes. It does not produce traditional “peaks” first; instead it bins the genome (typically 10kb), quantifies signal per bin, clusters bins by behavior, and tests bins for differential enrichment (via ROTS). + +This method complements peak-caller-derived methods and should be implemented as a separate differential workflow (not as a peakcaller option). + +2.2 Inputs + • Per target sample: + • final target BAM + BAI (required) + • Reference: + • chrom sizes (required) + • binning windows BED (required; can be generated) + • Optional: + • MS coefficients YAML (Snakemake-parity) + • functional DB (for enrichment stubs) + +Controls (IgG/input) + • Default: `chipbinner_use_input=false` (aligns with the requirement of “no controls in differential stage” and with the Snakemake workflow note that controls are disabled by default). + • However, the implementation should be designed so that enabling input later is straightforward: + • if enabled, control bigWigs/BAMs should be matched by `(control_group, condition)` similarly to existing pooling logic. + +2.3 Workflow outline (per group) + +For each `group` (mark): + 1. Build (or load) reference bins: + • Default: generate bins from chrom sizes using `bedtools makewindows -w chipbinner_bin_size` + • Optional: subtract blacklist regions if available + 2. Quantify signal per bin per sample (BAM → bin counts) + 3. Normalize per sample: + • If `--differential_use_spikein` is true: + • Apply external scaling factor per sample (spike-in or MS) + • Optionally apply depth normalization (library size), but beware global-change assumptions + 4. QC: + • PCA + • sample correlation + 5. Clustering: + • Run HDBSCAN over bins using a grid search of `minPts` / `minSamps` (Snakemake-parity) + 6. Differential testing: + • Run ROTS-based differential per bin using replicates kept separate + 7. Annotation: + • Annotate bins to nearest genes / features (same annotation machinery as DiffBind) + 8. Enrichment stubs: + • If a functional DB is provided, run (or stub) LOLA/enrich_clust outputs + +2.4 Snakemake-parity notes (include these to ease porting) + +The `snakemake_chipbinner` repo adds automation beyond a minimal ChIPBinner run. To make porting easier, design the Nextflow implementation around the same conceptual interfaces: + + • CSV-based sample sheet management + • Implement a ChIPBinner-specific sample sheet CSV (one per group) written to: + • `03_peak_calling/06_differential/02_chipbinner/00_samplesheets/.chipbinner.csv` + • Config-driven MS normalization + • Support `--chipbinner_ms_coeffs `: + • v1: load the file and validate structure (even if only partially used initially). + • v2: use MS coefficients as scaling factors, analogous to spike-in scale factors. + • Batch comparisons + • v1: not supported in the statistical model. + • v2: add a “comparisons file” that enumerates multiple (treated, control) contrasts. + • Clustering parameter grid search + • Implement a grid over: + • minPts in [100..1000] + • minSamps in [100..1000] + • Output a summary table ranking parameter choices by cluster stability / replicate consistency metrics. + • Controls disabled by default + • Match snakemake defaults: `chipbinner_use_input=false`. + +2.5 Outputs (ChIPBinner) + +Write to: + `03_peak_calling/06_differential/02_chipbinner//` + +Artifacts (minimum) + • `chipbinner.bin_counts.tsv` (or CSV) + • `chipbinner.normalized_matrix.tsv` (or CSV) + • `chipbinner.clusters.tsv` (bin → cluster assignment) + • `chipbinner.differential.tsv` (bin-level logFC, FDR, cluster) + • Plots: + • `PCA.pdf` + • `correlation.pdf` + • Annotation: + • `chipbinner.differential.annotated.tsv` + • Grid search summary: + • `chipbinner.hdbscan_grid_summary.tsv` + +MultiQC stubs + • `chipbinner.summary.tsv` + • group, treated_condition, control_condition + • n_bins_tested, n_fdr_pass, n_up, n_down + • n_clusters, chosen_minPts, chosen_minSamps + +⸻ + +Method 3: SPAN/OmniPeaks differential (SPAN-native) + +3.1 Clarify tool capability (required discovery step) + +The current pipeline uses OmniPeaks via: + • `java -jar omnipeaks.jar analyze -t -c ...` + +This SPEC requires a SPAN-native differential workflow (user requirement #11b). Implementation must therefore: + 1. Inspect the OmniPeaks CLI help (`java -jar omnipeaks.jar --help`) and/or upstream documentation to determine whether a native differential subcommand exists (e.g., `diff`, `compare`, `differential`, etc.). + 2. If a native differential mode exists: + • implement it directly as a new module (recommended). + 3. If a native differential mode does *not* exist: + • document this clearly and implement the closest SPAN-consistent differential alternative as a separate workflow stage (see 3.4 fallback plan). + +3.2 Inputs (expected) +At minimum: + • final target BAMs (replicates per condition) + • chrom sizes + • OmniPeaks JAR + +Controls (IgG/input) + • Differential stage does not use IgG for modeling (per requirement #5a). + • If the OmniPeaks differential mode requires controls, use the already-produced pooled controls per condition as *technical inputs* to the SPAN command, but do not treat IgG as a “condition” in differential interpretation. + +3.3 Replicates + • Enforce `--differential_min_replicates` per condition (≥2). + • If OmniPeaks differential does not accept replicates directly, pool target BAMs per `(group, condition)` for the SPAN differential step, and record pooling explicitly: + • `03_peak_calling/06_differential/03_span/00_manifests/span_diff_target_pooling.tsv` + +3.4 Outputs (SPAN-diff) + +Write to: + `03_peak_calling/06_differential/03_span//` + +Artifacts (minimum) + • `span.differential.peaks.bed` (or native SPAN output) + • `span.differential.tsv` (log2FC, FDR, peak coords) + • Plots (if SPAN produces them; otherwise create minimal diagnostics): + • `signal_distributions.pdf` (optional) + • Annotation: + • `span.differential.annotated.tsv` + • MultiQC stubs: + • `span.summary.tsv` + +3.5 Fallback plan (if no native SPAN differential exists) + +If OmniPeaks cannot do native differential: + • Do not silently “pretend” it can. + • Implement a transparent SPAN-consistent fallback: + • Use SPAN’s own peak outputs per replicate (already generated), + • build a region set (union across samples), + • count reads per region from the final BAMs, + • run DESeq2/edgeR with spike-in size factors (same external normalization as DiffBind), + • label outputs clearly as “SPAN peaks + DESeq2 differential”. + +This fallback still satisfies “SPAN/OmniPeaks differential” in spirit while being explicit about the statistical engine used. + +⸻ + +Shared component: Gene/feature annotation (required for all methods) + +Because the pipeline already supports a gene BED / GTF for QC, prefer a minimal, tool-agnostic annotation approach: + • Convert differential regions (peaks/bins) to BED + • Annotate with nearest gene/feature using: + • `bedtools closest` against `--gene_bed` (if provided), or + • a GTF-derived BED of TSS/gene bodies (if `--gtf` exists) + +Outputs should include: + • nearest_feature_id / gene_name (if available) + • distance_to_feature + +⸻ + +MultiQC integration stubs (required) + +v1 should not attempt full MultiQC module development unless it’s low effort. Instead: + • Write small summary TSVs for each method. + • Add them as MultiQC custom content tables via `assets/multiqc_config.yml` (custom content). + +Minimum: include per-method summary tables described above. + +⸻ + +Acceptance criteria (what “done” means for implementation) + +Functional + • With a condition-aware samplesheet containing exactly 2 conditions and ≥2 reps/condition: + • `--run_diffbind` produces DiffBind results for **each caller** in `--peakcaller` for each `group`. + • `--run_chipbinner` produces ChIPBinner outputs per `group` including differential tables and plots. + • `--run_span_diff` produces SPAN differential outputs per `group` (native if supported, otherwise explicit fallback). + • With `--normalisation_mode Spikein`, differential results incorporate spike-in scaling (documented and testable). + • With `--differential_from_run `, differential-only runs succeed without recomputing upstream steps. + +Validation / errors + • Fail clearly if: + • contrast not provided or invalid, + • not exactly 2 conditions, + • replicate constraints violated. + +Outputs + • Output directories and filenames match this SPEC (or the implemented naming is documented and stable). + • All three methods produce: + • a result TSV + • plots + • annotated TSV + • MultiQC summary TSV stubs + +⸻ + +Testing strategy (pytest-workflow friendly) + +Given runtime constraints, implement tests in layers: + +1) Unit-ish / fast integration tests (recommended for CI) + • Validate parameter failures: + • no contrast → fails + • 1 condition → fails + • <2 reps/condition → fails + • Validate manifest generation: + • manifests exist and contain expected columns + • Validate that enabling `--run_diffbind` triggers creation of DiffBind sample sheets (even if analysis is stubbed in test mode) + +2) Full integration tests (optional; may be heavy) + • Run a small dataset with `--run_diffbind` and confirm: + • result TSV exists for at least one group and one caller + • For ChIPBinner and SPAN-diff, consider “smoke tests” that run the module scripts with tiny synthetic inputs. + +To keep CI feasible, allow a `test` profile behavior where heavy differential steps are skipped but sample sheet/manifest generation is exercised. diff --git a/docs/extension_specs/EXTENSIONS_ADDED_20260104.md b/docs/extension_specs/EXTENSIONS_ADDED_20260104.md new file mode 100644 index 00000000..1849106b --- /dev/null +++ b/docs/extension_specs/EXTENSIONS_ADDED_20260104.md @@ -0,0 +1,111 @@ +# What we added to the fork (branch adding_new_peak_callers) + + - Samplesheet now supports condition (new header: group,condition,replicate,fastq_1,fastq_2,control). Legacy + sheets without condition still work (condition defaults to NA). This enables group × condition-aware + behavior throughout. + - New grouping metadata: every sample gets group_condition = "${group}_${condition}" and filenames/IDs avoid + collisions across conditions. + - Group/condition-aware normalisation controls + - --normalisation_scope {all,group,group_condition}: for --normalisation_mode Spikein, spike-in scale + factors can be computed globally (legacy) or separately per group / per group+condition. + - --dump_scale_factors true: writes spike-in scale-factor TSVs under + 03_peak_calling/00_normalisation_factors/. + - --igg_scale_scope {legacy,group_condition,sample}: for IgG/control scaling when using read-count + normalisation modes (e.g. BPM/CPM/RPKM). + - Condition-aware pooled IgG controls for callers that require it: + - Pooled BAMs per (control_group, condition) under 03_peak_calling/01_pooled_controls/. + - Fallbacks (if no exact condition-match control exists) are recorded in 03_peak_calling/07_qc_tables/ + control_pooling_fallbacks.tsv. + - New peak callers + variants (in --peakcaller, comma-separated, case-insensitive): + - MACS2_NARROW, MACS2_BROAD (no-control variants) + - GOPEAKS_NARROW, GOPEAKS_BROAD (emits *_gopeaks.json for MultiQC) + - EPIC2_200BP, EPIC2_150BP, EPIC2_25BP (requires pooled controls; needs --epic2_genome if not inferable) + - SPAN_DEFAULT, SPAN_STRINGENT (requires pooled controls; requires --omnipeaks_jar) + - Plus --peakcaller_preset {standard,extended} (extended = all new callers/variants) + - Consensus peaks can be condition-aware + - --consensus_grouping {group,group_condition} (defaults to group_condition when condition exists in the + samplesheet). + - Primary-caller semantics are enforced + - The first entry in --peakcaller (or the first in the preset list) is the “primary” used for downstream + consensus/QC; others are output-only. + - EPIC2_* and SPAN_* error early if --use_control false. + +# DONE (from docs/extension_specs/TODO.md) + + - #1 Add condition to samplesheet: Implemented (condition column supported; legacy sheets still work with + default condition=NA; metadata now includes group_condition). + - #2 Group-specific handling (Normalization + Peak-calling): Implemented (new scoping knobs like + --normalisation_scope and --igg_scale_scope, condition-aware control pooling for control-required callers, + and condition-aware consensus via --consensus_grouping). + - #3 Support additional peakcallers: Implemented (new caller variants incl. GoPeaks, epic2, SPAN/OMNIPEAKS, + MACS2 narrow/broad; plus --peakcaller_preset extended and primary-caller semantics). + +# Peak Caller Variants integrated into our peak-calling pipeline; + - seacr (SEACR; supports --use_control true/false) + - macs2 (legacy MACS2; supports --use_control true/false, with broad/narrow controlled by existing MACS2 + params) + - macs2_narrow (MACS2 narrow, no control) + - macs2_broad (MACS2 broad, no control) + - gopeaks_narrow (GoPeaks narrow; emits peaks + *_gopeaks.json for MultiQC) + - gopeaks_broad (GoPeaks broad; emits peaks + *_gopeaks.json for MultiQC) + - epic2_200bp, epic2_150bp, epic2_25bp (epic2 with different binning; requires pooled controls → + --use_control true) + - span_default, span_stringent (SPAN/OMNIPEAKS; requires pooled controls → --use_control true) + +# How to run your modified pipeline (modeled on your 20251120_CNR scripts) + + 1. Make a condition-aware samplesheet (this is what actually “turns on” the condition/group additions). + - Your current samplesheet.csv encodes condition inside group (e.g. H3K4me3_sgControl). Convert to + something like: + + group,condition,replicate,fastq_1,fastq_2,control + IgG,sgControl,1,fastq/.../C1_1_..._1.fq.gz,fastq/.../C1_1_..._2.fq.gz, + IgG,sgControl,2,fastq/.../C2_1_..._1.fq.gz,fastq/.../C2_1_..._2.fq.gz, + IgG,sgNSD3,1,fastq/.../S1_1_..._1.fq.gz,fastq/.../S1_1_..._2.fq.gz, + IgG,sgNSD3,2,fastq/.../S2_1_..._1.fq.gz,fastq/.../S2_1_..._2.fq.gz, + H3K4me3,sgControl,1,fastq/.../C1_2_..._1.fq.gz,fastq/.../C1_2_..._2.fq.gz,IgG + H3K4me3,sgControl,2,fastq/.../C2_2_..._1.fq.gz,fastq/.../C2_2_..._2.fq.gz,IgG + H3K4me3,sgNSD3,1,fastq/.../S1_2_..._1.fq.gz,fastq/.../S1_2_..._2.fq.gz,IgG + ... + + 2. Create a new params JSON (copy your existing nf-params.narrow.json / nf-params.spikein.json and add the + new knobs). Examples: + + - BPM run using condition-aware consensus + condition-aware IgG scaling + extended callers + - Set at least: + - "input": "samplesheet.with_condition.csv" + - "consensus_grouping": "group_condition" (or omit; defaults this way when condition exists) + - "igg_scale_scope": "group_condition" (new) + - Either "peakcaller_preset": "extended" or explicitly choose/order callers via "peakcaller": "..." + - If you include SPAN/epic2, add: + - "omnipeaks_jar": "/path/to/omnipeaks.jar" + - "epic2_genome": "mm39" (important for your GRCm39 resources unless you set genome in a way that + can be inferred) + - Spike-in run using group-scoped spike-in scaling + scale-factor TSVs + - Add: + - "normalisation_mode": "Spikein" + - "normalisation_scope": "group" (or "group_condition") + - "dump_scale_factors": true + + 3. Run your fork instead of nf-core/cutandrun + + - Replace: + - nextflow run nf-core/cutandrun -r 3.2.1 ... + - With either: + - nextflow run dhusmann/cutandrun -r adding_new_peak_callers -profile singularity -work-dir + -params-file -resume + - or run the local clone directly: + - nextflow run /home/users/dhusmann/.nextflow/assets/nf-core/cutandrun_dev/cutandrun -profile + singularity -work-dir -params-file -resume + + That will produce the new outputs under 03_peak_calling/ (pooled controls, per-caller peak folders, + per-(group,condition) consensus peaks, optional normalisation-factor TSVs, and control fallback QC). + +# REMAINS TODO + - 4 Differential peak calling subworkflow (DiffBind samplesheets + standalone ChIPBinner + standalone + SPAN): Not implemented yet (only design/docs scaffolding exists). + - 5 --compare_norm_methods mode: Not implemented. + - 6 --compare_peak_callers mode: Not implemented. + - 7 Differential peak/enrichment characterization (ChromHMM, cCRE, LOLA/GSEA sets from group×cond, etc.): + Not implemented. + From f123e6e36276f048009e4d6b5fbf8ee3e0330140 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 17:03:54 -0800 Subject: [PATCH 02/31] Add differential peak calling layer --- .../multiqc/differential_skipped_header.txt | 11 + .../multiqc/differential_summary_header.txt | 11 + assets/multiqc_config.yml | 2 + bin/annotate_regions.py | 118 ++++ bin/chipbinner_rots.R | 120 ++++ bin/diffbind_run.R | 349 +++++++++++ bin/hdbscan_grid.py | 89 +++ conf/flowswitch.config | 20 + conf/modules.config | 111 ++++ docs/output.md | 50 +- docs/usage.md | 52 ++ main.nf | 1 + modules/local/annotate_regions.nf | 45 ++ modules/local/chipbinner_bins.nf | 57 ++ modules/local/chipbinner_counts.nf | 81 +++ modules/local/chipbinner_hdbscan_grid.nf | 58 ++ modules/local/chipbinner_rots.nf | 77 +++ modules/local/diffbind_run.nf | 68 +++ modules/local/diffbind_samplesheet.nf | 61 ++ modules/local/differential_overlap.nf | 77 +++ modules/local/differential_summary.nf | 88 +++ modules/local/multiqc.nf | 2 + modules/local/span_capability_probe.nf | 60 ++ modules/local/span_compare.nf | 97 +++ modules/local/span_fallback_diff.nf | 73 +++ modules/local/write_differential_manifests.nf | 134 ++++ nextflow.config | 44 ++ nextflow_schema.json | 268 +++++++- .../local/differential_peak_calling.nf | 576 ++++++++++++++++++ subworkflows/local/prepare_peakcalling.nf | 6 + .../differential_manifest.peaks.tsv | 3 + .../differential_manifest.run_meta.json | 4 + .../differential_manifest.samples.tsv | 3 + tests/config/differential.config | 20 + tests/test_07_differential_peak_calling.yml | 27 + workflows/cutandrun.nf | 70 ++- workflows/differential_only.nf | 106 ++++ 37 files changed, 3007 insertions(+), 32 deletions(-) create mode 100644 assets/multiqc/differential_skipped_header.txt create mode 100644 assets/multiqc/differential_summary_header.txt create mode 100755 bin/annotate_regions.py create mode 100755 bin/chipbinner_rots.R create mode 100755 bin/diffbind_run.R create mode 100755 bin/hdbscan_grid.py create mode 100644 modules/local/annotate_regions.nf create mode 100644 modules/local/chipbinner_bins.nf create mode 100644 modules/local/chipbinner_counts.nf create mode 100644 modules/local/chipbinner_hdbscan_grid.nf create mode 100644 modules/local/chipbinner_rots.nf create mode 100644 modules/local/diffbind_run.nf create mode 100644 modules/local/diffbind_samplesheet.nf create mode 100644 modules/local/differential_overlap.nf create mode 100644 modules/local/differential_summary.nf create mode 100644 modules/local/span_capability_probe.nf create mode 100644 modules/local/span_compare.nf create mode 100644 modules/local/span_fallback_diff.nf create mode 100644 modules/local/write_differential_manifests.nf create mode 100644 subworkflows/local/differential_peak_calling.nf create mode 100644 tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.peaks.tsv create mode 100644 tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.run_meta.json create mode 100644 tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.samples.tsv create mode 100644 tests/config/differential.config create mode 100644 tests/test_07_differential_peak_calling.yml create mode 100644 workflows/differential_only.nf diff --git a/assets/multiqc/differential_skipped_header.txt b/assets/multiqc/differential_skipped_header.txt new file mode 100644 index 00000000..7f2c3224 --- /dev/null +++ b/assets/multiqc/differential_skipped_header.txt @@ -0,0 +1,11 @@ +#id: 'differential_skipped' +#parent_id: 'peak_qc' +#parent_name: 'Peak QC' +#parent_description: 'Differential peak calling skipped comparisons' +#section_name: 'Differential Peak Calling Skipped' +#description: 'Comparisons skipped due to missing conditions or replicate constraints' +#plot_type: 'table' +#anchor: 'differential_skipped' +#pconfig: +# id: 'differential_skipped_table' +# title: 'Differential Peak Calling Skipped' diff --git a/assets/multiqc/differential_summary_header.txt b/assets/multiqc/differential_summary_header.txt new file mode 100644 index 00000000..b3f53395 --- /dev/null +++ b/assets/multiqc/differential_summary_header.txt @@ -0,0 +1,11 @@ +#id: 'differential_summary' +#parent_id: 'peak_qc' +#parent_name: 'Peak QC' +#parent_description: 'Differential peak calling summary' +#section_name: 'Differential Peak Calling Summary' +#description: 'Counts of significant differential regions per method, group, and caller' +#plot_type: 'table' +#anchor: 'differential_summary' +#pconfig: +# id: 'differential_summary_table' +# title: 'Differential Peak Calling Summary' diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 240c0d88..08dccd35 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -68,6 +68,8 @@ custom_content: - consensus_peak_counts - primary_frip_score - peak_reprod_perc + - differential_summary + - differential_skipped - software-versions-by-process - software-versions-unique diff --git a/bin/annotate_regions.py b/bin/annotate_regions.py new file mode 100755 index 00000000..c71e3522 --- /dev/null +++ b/bin/annotate_regions.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +import argparse +import csv +import os +import subprocess +import sys +from pathlib import Path + + +def detect_cols(header): + header_lower = [h.lower() for h in header] + def find(names): + for name in names: + if name in header_lower: + return header[header_lower.index(name)] + return None + chr_col = find(["chr", "chrom", "seqnames", "seqname"]) + start_col = find(["start"]) + end_col = find(["end"]) + return chr_col, start_col, end_col + + +def run(cmd, stdout=None): + result = subprocess.run(cmd, stdout=stdout, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + raise RuntimeError(f"Command failed: {' '.join(cmd)}\n{result.stderr}") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--input", required=True) + parser.add_argument("--output", required=True) + parser.add_argument("--features", default=None) + parser.add_argument("--gtf", default=None) + args = parser.parse_args() + + input_path = Path(args.input) + output_path = Path(args.output) + + with input_path.open() as handle: + reader = csv.reader(handle, delimiter='\t') + header = next(reader) + + chr_col, start_col, end_col = detect_cols(header) + if not chr_col or not start_col or not end_col: + raise SystemExit("Could not detect chr/start/end columns for annotation") + + features_bed = None + if args.features and Path(args.features).exists(): + features_bed = Path(args.features) + elif args.gtf and Path(args.gtf).exists(): + raw_bed = Path("features.raw.bed") + tss_bed = Path("features.tss.bed") + run(["gtf2bed", args.gtf], stdout=raw_bed.open("w")) + with raw_bed.open() as in_handle, tss_bed.open("w") as out_handle: + for line in in_handle: + if not line.strip(): + continue + fields = line.rstrip().split("\t") + if len(fields) < 6: + continue + chrom, start, end, name, score, strand = fields[:6] + start = int(start) + end = int(end) + if strand == "+": + tss_start = start + else: + tss_start = max(end - 1, 0) + tss_end = tss_start + 1 + out_handle.write("\t".join([chrom, str(tss_start), str(tss_end), name]) + "\n") + features_bed = tss_bed + + if not features_bed or not features_bed.exists(): + # no annotation possible; copy input to output + output_path.write_text(input_path.read_text()) + return + + regions_bed = Path("regions.bed") + with input_path.open() as handle, regions_bed.open("w") as out_handle: + reader = csv.DictReader(handle, delimiter='\t') + for idx, row in enumerate(reader): + out_handle.write("\t".join([ + row[chr_col], + str(row[start_col]), + str(row[end_col]), + str(idx) + ]) + "\n") + + closest_out = Path("closest.tsv") + run(["bedtools", "closest", "-d", "-a", str(regions_bed), "-b", str(features_bed)], stdout=closest_out.open("w")) + + annotations = {} + with closest_out.open() as handle: + for line in handle: + if not line.strip(): + continue + fields = line.rstrip().split("\t") + if len(fields) < 5: + continue + row_id = int(fields[3]) + feature_id = fields[7] if len(fields) > 7 else "NA" + distance = fields[-1] + annotations[row_id] = (feature_id, distance) + + with input_path.open() as handle, output_path.open("w") as out_handle: + reader = csv.DictReader(handle, delimiter='\t') + fieldnames = reader.fieldnames + ["nearest_feature_id", "distance_to_feature"] + writer = csv.DictWriter(out_handle, delimiter='\t', fieldnames=fieldnames, lineterminator='\n') + writer.writeheader() + for idx, row in enumerate(reader): + feature_id, distance = annotations.get(idx, ("NA", "NA")) + row["nearest_feature_id"] = feature_id + row["distance_to_feature"] = distance + writer.writerow(row) + + +if __name__ == "__main__": + main() diff --git a/bin/chipbinner_rots.R b/bin/chipbinner_rots.R new file mode 100755 index 00000000..d6adbbbe --- /dev/null +++ b/bin/chipbinner_rots.R @@ -0,0 +1,120 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages({ + library(optparse) + library(jsonlite) +}) + +option_list <- list( + make_option(c("--matrix"), type = "character"), + make_option(c("--clusters"), type = "character"), + make_option(c("--samplesheet"), type = "character"), + make_option(c("--treated"), type = "character"), + make_option(c("--control"), type = "character"), + make_option(c("--group"), type = "character", default = "NA"), + make_option(c("--fdr"), type = "double", default = 0.05), + make_option(c("--lfc"), type = "double", default = 1.0), + make_option(c("--bootstrap"), type = "integer", default = 1000), + make_option(c("--k_value"), type = "integer", default = 100000) +) + +opt <- parse_args(OptionParser(option_list = option_list)) + +matrix_df <- read.table(opt$matrix, header = TRUE, sep = "\t", check.names = FALSE) +clusters_df <- read.table(opt$clusters, header = TRUE, sep = "\t", check.names = FALSE) +samples_df <- read.csv(opt$samplesheet, stringsAsFactors = FALSE) + +sample_ids <- samples_df$sample_id +conditions <- samples_df$condition + +if (!all(sample_ids %in% colnames(matrix_df))) { + stop("Matrix columns do not include all sample IDs") +} + +counts <- as.matrix(matrix_df[, sample_ids, drop = FALSE]) +rownames(counts) <- paste(matrix_df$chr, matrix_df$start, matrix_df$end, sep = ":") + +# Differential testing +log2fc <- log2(rowMeans(counts[, conditions == opt$treated, drop = FALSE]) / + rowMeans(counts[, conditions == opt$control, drop = FALSE])) + +use_rots <- TRUE +tryCatch({ + suppressPackageStartupMessages(library(ROTS)) +}, error = function(e) { + use_rots <<- FALSE +}) + +if (use_rots) { + groups <- ifelse(conditions == opt$treated, 1, 0) + rots_res <- ROTS::ROTS(counts, groups = groups, B = opt$bootstrap, K = opt$k_value) + pvals <- rots_res$pvalue +} else { + pvals <- apply(counts, 1, function(x) { + t.test(x[conditions == opt$treated], x[conditions == opt$control])$p.value + }) +} + +fdr <- p.adjust(pvals, method = "BH") + +results <- data.frame( + chr = matrix_df$chr, + start = matrix_df$start, + end = matrix_df$end, + log2FC = log2fc, + pvalue = pvals, + FDR = fdr +) + +if (all(c("chr", "start", "end", "cluster") %in% colnames(clusters_df))) { + clusters_key <- paste(clusters_df$chr, clusters_df$start, clusters_df$end, sep = ":") + results$cluster <- clusters_df$cluster[match(paste(results$chr, results$start, results$end, sep = ":"), clusters_key)] +} + +write.table(results, "chipbinner.differential.tsv", sep = "\t", quote = FALSE, row.names = FALSE) + +sig <- results[results$FDR <= opt$fdr & abs(results$log2FC) >= opt$lfc, , drop = FALSE] +up <- sig[sig$log2FC > 0, , drop = FALSE] +down <- sig[sig$log2FC < 0, , drop = FALSE] + +write.table(up[, c("chr", "start", "end")], "chipbinner.significant_up.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) +write.table(down[, c("chr", "start", "end")], "chipbinner.significant_down.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + +summary <- data.frame( + method = "chipbinner", + group = opt$group, + caller = "NA", + treated_condition = opt$treated, + control_condition = opt$control, + n_tested = nrow(results), + n_fdr_pass = nrow(sig), + n_up = nrow(up), + n_down = nrow(down) +) +write.table(summary, "chipbinner.summary.tsv", sep = "\t", quote = FALSE, row.names = FALSE) + +# Plots +counts_log <- log2(counts + 1) +try({ + pca <- prcomp(t(counts_log), scale. = TRUE) + pdf("plots/PCA.pdf") + plot(pca$x[,1], pca$x[,2], col = as.factor(conditions), pch = 19, + xlab = "PC1", ylab = "PC2", main = "PCA") + legend("topright", legend = levels(as.factor(conditions)), col = 1:length(unique(conditions)), pch = 19) + dev.off() +}, silent = TRUE) + +try({ + pdf("plots/correlation_heatmap.pdf") + heatmap(cor(counts_log), main = "Correlation") + dev.off() +}, silent = TRUE) + +try({ + treated_means <- rowMeans(counts[, conditions == opt$treated, drop = FALSE]) + control_means <- rowMeans(counts[, conditions == opt$control, drop = FALSE]) + pdf("plots/density_scatter.pdf") + plot(control_means, treated_means, pch = 16, cex = 0.5, + xlab = "Control mean", ylab = "Treated mean", main = "Density scatter") + dev.off() +}, silent = TRUE) diff --git a/bin/diffbind_run.R b/bin/diffbind_run.R new file mode 100755 index 00000000..cfaec45b --- /dev/null +++ b/bin/diffbind_run.R @@ -0,0 +1,349 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages({ + library(optparse) +}) + +option_list <- list( + make_option(c("--samplesheet"), type = "character"), + make_option(c("--treated"), type = "character"), + make_option(c("--control"), type = "character"), + make_option(c("--group"), type = "character", default = "NA"), + make_option(c("--caller"), type = "character", default = "NA"), + make_option(c("--fdr"), type = "double", default = 0.05), + make_option(c("--lfc"), type = "double", default = 1.0), + make_option(c("--min_overlap"), type = "integer", default = 2), + make_option(c("--summits"), type = "integer", default = 0), + make_option(c("--backend"), type = "character", default = "DESeq2"), + make_option(c("--use_spikein"), type = "character", default = "false"), + make_option(c("--prefix"), type = "character", default = "diffbind"), + make_option(c("--extra_params"), type = "character", default = NULL) +) + +opt <- parse_args(OptionParser(option_list = option_list)) + +suppressPackageStartupMessages({ + library(DiffBind) +}) + +use_spikein <- tolower(opt$use_spikein) %in% c("true", "1", "yes") + +samplesheet <- read.csv(opt$samplesheet, stringsAsFactors = FALSE) +required_cols <- c("SampleID", "Condition", "bamReads", "Peaks", "Replicate") +missing <- setdiff(required_cols, colnames(samplesheet)) +if (length(missing) > 0) { + stop("Missing required columns in samplesheet: ", paste(missing, collapse = ", ")) +} + +samplesheet <- samplesheet[samplesheet$Condition %in% c(opt$treated, opt$control), , drop = FALSE] +if (nrow(samplesheet) == 0) { + stop("No samples remain after filtering to contrast conditions") +} + +sample_ids <- samplesheet$SampleID +scale_factors <- NULL +if ("SpikeinScaleFactor" %in% colnames(samplesheet)) { + scale_factors <- suppressWarnings(as.numeric(samplesheet$SpikeinScaleFactor)) +} + +if (use_spikein && (is.null(scale_factors) || any(is.na(scale_factors)))) { + message("Spike-in scaling requested but scale factors missing; falling back to default normalization") + use_spikein <- FALSE +} + +extra <- list() +if (!is.null(opt$extra_params) && file.exists(opt$extra_params)) { + suppressPackageStartupMessages({ + library(jsonlite) + library(yaml) + }) + if (grepl("\\.ya?ml$", opt$extra_params, ignore.case = TRUE)) { + extra <- yaml::read_yaml(opt$extra_params) + } else { + extra <- jsonlite::fromJSON(opt$extra_params) + } +} + +backend <- toupper(opt$backend) +backend_method <- if (backend == "EDGER") DBA_EDGER else DBA_DESEQ2 + +summits_value <- if (opt$summits > 0) opt$summits else FALSE + +count_args <- list(summits = summits_value, minOverlap = opt$min_overlap) +if (!is.null(extra$dba_count) && is.list(extra$dba_count)) { + count_args <- modifyList(count_args, extra$dba_count) +} + +analyze_args <- list(method = backend_method) +if (!is.null(extra$dba_analyze) && is.list(extra$dba_analyze)) { + analyze_args <- modifyList(analyze_args, extra$dba_analyze) +} + +contrast_args <- list(categories = DBA_CONDITION, group1 = opt$treated, group2 = opt$control) +if (!is.null(extra$dba_contrast) && is.list(extra$dba_contrast)) { + contrast_args <- modifyList(contrast_args, extra$dba_contrast) +} + +extract_counts <- function(dba_obj, sample_ids) { + df <- NULL + counts <- NULL + peaks_df <- NULL + try({ + df <- dba.peakset(dba_obj, bRetrieve = TRUE, DataType = DBA_DATA_FRAME) + }, silent = TRUE) + if (!is.null(df)) { + count_cols <- match(sample_ids, colnames(df)) + if (all(!is.na(count_cols))) { + counts <- as.matrix(df[, count_cols, drop = FALSE]) + } + coord_cols <- list(chr = NULL, start = NULL, end = NULL) + for (col in colnames(df)) { + if (tolower(col) %in% c("chr", "chrom", "seqnames", "seqname")) coord_cols$chr <- col + if (tolower(col) %in% c("start")) coord_cols$start <- col + if (tolower(col) %in% c("end")) coord_cols$end <- col + } + if (!is.null(coord_cols$chr) && !is.null(coord_cols$start) && !is.null(coord_cols$end)) { + peaks_df <- df[, c(coord_cols$chr, coord_cols$start, coord_cols$end), drop = FALSE] + colnames(peaks_df) <- c("chr", "start", "end") + } + } + if (is.null(counts)) { + try({ + gr <- dba.peakset(dba_obj, bRetrieve = TRUE) + df_gr <- as.data.frame(gr) + count_cols <- match(sample_ids, colnames(df_gr)) + if (all(!is.na(count_cols))) { + counts <- as.matrix(df_gr[, count_cols, drop = FALSE]) + } + if (all(c("seqnames", "start", "end") %in% colnames(df_gr))) { + peaks_df <- df_gr[, c("seqnames", "start", "end")] + colnames(peaks_df) <- c("chr", "start", "end") + } + }, silent = TRUE) + } + list(counts = counts, peaks = peaks_df) +} + +run_deseq2 <- function(counts, conditions, scale_factors, treated, control) { + suppressPackageStartupMessages(library(DESeq2)) + cond <- factor(conditions, levels = c(control, treated)) + dds <- DESeqDataSetFromMatrix(countData = counts, colData = data.frame(condition = cond), design = ~condition) + if (!is.null(scale_factors)) { + sizeFactors(dds) <- 1 / scale_factors + } + dds <- DESeq(dds) + res <- results(dds, contrast = c("condition", treated, control)) + data.frame(log2FC = res$log2FoldChange, pvalue = res$pvalue, FDR = res$padj) +} + +run_edger <- function(counts, conditions, scale_factors, treated, control) { + suppressPackageStartupMessages(library(edgeR)) + cond <- factor(conditions, levels = c(control, treated)) + y <- DGEList(counts = counts, group = cond) + if (!is.null(scale_factors)) { + y$samples$norm.factors <- 1 / scale_factors + } else { + y <- calcNormFactors(y) + } + design <- model.matrix(~cond) + y <- estimateDisp(y, design) + fit <- glmQLFit(y, design) + qlf <- glmQLFTest(fit, coef = 2) + data.frame(log2FC = qlf$table$logFC, pvalue = qlf$table$PValue, FDR = p.adjust(qlf$table$PValue, method = "BH")) +} + +write_plot_placeholder <- function(path, title) { + pdf(path) + plot.new() + title(main = title) + dev.off() +} + +samplesheet$Condition <- as.character(samplesheet$Condition) + +if (use_spikein) { + dba_obj <- dba(sampleSheet = samplesheet) + dba_obj <- do.call(dba.count, c(list(dba_obj), count_args)) + dba_obj <- do.call(dba.contrast, c(list(dba_obj), contrast_args)) + + extracted <- extract_counts(dba_obj, sample_ids) + if (is.null(extracted$counts) || is.null(extracted$peaks)) { + message("Unable to extract counts/peaks for spike-in normalization; falling back to DiffBind default") + use_spikein <- FALSE + } else { + counts <- extracted$counts + if (backend == "EDGER") { + res_stats <- run_edger(counts, samplesheet$Condition, scale_factors, opt$treated, opt$control) + } else { + res_stats <- run_deseq2(counts, samplesheet$Condition, scale_factors, opt$treated, opt$control) + } + res_df <- cbind(extracted$peaks, res_stats) + res_df$log2FC[is.na(res_df$log2FC)] <- 0 + res_df$FDR[is.na(res_df$FDR)] <- 1 + res_df$pvalue[is.na(res_df$pvalue)] <- 1 + + write.table(res_df, paste0(opt$prefix, ".results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + sig <- res_df[res_df$FDR <= opt$fdr & abs(res_df$log2FC) >= opt$lfc, , drop = FALSE] + write.table(sig[, c("chr", "start", "end")], paste0(opt$prefix, ".significant.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + up <- sig[sig$log2FC > 0, , drop = FALSE] + down <- sig[sig$log2FC < 0, , drop = FALSE] + write.table(up[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_up.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(down[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_down.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + + n_tested <- nrow(res_df) + n_fdr <- nrow(sig) + n_up <- nrow(up) + n_down <- nrow(down) + summary <- data.frame( + method = "diffbind", + group = opt$group, + caller = opt$caller, + treated_condition = opt$treated, + control_condition = opt$control, + n_tested = n_tested, + n_fdr_pass = n_fdr, + n_up = n_up, + n_down = n_down + ) + write.table(summary, paste0(opt$prefix, ".summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + # plots + counts_log <- log2(counts + 1) + try({ + pca <- prcomp(t(counts_log), scale. = TRUE) + pdf("plots/PCA.pdf") + plot(pca$x[,1], pca$x[,2], col = as.factor(samplesheet$Condition), pch = 19, + xlab = "PC1", ylab = "PC2", main = "PCA") + legend("topright", legend = levels(as.factor(samplesheet$Condition)), col = 1:length(unique(samplesheet$Condition)), pch = 19) + dev.off() + }, silent = TRUE) + try({ + pdf("plots/correlation_heatmap.pdf") + heatmap(cor(counts_log), main = "Correlation") + dev.off() + }, silent = TRUE) + try({ + pdf("plots/MA.pdf") + plot(rowMeans(counts_log), res_df$log2FC, pch = 16, cex = 0.5, main = "MA", xlab = "Mean", ylab = "log2FC") + dev.off() + }, silent = TRUE) + try({ + pdf("plots/volcano.pdf") + plot(res_df$log2FC, -log10(res_df$FDR), pch = 16, cex = 0.5, main = "Volcano", xlab = "log2FC", ylab = "-log10(FDR)") + dev.off() + }, silent = TRUE) + try({ + pdf("plots/heatmap.pdf") + heatmap(counts_log, Rowv = NA, Colv = NA, scale = "row", main = "Binding heatmap") + dev.off() + }, silent = TRUE) + } +} + +if (!use_spikein) { + dba_obj <- dba(sampleSheet = samplesheet) + dba_obj <- do.call(dba.count, c(list(dba_obj), count_args)) + dba_obj <- do.call(dba.contrast, c(list(dba_obj), contrast_args)) + dba_obj <- do.call(dba.analyze, c(list(dba_obj), analyze_args)) + + res <- dba.report(dba_obj, th = opt$fdr, fold = opt$lfc) + res_df <- as.data.frame(res) + + coord_cols <- list(chr = NULL, start = NULL, end = NULL) + for (col in colnames(res_df)) { + if (tolower(col) %in% c("chr", "chrom", "seqnames", "seqname")) coord_cols$chr <- col + if (tolower(col) %in% c("start")) coord_cols$start <- col + if (tolower(col) %in% c("end")) coord_cols$end <- col + } + if (!is.null(coord_cols$chr)) { + res_df$chr <- res_df[[coord_cols$chr]] + } + if (!is.null(coord_cols$start)) { + res_df$start <- res_df[[coord_cols$start]] + } + if (!is.null(coord_cols$end)) { + res_df$end <- res_df[[coord_cols$end]] + } + + if (!"log2FC" %in% colnames(res_df)) { + if ("Fold" %in% colnames(res_df)) { + res_df$log2FC <- res_df$Fold + } else if ("fold" %in% colnames(res_df)) { + res_df$log2FC <- res_df$fold + } + } + if (!"FDR" %in% colnames(res_df)) { + if ("FDR" %in% colnames(res_df)) { + res_df$FDR <- res_df$FDR + } + } + + write.table(res_df, paste0(opt$prefix, ".results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + sig <- res_df[!is.na(res_df$FDR) & res_df$FDR <= opt$fdr & abs(res_df$log2FC) >= opt$lfc, , drop = FALSE] + write.table(sig[, c("chr", "start", "end")], paste0(opt$prefix, ".significant.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + up <- sig[sig$log2FC > 0, , drop = FALSE] + down <- sig[sig$log2FC < 0, , drop = FALSE] + write.table(up[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_up.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(down[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_down.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + + n_tested <- nrow(res_df) + n_fdr <- nrow(sig) + n_up <- nrow(up) + n_down <- nrow(down) + summary <- data.frame( + method = "diffbind", + group = opt$group, + caller = opt$caller, + treated_condition = opt$treated, + control_condition = opt$control, + n_tested = n_tested, + n_fdr_pass = n_fdr, + n_up = n_up, + n_down = n_down + ) + write.table(summary, paste0(opt$prefix, ".summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + + # plots + try({ + pdf("plots/PCA.pdf") + dba.plotPCA(dba_obj, DBA_CONDITION, label = DBA_ID) + dev.off() + }, silent = TRUE) + + try({ + pdf("plots/correlation_heatmap.pdf") + dba.plotHeatmap(dba_obj, correlations = TRUE) + dev.off() + }, silent = TRUE) + + try({ + pdf("plots/MA.pdf") + dba.plotMA(dba_obj) + dev.off() + }, silent = TRUE) + + try({ + pdf("plots/volcano.pdf") + dba.plotVolcano(dba_obj) + dev.off() + }, silent = TRUE) + + try({ + pdf("plots/heatmap.pdf") + dba.plotHeatmap(dba_obj) + dev.off() + }, silent = TRUE) +} + +if (!file.exists(paste0(opt$prefix, ".results.tsv"))) { + stop("DiffBind failed to produce results") +} + +plots <- c("plots/PCA.pdf", "plots/correlation_heatmap.pdf", "plots/MA.pdf", "plots/volcano.pdf", "plots/heatmap.pdf") +for (plot_file in plots) { + if (!file.exists(plot_file)) { + write_plot_placeholder(plot_file, basename(plot_file)) + } +} diff --git a/bin/hdbscan_grid.py b/bin/hdbscan_grid.py new file mode 100755 index 00000000..434ddded --- /dev/null +++ b/bin/hdbscan_grid.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +import argparse +import itertools +import sys + +import numpy as np +import pandas as pd + + +def parse_list(value): + return [int(x.strip()) for x in str(value).split(',') if x.strip()] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--matrix", required=True) + parser.add_argument("--min_cluster_size", required=True) + parser.add_argument("--min_samples", required=True) + parser.add_argument("--summary", required=True) + parser.add_argument("--clusters", required=True) + args = parser.parse_args() + + df = pd.read_csv(args.matrix, sep='\t') + if df.shape[1] <= 3: + raise SystemExit("Matrix missing sample columns") + + coords = df.iloc[:, :3] + matrix = df.iloc[:, 3:].values + + try: + import hdbscan + except Exception: + # fallback: single cluster + summary = pd.DataFrame([ + { + "min_cluster_size": 0, + "min_samples": 0, + "n_clusters": 1, + "fraction_assigned": 1.0, + "mean_persistence": 1.0, + "selected": True, + } + ]) + summary.to_csv(args.summary, sep='\t', index=False) + clusters = coords.copy() + clusters["cluster"] = 0 + clusters.to_csv(args.clusters, sep='\t', index=False) + return + + min_cluster_sizes = parse_list(args.min_cluster_size) + min_samples_list = parse_list(args.min_samples) + + results = [] + best = None + best_score = None + best_labels = None + + for mcs, ms in itertools.product(min_cluster_sizes, min_samples_list): + clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs, min_samples=ms) + labels = clusterer.fit_predict(matrix) + n_clusters = len(set(labels)) - (1 if -1 in labels else 0) + fraction_assigned = float(np.sum(labels >= 0)) / float(len(labels)) if len(labels) else 0.0 + mean_persistence = float(np.mean(clusterer.cluster_persistence_)) if getattr(clusterer, 'cluster_persistence_', None) is not None and len(clusterer.cluster_persistence_) > 0 else 0.0 + score = (fraction_assigned, n_clusters, mean_persistence) + if best_score is None or score > best_score: + best_score = score + best = (mcs, ms) + best_labels = labels + results.append({ + "min_cluster_size": mcs, + "min_samples": ms, + "n_clusters": n_clusters, + "fraction_assigned": round(fraction_assigned, 4), + "mean_persistence": round(mean_persistence, 4), + "selected": False, + }) + + summary = pd.DataFrame(results) + if best is not None: + summary.loc[(summary.min_cluster_size == best[0]) & (summary.min_samples == best[1]), "selected"] = True + summary.to_csv(args.summary, sep='\t', index=False) + + clusters = coords.copy() + clusters["cluster"] = best_labels if best_labels is not None else 0 + clusters.to_csv(args.clusters, sep='\t', index=False) + + +if __name__ == "__main__": + main() diff --git a/conf/flowswitch.config b/conf/flowswitch.config index a79202c8..07afeb04 100644 --- a/conf/flowswitch.config +++ b/conf/flowswitch.config @@ -135,3 +135,23 @@ if(params.only_peak_calling) { } if(params.skip_multiqc) { params.run_multiqc = false } + +if(params.differential_from_run) { + params.run_genome_prep = false + params.run_input_check = false + params.run_cat_fastq = false + params.run_trim_galore_fastqc = false + params.run_alignment = false + params.run_read_filter = false + params.run_preseq = false + params.run_mark_dups = false + params.run_remove_dups = false + params.run_remove_linear_dups = false + params.run_peak_calling = false + params.run_reporting = false + params.run_deeptools_heatmaps = false + params.run_deeptools_qc = false + params.run_peak_qc = false + params.run_multiqc = false + params.run_igv = false +} diff --git a/conf/modules.config b/conf/modules.config index 676eb44a..e3421104 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1149,3 +1149,114 @@ if (params.run_multiqc) { } } } + +/* +======================================================================================== + DIFFERENTIAL PEAK CALLING +======================================================================================== +*/ + +process { + withName: '.*WRITE_DIFFERENTIAL_MANIFESTS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/00_manifests" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*MAKE_DIFFBIND_SAMPLESHEET' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/01_diffbind/00_samplesheets/${caller}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*DIFFBIND_RUN' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/01_diffbind/${caller}/${group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*CHIPBINNER_BINS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/bins" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*CHIPBINNER_COUNTS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/counts" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*CHIPBINNER_HDBSCAN_GRID' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/clustering" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*CHIPBINNER_ROTS' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> + if (filename == 'versions.yml') { return null } + if (filename == 'plots') { return 'plots' } + if (filename.startsWith('plots')) { return "plots/${filename}" } + if (filename.endsWith('.bed')) { return "bed/${filename}" } + if (filename.contains('differential')) { return "differential/${filename}" } + return filename + } + ] + } + + withName: '.*SPAN_CAPABILITY_PROBE' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/00_manifests" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*SPAN_COMPARE' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/${meta.group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*SPAN_FALLBACK_DIFF' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/${group}" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*DIFFERENTIAL_OVERLAP' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/04_cross_comparison" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*DIFFERENTIAL_SUMMARY_MERGE' { + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/multiqc" }, + mode: "${params.publish_dir_mode}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } +} diff --git a/docs/output.md b/docs/output.md index ccf61f74..22d65384 100644 --- a/docs/output.md +++ b/docs/output.md @@ -30,7 +30,8 @@ - 6.7. [GoPeaks peak calling](#GoPeakspeakcalling) - 6.8. [epic2 peak calling](#epic2peakcalling) - 6.9. [SPAN/OmniPeaks peak calling](#SPANpeakcalling) - - 6.10. [Consensus Peaks](#ConsensusPeaks) + - 6.10. [Differential peak calling](#DifferentialPeakCalling) + - 6.11. [Consensus Peaks](#ConsensusPeaks) - 7. [Peak-based QC](#Peak-basedQC) - 7.1. [Peak Counts](#PeakCounts) - 7.2. [Peak Reproducibility](#PeakReproducibility) @@ -425,7 +426,52 @@ epic2 callers require pooled controls; see the pooled controls section above. SPAN/OmniPeaks callers require pooled controls and the `--omnipeaks_jar` parameter. -### 6.10. Consensus Peaks +### 6.10. Differential peak calling + +
+Output files + +- `03_peak_calling/06_differential/00_manifests/` + - `differential_manifest.samples.tsv`: per-sample metadata for posthoc differential runs. + - `differential_manifest.peaks.tsv`: per-sample peak paths for each caller. + - `differential_manifest.run_meta.json`: run metadata (callers, reference, default contrast). +- `03_peak_calling/06_differential/01_diffbind/00_samplesheets//` + - `.diffbind.csv`: DiffBind samplesheet for each group/caller. +- `03_peak_calling/06_differential/01_diffbind///` + - `diffbind.results.tsv`: differential result table. + - `diffbind.results.annotated.tsv`: annotated differential results. + - `diffbind.significant.bed`: significant regions. + - `diffbind.significant_up.bed`: regions enriched in treated. + - `diffbind.significant_down.bed`: regions enriched in control. + - `diffbind.summary.tsv`: per-group summary for MultiQC. + - `plots/*.pdf`: diagnostic plots (PCA/correlation). +- `03_peak_calling/06_differential/02_chipbinner//` + - `bins/*.bed`: genome-wide bins. + - `counts/*.tsv`: bin count matrices. + - `clustering/*.tsv`: clustering diagnostics. + - `differential/*.tsv`: differential results. + - `bed/*.bed`: significant differential bins. + - `plots/*.pdf`: diagnostic plots. + - `chipbinner.summary.tsv`: per-group summary for MultiQC. +- `03_peak_calling/06_differential/03_span//` + - `span.differential.tsv`: differential results. + - `span.differential.bed`: significant regions. + - `span.differential.annotated.tsv`: annotated differential results. + - `span.significant_up.bed`: regions enriched in treated. + - `span.significant_down.bed`: regions enriched in control. + - `span.summary.tsv`: per-group summary for MultiQC. +- `03_peak_calling/06_differential/04_cross_comparison/` + - `overlap_callers.tsv`: overlap between DiffBind results across callers. + - `overlap_methods.tsv`: overlap between DiffBind/ChIPBinner/SPAN results. +- `03_peak_calling/06_differential/multiqc/` + - `differential.summary.all_methods.tsv`: merged summary table. + - `differential.skipped.tsv`: skipped groups/callers with reasons. + +
+ +Differential peak calling is enabled with `--run_diffbind`, `--run_chipbinner`, and/or `--run_span_diff`. A contrast definition (`--differential_contrast "TREATED,CONTROL"`) is required to set the log2FC direction. The pipeline can also run in posthoc mode via `--differential_from_run` to reuse an existing run directory. + +### 6.11. Consensus Peaks
Output files diff --git a/docs/usage.md b/docs/usage.md index 56780e94..9f10a769 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -167,6 +167,58 @@ If control samples are provided in the sample sheet, they will be used to normal After peak calling, consensus peaks are calculated by merging peaks within the same grouping key. Use `--consensus_grouping` to choose `group` or `group_condition`. By default, if the samplesheet includes a `condition` column, grouping uses `group_condition`; otherwise it falls back to `group`. The number of replicates required for a valid peak can be changed using `replicate_threshold`. To call consensus peaks across all samples, set `--consensus_peak_mode all`. +### Differential Peak Calling + +Differential peak calling is an optional analysis layer that can be enabled after peak calling. It supports three complementary methods: + +- **DiffBind** (per caller × group) +- **ChIPBinner** (per group) +- **SPAN/OmniPeaks differential** (per group) + +To enable differential analysis, set one or more `--run_*` toggles and provide an explicit contrast: + +``` +--run_diffbind \ +--differential_contrast "Treated,Control" +``` + +Key parameters: + +- `--run_diffbind`, `--run_chipbinner`, `--run_span_diff` +- `--differential_contrast "TREATED,CONTROL"` (required when any method is enabled) +- `--differential_min_replicates` (default: 2) +- `--differential_strict` (fail on invalid comparisons instead of skipping) +- `--differential_use_spikein` (auto|true|false) +- `--differential_annotate` (annotate differential regions) +- `--differential_cross_compare` (cross-method/caller overlap tables) +- `--differential_run_multiqc` (generate summary tables for MultiQC) + +DiffBind options: + +- `--diffbind_fdr`, `--diffbind_lfc`, `--diffbind_backend`, `--diffbind_min_overlap`, `--diffbind_summits` +- `--diffbind_extra_params` (JSON/YAML overrides for advanced DiffBind settings) + +ChIPBinner options: + +- `--chipbinner_bin_size`, `--chipbinner_windows_dir`, `--chipbinner_blacklist` +- `--chipbinner_pseudocount`, `--chipbinner_fdr`, `--chipbinner_lfc` +- `--chipbinner_hdbscan_grid_min_cluster_size`, `--chipbinner_hdbscan_grid_min_samples` +- `--chipbinner_bootstrap`, `--chipbinner_k_value` + +SPAN differential options: + +- `--omnipeaks_jar` (required for SPAN differential) +- `--span_diff_mode` (auto|native|fallback), `--span_diff_fdr`, `--span_diff_gap`, `--span_diff_bin` +- `--span_diff_java_heap` + +Posthoc differential-only mode: + +``` +nextflow run . -entry DIFFERENTIAL_ONLY \ + --differential_from_run \ + --differential_contrast "Treated,Control" +``` + ### Reproducibility It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. diff --git a/main.nf b/main.nf index f3b5b5eb..2adea865 100644 --- a/main.nf +++ b/main.nf @@ -65,6 +65,7 @@ WorkflowMain.initialise(workflow, params, log, args) */ include { CUTANDRUN } from './workflows/cutandrun' +include { DIFFERENTIAL_ONLY } from './workflows/differential_only' workflow NFCORE_CUTANDRUN { /* diff --git a/modules/local/annotate_regions.nf b/modules/local/annotate_regions.nf new file mode 100644 index 00000000..f131a9c8 --- /dev/null +++ b/modules/local/annotate_regions.nf @@ -0,0 +1,45 @@ +process ANNOTATE_REGIONS { + tag "${results_tsv.simpleName}" + label 'process_medium' + + publishDir = { (meta?.publish_dir ?: task.ext.publish_dir) ? [ + path: meta?.publish_dir ?: task.ext.publish_dir, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] : null } + + conda "bioconda::bedtools=2.31.1 conda-forge::python=3.11 conda-forge::perl=5.26.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + tuple val(meta), path(results_tsv) + val gene_bed + val gtf + + output: + path "*.annotated.tsv", emit: annotated + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: results_tsv.baseName + def gene_arg = gene_bed ? "--features ${gene_bed}" : '' + def gtf_arg = (!gene_bed && gtf) ? "--gtf ${gtf}" : '' + """ + python ${projectDir}/bin/annotate_regions.py \ + --input ${results_tsv} \ + --output ${prefix}.annotated.tsv \ + ${gene_arg} \ + ${gtf_arg} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """ +} diff --git a/modules/local/chipbinner_bins.nf b/modules/local/chipbinner_bins.nf new file mode 100644 index 00000000..d92a0f4d --- /dev/null +++ b/modules/local/chipbinner_bins.nf @@ -0,0 +1,57 @@ +process CHIPBINNER_BINS { + tag "${group}.${bin_size}" + label 'process_high' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/bins" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "bioconda::bedtools=2.31.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + path chrom_sizes + val bin_size + val group + val windows_dir + val blacklist + + output: + tuple val(group), path("*.windows.bed"), emit: bins + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${group}.${bin_size}" + def windows_arg = windows_dir ? "${windows_dir}" : '' + def blacklist_arg = blacklist ? "${blacklist}" : '' + """ + set -euo pipefail + if [ -n "${windows_arg}" ]; then + windows_file=\$(ls ${windows_arg}/*${bin_size}*.bed 2>/dev/null | head -n 1 || true) + if [ -z "\$windows_file" ]; then + echo "No window file found in ${windows_arg} for bin size ${bin_size}" >&2 + exit 1 + fi + cp "\$windows_file" ${prefix}.windows.bed + else + bedtools makewindows -g ${chrom_sizes} -w ${bin_size} > ${prefix}.windows.bed + fi + + if [ -n "${blacklist_arg}" ]; then + bedtools subtract -a ${prefix}.windows.bed -b ${blacklist_arg} > ${prefix}.windows.filtered.bed + mv ${prefix}.windows.filtered.bed ${prefix}.windows.bed + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/local/chipbinner_counts.nf b/modules/local/chipbinner_counts.nf new file mode 100644 index 00000000..e7280d9e --- /dev/null +++ b/modules/local/chipbinner_counts.nf @@ -0,0 +1,81 @@ +process CHIPBINNER_COUNTS { + tag "${group}" + label 'process_high' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/counts" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "bioconda::bedtools=2.31.1 conda-forge::python=3.11" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + path bins + val samples_json + val group + val use_spikein + val pseudocount + + output: + path "chipbinner.bin_counts.tsv" , emit: counts + path "chipbinner.normalized_matrix.tsv", emit: normalized + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + python - <<'PY' + import json + import subprocess + import sys + from pathlib import Path + + samples = json.loads(r'''${samples_json}''') + bam_paths = [s['bam'] for s in samples] + sample_ids = [s['sample_id'] for s in samples] + scale_factors = [s.get('spikein_scale_factor') for s in samples] + + cmd = ["bedtools", "multicov", "-bams"] + bam_paths + ["-bed", "${bins}"] + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + sys.stderr.write(result.stderr) + sys.exit(result.returncode) + + lines = result.stdout.strip().splitlines() + with open('chipbinner.bin_counts.tsv', 'w') as handle: + handle.write("\\t".join(["chr", "start", "end"] + sample_ids) + "\\n") + for line in lines: + if not line.strip(): + continue + handle.write(line + "\\n") + + # normalized matrix + use_spikein = str('${use_spikein}').lower() in ("true", "1", "yes") + with open('chipbinner.normalized_matrix.tsv', 'w') as handle: + handle.write("\\t".join(["chr", "start", "end"] + sample_ids) + "\\n") + for line in lines: + if not line.strip(): + continue + parts = line.split("\\t") + coords = parts[:3] + counts = [float(x) for x in parts[3:]] + if use_spikein and all(sf not in (None, "NA", "") for sf in scale_factors): + factors = [float(sf) for sf in scale_factors] + counts = [c * f for c, f in zip(counts, factors)] + counts = [c + float('${pseudocount}') for c in counts] + handle.write("\\t".join(coords + [f"{c:.6f}" for c in counts]) + "\\n") + PY + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/chipbinner_hdbscan_grid.nf b/modules/local/chipbinner_hdbscan_grid.nf new file mode 100644 index 00000000..6e2ffed0 --- /dev/null +++ b/modules/local/chipbinner_hdbscan_grid.nf @@ -0,0 +1,58 @@ +process CHIPBINNER_HDBSCAN_GRID { + tag "${group}" + label 'process_high' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/clustering" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::python=3.11 conda-forge::numpy conda-forge::pandas conda-forge::scikit-learn conda-forge::hdbscan" + container "quay.io/biocontainers/python:3.8.3" + + input: + path matrix + val group + val min_cluster_size + val min_samples + + output: + path "chipbinner.hdbscan_grid_summary.tsv", emit: grid_summary + path "chipbinner.clusters.best.tsv" , emit: clusters + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + python - <<'PY' + import importlib.util + import subprocess + import sys + + required = { + "numpy": "numpy", + "pandas": "pandas", + "sklearn": "scikit-learn", + "hdbscan": "hdbscan", + } + missing = [pkg for mod, pkg in required.items() if importlib.util.find_spec(mod) is None] + if missing: + subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir"] + missing) + PY + + python ${projectDir}/bin/hdbscan_grid.py \ + --matrix ${matrix} \ + --min_cluster_size ${min_cluster_size} \ + --min_samples ${min_samples} \ + --summary chipbinner.hdbscan_grid_summary.tsv \ + --clusters chipbinner.clusters.best.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/chipbinner_rots.nf b/modules/local/chipbinner_rots.nf new file mode 100644 index 00000000..51ca0e6c --- /dev/null +++ b/modules/local/chipbinner_rots.nf @@ -0,0 +1,77 @@ +process CHIPBINNER_ROTS { + tag "${group}" + label 'process_high' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename == 'versions.yml') { return null } + if (filename == 'plots') { return 'plots' } + if (filename.startsWith('plots')) { return "plots/${filename}" } + if (filename.endsWith('.bed')) { return "bed/${filename}" } + if (filename.contains('differential')) { return "differential/${filename}" } + return filename + } + ] + + conda "bioconda::r-rots conda-forge::r-base=4.2.3 conda-forge::r-optparse conda-forge::r-ggplot2 conda-forge::r-jsonlite" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-rots:1.19.0--r42hdfd78af_0' : + 'biocontainers/r-rots:1.19.0--r42hdfd78af_0' }" + + input: + path matrix + path clusters + val samples_json + val treated + val control + val group + val fdr + val lfc + val bootstrap + val k_value + + output: + tuple val(group), path("chipbinner.differential.tsv") , emit: results + tuple val(group), path("chipbinner.significant_up.bed") , emit: up + tuple val(group), path("chipbinner.significant_down.bed") , emit: down + tuple val(group), path("chipbinner.summary.tsv") , emit: summary + tuple val(group), path("plots") , emit: plots + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + mkdir -p plots + python - <<'PY' + import json + import csv + samples = json.loads(r'''${samples_json}''') + with open('chipbinner.samplesheet.csv', 'w', newline='') as handle: + writer = csv.writer(handle) + writer.writerow(['sample_id', 'condition']) + for row in samples: + writer.writerow([row['sample_id'], row['condition']]) + PY + + Rscript ${projectDir}/bin/chipbinner_rots.R \ + --matrix ${matrix} \ + --clusters ${clusters} \ + --samplesheet chipbinner.samplesheet.csv \ + --treated ${treated} \ + --control ${control} \ + --group ${group} \ + --fdr ${fdr} \ + --lfc ${lfc} \ + --bootstrap ${bootstrap} \ + --k_value ${k_value} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + r-base: \$(R --version 2>&1 | head -n 1 | sed -e 's/.*R version //; s/ .*//') + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/diffbind_run.nf b/modules/local/diffbind_run.nf new file mode 100644 index 00000000..2a7d27f3 --- /dev/null +++ b/modules/local/diffbind_run.nf @@ -0,0 +1,68 @@ +process DIFFBIND_RUN { + tag "${group}.${caller}" + label 'process_medium' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/01_diffbind/${caller}/${group}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "bioconda::bioconductor-diffbind bioconda::bioconductor-deseq2 bioconda::bioconductor-edger bioconda::bioconductor-genomicranges conda-forge::r-base=4.2.3 conda-forge::r-optparse conda-forge::r-ggplot2 conda-forge::r-jsonlite conda-forge::r-yaml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-diffbind:3.14.0--r42hdfd78af_0' : + 'biocontainers/bioconductor-diffbind:3.14.0--r42hdfd78af_0' }" + + input: + path samplesheet + val treated + val control + val group + val caller + val use_spikein + val fdr + val lfc + val min_overlap + val summits + val backend + val extra_params + val output_prefix + + output: + tuple val(group), val(caller), path("${output_prefix}.results.tsv") , emit: results + tuple val(group), val(caller), path("${output_prefix}.significant.bed") , emit: significant + tuple val(group), val(caller), path("${output_prefix}.significant_up.bed") , emit: significant_up + tuple val(group), val(caller), path("${output_prefix}.significant_down.bed") , emit: significant_down + tuple val(group), val(caller), path("${output_prefix}.summary.tsv") , emit: summary + tuple val(group), val(caller), path("plots") , emit: plots + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def extra_arg = extra_params ? "--extra_params ${extra_params}" : '' + """ + mkdir -p plots + Rscript ${projectDir}/bin/diffbind_run.R \ + --samplesheet ${samplesheet} \ + --treated ${treated} \ + --control ${control} \ + --group ${group} \ + --caller ${caller} \ + --fdr ${fdr} \ + --lfc ${lfc} \ + --min_overlap ${min_overlap} \ + --summits ${summits} \ + --backend ${backend} \ + --use_spikein ${use_spikein} \ + --prefix ${output_prefix} \ + ${extra_arg} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diffbind: \$(Rscript -e 'packageVersion("DiffBind")' 2>/dev/null | tr -d '[]') + r-base: \$(R --version 2>&1 | head -n 1 | sed -e 's/.*R version //; s/ .*//') + END_VERSIONS + """ +} diff --git a/modules/local/diffbind_samplesheet.nf b/modules/local/diffbind_samplesheet.nf new file mode 100644 index 00000000..21a790f3 --- /dev/null +++ b/modules/local/diffbind_samplesheet.nf @@ -0,0 +1,61 @@ +process MAKE_DIFFBIND_SAMPLESHEET { + tag "${group}.${caller}" + label 'process_single' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/01_diffbind/00_samplesheets/${caller}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::python=3.11" + container "quay.io/biocontainers/python:3.8.3" + + input: + val samples_json + val group + val caller + + output: + tuple val(group), val(caller), path("*.diffbind.csv"), emit: samplesheet + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${group}.diffbind" + """\ + python - <<'PY' + import csv + import json + + samples = json.loads(r'''${samples_json}''') + out = f"${prefix}.csv" + + headers = [ + 'SampleID','Tissue','Factor','Condition','Replicate','bamReads','Peaks','PeakCaller','SpikeinScaleFactor' + ] + with open(out, 'w', newline='') as handle: + writer = csv.writer(handle) + writer.writerow(headers) + for sample in samples: + writer.writerow([ + sample['sample_id'], + 'CUTRUN', + sample['group'], + sample['condition'], + sample['replicate'], + sample['bam'], + sample['peaks'], + sample['caller'], + sample.get('spikein_scale_factor', 'NA'), + ]) + PY + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/differential_overlap.nf b/modules/local/differential_overlap.nf new file mode 100644 index 00000000..779b4026 --- /dev/null +++ b/modules/local/differential_overlap.nf @@ -0,0 +1,77 @@ +process DIFFERENTIAL_OVERLAP { + tag "overlap" + label 'process_medium' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/04_cross_comparison" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "bioconda::bedtools=2.31.1 conda-forge::python=3.11" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + path callers_tsv + path methods_tsv + + output: + path "overlap_callers.tsv" , emit: callers + path "overlap_methods.tsv" , emit: methods + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + set -euo pipefail + + printf "group\tcaller_a\tcaller_b\tintersection\tunion\tjaccard\n" > overlap_callers.tsv + if [[ -s "${callers_tsv}" ]]; then + awk -F'\t' 'BEGIN{OFS="\t"} { g=\$1; c=\$2; p=\$3; n[g]++; callers[g,n[g]]=c; paths[g,n[g]]=p } + END { for (g in n) { for (i=1; i<=n[g]; i++) { for (j=i+1; j<=n[g]; j++) { print g, callers[g,i], paths[g,i], callers[g,j], paths[g,j] } } } }' "${callers_tsv}" \ + | while IFS=\$'\\t' read -r group caller_a path_a caller_b path_b; do + jaccard_line=\$(bedtools jaccard -a "\$path_a" -b "\$path_b" | awk 'NR==2 {print \$2"\\t"\$3"\\t"\$4}') + if [[ -z "\$jaccard_line" ]]; then + inter=0 + union=0 + jacc=0 + else + IFS=\$'\\t' read -r inter union jacc <<< "\$jaccard_line" + fi + printf "%s\t%s\t%s\t%s\t%s\t%s\n" "\$group" "\$caller_a" "\$caller_b" "\$inter" "\$union" "\$jacc" >> overlap_callers.tsv + done + fi + + printf "group\tmethod_a\tmethod_b\tcaller\tintersection\tunion\tjaccard\n" > overlap_methods.tsv + if [[ -s "${methods_tsv}" ]]; then + awk -F'\t' 'BEGIN{OFS="\t"} { g=\$1; m=\$2; c=\$3; p=\$4; n[g]++; methods[g,n[g]]=m; callers[g,n[g]]=c; paths[g,n[g]]=p } + END { for (g in n) { for (i=1; i<=n[g]; i++) { for (j=i+1; j<=n[g]; j++) { print g, methods[g,i], callers[g,i], paths[g,i], methods[g,j], callers[g,j], paths[g,j] } } } }' "${methods_tsv}" \ + | while IFS=\$'\\t' read -r group method_a caller_a path_a method_b caller_b path_b; do + jaccard_line=\$(bedtools jaccard -a "\$path_a" -b "\$path_b" | awk 'NR==2 {print \$2"\\t"\$3"\\t"\$4}') + if [[ -z "\$jaccard_line" ]]; then + inter=0 + union=0 + jacc=0 + else + IFS=\$'\\t' read -r inter union jacc <<< "\$jaccard_line" + fi + caller="NA" + if [[ "\$method_a" == "diffbind" ]]; then + caller="\$caller_a" + elif [[ "\$method_b" == "diffbind" ]]; then + caller="\$caller_b" + fi + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" "\$group" "\$method_a" "\$method_b" "\$caller" "\$inter" "\$union" "\$jacc" >> overlap_methods.tsv + done + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf new file mode 100644 index 00000000..dd39319f --- /dev/null +++ b/modules/local/differential_summary.nf @@ -0,0 +1,88 @@ +process DIFFERENTIAL_SUMMARY_MERGE { + tag "differential_summary" + label 'process_single' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/multiqc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::python=3.11" + container "quay.io/biocontainers/python:3.8.3" + + input: + path summary_files + val skipped_json + path summary_header + path skipped_header + + output: + path "differential.summary.all_methods.tsv", emit: summary + path "differential.skipped.tsv" , emit: skipped + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + python - <<'PY' + import csv + import json + from pathlib import Path + + summary_files = sorted(Path('.').glob('*.summary.tsv')) + + with open('${summary_header}', 'r') as header_handle: + header_lines = header_handle.read().rstrip('\\n') + + rows = [] + for path in summary_files: + with path.open() as handle: + reader = csv.DictReader(handle, delimiter='\\t') + for row in reader: + rows.append(row) + + if rows: + fieldnames = list(rows[0].keys()) + else: + fieldnames = [ + 'method','group','caller','treated_condition','control_condition','n_tested','n_fdr_pass','n_up','n_down' + ] + + with open('differential.summary.all_methods.tsv', 'w', newline='') as out_handle: + if header_lines: + out_handle.write(header_lines + '\\n') + writer = csv.DictWriter(out_handle, delimiter='\\t', fieldnames=fieldnames, lineterminator='\\n') + writer.writeheader() + for row in rows: + writer.writerow(row) + + skipped = json.loads(r'''${skipped_json}''') if '${skipped_json}' else [] + + with open('${skipped_header}', 'r') as header_handle: + skipped_header = header_handle.read().rstrip('\\n') + + with open('differential.skipped.tsv', 'w', newline='') as out_handle: + if skipped_header: + out_handle.write(skipped_header + '\\n') + fieldnames = ['method','group','caller','reason','details'] + writer = csv.DictWriter(out_handle, delimiter='\\t', fieldnames=fieldnames, lineterminator='\\n') + writer.writeheader() + for row in skipped: + writer.writerow({ + 'method': row.get('method', 'NA'), + 'group': row.get('group', 'NA'), + 'caller': row.get('caller', 'NA'), + 'reason': row.get('reason', 'NA'), + 'details': row.get('details', '') + }) + PY + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf index 81f70b95..bde1405a 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc.nf @@ -32,6 +32,8 @@ process MULTIQC { path ('peak_metrics/peak_reprod_perc/*') path ('frag_len/*') path ('linear_duplicates/*') + path ('differential/*') + path ('differential/*') output: path "*multiqc_report.html", emit: report diff --git a/modules/local/span_capability_probe.nf b/modules/local/span_capability_probe.nf new file mode 100644 index 00000000..2ef182b8 --- /dev/null +++ b/modules/local/span_capability_probe.nf @@ -0,0 +1,60 @@ +process SPAN_CAPABILITY_PROBE { + tag "omnipeaks" + label 'process_single' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/00_manifests" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::openjdk=21.0.2 conda-forge::python=3.11" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://eclipse-temurin:21-jre' : + 'eclipse-temurin:21-jre' }" + + input: + path omnipeaks_jar + + output: + path "omnipeaks_capabilities.json", emit: capabilities + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + java -jar ${omnipeaks_jar} --help > omnipeaks_help.txt + python - <<'PY' + import json + import re + + commands = set() + with open('omnipeaks_help.txt') as handle: + for line in handle: + line = line.strip() + if not line: + continue + m = re.match(r'^(?:[A-Z]+)?\s*([a-zA-Z][\w-]+)\b', line) + if m: + cmd = m.group(1) + if cmd not in ('usage', 'options'): + commands.add(cmd) + commands = sorted(commands) + has_compare = any(cmd in ('compare', 'diff', 'differential') for cmd in commands) + out = { + "commands": commands, + "has_compare": has_compare, + } + with open('omnipeaks_capabilities.json', 'w') as handle: + json.dump(out, handle, indent=2) + handle.write("\\n") + PY + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + java: \$(java -version 2>&1 | head -n 1 | sed -e 's/"//g') + END_VERSIONS + """.stripIndent() +} diff --git a/modules/local/span_compare.nf b/modules/local/span_compare.nf new file mode 100644 index 00000000..d143daa2 --- /dev/null +++ b/modules/local/span_compare.nf @@ -0,0 +1,97 @@ +process SPAN_COMPARE { + tag "${meta.id}" + label 'process_long' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/${meta.group}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::openjdk=21.0.2 bioconda::samtools=1.19.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://eclipse-temurin:21-jre' : + 'eclipse-temurin:21-jre' }" + + input: + tuple val(meta), path(treated_bams), path(control_bams) + path chrom_sizes + path omnipeaks_jar + val gap + val bin + val fdr + val java_heap + + output: + tuple val(meta), path("span.differential.tsv") , emit: tsv + tuple val(meta), path("span.differential.bed") , emit: bed + tuple val(meta), path("span.significant_up.bed") , emit: up + tuple val(meta), path("span.significant_down.bed") , emit: down + tuple val(meta), path("span.summary.tsv") , emit: summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}.span_compare" + def fdr_arg = fdr ? "--fdr ${fdr}" : '' + def bin_arg = bin ? "--bin ${bin}" : '' + """ + treated_list=(${treated_bams}) + control_list=(${control_bams}) + + treated_input=${treated_bams} + control_input=${control_bams} + if [ \${#treated_list[@]} -gt 1 ]; then + samtools merge -f treated_pool.bam ${treated_bams} + samtools index treated_pool.bam + treated_input=treated_pool.bam + fi + if [ \${#control_list[@]} -gt 1 ]; then + samtools merge -f control_pool.bam ${control_bams} + samtools index control_pool.bam + control_input=control_pool.bam + fi + + java -Xmx${java_heap} -jar ${omnipeaks_jar} compare \ + -t ${treated_input} \ + -c ${control_input} \ + --cs ${chrom_sizes} \ + --gap ${gap} \ + ${bin_arg} \ + ${fdr_arg} \ + -p ${prefix} + + diff_file=\$(ls ${prefix}*.peak 2>/dev/null | head -n 1 || true) + if [ -z "\$diff_file" ]; then + diff_file=\$(ls ${prefix}*.bed 2>/dev/null | head -n 1 || true) + fi + if [ -z "\$diff_file" ]; then + echo "SPAN compare produced no output" >&2 + exit 1 + fi + + cp "\$diff_file" span.differential.tsv + awk 'BEGIN{OFS="\t"} {print \$1,\$2,\$3}' "\$diff_file" > span.differential.bed + + # Attempt to split by sign using column 5 if numeric + awk 'BEGIN{OFS="\t"} {if (\$5 ~ /^-?[0-9.]+\$/) {if (\$5>0) print \$1,\$2,\$3}}' "\$diff_file" > span.significant_up.bed + awk 'BEGIN{OFS="\t"} {if (\$5 ~ /^-?[0-9.]+\$/) {if (\$5<0) print \$1,\$2,\$3}}' "\$diff_file" > span.significant_down.bed + + total=\$(grep -v '^#' "\$diff_file" | wc -l | awk '{print \$1}') + n_up=\$(wc -l < span.significant_up.bed | awk '{print \$1}') + n_down=\$(wc -l < span.significant_down.bed | awk '{print \$1}') + n_fdr_pass=\$((n_up + n_down)) + cat <<-END_SUMMARY > span.summary.tsv + method\tgroup\tcaller\ttreated_condition\tcontrol_condition\tn_tested\tn_fdr_pass\tn_up\tn_down + span\t${meta.group}\tNA\t${meta.treated ?: 'NA'}\t${meta.control ?: 'NA'}\t\${total}\t\${n_fdr_pass}\t\${n_up}\t\${n_down} + END_SUMMARY + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + java: \$(java -version 2>&1 | head -n 1 | sed -e 's/\"//g') + samtools: \$(samtools --version 2>&1 | head -n 1 | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/span_fallback_diff.nf b/modules/local/span_fallback_diff.nf new file mode 100644 index 00000000..ce3f2e74 --- /dev/null +++ b/modules/local/span_fallback_diff.nf @@ -0,0 +1,73 @@ +process SPAN_FALLBACK_DIFF { + tag "${group}" + label 'process_medium' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/${group}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "bioconda::bioconductor-diffbind bioconda::bioconductor-deseq2 bioconda::bioconductor-edger bioconda::bioconductor-genomicranges conda-forge::r-base=4.2.3 conda-forge::r-optparse conda-forge::r-ggplot2 conda-forge::r-jsonlite conda-forge::r-yaml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-diffbind:3.14.0--r42hdfd78af_0' : + 'biocontainers/bioconductor-diffbind:3.14.0--r42hdfd78af_0' }" + + input: + path samplesheet + val treated + val control + val group + val use_spikein + val fdr + val lfc + val min_overlap + val summits + val backend + + output: + tuple val(group), path("span_fallback.differential.tsv") , emit: results + tuple val(group), path("span_fallback.differential.bed") , emit: bed + tuple val(group), path("span_fallback.significant_up.bed") , emit: up + tuple val(group), path("span_fallback.significant_down.bed") , emit: down + tuple val(group), path("span_fallback.summary.tsv") , emit: summary + tuple val(group), path("span_fallback.readme.txt") , emit: readme + tuple val(group), path("plots") , emit: plots + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + mkdir -p plots + Rscript ${projectDir}/bin/diffbind_run.R \ + --samplesheet ${samplesheet} \ + --treated ${treated} \ + --control ${control} \ + --group ${group} \ + --caller span_fallback \ + --fdr ${fdr} \ + --lfc ${lfc} \ + --min_overlap ${min_overlap} \ + --summits ${summits} \ + --backend ${backend} \ + --use_spikein ${use_spikein} \ + --prefix span_fallback + + mv span_fallback.results.tsv span_fallback.differential.tsv + mv span_fallback.significant.bed span_fallback.differential.bed + + cat <<-END_README > span_fallback.readme.txt + SPAN fallback differential: SPAN peaks + DiffBind/DESeq2/edgeR analysis. + This fallback runs DiffBind on SPAN-derived peak sets to approximate differential enrichment + when native OmniPeaks differential mode is unavailable. + END_README + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diffbind: \$(Rscript -e 'packageVersion("DiffBind")' 2>/dev/null | tr -d '[]') + r-base: \$(R --version 2>&1 | head -n 1 | sed -e 's/.*R version //; s/ .*//') + END_VERSIONS + """ +} diff --git a/modules/local/write_differential_manifests.nf b/modules/local/write_differential_manifests.nf new file mode 100644 index 00000000..b5586319 --- /dev/null +++ b/modules/local/write_differential_manifests.nf @@ -0,0 +1,134 @@ +process WRITE_DIFFERENTIAL_MANIFESTS { + tag "differential_manifests" + label 'process_single' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/00_manifests" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::python=3.11" + container "quay.io/biocontainers/python:3.8.3" + + input: + val samples_json + val peaks_json + val run_meta_json + val outdir + val chrom_sizes + + output: + path "differential_manifest.samples.tsv", emit: samples + path "differential_manifest.peaks.tsv" , emit: peaks + path "differential_manifest.run_meta.json", emit: run_meta + path "chrom_sizes.sizes" , optional: true, emit: chrom_sizes + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """\ + python - <<'PY' + import json + import sys + import shutil + from pathlib import Path + + samples = json.loads(r'''${samples_json}''') + peaks = json.loads(r'''${peaks_json}''') + run_meta = json.loads(r'''${run_meta_json}''') + outdir = Path(r'''${outdir}''') + + def resolve_published(path_str): + if not path_str or path_str == 'NA': + return path_str + path = Path(path_str) + if path.exists() and outdir in path.parents: + return str(path) + if not outdir.exists(): + return path_str + matches = list(outdir.rglob(path.name)) + if matches: + return str(matches[0]) + return path_str + + samples_sorted = sorted( + samples, + key=lambda x: ( + str(x.get('group', '')), + str(x.get('condition', '')), + int(x.get('replicate', 0)), + str(x.get('sample_id', '')), + ), + ) + + peaks_sorted = sorted( + peaks, + key=lambda x: ( + str(x.get('group', '')), + str(x.get('condition', '')), + int(x.get('replicate', 0)), + str(x.get('sample_id', '')), + str(x.get('caller', '')), + ), + ) + + with open('differential_manifest.samples.tsv', 'w') as handle: + handle.write("\\t".join([ + 'sample_id', + 'group', + 'condition', + 'replicate', + 'final_bam', + 'final_bai', + 'normalisation_mode', + 'spikein_scale_factor', + 'bigwig', + ]) + "\\n") + for row in samples_sorted: + handle.write("\\t".join([ + str(row.get('sample_id', '')), + str(row.get('group', '')), + str(row.get('condition', '')), + str(row.get('replicate', '')), + resolve_published(str(row.get('final_bam', ''))), + resolve_published(str(row.get('final_bai', ''))), + str(row.get('normalisation_mode', '')), + str(row.get('spikein_scale_factor', 'NA')), + resolve_published(str(row.get('bigwig', 'NA'))), + ]) + "\\n") + + with open('differential_manifest.peaks.tsv', 'w') as handle: + handle.write("\\t".join([ + 'sample_id', + 'caller', + 'peaks_path', + 'peaks_format', + 'caller_role', + ]) + "\\n") + for row in peaks_sorted: + handle.write("\\t".join([ + str(row.get('sample_id', '')), + str(row.get('caller', '')), + resolve_published(str(row.get('peaks_path', ''))), + str(row.get('peaks_format', 'other')), + str(row.get('caller_role', '')), + ]) + "\\n") + + with open('differential_manifest.run_meta.json', 'w') as handle: + json.dump(run_meta, handle, indent=2, sort_keys=True) + handle.write("\\n") + + chrom = '${chrom_sizes}' + if chrom and Path(chrom).exists(): + shutil.copy(chrom, 'chrom_sizes.sizes') + PY + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """.stripIndent() +} diff --git a/nextflow.config b/nextflow.config index 39d9e100..999fff73 100644 --- a/nextflow.config +++ b/nextflow.config @@ -96,6 +96,50 @@ params { consensus_grouping = null replicate_threshold = 1.0 + // Differential peak calling + run_diffbind = false + run_chipbinner = false + run_span_diff = false + differential_from_run = null + differential_contrast = null + differential_min_replicates = 2 + differential_strict = false + differential_use_spikein = "auto" + differential_annotate = true + differential_cross_compare = true + differential_run_multiqc = true + + // DiffBind options + diffbind_fdr = 0.05 + diffbind_lfc = 1.0 + diffbind_backend = "DESeq2" + diffbind_min_overlap = 2 + diffbind_summits = 0 + diffbind_extra_params = null + + // ChIPBinner options + chipbinner_bin_size = 10000 + chipbinner_windows_dir = null + chipbinner_blacklist = null + chipbinner_use_input = false + chipbinner_pseudocount = 1 + chipbinner_fdr = 0.05 + chipbinner_lfc = 1.0 + chipbinner_bootstrap = 1000 + chipbinner_k_value = 100000 + chipbinner_hdbscan_grid_min_cluster_size = "100,200,500,1000" + chipbinner_hdbscan_grid_min_samples = "100,200,500,1000" + chipbinner_ms_coeffs = null + chipbinner_lola_db = null + chipbinner_run_lola = false + + // SPAN differential options + span_diff_mode = "auto" + span_diff_fdr = 0.05 + span_diff_gap = 5 + span_diff_bin = 200 + span_diff_java_heap = "8G" + // Reporting and Visualisation skip_reporting = false skip_igv = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 3c68e540..5d33059d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -329,14 +332,24 @@ "default": "Spikein", "fa_icon": "fab fa-buffer", "description": "Sets the target read normalisation mode. Options are: [\"Spikein\", \"RPKM\", \"CPM\", \"BPM\", \"None\" ]", - "enum": ["Spikein", "RPKM", "CPM", "BPM", "None"] + "enum": [ + "Spikein", + "RPKM", + "CPM", + "BPM", + "None" + ] }, "normalisation_scope": { "type": "string", "default": "all", "fa_icon": "fab fa-buffer", "description": "Scope for spike-in normalisation when normalisation_mode is Spikein. Options are: [all, group, group_condition].", - "enum": ["all", "group", "group_condition"] + "enum": [ + "all", + "group", + "group_condition" + ] }, "normalisation_binsize": { "type": "integer", @@ -354,7 +367,10 @@ "default": "standard", "fa_icon": "fas fa-align-justify", "description": "Convenience preset for peak callers. Options are: [standard, extended]. Ignored if --peakcaller is set.", - "enum": ["standard", "extended"] + "enum": [ + "standard", + "extended" + ] }, "use_control": { "type": "boolean", @@ -379,7 +395,11 @@ "default": "legacy", "fa_icon": "fas fa-align-justify", "description": "Control scaling scope for IgG/background samples. Options are: [legacy, group_condition, sample].", - "enum": ["legacy", "group_condition", "sample"] + "enum": [ + "legacy", + "group_condition", + "sample" + ] }, "seacr_peak_threshold": { "type": "number", @@ -392,14 +412,20 @@ "default": "non", "fa_icon": "fas fa-align-justify", "description": "SEACR normalization. ", - "enum": ["non", "norm"] + "enum": [ + "non", + "norm" + ] }, "seacr_stringent": { "type": "string", "default": "stringent", "fa_icon": "fas fa-align-justify", "description": "SEACR stringency.", - "enum": ["stringent", "relaxed"] + "enum": [ + "stringent", + "relaxed" + ] }, "macs2_qvalue": { "type": "number", @@ -463,13 +489,19 @@ "default": "group", "fa_icon": "fas fa-align-justify", "description": "Specifies what samples to group together for consensus peaks. Options are [group, all]", - "enum": ["group", "all"] + "enum": [ + "group", + "all" + ] }, "consensus_grouping": { "type": "string", "fa_icon": "fas fa-align-justify", "description": "Grouping key for consensus peaks when consensus_peak_mode is group. Options are [group, group_condition]. Defaults to group_condition when the samplesheet has a condition column, otherwise group.", - "enum": ["group", "group_condition"] + "enum": [ + "group", + "group_condition" + ] }, "replicate_threshold": { "type": "number", @@ -666,7 +698,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "singularity_pull_docker_container": { @@ -755,6 +794,210 @@ "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } } + }, + "differential_options": { + "title": "Differential peak calling", + "type": "object", + "description": "Options for differential peak calling / enrichment.", + "default": "", + "fa_icon": "fas fa-balance-scale", + "properties": { + "run_diffbind": { + "type": "boolean", + "default": false, + "description": "Enable DiffBind differential analysis." + }, + "run_chipbinner": { + "type": "boolean", + "default": false, + "description": "Enable ChIPBinner differential analysis." + }, + "run_span_diff": { + "type": "boolean", + "default": false, + "description": "Enable SPAN/OmniPeaks differential analysis." + }, + "differential_from_run": { + "type": "string", + "default": null, + "description": "Run differential-only using manifests from an existing output directory.", + "fa_icon": "fas fa-folder-open" + }, + "differential_contrast": { + "type": "string", + "default": null, + "description": "Contrast definition as 'TREATED,CONTROL'." + }, + "differential_min_replicates": { + "type": "integer", + "default": 2, + "description": "Minimum replicates per condition for differential analysis." + }, + "differential_strict": { + "type": "boolean", + "default": false, + "description": "Fail on invalid differential comparisons instead of skipping." + }, + "differential_use_spikein": { + "type": "string", + "default": "auto", + "description": "Use spike-in scale factors for differential analysis (auto|true|false).", + "enum": [ + "auto", + "true", + "false" + ] + }, + "differential_annotate": { + "type": "boolean", + "default": true, + "description": "Annotate differential regions to nearest features." + }, + "differential_cross_compare": { + "type": "boolean", + "default": true, + "description": "Compute cross-caller and cross-method overlaps." + }, + "differential_run_multiqc": { + "type": "boolean", + "default": true, + "description": "Generate differential MultiQC summary tables." + }, + "diffbind_fdr": { + "type": "number", + "default": 0.05, + "description": "DiffBind FDR threshold." + }, + "diffbind_lfc": { + "type": "number", + "default": 1.0, + "description": "DiffBind minimum log2 fold-change." + }, + "diffbind_backend": { + "type": "string", + "default": "DESeq2", + "description": "DiffBind backend (DESeq2 or edgeR).", + "enum": [ + "DESeq2", + "edgeR" + ] + }, + "diffbind_min_overlap": { + "type": "integer", + "default": 2, + "description": "DiffBind minimum overlap for consensus peakset." + }, + "diffbind_summits": { + "type": "integer", + "default": 0, + "description": "Summit-centered peak width for DiffBind (0 = use full peaks)." + }, + "diffbind_extra_params": { + "type": "string", + "default": null, + "description": "JSON/YAML file with extra DiffBind options." + }, + "chipbinner_bin_size": { + "type": "integer", + "default": 10000, + "description": "Bin size in bp for ChIPBinner." + }, + "chipbinner_windows_dir": { + "type": "string", + "default": null, + "description": "Directory with precomputed bin windows for ChIPBinner." + }, + "chipbinner_blacklist": { + "type": "string", + "default": null, + "description": "Blacklist BED to exclude from ChIPBinner bins." + }, + "chipbinner_use_input": { + "type": "boolean", + "default": false, + "description": "Use control/input samples in ChIPBinner (if available)." + }, + "chipbinner_pseudocount": { + "type": "integer", + "default": 1, + "description": "Pseudocount added before log transforms." + }, + "chipbinner_fdr": { + "type": "number", + "default": 0.05, + "description": "ChIPBinner FDR threshold." + }, + "chipbinner_lfc": { + "type": "number", + "default": 1.0, + "description": "ChIPBinner minimum log2 fold-change." + }, + "chipbinner_bootstrap": { + "type": "integer", + "default": 1000, + "description": "ROTS bootstrap iterations for ChIPBinner." + }, + "chipbinner_k_value": { + "type": "integer", + "default": 100000, + "description": "ROTS K value (top list size)." + }, + "chipbinner_hdbscan_grid_min_cluster_size": { + "type": "string", + "default": "100,200,500,1000", + "description": "HDBSCAN min_cluster_size grid (comma-separated)." + }, + "chipbinner_hdbscan_grid_min_samples": { + "type": "string", + "default": "100,200,500,1000", + "description": "HDBSCAN min_samples grid (comma-separated)." + }, + "chipbinner_ms_coeffs": { + "type": "string", + "default": null, + "description": "MS coefficients YAML for ChIPBinner normalization." + }, + "chipbinner_lola_db": { + "type": "string", + "default": null, + "description": "LOLA database path for ChIPBinner enrichment." + }, + "chipbinner_run_lola": { + "type": "boolean", + "default": false, + "description": "Run LOLA enrichment for ChIPBinner clusters." + }, + "span_diff_mode": { + "type": "string", + "default": "auto", + "description": "SPAN differential mode (auto|native|fallback).", + "enum": [ + "auto", + "native", + "fallback" + ] + }, + "span_diff_fdr": { + "type": "number", + "default": 0.05, + "description": "SPAN differential FDR threshold." + }, + "span_diff_gap": { + "type": "integer", + "default": 5, + "description": "SPAN compare gap parameter." + }, + "span_diff_bin": { + "type": "integer", + "default": 200, + "description": "SPAN compare bin size." + }, + "span_diff_java_heap": { + "type": "string", + "default": "8G", + "description": "Java heap for SPAN/OmniPeaks differential." + } + } } }, "allOf": [ @@ -773,6 +1016,9 @@ { "$ref": "#/definitions/pipeline_options" }, + { + "$ref": "#/definitions/differential_options" + }, { "$ref": "#/definitions/reporting_options" }, @@ -786,4 +1032,4 @@ "$ref": "#/definitions/generic_options" } ] -} +} \ No newline at end of file diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf new file mode 100644 index 00000000..6e675108 --- /dev/null +++ b/subworkflows/local/differential_peak_calling.nf @@ -0,0 +1,576 @@ +/* + * Differential peak calling / enrichment + */ + +import groovy.json.JsonOutput +import groovy.json.JsonSlurper + +include { WRITE_DIFFERENTIAL_MANIFESTS } from '../../modules/local/write_differential_manifests' +include { MAKE_DIFFBIND_SAMPLESHEET } from '../../modules/local/diffbind_samplesheet' +include { MAKE_DIFFBIND_SAMPLESHEET as MAKE_SPAN_SAMPLESHEET } from '../../modules/local/diffbind_samplesheet' +include { DIFFBIND_RUN } from '../../modules/local/diffbind_run' +include { CHIPBINNER_BINS } from '../../modules/local/chipbinner_bins' +include { CHIPBINNER_COUNTS } from '../../modules/local/chipbinner_counts' +include { CHIPBINNER_HDBSCAN_GRID } from '../../modules/local/chipbinner_hdbscan_grid' +include { CHIPBINNER_ROTS } from '../../modules/local/chipbinner_rots' +include { SPAN_CAPABILITY_PROBE } from '../../modules/local/span_capability_probe' +include { SPAN_COMPARE } from '../../modules/local/span_compare' +include { SPAN_FALLBACK_DIFF } from '../../modules/local/span_fallback_diff' +include { ANNOTATE_REGIONS } from '../../modules/local/annotate_regions' +include { DIFFERENTIAL_OVERLAP } from '../../modules/local/differential_overlap' +include { DIFFERENTIAL_SUMMARY_MERGE } from '../../modules/local/differential_summary' + +workflow DIFFERENTIAL_PEAK_CALLING { + take: + ch_bam_bai // tuple [meta, bam, bai] + ch_peaks // tuple [meta, peaks] + ch_bigwig // tuple [meta, bigwig] + ch_spikein_scale // tuple [meta, scale] + ch_chrom_sizes // path + callers // list of callers + + main: + ch_versions = Channel.empty() + ch_skipped_records = Channel.empty() + ch_summary_files = Channel.empty() + ch_diffbind_significant = Channel.empty() + ch_chipbinner_significant = Channel.empty() + ch_span_significant = Channel.empty() + + ch_chrom_sizes_single = ch_chrom_sizes + .map { it instanceof List ? it[0] : it } + .ifEmpty('') + ch_gene_bed = Channel.value(params.gene_bed ? file(params.gene_bed) : '') + ch_gtf = Channel.value(params.gtf ? file(params.gtf) : '') + + def contrast_parts = params.differential_contrast?.split(',')?.collect { it.trim() }?.findAll { it } + if (!contrast_parts || contrast_parts.size() != 2) { + exit 1, "Invalid --differential_contrast value. Expected 'TREATED,CONTROL'." + } + def treated = contrast_parts[0] + def control = contrast_parts[1] + + def use_spikein_param = params.differential_use_spikein?.toString()?.toLowerCase() + ch_use_spikein = Channel.value('false') + if (use_spikein_param == 'true') { + ch_use_spikein = Channel.value('true') + } else if (use_spikein_param == 'auto') { + ch_use_spikein = ch_spikein_scale + .map { meta, scale -> scale } + .filter { it != null && it.toString() != 'NA' } + .count() + .map { it > 0 ? 'true' : 'false' } + } + + ch_bam_bai_target = ch_bam_bai.filter { meta, bam, bai -> meta.is_control == false } + ch_bigwig_target = ch_bigwig.filter { meta, bw -> meta.is_control == false } + ch_scale_target = ch_spikein_scale.filter { meta, scale -> meta.is_control == false } + + ch_bigwig_map = ch_bigwig_target + .map { meta, bw -> [meta.sample_id ?: meta.id, bw] } + .toList() + .map { list -> list.collectEntries { [(it[0]): it[1]] } } + + ch_scale_map = ch_scale_target + .map { meta, scale -> [meta.sample_id ?: meta.id, scale] } + .toList() + .map { list -> list.collectEntries { [(it[0]): it[1]] } } + + ch_samples = ch_bam_bai_target + .combine(ch_bigwig_map) + .combine(ch_scale_map) + .map { meta, bam, bai, bw_map, scale_map -> + def sample_id = meta.sample_id ?: meta.id + def scale = scale_map.get(sample_id, 'NA') + def bigwig = bw_map.get(sample_id, 'NA') + [ + sample_id: sample_id, + group: meta.group, + condition: meta.condition, + replicate: meta.replicate, + bam: bam, + bai: bai, + bam_path: bam.toString(), + bai_path: bai.toString(), + normalisation_mode: params.normalisation_mode, + spikein_scale_factor: scale, + bigwig_path: bigwig == null ? 'NA' : bigwig.toString(), + meta: meta + ] + } + + // Write manifests in integrated mode + if (!params.differential_from_run) { + ch_samples_manifest = ch_samples + .map { rec -> + [ + sample_id: rec.sample_id, + group: rec.group, + condition: rec.condition, + replicate: rec.replicate, + final_bam: rec.bam_path, + final_bai: rec.bai_path, + normalisation_mode: rec.normalisation_mode, + spikein_scale_factor: rec.spikein_scale_factor ?: 'NA', + bigwig: rec.bigwig_path ?: 'NA' + ] + } + .toList() + .map { list -> JsonOutput.toJson(list) } + + ch_peaks_manifest = ch_peaks + .filter { meta, peaks -> meta.is_control == false } + .map { meta, peaks -> + def sample_id = meta.sample_id ?: meta.id + def caller_role = callers && callers[0] == meta.caller ? 'primary' : 'secondary' + def peaks_format = 'other' + def name = peaks.getName() + if (name.endsWith('.narrowPeak')) { + peaks_format = 'narrowPeak' + } else if (name.endsWith('.broadPeak')) { + peaks_format = 'broadPeak' + } else if (name.endsWith('.bed')) { + peaks_format = 'bed' + } + [ + sample_id: sample_id, + caller: meta.caller, + peaks_path: peaks.toString(), + peaks_format: peaks_format, + caller_role: caller_role, + group: meta.group, + condition: meta.condition, + replicate: meta.replicate + ] + } + .toList() + .map { list -> JsonOutput.toJson(list) } + + def run_meta = [ + pipeline: workflow.manifest.name, + pipeline_version: workflow.manifest.version, + pipeline_revision: workflow.revision ?: 'NA', + genome: params.genome, + callers: callers, + default_contrast: params.differential_contrast, + chrom_sizes: "${params.outdir}/03_peak_calling/06_differential/00_manifests/chrom_sizes.sizes" + ] + + ch_run_meta = Channel.value(JsonOutput.toJson(run_meta)) + ch_outdir = Channel.value(params.outdir) + + WRITE_DIFFERENTIAL_MANIFESTS( + ch_samples_manifest, + ch_peaks_manifest, + ch_run_meta, + ch_outdir, + ch_chrom_sizes_single + ) + ch_versions = ch_versions.mix(WRITE_DIFFERENTIAL_MANIFESTS.out.versions) + } + + // Group-level validation + ch_group_samples = ch_samples + .map { rec -> [rec.group, rec] } + .groupTuple() + + ch_group_eval = ch_group_samples + .map { group, records -> + def treated_records = records.findAll { it.condition == treated } + def control_records = records.findAll { it.condition == control } + def conditions = records.collect { it.condition }.unique() + def other_conditions = conditions.findAll { !(it in [treated, control]) } + def reasons = [] + if (!treated_records) { + reasons << "missing_treated" + } + if (!control_records) { + reasons << "missing_control" + } + if (treated_records.size() < params.differential_min_replicates) { + reasons << "treated_replicates_lt_${params.differential_min_replicates}" + } + if (control_records.size() < params.differential_min_replicates) { + reasons << "control_replicates_lt_${params.differential_min_replicates}" + } + if (other_conditions) { + if (params.differential_strict) { + exit 1, "Group '${group}' has additional conditions (${other_conditions.join(',')}) but strict mode is enabled." + } + log.warn "Group '${group}' has additional conditions (${other_conditions.join(',')}); ignoring for differential analysis." + } + def valid = reasons.isEmpty() + if (!valid && params.differential_strict) { + exit 1, "Group '${group}' failed differential validation: ${reasons.join(';')}" + } + def filtered = records.findAll { it.condition in [treated, control] } + [group: group, records: filtered, valid: valid, reason: reasons.join(';'), details: other_conditions ? "ignored_conditions=${other_conditions.join(',')}" : ""] + } + + ch_group_eval.branch { valid: it.valid; invalid: !it.valid }.set { ch_group_split } + ch_valid_groups = ch_group_split.valid + ch_invalid_groups = ch_group_split.invalid + + ch_skipped_records = ch_skipped_records.mix( + ch_invalid_groups.map { rec -> [method: 'all', group: rec.group, caller: 'NA', reason: rec.reason, details: rec.details] } + ) + + // DiffBind per caller x group + if (params.run_diffbind) { + ch_peaks_map = ch_peaks + .filter { meta, peaks -> meta.is_control == false } + .map { meta, peaks -> ["${meta.group}__${meta.caller}", [meta, peaks]] } + .groupTuple() + .toList() + .map { list -> list.collectEntries { [(it[0]): it[1]] } } + + ch_diffbind_inputs = ch_valid_groups + .flatMap { rec -> + callers.collect { caller -> [group: rec.group, caller: caller, records: rec.records] } + } + .combine(ch_peaks_map) + .map { rec, peaks_map -> + def key = "${rec.group}__${rec.caller}" + def peaks_list = peaks_map.get(key, []) + def peaks_by_sample = peaks_list.collectEntries { [(it[0].sample_id ?: it[0].id): it[1]] } + def missing = rec.records.findAll { !peaks_by_sample.containsKey(it.sample_id) }.collect { it.sample_id } + def valid = missing.isEmpty() + def samples = rec.records.collect { row -> + [ + sample_id: row.sample_id, + group: row.group, + condition: row.condition, + replicate: row.replicate, + bam: row.bam_path, + peaks: peaks_by_sample.get(row.sample_id)?.toString(), + caller: rec.caller, + spikein_scale_factor: row.spikein_scale_factor ?: 'NA' + ] + } + [group: rec.group, caller: rec.caller, samples: samples, valid: valid, missing: missing] + } + + ch_diffbind_inputs.branch { valid: it.valid; invalid: !it.valid }.set { ch_diffbind_split } + ch_diffbind_valid = ch_diffbind_split.valid + ch_diffbind_invalid = ch_diffbind_split.invalid + + ch_skipped_records = ch_skipped_records.mix( + ch_diffbind_invalid.map { rec -> [method: 'diffbind', group: rec.group, caller: rec.caller, reason: 'missing_peaks', details: rec.missing.join(',')] } + ) + + ch_diffbind_valid + .map { rec -> [JsonOutput.toJson(rec.samples), rec.group, rec.caller] } + .set { ch_diffbind_samples } + + MAKE_DIFFBIND_SAMPLESHEET( + ch_diffbind_samples.map { samples_json, group, caller -> samples_json }, + ch_diffbind_samples.map { samples_json, group, caller -> group }, + ch_diffbind_samples.map { samples_json, group, caller -> caller } + ) + ch_versions = ch_versions.mix(MAKE_DIFFBIND_SAMPLESHEET.out.versions) + + DIFFBIND_RUN( + MAKE_DIFFBIND_SAMPLESHEET.out.samplesheet.map { group, caller, sheet -> sheet }, + Channel.value(treated), + Channel.value(control), + MAKE_DIFFBIND_SAMPLESHEET.out.samplesheet.map { group, caller, sheet -> group }, + MAKE_DIFFBIND_SAMPLESHEET.out.samplesheet.map { group, caller, sheet -> caller }, + ch_use_spikein, + Channel.value(params.diffbind_fdr), + Channel.value(params.diffbind_lfc), + Channel.value(params.diffbind_min_overlap), + Channel.value(params.diffbind_summits), + Channel.value(params.diffbind_backend), + Channel.value(params.diffbind_extra_params ? file(params.diffbind_extra_params) : ''), + Channel.value('diffbind') + ) + ch_versions = ch_versions.mix(DIFFBIND_RUN.out.versions) + ch_summary_files = ch_summary_files.mix(DIFFBIND_RUN.out.summary.map { group, caller, file -> file }) + ch_diffbind_significant = DIFFBIND_RUN.out.significant + + if (params.differential_annotate) { + ch_diffbind_annotate = DIFFBIND_RUN.out.results.map { group, caller, file -> + def meta = [publish_dir: "${params.outdir}/03_peak_calling/06_differential/01_diffbind/${caller}/${group}"] + [meta, file] + } + ANNOTATE_REGIONS( + ch_diffbind_annotate, + ch_gene_bed, + ch_gtf + ) + ch_versions = ch_versions.mix(ANNOTATE_REGIONS.out.versions) + } + } + + // ChIPBinner per group + if (params.run_chipbinner) { + CHIPBINNER_BINS( + ch_chrom_sizes_single, + Channel.value(params.chipbinner_bin_size), + ch_valid_groups.map { rec -> rec.group }, + Channel.value(params.chipbinner_windows_dir ? file(params.chipbinner_windows_dir) : ''), + Channel.value(params.chipbinner_blacklist ? file(params.chipbinner_blacklist) : '') + ) + ch_versions = ch_versions.mix(CHIPBINNER_BINS.out.versions) + + ch_chipbinner_samples = ch_valid_groups.map { rec -> + def ordered = rec.records.sort { a, b -> + a.condition <=> b.condition ?: a.replicate <=> b.replicate ?: a.sample_id <=> b.sample_id + } + def samples = ordered.collect { row -> + [ + sample_id: row.sample_id, + condition: row.condition, + bam: row.bam_path, + spikein_scale_factor: row.spikein_scale_factor ?: 'NA' + ] + } + [rec.group, JsonOutput.toJson(samples)] + } + + ch_chipbinner_inputs = CHIPBINNER_BINS.out.bins.join(ch_chipbinner_samples) + .map { group, bins_file, samples_json -> [bins_file, samples_json, group] } + + CHIPBINNER_COUNTS( + ch_chipbinner_inputs.map { bins, samples_json, group -> bins }, + ch_chipbinner_inputs.map { bins, samples_json, group -> samples_json }, + ch_chipbinner_inputs.map { bins, samples_json, group -> group }, + ch_use_spikein, + Channel.value(params.chipbinner_pseudocount) + ) + ch_versions = ch_versions.mix(CHIPBINNER_COUNTS.out.versions) + + CHIPBINNER_HDBSCAN_GRID( + CHIPBINNER_COUNTS.out.normalized, + ch_chipbinner_inputs.map { bins, samples_json, group -> group }, + Channel.value(params.chipbinner_hdbscan_grid_min_cluster_size), + Channel.value(params.chipbinner_hdbscan_grid_min_samples) + ) + ch_versions = ch_versions.mix(CHIPBINNER_HDBSCAN_GRID.out.versions) + + CHIPBINNER_ROTS( + CHIPBINNER_COUNTS.out.normalized, + CHIPBINNER_HDBSCAN_GRID.out.clusters, + ch_chipbinner_inputs.map { bins, samples_json, group -> samples_json }, + Channel.value(treated), + Channel.value(control), + ch_chipbinner_inputs.map { bins, samples_json, group -> group }, + Channel.value(params.chipbinner_fdr), + Channel.value(params.chipbinner_lfc), + Channel.value(params.chipbinner_bootstrap), + Channel.value(params.chipbinner_k_value) + ) + ch_versions = ch_versions.mix(CHIPBINNER_ROTS.out.versions) + ch_summary_files = ch_summary_files.mix(CHIPBINNER_ROTS.out.summary.map { group, file -> file }) + ch_chipbinner_significant = CHIPBINNER_ROTS.out.up + + if (params.differential_annotate) { + ch_chipbinner_annotate = CHIPBINNER_ROTS.out.results.map { group, file -> + def meta = [publish_dir: "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/differential"] + [meta, file] + } + ANNOTATE_REGIONS( + ch_chipbinner_annotate, + ch_gene_bed, + ch_gtf + ) + ch_versions = ch_versions.mix(ANNOTATE_REGIONS.out.versions) + } + } + + // SPAN differential per group + if (params.run_span_diff) { + if (!params.omnipeaks_jar) { + exit 1, "--omnipeaks_jar is required when --run_span_diff is enabled" + } + SPAN_CAPABILITY_PROBE(Channel.value(file(params.omnipeaks_jar))) + ch_versions = ch_versions.mix(SPAN_CAPABILITY_PROBE.out.versions) + + ch_span_caps = SPAN_CAPABILITY_PROBE.out.capabilities + .map { file -> new JsonSlurper().parse(file) } + ch_span_has_compare = ch_span_caps.map { it.has_compare ?: false } + + ch_span_groups = ch_valid_groups.map { rec -> + def treated_bams = rec.records.findAll { it.condition == treated }.collect { it.bam } + def control_bams = rec.records.findAll { it.condition == control }.collect { it.bam } + def meta = [id: rec.group, group: rec.group, treated: treated, control: control] + [meta, treated_bams, control_bams] + } + + def span_mode = params.span_diff_mode?.toString()?.toLowerCase() ?: 'auto' + + ch_span_native_groups = Channel.empty() + if (span_mode == 'native') { + ch_span_native_groups = ch_span_groups + } else if (span_mode == 'auto') { + ch_span_native_groups = ch_span_groups.combine(ch_span_has_compare) + .filter { meta, treated_bams, control_bams, has_compare -> has_compare } + .map { meta, treated_bams, control_bams, has_compare -> [meta, treated_bams, control_bams] } + } + + if (span_mode == 'native' || span_mode == 'auto') { + SPAN_COMPARE( + ch_span_native_groups, + ch_chrom_sizes_single, + Channel.value(file(params.omnipeaks_jar)), + Channel.value(params.span_diff_gap), + Channel.value(params.span_diff_bin), + Channel.value(params.span_diff_fdr), + Channel.value(params.span_diff_java_heap) + ) + ch_versions = ch_versions.mix(SPAN_COMPARE.out.versions) + ch_summary_files = ch_summary_files.mix(SPAN_COMPARE.out.summary.map { meta, file -> file }) + ch_span_significant = ch_span_significant.mix(SPAN_COMPARE.out.up.map { meta, file -> [meta.group, file] }) + + if (params.differential_annotate) { + ch_span_annotate = SPAN_COMPARE.out.tsv.map { meta, file -> + def publish_meta = [publish_dir: "${params.outdir}/03_peak_calling/06_differential/03_span/${meta.group}"] + [publish_meta, file] + } + ANNOTATE_REGIONS( + ch_span_annotate, + ch_gene_bed, + ch_gtf + ) + ch_versions = ch_versions.mix(ANNOTATE_REGIONS.out.versions) + } + } + + ch_span_fallback_groups = Channel.empty() + if (span_mode == 'fallback') { + ch_span_fallback_groups = ch_valid_groups + } else if (span_mode == 'auto') { + ch_span_fallback_groups = ch_valid_groups.combine(ch_span_has_compare) + .filter { rec, has_compare -> !has_compare } + .map { rec, has_compare -> rec } + } + + if (span_mode == 'fallback' || span_mode == 'auto') { + def span_callers = callers.findAll { it.startsWith('span_') } + if (!span_callers) { + ch_skipped_records = ch_skipped_records.mix( + ch_span_fallback_groups.map { rec -> [method: 'span', group: rec.group, caller: 'NA', reason: 'no_span_peaks', details: 'SPAN caller not in --peakcaller'] } + ) + } else { + def span_caller = span_callers[0] + ch_span_peaks = ch_peaks + .filter { meta, peaks -> meta.is_control == false && meta.caller == span_caller } + .map { meta, peaks -> [meta.sample_id ?: meta.id, peaks] } + .toList() + .map { list -> list.collectEntries { [(it[0]): it[1]] } } + + ch_span_samples = ch_span_fallback_groups.combine(ch_span_peaks) + .map { rec, peaks_map -> + def missing = rec.records.findAll { !peaks_map.containsKey(it.sample_id) }.collect { it.sample_id } + def samples = rec.records.collect { row -> + [ + sample_id: row.sample_id, + group: row.group, + condition: row.condition, + replicate: row.replicate, + bam: row.bam_path, + peaks: peaks_map.get(row.sample_id)?.toString(), + caller: span_caller, + spikein_scale_factor: row.spikein_scale_factor ?: 'NA' + ] + } + [group: rec.group, samples: samples, missing: missing] + } + + ch_span_samples.branch { valid: it.missing.isEmpty(); invalid: !it.missing.isEmpty() }.set { ch_span_split } + ch_span_valid = ch_span_split.valid + ch_span_invalid = ch_span_split.invalid + + ch_skipped_records = ch_skipped_records.mix( + ch_span_invalid.map { rec -> [method: 'span', group: rec.group, caller: span_caller, reason: 'missing_peaks', details: rec.missing.join(',')] } + ) + + MAKE_SPAN_SAMPLESHEET( + ch_span_valid.map { rec -> JsonOutput.toJson(rec.samples) }, + ch_span_valid.map { rec -> rec.group }, + Channel.value('span_fallback') + ) + ch_versions = ch_versions.mix(MAKE_SPAN_SAMPLESHEET.out.versions) + + SPAN_FALLBACK_DIFF( + MAKE_SPAN_SAMPLESHEET.out.samplesheet.map { group, caller, sheet -> sheet }, + Channel.value(treated), + Channel.value(control), + MAKE_SPAN_SAMPLESHEET.out.samplesheet.map { group, caller, sheet -> group }, + ch_use_spikein, + Channel.value(params.span_diff_fdr), + Channel.value(params.diffbind_lfc), + Channel.value(params.diffbind_min_overlap), + Channel.value(params.diffbind_summits), + Channel.value(params.diffbind_backend) + ) + ch_versions = ch_versions.mix(SPAN_FALLBACK_DIFF.out.versions) + ch_summary_files = ch_summary_files.mix(SPAN_FALLBACK_DIFF.out.summary.map { group, file -> file }) + ch_span_significant = ch_span_significant.mix(SPAN_FALLBACK_DIFF.out.up.map { group, file -> [group, file] }) + + if (params.differential_annotate) { + ch_span_fallback_annotate = SPAN_FALLBACK_DIFF.out.results.map { group, file -> + def meta = [publish_dir: "${params.outdir}/03_peak_calling/06_differential/03_span/${group}"] + [meta, file] + } + ANNOTATE_REGIONS( + ch_span_fallback_annotate, + ch_gene_bed, + ch_gtf + ) + ch_versions = ch_versions.mix(ANNOTATE_REGIONS.out.versions) + } + } + } + } + + if (params.differential_cross_compare) { + def primary_caller = callers ? callers[0] : null + diffbind_beds = ch_diffbind_significant.map { group, caller, file -> + [group: group, caller: caller, path: file.toString()] + } + method_beds = Channel.empty() + if (primary_caller) { + method_beds = method_beds.mix( + ch_diffbind_significant + .filter { group, caller, file -> caller == primary_caller } + .map { group, caller, file -> [group: group, method: 'diffbind', caller: caller, path: file.toString()] } + ) + } + method_beds = method_beds.mix(ch_chipbinner_significant.map { group, file -> [group: group, method: 'chipbinner', caller: 'NA', path: file.toString()] }) + method_beds = method_beds.mix(ch_span_significant.map { group, file -> [group: group, method: 'span', caller: 'NA', path: file.toString()] }) + + callers_tsv = diffbind_beds + .map { rec -> [rec.group, rec.caller, rec.path].join('\t') } + .toSortedList() + .map { lines -> lines.join('\n') + '\n' } + .collectFile(name: 'overlap_callers.tsv') + methods_tsv = method_beds + .map { rec -> [rec.group, rec.method, rec.caller ?: 'NA', rec.path].join('\t') } + .toSortedList() + .map { lines -> lines.join('\n') + '\n' } + .collectFile(name: 'overlap_methods.tsv') + + DIFFERENTIAL_OVERLAP(callers_tsv, methods_tsv) + ch_versions = ch_versions.mix(DIFFERENTIAL_OVERLAP.out.versions) + } + + ch_summary_out = Channel.empty() + ch_skipped_out = Channel.empty() + if (params.differential_run_multiqc) { + DIFFERENTIAL_SUMMARY_MERGE( + ch_summary_files.collect().ifEmpty([]), + ch_skipped_records.toList().map { list -> JsonOutput.toJson(list) }, + Channel.value(file("${projectDir}/assets/multiqc/differential_summary_header.txt")), + Channel.value(file("${projectDir}/assets/multiqc/differential_skipped_header.txt")) + ) + ch_versions = ch_versions.mix(DIFFERENTIAL_SUMMARY_MERGE.out.versions) + ch_summary_out = DIFFERENTIAL_SUMMARY_MERGE.out.summary + ch_skipped_out = DIFFERENTIAL_SUMMARY_MERGE.out.skipped + } + + emit: + summary = ch_summary_out + skipped = ch_skipped_out + versions = ch_versions +} diff --git a/subworkflows/local/prepare_peakcalling.nf b/subworkflows/local/prepare_peakcalling.nf index d9fcc019..baef316c 100644 --- a/subworkflows/local/prepare_peakcalling.nf +++ b/subworkflows/local/prepare_peakcalling.nf @@ -24,6 +24,7 @@ workflow PREPARE_PEAKCALLING { main: ch_versions = Channel.empty() ch_bedgraph = Channel.empty() + ch_spikein_scale_factors = Channel.empty() def norm_scope = normalisation_scope ?: 'all' def igg_scope = igg_scale_scope ?: 'legacy' def median = { List values -> @@ -113,6 +114,10 @@ workflow PREPARE_PEAKCALLING { NORMALISATION_FACTORS_REPORT ( ch_norm_factors ) ch_versions = ch_versions.mix(NORMALISATION_FACTORS_REPORT.out.versions) } + + ch_bam_scale_factor + .map { meta, bam, scale -> [ meta, scale ] } + .set { ch_spikein_scale_factors } } else if (norm_mode == "None") { /* @@ -302,4 +307,5 @@ workflow PREPARE_PEAKCALLING { bedgraph = UCSC_BEDCLIP.out.bedgraph // channel: [ val(meta), [ bedgraph ] ] bigwig = UCSC_BEDGRAPHTOBIGWIG.out.bigwig // channel: [ val(meta), [ bigwig ] ] versions = ch_versions // channel: [ versions.yml ] + spikein_scale_factors = ch_spikein_scale_factors // channel: [ val(meta), scale_factor ] } diff --git a/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.peaks.tsv b/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.peaks.tsv new file mode 100644 index 00000000..30a54bad --- /dev/null +++ b/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.peaks.tsv @@ -0,0 +1,3 @@ +sample_id caller peaks_path peaks_format caller_role +h3k27me3_Control_rep1 seacr tests/assets/differential_mock/data/h3k27me3_control.peaks.bed bed primary +h3k27me3_Treatment_rep1 seacr tests/assets/differential_mock/data/h3k27me3_treatment.peaks.bed bed primary diff --git a/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.run_meta.json b/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.run_meta.json new file mode 100644 index 00000000..5088659e --- /dev/null +++ b/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.run_meta.json @@ -0,0 +1,4 @@ +{ + "callers": ["seacr"], + "default_contrast": "Treatment,Control" +} diff --git a/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.samples.tsv b/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.samples.tsv new file mode 100644 index 00000000..890cbebf --- /dev/null +++ b/tests/assets/differential_mock/03_peak_calling/06_differential/00_manifests/differential_manifest.samples.tsv @@ -0,0 +1,3 @@ +sample_id group condition replicate final_bam final_bai normalisation_mode spikein_scale_factor bigwig +h3k27me3_Control_rep1 h3k27me3 Control 1 tests/assets/differential_mock/data/h3k27me3_control.bam tests/assets/differential_mock/data/h3k27me3_control.bam.bai None NA NA +h3k27me3_Treatment_rep1 h3k27me3 Treatment 1 tests/assets/differential_mock/data/h3k27me3_treatment.bam tests/assets/differential_mock/data/h3k27me3_treatment.bam.bai None NA NA diff --git a/tests/config/differential.config b/tests/config/differential.config new file mode 100644 index 00000000..79a0dec6 --- /dev/null +++ b/tests/config/differential.config @@ -0,0 +1,20 @@ +params { + outdir = "results/" + publish_dir_mode = "copy" + singularity_pull_docker_container = false +} + +process { + cpus = 2 + memory = 6.GB + time = 6.h +} + +singularity.enabled = true +singularity.autoMounts = true + +process { + withName: '.*DIFFBIND_RUN' { + ext.when = false + } +} diff --git a/tests/test_07_differential_peak_calling.yml b/tests/test_07_differential_peak_calling.yml new file mode 100644 index 00000000..5706c323 --- /dev/null +++ b/tests/test_07_differential_peak_calling.yml @@ -0,0 +1,27 @@ +- name: test_differential_missing_contrast + command: nextflow run main.nf -profile docker,test --run_diffbind --skip_fastqc --skip_removeduplicates --skip_preseq --skip_multiqc -c tests/config/nextflow.config + tags: + - test_differential + - test_differential_missing_contrast + exit_code: 1 + +- name: test_differential_manifests_samplesheet + command: nextflow run main.nf -profile docker,test --only_peak_calling --skip_fastqc --skip_removeduplicates --skip_preseq --skip_multiqc --run_diffbind --differential_contrast "Treatment,Control" --differential_min_replicates 1 --input tests/assets/samplesheet_condition.csv -c tests/config/differential.config + tags: + - test_differential + - test_differential_manifests + files: + - path: results/03_peak_calling/06_differential/00_manifests/differential_manifest.samples.tsv + - path: results/03_peak_calling/06_differential/00_manifests/differential_manifest.peaks.tsv + - path: results/03_peak_calling/06_differential/01_diffbind/00_samplesheets/seacr/h3k27me3.diffbind.csv + +- name: test_differential_only_skipped_invalid + command: nextflow run main.nf -entry DIFFERENTIAL_ONLY -profile docker,test --run_diffbind --differential_contrast "Treatment,Control" --differential_from_run tests/assets/differential_mock --skip_multiqc -c tests/config/differential.config + tags: + - test_differential + - test_differential_only + files: + - path: results/03_peak_calling/06_differential/multiqc/differential.skipped.tsv + contains: + - "treated_replicates_lt_2" + - "control_replicates_lt_2" diff --git a/workflows/cutandrun.nf b/workflows/cutandrun.nf index d97aae21..8a85b4cd 100644 --- a/workflows/cutandrun.nf +++ b/workflows/cutandrun.nf @@ -146,6 +146,7 @@ include { DEEPTOOLS_QC } from "../subworkflo include { PEAK_QC } from "../subworkflows/local/peak_qc" include { SAMTOOLS_VIEW_SORT_STATS as FILTER_READS } from "../subworkflows/local/samtools_view_sort_stats" include { DEDUPLICATE_LINEAR } from "../subworkflows/local/deduplicate_linear" +include { DIFFERENTIAL_PEAK_CALLING } from "../subworkflows/local/differential_peak_calling" /* ======================================================================================== @@ -187,6 +188,9 @@ workflow CUTANDRUN { // Init ch_software_versions = Channel.empty() + if (params.run_peak_calling && (params.run_diffbind || params.run_chipbinner || params.run_span_diff) && !params.differential_contrast) { + exit 1, "Differential analysis requested but --differential_contrast was not provided (expected \"TREATED,CONTROL\")." + } /* * SUBWORKFLOW: Uncompress and prepare reference genome files @@ -450,6 +454,9 @@ workflow CUTANDRUN { ch_bedgraph = Channel.empty() ch_bigwig = Channel.empty() + ch_spikein_scale_factors = Channel.empty() + ch_bam_target = Channel.empty() + ch_bam_control = Channel.empty() ch_peaks_all = Channel.empty() ch_peaks_primary = Channel.empty() ch_peaks_secondary = Channel.empty() @@ -478,6 +485,7 @@ workflow CUTANDRUN { ) ch_bedgraph = PREPARE_PEAKCALLING.out.bedgraph ch_bigwig = PREPARE_PEAKCALLING.out.bigwig + ch_spikein_scale_factors = PREPARE_PEAKCALLING.out.spikein_scale_factors ch_software_versions = ch_software_versions.mix(PREPARE_PEAKCALLING.out.versions) /* @@ -654,6 +662,46 @@ workflow CUTANDRUN { ch_software_versions = ch_software_versions.mix(CONTROL_POOLING_FALLBACKS_REPORT.out.versions) } + /* + * CHANNEL: Filter bais for target only + */ + ch_samtools_bai.filter { it -> it[0].is_control == false } + .set { ch_bai_target } + //ch_bai_target | view + + /* + * CHANNEL: Combine bam and bai files on id + */ + ch_bam_target.map { row -> [row[0].id, row ].flatten()} + .join ( ch_bai_target.map { row -> [row[0].id, row ].flatten()} ) + .map { row -> [row[1], row[2], row[4]] } + .set { ch_bam_bai } + // EXAMPLE CHANNEL STRUCT: [[META], BAM, BAI] + //ch_bam_bai | view + + ch_differential_summary = Channel.empty() + ch_differential_skipped = Channel.empty() + ch_differential_versions = Channel.empty() + if (params.run_peak_calling && (params.run_diffbind || params.run_chipbinner || params.run_span_diff)) { + DIFFERENTIAL_PEAK_CALLING ( + ch_bam_bai, + ch_peaks_all, + ch_bigwig, + ch_spikein_scale_factors, + PREPARE_GENOME.out.chrom_sizes.collect(), + callers + ) + ch_differential_summary = DIFFERENTIAL_PEAK_CALLING.out.summary + ch_differential_skipped = DIFFERENTIAL_PEAK_CALLING.out.skipped + ch_differential_versions = DIFFERENTIAL_PEAK_CALLING.out.versions + ch_software_versions = ch_software_versions.mix(ch_differential_versions) + if (!params.run_multiqc) { + ch_differential_summary.subscribe { } + ch_differential_skipped.subscribe { } + ch_differential_versions.subscribe { } + } + } + ch_dt_corrmatrix = Channel.empty() ch_dt_pcadata = Channel.empty() ch_dt_fpmatrix = Channel.empty() @@ -791,13 +839,6 @@ workflow CUTANDRUN { ch_software_versions = ch_software_versions.mix(DEEPTOOLS_QC.out.versions) } - /* - * CHANNEL: Filter bais for target only - */ - ch_samtools_bai.filter { it -> it[0].is_control == false } - .set { ch_bai_target } - //ch_bai_target | view - if (params.run_peak_qc && params.run_peak_calling) { /* * CHANNEL: Filter flagstat for target only @@ -850,17 +891,6 @@ workflow CUTANDRUN { } //ch_peakqc_reprod_perc_mqc | view - /* - * CHANNEL: Combine bam and bai files on id - */ - - ch_bam_target.map { row -> [row[0].id, row ].flatten()} - .join ( ch_bai_target.map { row -> [row[0].id, row ].flatten()} ) - .map { row -> [row[1], row[2], row[4]] } - .set { ch_bam_bai } - // EXAMPLE CHANNEL STRUCT: [[META], BAM, BAI] - //ch_bam_bai | view - /* * MODULE: Calculate fragment lengths */ @@ -936,7 +966,9 @@ workflow CUTANDRUN { ch_peakqc_count_consensus_mqc.collect{it[1]}.ifEmpty([]), ch_peakqc_reprod_perc_mqc.collect().ifEmpty([]), ch_frag_len_hist_mqc.collect().ifEmpty([]), - ch_linear_duplication_mqc.collect{it[1]}.ifEmpty([]) + ch_linear_duplication_mqc.collect{it[1]}.ifEmpty([]), + ch_differential_summary.collect().ifEmpty([]), + ch_differential_skipped.collect().ifEmpty([]) ) multiqc_report = MULTIQC.out.report.toList() } diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf new file mode 100644 index 00000000..c44e3cd4 --- /dev/null +++ b/workflows/differential_only.nf @@ -0,0 +1,106 @@ +/* + * Differential-only entrypoint (posthoc) + */ + +import groovy.json.JsonSlurper + +include { DIFFERENTIAL_PEAK_CALLING } from '../subworkflows/local/differential_peak_calling' + +workflow DIFFERENTIAL_ONLY { + main: + if (!params.differential_from_run) { + exit 1, "--differential_from_run must be provided for DIFFERENTIAL_ONLY" + } + + def run_dir = params.differential_from_run + def samples_path = file("${run_dir}/03_peak_calling/06_differential/00_manifests/differential_manifest.samples.tsv") + def peaks_path = file("${run_dir}/03_peak_calling/06_differential/00_manifests/differential_manifest.peaks.tsv") + def run_meta_path = file("${run_dir}/03_peak_calling/06_differential/00_manifests/differential_manifest.run_meta.json") + def chrom_sizes_path = file("${run_dir}/03_peak_calling/06_differential/00_manifests/chrom_sizes.sizes") + + if (!samples_path.exists()) { + exit 1, "Missing differential manifest samples.tsv under ${run_dir}" + } + if (!peaks_path.exists()) { + exit 1, "Missing differential manifest peaks.tsv under ${run_dir}" + } + if (run_meta_path.exists() && !params.differential_contrast) { + def meta = new JsonSlurper().parse(run_meta_path) + if (meta?.default_contrast) { + params.differential_contrast = meta.default_contrast + } + } + + def callers_list = [] + if (run_meta_path.exists()) { + def meta = new JsonSlurper().parse(run_meta_path) + callers_list = meta?.callers ?: [] + } + + ch_samples = Channel.fromPath(samples_path).splitCsv(header: true, sep: '\t') + .map { row -> + def meta = [ + id: row.sample_id, + sample_id: row.sample_id, + group: row.group, + condition: row.condition, + replicate: row.replicate.toInteger(), + is_control: false + ] + def bam = file(row.final_bam) + def bai = file(row.final_bai) + [meta, bam, bai, row.spikein_scale_factor, row.bigwig] + } + + ch_bam_bai = ch_samples.map { meta, bam, bai, scale, bigwig -> [meta, bam, bai] } + ch_spikein_scale = ch_samples + .filter { meta, bam, bai, scale, bigwig -> scale && scale != 'NA' } + .map { meta, bam, bai, scale, bigwig -> [meta, scale] } + + ch_bigwig = ch_samples + .filter { meta, bam, bai, scale, bigwig -> bigwig && bigwig != 'NA' } + .map { meta, bam, bai, scale, bigwig -> [meta, file(bigwig)] } + + ch_samples_map = ch_samples + .map { meta, bam, bai, scale, bigwig -> [meta.sample_id, meta] } + .toList() + .map { list -> list.collectEntries { [(it[0]): it[1]] } } + + ch_peaks = Channel.fromPath(peaks_path).splitCsv(header: true, sep: '\t') + .combine(ch_samples_map) + .map { row, samples_map -> + def meta = samples_map[row.sample_id] + if (!meta) { + return null + } + def meta_out = meta + [caller: row.caller] + [meta_out, file(row.peaks_path)] + } + .filter { it != null } + + if (!callers_list) { + callers_list = peaks_path.text.readLines() + .drop(1) + .collect { it.split('\\t')[1] } + .unique() + } + + ch_chrom_sizes = chrom_sizes_path.exists() ? Channel.value(chrom_sizes_path) : Channel.empty() + + DIFFERENTIAL_PEAK_CALLING( + ch_bam_bai, + ch_peaks, + ch_bigwig, + ch_spikein_scale, + ch_chrom_sizes, + callers_list + ) + ch_diff_summary = DIFFERENTIAL_PEAK_CALLING.out.summary + ch_diff_skipped = DIFFERENTIAL_PEAK_CALLING.out.skipped + ch_diff_versions = DIFFERENTIAL_PEAK_CALLING.out.versions + if (!params.run_multiqc) { + ch_diff_summary.subscribe { } + ch_diff_skipped.subscribe { } + ch_diff_versions.subscribe { } + } +} From dcff1ff7758a44ee02d192f0f4b64d338d4ab8dd Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 17:17:26 -0800 Subject: [PATCH 03/31] Avoid gtf2bed dependency in annotate_regions --- bin/annotate_regions.py | 67 ++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/bin/annotate_regions.py b/bin/annotate_regions.py index c71e3522..1bff4c78 100755 --- a/bin/annotate_regions.py +++ b/bin/annotate_regions.py @@ -20,6 +20,54 @@ def find(names): return chr_col, start_col, end_col +def parse_gtf_attributes(attr_str): + attrs = {} + for item in attr_str.strip().split(";"): + item = item.strip() + if not item: + continue + if " " not in item: + continue + key, value = item.split(" ", 1) + value = value.strip().strip('"') + attrs[key] = value + return attrs + + +def gtf_to_tss_bed(gtf_path, out_path): + with gtf_path.open() as in_handle, out_path.open("w") as out_handle: + for line in in_handle: + if not line.strip() or line.startswith("#"): + continue + fields = line.rstrip().split("\t") + if len(fields) < 9: + continue + chrom, _, _, start, end, _, strand, _, attrs = fields + try: + start_i = int(start) - 1 + end_i = int(end) + except ValueError: + continue + if start_i < 0: + start_i = 0 + if strand == "+": + tss_start = start_i + elif strand == "-": + tss_start = max(end_i - 1, 0) + else: + continue + tss_end = tss_start + 1 + attr_map = parse_gtf_attributes(attrs) + name = ( + attr_map.get("gene_name") + or attr_map.get("gene_id") + or attr_map.get("transcript_id") + or attr_map.get("ID") + or "NA" + ) + out_handle.write("\t".join([chrom, str(tss_start), str(tss_end), name]) + "\n") + + def run(cmd, stdout=None): result = subprocess.run(cmd, stdout=stdout, stderr=subprocess.PIPE, text=True) if result.returncode != 0: @@ -49,25 +97,8 @@ def main(): if args.features and Path(args.features).exists(): features_bed = Path(args.features) elif args.gtf and Path(args.gtf).exists(): - raw_bed = Path("features.raw.bed") tss_bed = Path("features.tss.bed") - run(["gtf2bed", args.gtf], stdout=raw_bed.open("w")) - with raw_bed.open() as in_handle, tss_bed.open("w") as out_handle: - for line in in_handle: - if not line.strip(): - continue - fields = line.rstrip().split("\t") - if len(fields) < 6: - continue - chrom, start, end, name, score, strand = fields[:6] - start = int(start) - end = int(end) - if strand == "+": - tss_start = start - else: - tss_start = max(end - 1, 0) - tss_end = tss_start + 1 - out_handle.write("\t".join([chrom, str(tss_start), str(tss_end), name]) + "\n") + gtf_to_tss_bed(Path(args.gtf), tss_bed) features_bed = tss_bed if not features_bed or not features_bed.exists(): From 5db310993bc5e20b920fae24d9c33633f2b5e639 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 17:37:09 -0800 Subject: [PATCH 04/31] Refine chipbinner clustering inputs --- bin/hdbscan_grid.py | 72 +++++++++++++------ modules/local/chipbinner_hdbscan_grid.nf | 22 ++---- modules/local/chipbinner_rots.nf | 3 + .../local/differential_peak_calling.nf | 2 +- 4 files changed, 60 insertions(+), 39 deletions(-) diff --git a/bin/hdbscan_grid.py b/bin/hdbscan_grid.py index 434ddded..b8917330 100755 --- a/bin/hdbscan_grid.py +++ b/bin/hdbscan_grid.py @@ -1,16 +1,55 @@ #!/usr/bin/env python3 import argparse +import csv import itertools import sys import numpy as np -import pandas as pd def parse_list(value): return [int(x.strip()) for x in str(value).split(',') if x.strip()] +def load_matrix(path): + with open(path, newline='') as handle: + reader = csv.reader(handle, delimiter='\t') + header = next(reader, []) + rows = [row for row in reader if row] + if len(header) <= 3: + raise SystemExit("Matrix missing sample columns") + coords = [row[:3] for row in rows] + data = [] + for row in rows: + data.append([float(x) for x in row[3:]]) + matrix = np.array(data) if data else np.empty((0, len(header) - 3)) + return header[:3], coords, matrix + + +def write_summary(path, rows): + fieldnames = [ + "min_cluster_size", + "min_samples", + "n_clusters", + "fraction_assigned", + "mean_persistence", + "selected", + ] + with open(path, "w", newline="") as handle: + writer = csv.DictWriter(handle, delimiter="\t", fieldnames=fieldnames, lineterminator="\n") + writer.writeheader() + for row in rows: + writer.writerow(row) + + +def write_clusters(path, coord_header, coords, labels): + with open(path, "w", newline="") as handle: + writer = csv.writer(handle, delimiter="\t", lineterminator="\n") + writer.writerow(coord_header + ["cluster"]) + for row, label in zip(coords, labels): + writer.writerow(row + [label]) + + def main(): parser = argparse.ArgumentParser() parser.add_argument("--matrix", required=True) @@ -20,18 +59,13 @@ def main(): parser.add_argument("--clusters", required=True) args = parser.parse_args() - df = pd.read_csv(args.matrix, sep='\t') - if df.shape[1] <= 3: - raise SystemExit("Matrix missing sample columns") - - coords = df.iloc[:, :3] - matrix = df.iloc[:, 3:].values + coord_header, coords, matrix = load_matrix(args.matrix) try: import hdbscan except Exception: # fallback: single cluster - summary = pd.DataFrame([ + summary_rows = [ { "min_cluster_size": 0, "min_samples": 0, @@ -40,11 +74,10 @@ def main(): "mean_persistence": 1.0, "selected": True, } - ]) - summary.to_csv(args.summary, sep='\t', index=False) - clusters = coords.copy() - clusters["cluster"] = 0 - clusters.to_csv(args.clusters, sep='\t', index=False) + ] + write_summary(args.summary, summary_rows) + labels = [0 for _ in coords] + write_clusters(args.clusters, coord_header, coords, labels) return min_cluster_sizes = parse_list(args.min_cluster_size) @@ -75,14 +108,13 @@ def main(): "selected": False, }) - summary = pd.DataFrame(results) - if best is not None: - summary.loc[(summary.min_cluster_size == best[0]) & (summary.min_samples == best[1]), "selected"] = True - summary.to_csv(args.summary, sep='\t', index=False) + for row in results: + if best is not None and row["min_cluster_size"] == best[0] and row["min_samples"] == best[1]: + row["selected"] = True + write_summary(args.summary, results) - clusters = coords.copy() - clusters["cluster"] = best_labels if best_labels is not None else 0 - clusters.to_csv(args.clusters, sep='\t', index=False) + labels = list(best_labels) if best_labels is not None else [0 for _ in coords] + write_clusters(args.clusters, coord_header, coords, labels) if __name__ == "__main__": diff --git a/modules/local/chipbinner_hdbscan_grid.nf b/modules/local/chipbinner_hdbscan_grid.nf index 6e2ffed0..ab4c8e66 100644 --- a/modules/local/chipbinner_hdbscan_grid.nf +++ b/modules/local/chipbinner_hdbscan_grid.nf @@ -8,8 +8,10 @@ process CHIPBINNER_HDBSCAN_GRID { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - conda "conda-forge::python=3.11 conda-forge::numpy conda-forge::pandas conda-forge::scikit-learn conda-forge::hdbscan" - container "quay.io/biocontainers/python:3.8.3" + conda "conda-forge::python=3.11 conda-forge::numpy conda-forge::hdbscan" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/hdbscan:0.8.33--pyhdfd78af_0' : + 'biocontainers/hdbscan:0.8.33--pyhdfd78af_0' }" input: path matrix @@ -27,22 +29,6 @@ process CHIPBINNER_HDBSCAN_GRID { script: """\ - python - <<'PY' - import importlib.util - import subprocess - import sys - - required = { - "numpy": "numpy", - "pandas": "pandas", - "sklearn": "scikit-learn", - "hdbscan": "hdbscan", - } - missing = [pkg for mod, pkg in required.items() if importlib.util.find_spec(mod) is None] - if missing: - subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir"] + missing) - PY - python ${projectDir}/bin/hdbscan_grid.py \ --matrix ${matrix} \ --min_cluster_size ${min_cluster_size} \ diff --git a/modules/local/chipbinner_rots.nf b/modules/local/chipbinner_rots.nf index 51ca0e6c..72eb538a 100644 --- a/modules/local/chipbinner_rots.nf +++ b/modules/local/chipbinner_rots.nf @@ -34,6 +34,7 @@ process CHIPBINNER_ROTS { output: tuple val(group), path("chipbinner.differential.tsv") , emit: results + tuple val(group), path("chipbinner.significant.bed") , emit: significant tuple val(group), path("chipbinner.significant_up.bed") , emit: up tuple val(group), path("chipbinner.significant_down.bed") , emit: down tuple val(group), path("chipbinner.summary.tsv") , emit: summary @@ -69,6 +70,8 @@ process CHIPBINNER_ROTS { --bootstrap ${bootstrap} \ --k_value ${k_value} + cat chipbinner.significant_up.bed chipbinner.significant_down.bed | awk 'NF' | sort -k1,1 -k2,2n | uniq > chipbinner.significant.bed + cat <<-END_VERSIONS > versions.yml "${task.process}": r-base: \$(R --version 2>&1 | head -n 1 | sed -e 's/.*R version //; s/ .*//') diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 6e675108..6aec96c7 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -362,7 +362,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ) ch_versions = ch_versions.mix(CHIPBINNER_ROTS.out.versions) ch_summary_files = ch_summary_files.mix(CHIPBINNER_ROTS.out.summary.map { group, file -> file }) - ch_chipbinner_significant = CHIPBINNER_ROTS.out.up + ch_chipbinner_significant = CHIPBINNER_ROTS.out.significant if (params.differential_annotate) { ch_chipbinner_annotate = CHIPBINNER_ROTS.out.results.map { group, file -> From 20fd9f0d8b55c344007f1cc1c8bd4b4ba8502831 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 17:49:16 -0800 Subject: [PATCH 05/31] Handle empty DiffBind results --- bin/diffbind_run.R | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/bin/diffbind_run.R b/bin/diffbind_run.R index cfaec45b..7f933928 100755 --- a/bin/diffbind_run.R +++ b/bin/diffbind_run.R @@ -265,18 +265,27 @@ if (!use_spikein) { if (!is.null(coord_cols$end)) { res_df$end <- res_df[[coord_cols$end]] } + if (!"chr" %in% colnames(res_df)) { + res_df$chr <- NA + } + if (!"start" %in% colnames(res_df)) { + res_df$start <- NA + } + if (!"end" %in% colnames(res_df)) { + res_df$end <- NA + } if (!"log2FC" %in% colnames(res_df)) { if ("Fold" %in% colnames(res_df)) { res_df$log2FC <- res_df$Fold } else if ("fold" %in% colnames(res_df)) { res_df$log2FC <- res_df$fold + } else { + res_df$log2FC <- NA_real_ } } if (!"FDR" %in% colnames(res_df)) { - if ("FDR" %in% colnames(res_df)) { - res_df$FDR <- res_df$FDR - } + res_df$FDR <- NA_real_ } write.table(res_df, paste0(opt$prefix, ".results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) From e58d2a635bff40714f864a58995ce9d2f899cdec Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 18:00:10 -0800 Subject: [PATCH 06/31] Handle unnamed annotation BEDs --- bin/annotate_regions.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/bin/annotate_regions.py b/bin/annotate_regions.py index 1bff4c78..e62f8919 100755 --- a/bin/annotate_regions.py +++ b/bin/annotate_regions.py @@ -106,6 +106,14 @@ def main(): output_path.write_text(input_path.read_text()) return + features_has_name = False + with features_bed.open() as handle: + for line in handle: + if not line.strip() or line.startswith("#"): + continue + features_has_name = len(line.rstrip().split("\t")) >= 4 + break + regions_bed = Path("regions.bed") with input_path.open() as handle, regions_bed.open("w") as out_handle: reader = csv.DictReader(handle, delimiter='\t') @@ -129,7 +137,9 @@ def main(): if len(fields) < 5: continue row_id = int(fields[3]) - feature_id = fields[7] if len(fields) > 7 else "NA" + feature_id = "NA" + if features_has_name and len(fields) > 7: + feature_id = fields[7] distance = fields[-1] annotations[row_id] = (feature_id, distance) From 853dd7d1582c37e4859fc7416f017ebe0f327c48 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 18:21:36 -0800 Subject: [PATCH 07/31] Ensure SPAN compare has samtools --- modules/local/span_compare.nf | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/local/span_compare.nf b/modules/local/span_compare.nf index d143daa2..847d2269 100644 --- a/modules/local/span_compare.nf +++ b/modules/local/span_compare.nf @@ -10,8 +10,8 @@ process SPAN_COMPARE { conda "conda-forge::openjdk=21.0.2 bioconda::samtools=1.19.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker://eclipse-temurin:21-jre' : - 'eclipse-temurin:21-jre' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : + 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" input: tuple val(meta), path(treated_bams), path(control_bams) @@ -38,6 +38,8 @@ process SPAN_COMPARE { def fdr_arg = fdr ? "--fdr ${fdr}" : '' def bin_arg = bin ? "--bin ${bin}" : '' """ + java_cmd=\${JAVA_CMD:-java} + treated_list=(${treated_bams}) control_list=(${control_bams}) @@ -54,7 +56,7 @@ process SPAN_COMPARE { control_input=control_pool.bam fi - java -Xmx${java_heap} -jar ${omnipeaks_jar} compare \ + "\$java_cmd" -Xmx${java_heap} -jar ${omnipeaks_jar} compare \ -t ${treated_input} \ -c ${control_input} \ --cs ${chrom_sizes} \ @@ -90,7 +92,7 @@ process SPAN_COMPARE { cat <<-END_VERSIONS > versions.yml "${task.process}": - java: \$(java -version 2>&1 | head -n 1 | sed -e 's/\"//g') + java: \$(\${java_cmd} -version 2>&1 | head -n 1 | sed -e 's/\"//g') samtools: \$(samtools --version 2>&1 | head -n 1 | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ From f2b03382a5b7fb8c5fc2d81c22889ac7524d9848 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 18:37:11 -0800 Subject: [PATCH 08/31] Report full DiffBind results --- bin/diffbind_run.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/diffbind_run.R b/bin/diffbind_run.R index 7f933928..2fe7261f 100755 --- a/bin/diffbind_run.R +++ b/bin/diffbind_run.R @@ -247,7 +247,7 @@ if (!use_spikein) { dba_obj <- do.call(dba.contrast, c(list(dba_obj), contrast_args)) dba_obj <- do.call(dba.analyze, c(list(dba_obj), analyze_args)) - res <- dba.report(dba_obj, th = opt$fdr, fold = opt$lfc) + res <- dba.report(dba_obj, th = 1, fold = 0) res_df <- as.data.frame(res) coord_cols <- list(chr = NULL, start = NULL, end = NULL) From 790ee233d8578ba86d97b8ca82c00985036bf260 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 18:53:58 -0800 Subject: [PATCH 09/31] Prevent summary file collisions --- modules/local/differential_summary.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf index dd39319f..73902309 100644 --- a/modules/local/differential_summary.nf +++ b/modules/local/differential_summary.nf @@ -30,9 +30,10 @@ process DIFFERENTIAL_SUMMARY_MERGE { python - <<'PY' import csv import json + import sys from pathlib import Path - summary_files = sorted(Path('.').glob('*.summary.tsv')) + summary_files = sorted(Path(p) for p in sys.argv[1:]) with open('${summary_header}', 'r') as header_handle: header_lines = header_handle.read().rstrip('\\n') @@ -79,6 +80,7 @@ process DIFFERENTIAL_SUMMARY_MERGE { 'details': row.get('details', '') }) PY + ${summary_files} cat <<-END_VERSIONS > versions.yml "${task.process}": From bc6ae2470252154b7608bb321967f1603570e1e9 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 19:09:44 -0800 Subject: [PATCH 10/31] Pass summary file list to merge --- modules/local/differential_summary.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf index 73902309..9bcaedbb 100644 --- a/modules/local/differential_summary.nf +++ b/modules/local/differential_summary.nf @@ -27,7 +27,7 @@ process DIFFERENTIAL_SUMMARY_MERGE { script: """\ - python - <<'PY' + python - ${summary_files} <<'PY' import csv import json import sys @@ -80,7 +80,6 @@ process DIFFERENTIAL_SUMMARY_MERGE { 'details': row.get('details', '') }) PY - ${summary_files} cat <<-END_VERSIONS > versions.yml "${task.process}": From a734875d1068291314e88c36fdfe921596ba3edf Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 19:28:37 -0800 Subject: [PATCH 11/31] Include SPAN down peaks in overlaps --- modules/local/span_compare.nf | 2 ++ modules/local/span_fallback_diff.nf | 3 ++- subworkflows/local/differential_peak_calling.nf | 4 ++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/local/span_compare.nf b/modules/local/span_compare.nf index 847d2269..a1114d02 100644 --- a/modules/local/span_compare.nf +++ b/modules/local/span_compare.nf @@ -25,6 +25,7 @@ process SPAN_COMPARE { output: tuple val(meta), path("span.differential.tsv") , emit: tsv tuple val(meta), path("span.differential.bed") , emit: bed + tuple val(meta), path("span.significant.bed") , emit: significant tuple val(meta), path("span.significant_up.bed") , emit: up tuple val(meta), path("span.significant_down.bed") , emit: down tuple val(meta), path("span.summary.tsv") , emit: summary @@ -80,6 +81,7 @@ process SPAN_COMPARE { # Attempt to split by sign using column 5 if numeric awk 'BEGIN{OFS="\t"} {if (\$5 ~ /^-?[0-9.]+\$/) {if (\$5>0) print \$1,\$2,\$3}}' "\$diff_file" > span.significant_up.bed awk 'BEGIN{OFS="\t"} {if (\$5 ~ /^-?[0-9.]+\$/) {if (\$5<0) print \$1,\$2,\$3}}' "\$diff_file" > span.significant_down.bed + cat span.significant_up.bed span.significant_down.bed | awk 'NF' | sort -k1,1 -k2,2n | uniq > span.significant.bed total=\$(grep -v '^#' "\$diff_file" | wc -l | awk '{print \$1}') n_up=\$(wc -l < span.significant_up.bed | awk '{print \$1}') diff --git a/modules/local/span_fallback_diff.nf b/modules/local/span_fallback_diff.nf index ce3f2e74..2d0dcb41 100644 --- a/modules/local/span_fallback_diff.nf +++ b/modules/local/span_fallback_diff.nf @@ -28,6 +28,7 @@ process SPAN_FALLBACK_DIFF { output: tuple val(group), path("span_fallback.differential.tsv") , emit: results tuple val(group), path("span_fallback.differential.bed") , emit: bed + tuple val(group), path("span_fallback.significant.bed") , emit: significant tuple val(group), path("span_fallback.significant_up.bed") , emit: up tuple val(group), path("span_fallback.significant_down.bed") , emit: down tuple val(group), path("span_fallback.summary.tsv") , emit: summary @@ -56,7 +57,7 @@ process SPAN_FALLBACK_DIFF { --prefix span_fallback mv span_fallback.results.tsv span_fallback.differential.tsv - mv span_fallback.significant.bed span_fallback.differential.bed + cp span_fallback.significant.bed span_fallback.differential.bed cat <<-END_README > span_fallback.readme.txt SPAN fallback differential: SPAN peaks + DiffBind/DESeq2/edgeR analysis. diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 6aec96c7..e88e826b 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -420,7 +420,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ) ch_versions = ch_versions.mix(SPAN_COMPARE.out.versions) ch_summary_files = ch_summary_files.mix(SPAN_COMPARE.out.summary.map { meta, file -> file }) - ch_span_significant = ch_span_significant.mix(SPAN_COMPARE.out.up.map { meta, file -> [meta.group, file] }) + ch_span_significant = ch_span_significant.mix(SPAN_COMPARE.out.significant.map { meta, file -> [meta.group, file] }) if (params.differential_annotate) { ch_span_annotate = SPAN_COMPARE.out.tsv.map { meta, file -> @@ -506,7 +506,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ) ch_versions = ch_versions.mix(SPAN_FALLBACK_DIFF.out.versions) ch_summary_files = ch_summary_files.mix(SPAN_FALLBACK_DIFF.out.summary.map { group, file -> file }) - ch_span_significant = ch_span_significant.mix(SPAN_FALLBACK_DIFF.out.up.map { group, file -> [group, file] }) + ch_span_significant = ch_span_significant.mix(SPAN_FALLBACK_DIFF.out.significant.map { group, file -> [group, file] }) if (params.differential_annotate) { ch_span_fallback_annotate = SPAN_FALLBACK_DIFF.out.results.map { group, file -> From ac47e30f2169f046e5ce4b49d37d37094dc81286 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 19:45:57 -0800 Subject: [PATCH 12/31] Fix SPAN compare inputs and capability probe --- modules/local/samtools_merge_bams.nf | 31 +++++++++ modules/local/span_capability_probe.nf | 59 ++++++++++------- modules/local/span_compare.nf | 29 ++------ .../local/differential_peak_calling.nf | 66 ++++++++++++------- 4 files changed, 114 insertions(+), 71 deletions(-) create mode 100644 modules/local/samtools_merge_bams.nf diff --git a/modules/local/samtools_merge_bams.nf b/modules/local/samtools_merge_bams.nf new file mode 100644 index 00000000..caf9ae51 --- /dev/null +++ b/modules/local/samtools_merge_bams.nf @@ -0,0 +1,31 @@ +process SAMTOOLS_MERGE_BAMS { + tag "${meta.group}.${meta.role}" + label 'process_medium' + + conda "bioconda::samtools=1.19.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : + 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" + + input: + tuple val(meta), path(bams) + + output: + tuple val(meta), path("${meta.group}.${meta.role}.merged.bam"), path("${meta.group}.${meta.role}.merged.bam.bai"), emit: merged + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.group}.${meta.role}.merged" + """ + samtools merge -f ${prefix}.bam ${bams} + samtools index ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(samtools --version 2>&1 | head -n 1 | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/span_capability_probe.nf b/modules/local/span_capability_probe.nf index 2ef182b8..6cb7de53 100644 --- a/modules/local/span_capability_probe.nf +++ b/modules/local/span_capability_probe.nf @@ -8,7 +8,7 @@ process SPAN_CAPABILITY_PROBE { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - conda "conda-forge::openjdk=21.0.2 conda-forge::python=3.11" + conda "conda-forge::openjdk=21.0.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'docker://eclipse-temurin:21-jre' : 'eclipse-temurin:21-jre' }" @@ -26,31 +26,40 @@ process SPAN_CAPABILITY_PROBE { script: """\ java -jar ${omnipeaks_jar} --help > omnipeaks_help.txt - python - <<'PY' - import json - import re - - commands = set() - with open('omnipeaks_help.txt') as handle: - for line in handle: - line = line.strip() - if not line: - continue - m = re.match(r'^(?:[A-Z]+)?\s*([a-zA-Z][\w-]+)\b', line) - if m: - cmd = m.group(1) - if cmd not in ('usage', 'options'): - commands.add(cmd) - commands = sorted(commands) - has_compare = any(cmd in ('compare', 'diff', 'differential') for cmd in commands) - out = { - "commands": commands, - "has_compare": has_compare, + commands=\$(awk ' + { + line=\$0 + sub(/^[ \\t]+/, "", line) + if (line == "") next + if (match(line, /^([A-Za-z][A-Za-z0-9_-]*)[ \\t]+/, m)) { + cmd=m[1] + lc=tolower(cmd) + if (lc != "usage" && lc != "options" && lc != "commands") print cmd + } + }' omnipeaks_help.txt | sort -u) + + has_compare=false + for cmd in \${commands}; do + lc=\$(printf '%s' "\${cmd}" | tr '[:upper:]' '[:lower:]') + if [ "\${lc}" = "compare" ] || [ "\${lc}" = "diff" ] || [ "\${lc}" = "differential" ]; then + has_compare=true + break + fi + done + + if [ -n "\${commands}" ]; then + commands_json=\$(printf '%s\n' "\${commands}" | awk 'BEGIN{first=1} {gsub(/\"/,"\\\\\""); if (!first) printf ", "; printf "\"%s\"", \$0; first=0} END{print ""}') + commands_json="[ \${commands_json} ]" + else + commands_json="[]" + fi + + cat <<-END_JSON > omnipeaks_capabilities.json + { + "commands": \${commands_json}, + "has_compare": \${has_compare} } - with open('omnipeaks_capabilities.json', 'w') as handle: - json.dump(out, handle, indent=2) - handle.write("\\n") - PY + END_JSON cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/span_compare.nf b/modules/local/span_compare.nf index a1114d02..4840ffea 100644 --- a/modules/local/span_compare.nf +++ b/modules/local/span_compare.nf @@ -8,13 +8,13 @@ process SPAN_COMPARE { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - conda "conda-forge::openjdk=21.0.2 bioconda::samtools=1.19.2" + conda "conda-forge::openjdk=21.0.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : - 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" + 'docker://eclipse-temurin:21-jre' : + 'eclipse-temurin:21-jre' }" input: - tuple val(meta), path(treated_bams), path(control_bams) + tuple val(meta), path(treated_bam), path(control_bam) path chrom_sizes path omnipeaks_jar val gap @@ -41,25 +41,9 @@ process SPAN_COMPARE { """ java_cmd=\${JAVA_CMD:-java} - treated_list=(${treated_bams}) - control_list=(${control_bams}) - - treated_input=${treated_bams} - control_input=${control_bams} - if [ \${#treated_list[@]} -gt 1 ]; then - samtools merge -f treated_pool.bam ${treated_bams} - samtools index treated_pool.bam - treated_input=treated_pool.bam - fi - if [ \${#control_list[@]} -gt 1 ]; then - samtools merge -f control_pool.bam ${control_bams} - samtools index control_pool.bam - control_input=control_pool.bam - fi - "\$java_cmd" -Xmx${java_heap} -jar ${omnipeaks_jar} compare \ - -t ${treated_input} \ - -c ${control_input} \ + -t ${treated_bam} \ + -c ${control_bam} \ --cs ${chrom_sizes} \ --gap ${gap} \ ${bin_arg} \ @@ -95,7 +79,6 @@ process SPAN_COMPARE { cat <<-END_VERSIONS > versions.yml "${task.process}": java: \$(\${java_cmd} -version 2>&1 | head -n 1 | sed -e 's/\"//g') - samtools: \$(samtools --version 2>&1 | head -n 1 | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ } diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index e88e826b..1fc69ad2 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -13,6 +13,7 @@ include { CHIPBINNER_BINS } from '../../modules/local/chipbinner_bins' include { CHIPBINNER_COUNTS } from '../../modules/local/chipbinner_counts' include { CHIPBINNER_HDBSCAN_GRID } from '../../modules/local/chipbinner_hdbscan_grid' include { CHIPBINNER_ROTS } from '../../modules/local/chipbinner_rots' +include { SAMTOOLS_MERGE_BAMS } from '../../modules/local/samtools_merge_bams' include { SPAN_CAPABILITY_PROBE } from '../../modules/local/span_capability_probe' include { SPAN_COMPARE } from '../../modules/local/span_compare' include { SPAN_FALLBACK_DIFF } from '../../modules/local/span_fallback_diff' @@ -386,37 +387,56 @@ workflow DIFFERENTIAL_PEAK_CALLING { SPAN_CAPABILITY_PROBE(Channel.value(file(params.omnipeaks_jar))) ch_versions = ch_versions.mix(SPAN_CAPABILITY_PROBE.out.versions) + def span_mode = params.span_diff_mode?.toString()?.toLowerCase() ?: 'auto' + def span_use_native = span_mode == 'native' || span_mode == 'auto' + def span_use_fallback = span_mode == 'fallback' || span_mode == 'auto' + ch_span_caps = SPAN_CAPABILITY_PROBE.out.capabilities .map { file -> new JsonSlurper().parse(file) } ch_span_has_compare = ch_span_caps.map { it.has_compare ?: false } - ch_span_groups = ch_valid_groups.map { rec -> - def treated_bams = rec.records.findAll { it.condition == treated }.collect { it.bam } - def control_bams = rec.records.findAll { it.condition == control }.collect { it.bam } - def meta = [id: rec.group, group: rec.group, treated: treated, control: control] - [meta, treated_bams, control_bams] - } + if (span_use_native) { + ch_span_merge_inputs = ch_valid_groups.flatMap { rec -> + def treated_bams = rec.records.findAll { it.condition == treated }.collect { it.bam } + def control_bams = rec.records.findAll { it.condition == control }.collect { it.bam } + [ + [[group: rec.group, role: 'treated', id: "${rec.group}.treated"], treated_bams], + [[group: rec.group, role: 'control', id: "${rec.group}.control"], control_bams] + ] + } - def span_mode = params.span_diff_mode?.toString()?.toLowerCase() ?: 'auto' + SAMTOOLS_MERGE_BAMS(ch_span_merge_inputs) + ch_versions = ch_versions.mix(SAMTOOLS_MERGE_BAMS.out.versions) - ch_span_native_groups = Channel.empty() - if (span_mode == 'native') { - ch_span_native_groups = ch_span_groups - } else if (span_mode == 'auto') { - ch_span_native_groups = ch_span_groups.combine(ch_span_has_compare) - .filter { meta, treated_bams, control_bams, has_compare -> has_compare } - .map { meta, treated_bams, control_bams, has_compare -> [meta, treated_bams, control_bams] } - } + ch_span_merged_treated = SAMTOOLS_MERGE_BAMS.out.merged + .filter { meta, bam, bai -> meta.role == 'treated' } + .map { meta, bam, bai -> [meta.group, bam] } + + ch_span_merged_control = SAMTOOLS_MERGE_BAMS.out.merged + .filter { meta, bam, bai -> meta.role == 'control' } + .map { meta, bam, bai -> [meta.group, bam] } + + ch_span_compare_inputs = ch_span_merged_treated.join(ch_span_merged_control) + .map { group, treated_bam, control_bam -> + def meta = [id: group, group: group, treated: treated, control: control] + [meta, treated_bam, control_bam] + } + + ch_span_native_groups = ch_span_compare_inputs + if (span_mode == 'auto') { + ch_span_native_groups = ch_span_compare_inputs.combine(ch_span_has_compare) + .filter { meta, treated_bam, control_bam, has_compare -> has_compare } + .map { meta, treated_bam, control_bam, has_compare -> [meta, treated_bam, control_bam] } + } - if (span_mode == 'native' || span_mode == 'auto') { SPAN_COMPARE( ch_span_native_groups, - ch_chrom_sizes_single, - Channel.value(file(params.omnipeaks_jar)), - Channel.value(params.span_diff_gap), - Channel.value(params.span_diff_bin), - Channel.value(params.span_diff_fdr), - Channel.value(params.span_diff_java_heap) + ch_chrom_sizes_single, + Channel.value(file(params.omnipeaks_jar)), + Channel.value(params.span_diff_gap), + Channel.value(params.span_diff_bin), + Channel.value(params.span_diff_fdr), + Channel.value(params.span_diff_java_heap) ) ch_versions = ch_versions.mix(SPAN_COMPARE.out.versions) ch_summary_files = ch_summary_files.mix(SPAN_COMPARE.out.summary.map { meta, file -> file }) @@ -445,7 +465,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { .map { rec, has_compare -> rec } } - if (span_mode == 'fallback' || span_mode == 'auto') { + if (span_use_fallback) { def span_callers = callers.findAll { it.startsWith('span_') } if (!span_callers) { ch_skipped_records = ch_skipped_records.mix( From 17833314ed7ff67e806b5446041273e4aa5da11c Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 20:03:26 -0800 Subject: [PATCH 13/31] Stage BAM indexes for chipbinner counts --- modules/local/chipbinner_counts.nf | 10 +++++++++- .../local/differential_peak_calling.nf | 20 +++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/modules/local/chipbinner_counts.nf b/modules/local/chipbinner_counts.nf index e7280d9e..0eafdc57 100644 --- a/modules/local/chipbinner_counts.nf +++ b/modules/local/chipbinner_counts.nf @@ -16,6 +16,8 @@ process CHIPBINNER_COUNTS { input: path bins val samples_json + path bams + path bais val group val use_spikein val pseudocount @@ -30,6 +32,8 @@ process CHIPBINNER_COUNTS { script: """\ + printf "%s\n" ${bams} > bam_paths.txt + python - <<'PY' import json import subprocess @@ -37,9 +41,13 @@ process CHIPBINNER_COUNTS { from pathlib import Path samples = json.loads(r'''${samples_json}''') - bam_paths = [s['bam'] for s in samples] + with open('bam_paths.txt') as handle: + bam_paths = [line.strip() for line in handle if line.strip()] sample_ids = [s['sample_id'] for s in samples] scale_factors = [s.get('spikein_scale_factor') for s in samples] + if len(bam_paths) != len(sample_ids): + sys.stderr.write(f\"Expected {len(sample_ids)} BAMs but found {len(bam_paths)}\\n\") + sys.exit(1) cmd = ["bedtools", "multicov", "-bams"] + bam_paths + ["-bed", "${bins}"] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 1fc69ad2..1a53bce4 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -326,16 +326,20 @@ workflow DIFFERENTIAL_PEAK_CALLING { spikein_scale_factor: row.spikein_scale_factor ?: 'NA' ] } - [rec.group, JsonOutput.toJson(samples)] + def bams = ordered.collect { row -> row.bam } + def bais = ordered.collect { row -> row.bai } + [rec.group, JsonOutput.toJson(samples), bams, bais] } ch_chipbinner_inputs = CHIPBINNER_BINS.out.bins.join(ch_chipbinner_samples) - .map { group, bins_file, samples_json -> [bins_file, samples_json, group] } + .map { group, bins_file, samples_json, bams, bais -> [bins_file, samples_json, bams, bais, group] } CHIPBINNER_COUNTS( - ch_chipbinner_inputs.map { bins, samples_json, group -> bins }, - ch_chipbinner_inputs.map { bins, samples_json, group -> samples_json }, - ch_chipbinner_inputs.map { bins, samples_json, group -> group }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> bins }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> samples_json }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> bams }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> bais }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> group }, ch_use_spikein, Channel.value(params.chipbinner_pseudocount) ) @@ -343,7 +347,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { CHIPBINNER_HDBSCAN_GRID( CHIPBINNER_COUNTS.out.normalized, - ch_chipbinner_inputs.map { bins, samples_json, group -> group }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> group }, Channel.value(params.chipbinner_hdbscan_grid_min_cluster_size), Channel.value(params.chipbinner_hdbscan_grid_min_samples) ) @@ -352,10 +356,10 @@ workflow DIFFERENTIAL_PEAK_CALLING { CHIPBINNER_ROTS( CHIPBINNER_COUNTS.out.normalized, CHIPBINNER_HDBSCAN_GRID.out.clusters, - ch_chipbinner_inputs.map { bins, samples_json, group -> samples_json }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> samples_json }, Channel.value(treated), Channel.value(control), - ch_chipbinner_inputs.map { bins, samples_json, group -> group }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> group }, Channel.value(params.chipbinner_fdr), Channel.value(params.chipbinner_lfc), Channel.value(params.chipbinner_bootstrap), From d045ec4b05fbc2e598720d095ec9cadf873d613a Mon Sep 17 00:00:00 2001 From: dhusmann <77034415+dhusmann@users.noreply.github.com> Date: Sun, 4 Jan 2026 21:09:49 -0800 Subject: [PATCH 14/31] Create differential peak calling specification document Add a detailed specification for differential peak calling, including methods, goals, design principles, and implementation plans. --- .../SPEC_DIFFERENTIAL_PEAK_CALLING.md | 732 ++++++++++++++++++ 1 file changed, 732 insertions(+) create mode 100644 docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md diff --git a/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md new file mode 100644 index 00000000..b2cd88fb --- /dev/null +++ b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md @@ -0,0 +1,732 @@ +# SPEC_DIFFERENTIAL_PEAK_CALLING.md +⸻ + +Differential Peak Calling Specification + +Status: Draft (implementation-ready) +Version: 1.0 +Date: 2026-01-04 +Target: dhusmann/cutandrun (branch: differential_peak_calling) + +⸻ + +Table of Contents + 1. Overview + 2. Goals and Non-goals + 3. Design Principles + 4. Pipeline Architecture + 1. Integrated vs posthoc execution + 2. Design manifest + 3. Subworkflow layout + 4. Data model and contrast semantics + 5. Validation and skip/strict behavior + 5. Shared Components + 1. Peak set and region annotation + 2. Summary + MultiQC inputs + 3. Cross-method and cross-caller overlaps + 6. Method 1: DiffBind + 7. Method 2: ChIPBinner + 8. Method 3: SPAN / OmniPeaks differential + 9. Output Structure + 10. User-facing Parameters + 11. Implementation Plan + 12. Testing Strategy and Acceptance Criteria + 13. Resource Labels and Containers + +⸻ + +Overview + +This spec adds differential peak calling / differential enrichment as a first-class (opt-in) feature layer for the pipeline, using three complementary approaches: + +Method Best for Core idea Primary outputs +DiffBind Peak-centric differential binding for punctate/focal marks and TFs Counts reads in a peak consensus/union and tests with DESeq2/edgeR Differential region TSV + up/down BED + diagnostic plots +ChIPBinner Broad marks and global redistribution / genome-wide shifts Genome-wide fixed binning + clustering + bin-level differential testing Bin-level differential TSV + cluster BEDs + diagnostic plots (+ optional enrichment) +SPAN / OmniPeaks differential SPAN users who want SPAN-consistent comparison Use OmniPeaks differential mode if supported, otherwise explicit fallback Differential regions BED/TSV (+ optional pooling manifest) + +Key requirement: differential analysis must run for every peak caller variant specified in --peakcaller (primary + secondary), producing parallel DiffBind outputs per caller. ChIPBinner and SPAN-diff run per group (mark), not per caller (unless SPAN fallback explicitly uses SPAN-derived peaks). + +⸻ + +Goals and Non-goals + +Goals + 1. Add opt-in differential analysis to the pipeline: + • DiffBind (per caller × group) + • ChIPBinner (per group) + • SPAN/OmniPeaks differential (per group; native if possible, explicit fallback otherwise) + 2. Support two execution modes: + • Integrated mode (normal run): differential executes after peak calling outputs exist. + • Posthoc mode: run differential-only from an existing --outdir without recomputing upstream steps. + 3. Require an explicit contrast definition to ensure consistent log2FC direction: + • log2FC = log2(treated / control) across all methods. + 4. If spike-in normalization is present and enabled, incorporate external scale factors into differential analysis. + 5. Emit stable, structured outputs including: + • raw result tables (TSV) + • annotated result tables (TSV) + • BED-like significant sets (up/down) + • diagnostic plots (PCA/correlation + method-specific) + • MultiQC-consumable summary tables + +Non-goals (v1) + • Automatic modeling of batch covariates (batch-aware design matrices). + • “Compare normalization methods” / “compare peak callers” decision frameworks (beyond overlap metrics and summary tables). + • A full custom MultiQC plugin/module (v1 uses MultiQC custom-content tables and plot discovery). + +⸻ + +Design Principles + 1. Layered architecture: differential analysis is a separate layer that: + • does not mutate upstream peak calling outputs, + • can run integrated or posthoc, + • consumes published artifacts only in posthoc mode (no work/ assumptions). + 2. Per-caller parity: any caller listed in --peakcaller is treated as a first-class source for DiffBind. + 3. Deterministic + resumable: manifests and samplesheets must be stable-sorted to support -resume and reproducibility. + 4. Explicit contrast direction: do not guess treated vs control from labels. + 5. Fail/skip is user-controlled: default behavior is to skip invalid comparisons with clear warnings, but a strict mode is provided to fail fast. + 6. Performance-aware design: avoid repeated expensive operations across methods; keep per-group computation parallelizable. + +⸻ + +Pipeline Architecture + +Integrated vs posthoc execution + +Integrated mode + • Differential runs when any --run_diffbind, --run_chipbinner, or --run_span_diff is enabled. + • The workflow consumes in-memory channels produced by upstream alignment + peak calling. + +Posthoc mode + • Set --differential_from_run to run differential-only: + • A dedicated entry workflow DIFFERENTIAL_ONLY reads manifest(s) from , + • reconstructs the necessary channels (BAM/BAI, per-caller peaks, optional scale factors), + • runs the same DIFFERENTIAL_PEAK_CALLING subworkflow. + +Design manifest + +To make posthoc differential reliable, the main pipeline writes deterministic manifests that contain published paths under --outdir. + +Emit location (new): +03_peak_calling/06_differential/00_manifests/ + +Files (new): + 1. differential_manifest.samples.tsv +One row per target sample (not IgG), minimum columns: + + • sample_id + • group + • condition + • replicate + • final_bam + • final_bai + • normalisation_mode + • spikein_scale_factor (or NA) + • bigwig (optional; NA if absent) + + 2. differential_manifest.peaks.tsv +One row per (sample_id, caller): + + • sample_id + • caller + • peaks_path + • peaks_format (bed/narrowPeak/broadPeak/other) + • caller_role (primary|secondary; optional) + + 3. differential_manifest.run_meta.json +Captures: + + • pipeline version/hash + • reference genome id (if any) + • the caller list used + • the default contrast if provided (optional) + +Manifest rules + • Sorted by stable keys (group, condition, replicate, sample_id, caller) before writing. + • Contains only published paths under --outdir. + +Subworkflow layout + +Add a new subworkflow: + +subworkflows/local/differential_peak_calling.nf + +It contains three gated method blocks and shared utilities: + +DIFFERENTIAL_PEAK_CALLING +├── WRITE_DIFFERENTIAL_MANIFESTS (integrated mode only) +├── (optional) VALIDATE_DIFFERENTIAL_DESIGN +├── DIFFBIND (per group × caller) +│ ├── MAKE_DIFFBIND_SAMPLESHEETS +│ ├── RUN_DIFFBIND_R +│ ├── ANNOTATE_REGIONS (shared) +│ └── DIFFBIND_SUMMARY +├── CHIPBINNER (per group) +│ ├── MAKE_BINS (or LOAD_BINS) +│ ├── BIN_COUNTS +│ ├── CHIPBINNER_CLUSTER_GRID (HDBSCAN grid search) +│ ├── CHIPBINNER_SELECT_MODELS +│ ├── CHIPBINNER_DIFFERENTIAL (ROTS) +│ ├── ANNOTATE_REGIONS (shared) +│ ├── (optional) CHIPBINNER_ENRICHMENT (LOLA) +│ └── CHIPBINNER_SUMMARY +├── SPAN_DIFF (per group) +│ ├── OMNIPEAKS_CAPABILITY_PROBE +│ ├── SPAN_COMPARE (native) OR SPAN_FALLBACK_DIFF +│ ├── ANNOTATE_REGIONS (shared) +│ └── SPAN_SUMMARY +└── CROSS_COMPARISON (optional) + ├── OVERLAP_CALLERS (DiffBind across callers) + └── OVERLAP_METHODS (DiffBind vs SPAN vs ChIPBinner) + +Add a new DSL2 entry workflow: + • workflow DIFFERENTIAL_ONLY (new entrypoint) + • Reads manifests from --differential_from_run + • Runs DIFFERENTIAL_PEAK_CALLING only + +Data model and contrast semantics + +Existing metadata (already in fork per existing work): + • meta.group (mark/target) + • meta.condition + • meta.replicate (int) + • meta.sample_id / meta.id (unique) + • meta.caller (peak caller id) + +Contrast definition (required for all methods): + • --differential_contrast "TREATED,CONTROL" + • log2FC sign convention: log2(TREATED / CONTROL) + +Filtering rule (v1): + • For each group, differential compares only the two conditions in --differential_contrast. + • If additional conditions exist for that group: + • default: ignore them with a warning + • strict mode: error + +Validation and skip/strict behavior + +Validation is performed per group (and per caller where applicable): + +Checks: + • both contrast conditions exist for the group + • each condition has ≥ --differential_min_replicates + • required inputs exist on disk (BAM/BAI + peak files) + +Behavior: + • default: invalid comparisons are skipped with a clear warning and included in a skipped.tsv report + • --differential_strict true: any invalid comparison causes the workflow to fail + +⸻ + +Shared Components + +Peak set and region annotation + +All methods must produce a BED-like region set that can be annotated consistently. + +Shared annotation module: + • modules/local/annotate_regions.nf + • Uses bedtools closest against one of: + • --gene_bed if present, else + • a derived TSS BED from --gtf if present, else + • skip annotation with warning (still emit unannotated outputs) + +Annotation fields appended to result TSV (minimum): + • nearest_feature_id (or gene id/name if available) + • distance_to_feature + +Summary + MultiQC inputs + +Each method writes a small summary TSV (one row per group/caller comparison) with standard columns: + +Standard summary columns: + • method (diffbind|chipbinner|span) + • group + • caller (diffbind only; else NA) + • treated_condition + • control_condition + • n_tested + • n_fdr_pass + • n_up + • n_down + +Additional method-specific columns may be appended (e.g., chosen HDBSCAN parameters). + +MultiQC integration (v1): + • MultiQC custom content tables discover *.summary.tsv under differential output tree. + • No custom plugin required for v1. + +Cross-method and cross-caller overlaps + +To provide immediate utility and “best-of-both” value, emit overlap metrics: + 1. Across peak callers (DiffBind significant sets) + + • For each group: + • compute pairwise overlap counts and Jaccard between caller outputs + + 2. Across methods + + • For each group: + • overlap DiffBind significant BED (per caller) with: + • SPAN significant BED (if present) + • ChIPBinner cluster-derived differential BEDs (if present) + • Emit both summary TSV and optional plots (UpSet if feasible; v1 can be summary-only) + +Implementation: + • bedtools-based overlap (fast, deterministic) + • 03_peak_calling/06_differential/04_cross_comparison/ + +⸻ + +Method 1: DiffBind + +Purpose + +Run classic differential binding using called peaks as inputs. Runs separately for each group × caller. + +Inputs + +Per sample: + • final BAM + BAI + • called peaks for the selected caller variant + +Per comparison: + • explicit contrast (treated/control) + • optional spike-in scale factor per sample (if enabled) + +Samplesheet generation + +Write one DiffBind samplesheet CSV per (group, caller): + +Path: +03_peak_calling/06_differential/01_diffbind/00_samplesheets//.diffbind.csv + +Columns (minimum DiffBind-compatible): + • SampleID = sample id + • Factor = group + • Condition = condition + • Replicate = replicate + • bamReads = final bam path + • Peaks = peak path + +Recommended extra columns: + • Tissue (constant, e.g. CUTRUN) + • Caller / PeakCaller (for provenance) + • SpikeinScaleFactor (optional; used by runner if enabled) + +Execution + +Create a new module: + • modules/local/diffbind_run.nf + • bin/diffbind_run.R + +Runner responsibilities: + 1. Validate design for the specific group×caller subset. + 2. Construct DiffBind object from samplesheet. + 3. Count reads in peaks: + • default: no summit recentering (use full peak widths) + • optional: if --diffbind_summits > 0, use summit-centered widths + 4. Normalization: + • if external scaling enabled and scale factors exist: + • inject user-provided scaling into the downstream DE test (preferred) + • else: + • use DiffBind’s standard normalization/backend + 5. Contrast: + • enforce treated vs control and consistent sign convention + 6. Report: + • full TSV + • significant BED sets: + • up (log2FC > 0 & FDR pass) + • down (log2FC < 0 & FDR pass) + 7. Plots (minimum): + • PCA + • correlation heatmap + • MA + • volcano + • heatmap (binding affinity) + • (optional) Venn/upset across conditions (if supported) + +Spike-in scaling integration (v1 implementation contract) + +If --differential_use_spikein resolves to true: + • Runner must attempt to use external sample scaling so differential log2FC reflects spike-in normalization. + • Implementation must be explicit and testable: + • either by setting normalization factors in DiffBind/DESeq2/edgeR, or + • by extracting the count matrix and running DESeq2/edgeR with user-provided size factors. + +Outputs + +Per (caller, group): + +03_peak_calling/06_differential/01_diffbind/// + +Minimum artifacts: + • diffbind.results.tsv + • diffbind.results.annotated.tsv (if annotation available) + • diffbind.significant.bed + • diffbind.significant_up.bed + • diffbind.significant_down.bed + • plots/PCA.pdf + • plots/correlation_heatmap.pdf + • plots/MA.pdf + • plots/volcano.pdf + • plots/heatmap.pdf + • diffbind.summary.tsv + +⸻ + +Method 2: ChIPBinner + +Purpose + +Broad-mark differential enrichment with robustness to global changes: + • fixed genomic bins (default 10kb) + • normalization (external scaling when available) + • clustering (HDBSCAN grid) + • bin-level differential testing (ROTS) + • optional enrichment (LOLA) + +Inputs + +Per sample: + • final target BAM + BAI + • group / condition / replicate metadata + • optional external scale factors: + • spike-in scale factors (preferred when present) + • MS coefficients YAML (optional) + +Reference: + • chrom sizes (*.sizes) + • optional blacklist BED + +Workflow (per group) + 1. Bin definition + • default: generate bins from chrom sizes + • optional: load precomputed windows from --chipbinner_windows_dir + • optional: subtract blacklist regions + 2. Quantification + • compute bin-level counts matrix across samples + 3. Normalization + • apply external scaling factors if enabled + • add pseudocount (default 1) before log transforms for clustering/plots + 4. QC + • PCA plot + • correlation heatmap + 5. Clustering (grid search) + • run HDBSCAN over bins using a parameter grid: + • min_cluster_size values + • min_samples/minPts values + • emit: + • chipbinner.hdbscan_grid_summary.tsv ranking solutions by: + • number of clusters (excluding noise) + • fraction of bins assigned (non-noise) + • mean cluster stability/persistence (if available) + • replicate consistency proxy (optional metric) + 6. Model selection + • choose: + • best overall solution + • best solution yielding ~2 clusters (if any) + • best solution yielding ~3 clusters (if any) + 7. Differential testing + • run ROTS on bins (replicates separate) + • produce bin-level log2FC/FDR with consistent sign + 8. Cluster labeling + • label clusters by behavior based on per-cluster mean signals: + • stable (similar signal) + • control_enriched + • treated_enriched + • noise (if applicable) + 9. Annotation + • annotate bins and/or cluster BEDs to nearest features (shared module) + 10. (Optional) Enrichment + + • if a LOLA DB is supplied, run enrichment per cluster + +Outputs (per group) + +03_peak_calling/06_differential/02_chipbinner// + +Minimum artifacts: + • bins/..windows.bed + • counts/chipbinner.bin_counts.tsv + • counts/chipbinner.normalized_matrix.tsv + • clustering/chipbinner.hdbscan_grid_summary.tsv + • clustering/chipbinner.clusters.best.tsv + • differential/chipbinner.differential.tsv + • differential/chipbinner.differential.annotated.tsv (if annotation available) + • bed/chipbinner.significant_up.bed + • bed/chipbinner.significant_down.bed + • plots/PCA.pdf + • plots/correlation_heatmap.pdf + • plots/density_scatter.pdf (treated vs control means, colored by cluster) + • chipbinner.summary.tsv +Optional enrichment (if enabled): + • enrichment/_lola.tsv + • enrichment/_lola_plot.pdf + +⸻ + +Method 3: SPAN / OmniPeaks differential + +Purpose + +Provide a SPAN-consistent differential workflow: + • Prefer OmniPeaks native differential mode if supported. + • Otherwise, run an explicit fallback and label it clearly. + +Capability detection + +Add a probe step: + • OMNIPEAKS_CAPABILITY_PROBE + • Runs java -jar omnipeaks.jar --help (or equivalent) and parses supported subcommands. + • Emits omnipeaks_capabilities.json for audit. + +Native mode (if supported) + +If a native differential subcommand exists (e.g., compare): + • Run it per group using the two contrast conditions. + • Replicate handling: + • if tool supports multiple BAMs per condition: pass them + • else: pool BAMs per condition and write pooling manifest + +Fallback mode (explicit) + +If no native differential exists: + • Do not silently pretend it does. + • Implement “SPAN peaks + DESeq2/edgeR” fallback: + 1. Require SPAN peak outputs to exist (SPAN caller present in --peakcaller or peaks provided in manifests). + 2. Build union peak set across samples in the contrast. + 3. Count reads per union region from final BAMs. + 4. Run DESeq2/edgeR with external scaling if enabled. + 5. Emit outputs under span_fallback/ with clear naming. + +Outputs (per group) + +03_peak_calling/06_differential/03_span// + +Native: + • span.differential.tsv + • span.differential.bed + • span.differential.annotated.tsv (if annotation available) + • span.significant_up.bed + • span.significant_down.bed + • span.summary.tsv + • 00_manifests/span_pooling.tsv (only if pooling used) + +Fallback: + • same outputs but with span_fallback.* prefix and a span_fallback.readme.txt describing the method. + +⸻ + +Output Structure + +All outputs live under: + +03_peak_calling/06_differential/ + +03_peak_calling/06_differential/ +├── 00_manifests/ +│ ├── differential_manifest.samples.tsv +│ ├── differential_manifest.peaks.tsv +│ └── differential_manifest.run_meta.json +├── 01_diffbind/ +│ ├── 00_samplesheets//.diffbind.csv +│ ├── // +│ │ ├── diffbind.results.tsv +│ │ ├── diffbind.results.annotated.tsv +│ │ ├── diffbind.significant.bed +│ │ ├── diffbind.significant_up.bed +│ │ ├── diffbind.significant_down.bed +│ │ ├── diffbind.summary.tsv +│ │ └── plots/*.pdf +├── 02_chipbinner/ +│ └── / +│ ├── bins/*.bed +│ ├── counts/*.tsv +│ ├── clustering/*.tsv +│ ├── differential/*.tsv +│ ├── bed/*.bed +│ ├── plots/*.pdf +│ ├── enrichment/ (optional) +│ └── chipbinner.summary.tsv +├── 03_span/ +│ └── / +│ ├── span.differential.tsv +│ ├── span.differential.bed +│ ├── span.differential.annotated.tsv +│ ├── span.significant_up.bed +│ ├── span.significant_down.bed +│ ├── span.summary.tsv +│ └── 00_manifests/ (optional) +├── 04_cross_comparison/ (optional) +│ ├── overlap_callers.tsv +│ ├── overlap_methods.tsv +│ └── plots/ (optional) +└── multiqc/ + ├── differential.summary.all_methods.tsv + ├── differential.skipped.tsv + └── (optional) differential_multiqc_report.html + + +⸻ + +User-facing Parameters + +Top-level toggles + • --run_diffbind (bool; default: false) + • --run_chipbinner (bool; default: false) + • --run_span_diff (bool; default: false) + +Execution mode + • --differential_from_run (string; default: null) + • if set: run differential-only using manifests from that outdir + +Contrast and validation behavior + • --differential_contrast "TREATED,CONTROL" (string; required if any method enabled) + • --differential_min_replicates (int; default: 2) + • --differential_strict (bool; default: false) + • --differential_use_spikein (string; default: "auto") + • allowed: auto|true|false + • auto resolves to true iff pipeline normalization mode is spike-in and scale factors exist + +Shared outputs / QC + • --differential_annotate (bool; default: true) + • --differential_cross_compare (bool; default: true) + • --differential_run_multiqc (bool; default: true) + +DiffBind parameters + • --diffbind_fdr (float; default: 0.05) + • --diffbind_lfc (float; default: 1.0) + • --diffbind_backend (string; default: DESeq2) + • allowed: DESeq2|edgeR + • --diffbind_min_overlap (int; default: 2) + • --diffbind_summits (int; default: 0) + • 0 = do not recenter; >0 = summit-based fixed width + • --diffbind_extra_params (path; default: null) + • JSON/YAML override file for advanced DiffBind runner options + +ChIPBinner parameters + • --chipbinner_bin_size (int; default: 10000) + • --chipbinner_windows_dir (path; default: null) + • if set, load bins from this dir; else generate + • --chipbinner_blacklist (path; default: null) + • --chipbinner_use_input (bool; default: false) + • --chipbinner_pseudocount (int; default: 1) + • --chipbinner_fdr (float; default: 0.05) + • --chipbinner_lfc (float; default: 1.0) + • --chipbinner_bootstrap (int; default: 1000) + • --chipbinner_k_value (int; default: 100000) + • --chipbinner_hdbscan_grid_min_cluster_size (string; default: "100,200,500,1000") + • --chipbinner_hdbscan_grid_min_samples (string; default: "100,200,500,1000") + • --chipbinner_ms_coeffs (path; default: null) + • --chipbinner_lola_db (path; default: null) + • --chipbinner_run_lola (bool; default: false) + +SPAN / OmniPeaks parameters + • --omnipeaks_jar (path; required if --run_span_diff) + • --span_diff_mode (string; default: auto) + • auto|native|fallback + • --span_diff_fdr (float; default: 0.05) + • --span_diff_gap (int; default: 5) + • --span_diff_bin (int; default: 200) + • --span_diff_java_heap (string; default: 8G) + +⸻ + +Implementation Plan + +New files + +Subworkflows: + • subworkflows/local/differential_peak_calling.nf + • workflows/differential_only.nf (or add new entrypoint in existing workflow file) + +Modules: + • modules/local/write_differential_manifests.nf + • modules/local/diffbind_run.nf + • modules/local/chipbinner_bins.nf + • modules/local/chipbinner_counts.nf + • modules/local/chipbinner_hdbscan_grid.nf + • modules/local/chipbinner_rots.nf + • modules/local/span_capability_probe.nf + • modules/local/span_compare.nf + • modules/local/span_fallback_diff.nf + • modules/local/annotate_regions.nf + • modules/local/differential_overlap.nf + +Scripts: + • bin/diffbind_run.R + • bin/chipbinner_rots.R + • bin/chipbinner_select_models.R (or embedded in rots runner) + • bin/span_fallback_deseq2.R (or shared DE runner) + • bin/hdbscan_grid.py (if clustering done in python) + • bin/annotate_regions.sh (optional wrapper around bedtools) + +Modified files + • workflows/cutandrun.nf (call DIFFERENTIAL_PEAK_CALLING in integrated mode) + • nextflow.config (params defaults) + • nextflow_schema.json (new params + help text) + • conf/base.config (resource labels for new processes) + • assets/multiqc_config.yml (custom content tables ordering + file cleanup) + • docs/usage.md (new params & examples) + • docs/output.md (new output tree section) + • tests/ (new minimal tests and/or profile behavior) + +⸻ + +Testing Strategy and Acceptance Criteria + +Minimal CI-friendly tests (recommended) + 1. Parameter validation tests: + + • missing --differential_contrast when methods enabled → fails + • only one condition present → skipped (default) or fails (strict) + • <2 replicates/condition → skipped (default) or fails (strict) + + 2. Manifest generation test: + + • after a small peak-calling test run, confirm: + • differential_manifest.samples.tsv exists with expected columns + • differential_manifest.peaks.tsv exists and includes all callers + + 3. DiffBind samplesheet generation test: + + • enabling --run_diffbind should create per-caller per-group samplesheets + +Optional heavier integration tests + • Run a small dataset with 2×2 design and confirm: + • DiffBind outputs exist for at least one caller and group + • ChIPBinner outputs exist for one group + • SPAN diff produces either native or fallback outputs (depending on tool availability) + +Acceptance criteria + • DiffBind runs per group × caller and respects log2FC direction. + • Posthoc differential-only mode runs without upstream recompute (using manifests). + • Spike-in scaling is applied when enabled and scale factors exist. + • Each method emits: result TSV, annotated TSV (if possible), significant BEDs, plots, summary TSV. + • MultiQC custom content tables can load the summaries. + +⸻ + +Resource Labels and Containers + +Resource labels (conf/base.config) + +Add/extend labels: + • process_single (default) + • process_medium (DiffBind, annotation) + • process_high (ChIPBinner binning/clustering, DE tests) + • process_long (SPAN compare, heavy clustering) + +Containers / environments + +DiffBind: + • R (>=4.2), Bioconductor DiffBind, DESeq2, edgeR, ggplot2, rtracklayer + +ChIPBinner: + • R (>=4.2), ROTS, (optional) LOLA, plus clustering dependencies + • If python used: python3 + hdbscan + numpy/pandas + +SPAN: + • Java runtime and omnipeaks.jar + +All modules must emit versions.yml for nf-core provenance. From def547c9c67d0306d5b439999be67e02b33d5785 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 22:59:38 -0800 Subject: [PATCH 15/31] Fix SPAN differential native/fallback handling --- bin/span_fallback_de.R | 194 ++++++++++ docs/usage.md | 2 +- modules/local/span_capability_probe.nf | 43 ++- modules/local/span_compare.nf | 137 ++++++- modules/local/span_fallback_diff.nf | 46 +-- modules/local/span_pooling_manifest.nf | 52 +++ .../local/differential_peak_calling.nf | 365 ++++++++++++++---- 7 files changed, 733 insertions(+), 106 deletions(-) create mode 100755 bin/span_fallback_de.R create mode 100644 modules/local/span_pooling_manifest.nf diff --git a/bin/span_fallback_de.R b/bin/span_fallback_de.R new file mode 100755 index 00000000..82aceea5 --- /dev/null +++ b/bin/span_fallback_de.R @@ -0,0 +1,194 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages({ + library(optparse) +}) + +option_list <- list( + make_option(c("--samplesheet"), type = "character"), + make_option(c("--treated"), type = "character"), + make_option(c("--control"), type = "character"), + make_option(c("--group"), type = "character", default = "NA"), + make_option(c("--fdr"), type = "double", default = 0.05), + make_option(c("--backend"), type = "character", default = "DESeq2"), + make_option(c("--use_spikein"), type = "character", default = "false"), + make_option(c("--prefix"), type = "character", default = "span_fallback") +) + +opt <- parse_args(OptionParser(option_list = option_list)) + +suppressPackageStartupMessages({ + library(GenomicRanges) + library(GenomicAlignments) + library(Rsamtools) +}) + +use_spikein <- tolower(opt$use_spikein) %in% c("true", "1", "yes") + +samplesheet <- read.csv(opt$samplesheet, stringsAsFactors = FALSE) +required_cols <- c("SampleID", "Condition", "bamReads", "Peaks") +missing <- setdiff(required_cols, colnames(samplesheet)) +if (length(missing) > 0) { + stop("Missing required columns in samplesheet: ", paste(missing, collapse = ", ")) +} + +samplesheet <- samplesheet[samplesheet$Condition %in% c(opt$treated, opt$control), , drop = FALSE] +if (nrow(samplesheet) == 0) { + stop("No samples remain after filtering to contrast conditions") +} + +read_peaks <- function(path) { + df <- tryCatch({ + read.table(path, sep = "\t", header = FALSE, comment.char = "", quote = "", stringsAsFactors = FALSE) + }, error = function(e) { + stop("Failed to read peaks file: ", path) + }) + if (ncol(df) < 3) { + stop("Peaks file has fewer than 3 columns: ", path) + } + df <- df[, 1:3] + suppressWarnings({ + df <- df[!is.na(as.numeric(df[, 2])) & !is.na(as.numeric(df[, 3])), , drop = FALSE] + }) + colnames(df) <- c("chr", "start", "end") + df$start <- as.integer(df$start) + df$end <- as.integer(df$end) + df +} + +if (any(is.na(samplesheet$Peaks)) || any(samplesheet$Peaks == "")) { + stop("Missing SPAN peak files in samplesheet for fallback differential.") +} + +peak_list <- lapply(samplesheet$Peaks, read_peaks) +peaks_list_path <- paste0(opt$prefix, ".peaks.list") +writeLines(paste(samplesheet$Peaks, collapse = ";"), peaks_list_path) +gr_list <- lapply(peak_list, function(df) { + GRanges(seqnames = df$chr, ranges = IRanges(start = df$start + 1L, end = df$end)) +}) + +union_gr <- reduce(do.call(c, gr_list)) + +union_df <- data.frame( + chr = as.character(seqnames(union_gr)), + start = start(union_gr) - 1L, + end = end(union_gr), + stringsAsFactors = FALSE +) + +union_bed <- paste0(opt$prefix, ".union.bed") +write.table(union_df, union_bed, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + +bam_files <- samplesheet$bamReads +sample_ids <- samplesheet$SampleID +names(bam_files) <- sample_ids + +detect_paired <- function(bam) { + bf <- BamFile(bam, yieldSize = 1000) + open(bf) + flags <- tryCatch({ + scanBam(bf, param = ScanBamParam(what = "flag"))[[1]]$flag + }, error = function(e) { + integer() + }) + close(bf) + if (length(flags) == 0) { + return(FALSE) + } + any(bitwAnd(flags, 1L) != 0) +} + +paired_flags <- vapply(bam_files, detect_paired, logical(1)) +single_end <- !any(paired_flags) + +bam_list <- BamFileList(bam_files, yieldSize = 1000000) +se <- summarizeOverlaps( + features = union_gr, + reads = bam_list, + mode = "Union", + singleEnd = single_end, + ignore.strand = TRUE +) + +counts <- assay(se) +colnames(counts) <- sample_ids + +counts_df <- cbind(union_df, counts) +counts_path <- paste0(opt$prefix, ".counts.tsv") +write.table(counts_df, counts_path, sep = "\t", quote = FALSE, row.names = FALSE) + +scale_factors <- NULL +if ("SpikeinScaleFactor" %in% colnames(samplesheet)) { + scale_factors <- suppressWarnings(as.numeric(samplesheet$SpikeinScaleFactor)) +} + +if (use_spikein && (is.null(scale_factors) || any(is.na(scale_factors)))) { + message("Spike-in scaling requested but scale factors missing; falling back to default normalization") + use_spikein <- FALSE +} + +backend <- toupper(opt$backend) + +run_deseq2 <- function(counts, conditions, scale_factors, treated, control) { + suppressPackageStartupMessages(library(DESeq2)) + cond <- factor(conditions, levels = c(control, treated)) + dds <- DESeqDataSetFromMatrix(countData = counts, colData = data.frame(condition = cond), design = ~condition) + if (!is.null(scale_factors)) { + sizeFactors(dds) <- 1 / scale_factors + } + dds <- DESeq(dds) + res <- results(dds, contrast = c("condition", treated, control)) + data.frame(log2FC = res$log2FoldChange, pvalue = res$pvalue, FDR = res$padj) +} + +run_edger <- function(counts, conditions, scale_factors, treated, control) { + suppressPackageStartupMessages(library(edgeR)) + cond <- factor(conditions, levels = c(control, treated)) + y <- DGEList(counts = counts, group = cond) + if (!is.null(scale_factors)) { + y$samples$norm.factors <- 1 / scale_factors + } else { + y <- calcNormFactors(y) + } + design <- model.matrix(~cond) + y <- estimateDisp(y, design) + fit <- glmQLFit(y, design) + qlf <- glmQLFTest(fit, coef = 2) + data.frame(log2FC = qlf$table$logFC, pvalue = qlf$table$PValue, FDR = p.adjust(qlf$table$PValue, method = "BH")) +} + +conditions <- samplesheet$Condition +if (backend == "EDGER") { + res_stats <- run_edger(counts, conditions, if (use_spikein) scale_factors else NULL, opt$treated, opt$control) +} else { + res_stats <- run_deseq2(counts, conditions, if (use_spikein) scale_factors else NULL, opt$treated, opt$control) +} + +res_stats$log2FC[is.na(res_stats$log2FC)] <- 0 +res_stats$pvalue[is.na(res_stats$pvalue)] <- 1 +res_stats$FDR[is.na(res_stats$FDR)] <- 1 + +res_df <- cbind(union_df, res_stats) +write.table(res_df, paste0(opt$prefix, ".results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + +sig <- res_df[res_df$FDR <= opt$fdr, , drop = FALSE] +write.table(sig[, c("chr", "start", "end")], paste0(opt$prefix, ".significant.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) +up <- sig[sig$log2FC > 0, , drop = FALSE] +down <- sig[sig$log2FC < 0, , drop = FALSE] +write.table(up[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_up.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) +write.table(down[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_down.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + +summary <- data.frame( + method = "span_fallback", + group = opt$group, + caller = "NA", + treated_condition = opt$treated, + control_condition = opt$control, + n_tested = nrow(res_df), + n_fdr_pass = nrow(sig), + n_up = nrow(up), + n_down = nrow(down), + use_spikein = if (use_spikein) "true" else "false", + span_mode_used = "fallback" +) +write.table(summary, paste0(opt$prefix, ".summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) diff --git a/docs/usage.md b/docs/usage.md index 9f10a769..d606d195 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -207,7 +207,7 @@ ChIPBinner options: SPAN differential options: -- `--omnipeaks_jar` (required for SPAN differential) +- `--omnipeaks_jar` (required for native SPAN differential; auto mode falls back if missing) - `--span_diff_mode` (auto|native|fallback), `--span_diff_fdr`, `--span_diff_gap`, `--span_diff_bin` - `--span_diff_java_heap` diff --git a/modules/local/span_capability_probe.nf b/modules/local/span_capability_probe.nf index 6cb7de53..c0b6cd3b 100644 --- a/modules/local/span_capability_probe.nf +++ b/modules/local/span_capability_probe.nf @@ -39,13 +39,49 @@ process SPAN_CAPABILITY_PROBE { }' omnipeaks_help.txt | sort -u) has_compare=false + compare_cmd="" for cmd in \${commands}; do lc=\$(printf '%s' "\${cmd}" | tr '[:upper:]' '[:lower:]') if [ "\${lc}" = "compare" ] || [ "\${lc}" = "diff" ] || [ "\${lc}" = "differential" ]; then has_compare=true - break + fi + if [ -z "\${compare_cmd}" ] && [ "\${lc}" = "compare" ]; then + compare_cmd="\${cmd}" fi done + if [ -z "\${compare_cmd}" ]; then + for cmd in \${commands}; do + lc=\$(printf '%s' "\${cmd}" | tr '[:upper:]' '[:lower:]') + if [ "\${lc}" = "diff" ] || [ "\${lc}" = "differential" ]; then + compare_cmd="\${cmd}" + break + fi + done + fi + + compare_multibam=false + compare_multibam_mode="single" + if [ -n "\${compare_cmd}" ]; then + java -jar ${omnipeaks_jar} "\${compare_cmd}" --help > omnipeaks_compare_help.txt 2>/dev/null || true + if [ -s omnipeaks_compare_help.txt ]; then + if grep -Eiq 'comma[- ]?separated|comma separated|list|multiple|replicate|replicates|bams?' omnipeaks_compare_help.txt; then + compare_multibam=true + fi + if grep -Eiq 'comma[- ]?separated|bam[^ ]*,|bam\[,?bam' omnipeaks_compare_help.txt; then + compare_multibam=true + compare_multibam_mode="comma" + fi + if grep -Eiq 'repeat|multiple[[:space:]]+times|use[[:space:]]+multiple' omnipeaks_compare_help.txt; then + if [ "\${compare_multibam_mode}" = "single" ]; then + compare_multibam_mode="repeat" + fi + compare_multibam=true + fi + if [ "\${compare_multibam}" = true ] && [ "\${compare_multibam_mode}" = "single" ]; then + compare_multibam_mode="comma" + fi + fi + fi if [ -n "\${commands}" ]; then commands_json=\$(printf '%s\n' "\${commands}" | awk 'BEGIN{first=1} {gsub(/\"/,"\\\\\""); if (!first) printf ", "; printf "\"%s\"", \$0; first=0} END{print ""}') @@ -57,7 +93,10 @@ process SPAN_CAPABILITY_PROBE { cat <<-END_JSON > omnipeaks_capabilities.json { "commands": \${commands_json}, - "has_compare": \${has_compare} + "has_compare": \${has_compare}, + "compare_command": "\${compare_cmd}", + "compare_multi_bam": \${compare_multibam}, + "compare_multi_bam_mode": "\${compare_multibam_mode}" } END_JSON diff --git a/modules/local/span_compare.nf b/modules/local/span_compare.nf index 4840ffea..39d9580e 100644 --- a/modules/local/span_compare.nf +++ b/modules/local/span_compare.nf @@ -14,21 +14,25 @@ process SPAN_COMPARE { 'eclipse-temurin:21-jre' }" input: - tuple val(meta), path(treated_bam), path(control_bam) + tuple val(meta), path(treated_bams), path(control_bams) path chrom_sizes path omnipeaks_jar val gap val bin val fdr val java_heap + val compare_cmd + val multibam_mode output: tuple val(meta), path("span.differential.tsv") , emit: tsv tuple val(meta), path("span.differential.bed") , emit: bed - tuple val(meta), path("span.significant.bed") , emit: significant + tuple val(meta), path("span.significant.bed") , emit: significant tuple val(meta), path("span.significant_up.bed") , emit: up tuple val(meta), path("span.significant_down.bed") , emit: down tuple val(meta), path("span.summary.tsv") , emit: summary + path "00_manifests/span_compare_inputs.tsv" , emit: manifest + path "span.readme.txt" , optional: true, emit: readme path "versions.yml" , emit: versions when: @@ -38,12 +42,37 @@ process SPAN_COMPARE { def prefix = task.ext.prefix ?: "${meta.id}.span_compare" def fdr_arg = fdr ? "--fdr ${fdr}" : '' def bin_arg = bin ? "--bin ${bin}" : '' + def treated_manifest = meta.treated_bams instanceof List ? meta.treated_bams.join(';') : (meta.treated_bams ?: '') + def control_manifest = meta.control_bams instanceof List ? meta.control_bams.join(';') : (meta.control_bams ?: '') + def input_mode = meta.input_mode ?: 'native' + def compare_mode = multibam_mode ?: 'single' + def compare_command = compare_cmd ?: 'compare' """ java_cmd=\${JAVA_CMD:-java} + compare_mode="${compare_mode}" + compare_command="${compare_command}" - "\$java_cmd" -Xmx${java_heap} -jar ${omnipeaks_jar} compare \ - -t ${treated_bam} \ - -c ${control_bam} \ + treated_list=( ${treated_bams} ) + control_list=( ${control_bams} ) + if [ "\${compare_mode}" = "repeat" ]; then + treated_args="" + for bam in "\${treated_list[@]}"; do + treated_args="\${treated_args} -t \${bam}" + done + control_args="" + for bam in "\${control_list[@]}"; do + control_args="\${control_args} -c \${bam}" + done + else + treated_joined=\$(IFS=,; echo "\${treated_list[*]}") + control_joined=\$(IFS=,; echo "\${control_list[*]}") + treated_args="-t \${treated_joined}" + control_args="-c \${control_joined}" + fi + + "\$java_cmd" -Xmx${java_heap} -jar ${omnipeaks_jar} \${compare_command} \ + \${treated_args} \ + \${control_args} \ --cs ${chrom_sizes} \ --gap ${gap} \ ${bin_arg} \ @@ -60,22 +89,106 @@ process SPAN_COMPARE { fi cp "\$diff_file" span.differential.tsv - awk 'BEGIN{OFS="\t"} {print \$1,\$2,\$3}' "\$diff_file" > span.differential.bed - # Attempt to split by sign using column 5 if numeric - awk 'BEGIN{OFS="\t"} {if (\$5 ~ /^-?[0-9.]+\$/) {if (\$5>0) print \$1,\$2,\$3}}' "\$diff_file" > span.significant_up.bed - awk 'BEGIN{OFS="\t"} {if (\$5 ~ /^-?[0-9.]+\$/) {if (\$5<0) print \$1,\$2,\$3}}' "\$diff_file" > span.significant_down.bed + : > span.significant_up.bed + : > span.significant_down.bed + cat <<'AWK' > span_compare_filter.awk + function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?$/) } + function calc_fdr(val) { + if (!isnum(val)) return 1 + if (val <= 1) return val + 0 + return 10^(-val) + } + function calc_log2(val) { + if (!isnum(val)) return "" + if (val < 0) return val + 0 + if (val == 0) return 0 + return log(val) / log(2) + } + BEGIN { OFS="\\t"; header_done=0; chr_idx=1; start_idx=2; end_idx=3; log2_idx=0; fdr_idx=0; qval_idx=0; fold_idx=0; proxy_log2=0 } + /^#/ || /^track/ || /^browser/ { next } + !header_done { + if (!isnum($2)) { + for (i=1; i<=NF; i++) { + col=tolower($i) + if (col ~ /^(chr|chrom|seqnames|seqname)$/) chr_idx=i + if (col == "start") start_idx=i + if (col == "end") end_idx=i + if (col ~ /log2fc|log2foldchange|log2_fold_change|log2fold|log_fc|log2ratio|log2_ratio/) log2_idx=i + if (col ~ /^(fdr|false.discovery|padj|adj_p|adjp)$/) fdr_idx=i + if (col ~ /qvalue|qval|q-value/) qval_idx=i + if ((col ~ /fold|fc/) && log2_idx==0) fold_idx=i + } + header_done=1 + next + } else { + chr_idx=1; start_idx=2; end_idx=3; log2_idx=7; qval_idx=9 + proxy_log2=1 + header_done=1 + } + } + { + log2_val="" + if (log2_idx>0 && isnum($log2_idx)) log2_val=$log2_idx+0 + if (log2_val=="" && fold_idx>0 && isnum($fold_idx)) { log2_val=calc_log2($fold_idx); proxy_log2=1 } + if (log2_val=="" && log2_idx==7 && isnum($7)) { log2_val=calc_log2($7); proxy_log2=1 } + fdr_val="" + if (fdr_idx>0) fdr_val=$fdr_idx + else if (qval_idx>0) fdr_val=$qval_idx + else if (NF>=9) fdr_val=$9 + fdr_val=calc_fdr(fdr_val) + if (fdr_val <= fdr_thresh) { + if (log2_val > 0) print $chr_idx,$start_idx,$end_idx >> up_file + else if (log2_val < 0) print $chr_idx,$start_idx,$end_idx >> down_file + } + } + END { + if (proxy_log2) print "proxy_log2" > proxy_file + } + AWK + + awk -v fdr_thresh="${fdr}" -v up_file="span.significant_up.bed" -v down_file="span.significant_down.bed" -v proxy_file="span.proxy_log2.txt" -f span_compare_filter.awk "\$diff_file" + cat span.significant_up.bed span.significant_down.bed | awk 'NF' | sort -k1,1 -k2,2n | uniq > span.significant.bed + cp span.significant.bed span.differential.bed - total=\$(grep -v '^#' "\$diff_file" | wc -l | awk '{print \$1}') + total=\$(awk 'function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?$/) } /^#/ || /^track/ || /^browser/ { next } !header_seen { if (!isnum(\$2)) { header_seen=1; next } header_seen=1 } { n++ } END { print n+0 }' "\$diff_file") n_up=\$(wc -l < span.significant_up.bed | awk '{print \$1}') n_down=\$(wc -l < span.significant_down.bed | awk '{print \$1}') n_fdr_pass=\$((n_up + n_down)) cat <<-END_SUMMARY > span.summary.tsv - method\tgroup\tcaller\ttreated_condition\tcontrol_condition\tn_tested\tn_fdr_pass\tn_up\tn_down - span\t${meta.group}\tNA\t${meta.treated ?: 'NA'}\t${meta.control ?: 'NA'}\t\${total}\t\${n_fdr_pass}\t\${n_up}\t\${n_down} + method\tgroup\tcaller\ttreated_condition\tcontrol_condition\tn_tested\tn_fdr_pass\tn_up\tn_down\tuse_spikein\tspan_mode_used + span\t${meta.group}\tNA\t${meta.treated ?: 'NA'}\t${meta.control ?: 'NA'}\t\${total}\t\${n_fdr_pass}\t\${n_up}\t\${n_down}\tNA\tnative END_SUMMARY + mkdir -p 00_manifests + if [ -z "${treated_manifest}" ]; then + treated_manifest=\$(IFS=';'; echo "\${treated_list[*]}") + else + treated_manifest="${treated_manifest}" + fi + if [ -z "${control_manifest}" ]; then + control_manifest=\$(IFS=';'; echo "\${control_list[*]}") + else + control_manifest="${control_manifest}" + fi + { + printf "group\\tcondition\\tinput_bams\\tinput_mode\\n" + printf "%s\\t%s\\t%s\\t%s\\n" "${meta.group}" "${meta.treated ?: 'NA'}" "\${treated_manifest}" "${input_mode}" + printf "%s\\t%s\\t%s\\t%s\\n" "${meta.group}" "${meta.control ?: 'NA'}" "\${control_manifest}" "${input_mode}" + } > 00_manifests/span_compare_inputs.tsv + + if [ -f span.proxy_log2.txt ]; then + cat <<-END_README > span.readme.txt + SPAN native differential parsing notes: + - No explicit log2FC column detected; log2FC was derived from the signal/fold-change column. + - For positive values, log2FC = log2(value); negative values are treated as already log2-scaled. + - FDR was computed from the Q-value column (column 9) assuming -log10 scale when values > 1. + - Direction follows treated (${meta.treated ?: 'NA'}) vs control (${meta.control ?: 'NA'}). + END_README + fi + rm -f span_compare_filter.awk span.proxy_log2.txt + cat <<-END_VERSIONS > versions.yml "${task.process}": java: \$(\${java_cmd} -version 2>&1 | head -n 1 | sed -e 's/\"//g') diff --git a/modules/local/span_fallback_diff.nf b/modules/local/span_fallback_diff.nf index 2d0dcb41..b972e59d 100644 --- a/modules/local/span_fallback_diff.nf +++ b/modules/local/span_fallback_diff.nf @@ -8,7 +8,7 @@ process SPAN_FALLBACK_DIFF { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - conda "bioconda::bioconductor-diffbind bioconda::bioconductor-deseq2 bioconda::bioconductor-edger bioconda::bioconductor-genomicranges conda-forge::r-base=4.2.3 conda-forge::r-optparse conda-forge::r-ggplot2 conda-forge::r-jsonlite conda-forge::r-yaml" + conda "bioconda::bioconductor-diffbind bioconda::bioconductor-deseq2 bioconda::bioconductor-edger bioconda::bioconductor-genomicranges bioconda::bioconductor-genomicalignments bioconda::bioconductor-rsamtools conda-forge::r-base=4.2.3 conda-forge::r-optparse" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bioconductor-diffbind:3.14.0--r42hdfd78af_0' : 'biocontainers/bioconductor-diffbind:3.14.0--r42hdfd78af_0' }" @@ -20,9 +20,6 @@ process SPAN_FALLBACK_DIFF { val group val use_spikein val fdr - val lfc - val min_overlap - val summits val backend output: @@ -33,7 +30,9 @@ process SPAN_FALLBACK_DIFF { tuple val(group), path("span_fallback.significant_down.bed") , emit: down tuple val(group), path("span_fallback.summary.tsv") , emit: summary tuple val(group), path("span_fallback.readme.txt") , emit: readme - tuple val(group), path("plots") , emit: plots + tuple val(group), path("span_fallback.union.bed") , emit: union + tuple val(group), path("span_fallback.counts.tsv") , emit: counts + path "00_manifests/span_fallback_union_manifest.tsv" , emit: manifest path "versions.yml" , emit: versions when: @@ -41,33 +40,36 @@ process SPAN_FALLBACK_DIFF { script: """ - mkdir -p plots - Rscript ${projectDir}/bin/diffbind_run.R \ - --samplesheet ${samplesheet} \ - --treated ${treated} \ - --control ${control} \ - --group ${group} \ - --caller span_fallback \ - --fdr ${fdr} \ - --lfc ${lfc} \ - --min_overlap ${min_overlap} \ - --summits ${summits} \ - --backend ${backend} \ - --use_spikein ${use_spikein} \ + Rscript ${projectDir}/bin/span_fallback_de.R \\ + --samplesheet ${samplesheet} \\ + --treated ${treated} \\ + --control ${control} \\ + --group ${group} \\ + --fdr ${fdr} \\ + --backend ${backend} \\ + --use_spikein ${use_spikein} \\ --prefix span_fallback mv span_fallback.results.tsv span_fallback.differential.tsv cp span_fallback.significant.bed span_fallback.differential.bed + mkdir -p 00_manifests + peaks_list=\$(cat span_fallback.peaks.list 2>/dev/null || true) + cat <<-END_MANIFEST > 00_manifests/span_fallback_union_manifest.tsv + group\tunion_bed\tcount_matrix\tsource_peaks + ${group}\t${params.outdir}/03_peak_calling/06_differential/03_span/${group}/span_fallback.union.bed\t${params.outdir}/03_peak_calling/06_differential/03_span/${group}/span_fallback.counts.tsv\t\${peaks_list} + END_MANIFEST + rm -f span_fallback.peaks.list + cat <<-END_README > span_fallback.readme.txt - SPAN fallback differential: SPAN peaks + DiffBind/DESeq2/edgeR analysis. - This fallback runs DiffBind on SPAN-derived peak sets to approximate differential enrichment - when native OmniPeaks differential mode is unavailable. + SPAN fallback differential: SPAN peaks -> union peaks -> per-sample counts -> ${backend} differential. + Log2FC is treated (${treated}) vs control (${control}). FDR threshold: ${fdr}. + Spike-in scaling applied: ${use_spikein}. END_README cat <<-END_VERSIONS > versions.yml "${task.process}": - diffbind: \$(Rscript -e 'packageVersion("DiffBind")' 2>/dev/null | tr -d '[]') + deseq2: \$(Rscript -e 'packageVersion("DESeq2")' 2>/dev/null | tr -d '[]') r-base: \$(R --version 2>&1 | head -n 1 | sed -e 's/.*R version //; s/ .*//') END_VERSIONS """ diff --git a/modules/local/span_pooling_manifest.nf b/modules/local/span_pooling_manifest.nf new file mode 100644 index 00000000..a7974e31 --- /dev/null +++ b/modules/local/span_pooling_manifest.nf @@ -0,0 +1,52 @@ +process SPAN_POOLING_MANIFEST { + tag "${group}" + label 'process_single' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/03_span/${group}/00_manifests" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "conda-forge::coreutils=9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + val group + val records + + output: + tuple val(group), path("span_pooling.tsv"), emit: manifest + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def header = "group\tcondition\tpooled_bam\tpooled_bai\tinput_bams\tpooling_reason\tcreated_by" + def lines = records ? records.collect { record -> + [ + record.group, + record.condition, + record.pooled_bam ?: 'NA', + record.pooled_bai ?: 'NA', + record.input_bams ?: '', + record.pooling_reason ?: '', + record.created_by ?: 'native' + ].join('\t') + }.join('\n') : '' + """ + printf "%s\\n" "${header}" > span_pooling.tsv + if [ -n "${lines}" ]; then + printf "%s\\n" "${lines}" >> span_pooling.tsv + fi + + coreutils_version=\$(cat --version | head -n 1 | awk '{print \$NF}') + { + echo "\\"${task.process}\\":" + echo " coreutils: \$coreutils_version" + } > versions.yml + """ +} diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 1a53bce4..bf7a912d 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -13,10 +13,12 @@ include { CHIPBINNER_BINS } from '../../modules/local/chipbinner_bins' include { CHIPBINNER_COUNTS } from '../../modules/local/chipbinner_counts' include { CHIPBINNER_HDBSCAN_GRID } from '../../modules/local/chipbinner_hdbscan_grid' include { CHIPBINNER_ROTS } from '../../modules/local/chipbinner_rots' +include { CHIPBINNER_LOLA } from '../../modules/local/chipbinner_lola' include { SAMTOOLS_MERGE_BAMS } from '../../modules/local/samtools_merge_bams' include { SPAN_CAPABILITY_PROBE } from '../../modules/local/span_capability_probe' include { SPAN_COMPARE } from '../../modules/local/span_compare' include { SPAN_FALLBACK_DIFF } from '../../modules/local/span_fallback_diff' +include { SPAN_POOLING_MANIFEST } from '../../modules/local/span_pooling_manifest' include { ANNOTATE_REGIONS } from '../../modules/local/annotate_regions' include { DIFFERENTIAL_OVERLAP } from '../../modules/local/differential_overlap' include { DIFFERENTIAL_SUMMARY_MERGE } from '../../modules/local/differential_summary' @@ -24,6 +26,7 @@ include { DIFFERENTIAL_SUMMARY_MERGE } from '../../modules/local/differential_su workflow DIFFERENTIAL_PEAK_CALLING { take: ch_bam_bai // tuple [meta, bam, bai] + ch_control_bam_bai // tuple [meta, bam, bai] (controls) ch_peaks // tuple [meta, peaks] ch_bigwig // tuple [meta, bigwig] ch_spikein_scale // tuple [meta, scale] @@ -35,8 +38,10 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_skipped_records = Channel.empty() ch_summary_files = Channel.empty() ch_diffbind_significant = Channel.empty() - ch_chipbinner_significant = Channel.empty() - ch_span_significant = Channel.empty() + ch_chipbinner_up = Channel.empty() + ch_chipbinner_down = Channel.empty() + ch_span_native_significant = Channel.empty() + ch_span_fallback_significant = Channel.empty() ch_chrom_sizes_single = ch_chrom_sizes .map { it instanceof List ? it[0] : it } @@ -52,16 +57,6 @@ workflow DIFFERENTIAL_PEAK_CALLING { def control = contrast_parts[1] def use_spikein_param = params.differential_use_spikein?.toString()?.toLowerCase() - ch_use_spikein = Channel.value('false') - if (use_spikein_param == 'true') { - ch_use_spikein = Channel.value('true') - } else if (use_spikein_param == 'auto') { - ch_use_spikein = ch_spikein_scale - .map { meta, scale -> scale } - .filter { it != null && it.toString() != 'NA' } - .count() - .map { it > 0 ? 'true' : 'false' } - } ch_bam_bai_target = ch_bam_bai.filter { meta, bam, bai -> meta.is_control == false } ch_bigwig_target = ch_bigwig.filter { meta, bw -> meta.is_control == false } @@ -89,17 +84,39 @@ workflow DIFFERENTIAL_PEAK_CALLING { group: meta.group, condition: meta.condition, replicate: meta.replicate, + control_group: meta.control_group ?: meta.group, + control_condition: meta.control_condition ?: 'NA', bam: bam, bai: bai, bam_path: bam.toString(), bai_path: bai.toString(), - normalisation_mode: params.normalisation_mode, + normalisation_mode: meta.normalisation_mode ?: params.normalisation_mode, spikein_scale_factor: scale, bigwig_path: bigwig == null ? 'NA' : bigwig.toString(), meta: meta ] } + def resolve_auto_spikein = { List records -> + def modes = records.collect { it.normalisation_mode ?: '' } + .findAll { it != null && it.toString().trim() } + .collect { it.toString() } + .unique() + def mode_is_spikein = modes.size() == 1 && modes[0].equalsIgnoreCase('Spikein') + def scales = records.collect { it.spikein_scale_factor } + def scales_present = scales && scales.every { it != null && it.toString() != 'NA' && it.toString() != '' } + return (mode_is_spikein && scales_present) ? 'true' : 'false' + } + + ch_use_spikein = Channel.value('false') + if (use_spikein_param == 'true') { + ch_use_spikein = Channel.value('true') + } else if (use_spikein_param == 'auto') { + ch_use_spikein = ch_samples + .toList() + .map { records -> resolve_auto_spikein(records) } + } + // Write manifests in integrated mode if (!params.differential_from_run) { ch_samples_manifest = ch_samples @@ -109,6 +126,8 @@ workflow DIFFERENTIAL_PEAK_CALLING { group: rec.group, condition: rec.condition, replicate: rec.replicate, + control_group: rec.control_group ?: 'NA', + control_condition: rec.control_condition ?: 'NA', final_bam: rec.bam_path, final_bai: rec.bai_path, normalisation_mode: rec.normalisation_mode, @@ -314,7 +333,44 @@ workflow DIFFERENTIAL_PEAK_CALLING { ) ch_versions = ch_versions.mix(CHIPBINNER_BINS.out.versions) - ch_chipbinner_samples = ch_valid_groups.map { rec -> + ch_control_map = Channel.value([:]) + if (params.chipbinner_use_input) { + ch_control_pool_inputs = ch_control_bam_bai + .map { meta, bam, bai -> + def control_group = meta.control_group ?: meta.group + ["${control_group}__${meta.condition}", [meta, bam]] + } + .groupTuple(by: [0]) + .map { key, entries -> + def meta0 = entries[0][0] + def control_group = meta0.control_group ?: meta0.group + def meta_out = [ + id: "${control_group}.${meta0.condition}.control", + group: control_group, + condition: meta0.condition, + role: "control_${meta0.condition}" + ] + [meta_out, entries.collect { it[1] }] + } + + SAMTOOLS_MERGE_BAMS(ch_control_pool_inputs) + ch_versions = ch_versions.mix(SAMTOOLS_MERGE_BAMS.out.versions) + + ch_control_map = SAMTOOLS_MERGE_BAMS.out.merged + .map { meta, bam, bai -> [meta.group, meta.condition, bam, bai] } + .toList() + .map { list -> + def map = [:].withDefault { [:] } + list.each { entry -> + def group = entry[0] + def condition = entry[1] + map[group][condition] = [entry[2], entry[3]] + } + map + } + } + + ch_chipbinner_samples = ch_valid_groups.combine(ch_control_map).map { rec, control_map -> def ordered = rec.records.sort { a, b -> a.condition <=> b.condition ?: a.replicate <=> b.replicate ?: a.sample_id <=> b.sample_id } @@ -328,46 +384,109 @@ workflow DIFFERENTIAL_PEAK_CALLING { } def bams = ordered.collect { row -> row.bam } def bais = ordered.collect { row -> row.bai } - [rec.group, JsonOutput.toJson(samples), bams, bais] + + def control_group = ordered ? (ordered[0].meta?.control_group ?: rec.group) : rec.group + def control_by_group = control_map.get(control_group, [:]) ?: [:] + def required_conditions = [treated, control] + def missing_controls = required_conditions.findAll { !control_by_group.containsKey(it) } + def use_input_group = params.chipbinner_use_input && missing_controls.isEmpty() + if (params.chipbinner_use_input && !use_input_group) { + def message = "ChIPBinner input normalization requested but controls missing for group '${rec.group}' conditions: ${missing_controls.join(',')}" + if (params.differential_strict) { + exit 1, message + } else { + log.warn message + } + } + + def control_bams = use_input_group ? required_conditions.collect { control_by_group[it][0] } : bams + def control_bais = use_input_group ? required_conditions.collect { control_by_group[it][1] } : bais + def control_conditions_json = use_input_group ? JsonOutput.toJson(required_conditions) : JsonOutput.toJson([]) + + [rec.group, JsonOutput.toJson(samples), bams, bais, control_bams, control_bais, control_conditions_json, use_input_group] } ch_chipbinner_inputs = CHIPBINNER_BINS.out.bins.join(ch_chipbinner_samples) - .map { group, bins_file, samples_json, bams, bais -> [bins_file, samples_json, bams, bais, group] } + .map { group, bins_file, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group -> + [bins_file, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group] + } CHIPBINNER_COUNTS( - ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> bins }, - ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> samples_json }, - ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> bams }, - ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> bais }, - ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> group }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> bins }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> samples_json }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> bams }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> bais }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> control_bams }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> control_bais }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> control_conditions_json }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> use_input_group }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> group }, ch_use_spikein, - Channel.value(params.chipbinner_pseudocount) + Channel.value(params.chipbinner_pseudocount), + Channel.value(params.chipbinner_ms_coeffs ? file(params.chipbinner_ms_coeffs).toString() : '') ) ch_versions = ch_versions.mix(CHIPBINNER_COUNTS.out.versions) CHIPBINNER_HDBSCAN_GRID( CHIPBINNER_COUNTS.out.normalized, - ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> group }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> samples_json }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> group }, Channel.value(params.chipbinner_hdbscan_grid_min_cluster_size), Channel.value(params.chipbinner_hdbscan_grid_min_samples) ) ch_versions = ch_versions.mix(CHIPBINNER_HDBSCAN_GRID.out.versions) + def lola_db_path = params.chipbinner_lola_db ? file(params.chipbinner_lola_db) : null + def lola_db_exists = lola_db_path && lola_db_path.exists() + def lola_should_run = params.chipbinner_run_lola && lola_db_exists + if (params.chipbinner_run_lola && !lola_db_exists) { + def message = "ChIPBinner LOLA requested but --chipbinner_lola_db not found: ${params.chipbinner_lola_db}" + if (params.differential_strict) { + exit 1, message + } else { + log.warn message + } + lola_should_run = false + } + CHIPBINNER_ROTS( CHIPBINNER_COUNTS.out.normalized, CHIPBINNER_HDBSCAN_GRID.out.clusters, - ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> samples_json }, + CHIPBINNER_HDBSCAN_GRID.out.grid_summary, + CHIPBINNER_COUNTS.out.norm_info, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> samples_json }, Channel.value(treated), Channel.value(control), - ch_chipbinner_inputs.map { bins, samples_json, bams, bais, group -> group }, + ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> group }, Channel.value(params.chipbinner_fdr), Channel.value(params.chipbinner_lfc), Channel.value(params.chipbinner_bootstrap), - Channel.value(params.chipbinner_k_value) + Channel.value(params.chipbinner_k_value), + Channel.value(lola_should_run) ) ch_versions = ch_versions.mix(CHIPBINNER_ROTS.out.versions) ch_summary_files = ch_summary_files.mix(CHIPBINNER_ROTS.out.summary.map { group, file -> file }) - ch_chipbinner_significant = CHIPBINNER_ROTS.out.significant + ch_chipbinner_up = CHIPBINNER_ROTS.out.up + ch_chipbinner_down = CHIPBINNER_ROTS.out.down + + if (lola_should_run) { + ch_lola_beds = Channel.empty() + ch_lola_beds = ch_lola_beds.mix( + CHIPBINNER_ROTS.out.control_enriched.map { group, file -> [group, 'control_enriched', file] }, + CHIPBINNER_ROTS.out.treated_enriched.map { group, file -> [group, 'treated_enriched', file] }, + CHIPBINNER_ROTS.out.stable.map { group, file -> [group, 'stable', file] } + ) + + ch_lola_inputs = CHIPBINNER_BINS.out.bins.join(ch_lola_beds) + .map { group, bins_file, label, bed_file -> [group, label, bed_file, bins_file] } + + CHIPBINNER_LOLA( + ch_lola_inputs.map { group, label, bed_file, bins_file -> [group, label, bed_file] }, + ch_lola_inputs.map { group, label, bed_file, bins_file -> bins_file }, + Channel.value(lola_db_path) + ) + ch_versions = ch_versions.mix(CHIPBINNER_LOLA.out.versions) + } if (params.differential_annotate) { ch_chipbinner_annotate = CHIPBINNER_ROTS.out.results.map { group, file -> @@ -385,33 +504,110 @@ workflow DIFFERENTIAL_PEAK_CALLING { // SPAN differential per group if (params.run_span_diff) { - if (!params.omnipeaks_jar) { - exit 1, "--omnipeaks_jar is required when --run_span_diff is enabled" + def span_mode = params.span_diff_mode?.toString()?.toLowerCase() ?: 'auto' + def span_has_jar = params.omnipeaks_jar ? true : false + + if (span_mode == 'native' && !span_has_jar) { + exit 1, "--omnipeaks_jar is required for --span_diff_mode native" + } + if (span_mode == 'auto' && !span_has_jar) { + log.warn "SPAN diff auto mode: --omnipeaks_jar not provided; using explicit fallback when SPAN peaks are available." } - SPAN_CAPABILITY_PROBE(Channel.value(file(params.omnipeaks_jar))) - ch_versions = ch_versions.mix(SPAN_CAPABILITY_PROBE.out.versions) - def span_mode = params.span_diff_mode?.toString()?.toLowerCase() ?: 'auto' - def span_use_native = span_mode == 'native' || span_mode == 'auto' + def span_use_native = span_mode == 'native' || (span_mode == 'auto' && span_has_jar) def span_use_fallback = span_mode == 'fallback' || span_mode == 'auto' - ch_span_caps = SPAN_CAPABILITY_PROBE.out.capabilities - .map { file -> new JsonSlurper().parse(file) } - ch_span_has_compare = ch_span_caps.map { it.has_compare ?: false } + ch_span_has_compare = Channel.value(false) + ch_span_multibam = Channel.value(false) + ch_span_multibam_mode = Channel.value('single') + ch_span_compare_cmd = Channel.value('compare') + + if (span_has_jar) { + SPAN_CAPABILITY_PROBE(Channel.value(file(params.omnipeaks_jar))) + ch_versions = ch_versions.mix(SPAN_CAPABILITY_PROBE.out.versions) + + ch_span_caps = SPAN_CAPABILITY_PROBE.out.capabilities + .map { file -> new JsonSlurper().parse(file) } + ch_span_has_compare = ch_span_caps.map { it.has_compare ?: false } + ch_span_multibam = ch_span_caps.map { it.compare_multi_bam ?: false } + ch_span_multibam_mode = ch_span_caps.map { it.compare_multi_bam_mode ?: 'single' } + ch_span_compare_cmd = ch_span_caps.map { it.compare_command ?: 'compare' } + } + + if (span_use_native && span_has_jar) { + ch_span_native_groups = ch_valid_groups + if (span_mode == 'auto') { + ch_span_native_groups = ch_valid_groups.combine(ch_span_has_compare) + .filter { rec, has_compare -> has_compare } + .map { rec, has_compare -> rec } + } - if (span_use_native) { - ch_span_merge_inputs = ch_valid_groups.flatMap { rec -> + ch_span_direct_groups = ch_span_native_groups.combine(ch_span_multibam) + .filter { rec, multibam -> multibam } + .map { rec, multibam -> rec } + + ch_span_pooled_groups = ch_span_native_groups.combine(ch_span_multibam) + .filter { rec, multibam -> !multibam } + .map { rec, multibam -> rec } + + ch_span_compare_direct = ch_span_direct_groups.map { rec -> + def treated_bams = rec.records.findAll { it.condition == treated }.collect { it.bam } + def control_bams = rec.records.findAll { it.condition == control }.collect { it.bam } + def meta = [ + id: rec.group, + group: rec.group, + treated: treated, + control: control, + input_mode: 'direct', + treated_bams: treated_bams.collect { it.toString() }, + control_bams: control_bams.collect { it.toString() } + ] + [meta, treated_bams, control_bams] + } + + ch_span_pool_inputs = ch_span_pooled_groups.flatMap { rec -> def treated_bams = rec.records.findAll { it.condition == treated }.collect { it.bam } def control_bams = rec.records.findAll { it.condition == control }.collect { it.bam } [ - [[group: rec.group, role: 'treated', id: "${rec.group}.treated"], treated_bams], - [[group: rec.group, role: 'control', id: "${rec.group}.control"], control_bams] + [key: "${rec.group}__treated", group: rec.group, role: 'treated', condition: treated, bams: treated_bams], + [key: "${rec.group}__control", group: rec.group, role: 'control', condition: control, bams: control_bams] ] } + ch_span_merge_inputs = ch_span_pool_inputs.map { rec -> + [[group: rec.group, role: rec.role, id: "${rec.group}.${rec.role}"], rec.bams] + } SAMTOOLS_MERGE_BAMS(ch_span_merge_inputs) ch_versions = ch_versions.mix(SAMTOOLS_MERGE_BAMS.out.versions) + ch_span_pool_inputs_keyed = ch_span_pool_inputs.map { rec -> [rec.key, rec] } + ch_span_pooled_outputs = SAMTOOLS_MERGE_BAMS.out.merged.map { meta, bam, bai -> + ["${meta.group}__${meta.role}", meta, bam, bai] + } + ch_span_pool_manifest_records = ch_span_pooled_outputs.join(ch_span_pool_inputs_keyed) + .map { key, meta, bam, bai, rec -> + [ + group: meta.group, + condition: rec.condition, + pooled_bam: bam.toString(), + pooled_bai: bai.toString(), + input_bams: rec.bams.collect { it.toString() }.join(';'), + pooling_reason: 'omnipeaks_no_multibam_support', + created_by: 'native' + ] + } + + ch_span_pool_manifest_grouped = ch_span_pool_manifest_records + .map { rec -> [rec.group, rec] } + .groupTuple() + .map { group, recs -> [group, recs] } + + SPAN_POOLING_MANIFEST( + ch_span_pool_manifest_grouped.map { group, recs -> group }, + ch_span_pool_manifest_grouped.map { group, recs -> recs } + ) + ch_versions = ch_versions.mix(SPAN_POOLING_MANIFEST.out.versions) + ch_span_merged_treated = SAMTOOLS_MERGE_BAMS.out.merged .filter { meta, bam, bai -> meta.role == 'treated' } .map { meta, bam, bai -> [meta.group, bam] } @@ -420,31 +616,38 @@ workflow DIFFERENTIAL_PEAK_CALLING { .filter { meta, bam, bai -> meta.role == 'control' } .map { meta, bam, bai -> [meta.group, bam] } - ch_span_compare_inputs = ch_span_merged_treated.join(ch_span_merged_control) + ch_span_compare_pooled = ch_span_merged_treated.join(ch_span_merged_control) .map { group, treated_bam, control_bam -> - def meta = [id: group, group: group, treated: treated, control: control] + def meta = [ + id: group, + group: group, + treated: treated, + control: control, + input_mode: 'pooled', + treated_bams: [treated_bam.toString()], + control_bams: [control_bam.toString()] + ] [meta, treated_bam, control_bam] } - ch_span_native_groups = ch_span_compare_inputs - if (span_mode == 'auto') { - ch_span_native_groups = ch_span_compare_inputs.combine(ch_span_has_compare) - .filter { meta, treated_bam, control_bam, has_compare -> has_compare } - .map { meta, treated_bam, control_bam, has_compare -> [meta, treated_bam, control_bam] } - } + ch_span_compare_inputs = ch_span_compare_direct.mix(ch_span_compare_pooled) SPAN_COMPARE( - ch_span_native_groups, + ch_span_compare_inputs, ch_chrom_sizes_single, Channel.value(file(params.omnipeaks_jar)), Channel.value(params.span_diff_gap), Channel.value(params.span_diff_bin), Channel.value(params.span_diff_fdr), - Channel.value(params.span_diff_java_heap) + Channel.value(params.span_diff_java_heap), + ch_span_compare_cmd, + ch_span_multibam_mode ) ch_versions = ch_versions.mix(SPAN_COMPARE.out.versions) ch_summary_files = ch_summary_files.mix(SPAN_COMPARE.out.summary.map { meta, file -> file }) - ch_span_significant = ch_span_significant.mix(SPAN_COMPARE.out.significant.map { meta, file -> [meta.group, file] }) + ch_span_native_significant = ch_span_native_significant.mix( + SPAN_COMPARE.out.significant.map { meta, file -> [group: meta.group, path: file.toString(), span_mode: 'native'] } + ) if (params.differential_annotate) { ch_span_annotate = SPAN_COMPARE.out.tsv.map { meta, file -> @@ -464,14 +667,21 @@ workflow DIFFERENTIAL_PEAK_CALLING { if (span_mode == 'fallback') { ch_span_fallback_groups = ch_valid_groups } else if (span_mode == 'auto') { - ch_span_fallback_groups = ch_valid_groups.combine(ch_span_has_compare) - .filter { rec, has_compare -> !has_compare } - .map { rec, has_compare -> rec } + if (!span_has_jar) { + ch_span_fallback_groups = ch_valid_groups + } else { + ch_span_fallback_groups = ch_valid_groups.combine(ch_span_has_compare) + .filter { rec, has_compare -> !has_compare } + .map { rec, has_compare -> rec } + } } if (span_use_fallback) { def span_callers = callers.findAll { it.startsWith('span_') } if (!span_callers) { + if (span_mode == 'auto' && !span_has_jar) { + exit 1, "SPAN diff auto mode requested but --omnipeaks_jar missing and no SPAN peaks are available for fallback." + } ch_skipped_records = ch_skipped_records.mix( ch_span_fallback_groups.map { rec -> [method: 'span', group: rec.group, caller: 'NA', reason: 'no_span_peaks', details: 'SPAN caller not in --peakcaller'] } ) @@ -523,14 +733,13 @@ workflow DIFFERENTIAL_PEAK_CALLING { MAKE_SPAN_SAMPLESHEET.out.samplesheet.map { group, caller, sheet -> group }, ch_use_spikein, Channel.value(params.span_diff_fdr), - Channel.value(params.diffbind_lfc), - Channel.value(params.diffbind_min_overlap), - Channel.value(params.diffbind_summits), Channel.value(params.diffbind_backend) ) ch_versions = ch_versions.mix(SPAN_FALLBACK_DIFF.out.versions) ch_summary_files = ch_summary_files.mix(SPAN_FALLBACK_DIFF.out.summary.map { group, file -> file }) - ch_span_significant = ch_span_significant.mix(SPAN_FALLBACK_DIFF.out.significant.map { group, file -> [group, file] }) + ch_span_fallback_significant = ch_span_fallback_significant.mix( + SPAN_FALLBACK_DIFF.out.significant.map { group, file -> [group: group, path: file.toString(), span_mode: 'fallback'] } + ) if (params.differential_annotate) { ch_span_fallback_annotate = SPAN_FALLBACK_DIFF.out.results.map { group, file -> @@ -549,20 +758,38 @@ workflow DIFFERENTIAL_PEAK_CALLING { } if (params.differential_cross_compare) { - def primary_caller = callers ? callers[0] : null diffbind_beds = ch_diffbind_significant.map { group, caller, file -> [group: group, caller: caller, path: file.toString()] } + ch_span_effective = ch_span_native_significant + .mix(ch_span_fallback_significant) + .map { rec -> [rec.group, rec] } + .groupTuple() + .map { group, recs -> + def native = recs.find { it.span_mode == 'native' } + native ?: recs[0] + } + method_beds = Channel.empty() - if (primary_caller) { - method_beds = method_beds.mix( - ch_diffbind_significant - .filter { group, caller, file -> caller == primary_caller } - .map { group, caller, file -> [group: group, method: 'diffbind', caller: caller, path: file.toString()] } - ) - } - method_beds = method_beds.mix(ch_chipbinner_significant.map { group, file -> [group: group, method: 'chipbinner', caller: 'NA', path: file.toString()] }) - method_beds = method_beds.mix(ch_span_significant.map { group, file -> [group: group, method: 'span', caller: 'NA', path: file.toString()] }) + method_beds = method_beds.mix( + ch_diffbind_significant + .map { group, caller, file -> + [group: group, method: 'diffbind', caller: caller, path: file.toString(), span_mode_used: 'NA'] + } + ) + method_beds = method_beds.mix( + ch_chipbinner_up.map { group, file -> + [group: group, method: 'chipbinner_treated_enriched', caller: 'NA', path: file.toString(), span_mode_used: 'NA'] + }, + ch_chipbinner_down.map { group, file -> + [group: group, method: 'chipbinner_control_enriched', caller: 'NA', path: file.toString(), span_mode_used: 'NA'] + } + ) + method_beds = method_beds.mix( + ch_span_effective.map { rec -> + [group: rec.group, method: 'span', caller: 'NA', path: rec.path, span_mode_used: rec.span_mode] + } + ) callers_tsv = diffbind_beds .map { rec -> [rec.group, rec.caller, rec.path].join('\t') } @@ -570,7 +797,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { .map { lines -> lines.join('\n') + '\n' } .collectFile(name: 'overlap_callers.tsv') methods_tsv = method_beds - .map { rec -> [rec.group, rec.method, rec.caller ?: 'NA', rec.path].join('\t') } + .map { rec -> [rec.group, rec.method, rec.caller ?: 'NA', rec.path, rec.span_mode_used ?: 'NA'].join('\t') } .toSortedList() .map { lines -> lines.join('\n') + '\n' } .collectFile(name: 'overlap_methods.tsv') From 3463f2df5158297eaf883aa22a50f6212cf1380d Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sun, 4 Jan 2026 23:12:08 -0800 Subject: [PATCH 16/31] Fix differential overlap and spike-in reporting --- bin/chipbinner_lola.R | 68 +++++++ bin/chipbinner_rots.R | 89 ++++++++- bin/diffbind_run.R | 16 +- bin/hdbscan_grid.py | 145 ++++++++++++-- modules/local/chipbinner_counts.nf | 182 ++++++++++++++++-- modules/local/chipbinner_hdbscan_grid.nf | 20 +- modules/local/chipbinner_lola.nf | 44 +++++ modules/local/chipbinner_rots.nf | 26 ++- modules/local/differential_overlap.nf | 54 ++++-- modules/local/differential_summary.nf | 13 +- modules/local/write_differential_manifests.nf | 4 + workflows/cutandrun.nf | 15 +- workflows/differential_only.nf | 37 +++- 13 files changed, 646 insertions(+), 67 deletions(-) create mode 100644 bin/chipbinner_lola.R create mode 100644 modules/local/chipbinner_lola.nf diff --git a/bin/chipbinner_lola.R b/bin/chipbinner_lola.R new file mode 100644 index 00000000..931bbce4 --- /dev/null +++ b/bin/chipbinner_lola.R @@ -0,0 +1,68 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages({ + library(optparse) + library(LOLA) + library(GenomicRanges) + library(rtracklayer) + library(ggplot2) +}) + +option_list <- list( + make_option(c("--bed"), type = "character"), + make_option(c("--universe"), type = "character"), + make_option(c("--db"), type = "character"), + make_option(c("--label"), type = "character"), + make_option(c("--out_tsv"), type = "character"), + make_option(c("--out_pdf"), type = "character", default = "") +) + +opt <- parse_args(OptionParser(option_list = option_list)) + +if (!dir.exists(opt$db)) { + stop(paste("LOLA database not found:", opt$db)) +} + +bed_lines <- readLines(opt$bed, warn = FALSE) +bed_lines <- bed_lines[!grepl("^#", bed_lines)] +if (length(bed_lines) == 0) { + write.table(data.frame(), opt$out_tsv, sep = "\t", quote = FALSE, row.names = FALSE) + if (opt$out_pdf != "") { + pdf(opt$out_pdf) + plot.new() + text(0.5, 0.5, "No regions available for enrichment") + dev.off() + } + quit(status = 0) +} + +user_set <- rtracklayer::import(opt$bed, format = "bed") +universe <- rtracklayer::import(opt$universe, format = "bed") +region_db <- LOLA::loadRegionDB(opt$db) + +user_sets <- GRangesList() +user_sets[[opt$label]] <- user_set + +res <- LOLA::runLOLA(user_sets, universe, region_db) +if (!is.null(res) && nrow(res) > 0) { + res <- res[res$userSet == opt$label, , drop = FALSE] +} + +write.table(res, opt$out_tsv, sep = "\t", quote = FALSE, row.names = FALSE) + +if (opt$out_pdf != "" && !is.null(res) && nrow(res) > 0) { + top <- res[order(res$pValue), , drop = FALSE] + if (nrow(top) > 20) { + top <- top[1:20, , drop = FALSE] + } + top$neglog10p <- -log10(top$pValue) + top$label <- if ("description" %in% colnames(top)) top$description else top$filename + pdf(opt$out_pdf) + ggplot(top, aes(x = reorder(label, neglog10p), y = neglog10p)) + + geom_col(fill = "#4C72B0") + + coord_flip() + + xlab("Region set") + + ylab("-log10(p-value)") + + ggtitle(paste("LOLA enrichment:", opt$label)) + dev.off() +} diff --git a/bin/chipbinner_rots.R b/bin/chipbinner_rots.R index d6adbbbe..e5d5f1a5 100755 --- a/bin/chipbinner_rots.R +++ b/bin/chipbinner_rots.R @@ -8,6 +8,8 @@ suppressPackageStartupMessages({ option_list <- list( make_option(c("--matrix"), type = "character"), make_option(c("--clusters"), type = "character"), + make_option(c("--grid_summary"), type = "character"), + make_option(c("--norm_info"), type = "character"), make_option(c("--samplesheet"), type = "character"), make_option(c("--treated"), type = "character"), make_option(c("--control"), type = "character"), @@ -15,7 +17,8 @@ option_list <- list( make_option(c("--fdr"), type = "double", default = 0.05), make_option(c("--lfc"), type = "double", default = 1.0), make_option(c("--bootstrap"), type = "integer", default = 1000), - make_option(c("--k_value"), type = "integer", default = 100000) + make_option(c("--k_value"), type = "integer", default = 100000), + make_option(c("--lola_run"), type = "character", default = "false") ) opt <- parse_args(OptionParser(option_list = option_list)) @@ -66,10 +69,42 @@ results <- data.frame( FDR = fdr ) +cluster_ids <- rep(NA, nrow(results)) if (all(c("chr", "start", "end", "cluster") %in% colnames(clusters_df))) { clusters_key <- paste(clusters_df$chr, clusters_df$start, clusters_df$end, sep = ":") - results$cluster <- clusters_df$cluster[match(paste(results$chr, results$start, results$end, sep = ":"), clusters_key)] + cluster_ids <- clusters_df$cluster[match(paste(results$chr, results$start, results$end, sep = ":"), clusters_key)] } +results$cluster_id <- cluster_ids + +cluster_labels <- rep("stable", length(cluster_ids)) +if (!all(is.na(cluster_ids))) { + treated_idx <- conditions == opt$treated + control_idx <- conditions == opt$control + for (cid in unique(cluster_ids)) { + if (is.na(cid)) { + next + } + if (cid == -1) { + cluster_labels[cluster_ids == cid] <- "noise" + next + } + cluster_rows <- cluster_ids == cid + treated_mean <- mean(rowMeans(counts[cluster_rows, treated_idx, drop = FALSE])) + control_mean <- mean(rowMeans(counts[cluster_rows, control_idx, drop = FALSE])) + if (is.nan(treated_mean)) treated_mean <- 0 + if (is.nan(control_mean)) control_mean <- 0 + log2fc_cluster <- log2((treated_mean + 1e-6) / (control_mean + 1e-6)) + if (abs(log2fc_cluster) < opt$lfc) { + label <- "stable" + } else if (log2fc_cluster > 0) { + label <- "treated_enriched" + } else { + label <- "control_enriched" + } + cluster_labels[cluster_ids == cid] <- label + } +} +results$cluster_label <- cluster_labels write.table(results, "chipbinner.differential.tsv", sep = "\t", quote = FALSE, row.names = FALSE) @@ -80,16 +115,59 @@ down <- sig[sig$log2FC < 0, , drop = FALSE] write.table(up[, c("chr", "start", "end")], "chipbinner.significant_up.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) write.table(down[, c("chr", "start", "end")], "chipbinner.significant_down.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) +write.table(results[results$cluster_label == "control_enriched", c("chr", "start", "end")], + "chipbinner.control_enriched.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) +write.table(results[results$cluster_label == "treated_enriched", c("chr", "start", "end")], + "chipbinner.treated_enriched.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) +write.table(results[results$cluster_label == "stable", c("chr", "start", "end")], + "chipbinner.stable.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) +write.table(results[results$cluster_label == "noise", c("chr", "start", "end")], + "chipbinner.noise.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + +selected_row <- NULL +if (!is.null(opt$grid_summary) && file.exists(opt$grid_summary)) { + grid <- read.table(opt$grid_summary, header = TRUE, sep = "\t", check.names = FALSE) + if ("selected" %in% colnames(grid)) { + selected_row <- grid[grid$selected == TRUE | grid$selected == "True" | grid$selected == "true", , drop = FALSE] + if (nrow(selected_row) > 0) { + selected_row <- selected_row[1, , drop = FALSE] + } + } +} + +norm_info <- list(use_input = FALSE, use_ms_scaling = FALSE) +if (!is.null(opt$norm_info) && file.exists(opt$norm_info)) { + try({ + norm_info <- jsonlite::fromJSON(opt$norm_info) + }, silent = TRUE) +} + +use_spikein_flag <- ifelse(is.null(norm_info$use_spikein_scaling), "false", + ifelse(norm_info$use_spikein_scaling, "true", "false")) + +n_clusters <- length(unique(cluster_ids[!is.na(cluster_ids) & cluster_ids != -1])) +min_cluster_size <- if (!is.null(selected_row) && "min_cluster_size" %in% colnames(selected_row)) selected_row$min_cluster_size else NA +min_samples <- if (!is.null(selected_row) && "min_samples" %in% colnames(selected_row)) selected_row$min_samples else NA + summary <- data.frame( method = "chipbinner", group = opt$group, caller = "NA", treated_condition = opt$treated, control_condition = opt$control, + n_bins_tested = nrow(results), n_tested = nrow(results), n_fdr_pass = nrow(sig), n_up = nrow(up), - n_down = nrow(down) + n_down = nrow(down), + n_clusters = n_clusters, + min_cluster_size = min_cluster_size, + min_samples = min_samples, + input_normalization_used = ifelse(is.null(norm_info$use_input), FALSE, norm_info$use_input), + ms_scaling_used = ifelse(is.null(norm_info$use_ms_scaling), FALSE, norm_info$use_ms_scaling), + use_spikein = use_spikein_flag, + span_mode_used = "NA", + lola_run = tolower(opt$lola_run) %in% c("true", "1", "yes") ) write.table(summary, "chipbinner.summary.tsv", sep = "\t", quote = FALSE, row.names = FALSE) @@ -114,7 +192,10 @@ try({ treated_means <- rowMeans(counts[, conditions == opt$treated, drop = FALSE]) control_means <- rowMeans(counts[, conditions == opt$control, drop = FALSE]) pdf("plots/density_scatter.pdf") - plot(control_means, treated_means, pch = 16, cex = 0.5, + cluster_factor <- as.factor(cluster_labels) + cols <- as.numeric(cluster_factor) + plot(control_means, treated_means, pch = 16, cex = 0.5, col = cols, xlab = "Control mean", ylab = "Treated mean", main = "Density scatter") + legend("topleft", legend = levels(cluster_factor), col = seq_along(levels(cluster_factor)), pch = 16, cex = 0.6) dev.off() }, silent = TRUE) diff --git a/bin/diffbind_run.R b/bin/diffbind_run.R index 2fe7261f..7d895735 100755 --- a/bin/diffbind_run.R +++ b/bin/diffbind_run.R @@ -16,6 +16,8 @@ option_list <- list( make_option(c("--summits"), type = "integer", default = 0), make_option(c("--backend"), type = "character", default = "DESeq2"), make_option(c("--use_spikein"), type = "character", default = "false"), + make_option(c("--summary_method"), type = "character", default = "diffbind"), + make_option(c("--span_mode_used"), type = "character", default = "NA"), make_option(c("--prefix"), type = "character", default = "diffbind"), make_option(c("--extra_params"), type = "character", default = NULL) ) @@ -195,8 +197,9 @@ if (use_spikein) { n_fdr <- nrow(sig) n_up <- nrow(up) n_down <- nrow(down) + use_spikein_flag <- ifelse(use_spikein, "true", "false") summary <- data.frame( - method = "diffbind", + method = opt$summary_method, group = opt$group, caller = opt$caller, treated_condition = opt$treated, @@ -204,7 +207,9 @@ if (use_spikein) { n_tested = n_tested, n_fdr_pass = n_fdr, n_up = n_up, - n_down = n_down + n_down = n_down, + use_spikein = use_spikein_flag, + span_mode_used = opt$span_mode_used ) write.table(summary, paste0(opt$prefix, ".summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) @@ -301,8 +306,9 @@ if (!use_spikein) { n_fdr <- nrow(sig) n_up <- nrow(up) n_down <- nrow(down) + use_spikein_flag <- ifelse(use_spikein, "true", "false") summary <- data.frame( - method = "diffbind", + method = opt$summary_method, group = opt$group, caller = opt$caller, treated_condition = opt$treated, @@ -310,7 +316,9 @@ if (!use_spikein) { n_tested = n_tested, n_fdr_pass = n_fdr, n_up = n_up, - n_down = n_down + n_down = n_down, + use_spikein = use_spikein_flag, + span_mode_used = opt$span_mode_used ) write.table(summary, paste0(opt$prefix, ".summary.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) diff --git a/bin/hdbscan_grid.py b/bin/hdbscan_grid.py index b8917330..af4e1126 100755 --- a/bin/hdbscan_grid.py +++ b/bin/hdbscan_grid.py @@ -23,7 +23,56 @@ def load_matrix(path): for row in rows: data.append([float(x) for x in row[3:]]) matrix = np.array(data) if data else np.empty((0, len(header) - 3)) - return header[:3], coords, matrix + return header[:3], header[3:], coords, matrix + + +def load_samplesheet(path, sample_ids): + if not path: + return [] + conditions = {} + with open(path, newline="") as handle: + reader = csv.DictReader(handle) + for row in reader: + sample_id = row.get("sample_id") + condition = row.get("condition") + if not sample_id: + continue + conditions[sample_id] = condition or "" + missing = [sid for sid in sample_ids if sid not in conditions] + if missing: + raise SystemExit(f"Samplesheet missing condition for: {', '.join(missing)}") + return [conditions[sid] for sid in sample_ids] + + +def replicate_consistency(matrix, labels, conditions): + if matrix.size == 0 or not conditions: + return 0.0 + scores = [] + unique_conditions = sorted(set(conditions)) + for cluster_id in sorted(set(labels)): + if cluster_id == -1: + continue + cluster_rows = matrix[labels == cluster_id, :] + if cluster_rows.shape[0] < 2: + continue + for condition in unique_conditions: + idx = [i for i, cond in enumerate(conditions) if cond == condition] + if len(idx) < 2: + continue + data = cluster_rows[:, idx] + try: + corr = np.corrcoef(data, rowvar=False) + except Exception: + continue + if corr.size <= 1: + continue + tri = corr[np.triu_indices_from(corr, k=1)] + tri = tri[~np.isnan(tri)] + if tri.size: + scores.append(float(np.mean(tri))) + if not scores: + return 0.0 + return float(np.mean(scores)) def write_summary(path, rows): @@ -31,9 +80,13 @@ def write_summary(path, rows): "min_cluster_size", "min_samples", "n_clusters", - "fraction_assigned", + "frac_assigned_non_noise", "mean_persistence", + "replicate_consistency", + "score_total", "selected", + "selected_2clusters", + "selected_3clusters", ] with open(path, "w", newline="") as handle: writer = csv.DictWriter(handle, delimiter="\t", fieldnames=fieldnames, lineterminator="\n") @@ -50,16 +103,27 @@ def write_clusters(path, coord_header, coords, labels): writer.writerow(row + [label]) +def write_not_found(path, coord_header, message): + with open(path, "w", newline="") as handle: + handle.write(f"# {message}\n") + writer = csv.writer(handle, delimiter="\t", lineterminator="\n") + writer.writerow(coord_header + ["cluster"]) + + def main(): parser = argparse.ArgumentParser() parser.add_argument("--matrix", required=True) parser.add_argument("--min_cluster_size", required=True) parser.add_argument("--min_samples", required=True) parser.add_argument("--summary", required=True) - parser.add_argument("--clusters", required=True) + parser.add_argument("--clusters_best", required=True) + parser.add_argument("--clusters_2", required=True) + parser.add_argument("--clusters_3", required=True) + parser.add_argument("--samplesheet", default="") args = parser.parse_args() - coord_header, coords, matrix = load_matrix(args.matrix) + coord_header, sample_ids, coords, matrix = load_matrix(args.matrix) + conditions = load_samplesheet(args.samplesheet, sample_ids) if args.samplesheet else [] try: import hdbscan @@ -70,51 +134,90 @@ def main(): "min_cluster_size": 0, "min_samples": 0, "n_clusters": 1, - "fraction_assigned": 1.0, + "frac_assigned_non_noise": 1.0, "mean_persistence": 1.0, + "replicate_consistency": 0.0, + "score_total": 1.0, "selected": True, + "selected_2clusters": False, + "selected_3clusters": False, } ] write_summary(args.summary, summary_rows) labels = [0 for _ in coords] - write_clusters(args.clusters, coord_header, coords, labels) + write_clusters(args.clusters_best, coord_header, coords, labels) + write_not_found(args.clusters_2, coord_header, "No 2-cluster solution found (HDBSCAN unavailable)") + write_not_found(args.clusters_3, coord_header, "No 3-cluster solution found (HDBSCAN unavailable)") return min_cluster_sizes = parse_list(args.min_cluster_size) min_samples_list = parse_list(args.min_samples) results = [] - best = None - best_score = None - best_labels = None + labels_by_params = {} for mcs, ms in itertools.product(min_cluster_sizes, min_samples_list): clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs, min_samples=ms) - labels = clusterer.fit_predict(matrix) + labels = clusterer.fit_predict(matrix) if matrix.size else np.array([]) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) fraction_assigned = float(np.sum(labels >= 0)) / float(len(labels)) if len(labels) else 0.0 mean_persistence = float(np.mean(clusterer.cluster_persistence_)) if getattr(clusterer, 'cluster_persistence_', None) is not None and len(clusterer.cluster_persistence_) > 0 else 0.0 - score = (fraction_assigned, n_clusters, mean_persistence) - if best_score is None or score > best_score: - best_score = score - best = (mcs, ms) - best_labels = labels + rep_consistency = replicate_consistency(matrix, labels, conditions) if len(labels) else 0.0 + score_total = float(fraction_assigned + mean_persistence + rep_consistency) + labels_by_params[(mcs, ms)] = labels results.append({ "min_cluster_size": mcs, "min_samples": ms, "n_clusters": n_clusters, - "fraction_assigned": round(fraction_assigned, 4), + "frac_assigned_non_noise": round(fraction_assigned, 4), "mean_persistence": round(mean_persistence, 4), + "replicate_consistency": round(rep_consistency, 4), + "score_total": round(score_total, 4), "selected": False, + "selected_2clusters": False, + "selected_3clusters": False, }) + best = max(results, key=lambda row: (row["score_total"], row["frac_assigned_non_noise"], row["mean_persistence"])) + best_2 = None + best_3 = None + candidates_2 = [row for row in results if row["n_clusters"] == 2] + candidates_3 = [row for row in results if row["n_clusters"] == 3] + if candidates_2: + best_2 = max(candidates_2, key=lambda row: (row["score_total"], row["frac_assigned_non_noise"], row["mean_persistence"])) + if candidates_3: + best_3 = max(candidates_3, key=lambda row: (row["score_total"], row["frac_assigned_non_noise"], row["mean_persistence"])) + for row in results: - if best is not None and row["min_cluster_size"] == best[0] and row["min_samples"] == best[1]: + if row is best: row["selected"] = True - write_summary(args.summary, results) - - labels = list(best_labels) if best_labels is not None else [0 for _ in coords] - write_clusters(args.clusters, coord_header, coords, labels) + if best_2 is not None and row is best_2: + row["selected_2clusters"] = True + if best_3 is not None and row is best_3: + row["selected_3clusters"] = True + + results_sorted = sorted(results, key=lambda row: row["score_total"], reverse=True) + write_summary(args.summary, results_sorted) + + labels = list(labels_by_params.get((best["min_cluster_size"], best["min_samples"]), [])) if coords else [] + if coords: + write_clusters(args.clusters_best, coord_header, coords, labels) + else: + write_clusters(args.clusters_best, coord_header, coords, []) + + if best_2 is not None: + labels_2 = list(labels_by_params.get((best_2["min_cluster_size"], best_2["min_samples"]), [])) + write_clusters(args.clusters_2, coord_header, coords, labels_2) + else: + sys.stderr.write("No 2-cluster solution found for HDBSCAN grid search\n") + write_not_found(args.clusters_2, coord_header, "No 2-cluster solution found") + + if best_3 is not None: + labels_3 = list(labels_by_params.get((best_3["min_cluster_size"], best_3["min_samples"]), [])) + write_clusters(args.clusters_3, coord_header, coords, labels_3) + else: + sys.stderr.write("No 3-cluster solution found for HDBSCAN grid search\n") + write_not_found(args.clusters_3, coord_header, "No 3-cluster solution found") if __name__ == "__main__": diff --git a/modules/local/chipbinner_counts.nf b/modules/local/chipbinner_counts.nf index 0eafdc57..514a8be1 100644 --- a/modules/local/chipbinner_counts.nf +++ b/modules/local/chipbinner_counts.nf @@ -18,13 +18,19 @@ process CHIPBINNER_COUNTS { val samples_json path bams path bais + path control_bams + path control_bais + val control_conditions_json + val use_input val group val use_spikein val pseudocount + val ms_coeffs output: path "chipbinner.bin_counts.tsv" , emit: counts path "chipbinner.normalized_matrix.tsv", emit: normalized + path "chipbinner.normalization.json" , emit: norm_info path "versions.yml" , emit: versions when: @@ -33,20 +39,25 @@ process CHIPBINNER_COUNTS { script: """\ printf "%s\n" ${bams} > bam_paths.txt + printf "%s\n" ${control_bams} > control_bam_paths.txt python - <<'PY' import json import subprocess import sys - from pathlib import Path samples = json.loads(r'''${samples_json}''') with open('bam_paths.txt') as handle: bam_paths = [line.strip() for line in handle if line.strip()] sample_ids = [s['sample_id'] for s in samples] scale_factors = [s.get('spikein_scale_factor') for s in samples] + ms_coeffs_path = r'''${ms_coeffs}''' + control_conditions = json.loads(r'''${control_conditions_json}''') + use_input = str('${use_input}').lower() in ("true", "1", "yes") + use_spikein = str('${use_spikein}').lower() in ("true", "1", "yes") + if len(bam_paths) != len(sample_ids): - sys.stderr.write(f\"Expected {len(sample_ids)} BAMs but found {len(bam_paths)}\\n\") + sys.stderr.write(f"Expected {len(sample_ids)} BAMs but found {len(bam_paths)}\\n") sys.exit(1) cmd = ["bedtools", "multicov", "-bams"] + bam_paths + ["-bed", "${bins}"] @@ -63,21 +74,170 @@ process CHIPBINNER_COUNTS { continue handle.write(line + "\\n") - # normalized matrix - use_spikein = str('${use_spikein}').lower() in ("true", "1", "yes") + def parse_ms_coeffs_file(path): + with open(path) as handle: + raw_lines = handle.read().splitlines() + lines = [] + for raw in raw_lines: + line = raw.split("#", 1)[0].rstrip() + if line.strip(): + lines.append(line) + if not lines: + return {} + # Detect list of dicts + entries = [] + current = None + in_samples_block = False + coeffs_map = {} + for raw in lines: + stripped = raw.lstrip() + if stripped.startswith("samples:"): + in_samples_block = True + continue + if stripped.startswith("-"): + if current: + entries.append(current) + current = {} + item = stripped[1:].strip() + if item and ":" in item: + key, value = item.split(":", 1) + current[key.strip()] = value.strip() + continue + if ":" in stripped: + key, value = stripped.split(":", 1) + key = key.strip() + value = value.strip() + if in_samples_block or (current is not None and raw.startswith(" ")): + if current is None: + current = {} + current[key] = value + else: + coeffs_map[key] = value + if current: + entries.append(current) + if entries: + coeffs = {} + for entry in entries: + sample_id = entry.get("sample_id") or entry.get("sample") + if not sample_id: + raise ValueError("MS coefficients entry missing sample_id") + value = entry.get("ms_coeff") or entry.get("coefficient") or entry.get("scale_factor") or entry.get("scale") + if value is None or value == "": + raise ValueError(f"MS coefficients entry missing coefficient for {sample_id}") + coeffs[str(sample_id)] = float(value) + return coeffs + if coeffs_map: + return {str(key): float(value) for key, value in coeffs_map.items()} + raise ValueError("Unsupported MS coefficients YAML structure") + + ms_coeffs = {} + if ms_coeffs_path: + try: + ms_coeffs = parse_ms_coeffs_file(ms_coeffs_path) + except Exception as exc: + sys.stderr.write(f"Failed to parse MS coefficients YAML: {exc}\\n") + sys.exit(1) + if not ms_coeffs: + sys.stderr.write("MS coefficients YAML contained no entries\\n") + sys.exit(1) + missing = [sid for sid in sample_ids if sid not in ms_coeffs] + if missing: + sys.stderr.write(f"MS coefficients missing entries for samples: {', '.join(missing)}\\n") + sys.exit(1) + + def counts_from_lines(lines): + counts = [] + for line in lines: + if not line.strip(): + continue + parts = line.split("\\t") + counts.append([float(x) for x in parts[3:]]) + return counts + + counts = counts_from_lines(lines) + if not counts: + counts = [] + + scaling_source = "library_size" + scale_values = None + if ms_coeffs: + scaling_source = "ms" + scale_values = [float(ms_coeffs[sid]) for sid in sample_ids] + elif use_spikein and all(sf not in (None, "NA", "") for sf in scale_factors): + scaling_source = "spikein" + scale_values = [float(sf) for sf in scale_factors] + else: + if counts: + lib_sizes = [0.0 for _ in sample_ids] + for row in counts: + for idx, val in enumerate(row): + lib_sizes[idx] += val + lib_sorted = sorted(lib_sizes) + median_lib = lib_sorted[len(lib_sorted) // 2] if lib_sorted else 1.0 + scale_values = [float(median_lib / ls) if ls > 0 else 1.0 for ls in lib_sizes] + else: + scale_values = [1.0 for _ in sample_ids] + + scaled_counts = [] + for row in counts: + scaled_counts.append([val * scale_values[idx] for idx, val in enumerate(row)]) + + if use_input and control_conditions: + with open('control_bam_paths.txt') as handle: + control_bam_paths = [line.strip() for line in handle if line.strip()] + if len(control_bam_paths) != len(control_conditions): + sys.stderr.write("Control BAMs and conditions mismatch\\n") + sys.exit(1) + control_cmd = ["bedtools", "multicov", "-bams"] + control_bam_paths + ["-bed", "${bins}"] + control_res = subprocess.run(control_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if control_res.returncode != 0: + sys.stderr.write(control_res.stderr) + sys.exit(control_res.returncode) + control_lines = control_res.stdout.strip().splitlines() + control_counts = counts_from_lines(control_lines) + if control_counts and len(control_counts) != len(scaled_counts): + sys.stderr.write("Control counts row mismatch with target counts\\n") + sys.exit(1) + + cond_to_idx = {cond: idx for idx, cond in enumerate(control_conditions)} + for idx, sample in enumerate(samples): + cond = sample.get('condition') + if cond not in cond_to_idx: + sys.stderr.write(f"Missing control counts for condition {cond}\\n") + sys.exit(1) + ctrl_idx = cond_to_idx[cond] + for row_idx, row in enumerate(scaled_counts): + row[idx] = row[idx] - (control_counts[row_idx][ctrl_idx] * scale_values[idx]) + for row in scaled_counts: + for idx, val in enumerate(row): + row[idx] = val if val > 0 else 0.0 + + pseudocount = float('${pseudocount}') + for row in scaled_counts: + for idx, val in enumerate(row): + row[idx] = val + pseudocount + with open('chipbinner.normalized_matrix.tsv', 'w') as handle: handle.write("\\t".join(["chr", "start", "end"] + sample_ids) + "\\n") - for line in lines: + for idx, line in enumerate(lines): if not line.strip(): continue parts = line.split("\\t") coords = parts[:3] - counts = [float(x) for x in parts[3:]] - if use_spikein and all(sf not in (None, "NA", "") for sf in scale_factors): - factors = [float(sf) for sf in scale_factors] - counts = [c * f for c, f in zip(counts, factors)] - counts = [c + float('${pseudocount}') for c in counts] - handle.write("\\t".join(coords + [f"{c:.6f}" for c in counts]) + "\\n") + row = scaled_counts[idx] if scaled_counts else [] + handle.write("\\t".join(coords + [f"{c:.6f}" for c in row]) + "\\n") + + norm_info = { + "use_input": bool(use_input and control_conditions), + "use_ms_scaling": scaling_source == "ms", + "use_spikein_scaling": scaling_source == "spikein", + "use_library_size": scaling_source == "library_size", + "scaling_source": scaling_source, + "control_conditions": control_conditions if use_input and control_conditions else [], + } + with open('chipbinner.normalization.json', 'w') as handle: + json.dump(norm_info, handle, indent=2) + handle.write("\\n") PY cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/chipbinner_hdbscan_grid.nf b/modules/local/chipbinner_hdbscan_grid.nf index ab4c8e66..06ad20b2 100644 --- a/modules/local/chipbinner_hdbscan_grid.nf +++ b/modules/local/chipbinner_hdbscan_grid.nf @@ -15,6 +15,7 @@ process CHIPBINNER_HDBSCAN_GRID { input: path matrix + val samples_json val group val min_cluster_size val min_samples @@ -22,6 +23,8 @@ process CHIPBINNER_HDBSCAN_GRID { output: path "chipbinner.hdbscan_grid_summary.tsv", emit: grid_summary path "chipbinner.clusters.best.tsv" , emit: clusters + path "chipbinner.clusters.2clusters.tsv" , emit: clusters_2 + path "chipbinner.clusters.3clusters.tsv" , emit: clusters_3 path "versions.yml" , emit: versions when: @@ -29,12 +32,27 @@ process CHIPBINNER_HDBSCAN_GRID { script: """\ + python - <<'PY' + import json + import csv + + samples = json.loads(r'''${samples_json}''') + with open('chipbinner.samplesheet.csv', 'w', newline='') as handle: + writer = csv.writer(handle) + writer.writerow(['sample_id', 'condition']) + for row in samples: + writer.writerow([row['sample_id'], row['condition']]) + PY + python ${projectDir}/bin/hdbscan_grid.py \ --matrix ${matrix} \ + --samplesheet chipbinner.samplesheet.csv \ --min_cluster_size ${min_cluster_size} \ --min_samples ${min_samples} \ --summary chipbinner.hdbscan_grid_summary.tsv \ - --clusters chipbinner.clusters.best.tsv + --clusters_best chipbinner.clusters.best.tsv \ + --clusters_2 chipbinner.clusters.2clusters.tsv \ + --clusters_3 chipbinner.clusters.3clusters.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/chipbinner_lola.nf b/modules/local/chipbinner_lola.nf new file mode 100644 index 00000000..58212ba2 --- /dev/null +++ b/modules/local/chipbinner_lola.nf @@ -0,0 +1,44 @@ +process CHIPBINNER_LOLA { + tag "${group}.${label}" + label 'process_medium' + + publishDir = [ + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/enrichment" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + conda "bioconda::bioconductor-lola=1.28.0 conda-forge::r-ggplot2 conda-forge::r-optparse" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-lola:1.28.0--r42hdfd78af_0' : + 'biocontainers/bioconductor-lola:1.28.0--r42hdfd78af_0' }" + + input: + tuple val(group), val(label), path(bed) + path universe + path lola_db + + output: + tuple val(group), val(label), path("${label}.lola.tsv"), emit: tsv + tuple val(group), val(label), path("${label}.lola_plot.pdf"), optional: true, emit: plot + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + Rscript ${projectDir}/bin/chipbinner_lola.R \ + --bed ${bed} \ + --universe ${universe} \ + --db ${lola_db} \ + --label ${label} \ + --out_tsv ${label}.lola.tsv \ + --out_pdf ${label}.lola_plot.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + r-base: \$(R --version 2>&1 | head -n 1 | sed -e 's/.*R version //; s/ .*//') + END_VERSIONS + """ +} diff --git a/modules/local/chipbinner_rots.nf b/modules/local/chipbinner_rots.nf index 72eb538a..305aae3b 100644 --- a/modules/local/chipbinner_rots.nf +++ b/modules/local/chipbinner_rots.nf @@ -23,6 +23,8 @@ process CHIPBINNER_ROTS { input: path matrix path clusters + path grid_summary + path norm_info val samples_json val treated val control @@ -31,15 +33,20 @@ process CHIPBINNER_ROTS { val lfc val bootstrap val k_value + val lola_run output: - tuple val(group), path("chipbinner.differential.tsv") , emit: results - tuple val(group), path("chipbinner.significant.bed") , emit: significant - tuple val(group), path("chipbinner.significant_up.bed") , emit: up - tuple val(group), path("chipbinner.significant_down.bed") , emit: down - tuple val(group), path("chipbinner.summary.tsv") , emit: summary - tuple val(group), path("plots") , emit: plots - path "versions.yml" , emit: versions + tuple val(group), path("chipbinner.differential.tsv") , emit: results + tuple val(group), path("chipbinner.significant.bed") , emit: significant + tuple val(group), path("chipbinner.significant_up.bed") , emit: up + tuple val(group), path("chipbinner.significant_down.bed") , emit: down + tuple val(group), path("chipbinner.control_enriched.bed") , emit: control_enriched + tuple val(group), path("chipbinner.treated_enriched.bed") , emit: treated_enriched + tuple val(group), path("chipbinner.stable.bed") , emit: stable + tuple val(group), path("chipbinner.noise.bed") , emit: noise + tuple val(group), path("chipbinner.summary.tsv") , emit: summary + tuple val(group), path("plots") , emit: plots + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -61,6 +68,8 @@ process CHIPBINNER_ROTS { Rscript ${projectDir}/bin/chipbinner_rots.R \ --matrix ${matrix} \ --clusters ${clusters} \ + --grid_summary ${grid_summary} \ + --norm_info ${norm_info} \ --samplesheet chipbinner.samplesheet.csv \ --treated ${treated} \ --control ${control} \ @@ -68,7 +77,8 @@ process CHIPBINNER_ROTS { --fdr ${fdr} \ --lfc ${lfc} \ --bootstrap ${bootstrap} \ - --k_value ${k_value} + --k_value ${k_value} \ + --lola_run ${lola_run} cat chipbinner.significant_up.bed chipbinner.significant_down.bed | awk 'NF' | sort -k1,1 -k2,2n | uniq > chipbinner.significant.bed diff --git a/modules/local/differential_overlap.nf b/modules/local/differential_overlap.nf index 779b4026..5ef01d11 100644 --- a/modules/local/differential_overlap.nf +++ b/modules/local/differential_overlap.nf @@ -29,12 +29,35 @@ process DIFFERENTIAL_OVERLAP { """\ set -euo pipefail + declare -A bp_cache + + calc_bp() { + local bed="\$1" + if [[ ! -s "\$bed" ]]; then + echo 0 + return + fi + bedtools sort -i "\$bed" | bedtools merge -i - | awk '{sum += (\$3 - \$2)} END {print sum + 0}' + } + + get_bp() { + local bed="\$1" + if [[ -n "\${bp_cache[\$bed]+x}" ]]; then + echo "\${bp_cache[\$bed]}" + return + fi + local value + value=\$(calc_bp "\$bed") + bp_cache["\$bed"]="\$value" + echo "\$value" + } + printf "group\tcaller_a\tcaller_b\tintersection\tunion\tjaccard\n" > overlap_callers.tsv if [[ -s "${callers_tsv}" ]]; then - awk -F'\t' 'BEGIN{OFS="\t"} { g=\$1; c=\$2; p=\$3; n[g]++; callers[g,n[g]]=c; paths[g,n[g]]=p } + awk -F'\\t' 'BEGIN{OFS="\\t"} { g=\$1; c=\$2; p=\$3; n[g]++; callers[g,n[g]]=c; paths[g,n[g]]=p } END { for (g in n) { for (i=1; i<=n[g]; i++) { for (j=i+1; j<=n[g]; j++) { print g, callers[g,i], paths[g,i], callers[g,j], paths[g,j] } } } }' "${callers_tsv}" \ | while IFS=\$'\\t' read -r group caller_a path_a caller_b path_b; do - jaccard_line=\$(bedtools jaccard -a "\$path_a" -b "\$path_b" | awk 'NR==2 {print \$2"\\t"\$3"\\t"\$4}') + jaccard_line=\$(bedtools jaccard -a "\$path_a" -b "\$path_b" | awk 'NR==2 {print \$1"\\t"\$2"\\t"\$3}') if [[ -z "\$jaccard_line" ]]; then inter=0 union=0 @@ -42,22 +65,23 @@ process DIFFERENTIAL_OVERLAP { else IFS=\$'\\t' read -r inter union jacc <<< "\$jaccard_line" fi - printf "%s\t%s\t%s\t%s\t%s\t%s\n" "\$group" "\$caller_a" "\$caller_b" "\$inter" "\$union" "\$jacc" >> overlap_callers.tsv + printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n" "\$group" "\$caller_a" "\$caller_b" "\$inter" "\$union" "\$jacc" >> overlap_callers.tsv done fi - printf "group\tmethod_a\tmethod_b\tcaller\tintersection\tunion\tjaccard\n" > overlap_methods.tsv + printf "group\tcaller\tmethod_a\tmethod_b\tn_a\tn_b\tn_overlap\tjaccard\tspan_mode_used\n" > overlap_methods.tsv if [[ -s "${methods_tsv}" ]]; then - awk -F'\t' 'BEGIN{OFS="\t"} { g=\$1; m=\$2; c=\$3; p=\$4; n[g]++; methods[g,n[g]]=m; callers[g,n[g]]=c; paths[g,n[g]]=p } - END { for (g in n) { for (i=1; i<=n[g]; i++) { for (j=i+1; j<=n[g]; j++) { print g, methods[g,i], callers[g,i], paths[g,i], methods[g,j], callers[g,j], paths[g,j] } } } }' "${methods_tsv}" \ - | while IFS=\$'\\t' read -r group method_a caller_a path_a method_b caller_b path_b; do - jaccard_line=\$(bedtools jaccard -a "\$path_a" -b "\$path_b" | awk 'NR==2 {print \$2"\\t"\$3"\\t"\$4}') + awk -F'\\t' 'BEGIN{OFS="\\t"} { g=\$1; m=\$2; c=\$3; p=\$4; s=\$5; n[g]++; methods[g,n[g]]=m; callers[g,n[g]]=c; paths[g,n[g]]=p; spans[g,n[g]]=s } + END { for (g in n) { for (i=1; i<=n[g]; i++) { for (j=i+1; j<=n[g]; j++) { if (methods[g,i] == methods[g,j]) { next } print g, methods[g,i], callers[g,i], paths[g,i], spans[g,i], methods[g,j], callers[g,j], paths[g,j], spans[g,j] } } } }' "${methods_tsv}" \ + | while IFS=\$'\\t' read -r group method_a caller_a path_a span_a method_b caller_b path_b span_b; do + n_a=\$(get_bp "\$path_a") + n_b=\$(get_bp "\$path_b") + jaccard_line=\$(bedtools jaccard -a "\$path_a" -b "\$path_b" | awk 'NR==2 {print \$1"\\t"\$3}') if [[ -z "\$jaccard_line" ]]; then - inter=0 - union=0 + n_overlap=0 jacc=0 else - IFS=\$'\\t' read -r inter union jacc <<< "\$jaccard_line" + IFS=\$'\\t' read -r n_overlap jacc <<< "\$jaccard_line" fi caller="NA" if [[ "\$method_a" == "diffbind" ]]; then @@ -65,7 +89,13 @@ process DIFFERENTIAL_OVERLAP { elif [[ "\$method_b" == "diffbind" ]]; then caller="\$caller_b" fi - printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" "\$group" "\$method_a" "\$method_b" "\$caller" "\$inter" "\$union" "\$jacc" >> overlap_methods.tsv + span_mode="NA" + if [[ "\$method_a" == "span" ]]; then + span_mode="\$span_a" + elif [[ "\$method_b" == "span" ]]; then + span_mode="\$span_b" + fi + printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n" "\$group" "\$caller" "\$method_a" "\$method_b" "\$n_a" "\$n_b" "\$n_overlap" "\$jacc" "\$span_mode" >> overlap_methods.tsv done fi diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf index 9bcaedbb..68d4b34e 100644 --- a/modules/local/differential_summary.nf +++ b/modules/local/differential_summary.nf @@ -45,12 +45,17 @@ process DIFFERENTIAL_SUMMARY_MERGE { for row in reader: rows.append(row) + base_fields = [ + 'method','group','caller','treated_condition','control_condition','n_tested','n_fdr_pass','n_up','n_down','use_spikein','span_mode_used' + ] if rows: - fieldnames = list(rows[0].keys()) + fieldnames = list(base_fields) + for row in rows: + for key in row.keys(): + if key not in fieldnames: + fieldnames.append(key) else: - fieldnames = [ - 'method','group','caller','treated_condition','control_condition','n_tested','n_fdr_pass','n_up','n_down' - ] + fieldnames = list(base_fields) with open('differential.summary.all_methods.tsv', 'w', newline='') as out_handle: if header_lines: diff --git a/modules/local/write_differential_manifests.nf b/modules/local/write_differential_manifests.nf index b5586319..2a03b360 100644 --- a/modules/local/write_differential_manifests.nf +++ b/modules/local/write_differential_manifests.nf @@ -81,6 +81,8 @@ process WRITE_DIFFERENTIAL_MANIFESTS { 'group', 'condition', 'replicate', + 'control_group', + 'control_condition', 'final_bam', 'final_bai', 'normalisation_mode', @@ -93,6 +95,8 @@ process WRITE_DIFFERENTIAL_MANIFESTS { str(row.get('group', '')), str(row.get('condition', '')), str(row.get('replicate', '')), + str(row.get('control_group', 'NA')), + str(row.get('control_condition', 'NA')), resolve_published(str(row.get('final_bam', ''))), resolve_published(str(row.get('final_bai', ''))), str(row.get('normalisation_mode', '')), diff --git a/workflows/cutandrun.nf b/workflows/cutandrun.nf index 8a85b4cd..b8081990 100644 --- a/workflows/cutandrun.nf +++ b/workflows/cutandrun.nf @@ -663,11 +663,14 @@ workflow CUTANDRUN { } /* - * CHANNEL: Filter bais for target only + * CHANNEL: Filter bais for target/control */ ch_samtools_bai.filter { it -> it[0].is_control == false } .set { ch_bai_target } + ch_samtools_bai.filter { it -> it[0].is_control == true } + .set { ch_bai_control } //ch_bai_target | view + //ch_bai_control | view /* * CHANNEL: Combine bam and bai files on id @@ -679,12 +682,22 @@ workflow CUTANDRUN { // EXAMPLE CHANNEL STRUCT: [[META], BAM, BAI] //ch_bam_bai | view + /* + * CHANNEL: Combine control bam and bai files on id + */ + ch_bam_control.map { row -> [row[0].id, row ].flatten()} + .join ( ch_bai_control.map { row -> [row[0].id, row ].flatten()} ) + .map { row -> [row[1], row[2], row[4]] } + .set { ch_control_bam_bai } + //ch_control_bam_bai | view + ch_differential_summary = Channel.empty() ch_differential_skipped = Channel.empty() ch_differential_versions = Channel.empty() if (params.run_peak_calling && (params.run_diffbind || params.run_chipbinner || params.run_span_diff)) { DIFFERENTIAL_PEAK_CALLING ( ch_bam_bai, + ch_control_bam_bai, ch_peaks_all, ch_bigwig, ch_spikein_scale_factors, diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index c44e3cd4..bc49fc58 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -45,7 +45,10 @@ workflow DIFFERENTIAL_ONLY { group: row.group, condition: row.condition, replicate: row.replicate.toInteger(), - is_control: false + control_group: row.control_group ?: row.group, + control_condition: row.control_condition ?: 'NA', + is_control: false, + normalisation_mode: row.normalisation_mode ?: params.normalisation_mode ] def bam = file(row.final_bam) def bai = file(row.final_bai) @@ -61,6 +64,37 @@ workflow DIFFERENTIAL_ONLY { .filter { meta, bam, bai, scale, bigwig -> bigwig && bigwig != 'NA' } .map { meta, bam, bai, scale, bigwig -> [meta, file(bigwig)] } + // Optional pooled controls (used for ChIPBinner input normalization) + def pooled_controls_dir = file("${run_dir}/03_peak_calling/01_pooled_controls") + ch_control_bam_bai = Channel.empty() + if (pooled_controls_dir.exists()) { + ch_control_bam_bai = Channel.fromPath("${pooled_controls_dir}/*.bam") + .map { bam -> + def base = bam.baseName + def parts = base.tokenize('_') + def condition = parts.size() > 1 ? parts[-1] : 'NA' + def group = parts.size() > 1 ? parts[0..-2].join('_') : base + def bai = file("${bam}.bai") + if (!bai.exists()) { + bai = file("${bam.baseName}.bai") + } + if (!bai.exists()) { + log.warn "Missing BAI for pooled control ${bam}" + return null + } + def meta = [ + id: base, + sample_id: base, + group: group, + condition: condition, + replicate: 1, + is_control: true + ] + [meta, bam, bai] + } + .filter { it != null } + } + ch_samples_map = ch_samples .map { meta, bam, bai, scale, bigwig -> [meta.sample_id, meta] } .toList() @@ -89,6 +123,7 @@ workflow DIFFERENTIAL_ONLY { DIFFERENTIAL_PEAK_CALLING( ch_bam_bai, + ch_control_bam_bai, ch_peaks, ch_bigwig, ch_spikein_scale, From 194f2de7453b4859413e2b78b11f7dfe2a95fd06 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 5 Jan 2026 00:10:02 -0800 Subject: [PATCH 17/31] Address SPAN diff review feedback --- bin/span_fallback_de.R | 5 ++-- .../SPEC_DIFFERENTIAL_PEAK_CALLING.md | 2 +- modules/local/span_compare.nf | 25 +++++++++++++++++-- modules/local/span_fallback_diff.nf | 2 +- .../local/differential_peak_calling.nf | 14 +++++++---- 5 files changed, 36 insertions(+), 12 deletions(-) diff --git a/bin/span_fallback_de.R b/bin/span_fallback_de.R index 82aceea5..09b437fb 100755 --- a/bin/span_fallback_de.R +++ b/bin/span_fallback_de.R @@ -123,8 +123,7 @@ if ("SpikeinScaleFactor" %in% colnames(samplesheet)) { } if (use_spikein && (is.null(scale_factors) || any(is.na(scale_factors)))) { - message("Spike-in scaling requested but scale factors missing; falling back to default normalization") - use_spikein <- FALSE + stop("Spike-in scaling requested but spike-in scale factors are missing from the samplesheet.") } backend <- toupper(opt$backend) @@ -179,7 +178,7 @@ write.table(up[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_up. write.table(down[, c("chr", "start", "end")], paste0(opt$prefix, ".significant_down.bed"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) summary <- data.frame( - method = "span_fallback", + method = "span", group = opt$group, caller = "NA", treated_condition = opt$treated, diff --git a/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md index b2cd88fb..7c422fe2 100644 --- a/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md +++ b/docs/extension_specs/SPEC_DIFFERENTIAL_PEAK_CALLING.md @@ -622,7 +622,7 @@ ChIPBinner parameters • --chipbinner_run_lola (bool; default: false) SPAN / OmniPeaks parameters - • --omnipeaks_jar (path; required if --run_span_diff) + • --omnipeaks_jar (path; required for native SPAN diff; optional for fallback) • --span_diff_mode (string; default: auto) • auto|native|fallback • --span_diff_fdr (float; default: 0.05) diff --git a/modules/local/span_compare.nf b/modules/local/span_compare.nf index 39d9580e..9717d9bd 100644 --- a/modules/local/span_compare.nf +++ b/modules/local/span_compare.nf @@ -90,6 +90,28 @@ process SPAN_COMPARE { cp "\$diff_file" span.differential.tsv + cat <<'AWK' > span_compare_bed.awk + function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?$/) } + BEGIN { OFS="\\t"; header_done=0; chr_idx=1; start_idx=2; end_idx=3 } + /^#/ || /^track/ || /^browser/ { next } + !header_done { + if (!isnum($2)) { + for (i=1; i<=NF; i++) { + col=tolower($i) + if (col ~ /^(chr|chrom|seqnames|seqname)$/) chr_idx=i + if (col == "start") start_idx=i + if (col == "end") end_idx=i + } + header_done=1 + next + } else { + header_done=1 + } + } + { print $chr_idx,$start_idx,$end_idx } + AWK + awk -f span_compare_bed.awk "\$diff_file" > span.differential.bed + : > span.significant_up.bed : > span.significant_down.bed cat <<'AWK' > span_compare_filter.awk @@ -150,7 +172,6 @@ process SPAN_COMPARE { awk -v fdr_thresh="${fdr}" -v up_file="span.significant_up.bed" -v down_file="span.significant_down.bed" -v proxy_file="span.proxy_log2.txt" -f span_compare_filter.awk "\$diff_file" cat span.significant_up.bed span.significant_down.bed | awk 'NF' | sort -k1,1 -k2,2n | uniq > span.significant.bed - cp span.significant.bed span.differential.bed total=\$(awk 'function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?$/) } /^#/ || /^track/ || /^browser/ { next } !header_seen { if (!isnum(\$2)) { header_seen=1; next } header_seen=1 } { n++ } END { print n+0 }' "\$diff_file") n_up=\$(wc -l < span.significant_up.bed | awk '{print \$1}') @@ -187,7 +208,7 @@ process SPAN_COMPARE { - Direction follows treated (${meta.treated ?: 'NA'}) vs control (${meta.control ?: 'NA'}). END_README fi - rm -f span_compare_filter.awk span.proxy_log2.txt + rm -f span_compare_filter.awk span_compare_bed.awk span.proxy_log2.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/span_fallback_diff.nf b/modules/local/span_fallback_diff.nf index b972e59d..2dbe6547 100644 --- a/modules/local/span_fallback_diff.nf +++ b/modules/local/span_fallback_diff.nf @@ -51,7 +51,7 @@ process SPAN_FALLBACK_DIFF { --prefix span_fallback mv span_fallback.results.tsv span_fallback.differential.tsv - cp span_fallback.significant.bed span_fallback.differential.bed + awk 'BEGIN{OFS="\\t"} NR>1 {print \$1,\$2,\$3}' span_fallback.differential.tsv > span_fallback.differential.bed mkdir -p 00_manifests peaks_list=\$(cat span_fallback.peaks.list 2>/dev/null || true) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index bf7a912d..a78fc3da 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -679,11 +679,15 @@ workflow DIFFERENTIAL_PEAK_CALLING { if (span_use_fallback) { def span_callers = callers.findAll { it.startsWith('span_') } if (!span_callers) { - if (span_mode == 'auto' && !span_has_jar) { - exit 1, "SPAN diff auto mode requested but --omnipeaks_jar missing and no SPAN peaks are available for fallback." + def details = span_has_jar ? + 'SPAN caller not in --peakcaller' : + 'SPAN caller not in --peakcaller; --omnipeaks_jar missing so native compare is unavailable' + if (params.differential_strict) { + exit 1, "SPAN differential unavailable: ${details}" } + log.warn "SPAN differential skipped: ${details}" ch_skipped_records = ch_skipped_records.mix( - ch_span_fallback_groups.map { rec -> [method: 'span', group: rec.group, caller: 'NA', reason: 'no_span_peaks', details: 'SPAN caller not in --peakcaller'] } + ch_span_fallback_groups.map { rec -> [method: 'span', group: rec.group, caller: 'NA', reason: 'no_span_peaks', details: details] } ) } else { def span_caller = span_callers[0] @@ -779,10 +783,10 @@ workflow DIFFERENTIAL_PEAK_CALLING { ) method_beds = method_beds.mix( ch_chipbinner_up.map { group, file -> - [group: group, method: 'chipbinner_treated_enriched', caller: 'NA', path: file.toString(), span_mode_used: 'NA'] + [group: group, method: 'chipbinner', caller: 'treated_enriched', path: file.toString(), span_mode_used: 'NA'] }, ch_chipbinner_down.map { group, file -> - [group: group, method: 'chipbinner_control_enriched', caller: 'NA', path: file.toString(), span_mode_used: 'NA'] + [group: group, method: 'chipbinner', caller: 'control_enriched', path: file.toString(), span_mode_used: 'NA'] } ) method_beds = method_beds.mix( From 59fce072a737f24a4d0cfcf064c215a27dbf552b Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 5 Jan 2026 00:25:30 -0800 Subject: [PATCH 18/31] Address differential review feedback --- bin/annotate_regions.py | 10 ++++--- bin/diffbind_run.R | 18 +++++++++++-- modules/local/annotate_regions.nf | 2 +- modules/local/chipbinner_counts.nf | 26 +++++++++++++++++++ modules/local/chipbinner_hdbscan_grid.nf | 2 +- modules/local/differential_summary.nf | 17 +++++++++--- .../local/differential_peak_calling.nf | 5 ++++ 7 files changed, 70 insertions(+), 10 deletions(-) diff --git a/bin/annotate_regions.py b/bin/annotate_regions.py index e62f8919..672b1e35 100755 --- a/bin/annotate_regions.py +++ b/bin/annotate_regions.py @@ -107,11 +107,15 @@ def main(): return features_has_name = False + feature_name_idx = None with features_bed.open() as handle: for line in handle: if not line.strip() or line.startswith("#"): continue - features_has_name = len(line.rstrip().split("\t")) >= 4 + features_cols = len(line.rstrip().split("\t")) + features_has_name = features_cols >= 4 + if features_has_name: + feature_name_idx = 4 + 3 break regions_bed = Path("regions.bed") @@ -138,8 +142,8 @@ def main(): continue row_id = int(fields[3]) feature_id = "NA" - if features_has_name and len(fields) > 7: - feature_id = fields[7] + if feature_name_idx is not None and len(fields) > feature_name_idx: + feature_id = fields[feature_name_idx] distance = fields[-1] annotations[row_id] = (feature_id, distance) diff --git a/bin/diffbind_run.R b/bin/diffbind_run.R index 7d895735..f0080146 100755 --- a/bin/diffbind_run.R +++ b/bin/diffbind_run.R @@ -252,8 +252,22 @@ if (!use_spikein) { dba_obj <- do.call(dba.contrast, c(list(dba_obj), contrast_args)) dba_obj <- do.call(dba.analyze, c(list(dba_obj), analyze_args)) - res <- dba.report(dba_obj, th = 1, fold = 0) - res_df <- as.data.frame(res) + res <- tryCatch(dba.report(dba_obj, th = 1, fold = 0), error = function(e) NULL) + if (is.null(res)) { + res_df <- data.frame() + } else { + res_df <- as.data.frame(res) + } + if (nrow(res_df) == 0 && ncol(res_df) == 0) { + res_df <- data.frame( + chr = character(), + start = integer(), + end = integer(), + log2FC = numeric(), + FDR = numeric(), + stringsAsFactors = FALSE + ) + } coord_cols <- list(chr = NULL, start = NULL, end = NULL) for (col in colnames(res_df)) { diff --git a/modules/local/annotate_regions.nf b/modules/local/annotate_regions.nf index f131a9c8..5bee90ed 100644 --- a/modules/local/annotate_regions.nf +++ b/modules/local/annotate_regions.nf @@ -8,7 +8,7 @@ process ANNOTATE_REGIONS { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] : null } - conda "bioconda::bedtools=2.31.1 conda-forge::python=3.11 conda-forge::perl=5.26.2" + conda "bioconda::bedtools=2.31.1 bioconda::bedops=2.4.41 conda-forge::python=3.11 conda-forge::perl=5.26.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" diff --git a/modules/local/chipbinner_counts.nf b/modules/local/chipbinner_counts.nf index 514a8be1..795ffe6b 100644 --- a/modules/local/chipbinner_counts.nf +++ b/modules/local/chipbinner_counts.nf @@ -39,16 +39,21 @@ process CHIPBINNER_COUNTS { script: """\ printf "%s\n" ${bams} > bam_paths.txt + printf "%s\n" ${bais} > bai_paths.txt printf "%s\n" ${control_bams} > control_bam_paths.txt + printf "%s\n" ${control_bais} > control_bai_paths.txt python - <<'PY' import json import subprocess import sys + from pathlib import Path samples = json.loads(r'''${samples_json}''') with open('bam_paths.txt') as handle: bam_paths = [line.strip() for line in handle if line.strip()] + with open('bai_paths.txt') as handle: + bai_paths = [line.strip() for line in handle if line.strip()] sample_ids = [s['sample_id'] for s in samples] scale_factors = [s.get('spikein_scale_factor') for s in samples] ms_coeffs_path = r'''${ms_coeffs}''' @@ -60,6 +65,24 @@ process CHIPBINNER_COUNTS { sys.stderr.write(f"Expected {len(sample_ids)} BAMs but found {len(bam_paths)}\\n") sys.exit(1) + def ensure_bai_for_bams(bams, bais, label): + missing = [] + for bam in bams: + bam_path = Path(bam) + candidates = [ + Path(f"{bam}.bai"), + bam_path.with_suffix(".bai"), + ] + if any(candidate.exists() for candidate in candidates): + continue + missing.append(str(bam_path)) + if missing: + sys.stderr.write(f"Missing BAM index for {label}: {', '.join(missing)}\\n") + sys.stderr.write(f"Available BAI files: {', '.join(bais) if bais else 'none'}\\n") + sys.exit(1) + + ensure_bai_for_bams(bam_paths, bai_paths, "chipbinner samples") + cmd = ["bedtools", "multicov", "-bams"] + bam_paths + ["-bed", "${bins}"] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if result.returncode != 0: @@ -185,6 +208,9 @@ process CHIPBINNER_COUNTS { if use_input and control_conditions: with open('control_bam_paths.txt') as handle: control_bam_paths = [line.strip() for line in handle if line.strip()] + with open('control_bai_paths.txt') as handle: + control_bai_paths = [line.strip() for line in handle if line.strip()] + ensure_bai_for_bams(control_bam_paths, control_bai_paths, "chipbinner controls") if len(control_bam_paths) != len(control_conditions): sys.stderr.write("Control BAMs and conditions mismatch\\n") sys.exit(1) diff --git a/modules/local/chipbinner_hdbscan_grid.nf b/modules/local/chipbinner_hdbscan_grid.nf index 06ad20b2..ea9bfb6f 100644 --- a/modules/local/chipbinner_hdbscan_grid.nf +++ b/modules/local/chipbinner_hdbscan_grid.nf @@ -8,7 +8,7 @@ process CHIPBINNER_HDBSCAN_GRID { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - conda "conda-forge::python=3.11 conda-forge::numpy conda-forge::hdbscan" + conda "conda-forge::python=3.11 conda-forge::numpy conda-forge::pandas conda-forge::scikit-learn conda-forge::hdbscan" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/hdbscan:0.8.33--pyhdfd78af_0' : 'biocontainers/hdbscan:0.8.33--pyhdfd78af_0' }" diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf index 68d4b34e..33d2bd1b 100644 --- a/modules/local/differential_summary.nf +++ b/modules/local/differential_summary.nf @@ -12,7 +12,7 @@ process DIFFERENTIAL_SUMMARY_MERGE { container "quay.io/biocontainers/python:3.8.3" input: - path summary_files + path summary_files, stageAs: { file -> "summaries/${file.parent.name}-${file.name}" } val skipped_json path summary_header path skipped_header @@ -27,13 +27,24 @@ process DIFFERENTIAL_SUMMARY_MERGE { script: """\ - python - ${summary_files} <<'PY' + : > summary_files.list + for file in ${summary_files}; do + echo "\$file" >> summary_files.list + done + + python - <<'PY' import csv import json import sys from pathlib import Path - summary_files = sorted(Path(p) for p in sys.argv[1:]) + summary_files = [] + with open('summary_files.list') as handle: + for line in handle: + line = line.strip() + if line: + summary_files.append(Path(line)) + summary_files = sorted(summary_files) with open('${summary_header}', 'r') as header_handle: header_lines = header_handle.read().rstrip('\\n') diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index a78fc3da..b0b55f84 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -40,6 +40,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_diffbind_significant = Channel.empty() ch_chipbinner_up = Channel.empty() ch_chipbinner_down = Channel.empty() + ch_chipbinner_all = Channel.empty() ch_span_native_significant = Channel.empty() ch_span_fallback_significant = Channel.empty() @@ -468,6 +469,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_summary_files = ch_summary_files.mix(CHIPBINNER_ROTS.out.summary.map { group, file -> file }) ch_chipbinner_up = CHIPBINNER_ROTS.out.up ch_chipbinner_down = CHIPBINNER_ROTS.out.down + ch_chipbinner_all = CHIPBINNER_ROTS.out.significant if (lola_should_run) { ch_lola_beds = Channel.empty() @@ -782,6 +784,9 @@ workflow DIFFERENTIAL_PEAK_CALLING { } ) method_beds = method_beds.mix( + ch_chipbinner_all.map { group, file -> + [group: group, method: 'chipbinner', caller: 'significant', path: file.toString(), span_mode_used: 'NA'] + }, ch_chipbinner_up.map { group, file -> [group: group, method: 'chipbinner', caller: 'treated_enriched', path: file.toString(), span_mode_used: 'NA'] }, From 52e138e07d8ab54279782f6b2c2af916eab1ee80 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 5 Jan 2026 00:38:36 -0800 Subject: [PATCH 19/31] Add chipbinner differential regression test --- tests/test_07_differential_peak_calling.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_07_differential_peak_calling.yml b/tests/test_07_differential_peak_calling.yml index 5706c323..2efa34f7 100644 --- a/tests/test_07_differential_peak_calling.yml +++ b/tests/test_07_differential_peak_calling.yml @@ -15,6 +15,22 @@ - path: results/03_peak_calling/06_differential/00_manifests/differential_manifest.peaks.tsv - path: results/03_peak_calling/06_differential/01_diffbind/00_samplesheets/seacr/h3k27me3.diffbind.csv +- name: test_differential_chipbinner + command: nextflow run main.nf -profile docker,test --run_chipbinner --differential_contrast "Treatment,Control" --differential_min_replicates 1 --input tests/assets/samplesheet_condition.csv --skip_fastqc --skip_removeduplicates --skip_preseq --skip_multiqc -c tests/config/differential.config + tags: + - test_differential + - test_differential_chipbinner + files: + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/clustering/chipbinner.hdbscan_grid_summary.tsv + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/clustering/chipbinner.clusters.best.tsv + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/clustering/chipbinner.clusters.2clusters.tsv + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/clustering/chipbinner.clusters.3clusters.tsv + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/differential/chipbinner.differential.tsv + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/bed/chipbinner.control_enriched.bed + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/bed/chipbinner.treated_enriched.bed + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/bed/chipbinner.stable.bed + - path: results/03_peak_calling/06_differential/02_chipbinner/h3k27me3/chipbinner.summary.tsv + - name: test_differential_only_skipped_invalid command: nextflow run main.nf -entry DIFFERENTIAL_ONLY -profile docker,test --run_diffbind --differential_contrast "Treatment,Control" --differential_from_run tests/assets/differential_mock --skip_multiqc -c tests/config/differential.config tags: From 8befbdfe597f8d4197d7350988276327ff589e10 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 5 Jan 2026 00:41:10 -0800 Subject: [PATCH 20/31] Align chipbinner overlaps and hdbscan ranking --- bin/chipbinner_rots.R | 3 ++- bin/hdbscan_grid.py | 10 ++++++---- subworkflows/local/differential_peak_calling.nf | 5 ----- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/bin/chipbinner_rots.R b/bin/chipbinner_rots.R index e5d5f1a5..c2699b59 100755 --- a/bin/chipbinner_rots.R +++ b/bin/chipbinner_rots.R @@ -172,7 +172,8 @@ summary <- data.frame( write.table(summary, "chipbinner.summary.tsv", sep = "\t", quote = FALSE, row.names = FALSE) # Plots -counts_log <- log2(counts + 1) +# counts already include pseudocount from normalization +counts_log <- log2(counts) try({ pca <- prcomp(t(counts_log), scale. = TRUE) pdf("plots/PCA.pdf") diff --git a/bin/hdbscan_grid.py b/bin/hdbscan_grid.py index af4e1126..dcbb4fc4 100755 --- a/bin/hdbscan_grid.py +++ b/bin/hdbscan_grid.py @@ -178,15 +178,17 @@ def main(): "selected_3clusters": False, }) - best = max(results, key=lambda row: (row["score_total"], row["frac_assigned_non_noise"], row["mean_persistence"])) + # Rank by cluster count first, then overall score, fraction assigned, and persistence. + rank_key = lambda row: (row["n_clusters"], row["score_total"], row["frac_assigned_non_noise"], row["mean_persistence"]) + best = max(results, key=rank_key) best_2 = None best_3 = None candidates_2 = [row for row in results if row["n_clusters"] == 2] candidates_3 = [row for row in results if row["n_clusters"] == 3] if candidates_2: - best_2 = max(candidates_2, key=lambda row: (row["score_total"], row["frac_assigned_non_noise"], row["mean_persistence"])) + best_2 = max(candidates_2, key=rank_key) if candidates_3: - best_3 = max(candidates_3, key=lambda row: (row["score_total"], row["frac_assigned_non_noise"], row["mean_persistence"])) + best_3 = max(candidates_3, key=rank_key) for row in results: if row is best: @@ -196,7 +198,7 @@ def main(): if best_3 is not None and row is best_3: row["selected_3clusters"] = True - results_sorted = sorted(results, key=lambda row: row["score_total"], reverse=True) + results_sorted = sorted(results, key=rank_key, reverse=True) write_summary(args.summary, results_sorted) labels = list(labels_by_params.get((best["min_cluster_size"], best["min_samples"]), [])) if coords else [] diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index b0b55f84..a78fc3da 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -40,7 +40,6 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_diffbind_significant = Channel.empty() ch_chipbinner_up = Channel.empty() ch_chipbinner_down = Channel.empty() - ch_chipbinner_all = Channel.empty() ch_span_native_significant = Channel.empty() ch_span_fallback_significant = Channel.empty() @@ -469,7 +468,6 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_summary_files = ch_summary_files.mix(CHIPBINNER_ROTS.out.summary.map { group, file -> file }) ch_chipbinner_up = CHIPBINNER_ROTS.out.up ch_chipbinner_down = CHIPBINNER_ROTS.out.down - ch_chipbinner_all = CHIPBINNER_ROTS.out.significant if (lola_should_run) { ch_lola_beds = Channel.empty() @@ -784,9 +782,6 @@ workflow DIFFERENTIAL_PEAK_CALLING { } ) method_beds = method_beds.mix( - ch_chipbinner_all.map { group, file -> - [group: group, method: 'chipbinner', caller: 'significant', path: file.toString(), span_mode_used: 'NA'] - }, ch_chipbinner_up.map { group, file -> [group: group, method: 'chipbinner', caller: 'treated_enriched', path: file.toString(), span_mode_used: 'NA'] }, From 4119fdf0878f855d9b22a3e630df4b59fa36aa98 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 5 Jan 2026 01:11:50 -0800 Subject: [PATCH 21/31] Complete chipbinner clustering outputs --- bin/chipbinner_rots.R | 139 ++++++++++++++---- bin/hdbscan_grid.py | 22 +-- modules/local/chipbinner_counts.nf | 67 ++++----- modules/local/chipbinner_rots.nf | 6 + .../local/differential_peak_calling.nf | 29 +++- 5 files changed, 180 insertions(+), 83 deletions(-) diff --git a/bin/chipbinner_rots.R b/bin/chipbinner_rots.R index c2699b59..a56334ac 100755 --- a/bin/chipbinner_rots.R +++ b/bin/chipbinner_rots.R @@ -8,6 +8,8 @@ suppressPackageStartupMessages({ option_list <- list( make_option(c("--matrix"), type = "character"), make_option(c("--clusters"), type = "character"), + make_option(c("--clusters_2"), type = "character", default = ""), + make_option(c("--clusters_3"), type = "character", default = ""), make_option(c("--grid_summary"), type = "character"), make_option(c("--norm_info"), type = "character"), make_option(c("--samplesheet"), type = "character"), @@ -24,7 +26,6 @@ option_list <- list( opt <- parse_args(OptionParser(option_list = option_list)) matrix_df <- read.table(opt$matrix, header = TRUE, sep = "\t", check.names = FALSE) -clusters_df <- read.table(opt$clusters, header = TRUE, sep = "\t", check.names = FALSE) samples_df <- read.csv(opt$samplesheet, stringsAsFactors = FALSE) sample_ids <- samples_df$sample_id @@ -69,42 +70,119 @@ results <- data.frame( FDR = fdr ) -cluster_ids <- rep(NA, nrow(results)) -if (all(c("chr", "start", "end", "cluster") %in% colnames(clusters_df))) { +read_clusters <- function(path) { + if (is.null(path) || path == "" || !file.exists(path)) { + return(NULL) + } + df <- tryCatch(read.table(path, header = TRUE, sep = "\t", check.names = FALSE), + error = function(e) NULL) + if (is.null(df) || nrow(df) == 0) { + return(NULL) + } + if (!all(c("chr", "start", "end", "cluster") %in% colnames(df))) { + return(NULL) + } + df +} + +cluster_ids_from_df <- function(clusters_df, results_df) { + if (is.null(clusters_df)) { + return(NULL) + } clusters_key <- paste(clusters_df$chr, clusters_df$start, clusters_df$end, sep = ":") - cluster_ids <- clusters_df$cluster[match(paste(results$chr, results$start, results$end, sep = ":"), clusters_key)] + cluster_ids <- clusters_df$cluster[match(paste(results_df$chr, results_df$start, results_df$end, sep = ":"), clusters_key)] + cluster_ids } -results$cluster_id <- cluster_ids -cluster_labels <- rep("stable", length(cluster_ids)) -if (!all(is.na(cluster_ids))) { - treated_idx <- conditions == opt$treated - control_idx <- conditions == opt$control +compute_cluster_labels <- function(cluster_ids, counts_mat, conds, treated, control, lfc) { + if (is.null(cluster_ids) || length(cluster_ids) == 0) { + return(NULL) + } + labels <- rep("stable", length(cluster_ids)) + if (all(is.na(cluster_ids))) { + labels[] <- "NA" + return(labels) + } + treated_idx <- conds == treated + control_idx <- conds == control for (cid in unique(cluster_ids)) { if (is.na(cid)) { next } if (cid == -1) { - cluster_labels[cluster_ids == cid] <- "noise" + labels[cluster_ids == cid] <- "noise" next } cluster_rows <- cluster_ids == cid - treated_mean <- mean(rowMeans(counts[cluster_rows, treated_idx, drop = FALSE])) - control_mean <- mean(rowMeans(counts[cluster_rows, control_idx, drop = FALSE])) + treated_mean <- mean(rowMeans(counts_mat[cluster_rows, treated_idx, drop = FALSE])) + control_mean <- mean(rowMeans(counts_mat[cluster_rows, control_idx, drop = FALSE])) if (is.nan(treated_mean)) treated_mean <- 0 if (is.nan(control_mean)) control_mean <- 0 log2fc_cluster <- log2((treated_mean + 1e-6) / (control_mean + 1e-6)) - if (abs(log2fc_cluster) < opt$lfc) { + if (abs(log2fc_cluster) < lfc) { label <- "stable" } else if (log2fc_cluster > 0) { label <- "treated_enriched" } else { label <- "control_enriched" } - cluster_labels[cluster_ids == cid] <- label + labels[cluster_ids == cid] <- label } + labels } -results$cluster_label <- cluster_labels + +write_cluster_beds <- function(results_df, labels, suffix, model, manifest_df) { + if (is.null(labels)) { + return(manifest_df) + } + suffix_tag <- if (suffix == "") "" else paste0(".", suffix) + out_files <- list( + control_enriched = paste0("chipbinner.control_enriched", suffix_tag, ".bed"), + treated_enriched = paste0("chipbinner.treated_enriched", suffix_tag, ".bed"), + stable = paste0("chipbinner.stable", suffix_tag, ".bed"), + noise = paste0("chipbinner.noise", suffix_tag, ".bed") + ) + write.table(results_df[labels == "control_enriched", c("chr", "start", "end")], + out_files$control_enriched, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(results_df[labels == "treated_enriched", c("chr", "start", "end")], + out_files$treated_enriched, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(results_df[labels == "stable", c("chr", "start", "end")], + out_files$stable, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(results_df[labels == "noise", c("chr", "start", "end")], + out_files$noise, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + + for (label in names(out_files)) { + bed_file <- out_files[[label]] + if (file.exists(bed_file)) { + manifest_df <- rbind( + manifest_df, + data.frame( + model = model, + label = label, + bed = normalizePath(bed_file, mustWork = FALSE), + stringsAsFactors = FALSE + ) + ) + } + } + manifest_df +} + +clusters_best <- read_clusters(opt$clusters) +clusters_2 <- read_clusters(opt$clusters_2) +clusters_3 <- read_clusters(opt$clusters_3) + +cluster_ids_best <- cluster_ids_from_df(clusters_best, results) +cluster_labels_best <- compute_cluster_labels(cluster_ids_best, counts, conditions, opt$treated, opt$control, opt$lfc) +if (is.null(cluster_ids_best)) { + cluster_ids_best <- rep(NA, nrow(results)) +} +if (is.null(cluster_labels_best)) { + cluster_labels_best <- rep("NA", nrow(results)) +} + +results$cluster_id <- cluster_ids_best +results$cluster_label <- cluster_labels_best write.table(results, "chipbinner.differential.tsv", sep = "\t", quote = FALSE, row.names = FALSE) @@ -115,14 +193,25 @@ down <- sig[sig$log2FC < 0, , drop = FALSE] write.table(up[, c("chr", "start", "end")], "chipbinner.significant_up.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) write.table(down[, c("chr", "start", "end")], "chipbinner.significant_down.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) -write.table(results[results$cluster_label == "control_enriched", c("chr", "start", "end")], - "chipbinner.control_enriched.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) -write.table(results[results$cluster_label == "treated_enriched", c("chr", "start", "end")], - "chipbinner.treated_enriched.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) -write.table(results[results$cluster_label == "stable", c("chr", "start", "end")], - "chipbinner.stable.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) -write.table(results[results$cluster_label == "noise", c("chr", "start", "end")], - "chipbinner.noise.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) +bed_manifest <- data.frame( + model = character(), + label = character(), + bed = character(), + stringsAsFactors = FALSE +) +bed_manifest <- write_cluster_beds(results, cluster_labels_best, "", "best", bed_manifest) + +cluster_labels_2 <- compute_cluster_labels(cluster_ids_from_df(clusters_2, results), counts, conditions, opt$treated, opt$control, opt$lfc) +if (!is.null(cluster_labels_2)) { + bed_manifest <- write_cluster_beds(results, cluster_labels_2, "2clusters", "2clusters", bed_manifest) +} + +cluster_labels_3 <- compute_cluster_labels(cluster_ids_from_df(clusters_3, results), counts, conditions, opt$treated, opt$control, opt$lfc) +if (!is.null(cluster_labels_3)) { + bed_manifest <- write_cluster_beds(results, cluster_labels_3, "3clusters", "3clusters", bed_manifest) +} + +write.table(bed_manifest, "chipbinner.cluster_beds.tsv", sep = "\t", quote = FALSE, row.names = FALSE) selected_row <- NULL if (!is.null(opt$grid_summary) && file.exists(opt$grid_summary)) { @@ -145,7 +234,7 @@ if (!is.null(opt$norm_info) && file.exists(opt$norm_info)) { use_spikein_flag <- ifelse(is.null(norm_info$use_spikein_scaling), "false", ifelse(norm_info$use_spikein_scaling, "true", "false")) -n_clusters <- length(unique(cluster_ids[!is.na(cluster_ids) & cluster_ids != -1])) +n_clusters <- length(unique(cluster_ids_best[!is.na(cluster_ids_best) & cluster_ids_best != -1])) min_cluster_size <- if (!is.null(selected_row) && "min_cluster_size" %in% colnames(selected_row)) selected_row$min_cluster_size else NA min_samples <- if (!is.null(selected_row) && "min_samples" %in% colnames(selected_row)) selected_row$min_samples else NA @@ -193,7 +282,7 @@ try({ treated_means <- rowMeans(counts[, conditions == opt$treated, drop = FALSE]) control_means <- rowMeans(counts[, conditions == opt$control, drop = FALSE]) pdf("plots/density_scatter.pdf") - cluster_factor <- as.factor(cluster_labels) + cluster_factor <- as.factor(cluster_labels_best) cols <- as.numeric(cluster_factor) plot(control_means, treated_means, pch = 16, cex = 0.5, col = cols, xlab = "Control mean", ylab = "Treated mean", main = "Density scatter") diff --git a/bin/hdbscan_grid.py b/bin/hdbscan_grid.py index dcbb4fc4..6d936cf1 100755 --- a/bin/hdbscan_grid.py +++ b/bin/hdbscan_grid.py @@ -81,8 +81,8 @@ def write_summary(path, rows): "min_samples", "n_clusters", "frac_assigned_non_noise", - "mean_persistence", - "replicate_consistency", + "cluster_stability_metric", + "replicate_consistency_metric", "score_total", "selected", "selected_2clusters", @@ -123,6 +123,8 @@ def main(): args = parser.parse_args() coord_header, sample_ids, coords, matrix = load_matrix(args.matrix) + if matrix.size: + matrix = np.log2(np.maximum(matrix, 1e-6)) conditions = load_samplesheet(args.samplesheet, sample_ids) if args.samplesheet else [] try: @@ -135,8 +137,8 @@ def main(): "min_samples": 0, "n_clusters": 1, "frac_assigned_non_noise": 1.0, - "mean_persistence": 1.0, - "replicate_consistency": 0.0, + "cluster_stability_metric": 1.0, + "replicate_consistency_metric": 0.0, "score_total": 1.0, "selected": True, "selected_2clusters": False, @@ -161,25 +163,25 @@ def main(): labels = clusterer.fit_predict(matrix) if matrix.size else np.array([]) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) fraction_assigned = float(np.sum(labels >= 0)) / float(len(labels)) if len(labels) else 0.0 - mean_persistence = float(np.mean(clusterer.cluster_persistence_)) if getattr(clusterer, 'cluster_persistence_', None) is not None and len(clusterer.cluster_persistence_) > 0 else 0.0 + cluster_stability = float(np.mean(clusterer.cluster_persistence_)) if getattr(clusterer, 'cluster_persistence_', None) is not None and len(clusterer.cluster_persistence_) > 0 else 0.0 rep_consistency = replicate_consistency(matrix, labels, conditions) if len(labels) else 0.0 - score_total = float(fraction_assigned + mean_persistence + rep_consistency) + score_total = float(fraction_assigned + cluster_stability + rep_consistency) labels_by_params[(mcs, ms)] = labels results.append({ "min_cluster_size": mcs, "min_samples": ms, "n_clusters": n_clusters, "frac_assigned_non_noise": round(fraction_assigned, 4), - "mean_persistence": round(mean_persistence, 4), - "replicate_consistency": round(rep_consistency, 4), + "cluster_stability_metric": round(cluster_stability, 4), + "replicate_consistency_metric": round(rep_consistency, 4), "score_total": round(score_total, 4), "selected": False, "selected_2clusters": False, "selected_3clusters": False, }) - # Rank by cluster count first, then overall score, fraction assigned, and persistence. - rank_key = lambda row: (row["n_clusters"], row["score_total"], row["frac_assigned_non_noise"], row["mean_persistence"]) + # Rank by cluster count first, then overall score, fraction assigned, and stability. + rank_key = lambda row: (row["n_clusters"], row["score_total"], row["frac_assigned_non_noise"], row["cluster_stability_metric"]) best = max(results, key=rank_key) best_2 = None best_3 = None diff --git a/modules/local/chipbinner_counts.nf b/modules/local/chipbinner_counts.nf index 795ffe6b..a120a9ae 100644 --- a/modules/local/chipbinner_counts.nf +++ b/modules/local/chipbinner_counts.nf @@ -8,7 +8,7 @@ process CHIPBINNER_COUNTS { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - conda "bioconda::bedtools=2.31.1 conda-forge::python=3.11" + conda "bioconda::bedtools=2.31.1 conda-forge::python=3.11 conda-forge::pyyaml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" @@ -98,49 +98,22 @@ process CHIPBINNER_COUNTS { handle.write(line + "\\n") def parse_ms_coeffs_file(path): + try: + import yaml + except Exception as exc: + raise ValueError(f"PyYAML is required to parse MS coefficients: {exc}") with open(path) as handle: - raw_lines = handle.read().splitlines() - lines = [] - for raw in raw_lines: - line = raw.split("#", 1)[0].rstrip() - if line.strip(): - lines.append(line) - if not lines: + data = yaml.safe_load(handle) + if data is None: return {} - # Detect list of dicts - entries = [] - current = None - in_samples_block = False - coeffs_map = {} - for raw in lines: - stripped = raw.lstrip() - if stripped.startswith("samples:"): - in_samples_block = True - continue - if stripped.startswith("-"): - if current: - entries.append(current) - current = {} - item = stripped[1:].strip() - if item and ":" in item: - key, value = item.split(":", 1) - current[key.strip()] = value.strip() - continue - if ":" in stripped: - key, value = stripped.split(":", 1) - key = key.strip() - value = value.strip() - if in_samples_block or (current is not None and raw.startswith(" ")): - if current is None: - current = {} - current[key] = value - else: - coeffs_map[key] = value - if current: - entries.append(current) - if entries: + + def parse_entries(entries): coeffs = {} + if not isinstance(entries, list): + raise ValueError("MS coefficients 'samples' must be a list") for entry in entries: + if not isinstance(entry, dict): + raise ValueError("MS coefficients entry must be a mapping") sample_id = entry.get("sample_id") or entry.get("sample") if not sample_id: raise ValueError("MS coefficients entry missing sample_id") @@ -149,8 +122,18 @@ process CHIPBINNER_COUNTS { raise ValueError(f"MS coefficients entry missing coefficient for {sample_id}") coeffs[str(sample_id)] = float(value) return coeffs - if coeffs_map: - return {str(key): float(value) for key, value in coeffs_map.items()} + + if isinstance(data, list): + return parse_entries(data) + if isinstance(data, dict): + if "samples" in data: + samples = data["samples"] + if isinstance(samples, list): + return parse_entries(samples) + if isinstance(samples, dict): + return {str(key): float(value) for key, value in samples.items()} + raise ValueError("MS coefficients 'samples' must be a list or mapping") + return {str(key): float(value) for key, value in data.items()} raise ValueError("Unsupported MS coefficients YAML structure") ms_coeffs = {} diff --git a/modules/local/chipbinner_rots.nf b/modules/local/chipbinner_rots.nf index 305aae3b..75c9edfc 100644 --- a/modules/local/chipbinner_rots.nf +++ b/modules/local/chipbinner_rots.nf @@ -23,6 +23,8 @@ process CHIPBINNER_ROTS { input: path matrix path clusters + path clusters_2 + path clusters_3 path grid_summary path norm_info val samples_json @@ -44,6 +46,8 @@ process CHIPBINNER_ROTS { tuple val(group), path("chipbinner.treated_enriched.bed") , emit: treated_enriched tuple val(group), path("chipbinner.stable.bed") , emit: stable tuple val(group), path("chipbinner.noise.bed") , emit: noise + path "chipbinner.*clusters*.bed" , optional: true, emit: cluster_beds_extra + tuple val(group), path("chipbinner.cluster_beds.tsv") , emit: cluster_beds tuple val(group), path("chipbinner.summary.tsv") , emit: summary tuple val(group), path("plots") , emit: plots path "versions.yml" , emit: versions @@ -68,6 +72,8 @@ process CHIPBINNER_ROTS { Rscript ${projectDir}/bin/chipbinner_rots.R \ --matrix ${matrix} \ --clusters ${clusters} \ + --clusters_2 ${clusters_2} \ + --clusters_3 ${clusters_3} \ --grid_summary ${grid_summary} \ --norm_info ${norm_info} \ --samplesheet chipbinner.samplesheet.csv \ diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index a78fc3da..dc6cd34a 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -452,6 +452,8 @@ workflow DIFFERENTIAL_PEAK_CALLING { CHIPBINNER_ROTS( CHIPBINNER_COUNTS.out.normalized, CHIPBINNER_HDBSCAN_GRID.out.clusters, + CHIPBINNER_HDBSCAN_GRID.out.clusters_2, + CHIPBINNER_HDBSCAN_GRID.out.clusters_3, CHIPBINNER_HDBSCAN_GRID.out.grid_summary, CHIPBINNER_COUNTS.out.norm_info, ch_chipbinner_inputs.map { bins, samples_json, bams, bais, control_bams, control_bais, control_conditions_json, use_input_group, group -> samples_json }, @@ -470,12 +472,27 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_chipbinner_down = CHIPBINNER_ROTS.out.down if (lola_should_run) { - ch_lola_beds = Channel.empty() - ch_lola_beds = ch_lola_beds.mix( - CHIPBINNER_ROTS.out.control_enriched.map { group, file -> [group, 'control_enriched', file] }, - CHIPBINNER_ROTS.out.treated_enriched.map { group, file -> [group, 'treated_enriched', file] }, - CHIPBINNER_ROTS.out.stable.map { group, file -> [group, 'stable', file] } - ) + ch_lola_beds = CHIPBINNER_ROTS.out.cluster_beds + .flatMap { group, manifest -> + def lines = manifest.text.readLines() + if (lines.size() <= 1) { + return [] + } + lines.drop(1).collect { line -> + def parts = line.split('\\t', -1) + if (parts.size() < 3) { + return null + } + def model = parts[0] + def label = parts[1] + def bed_path = parts[2] + if (label == 'noise') { + return null + } + def lola_label = (model == 'best' || model == '') ? label : "${label}.${model}" + [group, lola_label, file(bed_path)] + }.findAll { it != null } + } ch_lola_inputs = CHIPBINNER_BINS.out.bins.join(ch_lola_beds) .map { group, bins_file, label, bed_file -> [group, label, bed_file, bins_file] } From 2766d08a54f772c2d3f93d49ff03245d1a64edec Mon Sep 17 00:00:00 2001 From: dhusmann Date: Mon, 5 Jan 2026 01:59:37 -0800 Subject: [PATCH 22/31] Refine chipbinner ranking and enrichment outputs --- bin/chipbinner_rots.R | 38 +++++++++++++++---- bin/hdbscan_grid.py | 4 +- modules/local/chipbinner_lola.nf | 14 +++---- .../local/differential_peak_calling.nf | 10 ++--- 4 files changed, 45 insertions(+), 21 deletions(-) diff --git a/bin/chipbinner_rots.R b/bin/chipbinner_rots.R index a56334ac..0bfd1b07 100755 --- a/bin/chipbinner_rots.R +++ b/bin/chipbinner_rots.R @@ -159,7 +159,7 @@ write_cluster_beds <- function(results_df, labels, suffix, model, manifest_df) { data.frame( model = model, label = label, - bed = normalizePath(bed_file, mustWork = FALSE), + bed = bed_file, stringsAsFactors = FALSE ) ) @@ -181,8 +181,34 @@ if (is.null(cluster_labels_best)) { cluster_labels_best <- rep("NA", nrow(results)) } +cluster_ids_2_raw <- cluster_ids_from_df(clusters_2, results) +cluster_labels_2_raw <- compute_cluster_labels(cluster_ids_2_raw, counts, conditions, opt$treated, opt$control, opt$lfc) +cluster_ids_2 <- cluster_ids_2_raw +cluster_labels_2 <- cluster_labels_2_raw +if (is.null(cluster_ids_2)) { + cluster_ids_2 <- rep(NA, nrow(results)) +} +if (is.null(cluster_labels_2)) { + cluster_labels_2 <- rep("NA", nrow(results)) +} + +cluster_ids_3_raw <- cluster_ids_from_df(clusters_3, results) +cluster_labels_3_raw <- compute_cluster_labels(cluster_ids_3_raw, counts, conditions, opt$treated, opt$control, opt$lfc) +cluster_ids_3 <- cluster_ids_3_raw +cluster_labels_3 <- cluster_labels_3_raw +if (is.null(cluster_ids_3)) { + cluster_ids_3 <- rep(NA, nrow(results)) +} +if (is.null(cluster_labels_3)) { + cluster_labels_3 <- rep("NA", nrow(results)) +} + results$cluster_id <- cluster_ids_best results$cluster_label <- cluster_labels_best +results$cluster_id_2clusters <- cluster_ids_2 +results$cluster_label_2clusters <- cluster_labels_2 +results$cluster_id_3clusters <- cluster_ids_3 +results$cluster_label_3clusters <- cluster_labels_3 write.table(results, "chipbinner.differential.tsv", sep = "\t", quote = FALSE, row.names = FALSE) @@ -201,14 +227,12 @@ bed_manifest <- data.frame( ) bed_manifest <- write_cluster_beds(results, cluster_labels_best, "", "best", bed_manifest) -cluster_labels_2 <- compute_cluster_labels(cluster_ids_from_df(clusters_2, results), counts, conditions, opt$treated, opt$control, opt$lfc) -if (!is.null(cluster_labels_2)) { - bed_manifest <- write_cluster_beds(results, cluster_labels_2, "2clusters", "2clusters", bed_manifest) +if (!is.null(cluster_labels_2_raw)) { + bed_manifest <- write_cluster_beds(results, cluster_labels_2_raw, "2clusters", "2clusters", bed_manifest) } -cluster_labels_3 <- compute_cluster_labels(cluster_ids_from_df(clusters_3, results), counts, conditions, opt$treated, opt$control, opt$lfc) -if (!is.null(cluster_labels_3)) { - bed_manifest <- write_cluster_beds(results, cluster_labels_3, "3clusters", "3clusters", bed_manifest) +if (!is.null(cluster_labels_3_raw)) { + bed_manifest <- write_cluster_beds(results, cluster_labels_3_raw, "3clusters", "3clusters", bed_manifest) } write.table(bed_manifest, "chipbinner.cluster_beds.tsv", sep = "\t", quote = FALSE, row.names = FALSE) diff --git a/bin/hdbscan_grid.py b/bin/hdbscan_grid.py index 6d936cf1..a304e1bd 100755 --- a/bin/hdbscan_grid.py +++ b/bin/hdbscan_grid.py @@ -180,8 +180,8 @@ def main(): "selected_3clusters": False, }) - # Rank by cluster count first, then overall score, fraction assigned, and stability. - rank_key = lambda row: (row["n_clusters"], row["score_total"], row["frac_assigned_non_noise"], row["cluster_stability_metric"]) + # Rank by overall score first, then cluster count, fraction assigned, and stability. + rank_key = lambda row: (row["score_total"], row["n_clusters"], row["frac_assigned_non_noise"], row["cluster_stability_metric"]) best = max(results, key=rank_key) best_2 = None best_3 = None diff --git a/modules/local/chipbinner_lola.nf b/modules/local/chipbinner_lola.nf index 58212ba2..2b761470 100644 --- a/modules/local/chipbinner_lola.nf +++ b/modules/local/chipbinner_lola.nf @@ -1,9 +1,9 @@ process CHIPBINNER_LOLA { - tag "${group}.${label}" + tag "${group}.${model}.${label}" label 'process_medium' publishDir = [ - path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/enrichment" }, + path: { "${params.outdir}/03_peak_calling/06_differential/02_chipbinner/${group}/enrichment${model && model != 'best' ? '/' + model : ''}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -14,13 +14,13 @@ process CHIPBINNER_LOLA { 'biocontainers/bioconductor-lola:1.28.0--r42hdfd78af_0' }" input: - tuple val(group), val(label), path(bed) + tuple val(group), val(label), val(model), path(bed) path universe path lola_db output: - tuple val(group), val(label), path("${label}.lola.tsv"), emit: tsv - tuple val(group), val(label), path("${label}.lola_plot.pdf"), optional: true, emit: plot + tuple val(group), val(label), val(model), path("${label}_lola.tsv"), emit: tsv + tuple val(group), val(label), val(model), path("${label}_lola_plot.pdf"), optional: true, emit: plot path "versions.yml", emit: versions when: @@ -33,8 +33,8 @@ process CHIPBINNER_LOLA { --universe ${universe} \ --db ${lola_db} \ --label ${label} \ - --out_tsv ${label}.lola.tsv \ - --out_pdf ${label}.lola_plot.pdf + --out_tsv ${label}_lola.tsv \ + --out_pdf ${label}_lola_plot.pdf cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index dc6cd34a..3af73e9f 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -489,17 +489,17 @@ workflow DIFFERENTIAL_PEAK_CALLING { if (label == 'noise') { return null } - def lola_label = (model == 'best' || model == '') ? label : "${label}.${model}" - [group, lola_label, file(bed_path)] + def bed_file = manifest.parent ? new File(manifest.parent, bed_path) : new File(bed_path) + [group, label, model, file(bed_file.toString())] }.findAll { it != null } } ch_lola_inputs = CHIPBINNER_BINS.out.bins.join(ch_lola_beds) - .map { group, bins_file, label, bed_file -> [group, label, bed_file, bins_file] } + .map { group, bins_file, label, model, bed_file -> [group, label, model, bed_file, bins_file] } CHIPBINNER_LOLA( - ch_lola_inputs.map { group, label, bed_file, bins_file -> [group, label, bed_file] }, - ch_lola_inputs.map { group, label, bed_file, bins_file -> bins_file }, + ch_lola_inputs.map { group, label, model, bed_file, bins_file -> [group, label, model, bed_file] }, + ch_lola_inputs.map { group, label, model, bed_file, bins_file -> bins_file }, Channel.value(lola_db_path) ) ch_versions = ch_versions.mix(CHIPBINNER_LOLA.out.versions) From 19a7a1ea927366b41bfde07e857bd7d2772d6954 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 21:28:03 -0800 Subject: [PATCH 23/31] Fix pooled control BAI lookup and rots fallback --- bin/chipbinner_rots.R | 17 +++++++++---- modules/local/span_compare.nf | 6 ++--- .../local/differential_peak_calling.nf | 25 ++++++++----------- workflows/differential_only.nf | 2 +- 4 files changed, 27 insertions(+), 23 deletions(-) diff --git a/bin/chipbinner_rots.R b/bin/chipbinner_rots.R index 0bfd1b07..30ccae05 100755 --- a/bin/chipbinner_rots.R +++ b/bin/chipbinner_rots.R @@ -30,6 +30,8 @@ samples_df <- read.csv(opt$samplesheet, stringsAsFactors = FALSE) sample_ids <- samples_df$sample_id conditions <- samples_df$condition +treated_idx <- conditions == opt$treated +control_idx <- conditions == opt$control if (!all(sample_ids %in% colnames(matrix_df))) { stop("Matrix columns do not include all sample IDs") @@ -39,8 +41,8 @@ counts <- as.matrix(matrix_df[, sample_ids, drop = FALSE]) rownames(counts) <- paste(matrix_df$chr, matrix_df$start, matrix_df$end, sep = ":") # Differential testing -log2fc <- log2(rowMeans(counts[, conditions == opt$treated, drop = FALSE]) / - rowMeans(counts[, conditions == opt$control, drop = FALSE])) +log2fc <- log2(rowMeans(counts[, treated_idx, drop = FALSE]) / + rowMeans(counts[, control_idx, drop = FALSE])) use_rots <- TRUE tryCatch({ @@ -54,9 +56,14 @@ if (use_rots) { rots_res <- ROTS::ROTS(counts, groups = groups, B = opt$bootstrap, K = opt$k_value) pvals <- rots_res$pvalue } else { - pvals <- apply(counts, 1, function(x) { - t.test(x[conditions == opt$treated], x[conditions == opt$control])$p.value - }) + if (sum(treated_idx) < 2 || sum(control_idx) < 2) { + warning("ROTS unavailable and <2 samples per condition; assigning p-values of 1") + pvals <- rep(1, nrow(counts)) + } else { + pvals <- apply(counts, 1, function(x) { + t.test(x[treated_idx], x[control_idx])$p.value + }) + } } fdr <- p.adjust(pvals, method = "BH") diff --git a/modules/local/span_compare.nf b/modules/local/span_compare.nf index 9717d9bd..2dbcfdf7 100644 --- a/modules/local/span_compare.nf +++ b/modules/local/span_compare.nf @@ -91,7 +91,7 @@ process SPAN_COMPARE { cp "\$diff_file" span.differential.tsv cat <<'AWK' > span_compare_bed.awk - function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?$/) } + function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?\$/) } BEGIN { OFS="\\t"; header_done=0; chr_idx=1; start_idx=2; end_idx=3 } /^#/ || /^track/ || /^browser/ { next } !header_done { @@ -115,7 +115,7 @@ process SPAN_COMPARE { : > span.significant_up.bed : > span.significant_down.bed cat <<'AWK' > span_compare_filter.awk - function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?$/) } + function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?\$/) } function calc_fdr(val) { if (!isnum(val)) return 1 if (val <= 1) return val + 0 @@ -173,7 +173,7 @@ process SPAN_COMPARE { cat span.significant_up.bed span.significant_down.bed | awk 'NF' | sort -k1,1 -k2,2n | uniq > span.significant.bed - total=\$(awk 'function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?$/) } /^#/ || /^track/ || /^browser/ { next } !header_seen { if (!isnum(\$2)) { header_seen=1; next } header_seen=1 } { n++ } END { print n+0 }' "\$diff_file") + total=\$(awk 'function isnum(x) { return (x ~ /^-?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?\$/) } /^#/ || /^track/ || /^browser/ { next } !header_seen { if (!isnum(\$2)) { header_seen=1; next } header_seen=1 } { n++ } END { print n+0 }' "\$diff_file") n_up=\$(wc -l < span.significant_up.bed | awk '{print \$1}') n_down=\$(wc -l < span.significant_down.bed | awk '{print \$1}') n_fdr_pass=\$((n_up + n_down)) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 3af73e9f..b25da691 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -2,9 +2,6 @@ * Differential peak calling / enrichment */ -import groovy.json.JsonOutput -import groovy.json.JsonSlurper - include { WRITE_DIFFERENTIAL_MANIFESTS } from '../../modules/local/write_differential_manifests' include { MAKE_DIFFBIND_SAMPLESHEET } from '../../modules/local/diffbind_samplesheet' include { MAKE_DIFFBIND_SAMPLESHEET as MAKE_SPAN_SAMPLESHEET } from '../../modules/local/diffbind_samplesheet' @@ -136,7 +133,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ] } .toList() - .map { list -> JsonOutput.toJson(list) } + .map { list -> groovy.json.JsonOutput.toJson(list) } ch_peaks_manifest = ch_peaks .filter { meta, peaks -> meta.is_control == false } @@ -164,7 +161,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ] } .toList() - .map { list -> JsonOutput.toJson(list) } + .map { list -> groovy.json.JsonOutput.toJson(list) } def run_meta = [ pipeline: workflow.manifest.name, @@ -176,7 +173,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { chrom_sizes: "${params.outdir}/03_peak_calling/06_differential/00_manifests/chrom_sizes.sizes" ] - ch_run_meta = Channel.value(JsonOutput.toJson(run_meta)) + ch_run_meta = Channel.value(groovy.json.JsonOutput.toJson(run_meta)) ch_outdir = Channel.value(params.outdir) WRITE_DIFFERENTIAL_MANIFESTS( @@ -279,7 +276,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ) ch_diffbind_valid - .map { rec -> [JsonOutput.toJson(rec.samples), rec.group, rec.caller] } + .map { rec -> [groovy.json.JsonOutput.toJson(rec.samples), rec.group, rec.caller] } .set { ch_diffbind_samples } MAKE_DIFFBIND_SAMPLESHEET( @@ -401,9 +398,9 @@ workflow DIFFERENTIAL_PEAK_CALLING { def control_bams = use_input_group ? required_conditions.collect { control_by_group[it][0] } : bams def control_bais = use_input_group ? required_conditions.collect { control_by_group[it][1] } : bais - def control_conditions_json = use_input_group ? JsonOutput.toJson(required_conditions) : JsonOutput.toJson([]) + def control_conditions_json = use_input_group ? groovy.json.JsonOutput.toJson(required_conditions) : groovy.json.JsonOutput.toJson([]) - [rec.group, JsonOutput.toJson(samples), bams, bais, control_bams, control_bais, control_conditions_json, use_input_group] + [rec.group, groovy.json.JsonOutput.toJson(samples), bams, bais, control_bams, control_bais, control_conditions_json, use_input_group] } ch_chipbinner_inputs = CHIPBINNER_BINS.out.bins.join(ch_chipbinner_samples) @@ -544,7 +541,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_versions = ch_versions.mix(SPAN_CAPABILITY_PROBE.out.versions) ch_span_caps = SPAN_CAPABILITY_PROBE.out.capabilities - .map { file -> new JsonSlurper().parse(file) } + .map { file -> new groovy.json.JsonSlurper().parse(file) } ch_span_has_compare = ch_span_caps.map { it.has_compare ?: false } ch_span_multibam = ch_span_caps.map { it.compare_multi_bam ?: false } ch_span_multibam_mode = ch_span_caps.map { it.compare_multi_bam_mode ?: 'single' } @@ -741,7 +738,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ) MAKE_SPAN_SAMPLESHEET( - ch_span_valid.map { rec -> JsonOutput.toJson(rec.samples) }, + ch_span_valid.map { rec -> groovy.json.JsonOutput.toJson(rec.samples) }, ch_span_valid.map { rec -> rec.group }, Channel.value('span_fallback') ) @@ -787,8 +784,8 @@ workflow DIFFERENTIAL_PEAK_CALLING { .map { rec -> [rec.group, rec] } .groupTuple() .map { group, recs -> - def native = recs.find { it.span_mode == 'native' } - native ?: recs[0] + def native_record = recs.find { it.span_mode == 'native' } + native_record ?: recs[0] } method_beds = Channel.empty() @@ -832,7 +829,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { if (params.differential_run_multiqc) { DIFFERENTIAL_SUMMARY_MERGE( ch_summary_files.collect().ifEmpty([]), - ch_skipped_records.toList().map { list -> JsonOutput.toJson(list) }, + ch_skipped_records.toList().map { list -> groovy.json.JsonOutput.toJson(list) }, Channel.value(file("${projectDir}/assets/multiqc/differential_summary_header.txt")), Channel.value(file("${projectDir}/assets/multiqc/differential_skipped_header.txt")) ) diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index bc49fc58..b7d12278 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -76,7 +76,7 @@ workflow DIFFERENTIAL_ONLY { def group = parts.size() > 1 ? parts[0..-2].join('_') : base def bai = file("${bam}.bai") if (!bai.exists()) { - bai = file("${bam.baseName}.bai") + bai = file("${bam.parent}/${bam.baseName}.bai") } if (!bai.exists()) { log.warn "Missing BAI for pooled control ${bam}" From 88b127ff0e72b63d8b0d71fa6e6c265775e63cd9 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 21:36:05 -0800 Subject: [PATCH 24/31] Handle NA control_group in differential-only --- workflows/differential_only.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index b7d12278..b4954b95 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -39,13 +39,14 @@ workflow DIFFERENTIAL_ONLY { ch_samples = Channel.fromPath(samples_path).splitCsv(header: true, sep: '\t') .map { row -> + def control_group = (row.control_group && row.control_group != 'NA') ? row.control_group : row.group def meta = [ id: row.sample_id, sample_id: row.sample_id, group: row.group, condition: row.condition, replicate: row.replicate.toInteger(), - control_group: row.control_group ?: row.group, + control_group: control_group, control_condition: row.control_condition ?: 'NA', is_control: false, normalisation_mode: row.normalisation_mode ?: params.normalisation_mode From d3f1cddf66bef550af70cc939f1fff4142c98745 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 21:50:04 -0800 Subject: [PATCH 25/31] Use known conditions for pooled control parsing --- workflows/differential_only.nf | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/workflows/differential_only.nf b/workflows/differential_only.nf index b4954b95..4177c42b 100644 --- a/workflows/differential_only.nf +++ b/workflows/differential_only.nf @@ -36,6 +36,27 @@ workflow DIFFERENTIAL_ONLY { def meta = new JsonSlurper().parse(run_meta_path) callers_list = meta?.callers ?: [] } + def known_conditions = [] + samples_path.eachLine { line, idx -> + if (idx == 1) { + def headers = line.split('\t', -1) + def cond_idx = headers.findIndexOf { it == 'condition' } + if (cond_idx >= 0) { + samples_path.eachLine { row, row_idx -> + if (row_idx == 1) return + if (!row.trim()) return + def fields = row.split('\t', -1) + if (fields.size() > cond_idx) { + def value = fields[cond_idx] + if (value && value != 'NA') { + known_conditions << value + } + } + } + } + } + } + known_conditions = known_conditions.unique() ch_samples = Channel.fromPath(samples_path).splitCsv(header: true, sep: '\t') .map { row -> @@ -72,9 +93,16 @@ workflow DIFFERENTIAL_ONLY { ch_control_bam_bai = Channel.fromPath("${pooled_controls_dir}/*.bam") .map { bam -> def base = bam.baseName - def parts = base.tokenize('_') - def condition = parts.size() > 1 ? parts[-1] : 'NA' - def group = parts.size() > 1 ? parts[0..-2].join('_') : base + def matched = known_conditions.findAll { base.endsWith("_${it}") } + def condition = matched ? matched.max { it.size() } : null + def group = base + if (condition) { + group = base[0..-(condition.size() + 2)] + } else { + def parts = base.tokenize('_') + condition = parts.size() > 1 ? parts[-1] : 'NA' + group = parts.size() > 1 ? parts[0..-2].join('_') : base + } def bai = file("${bam}.bai") if (!bai.exists()) { bai = file("${bam.parent}/${bam.baseName}.bai") From ae4c3e517c9de5ee4671bb3dc031d21e1da293b5 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Thu, 8 Jan 2026 22:05:38 -0800 Subject: [PATCH 26/31] Disambiguate differential summary staging --- modules/local/differential_summary.nf | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf index 33d2bd1b..a84207fe 100644 --- a/modules/local/differential_summary.nf +++ b/modules/local/differential_summary.nf @@ -12,7 +12,12 @@ process DIFFERENTIAL_SUMMARY_MERGE { container "quay.io/biocontainers/python:3.8.3" input: - path summary_files, stageAs: { file -> "summaries/${file.parent.name}-${file.name}" } + path summary_files, stageAs: { file -> + def caller = file.parent?.parent?.name ?: file.parent?.name ?: 'summary' + def group = file.parent?.name ?: 'group' + def prefix = caller == group ? group : "${caller}-${group}" + "summaries/${prefix}-${file.name}" + } val skipped_json path summary_header path skipped_header From efbab9d3fe1e7e7721758eb5f6ed21226012f385 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Sat, 10 Jan 2026 02:52:14 -0800 Subject: [PATCH 27/31] Fix differential pytest failures and update history --- bin/chipbinner_rots.R | 5 +++ conf/modules.config | 7 ++++ docs/test_histor/differential_peak_calling.md | 15 ++++++++ .../test_history/differential_peak_calling.md | 36 +++++++++++++++++++ modules/local/annotate_regions.nf | 9 ++--- modules/local/chipbinner_counts.nf | 16 ++++----- modules/local/chipbinner_hdbscan_grid.nf | 4 +-- modules/local/chipbinner_rots.nf | 4 +-- modules/local/differential_summary.nf | 12 ++----- modules/local/span_compare.nf | 32 ++++++++--------- .../local/differential_peak_calling.nf | 4 +-- tests/config/differential.config | 2 ++ 12 files changed, 101 insertions(+), 45 deletions(-) create mode 100644 docs/test_histor/differential_peak_calling.md create mode 100644 docs/test_history/differential_peak_calling.md diff --git a/bin/chipbinner_rots.R b/bin/chipbinner_rots.R index 30ccae05..79e122bf 100755 --- a/bin/chipbinner_rots.R +++ b/bin/chipbinner_rots.R @@ -51,6 +51,11 @@ tryCatch({ use_rots <<- FALSE }) +if (use_rots && (sum(treated_idx) < 2 || sum(control_idx) < 2)) { + warning("ROTS requires >=2 samples per condition; falling back to simple tests") + use_rots <- FALSE +} + if (use_rots) { groups <- ifelse(conditions == opt$treated, 1, 0) rots_res <- ROTS::ROTS(counts, groups = groups, B = opt$bootstrap, K = opt$k_value) diff --git a/conf/modules.config b/conf/modules.config index e3421104..e95da593 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1195,6 +1195,7 @@ process { mode: "${params.publish_dir_mode}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + container = '' } withName: '.*CHIPBINNER_HDBSCAN_GRID' { @@ -1203,6 +1204,7 @@ process { mode: "${params.publish_dir_mode}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + container = '' } withName: '.*CHIPBINNER_ROTS' { @@ -1218,6 +1220,7 @@ process { return filename } ] + container = '' } withName: '.*SPAN_CAPABILITY_PROBE' { @@ -1244,6 +1247,10 @@ process { ] } + withName: '.*ANNOTATE_REGIONS' { + container = '' + } + withName: '.*DIFFERENTIAL_OVERLAP' { publishDir = [ path: { "${params.outdir}/03_peak_calling/06_differential/04_cross_comparison" }, diff --git a/docs/test_histor/differential_peak_calling.md b/docs/test_histor/differential_peak_calling.md new file mode 100644 index 00000000..c091e664 --- /dev/null +++ b/docs/test_histor/differential_peak_calling.md @@ -0,0 +1,15 @@ +# Differential peak calling test history (local notes) + +## 2026-01-10 — pytest failures resolved (current context) + +- **ROTS crash with <2 replicates** (`rowSums(is.na(...))` error in `test_differential_chipbinner`). + **Fix:** disable ROTS when a condition has <2 samples and fall back to simple tests (or p=1) in `bin/chipbinner_rots.R`. + +- **`DIFFERENTIAL_SUMMARY_MERGE` unbound variable** due to `$` in staged summary filename. + **Fix:** stage summaries with `stageAs: 'summaries/*'` and build `summary_files.list` via `find summaries -type f` in `modules/local/differential_summary.nf`. + +- **`ANNOTATE_REGIONS` missing python in container** (command not found). + **Fix:** disable container for `ANNOTATE_REGIONS` so conda provides python (`conf/modules.config`). + +- **`ANNOTATE_REGIONS` publishDir error** (`Unexpected path value` from publishDir closure). + **Fix:** replace publishDir closure with a static map + `enabled` closure in `modules/local/annotate_regions.nf`. diff --git a/docs/test_history/differential_peak_calling.md b/docs/test_history/differential_peak_calling.md new file mode 100644 index 00000000..1b8ac4d6 --- /dev/null +++ b/docs/test_history/differential_peak_calling.md @@ -0,0 +1,36 @@ +# Differential peak calling test history + +## 2026-01-09 — full pytest suite (Slurm) + +- **Failure:** `LexerNoViableAltException` / Groovy parsing errors during `span_compare` (seen in Nextflow stderr). + **Cause:** unescaped `$` tokens in AWK and regex anchors inside `modules/local/span_compare.nf`. + **Fix:** escape `$` in AWK fields and regex anchors (e.g., `\$2`, `\$i`, `/pattern\\$/`). + +- **Failure:** Nextflow could not start because `java` was not found in the pytest job environment. + **Fix:** set `JAVA_HOME`, `NXF_JAVA_HOME`, and `JAVA_CMD` to `/share/software/user/open/java/17.0.4` in the pytest script. + +- **Failure:** pytest workflow missing `PROFILE` environment variable, causing Nextflow config/profile resolution to fail in tests. + **Fix:** export `PROFILE=singularity` in the pytest script. + +- **Failure:** Nextflow picked up `~/.nextflow/config` with `process.executor=slurm`, leading to nested Slurm execution during pytest runs. + **Fix:** set `NXF_HOME` to a scratch directory and `NXF_EXECUTOR=local` in the pytest script to isolate from user config and force local executor. + +- **Failure:** `CHIPBINNER_COUNTS` input file name collision (same BAM/BAI filenames for target and control inputs) in `test_differential_chipbinner`. + **Fix:** stage control BAM/BAI into a dedicated subdirectory via `stageAs: { "controls/${it.getName()}" }` in `modules/local/chipbinner_counts.nf`. + +## 2026-01-10 — differential tag + full pytest suite (Slurm) + +- **Failure:** `ROTS` error (`rowSums(is.na(...))` expects matrix) when a condition has <2 replicates in `test_differential_chipbinner` (job `13175581`, run dir `/scratch/users/dhusmann/runs/pytest-suite/20260109_210345`). + **Fix:** disable ROTS when <2 samples per condition and fall back to simple tests (or p=1 if <2) in `bin/chipbinner_rots.R`. + +- **Failure:** `DIFFERENTIAL_SUMMARY_MERGE` unbound variable due to `$` in staged summary filename (job `13175959`, run dir `/scratch/users/dhusmann/runs/pytest-suite/20260109_212945`). + **Fix:** stage summaries with `stageAs: 'summaries/*'` and build `summary_files.list` via `find summaries -type f` in `modules/local/differential_summary.nf`. + +- **Failure:** `ANNOTATE_REGIONS` container missing `python` (job `13177002`, run dir `/scratch/users/dhusmann/runs/pytest-suite/20260109_222413`). + **Fix:** disable container for `ANNOTATE_REGIONS` so conda provides python (`conf/modules.config`). + +- **Failure:** `ANNOTATE_REGIONS` publishDir error (`Unexpected path value` from publishDir closure) (job `13178611`, run dir `/scratch/users/dhusmann/runs/pytest-suite/20260109_225233`). + **Fix:** replace publishDir closure with a static map + `enabled` closure in `modules/local/annotate_regions.nf`. + +- **Pass:** `test_differential_chipbinner` tag rerun succeeded (job `13179209`, run dir `/scratch/users/dhusmann/runs/pytest-suite/20260109_231631`). +- **Pass:** full pytest suite succeeded (job `13179771`, run dir `/scratch/users/dhusmann/runs/pytest-suite/20260109_234549`). diff --git a/modules/local/annotate_regions.nf b/modules/local/annotate_regions.nf index 5bee90ed..ca944ea7 100644 --- a/modules/local/annotate_regions.nf +++ b/modules/local/annotate_regions.nf @@ -2,11 +2,12 @@ process ANNOTATE_REGIONS { tag "${results_tsv.simpleName}" label 'process_medium' - publishDir = { (meta?.publish_dir ?: task.ext.publish_dir) ? [ - path: meta?.publish_dir ?: task.ext.publish_dir, + publishDir = [ + path: { meta?.publish_dir ?: task.ext.publish_dir }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] : null } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: { meta?.publish_dir ?: task.ext.publish_dir ? true : false } + ] conda "bioconda::bedtools=2.31.1 bioconda::bedops=2.4.41 conda-forge::python=3.11 conda-forge::perl=5.26.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/chipbinner_counts.nf b/modules/local/chipbinner_counts.nf index a120a9ae..1e32496e 100644 --- a/modules/local/chipbinner_counts.nf +++ b/modules/local/chipbinner_counts.nf @@ -9,9 +9,7 @@ process CHIPBINNER_COUNTS { ] conda "bioconda::bedtools=2.31.1 conda-forge::python=3.11 conda-forge::pyyaml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : - 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + container 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' input: path bins @@ -38,12 +36,12 @@ process CHIPBINNER_COUNTS { script: """\ - printf "%s\n" ${bams} > bam_paths.txt - printf "%s\n" ${bais} > bai_paths.txt - printf "%s\n" ${control_bams} > control_bam_paths.txt - printf "%s\n" ${control_bais} > control_bai_paths.txt + printf "%s\\n" ${bams} > bam_paths.txt + printf "%s\\n" ${bais} > bai_paths.txt + printf "%s\\n" ${control_bams} > control_bam_paths.txt + printf "%s\\n" ${control_bais} > control_bai_paths.txt - python - <<'PY' + python3 - <<'PY' import json import subprocess import sys @@ -252,7 +250,7 @@ process CHIPBINNER_COUNTS { cat <<-END_VERSIONS > versions.yml "${task.process}": bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") - python: \$(python --version | awk '{print \$2}') + python: \$(python3 --version | awk '{print \$2}') END_VERSIONS """.stripIndent() } diff --git a/modules/local/chipbinner_hdbscan_grid.nf b/modules/local/chipbinner_hdbscan_grid.nf index ea9bfb6f..2b9abd89 100644 --- a/modules/local/chipbinner_hdbscan_grid.nf +++ b/modules/local/chipbinner_hdbscan_grid.nf @@ -9,9 +9,7 @@ process CHIPBINNER_HDBSCAN_GRID { ] conda "conda-forge::python=3.11 conda-forge::numpy conda-forge::pandas conda-forge::scikit-learn conda-forge::hdbscan" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/hdbscan:0.8.33--pyhdfd78af_0' : - 'biocontainers/hdbscan:0.8.33--pyhdfd78af_0' }" + container 'biocontainers/hdbscan:0.8.33--pyhdfd78af_0' input: path matrix diff --git a/modules/local/chipbinner_rots.nf b/modules/local/chipbinner_rots.nf index 75c9edfc..9afaf89d 100644 --- a/modules/local/chipbinner_rots.nf +++ b/modules/local/chipbinner_rots.nf @@ -15,7 +15,7 @@ process CHIPBINNER_ROTS { } ] - conda "bioconda::r-rots conda-forge::r-base=4.2.3 conda-forge::r-optparse conda-forge::r-ggplot2 conda-forge::r-jsonlite" + conda "bioconda::bioconductor-rots conda-forge::r-base=4.2.3 conda-forge::r-optparse conda-forge::r-ggplot2 conda-forge::r-jsonlite conda-forge::python=3.11" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/r-rots:1.19.0--r42hdfd78af_0' : 'biocontainers/r-rots:1.19.0--r42hdfd78af_0' }" @@ -58,7 +58,7 @@ process CHIPBINNER_ROTS { script: """\ mkdir -p plots - python - <<'PY' + python3 - <<'PY' import json import csv samples = json.loads(r'''${samples_json}''') diff --git a/modules/local/differential_summary.nf b/modules/local/differential_summary.nf index a84207fe..35b84ffb 100644 --- a/modules/local/differential_summary.nf +++ b/modules/local/differential_summary.nf @@ -12,12 +12,7 @@ process DIFFERENTIAL_SUMMARY_MERGE { container "quay.io/biocontainers/python:3.8.3" input: - path summary_files, stageAs: { file -> - def caller = file.parent?.parent?.name ?: file.parent?.name ?: 'summary' - def group = file.parent?.name ?: 'group' - def prefix = caller == group ? group : "${caller}-${group}" - "summaries/${prefix}-${file.name}" - } + path summary_files, stageAs: 'summaries/*' val skipped_json path summary_header path skipped_header @@ -32,10 +27,9 @@ process DIFFERENTIAL_SUMMARY_MERGE { script: """\ + mkdir -p summaries : > summary_files.list - for file in ${summary_files}; do - echo "\$file" >> summary_files.list - done + find summaries -type f -name "*.tsv" -print | sort > summary_files.list python - <<'PY' import csv diff --git a/modules/local/span_compare.nf b/modules/local/span_compare.nf index 2dbcfdf7..19b0b308 100644 --- a/modules/local/span_compare.nf +++ b/modules/local/span_compare.nf @@ -95,10 +95,10 @@ process SPAN_COMPARE { BEGIN { OFS="\\t"; header_done=0; chr_idx=1; start_idx=2; end_idx=3 } /^#/ || /^track/ || /^browser/ { next } !header_done { - if (!isnum($2)) { + if (!isnum(\$2)) { for (i=1; i<=NF; i++) { - col=tolower($i) - if (col ~ /^(chr|chrom|seqnames|seqname)$/) chr_idx=i + col=tolower(\$i) + if (col ~ /^(chr|chrom|seqnames|seqname)\$/) chr_idx=i if (col == "start") start_idx=i if (col == "end") end_idx=i } @@ -108,7 +108,7 @@ process SPAN_COMPARE { header_done=1 } } - { print $chr_idx,$start_idx,$end_idx } + { print \$chr_idx,\$start_idx,\$end_idx } AWK awk -f span_compare_bed.awk "\$diff_file" > span.differential.bed @@ -130,14 +130,14 @@ process SPAN_COMPARE { BEGIN { OFS="\\t"; header_done=0; chr_idx=1; start_idx=2; end_idx=3; log2_idx=0; fdr_idx=0; qval_idx=0; fold_idx=0; proxy_log2=0 } /^#/ || /^track/ || /^browser/ { next } !header_done { - if (!isnum($2)) { + if (!isnum(\$2)) { for (i=1; i<=NF; i++) { - col=tolower($i) - if (col ~ /^(chr|chrom|seqnames|seqname)$/) chr_idx=i + col=tolower(\$i) + if (col ~ /^(chr|chrom|seqnames|seqname)\$/) chr_idx=i if (col == "start") start_idx=i if (col == "end") end_idx=i if (col ~ /log2fc|log2foldchange|log2_fold_change|log2fold|log_fc|log2ratio|log2_ratio/) log2_idx=i - if (col ~ /^(fdr|false.discovery|padj|adj_p|adjp)$/) fdr_idx=i + if (col ~ /^(fdr|false.discovery|padj|adj_p|adjp)\$/) fdr_idx=i if (col ~ /qvalue|qval|q-value/) qval_idx=i if ((col ~ /fold|fc/) && log2_idx==0) fold_idx=i } @@ -151,17 +151,17 @@ process SPAN_COMPARE { } { log2_val="" - if (log2_idx>0 && isnum($log2_idx)) log2_val=$log2_idx+0 - if (log2_val=="" && fold_idx>0 && isnum($fold_idx)) { log2_val=calc_log2($fold_idx); proxy_log2=1 } - if (log2_val=="" && log2_idx==7 && isnum($7)) { log2_val=calc_log2($7); proxy_log2=1 } + if (log2_idx>0 && isnum(\$log2_idx)) log2_val=\$log2_idx+0 + if (log2_val=="" && fold_idx>0 && isnum(\$fold_idx)) { log2_val=calc_log2(\$fold_idx); proxy_log2=1 } + if (log2_val=="" && log2_idx==7 && isnum(\$7)) { log2_val=calc_log2(\$7); proxy_log2=1 } fdr_val="" - if (fdr_idx>0) fdr_val=$fdr_idx - else if (qval_idx>0) fdr_val=$qval_idx - else if (NF>=9) fdr_val=$9 + if (fdr_idx>0) fdr_val=\$fdr_idx + else if (qval_idx>0) fdr_val=\$qval_idx + else if (NF>=9) fdr_val=\$9 fdr_val=calc_fdr(fdr_val) if (fdr_val <= fdr_thresh) { - if (log2_val > 0) print $chr_idx,$start_idx,$end_idx >> up_file - else if (log2_val < 0) print $chr_idx,$start_idx,$end_idx >> down_file + if (log2_val > 0) print \$chr_idx,\$start_idx,\$end_idx >> up_file + else if (log2_val < 0) print \$chr_idx,\$start_idx,\$end_idx >> down_file } } END { diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index b25da691..2b8b20a2 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -396,8 +396,8 @@ workflow DIFFERENTIAL_PEAK_CALLING { } } - def control_bams = use_input_group ? required_conditions.collect { control_by_group[it][0] } : bams - def control_bais = use_input_group ? required_conditions.collect { control_by_group[it][1] } : bais + def control_bams = use_input_group ? required_conditions.collect { control_by_group[it][0] } : [] + def control_bais = use_input_group ? required_conditions.collect { control_by_group[it][1] } : [] def control_conditions_json = use_input_group ? groovy.json.JsonOutput.toJson(required_conditions) : groovy.json.JsonOutput.toJson([]) [rec.group, groovy.json.JsonOutput.toJson(samples), bams, bais, control_bams, control_bais, control_conditions_json, use_input_group] diff --git a/tests/config/differential.config b/tests/config/differential.config index 79a0dec6..67ed1990 100644 --- a/tests/config/differential.config +++ b/tests/config/differential.config @@ -12,6 +12,8 @@ process { singularity.enabled = true singularity.autoMounts = true +conda.enabled = true +conda.channels = ['conda-forge', 'bioconda', 'defaults'] process { withName: '.*DIFFBIND_RUN' { From a2d3e85f3839119ea5aec00d0916d78b6a067fe2 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Tue, 13 Jan 2026 18:42:16 -0800 Subject: [PATCH 28/31] Filter NA bins in chipbinner significant beds --- bin/chipbinner_rots.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/chipbinner_rots.R b/bin/chipbinner_rots.R index 79e122bf..a232447a 100755 --- a/bin/chipbinner_rots.R +++ b/bin/chipbinner_rots.R @@ -224,7 +224,14 @@ results$cluster_label_3clusters <- cluster_labels_3 write.table(results, "chipbinner.differential.tsv", sep = "\t", quote = FALSE, row.names = FALSE) -sig <- results[results$FDR <= opt$fdr & abs(results$log2FC) >= opt$lfc, , drop = FALSE] +sig <- results[ + !is.na(results$FDR) & + !is.na(results$log2FC) & + results$FDR <= opt$fdr & + abs(results$log2FC) >= opt$lfc, + , + drop = FALSE +] up <- sig[sig$log2FC > 0, , drop = FALSE] down <- sig[sig$log2FC < 0, , drop = FALSE] From 6f3101c9a1bf1577b6dfca8fe7dfe0c41cf8c462 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Tue, 13 Jan 2026 18:53:14 -0800 Subject: [PATCH 29/31] Mark unassigned clusters as NA --- bin/chipbinner_rots.R | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/chipbinner_rots.R b/bin/chipbinner_rots.R index a232447a..be96b791 100755 --- a/bin/chipbinner_rots.R +++ b/bin/chipbinner_rots.R @@ -115,6 +115,7 @@ compute_cluster_labels <- function(cluster_ids, counts_mat, conds, treated, cont labels[] <- "NA" return(labels) } + labels[is.na(cluster_ids)] <- "NA" treated_idx <- conds == treated control_idx <- conds == control for (cid in unique(cluster_ids)) { From 090365811417f08c1206c5817235cf8566ee0337 Mon Sep 17 00:00:00 2001 From: dhusmann Date: Tue, 13 Jan 2026 19:02:53 -0800 Subject: [PATCH 30/31] Preserve empty control map for chipbinner --- subworkflows/local/differential_peak_calling.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index 2b8b20a2..fddec5a7 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -365,6 +365,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { } map } + .ifEmpty([:]) } ch_chipbinner_samples = ch_valid_groups.combine(ch_control_map).map { rec, control_map -> From 1ef69b22ef6205dcd5de909bf8e49a521b53868f Mon Sep 17 00:00:00 2001 From: dhusmann Date: Tue, 13 Jan 2026 19:21:09 -0800 Subject: [PATCH 31/31] Broadcast SPAN capability values --- subworkflows/local/differential_peak_calling.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/differential_peak_calling.nf b/subworkflows/local/differential_peak_calling.nf index fddec5a7..e390b39d 100644 --- a/subworkflows/local/differential_peak_calling.nf +++ b/subworkflows/local/differential_peak_calling.nf @@ -543,6 +543,7 @@ workflow DIFFERENTIAL_PEAK_CALLING { ch_span_caps = SPAN_CAPABILITY_PROBE.out.capabilities .map { file -> new groovy.json.JsonSlurper().parse(file) } + .first() ch_span_has_compare = ch_span_caps.map { it.has_compare ?: false } ch_span_multibam = ch_span_caps.map { it.compare_multi_bam ?: false } ch_span_multibam_mode = ch_span_caps.map { it.compare_multi_bam_mode ?: 'single' }