diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml index ad686e8..ea59134 100644 --- a/.github/actions/nf-test/action.yml +++ b/.github/actions/nf-test/action.yml @@ -56,6 +56,14 @@ runs: channel-priority: strict conda-remove-defaults: true + - name: Set dummy Nextflow secrets for stub tests + shell: bash + run: | + # The pipeline stub test (tests/default.nf.test) exercises the ESM + # embedding processes, which declare `secret 'HF_TOKEN'`. Nextflow + # requires the secret to exist even under -stub, so register a dummy. + nextflow secrets set HF_TOKEN "stub" || true + - name: Run nf-test shell: bash env: diff --git a/.gitignore b/.gitignore index 7e07892..d0ab513 100644 --- a/.gitignore +++ b/.gitignore @@ -208,8 +208,15 @@ other_scripts/* test-eval-env/* *.txt +# Tests +.nf-test/ + # Added by code-review-graph .code-review-graph/ # Claude .claude/ .mcp.json + +# nf-test fixtures (override the broad ignores above) +!tests/data/ +!tests/data/** diff --git a/bin/build_ppi_negative_ddis.py b/bin/build_ppi_negative_pool.py similarity index 60% rename from bin/build_ppi_negative_ddis.py rename to bin/build_ppi_negative_pool.py index 23051b8..b2d6b2a 100755 --- a/bin/build_ppi_negative_ddis.py +++ b/bin/build_ppi_negative_pool.py @@ -1,26 +1,43 @@ #!/usr/bin/env python3 """ -Build negative DDIs from a Y2H/MS PPI parquet and append them to the -domainsplit SQLite, restricted to Pfam domains already present in positive -DDIs. In degree_matched mode, selects pairs so each domain's negative -degree matches its positive degree. In frequency mode, takes the top-N -by PPI co-occurrence count, capped at (n_positive - n_negatome). +Build the candidate pool for negative DDIs from a Y2H/MS PPI parquet and dump it +to ``neg_pool.npz`` for the (seed-dependent) selection step. + +This is the EXPENSIVE, DETERMINISTIC half of negative-DDI construction: it +streams the parquet, maps bait/prey genes to UniProt + Pfam via the UniProt REST +API, and assembles the pool of candidate Pfam pairs (restricted to Pfam domains +that already appear in a 3did positive DDI, and excluding pairs that already +exist as DDIs). It performs NO sampling and NO insertion -- selection fans out +into parallel per-seed jobs (``select_ppi_negative_dans.py``) that read the dump, +and the winning selection is inserted by ``insert_ppi_negative_selection.py``. + +The dump carries what both negative-construction methods need (uncapped DANS, +Cappelletti et al. vbae036): + * Method 1 "deletion" -- the candidate pool ``cand_a``/``cand_b``, the pool + domain universe ``pool_dom`` and the *reduced* positive degrees + ``pool_deg_r`` (3did positives restricted to pool domains), plus the reduced + positive-edge PA and target count. + * Method 2 "random_addition" -- the full positive edge endpoint multiset + ``pos_a``/``pos_b`` (DANS samples node-pairs proportional to degree from it), + the full positive degrees/PA/count, and the forbidden-pair set + ``forbidden_a``/``forbidden_b`` (all existing DDIs) that DANS must avoid. """ import argparse -import heapq import itertools import json -import random import sqlite3 import sys import time import math from collections import defaultdict +import numpy as np import pyarrow.parquet as pq import requests +from ddi_db_utils import pfam_sort_key + TAG = "[ppi_neg]" BATCH_SIZE = 500_000 @@ -37,14 +54,14 @@ def parse_args(): p.add_argument("--parquet", required=True) p.add_argument("--pfam-mapping-out", required=True, help="Output path for UniProt -> Pfam JSON mapping") + p.add_argument("--pool-out", required=True, + help="Output path for the candidate-pool .npz dump") p.add_argument("--min-n-tested", type=int, required=True) - p.add_argument("--source-label", required=True) p.add_argument( - "--sampling-strategy", - choices=["frequency", "degree_matched"], - default="degree_matched", - help="'frequency' = top-N by co-occurrence (old behavior). " - "'degree_matched' = sample to match positive degree distribution.", + "--no-self", + action="store_true", + help="Skip self-pairs (domain interacting with itself) " + "when self_interaction is disabled.", ) return p.parse_args() @@ -142,12 +159,17 @@ def fetch_gene_mappings(gene_names, batch_size=100): return gene_to_uniprot, uniprot_to_pfams -def load_positive_pfams(conn): +def load_3did_pfams(conn): + """Pfam IDs that appear in a 3did positive DDI. + + Negatives are inferred (and degree-matched) only over the 3did domain + universe, so single-domain / PPIDM positives never widen the candidate set. + """ cur = conn.execute( "SELECT DISTINCT d.pfam_id " "FROM domain AS d JOIN domain_domain_interaction AS ddi " " ON d.id IN (ddi.domain_id_a, ddi.domain_id_b) " - "WHERE ddi.negative = 0" + "WHERE ddi.negative = 0 AND ddi.source = '3did'" ) return {row[0] for row in cur} @@ -159,9 +181,19 @@ def load_existing_pairs(conn): "JOIN domain AS da ON da.id = ddi.domain_id_a " "JOIN domain AS db ON db.id = ddi.domain_id_b" ) - return {tuple(sorted((a, b))) for a, b in cur} + return {tuple(sorted((a, b), key=pfam_sort_key)) for a, b in cur} +def load_positive_3did_edges(conn): + """The 3did positive DDIs as (pfam_a, pfam_b) pairs.""" + return conn.execute( + "SELECT da.pfam_id, db.pfam_id " + "FROM domain_domain_interaction AS ddi " + "JOIN domain AS da ON da.id = ddi.domain_id_a " + "JOIN domain AS db ON db.id = ddi.domain_id_b " + "WHERE ddi.negative = 0 AND ddi.source = '3did'" + ).fetchall() + def _validate_columns(parquet_schema): available = set(parquet_schema.names) @@ -198,84 +230,6 @@ def _collect_genes_and_pairs(parquet_path, min_n_tested): return n_input, unique_genes, baits, preys -def _compute_positive_degree(conn): - """Per-Pfam degree in the positive DDI set.""" - rows = conn.execute( - "SELECT da.pfam_id, db.pfam_id " - "FROM domain_domain_interaction AS ddi " - "JOIN domain AS da ON da.id = ddi.domain_id_a " - "JOIN domain AS db ON db.id = ddi.domain_id_b " - "WHERE ddi.negative = 0" - ).fetchall() - deg = defaultdict(int) - for a, b in rows: - deg[a] += 1 - deg[b] += 1 - return deg - - -def select_degree_matched(fresh_candidates, pos_degree, n_take): - """Select negatives so each domain's negative degree matches its positive degree. - - Uses a lazy-deletion max-heap scored by combined degree deficit of both - domains in each candidate pair. Candidates are shuffled for random - tiebreaking among equal-deficit pairs. - """ - if not fresh_candidates or n_take <= 0: - return [] - if not pos_degree: - return fresh_candidates[:n_take] - - target = dict(pos_degree) - current = defaultdict(int) - - candidates = list(fresh_candidates) - random.shuffle(candidates) - - remaining = set(range(len(candidates))) - - def deficit(pfam): - return max(0, target.get(pfam, 0) - current[pfam]) - - def score(i): - (pfam_a, pfam_b), _ = candidates[i] - return deficit(pfam_a) + deficit(pfam_b) - - heap = [(-score(i), i) for i in range(len(candidates))] - heapq.heapify(heap) - - chosen = [] - while len(chosen) < n_take and heap: - neg_s, i = heapq.heappop(heap) - if i not in remaining: - continue - - actual = score(i) - if actual != -neg_s: - if actual > 0: - heapq.heappush(heap, (-actual, i)) - else: - remaining.discard(i) - continue - - if actual <= 0: - break - - (pfam_a, pfam_b), count = candidates[i] - chosen.append(((pfam_a, pfam_b), count)) - remaining.discard(i) - current[pfam_a] += 1 - current[pfam_b] += 1 - - matched = sum(1 for p in target if current.get(p, 0) >= target[p]) - over = sum(1 for p in target if current.get(p, 0) > target[p]) - total_deficit = sum(max(0, target[p] - current.get(p, 0)) for p in target) - log(f"degree_matched: {matched}/{len(target)} domains reached target degree") - log(f"degree_matched: {over} domains exceeded target degree") - log(f"degree_matched: remaining total deficit = {total_deficit}") - return chosen - - def main(): args = parse_args() @@ -306,12 +260,9 @@ def main(): log(f"n_pfam_domains_for_input_proteins = {n_pfam_unique}") conn = sqlite3.connect(args.db) - conn.execute("PRAGMA foreign_keys=ON") - conn.execute("PRAGMA journal_mode=OFF") - conn.execute("PRAGMA synchronous=OFF") - pos_pfam = load_positive_pfams(conn) - log(f"n_positive_pfams = {len(pos_pfam)}") + pos_pfam = load_3did_pfams(conn) + log(f"n_3did_pfams = {len(pos_pfam)}") existing_pairs = load_existing_pairs(conn) log(f"n_existing_ddis = {len(existing_pairs)}") @@ -335,7 +286,9 @@ def row_pfams(gene): continue row_pairs = set() for a, b in itertools.product(bait_pfams, prey_pfams): - row_pairs.add(tuple(sorted((a, b)))) + if args.no_self and a == b: + continue + row_pairs.add(tuple(sorted((a, b), key=pfam_sort_key))) if not row_pairs: continue n_rows_with_pairs += 1 @@ -356,74 +309,96 @@ def row_pfams(gene): f"(observed in {most_common_count} PPI rows)" ) - n_positive = conn.execute( - "SELECT COUNT(*) FROM domain_domain_interaction WHERE negative = 0" - ).fetchone()[0] - n_negatome = conn.execute( - "SELECT COUNT(*) FROM domain_domain_interaction " - "WHERE negative = 1 AND source = 'negatome'" - ).fetchone()[0] + # Positive 3did graph statistics: degree (the per-domain cap), edge PA, and + # the target negative count. + pos_edges = load_positive_3did_edges(conn) + pos_degree = defaultdict(int) + for a, b in pos_edges: + pos_degree[a] += 1 + pos_degree[b] += 1 + n_positive = len(pos_edges) + n_positive_domains = len(pos_degree) + pos_edge_pa = np.array( + [pos_degree[a] * pos_degree[b] for a, b in pos_edges], dtype=np.int64 + ) log(f"n_positive_ddis_in_db = {n_positive}") - log(f"n_negatome_negatives_in_db = {n_negatome}") - if args.sampling_strategy == "degree_matched": - n_take = n_positive - log(f"n_take = {n_take} (degree_matched: matching positive count)") - else: - n_take = max(0, n_positive - n_negatome) - log(f"n_take (target for source='{args.source_label}') = {n_take}") - - fresh_candidates = [] - n_positive_ddis_in_negative_ppis = 0 + log(f"n_positive_domains = {n_positive_domains}") + log(f"positive mean PA = {float(pos_edge_pa.mean()):.1f}") - for key, count in candidate_counts.items(): + # Drop candidates that already exist as a DDI (positive or negative). + fresh_pairs = [] + n_positive_ddis_in_negative_ppis = 0 + for key in candidate_counts: if key in existing_pairs: n_positive_ddis_in_negative_ppis += 1 else: - fresh_candidates.append((key, count)) + fresh_pairs.append(key) log(f"n_positive_ddis_in_negative_ppis = {n_positive_ddis_in_negative_ppis}") - - fresh_candidates.sort(key=lambda kv: kv[1], reverse=True) - log(f"n_fresh_candidates_after_dedup = {len(fresh_candidates)}") - - if args.sampling_strategy == "degree_matched": - log("using degree-matched sampling strategy") - pos_degree = _compute_positive_degree(conn) - chosen = select_degree_matched(fresh_candidates, pos_degree, n_take) - else: - log("using frequency-ranked sampling strategy") - chosen = fresh_candidates[:n_take] - log(f"n_chosen = {len(chosen)}") - - if chosen: - # Pre-load pfam_id -> domain.id mapping to avoid per-row subqueries - pfam_to_domain_ids = defaultdict(list) - for did, pfam in conn.execute("SELECT id, pfam_id FROM domain"): - pfam_to_domain_ids[pfam].append(did) - log(f"loaded {len(pfam_to_domain_ids)} pfam -> domain mappings") - - insert_rows = [] - for (pfam_a, pfam_b), _ in chosen: - for d_a in pfam_to_domain_ids.get(pfam_a, ()): - for d_b in pfam_to_domain_ids.get(pfam_b, ()): - insert_rows.append((d_a, d_b, True, args.source_label)) - - conn.executemany( - "INSERT OR IGNORE INTO domain_domain_interaction" - "(domain_id_a, domain_id_b, negative, source) " - "VALUES (?, ?, ?, ?)", - insert_rows, - ) - conn.commit() - log(f"batch-inserted {len(insert_rows)} rows") - - n_inserted = conn.execute( - "SELECT COUNT(*) FROM domain_domain_interaction WHERE source = ?", - (args.source_label,), - ).fetchone()[0] - log(f"n_inserted_for_source = {n_inserted}") + log(f"n_fresh_candidates_after_dedup = {len(fresh_pairs)}") conn.close() + cand_a = np.array([a for a, b in fresh_pairs], dtype=object) + cand_b = np.array([b for a, b in fresh_pairs], dtype=object) + + # ---- Method 1 ("deletion"): reduce the positives to the candidate-domain + # universe so positive and candidate domains coincide; DANS then draws + # degree-aware over the fixed candidate pool. ---- + pool_domains = {d for pair in fresh_pairs for d in pair} + pos_edges_r = [ + (a, b) for a, b in pos_edges if a in pool_domains and b in pool_domains + ] + pos_degree_r = defaultdict(int) + for a, b in pos_edges_r: + pos_degree_r[a] += 1 + pos_degree_r[b] += 1 + n_positive_r = len(pos_edges_r) + n_positive_domains_r = len(pos_degree_r) + pos_edge_pa_r = np.array( + [pos_degree_r[a] * pos_degree_r[b] for a, b in pos_edges_r], dtype=np.int64 + ) + # Every pool domain carries its reduced-positive degree (0 if it has no edge + # in the reduced positive graph); the selector turns these into PA weights. + pool_dom = np.array(sorted(pool_domains, key=pfam_sort_key), dtype=object) + pool_deg_r = np.array([pos_degree_r[d] for d in pool_dom], dtype=np.int64) + log(f"n_pool_domains = {len(pool_dom)}") + log(f"n_reduced_positive_ddis = {n_positive_r}") + log(f"n_reduced_positive_domains = {n_positive_domains_r}") + + # ---- Method 2 ("random_addition"): plain DANS over the full positive set. + # The selector samples node-pairs proportional to degree by drawing from + # the endpoint multiset of these edges and rejects existing pairs. ---- + pos_a = np.array([a for a, b in pos_edges], dtype=object) + pos_b = np.array([b for a, b in pos_edges], dtype=object) + pos_dom = np.array(list(pos_degree.keys()), dtype=object) + pos_deg = np.array([pos_degree[d] for d in pos_dom], dtype=np.int64) + forbidden_a = np.array([a for a, b in existing_pairs], dtype=object) + forbidden_b = np.array([b for a, b in existing_pairs], dtype=object) + + log(f"writing candidate pool to {args.pool_out}") + np.savez( + args.pool_out, + # --- Method 1 (deletion) --- + cand_a=cand_a, + cand_b=cand_b, + pool_dom=pool_dom, + pool_deg_r=pool_deg_r, + pos_edge_pa_r=pos_edge_pa_r, + n_positive_r=np.int64(n_positive_r), + n_positive_domains_r=np.int64(n_positive_domains_r), + # --- Method 2 (random_addition) --- + pos_a=pos_a, + pos_b=pos_b, + pos_dom=pos_dom, + pos_deg=pos_deg, + pos_edge_pa=pos_edge_pa, + n_positive=np.int64(n_positive), + n_positive_domains=np.int64(n_positive_domains), + forbidden_a=forbidden_a, + forbidden_b=forbidden_b, + ) + log("done") + if __name__ == "__main__": main() diff --git a/bin/build_swissprot_pfam_map.py b/bin/build_swissprot_pfam_map.py new file mode 100755 index 0000000..7d923aa --- /dev/null +++ b/bin/build_swissprot_pfam_map.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +"""Build a reviewed-human UniProt -> Pfam map for single-domain detection. + +Downloads one UniProt stream (TSV, fields accession,id,gene_names,xref_pfam for +``reviewed:true AND organism_id:9606``) and emits ``swissprot_pfam_map.json``: + + { + "accession_to_pfams": {accession: [Pfam, ...]}, + "name_to_accession": {entry_name_or_gene: accession} + } + +``name_to_accession`` lets the single-domain step resolve HIPPIE identifiers that +are entry names (e.g. ``AL1A1_HUMAN``) or gene names; accessions resolve directly +against ``accession_to_pfams``. Gene names that map to more than one accession are +dropped as ambiguous; unique entry names always win. +""" + +import argparse +import gzip +import json +import os +import shutil +import ssl +import sys +import urllib.error +import urllib.request + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--url", required=True, help="UniProt stream URL or local TSV(.gz) file") + p.add_argument("--out", required=True, help="Output JSON path") + p.add_argument("--versions", required=True) + p.add_argument("--process-name", required=True) + return p.parse_args() + + +def fetch(url, out_path): + def _download(ctx): + req = urllib.request.Request(url, headers={"User-Agent": "domainsplit-pipeline"}) + with urllib.request.urlopen(req, context=ctx, timeout=600) as resp, open(out_path, "wb") as fh: + while True: + chunk = resp.read(1024 * 1024) + if not chunk: + break + fh.write(chunk) + + if url.startswith(("http://", "https://", "ftp://", "file://")): + try: + _download(ssl.create_default_context()) + except (urllib.error.URLError, ssl.SSLError) as exc: + print(f"WARNING: SSL validation failed for {url} ({exc!r}); retrying unverified.", + file=sys.stderr, flush=True) + _download(ssl._create_unverified_context()) + elif os.path.exists(url): + shutil.copy(url, out_path) + else: + raise SystemExit(f"url_uniprot_swissprot_pfam '{url}' is neither a URL nor a local file") + + +def open_maybe_gzip(path): + with open(path, "rb") as fh: + magic = fh.read(2) + if magic == b"\x1f\x8b": + return gzip.open(path, "rt") + return open(path, "rt") + + +def main(): + args = parse_args() + + raw = "swissprot.tsv" + fetch(args.url, raw) + + accession_to_pfams = {} + gene_to_accs = {} # gene token -> set of accessions (for ambiguity check) + entry_name_to_acc = {} + + n_lines = 0 + with open_maybe_gzip(raw) as fh: + for i, line in enumerate(fh): + line = line.rstrip("\n") + if i == 0 and line.lower().startswith("entry"): + continue # header + if not line: + continue + cols = line.split("\t") + if len(cols) < 4: + cols += [""] * (4 - len(cols)) + accession, entry_name, gene_names, pfam_field = cols[0], cols[1], cols[2], cols[3] + if not accession: + continue + n_lines += 1 + + pfams = sorted({p for p in pfam_field.replace(",", ";").split(";") if p}) + accession_to_pfams[accession] = pfams + + if entry_name: + entry_name_to_acc[entry_name] = accession + for token in gene_names.split(): + gene_to_accs.setdefault(token, set()).add(accession) + + # entry names are unique and authoritative; add unambiguous gene names that + # do not collide with an entry name + name_to_accession = dict(entry_name_to_acc) + for token, accs in gene_to_accs.items(): + if token in name_to_accession: + continue + if len(accs) == 1: + name_to_accession[token] = next(iter(accs)) + + n_single = sum(1 for pfams in accession_to_pfams.values() if len(pfams) == 1) + print(f"[swissprot_map] proteins={n_lines} single_domain={n_single} " + f"names={len(name_to_accession)}", flush=True) + + with open(args.out, "w") as fh: + json.dump( + {"accession_to_pfams": accession_to_pfams, + "name_to_accession": name_to_accession}, + fh, + ) + + with open(args.versions, "w") as f: + f.write(f'"{args.process_name}":\n') + f.write(f" python: {sys.version.split()[0]}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/ddi_db_utils.py b/bin/ddi_db_utils.py new file mode 100755 index 0000000..4d451c4 --- /dev/null +++ b/bin/ddi_db_utils.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +"""Shared helpers for inserting DDIs into the domainsplit SQLite. + +Every ``INSERT_`` step uses these so all sources are handled uniformly: +first bulk-create any missing ``domain`` rows for the Pfam IDs it references, +then insert its DDIs. Pairs are order-normalised by Pfam accession number (see +:func:`pfam_sort_key`) so the stored ``(domain_id_a, domain_id_b)`` order is +stable -- a pair is deduplicated regardless of the order it is supplied in and +regardless of which source inserted it first or in what order domains were +created. + +The table's ``UNIQUE(domain_id_a, domain_id_b, source)`` lets the same pair be +stored under different sources. To keep the historical "earliest source wins" +behaviour for the canonical sources, :func:`insert_ddis` defaults to +``dedup_across_sources=True``, which skips any pair already present under another +source. The negative-DDI method copies (which intentionally duplicate a pair +under a new label) pass ``dedup_across_sources=False``. +""" + +import sqlite3 + + +def pfam_sort_key(pfam): + """Sort key for a Pfam accession by its numeric part (``PF00028`` -> ``28``). + + Used to canonicalise DDI pairs so the stored column order depends only on the + Pfam accessions, never on the internal ``domain.id`` insertion order. Strips + everything but digits; accessions without any digit fall back to a lexical key + that sorts deterministically after all numbered ones. + """ + digits = "".join(c for c in pfam if c.isdigit()) + return (0, int(digits)) if digits else (1, pfam) + + +def ensure_domains(conn, pfam_ids): + """Bulk ``INSERT OR IGNORE`` domain rows for ``pfam_ids`` (name left NULL). + + Returns the number of distinct Pfam IDs supplied. + """ + unique = {p for p in pfam_ids if p} + conn.executemany( + "INSERT OR IGNORE INTO domain(pfam_id) VALUES (?)", + [(p,) for p in unique], + ) + return len(unique) + + +def _pfam_to_id(conn): + return {pfam: did for did, pfam in conn.execute("SELECT id, pfam_id FROM domain")} + + +def insert_ddis(conn, pairs, negative, source, dedup_across_sources=True): + """Insert DDIs for ``(pfam_a, pfam_b)`` pairs. + + Domains must already exist (call :func:`ensure_domains` first); pairs whose + Pfam is missing from the ``domain`` table are skipped. Each pair is stored + as ``(min(id), max(id))`` so swapped duplicates collapse onto one row. + + With ``dedup_across_sources=True`` (default) a pair already present under any + other source is skipped, preserving the "earliest source wins" semantics for + the canonical sources. Pass ``False`` to allow the pair to be duplicated + under this ``source`` (used by the negative-DDI method copies); same-source + duplicates are still collapsed by ``UNIQUE(domain_id_a, domain_id_b, source)``. + + Returns the number of rows offered to ``INSERT OR IGNORE`` (before dedup by + the DB). + """ + pfam_to_id = _pfam_to_id(conn) + neg = int(bool(negative)) + existing = set() + if dedup_across_sources: + existing = { + (a, b) for a, b in conn.execute( + "SELECT domain_id_a, domain_id_b FROM domain_domain_interaction" + ) + } + rows = [] + seen = set() + for a, b in pairs: + ia = pfam_to_id.get(a) + ib = pfam_to_id.get(b) + if ia is None or ib is None: + continue + key = (ia, ib) if pfam_sort_key(a) <= pfam_sort_key(b) else (ib, ia) + if key in seen or key in existing: + continue + seen.add(key) + rows.append((key[0], key[1], neg, source)) + + conn.executemany( + "INSERT OR IGNORE INTO domain_domain_interaction" + "(domain_id_a, domain_id_b, negative, source) VALUES (?, ?, ?, ?)", + rows, + ) + return len(rows) + + +def count_source(conn, source): + """Number of DDI rows currently tagged with ``source``.""" + return conn.execute( + "SELECT COUNT(*) FROM domain_domain_interaction WHERE source = ?", + (source,), + ).fetchone()[0] diff --git a/bin/insert_3did.py b/bin/insert_3did.py new file mode 100755 index 0000000..33fec32 --- /dev/null +++ b/bin/insert_3did.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +"""Insert 3did positive DDIs into the domainsplit SQLite. + +3did is treated like every other source: read its DDI pairs, bulk-create any +missing ``domain`` rows for the referenced Pfam IDs, then insert the +interactions as ``negative=0, source='3did'``. +""" + +import argparse +import sqlite3 +import sys + +from ddi_db_utils import count_source, ensure_domains, insert_ddis + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--db", required=True, help="domainsplit SQLite (modified in place)") + p.add_argument("--sqlite-3did", required=True, help="3did SQLite from DOWNLOAD_3DID_SQLITE") + p.add_argument("--versions", required=True) + p.add_argument("--process-name", required=True) + return p.parse_args() + + +def iter_3did_pairs(conn_3did): + """Yield (pfam_a, pfam_b) Pfam accessions (version stripped) for each 3did DDI.""" + cursor = conn_3did.execute( + "SELECT d1.Pfam_id, d2.Pfam_id " + "FROM DDI1, Domain AS d1, Domain AS d2 " + "WHERE DDI1.domain1 = d1.Name AND DDI1.domain2 = d2.Name" + ) + for id_1, id_2 in cursor: + yield id_1.split(".")[0], id_2.split(".")[0] + + +def main(): + args = parse_args() + + conn_3did = sqlite3.connect(args.sqlite_3did) + conn = sqlite3.connect(args.db) + conn.execute("PRAGMA foreign_keys=ON") + conn.execute("PRAGMA journal_mode=OFF") + conn.execute("PRAGMA synchronous=OFF") + + pairs = list(iter_3did_pairs(conn_3did)) + conn_3did.close() + print(f"[3did] read {len(pairs)} DDI pairs", flush=True) + + pfams = {p for pair in pairs for p in pair} + n_domains = ensure_domains(conn, pfams) + print(f"[3did] ensured {n_domains} domains", flush=True) + + insert_ddis(conn, pairs, negative=False, source="3did") + conn.commit() + print(f"[3did] n_ddis_source_3did = {count_source(conn, '3did')}", flush=True) + conn.close() + + with open(args.versions, "w") as f: + f.write(f'"{args.process_name}":\n') + f.write(f" python: {sys.version.split()[0]}\n") + f.write(f" sqlite3: {sqlite3.sqlite_version}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/insert_negatome.py b/bin/insert_negatome.py new file mode 100755 index 0000000..48b33d1 --- /dev/null +++ b/bin/insert_negatome.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +"""Insert Negatome negative DDIs into the domainsplit SQLite. + +Negatome ``combined_pfam.txt`` lists whitespace-separated Pfam pairs that do not +interact. Treated like every other source: bulk-create missing domains, then +insert as ``negative=1, source='negatome'``. +""" + +import argparse +import sqlite3 +import sys + +from ddi_db_utils import count_source, ensure_domains, insert_ddis + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--db", required=True) + p.add_argument("--negatome", required=True) + p.add_argument("--versions", required=True) + p.add_argument("--process-name", required=True) + return p.parse_args() + + +def iter_negatome_pairs(path): + with open(path) as f: + for line in f: + tokens = line.split() + if len(tokens) < 2: + continue + yield tokens[0], tokens[1] + + +def main(): + args = parse_args() + + pairs = list(iter_negatome_pairs(args.negatome)) + print(f"[negatome] read {len(pairs)} pairs", flush=True) + + conn = sqlite3.connect(args.db) + conn.execute("PRAGMA foreign_keys=ON") + conn.execute("PRAGMA journal_mode=OFF") + conn.execute("PRAGMA synchronous=OFF") + + pfams = {p for pair in pairs for p in pair} + ensure_domains(conn, pfams) + insert_ddis(conn, pairs, negative=True, source="negatome") + conn.commit() + print(f"[negatome] n_ddis_source_negatome = {count_source(conn, 'negatome')}", flush=True) + conn.close() + + with open(args.versions, "w") as f: + f.write(f'"{args.process_name}":\n') + f.write(f" python: {sys.version.split()[0]}\n") + f.write(f" sqlite3: {sqlite3.sqlite_version}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/insert_ppi_negative_selection.py b/bin/insert_ppi_negative_selection.py new file mode 100755 index 0000000..bfced58 --- /dev/null +++ b/bin/insert_ppi_negative_selection.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +""" +Insert the two negative-DDI construction methods into the domainsplit SQLite, +each under its own ``source`` labels so both can coexist in one database and be +selected independently by the downstream splits. + +Both methods use uncapped Degree-Aware Node Sampling (DANS); the per-seed pairs +are produced by ``select_ppi_negative_dans.py`` (one run per method, no +pick-best). This step copies the matching positives and inserts the negatives: + + * "deletion" -- positives = 3did restricted to the candidate-pool domain + universe (``3did_deletion``); negatives = + ``inferred_ppi_screen_negative_for_deletion``. + * "random_addition" -- positives = the full 3did set (``3did_random_addition``); + negatives = ``inferred_ppi_screen_negative_for_random_addition``. + +The four method labels are inserted with ``dedup_across_sources=False`` so they +may duplicate a pair already stored under the canonical ``3did`` source (the +table's ``UNIQUE(domain_id_a, domain_id_b, source)`` keeps the labels distinct). + +Prints a positive reference + the negative selection for each method and writes +the same data to a published scores TSV. +""" + +import argparse +import json +import sqlite3 + +import numpy as np + +from ddi_db_utils import count_source, ensure_domains, insert_ddis + + +TAG = "[neg_insert]" + + +def log(msg): + print(f"{TAG} {msg}", flush=True) + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--db", required=True) + p.add_argument("--pool", required=True, + help="candidate-pool .npz (for the pool-domain universe)") + p.add_argument("--pairs-deletion", required=True) + p.add_argument("--pairs-random-addition", required=True) + p.add_argument("--score-deletion", required=True) + p.add_argument("--score-random-addition", required=True) + p.add_argument("--scores-out", required=True, + help="output consolidated scores TSV path") + p.add_argument("--source-3did", default="3did") + p.add_argument("--label-pos-deletion", default="3did_deletion") + p.add_argument("--label-pos-random-addition", default="3did_random_addition") + p.add_argument("--label-neg-deletion", + default="inferred_ppi_screen_negative_for_deletion") + p.add_argument("--label-neg-random-addition", + default="inferred_ppi_screen_negative_for_random_addition") + return p.parse_args() + + +def read_pairs(path): + pairs = [] + with open(path) as fh: + for line in fh: + line = line.rstrip("\n") + if not line: + continue + a, b = line.split("\t") + pairs.append((a, b)) + return pairs + + +def load_positive_pairs(conn, source): + """3did positive (pfam_a, pfam_b) pairs currently in the DB.""" + return conn.execute( + "SELECT da.pfam_id, db.pfam_id " + "FROM domain_domain_interaction AS ddi " + "JOIN domain AS da ON da.id = ddi.domain_id_a " + "JOIN domain AS db ON db.id = ddi.domain_id_b " + "WHERE ddi.negative = 0 AND ddi.source = ?", + (source,), + ).fetchall() + + +def main(): + args = parse_args() + + pool_domains = set(np.load(args.pool, allow_pickle=True)["pool_dom"].tolist()) + log(f"pool-domain universe: {len(pool_domains)} domains") + + conn = sqlite3.connect(args.db) + conn.execute("PRAGMA foreign_keys=ON") + conn.execute("PRAGMA journal_mode=OFF") + conn.execute("PRAGMA synchronous=OFF") + + # --- positives: full (random_addition) and pool-restricted (deletion) copies --- + pos_3did = load_positive_pairs(conn, args.source_3did) + pos_reduced = [ + (a, b) for a, b in pos_3did if a in pool_domains and b in pool_domains + ] + log(f"3did positives: {len(pos_3did)} total, {len(pos_reduced)} within pool") + + insert_ddis(conn, pos_3did, negative=False, + source=args.label_pos_random_addition, dedup_across_sources=False) + insert_ddis(conn, pos_reduced, negative=False, + source=args.label_pos_deletion, dedup_across_sources=False) + + # --- negatives: one DANS selection per method --- + pairs_del = read_pairs(args.pairs_deletion) + pairs_rand = read_pairs(args.pairs_random_addition) + + ensure_domains(conn, (p for pair in pairs_del for p in pair)) + ensure_domains(conn, (p for pair in pairs_rand for p in pair)) + insert_ddis(conn, pairs_del, negative=True, + source=args.label_neg_deletion, dedup_across_sources=False) + insert_ddis(conn, pairs_rand, negative=True, + source=args.label_neg_random_addition, dedup_across_sources=False) + + conn.commit() + counts = { + args.label_pos_random_addition: count_source(conn, args.label_pos_random_addition), + args.label_pos_deletion: count_source(conn, args.label_pos_deletion), + args.label_neg_random_addition: count_source(conn, args.label_neg_random_addition), + args.label_neg_deletion: count_source(conn, args.label_neg_deletion), + } + conn.close() + + with open(args.score_deletion) as fh: + sc_del = json.load(fh) + with open(args.score_random_addition) as fh: + sc_rand = json.load(fh) + + for label, n in counts.items(): + log(f"inserted source={label} rows={n}") + for sc in (sc_del, sc_rand): + log(f"method={sc['method']} set=positive n_sel={sc['pos_n_sel']} " + f"n_dom={sc['pos_n_dom']} mean_pa={sc['pos_mean_pa']:.1f}") + log(f"method={sc['method']} set=negative seed={sc['seed']} J={sc['J']:.4f} " + f"pa={sc['pa']:.4f} deg={sc['deg']:.4f} cov={sc['cov']:.4f} " + f"n_sel={sc['n_sel']} n_dom={sc['n_dom']} mean_pa={sc['mean_pa']:.1f}") + + cols = ["set", "method", "seed", "J", "pa", "deg", "cov", "n_sel", "n_dom", + "mean_pa"] + with open(args.scores_out, "w") as fh: + fh.write("\t".join(cols) + "\n") + for sc in (sc_del, sc_rand): + fh.write("\t".join([ + "positive", sc["method"], "NA", "NA", "NA", "NA", "NA", + str(sc["pos_n_sel"]), str(sc["pos_n_dom"]), + f"{sc['pos_mean_pa']:.2f}", + ]) + "\n") + fh.write("\t".join([ + "negative", sc["method"], str(sc["seed"]), + f"{sc['J']:.6f}", f"{sc['pa']:.6f}", f"{sc['deg']:.6f}", + f"{sc['cov']:.6f}", str(sc["n_sel"]), str(sc["n_dom"]), + f"{sc['mean_pa']:.2f}", + ]) + "\n") + + +if __name__ == "__main__": + main() diff --git a/bin/insert_ppidm.py b/bin/insert_ppidm.py new file mode 100755 index 0000000..9ed2556 --- /dev/null +++ b/bin/insert_ppidm.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +"""Insert PPIDM predicted positive DDIs, keeping the class as the source. + +Input ``predicted_ddi_ppi.tsv`` columns: ``domain_1 domain_2 class`` where each +domain token looks like ``10114/PF00069`` (the Pfam accession follows the slash). +Rows are inserted as ``negative=0, source='PPIDM_'`` for the requested +classes. Classes are processed Gold -> Silver -> Bronze so that, on a duplicate +domain pair, the highest-confidence class wins (``INSERT OR IGNORE``). +""" + +import argparse +import sqlite3 +import sys + +from ddi_db_utils import count_source, ensure_domains, insert_ddis + +# highest confidence first +CLASS_ORDER = ["Gold", "Silver", "Bronze"] + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--db", required=True) + p.add_argument("--ppidm", required=True) + p.add_argument("--classes", required=True, + help="comma-separated classes to include, e.g. 'Bronze,Silver,Gold'") + p.add_argument("--versions", required=True) + p.add_argument("--process-name", required=True) + return p.parse_args() + + +def extract_pfam(token): + """``10114/PF00069`` or ``PF00069.3`` -> ``PF00069`` (or None).""" + pf = token.split("/")[-1].split(".")[0].strip() + return pf if pf.startswith("PF") else None + + +def main(): + args = parse_args() + + allowed = {c.strip().capitalize() for c in args.classes.split(",") if c.strip()} + classes = [c for c in CLASS_ORDER if c in allowed] + print(f"[ppidm] including classes: {classes}", flush=True) + + # class -> list of (pfam_a, pfam_b) + pairs_by_class = {c: [] for c in classes} + n_rows = n_bad = 0 + with open(args.ppidm) as fh: + for i, line in enumerate(fh): + line = line.rstrip("\n") + if not line: + continue + cols = line.split("\t") + if len(cols) < 3: + continue + if i == 0 and cols[2].strip().lower() == "class": + continue # header + cls = cols[2].strip().capitalize() + if cls not in pairs_by_class: + continue + pfam_a = extract_pfam(cols[0]) + pfam_b = extract_pfam(cols[1]) + if pfam_a is None or pfam_b is None: + n_bad += 1 + continue + pairs_by_class[cls].append((pfam_a, pfam_b)) + n_rows += 1 + + print(f"[ppidm] parsed {n_rows} pairs ({n_bad} unparseable)", flush=True) + + conn = sqlite3.connect(args.db) + conn.execute("PRAGMA foreign_keys=ON") + conn.execute("PRAGMA journal_mode=OFF") + conn.execute("PRAGMA synchronous=OFF") + + all_pfams = {p for pairs in pairs_by_class.values() for pair in pairs for p in pair} + ensure_domains(conn, all_pfams) + + for cls in classes: # Gold first + source = f"PPIDM_{cls}" + insert_ddis(conn, pairs_by_class[cls], negative=False, source=source) + conn.commit() + print(f"[ppidm] n_ddis_source_{source} = {count_source(conn, source)}", flush=True) + + conn.close() + + with open(args.versions, "w") as f: + f.write(f'"{args.process_name}":\n') + f.write(f" python: {sys.version.split()[0]}\n") + f.write(f" sqlite3: {sqlite3.sqlite_version}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/insert_single_domain_ppi.py b/bin/insert_single_domain_ppi.py new file mode 100755 index 0000000..996ecdf --- /dev/null +++ b/bin/insert_single_domain_ppi.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +"""Infer positive DDIs from HIPPIE PPIs between two single-domain proteins. + +A PPI contributes a positive DDI only when *both* interactors are reviewed human +proteins annotated with exactly one Pfam domain; the DDI is then the pair of +those two single domains. Identifiers in the HIPPIE columns may be UniProt +accessions or entry names (e.g. ``AL1A1_HUMAN``) -- both are resolved via the +SwissProt map. New domains are bulk-created so they get curated downstream. +""" + +import argparse +import json +import sqlite3 +import sys + +from ddi_db_utils import count_source, ensure_domains, insert_ddis + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--db", required=True) + p.add_argument("--hippie", required=True) + p.add_argument("--swissprot-map", required=True) + p.add_argument("--min-score", type=float, required=True) + p.add_argument("--versions", required=True) + p.add_argument("--process-name", required=True) + return p.parse_args() + + +def main(): + args = parse_args() + + with open(args.swissprot_map) as fh: + smap = json.load(fh) + accession_to_pfams = smap["accession_to_pfams"] + name_to_accession = smap["name_to_accession"] + + # single-domain proteins: accession -> its one Pfam + single_domain = { + acc: pfams[0] + for acc, pfams in accession_to_pfams.items() + if len(pfams) == 1 + } + print(f"[single_domain_ppi] single-domain proteins: {len(single_domain)}", flush=True) + + def resolve_pfam(token): + """Return the single Pfam of ``token`` (accession or name), else None.""" + acc = token if token in accession_to_pfams else name_to_accession.get(token) + if acc is None: + return None + return single_domain.get(acc) + + pairs = [] + n_rows = n_kept = n_unresolved = 0 + with open(args.hippie) as fh: + for line in fh: + line = line.rstrip("\n") + if not line: + continue + cols = line.split("\t") + if len(cols) < 5: + continue + n_rows += 1 + try: + score = float(cols[4]) + except ValueError: + continue + if score < args.min_score: + continue + pfam_a = resolve_pfam(cols[0]) + pfam_b = resolve_pfam(cols[2]) + if pfam_a is None or pfam_b is None: + n_unresolved += 1 + continue + pairs.append((pfam_a, pfam_b)) + n_kept += 1 + + print(f"[single_domain_ppi] hippie_rows={n_rows} score>= {args.min_score}: " + f"single_domain_pairs={n_kept} unresolved_or_multi={n_unresolved}", flush=True) + + conn = sqlite3.connect(args.db) + conn.execute("PRAGMA foreign_keys=ON") + conn.execute("PRAGMA journal_mode=OFF") + conn.execute("PRAGMA synchronous=OFF") + + pfams = {p for pair in pairs for p in pair} + ensure_domains(conn, pfams) + insert_ddis(conn, pairs, negative=False, source="single_domain_ppi") + conn.commit() + print(f"[single_domain_ppi] n_ddis_source = " + f"{count_source(conn, 'single_domain_ppi')}", flush=True) + conn.close() + + with open(args.versions, "w") as f: + f.write(f'"{args.process_name}":\n') + f.write(f" python: {sys.version.split()[0]}\n") + f.write(f" sqlite3: {sqlite3.sqlite_version}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/select_ppi_negative_dans.py b/bin/select_ppi_negative_dans.py new file mode 100755 index 0000000..a81235d --- /dev/null +++ b/bin/select_ppi_negative_dans.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +""" +Select negative DDIs by Degree-Aware Node Sampling (DANS, Cappelletti et al. +2024, Bioinformatics Advances vbae036) for one of two methods, and write the +chosen Pfam pairs plus a small score JSON (reporting only -- there is no +multi-seed pick-best). + +DANS draws a negative edge by sampling two endpoints proportional to node degree +(equivalently: take the source of one random positive edge and the destination +of another) and accepts the pair iff it is not an existing edge. It is +UNCAPPED: the negative degree *distribution* tracks the positive one without +pinning an exact per-node degree sequence. + + * method "deletion" -- DANS restricted to the PPI-derived candidate pool + (``cand_a``/``cand_b``), with the positive degrees first reduced to the + candidate-domain universe (``pool_deg_r``). Candidate edges are drawn + without replacement with probability proportional to the reduced preferential + attachment PA_r = deg_r(a)*deg_r(b); target = ``n_positive_r``. + + * method "random_addition" -- plain DANS over the *full* positive set: sample + node-pairs from the positive endpoint multiset (``pos_a``/``pos_b``), + rejecting self-pairs, existing edges (``forbidden_*``) and duplicates; + target = ``n_positive``. Domains absent from the candidate pool are reachable + here, so coverage and the degree distribution match the full positives. + +The selection is scored against the method-appropriate positives with a combined +objective (lower is better), reported for inspection only: + J = w_pa*pa + w_deg*deg + w_cov*cov, where + pa = Wasserstein-1 between log1p(PA_neg) and log1p(PA_pos), normalised by the + spread of the positive log1p(PA); + deg = Kolmogorov-Smirnov statistic between the per-domain negative degree + distribution (0 for unused domains) and the positive degree distribution; + cov = 1 - domains_used_neg / domains_pos. +""" + +import argparse +import json + +import numpy as np + +from ddi_db_utils import pfam_sort_key + + +TAG = "[neg_select]" + + +def log(msg): + print(f"{TAG} {msg}", flush=True) + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--pool", required=True, help="candidate-pool .npz from BUILD step") + p.add_argument("--method", required=True, + choices=["deletion", "random_addition"]) + p.add_argument("--seed", type=int, required=True) + p.add_argument("--score-out", required=True, help="output score JSON path") + p.add_argument("--pairs-out", required=True, help="output selected-pairs TSV path") + p.add_argument("--w-pa", type=float, default=0.5) + p.add_argument("--w-deg", type=float, default=0.3) + p.add_argument("--w-cov", type=float, default=0.2) + return p.parse_args() + + +def wasserstein1(x, y): + """1-Wasserstein distance between two 1D empirical samples (numpy only).""" + x = np.sort(np.asarray(x, dtype=float)) + y = np.sort(np.asarray(y, dtype=float)) + grid = np.concatenate([x, y]) + grid.sort() + cx = np.searchsorted(x, grid, side="right") / x.size + cy = np.searchsorted(y, grid, side="right") / y.size + deltas = np.diff(grid) + return float(np.sum(np.abs(cx[:-1] - cy[:-1]) * deltas)) + + +def ks_statistic(x, y): + """Two-sample Kolmogorov-Smirnov statistic (numpy only).""" + x = np.sort(np.asarray(x, dtype=float)) + y = np.sort(np.asarray(y, dtype=float)) + grid = np.concatenate([x, y]) + grid.sort() + cx = np.searchsorted(x, grid, side="right") / x.size + cy = np.searchsorted(y, grid, side="right") / y.size + return float(np.max(np.abs(cx - cy))) + + +def select_deletion(cand_a, cand_b, pool_dom, pool_deg_r, target, seed): + """DANS over the fixed candidate pool: draw `target` edges without + replacement with probability proportional to the reduced PA. No cap. + + Returns (selected_indices, cand_ai, cand_bi) where cand_a*/cand_b* index + into pool_dom. + """ + rng = np.random.default_rng(seed) + domain_index = {d: i for i, d in enumerate(pool_dom)} + cand_ai = np.fromiter((domain_index[a] for a in cand_a), dtype=np.int64, + count=cand_a.size) + cand_bi = np.fromiter((domain_index[b] for b in cand_b), dtype=np.int64, + count=cand_b.size) + pa = (pool_deg_r[cand_ai] * pool_deg_r[cand_bi]).astype(float) + n_cand = int(cand_a.size) + k = int(min(target, n_cand)) + if k < target: + log(f"deletion: candidate pool smaller than target " + f"({n_cand} < {target}); taking all") + total = pa.sum() + p = (pa / total) if total > 0 else None + sel = (rng.choice(n_cand, size=k, replace=False, p=p) + if k > 0 else np.empty(0, dtype=np.int64)) + log(f"deletion: selected {sel.size}/{target} candidate edges") + return sel, cand_ai, cand_bi + + +def select_random_addition(pos_a, pos_b, forbidden, target, seed): + """Canonical DANS over the full positive set: sample node-pairs from the + positive endpoint multiset (so endpoints are drawn proportional to degree), + rejecting self-pairs, existing edges and duplicates. No cap. + """ + rng = np.random.default_rng(seed) + endpoints = np.concatenate([pos_a, pos_b]) + m = int(endpoints.size) + picked = set() + out_a = [] + out_b = [] + attempts = 0 + max_attempts = 200 * target + 1000 + while len(out_a) < target and attempts < max_attempts: + need = target - len(out_a) + batch = int(min(max(need * 2, 1024), 5_000_000)) + ui = rng.integers(0, m, size=batch) + vi = rng.integers(0, m, size=batch) + attempts += batch + for iu, iv in zip(ui.tolist(), vi.tolist()): + a = endpoints[iu] + b = endpoints[iv] + if a == b: + continue + key = (a, b) if pfam_sort_key(a) <= pfam_sort_key(b) else (b, a) + if key in forbidden or key in picked: + continue + picked.add(key) + out_a.append(key[0]) + out_b.append(key[1]) + if len(out_a) >= target: + break + if len(out_a) < target: + log(f"random_addition: only {len(out_a)}/{target} edges after " + f"{attempts} attempts (forbidden/duplicate saturation)") + else: + log(f"random_addition: selected {target}/{target} edges in " + f"{attempts} attempts") + return np.array(out_a, dtype=object), np.array(out_b, dtype=object) + + +def main(): + args = parse_args() + data = np.load(args.pool, allow_pickle=True) + + if args.method == "deletion": + cand_a = data["cand_a"] + cand_b = data["cand_b"] + pool_dom = data["pool_dom"] + dom_deg = data["pool_deg_r"].astype(np.int64) + pos_edge_pa = data["pos_edge_pa_r"].astype(np.int64) + target = int(data["n_positive_r"]) + n_pos_domains = int(data["n_positive_domains_r"]) + + log(f"pool: {cand_a.size} candidate edges, {pool_dom.size} pool domains, " + f"target = {target}") + sel, cand_ai, cand_bi = select_deletion( + cand_a, cand_b, pool_dom, dom_deg, target, args.seed + ) + neg_ai = cand_ai[sel] + neg_bi = cand_bi[sel] + out_a = cand_a[sel] + out_b = cand_b[sel] + else: + pos_a = data["pos_a"] + pos_b = data["pos_b"] + pos_dom = data["pos_dom"] + dom_deg = data["pos_deg"].astype(np.int64) + pos_edge_pa = data["pos_edge_pa"].astype(np.int64) + target = int(data["n_positive"]) + n_pos_domains = int(data["n_positive_domains"]) + forbidden = set(zip(data["forbidden_a"].tolist(), + data["forbidden_b"].tolist())) + + log(f"positives: {pos_a.size} edges, {pos_dom.size} domains, " + f"{len(forbidden)} forbidden pairs, target = {target}") + out_a, out_b = select_random_addition( + pos_a, pos_b, forbidden, target, args.seed + ) + domain_index = {d: i for i, d in enumerate(pos_dom)} + neg_ai = np.fromiter((domain_index[a] for a in out_a), dtype=np.int64, + count=out_a.size) + neg_bi = np.fromiter((domain_index[b] for b in out_b), dtype=np.int64, + count=out_b.size) + + # --- selected-set statistics / objective (reporting only) --- + n_sel = int(out_a.size) + neg_pa = (dom_deg[neg_ai] * dom_deg[neg_bi]).astype(np.int64) + mean_pa_neg = float(neg_pa.mean()) if n_sel else 0.0 + + neg_deg = np.zeros(dom_deg.size, dtype=np.int64) + np.add.at(neg_deg, neg_ai, 1) + np.add.at(neg_deg, neg_bi, 1) + n_dom = int(np.count_nonzero(neg_deg)) + + pos_logpa = np.log1p(pos_edge_pa.astype(float)) + neg_logpa = np.log1p(neg_pa.astype(float)) + spread = float(pos_logpa.max() - pos_logpa.min()) if pos_logpa.size else 0.0 + pa_term = wasserstein1(neg_logpa, pos_logpa) / spread if spread > 0 else 0.0 + deg_term = ks_statistic(neg_deg, dom_deg) + cov_term = 1.0 - (n_dom / n_pos_domains) if n_pos_domains else 0.0 + j = args.w_pa * pa_term + args.w_deg * deg_term + args.w_cov * cov_term + pos_mean_pa = float(pos_edge_pa.mean()) if pos_edge_pa.size else 0.0 + + log(f"method={args.method} set=positive n_sel={target} n_dom={n_pos_domains} " + f"mean_pa={pos_mean_pa:.1f}") + log(f"method={args.method} set=negative seed={args.seed} J={j:.4f} " + f"pa={pa_term:.4f} deg={deg_term:.4f} cov={cov_term:.4f} n_sel={n_sel} " + f"n_dom={n_dom} mean_pa={mean_pa_neg:.1f}") + + score = { + "method": args.method, + "seed": int(args.seed), + "J": j, + "pa": pa_term, + "deg": deg_term, + "cov": cov_term, + "n_sel": n_sel, + "n_dom": n_dom, + "mean_pa": mean_pa_neg, + "pos_n_sel": target, + "pos_n_dom": n_pos_domains, + "pos_mean_pa": pos_mean_pa, + "w_pa": args.w_pa, + "w_deg": args.w_deg, + "w_cov": args.w_cov, + } + with open(args.score_out, "w") as fh: + json.dump(score, fh) + + with open(args.pairs_out, "w") as fh: + for a, b in zip(out_a.tolist(), out_b.tolist()): + fh.write(f"{a}\t{b}\n") + + +if __name__ == "__main__": + main() diff --git a/conf/modules.config b/conf/modules.config index 14931ed..7325cec 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -29,10 +29,23 @@ process { // default publishDir copy each of them under `insert/` / `smoke/` / // `init/` would create stale duplicates and racy filename collisions. // Disable publishing for these intermediates explicitly. - withName: 'INIT_DOMAINSPLIT_DB|INSERT_DDIS|INSERT_NEGATIVE_DDIS|INSERT_PPI_NEGATIVE_DDIS|SMOKE_FILTER|INSERT_DOMAIN_GO_TERMS|INSERT_PROTEINS_WITH_EMBEDDINGS|INSERT_PROTEIN_GO_TERMS|INSERT_PPI|INSERT_DOMAIN_PROTEIN_MAPPING' { + withName: 'INIT_DOMAINSPLIT_DB|INSERT_3DID|INSERT_SINGLE_DOMAIN_PPI|INSERT_PPIDM|INSERT_NEGATOME|REMOVE_SELF_INTERACTIONS|BUILD_SWISSPROT_PFAM_MAP|BUILD_PPI_NEGATIVE_POOL|SELECT_PPI_NEGATIVE_DANS|SMOKE_FILTER|INSERT_DOMAIN_GO_TERMS|INSERT_PROTEINS_WITH_EMBEDDINGS|INSERT_PROTEIN_GO_TERMS|INSERT_PPI|INSERT_DOMAIN_PROTEIN_MAPPING' { publishDir = [ enabled: false ] } + // INSERT_PPI_NEGATIVE_SELECTION's domainsplit.sqlite3 is an intermediate + // (published only via the workflow-level output block), but its + // negative_ppi_method_scores.tsv diagnostic IS published. + withName: 'INSERT_PPI_NEGATIVE_SELECTION' { + publishDir = [ + path: { "${params.outdir}/negative_ppi" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + (filename.equals('versions.yml') || filename.endsWith('.sqlite3')) ? null : filename + } + ] + } + // EXTRACT_UNIQUE_DOMAINS emits a transient pfam_ids.txt consumed only by // DOWNLOAD_PFAM_ALIGNMENTS_BATCH in the same subworkflow. withName: 'EXTRACT_UNIQUE_DOMAINS' { diff --git a/conf/test.config b/conf/test.config index 536b85e..7a6b2a8 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,10 +20,26 @@ process { params { config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' + config_profile_description = 'Minimal stub test dataset, fully offline (run with -stub)' - // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + // Every input below is a tiny local placeholder under tests/data/. The + // pipeline test runs in stub mode (`-stub`), so processes never read these + // files; they only need to exist so Nextflow stages local paths instead of + // downloading the real multi-GB sources or hitting REST APIs. + + // Required file params (no defaults in nextflow.config) + hippie_tsv = "${projectDir}/tests/data/hippie.tsv" + ppidm_tsv = "${projectDir}/tests/data/ppidm.tsv" + negative_ppi_parquet = "${projectDir}/tests/data/negative_ppi.parquet" + + // Source URLs -> local fixtures + url_3did = "${projectDir}/tests/data/3did.sql.gz" + url_negatome = "${projectDir}/tests/data/negatome.txt" + url_uniprot_swissprot_pfam = "${projectDir}/tests/data/swissprot_pfam.tsv" + url_uniprot_id_mapping = "${projectDir}/tests/data/uniprot_id_mapping.dat.gz" + url_uniprot_go_terms = "${projectDir}/tests/data/uniprot_go_terms.tsv" + url_uniprot_sequences = "${projectDir}/tests/data/uniprot_sequences.fasta.gz" + url_uniprot_prott5_embeddings = "${projectDir}/tests/data/prott5.h5" + url_string = "${projectDir}/tests/data/string.txt.gz" + url_pfam2go = "${projectDir}/tests/data/pfam2go.txt" } diff --git a/modules.json b/modules.json index 6e03911..5595224 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "mmseqs/easycluster": { "branch": "master", - "git_sha": "38697a933bef7041bb935c9b8374d9948ce6c794", + "git_sha": "6d46786420b4d7bc88eba026eb389c0c5535d120", "installed_by": ["modules"] } } @@ -16,7 +16,7 @@ "nf-core": { "utils_nextflow_pipeline": { "branch": "master", - "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", + "git_sha": "1a545fcbd762911c21a64ced3dbef99b2b51ac75", "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { @@ -26,7 +26,7 @@ }, "utils_nfschema_plugin": { "branch": "master", - "git_sha": "fdc08b8b1ae74f56686ce21f7ea11ad11990ce57", + "git_sha": "a7b27fd25bfa8dcc07d299e88bd790585901a436", "installed_by": ["subworkflows"] } } diff --git a/modules/local/3did/main.nf b/modules/local/3did/main.nf index b9e3fc5..99873b6 100644 --- a/modules/local/3did/main.nf +++ b/modules/local/3did/main.nf @@ -2,7 +2,7 @@ process DOWNLOAD_3DID_SQLITE { tag "3did" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path mysql_gz_file @@ -27,4 +27,11 @@ process DOWNLOAD_3DID_SQLITE { sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)') END_VERSIONS """ + + stub: + """ + touch 3did.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/analyze_ddi_bias/main.nf b/modules/local/analyze_ddi_bias/main.nf index 2ab9179..706d3b4 100644 --- a/modules/local/analyze_ddi_bias/main.nf +++ b/modules/local/analyze_ddi_bias/main.nf @@ -2,7 +2,7 @@ process ANALYZE_DDI_BIAS { tag "bias_analysis" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path "domainsplit.sqlite3" @@ -24,4 +24,11 @@ process ANALYZE_DDI_BIAS { matplotlib: \$(python3 -c 'import matplotlib; print(matplotlib.__version__)') END_VERSIONS """ + + stub: + """ + mkdir bias_analysis + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/insert_ppi_negative_ddis/environment.yml b/modules/local/build_ppi_negative_pool/environment.yml similarity index 92% rename from modules/local/insert_ppi_negative_ddis/environment.yml rename to modules/local/build_ppi_negative_pool/environment.yml index 8b9582f..61b1f9f 100644 --- a/modules/local/insert_ppi_negative_ddis/environment.yml +++ b/modules/local/build_ppi_negative_pool/environment.yml @@ -6,4 +6,5 @@ dependencies: - sqlite - pandas - pyarrow + - numpy - requests diff --git a/modules/local/insert_ppi_negative_ddis/main.nf b/modules/local/build_ppi_negative_pool/main.nf similarity index 60% rename from modules/local/insert_ppi_negative_ddis/main.nf rename to modules/local/build_ppi_negative_pool/main.nf index 345760e..6d647ec 100644 --- a/modules/local/insert_ppi_negative_ddis/main.nf +++ b/modules/local/build_ppi_negative_pool/main.nf @@ -1,38 +1,47 @@ -process INSERT_PPI_NEGATIVE_DDIS { - tag "insert_ppi_negative_ddis" +process BUILD_PPI_NEGATIVE_POOL { + tag "build_ppi_negative_pool" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' path negative_ppi_parquet val min_n_tested - val source_label - val sampling_strategy + val self_interaction output: path "domainsplit.sqlite3", emit: domainsplit_db + path "neg_pool.npz", emit: neg_pool path "uniprot_pfam_mapping.json", emit: pfam_mapping path "versions.yml", emit: versions script: + def no_self = self_interaction ? "" : "--no-self" """ cp "${domainsplit_db_in}" domainsplit.sqlite3 - build_ppi_negative_ddis.py \\ + build_ppi_negative_pool.py \\ --db domainsplit.sqlite3 \\ --parquet "${negative_ppi_parquet}" \\ --pfam-mapping-out uniprot_pfam_mapping.json \\ + --pool-out neg_pool.npz \\ --min-n-tested ${min_n_tested} \\ - --source-label "${source_label}" \\ - --sampling-strategy "${sampling_strategy}" + ${no_self} cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python3 -c 'import sys; print(sys.version.split()[0])') pyarrow: \$(python3 -c 'import pyarrow; print(pyarrow.__version__)') + numpy: \$(python3 -c 'import numpy; print(numpy.__version__)') sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)') END_VERSIONS """ + + stub: + """ + touch domainsplit.sqlite3 neg_pool.npz uniprot_pfam_mapping.json + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/curate_domains/extract_unique_domains/main.nf b/modules/local/curate_domains/extract_unique_domains/main.nf index 86edded..856709c 100644 --- a/modules/local/curate_domains/extract_unique_domains/main.nf +++ b/modules/local/curate_domains/extract_unique_domains/main.nf @@ -2,7 +2,7 @@ process EXTRACT_UNIQUE_DOMAINS { tag { "${domainsplit_db.simpleName}" } label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db @@ -32,4 +32,11 @@ process EXTRACT_UNIQUE_DOMAINS { sqlite3: \$(sqlite3 --version | awk '{print \$1}') END_VERSIONS """ + + stub: + """ + echo PF00001 > pfam_ids.txt + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/enrich/insert_domain_go_terms/main.nf b/modules/local/enrich/insert_domain_go_terms/main.nf index 2eb9437..a57c5f5 100644 --- a/modules/local/enrich/insert_domain_go_terms/main.nf +++ b/modules/local/enrich/insert_domain_go_terms/main.nf @@ -2,7 +2,7 @@ process INSERT_DOMAIN_GO_TERMS { tag "insert_domain_go_terms" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' @@ -20,4 +20,11 @@ process INSERT_DOMAIN_GO_TERMS { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/enrich/insert_domain_protein_mapping/main.nf b/modules/local/enrich/insert_domain_protein_mapping/main.nf index 7f7d00c..04d3d2e 100644 --- a/modules/local/enrich/insert_domain_protein_mapping/main.nf +++ b/modules/local/enrich/insert_domain_protein_mapping/main.nf @@ -2,7 +2,7 @@ process INSERT_DOMAIN_PROTEIN_MAPPING { tag "insert_domain_protein_mapping" label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' @@ -22,4 +22,11 @@ process INSERT_DOMAIN_PROTEIN_MAPPING { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/enrich/insert_ppi/main.nf b/modules/local/enrich/insert_ppi/main.nf index 9a27846..0ecc502 100644 --- a/modules/local/enrich/insert_ppi/main.nf +++ b/modules/local/enrich/insert_ppi/main.nf @@ -2,7 +2,7 @@ process INSERT_PPI { tag "insert_ppi" label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' @@ -22,4 +22,11 @@ process INSERT_PPI { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/enrich/insert_protein_go_terms/main.nf b/modules/local/enrich/insert_protein_go_terms/main.nf index 3148d9c..a77973f 100644 --- a/modules/local/enrich/insert_protein_go_terms/main.nf +++ b/modules/local/enrich/insert_protein_go_terms/main.nf @@ -2,7 +2,7 @@ process INSERT_PROTEIN_GO_TERMS { tag "insert_protein_go_terms" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' @@ -20,4 +20,11 @@ process INSERT_PROTEIN_GO_TERMS { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/enrich/insert_proteins_with_embeddings/main.nf b/modules/local/enrich/insert_proteins_with_embeddings/main.nf index 79201a8..6e296e7 100644 --- a/modules/local/enrich/insert_proteins_with_embeddings/main.nf +++ b/modules/local/enrich/insert_proteins_with_embeddings/main.nf @@ -2,7 +2,7 @@ process INSERT_PROTEINS_WITH_EMBEDDINGS { tag "insert_proteins_with_embeddings" label 'process_high' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' @@ -27,4 +27,11 @@ process INSERT_PROTEINS_WITH_EMBEDDINGS { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/esm_embeddings/main.nf b/modules/local/esm_embeddings/main.nf index 7f65c06..e8d9eeb 100644 --- a/modules/local/esm_embeddings/main.nf +++ b/modules/local/esm_embeddings/main.nf @@ -22,7 +22,7 @@ process FILTER_SEQUENCES { tag { "${protein_domain_map.simpleName}" } label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path protein_domain_map @@ -66,6 +66,15 @@ process FILTER_SEQUENCES { f.write(f" python: {sys.version.split()[0]}\\n") f.write(f" biopython: {Bio.__version__}\\n") """ + + stub: + protein_meta = [id: "protein_sequences"] + domain_meta = [id: "domain_sequences"] + """ + touch uniprot_filtered.fasta.gz domain_sequences.fasta.gz + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } // Per-residue protein embeddings. One task per FASTA shard. @@ -74,7 +83,7 @@ process GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK { label 'process_gpu_large' secret 'HF_TOKEN' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-gpu:1.0.0" + container "docker.io/konstantinpelz/domainsplit-gpu:1.0.0" containerOptions { workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? '--env HF_TOKEN --env HF_HOME --env HUGGINGFACE_HUB_CACHE' @@ -108,6 +117,13 @@ process GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK { --max-len ${params.esm_max_len} \\ --smoke-limit ${smoke} """ + + stub: + """ + touch ${input_fasta.simpleName}.esm.h5 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } // GPU-pooled domain embeddings. One task per FASTA shard. @@ -116,7 +132,7 @@ process GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK { label 'process_gpu_large' secret 'HF_TOKEN' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-gpu:1.0.0" + container "docker.io/konstantinpelz/domainsplit-gpu:1.0.0" containerOptions { workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? '--env HF_TOKEN --env HF_HOME --env HUGGINGFACE_HUB_CACHE' @@ -150,6 +166,13 @@ process GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK { --max-len ${params.esm_max_len} \\ --smoke-limit ${smoke} """ + + stub: + """ + touch ${input_fasta.simpleName}.esm.h5 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } workflow generate_esm_embeddings { @@ -169,7 +192,18 @@ workflow generate_esm_embeddings { protein_embeddings = JOIN_PROTEIN_EMBEDDINGS('esm_protein_embeddings', protein_chunks.chunk.collect()).joined domain_embeddings = JOIN_DOMAIN_EMBEDDINGS('esm_domain_embeddings', domain_chunks.chunk.collect() ).joined + ch_versions = Channel.empty().mix( + FILTER_SEQUENCES.out.versions, + SHARD_PROTEIN_FASTA.out.versions, + SHARD_DOMAIN_FASTA.out.versions, + GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK.out.versions, + GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK.out.versions, + JOIN_PROTEIN_EMBEDDINGS.out.versions, + JOIN_DOMAIN_EMBEDDINGS.out.versions, + ) + emit: protein_embeddings domain_embeddings + versions = ch_versions } diff --git a/modules/local/insert_ddis/environment.yml b/modules/local/external_validation_split/environment.yml similarity index 100% rename from modules/local/insert_ddis/environment.yml rename to modules/local/external_validation_split/environment.yml diff --git a/modules/local/external_validation_split/main.nf b/modules/local/external_validation_split/main.nf new file mode 100644 index 0000000..cbd11bd --- /dev/null +++ b/modules/local/external_validation_split/main.nf @@ -0,0 +1,112 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUBSET_DDIS_BY_SOURCE -- build a single split database keeping only DDIs + whose `source` is in the requested set, then prune orphan domains/proteins. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Used for the External-Validation test set, which is placed "as is" (no + leakage-aware partitioning) from the held-out sources. +----------------------------------------------------------------------------*/ + +process SUBSET_DDIS_BY_SOURCE { + tag "subset_${split_name}" + label 'process_medium' + conda "${moduleDir}/environment.yml" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" + + input: + path 'domainsplit.sqlite3' + val source_filter // list of DDI source strings to keep + val split_name // output split name, e.g. 'test' + + output: + path('*.sqlite3'), emit: split_dbs + val output_split_info, emit: split_info + path "versions.yml", emit: versions + + script: + output_split_info = [["${split_name}.sqlite3", split_name]] + def src_list = source_filter.collect { "'${it}'" }.join(", ") + + """ + #!/usr/bin/env python3 + import os + os.environ["SQLITE_TMPDIR"] = os.getcwd() + + import sqlite3 + import shutil + import sys + + input_db_path = "domainsplit.sqlite3" + output_path = "${split_name}.sqlite3" + sources = (${src_list},) + + shutil.copyfile(input_db_path, output_path) + + conn = sqlite3.connect(output_path) + conn.executescript(''' + PRAGMA foreign_keys=ON; + PRAGMA journal_mode=OFF; + PRAGMA synchronous=OFF; + ''') + + placeholders = ",".join("?" for _ in sources) + n_keep = conn.execute( + f"SELECT COUNT(*) FROM domain_domain_interaction WHERE source IN ({placeholders})", + sources, + ).fetchone()[0] + print(f"Keeping {n_keep} DDIs with source in {sources}", flush=True) + + conn.execute( + f"DELETE FROM domain_domain_interaction WHERE source NOT IN ({placeholders})", + sources, + ) + + conn.execute(''' + DELETE FROM domain WHERE id IN ( + SELECT d.id FROM domain d + LEFT JOIN domain_domain_interaction ddi + ON ddi.domain_id_a = d.id OR ddi.domain_id_b = d.id + LEFT JOIN domain_protein_map dpm + ON dpm.domain_id = d.id + WHERE ddi.id IS NULL OR dpm.domain_id IS NULL + ) + ''') + + conn.execute(''' + DELETE FROM protein WHERE id IN ( + SELECT p.id FROM protein p + LEFT JOIN domain_protein_map dpm + ON dpm.protein_id = p.id + WHERE dpm.domain_id IS NULL + ) + ''') + + conn.executescript(''' + VACUUM; + + CREATE INDEX IF NOT EXISTS idx_ddi_domain_a ON domain_domain_interaction(domain_id_a); + CREATE INDEX IF NOT EXISTS idx_ddi_domain_b ON domain_domain_interaction(domain_id_b); + CREATE INDEX IF NOT EXISTS idx_dpm_domain ON domain_protein_map(domain_id); + CREATE INDEX IF NOT EXISTS idx_dpm_protein ON domain_protein_map(protein_id); + CREATE INDEX IF NOT EXISTS idx_ppi_protein_a ON protein_protein_interaction(protein_id_a); + CREATE INDEX IF NOT EXISTS idx_ppi_protein_b ON protein_protein_interaction(protein_id_b); + CREATE INDEX IF NOT EXISTS idx_pgo_protein ON protein_go_terms(protein_id); + ''') + + conn.close() + print(f" {output_path}: done", flush=True) + + with open("versions.yml", "w") as f: + f.write('"${task.process}":\\n') + f.write(f" python: {sys.version.split()[0]}\\n") + f.write(f" sqlite3: {sqlite3.sqlite_version}\\n") + """ + + stub: + output_split_info = [["${split_name}.sqlite3", split_name]] + """ + touch ${split_name}.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ +} diff --git a/modules/local/init_domainsplit_db/main.nf b/modules/local/init_domainsplit_db/main.nf index 95cb1c1..53bf98e 100644 --- a/modules/local/init_domainsplit_db/main.nf +++ b/modules/local/init_domainsplit_db/main.nf @@ -2,7 +2,7 @@ process INIT_DOMAINSPLIT_DB { tag "init_domainsplit_db" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" output: path "domainsplit.sqlite3", emit: domainsplit_db @@ -31,7 +31,7 @@ process INIT_DOMAINSPLIT_DB { source VARCHAR(255), FOREIGN KEY(domain_id_a) REFERENCES domain ON DELETE CASCADE, FOREIGN KEY(domain_id_b) REFERENCES domain ON DELETE CASCADE, - UNIQUE(domain_id_a, domain_id_b) + UNIQUE(domain_id_a, domain_id_b, source) ); CREATE TABLE protein ( @@ -83,4 +83,11 @@ process INIT_DOMAINSPLIT_DB { f.write(f" python: {sys.version.split()[0]}\\n") f.write(f" sqlite3: {sqlite3.sqlite_version}\\n") """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/insert_3did/environment.yml b/modules/local/insert_3did/environment.yml new file mode 100644 index 0000000..514346d --- /dev/null +++ b/modules/local/insert_3did/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11 + - sqlite diff --git a/modules/local/insert_3did/main.nf b/modules/local/insert_3did/main.nf new file mode 100644 index 0000000..5434f10 --- /dev/null +++ b/modules/local/insert_3did/main.nf @@ -0,0 +1,32 @@ +process INSERT_3DID { + tag "insert_3did" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" + + input: + path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' + path sqlite_3did + + output: + path "domainsplit.sqlite3", emit: domainsplit_db + path "versions.yml", emit: versions + + script: + """ + cp "${domainsplit_db_in}" domainsplit.sqlite3 + + insert_3did.py \\ + --db domainsplit.sqlite3 \\ + --sqlite-3did ${sqlite_3did} \\ + --versions versions.yml \\ + --process-name "${task.process}" + """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ +} diff --git a/modules/local/insert_ddis/main.nf b/modules/local/insert_ddis/main.nf deleted file mode 100644 index 1acf264..0000000 --- a/modules/local/insert_ddis/main.nf +++ /dev/null @@ -1,99 +0,0 @@ -process INSERT_DDIS { - tag "insert_ddis" - label 'process_low' - conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" - - input: - path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' - path sqlite_3did - path negatome_txt - - output: - path "domainsplit.sqlite3", emit: domainsplit_db - path "versions.yml", emit: versions - - script: - """ - #!/usr/bin/env python3 - import shutil - import sqlite3 - import sys - - shutil.copy("${domainsplit_db_in}", "domainsplit.sqlite3") - - conn_3did = sqlite3.connect("${sqlite_3did}") - conn_domainsplit = sqlite3.connect("domainsplit.sqlite3") - conn_domainsplit.execute("PRAGMA foreign_keys=ON") - conn_domainsplit.execute("PRAGMA journal_mode=OFF") - conn_domainsplit.execute("PRAGMA synchronous=OFF") - - def iter_negatome_pairs(path): - with open(path) as f: - for line in f: - tokens = line.split() - if len(tokens) < 2: - continue - yield tokens[0], tokens[1] - - def negatome_pfam_ids(path): - ids = set() - for a, b in iter_negatome_pairs(path): - ids.add(a) - ids.add(b) - return ids - - # ---- domain rows: 3did Domain x domain_length + negatome pfam ids - print("Inserting domain information", flush=True) - cursor = conn_3did.execute( - "SELECT Name, Pfam_id, profile_length " - "FROM Domain, domain_length " - "WHERE domain_length.domain = Domain.Name" - ) - domain_rows_3did = ((name, pfam_id.split(".")[0]) for (name, pfam_id, _length) in cursor) - domain_rows_negatome = ((None, pfam_id) for pfam_id in negatome_pfam_ids("${negatome_txt}")) - - conn_domainsplit.executemany( - "INSERT OR IGNORE INTO domain(name, pfam_id) VALUES (?, ?);", - list(domain_rows_3did) + list(domain_rows_negatome), - ) - cursor.close() - conn_domainsplit.commit() - - # ---- positive DDIs from 3did - print("Inserting positive DDIs from 3did", flush=True) - cursor = conn_3did.execute( - "SELECT d1.Pfam_id, d2.Pfam_id " - "FROM DDI1, Domain AS d1, Domain AS d2 " - "WHERE DDI1.domain1 = d1.Name AND DDI1.domain2 = d2.Name;" - ) - pos_iter = ((id_1.split(".")[0], id_2.split(".")[0]) for (id_1, id_2) in cursor) - conn_domainsplit.executemany( - '''INSERT OR IGNORE INTO domain_domain_interaction(domain_id_a, domain_id_b, negative, source) - SELECT d1.id, d2.id, FALSE, '3did' - FROM domain AS d1, domain AS d2 - WHERE d1.pfam_id = ? AND d2.pfam_id = ?;''', - pos_iter, - ) - cursor.close() - conn_domainsplit.commit() - - # ---- negative DDIs from negatome - print("Inserting negative DDIs from negatome", flush=True) - conn_domainsplit.executemany( - '''INSERT OR IGNORE INTO domain_domain_interaction(domain_id_a, domain_id_b, negative, source) - SELECT d1.id, d2.id, TRUE, 'negatome' - FROM domain AS d1, domain AS d2 - WHERE d1.pfam_id = ? AND d2.pfam_id = ?;''', - iter_negatome_pairs("${negatome_txt}"), - ) - conn_domainsplit.commit() - conn_domainsplit.close() - conn_3did.close() - - with open("versions.yml", "w") as f: - f.write('"${task.process}":\\n') - f.write(f" python: {sys.version.split()[0]}\\n") - f.write(f" sqlite3: {sqlite3.sqlite_version}\\n") - """ -} diff --git a/modules/local/insert_negatome/environment.yml b/modules/local/insert_negatome/environment.yml new file mode 100644 index 0000000..514346d --- /dev/null +++ b/modules/local/insert_negatome/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11 + - sqlite diff --git a/modules/local/insert_negatome/main.nf b/modules/local/insert_negatome/main.nf new file mode 100644 index 0000000..5db9dc8 --- /dev/null +++ b/modules/local/insert_negatome/main.nf @@ -0,0 +1,32 @@ +process INSERT_NEGATOME { + tag "insert_negatome" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" + + input: + path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' + path negatome_txt + + output: + path "domainsplit.sqlite3", emit: domainsplit_db + path "versions.yml", emit: versions + + script: + """ + cp "${domainsplit_db_in}" domainsplit.sqlite3 + + insert_negatome.py \\ + --db domainsplit.sqlite3 \\ + --negatome ${negatome_txt} \\ + --versions versions.yml \\ + --process-name "${task.process}" + """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ +} diff --git a/modules/local/insert_ppi_negative_selection/environment.yml b/modules/local/insert_ppi_negative_selection/environment.yml new file mode 100644 index 0000000..ac84956 --- /dev/null +++ b/modules/local/insert_ppi_negative_selection/environment.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.12 + - sqlite + - numpy diff --git a/modules/local/insert_ppi_negative_selection/main.nf b/modules/local/insert_ppi_negative_selection/main.nf new file mode 100644 index 0000000..9d5daf9 --- /dev/null +++ b/modules/local/insert_ppi_negative_selection/main.nf @@ -0,0 +1,47 @@ +process INSERT_PPI_NEGATIVE_SELECTION { + tag "insert_ppi_negative_selection" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" + + input: + path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' + path neg_pool + path pairs_deletion + path pairs_random_addition + path score_deletion + path score_random_addition + + output: + path "domainsplit.sqlite3", emit: domainsplit_db + path "negative_ppi_method_scores.tsv", emit: scores + path "versions.yml", emit: versions + + script: + """ + cp "${domainsplit_db_in}" domainsplit.sqlite3 + + insert_ppi_negative_selection.py \\ + --db domainsplit.sqlite3 \\ + --pool "${neg_pool}" \\ + --pairs-deletion "${pairs_deletion}" \\ + --pairs-random-addition "${pairs_random_addition}" \\ + --score-deletion "${score_deletion}" \\ + --score-random-addition "${score_random_addition}" \\ + --scores-out negative_ppi_method_scores.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 -c 'import sys; print(sys.version.split()[0])') + numpy: \$(python3 -c 'import numpy; print(numpy.__version__)') + sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)') + END_VERSIONS + """ + + stub: + """ + touch domainsplit.sqlite3 negative_ppi_method_scores.tsv + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ +} diff --git a/modules/local/insert_ppidm/environment.yml b/modules/local/insert_ppidm/environment.yml new file mode 100644 index 0000000..514346d --- /dev/null +++ b/modules/local/insert_ppidm/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11 + - sqlite diff --git a/modules/local/insert_ppidm/main.nf b/modules/local/insert_ppidm/main.nf new file mode 100644 index 0000000..3c5244e --- /dev/null +++ b/modules/local/insert_ppidm/main.nf @@ -0,0 +1,34 @@ +process INSERT_PPIDM { + tag "insert_ppidm" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" + + input: + path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' + path ppidm_tsv + val classes + + output: + path "domainsplit.sqlite3", emit: domainsplit_db + path "versions.yml", emit: versions + + script: + """ + cp "${domainsplit_db_in}" domainsplit.sqlite3 + + insert_ppidm.py \\ + --db domainsplit.sqlite3 \\ + --ppidm ${ppidm_tsv} \\ + --classes "${classes}" \\ + --versions versions.yml \\ + --process-name "${task.process}" + """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ +} diff --git a/modules/local/insert_single_domain_ppi/environment.yml b/modules/local/insert_single_domain_ppi/environment.yml new file mode 100644 index 0000000..514346d --- /dev/null +++ b/modules/local/insert_single_domain_ppi/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11 + - sqlite diff --git a/modules/local/insert_single_domain_ppi/main.nf b/modules/local/insert_single_domain_ppi/main.nf new file mode 100644 index 0000000..3884ba6 --- /dev/null +++ b/modules/local/insert_single_domain_ppi/main.nf @@ -0,0 +1,36 @@ +process INSERT_SINGLE_DOMAIN_PPI { + tag "insert_single_domain_ppi" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" + + input: + path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' + path hippie_tsv + path swissprot_map + val min_score + + output: + path "domainsplit.sqlite3", emit: domainsplit_db + path "versions.yml", emit: versions + + script: + """ + cp "${domainsplit_db_in}" domainsplit.sqlite3 + + insert_single_domain_ppi.py \\ + --db domainsplit.sqlite3 \\ + --hippie ${hippie_tsv} \\ + --swissprot-map ${swissprot_map} \\ + --min-score ${min_score} \\ + --versions versions.yml \\ + --process-name "${task.process}" + """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ +} diff --git a/modules/local/minimal_leakage_split/main.nf b/modules/local/minimal_leakage_split/main.nf index b0a4099..f5130d9 100644 --- a/modules/local/minimal_leakage_split/main.nf +++ b/modules/local/minimal_leakage_split/main.nf @@ -2,7 +2,7 @@ process EXTRACT_DOMAIN_SEQUENCES { tag "domains" label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path "domainsplit.sqlite3" @@ -29,18 +29,26 @@ process EXTRACT_DOMAIN_SEQUENCES { sqlite3: \$(sqlite3 --version | awk '{print \$1}') END_VERSIONS """ + + stub: + """ + touch domain_sequences.fasta.gz + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } process MINIMAL_LEAKAGE_SPLIT_DOMAIN { tag "minimal_leakage_domain" label 'process_high' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path "domainsplit.sqlite3" val split_fractions // e.g., [("train", 0.6), ("optimization", 0.2), ("test", 0.2)] path ("domain_clusters.tsv") + val source_filter // list of DDI source strings to include; [] = all sources output: path('*.sqlite3'), emit: split_dbs @@ -59,6 +67,9 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN { def split_fraction_dict_str = output_file_fraction_dict.collect { k, v -> "'${k}': ${v}" }.join(", ") def split_fraction_dict_py = "{" + split_fraction_dict_str + "}" + def src_list = source_filter.collect { "'${it}'" }.join(", ") + def where_clause = source_filter ? "WHERE source IN (${src_list})" : "" + """ #!/usr/bin/env python3 \"\"\" @@ -112,7 +123,7 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN { # ── Load DDI data ──────────────────────────────────────────────── conn = sqlite3.connect(input_db_path) ddi_rows = conn.execute( - "SELECT id, domain_id_a, domain_id_b FROM domain_domain_interaction" + "SELECT id, domain_id_a, domain_id_b FROM domain_domain_interaction ${where_clause}" ).fetchall() conn.close() print(f"Loaded {len(ddi_rows)} DDIs") @@ -393,4 +404,14 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN { f.write(f" python: {_sys.version.split()[0]}\\n") f.write(f" numpy: {np.__version__}\\n") """ + + stub: + output_split_info = [] + split_fractions.each { name, fraction -> output_split_info << ["${name}.sqlite3", name] } + def touch_cmds = output_split_info.collect { "touch ${it[0]}" }.join("\n ") + """ + ${touch_cmds} + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/negatome/main.nf b/modules/local/negatome/main.nf index fae1152..a3eaf61 100644 --- a/modules/local/negatome/main.nf +++ b/modules/local/negatome/main.nf @@ -2,7 +2,7 @@ process DOWNLOAD_NEGATOME { tag "negatome" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: val url @@ -53,4 +53,11 @@ process DOWNLOAD_NEGATOME { f.write('"${task.process}":\\n') f.write(f" python: {sys.version.split()[0]}\\n") """ + + stub: + """ + touch combined_pfam.txt + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/pfam/main.nf b/modules/local/pfam/main.nf index b5507f4..b48cf04 100644 --- a/modules/local/pfam/main.nf +++ b/modules/local/pfam/main.nf @@ -2,7 +2,7 @@ process DOWNLOAD_PFAM_ALIGNMENTS_BATCH { tag { "batch_${pfam_ids_list.size()}" } label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" maxRetries 3 errorStrategy { task.attempt <= 3 ? 'retry' : 'ignore' } @@ -75,13 +75,20 @@ with open("versions.yml", "w") as f: f.write('"${task.process}":\\n') f.write(f" python: {sys.version.split()[0]}\\n") """ + + stub: + """ + touch PF00001.alignment.full.gz + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } process CREATE_PROTEIN_DOMAIN_MAPPING { tag { "${uniprot_map_file.simpleName}" } label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path uniprot_map_file @@ -169,4 +176,12 @@ with open("versions.yml", "w") as f: f.write('"${task.process}":\\n') f.write(f" python: {sys.version.split()[0]}\\n") """ + + stub: + out_path = 'protein_domain_mapping.csv.gz' + """ + touch ${out_path} + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/random_ddi_split/main.nf b/modules/local/random_ddi_split/main.nf index 5e67e95..e9a4564 100644 --- a/modules/local/random_ddi_split/main.nf +++ b/modules/local/random_ddi_split/main.nf @@ -2,11 +2,12 @@ process RANDOM_DDI_SPLIT { tag "random_ddi" label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path 'domainsplit.sqlite3' val split_fractions // e.g., [("train", 0.6), ("optimization", 0.2), ("test", 0.2)] + val source_filter // list of DDI source strings to include; [] = all sources output: path('*.sqlite3'), emit: split_dbs @@ -25,6 +26,9 @@ process RANDOM_DDI_SPLIT { def split_fraction_dict_str = output_file_fraction_dict.collect { k, v -> "'${k}': ${v}" }.join(", ") def split_fraction_dict_py = "{" + split_fraction_dict_str + "}" + def src_list = source_filter.collect { "'${it}'" }.join(", ") + def where_clause = source_filter ? "WHERE source IN (${src_list})" : "" + """ #!/usr/bin/env python3 @@ -39,7 +43,7 @@ process RANDOM_DDI_SPLIT { split_fractions = ${split_fraction_dict_py} conn = sqlite3.connect(input_db_path) - ddi_ids = [row[0] for row in conn.execute("SELECT id FROM domain_domain_interaction")] + ddi_ids = [row[0] for row in conn.execute("SELECT id FROM domain_domain_interaction ${where_clause}")] conn.close() random.shuffle(ddi_ids) @@ -122,4 +126,14 @@ process RANDOM_DDI_SPLIT { f.write('"${task.process}":\\n') f.write(f" python: {_sys.version.split()[0]}\\n") """ + + stub: + output_split_info = [] + split_fractions.each { name, fraction -> output_split_info << ["${name}.sqlite3", name] } + def touch_cmds = output_split_info.collect { "touch ${it[0]}" }.join("\n ") + """ + ${touch_cmds} + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/remove_self_interactions/environment.yml b/modules/local/remove_self_interactions/environment.yml new file mode 100644 index 0000000..514346d --- /dev/null +++ b/modules/local/remove_self_interactions/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11 + - sqlite diff --git a/modules/local/remove_self_interactions/main.nf b/modules/local/remove_self_interactions/main.nf new file mode 100644 index 0000000..f453df5 --- /dev/null +++ b/modules/local/remove_self_interactions/main.nf @@ -0,0 +1,47 @@ +process REMOVE_SELF_INTERACTIONS { + tag "remove_self_interactions" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" + + input: + path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' + + output: + path "domainsplit.sqlite3", emit: domainsplit_db + path "versions.yml", emit: versions + + script: + """ + #!/usr/bin/env python3 + import shutil + import sqlite3 + import sys + + shutil.copy("${domainsplit_db_in}", "domainsplit.sqlite3") + + conn = sqlite3.connect("domainsplit.sqlite3") + conn.execute("PRAGMA foreign_keys=ON") + before = conn.execute("SELECT COUNT(*) FROM domain_domain_interaction").fetchone()[0] + conn.execute( + "DELETE FROM domain_domain_interaction WHERE domain_id_a = domain_id_b" + ) + conn.commit() + after = conn.execute("SELECT COUNT(*) FROM domain_domain_interaction").fetchone()[0] + conn.close() + print(f"[remove_self_interactions] removed {before - after} self-DDIs " + f"({before} -> {after})", flush=True) + + with open("versions.yml", "w") as f: + f.write('"${task.process}":\\n') + f.write(f" python: {sys.version.split()[0]}\\n") + f.write(f" sqlite3: {sqlite3.sqlite_version}\\n") + """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ +} diff --git a/modules/local/select_ppi_negative_dans/environment.yml b/modules/local/select_ppi_negative_dans/environment.yml new file mode 100644 index 0000000..22cb361 --- /dev/null +++ b/modules/local/select_ppi_negative_dans/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.12 + - numpy diff --git a/modules/local/select_ppi_negative_dans/main.nf b/modules/local/select_ppi_negative_dans/main.nf new file mode 100644 index 0000000..d06c827 --- /dev/null +++ b/modules/local/select_ppi_negative_dans/main.nf @@ -0,0 +1,39 @@ +process SELECT_PPI_NEGATIVE_DANS { + tag "select_ppi_negative_dans:${method}" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" + + input: + val method + val seed + path neg_pool + + output: + path "score_${method}.json", emit: score + path "pairs_${method}.tsv", emit: pairs + path "versions.yml", emit: versions + + script: + """ + select_ppi_negative_dans.py \\ + --pool "${neg_pool}" \\ + --method ${method} \\ + --seed ${seed} \\ + --score-out score_${method}.json \\ + --pairs-out pairs_${method}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 -c 'import sys; print(sys.version.split()[0])') + numpy: \$(python3 -c 'import numpy; print(numpy.__version__)') + END_VERSIONS + """ + + stub: + """ + touch score_${method}.json pairs_${method}.tsv + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ +} diff --git a/modules/local/smoke_filter/main.nf b/modules/local/smoke_filter/main.nf index 3f60a3b..fb2232b 100644 --- a/modules/local/smoke_filter/main.nf +++ b/modules/local/smoke_filter/main.nf @@ -2,7 +2,7 @@ process SMOKE_FILTER { tag "smoke_filter" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db @@ -88,4 +88,11 @@ process SMOKE_FILTER { f.write(f" python: {sys.version.split()[0]}\\n") f.write(f" sqlite3: {sqlite3.sqlite_version}\\n") """ + + stub: + """ + touch domainsplit.smoke.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/swissprot_map/environment.yml b/modules/local/swissprot_map/environment.yml new file mode 100644 index 0000000..150b843 --- /dev/null +++ b/modules/local/swissprot_map/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11 diff --git a/modules/local/swissprot_map/main.nf b/modules/local/swissprot_map/main.nf new file mode 100644 index 0000000..d79b51c --- /dev/null +++ b/modules/local/swissprot_map/main.nf @@ -0,0 +1,29 @@ +process BUILD_SWISSPROT_PFAM_MAP { + tag "swissprot_map" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" + + input: + val url + + output: + path "swissprot_pfam_map.json", emit: map + path "versions.yml", emit: versions + + script: + """ + build_swissprot_pfam_map.py \\ + --url "${url}" \\ + --out swissprot_pfam_map.json \\ + --versions versions.yml \\ + --process-name "${task.process}" + """ + + stub: + """ + touch swissprot_pfam_map.json + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ +} diff --git a/modules/local/util/main.nf b/modules/local/util/main.nf index 2d4d80c..4d57032 100644 --- a/modules/local/util/main.nf +++ b/modules/local/util/main.nf @@ -15,7 +15,7 @@ process SHARD_FASTA { tag { "${input_fasta.simpleName}:${num_shards}" } label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: tuple val(meta), path(input_fasta) @@ -39,6 +39,13 @@ process SHARD_FASTA { print(f" biopython: {Bio.__version__}") PY """ + + stub: + """ + touch ${meta.id}_shard_0.fasta.gz + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } // Merge a collection of HDF5 chunks (plain or gzipped) into one HDF5 file. @@ -51,7 +58,7 @@ process JOIN_HDF_FILES { tag { output_name } label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: val output_name @@ -89,4 +96,11 @@ process JOIN_HDF_FILES { f.write(f" python: {sys.version.split()[0]}\\n") f.write(f" h5py: {h5py.__version__}\\n") """ + + stub: + """ + touch ${output_name}.h5 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/nf-core/mmseqs/easycluster/main.nf b/modules/nf-core/mmseqs/easycluster/main.nf index b4686ab..ded1cb8 100644 --- a/modules/nf-core/mmseqs/easycluster/main.nf +++ b/modules/nf-core/mmseqs/easycluster/main.nf @@ -3,7 +3,7 @@ process MMSEQS_EASYCLUSTER { label 'process_medium' conda "${moduleDir}/environment.yml" - container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/fe/fe49c17754753d6cd9a31e5894117edaf1c81e3d6053a12bf6dc8f3af1dffe23/data' : 'community.wave.seqera.io/library/mmseqs2:18.8cc5c--af05c9a98d9f6139'}" @@ -14,7 +14,7 @@ process MMSEQS_EASYCLUSTER { tuple val(meta), path("*rep_seq.fasta"), emit: representatives tuple val(meta), path("*all_seqs.fasta"), emit: fasta tuple val(meta), path("*.tsv"), emit: tsv - path "versions.yml", emit: versions + tuple val("${task.process}"), val('mmseqs'), eval('mmseqs version'), topic: versions, emit: versions_mmseqs when: task.ext.when == null || task.ext.when @@ -31,10 +31,6 @@ process MMSEQS_EASYCLUSTER { ${args} \\ --threads ${task.cpus} - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') - END_VERSIONS """ stub: @@ -47,9 +43,5 @@ process MMSEQS_EASYCLUSTER { touch ${prefix}_rep_seq.fasta touch ${prefix}_all_seqs.fasta - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') - END_VERSIONS """ } diff --git a/modules/nf-core/mmseqs/easycluster/meta.yml b/modules/nf-core/mmseqs/easycluster/meta.yml index 4451857..0b838ec 100644 --- a/modules/nf-core/mmseqs/easycluster/meta.yml +++ b/modules/nf-core/mmseqs/easycluster/meta.yml @@ -1,4 +1,3 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "mmseqs_easycluster" description: Cluster sequences using MMSeqs2 easy cluster. keywords: @@ -15,7 +14,8 @@ tools: documentation: "https://mmseqs.com/latest/userguide.pdf" tool_dev_url: "https://github.com/soedinglab/MMseqs2" doi: "10.1093/bioinformatics/btw006" - licence: ["GPL v3"] + licence: + - "GPL v3" identifier: biotools:mmseqs input: - - meta: @@ -62,13 +62,27 @@ output: description: an adjacency list file containing the clusters ontologies: - edam: http://edamontology.org/format_3475 # TSV + versions_mmseqs: + - - ${task.process}: + type: string + description: The name of the process + - mmseqs: + type: string + description: The name of the tool + - mmseqs version: + type: eval + description: The expression to obtain the version of the tool +topics: versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" - ontologies: - - edam: http://edamontology.org/format_3750 # YAML + - - ${task.process}: + type: string + description: The name of the process + - mmseqs: + type: string + description: The name of the tool + - mmseqs version: + type: eval + description: The expression to obtain the version of the tool authors: - "@Joon-Klaps" maintainers: diff --git a/modules/nf-core/mmseqs/easycluster/tests/main.nf.test.snap b/modules/nf-core/mmseqs/easycluster/tests/main.nf.test.snap index edb2c7e..5d27d6e 100644 --- a/modules/nf-core/mmseqs/easycluster/tests/main.nf.test.snap +++ b/modules/nf-core/mmseqs/easycluster/tests/main.nf.test.snap @@ -30,7 +30,11 @@ ] ], "3": [ - "versions.yml:md5,719ca0cf390aec3bd0edc9f819108c13" + [ + "MMSEQS_EASYCLUSTER", + "mmseqs", + "18.8cc5c" + ] ], "fasta": [ [ @@ -59,16 +63,20 @@ "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "versions": [ - "versions.yml:md5,719ca0cf390aec3bd0edc9f819108c13" + "versions_mmseqs": [ + [ + "MMSEQS_EASYCLUSTER", + "mmseqs", + "18.8cc5c" + ] ] } ], "meta": { "nf-test": "0.9.2", - "nextflow": "25.10.0" + "nextflow": "25.10.4" }, - "timestamp": "2025-11-01T16:21:16.919838587" + "timestamp": "2026-02-12T11:27:50.850138372" }, "mmseqs/easycluster - sarscov2 - proteome": { "content": [ @@ -101,7 +109,11 @@ ] ], "3": [ - "versions.yml:md5,719ca0cf390aec3bd0edc9f819108c13" + [ + "MMSEQS_EASYCLUSTER", + "mmseqs", + "18.8cc5c" + ] ], "fasta": [ [ @@ -130,15 +142,19 @@ "test_cluster.tsv:md5,1cad5ce35cf71f8c438fd3ec5a786946" ] ], - "versions": [ - "versions.yml:md5,719ca0cf390aec3bd0edc9f819108c13" + "versions_mmseqs": [ + [ + "MMSEQS_EASYCLUSTER", + "mmseqs", + "18.8cc5c" + ] ] } ], "meta": { "nf-test": "0.9.2", - "nextflow": "25.10.0" + "nextflow": "25.10.4" }, - "timestamp": "2025-11-01T16:21:12.483762944" + "timestamp": "2026-02-12T11:27:44.451570131" } } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index e5693d9..6533539 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,11 +10,11 @@ params { // Source database URLs - url_3did = 'https://3did.irbbarcelona.org/download/2022_01/3did.sql.gz' + url_3did = 'https://3did.irbbarcelona.org/download/current/3did.sql.gz' // URLs for uniprot data sources url_uniprot_id_mapping = 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz' - url_uniprot_embeddings = 'https://ftp.ebi.ac.uk/pub/contrib/UniProt/embeddings/current_release/UP000005640_9606/per-residue.h5' + url_uniprot_prott5_embeddings = 'https://ftp.ebi.ac.uk/pub/contrib/UniProt/embeddings/current_release/UP000005640_9606/per-residue.h5' url_uniprot_go_terms = 'https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Cgo_id&format=tsv&query=%28%28database%3AGO%29+AND+%28reviewed%3Atrue%29%29' url_uniprot_sequences = 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz' @@ -25,11 +25,6 @@ params { url_pfam_template = 'https://www.ebi.ac.uk/interpro/wwwapi//entry/pfam/{pfam_id}/?annotation=alignment:full&download' - // ProtT5 per-residue embeddings: local path to the EBI per-residue.h5 file. - // Download from https://ftp.ebi.ac.uk/pub/contrib/UniProt/embeddings/current_release/uniprot_sprot/per-residue.h5 - // When null or file missing, ProtT5 embeddings are skipped with a warning. - prott5_per_residue_h5 = null - // ESM embedding sharding + inference knobs. // The protein FASTA is split into `esm_protein_shards` shards that run in // parallel as GPU tasks (capped by cluster QoS via `maxForks` in slurm.config). @@ -62,19 +57,40 @@ params { // null (default) to disable the smoke filter entirely. smoke_test_n_ddis = null - // Optional second negative-DDI source derived from a Y2H/MS PPI parquet - // (columns: gene_name_bait, gene_name_prey, n_tested, ...). When the path - // is non-null the COLLECT_DDI_DATA subworkflow filters rows by `n_tested`, - // maps genes to UniProt via the Swiss-Prot flat file, looks up Pfam domains via - // the UniProt REST API, enumerates every Pfam-pair cross - // product restricted to Pfam IDs already present in positive DDIs, ranks - // pairs by co-occurrence frequency, and inserts the top-N as negatives so - // the total negative count matches the positive count. Leave the parquet - // null (default) to skip the whole step. + // Negative-DDI source derived from a Y2H/MS PPI parquet (columns: + // gene_name_bait, gene_name_prey, n_tested, ...). COLLECT_DDI_DATA filters + // rows by `n_tested`, maps genes to UniProt then Pfam (via the UniProt REST + // API), enumerates the Pfam-pair cross product restricted to Pfam IDs already + // present in positive DDIs, samples pairs by degree-aware node sampling + // (DANS; weight = preferential attachment in the positive graph), and + // inserts them as negatives so the total negative count matches the positive + // count. Required input (no default; must be supplied per run). negative_ppi_parquet = null negative_ppi_min_n_tested = 5 - negative_ppi_source_label = 'y2h_ms' - negative_sampling_strategy = 'degree_matched' // 'frequency' or 'degree_matched' + // Random seed for the (single-run, uncapped DANS) negative-DDI selection; + // both methods ("deletion" and "random_addition") use this seed. + negative_ppi_seed = 42 + + // Reviewed-human UniProt -> Pfam stream used to detect single-domain + // proteins (accession, entry name, gene names, Pfam xrefs). + url_uniprot_swissprot_pfam = 'https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=tsv&fields=accession%2Cid%2Cgene_names%2Cxref_pfam&query=%28%28reviewed%3Atrue%29+AND+%28organism_id%3A9606%29%29' + + // Positive DDIs inferred from HIPPIE PPIs between two single-domain proteins. + // Required input (no default). Rows are kept when the HIPPIE confidence + // score (column 5) is >= hippie_min_score. + hippie_tsv = null + hippie_min_score = 0.63 + + // Positive DDIs from PPIDM predictions (predicted_ddi_ppi.tsv with columns + // domain_1, domain_2, class). Class is kept as source 'PPIDM_'. + // Required input (no default). + ppidm_tsv = null + ppidm_classes = 'Bronze,Silver,Gold' + + // When false, all self-interactions (a domain interacting with itself) are + // removed after the positive/negatome sources are inserted, and the + // high-conf non-PPI negative builder skips self-pairs. + self_interaction = true // Boilerplate options input = null @@ -146,7 +162,10 @@ profiles { shifter.enabled = false charliecloud.enabled = false apptainer.enabled = false - docker.runOptions = '-u $(id -u):$(id -g)' + // -e HOME=/tmp: the micromamba-based images run `micromamba run` as their + // entrypoint, which needs a writable HOME for its proc dir. With `-u uid:gid` + // the container has no home (HOME=/), so point it at world-writable /tmp. + docker.runOptions = '-u $(id -u):$(id -g) -e HOME=/tmp' } arm64 { process.arch = 'arm64' @@ -160,7 +179,7 @@ profiles { wave.strategy = 'conda,container' } emulate_amd64 { - docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' + docker.runOptions = '-u $(id -u):$(id -g) -e HOME=/tmp --platform=linux/amd64' } singularity { singularity.enabled = true @@ -217,7 +236,7 @@ profiles { wave.strategy = 'conda,container' } gpu { - docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + docker.runOptions = '-u $(id -u):$(id -g) -e HOME=/tmp --gpus all' apptainer.runOptions = '--nv' singularity.runOptions = '--nv' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 7fbe6ef..bfb39b3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -15,6 +15,7 @@ "input": { "type": "string", "format": "file-path", + "mimetype": "text/csv", "description": "Unused samplesheet placeholder; this pipeline reads inputs from the url_* parameters instead.", "help_text": "Kept for nf-core template compatibility. Source databases are configured via the url_* parameters in nextflow.config.", "fa_icon": "fas fa-file-csv", @@ -36,15 +37,16 @@ } }, "source_database_options": { - "title": "Source database URLs", + "title": "Source databases", "type": "object", "fa_icon": "fas fa-database", - "description": "URLs of the public source databases used to assemble the domainsplit database.", + "description": "Public source databases (URLs and local file paths) used to assemble the domainsplit database.", + "required": ["hippie_tsv", "ppidm_tsv", "negative_ppi_parquet"], "properties": { "url_3did": { "type": "string", "description": "URL of the 3did SQL dump archive.", - "default": "https://3did.irbbarcelona.org/download/2022_01/3did.sql.gz", + "default": "https://3did.irbbarcelona.org/download/current/3did.sql.gz", "fa_icon": "fas fa-link" }, "url_uniprot_id_mapping": { @@ -53,7 +55,7 @@ "default": "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz", "fa_icon": "fas fa-link" }, - "url_uniprot_embeddings": { + "url_uniprot_prott5_embeddings": { "type": "string", "description": "URL of the per-residue ProtT5 UniProt embeddings HDF5.", "default": "https://ftp.ebi.ac.uk/pub/contrib/UniProt/embeddings/current_release/UP000005640_9606/per-residue.h5", @@ -95,12 +97,6 @@ "default": "https://www.ebi.ac.uk/interpro/wwwapi//entry/pfam/{pfam_id}/?annotation=alignment:full&download", "fa_icon": "fas fa-link" }, - "prott5_per_residue_h5": { - "type": "string", - "description": "Local path to the EBI ProtT5 per-residue HDF5 file. When null or file missing, ProtT5 embeddings are skipped with a warning.", - "default": null, - "fa_icon": "fas fa-file" - }, "esm_protein_shards": { "type": "integer", "description": "Number of FASTA shards for parallel per-residue ESM embedding generation over the protein sequences. Each shard runs as one GPU task; cluster QoS caps concurrency (see slurm.config).", @@ -156,10 +152,9 @@ "fa_icon": "fas fa-hashtag" }, "negative_ppi_parquet": { - "type": ["string", "null"], + "type": ["string"], "format": "file-path", - "description": "Optional path to a Y2H/MS PPI parquet (columns: gene_name_bait, gene_name_prey, n_tested, ...). When set, COLLECT_DDI_DATA derives extra negative DDIs by mapping bait/prey genes to UniProt then Pfam (via UniProt REST API) and inserting the most-frequent Pfam-pair candidates restricted to domains already in positive DDIs. Null (default) disables this source.", - "default": null, + "description": "Required path to a Y2H/MS PPI parquet (columns: gene_name_bait, gene_name_prey, n_tested, ...). COLLECT_DDI_DATA derives negative DDIs by mapping bait/prey genes to UniProt then Pfam (via UniProt REST API) and inserting Pfam-pair candidates (degree-matched or by frequency) restricted to domains already in positive DDIs.", "fa_icon": "fas fa-file-import" }, "negative_ppi_min_n_tested": { @@ -169,11 +164,11 @@ "minimum": 1, "fa_icon": "fas fa-filter" }, - "negative_ppi_source_label": { - "type": "string", - "description": "Value written to the `source` column of domain_domain_interaction for DDIs derived from negative_ppi_parquet.", - "default": "y2h_ms", - "fa_icon": "fas fa-tag" + "negative_ppi_seed": { + "type": "integer", + "description": "Random seed for the single-run, uncapped DANS negative-DDI selection (used by both the deletion and random_addition methods).", + "default": 42, + "fa_icon": "fas fa-dice" }, "pfam_download_batch_size": { "type": "integer", @@ -182,12 +177,43 @@ "minimum": 1, "fa_icon": "fas fa-layer-group" }, - "negative_sampling_strategy": { + "url_uniprot_swissprot_pfam": { "type": "string", - "description": "Strategy for sampling negative DDIs from the PPI parquet source.", - "default": "degree_matched", - "enum": ["frequency", "degree_matched"], - "fa_icon": "fas fa-random" + "description": "UniProt stream URL (or local TSV path) of reviewed human proteins with fields accession, entry name, gene names and Pfam xrefs. Used to detect single-domain proteins for the HIPPIE-derived positive DDIs.", + "default": "https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=tsv&fields=accession%2Cid%2Cgene_names%2Cxref_pfam&query=%28%28reviewed%3Atrue%29+AND+%28organism_id%3A9606%29%29", + "fa_icon": "fas fa-link" + }, + "hippie_tsv": { + "type": ["string"], + "format": "file-path", + "description": "Required path to a HIPPIE PPI TSV. COLLECT_DDI_DATA adds positive DDIs inferred from PPIs between two single-domain proteins.", + "fa_icon": "fas fa-file-import" + }, + "hippie_min_score": { + "type": "number", + "description": "Minimum HIPPIE confidence score (column 5) required to keep a PPI row when inferring single-domain positive DDIs.", + "default": 0.63, + "minimum": 0, + "maximum": 1, + "fa_icon": "fas fa-filter" + }, + "ppidm_tsv": { + "type": ["string"], + "format": "file-path", + "description": "Required path to a PPIDM predictions TSV (columns: domain_1, domain_2, class). COLLECT_DDI_DATA adds positive DDIs tagged with source 'PPIDM_'.", + "fa_icon": "fas fa-file-import" + }, + "ppidm_classes": { + "type": "string", + "description": "Comma-separated PPIDM confidence classes to include.", + "default": "Bronze,Silver,Gold", + "fa_icon": "fas fa-tags" + }, + "self_interaction": { + "type": "boolean", + "description": "When false, all self-interactions (a domain interacting with itself) are removed after the positive/negatome sources are inserted, and the high-confidence non-PPI negative builder skips self-pairs.", + "default": true, + "fa_icon": "fas fa-redo" } } }, diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf index 5d44942..fe21fea 100644 --- a/subworkflows/local/collect_ddi_data/main.nf +++ b/subworkflows/local/collect_ddi_data/main.nf @@ -4,17 +4,33 @@ pre-initialised Domainsplit SQLite. Downstream code consumes only the database; the 3did SQLite stays internal to this subworkflow. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Sources are inserted in a fixed order so that, on a duplicate domain pair, + the earlier source wins the label (INSERT OR IGNORE): + + 3did -> single-domain PPI -> PPIDM -> negatome + -> [optional self-interaction removal] + -> high-confidence non-PPI negatives (over 3did domains only) + Add a new DDI source by: - 1. Adding its download module (network fetch + format normalisation). - 2. Calling it here and routing the parsed output into INSERT_DDIS (or a - per-source INSERT_* module if its parsing differs). + 1. Adding its download/parse module. + 2. Slotting an INSERT_ call into the chain below (each collects its + own unique Pfam IDs and bulk-creates missing domain rows via the shared + bin/ddi_db_utils.py helper). 3. Tagging its rows with a unique source string in domain_domain_interaction. ----------------------------------------------------------------------------*/ include { DOWNLOAD_3DID_SQLITE } from '../../../modules/local/3did/main.nf' include { DOWNLOAD_NEGATOME } from '../../../modules/local/negatome/main.nf' -include { INSERT_DDIS } from '../../../modules/local/insert_ddis/main.nf' -include { INSERT_PPI_NEGATIVE_DDIS } from '../../../modules/local/insert_ppi_negative_ddis/main.nf' +include { INSERT_3DID } from '../../../modules/local/insert_3did/main.nf' +include { BUILD_SWISSPROT_PFAM_MAP } from '../../../modules/local/swissprot_map/main.nf' +include { INSERT_SINGLE_DOMAIN_PPI } from '../../../modules/local/insert_single_domain_ppi/main.nf' +include { INSERT_PPIDM } from '../../../modules/local/insert_ppidm/main.nf' +include { INSERT_NEGATOME } from '../../../modules/local/insert_negatome/main.nf' +include { REMOVE_SELF_INTERACTIONS } from '../../../modules/local/remove_self_interactions/main.nf' +include { BUILD_PPI_NEGATIVE_POOL } from '../../../modules/local/build_ppi_negative_pool/main.nf' +include { SELECT_PPI_NEGATIVE_DANS as SELECT_DELETION } from '../../../modules/local/select_ppi_negative_dans/main.nf' +include { SELECT_PPI_NEGATIVE_DANS as SELECT_RANDOM_ADDITION } from '../../../modules/local/select_ppi_negative_dans/main.nf' +include { INSERT_PPI_NEGATIVE_SELECTION } from '../../../modules/local/insert_ppi_negative_selection/main.nf' include { SMOKE_FILTER } from '../../../modules/local/smoke_filter/main.nf' workflow COLLECT_DDI_DATA { @@ -22,33 +38,103 @@ workflow COLLECT_DDI_DATA { domainsplit_db_in url_3did url_negatome + url_uniprot_swissprot_pfam + hippie_tsv + ppidm_tsv + negative_ppi_parquet main: + ch_versions = Channel.empty() + + if( !hippie_tsv || !ppidm_tsv || !negative_ppi_parquet ) { + log.error "Required inputs missing: hippie_tsv, ppidm_tsv, and negative_ppi_parquet must be provided" + exit 1 + } file_3did = file(url_3did) sqlite_3did = DOWNLOAD_3DID_SQLITE(file_3did).sqlite negatome_file = DOWNLOAD_NEGATOME(url_negatome).negatome - domainsplit_db = INSERT_DDIS(domainsplit_db_in, sqlite_3did, negatome_file).domainsplit_db + // 1. 3did positives + domainsplit_db = INSERT_3DID(domainsplit_db_in, sqlite_3did).domainsplit_db + + // 2-3. single-domain PPI positives (HIPPIE), using a reviewed-human SwissProt map + swissprot_map = BUILD_SWISSPROT_PFAM_MAP(url_uniprot_swissprot_pfam).map + domainsplit_db = INSERT_SINGLE_DOMAIN_PPI( + domainsplit_db, + file(hippie_tsv), + swissprot_map, + params.hippie_min_score, + ).domainsplit_db - pfam_mapping = Channel.empty() + // 4. PPIDM predicted positives (class kept as source) + domainsplit_db = INSERT_PPIDM( + domainsplit_db, + file(ppidm_tsv), + params.ppidm_classes, + ).domainsplit_db - if (params.negative_ppi_parquet != null) { - ppi_result = INSERT_PPI_NEGATIVE_DDIS( - domainsplit_db, - file(params.negative_ppi_parquet), - params.negative_ppi_min_n_tested, - params.negative_ppi_source_label, - params.negative_sampling_strategy, - ) - domainsplit_db = ppi_result.domainsplit_db - pfam_mapping = ppi_result.pfam_mapping + // 5. negatome negatives + domainsplit_db = INSERT_NEGATOME(domainsplit_db, negatome_file).domainsplit_db + + // 6. optional removal of all self-interactions + if (!params.self_interaction) { + domainsplit_db = REMOVE_SELF_INTERACTIONS(domainsplit_db).domainsplit_db + ch_versions = ch_versions.mix(REMOVE_SELF_INTERACTIONS.out.versions) } + // 7. high-confidence non-PPI negatives via uncapped DANS (Cappelletti et al. + // vbae036), in two flavours that coexist under distinct source labels: + // * "deletion" -- DANS over the PPI candidate pool, with the + // positives reduced to the candidate-domain + // universe (labels 3did_deletion / + // inferred_ppi_screen_negative_for_deletion). + // * "random_addition" -- plain DANS over the full positive set (labels + // 3did_random_addition / + // inferred_ppi_screen_negative_for_random_addition). + // The expensive, deterministic UniProt fetch + candidate-pool build runs + // once; each method is a single deterministic selection (no pick-best). + pool = BUILD_PPI_NEGATIVE_POOL( + domainsplit_db, + file(negative_ppi_parquet), + params.negative_ppi_min_n_tested, + params.self_interaction, + ) + + del = SELECT_DELETION('deletion', params.negative_ppi_seed, pool.neg_pool) + rand = SELECT_RANDOM_ADDITION('random_addition', params.negative_ppi_seed, pool.neg_pool) + + inserted = INSERT_PPI_NEGATIVE_SELECTION( + pool.domainsplit_db, + pool.neg_pool, + del.pairs, + rand.pairs, + del.score, + rand.score, + ) + domainsplit_db = inserted.domainsplit_db + pfam_mapping = pool.pfam_mapping + if (params.smoke_test_n_ddis != null) { domainsplit_db = SMOKE_FILTER(domainsplit_db, params.smoke_test_n_ddis).domainsplit_db + ch_versions = ch_versions.mix(SMOKE_FILTER.out.versions) } + ch_versions = ch_versions.mix( + DOWNLOAD_3DID_SQLITE.out.versions, + DOWNLOAD_NEGATOME.out.versions, + INSERT_3DID.out.versions, + BUILD_SWISSPROT_PFAM_MAP.out.versions, + INSERT_SINGLE_DOMAIN_PPI.out.versions, + INSERT_PPIDM.out.versions, + INSERT_NEGATOME.out.versions, + BUILD_PPI_NEGATIVE_POOL.out.versions, + SELECT_DELETION.out.versions, + SELECT_RANDOM_ADDITION.out.versions, + INSERT_PPI_NEGATIVE_SELECTION.out.versions, + ) + emit: domainsplit_db pfam_mapping + versions = ch_versions } diff --git a/subworkflows/local/collect_ddi_data/meta.yml b/subworkflows/local/collect_ddi_data/meta.yml new file mode 100644 index 0000000..a158369 --- /dev/null +++ b/subworkflows/local/collect_ddi_data/meta.yml @@ -0,0 +1,61 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "collect_ddi_data" +description: Download and parse every DDI source into the pre-initialised Domainsplit SQLite, applying the fixed source-priority order and optional smoke filter. +keywords: + - ddi + - 3did + - negatome + - ppi + - database +components: + - download/3did/sqlite + - download/negatome + - insert/3did + - build/swissprot/pfam/map + - insert/single/domain/ppi + - insert/ppidm + - insert/negatome + - remove/self/interactions + - build/ppi/negative/pool + - select/ppi/negative/dans + - insert/ppi/negative/selection + - smoke/filter +input: + - domainsplit_db_in: + type: file + description: Pre-initialised empty Domainsplit SQLite database. + pattern: "*.sqlite3" + - url_3did: + type: string + description: URL of the 3did flat file dump. + - url_negatome: + type: string + description: URL of the Negatome combined dataset. + - url_uniprot_swissprot_pfam: + type: string + description: URL of the UniProt SwissProt-to-Pfam mapping used to build single-domain PPIs. + - hippie_tsv: + type: file + description: HIPPIE PPI table used to derive single-domain positive DDIs. + pattern: "*.{tsv,txt}" + - ppidm_tsv: + type: file + description: PPIDM inferred domain-domain interaction table. + pattern: "*.{tsv,txt}" + - negative_ppi_parquet: + type: file + description: High-confidence non-PPI pairs used to derive negative DDIs. + pattern: "*.parquet" +output: + - domainsplit_db: + type: file + description: Domainsplit SQLite populated with positive and negative DDIs. + pattern: "*.sqlite3" + - pfam_mapping: + type: file + description: SwissProt protein-to-Pfam domain mapping produced while building single-domain PPIs. + pattern: "*.{tsv,parquet}" +authors: + - "@KonstantinPelz" +maintainers: + - "@KonstantinPelz" diff --git a/subworkflows/local/curate_domains/main.nf b/subworkflows/local/curate_domains/main.nf index a97b28e..91ee944 100644 --- a/subworkflows/local/curate_domains/main.nf +++ b/subworkflows/local/curate_domains/main.nf @@ -28,6 +28,13 @@ workflow CURATE_DOMAINS { pfam_files.collect() ).mapping + ch_versions = Channel.empty().mix( + EXTRACT_UNIQUE_DOMAINS.out.versions, + DOWNLOAD_PFAM_ALIGNMENTS_BATCH.out.versions, + CREATE_PROTEIN_DOMAIN_MAPPING.out.versions, + ) + emit: protein_domain_map + versions = ch_versions } diff --git a/subworkflows/local/curate_domains/meta.yml b/subworkflows/local/curate_domains/meta.yml new file mode 100644 index 0000000..97c2e04 --- /dev/null +++ b/subworkflows/local/curate_domains/meta.yml @@ -0,0 +1,30 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "curate_domains" +description: Enumerate the unique Pfam domains referenced by the DDI set, download their Pfam alignments, and build a protein-to-domain map. +keywords: + - pfam + - domain + - alignment + - mapping +components: + - extract/unique/domains + - download/pfam/alignments/batch + - create/protein/domain/mapping +input: + - domainsplit_db: + type: file + description: Domainsplit SQLite after DDI collection and smoke filtering. + pattern: "*.sqlite3" + - input_uniprot_id_mapping: + type: file + description: UniProt ID mapping used to associate proteins with Pfam domains. + pattern: "*.{tsv,dat,gz}" +output: + - protein_domain_map: + type: file + description: Protein-to-Pfam-domain mapping derived from the Pfam alignments. + pattern: "*.{tsv,parquet}" +authors: + - "@KonstantinPelz" +maintainers: + - "@KonstantinPelz" diff --git a/subworkflows/local/enrich_ddi_database/main.nf b/subworkflows/local/enrich_ddi_database/main.nf index da9eaeb..bddb71e 100644 --- a/subworkflows/local/enrich_ddi_database/main.nf +++ b/subworkflows/local/enrich_ddi_database/main.nf @@ -58,8 +58,17 @@ workflow ENRICH_DDI_DATABASE { db_after_ppi, protein_domain_map, esm_domain_embeddings - ).domainsplit_db.first() + ).domainsplit_db + + ch_versions = Channel.empty().mix( + INSERT_DOMAIN_GO_TERMS.out.versions, + INSERT_PROTEINS_WITH_EMBEDDINGS.out.versions, + INSERT_PROTEIN_GO_TERMS.out.versions, + INSERT_PPI.out.versions, + INSERT_DOMAIN_PROTEIN_MAPPING.out.versions, + ) emit: domainsplit_db + versions = ch_versions } diff --git a/subworkflows/local/enrich_ddi_database/meta.yml b/subworkflows/local/enrich_ddi_database/meta.yml new file mode 100644 index 0000000..fdae557 --- /dev/null +++ b/subworkflows/local/enrich_ddi_database/meta.yml @@ -0,0 +1,58 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "enrich_ddi_database" +description: Sequentially annotate the DDI database with domain GO terms, proteins plus per-residue embeddings, protein GO terms, STRING PPIs, and the per-domain protein/embedding map. +keywords: + - ddi + - go + - embeddings + - ppi + - annotation +components: + - insert/domain/go/terms + - insert/proteins/with/embeddings + - insert/protein/go/terms + - insert/ppi + - insert/domain/protein/mapping +input: + - domainsplit_db_in: + type: file + description: Domainsplit SQLite with curated domains. + pattern: "*.sqlite3" + - input_pfam2go: + type: file + description: pfam2go mapping of Pfam domains to GO terms. + - input_uniprot_sequences: + type: file + description: UniProt protein sequences. + pattern: "*.{fasta,fa,gz}" + - protein_domain_map: + type: file + description: Protein-to-Pfam-domain mapping from CURATE_DOMAINS. + - prott5_embeddings: + type: file + description: ProtT5 per-residue protein embeddings. + pattern: "*.h5" + - input_uniprot_go_terms: + type: file + description: UniProt protein-to-GO-term annotations. + - input_string: + type: file + description: STRING protein-protein interaction table. + - input_uniprot_id_mapping: + type: file + description: UniProt ID mapping linking STRING IDs to UniProt accessions. + - esm_protein_embeddings: + type: file + description: ESM per-residue protein embeddings. + - esm_domain_embeddings: + type: file + description: ESM pooled per-domain embeddings. +output: + - domainsplit_db: + type: file + description: Fully enriched Domainsplit SQLite database. + pattern: "*.sqlite3" +authors: + - "@KonstantinPelz" +maintainers: + - "@KonstantinPelz" diff --git a/subworkflows/local/generate_embeddings/main.nf b/subworkflows/local/generate_embeddings/main.nf deleted file mode 100644 index 1abbeae..0000000 --- a/subworkflows/local/generate_embeddings/main.nf +++ /dev/null @@ -1,26 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - GENERATE_EMBEDDINGS -- run protein-level ESM (protein + domain) - embedding generation. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ESM path produces both per-residue protein embeddings and pooled domain - embeddings against the supplied protein <-> domain map. - - ProtT5 embeddings are supplied externally via params.prott5_per_residue_h5 - and resolved in the top-level workflow (domainsplit.nf). -----------------------------------------------------------------------------*/ - -include { generate_esm_embeddings } from '../../../modules/local/esm_embeddings/main.nf' - -workflow GENERATE_EMBEDDINGS { - take: - protein_domain_map - input_uniprot_sequences - - main: - generate_esm_embeddings(input_uniprot_sequences, protein_domain_map) - - emit: - esm_protein_embeddings = generate_esm_embeddings.out.protein_embeddings - esm_domain_embeddings = generate_esm_embeddings.out.domain_embeddings -} diff --git a/subworkflows/local/split_domainsplit_database/main.nf b/subworkflows/local/split_domainsplit_database/main.nf index b88ef74..4536cca 100644 --- a/subworkflows/local/split_domainsplit_database/main.nf +++ b/subworkflows/local/split_domainsplit_database/main.nf @@ -1,17 +1,35 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SPLIT_DOMAINSPLIT_DATABASE -- split the Domainsplit DB into train/opt/test - sets using random and minimal-leakage strategies. + SPLIT_DOMAINSPLIT_DATABASE -- produce the split strategies, each run ONCE + PER NEGATIVE-DDI METHOD ("deletion" and "random_addition") so the two + methods' core sources stay isolated. Strategies: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Extracts domain sequences, clusters with MMseqs2, then runs the - splitting strategies (random DDI as biased baseline, spectral - graph-partitioning minimal leakage on domains) producing per-split - SQLite databases directly. + * random_ddi biased baseline (random partition) + * minimal_leakage_domain leakage-aware spectral partition + * external_validation leakage-aware train/validation on the "core" + sources (3did copy + the method's PPI-screen + negatives), plus an as-is test set from the + held-out sources (single-domain PPI, PPIDM, + negatome). The held-out test set is + method-independent, so it is built once and routed + into both external_validation_* folders. + + Each strategy therefore yields two method folders, e.g. + random_ddi_deletion / random_ddi_random_addition -> 6 method folders total. + + Domain sequences are extracted and clustered (MMseqs2) once; every + leakage-aware partition reuses the clusters. ----------------------------------------------------------------------------*/ -include { RANDOM_DDI_SPLIT } from '../../../modules/local/random_ddi_split/main' -include { EXTRACT_DOMAIN_SEQUENCES; MINIMAL_LEAKAGE_SPLIT_DOMAIN } from '../../../modules/local/minimal_leakage_split/main' -include { MMSEQS_EASYCLUSTER } from '../../../modules/nf-core/mmseqs/easycluster/main' +include { RANDOM_DDI_SPLIT as RANDOM_DDI_SPLIT_DEL } from '../../../modules/local/random_ddi_split/main' +include { RANDOM_DDI_SPLIT as RANDOM_DDI_SPLIT_RAND } from '../../../modules/local/random_ddi_split/main' +include { EXTRACT_DOMAIN_SEQUENCES } from '../../../modules/local/minimal_leakage_split/main' +include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MLS_DOMAIN_DEL } from '../../../modules/local/minimal_leakage_split/main' +include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MLS_DOMAIN_RAND } from '../../../modules/local/minimal_leakage_split/main' +include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MLS_TRAINVAL_DEL } from '../../../modules/local/minimal_leakage_split/main' +include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MLS_TRAINVAL_RAND } from '../../../modules/local/minimal_leakage_split/main' +include { SUBSET_DDIS_BY_SOURCE } from '../../../modules/local/external_validation_split/main' +include { MMSEQS_EASYCLUSTER } from '../../../modules/nf-core/mmseqs/easycluster/main' def map_split_dbs(split_info_ch, split_dbs_ch, method) { @@ -38,31 +56,83 @@ workflow SPLIT_DOMAINSPLIT_DATABASE { } clusters = MMSEQS_EASYCLUSTER(cluster_input) + def clusters_tsv = clusters.tsv.filter { it[0].id == "domain" }.map { it[1] } def splits = [ ["train", 0.6], ["optimization", 0.2], ["test", 0.2] ] + def trainval_splits = [ + ["train", 0.8], + ["validation", 0.2] + ] + + // Core sources per negative-DDI method: the method's 3did positive copy plus + // its PPI-screen negatives. Must stay in sync with the source labels written + // by bin/insert_ppi_negative_selection.py. + def core_deletion = [ + '3did_deletion', + 'inferred_ppi_screen_negative_for_deletion', + ] + def core_random_addition = [ + '3did_random_addition', + 'inferred_ppi_screen_negative_for_random_addition', + ] + + // External-validation test set: held-out sources placed as is + // (method-independent). + def test_sources = [ + 'single_domain_ppi', + 'PPIDM_Bronze', 'PPIDM_Silver', 'PPIDM_Gold', + 'negatome', + ] // Biased baseline: random DDI split (same proteins in train and test) - RANDOM_DDI_SPLIT( - domainsplit_db_ch, - Channel.of(splits) - ) + RANDOM_DDI_SPLIT_DEL(domainsplit_db_ch, splits, core_deletion) + RANDOM_DDI_SPLIT_RAND(domainsplit_db_ch, splits, core_random_addition) // Leakage-aware: spectral graph partitioning on domain clusters - MINIMAL_LEAKAGE_SPLIT_DOMAIN( + MLS_DOMAIN_DEL(domainsplit_db_ch, splits, clusters_tsv, core_deletion) + MLS_DOMAIN_RAND(domainsplit_db_ch, splits, clusters_tsv, core_random_addition) + + // External validation: leakage-free train/validation on core sources ... + MLS_TRAINVAL_DEL(domainsplit_db_ch, trainval_splits, clusters_tsv, core_deletion) + MLS_TRAINVAL_RAND(domainsplit_db_ch, trainval_splits, clusters_tsv, core_random_addition) + + // ... plus an as-is test set from the held-out sources (shared by both + // external_validation_* methods). + SUBSET_DDIS_BY_SOURCE( domainsplit_db_ch, - splits, - clusters.tsv.filter { it[0].id == "domain" }.map { it[1] } + test_sources, + "test" ) split_ch = Channel.empty().mix( - map_split_dbs(RANDOM_DDI_SPLIT.out.split_info, RANDOM_DDI_SPLIT.out.split_dbs, "random_ddi"), - map_split_dbs(MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_info, MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_dbs, "minimal_leakage_domain") + map_split_dbs(RANDOM_DDI_SPLIT_DEL.out.split_info, RANDOM_DDI_SPLIT_DEL.out.split_dbs, "random_ddi_deletion"), + map_split_dbs(RANDOM_DDI_SPLIT_RAND.out.split_info, RANDOM_DDI_SPLIT_RAND.out.split_dbs, "random_ddi_random_addition"), + map_split_dbs(MLS_DOMAIN_DEL.out.split_info, MLS_DOMAIN_DEL.out.split_dbs, "minimal_leakage_domain_deletion"), + map_split_dbs(MLS_DOMAIN_RAND.out.split_info, MLS_DOMAIN_RAND.out.split_dbs, "minimal_leakage_domain_random_addition"), + map_split_dbs(MLS_TRAINVAL_DEL.out.split_info, MLS_TRAINVAL_DEL.out.split_dbs, "external_validation_deletion"), + map_split_dbs(MLS_TRAINVAL_RAND.out.split_info, MLS_TRAINVAL_RAND.out.split_dbs, "external_validation_random_addition"), + map_split_dbs(SUBSET_DDIS_BY_SOURCE.out.split_info, SUBSET_DDIS_BY_SOURCE.out.split_dbs, "external_validation_deletion"), + map_split_dbs(SUBSET_DDIS_BY_SOURCE.out.split_info, SUBSET_DDIS_BY_SOURCE.out.split_dbs, "external_validation_random_addition") + ) + + // NB: MMSEQS_EASYCLUSTER (nf-core) reports its version via the `versions` + // channel topic, not an `emit: versions` output, so it is not mixed here. + ch_versions = Channel.empty().mix( + EXTRACT_DOMAIN_SEQUENCES.out.versions, + RANDOM_DDI_SPLIT_DEL.out.versions, + RANDOM_DDI_SPLIT_RAND.out.versions, + MLS_DOMAIN_DEL.out.versions, + MLS_DOMAIN_RAND.out.versions, + MLS_TRAINVAL_DEL.out.versions, + MLS_TRAINVAL_RAND.out.versions, + SUBSET_DDIS_BY_SOURCE.out.versions, ) emit: split_db = split_ch + versions = ch_versions } diff --git a/subworkflows/local/split_domainsplit_database/meta.yml b/subworkflows/local/split_domainsplit_database/meta.yml index f2212c8..7298266 100644 --- a/subworkflows/local/split_domainsplit_database/meta.yml +++ b/subworkflows/local/split_domainsplit_database/meta.yml @@ -7,10 +7,10 @@ keywords: - clustering components: - mmseqs/easycluster - - random_ddi_split - - random_denoise_split - - minimal_leakage_split - - split_database + - random/ddi/split + - extract/domain/sequences + - minimal/leakage/split/domain + - subset/ddis/by/source input: - domainsplit_db_ch: type: file diff --git a/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf b/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf index 5ba5d41..c09a912 100644 --- a/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf @@ -10,7 +10,6 @@ include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' include { paramsSummaryMap } from 'plugin/nf-schema' -include { paramsHelp } from 'plugin/nf-schema' include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' @@ -69,7 +68,8 @@ workflow PIPELINE_INITIALISATION { show_hidden, before_text, after_text, - command + command, + null ) // @@ -137,7 +137,6 @@ workflow PIPELINE_COMPLETION { // Generate methods description for MultiQC // def toolCitationText() { - // TODO nf-core: Optionally add in-text citation tools to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ @@ -149,7 +148,6 @@ def toolCitationText() { } def toolBibliographyText() { - // TODO nf-core: Optionally add bibliographic entries to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ @@ -182,7 +180,7 @@ def methodsDescriptionText(mqc_methods_yaml) { meta["tool_citations"] = "" meta["tool_bibliography"] = "" - // TODO nf-core: Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! + // Uncomment below once logic in toolCitationText/toolBibliographyText has been filled. // meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") // meta["tool_bibliography"] = toolBibliographyText() diff --git a/subworkflows/local/utils_nfcore_domainsplit_pipeline/meta.yml b/subworkflows/local/utils_nfcore_domainsplit_pipeline/meta.yml new file mode 100644 index 0000000..5f3bc49 --- /dev/null +++ b/subworkflows/local/utils_nfcore_domainsplit_pipeline/meta.yml @@ -0,0 +1,28 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "PIPELINE_INITIALISATION" +description: Subworkflow with functionality specific to the daisybio/domainsplit pipeline (initialisation and completion). +keywords: + - utility + - pipeline + - initialise + - completion +components: + - utils_nextflow_pipeline + - utils_nfcore_pipeline + - utils_nfschema_plugin + - completionemail + - completionsummary +input: + - nextflow_cli_args: + type: list + description: | + Nextflow CLI positional arguments +output: + - success: + type: boolean + description: | + Dummy output to indicate success +authors: + - "@KonstantinPelz" +maintainers: + - "@KonstantinPelz" diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf index d6e593e..37939ac 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf @@ -73,11 +73,23 @@ def getWorkflowVersion() { def dumpParametersToJSON(outdir) { def timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss') def filename = "params_${timestamp}.json" - def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") - def jsonStr = groovy.json.JsonOutput.toJson(params) + def temp_pf = workflow.launchDir.resolve(".${filename}") + def jsonGenerator = new groovy.json.JsonGenerator.Options() + .excludeNulls() + .addConverter(Path) { Path path -> path.toUriString() } + .addConverter(Duration) { Duration duration -> duration.toMillis() } + .addConverter(MemoryUnit) { MemoryUnit memory -> memory.toBytes() } + .addConverter(nextflow.script.types.VersionNumber) { nextflow.script.types.VersionNumber version -> version.toString() } + .build() + def jsonStr = jsonGenerator.toJson(params) temp_pf.text = groovy.json.JsonOutput.prettyPrint(jsonStr) - - nextflow.extension.FilesEx.copyTo(temp_pf.toPath(), "${outdir}/pipeline_info/params_${timestamp}.json") + if (outdir instanceof Path) { + temp_pf.copyTo(outdir.resolve("pipeline_info/${filename}")) + } else if (outdir instanceof String) { + temp_pf.copyTo("${outdir}/pipeline_info/params_${timestamp}.json") + } else { + log.warn("Could not determine type of outdir, parameters JSON file will not be copied to output directory!") + } temp_pf.delete() } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf index 1df8b76..9ff0681 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/main.nf +++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf @@ -22,6 +22,7 @@ workflow UTILS_NFSCHEMA_PLUGIN { before_text // string: text to show before the help message and parameters summary after_text // string: text to show after the help message and parameters summary command // string: an example command of the pipeline + cli_typecast // boolean: whether to perform typecasting of CLI parameters. Set this to `null` to use the default behaviour main: @@ -34,11 +35,11 @@ workflow UTILS_NFSCHEMA_PLUGIN { fullHelp: help_full, ] if(parameters_schema) { - help_options << [parametersSchema: parameters_schema] + help_options << [parameters_schema: parameters_schema] } log.info paramsHelp( help_options, - (params.help instanceof String && params.help != "true") ? params.help : "", + (help instanceof String && help != "true") ? help : "", ) exit 0 } @@ -50,7 +51,7 @@ workflow UTILS_NFSCHEMA_PLUGIN { summary_options = [:] if(parameters_schema) { - summary_options << [parametersSchema: parameters_schema] + summary_options << [parameters_schema: parameters_schema] } log.info before_text log.info paramsSummaryLog(summary_options, input_workflow) @@ -63,7 +64,10 @@ workflow UTILS_NFSCHEMA_PLUGIN { if(validate_params) { validateOptions = [:] if(parameters_schema) { - validateOptions << [parametersSchema: parameters_schema] + validateOptions << [parameters_schema: parameters_schema] + } + if(cli_typecast != null) { + validateOptions << [cast_cli_params: cli_typecast] } validateParameters(validateOptions) } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/meta.yml b/subworkflows/nf-core/utils_nfschema_plugin/meta.yml index f7d9f02..1d8c75a 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/meta.yml +++ b/subworkflows/nf-core/utils_nfschema_plugin/meta.yml @@ -25,6 +25,30 @@ input: option. When this input is empty it will automatically use the configured schema or "${projectDir}/nextflow_schema.json" as default. The schema should not be given in this way for meta pipelines. + - help: + type: boolean, string + description: | + Show the help message and exit. When a parameter name is given, show the help message for that parameter instead of the general help message. + - help_full: + type: boolean + description: Show the full help message and exit. + - show_hidden: + type: boolean + description: Show hidden parameters in the help message. + - before_text: + type: string + description: Text to show before the parameters summary and help message. + - after_text: + type: string + description: Text to show after the parameters summary and help message. + - command: + type: string + description: An example command to run the pipeline, to show in the help message and the summary. + - cli_typecast: + type: boolean + description: | + Whether to apply typecasting to the parameters given via the CLI before validation. + Set this to `null` to use the default behavior. output: - dummy_emit: type: boolean diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test index c977917..1fd1eac 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test @@ -31,6 +31,7 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" + input[9] = null """ } } @@ -63,6 +64,7 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" + input[9] = null """ } } @@ -95,6 +97,7 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" + input[9] = null """ } } @@ -127,6 +130,7 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" + input[9] = null """ } } @@ -160,6 +164,7 @@ nextflow_workflow { input[6] = "Before" input[7] = "After" input[8] = "nextflow run test/test" + input[9] = null """ } } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config index f6537cc..fd71cb8 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config @@ -1,5 +1,5 @@ plugins { - id "nf-schema@2.6.1" + id "nf-schema@2.7.2" } validation { diff --git a/tests/bin/mmseqs b/tests/bin/mmseqs new file mode 100755 index 0000000..a99de53 --- /dev/null +++ b/tests/bin/mmseqs @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Test-only stub of the `mmseqs` binary. +# +# The nf-test pipeline test (tests/default.nf.test) runs with `-stub` and the +# `test` profile, which enables no container engine — every process runs on the +# host. The nf-core MMSEQS_EASYCLUSTER module captures its version via an +# `eval('mmseqs version')` output directive, which Nextflow executes even in +# stub mode. Without mmseqs installed that fails with exit 127, so this shim +# provides a deterministic fake version. It is only ever on PATH for test runs +# (added via env.PATH in tests/nextflow.config); real runs use the container. +echo "stub" diff --git a/tests/data/3did.sql.gz b/tests/data/3did.sql.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/hippie.tsv b/tests/data/hippie.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/negative_ppi.parquet b/tests/data/negative_ppi.parquet new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/negatome.txt b/tests/data/negatome.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/pfam2go.txt b/tests/data/pfam2go.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/ppidm.tsv b/tests/data/ppidm.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/prott5.h5 b/tests/data/prott5.h5 new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/string.txt.gz b/tests/data/string.txt.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/swissprot_pfam.tsv b/tests/data/swissprot_pfam.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/uniprot_go_terms.tsv b/tests/data/uniprot_go_terms.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/uniprot_id_mapping.dat.gz b/tests/data/uniprot_id_mapping.dat.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/uniprot_sequences.fasta.gz b/tests/data/uniprot_sequences.fasta.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/default.nf.test b/tests/default.nf.test index 86ea2e3..d929657 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -6,6 +6,8 @@ nextflow_pipeline { test("-profile test") { + options "-stub" + when { params { outdir = "$outputDir" @@ -13,20 +15,23 @@ nextflow_pipeline { } then { - // stable_path: All files + folders in ${params.outdir}/ with a stable path (including file name) - def stable_path = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) - // stable_content: All files in ${params.outdir}/ with stable content - def stable_content = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') - assert workflow.success + // This is a `-stub` wiring test: every process runs its stub block and + // emits empty placeholder files, so file *content* is meaningless (and + // empty .gz/.h5 stubs break content hashing). We therefore snapshot only + // the set of produced output paths -- this verifies the whole DAG wires + // together (channel topology, the split fan-out, publish paths) end to + // end without any downloads, GPU, or containers. Everything under + // pipeline_info/ is ignored because those filenames embed a run + // timestamp (e.g. params_.json), which is non-deterministic. + def stable_path = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*']) + // Snapshot the collated software versions with the Nextflow version line + // stripped (so the assertion survives Nextflow upgrades). This also + // satisfies the nf-core `nf_test_content` lint rule, which requires a + // `versions.yml` to be snapshotted by every `*.nf.test`. + def versions_yml = removeNextflowVersion("$outputDir/pipeline_info/nf_core_pipeline_software_mqc_versions.yml") assertAll( - { assert snapshot( - // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions - removeNextflowVersion("$outputDir/pipeline_info/domainsplit_software_mqc_versions.yml"), - // All stable path name, with a relative path - stable_path, - // All files with stable contents - stable_content - ).match() } + { assert workflow.success }, + { assert snapshot(stable_path, versions_yml).match() } ) } } diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap new file mode 100644 index 0000000..864ab56 --- /dev/null +++ b/tests/default.nf.test.snap @@ -0,0 +1,192 @@ +{ + "-profile test": { + "content": [ + [ + "analyze", + "analyze/bias_analysis", + "create", + "create/protein_domain_mapping.csv.gz", + "databases", + "databases/external_validation_deletion", + "databases/external_validation_deletion/test.sqlite3", + "databases/external_validation_deletion/train.sqlite3", + "databases/external_validation_deletion/validation.sqlite3", + "databases/external_validation_random_addition", + "databases/external_validation_random_addition/test.sqlite3", + "databases/external_validation_random_addition/train.sqlite3", + "databases/external_validation_random_addition/validation.sqlite3", + "databases/minimal_leakage_domain_deletion", + "databases/minimal_leakage_domain_deletion/optimization.sqlite3", + "databases/minimal_leakage_domain_deletion/test.sqlite3", + "databases/minimal_leakage_domain_deletion/train.sqlite3", + "databases/minimal_leakage_domain_random_addition", + "databases/minimal_leakage_domain_random_addition/optimization.sqlite3", + "databases/minimal_leakage_domain_random_addition/test.sqlite3", + "databases/minimal_leakage_domain_random_addition/train.sqlite3", + "databases/random_ddi_deletion", + "databases/random_ddi_deletion/optimization.sqlite3", + "databases/random_ddi_deletion/test.sqlite3", + "databases/random_ddi_deletion/train.sqlite3", + "databases/random_ddi_random_addition", + "databases/random_ddi_random_addition/optimization.sqlite3", + "databases/random_ddi_random_addition/test.sqlite3", + "databases/random_ddi_random_addition/train.sqlite3", + "domainsplit.sqlite3", + "download", + "download/3did.sqlite3", + "download/PF00001.alignment.full.gz", + "download/combined_pfam.txt", + "extract", + "extract/domain_sequences.fasta.gz", + "filter", + "filter/domain_sequences.fasta.gz", + "filter/uniprot_filtered.fasta.gz", + "generate", + "generate/domain_sequences_shard_0.esm.h5", + "generate/protein_sequences_shard_0.esm.h5", + "join", + "join/esm_domain_embeddings.h5", + "join/esm_protein_embeddings.h5", + "mls", + "mls/optimization.sqlite3", + "mls/test.sqlite3", + "mls/train.sqlite3", + "mls/validation.sqlite3", + "mmseqs", + "mmseqs/domain.tsv", + "mmseqs/domain_all_seqs.fasta", + "mmseqs/domain_rep_seq.fasta", + "negative_ppi", + "negative_ppi/negative_ppi_method_scores.tsv", + "pipeline_info", + "random", + "random/optimization.sqlite3", + "random/test.sqlite3", + "random/train.sqlite3", + "shard", + "shard/domain_sequences_shard_0.fasta.gz", + "shard/protein_sequences_shard_0.fasta.gz", + "subset", + "subset/test.sqlite3" + ], + { + "ANALYZE_DDI_BIAS": { + "stub": "true" + }, + "BUILD_PPI_NEGATIVE_POOL": { + "stub": "true" + }, + "BUILD_SWISSPROT_PFAM_MAP": { + "stub": "true" + }, + "CREATE_PROTEIN_DOMAIN_MAPPING": { + "stub": "true" + }, + "DOWNLOAD_3DID_SQLITE": { + "stub": "true" + }, + "DOWNLOAD_NEGATOME": { + "stub": "true" + }, + "DOWNLOAD_PFAM_ALIGNMENTS_BATCH": { + "stub": "true" + }, + "EXTRACT_DOMAIN_SEQUENCES": { + "stub": "true" + }, + "EXTRACT_UNIQUE_DOMAINS": { + "stub": "true" + }, + "FILTER_SEQUENCES": { + "stub": "true" + }, + "GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK": { + "stub": "true" + }, + "GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK": { + "stub": "true" + }, + "INIT_DOMAINSPLIT_DB": { + "stub": "true" + }, + "INSERT_3DID": { + "stub": "true" + }, + "INSERT_DOMAIN_GO_TERMS": { + "stub": "true" + }, + "INSERT_DOMAIN_PROTEIN_MAPPING": { + "stub": "true" + }, + "INSERT_NEGATOME": { + "stub": "true" + }, + "INSERT_PPI": { + "stub": "true" + }, + "INSERT_PPIDM": { + "stub": "true" + }, + "INSERT_PPI_NEGATIVE_SELECTION": { + "stub": "true" + }, + "INSERT_PROTEINS_WITH_EMBEDDINGS": { + "stub": "true" + }, + "INSERT_PROTEIN_GO_TERMS": { + "stub": "true" + }, + "INSERT_SINGLE_DOMAIN_PPI": { + "stub": "true" + }, + "JOIN_DOMAIN_EMBEDDINGS": { + "stub": "true" + }, + "JOIN_PROTEIN_EMBEDDINGS": { + "stub": "true" + }, + "MLS_DOMAIN_DEL": { + "stub": "true" + }, + "MLS_DOMAIN_RAND": { + "stub": "true" + }, + "MLS_TRAINVAL_DEL": { + "stub": "true" + }, + "MLS_TRAINVAL_RAND": { + "stub": "true" + }, + "RANDOM_DDI_SPLIT_DEL": { + "stub": "true" + }, + "RANDOM_DDI_SPLIT_RAND": { + "stub": "true" + }, + "SELECT_DELETION": { + "stub": "true" + }, + "SELECT_RANDOM_ADDITION": { + "stub": "true" + }, + "SHARD_DOMAIN_FASTA": { + "stub": "true" + }, + "SHARD_PROTEIN_FASTA": { + "stub": "true" + }, + "SUBSET_DDIS_BY_SOURCE": { + "stub": "true" + }, + "Workflow": { + "daisybio/domainsplit": "v1.0.0dev" + } + } + ], + "timestamp": "2026-06-12T11:10:24.222393192", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + } +} \ No newline at end of file diff --git a/tests/nextflow.config b/tests/nextflow.config index 12b3258..ab87256 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -12,3 +12,12 @@ params { } aws.client.anonymous = true // fixes S3 access issues on self-hosted runners + +// The pipeline test (tests/default.nf.test) runs with `-stub` and no container +// engine, so every process executes on the host. The nf-core MMSEQS_EASYCLUSTER +// module captures its version with an `eval('mmseqs version')` output that runs +// even under -stub; prepend a test-only shim dir so that resolves without the +// real binary. Only applied to nf-test runs (this config is test-only). +env { + PATH = "${projectDir}/tests/bin:\$PATH" +} diff --git a/tests/python/test_insert_negatome.py b/tests/python/test_insert_negatome.py new file mode 100644 index 0000000..9691d7d --- /dev/null +++ b/tests/python/test_insert_negatome.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +"""Local unit-check for bin/insert_negatome.py (no Nextflow, no cluster). + +Builds a tiny empty Domainsplit SQLite and runs the Negatome inserter against a +small synthetic ``combined_pfam.txt``, asserting: + + * each whitespace-separated Pfam pair is stored as ``negative=1, + source='negatome'`` with its domains auto-created; + * lines without at least two tokens (blank / single-token) are skipped. + +Run directly (`python3 tests/python/test_insert_negatome.py`) or via pytest. +""" + +import os +import sqlite3 +import subprocess +import sys +import tempfile + +REPO = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +BIN = os.path.join(REPO, "bin") +INSERTER = os.path.join(BIN, "insert_negatome.py") + +SCHEMA = """ +CREATE TABLE domain (id INTEGER PRIMARY KEY, pfam_id, name, UNIQUE(pfam_id)); +CREATE TABLE domain_domain_interaction ( + id INTEGER PRIMARY KEY, + domain_id_a, domain_id_b, negative, + source VARCHAR(255), + FOREIGN KEY(domain_id_a) REFERENCES domain ON DELETE CASCADE, + FOREIGN KEY(domain_id_b) REFERENCES domain ON DELETE CASCADE, + UNIQUE(domain_id_a, domain_id_b, source) +); +""" + +NEGATOME_LINES = [ + "PF00001 PF00002", # kept + "PF00003\tPF00004", # kept (tab separated) + "PF00005 PF00006", # kept + "PF00007", # single token -> skipped + "", # blank -> skipped +] + + +def test_insert_negatome(): + with tempfile.TemporaryDirectory() as tmp: + db = os.path.join(tmp, "domainsplit.sqlite3") + conn = sqlite3.connect(db) + conn.executescript(SCHEMA) + conn.commit() + conn.close() + + negatome = os.path.join(tmp, "combined_pfam.txt") + with open(negatome, "w") as fh: + fh.write("\n".join(NEGATOME_LINES) + "\n") + + env = dict(os.environ, PYTHONPATH=BIN + os.pathsep + os.environ.get("PYTHONPATH", "")) + subprocess.run( + [sys.executable, INSERTER, "--db", db, "--negatome", negatome, + "--versions", os.path.join(tmp, "versions.yml"), + "--process-name", "TEST:INSERT_NEGATOME"], + check=True, env=env, + ) + + conn = sqlite3.connect(db) + total = conn.execute("SELECT COUNT(*) FROM domain_domain_interaction").fetchone()[0] + assert total == 3, f"expected 3 negatome DDIs, got {total}" + + rows = conn.execute( + "SELECT COUNT(*) FROM domain_domain_interaction " + "WHERE source = 'negatome' AND negative != 0" + ).fetchone()[0] + assert rows == 3, "all negatome rows must be negative with source 'negatome'" + + n_domains = conn.execute("SELECT COUNT(*) FROM domain").fetchone()[0] + assert n_domains == 6, f"expected 6 auto-created domains, got {n_domains}" + conn.close() + + +if __name__ == "__main__": + test_insert_negatome() + print("OK: insert_negatome invariants hold") diff --git a/tests/python/test_insert_ppi_negative_selection.py b/tests/python/test_insert_ppi_negative_selection.py new file mode 100644 index 0000000..d39e10c --- /dev/null +++ b/tests/python/test_insert_ppi_negative_selection.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +"""Local integration check for the dual-source negative insertion (no cluster). + +Validates the schema change (UNIQUE(domain_id_a, domain_id_b, source)) and +bin/insert_ppi_negative_selection.py together: + + * the four method labels are inserted, with 3did_random_addition copying the + full 3did set and 3did_deletion only the pool-domain subset; + * a pair can coexist under '3did' and '3did_random_addition' (duplicate by + source); + * the canonical sources still dedup across each other (a PPIDM pair equal to a + 3did pair is dropped) because insert_ddis defaults to dedup_across_sources. + +Run directly or via pytest. +""" + +import json +import os +import sqlite3 +import subprocess +import sys +import tempfile + +import numpy as np + +REPO = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +BIN = os.path.join(REPO, "bin") +sys.path.insert(0, BIN) + +from ddi_db_utils import ensure_domains, insert_ddis # noqa: E402 + +INSERTER = os.path.join(BIN, "insert_ppi_negative_selection.py") + +SCHEMA = """ +CREATE TABLE domain (id INTEGER PRIMARY KEY, pfam_id, name, UNIQUE(pfam_id)); +CREATE TABLE domain_domain_interaction ( + id INTEGER PRIMARY KEY, + domain_id_a, domain_id_b, negative, + source VARCHAR(255), + FOREIGN KEY(domain_id_a) REFERENCES domain ON DELETE CASCADE, + FOREIGN KEY(domain_id_b) REFERENCES domain ON DELETE CASCADE, + UNIQUE(domain_id_a, domain_id_b, source) +); +""" + + +def pf(i): + return f"PF{i:05d}" + + +def count(conn, source): + return conn.execute( + "SELECT COUNT(*) FROM domain_domain_interaction WHERE source = ?", + (source,), + ).fetchone()[0] + + +def write_score(path, method): + with open(path, "w") as fh: + json.dump({ + "method": method, "seed": 7, "J": 0.1, "pa": 0.1, "deg": 0.1, + "cov": 0.0, "n_sel": 2, "n_dom": 3, "mean_pa": 1.0, + "pos_n_sel": 3, "pos_n_dom": 4, "pos_mean_pa": 2.0, + }, fh) + + +def write_pairs(path, pairs): + with open(path, "w") as fh: + for a, b in pairs: + fh.write(f"{a}\t{b}\n") + + +def test_dual_source_insert(): + with tempfile.TemporaryDirectory() as tmp: + db = os.path.join(tmp, "domainsplit.sqlite3") + conn = sqlite3.connect(db) + conn.executescript(SCHEMA) + ensure_domains(conn, [pf(i) for i in range(1, 7)]) + + # 3did positives, then a PPIDM batch overlapping (1,2). + insert_ddis(conn, [(pf(1), pf(2)), (pf(1), pf(3)), (pf(1), pf(4))], + negative=False, source="3did") + insert_ddis(conn, [(pf(1), pf(2)), (pf(5), pf(6))], + negative=False, source="PPIDM_Gold") + conn.commit() + assert count(conn, "3did") == 3 + assert count(conn, "PPIDM_Gold") == 1, "cross-source dedup broken" + conn.close() + + # Pool covers only domains 1,2,3 -> 3did_deletion keeps (1,2),(1,3). + pool = os.path.join(tmp, "neg_pool.npz") + np.savez(pool, pool_dom=np.array([pf(1), pf(2), pf(3)], dtype=object)) + + write_pairs(os.path.join(tmp, "pairs_deletion.tsv"), + [(pf(2), pf(5)), (pf(3), pf(6))]) + write_pairs(os.path.join(tmp, "pairs_random_addition.tsv"), + [(pf(1), pf(6)), (pf(4), pf(5))]) + write_score(os.path.join(tmp, "score_deletion.json"), "deletion") + write_score(os.path.join(tmp, "score_random_addition.json"), "random_addition") + + env = dict(os.environ, PYTHONPATH=BIN + os.pathsep + os.environ.get("PYTHONPATH", "")) + subprocess.run( + [sys.executable, INSERTER, "--db", db, "--pool", pool, + "--pairs-deletion", os.path.join(tmp, "pairs_deletion.tsv"), + "--pairs-random-addition", os.path.join(tmp, "pairs_random_addition.tsv"), + "--score-deletion", os.path.join(tmp, "score_deletion.json"), + "--score-random-addition", os.path.join(tmp, "score_random_addition.json"), + "--scores-out", os.path.join(tmp, "scores.tsv")], + check=True, env=env, + ) + + conn = sqlite3.connect(db) + assert count(conn, "3did_random_addition") == 3, "full 3did copy wrong" + assert count(conn, "3did_deletion") == 2, "pool-restricted copy wrong" + assert count(conn, "inferred_ppi_screen_negative_for_deletion") == 2 + assert count(conn, "inferred_ppi_screen_negative_for_random_addition") == 2 + # original sources untouched + assert count(conn, "3did") == 3 + assert count(conn, "PPIDM_Gold") == 1 + + # The same pair (1,2) coexists under '3did' and '3did_random_addition'. + n_dup = conn.execute( + "SELECT COUNT(DISTINCT source) FROM domain_domain_interaction ddi " + "JOIN domain da ON da.id = ddi.domain_id_a " + "JOIN domain db ON db.id = ddi.domain_id_b " + "WHERE da.pfam_id = ? AND db.pfam_id = ?", + (pf(1), pf(2)), + ).fetchone()[0] + assert n_dup >= 2, f"(1,2) should exist under >=2 sources, got {n_dup}" + conn.close() + + print("OK: dual-source insert + schema invariants hold") + + +if __name__ == "__main__": + test_dual_source_insert() diff --git a/tests/python/test_insert_ppidm.py b/tests/python/test_insert_ppidm.py new file mode 100644 index 0000000..a296053 --- /dev/null +++ b/tests/python/test_insert_ppidm.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +"""Local unit-check for bin/insert_ppidm.py (no Nextflow, no cluster). + +Builds a tiny empty Domainsplit SQLite and runs the PPIDM inserter against a +small synthetic ``predicted_ddi_ppi.tsv``, asserting: + + * domain tokens like ``10114/PF00069`` are parsed down to the Pfam accession; + * each kept row is stored as ``negative=0, source='PPIDM_'``; + * classes are processed Gold -> Silver -> Bronze, so a pair appearing under + two classes is kept only under the highest-confidence one (cross-source + dedup in insert_ddis); + * unparseable tokens are skipped, and ``--classes`` filters which classes are + inserted at all. + +Run directly (`python3 tests/python/test_insert_ppidm.py`) or via pytest. +""" + +import os +import sqlite3 +import subprocess +import sys +import tempfile + +REPO = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +BIN = os.path.join(REPO, "bin") +INSERTER = os.path.join(BIN, "insert_ppidm.py") + +# Matches the schema the pipeline's INIT_DOMAINSPLIT_DB creates for these tables +# (see tests/python/test_insert_ppi_negative_selection.py). +SCHEMA = """ +CREATE TABLE domain (id INTEGER PRIMARY KEY, pfam_id, name, UNIQUE(pfam_id)); +CREATE TABLE domain_domain_interaction ( + id INTEGER PRIMARY KEY, + domain_id_a, domain_id_b, negative, + source VARCHAR(255), + FOREIGN KEY(domain_id_a) REFERENCES domain ON DELETE CASCADE, + FOREIGN KEY(domain_id_b) REFERENCES domain ON DELETE CASCADE, + UNIQUE(domain_id_a, domain_id_b, source) +); +""" + + +def count(conn, source): + return conn.execute( + "SELECT COUNT(*) FROM domain_domain_interaction WHERE source = ?", + (source,), + ).fetchone()[0] + + +def run_inserter(db, ppidm, classes, tmp): + env = dict(os.environ, PYTHONPATH=BIN + os.pathsep + os.environ.get("PYTHONPATH", "")) + subprocess.run( + [sys.executable, INSERTER, "--db", db, "--ppidm", ppidm, + "--classes", classes, + "--versions", os.path.join(tmp, "versions.yml"), + "--process-name", "TEST:INSERT_PPIDM"], + check=True, env=env, + ) + + +# Tokens carry a leading numeric id before the slash, as in real PPIDM output. +PPIDM_ROWS = [ + "domain_1\tdomain_2\tclass", # header (skipped) + "10/PF00001\t20/PF00002\tGold", # kept -> PPIDM_Gold + "30/PF00003\t40/PF00004\tSilver", # kept -> PPIDM_Silver + "50/PF00005\t60/PF00006\tBronze", # kept -> PPIDM_Bronze + "10/PF00001\t20/PF00002\tSilver", # duplicate pair, lower class -> dropped + "junk\tnonsense\tGold", # unparseable -> skipped +] + + +def write_ppidm(path, rows): + with open(path, "w") as fh: + fh.write("\n".join(rows) + "\n") + + +def test_insert_ppidm_all_classes(): + with tempfile.TemporaryDirectory() as tmp: + db = os.path.join(tmp, "domainsplit.sqlite3") + conn = sqlite3.connect(db) + conn.executescript(SCHEMA) + conn.commit() + conn.close() + + ppidm = os.path.join(tmp, "predicted_ddi_ppi.tsv") + write_ppidm(ppidm, PPIDM_ROWS) + + run_inserter(db, ppidm, "Bronze,Silver,Gold", tmp) + + conn = sqlite3.connect(db) + # One pair per class; the duplicate (PF00001, PF00002) is kept only under + # Gold (processed first) and dropped for Silver via cross-source dedup. + assert count(conn, "PPIDM_Gold") == 1, "Gold count wrong" + assert count(conn, "PPIDM_Silver") == 1, "Silver count wrong (dedup failed?)" + assert count(conn, "PPIDM_Bronze") == 1, "Bronze count wrong" + + # All kept rows are positives stored under a PPIDM_* source only. + total = conn.execute("SELECT COUNT(*) FROM domain_domain_interaction").fetchone()[0] + assert total == 3, f"expected 3 DDIs total, got {total}" + neg = conn.execute( + "SELECT COUNT(*) FROM domain_domain_interaction WHERE negative != 0" + ).fetchone()[0] + assert neg == 0, "PPIDM rows must be positives" + + # The duplicate pair exists only under Gold, not Silver. + n_sources = conn.execute( + "SELECT COUNT(DISTINCT source) FROM domain_domain_interaction ddi " + "JOIN domain da ON da.id = ddi.domain_id_a " + "JOIN domain db ON db.id = ddi.domain_id_b " + "WHERE da.pfam_id = ? AND db.pfam_id = ?", + ("PF00001", "PF00002"), + ).fetchone()[0] + assert n_sources == 1, f"(PF00001,PF00002) should be under 1 source, got {n_sources}" + conn.close() + + +def test_insert_ppidm_class_filter(): + """--classes restricts which classes are inserted at all.""" + with tempfile.TemporaryDirectory() as tmp: + db = os.path.join(tmp, "domainsplit.sqlite3") + conn = sqlite3.connect(db) + conn.executescript(SCHEMA) + conn.commit() + conn.close() + + ppidm = os.path.join(tmp, "predicted_ddi_ppi.tsv") + write_ppidm(ppidm, PPIDM_ROWS) + + run_inserter(db, ppidm, "Gold", tmp) + + conn = sqlite3.connect(db) + assert count(conn, "PPIDM_Gold") == 1 + assert count(conn, "PPIDM_Silver") == 0, "Silver should be excluded" + assert count(conn, "PPIDM_Bronze") == 0, "Bronze should be excluded" + conn.close() + + +if __name__ == "__main__": + test_insert_ppidm_all_classes() + test_insert_ppidm_class_filter() + print("OK: insert_ppidm class handling + dedup invariants hold") diff --git a/tests/python/test_select_ppi_negative_dans.py b/tests/python/test_select_ppi_negative_dans.py new file mode 100644 index 0000000..41bfba9 --- /dev/null +++ b/tests/python/test_select_ppi_negative_dans.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +"""Local unit-check for bin/select_ppi_negative_dans.py (no Nextflow, no cluster). + +Builds a tiny synthetic candidate pool and runs both DANS methods, asserting the +core invariants: + + * deletion -- draws exactly n_positive_r edges, all from the candidate + pool, all endpoints within the pool-domain universe. + * random_addition -- draws exactly n_positive edges, none a positive/forbidden + pair, no self-pairs, no duplicates, and reaches domains + that are absent from the candidate pool. + +Run directly (`python3 tests/python/test_select_ppi_negative_dans.py`) or via +pytest. +""" + +import os +import subprocess +import sys +import tempfile + +import numpy as np + +REPO = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +BIN = os.path.join(REPO, "bin") +SELECTOR = os.path.join(BIN, "select_ppi_negative_dans.py") + + +def pf(i): + return f"PF{i:05d}" + + +def build_pool(path): + # Full 3did positive edges (canonical ascending). + pos_edges = [(1, 2), (1, 3), (1, 4), (2, 3), (5, 6), + (7, 8), (9, 10), (1, 9), (2, 10)] + # Candidate pool: fresh (non-positive) pairs over domains 1..6. + cand = [(1, 5), (1, 6), (2, 5), (2, 6), (3, 5), + (3, 6), (4, 5), (4, 6), (3, 4)] + pool_domains = sorted({d for e in cand for d in e}) # 1..6 + + # Reduced positives = positive edges with both endpoints in the pool. + pos_r = [(a, b) for a, b in pos_edges + if a in pool_domains and b in pool_domains] + deg_r = {d: 0 for d in pool_domains} + for a, b in pos_r: + deg_r[a] += 1 + deg_r[b] += 1 + pool_dom = [pf(d) for d in pool_domains] + pool_deg_r = np.array([deg_r[d] for d in pool_domains], dtype=np.int64) + pos_edge_pa_r = np.array([deg_r[a] * deg_r[b] for a, b in pos_r], dtype=np.int64) + + # Full positive degrees over all 10 domains. + all_dom = list(range(1, 11)) + deg = {d: 0 for d in all_dom} + for a, b in pos_edges: + deg[a] += 1 + deg[b] += 1 + pos_dom = [pf(d) for d in all_dom] + pos_deg = np.array([deg[d] for d in all_dom], dtype=np.int64) + pos_edge_pa = np.array([deg[a] * deg[b] for a, b in pos_edges], dtype=np.int64) + + np.savez( + path, + cand_a=np.array([pf(a) for a, b in cand], dtype=object), + cand_b=np.array([pf(b) for a, b in cand], dtype=object), + pool_dom=np.array(pool_dom, dtype=object), + pool_deg_r=pool_deg_r, + pos_edge_pa_r=pos_edge_pa_r, + n_positive_r=np.int64(len(pos_r)), + n_positive_domains_r=np.int64(sum(1 for d in pool_domains if deg_r[d])), + pos_a=np.array([pf(a) for a, b in pos_edges], dtype=object), + pos_b=np.array([pf(b) for a, b in pos_edges], dtype=object), + pos_dom=np.array(pos_dom, dtype=object), + pos_deg=pos_deg, + pos_edge_pa=pos_edge_pa, + n_positive=np.int64(len(pos_edges)), + n_positive_domains=np.int64(len(all_dom)), + forbidden_a=np.array([pf(a) for a, b in pos_edges], dtype=object), + forbidden_b=np.array([pf(b) for a, b in pos_edges], dtype=object), + ) + return { + "cand": {tuple(sorted((pf(a), pf(b)))) for a, b in cand}, + "pool_domains": {pf(d) for d in pool_domains}, + "n_positive_r": len(pos_r), + "n_positive": len(pos_edges), + "forbidden": {(pf(a), pf(b)) for a, b in pos_edges}, + "pool_only": {pf(d) for d in pool_domains}, + "extra_domains": {pf(9), pf(10)}, + } + + +def run_method(pool_path, method, workdir): + score = os.path.join(workdir, f"score_{method}.json") + pairs = os.path.join(workdir, f"pairs_{method}.tsv") + env = dict(os.environ, PYTHONPATH=BIN + os.pathsep + os.environ.get("PYTHONPATH", "")) + subprocess.run( + [sys.executable, SELECTOR, "--pool", pool_path, "--method", method, + "--seed", "7", "--score-out", score, "--pairs-out", pairs], + check=True, env=env, + ) + out = [] + with open(pairs) as fh: + for line in fh: + line = line.rstrip("\n") + if line: + a, b = line.split("\t") + out.append((a, b)) + return out + + +def test_dans_methods(): + with tempfile.TemporaryDirectory() as tmp: + pool_path = os.path.join(tmp, "neg_pool.npz") + meta = build_pool(pool_path) + + # --- deletion --- + del_pairs = run_method(pool_path, "deletion", tmp) + assert len(del_pairs) == meta["n_positive_r"], \ + f"deletion count {len(del_pairs)} != {meta['n_positive_r']}" + for a, b in del_pairs: + assert tuple(sorted((a, b))) in meta["cand"], f"{a},{b} not in pool" + assert a in meta["pool_domains"] and b in meta["pool_domains"] + assert len({tuple(sorted(p)) for p in del_pairs}) == len(del_pairs), "dup in deletion" + + # --- random_addition --- + rand_pairs = run_method(pool_path, "random_addition", tmp) + assert len(rand_pairs) == meta["n_positive"], \ + f"random_addition count {len(rand_pairs)} != {meta['n_positive']}" + seen = set() + used_domains = set() + for a, b in rand_pairs: + assert a != b, f"self pair {a}" + key = (a, b) if a <= b else (b, a) + assert key not in meta["forbidden"], f"{key} is a positive/forbidden pair" + assert key not in seen, f"duplicate {key}" + seen.add(key) + used_domains.update((a, b)) + # DANS over the full positive set must be able to reach domains outside + # the candidate pool. + assert used_domains & meta["extra_domains"], \ + "random_addition never reached the pool-absent domains" + + print("OK: both DANS methods satisfy invariants") + + +if __name__ == "__main__": + test_dans_methods() diff --git a/workflows/domainsplit.nf b/workflows/domainsplit.nf index 877e784..4474982 100644 --- a/workflows/domainsplit.nf +++ b/workflows/domainsplit.nf @@ -9,7 +9,7 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore include { INIT_DOMAINSPLIT_DB } from '../modules/local/init_domainsplit_db/main.nf' include { COLLECT_DDI_DATA } from '../subworkflows/local/collect_ddi_data/main.nf' include { CURATE_DOMAINS } from '../subworkflows/local/curate_domains/main.nf' -include { GENERATE_EMBEDDINGS } from '../subworkflows/local/generate_embeddings/main.nf' +include { generate_esm_embeddings } from '../modules/local/esm_embeddings/main.nf' include { ENRICH_DDI_DATABASE } from '../subworkflows/local/enrich_ddi_database/main.nf' include { SPLIT_DOMAINSPLIT_DATABASE } from '../subworkflows/local/split_domainsplit_database/main.nf' include { ANALYZE_DDI_BIAS } from '../modules/local/analyze_ddi_bias/main.nf' @@ -22,24 +22,15 @@ include { ANALYZE_DDI_BIAS } from '../modules/local/analyze_ddi_bias/ workflow DOMAINSPLIT { main: + ch_versions = Channel.empty() + input_uniprot_id_mapping = file(params.url_uniprot_id_mapping) - input_uniprot_embeddings = file(params.url_uniprot_embeddings) input_uniprot_go_terms = file(params.url_uniprot_go_terms) input_uniprot_sequences = file(params.url_uniprot_sequences) input_string = file(params.url_string) input_pfam2go = file(params.url_pfam2go) - def prott5_file = [] - if (params.prott5_per_residue_h5) { - def f = file(params.prott5_per_residue_h5) - if (f.exists()) { - prott5_file = f - } else { - log.warn "ProtT5 HDF5 not found at '${params.prott5_per_residue_h5}' — skipping ProtT5 embeddings" - } - } else { - log.warn "params.prott5_per_residue_h5 not set — skipping ProtT5 embeddings" - } + def prott5_file = file(params.url_uniprot_prott5_embeddings) empty_db = INIT_DOMAINSPLIT_DB().domainsplit_db @@ -47,6 +38,10 @@ main: empty_db, params.url_3did, params.url_negatome, + params.url_uniprot_swissprot_pfam, + params.hippie_tsv, + params.ppidm_tsv, + params.negative_ppi_parquet, ) domainsplit_db_ddi = COLLECT_DDI_DATA.out.domainsplit_db @@ -58,9 +53,9 @@ main: protein_domain_map = CURATE_DOMAINS.out.protein_domain_map - GENERATE_EMBEDDINGS( - protein_domain_map, + generate_esm_embeddings( input_uniprot_sequences, + protein_domain_map, ) ENRICH_DDI_DATABASE( @@ -72,8 +67,8 @@ main: input_uniprot_go_terms, input_string, input_uniprot_id_mapping, - GENERATE_EMBEDDINGS.out.esm_protein_embeddings, - GENERATE_EMBEDDINGS.out.esm_domain_embeddings, + generate_esm_embeddings.out.protein_embeddings, + generate_esm_embeddings.out.domain_embeddings, ) ANALYZE_DDI_BIAS( @@ -84,6 +79,27 @@ main: ENRICH_DDI_DATABASE.out.domainsplit_db ) + // + // Collate and save software versions + // + ch_versions = ch_versions.mix( + INIT_DOMAINSPLIT_DB.out.versions, + COLLECT_DDI_DATA.out.versions, + CURATE_DOMAINS.out.versions, + generate_esm_embeddings.out.versions, + ENRICH_DDI_DATABASE.out.versions, + ANALYZE_DDI_BIAS.out.versions, + SPLIT_DOMAINSPLIT_DATABASE.out.versions, + ) + + softwareVersionsToYAML(ch_versions) + .collectFile( + storeDir: "${params.outdir}/pipeline_info", + name: 'nf_core_' + 'pipeline_software_' + 'mqc_' + 'versions.yml', + sort: true, + newLine: true, + ) + emit: domainsplit_db = ENRICH_DDI_DATABASE.out.domainsplit_db split_db = SPLIT_DOMAINSPLIT_DATABASE.out.split_db