From 46e16a59e9e915903fa22edc0059d4768fbc8de4 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Tue, 9 Jun 2026 18:30:15 +0200 Subject: [PATCH 01/16] added two more positive sources --- bin/build_ppi_negative_ddis.py | 44 ++++-- bin/build_swissprot_pfam_map.py | 129 ++++++++++++++++++ bin/ddi_db_utils.py | 84 ++++++++++++ bin/insert_3did.py | 65 +++++++++ bin/insert_negatome.py | 59 ++++++++ bin/insert_ppidm.py | 94 +++++++++++++ bin/insert_single_domain_ppi.py | 101 ++++++++++++++ conf/modules.config | 2 +- .../environment.yml | 0 .../local/external_validation_split/main.nf | 104 ++++++++++++++ modules/local/insert_3did/environment.yml | 6 + modules/local/insert_3did/main.nf | 25 ++++ modules/local/insert_ddis/main.nf | 99 -------------- modules/local/insert_negatome/environment.yml | 6 + modules/local/insert_negatome/main.nf | 25 ++++ .../local/insert_ppi_negative_ddis/main.nf | 7 +- modules/local/insert_ppidm/environment.yml | 6 + modules/local/insert_ppidm/main.nf | 27 ++++ .../insert_single_domain_ppi/environment.yml | 6 + .../local/insert_single_domain_ppi/main.nf | 29 ++++ modules/local/minimal_leakage_split/main.nf | 6 +- modules/local/random_ddi_split/main.nf | 6 +- .../remove_self_interactions/environment.yml | 6 + .../local/remove_self_interactions/main.nf | 40 ++++++ modules/local/swissprot_map/environment.yml | 5 + modules/local/swissprot_map/main.nf | 22 +++ nextflow.config | 44 ++++-- nextflow_schema.json | 55 ++++++-- subworkflows/local/collect_ddi_data/main.nf | 71 +++++++--- .../local/split_domainsplit_database/main.nf | 62 +++++++-- workflows/domainsplit.nf | 22 +-- 31 files changed, 1083 insertions(+), 174 deletions(-) create mode 100755 bin/build_swissprot_pfam_map.py create mode 100755 bin/ddi_db_utils.py create mode 100755 bin/insert_3did.py create mode 100755 bin/insert_negatome.py create mode 100755 bin/insert_ppidm.py create mode 100755 bin/insert_single_domain_ppi.py rename modules/local/{insert_ddis => external_validation_split}/environment.yml (100%) create mode 100644 modules/local/external_validation_split/main.nf create mode 100644 modules/local/insert_3did/environment.yml create mode 100644 modules/local/insert_3did/main.nf delete mode 100644 modules/local/insert_ddis/main.nf create mode 100644 modules/local/insert_negatome/environment.yml create mode 100644 modules/local/insert_negatome/main.nf create mode 100644 modules/local/insert_ppidm/environment.yml create mode 100644 modules/local/insert_ppidm/main.nf create mode 100644 modules/local/insert_single_domain_ppi/environment.yml create mode 100644 modules/local/insert_single_domain_ppi/main.nf create mode 100644 modules/local/remove_self_interactions/environment.yml create mode 100644 modules/local/remove_self_interactions/main.nf create mode 100644 modules/local/swissprot_map/environment.yml create mode 100644 modules/local/swissprot_map/main.nf diff --git a/bin/build_ppi_negative_ddis.py b/bin/build_ppi_negative_ddis.py index 23051b8..aae31d2 100755 --- a/bin/build_ppi_negative_ddis.py +++ b/bin/build_ppi_negative_ddis.py @@ -21,6 +21,8 @@ import pyarrow.parquet as pq import requests +from ddi_db_utils import pfam_sort_key + TAG = "[ppi_neg]" BATCH_SIZE = 500_000 @@ -38,7 +40,7 @@ def parse_args(): p.add_argument("--pfam-mapping-out", required=True, help="Output path for UniProt -> Pfam JSON mapping") p.add_argument("--min-n-tested", type=int, required=True) - p.add_argument("--source-label", required=True) + p.add_argument("--source-label", default="inferred_ppi_screen_negative") p.add_argument( "--sampling-strategy", choices=["frequency", "degree_matched"], @@ -46,6 +48,12 @@ def parse_args(): help="'frequency' = top-N by co-occurrence (old behavior). " "'degree_matched' = sample to match positive degree distribution.", ) + p.add_argument( + "--no-self", + action="store_true", + help="Skip self-pairs (domain interacting with itself) " + "when self_interaction is disabled.", + ) return p.parse_args() @@ -142,12 +150,17 @@ def fetch_gene_mappings(gene_names, batch_size=100): return gene_to_uniprot, uniprot_to_pfams -def load_positive_pfams(conn): +def load_3did_pfams(conn): + """Pfam IDs that appear in a 3did positive DDI. + + Negatives are inferred (and degree-matched) only over the 3did domain + universe, so single-domain / PPIDM positives never widen the candidate set. + """ cur = conn.execute( "SELECT DISTINCT d.pfam_id " "FROM domain AS d JOIN domain_domain_interaction AS ddi " " ON d.id IN (ddi.domain_id_a, ddi.domain_id_b) " - "WHERE ddi.negative = 0" + "WHERE ddi.negative = 0 AND ddi.source = '3did'" ) return {row[0] for row in cur} @@ -159,7 +172,7 @@ def load_existing_pairs(conn): "JOIN domain AS da ON da.id = ddi.domain_id_a " "JOIN domain AS db ON db.id = ddi.domain_id_b" ) - return {tuple(sorted((a, b))) for a, b in cur} + return {tuple(sorted((a, b), key=pfam_sort_key)) for a, b in cur} @@ -199,13 +212,13 @@ def _collect_genes_and_pairs(parquet_path, min_n_tested): def _compute_positive_degree(conn): - """Per-Pfam degree in the positive DDI set.""" + """Per-Pfam degree in the 3did positive DDI set.""" rows = conn.execute( "SELECT da.pfam_id, db.pfam_id " "FROM domain_domain_interaction AS ddi " "JOIN domain AS da ON da.id = ddi.domain_id_a " "JOIN domain AS db ON db.id = ddi.domain_id_b " - "WHERE ddi.negative = 0" + "WHERE ddi.negative = 0 AND ddi.source = '3did'" ).fetchall() deg = defaultdict(int) for a, b in rows: @@ -310,8 +323,8 @@ def main(): conn.execute("PRAGMA journal_mode=OFF") conn.execute("PRAGMA synchronous=OFF") - pos_pfam = load_positive_pfams(conn) - log(f"n_positive_pfams = {len(pos_pfam)}") + pos_pfam = load_3did_pfams(conn) + log(f"n_3did_pfams = {len(pos_pfam)}") existing_pairs = load_existing_pairs(conn) log(f"n_existing_ddis = {len(existing_pairs)}") @@ -335,7 +348,9 @@ def row_pfams(gene): continue row_pairs = set() for a, b in itertools.product(bait_pfams, prey_pfams): - row_pairs.add(tuple(sorted((a, b)))) + if args.no_self and a == b: + continue + row_pairs.add(tuple(sorted((a, b), key=pfam_sort_key))) if not row_pairs: continue n_rows_with_pairs += 1 @@ -357,7 +372,8 @@ def row_pfams(gene): ) n_positive = conn.execute( - "SELECT COUNT(*) FROM domain_domain_interaction WHERE negative = 0" + "SELECT COUNT(*) FROM domain_domain_interaction " + "WHERE negative = 0 AND source = '3did'" ).fetchone()[0] n_negatome = conn.execute( "SELECT COUNT(*) FROM domain_domain_interaction " @@ -404,9 +420,15 @@ def row_pfams(gene): insert_rows = [] for (pfam_a, pfam_b), _ in chosen: + # normalise by Pfam accession number (matching ddi_db_utils.insert_ddis) + # so swapped pairs collapse and dedup consistently with the other sources for d_a in pfam_to_domain_ids.get(pfam_a, ()): for d_b in pfam_to_domain_ids.get(pfam_b, ()): - insert_rows.append((d_a, d_b, True, args.source_label)) + if pfam_sort_key(pfam_a) <= pfam_sort_key(pfam_b): + lo, hi = d_a, d_b + else: + lo, hi = d_b, d_a + insert_rows.append((lo, hi, True, args.source_label)) conn.executemany( "INSERT OR IGNORE INTO domain_domain_interaction" diff --git a/bin/build_swissprot_pfam_map.py b/bin/build_swissprot_pfam_map.py new file mode 100755 index 0000000..7d923aa --- /dev/null +++ b/bin/build_swissprot_pfam_map.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +"""Build a reviewed-human UniProt -> Pfam map for single-domain detection. + +Downloads one UniProt stream (TSV, fields accession,id,gene_names,xref_pfam for +``reviewed:true AND organism_id:9606``) and emits ``swissprot_pfam_map.json``: + + { + "accession_to_pfams": {accession: [Pfam, ...]}, + "name_to_accession": {entry_name_or_gene: accession} + } + +``name_to_accession`` lets the single-domain step resolve HIPPIE identifiers that +are entry names (e.g. ``AL1A1_HUMAN``) or gene names; accessions resolve directly +against ``accession_to_pfams``. Gene names that map to more than one accession are +dropped as ambiguous; unique entry names always win. +""" + +import argparse +import gzip +import json +import os +import shutil +import ssl +import sys +import urllib.error +import urllib.request + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--url", required=True, help="UniProt stream URL or local TSV(.gz) file") + p.add_argument("--out", required=True, help="Output JSON path") + p.add_argument("--versions", required=True) + p.add_argument("--process-name", required=True) + return p.parse_args() + + +def fetch(url, out_path): + def _download(ctx): + req = urllib.request.Request(url, headers={"User-Agent": "domainsplit-pipeline"}) + with urllib.request.urlopen(req, context=ctx, timeout=600) as resp, open(out_path, "wb") as fh: + while True: + chunk = resp.read(1024 * 1024) + if not chunk: + break + fh.write(chunk) + + if url.startswith(("http://", "https://", "ftp://", "file://")): + try: + _download(ssl.create_default_context()) + except (urllib.error.URLError, ssl.SSLError) as exc: + print(f"WARNING: SSL validation failed for {url} ({exc!r}); retrying unverified.", + file=sys.stderr, flush=True) + _download(ssl._create_unverified_context()) + elif os.path.exists(url): + shutil.copy(url, out_path) + else: + raise SystemExit(f"url_uniprot_swissprot_pfam '{url}' is neither a URL nor a local file") + + +def open_maybe_gzip(path): + with open(path, "rb") as fh: + magic = fh.read(2) + if magic == b"\x1f\x8b": + return gzip.open(path, "rt") + return open(path, "rt") + + +def main(): + args = parse_args() + + raw = "swissprot.tsv" + fetch(args.url, raw) + + accession_to_pfams = {} + gene_to_accs = {} # gene token -> set of accessions (for ambiguity check) + entry_name_to_acc = {} + + n_lines = 0 + with open_maybe_gzip(raw) as fh: + for i, line in enumerate(fh): + line = line.rstrip("\n") + if i == 0 and line.lower().startswith("entry"): + continue # header + if not line: + continue + cols = line.split("\t") + if len(cols) < 4: + cols += [""] * (4 - len(cols)) + accession, entry_name, gene_names, pfam_field = cols[0], cols[1], cols[2], cols[3] + if not accession: + continue + n_lines += 1 + + pfams = sorted({p for p in pfam_field.replace(",", ";").split(";") if p}) + accession_to_pfams[accession] = pfams + + if entry_name: + entry_name_to_acc[entry_name] = accession + for token in gene_names.split(): + gene_to_accs.setdefault(token, set()).add(accession) + + # entry names are unique and authoritative; add unambiguous gene names that + # do not collide with an entry name + name_to_accession = dict(entry_name_to_acc) + for token, accs in gene_to_accs.items(): + if token in name_to_accession: + continue + if len(accs) == 1: + name_to_accession[token] = next(iter(accs)) + + n_single = sum(1 for pfams in accession_to_pfams.values() if len(pfams) == 1) + print(f"[swissprot_map] proteins={n_lines} single_domain={n_single} " + f"names={len(name_to_accession)}", flush=True) + + with open(args.out, "w") as fh: + json.dump( + {"accession_to_pfams": accession_to_pfams, + "name_to_accession": name_to_accession}, + fh, + ) + + with open(args.versions, "w") as f: + f.write(f'"{args.process_name}":\n') + f.write(f" python: {sys.version.split()[0]}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/ddi_db_utils.py b/bin/ddi_db_utils.py new file mode 100755 index 0000000..e2c6570 --- /dev/null +++ b/bin/ddi_db_utils.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +"""Shared helpers for inserting DDIs into the domainsplit SQLite. + +Every ``INSERT_`` step uses these so all sources are handled uniformly: +first bulk-create any missing ``domain`` rows for the Pfam IDs it references, +then insert its DDIs. Pairs are order-normalised by Pfam accession number (see +:func:`pfam_sort_key`) so the stored ``(domain_id_a, domain_id_b)`` order is +stable -- a pair is deduplicated regardless of the order it is supplied in and +regardless of which source inserted it first or in what order domains were +created -- and ``INSERT OR IGNORE`` keeps the earliest (positive *or* negative) +row. +""" + +import sqlite3 + + +def pfam_sort_key(pfam): + """Sort key for a Pfam accession by its numeric part (``PF00028`` -> ``28``). + + Used to canonicalise DDI pairs so the stored column order depends only on the + Pfam accessions, never on the internal ``domain.id`` insertion order. Strips + everything but digits; accessions without any digit fall back to a lexical key + that sorts deterministically after all numbered ones. + """ + digits = "".join(c for c in pfam if c.isdigit()) + return (0, int(digits)) if digits else (1, pfam) + + +def ensure_domains(conn, pfam_ids): + """Bulk ``INSERT OR IGNORE`` domain rows for ``pfam_ids`` (name left NULL). + + Returns the number of distinct Pfam IDs supplied. + """ + unique = {p for p in pfam_ids if p} + conn.executemany( + "INSERT OR IGNORE INTO domain(pfam_id) VALUES (?)", + [(p,) for p in unique], + ) + return len(unique) + + +def _pfam_to_id(conn): + return {pfam: did for did, pfam in conn.execute("SELECT id, pfam_id FROM domain")} + + +def insert_ddis(conn, pairs, negative, source): + """Insert DDIs for ``(pfam_a, pfam_b)`` pairs. + + Domains must already exist (call :func:`ensure_domains` first); pairs whose + Pfam is missing from the ``domain`` table are skipped. Each pair is stored + as ``(min(id), max(id))`` so swapped duplicates collapse onto one row. + + Returns the number of rows offered to ``INSERT OR IGNORE`` (before dedup by + the DB). + """ + pfam_to_id = _pfam_to_id(conn) + neg = int(bool(negative)) + rows = [] + seen = set() + for a, b in pairs: + ia = pfam_to_id.get(a) + ib = pfam_to_id.get(b) + if ia is None or ib is None: + continue + key = (ia, ib) if pfam_sort_key(a) <= pfam_sort_key(b) else (ib, ia) + if key in seen: + continue + seen.add(key) + rows.append((key[0], key[1], neg, source)) + + conn.executemany( + "INSERT OR IGNORE INTO domain_domain_interaction" + "(domain_id_a, domain_id_b, negative, source) VALUES (?, ?, ?, ?)", + rows, + ) + return len(rows) + + +def count_source(conn, source): + """Number of DDI rows currently tagged with ``source``.""" + return conn.execute( + "SELECT COUNT(*) FROM domain_domain_interaction WHERE source = ?", + (source,), + ).fetchone()[0] diff --git a/bin/insert_3did.py b/bin/insert_3did.py new file mode 100755 index 0000000..33fec32 --- /dev/null +++ b/bin/insert_3did.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +"""Insert 3did positive DDIs into the domainsplit SQLite. + +3did is treated like every other source: read its DDI pairs, bulk-create any +missing ``domain`` rows for the referenced Pfam IDs, then insert the +interactions as ``negative=0, source='3did'``. +""" + +import argparse +import sqlite3 +import sys + +from ddi_db_utils import count_source, ensure_domains, insert_ddis + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--db", required=True, help="domainsplit SQLite (modified in place)") + p.add_argument("--sqlite-3did", required=True, help="3did SQLite from DOWNLOAD_3DID_SQLITE") + p.add_argument("--versions", required=True) + p.add_argument("--process-name", required=True) + return p.parse_args() + + +def iter_3did_pairs(conn_3did): + """Yield (pfam_a, pfam_b) Pfam accessions (version stripped) for each 3did DDI.""" + cursor = conn_3did.execute( + "SELECT d1.Pfam_id, d2.Pfam_id " + "FROM DDI1, Domain AS d1, Domain AS d2 " + "WHERE DDI1.domain1 = d1.Name AND DDI1.domain2 = d2.Name" + ) + for id_1, id_2 in cursor: + yield id_1.split(".")[0], id_2.split(".")[0] + + +def main(): + args = parse_args() + + conn_3did = sqlite3.connect(args.sqlite_3did) + conn = sqlite3.connect(args.db) + conn.execute("PRAGMA foreign_keys=ON") + conn.execute("PRAGMA journal_mode=OFF") + conn.execute("PRAGMA synchronous=OFF") + + pairs = list(iter_3did_pairs(conn_3did)) + conn_3did.close() + print(f"[3did] read {len(pairs)} DDI pairs", flush=True) + + pfams = {p for pair in pairs for p in pair} + n_domains = ensure_domains(conn, pfams) + print(f"[3did] ensured {n_domains} domains", flush=True) + + insert_ddis(conn, pairs, negative=False, source="3did") + conn.commit() + print(f"[3did] n_ddis_source_3did = {count_source(conn, '3did')}", flush=True) + conn.close() + + with open(args.versions, "w") as f: + f.write(f'"{args.process_name}":\n') + f.write(f" python: {sys.version.split()[0]}\n") + f.write(f" sqlite3: {sqlite3.sqlite_version}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/insert_negatome.py b/bin/insert_negatome.py new file mode 100755 index 0000000..48b33d1 --- /dev/null +++ b/bin/insert_negatome.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +"""Insert Negatome negative DDIs into the domainsplit SQLite. + +Negatome ``combined_pfam.txt`` lists whitespace-separated Pfam pairs that do not +interact. Treated like every other source: bulk-create missing domains, then +insert as ``negative=1, source='negatome'``. +""" + +import argparse +import sqlite3 +import sys + +from ddi_db_utils import count_source, ensure_domains, insert_ddis + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--db", required=True) + p.add_argument("--negatome", required=True) + p.add_argument("--versions", required=True) + p.add_argument("--process-name", required=True) + return p.parse_args() + + +def iter_negatome_pairs(path): + with open(path) as f: + for line in f: + tokens = line.split() + if len(tokens) < 2: + continue + yield tokens[0], tokens[1] + + +def main(): + args = parse_args() + + pairs = list(iter_negatome_pairs(args.negatome)) + print(f"[negatome] read {len(pairs)} pairs", flush=True) + + conn = sqlite3.connect(args.db) + conn.execute("PRAGMA foreign_keys=ON") + conn.execute("PRAGMA journal_mode=OFF") + conn.execute("PRAGMA synchronous=OFF") + + pfams = {p for pair in pairs for p in pair} + ensure_domains(conn, pfams) + insert_ddis(conn, pairs, negative=True, source="negatome") + conn.commit() + print(f"[negatome] n_ddis_source_negatome = {count_source(conn, 'negatome')}", flush=True) + conn.close() + + with open(args.versions, "w") as f: + f.write(f'"{args.process_name}":\n') + f.write(f" python: {sys.version.split()[0]}\n") + f.write(f" sqlite3: {sqlite3.sqlite_version}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/insert_ppidm.py b/bin/insert_ppidm.py new file mode 100755 index 0000000..9ed2556 --- /dev/null +++ b/bin/insert_ppidm.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +"""Insert PPIDM predicted positive DDIs, keeping the class as the source. + +Input ``predicted_ddi_ppi.tsv`` columns: ``domain_1 domain_2 class`` where each +domain token looks like ``10114/PF00069`` (the Pfam accession follows the slash). +Rows are inserted as ``negative=0, source='PPIDM_'`` for the requested +classes. Classes are processed Gold -> Silver -> Bronze so that, on a duplicate +domain pair, the highest-confidence class wins (``INSERT OR IGNORE``). +""" + +import argparse +import sqlite3 +import sys + +from ddi_db_utils import count_source, ensure_domains, insert_ddis + +# highest confidence first +CLASS_ORDER = ["Gold", "Silver", "Bronze"] + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--db", required=True) + p.add_argument("--ppidm", required=True) + p.add_argument("--classes", required=True, + help="comma-separated classes to include, e.g. 'Bronze,Silver,Gold'") + p.add_argument("--versions", required=True) + p.add_argument("--process-name", required=True) + return p.parse_args() + + +def extract_pfam(token): + """``10114/PF00069`` or ``PF00069.3`` -> ``PF00069`` (or None).""" + pf = token.split("/")[-1].split(".")[0].strip() + return pf if pf.startswith("PF") else None + + +def main(): + args = parse_args() + + allowed = {c.strip().capitalize() for c in args.classes.split(",") if c.strip()} + classes = [c for c in CLASS_ORDER if c in allowed] + print(f"[ppidm] including classes: {classes}", flush=True) + + # class -> list of (pfam_a, pfam_b) + pairs_by_class = {c: [] for c in classes} + n_rows = n_bad = 0 + with open(args.ppidm) as fh: + for i, line in enumerate(fh): + line = line.rstrip("\n") + if not line: + continue + cols = line.split("\t") + if len(cols) < 3: + continue + if i == 0 and cols[2].strip().lower() == "class": + continue # header + cls = cols[2].strip().capitalize() + if cls not in pairs_by_class: + continue + pfam_a = extract_pfam(cols[0]) + pfam_b = extract_pfam(cols[1]) + if pfam_a is None or pfam_b is None: + n_bad += 1 + continue + pairs_by_class[cls].append((pfam_a, pfam_b)) + n_rows += 1 + + print(f"[ppidm] parsed {n_rows} pairs ({n_bad} unparseable)", flush=True) + + conn = sqlite3.connect(args.db) + conn.execute("PRAGMA foreign_keys=ON") + conn.execute("PRAGMA journal_mode=OFF") + conn.execute("PRAGMA synchronous=OFF") + + all_pfams = {p for pairs in pairs_by_class.values() for pair in pairs for p in pair} + ensure_domains(conn, all_pfams) + + for cls in classes: # Gold first + source = f"PPIDM_{cls}" + insert_ddis(conn, pairs_by_class[cls], negative=False, source=source) + conn.commit() + print(f"[ppidm] n_ddis_source_{source} = {count_source(conn, source)}", flush=True) + + conn.close() + + with open(args.versions, "w") as f: + f.write(f'"{args.process_name}":\n') + f.write(f" python: {sys.version.split()[0]}\n") + f.write(f" sqlite3: {sqlite3.sqlite_version}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/insert_single_domain_ppi.py b/bin/insert_single_domain_ppi.py new file mode 100755 index 0000000..996ecdf --- /dev/null +++ b/bin/insert_single_domain_ppi.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +"""Infer positive DDIs from HIPPIE PPIs between two single-domain proteins. + +A PPI contributes a positive DDI only when *both* interactors are reviewed human +proteins annotated with exactly one Pfam domain; the DDI is then the pair of +those two single domains. Identifiers in the HIPPIE columns may be UniProt +accessions or entry names (e.g. ``AL1A1_HUMAN``) -- both are resolved via the +SwissProt map. New domains are bulk-created so they get curated downstream. +""" + +import argparse +import json +import sqlite3 +import sys + +from ddi_db_utils import count_source, ensure_domains, insert_ddis + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--db", required=True) + p.add_argument("--hippie", required=True) + p.add_argument("--swissprot-map", required=True) + p.add_argument("--min-score", type=float, required=True) + p.add_argument("--versions", required=True) + p.add_argument("--process-name", required=True) + return p.parse_args() + + +def main(): + args = parse_args() + + with open(args.swissprot_map) as fh: + smap = json.load(fh) + accession_to_pfams = smap["accession_to_pfams"] + name_to_accession = smap["name_to_accession"] + + # single-domain proteins: accession -> its one Pfam + single_domain = { + acc: pfams[0] + for acc, pfams in accession_to_pfams.items() + if len(pfams) == 1 + } + print(f"[single_domain_ppi] single-domain proteins: {len(single_domain)}", flush=True) + + def resolve_pfam(token): + """Return the single Pfam of ``token`` (accession or name), else None.""" + acc = token if token in accession_to_pfams else name_to_accession.get(token) + if acc is None: + return None + return single_domain.get(acc) + + pairs = [] + n_rows = n_kept = n_unresolved = 0 + with open(args.hippie) as fh: + for line in fh: + line = line.rstrip("\n") + if not line: + continue + cols = line.split("\t") + if len(cols) < 5: + continue + n_rows += 1 + try: + score = float(cols[4]) + except ValueError: + continue + if score < args.min_score: + continue + pfam_a = resolve_pfam(cols[0]) + pfam_b = resolve_pfam(cols[2]) + if pfam_a is None or pfam_b is None: + n_unresolved += 1 + continue + pairs.append((pfam_a, pfam_b)) + n_kept += 1 + + print(f"[single_domain_ppi] hippie_rows={n_rows} score>= {args.min_score}: " + f"single_domain_pairs={n_kept} unresolved_or_multi={n_unresolved}", flush=True) + + conn = sqlite3.connect(args.db) + conn.execute("PRAGMA foreign_keys=ON") + conn.execute("PRAGMA journal_mode=OFF") + conn.execute("PRAGMA synchronous=OFF") + + pfams = {p for pair in pairs for p in pair} + ensure_domains(conn, pfams) + insert_ddis(conn, pairs, negative=False, source="single_domain_ppi") + conn.commit() + print(f"[single_domain_ppi] n_ddis_source = " + f"{count_source(conn, 'single_domain_ppi')}", flush=True) + conn.close() + + with open(args.versions, "w") as f: + f.write(f'"{args.process_name}":\n') + f.write(f" python: {sys.version.split()[0]}\n") + f.write(f" sqlite3: {sqlite3.sqlite_version}\n") + + +if __name__ == "__main__": + main() diff --git a/conf/modules.config b/conf/modules.config index 14931ed..a91b4f8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -29,7 +29,7 @@ process { // default publishDir copy each of them under `insert/` / `smoke/` / // `init/` would create stale duplicates and racy filename collisions. // Disable publishing for these intermediates explicitly. - withName: 'INIT_DOMAINSPLIT_DB|INSERT_DDIS|INSERT_NEGATIVE_DDIS|INSERT_PPI_NEGATIVE_DDIS|SMOKE_FILTER|INSERT_DOMAIN_GO_TERMS|INSERT_PROTEINS_WITH_EMBEDDINGS|INSERT_PROTEIN_GO_TERMS|INSERT_PPI|INSERT_DOMAIN_PROTEIN_MAPPING' { + withName: 'INIT_DOMAINSPLIT_DB|INSERT_3DID|INSERT_SINGLE_DOMAIN_PPI|INSERT_PPIDM|INSERT_NEGATOME|REMOVE_SELF_INTERACTIONS|BUILD_SWISSPROT_PFAM_MAP|INSERT_PPI_NEGATIVE_DDIS|SMOKE_FILTER|INSERT_DOMAIN_GO_TERMS|INSERT_PROTEINS_WITH_EMBEDDINGS|INSERT_PROTEIN_GO_TERMS|INSERT_PPI|INSERT_DOMAIN_PROTEIN_MAPPING' { publishDir = [ enabled: false ] } diff --git a/modules/local/insert_ddis/environment.yml b/modules/local/external_validation_split/environment.yml similarity index 100% rename from modules/local/insert_ddis/environment.yml rename to modules/local/external_validation_split/environment.yml diff --git a/modules/local/external_validation_split/main.nf b/modules/local/external_validation_split/main.nf new file mode 100644 index 0000000..e7fc98d --- /dev/null +++ b/modules/local/external_validation_split/main.nf @@ -0,0 +1,104 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUBSET_DDIS_BY_SOURCE -- build a single split database keeping only DDIs + whose `source` is in the requested set, then prune orphan domains/proteins. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Used for the External-Validation test set, which is placed "as is" (no + leakage-aware partitioning) from the held-out sources. +----------------------------------------------------------------------------*/ + +process SUBSET_DDIS_BY_SOURCE { + tag "subset_${split_name}" + label 'process_medium' + conda "${moduleDir}/environment.yml" + container "docker://konstantinpelz/domainsplit-general:1.0.0" + + input: + path 'domainsplit.sqlite3' + val source_filter // list of DDI source strings to keep + val split_name // output split name, e.g. 'test' + + output: + path('*.sqlite3'), emit: split_dbs + val output_split_info, emit: split_info + path "versions.yml", emit: versions + + script: + output_split_info = [["${split_name}.sqlite3", split_name]] + def src_list = source_filter.collect { "'${it}'" }.join(", ") + + """ + #!/usr/bin/env python3 + import os + os.environ["SQLITE_TMPDIR"] = os.getcwd() + + import sqlite3 + import shutil + import sys + + input_db_path = "domainsplit.sqlite3" + output_path = "${split_name}.sqlite3" + sources = (${src_list},) + + shutil.copyfile(input_db_path, output_path) + + conn = sqlite3.connect(output_path) + conn.executescript(''' + PRAGMA foreign_keys=ON; + PRAGMA journal_mode=OFF; + PRAGMA synchronous=OFF; + ''') + + placeholders = ",".join("?" for _ in sources) + n_keep = conn.execute( + f"SELECT COUNT(*) FROM domain_domain_interaction WHERE source IN ({placeholders})", + sources, + ).fetchone()[0] + print(f"Keeping {n_keep} DDIs with source in {sources}", flush=True) + + conn.execute( + f"DELETE FROM domain_domain_interaction WHERE source NOT IN ({placeholders})", + sources, + ) + + conn.execute(''' + DELETE FROM domain WHERE id IN ( + SELECT d.id FROM domain d + LEFT JOIN domain_domain_interaction ddi + ON ddi.domain_id_a = d.id OR ddi.domain_id_b = d.id + LEFT JOIN domain_protein_map dpm + ON dpm.domain_id = d.id + WHERE ddi.id IS NULL OR dpm.domain_id IS NULL + ) + ''') + + conn.execute(''' + DELETE FROM protein WHERE id IN ( + SELECT p.id FROM protein p + LEFT JOIN domain_protein_map dpm + ON dpm.protein_id = p.id + WHERE dpm.domain_id IS NULL + ) + ''') + + conn.executescript(''' + VACUUM; + + CREATE INDEX IF NOT EXISTS idx_ddi_domain_a ON domain_domain_interaction(domain_id_a); + CREATE INDEX IF NOT EXISTS idx_ddi_domain_b ON domain_domain_interaction(domain_id_b); + CREATE INDEX IF NOT EXISTS idx_dpm_domain ON domain_protein_map(domain_id); + CREATE INDEX IF NOT EXISTS idx_dpm_protein ON domain_protein_map(protein_id); + CREATE INDEX IF NOT EXISTS idx_ppi_protein_a ON protein_protein_interaction(protein_id_a); + CREATE INDEX IF NOT EXISTS idx_ppi_protein_b ON protein_protein_interaction(protein_id_b); + CREATE INDEX IF NOT EXISTS idx_pgo_protein ON protein_go_terms(protein_id); + ''') + + conn.close() + print(f" {output_path}: done", flush=True) + + with open("versions.yml", "w") as f: + f.write('"${task.process}":\\n') + f.write(f" python: {sys.version.split()[0]}\\n") + f.write(f" sqlite3: {sqlite3.sqlite_version}\\n") + """ +} diff --git a/modules/local/insert_3did/environment.yml b/modules/local/insert_3did/environment.yml new file mode 100644 index 0000000..514346d --- /dev/null +++ b/modules/local/insert_3did/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11 + - sqlite diff --git a/modules/local/insert_3did/main.nf b/modules/local/insert_3did/main.nf new file mode 100644 index 0000000..c2485a2 --- /dev/null +++ b/modules/local/insert_3did/main.nf @@ -0,0 +1,25 @@ +process INSERT_3DID { + tag "insert_3did" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker://konstantinpelz/domainsplit-general:1.0.0" + + input: + path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' + path sqlite_3did + + output: + path "domainsplit.sqlite3", emit: domainsplit_db + path "versions.yml", emit: versions + + script: + """ + cp "${domainsplit_db_in}" domainsplit.sqlite3 + + insert_3did.py \\ + --db domainsplit.sqlite3 \\ + --sqlite-3did ${sqlite_3did} \\ + --versions versions.yml \\ + --process-name "${task.process}" + """ +} diff --git a/modules/local/insert_ddis/main.nf b/modules/local/insert_ddis/main.nf deleted file mode 100644 index 1acf264..0000000 --- a/modules/local/insert_ddis/main.nf +++ /dev/null @@ -1,99 +0,0 @@ -process INSERT_DDIS { - tag "insert_ddis" - label 'process_low' - conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" - - input: - path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' - path sqlite_3did - path negatome_txt - - output: - path "domainsplit.sqlite3", emit: domainsplit_db - path "versions.yml", emit: versions - - script: - """ - #!/usr/bin/env python3 - import shutil - import sqlite3 - import sys - - shutil.copy("${domainsplit_db_in}", "domainsplit.sqlite3") - - conn_3did = sqlite3.connect("${sqlite_3did}") - conn_domainsplit = sqlite3.connect("domainsplit.sqlite3") - conn_domainsplit.execute("PRAGMA foreign_keys=ON") - conn_domainsplit.execute("PRAGMA journal_mode=OFF") - conn_domainsplit.execute("PRAGMA synchronous=OFF") - - def iter_negatome_pairs(path): - with open(path) as f: - for line in f: - tokens = line.split() - if len(tokens) < 2: - continue - yield tokens[0], tokens[1] - - def negatome_pfam_ids(path): - ids = set() - for a, b in iter_negatome_pairs(path): - ids.add(a) - ids.add(b) - return ids - - # ---- domain rows: 3did Domain x domain_length + negatome pfam ids - print("Inserting domain information", flush=True) - cursor = conn_3did.execute( - "SELECT Name, Pfam_id, profile_length " - "FROM Domain, domain_length " - "WHERE domain_length.domain = Domain.Name" - ) - domain_rows_3did = ((name, pfam_id.split(".")[0]) for (name, pfam_id, _length) in cursor) - domain_rows_negatome = ((None, pfam_id) for pfam_id in negatome_pfam_ids("${negatome_txt}")) - - conn_domainsplit.executemany( - "INSERT OR IGNORE INTO domain(name, pfam_id) VALUES (?, ?);", - list(domain_rows_3did) + list(domain_rows_negatome), - ) - cursor.close() - conn_domainsplit.commit() - - # ---- positive DDIs from 3did - print("Inserting positive DDIs from 3did", flush=True) - cursor = conn_3did.execute( - "SELECT d1.Pfam_id, d2.Pfam_id " - "FROM DDI1, Domain AS d1, Domain AS d2 " - "WHERE DDI1.domain1 = d1.Name AND DDI1.domain2 = d2.Name;" - ) - pos_iter = ((id_1.split(".")[0], id_2.split(".")[0]) for (id_1, id_2) in cursor) - conn_domainsplit.executemany( - '''INSERT OR IGNORE INTO domain_domain_interaction(domain_id_a, domain_id_b, negative, source) - SELECT d1.id, d2.id, FALSE, '3did' - FROM domain AS d1, domain AS d2 - WHERE d1.pfam_id = ? AND d2.pfam_id = ?;''', - pos_iter, - ) - cursor.close() - conn_domainsplit.commit() - - # ---- negative DDIs from negatome - print("Inserting negative DDIs from negatome", flush=True) - conn_domainsplit.executemany( - '''INSERT OR IGNORE INTO domain_domain_interaction(domain_id_a, domain_id_b, negative, source) - SELECT d1.id, d2.id, TRUE, 'negatome' - FROM domain AS d1, domain AS d2 - WHERE d1.pfam_id = ? AND d2.pfam_id = ?;''', - iter_negatome_pairs("${negatome_txt}"), - ) - conn_domainsplit.commit() - conn_domainsplit.close() - conn_3did.close() - - with open("versions.yml", "w") as f: - f.write('"${task.process}":\\n') - f.write(f" python: {sys.version.split()[0]}\\n") - f.write(f" sqlite3: {sqlite3.sqlite_version}\\n") - """ -} diff --git a/modules/local/insert_negatome/environment.yml b/modules/local/insert_negatome/environment.yml new file mode 100644 index 0000000..514346d --- /dev/null +++ b/modules/local/insert_negatome/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11 + - sqlite diff --git a/modules/local/insert_negatome/main.nf b/modules/local/insert_negatome/main.nf new file mode 100644 index 0000000..454982d --- /dev/null +++ b/modules/local/insert_negatome/main.nf @@ -0,0 +1,25 @@ +process INSERT_NEGATOME { + tag "insert_negatome" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker://konstantinpelz/domainsplit-general:1.0.0" + + input: + path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' + path negatome_txt + + output: + path "domainsplit.sqlite3", emit: domainsplit_db + path "versions.yml", emit: versions + + script: + """ + cp "${domainsplit_db_in}" domainsplit.sqlite3 + + insert_negatome.py \\ + --db domainsplit.sqlite3 \\ + --negatome ${negatome_txt} \\ + --versions versions.yml \\ + --process-name "${task.process}" + """ +} diff --git a/modules/local/insert_ppi_negative_ddis/main.nf b/modules/local/insert_ppi_negative_ddis/main.nf index 345760e..ba4792c 100644 --- a/modules/local/insert_ppi_negative_ddis/main.nf +++ b/modules/local/insert_ppi_negative_ddis/main.nf @@ -8,8 +8,8 @@ process INSERT_PPI_NEGATIVE_DDIS { path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' path negative_ppi_parquet val min_n_tested - val source_label val sampling_strategy + val self_interaction output: path "domainsplit.sqlite3", emit: domainsplit_db @@ -17,6 +17,7 @@ process INSERT_PPI_NEGATIVE_DDIS { path "versions.yml", emit: versions script: + def no_self = self_interaction ? "" : "--no-self" """ cp "${domainsplit_db_in}" domainsplit.sqlite3 @@ -25,8 +26,8 @@ process INSERT_PPI_NEGATIVE_DDIS { --parquet "${negative_ppi_parquet}" \\ --pfam-mapping-out uniprot_pfam_mapping.json \\ --min-n-tested ${min_n_tested} \\ - --source-label "${source_label}" \\ - --sampling-strategy "${sampling_strategy}" + --sampling-strategy "${sampling_strategy}" \\ + ${no_self} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/insert_ppidm/environment.yml b/modules/local/insert_ppidm/environment.yml new file mode 100644 index 0000000..514346d --- /dev/null +++ b/modules/local/insert_ppidm/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11 + - sqlite diff --git a/modules/local/insert_ppidm/main.nf b/modules/local/insert_ppidm/main.nf new file mode 100644 index 0000000..749b705 --- /dev/null +++ b/modules/local/insert_ppidm/main.nf @@ -0,0 +1,27 @@ +process INSERT_PPIDM { + tag "insert_ppidm" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker://konstantinpelz/domainsplit-general:1.0.0" + + input: + path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' + path ppidm_tsv + val classes + + output: + path "domainsplit.sqlite3", emit: domainsplit_db + path "versions.yml", emit: versions + + script: + """ + cp "${domainsplit_db_in}" domainsplit.sqlite3 + + insert_ppidm.py \\ + --db domainsplit.sqlite3 \\ + --ppidm ${ppidm_tsv} \\ + --classes "${classes}" \\ + --versions versions.yml \\ + --process-name "${task.process}" + """ +} diff --git a/modules/local/insert_single_domain_ppi/environment.yml b/modules/local/insert_single_domain_ppi/environment.yml new file mode 100644 index 0000000..514346d --- /dev/null +++ b/modules/local/insert_single_domain_ppi/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11 + - sqlite diff --git a/modules/local/insert_single_domain_ppi/main.nf b/modules/local/insert_single_domain_ppi/main.nf new file mode 100644 index 0000000..3a14070 --- /dev/null +++ b/modules/local/insert_single_domain_ppi/main.nf @@ -0,0 +1,29 @@ +process INSERT_SINGLE_DOMAIN_PPI { + tag "insert_single_domain_ppi" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker://konstantinpelz/domainsplit-general:1.0.0" + + input: + path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' + path hippie_tsv + path swissprot_map + val min_score + + output: + path "domainsplit.sqlite3", emit: domainsplit_db + path "versions.yml", emit: versions + + script: + """ + cp "${domainsplit_db_in}" domainsplit.sqlite3 + + insert_single_domain_ppi.py \\ + --db domainsplit.sqlite3 \\ + --hippie ${hippie_tsv} \\ + --swissprot-map ${swissprot_map} \\ + --min-score ${min_score} \\ + --versions versions.yml \\ + --process-name "${task.process}" + """ +} diff --git a/modules/local/minimal_leakage_split/main.nf b/modules/local/minimal_leakage_split/main.nf index b0a4099..af546cb 100644 --- a/modules/local/minimal_leakage_split/main.nf +++ b/modules/local/minimal_leakage_split/main.nf @@ -41,6 +41,7 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN { path "domainsplit.sqlite3" val split_fractions // e.g., [("train", 0.6), ("optimization", 0.2), ("test", 0.2)] path ("domain_clusters.tsv") + val source_filter // list of DDI source strings to include; [] = all sources output: path('*.sqlite3'), emit: split_dbs @@ -59,6 +60,9 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN { def split_fraction_dict_str = output_file_fraction_dict.collect { k, v -> "'${k}': ${v}" }.join(", ") def split_fraction_dict_py = "{" + split_fraction_dict_str + "}" + def src_list = source_filter.collect { "'${it}'" }.join(", ") + def where_clause = source_filter ? "WHERE source IN (${src_list})" : "" + """ #!/usr/bin/env python3 \"\"\" @@ -112,7 +116,7 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN { # ── Load DDI data ──────────────────────────────────────────────── conn = sqlite3.connect(input_db_path) ddi_rows = conn.execute( - "SELECT id, domain_id_a, domain_id_b FROM domain_domain_interaction" + "SELECT id, domain_id_a, domain_id_b FROM domain_domain_interaction ${where_clause}" ).fetchall() conn.close() print(f"Loaded {len(ddi_rows)} DDIs") diff --git a/modules/local/random_ddi_split/main.nf b/modules/local/random_ddi_split/main.nf index 5e67e95..1d20ff4 100644 --- a/modules/local/random_ddi_split/main.nf +++ b/modules/local/random_ddi_split/main.nf @@ -7,6 +7,7 @@ process RANDOM_DDI_SPLIT { input: path 'domainsplit.sqlite3' val split_fractions // e.g., [("train", 0.6), ("optimization", 0.2), ("test", 0.2)] + val source_filter // list of DDI source strings to include; [] = all sources output: path('*.sqlite3'), emit: split_dbs @@ -25,6 +26,9 @@ process RANDOM_DDI_SPLIT { def split_fraction_dict_str = output_file_fraction_dict.collect { k, v -> "'${k}': ${v}" }.join(", ") def split_fraction_dict_py = "{" + split_fraction_dict_str + "}" + def src_list = source_filter.collect { "'${it}'" }.join(", ") + def where_clause = source_filter ? "WHERE source IN (${src_list})" : "" + """ #!/usr/bin/env python3 @@ -39,7 +43,7 @@ process RANDOM_DDI_SPLIT { split_fractions = ${split_fraction_dict_py} conn = sqlite3.connect(input_db_path) - ddi_ids = [row[0] for row in conn.execute("SELECT id FROM domain_domain_interaction")] + ddi_ids = [row[0] for row in conn.execute("SELECT id FROM domain_domain_interaction ${where_clause}")] conn.close() random.shuffle(ddi_ids) diff --git a/modules/local/remove_self_interactions/environment.yml b/modules/local/remove_self_interactions/environment.yml new file mode 100644 index 0000000..514346d --- /dev/null +++ b/modules/local/remove_self_interactions/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11 + - sqlite diff --git a/modules/local/remove_self_interactions/main.nf b/modules/local/remove_self_interactions/main.nf new file mode 100644 index 0000000..c87e5a8 --- /dev/null +++ b/modules/local/remove_self_interactions/main.nf @@ -0,0 +1,40 @@ +process REMOVE_SELF_INTERACTIONS { + tag "remove_self_interactions" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker://konstantinpelz/domainsplit-general:1.0.0" + + input: + path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' + + output: + path "domainsplit.sqlite3", emit: domainsplit_db + path "versions.yml", emit: versions + + script: + """ + #!/usr/bin/env python3 + import shutil + import sqlite3 + import sys + + shutil.copy("${domainsplit_db_in}", "domainsplit.sqlite3") + + conn = sqlite3.connect("domainsplit.sqlite3") + conn.execute("PRAGMA foreign_keys=ON") + before = conn.execute("SELECT COUNT(*) FROM domain_domain_interaction").fetchone()[0] + conn.execute( + "DELETE FROM domain_domain_interaction WHERE domain_id_a = domain_id_b" + ) + conn.commit() + after = conn.execute("SELECT COUNT(*) FROM domain_domain_interaction").fetchone()[0] + conn.close() + print(f"[remove_self_interactions] removed {before - after} self-DDIs " + f"({before} -> {after})", flush=True) + + with open("versions.yml", "w") as f: + f.write('"${task.process}":\\n') + f.write(f" python: {sys.version.split()[0]}\\n") + f.write(f" sqlite3: {sqlite3.sqlite_version}\\n") + """ +} diff --git a/modules/local/swissprot_map/environment.yml b/modules/local/swissprot_map/environment.yml new file mode 100644 index 0000000..150b843 --- /dev/null +++ b/modules/local/swissprot_map/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11 diff --git a/modules/local/swissprot_map/main.nf b/modules/local/swissprot_map/main.nf new file mode 100644 index 0000000..e28b2e2 --- /dev/null +++ b/modules/local/swissprot_map/main.nf @@ -0,0 +1,22 @@ +process BUILD_SWISSPROT_PFAM_MAP { + tag "swissprot_map" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker://konstantinpelz/domainsplit-general:1.0.0" + + input: + val url + + output: + path "swissprot_pfam_map.json", emit: map + path "versions.yml", emit: versions + + script: + """ + build_swissprot_pfam_map.py \\ + --url "${url}" \\ + --out swissprot_pfam_map.json \\ + --versions versions.yml \\ + --process-name "${task.process}" + """ +} diff --git a/nextflow.config b/nextflow.config index e5693d9..97a108c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,9 +25,9 @@ params { url_pfam_template = 'https://www.ebi.ac.uk/interpro/wwwapi//entry/pfam/{pfam_id}/?annotation=alignment:full&download' - // ProtT5 per-residue embeddings: local path to the EBI per-residue.h5 file. - // Download from https://ftp.ebi.ac.uk/pub/contrib/UniProt/embeddings/current_release/uniprot_sprot/per-residue.h5 - // When null or file missing, ProtT5 embeddings are skipped with a warning. + // ProtT5 per-residue embeddings: optional local path to a pre-downloaded + // EBI per-residue.h5 file. When set and the file exists it is used; otherwise + // the embeddings are downloaded from url_uniprot_embeddings (ProtT5 always runs). prott5_per_residue_h5 = null // ESM embedding sharding + inference knobs. @@ -62,20 +62,38 @@ params { // null (default) to disable the smoke filter entirely. smoke_test_n_ddis = null - // Optional second negative-DDI source derived from a Y2H/MS PPI parquet - // (columns: gene_name_bait, gene_name_prey, n_tested, ...). When the path - // is non-null the COLLECT_DDI_DATA subworkflow filters rows by `n_tested`, - // maps genes to UniProt via the Swiss-Prot flat file, looks up Pfam domains via - // the UniProt REST API, enumerates every Pfam-pair cross - // product restricted to Pfam IDs already present in positive DDIs, ranks - // pairs by co-occurrence frequency, and inserts the top-N as negatives so - // the total negative count matches the positive count. Leave the parquet - // null (default) to skip the whole step. + // Negative-DDI source derived from a Y2H/MS PPI parquet (columns: + // gene_name_bait, gene_name_prey, n_tested, ...). COLLECT_DDI_DATA filters + // rows by `n_tested`, maps genes to UniProt then Pfam (via the UniProt REST + // API), enumerates the Pfam-pair cross product restricted to Pfam IDs already + // present in positive DDIs, samples pairs (frequency or degree-matched), and + // inserts them as negatives so the total negative count matches the positive + // count. Required input (no default; must be supplied per run). negative_ppi_parquet = null negative_ppi_min_n_tested = 5 - negative_ppi_source_label = 'y2h_ms' negative_sampling_strategy = 'degree_matched' // 'frequency' or 'degree_matched' + // Reviewed-human UniProt -> Pfam stream used to detect single-domain + // proteins (accession, entry name, gene names, Pfam xrefs). + url_uniprot_swissprot_pfam = 'https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=tsv&fields=accession%2Cid%2Cgene_names%2Cxref_pfam&query=%28%28reviewed%3Atrue%29+AND+%28organism_id%3A9606%29%29' + + // Positive DDIs inferred from HIPPIE PPIs between two single-domain proteins. + // Required input (no default). Rows are kept when the HIPPIE confidence + // score (column 5) is >= hippie_min_score. + hippie_tsv = null + hippie_min_score = 0.63 + + // Positive DDIs from PPIDM predictions (predicted_ddi_ppi.tsv with columns + // domain_1, domain_2, class). Class is kept as source 'PPIDM_'. + // Required input (no default). + ppidm_tsv = null + ppidm_classes = 'Bronze,Silver,Gold' + + // When false, all self-interactions (a domain interacting with itself) are + // removed after the positive/negatome sources are inserted, and the + // high-conf non-PPI negative builder skips self-pairs. + self_interaction = true + // Boilerplate options input = null outdir = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 7fbe6ef..e74321b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -36,10 +36,11 @@ } }, "source_database_options": { - "title": "Source database URLs", + "title": "Source databases", "type": "object", "fa_icon": "fas fa-database", - "description": "URLs of the public source databases used to assemble the domainsplit database.", + "description": "Public source databases (URLs and local file paths) used to assemble the domainsplit database.", + "required": ["hippie_tsv", "ppidm_tsv", "negative_ppi_parquet"], "properties": { "url_3did": { "type": "string", @@ -97,7 +98,7 @@ }, "prott5_per_residue_h5": { "type": "string", - "description": "Local path to the EBI ProtT5 per-residue HDF5 file. When null or file missing, ProtT5 embeddings are skipped with a warning.", + "description": "Optional local path to a pre-downloaded ProtT5 per-residue HDF5 file. When set and present it is used; otherwise the file is downloaded from url_uniprot_embeddings.", "default": null, "fa_icon": "fas fa-file" }, @@ -158,7 +159,7 @@ "negative_ppi_parquet": { "type": ["string", "null"], "format": "file-path", - "description": "Optional path to a Y2H/MS PPI parquet (columns: gene_name_bait, gene_name_prey, n_tested, ...). When set, COLLECT_DDI_DATA derives extra negative DDIs by mapping bait/prey genes to UniProt then Pfam (via UniProt REST API) and inserting the most-frequent Pfam-pair candidates restricted to domains already in positive DDIs. Null (default) disables this source.", + "description": "Required path to a Y2H/MS PPI parquet (columns: gene_name_bait, gene_name_prey, n_tested, ...). COLLECT_DDI_DATA derives negative DDIs by mapping bait/prey genes to UniProt then Pfam (via UniProt REST API) and inserting Pfam-pair candidates (degree-matched or by frequency) restricted to domains already in positive DDIs.", "default": null, "fa_icon": "fas fa-file-import" }, @@ -169,12 +170,6 @@ "minimum": 1, "fa_icon": "fas fa-filter" }, - "negative_ppi_source_label": { - "type": "string", - "description": "Value written to the `source` column of domain_domain_interaction for DDIs derived from negative_ppi_parquet.", - "default": "y2h_ms", - "fa_icon": "fas fa-tag" - }, "pfam_download_batch_size": { "type": "integer", "description": "Number of Pfam IDs grouped into a single download job to reduce scheduler overhead.", @@ -188,6 +183,46 @@ "default": "degree_matched", "enum": ["frequency", "degree_matched"], "fa_icon": "fas fa-random" + }, + "url_uniprot_swissprot_pfam": { + "type": "string", + "description": "UniProt stream URL (or local TSV path) of reviewed human proteins with fields accession, entry name, gene names and Pfam xrefs. Used to detect single-domain proteins for the HIPPIE-derived positive DDIs.", + "default": "https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=tsv&fields=accession%2Cid%2Cgene_names%2Cxref_pfam&query=%28%28reviewed%3Atrue%29+AND+%28organism_id%3A9606%29%29", + "fa_icon": "fas fa-link" + }, + "hippie_tsv": { + "type": ["string", "null"], + "format": "file-path", + "description": "Required path to a HIPPIE PPI TSV. COLLECT_DDI_DATA adds positive DDIs inferred from PPIs between two single-domain proteins.", + "default": null, + "fa_icon": "fas fa-file-import" + }, + "hippie_min_score": { + "type": "number", + "description": "Minimum HIPPIE confidence score (column 5) required to keep a PPI row when inferring single-domain positive DDIs.", + "default": 0.63, + "minimum": 0, + "maximum": 1, + "fa_icon": "fas fa-filter" + }, + "ppidm_tsv": { + "type": ["string", "null"], + "format": "file-path", + "description": "Required path to a PPIDM predictions TSV (columns: domain_1, domain_2, class). COLLECT_DDI_DATA adds positive DDIs tagged with source 'PPIDM_'.", + "default": null, + "fa_icon": "fas fa-file-import" + }, + "ppidm_classes": { + "type": "string", + "description": "Comma-separated PPIDM confidence classes to include.", + "default": "Bronze,Silver,Gold", + "fa_icon": "fas fa-tags" + }, + "self_interaction": { + "type": "boolean", + "description": "When false, all self-interactions (a domain interacting with itself) are removed after the positive/negatome sources are inserted, and the high-confidence non-PPI negative builder skips self-pairs.", + "default": true, + "fa_icon": "fas fa-redo" } } }, diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf index 5d44942..6c3d9da 100644 --- a/subworkflows/local/collect_ddi_data/main.nf +++ b/subworkflows/local/collect_ddi_data/main.nf @@ -4,16 +4,29 @@ pre-initialised Domainsplit SQLite. Downstream code consumes only the database; the 3did SQLite stays internal to this subworkflow. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Sources are inserted in a fixed order so that, on a duplicate domain pair, + the earlier source wins the label (INSERT OR IGNORE): + + 3did -> single-domain PPI -> PPIDM -> negatome + -> [optional self-interaction removal] + -> high-confidence non-PPI negatives (over 3did domains only) + Add a new DDI source by: - 1. Adding its download module (network fetch + format normalisation). - 2. Calling it here and routing the parsed output into INSERT_DDIS (or a - per-source INSERT_* module if its parsing differs). + 1. Adding its download/parse module. + 2. Slotting an INSERT_ call into the chain below (each collects its + own unique Pfam IDs and bulk-creates missing domain rows via the shared + bin/ddi_db_utils.py helper). 3. Tagging its rows with a unique source string in domain_domain_interaction. ----------------------------------------------------------------------------*/ include { DOWNLOAD_3DID_SQLITE } from '../../../modules/local/3did/main.nf' include { DOWNLOAD_NEGATOME } from '../../../modules/local/negatome/main.nf' -include { INSERT_DDIS } from '../../../modules/local/insert_ddis/main.nf' +include { INSERT_3DID } from '../../../modules/local/insert_3did/main.nf' +include { BUILD_SWISSPROT_PFAM_MAP } from '../../../modules/local/swissprot_map/main.nf' +include { INSERT_SINGLE_DOMAIN_PPI } from '../../../modules/local/insert_single_domain_ppi/main.nf' +include { INSERT_PPIDM } from '../../../modules/local/insert_ppidm/main.nf' +include { INSERT_NEGATOME } from '../../../modules/local/insert_negatome/main.nf' +include { REMOVE_SELF_INTERACTIONS } from '../../../modules/local/remove_self_interactions/main.nf' include { INSERT_PPI_NEGATIVE_DDIS } from '../../../modules/local/insert_ppi_negative_ddis/main.nf' include { SMOKE_FILTER } from '../../../modules/local/smoke_filter/main.nf' @@ -22,28 +35,54 @@ workflow COLLECT_DDI_DATA { domainsplit_db_in url_3did url_negatome + url_uniprot_swissprot_pfam + hippie_tsv + ppidm_tsv + negative_ppi_parquet main: file_3did = file(url_3did) sqlite_3did = DOWNLOAD_3DID_SQLITE(file_3did).sqlite negatome_file = DOWNLOAD_NEGATOME(url_negatome).negatome - domainsplit_db = INSERT_DDIS(domainsplit_db_in, sqlite_3did, negatome_file).domainsplit_db + // 1. 3did positives + domainsplit_db = INSERT_3DID(domainsplit_db_in, sqlite_3did).domainsplit_db + + // 2-3. single-domain PPI positives (HIPPIE), using a reviewed-human SwissProt map + swissprot_map = BUILD_SWISSPROT_PFAM_MAP(url_uniprot_swissprot_pfam).map + domainsplit_db = INSERT_SINGLE_DOMAIN_PPI( + domainsplit_db, + file(hippie_tsv), + swissprot_map, + params.hippie_min_score, + ).domainsplit_db - pfam_mapping = Channel.empty() + // 4. PPIDM predicted positives (class kept as source) + domainsplit_db = INSERT_PPIDM( + domainsplit_db, + file(ppidm_tsv), + params.ppidm_classes, + ).domainsplit_db - if (params.negative_ppi_parquet != null) { - ppi_result = INSERT_PPI_NEGATIVE_DDIS( - domainsplit_db, - file(params.negative_ppi_parquet), - params.negative_ppi_min_n_tested, - params.negative_ppi_source_label, - params.negative_sampling_strategy, - ) - domainsplit_db = ppi_result.domainsplit_db - pfam_mapping = ppi_result.pfam_mapping + // 5. negatome negatives + domainsplit_db = INSERT_NEGATOME(domainsplit_db, negatome_file).domainsplit_db + + // 6. optional removal of all self-interactions + if (!params.self_interaction) { + domainsplit_db = REMOVE_SELF_INTERACTIONS(domainsplit_db).domainsplit_db } + // 7. high-confidence non-PPI negatives (inferred only over 3did domains) + ppi_result = INSERT_PPI_NEGATIVE_DDIS( + domainsplit_db, + file(negative_ppi_parquet), + params.negative_ppi_min_n_tested, + params.negative_sampling_strategy, + params.self_interaction, + ) + domainsplit_db = ppi_result.domainsplit_db + pfam_mapping = ppi_result.pfam_mapping + if (params.smoke_test_n_ddis != null) { domainsplit_db = SMOKE_FILTER(domainsplit_db, params.smoke_test_n_ddis).domainsplit_db } diff --git a/subworkflows/local/split_domainsplit_database/main.nf b/subworkflows/local/split_domainsplit_database/main.nf index b88ef74..fc1ffb9 100644 --- a/subworkflows/local/split_domainsplit_database/main.nf +++ b/subworkflows/local/split_domainsplit_database/main.nf @@ -1,16 +1,23 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SPLIT_DOMAINSPLIT_DATABASE -- split the Domainsplit DB into train/opt/test - sets using random and minimal-leakage strategies. + SPLIT_DOMAINSPLIT_DATABASE -- produce three split strategies: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Extracts domain sequences, clusters with MMseqs2, then runs the - splitting strategies (random DDI as biased baseline, spectral - graph-partitioning minimal leakage on domains) producing per-split - SQLite databases directly. + * random_ddi biased baseline (random partition) + * minimal_leakage_domain leakage-aware spectral partition + * external_validation leakage-aware train/validation on the "core" + sources (3did + high-conf non-PPI negatives), + plus an as-is test set from the held-out sources + (single-domain PPI, PPIDM, negatome). + + Domain sequences are extracted and clustered (MMseqs2) once; both the + minimal-leakage and external-validation train/val partitions reuse the + clusters. ----------------------------------------------------------------------------*/ include { RANDOM_DDI_SPLIT } from '../../../modules/local/random_ddi_split/main' include { EXTRACT_DOMAIN_SEQUENCES; MINIMAL_LEAKAGE_SPLIT_DOMAIN } from '../../../modules/local/minimal_leakage_split/main' +include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MINIMAL_LEAKAGE_SPLIT_TRAINVAL } from '../../../modules/local/minimal_leakage_split/main' +include { SUBSET_DDIS_BY_SOURCE } from '../../../modules/local/external_validation_split/main' include { MMSEQS_EASYCLUSTER } from '../../../modules/nf-core/mmseqs/easycluster/main' @@ -38,6 +45,7 @@ workflow SPLIT_DOMAINSPLIT_DATABASE { } clusters = MMSEQS_EASYCLUSTER(cluster_input) + def clusters_tsv = clusters.tsv.filter { it[0].id == "domain" }.map { it[1] }.first() def splits = [ ["train", 0.6], @@ -45,22 +53,58 @@ workflow SPLIT_DOMAINSPLIT_DATABASE { ["test", 0.2] ] + // The two methods that mimic a within-distribution evaluation use only the + // "core" sources: 3did positives + high-confidence non-PPI negatives. + // 'inferred_ppi_screen_negative' must stay in sync with the --source-label + // default in bin/build_ppi_negative_ddis.py. + def core_sources = ['3did', 'inferred_ppi_screen_negative'] + + // External-validation test set: held-out sources placed as is. + def test_sources = [ + 'single_domain_ppi', + 'PPIDM_Bronze', 'PPIDM_Silver', 'PPIDM_Gold', + 'negatome', + ] + // Biased baseline: random DDI split (same proteins in train and test) RANDOM_DDI_SPLIT( domainsplit_db_ch, - Channel.of(splits) + Channel.of(splits), + core_sources ) // Leakage-aware: spectral graph partitioning on domain clusters MINIMAL_LEAKAGE_SPLIT_DOMAIN( domainsplit_db_ch, splits, - clusters.tsv.filter { it[0].id == "domain" }.map { it[1] } + clusters_tsv, + core_sources + ) + + // External validation: leakage-free train/validation on core sources ... + def trainval_splits = [ + ["train", 0.8], + ["validation", 0.2] + ] + MINIMAL_LEAKAGE_SPLIT_TRAINVAL( + domainsplit_db_ch, + trainval_splits, + clusters_tsv, + core_sources + ) + + // ... plus an as-is test set from the held-out sources + SUBSET_DDIS_BY_SOURCE( + domainsplit_db_ch, + test_sources, + "test" ) split_ch = Channel.empty().mix( map_split_dbs(RANDOM_DDI_SPLIT.out.split_info, RANDOM_DDI_SPLIT.out.split_dbs, "random_ddi"), - map_split_dbs(MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_info, MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_dbs, "minimal_leakage_domain") + map_split_dbs(MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_info, MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_dbs, "minimal_leakage_domain"), + map_split_dbs(MINIMAL_LEAKAGE_SPLIT_TRAINVAL.out.split_info, MINIMAL_LEAKAGE_SPLIT_TRAINVAL.out.split_dbs, "external_validation"), + map_split_dbs(SUBSET_DDIS_BY_SOURCE.out.split_info, SUBSET_DDIS_BY_SOURCE.out.split_dbs, "external_validation") ) emit: diff --git a/workflows/domainsplit.nf b/workflows/domainsplit.nf index 877e784..93f93f4 100644 --- a/workflows/domainsplit.nf +++ b/workflows/domainsplit.nf @@ -23,22 +23,20 @@ include { ANALYZE_DDI_BIAS } from '../modules/local/analyze_ddi_bias/ workflow DOMAINSPLIT { main: input_uniprot_id_mapping = file(params.url_uniprot_id_mapping) - input_uniprot_embeddings = file(params.url_uniprot_embeddings) input_uniprot_go_terms = file(params.url_uniprot_go_terms) input_uniprot_sequences = file(params.url_uniprot_sequences) input_string = file(params.url_string) input_pfam2go = file(params.url_pfam2go) - def prott5_file = [] - if (params.prott5_per_residue_h5) { - def f = file(params.prott5_per_residue_h5) - if (f.exists()) { - prott5_file = f - } else { - log.warn "ProtT5 HDF5 not found at '${params.prott5_per_residue_h5}' — skipping ProtT5 embeddings" - } + // ProtT5 per-residue embeddings: prefer a pre-downloaded local file when it + // exists, otherwise fall back to downloading url_uniprot_embeddings. Always + // populated, so ProtT5 embeddings are a compulsory step. + def prott5_file = file(params.url_uniprot_embeddings) + if (params.prott5_per_residue_h5 && file(params.prott5_per_residue_h5).exists()) { + prott5_file = file(params.prott5_per_residue_h5) + log.info "Using local ProtT5 HDF5 at '${params.prott5_per_residue_h5}'" } else { - log.warn "params.prott5_per_residue_h5 not set — skipping ProtT5 embeddings" + log.info "Using ProtT5 HDF5 from url_uniprot_embeddings" } empty_db = INIT_DOMAINSPLIT_DB().domainsplit_db @@ -47,6 +45,10 @@ main: empty_db, params.url_3did, params.url_negatome, + params.url_uniprot_swissprot_pfam, + params.hippie_tsv, + params.ppidm_tsv, + params.negative_ppi_parquet, ) domainsplit_db_ddi = COLLECT_DDI_DATA.out.domainsplit_db From 36b05f64992449e73bd9a5faace21fefa24eb2b7 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Tue, 9 Jun 2026 19:16:31 +0200 Subject: [PATCH 02/16] updated mmseqs, made the negative sampling use dans --- bin/build_ppi_negative_ddis.py | 143 ++++++++---------- modules.json | 2 +- .../insert_ppi_negative_ddis/environment.yml | 1 + .../local/insert_ppi_negative_ddis/main.nf | 3 +- modules/nf-core/mmseqs/easycluster/main.nf | 12 +- modules/nf-core/mmseqs/easycluster/meta.yml | 30 +++- .../easycluster/tests/main.nf.test.snap | 36 +++-- nextflow.config | 4 +- nextflow_schema.json | 7 - subworkflows/local/collect_ddi_data/main.nf | 1 - 10 files changed, 114 insertions(+), 125 deletions(-) diff --git a/bin/build_ppi_negative_ddis.py b/bin/build_ppi_negative_ddis.py index aae31d2..366ae06 100755 --- a/bin/build_ppi_negative_ddis.py +++ b/bin/build_ppi_negative_ddis.py @@ -2,22 +2,27 @@ """ Build negative DDIs from a Y2H/MS PPI parquet and append them to the domainsplit SQLite, restricted to Pfam domains already present in positive -DDIs. In degree_matched mode, selects pairs so each domain's negative -degree matches its positive degree. In frequency mode, takes the top-N -by PPI co-occurrence count, capped at (n_positive - n_negatome). +DDIs. + +Selection uses degree-aware node sampling (DANS, Cappelletti et al. 2024, +Bioinformatics Advances vbae036) applied to the PPI-derived candidate pool: +each candidate Pfam pair is sampled (without replacement) with probability +proportional to the preferential attachment of its two domains in the positive +graph -- the product of their positive degrees. This makes the negative +degree / preferential-attachment distribution track the positives, avoiding the +inflated downstream evaluation that uniform negative sampling produces. """ import argparse -import heapq import itertools import json -import random import sqlite3 import sys import time import math from collections import defaultdict +import numpy as np import pyarrow.parquet as pq import requests @@ -42,11 +47,10 @@ def parse_args(): p.add_argument("--min-n-tested", type=int, required=True) p.add_argument("--source-label", default="inferred_ppi_screen_negative") p.add_argument( - "--sampling-strategy", - choices=["frequency", "degree_matched"], - default="degree_matched", - help="'frequency' = top-N by co-occurrence (old behavior). " - "'degree_matched' = sample to match positive degree distribution.", + "--seed", + type=int, + default=42, + help="Random seed for reproducible DANS negative sampling.", ) p.add_argument( "--no-self", @@ -227,65 +231,52 @@ def _compute_positive_degree(conn): return deg -def select_degree_matched(fresh_candidates, pos_degree, n_take): - """Select negatives so each domain's negative degree matches its positive degree. +def select_dans(fresh_candidates, pos_degree, n_take, seed=42): + """Degree-aware node sampling (DANS) over the PPI candidate pool. - Uses a lazy-deletion max-heap scored by combined degree deficit of both - domains in each candidate pair. Candidates are shuffled for random - tiebreaking among equal-deficit pairs. + Cappelletti et al. 2024 (Bioinformatics Advances, vbae036) sample negative + edges so endpoint node-degrees track the positive distribution; the induced + edge probability is proportional to the preferential attachment of the two + endpoints, ``PA = deg(a) * deg(b)``. Here we apply that distribution to the + fixed pool of PPI-derived candidate Pfam pairs: each candidate is drawn + without replacement with probability proportional to the product of its two + domains' positive degrees, so the selected negatives mirror the positive + degree / PA distribution while staying biologically grounded in the PPI + screen. """ - if not fresh_candidates or n_take <= 0: + if n_take <= 0 or not fresh_candidates: return [] - if not pos_degree: - return fresh_candidates[:n_take] + if n_take >= len(fresh_candidates): + return list(fresh_candidates) - target = dict(pos_degree) - current = defaultdict(int) - - candidates = list(fresh_candidates) - random.shuffle(candidates) - - remaining = set(range(len(candidates))) - - def deficit(pfam): - return max(0, target.get(pfam, 0) - current[pfam]) - - def score(i): - (pfam_a, pfam_b), _ = candidates[i] - return deficit(pfam_a) + deficit(pfam_b) - - heap = [(-score(i), i) for i in range(len(candidates))] - heapq.heapify(heap) - - chosen = [] - while len(chosen) < n_take and heap: - neg_s, i = heapq.heappop(heap) - if i not in remaining: - continue - - actual = score(i) - if actual != -neg_s: - if actual > 0: - heapq.heappush(heap, (-actual, i)) - else: - remaining.discard(i) - continue - - if actual <= 0: - break - - (pfam_a, pfam_b), count = candidates[i] - chosen.append(((pfam_a, pfam_b), count)) - remaining.discard(i) - current[pfam_a] += 1 - current[pfam_b] += 1 - - matched = sum(1 for p in target if current.get(p, 0) >= target[p]) - over = sum(1 for p in target if current.get(p, 0) > target[p]) - total_deficit = sum(max(0, target[p] - current.get(p, 0)) for p in target) - log(f"degree_matched: {matched}/{len(target)} domains reached target degree") - log(f"degree_matched: {over} domains exceeded target degree") - log(f"degree_matched: remaining total deficit = {total_deficit}") + weights = np.array( + [pos_degree.get(a, 0) * pos_degree.get(b, 0) for (a, b), _ in fresh_candidates], + dtype=float, + ) + # Defensive fallback: if positive-degree info is missing/degenerate (or too + # few non-zero weights to draw n_take distinct pairs), sample uniformly. + if weights.sum() <= 0 or int((weights > 0).sum()) < n_take: + log("DANS: degenerate weights -> uniform fallback") + weights = np.ones(len(fresh_candidates), dtype=float) + + probs = weights / weights.sum() + rng = np.random.default_rng(seed) + idx = rng.choice(len(fresh_candidates), size=n_take, replace=False, p=probs) + + chosen = [fresh_candidates[i] for i in idx] + + def pa(a, b): + return pos_degree.get(a, 0) * pos_degree.get(b, 0) + + chosen_degree = defaultdict(int) + for (a, b), _ in chosen: + chosen_degree[a] += 1 + chosen_degree[b] += 1 + pool_pa = float(np.mean([pa(a, b) for (a, b), _ in fresh_candidates])) + sel_pa = float(np.mean([pa(a, b) for (a, b), _ in chosen])) + log(f"DANS: selected {len(chosen)} negatives from pool of {len(fresh_candidates)}") + log(f"DANS: {len(chosen_degree)} domains used ({len(pos_degree)} in the positive set); " + f"mean PA pool={pool_pa:.1f} selected={sel_pa:.1f}") return chosen @@ -375,18 +366,7 @@ def row_pfams(gene): "SELECT COUNT(*) FROM domain_domain_interaction " "WHERE negative = 0 AND source = '3did'" ).fetchone()[0] - n_negatome = conn.execute( - "SELECT COUNT(*) FROM domain_domain_interaction " - "WHERE negative = 1 AND source = 'negatome'" - ).fetchone()[0] log(f"n_positive_ddis_in_db = {n_positive}") - log(f"n_negatome_negatives_in_db = {n_negatome}") - if args.sampling_strategy == "degree_matched": - n_take = n_positive - log(f"n_take = {n_take} (degree_matched: matching positive count)") - else: - n_take = max(0, n_positive - n_negatome) - log(f"n_take (target for source='{args.source_label}') = {n_take}") fresh_candidates = [] n_positive_ddis_in_negative_ppis = 0 @@ -399,16 +379,11 @@ def row_pfams(gene): log(f"n_positive_ddis_in_negative_ppis = {n_positive_ddis_in_negative_ppis}") - fresh_candidates.sort(key=lambda kv: kv[1], reverse=True) log(f"n_fresh_candidates_after_dedup = {len(fresh_candidates)}") - if args.sampling_strategy == "degree_matched": - log("using degree-matched sampling strategy") - pos_degree = _compute_positive_degree(conn) - chosen = select_degree_matched(fresh_candidates, pos_degree, n_take) - else: - log("using frequency-ranked sampling strategy") - chosen = fresh_candidates[:n_take] + log("selecting negatives via degree-aware node sampling (DANS)") + pos_degree = _compute_positive_degree(conn) + chosen = select_dans(fresh_candidates, pos_degree, n_positive, seed=args.seed) log(f"n_chosen = {len(chosen)}") if chosen: diff --git a/modules.json b/modules.json index 6e03911..7c9f7ff 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "mmseqs/easycluster": { "branch": "master", - "git_sha": "38697a933bef7041bb935c9b8374d9948ce6c794", + "git_sha": "6d46786420b4d7bc88eba026eb389c0c5535d120", "installed_by": ["modules"] } } diff --git a/modules/local/insert_ppi_negative_ddis/environment.yml b/modules/local/insert_ppi_negative_ddis/environment.yml index 8b9582f..61b1f9f 100644 --- a/modules/local/insert_ppi_negative_ddis/environment.yml +++ b/modules/local/insert_ppi_negative_ddis/environment.yml @@ -6,4 +6,5 @@ dependencies: - sqlite - pandas - pyarrow + - numpy - requests diff --git a/modules/local/insert_ppi_negative_ddis/main.nf b/modules/local/insert_ppi_negative_ddis/main.nf index ba4792c..3dc3998 100644 --- a/modules/local/insert_ppi_negative_ddis/main.nf +++ b/modules/local/insert_ppi_negative_ddis/main.nf @@ -8,7 +8,6 @@ process INSERT_PPI_NEGATIVE_DDIS { path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' path negative_ppi_parquet val min_n_tested - val sampling_strategy val self_interaction output: @@ -26,13 +25,13 @@ process INSERT_PPI_NEGATIVE_DDIS { --parquet "${negative_ppi_parquet}" \\ --pfam-mapping-out uniprot_pfam_mapping.json \\ --min-n-tested ${min_n_tested} \\ - --sampling-strategy "${sampling_strategy}" \\ ${no_self} cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python3 -c 'import sys; print(sys.version.split()[0])') pyarrow: \$(python3 -c 'import pyarrow; print(pyarrow.__version__)') + numpy: \$(python3 -c 'import numpy; print(numpy.__version__)') sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)') END_VERSIONS """ diff --git a/modules/nf-core/mmseqs/easycluster/main.nf b/modules/nf-core/mmseqs/easycluster/main.nf index b4686ab..ded1cb8 100644 --- a/modules/nf-core/mmseqs/easycluster/main.nf +++ b/modules/nf-core/mmseqs/easycluster/main.nf @@ -3,7 +3,7 @@ process MMSEQS_EASYCLUSTER { label 'process_medium' conda "${moduleDir}/environment.yml" - container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/fe/fe49c17754753d6cd9a31e5894117edaf1c81e3d6053a12bf6dc8f3af1dffe23/data' : 'community.wave.seqera.io/library/mmseqs2:18.8cc5c--af05c9a98d9f6139'}" @@ -14,7 +14,7 @@ process MMSEQS_EASYCLUSTER { tuple val(meta), path("*rep_seq.fasta"), emit: representatives tuple val(meta), path("*all_seqs.fasta"), emit: fasta tuple val(meta), path("*.tsv"), emit: tsv - path "versions.yml", emit: versions + tuple val("${task.process}"), val('mmseqs'), eval('mmseqs version'), topic: versions, emit: versions_mmseqs when: task.ext.when == null || task.ext.when @@ -31,10 +31,6 @@ process MMSEQS_EASYCLUSTER { ${args} \\ --threads ${task.cpus} - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') - END_VERSIONS """ stub: @@ -47,9 +43,5 @@ process MMSEQS_EASYCLUSTER { touch ${prefix}_rep_seq.fasta touch ${prefix}_all_seqs.fasta - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') - END_VERSIONS """ } diff --git a/modules/nf-core/mmseqs/easycluster/meta.yml b/modules/nf-core/mmseqs/easycluster/meta.yml index 4451857..0b838ec 100644 --- a/modules/nf-core/mmseqs/easycluster/meta.yml +++ b/modules/nf-core/mmseqs/easycluster/meta.yml @@ -1,4 +1,3 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "mmseqs_easycluster" description: Cluster sequences using MMSeqs2 easy cluster. keywords: @@ -15,7 +14,8 @@ tools: documentation: "https://mmseqs.com/latest/userguide.pdf" tool_dev_url: "https://github.com/soedinglab/MMseqs2" doi: "10.1093/bioinformatics/btw006" - licence: ["GPL v3"] + licence: + - "GPL v3" identifier: biotools:mmseqs input: - - meta: @@ -62,13 +62,27 @@ output: description: an adjacency list file containing the clusters ontologies: - edam: http://edamontology.org/format_3475 # TSV + versions_mmseqs: + - - ${task.process}: + type: string + description: The name of the process + - mmseqs: + type: string + description: The name of the tool + - mmseqs version: + type: eval + description: The expression to obtain the version of the tool +topics: versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" - ontologies: - - edam: http://edamontology.org/format_3750 # YAML + - - ${task.process}: + type: string + description: The name of the process + - mmseqs: + type: string + description: The name of the tool + - mmseqs version: + type: eval + description: The expression to obtain the version of the tool authors: - "@Joon-Klaps" maintainers: diff --git a/modules/nf-core/mmseqs/easycluster/tests/main.nf.test.snap b/modules/nf-core/mmseqs/easycluster/tests/main.nf.test.snap index edb2c7e..5d27d6e 100644 --- a/modules/nf-core/mmseqs/easycluster/tests/main.nf.test.snap +++ b/modules/nf-core/mmseqs/easycluster/tests/main.nf.test.snap @@ -30,7 +30,11 @@ ] ], "3": [ - "versions.yml:md5,719ca0cf390aec3bd0edc9f819108c13" + [ + "MMSEQS_EASYCLUSTER", + "mmseqs", + "18.8cc5c" + ] ], "fasta": [ [ @@ -59,16 +63,20 @@ "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "versions": [ - "versions.yml:md5,719ca0cf390aec3bd0edc9f819108c13" + "versions_mmseqs": [ + [ + "MMSEQS_EASYCLUSTER", + "mmseqs", + "18.8cc5c" + ] ] } ], "meta": { "nf-test": "0.9.2", - "nextflow": "25.10.0" + "nextflow": "25.10.4" }, - "timestamp": "2025-11-01T16:21:16.919838587" + "timestamp": "2026-02-12T11:27:50.850138372" }, "mmseqs/easycluster - sarscov2 - proteome": { "content": [ @@ -101,7 +109,11 @@ ] ], "3": [ - "versions.yml:md5,719ca0cf390aec3bd0edc9f819108c13" + [ + "MMSEQS_EASYCLUSTER", + "mmseqs", + "18.8cc5c" + ] ], "fasta": [ [ @@ -130,15 +142,19 @@ "test_cluster.tsv:md5,1cad5ce35cf71f8c438fd3ec5a786946" ] ], - "versions": [ - "versions.yml:md5,719ca0cf390aec3bd0edc9f819108c13" + "versions_mmseqs": [ + [ + "MMSEQS_EASYCLUSTER", + "mmseqs", + "18.8cc5c" + ] ] } ], "meta": { "nf-test": "0.9.2", - "nextflow": "25.10.0" + "nextflow": "25.10.4" }, - "timestamp": "2025-11-01T16:21:12.483762944" + "timestamp": "2026-02-12T11:27:44.451570131" } } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 97a108c..53ce85f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -66,12 +66,12 @@ params { // gene_name_bait, gene_name_prey, n_tested, ...). COLLECT_DDI_DATA filters // rows by `n_tested`, maps genes to UniProt then Pfam (via the UniProt REST // API), enumerates the Pfam-pair cross product restricted to Pfam IDs already - // present in positive DDIs, samples pairs (frequency or degree-matched), and + // present in positive DDIs, samples pairs by degree-aware node sampling + // (DANS; weight = preferential attachment in the positive graph), and // inserts them as negatives so the total negative count matches the positive // count. Required input (no default; must be supplied per run). negative_ppi_parquet = null negative_ppi_min_n_tested = 5 - negative_sampling_strategy = 'degree_matched' // 'frequency' or 'degree_matched' // Reviewed-human UniProt -> Pfam stream used to detect single-domain // proteins (accession, entry name, gene names, Pfam xrefs). diff --git a/nextflow_schema.json b/nextflow_schema.json index e74321b..ddf5853 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -177,13 +177,6 @@ "minimum": 1, "fa_icon": "fas fa-layer-group" }, - "negative_sampling_strategy": { - "type": "string", - "description": "Strategy for sampling negative DDIs from the PPI parquet source.", - "default": "degree_matched", - "enum": ["frequency", "degree_matched"], - "fa_icon": "fas fa-random" - }, "url_uniprot_swissprot_pfam": { "type": "string", "description": "UniProt stream URL (or local TSV path) of reviewed human proteins with fields accession, entry name, gene names and Pfam xrefs. Used to detect single-domain proteins for the HIPPIE-derived positive DDIs.", diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf index 6c3d9da..3c377f4 100644 --- a/subworkflows/local/collect_ddi_data/main.nf +++ b/subworkflows/local/collect_ddi_data/main.nf @@ -77,7 +77,6 @@ workflow COLLECT_DDI_DATA { domainsplit_db, file(negative_ppi_parquet), params.negative_ppi_min_n_tested, - params.negative_sampling_strategy, params.self_interaction, ) domainsplit_db = ppi_result.domainsplit_db From 33b58944ad02f05a2b2c90b8b8bf53032dc42e75 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Tue, 9 Jun 2026 22:26:58 +0200 Subject: [PATCH 03/16] fixed warnings --- modules.json | 4 +- nextflow_schema.json | 1 + subworkflows/local/collect_ddi_data/meta.yml | 59 +++++++++++++++++++ subworkflows/local/curate_domains/meta.yml | 30 ++++++++++ .../local/enrich_ddi_database/meta.yml | 58 ++++++++++++++++++ .../local/generate_embeddings/meta.yml | 29 +++++++++ .../local/split_domainsplit_database/meta.yml | 8 +-- .../utils_nfcore_domainsplit_pipeline/main.nf | 5 +- .../meta.yml | 28 +++++++++ .../nf-core/utils_nextflow_pipeline/main.nf | 20 +++++-- .../nf-core/utils_nfschema_plugin/main.nf | 12 ++-- .../nf-core/utils_nfschema_plugin/meta.yml | 24 ++++++++ .../utils_nfschema_plugin/tests/main.nf.test | 5 ++ .../tests/nextflow.config | 2 +- 14 files changed, 266 insertions(+), 19 deletions(-) create mode 100644 subworkflows/local/collect_ddi_data/meta.yml create mode 100644 subworkflows/local/curate_domains/meta.yml create mode 100644 subworkflows/local/enrich_ddi_database/meta.yml create mode 100644 subworkflows/local/generate_embeddings/meta.yml create mode 100644 subworkflows/local/utils_nfcore_domainsplit_pipeline/meta.yml diff --git a/modules.json b/modules.json index 7c9f7ff..5595224 100644 --- a/modules.json +++ b/modules.json @@ -16,7 +16,7 @@ "nf-core": { "utils_nextflow_pipeline": { "branch": "master", - "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", + "git_sha": "1a545fcbd762911c21a64ced3dbef99b2b51ac75", "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { @@ -26,7 +26,7 @@ }, "utils_nfschema_plugin": { "branch": "master", - "git_sha": "fdc08b8b1ae74f56686ce21f7ea11ad11990ce57", + "git_sha": "a7b27fd25bfa8dcc07d299e88bd790585901a436", "installed_by": ["subworkflows"] } } diff --git a/nextflow_schema.json b/nextflow_schema.json index ddf5853..5dd15a9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -15,6 +15,7 @@ "input": { "type": "string", "format": "file-path", + "mimetype": "text/csv", "description": "Unused samplesheet placeholder; this pipeline reads inputs from the url_* parameters instead.", "help_text": "Kept for nf-core template compatibility. Source databases are configured via the url_* parameters in nextflow.config.", "fa_icon": "fas fa-file-csv", diff --git a/subworkflows/local/collect_ddi_data/meta.yml b/subworkflows/local/collect_ddi_data/meta.yml new file mode 100644 index 0000000..750ebd0 --- /dev/null +++ b/subworkflows/local/collect_ddi_data/meta.yml @@ -0,0 +1,59 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "collect_ddi_data" +description: Download and parse every DDI source into the pre-initialised Domainsplit SQLite, applying the fixed source-priority order and optional smoke filter. +keywords: + - ddi + - 3did + - negatome + - ppi + - database +components: + - download/3did/sqlite + - download/negatome + - insert/3did + - build/swissprot/pfam/map + - insert/single/domain/ppi + - insert/ppidm + - insert/negatome + - remove/self/interactions + - insert/ppi/negative/ddis + - smoke/filter +input: + - domainsplit_db_in: + type: file + description: Pre-initialised empty Domainsplit SQLite database. + pattern: "*.sqlite3" + - url_3did: + type: string + description: URL of the 3did flat file dump. + - url_negatome: + type: string + description: URL of the Negatome combined dataset. + - url_uniprot_swissprot_pfam: + type: string + description: URL of the UniProt SwissProt-to-Pfam mapping used to build single-domain PPIs. + - hippie_tsv: + type: file + description: HIPPIE PPI table used to derive single-domain positive DDIs. + pattern: "*.{tsv,txt}" + - ppidm_tsv: + type: file + description: PPIDM inferred domain-domain interaction table. + pattern: "*.{tsv,txt}" + - negative_ppi_parquet: + type: file + description: High-confidence non-PPI pairs used to derive negative DDIs. + pattern: "*.parquet" +output: + - domainsplit_db: + type: file + description: Domainsplit SQLite populated with positive and negative DDIs. + pattern: "*.sqlite3" + - pfam_mapping: + type: file + description: SwissProt protein-to-Pfam domain mapping produced while building single-domain PPIs. + pattern: "*.{tsv,parquet}" +authors: + - "@KonstantinPelz" +maintainers: + - "@KonstantinPelz" diff --git a/subworkflows/local/curate_domains/meta.yml b/subworkflows/local/curate_domains/meta.yml new file mode 100644 index 0000000..97c2e04 --- /dev/null +++ b/subworkflows/local/curate_domains/meta.yml @@ -0,0 +1,30 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "curate_domains" +description: Enumerate the unique Pfam domains referenced by the DDI set, download their Pfam alignments, and build a protein-to-domain map. +keywords: + - pfam + - domain + - alignment + - mapping +components: + - extract/unique/domains + - download/pfam/alignments/batch + - create/protein/domain/mapping +input: + - domainsplit_db: + type: file + description: Domainsplit SQLite after DDI collection and smoke filtering. + pattern: "*.sqlite3" + - input_uniprot_id_mapping: + type: file + description: UniProt ID mapping used to associate proteins with Pfam domains. + pattern: "*.{tsv,dat,gz}" +output: + - protein_domain_map: + type: file + description: Protein-to-Pfam-domain mapping derived from the Pfam alignments. + pattern: "*.{tsv,parquet}" +authors: + - "@KonstantinPelz" +maintainers: + - "@KonstantinPelz" diff --git a/subworkflows/local/enrich_ddi_database/meta.yml b/subworkflows/local/enrich_ddi_database/meta.yml new file mode 100644 index 0000000..fdae557 --- /dev/null +++ b/subworkflows/local/enrich_ddi_database/meta.yml @@ -0,0 +1,58 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "enrich_ddi_database" +description: Sequentially annotate the DDI database with domain GO terms, proteins plus per-residue embeddings, protein GO terms, STRING PPIs, and the per-domain protein/embedding map. +keywords: + - ddi + - go + - embeddings + - ppi + - annotation +components: + - insert/domain/go/terms + - insert/proteins/with/embeddings + - insert/protein/go/terms + - insert/ppi + - insert/domain/protein/mapping +input: + - domainsplit_db_in: + type: file + description: Domainsplit SQLite with curated domains. + pattern: "*.sqlite3" + - input_pfam2go: + type: file + description: pfam2go mapping of Pfam domains to GO terms. + - input_uniprot_sequences: + type: file + description: UniProt protein sequences. + pattern: "*.{fasta,fa,gz}" + - protein_domain_map: + type: file + description: Protein-to-Pfam-domain mapping from CURATE_DOMAINS. + - prott5_embeddings: + type: file + description: ProtT5 per-residue protein embeddings. + pattern: "*.h5" + - input_uniprot_go_terms: + type: file + description: UniProt protein-to-GO-term annotations. + - input_string: + type: file + description: STRING protein-protein interaction table. + - input_uniprot_id_mapping: + type: file + description: UniProt ID mapping linking STRING IDs to UniProt accessions. + - esm_protein_embeddings: + type: file + description: ESM per-residue protein embeddings. + - esm_domain_embeddings: + type: file + description: ESM pooled per-domain embeddings. +output: + - domainsplit_db: + type: file + description: Fully enriched Domainsplit SQLite database. + pattern: "*.sqlite3" +authors: + - "@KonstantinPelz" +maintainers: + - "@KonstantinPelz" diff --git a/subworkflows/local/generate_embeddings/meta.yml b/subworkflows/local/generate_embeddings/meta.yml new file mode 100644 index 0000000..dfd0981 --- /dev/null +++ b/subworkflows/local/generate_embeddings/meta.yml @@ -0,0 +1,29 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "generate_embeddings" +description: Generate ESM per-residue protein embeddings and pooled per-domain embeddings against the supplied protein-to-domain map. +keywords: + - esm + - embeddings + - protein + - domain +components: + - generate/esm/embeddings +input: + - protein_domain_map: + type: file + description: Protein-to-Pfam-domain mapping used to pool per-domain embeddings. + - input_uniprot_sequences: + type: file + description: UniProt protein sequences to embed. + pattern: "*.{fasta,fa,gz}" +output: + - esm_protein_embeddings: + type: file + description: ESM per-residue protein embeddings. + - esm_domain_embeddings: + type: file + description: ESM pooled per-domain embeddings. +authors: + - "@KonstantinPelz" +maintainers: + - "@KonstantinPelz" diff --git a/subworkflows/local/split_domainsplit_database/meta.yml b/subworkflows/local/split_domainsplit_database/meta.yml index f2212c8..7298266 100644 --- a/subworkflows/local/split_domainsplit_database/meta.yml +++ b/subworkflows/local/split_domainsplit_database/meta.yml @@ -7,10 +7,10 @@ keywords: - clustering components: - mmseqs/easycluster - - random_ddi_split - - random_denoise_split - - minimal_leakage_split - - split_database + - random/ddi/split + - extract/domain/sequences + - minimal/leakage/split/domain + - subset/ddis/by/source input: - domainsplit_db_ch: type: file diff --git a/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf b/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf index 5ba5d41..2f778e3 100644 --- a/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf @@ -10,7 +10,6 @@ include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' include { paramsSummaryMap } from 'plugin/nf-schema' -include { paramsHelp } from 'plugin/nf-schema' include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' @@ -137,7 +136,6 @@ workflow PIPELINE_COMPLETION { // Generate methods description for MultiQC // def toolCitationText() { - // TODO nf-core: Optionally add in-text citation tools to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ @@ -149,7 +147,6 @@ def toolCitationText() { } def toolBibliographyText() { - // TODO nf-core: Optionally add bibliographic entries to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ @@ -182,7 +179,7 @@ def methodsDescriptionText(mqc_methods_yaml) { meta["tool_citations"] = "" meta["tool_bibliography"] = "" - // TODO nf-core: Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! + // Uncomment below once logic in toolCitationText/toolBibliographyText has been filled. // meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") // meta["tool_bibliography"] = toolBibliographyText() diff --git a/subworkflows/local/utils_nfcore_domainsplit_pipeline/meta.yml b/subworkflows/local/utils_nfcore_domainsplit_pipeline/meta.yml new file mode 100644 index 0000000..5f3bc49 --- /dev/null +++ b/subworkflows/local/utils_nfcore_domainsplit_pipeline/meta.yml @@ -0,0 +1,28 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "PIPELINE_INITIALISATION" +description: Subworkflow with functionality specific to the daisybio/domainsplit pipeline (initialisation and completion). +keywords: + - utility + - pipeline + - initialise + - completion +components: + - utils_nextflow_pipeline + - utils_nfcore_pipeline + - utils_nfschema_plugin + - completionemail + - completionsummary +input: + - nextflow_cli_args: + type: list + description: | + Nextflow CLI positional arguments +output: + - success: + type: boolean + description: | + Dummy output to indicate success +authors: + - "@KonstantinPelz" +maintainers: + - "@KonstantinPelz" diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf index d6e593e..37939ac 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf @@ -73,11 +73,23 @@ def getWorkflowVersion() { def dumpParametersToJSON(outdir) { def timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss') def filename = "params_${timestamp}.json" - def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") - def jsonStr = groovy.json.JsonOutput.toJson(params) + def temp_pf = workflow.launchDir.resolve(".${filename}") + def jsonGenerator = new groovy.json.JsonGenerator.Options() + .excludeNulls() + .addConverter(Path) { Path path -> path.toUriString() } + .addConverter(Duration) { Duration duration -> duration.toMillis() } + .addConverter(MemoryUnit) { MemoryUnit memory -> memory.toBytes() } + .addConverter(nextflow.script.types.VersionNumber) { nextflow.script.types.VersionNumber version -> version.toString() } + .build() + def jsonStr = jsonGenerator.toJson(params) temp_pf.text = groovy.json.JsonOutput.prettyPrint(jsonStr) - - nextflow.extension.FilesEx.copyTo(temp_pf.toPath(), "${outdir}/pipeline_info/params_${timestamp}.json") + if (outdir instanceof Path) { + temp_pf.copyTo(outdir.resolve("pipeline_info/${filename}")) + } else if (outdir instanceof String) { + temp_pf.copyTo("${outdir}/pipeline_info/params_${timestamp}.json") + } else { + log.warn("Could not determine type of outdir, parameters JSON file will not be copied to output directory!") + } temp_pf.delete() } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf index 1df8b76..9ff0681 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/main.nf +++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf @@ -22,6 +22,7 @@ workflow UTILS_NFSCHEMA_PLUGIN { before_text // string: text to show before the help message and parameters summary after_text // string: text to show after the help message and parameters summary command // string: an example command of the pipeline + cli_typecast // boolean: whether to perform typecasting of CLI parameters. Set this to `null` to use the default behaviour main: @@ -34,11 +35,11 @@ workflow UTILS_NFSCHEMA_PLUGIN { fullHelp: help_full, ] if(parameters_schema) { - help_options << [parametersSchema: parameters_schema] + help_options << [parameters_schema: parameters_schema] } log.info paramsHelp( help_options, - (params.help instanceof String && params.help != "true") ? params.help : "", + (help instanceof String && help != "true") ? help : "", ) exit 0 } @@ -50,7 +51,7 @@ workflow UTILS_NFSCHEMA_PLUGIN { summary_options = [:] if(parameters_schema) { - summary_options << [parametersSchema: parameters_schema] + summary_options << [parameters_schema: parameters_schema] } log.info before_text log.info paramsSummaryLog(summary_options, input_workflow) @@ -63,7 +64,10 @@ workflow UTILS_NFSCHEMA_PLUGIN { if(validate_params) { validateOptions = [:] if(parameters_schema) { - validateOptions << [parametersSchema: parameters_schema] + validateOptions << [parameters_schema: parameters_schema] + } + if(cli_typecast != null) { + validateOptions << [cast_cli_params: cli_typecast] } validateParameters(validateOptions) } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/meta.yml b/subworkflows/nf-core/utils_nfschema_plugin/meta.yml index f7d9f02..1d8c75a 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/meta.yml +++ b/subworkflows/nf-core/utils_nfschema_plugin/meta.yml @@ -25,6 +25,30 @@ input: option. When this input is empty it will automatically use the configured schema or "${projectDir}/nextflow_schema.json" as default. The schema should not be given in this way for meta pipelines. + - help: + type: boolean, string + description: | + Show the help message and exit. When a parameter name is given, show the help message for that parameter instead of the general help message. + - help_full: + type: boolean + description: Show the full help message and exit. + - show_hidden: + type: boolean + description: Show hidden parameters in the help message. + - before_text: + type: string + description: Text to show before the parameters summary and help message. + - after_text: + type: string + description: Text to show after the parameters summary and help message. + - command: + type: string + description: An example command to run the pipeline, to show in the help message and the summary. + - cli_typecast: + type: boolean + description: | + Whether to apply typecasting to the parameters given via the CLI before validation. + Set this to `null` to use the default behavior. output: - dummy_emit: type: boolean diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test index c977917..1fd1eac 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test @@ -31,6 +31,7 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" + input[9] = null """ } } @@ -63,6 +64,7 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" + input[9] = null """ } } @@ -95,6 +97,7 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" + input[9] = null """ } } @@ -127,6 +130,7 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" + input[9] = null """ } } @@ -160,6 +164,7 @@ nextflow_workflow { input[6] = "Before" input[7] = "After" input[8] = "nextflow run test/test" + input[9] = null """ } } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config index f6537cc..fd71cb8 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config @@ -1,5 +1,5 @@ plugins { - id "nf-schema@2.6.1" + id "nf-schema@2.7.2" } validation { From d9575d31cecba169a04673d1d662453b14a5128c Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Tue, 9 Jun 2026 22:42:34 +0200 Subject: [PATCH 04/16] changed something back written by claude --- .../nf-core/utils_nextflow_pipeline/main.nf | 19 +++---------------- .../nf-core/utils_nfschema_plugin/main.nf | 12 ++++-------- .../utils_nfschema_plugin/tests/main.nf.test | 5 ----- 3 files changed, 7 insertions(+), 29 deletions(-) diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf index 37939ac..207c487 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf @@ -73,23 +73,10 @@ def getWorkflowVersion() { def dumpParametersToJSON(outdir) { def timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss') def filename = "params_${timestamp}.json" - def temp_pf = workflow.launchDir.resolve(".${filename}") - def jsonGenerator = new groovy.json.JsonGenerator.Options() - .excludeNulls() - .addConverter(Path) { Path path -> path.toUriString() } - .addConverter(Duration) { Duration duration -> duration.toMillis() } - .addConverter(MemoryUnit) { MemoryUnit memory -> memory.toBytes() } - .addConverter(nextflow.script.types.VersionNumber) { nextflow.script.types.VersionNumber version -> version.toString() } - .build() - def jsonStr = jsonGenerator.toJson(params) + def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") + def jsonStr = groovy.json.JsonOutput.toJson(params) temp_pf.text = groovy.json.JsonOutput.prettyPrint(jsonStr) - if (outdir instanceof Path) { - temp_pf.copyTo(outdir.resolve("pipeline_info/${filename}")) - } else if (outdir instanceof String) { - temp_pf.copyTo("${outdir}/pipeline_info/params_${timestamp}.json") - } else { - log.warn("Could not determine type of outdir, parameters JSON file will not be copied to output directory!") - } + nextflow.extension.FilesEx.copyTo(temp_pf.toPath(), "${outdir}/pipeline_info/params_${timestamp}.json") temp_pf.delete() } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf index 9ff0681..1df8b76 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/main.nf +++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf @@ -22,7 +22,6 @@ workflow UTILS_NFSCHEMA_PLUGIN { before_text // string: text to show before the help message and parameters summary after_text // string: text to show after the help message and parameters summary command // string: an example command of the pipeline - cli_typecast // boolean: whether to perform typecasting of CLI parameters. Set this to `null` to use the default behaviour main: @@ -35,11 +34,11 @@ workflow UTILS_NFSCHEMA_PLUGIN { fullHelp: help_full, ] if(parameters_schema) { - help_options << [parameters_schema: parameters_schema] + help_options << [parametersSchema: parameters_schema] } log.info paramsHelp( help_options, - (help instanceof String && help != "true") ? help : "", + (params.help instanceof String && params.help != "true") ? params.help : "", ) exit 0 } @@ -51,7 +50,7 @@ workflow UTILS_NFSCHEMA_PLUGIN { summary_options = [:] if(parameters_schema) { - summary_options << [parameters_schema: parameters_schema] + summary_options << [parametersSchema: parameters_schema] } log.info before_text log.info paramsSummaryLog(summary_options, input_workflow) @@ -64,10 +63,7 @@ workflow UTILS_NFSCHEMA_PLUGIN { if(validate_params) { validateOptions = [:] if(parameters_schema) { - validateOptions << [parameters_schema: parameters_schema] - } - if(cli_typecast != null) { - validateOptions << [cast_cli_params: cli_typecast] + validateOptions << [parametersSchema: parameters_schema] } validateParameters(validateOptions) } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test index 1fd1eac..c977917 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test @@ -31,7 +31,6 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" - input[9] = null """ } } @@ -64,7 +63,6 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" - input[9] = null """ } } @@ -97,7 +95,6 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" - input[9] = null """ } } @@ -130,7 +127,6 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" - input[9] = null """ } } @@ -164,7 +160,6 @@ nextflow_workflow { input[6] = "Before" input[7] = "After" input[8] = "nextflow run test/test" - input[9] = null """ } } From b790ec8a29312d1befb994474ac3082d73f2ed40 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Tue, 9 Jun 2026 22:52:31 +0200 Subject: [PATCH 05/16] fixed errors --- .../utils_nfcore_domainsplit_pipeline/main.nf | 3 ++- .../nf-core/utils_nextflow_pipeline/main.nf | 19 ++++++++++++++++--- .../nf-core/utils_nfschema_plugin/main.nf | 12 ++++++++---- .../utils_nfschema_plugin/tests/main.nf.test | 5 +++++ 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf b/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf index 2f778e3..c09a912 100644 --- a/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf @@ -68,7 +68,8 @@ workflow PIPELINE_INITIALISATION { show_hidden, before_text, after_text, - command + command, + null ) // diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf index 207c487..37939ac 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf @@ -73,10 +73,23 @@ def getWorkflowVersion() { def dumpParametersToJSON(outdir) { def timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss') def filename = "params_${timestamp}.json" - def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") - def jsonStr = groovy.json.JsonOutput.toJson(params) + def temp_pf = workflow.launchDir.resolve(".${filename}") + def jsonGenerator = new groovy.json.JsonGenerator.Options() + .excludeNulls() + .addConverter(Path) { Path path -> path.toUriString() } + .addConverter(Duration) { Duration duration -> duration.toMillis() } + .addConverter(MemoryUnit) { MemoryUnit memory -> memory.toBytes() } + .addConverter(nextflow.script.types.VersionNumber) { nextflow.script.types.VersionNumber version -> version.toString() } + .build() + def jsonStr = jsonGenerator.toJson(params) temp_pf.text = groovy.json.JsonOutput.prettyPrint(jsonStr) - nextflow.extension.FilesEx.copyTo(temp_pf.toPath(), "${outdir}/pipeline_info/params_${timestamp}.json") + if (outdir instanceof Path) { + temp_pf.copyTo(outdir.resolve("pipeline_info/${filename}")) + } else if (outdir instanceof String) { + temp_pf.copyTo("${outdir}/pipeline_info/params_${timestamp}.json") + } else { + log.warn("Could not determine type of outdir, parameters JSON file will not be copied to output directory!") + } temp_pf.delete() } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf index 1df8b76..9ff0681 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/main.nf +++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf @@ -22,6 +22,7 @@ workflow UTILS_NFSCHEMA_PLUGIN { before_text // string: text to show before the help message and parameters summary after_text // string: text to show after the help message and parameters summary command // string: an example command of the pipeline + cli_typecast // boolean: whether to perform typecasting of CLI parameters. Set this to `null` to use the default behaviour main: @@ -34,11 +35,11 @@ workflow UTILS_NFSCHEMA_PLUGIN { fullHelp: help_full, ] if(parameters_schema) { - help_options << [parametersSchema: parameters_schema] + help_options << [parameters_schema: parameters_schema] } log.info paramsHelp( help_options, - (params.help instanceof String && params.help != "true") ? params.help : "", + (help instanceof String && help != "true") ? help : "", ) exit 0 } @@ -50,7 +51,7 @@ workflow UTILS_NFSCHEMA_PLUGIN { summary_options = [:] if(parameters_schema) { - summary_options << [parametersSchema: parameters_schema] + summary_options << [parameters_schema: parameters_schema] } log.info before_text log.info paramsSummaryLog(summary_options, input_workflow) @@ -63,7 +64,10 @@ workflow UTILS_NFSCHEMA_PLUGIN { if(validate_params) { validateOptions = [:] if(parameters_schema) { - validateOptions << [parametersSchema: parameters_schema] + validateOptions << [parameters_schema: parameters_schema] + } + if(cli_typecast != null) { + validateOptions << [cast_cli_params: cli_typecast] } validateParameters(validateOptions) } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test index c977917..1fd1eac 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test @@ -31,6 +31,7 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" + input[9] = null """ } } @@ -63,6 +64,7 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" + input[9] = null """ } } @@ -95,6 +97,7 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" + input[9] = null """ } } @@ -127,6 +130,7 @@ nextflow_workflow { input[6] = "" input[7] = "" input[8] = "" + input[9] = null """ } } @@ -160,6 +164,7 @@ nextflow_workflow { input[6] = "Before" input[7] = "After" input[8] = "nextflow run test/test" + input[9] = null """ } } From 9a4832c4bdf32ed77dee5dd5197af20b4dc0200e Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Tue, 9 Jun 2026 23:51:57 +0200 Subject: [PATCH 06/16] changed two parameters for prott5 into one --- nextflow.config | 9 ++------- nextflow_schema.json | 8 +------- subworkflows/local/enrich_ddi_database/main.nf | 2 +- subworkflows/local/generate_embeddings/main.nf | 2 +- subworkflows/local/split_domainsplit_database/main.nf | 2 +- workflows/domainsplit.nf | 11 +---------- 6 files changed, 7 insertions(+), 27 deletions(-) diff --git a/nextflow.config b/nextflow.config index 53ce85f..08bf661 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,11 +10,11 @@ params { // Source database URLs - url_3did = 'https://3did.irbbarcelona.org/download/2022_01/3did.sql.gz' + url_3did = 'https://3did.irbbarcelona.org/download/current/3did.sql.gz' // URLs for uniprot data sources url_uniprot_id_mapping = 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz' - url_uniprot_embeddings = 'https://ftp.ebi.ac.uk/pub/contrib/UniProt/embeddings/current_release/UP000005640_9606/per-residue.h5' + url_uniprot_prott5_embeddings = 'https://ftp.ebi.ac.uk/pub/contrib/UniProt/embeddings/current_release/UP000005640_9606/per-residue.h5' url_uniprot_go_terms = 'https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Cgo_id&format=tsv&query=%28%28database%3AGO%29+AND+%28reviewed%3Atrue%29%29' url_uniprot_sequences = 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz' @@ -25,11 +25,6 @@ params { url_pfam_template = 'https://www.ebi.ac.uk/interpro/wwwapi//entry/pfam/{pfam_id}/?annotation=alignment:full&download' - // ProtT5 per-residue embeddings: optional local path to a pre-downloaded - // EBI per-residue.h5 file. When set and the file exists it is used; otherwise - // the embeddings are downloaded from url_uniprot_embeddings (ProtT5 always runs). - prott5_per_residue_h5 = null - // ESM embedding sharding + inference knobs. // The protein FASTA is split into `esm_protein_shards` shards that run in // parallel as GPU tasks (capped by cluster QoS via `maxForks` in slurm.config). diff --git a/nextflow_schema.json b/nextflow_schema.json index 5dd15a9..77796a4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -55,7 +55,7 @@ "default": "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz", "fa_icon": "fas fa-link" }, - "url_uniprot_embeddings": { + "url_uniprot_prott5_embeddings": { "type": "string", "description": "URL of the per-residue ProtT5 UniProt embeddings HDF5.", "default": "https://ftp.ebi.ac.uk/pub/contrib/UniProt/embeddings/current_release/UP000005640_9606/per-residue.h5", @@ -97,12 +97,6 @@ "default": "https://www.ebi.ac.uk/interpro/wwwapi//entry/pfam/{pfam_id}/?annotation=alignment:full&download", "fa_icon": "fas fa-link" }, - "prott5_per_residue_h5": { - "type": "string", - "description": "Optional local path to a pre-downloaded ProtT5 per-residue HDF5 file. When set and present it is used; otherwise the file is downloaded from url_uniprot_embeddings.", - "default": null, - "fa_icon": "fas fa-file" - }, "esm_protein_shards": { "type": "integer", "description": "Number of FASTA shards for parallel per-residue ESM embedding generation over the protein sequences. Each shard runs as one GPU task; cluster QoS caps concurrency (see slurm.config).", diff --git a/subworkflows/local/enrich_ddi_database/main.nf b/subworkflows/local/enrich_ddi_database/main.nf index da9eaeb..56df49b 100644 --- a/subworkflows/local/enrich_ddi_database/main.nf +++ b/subworkflows/local/enrich_ddi_database/main.nf @@ -58,7 +58,7 @@ workflow ENRICH_DDI_DATABASE { db_after_ppi, protein_domain_map, esm_domain_embeddings - ).domainsplit_db.first() + ).domainsplit_db emit: domainsplit_db diff --git a/subworkflows/local/generate_embeddings/main.nf b/subworkflows/local/generate_embeddings/main.nf index 1abbeae..c043aec 100644 --- a/subworkflows/local/generate_embeddings/main.nf +++ b/subworkflows/local/generate_embeddings/main.nf @@ -6,7 +6,7 @@ ESM path produces both per-residue protein embeddings and pooled domain embeddings against the supplied protein <-> domain map. - ProtT5 embeddings are supplied externally via params.prott5_per_residue_h5 + ProtT5 embeddings are supplied externally via params.url_uniprot_prott5_embeddings and resolved in the top-level workflow (domainsplit.nf). ----------------------------------------------------------------------------*/ diff --git a/subworkflows/local/split_domainsplit_database/main.nf b/subworkflows/local/split_domainsplit_database/main.nf index fc1ffb9..972b410 100644 --- a/subworkflows/local/split_domainsplit_database/main.nf +++ b/subworkflows/local/split_domainsplit_database/main.nf @@ -45,7 +45,7 @@ workflow SPLIT_DOMAINSPLIT_DATABASE { } clusters = MMSEQS_EASYCLUSTER(cluster_input) - def clusters_tsv = clusters.tsv.filter { it[0].id == "domain" }.map { it[1] }.first() + def clusters_tsv = clusters.tsv.filter { it[0].id == "domain" }.map { it[1] } def splits = [ ["train", 0.6], diff --git a/workflows/domainsplit.nf b/workflows/domainsplit.nf index 93f93f4..7f5fe85 100644 --- a/workflows/domainsplit.nf +++ b/workflows/domainsplit.nf @@ -28,16 +28,7 @@ main: input_string = file(params.url_string) input_pfam2go = file(params.url_pfam2go) - // ProtT5 per-residue embeddings: prefer a pre-downloaded local file when it - // exists, otherwise fall back to downloading url_uniprot_embeddings. Always - // populated, so ProtT5 embeddings are a compulsory step. - def prott5_file = file(params.url_uniprot_embeddings) - if (params.prott5_per_residue_h5 && file(params.prott5_per_residue_h5).exists()) { - prott5_file = file(params.prott5_per_residue_h5) - log.info "Using local ProtT5 HDF5 at '${params.prott5_per_residue_h5}'" - } else { - log.info "Using ProtT5 HDF5 from url_uniprot_embeddings" - } + def prott5_file = file(params.url_uniprot_prott5_embeddings) empty_db = INIT_DOMAINSPLIT_DB().domainsplit_db From c37ab02cf84a557071b07fadd376037297db00a6 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Wed, 10 Jun 2026 02:28:07 +0200 Subject: [PATCH 07/16] tried new improved method for sampling negative data --- ...ive_ddis.py => build_ppi_negative_pool.py} | 199 +++++----------- bin/insert_ppi_negative_selection.py | 116 +++++++++ bin/select_ppi_negative_dans.py | 225 ++++++++++++++++++ conf/modules.config | 15 +- .../environment.yml | 0 .../main.nf | 8 +- .../environment.yml | 7 + .../insert_ppi_negative_selection/main.nf | 31 +++ .../select_ppi_negative_dans/environment.yml | 6 + .../local/select_ppi_negative_dans/main.nf | 28 +++ nextflow.config | 3 + nextflow_schema.json | 6 + subworkflows/local/collect_ddi_data/main.nf | 33 ++- .../local/split_domainsplit_database/main.nf | 2 +- 14 files changed, 534 insertions(+), 145 deletions(-) rename bin/{build_ppi_negative_ddis.py => build_ppi_negative_pool.py} (64%) create mode 100755 bin/insert_ppi_negative_selection.py create mode 100755 bin/select_ppi_negative_dans.py rename modules/local/{insert_ppi_negative_ddis => build_ppi_negative_pool}/environment.yml (100%) rename modules/local/{insert_ppi_negative_ddis => build_ppi_negative_pool}/main.nf (86%) create mode 100644 modules/local/insert_ppi_negative_selection/environment.yml create mode 100644 modules/local/insert_ppi_negative_selection/main.nf create mode 100644 modules/local/select_ppi_negative_dans/environment.yml create mode 100644 modules/local/select_ppi_negative_dans/main.nf diff --git a/bin/build_ppi_negative_ddis.py b/bin/build_ppi_negative_pool.py similarity index 64% rename from bin/build_ppi_negative_ddis.py rename to bin/build_ppi_negative_pool.py index 366ae06..e44fa31 100755 --- a/bin/build_ppi_negative_ddis.py +++ b/bin/build_ppi_negative_pool.py @@ -1,16 +1,20 @@ #!/usr/bin/env python3 """ -Build negative DDIs from a Y2H/MS PPI parquet and append them to the -domainsplit SQLite, restricted to Pfam domains already present in positive -DDIs. - -Selection uses degree-aware node sampling (DANS, Cappelletti et al. 2024, -Bioinformatics Advances vbae036) applied to the PPI-derived candidate pool: -each candidate Pfam pair is sampled (without replacement) with probability -proportional to the preferential attachment of its two domains in the positive -graph -- the product of their positive degrees. This makes the negative -degree / preferential-attachment distribution track the positives, avoiding the -inflated downstream evaluation that uniform negative sampling produces. +Build the candidate pool for negative DDIs from a Y2H/MS PPI parquet and dump it +to ``neg_pool.npz`` for the (seed-dependent) selection step. + +This is the EXPENSIVE, DETERMINISTIC half of negative-DDI construction: it +streams the parquet, maps bait/prey genes to UniProt + Pfam via the UniProt REST +API, and assembles the pool of candidate Pfam pairs (restricted to Pfam domains +that already appear in a 3did positive DDI, and excluding pairs that already +exist as DDIs). It performs NO sampling and NO insertion -- selection fans out +into parallel per-seed jobs (``select_ppi_negative_dans.py``) that read the dump, +and the winning selection is inserted by ``insert_ppi_negative_selection.py``. + +The dump also carries the positive-graph statistics the selector needs: +``pos_degree`` (per-domain positive degree, the cap), the positive-edge +preferential-attachment (PA = deg(a)*deg(b)) array, the target negative count +(``n_positive``) and the positive domain count (``n_positive_domains``). """ import argparse @@ -44,14 +48,9 @@ def parse_args(): p.add_argument("--parquet", required=True) p.add_argument("--pfam-mapping-out", required=True, help="Output path for UniProt -> Pfam JSON mapping") + p.add_argument("--pool-out", required=True, + help="Output path for the candidate-pool .npz dump") p.add_argument("--min-n-tested", type=int, required=True) - p.add_argument("--source-label", default="inferred_ppi_screen_negative") - p.add_argument( - "--seed", - type=int, - default=42, - help="Random seed for reproducible DANS negative sampling.", - ) p.add_argument( "--no-self", action="store_true", @@ -179,6 +178,16 @@ def load_existing_pairs(conn): return {tuple(sorted((a, b), key=pfam_sort_key)) for a, b in cur} +def load_positive_3did_edges(conn): + """The 3did positive DDIs as (pfam_a, pfam_b) pairs.""" + return conn.execute( + "SELECT da.pfam_id, db.pfam_id " + "FROM domain_domain_interaction AS ddi " + "JOIN domain AS da ON da.id = ddi.domain_id_a " + "JOIN domain AS db ON db.id = ddi.domain_id_b " + "WHERE ddi.negative = 0 AND ddi.source = '3did'" + ).fetchall() + def _validate_columns(parquet_schema): available = set(parquet_schema.names) @@ -215,71 +224,6 @@ def _collect_genes_and_pairs(parquet_path, min_n_tested): return n_input, unique_genes, baits, preys -def _compute_positive_degree(conn): - """Per-Pfam degree in the 3did positive DDI set.""" - rows = conn.execute( - "SELECT da.pfam_id, db.pfam_id " - "FROM domain_domain_interaction AS ddi " - "JOIN domain AS da ON da.id = ddi.domain_id_a " - "JOIN domain AS db ON db.id = ddi.domain_id_b " - "WHERE ddi.negative = 0 AND ddi.source = '3did'" - ).fetchall() - deg = defaultdict(int) - for a, b in rows: - deg[a] += 1 - deg[b] += 1 - return deg - - -def select_dans(fresh_candidates, pos_degree, n_take, seed=42): - """Degree-aware node sampling (DANS) over the PPI candidate pool. - - Cappelletti et al. 2024 (Bioinformatics Advances, vbae036) sample negative - edges so endpoint node-degrees track the positive distribution; the induced - edge probability is proportional to the preferential attachment of the two - endpoints, ``PA = deg(a) * deg(b)``. Here we apply that distribution to the - fixed pool of PPI-derived candidate Pfam pairs: each candidate is drawn - without replacement with probability proportional to the product of its two - domains' positive degrees, so the selected negatives mirror the positive - degree / PA distribution while staying biologically grounded in the PPI - screen. - """ - if n_take <= 0 or not fresh_candidates: - return [] - if n_take >= len(fresh_candidates): - return list(fresh_candidates) - - weights = np.array( - [pos_degree.get(a, 0) * pos_degree.get(b, 0) for (a, b), _ in fresh_candidates], - dtype=float, - ) - # Defensive fallback: if positive-degree info is missing/degenerate (or too - # few non-zero weights to draw n_take distinct pairs), sample uniformly. - if weights.sum() <= 0 or int((weights > 0).sum()) < n_take: - log("DANS: degenerate weights -> uniform fallback") - weights = np.ones(len(fresh_candidates), dtype=float) - - probs = weights / weights.sum() - rng = np.random.default_rng(seed) - idx = rng.choice(len(fresh_candidates), size=n_take, replace=False, p=probs) - - chosen = [fresh_candidates[i] for i in idx] - - def pa(a, b): - return pos_degree.get(a, 0) * pos_degree.get(b, 0) - - chosen_degree = defaultdict(int) - for (a, b), _ in chosen: - chosen_degree[a] += 1 - chosen_degree[b] += 1 - pool_pa = float(np.mean([pa(a, b) for (a, b), _ in fresh_candidates])) - sel_pa = float(np.mean([pa(a, b) for (a, b), _ in chosen])) - log(f"DANS: selected {len(chosen)} negatives from pool of {len(fresh_candidates)}") - log(f"DANS: {len(chosen_degree)} domains used ({len(pos_degree)} in the positive set); " - f"mean PA pool={pool_pa:.1f} selected={sel_pa:.1f}") - return chosen - - def main(): args = parse_args() @@ -310,9 +254,6 @@ def main(): log(f"n_pfam_domains_for_input_proteins = {n_pfam_unique}") conn = sqlite3.connect(args.db) - conn.execute("PRAGMA foreign_keys=ON") - conn.execute("PRAGMA journal_mode=OFF") - conn.execute("PRAGMA synchronous=OFF") pos_pfam = load_3did_pfams(conn) log(f"n_3did_pfams = {len(pos_pfam)}") @@ -362,65 +303,53 @@ def row_pfams(gene): f"(observed in {most_common_count} PPI rows)" ) - n_positive = conn.execute( - "SELECT COUNT(*) FROM domain_domain_interaction " - "WHERE negative = 0 AND source = '3did'" - ).fetchone()[0] + # Positive 3did graph statistics: degree (the per-domain cap), edge PA, and + # the target negative count. + pos_edges = load_positive_3did_edges(conn) + pos_degree = defaultdict(int) + for a, b in pos_edges: + pos_degree[a] += 1 + pos_degree[b] += 1 + n_positive = len(pos_edges) + n_positive_domains = len(pos_degree) + pos_edge_pa = np.array( + [pos_degree[a] * pos_degree[b] for a, b in pos_edges], dtype=np.int64 + ) log(f"n_positive_ddis_in_db = {n_positive}") + log(f"n_positive_domains = {n_positive_domains}") + log(f"positive mean PA = {float(pos_edge_pa.mean()):.1f}") - fresh_candidates = [] + # Drop candidates that already exist as a DDI (positive or negative). + fresh_pairs = [] n_positive_ddis_in_negative_ppis = 0 - - for key, count in candidate_counts.items(): + for key in candidate_counts: if key in existing_pairs: n_positive_ddis_in_negative_ppis += 1 else: - fresh_candidates.append((key, count)) + fresh_pairs.append(key) log(f"n_positive_ddis_in_negative_ppis = {n_positive_ddis_in_negative_ppis}") - - log(f"n_fresh_candidates_after_dedup = {len(fresh_candidates)}") - - log("selecting negatives via degree-aware node sampling (DANS)") - pos_degree = _compute_positive_degree(conn) - chosen = select_dans(fresh_candidates, pos_degree, n_positive, seed=args.seed) - log(f"n_chosen = {len(chosen)}") - - if chosen: - # Pre-load pfam_id -> domain.id mapping to avoid per-row subqueries - pfam_to_domain_ids = defaultdict(list) - for did, pfam in conn.execute("SELECT id, pfam_id FROM domain"): - pfam_to_domain_ids[pfam].append(did) - log(f"loaded {len(pfam_to_domain_ids)} pfam -> domain mappings") - - insert_rows = [] - for (pfam_a, pfam_b), _ in chosen: - # normalise by Pfam accession number (matching ddi_db_utils.insert_ddis) - # so swapped pairs collapse and dedup consistently with the other sources - for d_a in pfam_to_domain_ids.get(pfam_a, ()): - for d_b in pfam_to_domain_ids.get(pfam_b, ()): - if pfam_sort_key(pfam_a) <= pfam_sort_key(pfam_b): - lo, hi = d_a, d_b - else: - lo, hi = d_b, d_a - insert_rows.append((lo, hi, True, args.source_label)) - - conn.executemany( - "INSERT OR IGNORE INTO domain_domain_interaction" - "(domain_id_a, domain_id_b, negative, source) " - "VALUES (?, ?, ?, ?)", - insert_rows, - ) - conn.commit() - log(f"batch-inserted {len(insert_rows)} rows") - - n_inserted = conn.execute( - "SELECT COUNT(*) FROM domain_domain_interaction WHERE source = ?", - (args.source_label,), - ).fetchone()[0] - log(f"n_inserted_for_source = {n_inserted}") + log(f"n_fresh_candidates_after_dedup = {len(fresh_pairs)}") conn.close() + cand_a = np.array([a for a, b in fresh_pairs], dtype=object) + cand_b = np.array([b for a, b in fresh_pairs], dtype=object) + pos_dom = np.array(list(pos_degree.keys()), dtype=object) + pos_deg = np.array([pos_degree[d] for d in pos_dom], dtype=np.int64) + + log(f"writing candidate pool to {args.pool_out}") + np.savez( + args.pool_out, + cand_a=cand_a, + cand_b=cand_b, + pos_dom=pos_dom, + pos_deg=pos_deg, + pos_edge_pa=pos_edge_pa, + n_positive=np.int64(n_positive), + n_positive_domains=np.int64(n_positive_domains), + ) + log("done") + if __name__ == "__main__": main() diff --git a/bin/insert_ppi_negative_selection.py b/bin/insert_ppi_negative_selection.py new file mode 100755 index 0000000..769f127 --- /dev/null +++ b/bin/insert_ppi_negative_selection.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +""" +Pick the best per-seed negative-DDI selection and insert it into the domainsplit +SQLite. + +Reads every ``score_*.json`` + ``pairs_*.tsv`` produced by the parallel +``select_ppi_negative_dans.py`` jobs, picks the selection with the lowest +objective ``J`` (ties broken by the smaller seed, so the result is fully +deterministic regardless of which SLURM task finished first), and inserts that +seed's Pfam pairs as negatives via the shared ``ddi_db_utils`` helpers. + +Prints a positive reference line (absolute baseline) followed by one line per +seed and a WINNER line, and writes the same data to a published scores TSV. +""" + +import argparse +import glob +import json +import sqlite3 + +from ddi_db_utils import count_source, ensure_domains, insert_ddis + + +TAG = "[neg_insert]" +J_ROUND = 12 + + +def log(msg): + print(f"{TAG} {msg}", flush=True) + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--db", required=True) + p.add_argument("--scores-out", required=True, + help="output consolidated scores TSV path") + p.add_argument("--score-glob", default="score_*.json") + p.add_argument("--pairs-template", default="pairs_{seed}.tsv") + p.add_argument("--source-label", default="inferred_ppi_screen_negative") + return p.parse_args() + + +def read_pairs(path): + pairs = [] + with open(path) as fh: + for line in fh: + line = line.rstrip("\n") + if not line: + continue + a, b = line.split("\t") + pairs.append((a, b)) + return pairs + + +def main(): + args = parse_args() + + score_files = sorted(glob.glob(args.score_glob)) + if not score_files: + raise SystemExit(f"{TAG} no score files matching {args.score_glob}") + + records = [] + for path in score_files: + with open(path) as fh: + records.append(json.load(fh)) + + # Deterministic winner: lowest J (rounded), ties broken by smaller seed. + records.sort(key=lambda r: (round(r["J"], J_ROUND), r["seed"])) + winner = records[0] + winner_seed = winner["seed"] + + pairs_path = args.pairs_template.format(seed=winner_seed) + pairs = read_pairs(pairs_path) + + conn = sqlite3.connect(args.db) + conn.execute("PRAGMA foreign_keys=ON") + conn.execute("PRAGMA journal_mode=OFF") + conn.execute("PRAGMA synchronous=OFF") + + ensure_domains(conn, (p for pair in pairs for p in pair)) + insert_ddis(conn, pairs, negative=True, source=args.source_label) + conn.commit() + n_inserted = count_source(conn, args.source_label) + conn.close() + + # --- report: positive reference, then every seed, then the winner --- + ref = records[0] + log(f"set=positive n_sel={ref['pos_n_sel']} n_dom={ref['pos_n_dom']} " + f"mean_pa={ref['pos_mean_pa']:.1f}") + for r in records: + log(f"set=negative seed={r['seed']} J={r['J']:.4f} pa={r['pa']:.4f} " + f"deg={r['deg']:.4f} cov={r['cov']:.4f} n_sel={r['n_sel']} " + f"n_dom={r['n_dom']} mean_pa={r['mean_pa']:.1f}") + log(f"WINNER seed={winner_seed} J={winner['J']:.4f} n_inserted={n_inserted}") + + cols = ["set", "seed", "J", "pa", "deg", "cov", "n_sel", "n_dom", + "mean_pa", "winner"] + with open(args.scores_out, "w") as fh: + fh.write("\t".join(cols) + "\n") + fh.write("\t".join([ + "positive", "NA", "NA", "NA", "NA", "NA", + str(ref["pos_n_sel"]), str(ref["pos_n_dom"]), + f"{ref['pos_mean_pa']:.2f}", "NA", + ]) + "\n") + for r in records: + fh.write("\t".join([ + "negative", str(r["seed"]), + f"{r['J']:.6f}", f"{r['pa']:.6f}", f"{r['deg']:.6f}", + f"{r['cov']:.6f}", str(r["n_sel"]), str(r["n_dom"]), + f"{r['mean_pa']:.2f}", + "1" if r["seed"] == winner_seed else "0", + ]) + "\n") + + +if __name__ == "__main__": + main() diff --git a/bin/select_ppi_negative_dans.py b/bin/select_ppi_negative_dans.py new file mode 100755 index 0000000..c83b3d9 --- /dev/null +++ b/bin/select_ppi_negative_dans.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +""" +Select negative DDIs from a candidate pool (``neg_pool.npz``) for one random +seed, score the selection, and emit the chosen Pfam pairs + a score JSON. + +Degree-aware node sampling (DANS, Cappelletti et al. 2024, Bioinformatics +Advances vbae036) matched the negative degree distribution to the positives by +sampling edges with probability proportional to preferential attachment +(PA = deg(a)*deg(b)). Applied naively to a fixed candidate pool that already has +the positive mean PA, that overshoots: sampling proportional to PA draws edges +with mean PA = E[PA^2]/E[PA] >> pool mean, concentrating on a few hub domains. + +This selector keeps the PA-proportional draw but adds a hard per-domain CAP at +the positive degree, and refills in multiple passes until the target negative +count (== positive count) is reached: + + * pass 1 draws ``target`` candidates PA-weighted without replacement and adds + them while no endpoint exceeds its cap; + * each refill pass prunes already-picked candidates and any candidate touching + a saturated domain (it would only be skipped again), renormalises the PA + weights over the survivors, and draws the deficit (floored at ``min_batch``); + * the loop stops when ``target`` is hit, or early if the pruned pool can no + longer supply an eligible edge -- that shortfall is the true feasibility + ceiling of the fixed pool. + +Capping pins the negative degree sequence to the positive one (degree +distribution matched, hub-driven mean-PA blow-up impossible); the PA-weighted +draw fills high-degree domains toward their cap first, reproducing the positive +PA distribution; the refill drives the count to the positive total. + +The selection is scored against the positives with a combined objective (lower is +better): J = w_pa*pa + w_deg*deg + w_cov*cov, where + pa = Wasserstein-1 between log1p(PA_neg) and log1p(PA_pos), normalised by the + spread of the positive log1p(PA); + deg = Kolmogorov-Smirnov statistic between the per-domain negative degree + distribution (0 for unused domains) and the positive degree distribution; + cov = 1 - domains_used_neg / domains_pos. +""" + +import argparse +import json + +import numpy as np + + +TAG = "[neg_select]" + + +def log(msg): + print(f"{TAG} {msg}", flush=True) + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--pool", required=True, help="candidate-pool .npz from BUILD step") + p.add_argument("--seed", type=int, required=True) + p.add_argument("--score-out", required=True, help="output score JSON path") + p.add_argument("--pairs-out", required=True, help="output selected-pairs TSV path") + p.add_argument("--w-pa", type=float, default=0.5) + p.add_argument("--w-deg", type=float, default=0.3) + p.add_argument("--w-cov", type=float, default=0.2) + p.add_argument( + "--min-batch", + type=int, + default=20, + help="Minimum candidates drawn per refill pass so the tail keeps progressing.", + ) + return p.parse_args() + + +def wasserstein1(x, y): + """1-Wasserstein distance between two 1D empirical samples (numpy only).""" + x = np.sort(np.asarray(x, dtype=float)) + y = np.sort(np.asarray(y, dtype=float)) + grid = np.concatenate([x, y]) + grid.sort() + cx = np.searchsorted(x, grid, side="right") / x.size + cy = np.searchsorted(y, grid, side="right") / y.size + deltas = np.diff(grid) + return float(np.sum(np.abs(cx[:-1] - cy[:-1]) * deltas)) + + +def ks_statistic(x, y): + """Two-sample Kolmogorov-Smirnov statistic (numpy only).""" + x = np.sort(np.asarray(x, dtype=float)) + y = np.sort(np.asarray(y, dtype=float)) + grid = np.concatenate([x, y]) + grid.sort() + cx = np.searchsorted(x, grid, side="right") / x.size + cy = np.searchsorted(y, grid, side="right") / y.size + return float(np.max(np.abs(cx - cy))) + + +def select(cand_ai, cand_bi, pa, cap, target, seed, min_batch): + """PA-weighted, degree-capped, multi-pass refill selection. + + Returns the array of selected candidate indices. + """ + rng = np.random.default_rng(seed) + n_cand = cand_ai.size + remaining = cap.copy() + picked = np.zeros(n_cand, dtype=bool) + selected = [] + + def draw_and_add(batch): + elig = np.flatnonzero( + (~picked) & (remaining[cand_ai] > 0) & (remaining[cand_bi] > 0) + ) + if elig.size == 0: + return 0 + w = pa[elig].astype(float) + total = w.sum() + p = (w / total) if total > 0 else None + k = int(min(batch, elig.size)) + chosen = rng.choice(elig, size=k, replace=False, p=p) + added = 0 + for ci in chosen: + a = cand_ai[ci] + b = cand_bi[ci] + if remaining[a] > 0 and remaining[b] > 0: + selected.append(int(ci)) + picked[ci] = True + remaining[a] -= 1 + remaining[b] -= 1 + added += 1 + if len(selected) >= target: + break + return added + + draw_and_add(target) + n_passes = 1 + while len(selected) < target: + missing = target - len(selected) + added = draw_and_add(max(missing, min_batch)) + n_passes += 1 + if added == 0: + log(f"pool exhausted after {n_passes} passes; " + f"selected {len(selected)}/{target}") + break + + log(f"selection done in {n_passes} passes: {len(selected)}/{target} edges") + return np.array(selected, dtype=np.int64) + + +def main(): + args = parse_args() + + data = np.load(args.pool, allow_pickle=True) + cand_a = data["cand_a"] + cand_b = data["cand_b"] + pos_dom = data["pos_dom"] + pos_deg = data["pos_deg"].astype(np.int64) + pos_edge_pa = data["pos_edge_pa"].astype(np.int64) + n_positive = int(data["n_positive"]) + n_positive_domains = int(data["n_positive_domains"]) + + log(f"pool: {cand_a.size} candidates, {pos_dom.size} positive domains, " + f"target = {n_positive}") + + # Map every domain to an integer index over the positive-domain universe. + domain_index = {d: i for i, d in enumerate(pos_dom)} + cand_ai = np.fromiter((domain_index[a] for a in cand_a), dtype=np.int64, + count=cand_a.size) + cand_bi = np.fromiter((domain_index[b] for b in cand_b), dtype=np.int64, + count=cand_b.size) + cap = pos_deg.copy() + pa = pos_deg[cand_ai] * pos_deg[cand_bi] + + sel = select(cand_ai, cand_bi, pa, cap, n_positive, args.seed, args.min_batch) + + # --- selected-set statistics --- + sel_ai = cand_ai[sel] + sel_bi = cand_bi[sel] + neg_pa = pa[sel] + n_sel = int(sel.size) + mean_pa_neg = float(neg_pa.mean()) if n_sel else 0.0 + + neg_deg = np.zeros(pos_dom.size, dtype=np.int64) + np.add.at(neg_deg, sel_ai, 1) + np.add.at(neg_deg, sel_bi, 1) + n_dom = int(np.count_nonzero(neg_deg)) + + # --- objective --- + pos_logpa = np.log1p(pos_edge_pa.astype(float)) + neg_logpa = np.log1p(neg_pa.astype(float)) + spread = float(pos_logpa.max() - pos_logpa.min()) + pa_term = wasserstein1(neg_logpa, pos_logpa) / spread if spread > 0 else 0.0 + deg_term = ks_statistic(neg_deg, pos_deg) + cov_term = 1.0 - (n_dom / n_positive_domains) if n_positive_domains else 0.0 + j = args.w_pa * pa_term + args.w_deg * deg_term + args.w_cov * cov_term + + pos_mean_pa = float(pos_edge_pa.mean()) + + log(f"set=positive n_sel={n_positive} n_dom={n_positive_domains} " + f"mean_pa={pos_mean_pa:.1f}") + log(f"set=negative seed={args.seed} J={j:.4f} pa={pa_term:.4f} " + f"deg={deg_term:.4f} cov={cov_term:.4f} n_sel={n_sel} n_dom={n_dom} " + f"mean_pa={mean_pa_neg:.1f}") + + score = { + "seed": int(args.seed), + "J": j, + "pa": pa_term, + "deg": deg_term, + "cov": cov_term, + "n_sel": n_sel, + "n_dom": n_dom, + "mean_pa": mean_pa_neg, + "pos_n_sel": n_positive, + "pos_n_dom": n_positive_domains, + "pos_mean_pa": pos_mean_pa, + "w_pa": args.w_pa, + "w_deg": args.w_deg, + "w_cov": args.w_cov, + } + with open(args.score_out, "w") as fh: + json.dump(score, fh) + + with open(args.pairs_out, "w") as fh: + for ci in sel: + fh.write(f"{cand_a[ci]}\t{cand_b[ci]}\n") + + +if __name__ == "__main__": + main() diff --git a/conf/modules.config b/conf/modules.config index a91b4f8..aa3bdda 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -29,10 +29,23 @@ process { // default publishDir copy each of them under `insert/` / `smoke/` / // `init/` would create stale duplicates and racy filename collisions. // Disable publishing for these intermediates explicitly. - withName: 'INIT_DOMAINSPLIT_DB|INSERT_3DID|INSERT_SINGLE_DOMAIN_PPI|INSERT_PPIDM|INSERT_NEGATOME|REMOVE_SELF_INTERACTIONS|BUILD_SWISSPROT_PFAM_MAP|INSERT_PPI_NEGATIVE_DDIS|SMOKE_FILTER|INSERT_DOMAIN_GO_TERMS|INSERT_PROTEINS_WITH_EMBEDDINGS|INSERT_PROTEIN_GO_TERMS|INSERT_PPI|INSERT_DOMAIN_PROTEIN_MAPPING' { + withName: 'INIT_DOMAINSPLIT_DB|INSERT_3DID|INSERT_SINGLE_DOMAIN_PPI|INSERT_PPIDM|INSERT_NEGATOME|REMOVE_SELF_INTERACTIONS|BUILD_SWISSPROT_PFAM_MAP|BUILD_PPI_NEGATIVE_POOL|SELECT_PPI_NEGATIVE_DANS|SMOKE_FILTER|INSERT_DOMAIN_GO_TERMS|INSERT_PROTEINS_WITH_EMBEDDINGS|INSERT_PROTEIN_GO_TERMS|INSERT_PPI|INSERT_DOMAIN_PROTEIN_MAPPING' { publishDir = [ enabled: false ] } + // INSERT_PPI_NEGATIVE_SELECTION's domainsplit.sqlite3 is an intermediate + // (published only via the workflow-level output block), but its + // negative_ppi_seed_scores.tsv diagnostic IS published. + withName: 'INSERT_PPI_NEGATIVE_SELECTION' { + publishDir = [ + path: { "${params.outdir}/negative_ppi" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + (filename.equals('versions.yml') || filename.endsWith('.sqlite3')) ? null : filename + } + ] + } + // EXTRACT_UNIQUE_DOMAINS emits a transient pfam_ids.txt consumed only by // DOWNLOAD_PFAM_ALIGNMENTS_BATCH in the same subworkflow. withName: 'EXTRACT_UNIQUE_DOMAINS' { diff --git a/modules/local/insert_ppi_negative_ddis/environment.yml b/modules/local/build_ppi_negative_pool/environment.yml similarity index 100% rename from modules/local/insert_ppi_negative_ddis/environment.yml rename to modules/local/build_ppi_negative_pool/environment.yml diff --git a/modules/local/insert_ppi_negative_ddis/main.nf b/modules/local/build_ppi_negative_pool/main.nf similarity index 86% rename from modules/local/insert_ppi_negative_ddis/main.nf rename to modules/local/build_ppi_negative_pool/main.nf index 3dc3998..3ab76ec 100644 --- a/modules/local/insert_ppi_negative_ddis/main.nf +++ b/modules/local/build_ppi_negative_pool/main.nf @@ -1,5 +1,5 @@ -process INSERT_PPI_NEGATIVE_DDIS { - tag "insert_ppi_negative_ddis" +process BUILD_PPI_NEGATIVE_POOL { + tag "build_ppi_negative_pool" label 'process_low' conda "${moduleDir}/environment.yml" container "docker://konstantinpelz/domainsplit-general:1.0.0" @@ -12,6 +12,7 @@ process INSERT_PPI_NEGATIVE_DDIS { output: path "domainsplit.sqlite3", emit: domainsplit_db + path "neg_pool.npz", emit: neg_pool path "uniprot_pfam_mapping.json", emit: pfam_mapping path "versions.yml", emit: versions @@ -20,10 +21,11 @@ process INSERT_PPI_NEGATIVE_DDIS { """ cp "${domainsplit_db_in}" domainsplit.sqlite3 - build_ppi_negative_ddis.py \\ + build_ppi_negative_pool.py \\ --db domainsplit.sqlite3 \\ --parquet "${negative_ppi_parquet}" \\ --pfam-mapping-out uniprot_pfam_mapping.json \\ + --pool-out neg_pool.npz \\ --min-n-tested ${min_n_tested} \\ ${no_self} diff --git a/modules/local/insert_ppi_negative_selection/environment.yml b/modules/local/insert_ppi_negative_selection/environment.yml new file mode 100644 index 0000000..ac84956 --- /dev/null +++ b/modules/local/insert_ppi_negative_selection/environment.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.12 + - sqlite + - numpy diff --git a/modules/local/insert_ppi_negative_selection/main.nf b/modules/local/insert_ppi_negative_selection/main.nf new file mode 100644 index 0000000..f31467b --- /dev/null +++ b/modules/local/insert_ppi_negative_selection/main.nf @@ -0,0 +1,31 @@ +process INSERT_PPI_NEGATIVE_SELECTION { + tag "insert_ppi_negative_selection" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker://konstantinpelz/domainsplit-general:1.0.0" + + input: + path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' + path score_jsons + path pairs_tsvs + + output: + path "domainsplit.sqlite3", emit: domainsplit_db + path "negative_ppi_seed_scores.tsv", emit: scores + path "versions.yml", emit: versions + + script: + """ + cp "${domainsplit_db_in}" domainsplit.sqlite3 + + insert_ppi_negative_selection.py \\ + --db domainsplit.sqlite3 \\ + --scores-out negative_ppi_seed_scores.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 -c 'import sys; print(sys.version.split()[0])') + sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)') + END_VERSIONS + """ +} diff --git a/modules/local/select_ppi_negative_dans/environment.yml b/modules/local/select_ppi_negative_dans/environment.yml new file mode 100644 index 0000000..22cb361 --- /dev/null +++ b/modules/local/select_ppi_negative_dans/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.12 + - numpy diff --git a/modules/local/select_ppi_negative_dans/main.nf b/modules/local/select_ppi_negative_dans/main.nf new file mode 100644 index 0000000..4163661 --- /dev/null +++ b/modules/local/select_ppi_negative_dans/main.nf @@ -0,0 +1,28 @@ +process SELECT_PPI_NEGATIVE_DANS { + tag "select_ppi_negative_dans:seed=${seed}" + label 'process_low' + conda "${moduleDir}/environment.yml" + container "docker://konstantinpelz/domainsplit-general:1.0.0" + + input: + tuple val(seed), path(neg_pool) + + output: + tuple val(seed), path("score_${seed}.json"), path("pairs_${seed}.tsv"), emit: result + path "versions.yml", emit: versions + + script: + """ + select_ppi_negative_dans.py \\ + --pool "${neg_pool}" \\ + --seed ${seed} \\ + --score-out score_${seed}.json \\ + --pairs-out pairs_${seed}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 -c 'import sys; print(sys.version.split()[0])') + numpy: \$(python3 -c 'import numpy; print(numpy.__version__)') + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 08bf661..aa5af2a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -67,6 +67,9 @@ params { // count. Required input (no default; must be supplied per run). negative_ppi_parquet = null negative_ppi_min_n_tested = 5 + // Base seed for the multi-seed negative-DDI selection: 5 parallel SLURM + // jobs run with seeds base+1..+5 and the best-scoring selection is kept. + negative_ppi_seed = 42 // Reviewed-human UniProt -> Pfam stream used to detect single-domain // proteins (accession, entry name, gene names, Pfam xrefs). diff --git a/nextflow_schema.json b/nextflow_schema.json index 77796a4..ef83a4b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -165,6 +165,12 @@ "minimum": 1, "fa_icon": "fas fa-filter" }, + "negative_ppi_seed": { + "type": "integer", + "description": "Base random seed for the multi-seed negative-DDI selection. Five parallel jobs run the degree-capped PA-weighted sampler with seeds base+1..+5; the lowest-scoring (best degree/PA/coverage-matched) selection is inserted.", + "default": 42, + "fa_icon": "fas fa-dice" + }, "pfam_download_batch_size": { "type": "integer", "description": "Number of Pfam IDs grouped into a single download job to reduce scheduler overhead.", diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf index 3c377f4..9c32b68 100644 --- a/subworkflows/local/collect_ddi_data/main.nf +++ b/subworkflows/local/collect_ddi_data/main.nf @@ -27,7 +27,9 @@ include { INSERT_SINGLE_DOMAIN_PPI } from '../../../modules/local/insert_single include { INSERT_PPIDM } from '../../../modules/local/insert_ppidm/main.nf' include { INSERT_NEGATOME } from '../../../modules/local/insert_negatome/main.nf' include { REMOVE_SELF_INTERACTIONS } from '../../../modules/local/remove_self_interactions/main.nf' -include { INSERT_PPI_NEGATIVE_DDIS } from '../../../modules/local/insert_ppi_negative_ddis/main.nf' +include { BUILD_PPI_NEGATIVE_POOL } from '../../../modules/local/build_ppi_negative_pool/main.nf' +include { SELECT_PPI_NEGATIVE_DANS } from '../../../modules/local/select_ppi_negative_dans/main.nf' +include { INSERT_PPI_NEGATIVE_SELECTION } from '../../../modules/local/insert_ppi_negative_selection/main.nf' include { SMOKE_FILTER } from '../../../modules/local/smoke_filter/main.nf' workflow COLLECT_DDI_DATA { @@ -72,15 +74,36 @@ workflow COLLECT_DDI_DATA { domainsplit_db = REMOVE_SELF_INTERACTIONS(domainsplit_db).domainsplit_db } - // 7. high-confidence non-PPI negatives (inferred only over 3did domains) - ppi_result = INSERT_PPI_NEGATIVE_DDIS( + // 7. high-confidence non-PPI negatives (inferred only over 3did domains). + // The expensive, deterministic UniProt fetch + candidate-pool build runs + // once; selection fans out over 5 seeds (base+1..+5) in parallel SLURM + // jobs, and the best-scoring (degree/PA-matched) selection is inserted. + pool = BUILD_PPI_NEGATIVE_POOL( domainsplit_db, file(negative_ppi_parquet), params.negative_ppi_min_n_tested, params.self_interaction, ) - domainsplit_db = ppi_result.domainsplit_db - pfam_mapping = ppi_result.pfam_mapping + + // Pair the single shared pool file with each seed (combine avoids the + // queue-exhaustion that mixing a one-shot channel with a 5-item queue causes). + seeds = Channel.of(1, 2, 3, 4, 5).map { params.negative_ppi_seed + it } + sel = SELECT_PPI_NEGATIVE_DANS(seeds.combine(pool.neg_pool)) + + sel.result + .multiMap { seed, score, pairs -> + scores: score + pairs: pairs + } + .set { selection } + + best = INSERT_PPI_NEGATIVE_SELECTION( + pool.domainsplit_db, + selection.scores.collect(), + selection.pairs.collect(), + ) + domainsplit_db = best.domainsplit_db + pfam_mapping = pool.pfam_mapping if (params.smoke_test_n_ddis != null) { domainsplit_db = SMOKE_FILTER(domainsplit_db, params.smoke_test_n_ddis).domainsplit_db diff --git a/subworkflows/local/split_domainsplit_database/main.nf b/subworkflows/local/split_domainsplit_database/main.nf index 972b410..4f6e3a7 100644 --- a/subworkflows/local/split_domainsplit_database/main.nf +++ b/subworkflows/local/split_domainsplit_database/main.nf @@ -56,7 +56,7 @@ workflow SPLIT_DOMAINSPLIT_DATABASE { // The two methods that mimic a within-distribution evaluation use only the // "core" sources: 3did positives + high-confidence non-PPI negatives. // 'inferred_ppi_screen_negative' must stay in sync with the --source-label - // default in bin/build_ppi_negative_ddis.py. + // default in bin/insert_ppi_negative_selection.py. def core_sources = ['3did', 'inferred_ppi_screen_negative'] // External-validation test set: held-out sources placed as is. From 097bd3bf1f5fdcbffae54a16c237a8e00d496dc0 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Wed, 10 Jun 2026 18:16:21 +0200 Subject: [PATCH 08/16] reworked generating negatives --- bin/build_ppi_negative_pool.py | 57 +++- bin/ddi_db_utils.py | 27 +- bin/insert_ppi_negative_selection.py | 162 ++++++---- bin/select_ppi_negative_dans.py | 278 ++++++++++-------- conf/modules.config | 2 +- modules/local/init_domainsplit_db/main.nf | 2 +- .../insert_ppi_negative_selection/main.nf | 21 +- .../local/select_ppi_negative_dans/main.nf | 16 +- nextflow.config | 4 +- nextflow_schema.json | 2 +- subworkflows/local/collect_ddi_data/main.nf | 44 +-- .../local/split_domainsplit_database/main.nf | 102 ++++--- .../test_insert_ppi_negative_selection.py | 135 +++++++++ tests/python/test_select_ppi_negative_dans.py | 148 ++++++++++ 14 files changed, 725 insertions(+), 275 deletions(-) create mode 100644 tests/python/test_insert_ppi_negative_selection.py create mode 100644 tests/python/test_select_ppi_negative_dans.py diff --git a/bin/build_ppi_negative_pool.py b/bin/build_ppi_negative_pool.py index e44fa31..b2d6b2a 100755 --- a/bin/build_ppi_negative_pool.py +++ b/bin/build_ppi_negative_pool.py @@ -11,10 +11,16 @@ into parallel per-seed jobs (``select_ppi_negative_dans.py``) that read the dump, and the winning selection is inserted by ``insert_ppi_negative_selection.py``. -The dump also carries the positive-graph statistics the selector needs: -``pos_degree`` (per-domain positive degree, the cap), the positive-edge -preferential-attachment (PA = deg(a)*deg(b)) array, the target negative count -(``n_positive``) and the positive domain count (``n_positive_domains``). +The dump carries what both negative-construction methods need (uncapped DANS, +Cappelletti et al. vbae036): + * Method 1 "deletion" -- the candidate pool ``cand_a``/``cand_b``, the pool + domain universe ``pool_dom`` and the *reduced* positive degrees + ``pool_deg_r`` (3did positives restricted to pool domains), plus the reduced + positive-edge PA and target count. + * Method 2 "random_addition" -- the full positive edge endpoint multiset + ``pos_a``/``pos_b`` (DANS samples node-pairs proportional to degree from it), + the full positive degrees/PA/count, and the forbidden-pair set + ``forbidden_a``/``forbidden_b`` (all existing DDIs) that DANS must avoid. """ import argparse @@ -334,19 +340,62 @@ def row_pfams(gene): cand_a = np.array([a for a, b in fresh_pairs], dtype=object) cand_b = np.array([b for a, b in fresh_pairs], dtype=object) + + # ---- Method 1 ("deletion"): reduce the positives to the candidate-domain + # universe so positive and candidate domains coincide; DANS then draws + # degree-aware over the fixed candidate pool. ---- + pool_domains = {d for pair in fresh_pairs for d in pair} + pos_edges_r = [ + (a, b) for a, b in pos_edges if a in pool_domains and b in pool_domains + ] + pos_degree_r = defaultdict(int) + for a, b in pos_edges_r: + pos_degree_r[a] += 1 + pos_degree_r[b] += 1 + n_positive_r = len(pos_edges_r) + n_positive_domains_r = len(pos_degree_r) + pos_edge_pa_r = np.array( + [pos_degree_r[a] * pos_degree_r[b] for a, b in pos_edges_r], dtype=np.int64 + ) + # Every pool domain carries its reduced-positive degree (0 if it has no edge + # in the reduced positive graph); the selector turns these into PA weights. + pool_dom = np.array(sorted(pool_domains, key=pfam_sort_key), dtype=object) + pool_deg_r = np.array([pos_degree_r[d] for d in pool_dom], dtype=np.int64) + log(f"n_pool_domains = {len(pool_dom)}") + log(f"n_reduced_positive_ddis = {n_positive_r}") + log(f"n_reduced_positive_domains = {n_positive_domains_r}") + + # ---- Method 2 ("random_addition"): plain DANS over the full positive set. + # The selector samples node-pairs proportional to degree by drawing from + # the endpoint multiset of these edges and rejects existing pairs. ---- + pos_a = np.array([a for a, b in pos_edges], dtype=object) + pos_b = np.array([b for a, b in pos_edges], dtype=object) pos_dom = np.array(list(pos_degree.keys()), dtype=object) pos_deg = np.array([pos_degree[d] for d in pos_dom], dtype=np.int64) + forbidden_a = np.array([a for a, b in existing_pairs], dtype=object) + forbidden_b = np.array([b for a, b in existing_pairs], dtype=object) log(f"writing candidate pool to {args.pool_out}") np.savez( args.pool_out, + # --- Method 1 (deletion) --- cand_a=cand_a, cand_b=cand_b, + pool_dom=pool_dom, + pool_deg_r=pool_deg_r, + pos_edge_pa_r=pos_edge_pa_r, + n_positive_r=np.int64(n_positive_r), + n_positive_domains_r=np.int64(n_positive_domains_r), + # --- Method 2 (random_addition) --- + pos_a=pos_a, + pos_b=pos_b, pos_dom=pos_dom, pos_deg=pos_deg, pos_edge_pa=pos_edge_pa, n_positive=np.int64(n_positive), n_positive_domains=np.int64(n_positive_domains), + forbidden_a=forbidden_a, + forbidden_b=forbidden_b, ) log("done") diff --git a/bin/ddi_db_utils.py b/bin/ddi_db_utils.py index e2c6570..4d451c4 100755 --- a/bin/ddi_db_utils.py +++ b/bin/ddi_db_utils.py @@ -7,8 +7,14 @@ :func:`pfam_sort_key`) so the stored ``(domain_id_a, domain_id_b)`` order is stable -- a pair is deduplicated regardless of the order it is supplied in and regardless of which source inserted it first or in what order domains were -created -- and ``INSERT OR IGNORE`` keeps the earliest (positive *or* negative) -row. +created. + +The table's ``UNIQUE(domain_id_a, domain_id_b, source)`` lets the same pair be +stored under different sources. To keep the historical "earliest source wins" +behaviour for the canonical sources, :func:`insert_ddis` defaults to +``dedup_across_sources=True``, which skips any pair already present under another +source. The negative-DDI method copies (which intentionally duplicate a pair +under a new label) pass ``dedup_across_sources=False``. """ import sqlite3 @@ -43,18 +49,31 @@ def _pfam_to_id(conn): return {pfam: did for did, pfam in conn.execute("SELECT id, pfam_id FROM domain")} -def insert_ddis(conn, pairs, negative, source): +def insert_ddis(conn, pairs, negative, source, dedup_across_sources=True): """Insert DDIs for ``(pfam_a, pfam_b)`` pairs. Domains must already exist (call :func:`ensure_domains` first); pairs whose Pfam is missing from the ``domain`` table are skipped. Each pair is stored as ``(min(id), max(id))`` so swapped duplicates collapse onto one row. + With ``dedup_across_sources=True`` (default) a pair already present under any + other source is skipped, preserving the "earliest source wins" semantics for + the canonical sources. Pass ``False`` to allow the pair to be duplicated + under this ``source`` (used by the negative-DDI method copies); same-source + duplicates are still collapsed by ``UNIQUE(domain_id_a, domain_id_b, source)``. + Returns the number of rows offered to ``INSERT OR IGNORE`` (before dedup by the DB). """ pfam_to_id = _pfam_to_id(conn) neg = int(bool(negative)) + existing = set() + if dedup_across_sources: + existing = { + (a, b) for a, b in conn.execute( + "SELECT domain_id_a, domain_id_b FROM domain_domain_interaction" + ) + } rows = [] seen = set() for a, b in pairs: @@ -63,7 +82,7 @@ def insert_ddis(conn, pairs, negative, source): if ia is None or ib is None: continue key = (ia, ib) if pfam_sort_key(a) <= pfam_sort_key(b) else (ib, ia) - if key in seen: + if key in seen or key in existing: continue seen.add(key) rows.append((key[0], key[1], neg, source)) diff --git a/bin/insert_ppi_negative_selection.py b/bin/insert_ppi_negative_selection.py index 769f127..bfced58 100755 --- a/bin/insert_ppi_negative_selection.py +++ b/bin/insert_ppi_negative_selection.py @@ -1,28 +1,37 @@ #!/usr/bin/env python3 """ -Pick the best per-seed negative-DDI selection and insert it into the domainsplit -SQLite. - -Reads every ``score_*.json`` + ``pairs_*.tsv`` produced by the parallel -``select_ppi_negative_dans.py`` jobs, picks the selection with the lowest -objective ``J`` (ties broken by the smaller seed, so the result is fully -deterministic regardless of which SLURM task finished first), and inserts that -seed's Pfam pairs as negatives via the shared ``ddi_db_utils`` helpers. - -Prints a positive reference line (absolute baseline) followed by one line per -seed and a WINNER line, and writes the same data to a published scores TSV. +Insert the two negative-DDI construction methods into the domainsplit SQLite, +each under its own ``source`` labels so both can coexist in one database and be +selected independently by the downstream splits. + +Both methods use uncapped Degree-Aware Node Sampling (DANS); the per-seed pairs +are produced by ``select_ppi_negative_dans.py`` (one run per method, no +pick-best). This step copies the matching positives and inserts the negatives: + + * "deletion" -- positives = 3did restricted to the candidate-pool domain + universe (``3did_deletion``); negatives = + ``inferred_ppi_screen_negative_for_deletion``. + * "random_addition" -- positives = the full 3did set (``3did_random_addition``); + negatives = ``inferred_ppi_screen_negative_for_random_addition``. + +The four method labels are inserted with ``dedup_across_sources=False`` so they +may duplicate a pair already stored under the canonical ``3did`` source (the +table's ``UNIQUE(domain_id_a, domain_id_b, source)`` keeps the labels distinct). + +Prints a positive reference + the negative selection for each method and writes +the same data to a published scores TSV. """ import argparse -import glob import json import sqlite3 +import numpy as np + from ddi_db_utils import count_source, ensure_domains, insert_ddis TAG = "[neg_insert]" -J_ROUND = 12 def log(msg): @@ -32,11 +41,21 @@ def log(msg): def parse_args(): p = argparse.ArgumentParser() p.add_argument("--db", required=True) + p.add_argument("--pool", required=True, + help="candidate-pool .npz (for the pool-domain universe)") + p.add_argument("--pairs-deletion", required=True) + p.add_argument("--pairs-random-addition", required=True) + p.add_argument("--score-deletion", required=True) + p.add_argument("--score-random-addition", required=True) p.add_argument("--scores-out", required=True, help="output consolidated scores TSV path") - p.add_argument("--score-glob", default="score_*.json") - p.add_argument("--pairs-template", default="pairs_{seed}.tsv") - p.add_argument("--source-label", default="inferred_ppi_screen_negative") + p.add_argument("--source-3did", default="3did") + p.add_argument("--label-pos-deletion", default="3did_deletion") + p.add_argument("--label-pos-random-addition", default="3did_random_addition") + p.add_argument("--label-neg-deletion", + default="inferred_ppi_screen_negative_for_deletion") + p.add_argument("--label-neg-random-addition", + default="inferred_ppi_screen_negative_for_random_addition") return p.parse_args() @@ -52,63 +71,90 @@ def read_pairs(path): return pairs -def main(): - args = parse_args() - - score_files = sorted(glob.glob(args.score_glob)) - if not score_files: - raise SystemExit(f"{TAG} no score files matching {args.score_glob}") +def load_positive_pairs(conn, source): + """3did positive (pfam_a, pfam_b) pairs currently in the DB.""" + return conn.execute( + "SELECT da.pfam_id, db.pfam_id " + "FROM domain_domain_interaction AS ddi " + "JOIN domain AS da ON da.id = ddi.domain_id_a " + "JOIN domain AS db ON db.id = ddi.domain_id_b " + "WHERE ddi.negative = 0 AND ddi.source = ?", + (source,), + ).fetchall() - records = [] - for path in score_files: - with open(path) as fh: - records.append(json.load(fh)) - # Deterministic winner: lowest J (rounded), ties broken by smaller seed. - records.sort(key=lambda r: (round(r["J"], J_ROUND), r["seed"])) - winner = records[0] - winner_seed = winner["seed"] +def main(): + args = parse_args() - pairs_path = args.pairs_template.format(seed=winner_seed) - pairs = read_pairs(pairs_path) + pool_domains = set(np.load(args.pool, allow_pickle=True)["pool_dom"].tolist()) + log(f"pool-domain universe: {len(pool_domains)} domains") conn = sqlite3.connect(args.db) conn.execute("PRAGMA foreign_keys=ON") conn.execute("PRAGMA journal_mode=OFF") conn.execute("PRAGMA synchronous=OFF") - ensure_domains(conn, (p for pair in pairs for p in pair)) - insert_ddis(conn, pairs, negative=True, source=args.source_label) + # --- positives: full (random_addition) and pool-restricted (deletion) copies --- + pos_3did = load_positive_pairs(conn, args.source_3did) + pos_reduced = [ + (a, b) for a, b in pos_3did if a in pool_domains and b in pool_domains + ] + log(f"3did positives: {len(pos_3did)} total, {len(pos_reduced)} within pool") + + insert_ddis(conn, pos_3did, negative=False, + source=args.label_pos_random_addition, dedup_across_sources=False) + insert_ddis(conn, pos_reduced, negative=False, + source=args.label_pos_deletion, dedup_across_sources=False) + + # --- negatives: one DANS selection per method --- + pairs_del = read_pairs(args.pairs_deletion) + pairs_rand = read_pairs(args.pairs_random_addition) + + ensure_domains(conn, (p for pair in pairs_del for p in pair)) + ensure_domains(conn, (p for pair in pairs_rand for p in pair)) + insert_ddis(conn, pairs_del, negative=True, + source=args.label_neg_deletion, dedup_across_sources=False) + insert_ddis(conn, pairs_rand, negative=True, + source=args.label_neg_random_addition, dedup_across_sources=False) + conn.commit() - n_inserted = count_source(conn, args.source_label) + counts = { + args.label_pos_random_addition: count_source(conn, args.label_pos_random_addition), + args.label_pos_deletion: count_source(conn, args.label_pos_deletion), + args.label_neg_random_addition: count_source(conn, args.label_neg_random_addition), + args.label_neg_deletion: count_source(conn, args.label_neg_deletion), + } conn.close() - # --- report: positive reference, then every seed, then the winner --- - ref = records[0] - log(f"set=positive n_sel={ref['pos_n_sel']} n_dom={ref['pos_n_dom']} " - f"mean_pa={ref['pos_mean_pa']:.1f}") - for r in records: - log(f"set=negative seed={r['seed']} J={r['J']:.4f} pa={r['pa']:.4f} " - f"deg={r['deg']:.4f} cov={r['cov']:.4f} n_sel={r['n_sel']} " - f"n_dom={r['n_dom']} mean_pa={r['mean_pa']:.1f}") - log(f"WINNER seed={winner_seed} J={winner['J']:.4f} n_inserted={n_inserted}") - - cols = ["set", "seed", "J", "pa", "deg", "cov", "n_sel", "n_dom", - "mean_pa", "winner"] + with open(args.score_deletion) as fh: + sc_del = json.load(fh) + with open(args.score_random_addition) as fh: + sc_rand = json.load(fh) + + for label, n in counts.items(): + log(f"inserted source={label} rows={n}") + for sc in (sc_del, sc_rand): + log(f"method={sc['method']} set=positive n_sel={sc['pos_n_sel']} " + f"n_dom={sc['pos_n_dom']} mean_pa={sc['pos_mean_pa']:.1f}") + log(f"method={sc['method']} set=negative seed={sc['seed']} J={sc['J']:.4f} " + f"pa={sc['pa']:.4f} deg={sc['deg']:.4f} cov={sc['cov']:.4f} " + f"n_sel={sc['n_sel']} n_dom={sc['n_dom']} mean_pa={sc['mean_pa']:.1f}") + + cols = ["set", "method", "seed", "J", "pa", "deg", "cov", "n_sel", "n_dom", + "mean_pa"] with open(args.scores_out, "w") as fh: fh.write("\t".join(cols) + "\n") - fh.write("\t".join([ - "positive", "NA", "NA", "NA", "NA", "NA", - str(ref["pos_n_sel"]), str(ref["pos_n_dom"]), - f"{ref['pos_mean_pa']:.2f}", "NA", - ]) + "\n") - for r in records: + for sc in (sc_del, sc_rand): + fh.write("\t".join([ + "positive", sc["method"], "NA", "NA", "NA", "NA", "NA", + str(sc["pos_n_sel"]), str(sc["pos_n_dom"]), + f"{sc['pos_mean_pa']:.2f}", + ]) + "\n") fh.write("\t".join([ - "negative", str(r["seed"]), - f"{r['J']:.6f}", f"{r['pa']:.6f}", f"{r['deg']:.6f}", - f"{r['cov']:.6f}", str(r["n_sel"]), str(r["n_dom"]), - f"{r['mean_pa']:.2f}", - "1" if r["seed"] == winner_seed else "0", + "negative", sc["method"], str(sc["seed"]), + f"{sc['J']:.6f}", f"{sc['pa']:.6f}", f"{sc['deg']:.6f}", + f"{sc['cov']:.6f}", str(sc["n_sel"]), str(sc["n_dom"]), + f"{sc['mean_pa']:.2f}", ]) + "\n") diff --git a/bin/select_ppi_negative_dans.py b/bin/select_ppi_negative_dans.py index c83b3d9..a81235d 100755 --- a/bin/select_ppi_negative_dans.py +++ b/bin/select_ppi_negative_dans.py @@ -1,35 +1,31 @@ #!/usr/bin/env python3 """ -Select negative DDIs from a candidate pool (``neg_pool.npz``) for one random -seed, score the selection, and emit the chosen Pfam pairs + a score JSON. - -Degree-aware node sampling (DANS, Cappelletti et al. 2024, Bioinformatics -Advances vbae036) matched the negative degree distribution to the positives by -sampling edges with probability proportional to preferential attachment -(PA = deg(a)*deg(b)). Applied naively to a fixed candidate pool that already has -the positive mean PA, that overshoots: sampling proportional to PA draws edges -with mean PA = E[PA^2]/E[PA] >> pool mean, concentrating on a few hub domains. - -This selector keeps the PA-proportional draw but adds a hard per-domain CAP at -the positive degree, and refills in multiple passes until the target negative -count (== positive count) is reached: - - * pass 1 draws ``target`` candidates PA-weighted without replacement and adds - them while no endpoint exceeds its cap; - * each refill pass prunes already-picked candidates and any candidate touching - a saturated domain (it would only be skipped again), renormalises the PA - weights over the survivors, and draws the deficit (floored at ``min_batch``); - * the loop stops when ``target`` is hit, or early if the pruned pool can no - longer supply an eligible edge -- that shortfall is the true feasibility - ceiling of the fixed pool. - -Capping pins the negative degree sequence to the positive one (degree -distribution matched, hub-driven mean-PA blow-up impossible); the PA-weighted -draw fills high-degree domains toward their cap first, reproducing the positive -PA distribution; the refill drives the count to the positive total. - -The selection is scored against the positives with a combined objective (lower is -better): J = w_pa*pa + w_deg*deg + w_cov*cov, where +Select negative DDIs by Degree-Aware Node Sampling (DANS, Cappelletti et al. +2024, Bioinformatics Advances vbae036) for one of two methods, and write the +chosen Pfam pairs plus a small score JSON (reporting only -- there is no +multi-seed pick-best). + +DANS draws a negative edge by sampling two endpoints proportional to node degree +(equivalently: take the source of one random positive edge and the destination +of another) and accepts the pair iff it is not an existing edge. It is +UNCAPPED: the negative degree *distribution* tracks the positive one without +pinning an exact per-node degree sequence. + + * method "deletion" -- DANS restricted to the PPI-derived candidate pool + (``cand_a``/``cand_b``), with the positive degrees first reduced to the + candidate-domain universe (``pool_deg_r``). Candidate edges are drawn + without replacement with probability proportional to the reduced preferential + attachment PA_r = deg_r(a)*deg_r(b); target = ``n_positive_r``. + + * method "random_addition" -- plain DANS over the *full* positive set: sample + node-pairs from the positive endpoint multiset (``pos_a``/``pos_b``), + rejecting self-pairs, existing edges (``forbidden_*``) and duplicates; + target = ``n_positive``. Domains absent from the candidate pool are reachable + here, so coverage and the degree distribution match the full positives. + +The selection is scored against the method-appropriate positives with a combined +objective (lower is better), reported for inspection only: + J = w_pa*pa + w_deg*deg + w_cov*cov, where pa = Wasserstein-1 between log1p(PA_neg) and log1p(PA_pos), normalised by the spread of the positive log1p(PA); deg = Kolmogorov-Smirnov statistic between the per-domain negative degree @@ -42,6 +38,8 @@ import numpy as np +from ddi_db_utils import pfam_sort_key + TAG = "[neg_select]" @@ -53,18 +51,14 @@ def log(msg): def parse_args(): p = argparse.ArgumentParser() p.add_argument("--pool", required=True, help="candidate-pool .npz from BUILD step") + p.add_argument("--method", required=True, + choices=["deletion", "random_addition"]) p.add_argument("--seed", type=int, required=True) p.add_argument("--score-out", required=True, help="output score JSON path") p.add_argument("--pairs-out", required=True, help="output selected-pairs TSV path") p.add_argument("--w-pa", type=float, default=0.5) p.add_argument("--w-deg", type=float, default=0.3) p.add_argument("--w-cov", type=float, default=0.2) - p.add_argument( - "--min-batch", - type=int, - default=20, - help="Minimum candidates drawn per refill pass so the tail keeps progressing.", - ) return p.parse_args() @@ -91,113 +85,145 @@ def ks_statistic(x, y): return float(np.max(np.abs(cx - cy))) -def select(cand_ai, cand_bi, pa, cap, target, seed, min_batch): - """PA-weighted, degree-capped, multi-pass refill selection. +def select_deletion(cand_a, cand_b, pool_dom, pool_deg_r, target, seed): + """DANS over the fixed candidate pool: draw `target` edges without + replacement with probability proportional to the reduced PA. No cap. - Returns the array of selected candidate indices. + Returns (selected_indices, cand_ai, cand_bi) where cand_a*/cand_b* index + into pool_dom. """ rng = np.random.default_rng(seed) - n_cand = cand_ai.size - remaining = cap.copy() - picked = np.zeros(n_cand, dtype=bool) - selected = [] - - def draw_and_add(batch): - elig = np.flatnonzero( - (~picked) & (remaining[cand_ai] > 0) & (remaining[cand_bi] > 0) - ) - if elig.size == 0: - return 0 - w = pa[elig].astype(float) - total = w.sum() - p = (w / total) if total > 0 else None - k = int(min(batch, elig.size)) - chosen = rng.choice(elig, size=k, replace=False, p=p) - added = 0 - for ci in chosen: - a = cand_ai[ci] - b = cand_bi[ci] - if remaining[a] > 0 and remaining[b] > 0: - selected.append(int(ci)) - picked[ci] = True - remaining[a] -= 1 - remaining[b] -= 1 - added += 1 - if len(selected) >= target: - break - return added - - draw_and_add(target) - n_passes = 1 - while len(selected) < target: - missing = target - len(selected) - added = draw_and_add(max(missing, min_batch)) - n_passes += 1 - if added == 0: - log(f"pool exhausted after {n_passes} passes; " - f"selected {len(selected)}/{target}") - break - - log(f"selection done in {n_passes} passes: {len(selected)}/{target} edges") - return np.array(selected, dtype=np.int64) - - -def main(): - args = parse_args() - - data = np.load(args.pool, allow_pickle=True) - cand_a = data["cand_a"] - cand_b = data["cand_b"] - pos_dom = data["pos_dom"] - pos_deg = data["pos_deg"].astype(np.int64) - pos_edge_pa = data["pos_edge_pa"].astype(np.int64) - n_positive = int(data["n_positive"]) - n_positive_domains = int(data["n_positive_domains"]) - - log(f"pool: {cand_a.size} candidates, {pos_dom.size} positive domains, " - f"target = {n_positive}") - - # Map every domain to an integer index over the positive-domain universe. - domain_index = {d: i for i, d in enumerate(pos_dom)} + domain_index = {d: i for i, d in enumerate(pool_dom)} cand_ai = np.fromiter((domain_index[a] for a in cand_a), dtype=np.int64, count=cand_a.size) cand_bi = np.fromiter((domain_index[b] for b in cand_b), dtype=np.int64, count=cand_b.size) - cap = pos_deg.copy() - pa = pos_deg[cand_ai] * pos_deg[cand_bi] + pa = (pool_deg_r[cand_ai] * pool_deg_r[cand_bi]).astype(float) + n_cand = int(cand_a.size) + k = int(min(target, n_cand)) + if k < target: + log(f"deletion: candidate pool smaller than target " + f"({n_cand} < {target}); taking all") + total = pa.sum() + p = (pa / total) if total > 0 else None + sel = (rng.choice(n_cand, size=k, replace=False, p=p) + if k > 0 else np.empty(0, dtype=np.int64)) + log(f"deletion: selected {sel.size}/{target} candidate edges") + return sel, cand_ai, cand_bi + + +def select_random_addition(pos_a, pos_b, forbidden, target, seed): + """Canonical DANS over the full positive set: sample node-pairs from the + positive endpoint multiset (so endpoints are drawn proportional to degree), + rejecting self-pairs, existing edges and duplicates. No cap. + """ + rng = np.random.default_rng(seed) + endpoints = np.concatenate([pos_a, pos_b]) + m = int(endpoints.size) + picked = set() + out_a = [] + out_b = [] + attempts = 0 + max_attempts = 200 * target + 1000 + while len(out_a) < target and attempts < max_attempts: + need = target - len(out_a) + batch = int(min(max(need * 2, 1024), 5_000_000)) + ui = rng.integers(0, m, size=batch) + vi = rng.integers(0, m, size=batch) + attempts += batch + for iu, iv in zip(ui.tolist(), vi.tolist()): + a = endpoints[iu] + b = endpoints[iv] + if a == b: + continue + key = (a, b) if pfam_sort_key(a) <= pfam_sort_key(b) else (b, a) + if key in forbidden or key in picked: + continue + picked.add(key) + out_a.append(key[0]) + out_b.append(key[1]) + if len(out_a) >= target: + break + if len(out_a) < target: + log(f"random_addition: only {len(out_a)}/{target} edges after " + f"{attempts} attempts (forbidden/duplicate saturation)") + else: + log(f"random_addition: selected {target}/{target} edges in " + f"{attempts} attempts") + return np.array(out_a, dtype=object), np.array(out_b, dtype=object) + - sel = select(cand_ai, cand_bi, pa, cap, n_positive, args.seed, args.min_batch) +def main(): + args = parse_args() + data = np.load(args.pool, allow_pickle=True) - # --- selected-set statistics --- - sel_ai = cand_ai[sel] - sel_bi = cand_bi[sel] - neg_pa = pa[sel] - n_sel = int(sel.size) + if args.method == "deletion": + cand_a = data["cand_a"] + cand_b = data["cand_b"] + pool_dom = data["pool_dom"] + dom_deg = data["pool_deg_r"].astype(np.int64) + pos_edge_pa = data["pos_edge_pa_r"].astype(np.int64) + target = int(data["n_positive_r"]) + n_pos_domains = int(data["n_positive_domains_r"]) + + log(f"pool: {cand_a.size} candidate edges, {pool_dom.size} pool domains, " + f"target = {target}") + sel, cand_ai, cand_bi = select_deletion( + cand_a, cand_b, pool_dom, dom_deg, target, args.seed + ) + neg_ai = cand_ai[sel] + neg_bi = cand_bi[sel] + out_a = cand_a[sel] + out_b = cand_b[sel] + else: + pos_a = data["pos_a"] + pos_b = data["pos_b"] + pos_dom = data["pos_dom"] + dom_deg = data["pos_deg"].astype(np.int64) + pos_edge_pa = data["pos_edge_pa"].astype(np.int64) + target = int(data["n_positive"]) + n_pos_domains = int(data["n_positive_domains"]) + forbidden = set(zip(data["forbidden_a"].tolist(), + data["forbidden_b"].tolist())) + + log(f"positives: {pos_a.size} edges, {pos_dom.size} domains, " + f"{len(forbidden)} forbidden pairs, target = {target}") + out_a, out_b = select_random_addition( + pos_a, pos_b, forbidden, target, args.seed + ) + domain_index = {d: i for i, d in enumerate(pos_dom)} + neg_ai = np.fromiter((domain_index[a] for a in out_a), dtype=np.int64, + count=out_a.size) + neg_bi = np.fromiter((domain_index[b] for b in out_b), dtype=np.int64, + count=out_b.size) + + # --- selected-set statistics / objective (reporting only) --- + n_sel = int(out_a.size) + neg_pa = (dom_deg[neg_ai] * dom_deg[neg_bi]).astype(np.int64) mean_pa_neg = float(neg_pa.mean()) if n_sel else 0.0 - neg_deg = np.zeros(pos_dom.size, dtype=np.int64) - np.add.at(neg_deg, sel_ai, 1) - np.add.at(neg_deg, sel_bi, 1) + neg_deg = np.zeros(dom_deg.size, dtype=np.int64) + np.add.at(neg_deg, neg_ai, 1) + np.add.at(neg_deg, neg_bi, 1) n_dom = int(np.count_nonzero(neg_deg)) - # --- objective --- pos_logpa = np.log1p(pos_edge_pa.astype(float)) neg_logpa = np.log1p(neg_pa.astype(float)) - spread = float(pos_logpa.max() - pos_logpa.min()) + spread = float(pos_logpa.max() - pos_logpa.min()) if pos_logpa.size else 0.0 pa_term = wasserstein1(neg_logpa, pos_logpa) / spread if spread > 0 else 0.0 - deg_term = ks_statistic(neg_deg, pos_deg) - cov_term = 1.0 - (n_dom / n_positive_domains) if n_positive_domains else 0.0 + deg_term = ks_statistic(neg_deg, dom_deg) + cov_term = 1.0 - (n_dom / n_pos_domains) if n_pos_domains else 0.0 j = args.w_pa * pa_term + args.w_deg * deg_term + args.w_cov * cov_term + pos_mean_pa = float(pos_edge_pa.mean()) if pos_edge_pa.size else 0.0 - pos_mean_pa = float(pos_edge_pa.mean()) - - log(f"set=positive n_sel={n_positive} n_dom={n_positive_domains} " + log(f"method={args.method} set=positive n_sel={target} n_dom={n_pos_domains} " f"mean_pa={pos_mean_pa:.1f}") - log(f"set=negative seed={args.seed} J={j:.4f} pa={pa_term:.4f} " - f"deg={deg_term:.4f} cov={cov_term:.4f} n_sel={n_sel} n_dom={n_dom} " - f"mean_pa={mean_pa_neg:.1f}") + log(f"method={args.method} set=negative seed={args.seed} J={j:.4f} " + f"pa={pa_term:.4f} deg={deg_term:.4f} cov={cov_term:.4f} n_sel={n_sel} " + f"n_dom={n_dom} mean_pa={mean_pa_neg:.1f}") score = { + "method": args.method, "seed": int(args.seed), "J": j, "pa": pa_term, @@ -206,8 +232,8 @@ def main(): "n_sel": n_sel, "n_dom": n_dom, "mean_pa": mean_pa_neg, - "pos_n_sel": n_positive, - "pos_n_dom": n_positive_domains, + "pos_n_sel": target, + "pos_n_dom": n_pos_domains, "pos_mean_pa": pos_mean_pa, "w_pa": args.w_pa, "w_deg": args.w_deg, @@ -217,8 +243,8 @@ def main(): json.dump(score, fh) with open(args.pairs_out, "w") as fh: - for ci in sel: - fh.write(f"{cand_a[ci]}\t{cand_b[ci]}\n") + for a, b in zip(out_a.tolist(), out_b.tolist()): + fh.write(f"{a}\t{b}\n") if __name__ == "__main__": diff --git a/conf/modules.config b/conf/modules.config index aa3bdda..7325cec 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,7 +35,7 @@ process { // INSERT_PPI_NEGATIVE_SELECTION's domainsplit.sqlite3 is an intermediate // (published only via the workflow-level output block), but its - // negative_ppi_seed_scores.tsv diagnostic IS published. + // negative_ppi_method_scores.tsv diagnostic IS published. withName: 'INSERT_PPI_NEGATIVE_SELECTION' { publishDir = [ path: { "${params.outdir}/negative_ppi" }, diff --git a/modules/local/init_domainsplit_db/main.nf b/modules/local/init_domainsplit_db/main.nf index 95cb1c1..5f70b1b 100644 --- a/modules/local/init_domainsplit_db/main.nf +++ b/modules/local/init_domainsplit_db/main.nf @@ -31,7 +31,7 @@ process INIT_DOMAINSPLIT_DB { source VARCHAR(255), FOREIGN KEY(domain_id_a) REFERENCES domain ON DELETE CASCADE, FOREIGN KEY(domain_id_b) REFERENCES domain ON DELETE CASCADE, - UNIQUE(domain_id_a, domain_id_b) + UNIQUE(domain_id_a, domain_id_b, source) ); CREATE TABLE protein ( diff --git a/modules/local/insert_ppi_negative_selection/main.nf b/modules/local/insert_ppi_negative_selection/main.nf index f31467b..652e64b 100644 --- a/modules/local/insert_ppi_negative_selection/main.nf +++ b/modules/local/insert_ppi_negative_selection/main.nf @@ -6,13 +6,16 @@ process INSERT_PPI_NEGATIVE_SELECTION { input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' - path score_jsons - path pairs_tsvs + path neg_pool + path pairs_deletion + path pairs_random_addition + path score_deletion + path score_random_addition output: - path "domainsplit.sqlite3", emit: domainsplit_db - path "negative_ppi_seed_scores.tsv", emit: scores - path "versions.yml", emit: versions + path "domainsplit.sqlite3", emit: domainsplit_db + path "negative_ppi_method_scores.tsv", emit: scores + path "versions.yml", emit: versions script: """ @@ -20,11 +23,17 @@ process INSERT_PPI_NEGATIVE_SELECTION { insert_ppi_negative_selection.py \\ --db domainsplit.sqlite3 \\ - --scores-out negative_ppi_seed_scores.tsv + --pool "${neg_pool}" \\ + --pairs-deletion "${pairs_deletion}" \\ + --pairs-random-addition "${pairs_random_addition}" \\ + --score-deletion "${score_deletion}" \\ + --score-random-addition "${score_random_addition}" \\ + --scores-out negative_ppi_method_scores.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python3 -c 'import sys; print(sys.version.split()[0])') + numpy: \$(python3 -c 'import numpy; print(numpy.__version__)') sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)') END_VERSIONS """ diff --git a/modules/local/select_ppi_negative_dans/main.nf b/modules/local/select_ppi_negative_dans/main.nf index 4163661..fd1a631 100644 --- a/modules/local/select_ppi_negative_dans/main.nf +++ b/modules/local/select_ppi_negative_dans/main.nf @@ -1,23 +1,27 @@ process SELECT_PPI_NEGATIVE_DANS { - tag "select_ppi_negative_dans:seed=${seed}" + tag "select_ppi_negative_dans:${method}" label 'process_low' conda "${moduleDir}/environment.yml" container "docker://konstantinpelz/domainsplit-general:1.0.0" input: - tuple val(seed), path(neg_pool) + val method + val seed + path neg_pool output: - tuple val(seed), path("score_${seed}.json"), path("pairs_${seed}.tsv"), emit: result - path "versions.yml", emit: versions + path "score_${method}.json", emit: score + path "pairs_${method}.tsv", emit: pairs + path "versions.yml", emit: versions script: """ select_ppi_negative_dans.py \\ --pool "${neg_pool}" \\ + --method ${method} \\ --seed ${seed} \\ - --score-out score_${seed}.json \\ - --pairs-out pairs_${seed}.tsv + --score-out score_${method}.json \\ + --pairs-out pairs_${method}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index aa5af2a..2f39696 100644 --- a/nextflow.config +++ b/nextflow.config @@ -67,8 +67,8 @@ params { // count. Required input (no default; must be supplied per run). negative_ppi_parquet = null negative_ppi_min_n_tested = 5 - // Base seed for the multi-seed negative-DDI selection: 5 parallel SLURM - // jobs run with seeds base+1..+5 and the best-scoring selection is kept. + // Random seed for the (single-run, uncapped DANS) negative-DDI selection; + // both methods ("deletion" and "random_addition") use this seed. negative_ppi_seed = 42 // Reviewed-human UniProt -> Pfam stream used to detect single-domain diff --git a/nextflow_schema.json b/nextflow_schema.json index ef83a4b..bb61aef 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -167,7 +167,7 @@ }, "negative_ppi_seed": { "type": "integer", - "description": "Base random seed for the multi-seed negative-DDI selection. Five parallel jobs run the degree-capped PA-weighted sampler with seeds base+1..+5; the lowest-scoring (best degree/PA/coverage-matched) selection is inserted.", + "description": "Random seed for the single-run, uncapped DANS negative-DDI selection (used by both the deletion and random_addition methods).", "default": 42, "fa_icon": "fas fa-dice" }, diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf index 9c32b68..d893eab 100644 --- a/subworkflows/local/collect_ddi_data/main.nf +++ b/subworkflows/local/collect_ddi_data/main.nf @@ -27,9 +27,10 @@ include { INSERT_SINGLE_DOMAIN_PPI } from '../../../modules/local/insert_single include { INSERT_PPIDM } from '../../../modules/local/insert_ppidm/main.nf' include { INSERT_NEGATOME } from '../../../modules/local/insert_negatome/main.nf' include { REMOVE_SELF_INTERACTIONS } from '../../../modules/local/remove_self_interactions/main.nf' -include { BUILD_PPI_NEGATIVE_POOL } from '../../../modules/local/build_ppi_negative_pool/main.nf' -include { SELECT_PPI_NEGATIVE_DANS } from '../../../modules/local/select_ppi_negative_dans/main.nf' -include { INSERT_PPI_NEGATIVE_SELECTION } from '../../../modules/local/insert_ppi_negative_selection/main.nf' +include { BUILD_PPI_NEGATIVE_POOL } from '../../../modules/local/build_ppi_negative_pool/main.nf' +include { SELECT_PPI_NEGATIVE_DANS as SELECT_DELETION } from '../../../modules/local/select_ppi_negative_dans/main.nf' +include { SELECT_PPI_NEGATIVE_DANS as SELECT_RANDOM_ADDITION } from '../../../modules/local/select_ppi_negative_dans/main.nf' +include { INSERT_PPI_NEGATIVE_SELECTION } from '../../../modules/local/insert_ppi_negative_selection/main.nf' include { SMOKE_FILTER } from '../../../modules/local/smoke_filter/main.nf' workflow COLLECT_DDI_DATA { @@ -74,10 +75,17 @@ workflow COLLECT_DDI_DATA { domainsplit_db = REMOVE_SELF_INTERACTIONS(domainsplit_db).domainsplit_db } - // 7. high-confidence non-PPI negatives (inferred only over 3did domains). + // 7. high-confidence non-PPI negatives via uncapped DANS (Cappelletti et al. + // vbae036), in two flavours that coexist under distinct source labels: + // * "deletion" -- DANS over the PPI candidate pool, with the + // positives reduced to the candidate-domain + // universe (labels 3did_deletion / + // inferred_ppi_screen_negative_for_deletion). + // * "random_addition" -- plain DANS over the full positive set (labels + // 3did_random_addition / + // inferred_ppi_screen_negative_for_random_addition). // The expensive, deterministic UniProt fetch + candidate-pool build runs - // once; selection fans out over 5 seeds (base+1..+5) in parallel SLURM - // jobs, and the best-scoring (degree/PA-matched) selection is inserted. + // once; each method is a single deterministic selection (no pick-best). pool = BUILD_PPI_NEGATIVE_POOL( domainsplit_db, file(negative_ppi_parquet), @@ -85,24 +93,18 @@ workflow COLLECT_DDI_DATA { params.self_interaction, ) - // Pair the single shared pool file with each seed (combine avoids the - // queue-exhaustion that mixing a one-shot channel with a 5-item queue causes). - seeds = Channel.of(1, 2, 3, 4, 5).map { params.negative_ppi_seed + it } - sel = SELECT_PPI_NEGATIVE_DANS(seeds.combine(pool.neg_pool)) + del = SELECT_DELETION('deletion', params.negative_ppi_seed, pool.neg_pool) + rand = SELECT_RANDOM_ADDITION('random_addition', params.negative_ppi_seed, pool.neg_pool) - sel.result - .multiMap { seed, score, pairs -> - scores: score - pairs: pairs - } - .set { selection } - - best = INSERT_PPI_NEGATIVE_SELECTION( + inserted = INSERT_PPI_NEGATIVE_SELECTION( pool.domainsplit_db, - selection.scores.collect(), - selection.pairs.collect(), + pool.neg_pool, + del.pairs, + rand.pairs, + del.score, + rand.score, ) - domainsplit_db = best.domainsplit_db + domainsplit_db = inserted.domainsplit_db pfam_mapping = pool.pfam_mapping if (params.smoke_test_n_ddis != null) { diff --git a/subworkflows/local/split_domainsplit_database/main.nf b/subworkflows/local/split_domainsplit_database/main.nf index 4f6e3a7..1427c03 100644 --- a/subworkflows/local/split_domainsplit_database/main.nf +++ b/subworkflows/local/split_domainsplit_database/main.nf @@ -1,24 +1,35 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SPLIT_DOMAINSPLIT_DATABASE -- produce three split strategies: + SPLIT_DOMAINSPLIT_DATABASE -- produce the split strategies, each run ONCE + PER NEGATIVE-DDI METHOD ("deletion" and "random_addition") so the two + methods' core sources stay isolated. Strategies: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * random_ddi biased baseline (random partition) * minimal_leakage_domain leakage-aware spectral partition * external_validation leakage-aware train/validation on the "core" - sources (3did + high-conf non-PPI negatives), - plus an as-is test set from the held-out sources - (single-domain PPI, PPIDM, negatome). - - Domain sequences are extracted and clustered (MMseqs2) once; both the - minimal-leakage and external-validation train/val partitions reuse the - clusters. + sources (3did copy + the method's PPI-screen + negatives), plus an as-is test set from the + held-out sources (single-domain PPI, PPIDM, + negatome). The held-out test set is + method-independent, so it is built once and routed + into both external_validation_* folders. + + Each strategy therefore yields two method folders, e.g. + random_ddi_deletion / random_ddi_random_addition -> 6 method folders total. + + Domain sequences are extracted and clustered (MMseqs2) once; every + leakage-aware partition reuses the clusters. ----------------------------------------------------------------------------*/ -include { RANDOM_DDI_SPLIT } from '../../../modules/local/random_ddi_split/main' -include { EXTRACT_DOMAIN_SEQUENCES; MINIMAL_LEAKAGE_SPLIT_DOMAIN } from '../../../modules/local/minimal_leakage_split/main' -include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MINIMAL_LEAKAGE_SPLIT_TRAINVAL } from '../../../modules/local/minimal_leakage_split/main' -include { SUBSET_DDIS_BY_SOURCE } from '../../../modules/local/external_validation_split/main' -include { MMSEQS_EASYCLUSTER } from '../../../modules/nf-core/mmseqs/easycluster/main' +include { RANDOM_DDI_SPLIT as RANDOM_DDI_SPLIT_DEL } from '../../../modules/local/random_ddi_split/main' +include { RANDOM_DDI_SPLIT as RANDOM_DDI_SPLIT_RAND } from '../../../modules/local/random_ddi_split/main' +include { EXTRACT_DOMAIN_SEQUENCES } from '../../../modules/local/minimal_leakage_split/main' +include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MLS_DOMAIN_DEL } from '../../../modules/local/minimal_leakage_split/main' +include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MLS_DOMAIN_RAND } from '../../../modules/local/minimal_leakage_split/main' +include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MLS_TRAINVAL_DEL } from '../../../modules/local/minimal_leakage_split/main' +include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MLS_TRAINVAL_RAND } from '../../../modules/local/minimal_leakage_split/main' +include { SUBSET_DDIS_BY_SOURCE } from '../../../modules/local/external_validation_split/main' +include { MMSEQS_EASYCLUSTER } from '../../../modules/nf-core/mmseqs/easycluster/main' def map_split_dbs(split_info_ch, split_dbs_ch, method) { @@ -52,14 +63,25 @@ workflow SPLIT_DOMAINSPLIT_DATABASE { ["optimization", 0.2], ["test", 0.2] ] + def trainval_splits = [ + ["train", 0.8], + ["validation", 0.2] + ] - // The two methods that mimic a within-distribution evaluation use only the - // "core" sources: 3did positives + high-confidence non-PPI negatives. - // 'inferred_ppi_screen_negative' must stay in sync with the --source-label - // default in bin/insert_ppi_negative_selection.py. - def core_sources = ['3did', 'inferred_ppi_screen_negative'] + // Core sources per negative-DDI method: the method's 3did positive copy plus + // its PPI-screen negatives. Must stay in sync with the source labels written + // by bin/insert_ppi_negative_selection.py. + def core_deletion = [ + '3did_deletion', + 'inferred_ppi_screen_negative_for_deletion', + ] + def core_random_addition = [ + '3did_random_addition', + 'inferred_ppi_screen_negative_for_random_addition', + ] - // External-validation test set: held-out sources placed as is. + // External-validation test set: held-out sources placed as is + // (method-independent). def test_sources = [ 'single_domain_ppi', 'PPIDM_Bronze', 'PPIDM_Silver', 'PPIDM_Gold', @@ -67,33 +89,19 @@ workflow SPLIT_DOMAINSPLIT_DATABASE { ] // Biased baseline: random DDI split (same proteins in train and test) - RANDOM_DDI_SPLIT( - domainsplit_db_ch, - Channel.of(splits), - core_sources - ) + RANDOM_DDI_SPLIT_DEL(domainsplit_db_ch, splits, core_deletion) + RANDOM_DDI_SPLIT_RAND(domainsplit_db_ch, splits, core_random_addition) // Leakage-aware: spectral graph partitioning on domain clusters - MINIMAL_LEAKAGE_SPLIT_DOMAIN( - domainsplit_db_ch, - splits, - clusters_tsv, - core_sources - ) + MLS_DOMAIN_DEL(domainsplit_db_ch, splits, clusters_tsv, core_deletion) + MLS_DOMAIN_RAND(domainsplit_db_ch, splits, clusters_tsv, core_random_addition) // External validation: leakage-free train/validation on core sources ... - def trainval_splits = [ - ["train", 0.8], - ["validation", 0.2] - ] - MINIMAL_LEAKAGE_SPLIT_TRAINVAL( - domainsplit_db_ch, - trainval_splits, - clusters_tsv, - core_sources - ) + MLS_TRAINVAL_DEL(domainsplit_db_ch, trainval_splits, clusters_tsv, core_deletion) + MLS_TRAINVAL_RAND(domainsplit_db_ch, trainval_splits, clusters_tsv, core_random_addition) - // ... plus an as-is test set from the held-out sources + // ... plus an as-is test set from the held-out sources (shared by both + // external_validation_* methods). SUBSET_DDIS_BY_SOURCE( domainsplit_db_ch, test_sources, @@ -101,10 +109,14 @@ workflow SPLIT_DOMAINSPLIT_DATABASE { ) split_ch = Channel.empty().mix( - map_split_dbs(RANDOM_DDI_SPLIT.out.split_info, RANDOM_DDI_SPLIT.out.split_dbs, "random_ddi"), - map_split_dbs(MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_info, MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_dbs, "minimal_leakage_domain"), - map_split_dbs(MINIMAL_LEAKAGE_SPLIT_TRAINVAL.out.split_info, MINIMAL_LEAKAGE_SPLIT_TRAINVAL.out.split_dbs, "external_validation"), - map_split_dbs(SUBSET_DDIS_BY_SOURCE.out.split_info, SUBSET_DDIS_BY_SOURCE.out.split_dbs, "external_validation") + map_split_dbs(RANDOM_DDI_SPLIT_DEL.out.split_info, RANDOM_DDI_SPLIT_DEL.out.split_dbs, "random_ddi_deletion"), + map_split_dbs(RANDOM_DDI_SPLIT_RAND.out.split_info, RANDOM_DDI_SPLIT_RAND.out.split_dbs, "random_ddi_random_addition"), + map_split_dbs(MLS_DOMAIN_DEL.out.split_info, MLS_DOMAIN_DEL.out.split_dbs, "minimal_leakage_domain_deletion"), + map_split_dbs(MLS_DOMAIN_RAND.out.split_info, MLS_DOMAIN_RAND.out.split_dbs, "minimal_leakage_domain_random_addition"), + map_split_dbs(MLS_TRAINVAL_DEL.out.split_info, MLS_TRAINVAL_DEL.out.split_dbs, "external_validation_deletion"), + map_split_dbs(MLS_TRAINVAL_RAND.out.split_info, MLS_TRAINVAL_RAND.out.split_dbs, "external_validation_random_addition"), + map_split_dbs(SUBSET_DDIS_BY_SOURCE.out.split_info, SUBSET_DDIS_BY_SOURCE.out.split_dbs, "external_validation_deletion"), + map_split_dbs(SUBSET_DDIS_BY_SOURCE.out.split_info, SUBSET_DDIS_BY_SOURCE.out.split_dbs, "external_validation_random_addition") ) emit: diff --git a/tests/python/test_insert_ppi_negative_selection.py b/tests/python/test_insert_ppi_negative_selection.py new file mode 100644 index 0000000..d3c8add --- /dev/null +++ b/tests/python/test_insert_ppi_negative_selection.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +"""Local integration check for the dual-source negative insertion (no cluster). + +Validates the schema change (UNIQUE(domain_id_a, domain_id_b, source)) and +bin/insert_ppi_negative_selection.py together: + + * the four method labels are inserted, with 3did_random_addition copying the + full 3did set and 3did_deletion only the pool-domain subset; + * a pair can coexist under '3did' and '3did_random_addition' (duplicate by + source); + * the canonical sources still dedup across each other (a PPIDM pair equal to a + 3did pair is dropped) because insert_ddis defaults to dedup_across_sources. + +Run directly or via pytest. +""" + +import json +import os +import sqlite3 +import subprocess +import sys +import tempfile + +import numpy as np + +REPO = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +BIN = os.path.join(REPO, "bin") +sys.path.insert(0, BIN) + +from ddi_db_utils import ensure_domains, insert_ddis # noqa: E402 + +INSERTER = os.path.join(BIN, "insert_ppi_negative_selection.py") + +SCHEMA = """ +CREATE TABLE domain (id INTEGER PRIMARY KEY, pfam_id, name, UNIQUE(pfam_id)); +CREATE TABLE domain_domain_interaction ( + id INTEGER PRIMARY KEY, + domain_id_a, domain_id_b, negative, + source VARCHAR(255), + FOREIGN KEY(domain_id_a) REFERENCES domain ON DELETE CASCADE, + FOREIGN KEY(domain_id_b) REFERENCES domain ON DELETE CASCADE, + UNIQUE(domain_id_a, domain_id_b, source) +); +""" + + +def pf(i): + return f"PF{i:05d}" + + +def count(conn, source): + return conn.execute( + "SELECT COUNT(*) FROM domain_domain_interaction WHERE source = ?", + (source,), + ).fetchone()[0] + + +def write_score(path, method): + json.dump({ + "method": method, "seed": 7, "J": 0.1, "pa": 0.1, "deg": 0.1, + "cov": 0.0, "n_sel": 2, "n_dom": 3, "mean_pa": 1.0, + "pos_n_sel": 3, "pos_n_dom": 4, "pos_mean_pa": 2.0, + }, open(path, "w")) + + +def write_pairs(path, pairs): + with open(path, "w") as fh: + for a, b in pairs: + fh.write(f"{a}\t{b}\n") + + +def test_dual_source_insert(): + with tempfile.TemporaryDirectory() as tmp: + db = os.path.join(tmp, "domainsplit.sqlite3") + conn = sqlite3.connect(db) + conn.executescript(SCHEMA) + ensure_domains(conn, [pf(i) for i in range(1, 7)]) + + # 3did positives, then a PPIDM batch overlapping (1,2). + insert_ddis(conn, [(pf(1), pf(2)), (pf(1), pf(3)), (pf(1), pf(4))], + negative=False, source="3did") + insert_ddis(conn, [(pf(1), pf(2)), (pf(5), pf(6))], + negative=False, source="PPIDM_Gold") + conn.commit() + assert count(conn, "3did") == 3 + assert count(conn, "PPIDM_Gold") == 1, "cross-source dedup broken" + conn.close() + + # Pool covers only domains 1,2,3 -> 3did_deletion keeps (1,2),(1,3). + pool = os.path.join(tmp, "neg_pool.npz") + np.savez(pool, pool_dom=np.array([pf(1), pf(2), pf(3)], dtype=object)) + + write_pairs(os.path.join(tmp, "pairs_deletion.tsv"), + [(pf(2), pf(5)), (pf(3), pf(6))]) + write_pairs(os.path.join(tmp, "pairs_random_addition.tsv"), + [(pf(1), pf(6)), (pf(4), pf(5))]) + write_score(os.path.join(tmp, "score_deletion.json"), "deletion") + write_score(os.path.join(tmp, "score_random_addition.json"), "random_addition") + + env = dict(os.environ, PYTHONPATH=BIN + os.pathsep + os.environ.get("PYTHONPATH", "")) + subprocess.run( + [sys.executable, INSERTER, "--db", db, "--pool", pool, + "--pairs-deletion", os.path.join(tmp, "pairs_deletion.tsv"), + "--pairs-random-addition", os.path.join(tmp, "pairs_random_addition.tsv"), + "--score-deletion", os.path.join(tmp, "score_deletion.json"), + "--score-random-addition", os.path.join(tmp, "score_random_addition.json"), + "--scores-out", os.path.join(tmp, "scores.tsv")], + check=True, env=env, + ) + + conn = sqlite3.connect(db) + assert count(conn, "3did_random_addition") == 3, "full 3did copy wrong" + assert count(conn, "3did_deletion") == 2, "pool-restricted copy wrong" + assert count(conn, "inferred_ppi_screen_negative_for_deletion") == 2 + assert count(conn, "inferred_ppi_screen_negative_for_random_addition") == 2 + # original sources untouched + assert count(conn, "3did") == 3 + assert count(conn, "PPIDM_Gold") == 1 + + # The same pair (1,2) coexists under '3did' and '3did_random_addition'. + n_dup = conn.execute( + "SELECT COUNT(DISTINCT source) FROM domain_domain_interaction ddi " + "JOIN domain da ON da.id = ddi.domain_id_a " + "JOIN domain db ON db.id = ddi.domain_id_b " + "WHERE da.pfam_id = ? AND db.pfam_id = ?", + (pf(1), pf(2)), + ).fetchone()[0] + assert n_dup >= 2, f"(1,2) should exist under >=2 sources, got {n_dup}" + conn.close() + + print("OK: dual-source insert + schema invariants hold") + + +if __name__ == "__main__": + test_dual_source_insert() diff --git a/tests/python/test_select_ppi_negative_dans.py b/tests/python/test_select_ppi_negative_dans.py new file mode 100644 index 0000000..41bfba9 --- /dev/null +++ b/tests/python/test_select_ppi_negative_dans.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +"""Local unit-check for bin/select_ppi_negative_dans.py (no Nextflow, no cluster). + +Builds a tiny synthetic candidate pool and runs both DANS methods, asserting the +core invariants: + + * deletion -- draws exactly n_positive_r edges, all from the candidate + pool, all endpoints within the pool-domain universe. + * random_addition -- draws exactly n_positive edges, none a positive/forbidden + pair, no self-pairs, no duplicates, and reaches domains + that are absent from the candidate pool. + +Run directly (`python3 tests/python/test_select_ppi_negative_dans.py`) or via +pytest. +""" + +import os +import subprocess +import sys +import tempfile + +import numpy as np + +REPO = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +BIN = os.path.join(REPO, "bin") +SELECTOR = os.path.join(BIN, "select_ppi_negative_dans.py") + + +def pf(i): + return f"PF{i:05d}" + + +def build_pool(path): + # Full 3did positive edges (canonical ascending). + pos_edges = [(1, 2), (1, 3), (1, 4), (2, 3), (5, 6), + (7, 8), (9, 10), (1, 9), (2, 10)] + # Candidate pool: fresh (non-positive) pairs over domains 1..6. + cand = [(1, 5), (1, 6), (2, 5), (2, 6), (3, 5), + (3, 6), (4, 5), (4, 6), (3, 4)] + pool_domains = sorted({d for e in cand for d in e}) # 1..6 + + # Reduced positives = positive edges with both endpoints in the pool. + pos_r = [(a, b) for a, b in pos_edges + if a in pool_domains and b in pool_domains] + deg_r = {d: 0 for d in pool_domains} + for a, b in pos_r: + deg_r[a] += 1 + deg_r[b] += 1 + pool_dom = [pf(d) for d in pool_domains] + pool_deg_r = np.array([deg_r[d] for d in pool_domains], dtype=np.int64) + pos_edge_pa_r = np.array([deg_r[a] * deg_r[b] for a, b in pos_r], dtype=np.int64) + + # Full positive degrees over all 10 domains. + all_dom = list(range(1, 11)) + deg = {d: 0 for d in all_dom} + for a, b in pos_edges: + deg[a] += 1 + deg[b] += 1 + pos_dom = [pf(d) for d in all_dom] + pos_deg = np.array([deg[d] for d in all_dom], dtype=np.int64) + pos_edge_pa = np.array([deg[a] * deg[b] for a, b in pos_edges], dtype=np.int64) + + np.savez( + path, + cand_a=np.array([pf(a) for a, b in cand], dtype=object), + cand_b=np.array([pf(b) for a, b in cand], dtype=object), + pool_dom=np.array(pool_dom, dtype=object), + pool_deg_r=pool_deg_r, + pos_edge_pa_r=pos_edge_pa_r, + n_positive_r=np.int64(len(pos_r)), + n_positive_domains_r=np.int64(sum(1 for d in pool_domains if deg_r[d])), + pos_a=np.array([pf(a) for a, b in pos_edges], dtype=object), + pos_b=np.array([pf(b) for a, b in pos_edges], dtype=object), + pos_dom=np.array(pos_dom, dtype=object), + pos_deg=pos_deg, + pos_edge_pa=pos_edge_pa, + n_positive=np.int64(len(pos_edges)), + n_positive_domains=np.int64(len(all_dom)), + forbidden_a=np.array([pf(a) for a, b in pos_edges], dtype=object), + forbidden_b=np.array([pf(b) for a, b in pos_edges], dtype=object), + ) + return { + "cand": {tuple(sorted((pf(a), pf(b)))) for a, b in cand}, + "pool_domains": {pf(d) for d in pool_domains}, + "n_positive_r": len(pos_r), + "n_positive": len(pos_edges), + "forbidden": {(pf(a), pf(b)) for a, b in pos_edges}, + "pool_only": {pf(d) for d in pool_domains}, + "extra_domains": {pf(9), pf(10)}, + } + + +def run_method(pool_path, method, workdir): + score = os.path.join(workdir, f"score_{method}.json") + pairs = os.path.join(workdir, f"pairs_{method}.tsv") + env = dict(os.environ, PYTHONPATH=BIN + os.pathsep + os.environ.get("PYTHONPATH", "")) + subprocess.run( + [sys.executable, SELECTOR, "--pool", pool_path, "--method", method, + "--seed", "7", "--score-out", score, "--pairs-out", pairs], + check=True, env=env, + ) + out = [] + with open(pairs) as fh: + for line in fh: + line = line.rstrip("\n") + if line: + a, b = line.split("\t") + out.append((a, b)) + return out + + +def test_dans_methods(): + with tempfile.TemporaryDirectory() as tmp: + pool_path = os.path.join(tmp, "neg_pool.npz") + meta = build_pool(pool_path) + + # --- deletion --- + del_pairs = run_method(pool_path, "deletion", tmp) + assert len(del_pairs) == meta["n_positive_r"], \ + f"deletion count {len(del_pairs)} != {meta['n_positive_r']}" + for a, b in del_pairs: + assert tuple(sorted((a, b))) in meta["cand"], f"{a},{b} not in pool" + assert a in meta["pool_domains"] and b in meta["pool_domains"] + assert len({tuple(sorted(p)) for p in del_pairs}) == len(del_pairs), "dup in deletion" + + # --- random_addition --- + rand_pairs = run_method(pool_path, "random_addition", tmp) + assert len(rand_pairs) == meta["n_positive"], \ + f"random_addition count {len(rand_pairs)} != {meta['n_positive']}" + seen = set() + used_domains = set() + for a, b in rand_pairs: + assert a != b, f"self pair {a}" + key = (a, b) if a <= b else (b, a) + assert key not in meta["forbidden"], f"{key} is a positive/forbidden pair" + assert key not in seen, f"duplicate {key}" + seen.add(key) + used_domains.update((a, b)) + # DANS over the full positive set must be able to reach domains outside + # the candidate pool. + assert used_domains & meta["extra_domains"], \ + "random_addition never reached the pool-absent domains" + + print("OK: both DANS methods satisfy invariants") + + +if __name__ == "__main__": + test_dans_methods() From 6be7b1a4bc72c06c940c36174368db620243d835 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Thu, 11 Jun 2026 14:48:50 +0200 Subject: [PATCH 09/16] updated schema --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index bb61aef..a4d72f8 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -46,7 +46,7 @@ "url_3did": { "type": "string", "description": "URL of the 3did SQL dump archive.", - "default": "https://3did.irbbarcelona.org/download/2022_01/3did.sql.gz", + "default": "https://3did.irbbarcelona.org/download/current/3did.sql.gz", "fa_icon": "fas fa-link" }, "url_uniprot_id_mapping": { From 77df26dab13636cf3810fcc774a509c06c729be4 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Thu, 11 Jun 2026 14:59:48 +0200 Subject: [PATCH 10/16] dont add nf-test to github --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 7e07892..c85aa62 100644 --- a/.gitignore +++ b/.gitignore @@ -208,6 +208,9 @@ other_scripts/* test-eval-env/* *.txt +# Tests +.nf-test/ + # Added by code-review-graph .code-review-graph/ # Claude From 64f915bd0610517b4fdee08b780bcfe13871628c Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Thu, 11 Jun 2026 15:14:06 +0200 Subject: [PATCH 11/16] marked required files as required, fixed json in test --- nextflow_schema.json | 9 +++------ subworkflows/local/collect_ddi_data/main.nf | 4 ++++ tests/python/test_insert_ppi_negative_selection.py | 11 ++++++----- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index a4d72f8..bfb39b3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -152,10 +152,9 @@ "fa_icon": "fas fa-hashtag" }, "negative_ppi_parquet": { - "type": ["string", "null"], + "type": ["string"], "format": "file-path", "description": "Required path to a Y2H/MS PPI parquet (columns: gene_name_bait, gene_name_prey, n_tested, ...). COLLECT_DDI_DATA derives negative DDIs by mapping bait/prey genes to UniProt then Pfam (via UniProt REST API) and inserting Pfam-pair candidates (degree-matched or by frequency) restricted to domains already in positive DDIs.", - "default": null, "fa_icon": "fas fa-file-import" }, "negative_ppi_min_n_tested": { @@ -185,10 +184,9 @@ "fa_icon": "fas fa-link" }, "hippie_tsv": { - "type": ["string", "null"], + "type": ["string"], "format": "file-path", "description": "Required path to a HIPPIE PPI TSV. COLLECT_DDI_DATA adds positive DDIs inferred from PPIs between two single-domain proteins.", - "default": null, "fa_icon": "fas fa-file-import" }, "hippie_min_score": { @@ -200,10 +198,9 @@ "fa_icon": "fas fa-filter" }, "ppidm_tsv": { - "type": ["string", "null"], + "type": ["string"], "format": "file-path", "description": "Required path to a PPIDM predictions TSV (columns: domain_1, domain_2, class). COLLECT_DDI_DATA adds positive DDIs tagged with source 'PPIDM_'.", - "default": null, "fa_icon": "fas fa-file-import" }, "ppidm_classes": { diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf index d893eab..386d71b 100644 --- a/subworkflows/local/collect_ddi_data/main.nf +++ b/subworkflows/local/collect_ddi_data/main.nf @@ -44,6 +44,10 @@ workflow COLLECT_DDI_DATA { negative_ppi_parquet main: + if( !hippie_tsv || !ppidm_tsv || !negative_ppi_parquet ) { + log.error "Required inputs missing: hippie_tsv, ppidm_tsv, and negative_ppi_parquet must be provided" + exit 1 + } file_3did = file(url_3did) sqlite_3did = DOWNLOAD_3DID_SQLITE(file_3did).sqlite negatome_file = DOWNLOAD_NEGATOME(url_negatome).negatome diff --git a/tests/python/test_insert_ppi_negative_selection.py b/tests/python/test_insert_ppi_negative_selection.py index d3c8add..d39e10c 100644 --- a/tests/python/test_insert_ppi_negative_selection.py +++ b/tests/python/test_insert_ppi_negative_selection.py @@ -56,11 +56,12 @@ def count(conn, source): def write_score(path, method): - json.dump({ - "method": method, "seed": 7, "J": 0.1, "pa": 0.1, "deg": 0.1, - "cov": 0.0, "n_sel": 2, "n_dom": 3, "mean_pa": 1.0, - "pos_n_sel": 3, "pos_n_dom": 4, "pos_mean_pa": 2.0, - }, open(path, "w")) + with open(path, "w") as fh: + json.dump({ + "method": method, "seed": 7, "J": 0.1, "pa": 0.1, "deg": 0.1, + "cov": 0.0, "n_sel": 2, "n_dom": 3, "mean_pa": 1.0, + "pos_n_sel": 3, "pos_n_dom": 4, "pos_mean_pa": 2.0, + }, fh) def write_pairs(path, pairs): From bc681c57bff37d28118c49d463c504ca8a798767 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Fri, 12 Jun 2026 11:18:19 +0200 Subject: [PATCH 12/16] updated tests --- .github/actions/nf-test/action.yml | 8 + .gitignore | 4 + conf/test.config | 26 ++- modules/local/3did/main.nf | 7 + modules/local/analyze_ddi_bias/main.nf | 7 + modules/local/build_ppi_negative_pool/main.nf | 7 + .../extract_unique_domains/main.nf | 7 + .../enrich/insert_domain_go_terms/main.nf | 7 + .../insert_domain_protein_mapping/main.nf | 7 + modules/local/enrich/insert_ppi/main.nf | 7 + .../enrich/insert_protein_go_terms/main.nf | 7 + .../insert_proteins_with_embeddings/main.nf | 7 + modules/local/esm_embeddings/main.nf | 34 +++ .../local/external_validation_split/main.nf | 8 + modules/local/init_domainsplit_db/main.nf | 7 + modules/local/insert_3did/main.nf | 7 + modules/local/insert_negatome/main.nf | 7 + .../insert_ppi_negative_selection/main.nf | 7 + modules/local/insert_ppidm/main.nf | 7 + .../local/insert_single_domain_ppi/main.nf | 7 + modules/local/minimal_leakage_split/main.nf | 17 ++ modules/local/negatome/main.nf | 7 + modules/local/pfam/main.nf | 15 ++ modules/local/random_ddi_split/main.nf | 10 + .../local/remove_self_interactions/main.nf | 7 + .../local/select_ppi_negative_dans/main.nf | 7 + modules/local/smoke_filter/main.nf | 7 + modules/local/swissprot_map/main.nf | 7 + modules/local/util/main.nf | 14 ++ subworkflows/local/collect_ddi_data/main.nf | 19 ++ subworkflows/local/collect_ddi_data/meta.yml | 4 +- subworkflows/local/curate_domains/main.nf | 7 + .../local/enrich_ddi_database/main.nf | 9 + .../local/generate_embeddings/main.nf | 26 --- .../local/generate_embeddings/meta.yml | 29 --- .../local/split_domainsplit_database/main.nf | 14 ++ tests/bin/mmseqs | 11 + tests/data/3did.sql.gz | 0 tests/data/hippie.tsv | 0 tests/data/negative_ppi.parquet | 0 tests/data/negatome.txt | 0 tests/data/pfam2go.txt | 0 tests/data/ppidm.tsv | 0 tests/data/prott5.h5 | 0 tests/data/string.txt.gz | 0 tests/data/swissprot_pfam.tsv | 0 tests/data/uniprot_go_terms.tsv | 0 tests/data/uniprot_id_mapping.dat.gz | 0 tests/data/uniprot_sequences.fasta.gz | 0 tests/default.nf.test | 30 +-- tests/default.nf.test.snap | 194 ++++++++++++++++++ tests/nextflow.config | 9 + tests/python/test_insert_negatome.py | 82 ++++++++ tests/python/test_insert_ppidm.py | 141 +++++++++++++ workflows/domainsplit.nf | 33 ++- 55 files changed, 805 insertions(+), 79 deletions(-) delete mode 100644 subworkflows/local/generate_embeddings/main.nf delete mode 100644 subworkflows/local/generate_embeddings/meta.yml create mode 100755 tests/bin/mmseqs create mode 100644 tests/data/3did.sql.gz create mode 100644 tests/data/hippie.tsv create mode 100644 tests/data/negative_ppi.parquet create mode 100644 tests/data/negatome.txt create mode 100644 tests/data/pfam2go.txt create mode 100644 tests/data/ppidm.tsv create mode 100644 tests/data/prott5.h5 create mode 100644 tests/data/string.txt.gz create mode 100644 tests/data/swissprot_pfam.tsv create mode 100644 tests/data/uniprot_go_terms.tsv create mode 100644 tests/data/uniprot_id_mapping.dat.gz create mode 100644 tests/data/uniprot_sequences.fasta.gz create mode 100644 tests/default.nf.test.snap create mode 100644 tests/python/test_insert_negatome.py create mode 100644 tests/python/test_insert_ppidm.py diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml index ad686e8..ea59134 100644 --- a/.github/actions/nf-test/action.yml +++ b/.github/actions/nf-test/action.yml @@ -56,6 +56,14 @@ runs: channel-priority: strict conda-remove-defaults: true + - name: Set dummy Nextflow secrets for stub tests + shell: bash + run: | + # The pipeline stub test (tests/default.nf.test) exercises the ESM + # embedding processes, which declare `secret 'HF_TOKEN'`. Nextflow + # requires the secret to exist even under -stub, so register a dummy. + nextflow secrets set HF_TOKEN "stub" || true + - name: Run nf-test shell: bash env: diff --git a/.gitignore b/.gitignore index c85aa62..d0ab513 100644 --- a/.gitignore +++ b/.gitignore @@ -216,3 +216,7 @@ test-eval-env/* # Claude .claude/ .mcp.json + +# nf-test fixtures (override the broad ignores above) +!tests/data/ +!tests/data/** diff --git a/conf/test.config b/conf/test.config index 536b85e..7a6b2a8 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,10 +20,26 @@ process { params { config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' + config_profile_description = 'Minimal stub test dataset, fully offline (run with -stub)' - // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + // Every input below is a tiny local placeholder under tests/data/. The + // pipeline test runs in stub mode (`-stub`), so processes never read these + // files; they only need to exist so Nextflow stages local paths instead of + // downloading the real multi-GB sources or hitting REST APIs. + + // Required file params (no defaults in nextflow.config) + hippie_tsv = "${projectDir}/tests/data/hippie.tsv" + ppidm_tsv = "${projectDir}/tests/data/ppidm.tsv" + negative_ppi_parquet = "${projectDir}/tests/data/negative_ppi.parquet" + + // Source URLs -> local fixtures + url_3did = "${projectDir}/tests/data/3did.sql.gz" + url_negatome = "${projectDir}/tests/data/negatome.txt" + url_uniprot_swissprot_pfam = "${projectDir}/tests/data/swissprot_pfam.tsv" + url_uniprot_id_mapping = "${projectDir}/tests/data/uniprot_id_mapping.dat.gz" + url_uniprot_go_terms = "${projectDir}/tests/data/uniprot_go_terms.tsv" + url_uniprot_sequences = "${projectDir}/tests/data/uniprot_sequences.fasta.gz" + url_uniprot_prott5_embeddings = "${projectDir}/tests/data/prott5.h5" + url_string = "${projectDir}/tests/data/string.txt.gz" + url_pfam2go = "${projectDir}/tests/data/pfam2go.txt" } diff --git a/modules/local/3did/main.nf b/modules/local/3did/main.nf index b9e3fc5..e483d7d 100644 --- a/modules/local/3did/main.nf +++ b/modules/local/3did/main.nf @@ -27,4 +27,11 @@ process DOWNLOAD_3DID_SQLITE { sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)') END_VERSIONS """ + + stub: + """ + touch 3did.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/analyze_ddi_bias/main.nf b/modules/local/analyze_ddi_bias/main.nf index 2ab9179..e0c6cc5 100644 --- a/modules/local/analyze_ddi_bias/main.nf +++ b/modules/local/analyze_ddi_bias/main.nf @@ -24,4 +24,11 @@ process ANALYZE_DDI_BIAS { matplotlib: \$(python3 -c 'import matplotlib; print(matplotlib.__version__)') END_VERSIONS """ + + stub: + """ + mkdir bias_analysis + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/build_ppi_negative_pool/main.nf b/modules/local/build_ppi_negative_pool/main.nf index 3ab76ec..ec15f4c 100644 --- a/modules/local/build_ppi_negative_pool/main.nf +++ b/modules/local/build_ppi_negative_pool/main.nf @@ -37,4 +37,11 @@ process BUILD_PPI_NEGATIVE_POOL { sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)') END_VERSIONS """ + + stub: + """ + touch domainsplit.sqlite3 neg_pool.npz uniprot_pfam_mapping.json + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/curate_domains/extract_unique_domains/main.nf b/modules/local/curate_domains/extract_unique_domains/main.nf index 86edded..e59d20d 100644 --- a/modules/local/curate_domains/extract_unique_domains/main.nf +++ b/modules/local/curate_domains/extract_unique_domains/main.nf @@ -32,4 +32,11 @@ process EXTRACT_UNIQUE_DOMAINS { sqlite3: \$(sqlite3 --version | awk '{print \$1}') END_VERSIONS """ + + stub: + """ + echo PF00001 > pfam_ids.txt + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/enrich/insert_domain_go_terms/main.nf b/modules/local/enrich/insert_domain_go_terms/main.nf index 2eb9437..3e6409a 100644 --- a/modules/local/enrich/insert_domain_go_terms/main.nf +++ b/modules/local/enrich/insert_domain_go_terms/main.nf @@ -20,4 +20,11 @@ process INSERT_DOMAIN_GO_TERMS { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/enrich/insert_domain_protein_mapping/main.nf b/modules/local/enrich/insert_domain_protein_mapping/main.nf index 7f7d00c..9fb4b44 100644 --- a/modules/local/enrich/insert_domain_protein_mapping/main.nf +++ b/modules/local/enrich/insert_domain_protein_mapping/main.nf @@ -22,4 +22,11 @@ process INSERT_DOMAIN_PROTEIN_MAPPING { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/enrich/insert_ppi/main.nf b/modules/local/enrich/insert_ppi/main.nf index 9a27846..afd5f32 100644 --- a/modules/local/enrich/insert_ppi/main.nf +++ b/modules/local/enrich/insert_ppi/main.nf @@ -22,4 +22,11 @@ process INSERT_PPI { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/enrich/insert_protein_go_terms/main.nf b/modules/local/enrich/insert_protein_go_terms/main.nf index 3148d9c..64110a0 100644 --- a/modules/local/enrich/insert_protein_go_terms/main.nf +++ b/modules/local/enrich/insert_protein_go_terms/main.nf @@ -20,4 +20,11 @@ process INSERT_PROTEIN_GO_TERMS { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/enrich/insert_proteins_with_embeddings/main.nf b/modules/local/enrich/insert_proteins_with_embeddings/main.nf index 79201a8..096318c 100644 --- a/modules/local/enrich/insert_proteins_with_embeddings/main.nf +++ b/modules/local/enrich/insert_proteins_with_embeddings/main.nf @@ -27,4 +27,11 @@ process INSERT_PROTEINS_WITH_EMBEDDINGS { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/esm_embeddings/main.nf b/modules/local/esm_embeddings/main.nf index 7f65c06..a96bd5d 100644 --- a/modules/local/esm_embeddings/main.nf +++ b/modules/local/esm_embeddings/main.nf @@ -66,6 +66,15 @@ process FILTER_SEQUENCES { f.write(f" python: {sys.version.split()[0]}\\n") f.write(f" biopython: {Bio.__version__}\\n") """ + + stub: + protein_meta = [id: "protein_sequences"] + domain_meta = [id: "domain_sequences"] + """ + touch uniprot_filtered.fasta.gz domain_sequences.fasta.gz + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } // Per-residue protein embeddings. One task per FASTA shard. @@ -108,6 +117,13 @@ process GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK { --max-len ${params.esm_max_len} \\ --smoke-limit ${smoke} """ + + stub: + """ + touch ${input_fasta.simpleName}.esm.h5 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } // GPU-pooled domain embeddings. One task per FASTA shard. @@ -150,6 +166,13 @@ process GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK { --max-len ${params.esm_max_len} \\ --smoke-limit ${smoke} """ + + stub: + """ + touch ${input_fasta.simpleName}.esm.h5 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } workflow generate_esm_embeddings { @@ -169,7 +192,18 @@ workflow generate_esm_embeddings { protein_embeddings = JOIN_PROTEIN_EMBEDDINGS('esm_protein_embeddings', protein_chunks.chunk.collect()).joined domain_embeddings = JOIN_DOMAIN_EMBEDDINGS('esm_domain_embeddings', domain_chunks.chunk.collect() ).joined + ch_versions = Channel.empty().mix( + FILTER_SEQUENCES.out.versions, + SHARD_PROTEIN_FASTA.out.versions, + SHARD_DOMAIN_FASTA.out.versions, + GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK.out.versions, + GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK.out.versions, + JOIN_PROTEIN_EMBEDDINGS.out.versions, + JOIN_DOMAIN_EMBEDDINGS.out.versions, + ) + emit: protein_embeddings domain_embeddings + versions = ch_versions } diff --git a/modules/local/external_validation_split/main.nf b/modules/local/external_validation_split/main.nf index e7fc98d..5760486 100644 --- a/modules/local/external_validation_split/main.nf +++ b/modules/local/external_validation_split/main.nf @@ -101,4 +101,12 @@ process SUBSET_DDIS_BY_SOURCE { f.write(f" python: {sys.version.split()[0]}\\n") f.write(f" sqlite3: {sqlite3.sqlite_version}\\n") """ + + stub: + output_split_info = [["${split_name}.sqlite3", split_name]] + """ + touch ${split_name}.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/init_domainsplit_db/main.nf b/modules/local/init_domainsplit_db/main.nf index 5f70b1b..41dab03 100644 --- a/modules/local/init_domainsplit_db/main.nf +++ b/modules/local/init_domainsplit_db/main.nf @@ -83,4 +83,11 @@ process INIT_DOMAINSPLIT_DB { f.write(f" python: {sys.version.split()[0]}\\n") f.write(f" sqlite3: {sqlite3.sqlite_version}\\n") """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/insert_3did/main.nf b/modules/local/insert_3did/main.nf index c2485a2..021939a 100644 --- a/modules/local/insert_3did/main.nf +++ b/modules/local/insert_3did/main.nf @@ -22,4 +22,11 @@ process INSERT_3DID { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/insert_negatome/main.nf b/modules/local/insert_negatome/main.nf index 454982d..429b495 100644 --- a/modules/local/insert_negatome/main.nf +++ b/modules/local/insert_negatome/main.nf @@ -22,4 +22,11 @@ process INSERT_NEGATOME { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/insert_ppi_negative_selection/main.nf b/modules/local/insert_ppi_negative_selection/main.nf index 652e64b..b0380e8 100644 --- a/modules/local/insert_ppi_negative_selection/main.nf +++ b/modules/local/insert_ppi_negative_selection/main.nf @@ -37,4 +37,11 @@ process INSERT_PPI_NEGATIVE_SELECTION { sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)') END_VERSIONS """ + + stub: + """ + touch domainsplit.sqlite3 negative_ppi_method_scores.tsv + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/insert_ppidm/main.nf b/modules/local/insert_ppidm/main.nf index 749b705..553e79d 100644 --- a/modules/local/insert_ppidm/main.nf +++ b/modules/local/insert_ppidm/main.nf @@ -24,4 +24,11 @@ process INSERT_PPIDM { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/insert_single_domain_ppi/main.nf b/modules/local/insert_single_domain_ppi/main.nf index 3a14070..57b30df 100644 --- a/modules/local/insert_single_domain_ppi/main.nf +++ b/modules/local/insert_single_domain_ppi/main.nf @@ -26,4 +26,11 @@ process INSERT_SINGLE_DOMAIN_PPI { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/minimal_leakage_split/main.nf b/modules/local/minimal_leakage_split/main.nf index af546cb..a548b12 100644 --- a/modules/local/minimal_leakage_split/main.nf +++ b/modules/local/minimal_leakage_split/main.nf @@ -29,6 +29,13 @@ process EXTRACT_DOMAIN_SEQUENCES { sqlite3: \$(sqlite3 --version | awk '{print \$1}') END_VERSIONS """ + + stub: + """ + touch domain_sequences.fasta.gz + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } process MINIMAL_LEAKAGE_SPLIT_DOMAIN { @@ -397,4 +404,14 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN { f.write(f" python: {_sys.version.split()[0]}\\n") f.write(f" numpy: {np.__version__}\\n") """ + + stub: + output_split_info = [] + split_fractions.each { name, fraction -> output_split_info << ["${name}.sqlite3", name] } + def touch_cmds = output_split_info.collect { "touch ${it[0]}" }.join("\n ") + """ + ${touch_cmds} + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/negatome/main.nf b/modules/local/negatome/main.nf index fae1152..79b8485 100644 --- a/modules/local/negatome/main.nf +++ b/modules/local/negatome/main.nf @@ -53,4 +53,11 @@ process DOWNLOAD_NEGATOME { f.write('"${task.process}":\\n') f.write(f" python: {sys.version.split()[0]}\\n") """ + + stub: + """ + touch combined_pfam.txt + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/pfam/main.nf b/modules/local/pfam/main.nf index b5507f4..f853c2e 100644 --- a/modules/local/pfam/main.nf +++ b/modules/local/pfam/main.nf @@ -75,6 +75,13 @@ with open("versions.yml", "w") as f: f.write('"${task.process}":\\n') f.write(f" python: {sys.version.split()[0]}\\n") """ + + stub: + """ + touch PF00001.alignment.full.gz + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } process CREATE_PROTEIN_DOMAIN_MAPPING { @@ -169,4 +176,12 @@ with open("versions.yml", "w") as f: f.write('"${task.process}":\\n') f.write(f" python: {sys.version.split()[0]}\\n") """ + + stub: + out_path = 'protein_domain_mapping.csv.gz' + """ + touch ${out_path} + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/random_ddi_split/main.nf b/modules/local/random_ddi_split/main.nf index 1d20ff4..79fee98 100644 --- a/modules/local/random_ddi_split/main.nf +++ b/modules/local/random_ddi_split/main.nf @@ -126,4 +126,14 @@ process RANDOM_DDI_SPLIT { f.write('"${task.process}":\\n') f.write(f" python: {_sys.version.split()[0]}\\n") """ + + stub: + output_split_info = [] + split_fractions.each { name, fraction -> output_split_info << ["${name}.sqlite3", name] } + def touch_cmds = output_split_info.collect { "touch ${it[0]}" }.join("\n ") + """ + ${touch_cmds} + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/remove_self_interactions/main.nf b/modules/local/remove_self_interactions/main.nf index c87e5a8..668c375 100644 --- a/modules/local/remove_self_interactions/main.nf +++ b/modules/local/remove_self_interactions/main.nf @@ -37,4 +37,11 @@ process REMOVE_SELF_INTERACTIONS { f.write(f" python: {sys.version.split()[0]}\\n") f.write(f" sqlite3: {sqlite3.sqlite_version}\\n") """ + + stub: + """ + touch domainsplit.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/select_ppi_negative_dans/main.nf b/modules/local/select_ppi_negative_dans/main.nf index fd1a631..cbd009a 100644 --- a/modules/local/select_ppi_negative_dans/main.nf +++ b/modules/local/select_ppi_negative_dans/main.nf @@ -29,4 +29,11 @@ process SELECT_PPI_NEGATIVE_DANS { numpy: \$(python3 -c 'import numpy; print(numpy.__version__)') END_VERSIONS """ + + stub: + """ + touch score_${method}.json pairs_${method}.tsv + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/smoke_filter/main.nf b/modules/local/smoke_filter/main.nf index 3f60a3b..e884cb9 100644 --- a/modules/local/smoke_filter/main.nf +++ b/modules/local/smoke_filter/main.nf @@ -88,4 +88,11 @@ process SMOKE_FILTER { f.write(f" python: {sys.version.split()[0]}\\n") f.write(f" sqlite3: {sqlite3.sqlite_version}\\n") """ + + stub: + """ + touch domainsplit.smoke.sqlite3 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/swissprot_map/main.nf b/modules/local/swissprot_map/main.nf index e28b2e2..3ad8330 100644 --- a/modules/local/swissprot_map/main.nf +++ b/modules/local/swissprot_map/main.nf @@ -19,4 +19,11 @@ process BUILD_SWISSPROT_PFAM_MAP { --versions versions.yml \\ --process-name "${task.process}" """ + + stub: + """ + touch swissprot_pfam_map.json + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/modules/local/util/main.nf b/modules/local/util/main.nf index 2d4d80c..4be930d 100644 --- a/modules/local/util/main.nf +++ b/modules/local/util/main.nf @@ -39,6 +39,13 @@ process SHARD_FASTA { print(f" biopython: {Bio.__version__}") PY """ + + stub: + """ + touch ${meta.id}_shard_0.fasta.gz + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } // Merge a collection of HDF5 chunks (plain or gzipped) into one HDF5 file. @@ -89,4 +96,11 @@ process JOIN_HDF_FILES { f.write(f" python: {sys.version.split()[0]}\\n") f.write(f" h5py: {h5py.__version__}\\n") """ + + stub: + """ + touch ${output_name}.h5 + echo '"${task.process}":' > versions.yml + echo ' stub: "true"' >> versions.yml + """ } diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf index 386d71b..fe21fea 100644 --- a/subworkflows/local/collect_ddi_data/main.nf +++ b/subworkflows/local/collect_ddi_data/main.nf @@ -44,6 +44,8 @@ workflow COLLECT_DDI_DATA { negative_ppi_parquet main: + ch_versions = Channel.empty() + if( !hippie_tsv || !ppidm_tsv || !negative_ppi_parquet ) { log.error "Required inputs missing: hippie_tsv, ppidm_tsv, and negative_ppi_parquet must be provided" exit 1 @@ -77,6 +79,7 @@ workflow COLLECT_DDI_DATA { // 6. optional removal of all self-interactions if (!params.self_interaction) { domainsplit_db = REMOVE_SELF_INTERACTIONS(domainsplit_db).domainsplit_db + ch_versions = ch_versions.mix(REMOVE_SELF_INTERACTIONS.out.versions) } // 7. high-confidence non-PPI negatives via uncapped DANS (Cappelletti et al. @@ -113,9 +116,25 @@ workflow COLLECT_DDI_DATA { if (params.smoke_test_n_ddis != null) { domainsplit_db = SMOKE_FILTER(domainsplit_db, params.smoke_test_n_ddis).domainsplit_db + ch_versions = ch_versions.mix(SMOKE_FILTER.out.versions) } + ch_versions = ch_versions.mix( + DOWNLOAD_3DID_SQLITE.out.versions, + DOWNLOAD_NEGATOME.out.versions, + INSERT_3DID.out.versions, + BUILD_SWISSPROT_PFAM_MAP.out.versions, + INSERT_SINGLE_DOMAIN_PPI.out.versions, + INSERT_PPIDM.out.versions, + INSERT_NEGATOME.out.versions, + BUILD_PPI_NEGATIVE_POOL.out.versions, + SELECT_DELETION.out.versions, + SELECT_RANDOM_ADDITION.out.versions, + INSERT_PPI_NEGATIVE_SELECTION.out.versions, + ) + emit: domainsplit_db pfam_mapping + versions = ch_versions } diff --git a/subworkflows/local/collect_ddi_data/meta.yml b/subworkflows/local/collect_ddi_data/meta.yml index 750ebd0..a158369 100644 --- a/subworkflows/local/collect_ddi_data/meta.yml +++ b/subworkflows/local/collect_ddi_data/meta.yml @@ -16,7 +16,9 @@ components: - insert/ppidm - insert/negatome - remove/self/interactions - - insert/ppi/negative/ddis + - build/ppi/negative/pool + - select/ppi/negative/dans + - insert/ppi/negative/selection - smoke/filter input: - domainsplit_db_in: diff --git a/subworkflows/local/curate_domains/main.nf b/subworkflows/local/curate_domains/main.nf index a97b28e..91ee944 100644 --- a/subworkflows/local/curate_domains/main.nf +++ b/subworkflows/local/curate_domains/main.nf @@ -28,6 +28,13 @@ workflow CURATE_DOMAINS { pfam_files.collect() ).mapping + ch_versions = Channel.empty().mix( + EXTRACT_UNIQUE_DOMAINS.out.versions, + DOWNLOAD_PFAM_ALIGNMENTS_BATCH.out.versions, + CREATE_PROTEIN_DOMAIN_MAPPING.out.versions, + ) + emit: protein_domain_map + versions = ch_versions } diff --git a/subworkflows/local/enrich_ddi_database/main.nf b/subworkflows/local/enrich_ddi_database/main.nf index 56df49b..bddb71e 100644 --- a/subworkflows/local/enrich_ddi_database/main.nf +++ b/subworkflows/local/enrich_ddi_database/main.nf @@ -60,6 +60,15 @@ workflow ENRICH_DDI_DATABASE { esm_domain_embeddings ).domainsplit_db + ch_versions = Channel.empty().mix( + INSERT_DOMAIN_GO_TERMS.out.versions, + INSERT_PROTEINS_WITH_EMBEDDINGS.out.versions, + INSERT_PROTEIN_GO_TERMS.out.versions, + INSERT_PPI.out.versions, + INSERT_DOMAIN_PROTEIN_MAPPING.out.versions, + ) + emit: domainsplit_db + versions = ch_versions } diff --git a/subworkflows/local/generate_embeddings/main.nf b/subworkflows/local/generate_embeddings/main.nf deleted file mode 100644 index c043aec..0000000 --- a/subworkflows/local/generate_embeddings/main.nf +++ /dev/null @@ -1,26 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - GENERATE_EMBEDDINGS -- run protein-level ESM (protein + domain) - embedding generation. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ESM path produces both per-residue protein embeddings and pooled domain - embeddings against the supplied protein <-> domain map. - - ProtT5 embeddings are supplied externally via params.url_uniprot_prott5_embeddings - and resolved in the top-level workflow (domainsplit.nf). -----------------------------------------------------------------------------*/ - -include { generate_esm_embeddings } from '../../../modules/local/esm_embeddings/main.nf' - -workflow GENERATE_EMBEDDINGS { - take: - protein_domain_map - input_uniprot_sequences - - main: - generate_esm_embeddings(input_uniprot_sequences, protein_domain_map) - - emit: - esm_protein_embeddings = generate_esm_embeddings.out.protein_embeddings - esm_domain_embeddings = generate_esm_embeddings.out.domain_embeddings -} diff --git a/subworkflows/local/generate_embeddings/meta.yml b/subworkflows/local/generate_embeddings/meta.yml deleted file mode 100644 index dfd0981..0000000 --- a/subworkflows/local/generate_embeddings/meta.yml +++ /dev/null @@ -1,29 +0,0 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json -name: "generate_embeddings" -description: Generate ESM per-residue protein embeddings and pooled per-domain embeddings against the supplied protein-to-domain map. -keywords: - - esm - - embeddings - - protein - - domain -components: - - generate/esm/embeddings -input: - - protein_domain_map: - type: file - description: Protein-to-Pfam-domain mapping used to pool per-domain embeddings. - - input_uniprot_sequences: - type: file - description: UniProt protein sequences to embed. - pattern: "*.{fasta,fa,gz}" -output: - - esm_protein_embeddings: - type: file - description: ESM per-residue protein embeddings. - - esm_domain_embeddings: - type: file - description: ESM pooled per-domain embeddings. -authors: - - "@KonstantinPelz" -maintainers: - - "@KonstantinPelz" diff --git a/subworkflows/local/split_domainsplit_database/main.nf b/subworkflows/local/split_domainsplit_database/main.nf index 1427c03..4536cca 100644 --- a/subworkflows/local/split_domainsplit_database/main.nf +++ b/subworkflows/local/split_domainsplit_database/main.nf @@ -119,6 +119,20 @@ workflow SPLIT_DOMAINSPLIT_DATABASE { map_split_dbs(SUBSET_DDIS_BY_SOURCE.out.split_info, SUBSET_DDIS_BY_SOURCE.out.split_dbs, "external_validation_random_addition") ) + // NB: MMSEQS_EASYCLUSTER (nf-core) reports its version via the `versions` + // channel topic, not an `emit: versions` output, so it is not mixed here. + ch_versions = Channel.empty().mix( + EXTRACT_DOMAIN_SEQUENCES.out.versions, + RANDOM_DDI_SPLIT_DEL.out.versions, + RANDOM_DDI_SPLIT_RAND.out.versions, + MLS_DOMAIN_DEL.out.versions, + MLS_DOMAIN_RAND.out.versions, + MLS_TRAINVAL_DEL.out.versions, + MLS_TRAINVAL_RAND.out.versions, + SUBSET_DDIS_BY_SOURCE.out.versions, + ) + emit: split_db = split_ch + versions = ch_versions } diff --git a/tests/bin/mmseqs b/tests/bin/mmseqs new file mode 100755 index 0000000..a99de53 --- /dev/null +++ b/tests/bin/mmseqs @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Test-only stub of the `mmseqs` binary. +# +# The nf-test pipeline test (tests/default.nf.test) runs with `-stub` and the +# `test` profile, which enables no container engine — every process runs on the +# host. The nf-core MMSEQS_EASYCLUSTER module captures its version via an +# `eval('mmseqs version')` output directive, which Nextflow executes even in +# stub mode. Without mmseqs installed that fails with exit 127, so this shim +# provides a deterministic fake version. It is only ever on PATH for test runs +# (added via env.PATH in tests/nextflow.config); real runs use the container. +echo "stub" diff --git a/tests/data/3did.sql.gz b/tests/data/3did.sql.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/hippie.tsv b/tests/data/hippie.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/negative_ppi.parquet b/tests/data/negative_ppi.parquet new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/negatome.txt b/tests/data/negatome.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/pfam2go.txt b/tests/data/pfam2go.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/ppidm.tsv b/tests/data/ppidm.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/prott5.h5 b/tests/data/prott5.h5 new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/string.txt.gz b/tests/data/string.txt.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/swissprot_pfam.tsv b/tests/data/swissprot_pfam.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/uniprot_go_terms.tsv b/tests/data/uniprot_go_terms.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/uniprot_id_mapping.dat.gz b/tests/data/uniprot_id_mapping.dat.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/uniprot_sequences.fasta.gz b/tests/data/uniprot_sequences.fasta.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/default.nf.test b/tests/default.nf.test index 86ea2e3..dde4cdc 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -6,6 +6,8 @@ nextflow_pipeline { test("-profile test") { + options "-stub" + when { params { outdir = "$outputDir" @@ -13,20 +15,22 @@ nextflow_pipeline { } then { - // stable_path: All files + folders in ${params.outdir}/ with a stable path (including file name) - def stable_path = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) - // stable_content: All files in ${params.outdir}/ with stable content - def stable_content = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') - assert workflow.success + // This is a `-stub` wiring test: every process runs its stub block and + // emits empty placeholder files, so file *content* is meaningless (and + // empty .gz/.h5 stubs break content hashing). We therefore snapshot only + // the set of produced output paths -- this verifies the whole DAG wires + // together (channel topology, the split fan-out, publish paths) end to + // end without any downloads, GPU, or containers. pipeline_info/ is + // ignored because its filenames embed a run timestamp. + def stable_path = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/execution_*', 'pipeline_info/pipeline_dag_*']) + // Snapshot the collated software versions with the Nextflow version line + // stripped (so the assertion survives Nextflow upgrades). This also + // satisfies the nf-core `nf_test_content` lint rule, which requires a + // `versions.yml` to be snapshotted by every `*.nf.test`. + def versions_yml = removeNextflowVersion("$outputDir/pipeline_info/nf_core_pipeline_software_mqc_versions.yml") assertAll( - { assert snapshot( - // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions - removeNextflowVersion("$outputDir/pipeline_info/domainsplit_software_mqc_versions.yml"), - // All stable path name, with a relative path - stable_path, - // All files with stable contents - stable_content - ).match() } + { assert workflow.success }, + { assert snapshot(stable_path, versions_yml).match() } ) } } diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap new file mode 100644 index 0000000..483c015 --- /dev/null +++ b/tests/default.nf.test.snap @@ -0,0 +1,194 @@ +{ + "-profile test": { + "content": [ + [ + "analyze", + "analyze/bias_analysis", + "create", + "create/protein_domain_mapping.csv.gz", + "databases", + "databases/external_validation_deletion", + "databases/external_validation_deletion/test.sqlite3", + "databases/external_validation_deletion/train.sqlite3", + "databases/external_validation_deletion/validation.sqlite3", + "databases/external_validation_random_addition", + "databases/external_validation_random_addition/test.sqlite3", + "databases/external_validation_random_addition/train.sqlite3", + "databases/external_validation_random_addition/validation.sqlite3", + "databases/minimal_leakage_domain_deletion", + "databases/minimal_leakage_domain_deletion/optimization.sqlite3", + "databases/minimal_leakage_domain_deletion/test.sqlite3", + "databases/minimal_leakage_domain_deletion/train.sqlite3", + "databases/minimal_leakage_domain_random_addition", + "databases/minimal_leakage_domain_random_addition/optimization.sqlite3", + "databases/minimal_leakage_domain_random_addition/test.sqlite3", + "databases/minimal_leakage_domain_random_addition/train.sqlite3", + "databases/random_ddi_deletion", + "databases/random_ddi_deletion/optimization.sqlite3", + "databases/random_ddi_deletion/test.sqlite3", + "databases/random_ddi_deletion/train.sqlite3", + "databases/random_ddi_random_addition", + "databases/random_ddi_random_addition/optimization.sqlite3", + "databases/random_ddi_random_addition/test.sqlite3", + "databases/random_ddi_random_addition/train.sqlite3", + "domainsplit.sqlite3", + "download", + "download/3did.sqlite3", + "download/PF00001.alignment.full.gz", + "download/combined_pfam.txt", + "extract", + "extract/domain_sequences.fasta.gz", + "filter", + "filter/domain_sequences.fasta.gz", + "filter/uniprot_filtered.fasta.gz", + "generate", + "generate/domain_sequences_shard_0.esm.h5", + "generate/protein_sequences_shard_0.esm.h5", + "join", + "join/esm_domain_embeddings.h5", + "join/esm_protein_embeddings.h5", + "mls", + "mls/optimization.sqlite3", + "mls/test.sqlite3", + "mls/train.sqlite3", + "mls/validation.sqlite3", + "mmseqs", + "mmseqs/domain.tsv", + "mmseqs/domain_all_seqs.fasta", + "mmseqs/domain_rep_seq.fasta", + "negative_ppi", + "negative_ppi/negative_ppi_method_scores.tsv", + "pipeline_info", + "pipeline_info/nf_core_pipeline_software_mqc_versions.yml", + "pipeline_info/params_2026-06-12_11-10-10.json", + "random", + "random/optimization.sqlite3", + "random/test.sqlite3", + "random/train.sqlite3", + "shard", + "shard/domain_sequences_shard_0.fasta.gz", + "shard/protein_sequences_shard_0.fasta.gz", + "subset", + "subset/test.sqlite3" + ], + { + "ANALYZE_DDI_BIAS": { + "stub": "true" + }, + "BUILD_PPI_NEGATIVE_POOL": { + "stub": "true" + }, + "BUILD_SWISSPROT_PFAM_MAP": { + "stub": "true" + }, + "CREATE_PROTEIN_DOMAIN_MAPPING": { + "stub": "true" + }, + "DOWNLOAD_3DID_SQLITE": { + "stub": "true" + }, + "DOWNLOAD_NEGATOME": { + "stub": "true" + }, + "DOWNLOAD_PFAM_ALIGNMENTS_BATCH": { + "stub": "true" + }, + "EXTRACT_DOMAIN_SEQUENCES": { + "stub": "true" + }, + "EXTRACT_UNIQUE_DOMAINS": { + "stub": "true" + }, + "FILTER_SEQUENCES": { + "stub": "true" + }, + "GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK": { + "stub": "true" + }, + "GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK": { + "stub": "true" + }, + "INIT_DOMAINSPLIT_DB": { + "stub": "true" + }, + "INSERT_3DID": { + "stub": "true" + }, + "INSERT_DOMAIN_GO_TERMS": { + "stub": "true" + }, + "INSERT_DOMAIN_PROTEIN_MAPPING": { + "stub": "true" + }, + "INSERT_NEGATOME": { + "stub": "true" + }, + "INSERT_PPI": { + "stub": "true" + }, + "INSERT_PPIDM": { + "stub": "true" + }, + "INSERT_PPI_NEGATIVE_SELECTION": { + "stub": "true" + }, + "INSERT_PROTEINS_WITH_EMBEDDINGS": { + "stub": "true" + }, + "INSERT_PROTEIN_GO_TERMS": { + "stub": "true" + }, + "INSERT_SINGLE_DOMAIN_PPI": { + "stub": "true" + }, + "JOIN_DOMAIN_EMBEDDINGS": { + "stub": "true" + }, + "JOIN_PROTEIN_EMBEDDINGS": { + "stub": "true" + }, + "MLS_DOMAIN_DEL": { + "stub": "true" + }, + "MLS_DOMAIN_RAND": { + "stub": "true" + }, + "MLS_TRAINVAL_DEL": { + "stub": "true" + }, + "MLS_TRAINVAL_RAND": { + "stub": "true" + }, + "RANDOM_DDI_SPLIT_DEL": { + "stub": "true" + }, + "RANDOM_DDI_SPLIT_RAND": { + "stub": "true" + }, + "SELECT_DELETION": { + "stub": "true" + }, + "SELECT_RANDOM_ADDITION": { + "stub": "true" + }, + "SHARD_DOMAIN_FASTA": { + "stub": "true" + }, + "SHARD_PROTEIN_FASTA": { + "stub": "true" + }, + "SUBSET_DDIS_BY_SOURCE": { + "stub": "true" + }, + "Workflow": { + "daisybio/domainsplit": "v1.0.0dev" + } + } + ], + "timestamp": "2026-06-12T11:10:24.222393192", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + } +} \ No newline at end of file diff --git a/tests/nextflow.config b/tests/nextflow.config index 12b3258..ab87256 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -12,3 +12,12 @@ params { } aws.client.anonymous = true // fixes S3 access issues on self-hosted runners + +// The pipeline test (tests/default.nf.test) runs with `-stub` and no container +// engine, so every process executes on the host. The nf-core MMSEQS_EASYCLUSTER +// module captures its version with an `eval('mmseqs version')` output that runs +// even under -stub; prepend a test-only shim dir so that resolves without the +// real binary. Only applied to nf-test runs (this config is test-only). +env { + PATH = "${projectDir}/tests/bin:\$PATH" +} diff --git a/tests/python/test_insert_negatome.py b/tests/python/test_insert_negatome.py new file mode 100644 index 0000000..9691d7d --- /dev/null +++ b/tests/python/test_insert_negatome.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +"""Local unit-check for bin/insert_negatome.py (no Nextflow, no cluster). + +Builds a tiny empty Domainsplit SQLite and runs the Negatome inserter against a +small synthetic ``combined_pfam.txt``, asserting: + + * each whitespace-separated Pfam pair is stored as ``negative=1, + source='negatome'`` with its domains auto-created; + * lines without at least two tokens (blank / single-token) are skipped. + +Run directly (`python3 tests/python/test_insert_negatome.py`) or via pytest. +""" + +import os +import sqlite3 +import subprocess +import sys +import tempfile + +REPO = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +BIN = os.path.join(REPO, "bin") +INSERTER = os.path.join(BIN, "insert_negatome.py") + +SCHEMA = """ +CREATE TABLE domain (id INTEGER PRIMARY KEY, pfam_id, name, UNIQUE(pfam_id)); +CREATE TABLE domain_domain_interaction ( + id INTEGER PRIMARY KEY, + domain_id_a, domain_id_b, negative, + source VARCHAR(255), + FOREIGN KEY(domain_id_a) REFERENCES domain ON DELETE CASCADE, + FOREIGN KEY(domain_id_b) REFERENCES domain ON DELETE CASCADE, + UNIQUE(domain_id_a, domain_id_b, source) +); +""" + +NEGATOME_LINES = [ + "PF00001 PF00002", # kept + "PF00003\tPF00004", # kept (tab separated) + "PF00005 PF00006", # kept + "PF00007", # single token -> skipped + "", # blank -> skipped +] + + +def test_insert_negatome(): + with tempfile.TemporaryDirectory() as tmp: + db = os.path.join(tmp, "domainsplit.sqlite3") + conn = sqlite3.connect(db) + conn.executescript(SCHEMA) + conn.commit() + conn.close() + + negatome = os.path.join(tmp, "combined_pfam.txt") + with open(negatome, "w") as fh: + fh.write("\n".join(NEGATOME_LINES) + "\n") + + env = dict(os.environ, PYTHONPATH=BIN + os.pathsep + os.environ.get("PYTHONPATH", "")) + subprocess.run( + [sys.executable, INSERTER, "--db", db, "--negatome", negatome, + "--versions", os.path.join(tmp, "versions.yml"), + "--process-name", "TEST:INSERT_NEGATOME"], + check=True, env=env, + ) + + conn = sqlite3.connect(db) + total = conn.execute("SELECT COUNT(*) FROM domain_domain_interaction").fetchone()[0] + assert total == 3, f"expected 3 negatome DDIs, got {total}" + + rows = conn.execute( + "SELECT COUNT(*) FROM domain_domain_interaction " + "WHERE source = 'negatome' AND negative != 0" + ).fetchone()[0] + assert rows == 3, "all negatome rows must be negative with source 'negatome'" + + n_domains = conn.execute("SELECT COUNT(*) FROM domain").fetchone()[0] + assert n_domains == 6, f"expected 6 auto-created domains, got {n_domains}" + conn.close() + + +if __name__ == "__main__": + test_insert_negatome() + print("OK: insert_negatome invariants hold") diff --git a/tests/python/test_insert_ppidm.py b/tests/python/test_insert_ppidm.py new file mode 100644 index 0000000..a296053 --- /dev/null +++ b/tests/python/test_insert_ppidm.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +"""Local unit-check for bin/insert_ppidm.py (no Nextflow, no cluster). + +Builds a tiny empty Domainsplit SQLite and runs the PPIDM inserter against a +small synthetic ``predicted_ddi_ppi.tsv``, asserting: + + * domain tokens like ``10114/PF00069`` are parsed down to the Pfam accession; + * each kept row is stored as ``negative=0, source='PPIDM_'``; + * classes are processed Gold -> Silver -> Bronze, so a pair appearing under + two classes is kept only under the highest-confidence one (cross-source + dedup in insert_ddis); + * unparseable tokens are skipped, and ``--classes`` filters which classes are + inserted at all. + +Run directly (`python3 tests/python/test_insert_ppidm.py`) or via pytest. +""" + +import os +import sqlite3 +import subprocess +import sys +import tempfile + +REPO = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +BIN = os.path.join(REPO, "bin") +INSERTER = os.path.join(BIN, "insert_ppidm.py") + +# Matches the schema the pipeline's INIT_DOMAINSPLIT_DB creates for these tables +# (see tests/python/test_insert_ppi_negative_selection.py). +SCHEMA = """ +CREATE TABLE domain (id INTEGER PRIMARY KEY, pfam_id, name, UNIQUE(pfam_id)); +CREATE TABLE domain_domain_interaction ( + id INTEGER PRIMARY KEY, + domain_id_a, domain_id_b, negative, + source VARCHAR(255), + FOREIGN KEY(domain_id_a) REFERENCES domain ON DELETE CASCADE, + FOREIGN KEY(domain_id_b) REFERENCES domain ON DELETE CASCADE, + UNIQUE(domain_id_a, domain_id_b, source) +); +""" + + +def count(conn, source): + return conn.execute( + "SELECT COUNT(*) FROM domain_domain_interaction WHERE source = ?", + (source,), + ).fetchone()[0] + + +def run_inserter(db, ppidm, classes, tmp): + env = dict(os.environ, PYTHONPATH=BIN + os.pathsep + os.environ.get("PYTHONPATH", "")) + subprocess.run( + [sys.executable, INSERTER, "--db", db, "--ppidm", ppidm, + "--classes", classes, + "--versions", os.path.join(tmp, "versions.yml"), + "--process-name", "TEST:INSERT_PPIDM"], + check=True, env=env, + ) + + +# Tokens carry a leading numeric id before the slash, as in real PPIDM output. +PPIDM_ROWS = [ + "domain_1\tdomain_2\tclass", # header (skipped) + "10/PF00001\t20/PF00002\tGold", # kept -> PPIDM_Gold + "30/PF00003\t40/PF00004\tSilver", # kept -> PPIDM_Silver + "50/PF00005\t60/PF00006\tBronze", # kept -> PPIDM_Bronze + "10/PF00001\t20/PF00002\tSilver", # duplicate pair, lower class -> dropped + "junk\tnonsense\tGold", # unparseable -> skipped +] + + +def write_ppidm(path, rows): + with open(path, "w") as fh: + fh.write("\n".join(rows) + "\n") + + +def test_insert_ppidm_all_classes(): + with tempfile.TemporaryDirectory() as tmp: + db = os.path.join(tmp, "domainsplit.sqlite3") + conn = sqlite3.connect(db) + conn.executescript(SCHEMA) + conn.commit() + conn.close() + + ppidm = os.path.join(tmp, "predicted_ddi_ppi.tsv") + write_ppidm(ppidm, PPIDM_ROWS) + + run_inserter(db, ppidm, "Bronze,Silver,Gold", tmp) + + conn = sqlite3.connect(db) + # One pair per class; the duplicate (PF00001, PF00002) is kept only under + # Gold (processed first) and dropped for Silver via cross-source dedup. + assert count(conn, "PPIDM_Gold") == 1, "Gold count wrong" + assert count(conn, "PPIDM_Silver") == 1, "Silver count wrong (dedup failed?)" + assert count(conn, "PPIDM_Bronze") == 1, "Bronze count wrong" + + # All kept rows are positives stored under a PPIDM_* source only. + total = conn.execute("SELECT COUNT(*) FROM domain_domain_interaction").fetchone()[0] + assert total == 3, f"expected 3 DDIs total, got {total}" + neg = conn.execute( + "SELECT COUNT(*) FROM domain_domain_interaction WHERE negative != 0" + ).fetchone()[0] + assert neg == 0, "PPIDM rows must be positives" + + # The duplicate pair exists only under Gold, not Silver. + n_sources = conn.execute( + "SELECT COUNT(DISTINCT source) FROM domain_domain_interaction ddi " + "JOIN domain da ON da.id = ddi.domain_id_a " + "JOIN domain db ON db.id = ddi.domain_id_b " + "WHERE da.pfam_id = ? AND db.pfam_id = ?", + ("PF00001", "PF00002"), + ).fetchone()[0] + assert n_sources == 1, f"(PF00001,PF00002) should be under 1 source, got {n_sources}" + conn.close() + + +def test_insert_ppidm_class_filter(): + """--classes restricts which classes are inserted at all.""" + with tempfile.TemporaryDirectory() as tmp: + db = os.path.join(tmp, "domainsplit.sqlite3") + conn = sqlite3.connect(db) + conn.executescript(SCHEMA) + conn.commit() + conn.close() + + ppidm = os.path.join(tmp, "predicted_ddi_ppi.tsv") + write_ppidm(ppidm, PPIDM_ROWS) + + run_inserter(db, ppidm, "Gold", tmp) + + conn = sqlite3.connect(db) + assert count(conn, "PPIDM_Gold") == 1 + assert count(conn, "PPIDM_Silver") == 0, "Silver should be excluded" + assert count(conn, "PPIDM_Bronze") == 0, "Bronze should be excluded" + conn.close() + + +if __name__ == "__main__": + test_insert_ppidm_all_classes() + test_insert_ppidm_class_filter() + print("OK: insert_ppidm class handling + dedup invariants hold") diff --git a/workflows/domainsplit.nf b/workflows/domainsplit.nf index 7f5fe85..4474982 100644 --- a/workflows/domainsplit.nf +++ b/workflows/domainsplit.nf @@ -9,7 +9,7 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore include { INIT_DOMAINSPLIT_DB } from '../modules/local/init_domainsplit_db/main.nf' include { COLLECT_DDI_DATA } from '../subworkflows/local/collect_ddi_data/main.nf' include { CURATE_DOMAINS } from '../subworkflows/local/curate_domains/main.nf' -include { GENERATE_EMBEDDINGS } from '../subworkflows/local/generate_embeddings/main.nf' +include { generate_esm_embeddings } from '../modules/local/esm_embeddings/main.nf' include { ENRICH_DDI_DATABASE } from '../subworkflows/local/enrich_ddi_database/main.nf' include { SPLIT_DOMAINSPLIT_DATABASE } from '../subworkflows/local/split_domainsplit_database/main.nf' include { ANALYZE_DDI_BIAS } from '../modules/local/analyze_ddi_bias/main.nf' @@ -22,6 +22,8 @@ include { ANALYZE_DDI_BIAS } from '../modules/local/analyze_ddi_bias/ workflow DOMAINSPLIT { main: + ch_versions = Channel.empty() + input_uniprot_id_mapping = file(params.url_uniprot_id_mapping) input_uniprot_go_terms = file(params.url_uniprot_go_terms) input_uniprot_sequences = file(params.url_uniprot_sequences) @@ -51,9 +53,9 @@ main: protein_domain_map = CURATE_DOMAINS.out.protein_domain_map - GENERATE_EMBEDDINGS( - protein_domain_map, + generate_esm_embeddings( input_uniprot_sequences, + protein_domain_map, ) ENRICH_DDI_DATABASE( @@ -65,8 +67,8 @@ main: input_uniprot_go_terms, input_string, input_uniprot_id_mapping, - GENERATE_EMBEDDINGS.out.esm_protein_embeddings, - GENERATE_EMBEDDINGS.out.esm_domain_embeddings, + generate_esm_embeddings.out.protein_embeddings, + generate_esm_embeddings.out.domain_embeddings, ) ANALYZE_DDI_BIAS( @@ -77,6 +79,27 @@ main: ENRICH_DDI_DATABASE.out.domainsplit_db ) + // + // Collate and save software versions + // + ch_versions = ch_versions.mix( + INIT_DOMAINSPLIT_DB.out.versions, + COLLECT_DDI_DATA.out.versions, + CURATE_DOMAINS.out.versions, + generate_esm_embeddings.out.versions, + ENRICH_DDI_DATABASE.out.versions, + ANALYZE_DDI_BIAS.out.versions, + SPLIT_DOMAINSPLIT_DATABASE.out.versions, + ) + + softwareVersionsToYAML(ch_versions) + .collectFile( + storeDir: "${params.outdir}/pipeline_info", + name: 'nf_core_' + 'pipeline_software_' + 'mqc_' + 'versions.yml', + sort: true, + newLine: true, + ) + emit: domainsplit_db = ENRICH_DDI_DATABASE.out.domainsplit_db split_db = SPLIT_DOMAINSPLIT_DATABASE.out.split_db From 3450738ecf05b783164fabb700495829e22512d0 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Fri, 12 Jun 2026 12:24:44 +0200 Subject: [PATCH 13/16] removed docker:// prefix for containers --- modules/local/3did/main.nf | 2 +- modules/local/analyze_ddi_bias/main.nf | 2 +- modules/local/build_ppi_negative_pool/main.nf | 2 +- modules/local/curate_domains/extract_unique_domains/main.nf | 2 +- modules/local/enrich/insert_domain_go_terms/main.nf | 2 +- modules/local/enrich/insert_domain_protein_mapping/main.nf | 2 +- modules/local/enrich/insert_ppi/main.nf | 2 +- modules/local/enrich/insert_protein_go_terms/main.nf | 2 +- .../local/enrich/insert_proteins_with_embeddings/main.nf | 2 +- modules/local/esm_embeddings/main.nf | 6 +++--- modules/local/external_validation_split/main.nf | 2 +- modules/local/init_domainsplit_db/main.nf | 2 +- modules/local/insert_3did/main.nf | 2 +- modules/local/insert_negatome/main.nf | 2 +- modules/local/insert_ppi_negative_selection/main.nf | 2 +- modules/local/insert_ppidm/main.nf | 2 +- modules/local/insert_single_domain_ppi/main.nf | 2 +- modules/local/minimal_leakage_split/main.nf | 4 ++-- modules/local/negatome/main.nf | 2 +- modules/local/pfam/main.nf | 4 ++-- modules/local/random_ddi_split/main.nf | 2 +- modules/local/remove_self_interactions/main.nf | 2 +- modules/local/select_ppi_negative_dans/main.nf | 2 +- modules/local/smoke_filter/main.nf | 2 +- modules/local/swissprot_map/main.nf | 2 +- modules/local/util/main.nf | 4 ++-- 26 files changed, 31 insertions(+), 31 deletions(-) diff --git a/modules/local/3did/main.nf b/modules/local/3did/main.nf index e483d7d..3f1cf0f 100644 --- a/modules/local/3did/main.nf +++ b/modules/local/3did/main.nf @@ -2,7 +2,7 @@ process DOWNLOAD_3DID_SQLITE { tag "3did" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path mysql_gz_file diff --git a/modules/local/analyze_ddi_bias/main.nf b/modules/local/analyze_ddi_bias/main.nf index e0c6cc5..6f5f77e 100644 --- a/modules/local/analyze_ddi_bias/main.nf +++ b/modules/local/analyze_ddi_bias/main.nf @@ -2,7 +2,7 @@ process ANALYZE_DDI_BIAS { tag "bias_analysis" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path "domainsplit.sqlite3" diff --git a/modules/local/build_ppi_negative_pool/main.nf b/modules/local/build_ppi_negative_pool/main.nf index ec15f4c..3ddf835 100644 --- a/modules/local/build_ppi_negative_pool/main.nf +++ b/modules/local/build_ppi_negative_pool/main.nf @@ -2,7 +2,7 @@ process BUILD_PPI_NEGATIVE_POOL { tag "build_ppi_negative_pool" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/curate_domains/extract_unique_domains/main.nf b/modules/local/curate_domains/extract_unique_domains/main.nf index e59d20d..7a607e9 100644 --- a/modules/local/curate_domains/extract_unique_domains/main.nf +++ b/modules/local/curate_domains/extract_unique_domains/main.nf @@ -2,7 +2,7 @@ process EXTRACT_UNIQUE_DOMAINS { tag { "${domainsplit_db.simpleName}" } label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db diff --git a/modules/local/enrich/insert_domain_go_terms/main.nf b/modules/local/enrich/insert_domain_go_terms/main.nf index 3e6409a..3f8f013 100644 --- a/modules/local/enrich/insert_domain_go_terms/main.nf +++ b/modules/local/enrich/insert_domain_go_terms/main.nf @@ -2,7 +2,7 @@ process INSERT_DOMAIN_GO_TERMS { tag "insert_domain_go_terms" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/enrich/insert_domain_protein_mapping/main.nf b/modules/local/enrich/insert_domain_protein_mapping/main.nf index 9fb4b44..950840a 100644 --- a/modules/local/enrich/insert_domain_protein_mapping/main.nf +++ b/modules/local/enrich/insert_domain_protein_mapping/main.nf @@ -2,7 +2,7 @@ process INSERT_DOMAIN_PROTEIN_MAPPING { tag "insert_domain_protein_mapping" label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/enrich/insert_ppi/main.nf b/modules/local/enrich/insert_ppi/main.nf index afd5f32..d5c67c8 100644 --- a/modules/local/enrich/insert_ppi/main.nf +++ b/modules/local/enrich/insert_ppi/main.nf @@ -2,7 +2,7 @@ process INSERT_PPI { tag "insert_ppi" label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/enrich/insert_protein_go_terms/main.nf b/modules/local/enrich/insert_protein_go_terms/main.nf index 64110a0..ec34f33 100644 --- a/modules/local/enrich/insert_protein_go_terms/main.nf +++ b/modules/local/enrich/insert_protein_go_terms/main.nf @@ -2,7 +2,7 @@ process INSERT_PROTEIN_GO_TERMS { tag "insert_protein_go_terms" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/enrich/insert_proteins_with_embeddings/main.nf b/modules/local/enrich/insert_proteins_with_embeddings/main.nf index 096318c..ee2b6d2 100644 --- a/modules/local/enrich/insert_proteins_with_embeddings/main.nf +++ b/modules/local/enrich/insert_proteins_with_embeddings/main.nf @@ -2,7 +2,7 @@ process INSERT_PROTEINS_WITH_EMBEDDINGS { tag "insert_proteins_with_embeddings" label 'process_high' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/esm_embeddings/main.nf b/modules/local/esm_embeddings/main.nf index a96bd5d..00bac7d 100644 --- a/modules/local/esm_embeddings/main.nf +++ b/modules/local/esm_embeddings/main.nf @@ -22,7 +22,7 @@ process FILTER_SEQUENCES { tag { "${protein_domain_map.simpleName}" } label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path protein_domain_map @@ -83,7 +83,7 @@ process GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK { label 'process_gpu_large' secret 'HF_TOKEN' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-gpu:1.0.0" + container "konstantinpelz/domainsplit-gpu:1.0.0" containerOptions { workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? '--env HF_TOKEN --env HF_HOME --env HUGGINGFACE_HUB_CACHE' @@ -132,7 +132,7 @@ process GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK { label 'process_gpu_large' secret 'HF_TOKEN' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-gpu:1.0.0" + container "konstantinpelz/domainsplit-gpu:1.0.0" containerOptions { workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? '--env HF_TOKEN --env HF_HOME --env HUGGINGFACE_HUB_CACHE' diff --git a/modules/local/external_validation_split/main.nf b/modules/local/external_validation_split/main.nf index 5760486..92f1570 100644 --- a/modules/local/external_validation_split/main.nf +++ b/modules/local/external_validation_split/main.nf @@ -11,7 +11,7 @@ process SUBSET_DDIS_BY_SOURCE { tag "subset_${split_name}" label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path 'domainsplit.sqlite3' diff --git a/modules/local/init_domainsplit_db/main.nf b/modules/local/init_domainsplit_db/main.nf index 41dab03..548921c 100644 --- a/modules/local/init_domainsplit_db/main.nf +++ b/modules/local/init_domainsplit_db/main.nf @@ -2,7 +2,7 @@ process INIT_DOMAINSPLIT_DB { tag "init_domainsplit_db" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" output: path "domainsplit.sqlite3", emit: domainsplit_db diff --git a/modules/local/insert_3did/main.nf b/modules/local/insert_3did/main.nf index 021939a..73e26bb 100644 --- a/modules/local/insert_3did/main.nf +++ b/modules/local/insert_3did/main.nf @@ -2,7 +2,7 @@ process INSERT_3DID { tag "insert_3did" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/insert_negatome/main.nf b/modules/local/insert_negatome/main.nf index 429b495..928f4d1 100644 --- a/modules/local/insert_negatome/main.nf +++ b/modules/local/insert_negatome/main.nf @@ -2,7 +2,7 @@ process INSERT_NEGATOME { tag "insert_negatome" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/insert_ppi_negative_selection/main.nf b/modules/local/insert_ppi_negative_selection/main.nf index b0380e8..d6c199f 100644 --- a/modules/local/insert_ppi_negative_selection/main.nf +++ b/modules/local/insert_ppi_negative_selection/main.nf @@ -2,7 +2,7 @@ process INSERT_PPI_NEGATIVE_SELECTION { tag "insert_ppi_negative_selection" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/insert_ppidm/main.nf b/modules/local/insert_ppidm/main.nf index 553e79d..804bd95 100644 --- a/modules/local/insert_ppidm/main.nf +++ b/modules/local/insert_ppidm/main.nf @@ -2,7 +2,7 @@ process INSERT_PPIDM { tag "insert_ppidm" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/insert_single_domain_ppi/main.nf b/modules/local/insert_single_domain_ppi/main.nf index 57b30df..a2c4ea4 100644 --- a/modules/local/insert_single_domain_ppi/main.nf +++ b/modules/local/insert_single_domain_ppi/main.nf @@ -2,7 +2,7 @@ process INSERT_SINGLE_DOMAIN_PPI { tag "insert_single_domain_ppi" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/minimal_leakage_split/main.nf b/modules/local/minimal_leakage_split/main.nf index a548b12..be5dc15 100644 --- a/modules/local/minimal_leakage_split/main.nf +++ b/modules/local/minimal_leakage_split/main.nf @@ -2,7 +2,7 @@ process EXTRACT_DOMAIN_SEQUENCES { tag "domains" label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path "domainsplit.sqlite3" @@ -42,7 +42,7 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN { tag "minimal_leakage_domain" label 'process_high' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path "domainsplit.sqlite3" diff --git a/modules/local/negatome/main.nf b/modules/local/negatome/main.nf index 79b8485..c1e04fc 100644 --- a/modules/local/negatome/main.nf +++ b/modules/local/negatome/main.nf @@ -2,7 +2,7 @@ process DOWNLOAD_NEGATOME { tag "negatome" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: val url diff --git a/modules/local/pfam/main.nf b/modules/local/pfam/main.nf index f853c2e..2025a10 100644 --- a/modules/local/pfam/main.nf +++ b/modules/local/pfam/main.nf @@ -2,7 +2,7 @@ process DOWNLOAD_PFAM_ALIGNMENTS_BATCH { tag { "batch_${pfam_ids_list.size()}" } label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" maxRetries 3 errorStrategy { task.attempt <= 3 ? 'retry' : 'ignore' } @@ -88,7 +88,7 @@ process CREATE_PROTEIN_DOMAIN_MAPPING { tag { "${uniprot_map_file.simpleName}" } label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path uniprot_map_file diff --git a/modules/local/random_ddi_split/main.nf b/modules/local/random_ddi_split/main.nf index 79fee98..cbe6e3e 100644 --- a/modules/local/random_ddi_split/main.nf +++ b/modules/local/random_ddi_split/main.nf @@ -2,7 +2,7 @@ process RANDOM_DDI_SPLIT { tag "random_ddi" label 'process_medium' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path 'domainsplit.sqlite3' diff --git a/modules/local/remove_self_interactions/main.nf b/modules/local/remove_self_interactions/main.nf index 668c375..7b77ce4 100644 --- a/modules/local/remove_self_interactions/main.nf +++ b/modules/local/remove_self_interactions/main.nf @@ -2,7 +2,7 @@ process REMOVE_SELF_INTERACTIONS { tag "remove_self_interactions" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/select_ppi_negative_dans/main.nf b/modules/local/select_ppi_negative_dans/main.nf index cbd009a..6163df8 100644 --- a/modules/local/select_ppi_negative_dans/main.nf +++ b/modules/local/select_ppi_negative_dans/main.nf @@ -2,7 +2,7 @@ process SELECT_PPI_NEGATIVE_DANS { tag "select_ppi_negative_dans:${method}" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: val method diff --git a/modules/local/smoke_filter/main.nf b/modules/local/smoke_filter/main.nf index e884cb9..718258d 100644 --- a/modules/local/smoke_filter/main.nf +++ b/modules/local/smoke_filter/main.nf @@ -2,7 +2,7 @@ process SMOKE_FILTER { tag "smoke_filter" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db diff --git a/modules/local/swissprot_map/main.nf b/modules/local/swissprot_map/main.nf index 3ad8330..674cd01 100644 --- a/modules/local/swissprot_map/main.nf +++ b/modules/local/swissprot_map/main.nf @@ -2,7 +2,7 @@ process BUILD_SWISSPROT_PFAM_MAP { tag "swissprot_map" label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: val url diff --git a/modules/local/util/main.nf b/modules/local/util/main.nf index 4be930d..c8328d1 100644 --- a/modules/local/util/main.nf +++ b/modules/local/util/main.nf @@ -15,7 +15,7 @@ process SHARD_FASTA { tag { "${input_fasta.simpleName}:${num_shards}" } label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: tuple val(meta), path(input_fasta) @@ -58,7 +58,7 @@ process JOIN_HDF_FILES { tag { output_name } label 'process_low' conda "${moduleDir}/environment.yml" - container "docker://konstantinpelz/domainsplit-general:1.0.0" + container "konstantinpelz/domainsplit-general:1.0.0" input: val output_name From a40248a3db27d951f094051f7565a608eeb6a115 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Fri, 12 Jun 2026 12:31:29 +0200 Subject: [PATCH 14/16] fixed docker prefix --- modules/local/3did/main.nf | 2 +- modules/local/analyze_ddi_bias/main.nf | 2 +- modules/local/build_ppi_negative_pool/main.nf | 2 +- modules/local/curate_domains/extract_unique_domains/main.nf | 2 +- modules/local/enrich/insert_domain_go_terms/main.nf | 2 +- modules/local/enrich/insert_domain_protein_mapping/main.nf | 2 +- modules/local/enrich/insert_ppi/main.nf | 2 +- modules/local/enrich/insert_protein_go_terms/main.nf | 2 +- .../local/enrich/insert_proteins_with_embeddings/main.nf | 2 +- modules/local/esm_embeddings/main.nf | 6 +++--- modules/local/external_validation_split/main.nf | 2 +- modules/local/init_domainsplit_db/main.nf | 2 +- modules/local/insert_3did/main.nf | 2 +- modules/local/insert_negatome/main.nf | 2 +- modules/local/insert_ppi_negative_selection/main.nf | 2 +- modules/local/insert_ppidm/main.nf | 2 +- modules/local/insert_single_domain_ppi/main.nf | 2 +- modules/local/minimal_leakage_split/main.nf | 4 ++-- modules/local/negatome/main.nf | 2 +- modules/local/pfam/main.nf | 4 ++-- modules/local/random_ddi_split/main.nf | 2 +- modules/local/remove_self_interactions/main.nf | 2 +- modules/local/select_ppi_negative_dans/main.nf | 2 +- modules/local/smoke_filter/main.nf | 2 +- modules/local/swissprot_map/main.nf | 2 +- modules/local/util/main.nf | 4 ++-- 26 files changed, 31 insertions(+), 31 deletions(-) diff --git a/modules/local/3did/main.nf b/modules/local/3did/main.nf index 3f1cf0f..99873b6 100644 --- a/modules/local/3did/main.nf +++ b/modules/local/3did/main.nf @@ -2,7 +2,7 @@ process DOWNLOAD_3DID_SQLITE { tag "3did" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path mysql_gz_file diff --git a/modules/local/analyze_ddi_bias/main.nf b/modules/local/analyze_ddi_bias/main.nf index 6f5f77e..706d3b4 100644 --- a/modules/local/analyze_ddi_bias/main.nf +++ b/modules/local/analyze_ddi_bias/main.nf @@ -2,7 +2,7 @@ process ANALYZE_DDI_BIAS { tag "bias_analysis" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path "domainsplit.sqlite3" diff --git a/modules/local/build_ppi_negative_pool/main.nf b/modules/local/build_ppi_negative_pool/main.nf index 3ddf835..6d647ec 100644 --- a/modules/local/build_ppi_negative_pool/main.nf +++ b/modules/local/build_ppi_negative_pool/main.nf @@ -2,7 +2,7 @@ process BUILD_PPI_NEGATIVE_POOL { tag "build_ppi_negative_pool" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/curate_domains/extract_unique_domains/main.nf b/modules/local/curate_domains/extract_unique_domains/main.nf index 7a607e9..856709c 100644 --- a/modules/local/curate_domains/extract_unique_domains/main.nf +++ b/modules/local/curate_domains/extract_unique_domains/main.nf @@ -2,7 +2,7 @@ process EXTRACT_UNIQUE_DOMAINS { tag { "${domainsplit_db.simpleName}" } label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db diff --git a/modules/local/enrich/insert_domain_go_terms/main.nf b/modules/local/enrich/insert_domain_go_terms/main.nf index 3f8f013..a57c5f5 100644 --- a/modules/local/enrich/insert_domain_go_terms/main.nf +++ b/modules/local/enrich/insert_domain_go_terms/main.nf @@ -2,7 +2,7 @@ process INSERT_DOMAIN_GO_TERMS { tag "insert_domain_go_terms" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/enrich/insert_domain_protein_mapping/main.nf b/modules/local/enrich/insert_domain_protein_mapping/main.nf index 950840a..04d3d2e 100644 --- a/modules/local/enrich/insert_domain_protein_mapping/main.nf +++ b/modules/local/enrich/insert_domain_protein_mapping/main.nf @@ -2,7 +2,7 @@ process INSERT_DOMAIN_PROTEIN_MAPPING { tag "insert_domain_protein_mapping" label 'process_medium' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/enrich/insert_ppi/main.nf b/modules/local/enrich/insert_ppi/main.nf index d5c67c8..0ecc502 100644 --- a/modules/local/enrich/insert_ppi/main.nf +++ b/modules/local/enrich/insert_ppi/main.nf @@ -2,7 +2,7 @@ process INSERT_PPI { tag "insert_ppi" label 'process_medium' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/enrich/insert_protein_go_terms/main.nf b/modules/local/enrich/insert_protein_go_terms/main.nf index ec34f33..a77973f 100644 --- a/modules/local/enrich/insert_protein_go_terms/main.nf +++ b/modules/local/enrich/insert_protein_go_terms/main.nf @@ -2,7 +2,7 @@ process INSERT_PROTEIN_GO_TERMS { tag "insert_protein_go_terms" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/enrich/insert_proteins_with_embeddings/main.nf b/modules/local/enrich/insert_proteins_with_embeddings/main.nf index ee2b6d2..6e296e7 100644 --- a/modules/local/enrich/insert_proteins_with_embeddings/main.nf +++ b/modules/local/enrich/insert_proteins_with_embeddings/main.nf @@ -2,7 +2,7 @@ process INSERT_PROTEINS_WITH_EMBEDDINGS { tag "insert_proteins_with_embeddings" label 'process_high' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/esm_embeddings/main.nf b/modules/local/esm_embeddings/main.nf index 00bac7d..e8d9eeb 100644 --- a/modules/local/esm_embeddings/main.nf +++ b/modules/local/esm_embeddings/main.nf @@ -22,7 +22,7 @@ process FILTER_SEQUENCES { tag { "${protein_domain_map.simpleName}" } label 'process_medium' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path protein_domain_map @@ -83,7 +83,7 @@ process GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK { label 'process_gpu_large' secret 'HF_TOKEN' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-gpu:1.0.0" + container "docker.io/konstantinpelz/domainsplit-gpu:1.0.0" containerOptions { workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? '--env HF_TOKEN --env HF_HOME --env HUGGINGFACE_HUB_CACHE' @@ -132,7 +132,7 @@ process GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK { label 'process_gpu_large' secret 'HF_TOKEN' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-gpu:1.0.0" + container "docker.io/konstantinpelz/domainsplit-gpu:1.0.0" containerOptions { workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? '--env HF_TOKEN --env HF_HOME --env HUGGINGFACE_HUB_CACHE' diff --git a/modules/local/external_validation_split/main.nf b/modules/local/external_validation_split/main.nf index 92f1570..cbd11bd 100644 --- a/modules/local/external_validation_split/main.nf +++ b/modules/local/external_validation_split/main.nf @@ -11,7 +11,7 @@ process SUBSET_DDIS_BY_SOURCE { tag "subset_${split_name}" label 'process_medium' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path 'domainsplit.sqlite3' diff --git a/modules/local/init_domainsplit_db/main.nf b/modules/local/init_domainsplit_db/main.nf index 548921c..53bf98e 100644 --- a/modules/local/init_domainsplit_db/main.nf +++ b/modules/local/init_domainsplit_db/main.nf @@ -2,7 +2,7 @@ process INIT_DOMAINSPLIT_DB { tag "init_domainsplit_db" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" output: path "domainsplit.sqlite3", emit: domainsplit_db diff --git a/modules/local/insert_3did/main.nf b/modules/local/insert_3did/main.nf index 73e26bb..5434f10 100644 --- a/modules/local/insert_3did/main.nf +++ b/modules/local/insert_3did/main.nf @@ -2,7 +2,7 @@ process INSERT_3DID { tag "insert_3did" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/insert_negatome/main.nf b/modules/local/insert_negatome/main.nf index 928f4d1..5db9dc8 100644 --- a/modules/local/insert_negatome/main.nf +++ b/modules/local/insert_negatome/main.nf @@ -2,7 +2,7 @@ process INSERT_NEGATOME { tag "insert_negatome" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/insert_ppi_negative_selection/main.nf b/modules/local/insert_ppi_negative_selection/main.nf index d6c199f..9d5daf9 100644 --- a/modules/local/insert_ppi_negative_selection/main.nf +++ b/modules/local/insert_ppi_negative_selection/main.nf @@ -2,7 +2,7 @@ process INSERT_PPI_NEGATIVE_SELECTION { tag "insert_ppi_negative_selection" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/insert_ppidm/main.nf b/modules/local/insert_ppidm/main.nf index 804bd95..3c5244e 100644 --- a/modules/local/insert_ppidm/main.nf +++ b/modules/local/insert_ppidm/main.nf @@ -2,7 +2,7 @@ process INSERT_PPIDM { tag "insert_ppidm" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/insert_single_domain_ppi/main.nf b/modules/local/insert_single_domain_ppi/main.nf index a2c4ea4..3884ba6 100644 --- a/modules/local/insert_single_domain_ppi/main.nf +++ b/modules/local/insert_single_domain_ppi/main.nf @@ -2,7 +2,7 @@ process INSERT_SINGLE_DOMAIN_PPI { tag "insert_single_domain_ppi" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/minimal_leakage_split/main.nf b/modules/local/minimal_leakage_split/main.nf index be5dc15..f5130d9 100644 --- a/modules/local/minimal_leakage_split/main.nf +++ b/modules/local/minimal_leakage_split/main.nf @@ -2,7 +2,7 @@ process EXTRACT_DOMAIN_SEQUENCES { tag "domains" label 'process_medium' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path "domainsplit.sqlite3" @@ -42,7 +42,7 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN { tag "minimal_leakage_domain" label 'process_high' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path "domainsplit.sqlite3" diff --git a/modules/local/negatome/main.nf b/modules/local/negatome/main.nf index c1e04fc..a3eaf61 100644 --- a/modules/local/negatome/main.nf +++ b/modules/local/negatome/main.nf @@ -2,7 +2,7 @@ process DOWNLOAD_NEGATOME { tag "negatome" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: val url diff --git a/modules/local/pfam/main.nf b/modules/local/pfam/main.nf index 2025a10..b48cf04 100644 --- a/modules/local/pfam/main.nf +++ b/modules/local/pfam/main.nf @@ -2,7 +2,7 @@ process DOWNLOAD_PFAM_ALIGNMENTS_BATCH { tag { "batch_${pfam_ids_list.size()}" } label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" maxRetries 3 errorStrategy { task.attempt <= 3 ? 'retry' : 'ignore' } @@ -88,7 +88,7 @@ process CREATE_PROTEIN_DOMAIN_MAPPING { tag { "${uniprot_map_file.simpleName}" } label 'process_medium' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path uniprot_map_file diff --git a/modules/local/random_ddi_split/main.nf b/modules/local/random_ddi_split/main.nf index cbe6e3e..e9a4564 100644 --- a/modules/local/random_ddi_split/main.nf +++ b/modules/local/random_ddi_split/main.nf @@ -2,7 +2,7 @@ process RANDOM_DDI_SPLIT { tag "random_ddi" label 'process_medium' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path 'domainsplit.sqlite3' diff --git a/modules/local/remove_self_interactions/main.nf b/modules/local/remove_self_interactions/main.nf index 7b77ce4..f453df5 100644 --- a/modules/local/remove_self_interactions/main.nf +++ b/modules/local/remove_self_interactions/main.nf @@ -2,7 +2,7 @@ process REMOVE_SELF_INTERACTIONS { tag "remove_self_interactions" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3' diff --git a/modules/local/select_ppi_negative_dans/main.nf b/modules/local/select_ppi_negative_dans/main.nf index 6163df8..d06c827 100644 --- a/modules/local/select_ppi_negative_dans/main.nf +++ b/modules/local/select_ppi_negative_dans/main.nf @@ -2,7 +2,7 @@ process SELECT_PPI_NEGATIVE_DANS { tag "select_ppi_negative_dans:${method}" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: val method diff --git a/modules/local/smoke_filter/main.nf b/modules/local/smoke_filter/main.nf index 718258d..fb2232b 100644 --- a/modules/local/smoke_filter/main.nf +++ b/modules/local/smoke_filter/main.nf @@ -2,7 +2,7 @@ process SMOKE_FILTER { tag "smoke_filter" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: path domainsplit_db diff --git a/modules/local/swissprot_map/main.nf b/modules/local/swissprot_map/main.nf index 674cd01..d79b51c 100644 --- a/modules/local/swissprot_map/main.nf +++ b/modules/local/swissprot_map/main.nf @@ -2,7 +2,7 @@ process BUILD_SWISSPROT_PFAM_MAP { tag "swissprot_map" label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: val url diff --git a/modules/local/util/main.nf b/modules/local/util/main.nf index c8328d1..4d57032 100644 --- a/modules/local/util/main.nf +++ b/modules/local/util/main.nf @@ -15,7 +15,7 @@ process SHARD_FASTA { tag { "${input_fasta.simpleName}:${num_shards}" } label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: tuple val(meta), path(input_fasta) @@ -58,7 +58,7 @@ process JOIN_HDF_FILES { tag { output_name } label 'process_low' conda "${moduleDir}/environment.yml" - container "konstantinpelz/domainsplit-general:1.0.0" + container "docker.io/konstantinpelz/domainsplit-general:1.0.0" input: val output_name From 2aca8c52017eab6e622a96532dd15ed69c00cc52 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Fri, 12 Jun 2026 12:38:19 +0200 Subject: [PATCH 15/16] added docker home directory --- nextflow.config | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/nextflow.config b/nextflow.config index 2f39696..6533539 100644 --- a/nextflow.config +++ b/nextflow.config @@ -162,7 +162,10 @@ profiles { shifter.enabled = false charliecloud.enabled = false apptainer.enabled = false - docker.runOptions = '-u $(id -u):$(id -g)' + // -e HOME=/tmp: the micromamba-based images run `micromamba run` as their + // entrypoint, which needs a writable HOME for its proc dir. With `-u uid:gid` + // the container has no home (HOME=/), so point it at world-writable /tmp. + docker.runOptions = '-u $(id -u):$(id -g) -e HOME=/tmp' } arm64 { process.arch = 'arm64' @@ -176,7 +179,7 @@ profiles { wave.strategy = 'conda,container' } emulate_amd64 { - docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' + docker.runOptions = '-u $(id -u):$(id -g) -e HOME=/tmp --platform=linux/amd64' } singularity { singularity.enabled = true @@ -233,7 +236,7 @@ profiles { wave.strategy = 'conda,container' } gpu { - docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + docker.runOptions = '-u $(id -u):$(id -g) -e HOME=/tmp --gpus all' apptainer.runOptions = '--nv' singularity.runOptions = '--nv' } From a09986f7628033eff6b1dd523a5d89ff261867b0 Mon Sep 17 00:00:00 2001 From: Konstantin Pelz Date: Fri, 12 Jun 2026 12:49:23 +0200 Subject: [PATCH 16/16] removed stuff from snapshot --- tests/default.nf.test | 7 ++++--- tests/default.nf.test.snap | 2 -- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/default.nf.test b/tests/default.nf.test index dde4cdc..d929657 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -20,9 +20,10 @@ nextflow_pipeline { // empty .gz/.h5 stubs break content hashing). We therefore snapshot only // the set of produced output paths -- this verifies the whole DAG wires // together (channel topology, the split fan-out, publish paths) end to - // end without any downloads, GPU, or containers. pipeline_info/ is - // ignored because its filenames embed a run timestamp. - def stable_path = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/execution_*', 'pipeline_info/pipeline_dag_*']) + // end without any downloads, GPU, or containers. Everything under + // pipeline_info/ is ignored because those filenames embed a run + // timestamp (e.g. params_.json), which is non-deterministic. + def stable_path = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*']) // Snapshot the collated software versions with the Nextflow version line // stripped (so the assertion survives Nextflow upgrades). This also // satisfies the nf-core `nf_test_content` lint rule, which requires a diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap index 483c015..864ab56 100644 --- a/tests/default.nf.test.snap +++ b/tests/default.nf.test.snap @@ -59,8 +59,6 @@ "negative_ppi", "negative_ppi/negative_ppi_method_scores.tsv", "pipeline_info", - "pipeline_info/nf_core_pipeline_software_mqc_versions.yml", - "pipeline_info/params_2026-06-12_11-10-10.json", "random", "random/optimization.sqlite3", "random/test.sqlite3",