From 46e16a59e9e915903fa22edc0059d4768fbc8de4 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Tue, 9 Jun 2026 18:30:15 +0200
Subject: [PATCH 01/16] added two more positive sources

---
 bin/build_ppi_negative_ddis.py                |  44 ++++--
 bin/build_swissprot_pfam_map.py               | 129 ++++++++++++++++++
 bin/ddi_db_utils.py                           |  84 ++++++++++++
 bin/insert_3did.py                            |  65 +++++++++
 bin/insert_negatome.py                        |  59 ++++++++
 bin/insert_ppidm.py                           |  94 +++++++++++++
 bin/insert_single_domain_ppi.py               | 101 ++++++++++++++
 conf/modules.config                           |   2 +-
 .../environment.yml                           |   0
 .../local/external_validation_split/main.nf   | 104 ++++++++++++++
 modules/local/insert_3did/environment.yml     |   6 +
 modules/local/insert_3did/main.nf             |  25 ++++
 modules/local/insert_ddis/main.nf             |  99 --------------
 modules/local/insert_negatome/environment.yml |   6 +
 modules/local/insert_negatome/main.nf         |  25 ++++
 .../local/insert_ppi_negative_ddis/main.nf    |   7 +-
 modules/local/insert_ppidm/environment.yml    |   6 +
 modules/local/insert_ppidm/main.nf            |  27 ++++
 .../insert_single_domain_ppi/environment.yml  |   6 +
 .../local/insert_single_domain_ppi/main.nf    |  29 ++++
 modules/local/minimal_leakage_split/main.nf   |   6 +-
 modules/local/random_ddi_split/main.nf        |   6 +-
 .../remove_self_interactions/environment.yml  |   6 +
 .../local/remove_self_interactions/main.nf    |  40 ++++++
 modules/local/swissprot_map/environment.yml   |   5 +
 modules/local/swissprot_map/main.nf           |  22 +++
 nextflow.config                               |  44 ++++--
 nextflow_schema.json                          |  55 ++++++--
 subworkflows/local/collect_ddi_data/main.nf   |  71 +++++++---
 .../local/split_domainsplit_database/main.nf  |  62 +++++++--
 workflows/domainsplit.nf                      |  22 +--
 31 files changed, 1083 insertions(+), 174 deletions(-)
 create mode 100755 bin/build_swissprot_pfam_map.py
 create mode 100755 bin/ddi_db_utils.py
 create mode 100755 bin/insert_3did.py
 create mode 100755 bin/insert_negatome.py
 create mode 100755 bin/insert_ppidm.py
 create mode 100755 bin/insert_single_domain_ppi.py
 rename modules/local/{insert_ddis => external_validation_split}/environment.yml (100%)
 create mode 100644 modules/local/external_validation_split/main.nf
 create mode 100644 modules/local/insert_3did/environment.yml
 create mode 100644 modules/local/insert_3did/main.nf
 delete mode 100644 modules/local/insert_ddis/main.nf
 create mode 100644 modules/local/insert_negatome/environment.yml
 create mode 100644 modules/local/insert_negatome/main.nf
 create mode 100644 modules/local/insert_ppidm/environment.yml
 create mode 100644 modules/local/insert_ppidm/main.nf
 create mode 100644 modules/local/insert_single_domain_ppi/environment.yml
 create mode 100644 modules/local/insert_single_domain_ppi/main.nf
 create mode 100644 modules/local/remove_self_interactions/environment.yml
 create mode 100644 modules/local/remove_self_interactions/main.nf
 create mode 100644 modules/local/swissprot_map/environment.yml
 create mode 100644 modules/local/swissprot_map/main.nf

diff --git a/bin/build_ppi_negative_ddis.py b/bin/build_ppi_negative_ddis.py
index 23051b8..aae31d2 100755
--- a/bin/build_ppi_negative_ddis.py
+++ b/bin/build_ppi_negative_ddis.py
@@ -21,6 +21,8 @@
 import pyarrow.parquet as pq
 import requests
 
+from ddi_db_utils import pfam_sort_key
+
 
 TAG = "[ppi_neg]"
 BATCH_SIZE = 500_000
@@ -38,7 +40,7 @@ def parse_args():
     p.add_argument("--pfam-mapping-out", required=True,
                    help="Output path for UniProt -> Pfam JSON mapping")
     p.add_argument("--min-n-tested", type=int, required=True)
-    p.add_argument("--source-label", required=True)
+    p.add_argument("--source-label", default="inferred_ppi_screen_negative")
     p.add_argument(
         "--sampling-strategy",
         choices=["frequency", "degree_matched"],
@@ -46,6 +48,12 @@ def parse_args():
         help="'frequency' = top-N by co-occurrence (old behavior). "
              "'degree_matched' = sample to match positive degree distribution.",
     )
+    p.add_argument(
+        "--no-self",
+        action="store_true",
+        help="Skip self-pairs (domain interacting with itself) "
+             "when self_interaction is disabled.",
+    )
     return p.parse_args()
 
 
@@ -142,12 +150,17 @@ def fetch_gene_mappings(gene_names, batch_size=100):
     return gene_to_uniprot, uniprot_to_pfams
 
 
-def load_positive_pfams(conn):
+def load_3did_pfams(conn):
+    """Pfam IDs that appear in a 3did positive DDI.
+
+    Negatives are inferred (and degree-matched) only over the 3did domain
+    universe, so single-domain / PPIDM positives never widen the candidate set.
+    """
     cur = conn.execute(
         "SELECT DISTINCT d.pfam_id "
         "FROM domain AS d JOIN domain_domain_interaction AS ddi "
         "  ON d.id IN (ddi.domain_id_a, ddi.domain_id_b) "
-        "WHERE ddi.negative = 0"
+        "WHERE ddi.negative = 0 AND ddi.source = '3did'"
     )
     return {row[0] for row in cur}
 
@@ -159,7 +172,7 @@ def load_existing_pairs(conn):
         "JOIN domain AS da ON da.id = ddi.domain_id_a "
         "JOIN domain AS db ON db.id = ddi.domain_id_b"
     )
-    return {tuple(sorted((a, b))) for a, b in cur}
+    return {tuple(sorted((a, b), key=pfam_sort_key)) for a, b in cur}
 
 
 
@@ -199,13 +212,13 @@ def _collect_genes_and_pairs(parquet_path, min_n_tested):
 
 
 def _compute_positive_degree(conn):
-    """Per-Pfam degree in the positive DDI set."""
+    """Per-Pfam degree in the 3did positive DDI set."""
     rows = conn.execute(
         "SELECT da.pfam_id, db.pfam_id "
         "FROM domain_domain_interaction AS ddi "
         "JOIN domain AS da ON da.id = ddi.domain_id_a "
         "JOIN domain AS db ON db.id = ddi.domain_id_b "
-        "WHERE ddi.negative = 0"
+        "WHERE ddi.negative = 0 AND ddi.source = '3did'"
     ).fetchall()
     deg = defaultdict(int)
     for a, b in rows:
@@ -310,8 +323,8 @@ def main():
     conn.execute("PRAGMA journal_mode=OFF")
     conn.execute("PRAGMA synchronous=OFF")
 
-    pos_pfam = load_positive_pfams(conn)
-    log(f"n_positive_pfams = {len(pos_pfam)}")
+    pos_pfam = load_3did_pfams(conn)
+    log(f"n_3did_pfams = {len(pos_pfam)}")
 
     existing_pairs = load_existing_pairs(conn)
     log(f"n_existing_ddis = {len(existing_pairs)}")
@@ -335,7 +348,9 @@ def row_pfams(gene):
             continue
         row_pairs = set()
         for a, b in itertools.product(bait_pfams, prey_pfams):
-            row_pairs.add(tuple(sorted((a, b))))
+            if args.no_self and a == b:
+                continue
+            row_pairs.add(tuple(sorted((a, b), key=pfam_sort_key)))
         if not row_pairs:
             continue
         n_rows_with_pairs += 1
@@ -357,7 +372,8 @@ def row_pfams(gene):
         )
 
     n_positive = conn.execute(
-        "SELECT COUNT(*) FROM domain_domain_interaction WHERE negative = 0"
+        "SELECT COUNT(*) FROM domain_domain_interaction "
+        "WHERE negative = 0 AND source = '3did'"
     ).fetchone()[0]
     n_negatome = conn.execute(
         "SELECT COUNT(*) FROM domain_domain_interaction "
@@ -404,9 +420,15 @@ def row_pfams(gene):
 
         insert_rows = []
         for (pfam_a, pfam_b), _ in chosen:
+            # normalise by Pfam accession number (matching ddi_db_utils.insert_ddis)
+            # so swapped pairs collapse and dedup consistently with the other sources
             for d_a in pfam_to_domain_ids.get(pfam_a, ()):
                 for d_b in pfam_to_domain_ids.get(pfam_b, ()):
-                    insert_rows.append((d_a, d_b, True, args.source_label))
+                    if pfam_sort_key(pfam_a) <= pfam_sort_key(pfam_b):
+                        lo, hi = d_a, d_b
+                    else:
+                        lo, hi = d_b, d_a
+                    insert_rows.append((lo, hi, True, args.source_label))
 
         conn.executemany(
             "INSERT OR IGNORE INTO domain_domain_interaction"
diff --git a/bin/build_swissprot_pfam_map.py b/bin/build_swissprot_pfam_map.py
new file mode 100755
index 0000000..7d923aa
--- /dev/null
+++ b/bin/build_swissprot_pfam_map.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+"""Build a reviewed-human UniProt -> Pfam map for single-domain detection.
+
+Downloads one UniProt stream (TSV, fields accession,id,gene_names,xref_pfam for
+``reviewed:true AND organism_id:9606``) and emits ``swissprot_pfam_map.json``:
+
+    {
+      "accession_to_pfams": {accession: [Pfam, ...]},
+      "name_to_accession":  {entry_name_or_gene: accession}
+    }
+
+``name_to_accession`` lets the single-domain step resolve HIPPIE identifiers that
+are entry names (e.g. ``AL1A1_HUMAN``) or gene names; accessions resolve directly
+against ``accession_to_pfams``.  Gene names that map to more than one accession are
+dropped as ambiguous; unique entry names always win.
+"""
+
+import argparse
+import gzip
+import json
+import os
+import shutil
+import ssl
+import sys
+import urllib.error
+import urllib.request
+
+
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--url", required=True, help="UniProt stream URL or local TSV(.gz) file")
+    p.add_argument("--out", required=True, help="Output JSON path")
+    p.add_argument("--versions", required=True)
+    p.add_argument("--process-name", required=True)
+    return p.parse_args()
+
+
+def fetch(url, out_path):
+    def _download(ctx):
+        req = urllib.request.Request(url, headers={"User-Agent": "domainsplit-pipeline"})
+        with urllib.request.urlopen(req, context=ctx, timeout=600) as resp, open(out_path, "wb") as fh:
+            while True:
+                chunk = resp.read(1024 * 1024)
+                if not chunk:
+                    break
+                fh.write(chunk)
+
+    if url.startswith(("http://", "https://", "ftp://", "file://")):
+        try:
+            _download(ssl.create_default_context())
+        except (urllib.error.URLError, ssl.SSLError) as exc:
+            print(f"WARNING: SSL validation failed for {url} ({exc!r}); retrying unverified.",
+                  file=sys.stderr, flush=True)
+            _download(ssl._create_unverified_context())
+    elif os.path.exists(url):
+        shutil.copy(url, out_path)
+    else:
+        raise SystemExit(f"url_uniprot_swissprot_pfam '{url}' is neither a URL nor a local file")
+
+
+def open_maybe_gzip(path):
+    with open(path, "rb") as fh:
+        magic = fh.read(2)
+    if magic == b"\x1f\x8b":
+        return gzip.open(path, "rt")
+    return open(path, "rt")
+
+
+def main():
+    args = parse_args()
+
+    raw = "swissprot.tsv"
+    fetch(args.url, raw)
+
+    accession_to_pfams = {}
+    gene_to_accs = {}      # gene token -> set of accessions (for ambiguity check)
+    entry_name_to_acc = {}
+
+    n_lines = 0
+    with open_maybe_gzip(raw) as fh:
+        for i, line in enumerate(fh):
+            line = line.rstrip("\n")
+            if i == 0 and line.lower().startswith("entry"):
+                continue  # header
+            if not line:
+                continue
+            cols = line.split("\t")
+            if len(cols) < 4:
+                cols += [""] * (4 - len(cols))
+            accession, entry_name, gene_names, pfam_field = cols[0], cols[1], cols[2], cols[3]
+            if not accession:
+                continue
+            n_lines += 1
+
+            pfams = sorted({p for p in pfam_field.replace(",", ";").split(";") if p})
+            accession_to_pfams[accession] = pfams
+
+            if entry_name:
+                entry_name_to_acc[entry_name] = accession
+            for token in gene_names.split():
+                gene_to_accs.setdefault(token, set()).add(accession)
+
+    # entry names are unique and authoritative; add unambiguous gene names that
+    # do not collide with an entry name
+    name_to_accession = dict(entry_name_to_acc)
+    for token, accs in gene_to_accs.items():
+        if token in name_to_accession:
+            continue
+        if len(accs) == 1:
+            name_to_accession[token] = next(iter(accs))
+
+    n_single = sum(1 for pfams in accession_to_pfams.values() if len(pfams) == 1)
+    print(f"[swissprot_map] proteins={n_lines} single_domain={n_single} "
+          f"names={len(name_to_accession)}", flush=True)
+
+    with open(args.out, "w") as fh:
+        json.dump(
+            {"accession_to_pfams": accession_to_pfams,
+             "name_to_accession": name_to_accession},
+            fh,
+        )
+
+    with open(args.versions, "w") as f:
+        f.write(f'"{args.process_name}":\n')
+        f.write(f"    python: {sys.version.split()[0]}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/ddi_db_utils.py b/bin/ddi_db_utils.py
new file mode 100755
index 0000000..e2c6570
--- /dev/null
+++ b/bin/ddi_db_utils.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+"""Shared helpers for inserting DDIs into the domainsplit SQLite.
+
+Every ``INSERT_<source>`` step uses these so all sources are handled uniformly:
+first bulk-create any missing ``domain`` rows for the Pfam IDs it references,
+then insert its DDIs.  Pairs are order-normalised by Pfam accession number (see
+:func:`pfam_sort_key`) so the stored ``(domain_id_a, domain_id_b)`` order is
+stable -- a pair is deduplicated regardless of the order it is supplied in and
+regardless of which source inserted it first or in what order domains were
+created -- and ``INSERT OR IGNORE`` keeps the earliest (positive *or* negative)
+row.
+"""
+
+import sqlite3
+
+
+def pfam_sort_key(pfam):
+    """Sort key for a Pfam accession by its numeric part (``PF00028`` -> ``28``).
+
+    Used to canonicalise DDI pairs so the stored column order depends only on the
+    Pfam accessions, never on the internal ``domain.id`` insertion order.  Strips
+    everything but digits; accessions without any digit fall back to a lexical key
+    that sorts deterministically after all numbered ones.
+    """
+    digits = "".join(c for c in pfam if c.isdigit())
+    return (0, int(digits)) if digits else (1, pfam)
+
+
+def ensure_domains(conn, pfam_ids):
+    """Bulk ``INSERT OR IGNORE`` domain rows for ``pfam_ids`` (name left NULL).
+
+    Returns the number of distinct Pfam IDs supplied.
+    """
+    unique = {p for p in pfam_ids if p}
+    conn.executemany(
+        "INSERT OR IGNORE INTO domain(pfam_id) VALUES (?)",
+        [(p,) for p in unique],
+    )
+    return len(unique)
+
+
+def _pfam_to_id(conn):
+    return {pfam: did for did, pfam in conn.execute("SELECT id, pfam_id FROM domain")}
+
+
+def insert_ddis(conn, pairs, negative, source):
+    """Insert DDIs for ``(pfam_a, pfam_b)`` pairs.
+
+    Domains must already exist (call :func:`ensure_domains` first); pairs whose
+    Pfam is missing from the ``domain`` table are skipped.  Each pair is stored
+    as ``(min(id), max(id))`` so swapped duplicates collapse onto one row.
+
+    Returns the number of rows offered to ``INSERT OR IGNORE`` (before dedup by
+    the DB).
+    """
+    pfam_to_id = _pfam_to_id(conn)
+    neg = int(bool(negative))
+    rows = []
+    seen = set()
+    for a, b in pairs:
+        ia = pfam_to_id.get(a)
+        ib = pfam_to_id.get(b)
+        if ia is None or ib is None:
+            continue
+        key = (ia, ib) if pfam_sort_key(a) <= pfam_sort_key(b) else (ib, ia)
+        if key in seen:
+            continue
+        seen.add(key)
+        rows.append((key[0], key[1], neg, source))
+
+    conn.executemany(
+        "INSERT OR IGNORE INTO domain_domain_interaction"
+        "(domain_id_a, domain_id_b, negative, source) VALUES (?, ?, ?, ?)",
+        rows,
+    )
+    return len(rows)
+
+
+def count_source(conn, source):
+    """Number of DDI rows currently tagged with ``source``."""
+    return conn.execute(
+        "SELECT COUNT(*) FROM domain_domain_interaction WHERE source = ?",
+        (source,),
+    ).fetchone()[0]
diff --git a/bin/insert_3did.py b/bin/insert_3did.py
new file mode 100755
index 0000000..33fec32
--- /dev/null
+++ b/bin/insert_3did.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""Insert 3did positive DDIs into the domainsplit SQLite.
+
+3did is treated like every other source: read its DDI pairs, bulk-create any
+missing ``domain`` rows for the referenced Pfam IDs, then insert the
+interactions as ``negative=0, source='3did'``.
+"""
+
+import argparse
+import sqlite3
+import sys
+
+from ddi_db_utils import count_source, ensure_domains, insert_ddis
+
+
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--db", required=True, help="domainsplit SQLite (modified in place)")
+    p.add_argument("--sqlite-3did", required=True, help="3did SQLite from DOWNLOAD_3DID_SQLITE")
+    p.add_argument("--versions", required=True)
+    p.add_argument("--process-name", required=True)
+    return p.parse_args()
+
+
+def iter_3did_pairs(conn_3did):
+    """Yield (pfam_a, pfam_b) Pfam accessions (version stripped) for each 3did DDI."""
+    cursor = conn_3did.execute(
+        "SELECT d1.Pfam_id, d2.Pfam_id "
+        "FROM DDI1, Domain AS d1, Domain AS d2 "
+        "WHERE DDI1.domain1 = d1.Name AND DDI1.domain2 = d2.Name"
+    )
+    for id_1, id_2 in cursor:
+        yield id_1.split(".")[0], id_2.split(".")[0]
+
+
+def main():
+    args = parse_args()
+
+    conn_3did = sqlite3.connect(args.sqlite_3did)
+    conn = sqlite3.connect(args.db)
+    conn.execute("PRAGMA foreign_keys=ON")
+    conn.execute("PRAGMA journal_mode=OFF")
+    conn.execute("PRAGMA synchronous=OFF")
+
+    pairs = list(iter_3did_pairs(conn_3did))
+    conn_3did.close()
+    print(f"[3did] read {len(pairs)} DDI pairs", flush=True)
+
+    pfams = {p for pair in pairs for p in pair}
+    n_domains = ensure_domains(conn, pfams)
+    print(f"[3did] ensured {n_domains} domains", flush=True)
+
+    insert_ddis(conn, pairs, negative=False, source="3did")
+    conn.commit()
+    print(f"[3did] n_ddis_source_3did = {count_source(conn, '3did')}", flush=True)
+    conn.close()
+
+    with open(args.versions, "w") as f:
+        f.write(f'"{args.process_name}":\n')
+        f.write(f"    python: {sys.version.split()[0]}\n")
+        f.write(f"    sqlite3: {sqlite3.sqlite_version}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/insert_negatome.py b/bin/insert_negatome.py
new file mode 100755
index 0000000..48b33d1
--- /dev/null
+++ b/bin/insert_negatome.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+"""Insert Negatome negative DDIs into the domainsplit SQLite.
+
+Negatome ``combined_pfam.txt`` lists whitespace-separated Pfam pairs that do not
+interact.  Treated like every other source: bulk-create missing domains, then
+insert as ``negative=1, source='negatome'``.
+"""
+
+import argparse
+import sqlite3
+import sys
+
+from ddi_db_utils import count_source, ensure_domains, insert_ddis
+
+
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--db", required=True)
+    p.add_argument("--negatome", required=True)
+    p.add_argument("--versions", required=True)
+    p.add_argument("--process-name", required=True)
+    return p.parse_args()
+
+
+def iter_negatome_pairs(path):
+    with open(path) as f:
+        for line in f:
+            tokens = line.split()
+            if len(tokens) < 2:
+                continue
+            yield tokens[0], tokens[1]
+
+
+def main():
+    args = parse_args()
+
+    pairs = list(iter_negatome_pairs(args.negatome))
+    print(f"[negatome] read {len(pairs)} pairs", flush=True)
+
+    conn = sqlite3.connect(args.db)
+    conn.execute("PRAGMA foreign_keys=ON")
+    conn.execute("PRAGMA journal_mode=OFF")
+    conn.execute("PRAGMA synchronous=OFF")
+
+    pfams = {p for pair in pairs for p in pair}
+    ensure_domains(conn, pfams)
+    insert_ddis(conn, pairs, negative=True, source="negatome")
+    conn.commit()
+    print(f"[negatome] n_ddis_source_negatome = {count_source(conn, 'negatome')}", flush=True)
+    conn.close()
+
+    with open(args.versions, "w") as f:
+        f.write(f'"{args.process_name}":\n')
+        f.write(f"    python: {sys.version.split()[0]}\n")
+        f.write(f"    sqlite3: {sqlite3.sqlite_version}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/insert_ppidm.py b/bin/insert_ppidm.py
new file mode 100755
index 0000000..9ed2556
--- /dev/null
+++ b/bin/insert_ppidm.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""Insert PPIDM predicted positive DDIs, keeping the class as the source.
+
+Input ``predicted_ddi_ppi.tsv`` columns: ``domain_1  domain_2  class`` where each
+domain token looks like ``10114/PF00069`` (the Pfam accession follows the slash).
+Rows are inserted as ``negative=0, source='PPIDM_<Class>'`` for the requested
+classes.  Classes are processed Gold -> Silver -> Bronze so that, on a duplicate
+domain pair, the highest-confidence class wins (``INSERT OR IGNORE``).
+"""
+
+import argparse
+import sqlite3
+import sys
+
+from ddi_db_utils import count_source, ensure_domains, insert_ddis
+
+# highest confidence first
+CLASS_ORDER = ["Gold", "Silver", "Bronze"]
+
+
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--db", required=True)
+    p.add_argument("--ppidm", required=True)
+    p.add_argument("--classes", required=True,
+                   help="comma-separated classes to include, e.g. 'Bronze,Silver,Gold'")
+    p.add_argument("--versions", required=True)
+    p.add_argument("--process-name", required=True)
+    return p.parse_args()
+
+
+def extract_pfam(token):
+    """``10114/PF00069`` or ``PF00069.3`` -> ``PF00069`` (or None)."""
+    pf = token.split("/")[-1].split(".")[0].strip()
+    return pf if pf.startswith("PF") else None
+
+
+def main():
+    args = parse_args()
+
+    allowed = {c.strip().capitalize() for c in args.classes.split(",") if c.strip()}
+    classes = [c for c in CLASS_ORDER if c in allowed]
+    print(f"[ppidm] including classes: {classes}", flush=True)
+
+    # class -> list of (pfam_a, pfam_b)
+    pairs_by_class = {c: [] for c in classes}
+    n_rows = n_bad = 0
+    with open(args.ppidm) as fh:
+        for i, line in enumerate(fh):
+            line = line.rstrip("\n")
+            if not line:
+                continue
+            cols = line.split("\t")
+            if len(cols) < 3:
+                continue
+            if i == 0 and cols[2].strip().lower() == "class":
+                continue  # header
+            cls = cols[2].strip().capitalize()
+            if cls not in pairs_by_class:
+                continue
+            pfam_a = extract_pfam(cols[0])
+            pfam_b = extract_pfam(cols[1])
+            if pfam_a is None or pfam_b is None:
+                n_bad += 1
+                continue
+            pairs_by_class[cls].append((pfam_a, pfam_b))
+            n_rows += 1
+
+    print(f"[ppidm] parsed {n_rows} pairs ({n_bad} unparseable)", flush=True)
+
+    conn = sqlite3.connect(args.db)
+    conn.execute("PRAGMA foreign_keys=ON")
+    conn.execute("PRAGMA journal_mode=OFF")
+    conn.execute("PRAGMA synchronous=OFF")
+
+    all_pfams = {p for pairs in pairs_by_class.values() for pair in pairs for p in pair}
+    ensure_domains(conn, all_pfams)
+
+    for cls in classes:  # Gold first
+        source = f"PPIDM_{cls}"
+        insert_ddis(conn, pairs_by_class[cls], negative=False, source=source)
+        conn.commit()
+        print(f"[ppidm] n_ddis_source_{source} = {count_source(conn, source)}", flush=True)
+
+    conn.close()
+
+    with open(args.versions, "w") as f:
+        f.write(f'"{args.process_name}":\n')
+        f.write(f"    python: {sys.version.split()[0]}\n")
+        f.write(f"    sqlite3: {sqlite3.sqlite_version}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/insert_single_domain_ppi.py b/bin/insert_single_domain_ppi.py
new file mode 100755
index 0000000..996ecdf
--- /dev/null
+++ b/bin/insert_single_domain_ppi.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+"""Infer positive DDIs from HIPPIE PPIs between two single-domain proteins.
+
+A PPI contributes a positive DDI only when *both* interactors are reviewed human
+proteins annotated with exactly one Pfam domain; the DDI is then the pair of
+those two single domains.  Identifiers in the HIPPIE columns may be UniProt
+accessions or entry names (e.g. ``AL1A1_HUMAN``) -- both are resolved via the
+SwissProt map.  New domains are bulk-created so they get curated downstream.
+"""
+
+import argparse
+import json
+import sqlite3
+import sys
+
+from ddi_db_utils import count_source, ensure_domains, insert_ddis
+
+
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--db", required=True)
+    p.add_argument("--hippie", required=True)
+    p.add_argument("--swissprot-map", required=True)
+    p.add_argument("--min-score", type=float, required=True)
+    p.add_argument("--versions", required=True)
+    p.add_argument("--process-name", required=True)
+    return p.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    with open(args.swissprot_map) as fh:
+        smap = json.load(fh)
+    accession_to_pfams = smap["accession_to_pfams"]
+    name_to_accession = smap["name_to_accession"]
+
+    # single-domain proteins: accession -> its one Pfam
+    single_domain = {
+        acc: pfams[0]
+        for acc, pfams in accession_to_pfams.items()
+        if len(pfams) == 1
+    }
+    print(f"[single_domain_ppi] single-domain proteins: {len(single_domain)}", flush=True)
+
+    def resolve_pfam(token):
+        """Return the single Pfam of ``token`` (accession or name), else None."""
+        acc = token if token in accession_to_pfams else name_to_accession.get(token)
+        if acc is None:
+            return None
+        return single_domain.get(acc)
+
+    pairs = []
+    n_rows = n_kept = n_unresolved = 0
+    with open(args.hippie) as fh:
+        for line in fh:
+            line = line.rstrip("\n")
+            if not line:
+                continue
+            cols = line.split("\t")
+            if len(cols) < 5:
+                continue
+            n_rows += 1
+            try:
+                score = float(cols[4])
+            except ValueError:
+                continue
+            if score < args.min_score:
+                continue
+            pfam_a = resolve_pfam(cols[0])
+            pfam_b = resolve_pfam(cols[2])
+            if pfam_a is None or pfam_b is None:
+                n_unresolved += 1
+                continue
+            pairs.append((pfam_a, pfam_b))
+            n_kept += 1
+
+    print(f"[single_domain_ppi] hippie_rows={n_rows} score>= {args.min_score}: "
+          f"single_domain_pairs={n_kept} unresolved_or_multi={n_unresolved}", flush=True)
+
+    conn = sqlite3.connect(args.db)
+    conn.execute("PRAGMA foreign_keys=ON")
+    conn.execute("PRAGMA journal_mode=OFF")
+    conn.execute("PRAGMA synchronous=OFF")
+
+    pfams = {p for pair in pairs for p in pair}
+    ensure_domains(conn, pfams)
+    insert_ddis(conn, pairs, negative=False, source="single_domain_ppi")
+    conn.commit()
+    print(f"[single_domain_ppi] n_ddis_source = "
+          f"{count_source(conn, 'single_domain_ppi')}", flush=True)
+    conn.close()
+
+    with open(args.versions, "w") as f:
+        f.write(f'"{args.process_name}":\n')
+        f.write(f"    python: {sys.version.split()[0]}\n")
+        f.write(f"    sqlite3: {sqlite3.sqlite_version}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/conf/modules.config b/conf/modules.config
index 14931ed..a91b4f8 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -29,7 +29,7 @@ process {
     // default publishDir copy each of them under `insert/` / `smoke/` /
     // `init/` would create stale duplicates and racy filename collisions.
     // Disable publishing for these intermediates explicitly.
-    withName: 'INIT_DOMAINSPLIT_DB|INSERT_DDIS|INSERT_NEGATIVE_DDIS|INSERT_PPI_NEGATIVE_DDIS|SMOKE_FILTER|INSERT_DOMAIN_GO_TERMS|INSERT_PROTEINS_WITH_EMBEDDINGS|INSERT_PROTEIN_GO_TERMS|INSERT_PPI|INSERT_DOMAIN_PROTEIN_MAPPING' {
+    withName: 'INIT_DOMAINSPLIT_DB|INSERT_3DID|INSERT_SINGLE_DOMAIN_PPI|INSERT_PPIDM|INSERT_NEGATOME|REMOVE_SELF_INTERACTIONS|BUILD_SWISSPROT_PFAM_MAP|INSERT_PPI_NEGATIVE_DDIS|SMOKE_FILTER|INSERT_DOMAIN_GO_TERMS|INSERT_PROTEINS_WITH_EMBEDDINGS|INSERT_PROTEIN_GO_TERMS|INSERT_PPI|INSERT_DOMAIN_PROTEIN_MAPPING' {
         publishDir = [ enabled: false ]
     }
 
diff --git a/modules/local/insert_ddis/environment.yml b/modules/local/external_validation_split/environment.yml
similarity index 100%
rename from modules/local/insert_ddis/environment.yml
rename to modules/local/external_validation_split/environment.yml
diff --git a/modules/local/external_validation_split/main.nf b/modules/local/external_validation_split/main.nf
new file mode 100644
index 0000000..e7fc98d
--- /dev/null
+++ b/modules/local/external_validation_split/main.nf
@@ -0,0 +1,104 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    SUBSET_DDIS_BY_SOURCE -- build a single split database keeping only DDIs
+    whose `source` is in the requested set, then prune orphan domains/proteins.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Used for the External-Validation test set, which is placed "as is" (no
+    leakage-aware partitioning) from the held-out sources.
+----------------------------------------------------------------------------*/
+
+process SUBSET_DDIS_BY_SOURCE {
+    tag "subset_${split_name}"
+    label 'process_medium'
+    conda "${moduleDir}/environment.yml"
+    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+
+    input:
+    path 'domainsplit.sqlite3'
+    val  source_filter   // list of DDI source strings to keep
+    val  split_name      // output split name, e.g. 'test'
+
+    output:
+    path('*.sqlite3'), emit: split_dbs
+    val output_split_info, emit: split_info
+    path "versions.yml", emit: versions
+
+    script:
+    output_split_info = [["${split_name}.sqlite3", split_name]]
+    def src_list = source_filter.collect { "'${it}'" }.join(", ")
+
+    """
+    #!/usr/bin/env python3
+    import os
+    os.environ["SQLITE_TMPDIR"] = os.getcwd()
+
+    import sqlite3
+    import shutil
+    import sys
+
+    input_db_path = "domainsplit.sqlite3"
+    output_path = "${split_name}.sqlite3"
+    sources = (${src_list},)
+
+    shutil.copyfile(input_db_path, output_path)
+
+    conn = sqlite3.connect(output_path)
+    conn.executescript('''
+        PRAGMA foreign_keys=ON;
+        PRAGMA journal_mode=OFF;
+        PRAGMA synchronous=OFF;
+    ''')
+
+    placeholders = ",".join("?" for _ in sources)
+    n_keep = conn.execute(
+        f"SELECT COUNT(*) FROM domain_domain_interaction WHERE source IN ({placeholders})",
+        sources,
+    ).fetchone()[0]
+    print(f"Keeping {n_keep} DDIs with source in {sources}", flush=True)
+
+    conn.execute(
+        f"DELETE FROM domain_domain_interaction WHERE source NOT IN ({placeholders})",
+        sources,
+    )
+
+    conn.execute('''
+        DELETE FROM domain WHERE id IN (
+            SELECT d.id FROM domain d
+            LEFT JOIN domain_domain_interaction ddi
+                ON ddi.domain_id_a = d.id OR ddi.domain_id_b = d.id
+            LEFT JOIN domain_protein_map dpm
+                ON dpm.domain_id = d.id
+            WHERE ddi.id IS NULL OR dpm.domain_id IS NULL
+        )
+    ''')
+
+    conn.execute('''
+        DELETE FROM protein WHERE id IN (
+            SELECT p.id FROM protein p
+            LEFT JOIN domain_protein_map dpm
+                ON dpm.protein_id = p.id
+            WHERE dpm.domain_id IS NULL
+        )
+    ''')
+
+    conn.executescript('''
+        VACUUM;
+
+        CREATE INDEX IF NOT EXISTS idx_ddi_domain_a ON domain_domain_interaction(domain_id_a);
+        CREATE INDEX IF NOT EXISTS idx_ddi_domain_b ON domain_domain_interaction(domain_id_b);
+        CREATE INDEX IF NOT EXISTS idx_dpm_domain ON domain_protein_map(domain_id);
+        CREATE INDEX IF NOT EXISTS idx_dpm_protein ON domain_protein_map(protein_id);
+        CREATE INDEX IF NOT EXISTS idx_ppi_protein_a ON protein_protein_interaction(protein_id_a);
+        CREATE INDEX IF NOT EXISTS idx_ppi_protein_b ON protein_protein_interaction(protein_id_b);
+        CREATE INDEX IF NOT EXISTS idx_pgo_protein ON protein_go_terms(protein_id);
+    ''')
+
+    conn.close()
+    print(f"  {output_path}: done", flush=True)
+
+    with open("versions.yml", "w") as f:
+        f.write('"${task.process}":\\n')
+        f.write(f"    python: {sys.version.split()[0]}\\n")
+        f.write(f"    sqlite3: {sqlite3.sqlite_version}\\n")
+    """
+}
diff --git a/modules/local/insert_3did/environment.yml b/modules/local/insert_3did/environment.yml
new file mode 100644
index 0000000..514346d
--- /dev/null
+++ b/modules/local/insert_3did/environment.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.11
+  - sqlite
diff --git a/modules/local/insert_3did/main.nf b/modules/local/insert_3did/main.nf
new file mode 100644
index 0000000..c2485a2
--- /dev/null
+++ b/modules/local/insert_3did/main.nf
@@ -0,0 +1,25 @@
+process INSERT_3DID {
+    tag "insert_3did"
+    label 'process_low'
+    conda "${moduleDir}/environment.yml"
+    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+
+    input:
+    path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
+    path sqlite_3did
+
+    output:
+    path "domainsplit.sqlite3", emit: domainsplit_db
+    path "versions.yml",        emit: versions
+
+    script:
+    """
+    cp "${domainsplit_db_in}" domainsplit.sqlite3
+
+    insert_3did.py \\
+        --db domainsplit.sqlite3 \\
+        --sqlite-3did ${sqlite_3did} \\
+        --versions versions.yml \\
+        --process-name "${task.process}"
+    """
+}
diff --git a/modules/local/insert_ddis/main.nf b/modules/local/insert_ddis/main.nf
deleted file mode 100644
index 1acf264..0000000
--- a/modules/local/insert_ddis/main.nf
+++ /dev/null
@@ -1,99 +0,0 @@
-process INSERT_DDIS {
-    tag "insert_ddis"
-    label 'process_low'
-    conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
-
-    input:
-    path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
-    path sqlite_3did
-    path negatome_txt
-
-    output:
-    path "domainsplit.sqlite3", emit: domainsplit_db
-    path "versions.yml",        emit: versions
-
-    script:
-    """
-    #!/usr/bin/env python3
-    import shutil
-    import sqlite3
-    import sys
-
-    shutil.copy("${domainsplit_db_in}", "domainsplit.sqlite3")
-
-    conn_3did        = sqlite3.connect("${sqlite_3did}")
-    conn_domainsplit = sqlite3.connect("domainsplit.sqlite3")
-    conn_domainsplit.execute("PRAGMA foreign_keys=ON")
-    conn_domainsplit.execute("PRAGMA journal_mode=OFF")
-    conn_domainsplit.execute("PRAGMA synchronous=OFF")
-
-    def iter_negatome_pairs(path):
-        with open(path) as f:
-            for line in f:
-                tokens = line.split()
-                if len(tokens) < 2:
-                    continue
-                yield tokens[0], tokens[1]
-
-    def negatome_pfam_ids(path):
-        ids = set()
-        for a, b in iter_negatome_pairs(path):
-            ids.add(a)
-            ids.add(b)
-        return ids
-
-    # ---- domain rows: 3did Domain x domain_length + negatome pfam ids
-    print("Inserting domain information", flush=True)
-    cursor = conn_3did.execute(
-        "SELECT Name, Pfam_id, profile_length "
-        "FROM Domain, domain_length "
-        "WHERE domain_length.domain = Domain.Name"
-    )
-    domain_rows_3did = ((name, pfam_id.split(".")[0]) for (name, pfam_id, _length) in cursor)
-    domain_rows_negatome = ((None, pfam_id) for pfam_id in negatome_pfam_ids("${negatome_txt}"))
-
-    conn_domainsplit.executemany(
-        "INSERT OR IGNORE INTO domain(name, pfam_id) VALUES (?, ?);",
-        list(domain_rows_3did) + list(domain_rows_negatome),
-    )
-    cursor.close()
-    conn_domainsplit.commit()
-
-    # ---- positive DDIs from 3did
-    print("Inserting positive DDIs from 3did", flush=True)
-    cursor = conn_3did.execute(
-        "SELECT d1.Pfam_id, d2.Pfam_id "
-        "FROM DDI1, Domain AS d1, Domain AS d2 "
-        "WHERE DDI1.domain1 = d1.Name AND DDI1.domain2 = d2.Name;"
-    )
-    pos_iter = ((id_1.split(".")[0], id_2.split(".")[0]) for (id_1, id_2) in cursor)
-    conn_domainsplit.executemany(
-        '''INSERT OR IGNORE INTO domain_domain_interaction(domain_id_a, domain_id_b, negative, source)
-           SELECT d1.id, d2.id, FALSE, '3did'
-           FROM domain AS d1, domain AS d2
-           WHERE d1.pfam_id = ? AND d2.pfam_id = ?;''',
-        pos_iter,
-    )
-    cursor.close()
-    conn_domainsplit.commit()
-
-    # ---- negative DDIs from negatome
-    print("Inserting negative DDIs from negatome", flush=True)
-    conn_domainsplit.executemany(
-        '''INSERT OR IGNORE INTO domain_domain_interaction(domain_id_a, domain_id_b, negative, source)
-           SELECT d1.id, d2.id, TRUE, 'negatome'
-           FROM domain AS d1, domain AS d2
-           WHERE d1.pfam_id = ? AND d2.pfam_id = ?;''',
-        iter_negatome_pairs("${negatome_txt}"),
-    )
-    conn_domainsplit.commit()
-    conn_domainsplit.close()
-    conn_3did.close()
-
-    with open("versions.yml", "w") as f:
-        f.write('"${task.process}":\\n')
-        f.write(f"    python: {sys.version.split()[0]}\\n")
-        f.write(f"    sqlite3: {sqlite3.sqlite_version}\\n")
-    """
-}
diff --git a/modules/local/insert_negatome/environment.yml b/modules/local/insert_negatome/environment.yml
new file mode 100644
index 0000000..514346d
--- /dev/null
+++ b/modules/local/insert_negatome/environment.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.11
+  - sqlite
diff --git a/modules/local/insert_negatome/main.nf b/modules/local/insert_negatome/main.nf
new file mode 100644
index 0000000..454982d
--- /dev/null
+++ b/modules/local/insert_negatome/main.nf
@@ -0,0 +1,25 @@
+process INSERT_NEGATOME {
+    tag "insert_negatome"
+    label 'process_low'
+    conda "${moduleDir}/environment.yml"
+    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+
+    input:
+    path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
+    path negatome_txt
+
+    output:
+    path "domainsplit.sqlite3", emit: domainsplit_db
+    path "versions.yml",        emit: versions
+
+    script:
+    """
+    cp "${domainsplit_db_in}" domainsplit.sqlite3
+
+    insert_negatome.py \\
+        --db domainsplit.sqlite3 \\
+        --negatome ${negatome_txt} \\
+        --versions versions.yml \\
+        --process-name "${task.process}"
+    """
+}
diff --git a/modules/local/insert_ppi_negative_ddis/main.nf b/modules/local/insert_ppi_negative_ddis/main.nf
index 345760e..ba4792c 100644
--- a/modules/local/insert_ppi_negative_ddis/main.nf
+++ b/modules/local/insert_ppi_negative_ddis/main.nf
@@ -8,8 +8,8 @@ process INSERT_PPI_NEGATIVE_DDIS {
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
     path negative_ppi_parquet
     val  min_n_tested
-    val  source_label
     val  sampling_strategy
+    val  self_interaction
 
     output:
     path "domainsplit.sqlite3",        emit: domainsplit_db
@@ -17,6 +17,7 @@ process INSERT_PPI_NEGATIVE_DDIS {
     path "versions.yml",               emit: versions
 
     script:
+    def no_self = self_interaction ? "" : "--no-self"
     """
     cp "${domainsplit_db_in}" domainsplit.sqlite3
 
@@ -25,8 +26,8 @@ process INSERT_PPI_NEGATIVE_DDIS {
         --parquet "${negative_ppi_parquet}" \\
         --pfam-mapping-out uniprot_pfam_mapping.json \\
         --min-n-tested ${min_n_tested} \\
-        --source-label "${source_label}" \\
-        --sampling-strategy "${sampling_strategy}"
+        --sampling-strategy "${sampling_strategy}" \\
+        ${no_self}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/insert_ppidm/environment.yml b/modules/local/insert_ppidm/environment.yml
new file mode 100644
index 0000000..514346d
--- /dev/null
+++ b/modules/local/insert_ppidm/environment.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.11
+  - sqlite
diff --git a/modules/local/insert_ppidm/main.nf b/modules/local/insert_ppidm/main.nf
new file mode 100644
index 0000000..749b705
--- /dev/null
+++ b/modules/local/insert_ppidm/main.nf
@@ -0,0 +1,27 @@
+process INSERT_PPIDM {
+    tag "insert_ppidm"
+    label 'process_low'
+    conda "${moduleDir}/environment.yml"
+    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+
+    input:
+    path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
+    path ppidm_tsv
+    val  classes
+
+    output:
+    path "domainsplit.sqlite3", emit: domainsplit_db
+    path "versions.yml",        emit: versions
+
+    script:
+    """
+    cp "${domainsplit_db_in}" domainsplit.sqlite3
+
+    insert_ppidm.py \\
+        --db domainsplit.sqlite3 \\
+        --ppidm ${ppidm_tsv} \\
+        --classes "${classes}" \\
+        --versions versions.yml \\
+        --process-name "${task.process}"
+    """
+}
diff --git a/modules/local/insert_single_domain_ppi/environment.yml b/modules/local/insert_single_domain_ppi/environment.yml
new file mode 100644
index 0000000..514346d
--- /dev/null
+++ b/modules/local/insert_single_domain_ppi/environment.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.11
+  - sqlite
diff --git a/modules/local/insert_single_domain_ppi/main.nf b/modules/local/insert_single_domain_ppi/main.nf
new file mode 100644
index 0000000..3a14070
--- /dev/null
+++ b/modules/local/insert_single_domain_ppi/main.nf
@@ -0,0 +1,29 @@
+process INSERT_SINGLE_DOMAIN_PPI {
+    tag "insert_single_domain_ppi"
+    label 'process_low'
+    conda "${moduleDir}/environment.yml"
+    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+
+    input:
+    path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
+    path hippie_tsv
+    path swissprot_map
+    val  min_score
+
+    output:
+    path "domainsplit.sqlite3", emit: domainsplit_db
+    path "versions.yml",        emit: versions
+
+    script:
+    """
+    cp "${domainsplit_db_in}" domainsplit.sqlite3
+
+    insert_single_domain_ppi.py \\
+        --db domainsplit.sqlite3 \\
+        --hippie ${hippie_tsv} \\
+        --swissprot-map ${swissprot_map} \\
+        --min-score ${min_score} \\
+        --versions versions.yml \\
+        --process-name "${task.process}"
+    """
+}
diff --git a/modules/local/minimal_leakage_split/main.nf b/modules/local/minimal_leakage_split/main.nf
index b0a4099..af546cb 100644
--- a/modules/local/minimal_leakage_split/main.nf
+++ b/modules/local/minimal_leakage_split/main.nf
@@ -41,6 +41,7 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN {
     path "domainsplit.sqlite3"
     val split_fractions  // e.g., [("train", 0.6), ("optimization", 0.2), ("test", 0.2)]
     path ("domain_clusters.tsv")
+    val source_filter    // list of DDI source strings to include; [] = all sources
 
     output:
     path('*.sqlite3'), emit: split_dbs
@@ -59,6 +60,9 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN {
     def split_fraction_dict_str = output_file_fraction_dict.collect { k, v -> "'${k}': ${v}" }.join(", ")
     def split_fraction_dict_py = "{" + split_fraction_dict_str + "}"
 
+    def src_list = source_filter.collect { "'${it}'" }.join(", ")
+    def where_clause = source_filter ? "WHERE source IN (${src_list})" : ""
+
     """
     #!/usr/bin/env python3
     \"\"\"
@@ -112,7 +116,7 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN {
     # ── Load DDI data ────────────────────────────────────────────────
     conn = sqlite3.connect(input_db_path)
     ddi_rows = conn.execute(
-        "SELECT id, domain_id_a, domain_id_b FROM domain_domain_interaction"
+        "SELECT id, domain_id_a, domain_id_b FROM domain_domain_interaction ${where_clause}"
     ).fetchall()
     conn.close()
     print(f"Loaded {len(ddi_rows)} DDIs")
diff --git a/modules/local/random_ddi_split/main.nf b/modules/local/random_ddi_split/main.nf
index 5e67e95..1d20ff4 100644
--- a/modules/local/random_ddi_split/main.nf
+++ b/modules/local/random_ddi_split/main.nf
@@ -7,6 +7,7 @@ process RANDOM_DDI_SPLIT {
     input:
     path 'domainsplit.sqlite3'
     val split_fractions  // e.g., [("train", 0.6), ("optimization", 0.2), ("test", 0.2)]
+    val source_filter    // list of DDI source strings to include; [] = all sources
 
     output:
     path('*.sqlite3'), emit: split_dbs
@@ -25,6 +26,9 @@ process RANDOM_DDI_SPLIT {
     def split_fraction_dict_str = output_file_fraction_dict.collect { k, v -> "'${k}': ${v}" }.join(", ")
     def split_fraction_dict_py = "{" + split_fraction_dict_str + "}"
 
+    def src_list = source_filter.collect { "'${it}'" }.join(", ")
+    def where_clause = source_filter ? "WHERE source IN (${src_list})" : ""
+
     """
     #!/usr/bin/env python3
 
@@ -39,7 +43,7 @@ process RANDOM_DDI_SPLIT {
     split_fractions = ${split_fraction_dict_py}
 
     conn = sqlite3.connect(input_db_path)
-    ddi_ids = [row[0] for row in conn.execute("SELECT id FROM domain_domain_interaction")]
+    ddi_ids = [row[0] for row in conn.execute("SELECT id FROM domain_domain_interaction ${where_clause}")]
     conn.close()
 
     random.shuffle(ddi_ids)
diff --git a/modules/local/remove_self_interactions/environment.yml b/modules/local/remove_self_interactions/environment.yml
new file mode 100644
index 0000000..514346d
--- /dev/null
+++ b/modules/local/remove_self_interactions/environment.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.11
+  - sqlite
diff --git a/modules/local/remove_self_interactions/main.nf b/modules/local/remove_self_interactions/main.nf
new file mode 100644
index 0000000..c87e5a8
--- /dev/null
+++ b/modules/local/remove_self_interactions/main.nf
@@ -0,0 +1,40 @@
+process REMOVE_SELF_INTERACTIONS {
+    tag "remove_self_interactions"
+    label 'process_low'
+    conda "${moduleDir}/environment.yml"
+    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+
+    input:
+    path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
+
+    output:
+    path "domainsplit.sqlite3", emit: domainsplit_db
+    path "versions.yml",        emit: versions
+
+    script:
+    """
+    #!/usr/bin/env python3
+    import shutil
+    import sqlite3
+    import sys
+
+    shutil.copy("${domainsplit_db_in}", "domainsplit.sqlite3")
+
+    conn = sqlite3.connect("domainsplit.sqlite3")
+    conn.execute("PRAGMA foreign_keys=ON")
+    before = conn.execute("SELECT COUNT(*) FROM domain_domain_interaction").fetchone()[0]
+    conn.execute(
+        "DELETE FROM domain_domain_interaction WHERE domain_id_a = domain_id_b"
+    )
+    conn.commit()
+    after = conn.execute("SELECT COUNT(*) FROM domain_domain_interaction").fetchone()[0]
+    conn.close()
+    print(f"[remove_self_interactions] removed {before - after} self-DDIs "
+          f"({before} -> {after})", flush=True)
+
+    with open("versions.yml", "w") as f:
+        f.write('"${task.process}":\\n')
+        f.write(f"    python: {sys.version.split()[0]}\\n")
+        f.write(f"    sqlite3: {sqlite3.sqlite_version}\\n")
+    """
+}
diff --git a/modules/local/swissprot_map/environment.yml b/modules/local/swissprot_map/environment.yml
new file mode 100644
index 0000000..150b843
--- /dev/null
+++ b/modules/local/swissprot_map/environment.yml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.11
diff --git a/modules/local/swissprot_map/main.nf b/modules/local/swissprot_map/main.nf
new file mode 100644
index 0000000..e28b2e2
--- /dev/null
+++ b/modules/local/swissprot_map/main.nf
@@ -0,0 +1,22 @@
+process BUILD_SWISSPROT_PFAM_MAP {
+    tag "swissprot_map"
+    label 'process_low'
+    conda "${moduleDir}/environment.yml"
+    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+
+    input:
+    val url
+
+    output:
+    path "swissprot_pfam_map.json", emit: map
+    path "versions.yml",            emit: versions
+
+    script:
+    """
+    build_swissprot_pfam_map.py \\
+        --url "${url}" \\
+        --out swissprot_pfam_map.json \\
+        --versions versions.yml \\
+        --process-name "${task.process}"
+    """
+}
diff --git a/nextflow.config b/nextflow.config
index e5693d9..97a108c 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -25,9 +25,9 @@ params {
 
     url_pfam_template = 'https://www.ebi.ac.uk/interpro/wwwapi//entry/pfam/{pfam_id}/?annotation=alignment:full&download'
 
-    // ProtT5 per-residue embeddings: local path to the EBI per-residue.h5 file.
-    // Download from https://ftp.ebi.ac.uk/pub/contrib/UniProt/embeddings/current_release/uniprot_sprot/per-residue.h5
-    // When null or file missing, ProtT5 embeddings are skipped with a warning.
+    // ProtT5 per-residue embeddings: optional local path to a pre-downloaded
+    // EBI per-residue.h5 file. When set and the file exists it is used; otherwise
+    // the embeddings are downloaded from url_uniprot_embeddings (ProtT5 always runs).
     prott5_per_residue_h5              = null
 
     // ESM embedding sharding + inference knobs.
@@ -62,20 +62,38 @@ params {
     // null (default) to disable the smoke filter entirely.
     smoke_test_n_ddis          = null
 
-    // Optional second negative-DDI source derived from a Y2H/MS PPI parquet
-    // (columns: gene_name_bait, gene_name_prey, n_tested, ...). When the path
-    // is non-null the COLLECT_DDI_DATA subworkflow filters rows by `n_tested`,
-    // maps genes to UniProt via the Swiss-Prot flat file, looks up Pfam domains via
-    // the UniProt REST API, enumerates every Pfam-pair cross
-    // product restricted to Pfam IDs already present in positive DDIs, ranks
-    // pairs by co-occurrence frequency, and inserts the top-N as negatives so
-    // the total negative count matches the positive count. Leave the parquet
-    // null (default) to skip the whole step.
+    // Negative-DDI source derived from a Y2H/MS PPI parquet (columns:
+    // gene_name_bait, gene_name_prey, n_tested, ...). COLLECT_DDI_DATA filters
+    // rows by `n_tested`, maps genes to UniProt then Pfam (via the UniProt REST
+    // API), enumerates the Pfam-pair cross product restricted to Pfam IDs already
+    // present in positive DDIs, samples pairs (frequency or degree-matched), and
+    // inserts them as negatives so the total negative count matches the positive
+    // count. Required input (no default; must be supplied per run).
     negative_ppi_parquet         = null
     negative_ppi_min_n_tested    = 5
-    negative_ppi_source_label    = 'y2h_ms'
     negative_sampling_strategy   = 'degree_matched'  // 'frequency' or 'degree_matched'
 
+    // Reviewed-human UniProt -> Pfam stream used to detect single-domain
+    // proteins (accession, entry name, gene names, Pfam xrefs).
+    url_uniprot_swissprot_pfam   = 'https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=tsv&fields=accession%2Cid%2Cgene_names%2Cxref_pfam&query=%28%28reviewed%3Atrue%29+AND+%28organism_id%3A9606%29%29'
+
+    // Positive DDIs inferred from HIPPIE PPIs between two single-domain proteins.
+    // Required input (no default). Rows are kept when the HIPPIE confidence
+    // score (column 5) is >= hippie_min_score.
+    hippie_tsv                   = null
+    hippie_min_score             = 0.63
+
+    // Positive DDIs from PPIDM predictions (predicted_ddi_ppi.tsv with columns
+    // domain_1, domain_2, class). Class is kept as source 'PPIDM_<Class>'.
+    // Required input (no default).
+    ppidm_tsv                    = null
+    ppidm_classes                = 'Bronze,Silver,Gold'
+
+    // When false, all self-interactions (a domain interacting with itself) are
+    // removed after the positive/negatome sources are inserted, and the
+    // high-conf non-PPI negative builder skips self-pairs.
+    self_interaction             = true
+
     // Boilerplate options
     input                        = null
     outdir                       = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 7fbe6ef..e74321b 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -36,10 +36,11 @@
             }
         },
         "source_database_options": {
-            "title": "Source database URLs",
+            "title": "Source databases",
             "type": "object",
             "fa_icon": "fas fa-database",
-            "description": "URLs of the public source databases used to assemble the domainsplit database.",
+            "description": "Public source databases (URLs and local file paths) used to assemble the domainsplit database.",
+            "required": ["hippie_tsv", "ppidm_tsv", "negative_ppi_parquet"],
             "properties": {
                 "url_3did": {
                     "type": "string",
@@ -97,7 +98,7 @@
                 },
                 "prott5_per_residue_h5": {
                     "type": "string",
-                    "description": "Local path to the EBI ProtT5 per-residue HDF5 file. When null or file missing, ProtT5 embeddings are skipped with a warning.",
+                    "description": "Optional local path to a pre-downloaded ProtT5 per-residue HDF5 file. When set and present it is used; otherwise the file is downloaded from url_uniprot_embeddings.",
                     "default": null,
                     "fa_icon": "fas fa-file"
                 },
@@ -158,7 +159,7 @@
                 "negative_ppi_parquet": {
                     "type": ["string", "null"],
                     "format": "file-path",
-                    "description": "Optional path to a Y2H/MS PPI parquet (columns: gene_name_bait, gene_name_prey, n_tested, ...). When set, COLLECT_DDI_DATA derives extra negative DDIs by mapping bait/prey genes to UniProt then Pfam (via UniProt REST API) and inserting the most-frequent Pfam-pair candidates restricted to domains already in positive DDIs. Null (default) disables this source.",
+                    "description": "Required path to a Y2H/MS PPI parquet (columns: gene_name_bait, gene_name_prey, n_tested, ...). COLLECT_DDI_DATA derives negative DDIs by mapping bait/prey genes to UniProt then Pfam (via UniProt REST API) and inserting Pfam-pair candidates (degree-matched or by frequency) restricted to domains already in positive DDIs.",
                     "default": null,
                     "fa_icon": "fas fa-file-import"
                 },
@@ -169,12 +170,6 @@
                     "minimum": 1,
                     "fa_icon": "fas fa-filter"
                 },
-                "negative_ppi_source_label": {
-                    "type": "string",
-                    "description": "Value written to the `source` column of domain_domain_interaction for DDIs derived from negative_ppi_parquet.",
-                    "default": "y2h_ms",
-                    "fa_icon": "fas fa-tag"
-                },
                 "pfam_download_batch_size": {
                     "type": "integer",
                     "description": "Number of Pfam IDs grouped into a single download job to reduce scheduler overhead.",
@@ -188,6 +183,46 @@
                     "default": "degree_matched",
                     "enum": ["frequency", "degree_matched"],
                     "fa_icon": "fas fa-random"
+                },
+                "url_uniprot_swissprot_pfam": {
+                    "type": "string",
+                    "description": "UniProt stream URL (or local TSV path) of reviewed human proteins with fields accession, entry name, gene names and Pfam xrefs. Used to detect single-domain proteins for the HIPPIE-derived positive DDIs.",
+                    "default": "https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=tsv&fields=accession%2Cid%2Cgene_names%2Cxref_pfam&query=%28%28reviewed%3Atrue%29+AND+%28organism_id%3A9606%29%29",
+                    "fa_icon": "fas fa-link"
+                },
+                "hippie_tsv": {
+                    "type": ["string", "null"],
+                    "format": "file-path",
+                    "description": "Required path to a HIPPIE PPI TSV. COLLECT_DDI_DATA adds positive DDIs inferred from PPIs between two single-domain proteins.",
+                    "default": null,
+                    "fa_icon": "fas fa-file-import"
+                },
+                "hippie_min_score": {
+                    "type": "number",
+                    "description": "Minimum HIPPIE confidence score (column 5) required to keep a PPI row when inferring single-domain positive DDIs.",
+                    "default": 0.63,
+                    "minimum": 0,
+                    "maximum": 1,
+                    "fa_icon": "fas fa-filter"
+                },
+                "ppidm_tsv": {
+                    "type": ["string", "null"],
+                    "format": "file-path",
+                    "description": "Required path to a PPIDM predictions TSV (columns: domain_1, domain_2, class). COLLECT_DDI_DATA adds positive DDIs tagged with source 'PPIDM_<Class>'.",
+                    "default": null,
+                    "fa_icon": "fas fa-file-import"
+                },
+                "ppidm_classes": {
+                    "type": "string",
+                    "description": "Comma-separated PPIDM confidence classes to include.",
+                    "default": "Bronze,Silver,Gold",
+                    "fa_icon": "fas fa-tags"
+                },
+                "self_interaction": {
+                    "type": "boolean",
+                    "description": "When false, all self-interactions (a domain interacting with itself) are removed after the positive/negatome sources are inserted, and the high-confidence non-PPI negative builder skips self-pairs.",
+                    "default": true,
+                    "fa_icon": "fas fa-redo"
                 }
             }
         },
diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf
index 5d44942..6c3d9da 100644
--- a/subworkflows/local/collect_ddi_data/main.nf
+++ b/subworkflows/local/collect_ddi_data/main.nf
@@ -4,16 +4,29 @@
     pre-initialised Domainsplit SQLite. Downstream code consumes only the
     database; the 3did SQLite stays internal to this subworkflow.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Sources are inserted in a fixed order so that, on a duplicate domain pair,
+    the earlier source wins the label (INSERT OR IGNORE):
+
+        3did -> single-domain PPI -> PPIDM -> negatome
+          -> [optional self-interaction removal]
+          -> high-confidence non-PPI negatives (over 3did domains only)
+
     Add a new DDI source by:
-      1. Adding its download module (network fetch + format normalisation).
-      2. Calling it here and routing the parsed output into INSERT_DDIS (or a
-         per-source INSERT_* module if its parsing differs).
+      1. Adding its download/parse module.
+      2. Slotting an INSERT_<source> call into the chain below (each collects its
+         own unique Pfam IDs and bulk-creates missing domain rows via the shared
+         bin/ddi_db_utils.py helper).
       3. Tagging its rows with a unique source string in domain_domain_interaction.
 ----------------------------------------------------------------------------*/
 
 include { DOWNLOAD_3DID_SQLITE      } from '../../../modules/local/3did/main.nf'
 include { DOWNLOAD_NEGATOME         } from '../../../modules/local/negatome/main.nf'
-include { INSERT_DDIS               } from '../../../modules/local/insert_ddis/main.nf'
+include { INSERT_3DID               } from '../../../modules/local/insert_3did/main.nf'
+include { BUILD_SWISSPROT_PFAM_MAP  } from '../../../modules/local/swissprot_map/main.nf'
+include { INSERT_SINGLE_DOMAIN_PPI  } from '../../../modules/local/insert_single_domain_ppi/main.nf'
+include { INSERT_PPIDM              } from '../../../modules/local/insert_ppidm/main.nf'
+include { INSERT_NEGATOME           } from '../../../modules/local/insert_negatome/main.nf'
+include { REMOVE_SELF_INTERACTIONS  } from '../../../modules/local/remove_self_interactions/main.nf'
 include { INSERT_PPI_NEGATIVE_DDIS  } from '../../../modules/local/insert_ppi_negative_ddis/main.nf'
 include { SMOKE_FILTER              } from '../../../modules/local/smoke_filter/main.nf'
 
@@ -22,28 +35,54 @@ workflow COLLECT_DDI_DATA {
     domainsplit_db_in
     url_3did
     url_negatome
+    url_uniprot_swissprot_pfam
+    hippie_tsv
+    ppidm_tsv
+    negative_ppi_parquet
 
     main:
     file_3did     = file(url_3did)
     sqlite_3did   = DOWNLOAD_3DID_SQLITE(file_3did).sqlite
     negatome_file = DOWNLOAD_NEGATOME(url_negatome).negatome
 
-    domainsplit_db = INSERT_DDIS(domainsplit_db_in, sqlite_3did, negatome_file).domainsplit_db
+    // 1. 3did positives
+    domainsplit_db = INSERT_3DID(domainsplit_db_in, sqlite_3did).domainsplit_db
+
+    // 2-3. single-domain PPI positives (HIPPIE), using a reviewed-human SwissProt map
+    swissprot_map = BUILD_SWISSPROT_PFAM_MAP(url_uniprot_swissprot_pfam).map
+    domainsplit_db = INSERT_SINGLE_DOMAIN_PPI(
+        domainsplit_db,
+        file(hippie_tsv),
+        swissprot_map,
+        params.hippie_min_score,
+    ).domainsplit_db
 
-    pfam_mapping = Channel.empty()
+    // 4. PPIDM predicted positives (class kept as source)
+    domainsplit_db = INSERT_PPIDM(
+        domainsplit_db,
+        file(ppidm_tsv),
+        params.ppidm_classes,
+    ).domainsplit_db
 
-    if (params.negative_ppi_parquet != null) {
-        ppi_result = INSERT_PPI_NEGATIVE_DDIS(
-            domainsplit_db,
-            file(params.negative_ppi_parquet),
-            params.negative_ppi_min_n_tested,
-            params.negative_ppi_source_label,
-            params.negative_sampling_strategy,
-        )
-        domainsplit_db = ppi_result.domainsplit_db
-        pfam_mapping   = ppi_result.pfam_mapping
+    // 5. negatome negatives
+    domainsplit_db = INSERT_NEGATOME(domainsplit_db, negatome_file).domainsplit_db
+
+    // 6. optional removal of all self-interactions
+    if (!params.self_interaction) {
+        domainsplit_db = REMOVE_SELF_INTERACTIONS(domainsplit_db).domainsplit_db
     }
 
+    // 7. high-confidence non-PPI negatives (inferred only over 3did domains)
+    ppi_result = INSERT_PPI_NEGATIVE_DDIS(
+        domainsplit_db,
+        file(negative_ppi_parquet),
+        params.negative_ppi_min_n_tested,
+        params.negative_sampling_strategy,
+        params.self_interaction,
+    )
+    domainsplit_db = ppi_result.domainsplit_db
+    pfam_mapping   = ppi_result.pfam_mapping
+
     if (params.smoke_test_n_ddis != null) {
         domainsplit_db = SMOKE_FILTER(domainsplit_db, params.smoke_test_n_ddis).domainsplit_db
     }
diff --git a/subworkflows/local/split_domainsplit_database/main.nf b/subworkflows/local/split_domainsplit_database/main.nf
index b88ef74..fc1ffb9 100644
--- a/subworkflows/local/split_domainsplit_database/main.nf
+++ b/subworkflows/local/split_domainsplit_database/main.nf
@@ -1,16 +1,23 @@
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    SPLIT_DOMAINSPLIT_DATABASE -- split the Domainsplit DB into train/opt/test
-    sets using random and minimal-leakage strategies.
+    SPLIT_DOMAINSPLIT_DATABASE -- produce three split strategies:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    Extracts domain sequences, clusters with MMseqs2, then runs the
-    splitting strategies (random DDI as biased baseline, spectral
-    graph-partitioning minimal leakage on domains) producing per-split
-    SQLite databases directly.
+      * random_ddi             biased baseline (random partition)
+      * minimal_leakage_domain leakage-aware spectral partition
+      * external_validation    leakage-aware train/validation on the "core"
+                               sources (3did + high-conf non-PPI negatives),
+                               plus an as-is test set from the held-out sources
+                               (single-domain PPI, PPIDM, negatome).
+
+    Domain sequences are extracted and clustered (MMseqs2) once; both the
+    minimal-leakage and external-validation train/val partitions reuse the
+    clusters.
 ----------------------------------------------------------------------------*/
 
 include { RANDOM_DDI_SPLIT                                                  } from '../../../modules/local/random_ddi_split/main'
 include { EXTRACT_DOMAIN_SEQUENCES; MINIMAL_LEAKAGE_SPLIT_DOMAIN            } from '../../../modules/local/minimal_leakage_split/main'
+include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MINIMAL_LEAKAGE_SPLIT_TRAINVAL    } from '../../../modules/local/minimal_leakage_split/main'
+include { SUBSET_DDIS_BY_SOURCE                                             } from '../../../modules/local/external_validation_split/main'
 include { MMSEQS_EASYCLUSTER                                                } from '../../../modules/nf-core/mmseqs/easycluster/main'
 
 
@@ -38,6 +45,7 @@ workflow SPLIT_DOMAINSPLIT_DATABASE {
     }
 
     clusters = MMSEQS_EASYCLUSTER(cluster_input)
+    def clusters_tsv = clusters.tsv.filter { it[0].id == "domain" }.map { it[1] }.first()
 
     def splits = [
         ["train", 0.6],
@@ -45,22 +53,58 @@ workflow SPLIT_DOMAINSPLIT_DATABASE {
         ["test", 0.2]
     ]
 
+    // The two methods that mimic a within-distribution evaluation use only the
+    // "core" sources: 3did positives + high-confidence non-PPI negatives.
+    // 'inferred_ppi_screen_negative' must stay in sync with the --source-label
+    // default in bin/build_ppi_negative_ddis.py.
+    def core_sources = ['3did', 'inferred_ppi_screen_negative']
+
+    // External-validation test set: held-out sources placed as is.
+    def test_sources = [
+        'single_domain_ppi',
+        'PPIDM_Bronze', 'PPIDM_Silver', 'PPIDM_Gold',
+        'negatome',
+    ]
+
     // Biased baseline: random DDI split (same proteins in train and test)
     RANDOM_DDI_SPLIT(
         domainsplit_db_ch,
-        Channel.of(splits)
+        Channel.of(splits),
+        core_sources
     )
 
     // Leakage-aware: spectral graph partitioning on domain clusters
     MINIMAL_LEAKAGE_SPLIT_DOMAIN(
         domainsplit_db_ch,
         splits,
-        clusters.tsv.filter { it[0].id == "domain" }.map { it[1] }
+        clusters_tsv,
+        core_sources
+    )
+
+    // External validation: leakage-free train/validation on core sources ...
+    def trainval_splits = [
+        ["train", 0.8],
+        ["validation", 0.2]
+    ]
+    MINIMAL_LEAKAGE_SPLIT_TRAINVAL(
+        domainsplit_db_ch,
+        trainval_splits,
+        clusters_tsv,
+        core_sources
+    )
+
+    // ... plus an as-is test set from the held-out sources
+    SUBSET_DDIS_BY_SOURCE(
+        domainsplit_db_ch,
+        test_sources,
+        "test"
     )
 
     split_ch = Channel.empty().mix(
         map_split_dbs(RANDOM_DDI_SPLIT.out.split_info, RANDOM_DDI_SPLIT.out.split_dbs, "random_ddi"),
-        map_split_dbs(MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_info, MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_dbs, "minimal_leakage_domain")
+        map_split_dbs(MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_info, MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_dbs, "minimal_leakage_domain"),
+        map_split_dbs(MINIMAL_LEAKAGE_SPLIT_TRAINVAL.out.split_info, MINIMAL_LEAKAGE_SPLIT_TRAINVAL.out.split_dbs, "external_validation"),
+        map_split_dbs(SUBSET_DDIS_BY_SOURCE.out.split_info, SUBSET_DDIS_BY_SOURCE.out.split_dbs, "external_validation")
     )
 
     emit:
diff --git a/workflows/domainsplit.nf b/workflows/domainsplit.nf
index 877e784..93f93f4 100644
--- a/workflows/domainsplit.nf
+++ b/workflows/domainsplit.nf
@@ -23,22 +23,20 @@ include { ANALYZE_DDI_BIAS            } from '../modules/local/analyze_ddi_bias/
 workflow DOMAINSPLIT {
 main:
     input_uniprot_id_mapping = file(params.url_uniprot_id_mapping)
-    input_uniprot_embeddings = file(params.url_uniprot_embeddings)
     input_uniprot_go_terms   = file(params.url_uniprot_go_terms)
     input_uniprot_sequences  = file(params.url_uniprot_sequences)
     input_string             = file(params.url_string)
     input_pfam2go            = file(params.url_pfam2go)
 
-    def prott5_file = []
-    if (params.prott5_per_residue_h5) {
-        def f = file(params.prott5_per_residue_h5)
-        if (f.exists()) {
-            prott5_file = f
-        } else {
-            log.warn "ProtT5 HDF5 not found at '${params.prott5_per_residue_h5}' — skipping ProtT5 embeddings"
-        }
+    // ProtT5 per-residue embeddings: prefer a pre-downloaded local file when it
+    // exists, otherwise fall back to downloading url_uniprot_embeddings. Always
+    // populated, so ProtT5 embeddings are a compulsory step.
+    def prott5_file = file(params.url_uniprot_embeddings)
+    if (params.prott5_per_residue_h5 && file(params.prott5_per_residue_h5).exists()) {
+        prott5_file = file(params.prott5_per_residue_h5)
+        log.info "Using local ProtT5 HDF5 at '${params.prott5_per_residue_h5}'"
     } else {
-        log.warn "params.prott5_per_residue_h5 not set — skipping ProtT5 embeddings"
+        log.info "Using ProtT5 HDF5 from url_uniprot_embeddings"
     }
 
     empty_db = INIT_DOMAINSPLIT_DB().domainsplit_db
@@ -47,6 +45,10 @@ main:
         empty_db,
         params.url_3did,
         params.url_negatome,
+        params.url_uniprot_swissprot_pfam,
+        params.hippie_tsv,
+        params.ppidm_tsv,
+        params.negative_ppi_parquet,
     )
 
     domainsplit_db_ddi = COLLECT_DDI_DATA.out.domainsplit_db

From 36b05f64992449e73bd9a5faace21fefa24eb2b7 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Tue, 9 Jun 2026 19:16:31 +0200
Subject: [PATCH 02/16] updated mmseqs, made the negative sampling use dans

---
 bin/build_ppi_negative_ddis.py                | 143 ++++++++----------
 modules.json                                  |   2 +-
 .../insert_ppi_negative_ddis/environment.yml  |   1 +
 .../local/insert_ppi_negative_ddis/main.nf    |   3 +-
 modules/nf-core/mmseqs/easycluster/main.nf    |  12 +-
 modules/nf-core/mmseqs/easycluster/meta.yml   |  30 +++-
 .../easycluster/tests/main.nf.test.snap       |  36 +++--
 nextflow.config                               |   4 +-
 nextflow_schema.json                          |   7 -
 subworkflows/local/collect_ddi_data/main.nf   |   1 -
 10 files changed, 114 insertions(+), 125 deletions(-)

diff --git a/bin/build_ppi_negative_ddis.py b/bin/build_ppi_negative_ddis.py
index aae31d2..366ae06 100755
--- a/bin/build_ppi_negative_ddis.py
+++ b/bin/build_ppi_negative_ddis.py
@@ -2,22 +2,27 @@
 """
 Build negative DDIs from a Y2H/MS PPI parquet and append them to the
 domainsplit SQLite, restricted to Pfam domains already present in positive
-DDIs.  In degree_matched mode, selects pairs so each domain's negative
-degree matches its positive degree.  In frequency mode, takes the top-N
-by PPI co-occurrence count, capped at (n_positive - n_negatome).
+DDIs.
+
+Selection uses degree-aware node sampling (DANS, Cappelletti et al. 2024,
+Bioinformatics Advances vbae036) applied to the PPI-derived candidate pool:
+each candidate Pfam pair is sampled (without replacement) with probability
+proportional to the preferential attachment of its two domains in the positive
+graph -- the product of their positive degrees.  This makes the negative
+degree / preferential-attachment distribution track the positives, avoiding the
+inflated downstream evaluation that uniform negative sampling produces.
 """
 
 import argparse
-import heapq
 import itertools
 import json
-import random
 import sqlite3
 import sys
 import time
 import math
 from collections import defaultdict
 
+import numpy as np
 import pyarrow.parquet as pq
 import requests
 
@@ -42,11 +47,10 @@ def parse_args():
     p.add_argument("--min-n-tested", type=int, required=True)
     p.add_argument("--source-label", default="inferred_ppi_screen_negative")
     p.add_argument(
-        "--sampling-strategy",
-        choices=["frequency", "degree_matched"],
-        default="degree_matched",
-        help="'frequency' = top-N by co-occurrence (old behavior). "
-             "'degree_matched' = sample to match positive degree distribution.",
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducible DANS negative sampling.",
     )
     p.add_argument(
         "--no-self",
@@ -227,65 +231,52 @@ def _compute_positive_degree(conn):
     return deg
 
 
-def select_degree_matched(fresh_candidates, pos_degree, n_take):
-    """Select negatives so each domain's negative degree matches its positive degree.
+def select_dans(fresh_candidates, pos_degree, n_take, seed=42):
+    """Degree-aware node sampling (DANS) over the PPI candidate pool.
 
-    Uses a lazy-deletion max-heap scored by combined degree deficit of both
-    domains in each candidate pair.  Candidates are shuffled for random
-    tiebreaking among equal-deficit pairs.
+    Cappelletti et al. 2024 (Bioinformatics Advances, vbae036) sample negative
+    edges so endpoint node-degrees track the positive distribution; the induced
+    edge probability is proportional to the preferential attachment of the two
+    endpoints, ``PA = deg(a) * deg(b)``.  Here we apply that distribution to the
+    fixed pool of PPI-derived candidate Pfam pairs: each candidate is drawn
+    without replacement with probability proportional to the product of its two
+    domains' positive degrees, so the selected negatives mirror the positive
+    degree / PA distribution while staying biologically grounded in the PPI
+    screen.
     """
-    if not fresh_candidates or n_take <= 0:
+    if n_take <= 0 or not fresh_candidates:
         return []
-    if not pos_degree:
-        return fresh_candidates[:n_take]
+    if n_take >= len(fresh_candidates):
+        return list(fresh_candidates)
 
-    target = dict(pos_degree)
-    current = defaultdict(int)
-
-    candidates = list(fresh_candidates)
-    random.shuffle(candidates)
-
-    remaining = set(range(len(candidates)))
-
-    def deficit(pfam):
-        return max(0, target.get(pfam, 0) - current[pfam])
-
-    def score(i):
-        (pfam_a, pfam_b), _ = candidates[i]
-        return deficit(pfam_a) + deficit(pfam_b)
-
-    heap = [(-score(i), i) for i in range(len(candidates))]
-    heapq.heapify(heap)
-
-    chosen = []
-    while len(chosen) < n_take and heap:
-        neg_s, i = heapq.heappop(heap)
-        if i not in remaining:
-            continue
-
-        actual = score(i)
-        if actual != -neg_s:
-            if actual > 0:
-                heapq.heappush(heap, (-actual, i))
-            else:
-                remaining.discard(i)
-            continue
-
-        if actual <= 0:
-            break
-
-        (pfam_a, pfam_b), count = candidates[i]
-        chosen.append(((pfam_a, pfam_b), count))
-        remaining.discard(i)
-        current[pfam_a] += 1
-        current[pfam_b] += 1
-
-    matched = sum(1 for p in target if current.get(p, 0) >= target[p])
-    over = sum(1 for p in target if current.get(p, 0) > target[p])
-    total_deficit = sum(max(0, target[p] - current.get(p, 0)) for p in target)
-    log(f"degree_matched: {matched}/{len(target)} domains reached target degree")
-    log(f"degree_matched: {over} domains exceeded target degree")
-    log(f"degree_matched: remaining total deficit = {total_deficit}")
+    weights = np.array(
+        [pos_degree.get(a, 0) * pos_degree.get(b, 0) for (a, b), _ in fresh_candidates],
+        dtype=float,
+    )
+    # Defensive fallback: if positive-degree info is missing/degenerate (or too
+    # few non-zero weights to draw n_take distinct pairs), sample uniformly.
+    if weights.sum() <= 0 or int((weights > 0).sum()) < n_take:
+        log("DANS: degenerate weights -> uniform fallback")
+        weights = np.ones(len(fresh_candidates), dtype=float)
+
+    probs = weights / weights.sum()
+    rng = np.random.default_rng(seed)
+    idx = rng.choice(len(fresh_candidates), size=n_take, replace=False, p=probs)
+
+    chosen = [fresh_candidates[i] for i in idx]
+
+    def pa(a, b):
+        return pos_degree.get(a, 0) * pos_degree.get(b, 0)
+
+    chosen_degree = defaultdict(int)
+    for (a, b), _ in chosen:
+        chosen_degree[a] += 1
+        chosen_degree[b] += 1
+    pool_pa = float(np.mean([pa(a, b) for (a, b), _ in fresh_candidates]))
+    sel_pa = float(np.mean([pa(a, b) for (a, b), _ in chosen]))
+    log(f"DANS: selected {len(chosen)} negatives from pool of {len(fresh_candidates)}")
+    log(f"DANS: {len(chosen_degree)} domains used ({len(pos_degree)} in the positive set); "
+        f"mean PA pool={pool_pa:.1f} selected={sel_pa:.1f}")
     return chosen
 
 
@@ -375,18 +366,7 @@ def row_pfams(gene):
         "SELECT COUNT(*) FROM domain_domain_interaction "
         "WHERE negative = 0 AND source = '3did'"
     ).fetchone()[0]
-    n_negatome = conn.execute(
-        "SELECT COUNT(*) FROM domain_domain_interaction "
-        "WHERE negative = 1 AND source = 'negatome'"
-    ).fetchone()[0]
     log(f"n_positive_ddis_in_db = {n_positive}")
-    log(f"n_negatome_negatives_in_db = {n_negatome}")
-    if args.sampling_strategy == "degree_matched":
-        n_take = n_positive
-        log(f"n_take = {n_take} (degree_matched: matching positive count)")
-    else:
-        n_take = max(0, n_positive - n_negatome)
-        log(f"n_take (target for source='{args.source_label}') = {n_take}")
 
     fresh_candidates = []
     n_positive_ddis_in_negative_ppis = 0
@@ -399,16 +379,11 @@ def row_pfams(gene):
 
     log(f"n_positive_ddis_in_negative_ppis = {n_positive_ddis_in_negative_ppis}")
 
-    fresh_candidates.sort(key=lambda kv: kv[1], reverse=True)
     log(f"n_fresh_candidates_after_dedup = {len(fresh_candidates)}")
 
-    if args.sampling_strategy == "degree_matched":
-        log("using degree-matched sampling strategy")
-        pos_degree = _compute_positive_degree(conn)
-        chosen = select_degree_matched(fresh_candidates, pos_degree, n_take)
-    else:
-        log("using frequency-ranked sampling strategy")
-        chosen = fresh_candidates[:n_take]
+    log("selecting negatives via degree-aware node sampling (DANS)")
+    pos_degree = _compute_positive_degree(conn)
+    chosen = select_dans(fresh_candidates, pos_degree, n_positive, seed=args.seed)
     log(f"n_chosen = {len(chosen)}")
 
     if chosen:
diff --git a/modules.json b/modules.json
index 6e03911..7c9f7ff 100644
--- a/modules.json
+++ b/modules.json
@@ -7,7 +7,7 @@
                 "nf-core": {
                     "mmseqs/easycluster": {
                         "branch": "master",
-                        "git_sha": "38697a933bef7041bb935c9b8374d9948ce6c794",
+                        "git_sha": "6d46786420b4d7bc88eba026eb389c0c5535d120",
                         "installed_by": ["modules"]
                     }
                 }
diff --git a/modules/local/insert_ppi_negative_ddis/environment.yml b/modules/local/insert_ppi_negative_ddis/environment.yml
index 8b9582f..61b1f9f 100644
--- a/modules/local/insert_ppi_negative_ddis/environment.yml
+++ b/modules/local/insert_ppi_negative_ddis/environment.yml
@@ -6,4 +6,5 @@ dependencies:
   - sqlite
   - pandas
   - pyarrow
+  - numpy
   - requests
diff --git a/modules/local/insert_ppi_negative_ddis/main.nf b/modules/local/insert_ppi_negative_ddis/main.nf
index ba4792c..3dc3998 100644
--- a/modules/local/insert_ppi_negative_ddis/main.nf
+++ b/modules/local/insert_ppi_negative_ddis/main.nf
@@ -8,7 +8,6 @@ process INSERT_PPI_NEGATIVE_DDIS {
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
     path negative_ppi_parquet
     val  min_n_tested
-    val  sampling_strategy
     val  self_interaction
 
     output:
@@ -26,13 +25,13 @@ process INSERT_PPI_NEGATIVE_DDIS {
         --parquet "${negative_ppi_parquet}" \\
         --pfam-mapping-out uniprot_pfam_mapping.json \\
         --min-n-tested ${min_n_tested} \\
-        --sampling-strategy "${sampling_strategy}" \\
         ${no_self}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         python: \$(python3 -c 'import sys; print(sys.version.split()[0])')
         pyarrow: \$(python3 -c 'import pyarrow; print(pyarrow.__version__)')
+        numpy: \$(python3 -c 'import numpy; print(numpy.__version__)')
         sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)')
     END_VERSIONS
     """
diff --git a/modules/nf-core/mmseqs/easycluster/main.nf b/modules/nf-core/mmseqs/easycluster/main.nf
index b4686ab..ded1cb8 100644
--- a/modules/nf-core/mmseqs/easycluster/main.nf
+++ b/modules/nf-core/mmseqs/easycluster/main.nf
@@ -3,7 +3,7 @@ process MMSEQS_EASYCLUSTER {
     label 'process_medium'
 
     conda "${moduleDir}/environment.yml"
-    container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+    container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container
         ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/fe/fe49c17754753d6cd9a31e5894117edaf1c81e3d6053a12bf6dc8f3af1dffe23/data'
         : 'community.wave.seqera.io/library/mmseqs2:18.8cc5c--af05c9a98d9f6139'}"
 
@@ -14,7 +14,7 @@ process MMSEQS_EASYCLUSTER {
     tuple val(meta), path("*rep_seq.fasta"), emit: representatives
     tuple val(meta), path("*all_seqs.fasta"), emit: fasta
     tuple val(meta), path("*.tsv"), emit: tsv
-    path "versions.yml", emit: versions
+    tuple val("${task.process}"), val('mmseqs'), eval('mmseqs version'), topic: versions, emit: versions_mmseqs
 
     when:
     task.ext.when == null || task.ext.when
@@ -31,10 +31,6 @@ process MMSEQS_EASYCLUSTER {
         ${args} \\
         --threads ${task.cpus}
 
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //')
-    END_VERSIONS
     """
 
     stub:
@@ -47,9 +43,5 @@ process MMSEQS_EASYCLUSTER {
     touch ${prefix}_rep_seq.fasta
     touch ${prefix}_all_seqs.fasta
 
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //')
-    END_VERSIONS
     """
 }
diff --git a/modules/nf-core/mmseqs/easycluster/meta.yml b/modules/nf-core/mmseqs/easycluster/meta.yml
index 4451857..0b838ec 100644
--- a/modules/nf-core/mmseqs/easycluster/meta.yml
+++ b/modules/nf-core/mmseqs/easycluster/meta.yml
@@ -1,4 +1,3 @@
-# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
 name: "mmseqs_easycluster"
 description: Cluster sequences using MMSeqs2 easy cluster.
 keywords:
@@ -15,7 +14,8 @@ tools:
       documentation: "https://mmseqs.com/latest/userguide.pdf"
       tool_dev_url: "https://github.com/soedinglab/MMseqs2"
       doi: "10.1093/bioinformatics/btw006"
-      licence: ["GPL v3"]
+      licence:
+        - "GPL v3"
       identifier: biotools:mmseqs
 input:
   - - meta:
@@ -62,13 +62,27 @@ output:
           description: an adjacency list file containing the clusters
           ontologies:
             - edam: http://edamontology.org/format_3475 # TSV
+  versions_mmseqs:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - mmseqs:
+          type: string
+          description: The name of the tool
+      - mmseqs version:
+          type: eval
+          description: The expression to obtain the version of the tool
+topics:
   versions:
-    - versions.yml:
-        type: file
-        description: File containing software versions
-        pattern: "versions.yml"
-        ontologies:
-          - edam: http://edamontology.org/format_3750 # YAML
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - mmseqs:
+          type: string
+          description: The name of the tool
+      - mmseqs version:
+          type: eval
+          description: The expression to obtain the version of the tool
 authors:
   - "@Joon-Klaps"
 maintainers:
diff --git a/modules/nf-core/mmseqs/easycluster/tests/main.nf.test.snap b/modules/nf-core/mmseqs/easycluster/tests/main.nf.test.snap
index edb2c7e..5d27d6e 100644
--- a/modules/nf-core/mmseqs/easycluster/tests/main.nf.test.snap
+++ b/modules/nf-core/mmseqs/easycluster/tests/main.nf.test.snap
@@ -30,7 +30,11 @@
                     ]
                 ],
                 "3": [
-                    "versions.yml:md5,719ca0cf390aec3bd0edc9f819108c13"
+                    [
+                        "MMSEQS_EASYCLUSTER",
+                        "mmseqs",
+                        "18.8cc5c"
+                    ]
                 ],
                 "fasta": [
                     [
@@ -59,16 +63,20 @@
                         "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
                     ]
                 ],
-                "versions": [
-                    "versions.yml:md5,719ca0cf390aec3bd0edc9f819108c13"
+                "versions_mmseqs": [
+                    [
+                        "MMSEQS_EASYCLUSTER",
+                        "mmseqs",
+                        "18.8cc5c"
+                    ]
                 ]
             }
         ],
         "meta": {
             "nf-test": "0.9.2",
-            "nextflow": "25.10.0"
+            "nextflow": "25.10.4"
         },
-        "timestamp": "2025-11-01T16:21:16.919838587"
+        "timestamp": "2026-02-12T11:27:50.850138372"
     },
     "mmseqs/easycluster - sarscov2 - proteome": {
         "content": [
@@ -101,7 +109,11 @@
                     ]
                 ],
                 "3": [
-                    "versions.yml:md5,719ca0cf390aec3bd0edc9f819108c13"
+                    [
+                        "MMSEQS_EASYCLUSTER",
+                        "mmseqs",
+                        "18.8cc5c"
+                    ]
                 ],
                 "fasta": [
                     [
@@ -130,15 +142,19 @@
                         "test_cluster.tsv:md5,1cad5ce35cf71f8c438fd3ec5a786946"
                     ]
                 ],
-                "versions": [
-                    "versions.yml:md5,719ca0cf390aec3bd0edc9f819108c13"
+                "versions_mmseqs": [
+                    [
+                        "MMSEQS_EASYCLUSTER",
+                        "mmseqs",
+                        "18.8cc5c"
+                    ]
                 ]
             }
         ],
         "meta": {
             "nf-test": "0.9.2",
-            "nextflow": "25.10.0"
+            "nextflow": "25.10.4"
         },
-        "timestamp": "2025-11-01T16:21:12.483762944"
+        "timestamp": "2026-02-12T11:27:44.451570131"
     }
 }
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index 97a108c..53ce85f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -66,12 +66,12 @@ params {
     // gene_name_bait, gene_name_prey, n_tested, ...). COLLECT_DDI_DATA filters
     // rows by `n_tested`, maps genes to UniProt then Pfam (via the UniProt REST
     // API), enumerates the Pfam-pair cross product restricted to Pfam IDs already
-    // present in positive DDIs, samples pairs (frequency or degree-matched), and
+    // present in positive DDIs, samples pairs by degree-aware node sampling
+    // (DANS; weight = preferential attachment in the positive graph), and
     // inserts them as negatives so the total negative count matches the positive
     // count. Required input (no default; must be supplied per run).
     negative_ppi_parquet         = null
     negative_ppi_min_n_tested    = 5
-    negative_sampling_strategy   = 'degree_matched'  // 'frequency' or 'degree_matched'
 
     // Reviewed-human UniProt -> Pfam stream used to detect single-domain
     // proteins (accession, entry name, gene names, Pfam xrefs).
diff --git a/nextflow_schema.json b/nextflow_schema.json
index e74321b..ddf5853 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -177,13 +177,6 @@
                     "minimum": 1,
                     "fa_icon": "fas fa-layer-group"
                 },
-                "negative_sampling_strategy": {
-                    "type": "string",
-                    "description": "Strategy for sampling negative DDIs from the PPI parquet source.",
-                    "default": "degree_matched",
-                    "enum": ["frequency", "degree_matched"],
-                    "fa_icon": "fas fa-random"
-                },
                 "url_uniprot_swissprot_pfam": {
                     "type": "string",
                     "description": "UniProt stream URL (or local TSV path) of reviewed human proteins with fields accession, entry name, gene names and Pfam xrefs. Used to detect single-domain proteins for the HIPPIE-derived positive DDIs.",
diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf
index 6c3d9da..3c377f4 100644
--- a/subworkflows/local/collect_ddi_data/main.nf
+++ b/subworkflows/local/collect_ddi_data/main.nf
@@ -77,7 +77,6 @@ workflow COLLECT_DDI_DATA {
         domainsplit_db,
         file(negative_ppi_parquet),
         params.negative_ppi_min_n_tested,
-        params.negative_sampling_strategy,
         params.self_interaction,
     )
     domainsplit_db = ppi_result.domainsplit_db

From 33b58944ad02f05a2b2c90b8b8bf53032dc42e75 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Tue, 9 Jun 2026 22:26:58 +0200
Subject: [PATCH 03/16] fixed warnings

---
 modules.json                                  |  4 +-
 nextflow_schema.json                          |  1 +
 subworkflows/local/collect_ddi_data/meta.yml  | 59 +++++++++++++++++++
 subworkflows/local/curate_domains/meta.yml    | 30 ++++++++++
 .../local/enrich_ddi_database/meta.yml        | 58 ++++++++++++++++++
 .../local/generate_embeddings/meta.yml        | 29 +++++++++
 .../local/split_domainsplit_database/meta.yml |  8 +--
 .../utils_nfcore_domainsplit_pipeline/main.nf |  5 +-
 .../meta.yml                                  | 28 +++++++++
 .../nf-core/utils_nextflow_pipeline/main.nf   | 20 +++++--
 .../nf-core/utils_nfschema_plugin/main.nf     | 12 ++--
 .../nf-core/utils_nfschema_plugin/meta.yml    | 24 ++++++++
 .../utils_nfschema_plugin/tests/main.nf.test  |  5 ++
 .../tests/nextflow.config                     |  2 +-
 14 files changed, 266 insertions(+), 19 deletions(-)
 create mode 100644 subworkflows/local/collect_ddi_data/meta.yml
 create mode 100644 subworkflows/local/curate_domains/meta.yml
 create mode 100644 subworkflows/local/enrich_ddi_database/meta.yml
 create mode 100644 subworkflows/local/generate_embeddings/meta.yml
 create mode 100644 subworkflows/local/utils_nfcore_domainsplit_pipeline/meta.yml

diff --git a/modules.json b/modules.json
index 7c9f7ff..5595224 100644
--- a/modules.json
+++ b/modules.json
@@ -16,7 +16,7 @@
                 "nf-core": {
                     "utils_nextflow_pipeline": {
                         "branch": "master",
-                        "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d",
+                        "git_sha": "1a545fcbd762911c21a64ced3dbef99b2b51ac75",
                         "installed_by": ["subworkflows"]
                     },
                     "utils_nfcore_pipeline": {
@@ -26,7 +26,7 @@
                     },
                     "utils_nfschema_plugin": {
                         "branch": "master",
-                        "git_sha": "fdc08b8b1ae74f56686ce21f7ea11ad11990ce57",
+                        "git_sha": "a7b27fd25bfa8dcc07d299e88bd790585901a436",
                         "installed_by": ["subworkflows"]
                     }
                 }
diff --git a/nextflow_schema.json b/nextflow_schema.json
index ddf5853..5dd15a9 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -15,6 +15,7 @@
                 "input": {
                     "type": "string",
                     "format": "file-path",
+                    "mimetype": "text/csv",
                     "description": "Unused samplesheet placeholder; this pipeline reads inputs from the url_* parameters instead.",
                     "help_text": "Kept for nf-core template compatibility. Source databases are configured via the url_* parameters in nextflow.config.",
                     "fa_icon": "fas fa-file-csv",
diff --git a/subworkflows/local/collect_ddi_data/meta.yml b/subworkflows/local/collect_ddi_data/meta.yml
new file mode 100644
index 0000000..750ebd0
--- /dev/null
+++ b/subworkflows/local/collect_ddi_data/meta.yml
@@ -0,0 +1,59 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
+name: "collect_ddi_data"
+description: Download and parse every DDI source into the pre-initialised Domainsplit SQLite, applying the fixed source-priority order and optional smoke filter.
+keywords:
+  - ddi
+  - 3did
+  - negatome
+  - ppi
+  - database
+components:
+  - download/3did/sqlite
+  - download/negatome
+  - insert/3did
+  - build/swissprot/pfam/map
+  - insert/single/domain/ppi
+  - insert/ppidm
+  - insert/negatome
+  - remove/self/interactions
+  - insert/ppi/negative/ddis
+  - smoke/filter
+input:
+  - domainsplit_db_in:
+      type: file
+      description: Pre-initialised empty Domainsplit SQLite database.
+      pattern: "*.sqlite3"
+  - url_3did:
+      type: string
+      description: URL of the 3did flat file dump.
+  - url_negatome:
+      type: string
+      description: URL of the Negatome combined dataset.
+  - url_uniprot_swissprot_pfam:
+      type: string
+      description: URL of the UniProt SwissProt-to-Pfam mapping used to build single-domain PPIs.
+  - hippie_tsv:
+      type: file
+      description: HIPPIE PPI table used to derive single-domain positive DDIs.
+      pattern: "*.{tsv,txt}"
+  - ppidm_tsv:
+      type: file
+      description: PPIDM inferred domain-domain interaction table.
+      pattern: "*.{tsv,txt}"
+  - negative_ppi_parquet:
+      type: file
+      description: High-confidence non-PPI pairs used to derive negative DDIs.
+      pattern: "*.parquet"
+output:
+  - domainsplit_db:
+      type: file
+      description: Domainsplit SQLite populated with positive and negative DDIs.
+      pattern: "*.sqlite3"
+  - pfam_mapping:
+      type: file
+      description: SwissProt protein-to-Pfam domain mapping produced while building single-domain PPIs.
+      pattern: "*.{tsv,parquet}"
+authors:
+  - "@KonstantinPelz"
+maintainers:
+  - "@KonstantinPelz"
diff --git a/subworkflows/local/curate_domains/meta.yml b/subworkflows/local/curate_domains/meta.yml
new file mode 100644
index 0000000..97c2e04
--- /dev/null
+++ b/subworkflows/local/curate_domains/meta.yml
@@ -0,0 +1,30 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
+name: "curate_domains"
+description: Enumerate the unique Pfam domains referenced by the DDI set, download their Pfam alignments, and build a protein-to-domain map.
+keywords:
+  - pfam
+  - domain
+  - alignment
+  - mapping
+components:
+  - extract/unique/domains
+  - download/pfam/alignments/batch
+  - create/protein/domain/mapping
+input:
+  - domainsplit_db:
+      type: file
+      description: Domainsplit SQLite after DDI collection and smoke filtering.
+      pattern: "*.sqlite3"
+  - input_uniprot_id_mapping:
+      type: file
+      description: UniProt ID mapping used to associate proteins with Pfam domains.
+      pattern: "*.{tsv,dat,gz}"
+output:
+  - protein_domain_map:
+      type: file
+      description: Protein-to-Pfam-domain mapping derived from the Pfam alignments.
+      pattern: "*.{tsv,parquet}"
+authors:
+  - "@KonstantinPelz"
+maintainers:
+  - "@KonstantinPelz"
diff --git a/subworkflows/local/enrich_ddi_database/meta.yml b/subworkflows/local/enrich_ddi_database/meta.yml
new file mode 100644
index 0000000..fdae557
--- /dev/null
+++ b/subworkflows/local/enrich_ddi_database/meta.yml
@@ -0,0 +1,58 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
+name: "enrich_ddi_database"
+description: Sequentially annotate the DDI database with domain GO terms, proteins plus per-residue embeddings, protein GO terms, STRING PPIs, and the per-domain protein/embedding map.
+keywords:
+  - ddi
+  - go
+  - embeddings
+  - ppi
+  - annotation
+components:
+  - insert/domain/go/terms
+  - insert/proteins/with/embeddings
+  - insert/protein/go/terms
+  - insert/ppi
+  - insert/domain/protein/mapping
+input:
+  - domainsplit_db_in:
+      type: file
+      description: Domainsplit SQLite with curated domains.
+      pattern: "*.sqlite3"
+  - input_pfam2go:
+      type: file
+      description: pfam2go mapping of Pfam domains to GO terms.
+  - input_uniprot_sequences:
+      type: file
+      description: UniProt protein sequences.
+      pattern: "*.{fasta,fa,gz}"
+  - protein_domain_map:
+      type: file
+      description: Protein-to-Pfam-domain mapping from CURATE_DOMAINS.
+  - prott5_embeddings:
+      type: file
+      description: ProtT5 per-residue protein embeddings.
+      pattern: "*.h5"
+  - input_uniprot_go_terms:
+      type: file
+      description: UniProt protein-to-GO-term annotations.
+  - input_string:
+      type: file
+      description: STRING protein-protein interaction table.
+  - input_uniprot_id_mapping:
+      type: file
+      description: UniProt ID mapping linking STRING IDs to UniProt accessions.
+  - esm_protein_embeddings:
+      type: file
+      description: ESM per-residue protein embeddings.
+  - esm_domain_embeddings:
+      type: file
+      description: ESM pooled per-domain embeddings.
+output:
+  - domainsplit_db:
+      type: file
+      description: Fully enriched Domainsplit SQLite database.
+      pattern: "*.sqlite3"
+authors:
+  - "@KonstantinPelz"
+maintainers:
+  - "@KonstantinPelz"
diff --git a/subworkflows/local/generate_embeddings/meta.yml b/subworkflows/local/generate_embeddings/meta.yml
new file mode 100644
index 0000000..dfd0981
--- /dev/null
+++ b/subworkflows/local/generate_embeddings/meta.yml
@@ -0,0 +1,29 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
+name: "generate_embeddings"
+description: Generate ESM per-residue protein embeddings and pooled per-domain embeddings against the supplied protein-to-domain map.
+keywords:
+  - esm
+  - embeddings
+  - protein
+  - domain
+components:
+  - generate/esm/embeddings
+input:
+  - protein_domain_map:
+      type: file
+      description: Protein-to-Pfam-domain mapping used to pool per-domain embeddings.
+  - input_uniprot_sequences:
+      type: file
+      description: UniProt protein sequences to embed.
+      pattern: "*.{fasta,fa,gz}"
+output:
+  - esm_protein_embeddings:
+      type: file
+      description: ESM per-residue protein embeddings.
+  - esm_domain_embeddings:
+      type: file
+      description: ESM pooled per-domain embeddings.
+authors:
+  - "@KonstantinPelz"
+maintainers:
+  - "@KonstantinPelz"
diff --git a/subworkflows/local/split_domainsplit_database/meta.yml b/subworkflows/local/split_domainsplit_database/meta.yml
index f2212c8..7298266 100644
--- a/subworkflows/local/split_domainsplit_database/meta.yml
+++ b/subworkflows/local/split_domainsplit_database/meta.yml
@@ -7,10 +7,10 @@ keywords:
   - clustering
 components:
   - mmseqs/easycluster
-  - random_ddi_split
-  - random_denoise_split
-  - minimal_leakage_split
-  - split_database
+  - random/ddi/split
+  - extract/domain/sequences
+  - minimal/leakage/split/domain
+  - subset/ddis/by/source
 input:
   - domainsplit_db_ch:
       type: file
diff --git a/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf b/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf
index 5ba5d41..2f778e3 100644
--- a/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf
+++ b/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf
@@ -10,7 +10,6 @@
 
 include { UTILS_NFSCHEMA_PLUGIN     } from '../../nf-core/utils_nfschema_plugin'
 include { paramsSummaryMap          } from 'plugin/nf-schema'
-include { paramsHelp                } from 'plugin/nf-schema'
 include { completionEmail           } from '../../nf-core/utils_nfcore_pipeline'
 include { completionSummary         } from '../../nf-core/utils_nfcore_pipeline'
 include { UTILS_NFCORE_PIPELINE     } from '../../nf-core/utils_nfcore_pipeline'
@@ -137,7 +136,6 @@ workflow PIPELINE_COMPLETION {
 // Generate methods description for MultiQC
 //
 def toolCitationText() {
-    // TODO nf-core: Optionally add in-text citation tools to this list.
     // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "",
     // Uncomment function in methodsDescriptionText to render in MultiQC report
     def citation_text = [
@@ -149,7 +147,6 @@ def toolCitationText() {
 }
 
 def toolBibliographyText() {
-    // TODO nf-core: Optionally add bibliographic entries to this list.
     // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "<li>Author (2023) Pub name, Journal, DOI</li>" : "",
     // Uncomment function in methodsDescriptionText to render in MultiQC report
     def reference_text = [
@@ -182,7 +179,7 @@ def methodsDescriptionText(mqc_methods_yaml) {
     meta["tool_citations"] = ""
     meta["tool_bibliography"] = ""
 
-    // TODO nf-core: Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled!
+    // Uncomment below once logic in toolCitationText/toolBibliographyText has been filled.
     // meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".")
     // meta["tool_bibliography"] = toolBibliographyText()
 
diff --git a/subworkflows/local/utils_nfcore_domainsplit_pipeline/meta.yml b/subworkflows/local/utils_nfcore_domainsplit_pipeline/meta.yml
new file mode 100644
index 0000000..5f3bc49
--- /dev/null
+++ b/subworkflows/local/utils_nfcore_domainsplit_pipeline/meta.yml
@@ -0,0 +1,28 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
+name: "PIPELINE_INITIALISATION"
+description: Subworkflow with functionality specific to the daisybio/domainsplit pipeline (initialisation and completion).
+keywords:
+  - utility
+  - pipeline
+  - initialise
+  - completion
+components:
+  - utils_nextflow_pipeline
+  - utils_nfcore_pipeline
+  - utils_nfschema_plugin
+  - completionemail
+  - completionsummary
+input:
+  - nextflow_cli_args:
+      type: list
+      description: |
+        Nextflow CLI positional arguments
+output:
+  - success:
+      type: boolean
+      description: |
+        Dummy output to indicate success
+authors:
+  - "@KonstantinPelz"
+maintainers:
+  - "@KonstantinPelz"
diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf
index d6e593e..37939ac 100644
--- a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf
+++ b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf
@@ -73,11 +73,23 @@ def getWorkflowVersion() {
 def dumpParametersToJSON(outdir) {
     def timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss')
     def filename  = "params_${timestamp}.json"
-    def temp_pf   = new File(workflow.launchDir.toString(), ".${filename}")
-    def jsonStr   = groovy.json.JsonOutput.toJson(params)
+    def temp_pf       = workflow.launchDir.resolve(".${filename}")
+    def jsonGenerator = new groovy.json.JsonGenerator.Options()
+        .excludeNulls()
+        .addConverter(Path) { Path path -> path.toUriString() }
+        .addConverter(Duration) { Duration duration -> duration.toMillis() }
+        .addConverter(MemoryUnit) { MemoryUnit memory -> memory.toBytes() }
+        .addConverter(nextflow.script.types.VersionNumber) { nextflow.script.types.VersionNumber version -> version.toString() }
+        .build()
+    def jsonStr   = jsonGenerator.toJson(params)
     temp_pf.text  = groovy.json.JsonOutput.prettyPrint(jsonStr)
-
-    nextflow.extension.FilesEx.copyTo(temp_pf.toPath(), "${outdir}/pipeline_info/params_${timestamp}.json")
+    if (outdir instanceof Path) {
+        temp_pf.copyTo(outdir.resolve("pipeline_info/${filename}"))
+    } else if (outdir instanceof String) {
+        temp_pf.copyTo("${outdir}/pipeline_info/params_${timestamp}.json")
+    } else {
+        log.warn("Could not determine type of outdir, parameters JSON file will not be copied to output directory!")
+    }
     temp_pf.delete()
 }
 
diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf
index 1df8b76..9ff0681 100644
--- a/subworkflows/nf-core/utils_nfschema_plugin/main.nf
+++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf
@@ -22,6 +22,7 @@ workflow UTILS_NFSCHEMA_PLUGIN {
     before_text         // string:   text to show before the help message and parameters summary
     after_text          // string:   text to show after the help message and parameters summary
     command             // string:   an example command of the pipeline
+    cli_typecast        // boolean:  whether to perform typecasting of CLI parameters. Set this to `null` to use the default behaviour
 
     main:
 
@@ -34,11 +35,11 @@ workflow UTILS_NFSCHEMA_PLUGIN {
             fullHelp: help_full,
         ]
         if(parameters_schema) {
-            help_options << [parametersSchema: parameters_schema]
+            help_options << [parameters_schema: parameters_schema]
         }
         log.info paramsHelp(
             help_options,
-            (params.help instanceof String && params.help != "true") ? params.help : "",
+            (help instanceof String && help != "true") ? help : "",
         )
         exit 0
     }
@@ -50,7 +51,7 @@ workflow UTILS_NFSCHEMA_PLUGIN {
 
     summary_options = [:]
     if(parameters_schema) {
-        summary_options << [parametersSchema: parameters_schema]
+        summary_options << [parameters_schema: parameters_schema]
     }
     log.info before_text
     log.info paramsSummaryLog(summary_options, input_workflow)
@@ -63,7 +64,10 @@ workflow UTILS_NFSCHEMA_PLUGIN {
     if(validate_params) {
         validateOptions = [:]
         if(parameters_schema) {
-            validateOptions << [parametersSchema: parameters_schema]
+            validateOptions << [parameters_schema: parameters_schema]
+        }
+        if(cli_typecast != null) {
+            validateOptions << [cast_cli_params: cli_typecast]
         }
         validateParameters(validateOptions)
     }
diff --git a/subworkflows/nf-core/utils_nfschema_plugin/meta.yml b/subworkflows/nf-core/utils_nfschema_plugin/meta.yml
index f7d9f02..1d8c75a 100644
--- a/subworkflows/nf-core/utils_nfschema_plugin/meta.yml
+++ b/subworkflows/nf-core/utils_nfschema_plugin/meta.yml
@@ -25,6 +25,30 @@ input:
         option. When this input is empty it will automatically use the configured schema or
         "${projectDir}/nextflow_schema.json" as default. The schema should not be given in this way
         for meta pipelines.
+  - help:
+      type: boolean, string
+      description: |
+        Show the help message and exit. When a parameter name is given, show the help message for that parameter instead of the general help message.
+  - help_full:
+      type: boolean
+      description: Show the full help message and exit.
+  - show_hidden:
+      type: boolean
+      description: Show hidden parameters in the help message.
+  - before_text:
+      type: string
+      description: Text to show before the parameters summary and help message.
+  - after_text:
+      type: string
+      description: Text to show after the parameters summary and help message.
+  - command:
+      type: string
+      description: An example command to run the pipeline, to show in the help message and the summary.
+  - cli_typecast:
+      type: boolean
+      description: |
+        Whether to apply typecasting to the parameters given via the CLI before validation.
+        Set this to `null` to use the default behavior.
 output:
   - dummy_emit:
       type: boolean
diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test
index c977917..1fd1eac 100644
--- a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test
+++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test
@@ -31,6 +31,7 @@ nextflow_workflow {
                 input[6] = ""
                 input[7] = ""
                 input[8] = ""
+                input[9] = null
                 """
             }
         }
@@ -63,6 +64,7 @@ nextflow_workflow {
                 input[6] = ""
                 input[7] = ""
                 input[8] = ""
+                input[9] = null
                 """
             }
         }
@@ -95,6 +97,7 @@ nextflow_workflow {
                 input[6] = ""
                 input[7] = ""
                 input[8] = ""
+                input[9] = null
                 """
             }
         }
@@ -127,6 +130,7 @@ nextflow_workflow {
                 input[6] = ""
                 input[7] = ""
                 input[8] = ""
+                input[9] = null
                 """
             }
         }
@@ -160,6 +164,7 @@ nextflow_workflow {
                 input[6] = "Before"
                 input[7] = "After"
                 input[8] = "nextflow run test/test"
+                input[9] = null
                 """
             }
         }
diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config
index f6537cc..fd71cb8 100644
--- a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config
+++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config
@@ -1,5 +1,5 @@
 plugins {
-    id "nf-schema@2.6.1"
+    id "nf-schema@2.7.2"
 }
 
 validation {

From d9575d31cecba169a04673d1d662453b14a5128c Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Tue, 9 Jun 2026 22:42:34 +0200
Subject: [PATCH 04/16] changed something back written by claude

---
 .../nf-core/utils_nextflow_pipeline/main.nf   | 19 +++----------------
 .../nf-core/utils_nfschema_plugin/main.nf     | 12 ++++--------
 .../utils_nfschema_plugin/tests/main.nf.test  |  5 -----
 3 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf
index 37939ac..207c487 100644
--- a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf
+++ b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf
@@ -73,23 +73,10 @@ def getWorkflowVersion() {
 def dumpParametersToJSON(outdir) {
     def timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss')
     def filename  = "params_${timestamp}.json"
-    def temp_pf       = workflow.launchDir.resolve(".${filename}")
-    def jsonGenerator = new groovy.json.JsonGenerator.Options()
-        .excludeNulls()
-        .addConverter(Path) { Path path -> path.toUriString() }
-        .addConverter(Duration) { Duration duration -> duration.toMillis() }
-        .addConverter(MemoryUnit) { MemoryUnit memory -> memory.toBytes() }
-        .addConverter(nextflow.script.types.VersionNumber) { nextflow.script.types.VersionNumber version -> version.toString() }
-        .build()
-    def jsonStr   = jsonGenerator.toJson(params)
+    def temp_pf   = new File(workflow.launchDir.toString(), ".${filename}")
+    def jsonStr   = groovy.json.JsonOutput.toJson(params)
     temp_pf.text  = groovy.json.JsonOutput.prettyPrint(jsonStr)
-    if (outdir instanceof Path) {
-        temp_pf.copyTo(outdir.resolve("pipeline_info/${filename}"))
-    } else if (outdir instanceof String) {
-        temp_pf.copyTo("${outdir}/pipeline_info/params_${timestamp}.json")
-    } else {
-        log.warn("Could not determine type of outdir, parameters JSON file will not be copied to output directory!")
-    }
+    nextflow.extension.FilesEx.copyTo(temp_pf.toPath(), "${outdir}/pipeline_info/params_${timestamp}.json")
     temp_pf.delete()
 }
 
diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf
index 9ff0681..1df8b76 100644
--- a/subworkflows/nf-core/utils_nfschema_plugin/main.nf
+++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf
@@ -22,7 +22,6 @@ workflow UTILS_NFSCHEMA_PLUGIN {
     before_text         // string:   text to show before the help message and parameters summary
     after_text          // string:   text to show after the help message and parameters summary
     command             // string:   an example command of the pipeline
-    cli_typecast        // boolean:  whether to perform typecasting of CLI parameters. Set this to `null` to use the default behaviour
 
     main:
 
@@ -35,11 +34,11 @@ workflow UTILS_NFSCHEMA_PLUGIN {
             fullHelp: help_full,
         ]
         if(parameters_schema) {
-            help_options << [parameters_schema: parameters_schema]
+            help_options << [parametersSchema: parameters_schema]
         }
         log.info paramsHelp(
             help_options,
-            (help instanceof String && help != "true") ? help : "",
+            (params.help instanceof String && params.help != "true") ? params.help : "",
         )
         exit 0
     }
@@ -51,7 +50,7 @@ workflow UTILS_NFSCHEMA_PLUGIN {
 
     summary_options = [:]
     if(parameters_schema) {
-        summary_options << [parameters_schema: parameters_schema]
+        summary_options << [parametersSchema: parameters_schema]
     }
     log.info before_text
     log.info paramsSummaryLog(summary_options, input_workflow)
@@ -64,10 +63,7 @@ workflow UTILS_NFSCHEMA_PLUGIN {
     if(validate_params) {
         validateOptions = [:]
         if(parameters_schema) {
-            validateOptions << [parameters_schema: parameters_schema]
-        }
-        if(cli_typecast != null) {
-            validateOptions << [cast_cli_params: cli_typecast]
+            validateOptions << [parametersSchema: parameters_schema]
         }
         validateParameters(validateOptions)
     }
diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test
index 1fd1eac..c977917 100644
--- a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test
+++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test
@@ -31,7 +31,6 @@ nextflow_workflow {
                 input[6] = ""
                 input[7] = ""
                 input[8] = ""
-                input[9] = null
                 """
             }
         }
@@ -64,7 +63,6 @@ nextflow_workflow {
                 input[6] = ""
                 input[7] = ""
                 input[8] = ""
-                input[9] = null
                 """
             }
         }
@@ -97,7 +95,6 @@ nextflow_workflow {
                 input[6] = ""
                 input[7] = ""
                 input[8] = ""
-                input[9] = null
                 """
             }
         }
@@ -130,7 +127,6 @@ nextflow_workflow {
                 input[6] = ""
                 input[7] = ""
                 input[8] = ""
-                input[9] = null
                 """
             }
         }
@@ -164,7 +160,6 @@ nextflow_workflow {
                 input[6] = "Before"
                 input[7] = "After"
                 input[8] = "nextflow run test/test"
-                input[9] = null
                 """
             }
         }

From b790ec8a29312d1befb994474ac3082d73f2ed40 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Tue, 9 Jun 2026 22:52:31 +0200
Subject: [PATCH 05/16] fixed errors

---
 .../utils_nfcore_domainsplit_pipeline/main.nf |  3 ++-
 .../nf-core/utils_nextflow_pipeline/main.nf   | 19 ++++++++++++++++---
 .../nf-core/utils_nfschema_plugin/main.nf     | 12 ++++++++----
 .../utils_nfschema_plugin/tests/main.nf.test  |  5 +++++
 4 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf b/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf
index 2f778e3..c09a912 100644
--- a/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf
+++ b/subworkflows/local/utils_nfcore_domainsplit_pipeline/main.nf
@@ -68,7 +68,8 @@ workflow PIPELINE_INITIALISATION {
         show_hidden,
         before_text,
         after_text,
-        command
+        command,
+        null
     )
 
     //
diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf
index 207c487..37939ac 100644
--- a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf
+++ b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf
@@ -73,10 +73,23 @@ def getWorkflowVersion() {
 def dumpParametersToJSON(outdir) {
     def timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss')
     def filename  = "params_${timestamp}.json"
-    def temp_pf   = new File(workflow.launchDir.toString(), ".${filename}")
-    def jsonStr   = groovy.json.JsonOutput.toJson(params)
+    def temp_pf       = workflow.launchDir.resolve(".${filename}")
+    def jsonGenerator = new groovy.json.JsonGenerator.Options()
+        .excludeNulls()
+        .addConverter(Path) { Path path -> path.toUriString() }
+        .addConverter(Duration) { Duration duration -> duration.toMillis() }
+        .addConverter(MemoryUnit) { MemoryUnit memory -> memory.toBytes() }
+        .addConverter(nextflow.script.types.VersionNumber) { nextflow.script.types.VersionNumber version -> version.toString() }
+        .build()
+    def jsonStr   = jsonGenerator.toJson(params)
     temp_pf.text  = groovy.json.JsonOutput.prettyPrint(jsonStr)
-    nextflow.extension.FilesEx.copyTo(temp_pf.toPath(), "${outdir}/pipeline_info/params_${timestamp}.json")
+    if (outdir instanceof Path) {
+        temp_pf.copyTo(outdir.resolve("pipeline_info/${filename}"))
+    } else if (outdir instanceof String) {
+        temp_pf.copyTo("${outdir}/pipeline_info/params_${timestamp}.json")
+    } else {
+        log.warn("Could not determine type of outdir, parameters JSON file will not be copied to output directory!")
+    }
     temp_pf.delete()
 }
 
diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf
index 1df8b76..9ff0681 100644
--- a/subworkflows/nf-core/utils_nfschema_plugin/main.nf
+++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf
@@ -22,6 +22,7 @@ workflow UTILS_NFSCHEMA_PLUGIN {
     before_text         // string:   text to show before the help message and parameters summary
     after_text          // string:   text to show after the help message and parameters summary
     command             // string:   an example command of the pipeline
+    cli_typecast        // boolean:  whether to perform typecasting of CLI parameters. Set this to `null` to use the default behaviour
 
     main:
 
@@ -34,11 +35,11 @@ workflow UTILS_NFSCHEMA_PLUGIN {
             fullHelp: help_full,
         ]
         if(parameters_schema) {
-            help_options << [parametersSchema: parameters_schema]
+            help_options << [parameters_schema: parameters_schema]
         }
         log.info paramsHelp(
             help_options,
-            (params.help instanceof String && params.help != "true") ? params.help : "",
+            (help instanceof String && help != "true") ? help : "",
         )
         exit 0
     }
@@ -50,7 +51,7 @@ workflow UTILS_NFSCHEMA_PLUGIN {
 
     summary_options = [:]
     if(parameters_schema) {
-        summary_options << [parametersSchema: parameters_schema]
+        summary_options << [parameters_schema: parameters_schema]
     }
     log.info before_text
     log.info paramsSummaryLog(summary_options, input_workflow)
@@ -63,7 +64,10 @@ workflow UTILS_NFSCHEMA_PLUGIN {
     if(validate_params) {
         validateOptions = [:]
         if(parameters_schema) {
-            validateOptions << [parametersSchema: parameters_schema]
+            validateOptions << [parameters_schema: parameters_schema]
+        }
+        if(cli_typecast != null) {
+            validateOptions << [cast_cli_params: cli_typecast]
         }
         validateParameters(validateOptions)
     }
diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test
index c977917..1fd1eac 100644
--- a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test
+++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test
@@ -31,6 +31,7 @@ nextflow_workflow {
                 input[6] = ""
                 input[7] = ""
                 input[8] = ""
+                input[9] = null
                 """
             }
         }
@@ -63,6 +64,7 @@ nextflow_workflow {
                 input[6] = ""
                 input[7] = ""
                 input[8] = ""
+                input[9] = null
                 """
             }
         }
@@ -95,6 +97,7 @@ nextflow_workflow {
                 input[6] = ""
                 input[7] = ""
                 input[8] = ""
+                input[9] = null
                 """
             }
         }
@@ -127,6 +130,7 @@ nextflow_workflow {
                 input[6] = ""
                 input[7] = ""
                 input[8] = ""
+                input[9] = null
                 """
             }
         }
@@ -160,6 +164,7 @@ nextflow_workflow {
                 input[6] = "Before"
                 input[7] = "After"
                 input[8] = "nextflow run test/test"
+                input[9] = null
                 """
             }
         }

From 9a4832c4bdf32ed77dee5dd5197af20b4dc0200e Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Tue, 9 Jun 2026 23:51:57 +0200
Subject: [PATCH 06/16] changed two parameters for prott5 into one

---
 nextflow.config                                       |  9 ++-------
 nextflow_schema.json                                  |  8 +-------
 subworkflows/local/enrich_ddi_database/main.nf        |  2 +-
 subworkflows/local/generate_embeddings/main.nf        |  2 +-
 subworkflows/local/split_domainsplit_database/main.nf |  2 +-
 workflows/domainsplit.nf                              | 11 +----------
 6 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 53ce85f..08bf661 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -10,11 +10,11 @@
 params {
 
     // Source database URLs
-    url_3did                      = 'https://3did.irbbarcelona.org/download/2022_01/3did.sql.gz'
+    url_3did                      = 'https://3did.irbbarcelona.org/download/current/3did.sql.gz'
 
     // URLs for uniprot data sources
     url_uniprot_id_mapping = 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz'
-    url_uniprot_embeddings = 'https://ftp.ebi.ac.uk/pub/contrib/UniProt/embeddings/current_release/UP000005640_9606/per-residue.h5'
+    url_uniprot_prott5_embeddings = 'https://ftp.ebi.ac.uk/pub/contrib/UniProt/embeddings/current_release/UP000005640_9606/per-residue.h5'
     url_uniprot_go_terms = 'https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Cgo_id&format=tsv&query=%28%28database%3AGO%29+AND+%28reviewed%3Atrue%29%29'
     url_uniprot_sequences = 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz'
 
@@ -25,11 +25,6 @@ params {
 
     url_pfam_template = 'https://www.ebi.ac.uk/interpro/wwwapi//entry/pfam/{pfam_id}/?annotation=alignment:full&download'
 
-    // ProtT5 per-residue embeddings: optional local path to a pre-downloaded
-    // EBI per-residue.h5 file. When set and the file exists it is used; otherwise
-    // the embeddings are downloaded from url_uniprot_embeddings (ProtT5 always runs).
-    prott5_per_residue_h5              = null
-
     // ESM embedding sharding + inference knobs.
     // The protein FASTA is split into `esm_protein_shards` shards that run in
     // parallel as GPU tasks (capped by cluster QoS via `maxForks` in slurm.config).
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 5dd15a9..77796a4 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -55,7 +55,7 @@
                     "default": "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz",
                     "fa_icon": "fas fa-link"
                 },
-                "url_uniprot_embeddings": {
+                "url_uniprot_prott5_embeddings": {
                     "type": "string",
                     "description": "URL of the per-residue ProtT5 UniProt embeddings HDF5.",
                     "default": "https://ftp.ebi.ac.uk/pub/contrib/UniProt/embeddings/current_release/UP000005640_9606/per-residue.h5",
@@ -97,12 +97,6 @@
                     "default": "https://www.ebi.ac.uk/interpro/wwwapi//entry/pfam/{pfam_id}/?annotation=alignment:full&download",
                     "fa_icon": "fas fa-link"
                 },
-                "prott5_per_residue_h5": {
-                    "type": "string",
-                    "description": "Optional local path to a pre-downloaded ProtT5 per-residue HDF5 file. When set and present it is used; otherwise the file is downloaded from url_uniprot_embeddings.",
-                    "default": null,
-                    "fa_icon": "fas fa-file"
-                },
                 "esm_protein_shards": {
                     "type": "integer",
                     "description": "Number of FASTA shards for parallel per-residue ESM embedding generation over the protein sequences. Each shard runs as one GPU task; cluster QoS caps concurrency (see slurm.config).",
diff --git a/subworkflows/local/enrich_ddi_database/main.nf b/subworkflows/local/enrich_ddi_database/main.nf
index da9eaeb..56df49b 100644
--- a/subworkflows/local/enrich_ddi_database/main.nf
+++ b/subworkflows/local/enrich_ddi_database/main.nf
@@ -58,7 +58,7 @@ workflow ENRICH_DDI_DATABASE {
         db_after_ppi,
         protein_domain_map,
         esm_domain_embeddings
-    ).domainsplit_db.first()
+    ).domainsplit_db
 
     emit:
     domainsplit_db
diff --git a/subworkflows/local/generate_embeddings/main.nf b/subworkflows/local/generate_embeddings/main.nf
index 1abbeae..c043aec 100644
--- a/subworkflows/local/generate_embeddings/main.nf
+++ b/subworkflows/local/generate_embeddings/main.nf
@@ -6,7 +6,7 @@
     ESM path produces both per-residue protein embeddings and pooled domain
     embeddings against the supplied protein <-> domain map.
 
-    ProtT5 embeddings are supplied externally via params.prott5_per_residue_h5
+    ProtT5 embeddings are supplied externally via params.url_uniprot_prott5_embeddings
     and resolved in the top-level workflow (domainsplit.nf).
 ----------------------------------------------------------------------------*/
 
diff --git a/subworkflows/local/split_domainsplit_database/main.nf b/subworkflows/local/split_domainsplit_database/main.nf
index fc1ffb9..972b410 100644
--- a/subworkflows/local/split_domainsplit_database/main.nf
+++ b/subworkflows/local/split_domainsplit_database/main.nf
@@ -45,7 +45,7 @@ workflow SPLIT_DOMAINSPLIT_DATABASE {
     }
 
     clusters = MMSEQS_EASYCLUSTER(cluster_input)
-    def clusters_tsv = clusters.tsv.filter { it[0].id == "domain" }.map { it[1] }.first()
+    def clusters_tsv = clusters.tsv.filter { it[0].id == "domain" }.map { it[1] }
 
     def splits = [
         ["train", 0.6],
diff --git a/workflows/domainsplit.nf b/workflows/domainsplit.nf
index 93f93f4..7f5fe85 100644
--- a/workflows/domainsplit.nf
+++ b/workflows/domainsplit.nf
@@ -28,16 +28,7 @@ main:
     input_string             = file(params.url_string)
     input_pfam2go            = file(params.url_pfam2go)
 
-    // ProtT5 per-residue embeddings: prefer a pre-downloaded local file when it
-    // exists, otherwise fall back to downloading url_uniprot_embeddings. Always
-    // populated, so ProtT5 embeddings are a compulsory step.
-    def prott5_file = file(params.url_uniprot_embeddings)
-    if (params.prott5_per_residue_h5 && file(params.prott5_per_residue_h5).exists()) {
-        prott5_file = file(params.prott5_per_residue_h5)
-        log.info "Using local ProtT5 HDF5 at '${params.prott5_per_residue_h5}'"
-    } else {
-        log.info "Using ProtT5 HDF5 from url_uniprot_embeddings"
-    }
+    def prott5_file = file(params.url_uniprot_prott5_embeddings)
 
     empty_db = INIT_DOMAINSPLIT_DB().domainsplit_db
 

From c37ab02cf84a557071b07fadd376037297db00a6 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Wed, 10 Jun 2026 02:28:07 +0200
Subject: [PATCH 07/16] tried new improved method for sampling negative data

---
 ...ive_ddis.py => build_ppi_negative_pool.py} | 199 +++++-----------
 bin/insert_ppi_negative_selection.py          | 116 +++++++++
 bin/select_ppi_negative_dans.py               | 225 ++++++++++++++++++
 conf/modules.config                           |  15 +-
 .../environment.yml                           |   0
 .../main.nf                                   |   8 +-
 .../environment.yml                           |   7 +
 .../insert_ppi_negative_selection/main.nf     |  31 +++
 .../select_ppi_negative_dans/environment.yml  |   6 +
 .../local/select_ppi_negative_dans/main.nf    |  28 +++
 nextflow.config                               |   3 +
 nextflow_schema.json                          |   6 +
 subworkflows/local/collect_ddi_data/main.nf   |  33 ++-
 .../local/split_domainsplit_database/main.nf  |   2 +-
 14 files changed, 534 insertions(+), 145 deletions(-)
 rename bin/{build_ppi_negative_ddis.py => build_ppi_negative_pool.py} (64%)
 create mode 100755 bin/insert_ppi_negative_selection.py
 create mode 100755 bin/select_ppi_negative_dans.py
 rename modules/local/{insert_ppi_negative_ddis => build_ppi_negative_pool}/environment.yml (100%)
 rename modules/local/{insert_ppi_negative_ddis => build_ppi_negative_pool}/main.nf (86%)
 create mode 100644 modules/local/insert_ppi_negative_selection/environment.yml
 create mode 100644 modules/local/insert_ppi_negative_selection/main.nf
 create mode 100644 modules/local/select_ppi_negative_dans/environment.yml
 create mode 100644 modules/local/select_ppi_negative_dans/main.nf

diff --git a/bin/build_ppi_negative_ddis.py b/bin/build_ppi_negative_pool.py
similarity index 64%
rename from bin/build_ppi_negative_ddis.py
rename to bin/build_ppi_negative_pool.py
index 366ae06..e44fa31 100755
--- a/bin/build_ppi_negative_ddis.py
+++ b/bin/build_ppi_negative_pool.py
@@ -1,16 +1,20 @@
 #!/usr/bin/env python3
 """
-Build negative DDIs from a Y2H/MS PPI parquet and append them to the
-domainsplit SQLite, restricted to Pfam domains already present in positive
-DDIs.
-
-Selection uses degree-aware node sampling (DANS, Cappelletti et al. 2024,
-Bioinformatics Advances vbae036) applied to the PPI-derived candidate pool:
-each candidate Pfam pair is sampled (without replacement) with probability
-proportional to the preferential attachment of its two domains in the positive
-graph -- the product of their positive degrees.  This makes the negative
-degree / preferential-attachment distribution track the positives, avoiding the
-inflated downstream evaluation that uniform negative sampling produces.
+Build the candidate pool for negative DDIs from a Y2H/MS PPI parquet and dump it
+to ``neg_pool.npz`` for the (seed-dependent) selection step.
+
+This is the EXPENSIVE, DETERMINISTIC half of negative-DDI construction: it
+streams the parquet, maps bait/prey genes to UniProt + Pfam via the UniProt REST
+API, and assembles the pool of candidate Pfam pairs (restricted to Pfam domains
+that already appear in a 3did positive DDI, and excluding pairs that already
+exist as DDIs).  It performs NO sampling and NO insertion -- selection fans out
+into parallel per-seed jobs (``select_ppi_negative_dans.py``) that read the dump,
+and the winning selection is inserted by ``insert_ppi_negative_selection.py``.
+
+The dump also carries the positive-graph statistics the selector needs:
+``pos_degree`` (per-domain positive degree, the cap), the positive-edge
+preferential-attachment (PA = deg(a)*deg(b)) array, the target negative count
+(``n_positive``) and the positive domain count (``n_positive_domains``).
 """
 
 import argparse
@@ -44,14 +48,9 @@ def parse_args():
     p.add_argument("--parquet", required=True)
     p.add_argument("--pfam-mapping-out", required=True,
                    help="Output path for UniProt -> Pfam JSON mapping")
+    p.add_argument("--pool-out", required=True,
+                   help="Output path for the candidate-pool .npz dump")
     p.add_argument("--min-n-tested", type=int, required=True)
-    p.add_argument("--source-label", default="inferred_ppi_screen_negative")
-    p.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="Random seed for reproducible DANS negative sampling.",
-    )
     p.add_argument(
         "--no-self",
         action="store_true",
@@ -179,6 +178,16 @@ def load_existing_pairs(conn):
     return {tuple(sorted((a, b), key=pfam_sort_key)) for a, b in cur}
 
 
+def load_positive_3did_edges(conn):
+    """The 3did positive DDIs as (pfam_a, pfam_b) pairs."""
+    return conn.execute(
+        "SELECT da.pfam_id, db.pfam_id "
+        "FROM domain_domain_interaction AS ddi "
+        "JOIN domain AS da ON da.id = ddi.domain_id_a "
+        "JOIN domain AS db ON db.id = ddi.domain_id_b "
+        "WHERE ddi.negative = 0 AND ddi.source = '3did'"
+    ).fetchall()
+
 
 def _validate_columns(parquet_schema):
     available = set(parquet_schema.names)
@@ -215,71 +224,6 @@ def _collect_genes_and_pairs(parquet_path, min_n_tested):
     return n_input, unique_genes, baits, preys
 
 
-def _compute_positive_degree(conn):
-    """Per-Pfam degree in the 3did positive DDI set."""
-    rows = conn.execute(
-        "SELECT da.pfam_id, db.pfam_id "
-        "FROM domain_domain_interaction AS ddi "
-        "JOIN domain AS da ON da.id = ddi.domain_id_a "
-        "JOIN domain AS db ON db.id = ddi.domain_id_b "
-        "WHERE ddi.negative = 0 AND ddi.source = '3did'"
-    ).fetchall()
-    deg = defaultdict(int)
-    for a, b in rows:
-        deg[a] += 1
-        deg[b] += 1
-    return deg
-
-
-def select_dans(fresh_candidates, pos_degree, n_take, seed=42):
-    """Degree-aware node sampling (DANS) over the PPI candidate pool.
-
-    Cappelletti et al. 2024 (Bioinformatics Advances, vbae036) sample negative
-    edges so endpoint node-degrees track the positive distribution; the induced
-    edge probability is proportional to the preferential attachment of the two
-    endpoints, ``PA = deg(a) * deg(b)``.  Here we apply that distribution to the
-    fixed pool of PPI-derived candidate Pfam pairs: each candidate is drawn
-    without replacement with probability proportional to the product of its two
-    domains' positive degrees, so the selected negatives mirror the positive
-    degree / PA distribution while staying biologically grounded in the PPI
-    screen.
-    """
-    if n_take <= 0 or not fresh_candidates:
-        return []
-    if n_take >= len(fresh_candidates):
-        return list(fresh_candidates)
-
-    weights = np.array(
-        [pos_degree.get(a, 0) * pos_degree.get(b, 0) for (a, b), _ in fresh_candidates],
-        dtype=float,
-    )
-    # Defensive fallback: if positive-degree info is missing/degenerate (or too
-    # few non-zero weights to draw n_take distinct pairs), sample uniformly.
-    if weights.sum() <= 0 or int((weights > 0).sum()) < n_take:
-        log("DANS: degenerate weights -> uniform fallback")
-        weights = np.ones(len(fresh_candidates), dtype=float)
-
-    probs = weights / weights.sum()
-    rng = np.random.default_rng(seed)
-    idx = rng.choice(len(fresh_candidates), size=n_take, replace=False, p=probs)
-
-    chosen = [fresh_candidates[i] for i in idx]
-
-    def pa(a, b):
-        return pos_degree.get(a, 0) * pos_degree.get(b, 0)
-
-    chosen_degree = defaultdict(int)
-    for (a, b), _ in chosen:
-        chosen_degree[a] += 1
-        chosen_degree[b] += 1
-    pool_pa = float(np.mean([pa(a, b) for (a, b), _ in fresh_candidates]))
-    sel_pa = float(np.mean([pa(a, b) for (a, b), _ in chosen]))
-    log(f"DANS: selected {len(chosen)} negatives from pool of {len(fresh_candidates)}")
-    log(f"DANS: {len(chosen_degree)} domains used ({len(pos_degree)} in the positive set); "
-        f"mean PA pool={pool_pa:.1f} selected={sel_pa:.1f}")
-    return chosen
-
-
 def main():
     args = parse_args()
 
@@ -310,9 +254,6 @@ def main():
     log(f"n_pfam_domains_for_input_proteins = {n_pfam_unique}")
 
     conn = sqlite3.connect(args.db)
-    conn.execute("PRAGMA foreign_keys=ON")
-    conn.execute("PRAGMA journal_mode=OFF")
-    conn.execute("PRAGMA synchronous=OFF")
 
     pos_pfam = load_3did_pfams(conn)
     log(f"n_3did_pfams = {len(pos_pfam)}")
@@ -362,65 +303,53 @@ def row_pfams(gene):
             f"(observed in {most_common_count} PPI rows)"
         )
 
-    n_positive = conn.execute(
-        "SELECT COUNT(*) FROM domain_domain_interaction "
-        "WHERE negative = 0 AND source = '3did'"
-    ).fetchone()[0]
+    # Positive 3did graph statistics: degree (the per-domain cap), edge PA, and
+    # the target negative count.
+    pos_edges = load_positive_3did_edges(conn)
+    pos_degree = defaultdict(int)
+    for a, b in pos_edges:
+        pos_degree[a] += 1
+        pos_degree[b] += 1
+    n_positive = len(pos_edges)
+    n_positive_domains = len(pos_degree)
+    pos_edge_pa = np.array(
+        [pos_degree[a] * pos_degree[b] for a, b in pos_edges], dtype=np.int64
+    )
     log(f"n_positive_ddis_in_db = {n_positive}")
+    log(f"n_positive_domains = {n_positive_domains}")
+    log(f"positive mean PA = {float(pos_edge_pa.mean()):.1f}")
 
-    fresh_candidates = []
+    # Drop candidates that already exist as a DDI (positive or negative).
+    fresh_pairs = []
     n_positive_ddis_in_negative_ppis = 0
-
-    for key, count in candidate_counts.items():
+    for key in candidate_counts:
         if key in existing_pairs:
             n_positive_ddis_in_negative_ppis += 1
         else:
-            fresh_candidates.append((key, count))
+            fresh_pairs.append(key)
 
     log(f"n_positive_ddis_in_negative_ppis = {n_positive_ddis_in_negative_ppis}")
-
-    log(f"n_fresh_candidates_after_dedup = {len(fresh_candidates)}")
-
-    log("selecting negatives via degree-aware node sampling (DANS)")
-    pos_degree = _compute_positive_degree(conn)
-    chosen = select_dans(fresh_candidates, pos_degree, n_positive, seed=args.seed)
-    log(f"n_chosen = {len(chosen)}")
-
-    if chosen:
-        # Pre-load pfam_id -> domain.id mapping to avoid per-row subqueries
-        pfam_to_domain_ids = defaultdict(list)
-        for did, pfam in conn.execute("SELECT id, pfam_id FROM domain"):
-            pfam_to_domain_ids[pfam].append(did)
-        log(f"loaded {len(pfam_to_domain_ids)} pfam -> domain mappings")
-
-        insert_rows = []
-        for (pfam_a, pfam_b), _ in chosen:
-            # normalise by Pfam accession number (matching ddi_db_utils.insert_ddis)
-            # so swapped pairs collapse and dedup consistently with the other sources
-            for d_a in pfam_to_domain_ids.get(pfam_a, ()):
-                for d_b in pfam_to_domain_ids.get(pfam_b, ()):
-                    if pfam_sort_key(pfam_a) <= pfam_sort_key(pfam_b):
-                        lo, hi = d_a, d_b
-                    else:
-                        lo, hi = d_b, d_a
-                    insert_rows.append((lo, hi, True, args.source_label))
-
-        conn.executemany(
-            "INSERT OR IGNORE INTO domain_domain_interaction"
-            "(domain_id_a, domain_id_b, negative, source) "
-            "VALUES (?, ?, ?, ?)",
-            insert_rows,
-        )
-        conn.commit()
-        log(f"batch-inserted {len(insert_rows)} rows")
-
-    n_inserted = conn.execute(
-        "SELECT COUNT(*) FROM domain_domain_interaction WHERE source = ?",
-        (args.source_label,),
-    ).fetchone()[0]
-    log(f"n_inserted_for_source = {n_inserted}")
+    log(f"n_fresh_candidates_after_dedup = {len(fresh_pairs)}")
     conn.close()
 
+    cand_a = np.array([a for a, b in fresh_pairs], dtype=object)
+    cand_b = np.array([b for a, b in fresh_pairs], dtype=object)
+    pos_dom = np.array(list(pos_degree.keys()), dtype=object)
+    pos_deg = np.array([pos_degree[d] for d in pos_dom], dtype=np.int64)
+
+    log(f"writing candidate pool to {args.pool_out}")
+    np.savez(
+        args.pool_out,
+        cand_a=cand_a,
+        cand_b=cand_b,
+        pos_dom=pos_dom,
+        pos_deg=pos_deg,
+        pos_edge_pa=pos_edge_pa,
+        n_positive=np.int64(n_positive),
+        n_positive_domains=np.int64(n_positive_domains),
+    )
+    log("done")
+
 
 if __name__ == "__main__":
     main()
diff --git a/bin/insert_ppi_negative_selection.py b/bin/insert_ppi_negative_selection.py
new file mode 100755
index 0000000..769f127
--- /dev/null
+++ b/bin/insert_ppi_negative_selection.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+"""
+Pick the best per-seed negative-DDI selection and insert it into the domainsplit
+SQLite.
+
+Reads every ``score_*.json`` + ``pairs_*.tsv`` produced by the parallel
+``select_ppi_negative_dans.py`` jobs, picks the selection with the lowest
+objective ``J`` (ties broken by the smaller seed, so the result is fully
+deterministic regardless of which SLURM task finished first), and inserts that
+seed's Pfam pairs as negatives via the shared ``ddi_db_utils`` helpers.
+
+Prints a positive reference line (absolute baseline) followed by one line per
+seed and a WINNER line, and writes the same data to a published scores TSV.
+"""
+
+import argparse
+import glob
+import json
+import sqlite3
+
+from ddi_db_utils import count_source, ensure_domains, insert_ddis
+
+
+TAG = "[neg_insert]"
+J_ROUND = 12
+
+
+def log(msg):
+    print(f"{TAG} {msg}", flush=True)
+
+
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--db", required=True)
+    p.add_argument("--scores-out", required=True,
+                   help="output consolidated scores TSV path")
+    p.add_argument("--score-glob", default="score_*.json")
+    p.add_argument("--pairs-template", default="pairs_{seed}.tsv")
+    p.add_argument("--source-label", default="inferred_ppi_screen_negative")
+    return p.parse_args()
+
+
+def read_pairs(path):
+    pairs = []
+    with open(path) as fh:
+        for line in fh:
+            line = line.rstrip("\n")
+            if not line:
+                continue
+            a, b = line.split("\t")
+            pairs.append((a, b))
+    return pairs
+
+
+def main():
+    args = parse_args()
+
+    score_files = sorted(glob.glob(args.score_glob))
+    if not score_files:
+        raise SystemExit(f"{TAG} no score files matching {args.score_glob}")
+
+    records = []
+    for path in score_files:
+        with open(path) as fh:
+            records.append(json.load(fh))
+
+    # Deterministic winner: lowest J (rounded), ties broken by smaller seed.
+    records.sort(key=lambda r: (round(r["J"], J_ROUND), r["seed"]))
+    winner = records[0]
+    winner_seed = winner["seed"]
+
+    pairs_path = args.pairs_template.format(seed=winner_seed)
+    pairs = read_pairs(pairs_path)
+
+    conn = sqlite3.connect(args.db)
+    conn.execute("PRAGMA foreign_keys=ON")
+    conn.execute("PRAGMA journal_mode=OFF")
+    conn.execute("PRAGMA synchronous=OFF")
+
+    ensure_domains(conn, (p for pair in pairs for p in pair))
+    insert_ddis(conn, pairs, negative=True, source=args.source_label)
+    conn.commit()
+    n_inserted = count_source(conn, args.source_label)
+    conn.close()
+
+    # --- report: positive reference, then every seed, then the winner ---
+    ref = records[0]
+    log(f"set=positive n_sel={ref['pos_n_sel']} n_dom={ref['pos_n_dom']} "
+        f"mean_pa={ref['pos_mean_pa']:.1f}")
+    for r in records:
+        log(f"set=negative seed={r['seed']} J={r['J']:.4f} pa={r['pa']:.4f} "
+            f"deg={r['deg']:.4f} cov={r['cov']:.4f} n_sel={r['n_sel']} "
+            f"n_dom={r['n_dom']} mean_pa={r['mean_pa']:.1f}")
+    log(f"WINNER seed={winner_seed} J={winner['J']:.4f} n_inserted={n_inserted}")
+
+    cols = ["set", "seed", "J", "pa", "deg", "cov", "n_sel", "n_dom",
+            "mean_pa", "winner"]
+    with open(args.scores_out, "w") as fh:
+        fh.write("\t".join(cols) + "\n")
+        fh.write("\t".join([
+            "positive", "NA", "NA", "NA", "NA", "NA",
+            str(ref["pos_n_sel"]), str(ref["pos_n_dom"]),
+            f"{ref['pos_mean_pa']:.2f}", "NA",
+        ]) + "\n")
+        for r in records:
+            fh.write("\t".join([
+                "negative", str(r["seed"]),
+                f"{r['J']:.6f}", f"{r['pa']:.6f}", f"{r['deg']:.6f}",
+                f"{r['cov']:.6f}", str(r["n_sel"]), str(r["n_dom"]),
+                f"{r['mean_pa']:.2f}",
+                "1" if r["seed"] == winner_seed else "0",
+            ]) + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/select_ppi_negative_dans.py b/bin/select_ppi_negative_dans.py
new file mode 100755
index 0000000..c83b3d9
--- /dev/null
+++ b/bin/select_ppi_negative_dans.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+"""
+Select negative DDIs from a candidate pool (``neg_pool.npz``) for one random
+seed, score the selection, and emit the chosen Pfam pairs + a score JSON.
+
+Degree-aware node sampling (DANS, Cappelletti et al. 2024, Bioinformatics
+Advances vbae036) matched the negative degree distribution to the positives by
+sampling edges with probability proportional to preferential attachment
+(PA = deg(a)*deg(b)).  Applied naively to a fixed candidate pool that already has
+the positive mean PA, that overshoots: sampling proportional to PA draws edges
+with mean PA = E[PA^2]/E[PA] >> pool mean, concentrating on a few hub domains.
+
+This selector keeps the PA-proportional draw but adds a hard per-domain CAP at
+the positive degree, and refills in multiple passes until the target negative
+count (== positive count) is reached:
+
+  * pass 1 draws ``target`` candidates PA-weighted without replacement and adds
+    them while no endpoint exceeds its cap;
+  * each refill pass prunes already-picked candidates and any candidate touching
+    a saturated domain (it would only be skipped again), renormalises the PA
+    weights over the survivors, and draws the deficit (floored at ``min_batch``);
+  * the loop stops when ``target`` is hit, or early if the pruned pool can no
+    longer supply an eligible edge -- that shortfall is the true feasibility
+    ceiling of the fixed pool.
+
+Capping pins the negative degree sequence to the positive one (degree
+distribution matched, hub-driven mean-PA blow-up impossible); the PA-weighted
+draw fills high-degree domains toward their cap first, reproducing the positive
+PA distribution; the refill drives the count to the positive total.
+
+The selection is scored against the positives with a combined objective (lower is
+better):  J = w_pa*pa + w_deg*deg + w_cov*cov, where
+  pa  = Wasserstein-1 between log1p(PA_neg) and log1p(PA_pos), normalised by the
+        spread of the positive log1p(PA);
+  deg = Kolmogorov-Smirnov statistic between the per-domain negative degree
+        distribution (0 for unused domains) and the positive degree distribution;
+  cov = 1 - domains_used_neg / domains_pos.
+"""
+
+import argparse
+import json
+
+import numpy as np
+
+
+TAG = "[neg_select]"
+
+
+def log(msg):
+    print(f"{TAG} {msg}", flush=True)
+
+
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--pool", required=True, help="candidate-pool .npz from BUILD step")
+    p.add_argument("--seed", type=int, required=True)
+    p.add_argument("--score-out", required=True, help="output score JSON path")
+    p.add_argument("--pairs-out", required=True, help="output selected-pairs TSV path")
+    p.add_argument("--w-pa", type=float, default=0.5)
+    p.add_argument("--w-deg", type=float, default=0.3)
+    p.add_argument("--w-cov", type=float, default=0.2)
+    p.add_argument(
+        "--min-batch",
+        type=int,
+        default=20,
+        help="Minimum candidates drawn per refill pass so the tail keeps progressing.",
+    )
+    return p.parse_args()
+
+
+def wasserstein1(x, y):
+    """1-Wasserstein distance between two 1D empirical samples (numpy only)."""
+    x = np.sort(np.asarray(x, dtype=float))
+    y = np.sort(np.asarray(y, dtype=float))
+    grid = np.concatenate([x, y])
+    grid.sort()
+    cx = np.searchsorted(x, grid, side="right") / x.size
+    cy = np.searchsorted(y, grid, side="right") / y.size
+    deltas = np.diff(grid)
+    return float(np.sum(np.abs(cx[:-1] - cy[:-1]) * deltas))
+
+
+def ks_statistic(x, y):
+    """Two-sample Kolmogorov-Smirnov statistic (numpy only)."""
+    x = np.sort(np.asarray(x, dtype=float))
+    y = np.sort(np.asarray(y, dtype=float))
+    grid = np.concatenate([x, y])
+    grid.sort()
+    cx = np.searchsorted(x, grid, side="right") / x.size
+    cy = np.searchsorted(y, grid, side="right") / y.size
+    return float(np.max(np.abs(cx - cy)))
+
+
+def select(cand_ai, cand_bi, pa, cap, target, seed, min_batch):
+    """PA-weighted, degree-capped, multi-pass refill selection.
+
+    Returns the array of selected candidate indices.
+    """
+    rng = np.random.default_rng(seed)
+    n_cand = cand_ai.size
+    remaining = cap.copy()
+    picked = np.zeros(n_cand, dtype=bool)
+    selected = []
+
+    def draw_and_add(batch):
+        elig = np.flatnonzero(
+            (~picked) & (remaining[cand_ai] > 0) & (remaining[cand_bi] > 0)
+        )
+        if elig.size == 0:
+            return 0
+        w = pa[elig].astype(float)
+        total = w.sum()
+        p = (w / total) if total > 0 else None
+        k = int(min(batch, elig.size))
+        chosen = rng.choice(elig, size=k, replace=False, p=p)
+        added = 0
+        for ci in chosen:
+            a = cand_ai[ci]
+            b = cand_bi[ci]
+            if remaining[a] > 0 and remaining[b] > 0:
+                selected.append(int(ci))
+                picked[ci] = True
+                remaining[a] -= 1
+                remaining[b] -= 1
+                added += 1
+                if len(selected) >= target:
+                    break
+        return added
+
+    draw_and_add(target)
+    n_passes = 1
+    while len(selected) < target:
+        missing = target - len(selected)
+        added = draw_and_add(max(missing, min_batch))
+        n_passes += 1
+        if added == 0:
+            log(f"pool exhausted after {n_passes} passes; "
+                f"selected {len(selected)}/{target}")
+            break
+
+    log(f"selection done in {n_passes} passes: {len(selected)}/{target} edges")
+    return np.array(selected, dtype=np.int64)
+
+
+def main():
+    args = parse_args()
+
+    data = np.load(args.pool, allow_pickle=True)
+    cand_a = data["cand_a"]
+    cand_b = data["cand_b"]
+    pos_dom = data["pos_dom"]
+    pos_deg = data["pos_deg"].astype(np.int64)
+    pos_edge_pa = data["pos_edge_pa"].astype(np.int64)
+    n_positive = int(data["n_positive"])
+    n_positive_domains = int(data["n_positive_domains"])
+
+    log(f"pool: {cand_a.size} candidates, {pos_dom.size} positive domains, "
+        f"target = {n_positive}")
+
+    # Map every domain to an integer index over the positive-domain universe.
+    domain_index = {d: i for i, d in enumerate(pos_dom)}
+    cand_ai = np.fromiter((domain_index[a] for a in cand_a), dtype=np.int64,
+                          count=cand_a.size)
+    cand_bi = np.fromiter((domain_index[b] for b in cand_b), dtype=np.int64,
+                          count=cand_b.size)
+    cap = pos_deg.copy()
+    pa = pos_deg[cand_ai] * pos_deg[cand_bi]
+
+    sel = select(cand_ai, cand_bi, pa, cap, n_positive, args.seed, args.min_batch)
+
+    # --- selected-set statistics ---
+    sel_ai = cand_ai[sel]
+    sel_bi = cand_bi[sel]
+    neg_pa = pa[sel]
+    n_sel = int(sel.size)
+    mean_pa_neg = float(neg_pa.mean()) if n_sel else 0.0
+
+    neg_deg = np.zeros(pos_dom.size, dtype=np.int64)
+    np.add.at(neg_deg, sel_ai, 1)
+    np.add.at(neg_deg, sel_bi, 1)
+    n_dom = int(np.count_nonzero(neg_deg))
+
+    # --- objective ---
+    pos_logpa = np.log1p(pos_edge_pa.astype(float))
+    neg_logpa = np.log1p(neg_pa.astype(float))
+    spread = float(pos_logpa.max() - pos_logpa.min())
+    pa_term = wasserstein1(neg_logpa, pos_logpa) / spread if spread > 0 else 0.0
+    deg_term = ks_statistic(neg_deg, pos_deg)
+    cov_term = 1.0 - (n_dom / n_positive_domains) if n_positive_domains else 0.0
+    j = args.w_pa * pa_term + args.w_deg * deg_term + args.w_cov * cov_term
+
+    pos_mean_pa = float(pos_edge_pa.mean())
+
+    log(f"set=positive n_sel={n_positive} n_dom={n_positive_domains} "
+        f"mean_pa={pos_mean_pa:.1f}")
+    log(f"set=negative seed={args.seed} J={j:.4f} pa={pa_term:.4f} "
+        f"deg={deg_term:.4f} cov={cov_term:.4f} n_sel={n_sel} n_dom={n_dom} "
+        f"mean_pa={mean_pa_neg:.1f}")
+
+    score = {
+        "seed": int(args.seed),
+        "J": j,
+        "pa": pa_term,
+        "deg": deg_term,
+        "cov": cov_term,
+        "n_sel": n_sel,
+        "n_dom": n_dom,
+        "mean_pa": mean_pa_neg,
+        "pos_n_sel": n_positive,
+        "pos_n_dom": n_positive_domains,
+        "pos_mean_pa": pos_mean_pa,
+        "w_pa": args.w_pa,
+        "w_deg": args.w_deg,
+        "w_cov": args.w_cov,
+    }
+    with open(args.score_out, "w") as fh:
+        json.dump(score, fh)
+
+    with open(args.pairs_out, "w") as fh:
+        for ci in sel:
+            fh.write(f"{cand_a[ci]}\t{cand_b[ci]}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/conf/modules.config b/conf/modules.config
index a91b4f8..aa3bdda 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -29,10 +29,23 @@ process {
     // default publishDir copy each of them under `insert/` / `smoke/` /
     // `init/` would create stale duplicates and racy filename collisions.
     // Disable publishing for these intermediates explicitly.
-    withName: 'INIT_DOMAINSPLIT_DB|INSERT_3DID|INSERT_SINGLE_DOMAIN_PPI|INSERT_PPIDM|INSERT_NEGATOME|REMOVE_SELF_INTERACTIONS|BUILD_SWISSPROT_PFAM_MAP|INSERT_PPI_NEGATIVE_DDIS|SMOKE_FILTER|INSERT_DOMAIN_GO_TERMS|INSERT_PROTEINS_WITH_EMBEDDINGS|INSERT_PROTEIN_GO_TERMS|INSERT_PPI|INSERT_DOMAIN_PROTEIN_MAPPING' {
+    withName: 'INIT_DOMAINSPLIT_DB|INSERT_3DID|INSERT_SINGLE_DOMAIN_PPI|INSERT_PPIDM|INSERT_NEGATOME|REMOVE_SELF_INTERACTIONS|BUILD_SWISSPROT_PFAM_MAP|BUILD_PPI_NEGATIVE_POOL|SELECT_PPI_NEGATIVE_DANS|SMOKE_FILTER|INSERT_DOMAIN_GO_TERMS|INSERT_PROTEINS_WITH_EMBEDDINGS|INSERT_PROTEIN_GO_TERMS|INSERT_PPI|INSERT_DOMAIN_PROTEIN_MAPPING' {
         publishDir = [ enabled: false ]
     }
 
+    // INSERT_PPI_NEGATIVE_SELECTION's domainsplit.sqlite3 is an intermediate
+    // (published only via the workflow-level output block), but its
+    // negative_ppi_seed_scores.tsv diagnostic IS published.
+    withName: 'INSERT_PPI_NEGATIVE_SELECTION' {
+        publishDir = [
+            path: { "${params.outdir}/negative_ppi" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename ->
+                (filename.equals('versions.yml') || filename.endsWith('.sqlite3')) ? null : filename
+            }
+        ]
+    }
+
     // EXTRACT_UNIQUE_DOMAINS emits a transient pfam_ids.txt consumed only by
     // DOWNLOAD_PFAM_ALIGNMENTS_BATCH in the same subworkflow.
     withName: 'EXTRACT_UNIQUE_DOMAINS' {
diff --git a/modules/local/insert_ppi_negative_ddis/environment.yml b/modules/local/build_ppi_negative_pool/environment.yml
similarity index 100%
rename from modules/local/insert_ppi_negative_ddis/environment.yml
rename to modules/local/build_ppi_negative_pool/environment.yml
diff --git a/modules/local/insert_ppi_negative_ddis/main.nf b/modules/local/build_ppi_negative_pool/main.nf
similarity index 86%
rename from modules/local/insert_ppi_negative_ddis/main.nf
rename to modules/local/build_ppi_negative_pool/main.nf
index 3dc3998..3ab76ec 100644
--- a/modules/local/insert_ppi_negative_ddis/main.nf
+++ b/modules/local/build_ppi_negative_pool/main.nf
@@ -1,5 +1,5 @@
-process INSERT_PPI_NEGATIVE_DDIS {
-    tag "insert_ppi_negative_ddis"
+process BUILD_PPI_NEGATIVE_POOL {
+    tag "build_ppi_negative_pool"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
     container "docker://konstantinpelz/domainsplit-general:1.0.0"
@@ -12,6 +12,7 @@ process INSERT_PPI_NEGATIVE_DDIS {
 
     output:
     path "domainsplit.sqlite3",        emit: domainsplit_db
+    path "neg_pool.npz",               emit: neg_pool
     path "uniprot_pfam_mapping.json",  emit: pfam_mapping
     path "versions.yml",               emit: versions
 
@@ -20,10 +21,11 @@ process INSERT_PPI_NEGATIVE_DDIS {
     """
     cp "${domainsplit_db_in}" domainsplit.sqlite3
 
-    build_ppi_negative_ddis.py \\
+    build_ppi_negative_pool.py \\
         --db domainsplit.sqlite3 \\
         --parquet "${negative_ppi_parquet}" \\
         --pfam-mapping-out uniprot_pfam_mapping.json \\
+        --pool-out neg_pool.npz \\
         --min-n-tested ${min_n_tested} \\
         ${no_self}
 
diff --git a/modules/local/insert_ppi_negative_selection/environment.yml b/modules/local/insert_ppi_negative_selection/environment.yml
new file mode 100644
index 0000000..ac84956
--- /dev/null
+++ b/modules/local/insert_ppi_negative_selection/environment.yml
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.12
+  - sqlite
+  - numpy
diff --git a/modules/local/insert_ppi_negative_selection/main.nf b/modules/local/insert_ppi_negative_selection/main.nf
new file mode 100644
index 0000000..f31467b
--- /dev/null
+++ b/modules/local/insert_ppi_negative_selection/main.nf
@@ -0,0 +1,31 @@
+process INSERT_PPI_NEGATIVE_SELECTION {
+    tag "insert_ppi_negative_selection"
+    label 'process_low'
+    conda "${moduleDir}/environment.yml"
+    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+
+    input:
+    path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
+    path score_jsons
+    path pairs_tsvs
+
+    output:
+    path "domainsplit.sqlite3",          emit: domainsplit_db
+    path "negative_ppi_seed_scores.tsv", emit: scores
+    path "versions.yml",                 emit: versions
+
+    script:
+    """
+    cp "${domainsplit_db_in}" domainsplit.sqlite3
+
+    insert_ppi_negative_selection.py \\
+        --db domainsplit.sqlite3 \\
+        --scores-out negative_ppi_seed_scores.tsv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python3 -c 'import sys; print(sys.version.split()[0])')
+        sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/select_ppi_negative_dans/environment.yml b/modules/local/select_ppi_negative_dans/environment.yml
new file mode 100644
index 0000000..22cb361
--- /dev/null
+++ b/modules/local/select_ppi_negative_dans/environment.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.12
+  - numpy
diff --git a/modules/local/select_ppi_negative_dans/main.nf b/modules/local/select_ppi_negative_dans/main.nf
new file mode 100644
index 0000000..4163661
--- /dev/null
+++ b/modules/local/select_ppi_negative_dans/main.nf
@@ -0,0 +1,28 @@
+process SELECT_PPI_NEGATIVE_DANS {
+    tag "select_ppi_negative_dans:seed=${seed}"
+    label 'process_low'
+    conda "${moduleDir}/environment.yml"
+    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+
+    input:
+    tuple val(seed), path(neg_pool)
+
+    output:
+    tuple val(seed), path("score_${seed}.json"), path("pairs_${seed}.tsv"), emit: result
+    path "versions.yml", emit: versions
+
+    script:
+    """
+    select_ppi_negative_dans.py \\
+        --pool "${neg_pool}" \\
+        --seed ${seed} \\
+        --score-out score_${seed}.json \\
+        --pairs-out pairs_${seed}.tsv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python3 -c 'import sys; print(sys.version.split()[0])')
+        numpy: \$(python3 -c 'import numpy; print(numpy.__version__)')
+    END_VERSIONS
+    """
+}
diff --git a/nextflow.config b/nextflow.config
index 08bf661..aa5af2a 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -67,6 +67,9 @@ params {
     // count. Required input (no default; must be supplied per run).
     negative_ppi_parquet         = null
     negative_ppi_min_n_tested    = 5
+    // Base seed for the multi-seed negative-DDI selection: 5 parallel SLURM
+    // jobs run with seeds base+1..+5 and the best-scoring selection is kept.
+    negative_ppi_seed            = 42
 
     // Reviewed-human UniProt -> Pfam stream used to detect single-domain
     // proteins (accession, entry name, gene names, Pfam xrefs).
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 77796a4..ef83a4b 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -165,6 +165,12 @@
                     "minimum": 1,
                     "fa_icon": "fas fa-filter"
                 },
+                "negative_ppi_seed": {
+                    "type": "integer",
+                    "description": "Base random seed for the multi-seed negative-DDI selection. Five parallel jobs run the degree-capped PA-weighted sampler with seeds base+1..+5; the lowest-scoring (best degree/PA/coverage-matched) selection is inserted.",
+                    "default": 42,
+                    "fa_icon": "fas fa-dice"
+                },
                 "pfam_download_batch_size": {
                     "type": "integer",
                     "description": "Number of Pfam IDs grouped into a single download job to reduce scheduler overhead.",
diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf
index 3c377f4..9c32b68 100644
--- a/subworkflows/local/collect_ddi_data/main.nf
+++ b/subworkflows/local/collect_ddi_data/main.nf
@@ -27,7 +27,9 @@ include { INSERT_SINGLE_DOMAIN_PPI  } from '../../../modules/local/insert_single
 include { INSERT_PPIDM              } from '../../../modules/local/insert_ppidm/main.nf'
 include { INSERT_NEGATOME           } from '../../../modules/local/insert_negatome/main.nf'
 include { REMOVE_SELF_INTERACTIONS  } from '../../../modules/local/remove_self_interactions/main.nf'
-include { INSERT_PPI_NEGATIVE_DDIS  } from '../../../modules/local/insert_ppi_negative_ddis/main.nf'
+include { BUILD_PPI_NEGATIVE_POOL       } from '../../../modules/local/build_ppi_negative_pool/main.nf'
+include { SELECT_PPI_NEGATIVE_DANS      } from '../../../modules/local/select_ppi_negative_dans/main.nf'
+include { INSERT_PPI_NEGATIVE_SELECTION } from '../../../modules/local/insert_ppi_negative_selection/main.nf'
 include { SMOKE_FILTER              } from '../../../modules/local/smoke_filter/main.nf'
 
 workflow COLLECT_DDI_DATA {
@@ -72,15 +74,36 @@ workflow COLLECT_DDI_DATA {
         domainsplit_db = REMOVE_SELF_INTERACTIONS(domainsplit_db).domainsplit_db
     }
 
-    // 7. high-confidence non-PPI negatives (inferred only over 3did domains)
-    ppi_result = INSERT_PPI_NEGATIVE_DDIS(
+    // 7. high-confidence non-PPI negatives (inferred only over 3did domains).
+    //    The expensive, deterministic UniProt fetch + candidate-pool build runs
+    //    once; selection fans out over 5 seeds (base+1..+5) in parallel SLURM
+    //    jobs, and the best-scoring (degree/PA-matched) selection is inserted.
+    pool = BUILD_PPI_NEGATIVE_POOL(
         domainsplit_db,
         file(negative_ppi_parquet),
         params.negative_ppi_min_n_tested,
         params.self_interaction,
     )
-    domainsplit_db = ppi_result.domainsplit_db
-    pfam_mapping   = ppi_result.pfam_mapping
+
+    // Pair the single shared pool file with each seed (combine avoids the
+    // queue-exhaustion that mixing a one-shot channel with a 5-item queue causes).
+    seeds = Channel.of(1, 2, 3, 4, 5).map { params.negative_ppi_seed + it }
+    sel   = SELECT_PPI_NEGATIVE_DANS(seeds.combine(pool.neg_pool))
+
+    sel.result
+        .multiMap { seed, score, pairs ->
+            scores: score
+            pairs:  pairs
+        }
+        .set { selection }
+
+    best = INSERT_PPI_NEGATIVE_SELECTION(
+        pool.domainsplit_db,
+        selection.scores.collect(),
+        selection.pairs.collect(),
+    )
+    domainsplit_db = best.domainsplit_db
+    pfam_mapping   = pool.pfam_mapping
 
     if (params.smoke_test_n_ddis != null) {
         domainsplit_db = SMOKE_FILTER(domainsplit_db, params.smoke_test_n_ddis).domainsplit_db
diff --git a/subworkflows/local/split_domainsplit_database/main.nf b/subworkflows/local/split_domainsplit_database/main.nf
index 972b410..4f6e3a7 100644
--- a/subworkflows/local/split_domainsplit_database/main.nf
+++ b/subworkflows/local/split_domainsplit_database/main.nf
@@ -56,7 +56,7 @@ workflow SPLIT_DOMAINSPLIT_DATABASE {
     // The two methods that mimic a within-distribution evaluation use only the
     // "core" sources: 3did positives + high-confidence non-PPI negatives.
     // 'inferred_ppi_screen_negative' must stay in sync with the --source-label
-    // default in bin/build_ppi_negative_ddis.py.
+    // default in bin/insert_ppi_negative_selection.py.
     def core_sources = ['3did', 'inferred_ppi_screen_negative']
 
     // External-validation test set: held-out sources placed as is.

From 097bd3bf1f5fdcbffae54a16c237a8e00d496dc0 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Wed, 10 Jun 2026 18:16:21 +0200
Subject: [PATCH 08/16] reworked generating negatives

---
 bin/build_ppi_negative_pool.py                |  57 +++-
 bin/ddi_db_utils.py                           |  27 +-
 bin/insert_ppi_negative_selection.py          | 162 ++++++----
 bin/select_ppi_negative_dans.py               | 278 ++++++++++--------
 conf/modules.config                           |   2 +-
 modules/local/init_domainsplit_db/main.nf     |   2 +-
 .../insert_ppi_negative_selection/main.nf     |  21 +-
 .../local/select_ppi_negative_dans/main.nf    |  16 +-
 nextflow.config                               |   4 +-
 nextflow_schema.json                          |   2 +-
 subworkflows/local/collect_ddi_data/main.nf   |  44 +--
 .../local/split_domainsplit_database/main.nf  | 102 ++++---
 .../test_insert_ppi_negative_selection.py     | 135 +++++++++
 tests/python/test_select_ppi_negative_dans.py | 148 ++++++++++
 14 files changed, 725 insertions(+), 275 deletions(-)
 create mode 100644 tests/python/test_insert_ppi_negative_selection.py
 create mode 100644 tests/python/test_select_ppi_negative_dans.py

diff --git a/bin/build_ppi_negative_pool.py b/bin/build_ppi_negative_pool.py
index e44fa31..b2d6b2a 100755
--- a/bin/build_ppi_negative_pool.py
+++ b/bin/build_ppi_negative_pool.py
@@ -11,10 +11,16 @@
 into parallel per-seed jobs (``select_ppi_negative_dans.py``) that read the dump,
 and the winning selection is inserted by ``insert_ppi_negative_selection.py``.
 
-The dump also carries the positive-graph statistics the selector needs:
-``pos_degree`` (per-domain positive degree, the cap), the positive-edge
-preferential-attachment (PA = deg(a)*deg(b)) array, the target negative count
-(``n_positive``) and the positive domain count (``n_positive_domains``).
+The dump carries what both negative-construction methods need (uncapped DANS,
+Cappelletti et al. vbae036):
+  * Method 1 "deletion" -- the candidate pool ``cand_a``/``cand_b``, the pool
+    domain universe ``pool_dom`` and the *reduced* positive degrees
+    ``pool_deg_r`` (3did positives restricted to pool domains), plus the reduced
+    positive-edge PA and target count.
+  * Method 2 "random_addition" -- the full positive edge endpoint multiset
+    ``pos_a``/``pos_b`` (DANS samples node-pairs proportional to degree from it),
+    the full positive degrees/PA/count, and the forbidden-pair set
+    ``forbidden_a``/``forbidden_b`` (all existing DDIs) that DANS must avoid.
 """
 
 import argparse
@@ -334,19 +340,62 @@ def row_pfams(gene):
 
     cand_a = np.array([a for a, b in fresh_pairs], dtype=object)
     cand_b = np.array([b for a, b in fresh_pairs], dtype=object)
+
+    # ---- Method 1 ("deletion"): reduce the positives to the candidate-domain
+    #      universe so positive and candidate domains coincide; DANS then draws
+    #      degree-aware over the fixed candidate pool. ----
+    pool_domains = {d for pair in fresh_pairs for d in pair}
+    pos_edges_r = [
+        (a, b) for a, b in pos_edges if a in pool_domains and b in pool_domains
+    ]
+    pos_degree_r = defaultdict(int)
+    for a, b in pos_edges_r:
+        pos_degree_r[a] += 1
+        pos_degree_r[b] += 1
+    n_positive_r = len(pos_edges_r)
+    n_positive_domains_r = len(pos_degree_r)
+    pos_edge_pa_r = np.array(
+        [pos_degree_r[a] * pos_degree_r[b] for a, b in pos_edges_r], dtype=np.int64
+    )
+    # Every pool domain carries its reduced-positive degree (0 if it has no edge
+    # in the reduced positive graph); the selector turns these into PA weights.
+    pool_dom = np.array(sorted(pool_domains, key=pfam_sort_key), dtype=object)
+    pool_deg_r = np.array([pos_degree_r[d] for d in pool_dom], dtype=np.int64)
+    log(f"n_pool_domains = {len(pool_dom)}")
+    log(f"n_reduced_positive_ddis = {n_positive_r}")
+    log(f"n_reduced_positive_domains = {n_positive_domains_r}")
+
+    # ---- Method 2 ("random_addition"): plain DANS over the full positive set.
+    #      The selector samples node-pairs proportional to degree by drawing from
+    #      the endpoint multiset of these edges and rejects existing pairs. ----
+    pos_a = np.array([a for a, b in pos_edges], dtype=object)
+    pos_b = np.array([b for a, b in pos_edges], dtype=object)
     pos_dom = np.array(list(pos_degree.keys()), dtype=object)
     pos_deg = np.array([pos_degree[d] for d in pos_dom], dtype=np.int64)
+    forbidden_a = np.array([a for a, b in existing_pairs], dtype=object)
+    forbidden_b = np.array([b for a, b in existing_pairs], dtype=object)
 
     log(f"writing candidate pool to {args.pool_out}")
     np.savez(
         args.pool_out,
+        # --- Method 1 (deletion) ---
         cand_a=cand_a,
         cand_b=cand_b,
+        pool_dom=pool_dom,
+        pool_deg_r=pool_deg_r,
+        pos_edge_pa_r=pos_edge_pa_r,
+        n_positive_r=np.int64(n_positive_r),
+        n_positive_domains_r=np.int64(n_positive_domains_r),
+        # --- Method 2 (random_addition) ---
+        pos_a=pos_a,
+        pos_b=pos_b,
         pos_dom=pos_dom,
         pos_deg=pos_deg,
         pos_edge_pa=pos_edge_pa,
         n_positive=np.int64(n_positive),
         n_positive_domains=np.int64(n_positive_domains),
+        forbidden_a=forbidden_a,
+        forbidden_b=forbidden_b,
     )
     log("done")
 
diff --git a/bin/ddi_db_utils.py b/bin/ddi_db_utils.py
index e2c6570..4d451c4 100755
--- a/bin/ddi_db_utils.py
+++ b/bin/ddi_db_utils.py
@@ -7,8 +7,14 @@
 :func:`pfam_sort_key`) so the stored ``(domain_id_a, domain_id_b)`` order is
 stable -- a pair is deduplicated regardless of the order it is supplied in and
 regardless of which source inserted it first or in what order domains were
-created -- and ``INSERT OR IGNORE`` keeps the earliest (positive *or* negative)
-row.
+created.
+
+The table's ``UNIQUE(domain_id_a, domain_id_b, source)`` lets the same pair be
+stored under different sources.  To keep the historical "earliest source wins"
+behaviour for the canonical sources, :func:`insert_ddis` defaults to
+``dedup_across_sources=True``, which skips any pair already present under another
+source.  The negative-DDI method copies (which intentionally duplicate a pair
+under a new label) pass ``dedup_across_sources=False``.
 """
 
 import sqlite3
@@ -43,18 +49,31 @@ def _pfam_to_id(conn):
     return {pfam: did for did, pfam in conn.execute("SELECT id, pfam_id FROM domain")}
 
 
-def insert_ddis(conn, pairs, negative, source):
+def insert_ddis(conn, pairs, negative, source, dedup_across_sources=True):
     """Insert DDIs for ``(pfam_a, pfam_b)`` pairs.
 
     Domains must already exist (call :func:`ensure_domains` first); pairs whose
     Pfam is missing from the ``domain`` table are skipped.  Each pair is stored
     as ``(min(id), max(id))`` so swapped duplicates collapse onto one row.
 
+    With ``dedup_across_sources=True`` (default) a pair already present under any
+    other source is skipped, preserving the "earliest source wins" semantics for
+    the canonical sources.  Pass ``False`` to allow the pair to be duplicated
+    under this ``source`` (used by the negative-DDI method copies); same-source
+    duplicates are still collapsed by ``UNIQUE(domain_id_a, domain_id_b, source)``.
+
     Returns the number of rows offered to ``INSERT OR IGNORE`` (before dedup by
     the DB).
     """
     pfam_to_id = _pfam_to_id(conn)
     neg = int(bool(negative))
+    existing = set()
+    if dedup_across_sources:
+        existing = {
+            (a, b) for a, b in conn.execute(
+                "SELECT domain_id_a, domain_id_b FROM domain_domain_interaction"
+            )
+        }
     rows = []
     seen = set()
     for a, b in pairs:
@@ -63,7 +82,7 @@ def insert_ddis(conn, pairs, negative, source):
         if ia is None or ib is None:
             continue
         key = (ia, ib) if pfam_sort_key(a) <= pfam_sort_key(b) else (ib, ia)
-        if key in seen:
+        if key in seen or key in existing:
             continue
         seen.add(key)
         rows.append((key[0], key[1], neg, source))
diff --git a/bin/insert_ppi_negative_selection.py b/bin/insert_ppi_negative_selection.py
index 769f127..bfced58 100755
--- a/bin/insert_ppi_negative_selection.py
+++ b/bin/insert_ppi_negative_selection.py
@@ -1,28 +1,37 @@
 #!/usr/bin/env python3
 """
-Pick the best per-seed negative-DDI selection and insert it into the domainsplit
-SQLite.
-
-Reads every ``score_*.json`` + ``pairs_*.tsv`` produced by the parallel
-``select_ppi_negative_dans.py`` jobs, picks the selection with the lowest
-objective ``J`` (ties broken by the smaller seed, so the result is fully
-deterministic regardless of which SLURM task finished first), and inserts that
-seed's Pfam pairs as negatives via the shared ``ddi_db_utils`` helpers.
-
-Prints a positive reference line (absolute baseline) followed by one line per
-seed and a WINNER line, and writes the same data to a published scores TSV.
+Insert the two negative-DDI construction methods into the domainsplit SQLite,
+each under its own ``source`` labels so both can coexist in one database and be
+selected independently by the downstream splits.
+
+Both methods use uncapped Degree-Aware Node Sampling (DANS); the per-seed pairs
+are produced by ``select_ppi_negative_dans.py`` (one run per method, no
+pick-best).  This step copies the matching positives and inserts the negatives:
+
+  * "deletion"        -- positives = 3did restricted to the candidate-pool domain
+    universe (``3did_deletion``); negatives =
+    ``inferred_ppi_screen_negative_for_deletion``.
+  * "random_addition" -- positives = the full 3did set (``3did_random_addition``);
+    negatives = ``inferred_ppi_screen_negative_for_random_addition``.
+
+The four method labels are inserted with ``dedup_across_sources=False`` so they
+may duplicate a pair already stored under the canonical ``3did`` source (the
+table's ``UNIQUE(domain_id_a, domain_id_b, source)`` keeps the labels distinct).
+
+Prints a positive reference + the negative selection for each method and writes
+the same data to a published scores TSV.
 """
 
 import argparse
-import glob
 import json
 import sqlite3
 
+import numpy as np
+
 from ddi_db_utils import count_source, ensure_domains, insert_ddis
 
 
 TAG = "[neg_insert]"
-J_ROUND = 12
 
 
 def log(msg):
@@ -32,11 +41,21 @@ def log(msg):
 def parse_args():
     p = argparse.ArgumentParser()
     p.add_argument("--db", required=True)
+    p.add_argument("--pool", required=True,
+                   help="candidate-pool .npz (for the pool-domain universe)")
+    p.add_argument("--pairs-deletion", required=True)
+    p.add_argument("--pairs-random-addition", required=True)
+    p.add_argument("--score-deletion", required=True)
+    p.add_argument("--score-random-addition", required=True)
     p.add_argument("--scores-out", required=True,
                    help="output consolidated scores TSV path")
-    p.add_argument("--score-glob", default="score_*.json")
-    p.add_argument("--pairs-template", default="pairs_{seed}.tsv")
-    p.add_argument("--source-label", default="inferred_ppi_screen_negative")
+    p.add_argument("--source-3did", default="3did")
+    p.add_argument("--label-pos-deletion", default="3did_deletion")
+    p.add_argument("--label-pos-random-addition", default="3did_random_addition")
+    p.add_argument("--label-neg-deletion",
+                   default="inferred_ppi_screen_negative_for_deletion")
+    p.add_argument("--label-neg-random-addition",
+                   default="inferred_ppi_screen_negative_for_random_addition")
     return p.parse_args()
 
 
@@ -52,63 +71,90 @@ def read_pairs(path):
     return pairs
 
 
-def main():
-    args = parse_args()
-
-    score_files = sorted(glob.glob(args.score_glob))
-    if not score_files:
-        raise SystemExit(f"{TAG} no score files matching {args.score_glob}")
+def load_positive_pairs(conn, source):
+    """3did positive (pfam_a, pfam_b) pairs currently in the DB."""
+    return conn.execute(
+        "SELECT da.pfam_id, db.pfam_id "
+        "FROM domain_domain_interaction AS ddi "
+        "JOIN domain AS da ON da.id = ddi.domain_id_a "
+        "JOIN domain AS db ON db.id = ddi.domain_id_b "
+        "WHERE ddi.negative = 0 AND ddi.source = ?",
+        (source,),
+    ).fetchall()
 
-    records = []
-    for path in score_files:
-        with open(path) as fh:
-            records.append(json.load(fh))
 
-    # Deterministic winner: lowest J (rounded), ties broken by smaller seed.
-    records.sort(key=lambda r: (round(r["J"], J_ROUND), r["seed"]))
-    winner = records[0]
-    winner_seed = winner["seed"]
+def main():
+    args = parse_args()
 
-    pairs_path = args.pairs_template.format(seed=winner_seed)
-    pairs = read_pairs(pairs_path)
+    pool_domains = set(np.load(args.pool, allow_pickle=True)["pool_dom"].tolist())
+    log(f"pool-domain universe: {len(pool_domains)} domains")
 
     conn = sqlite3.connect(args.db)
     conn.execute("PRAGMA foreign_keys=ON")
     conn.execute("PRAGMA journal_mode=OFF")
     conn.execute("PRAGMA synchronous=OFF")
 
-    ensure_domains(conn, (p for pair in pairs for p in pair))
-    insert_ddis(conn, pairs, negative=True, source=args.source_label)
+    # --- positives: full (random_addition) and pool-restricted (deletion) copies ---
+    pos_3did = load_positive_pairs(conn, args.source_3did)
+    pos_reduced = [
+        (a, b) for a, b in pos_3did if a in pool_domains and b in pool_domains
+    ]
+    log(f"3did positives: {len(pos_3did)} total, {len(pos_reduced)} within pool")
+
+    insert_ddis(conn, pos_3did, negative=False,
+                source=args.label_pos_random_addition, dedup_across_sources=False)
+    insert_ddis(conn, pos_reduced, negative=False,
+                source=args.label_pos_deletion, dedup_across_sources=False)
+
+    # --- negatives: one DANS selection per method ---
+    pairs_del = read_pairs(args.pairs_deletion)
+    pairs_rand = read_pairs(args.pairs_random_addition)
+
+    ensure_domains(conn, (p for pair in pairs_del for p in pair))
+    ensure_domains(conn, (p for pair in pairs_rand for p in pair))
+    insert_ddis(conn, pairs_del, negative=True,
+                source=args.label_neg_deletion, dedup_across_sources=False)
+    insert_ddis(conn, pairs_rand, negative=True,
+                source=args.label_neg_random_addition, dedup_across_sources=False)
+
     conn.commit()
-    n_inserted = count_source(conn, args.source_label)
+    counts = {
+        args.label_pos_random_addition: count_source(conn, args.label_pos_random_addition),
+        args.label_pos_deletion: count_source(conn, args.label_pos_deletion),
+        args.label_neg_random_addition: count_source(conn, args.label_neg_random_addition),
+        args.label_neg_deletion: count_source(conn, args.label_neg_deletion),
+    }
     conn.close()
 
-    # --- report: positive reference, then every seed, then the winner ---
-    ref = records[0]
-    log(f"set=positive n_sel={ref['pos_n_sel']} n_dom={ref['pos_n_dom']} "
-        f"mean_pa={ref['pos_mean_pa']:.1f}")
-    for r in records:
-        log(f"set=negative seed={r['seed']} J={r['J']:.4f} pa={r['pa']:.4f} "
-            f"deg={r['deg']:.4f} cov={r['cov']:.4f} n_sel={r['n_sel']} "
-            f"n_dom={r['n_dom']} mean_pa={r['mean_pa']:.1f}")
-    log(f"WINNER seed={winner_seed} J={winner['J']:.4f} n_inserted={n_inserted}")
-
-    cols = ["set", "seed", "J", "pa", "deg", "cov", "n_sel", "n_dom",
-            "mean_pa", "winner"]
+    with open(args.score_deletion) as fh:
+        sc_del = json.load(fh)
+    with open(args.score_random_addition) as fh:
+        sc_rand = json.load(fh)
+
+    for label, n in counts.items():
+        log(f"inserted source={label} rows={n}")
+    for sc in (sc_del, sc_rand):
+        log(f"method={sc['method']} set=positive n_sel={sc['pos_n_sel']} "
+            f"n_dom={sc['pos_n_dom']} mean_pa={sc['pos_mean_pa']:.1f}")
+        log(f"method={sc['method']} set=negative seed={sc['seed']} J={sc['J']:.4f} "
+            f"pa={sc['pa']:.4f} deg={sc['deg']:.4f} cov={sc['cov']:.4f} "
+            f"n_sel={sc['n_sel']} n_dom={sc['n_dom']} mean_pa={sc['mean_pa']:.1f}")
+
+    cols = ["set", "method", "seed", "J", "pa", "deg", "cov", "n_sel", "n_dom",
+            "mean_pa"]
     with open(args.scores_out, "w") as fh:
         fh.write("\t".join(cols) + "\n")
-        fh.write("\t".join([
-            "positive", "NA", "NA", "NA", "NA", "NA",
-            str(ref["pos_n_sel"]), str(ref["pos_n_dom"]),
-            f"{ref['pos_mean_pa']:.2f}", "NA",
-        ]) + "\n")
-        for r in records:
+        for sc in (sc_del, sc_rand):
+            fh.write("\t".join([
+                "positive", sc["method"], "NA", "NA", "NA", "NA", "NA",
+                str(sc["pos_n_sel"]), str(sc["pos_n_dom"]),
+                f"{sc['pos_mean_pa']:.2f}",
+            ]) + "\n")
             fh.write("\t".join([
-                "negative", str(r["seed"]),
-                f"{r['J']:.6f}", f"{r['pa']:.6f}", f"{r['deg']:.6f}",
-                f"{r['cov']:.6f}", str(r["n_sel"]), str(r["n_dom"]),
-                f"{r['mean_pa']:.2f}",
-                "1" if r["seed"] == winner_seed else "0",
+                "negative", sc["method"], str(sc["seed"]),
+                f"{sc['J']:.6f}", f"{sc['pa']:.6f}", f"{sc['deg']:.6f}",
+                f"{sc['cov']:.6f}", str(sc["n_sel"]), str(sc["n_dom"]),
+                f"{sc['mean_pa']:.2f}",
             ]) + "\n")
 
 
diff --git a/bin/select_ppi_negative_dans.py b/bin/select_ppi_negative_dans.py
index c83b3d9..a81235d 100755
--- a/bin/select_ppi_negative_dans.py
+++ b/bin/select_ppi_negative_dans.py
@@ -1,35 +1,31 @@
 #!/usr/bin/env python3
 """
-Select negative DDIs from a candidate pool (``neg_pool.npz``) for one random
-seed, score the selection, and emit the chosen Pfam pairs + a score JSON.
-
-Degree-aware node sampling (DANS, Cappelletti et al. 2024, Bioinformatics
-Advances vbae036) matched the negative degree distribution to the positives by
-sampling edges with probability proportional to preferential attachment
-(PA = deg(a)*deg(b)).  Applied naively to a fixed candidate pool that already has
-the positive mean PA, that overshoots: sampling proportional to PA draws edges
-with mean PA = E[PA^2]/E[PA] >> pool mean, concentrating on a few hub domains.
-
-This selector keeps the PA-proportional draw but adds a hard per-domain CAP at
-the positive degree, and refills in multiple passes until the target negative
-count (== positive count) is reached:
-
-  * pass 1 draws ``target`` candidates PA-weighted without replacement and adds
-    them while no endpoint exceeds its cap;
-  * each refill pass prunes already-picked candidates and any candidate touching
-    a saturated domain (it would only be skipped again), renormalises the PA
-    weights over the survivors, and draws the deficit (floored at ``min_batch``);
-  * the loop stops when ``target`` is hit, or early if the pruned pool can no
-    longer supply an eligible edge -- that shortfall is the true feasibility
-    ceiling of the fixed pool.
-
-Capping pins the negative degree sequence to the positive one (degree
-distribution matched, hub-driven mean-PA blow-up impossible); the PA-weighted
-draw fills high-degree domains toward their cap first, reproducing the positive
-PA distribution; the refill drives the count to the positive total.
-
-The selection is scored against the positives with a combined objective (lower is
-better):  J = w_pa*pa + w_deg*deg + w_cov*cov, where
+Select negative DDIs by Degree-Aware Node Sampling (DANS, Cappelletti et al.
+2024, Bioinformatics Advances vbae036) for one of two methods, and write the
+chosen Pfam pairs plus a small score JSON (reporting only -- there is no
+multi-seed pick-best).
+
+DANS draws a negative edge by sampling two endpoints proportional to node degree
+(equivalently: take the source of one random positive edge and the destination
+of another) and accepts the pair iff it is not an existing edge.  It is
+UNCAPPED: the negative degree *distribution* tracks the positive one without
+pinning an exact per-node degree sequence.
+
+  * method "deletion"        -- DANS restricted to the PPI-derived candidate pool
+    (``cand_a``/``cand_b``), with the positive degrees first reduced to the
+    candidate-domain universe (``pool_deg_r``).  Candidate edges are drawn
+    without replacement with probability proportional to the reduced preferential
+    attachment PA_r = deg_r(a)*deg_r(b); target = ``n_positive_r``.
+
+  * method "random_addition" -- plain DANS over the *full* positive set: sample
+    node-pairs from the positive endpoint multiset (``pos_a``/``pos_b``),
+    rejecting self-pairs, existing edges (``forbidden_*``) and duplicates;
+    target = ``n_positive``.  Domains absent from the candidate pool are reachable
+    here, so coverage and the degree distribution match the full positives.
+
+The selection is scored against the method-appropriate positives with a combined
+objective (lower is better), reported for inspection only:
+  J = w_pa*pa + w_deg*deg + w_cov*cov, where
   pa  = Wasserstein-1 between log1p(PA_neg) and log1p(PA_pos), normalised by the
         spread of the positive log1p(PA);
   deg = Kolmogorov-Smirnov statistic between the per-domain negative degree
@@ -42,6 +38,8 @@
 
 import numpy as np
 
+from ddi_db_utils import pfam_sort_key
+
 
 TAG = "[neg_select]"
 
@@ -53,18 +51,14 @@ def log(msg):
 def parse_args():
     p = argparse.ArgumentParser()
     p.add_argument("--pool", required=True, help="candidate-pool .npz from BUILD step")
+    p.add_argument("--method", required=True,
+                   choices=["deletion", "random_addition"])
     p.add_argument("--seed", type=int, required=True)
     p.add_argument("--score-out", required=True, help="output score JSON path")
     p.add_argument("--pairs-out", required=True, help="output selected-pairs TSV path")
     p.add_argument("--w-pa", type=float, default=0.5)
     p.add_argument("--w-deg", type=float, default=0.3)
     p.add_argument("--w-cov", type=float, default=0.2)
-    p.add_argument(
-        "--min-batch",
-        type=int,
-        default=20,
-        help="Minimum candidates drawn per refill pass so the tail keeps progressing.",
-    )
     return p.parse_args()
 
 
@@ -91,113 +85,145 @@ def ks_statistic(x, y):
     return float(np.max(np.abs(cx - cy)))
 
 
-def select(cand_ai, cand_bi, pa, cap, target, seed, min_batch):
-    """PA-weighted, degree-capped, multi-pass refill selection.
+def select_deletion(cand_a, cand_b, pool_dom, pool_deg_r, target, seed):
+    """DANS over the fixed candidate pool: draw `target` edges without
+    replacement with probability proportional to the reduced PA. No cap.
 
-    Returns the array of selected candidate indices.
+    Returns (selected_indices, cand_ai, cand_bi) where cand_a*/cand_b* index
+    into pool_dom.
     """
     rng = np.random.default_rng(seed)
-    n_cand = cand_ai.size
-    remaining = cap.copy()
-    picked = np.zeros(n_cand, dtype=bool)
-    selected = []
-
-    def draw_and_add(batch):
-        elig = np.flatnonzero(
-            (~picked) & (remaining[cand_ai] > 0) & (remaining[cand_bi] > 0)
-        )
-        if elig.size == 0:
-            return 0
-        w = pa[elig].astype(float)
-        total = w.sum()
-        p = (w / total) if total > 0 else None
-        k = int(min(batch, elig.size))
-        chosen = rng.choice(elig, size=k, replace=False, p=p)
-        added = 0
-        for ci in chosen:
-            a = cand_ai[ci]
-            b = cand_bi[ci]
-            if remaining[a] > 0 and remaining[b] > 0:
-                selected.append(int(ci))
-                picked[ci] = True
-                remaining[a] -= 1
-                remaining[b] -= 1
-                added += 1
-                if len(selected) >= target:
-                    break
-        return added
-
-    draw_and_add(target)
-    n_passes = 1
-    while len(selected) < target:
-        missing = target - len(selected)
-        added = draw_and_add(max(missing, min_batch))
-        n_passes += 1
-        if added == 0:
-            log(f"pool exhausted after {n_passes} passes; "
-                f"selected {len(selected)}/{target}")
-            break
-
-    log(f"selection done in {n_passes} passes: {len(selected)}/{target} edges")
-    return np.array(selected, dtype=np.int64)
-
-
-def main():
-    args = parse_args()
-
-    data = np.load(args.pool, allow_pickle=True)
-    cand_a = data["cand_a"]
-    cand_b = data["cand_b"]
-    pos_dom = data["pos_dom"]
-    pos_deg = data["pos_deg"].astype(np.int64)
-    pos_edge_pa = data["pos_edge_pa"].astype(np.int64)
-    n_positive = int(data["n_positive"])
-    n_positive_domains = int(data["n_positive_domains"])
-
-    log(f"pool: {cand_a.size} candidates, {pos_dom.size} positive domains, "
-        f"target = {n_positive}")
-
-    # Map every domain to an integer index over the positive-domain universe.
-    domain_index = {d: i for i, d in enumerate(pos_dom)}
+    domain_index = {d: i for i, d in enumerate(pool_dom)}
     cand_ai = np.fromiter((domain_index[a] for a in cand_a), dtype=np.int64,
                           count=cand_a.size)
     cand_bi = np.fromiter((domain_index[b] for b in cand_b), dtype=np.int64,
                           count=cand_b.size)
-    cap = pos_deg.copy()
-    pa = pos_deg[cand_ai] * pos_deg[cand_bi]
+    pa = (pool_deg_r[cand_ai] * pool_deg_r[cand_bi]).astype(float)
+    n_cand = int(cand_a.size)
+    k = int(min(target, n_cand))
+    if k < target:
+        log(f"deletion: candidate pool smaller than target "
+            f"({n_cand} < {target}); taking all")
+    total = pa.sum()
+    p = (pa / total) if total > 0 else None
+    sel = (rng.choice(n_cand, size=k, replace=False, p=p)
+           if k > 0 else np.empty(0, dtype=np.int64))
+    log(f"deletion: selected {sel.size}/{target} candidate edges")
+    return sel, cand_ai, cand_bi
+
+
+def select_random_addition(pos_a, pos_b, forbidden, target, seed):
+    """Canonical DANS over the full positive set: sample node-pairs from the
+    positive endpoint multiset (so endpoints are drawn proportional to degree),
+    rejecting self-pairs, existing edges and duplicates. No cap.
+    """
+    rng = np.random.default_rng(seed)
+    endpoints = np.concatenate([pos_a, pos_b])
+    m = int(endpoints.size)
+    picked = set()
+    out_a = []
+    out_b = []
+    attempts = 0
+    max_attempts = 200 * target + 1000
+    while len(out_a) < target and attempts < max_attempts:
+        need = target - len(out_a)
+        batch = int(min(max(need * 2, 1024), 5_000_000))
+        ui = rng.integers(0, m, size=batch)
+        vi = rng.integers(0, m, size=batch)
+        attempts += batch
+        for iu, iv in zip(ui.tolist(), vi.tolist()):
+            a = endpoints[iu]
+            b = endpoints[iv]
+            if a == b:
+                continue
+            key = (a, b) if pfam_sort_key(a) <= pfam_sort_key(b) else (b, a)
+            if key in forbidden or key in picked:
+                continue
+            picked.add(key)
+            out_a.append(key[0])
+            out_b.append(key[1])
+            if len(out_a) >= target:
+                break
+    if len(out_a) < target:
+        log(f"random_addition: only {len(out_a)}/{target} edges after "
+            f"{attempts} attempts (forbidden/duplicate saturation)")
+    else:
+        log(f"random_addition: selected {target}/{target} edges in "
+            f"{attempts} attempts")
+    return np.array(out_a, dtype=object), np.array(out_b, dtype=object)
+
 
-    sel = select(cand_ai, cand_bi, pa, cap, n_positive, args.seed, args.min_batch)
+def main():
+    args = parse_args()
+    data = np.load(args.pool, allow_pickle=True)
 
-    # --- selected-set statistics ---
-    sel_ai = cand_ai[sel]
-    sel_bi = cand_bi[sel]
-    neg_pa = pa[sel]
-    n_sel = int(sel.size)
+    if args.method == "deletion":
+        cand_a = data["cand_a"]
+        cand_b = data["cand_b"]
+        pool_dom = data["pool_dom"]
+        dom_deg = data["pool_deg_r"].astype(np.int64)
+        pos_edge_pa = data["pos_edge_pa_r"].astype(np.int64)
+        target = int(data["n_positive_r"])
+        n_pos_domains = int(data["n_positive_domains_r"])
+
+        log(f"pool: {cand_a.size} candidate edges, {pool_dom.size} pool domains, "
+            f"target = {target}")
+        sel, cand_ai, cand_bi = select_deletion(
+            cand_a, cand_b, pool_dom, dom_deg, target, args.seed
+        )
+        neg_ai = cand_ai[sel]
+        neg_bi = cand_bi[sel]
+        out_a = cand_a[sel]
+        out_b = cand_b[sel]
+    else:
+        pos_a = data["pos_a"]
+        pos_b = data["pos_b"]
+        pos_dom = data["pos_dom"]
+        dom_deg = data["pos_deg"].astype(np.int64)
+        pos_edge_pa = data["pos_edge_pa"].astype(np.int64)
+        target = int(data["n_positive"])
+        n_pos_domains = int(data["n_positive_domains"])
+        forbidden = set(zip(data["forbidden_a"].tolist(),
+                            data["forbidden_b"].tolist()))
+
+        log(f"positives: {pos_a.size} edges, {pos_dom.size} domains, "
+            f"{len(forbidden)} forbidden pairs, target = {target}")
+        out_a, out_b = select_random_addition(
+            pos_a, pos_b, forbidden, target, args.seed
+        )
+        domain_index = {d: i for i, d in enumerate(pos_dom)}
+        neg_ai = np.fromiter((domain_index[a] for a in out_a), dtype=np.int64,
+                             count=out_a.size)
+        neg_bi = np.fromiter((domain_index[b] for b in out_b), dtype=np.int64,
+                             count=out_b.size)
+
+    # --- selected-set statistics / objective (reporting only) ---
+    n_sel = int(out_a.size)
+    neg_pa = (dom_deg[neg_ai] * dom_deg[neg_bi]).astype(np.int64)
     mean_pa_neg = float(neg_pa.mean()) if n_sel else 0.0
 
-    neg_deg = np.zeros(pos_dom.size, dtype=np.int64)
-    np.add.at(neg_deg, sel_ai, 1)
-    np.add.at(neg_deg, sel_bi, 1)
+    neg_deg = np.zeros(dom_deg.size, dtype=np.int64)
+    np.add.at(neg_deg, neg_ai, 1)
+    np.add.at(neg_deg, neg_bi, 1)
     n_dom = int(np.count_nonzero(neg_deg))
 
-    # --- objective ---
     pos_logpa = np.log1p(pos_edge_pa.astype(float))
     neg_logpa = np.log1p(neg_pa.astype(float))
-    spread = float(pos_logpa.max() - pos_logpa.min())
+    spread = float(pos_logpa.max() - pos_logpa.min()) if pos_logpa.size else 0.0
     pa_term = wasserstein1(neg_logpa, pos_logpa) / spread if spread > 0 else 0.0
-    deg_term = ks_statistic(neg_deg, pos_deg)
-    cov_term = 1.0 - (n_dom / n_positive_domains) if n_positive_domains else 0.0
+    deg_term = ks_statistic(neg_deg, dom_deg)
+    cov_term = 1.0 - (n_dom / n_pos_domains) if n_pos_domains else 0.0
     j = args.w_pa * pa_term + args.w_deg * deg_term + args.w_cov * cov_term
+    pos_mean_pa = float(pos_edge_pa.mean()) if pos_edge_pa.size else 0.0
 
-    pos_mean_pa = float(pos_edge_pa.mean())
-
-    log(f"set=positive n_sel={n_positive} n_dom={n_positive_domains} "
+    log(f"method={args.method} set=positive n_sel={target} n_dom={n_pos_domains} "
         f"mean_pa={pos_mean_pa:.1f}")
-    log(f"set=negative seed={args.seed} J={j:.4f} pa={pa_term:.4f} "
-        f"deg={deg_term:.4f} cov={cov_term:.4f} n_sel={n_sel} n_dom={n_dom} "
-        f"mean_pa={mean_pa_neg:.1f}")
+    log(f"method={args.method} set=negative seed={args.seed} J={j:.4f} "
+        f"pa={pa_term:.4f} deg={deg_term:.4f} cov={cov_term:.4f} n_sel={n_sel} "
+        f"n_dom={n_dom} mean_pa={mean_pa_neg:.1f}")
 
     score = {
+        "method": args.method,
         "seed": int(args.seed),
         "J": j,
         "pa": pa_term,
@@ -206,8 +232,8 @@ def main():
         "n_sel": n_sel,
         "n_dom": n_dom,
         "mean_pa": mean_pa_neg,
-        "pos_n_sel": n_positive,
-        "pos_n_dom": n_positive_domains,
+        "pos_n_sel": target,
+        "pos_n_dom": n_pos_domains,
         "pos_mean_pa": pos_mean_pa,
         "w_pa": args.w_pa,
         "w_deg": args.w_deg,
@@ -217,8 +243,8 @@ def main():
         json.dump(score, fh)
 
     with open(args.pairs_out, "w") as fh:
-        for ci in sel:
-            fh.write(f"{cand_a[ci]}\t{cand_b[ci]}\n")
+        for a, b in zip(out_a.tolist(), out_b.tolist()):
+            fh.write(f"{a}\t{b}\n")
 
 
 if __name__ == "__main__":
diff --git a/conf/modules.config b/conf/modules.config
index aa3bdda..7325cec 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -35,7 +35,7 @@ process {
 
     // INSERT_PPI_NEGATIVE_SELECTION's domainsplit.sqlite3 is an intermediate
     // (published only via the workflow-level output block), but its
-    // negative_ppi_seed_scores.tsv diagnostic IS published.
+    // negative_ppi_method_scores.tsv diagnostic IS published.
     withName: 'INSERT_PPI_NEGATIVE_SELECTION' {
         publishDir = [
             path: { "${params.outdir}/negative_ppi" },
diff --git a/modules/local/init_domainsplit_db/main.nf b/modules/local/init_domainsplit_db/main.nf
index 95cb1c1..5f70b1b 100644
--- a/modules/local/init_domainsplit_db/main.nf
+++ b/modules/local/init_domainsplit_db/main.nf
@@ -31,7 +31,7 @@ process INIT_DOMAINSPLIT_DB {
             source VARCHAR(255),
             FOREIGN KEY(domain_id_a) REFERENCES domain ON DELETE CASCADE,
             FOREIGN KEY(domain_id_b) REFERENCES domain ON DELETE CASCADE,
-            UNIQUE(domain_id_a, domain_id_b)
+            UNIQUE(domain_id_a, domain_id_b, source)
         );
 
         CREATE TABLE protein (
diff --git a/modules/local/insert_ppi_negative_selection/main.nf b/modules/local/insert_ppi_negative_selection/main.nf
index f31467b..652e64b 100644
--- a/modules/local/insert_ppi_negative_selection/main.nf
+++ b/modules/local/insert_ppi_negative_selection/main.nf
@@ -6,13 +6,16 @@ process INSERT_PPI_NEGATIVE_SELECTION {
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
-    path score_jsons
-    path pairs_tsvs
+    path neg_pool
+    path pairs_deletion
+    path pairs_random_addition
+    path score_deletion
+    path score_random_addition
 
     output:
-    path "domainsplit.sqlite3",          emit: domainsplit_db
-    path "negative_ppi_seed_scores.tsv", emit: scores
-    path "versions.yml",                 emit: versions
+    path "domainsplit.sqlite3",            emit: domainsplit_db
+    path "negative_ppi_method_scores.tsv", emit: scores
+    path "versions.yml",                   emit: versions
 
     script:
     """
@@ -20,11 +23,17 @@ process INSERT_PPI_NEGATIVE_SELECTION {
 
     insert_ppi_negative_selection.py \\
         --db domainsplit.sqlite3 \\
-        --scores-out negative_ppi_seed_scores.tsv
+        --pool "${neg_pool}" \\
+        --pairs-deletion "${pairs_deletion}" \\
+        --pairs-random-addition "${pairs_random_addition}" \\
+        --score-deletion "${score_deletion}" \\
+        --score-random-addition "${score_random_addition}" \\
+        --scores-out negative_ppi_method_scores.tsv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         python: \$(python3 -c 'import sys; print(sys.version.split()[0])')
+        numpy: \$(python3 -c 'import numpy; print(numpy.__version__)')
         sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)')
     END_VERSIONS
     """
diff --git a/modules/local/select_ppi_negative_dans/main.nf b/modules/local/select_ppi_negative_dans/main.nf
index 4163661..fd1a631 100644
--- a/modules/local/select_ppi_negative_dans/main.nf
+++ b/modules/local/select_ppi_negative_dans/main.nf
@@ -1,23 +1,27 @@
 process SELECT_PPI_NEGATIVE_DANS {
-    tag "select_ppi_negative_dans:seed=${seed}"
+    tag "select_ppi_negative_dans:${method}"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
     container "docker://konstantinpelz/domainsplit-general:1.0.0"
 
     input:
-    tuple val(seed), path(neg_pool)
+    val  method
+    val  seed
+    path neg_pool
 
     output:
-    tuple val(seed), path("score_${seed}.json"), path("pairs_${seed}.tsv"), emit: result
-    path "versions.yml", emit: versions
+    path "score_${method}.json", emit: score
+    path "pairs_${method}.tsv",  emit: pairs
+    path "versions.yml",         emit: versions
 
     script:
     """
     select_ppi_negative_dans.py \\
         --pool "${neg_pool}" \\
+        --method ${method} \\
         --seed ${seed} \\
-        --score-out score_${seed}.json \\
-        --pairs-out pairs_${seed}.tsv
+        --score-out score_${method}.json \\
+        --pairs-out pairs_${method}.tsv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/nextflow.config b/nextflow.config
index aa5af2a..2f39696 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -67,8 +67,8 @@ params {
     // count. Required input (no default; must be supplied per run).
     negative_ppi_parquet         = null
     negative_ppi_min_n_tested    = 5
-    // Base seed for the multi-seed negative-DDI selection: 5 parallel SLURM
-    // jobs run with seeds base+1..+5 and the best-scoring selection is kept.
+    // Random seed for the (single-run, uncapped DANS) negative-DDI selection;
+    // both methods ("deletion" and "random_addition") use this seed.
     negative_ppi_seed            = 42
 
     // Reviewed-human UniProt -> Pfam stream used to detect single-domain
diff --git a/nextflow_schema.json b/nextflow_schema.json
index ef83a4b..bb61aef 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -167,7 +167,7 @@
                 },
                 "negative_ppi_seed": {
                     "type": "integer",
-                    "description": "Base random seed for the multi-seed negative-DDI selection. Five parallel jobs run the degree-capped PA-weighted sampler with seeds base+1..+5; the lowest-scoring (best degree/PA/coverage-matched) selection is inserted.",
+                    "description": "Random seed for the single-run, uncapped DANS negative-DDI selection (used by both the deletion and random_addition methods).",
                     "default": 42,
                     "fa_icon": "fas fa-dice"
                 },
diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf
index 9c32b68..d893eab 100644
--- a/subworkflows/local/collect_ddi_data/main.nf
+++ b/subworkflows/local/collect_ddi_data/main.nf
@@ -27,9 +27,10 @@ include { INSERT_SINGLE_DOMAIN_PPI  } from '../../../modules/local/insert_single
 include { INSERT_PPIDM              } from '../../../modules/local/insert_ppidm/main.nf'
 include { INSERT_NEGATOME           } from '../../../modules/local/insert_negatome/main.nf'
 include { REMOVE_SELF_INTERACTIONS  } from '../../../modules/local/remove_self_interactions/main.nf'
-include { BUILD_PPI_NEGATIVE_POOL       } from '../../../modules/local/build_ppi_negative_pool/main.nf'
-include { SELECT_PPI_NEGATIVE_DANS      } from '../../../modules/local/select_ppi_negative_dans/main.nf'
-include { INSERT_PPI_NEGATIVE_SELECTION } from '../../../modules/local/insert_ppi_negative_selection/main.nf'
+include { BUILD_PPI_NEGATIVE_POOL                            } from '../../../modules/local/build_ppi_negative_pool/main.nf'
+include { SELECT_PPI_NEGATIVE_DANS as SELECT_DELETION        } from '../../../modules/local/select_ppi_negative_dans/main.nf'
+include { SELECT_PPI_NEGATIVE_DANS as SELECT_RANDOM_ADDITION } from '../../../modules/local/select_ppi_negative_dans/main.nf'
+include { INSERT_PPI_NEGATIVE_SELECTION                      } from '../../../modules/local/insert_ppi_negative_selection/main.nf'
 include { SMOKE_FILTER              } from '../../../modules/local/smoke_filter/main.nf'
 
 workflow COLLECT_DDI_DATA {
@@ -74,10 +75,17 @@ workflow COLLECT_DDI_DATA {
         domainsplit_db = REMOVE_SELF_INTERACTIONS(domainsplit_db).domainsplit_db
     }
 
-    // 7. high-confidence non-PPI negatives (inferred only over 3did domains).
+    // 7. high-confidence non-PPI negatives via uncapped DANS (Cappelletti et al.
+    //    vbae036), in two flavours that coexist under distinct source labels:
+    //      * "deletion"        -- DANS over the PPI candidate pool, with the
+    //                             positives reduced to the candidate-domain
+    //                             universe (labels 3did_deletion /
+    //                             inferred_ppi_screen_negative_for_deletion).
+    //      * "random_addition" -- plain DANS over the full positive set (labels
+    //                             3did_random_addition /
+    //                             inferred_ppi_screen_negative_for_random_addition).
     //    The expensive, deterministic UniProt fetch + candidate-pool build runs
-    //    once; selection fans out over 5 seeds (base+1..+5) in parallel SLURM
-    //    jobs, and the best-scoring (degree/PA-matched) selection is inserted.
+    //    once; each method is a single deterministic selection (no pick-best).
     pool = BUILD_PPI_NEGATIVE_POOL(
         domainsplit_db,
         file(negative_ppi_parquet),
@@ -85,24 +93,18 @@ workflow COLLECT_DDI_DATA {
         params.self_interaction,
     )
 
-    // Pair the single shared pool file with each seed (combine avoids the
-    // queue-exhaustion that mixing a one-shot channel with a 5-item queue causes).
-    seeds = Channel.of(1, 2, 3, 4, 5).map { params.negative_ppi_seed + it }
-    sel   = SELECT_PPI_NEGATIVE_DANS(seeds.combine(pool.neg_pool))
+    del  = SELECT_DELETION('deletion', params.negative_ppi_seed, pool.neg_pool)
+    rand = SELECT_RANDOM_ADDITION('random_addition', params.negative_ppi_seed, pool.neg_pool)
 
-    sel.result
-        .multiMap { seed, score, pairs ->
-            scores: score
-            pairs:  pairs
-        }
-        .set { selection }
-
-    best = INSERT_PPI_NEGATIVE_SELECTION(
+    inserted = INSERT_PPI_NEGATIVE_SELECTION(
         pool.domainsplit_db,
-        selection.scores.collect(),
-        selection.pairs.collect(),
+        pool.neg_pool,
+        del.pairs,
+        rand.pairs,
+        del.score,
+        rand.score,
     )
-    domainsplit_db = best.domainsplit_db
+    domainsplit_db = inserted.domainsplit_db
     pfam_mapping   = pool.pfam_mapping
 
     if (params.smoke_test_n_ddis != null) {
diff --git a/subworkflows/local/split_domainsplit_database/main.nf b/subworkflows/local/split_domainsplit_database/main.nf
index 4f6e3a7..1427c03 100644
--- a/subworkflows/local/split_domainsplit_database/main.nf
+++ b/subworkflows/local/split_domainsplit_database/main.nf
@@ -1,24 +1,35 @@
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    SPLIT_DOMAINSPLIT_DATABASE -- produce three split strategies:
+    SPLIT_DOMAINSPLIT_DATABASE -- produce the split strategies, each run ONCE
+    PER NEGATIVE-DDI METHOD ("deletion" and "random_addition") so the two
+    methods' core sources stay isolated.  Strategies:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
       * random_ddi             biased baseline (random partition)
       * minimal_leakage_domain leakage-aware spectral partition
       * external_validation    leakage-aware train/validation on the "core"
-                               sources (3did + high-conf non-PPI negatives),
-                               plus an as-is test set from the held-out sources
-                               (single-domain PPI, PPIDM, negatome).
-
-    Domain sequences are extracted and clustered (MMseqs2) once; both the
-    minimal-leakage and external-validation train/val partitions reuse the
-    clusters.
+                               sources (3did copy + the method's PPI-screen
+                               negatives), plus an as-is test set from the
+                               held-out sources (single-domain PPI, PPIDM,
+                               negatome).  The held-out test set is
+                               method-independent, so it is built once and routed
+                               into both external_validation_* folders.
+
+    Each strategy therefore yields two method folders, e.g.
+    random_ddi_deletion / random_ddi_random_addition -> 6 method folders total.
+
+    Domain sequences are extracted and clustered (MMseqs2) once; every
+    leakage-aware partition reuses the clusters.
 ----------------------------------------------------------------------------*/
 
-include { RANDOM_DDI_SPLIT                                                  } from '../../../modules/local/random_ddi_split/main'
-include { EXTRACT_DOMAIN_SEQUENCES; MINIMAL_LEAKAGE_SPLIT_DOMAIN            } from '../../../modules/local/minimal_leakage_split/main'
-include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MINIMAL_LEAKAGE_SPLIT_TRAINVAL    } from '../../../modules/local/minimal_leakage_split/main'
-include { SUBSET_DDIS_BY_SOURCE                                             } from '../../../modules/local/external_validation_split/main'
-include { MMSEQS_EASYCLUSTER                                                } from '../../../modules/nf-core/mmseqs/easycluster/main'
+include { RANDOM_DDI_SPLIT as RANDOM_DDI_SPLIT_DEL          } from '../../../modules/local/random_ddi_split/main'
+include { RANDOM_DDI_SPLIT as RANDOM_DDI_SPLIT_RAND         } from '../../../modules/local/random_ddi_split/main'
+include { EXTRACT_DOMAIN_SEQUENCES                          } from '../../../modules/local/minimal_leakage_split/main'
+include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MLS_DOMAIN_DEL    } from '../../../modules/local/minimal_leakage_split/main'
+include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MLS_DOMAIN_RAND   } from '../../../modules/local/minimal_leakage_split/main'
+include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MLS_TRAINVAL_DEL  } from '../../../modules/local/minimal_leakage_split/main'
+include { MINIMAL_LEAKAGE_SPLIT_DOMAIN as MLS_TRAINVAL_RAND } from '../../../modules/local/minimal_leakage_split/main'
+include { SUBSET_DDIS_BY_SOURCE                             } from '../../../modules/local/external_validation_split/main'
+include { MMSEQS_EASYCLUSTER                                } from '../../../modules/nf-core/mmseqs/easycluster/main'
 
 
 def map_split_dbs(split_info_ch, split_dbs_ch, method) {
@@ -52,14 +63,25 @@ workflow SPLIT_DOMAINSPLIT_DATABASE {
         ["optimization", 0.2],
         ["test", 0.2]
     ]
+    def trainval_splits = [
+        ["train", 0.8],
+        ["validation", 0.2]
+    ]
 
-    // The two methods that mimic a within-distribution evaluation use only the
-    // "core" sources: 3did positives + high-confidence non-PPI negatives.
-    // 'inferred_ppi_screen_negative' must stay in sync with the --source-label
-    // default in bin/insert_ppi_negative_selection.py.
-    def core_sources = ['3did', 'inferred_ppi_screen_negative']
+    // Core sources per negative-DDI method: the method's 3did positive copy plus
+    // its PPI-screen negatives.  Must stay in sync with the source labels written
+    // by bin/insert_ppi_negative_selection.py.
+    def core_deletion = [
+        '3did_deletion',
+        'inferred_ppi_screen_negative_for_deletion',
+    ]
+    def core_random_addition = [
+        '3did_random_addition',
+        'inferred_ppi_screen_negative_for_random_addition',
+    ]
 
-    // External-validation test set: held-out sources placed as is.
+    // External-validation test set: held-out sources placed as is
+    // (method-independent).
     def test_sources = [
         'single_domain_ppi',
         'PPIDM_Bronze', 'PPIDM_Silver', 'PPIDM_Gold',
@@ -67,33 +89,19 @@ workflow SPLIT_DOMAINSPLIT_DATABASE {
     ]
 
     // Biased baseline: random DDI split (same proteins in train and test)
-    RANDOM_DDI_SPLIT(
-        domainsplit_db_ch,
-        Channel.of(splits),
-        core_sources
-    )
+    RANDOM_DDI_SPLIT_DEL(domainsplit_db_ch, splits, core_deletion)
+    RANDOM_DDI_SPLIT_RAND(domainsplit_db_ch, splits, core_random_addition)
 
     // Leakage-aware: spectral graph partitioning on domain clusters
-    MINIMAL_LEAKAGE_SPLIT_DOMAIN(
-        domainsplit_db_ch,
-        splits,
-        clusters_tsv,
-        core_sources
-    )
+    MLS_DOMAIN_DEL(domainsplit_db_ch, splits, clusters_tsv, core_deletion)
+    MLS_DOMAIN_RAND(domainsplit_db_ch, splits, clusters_tsv, core_random_addition)
 
     // External validation: leakage-free train/validation on core sources ...
-    def trainval_splits = [
-        ["train", 0.8],
-        ["validation", 0.2]
-    ]
-    MINIMAL_LEAKAGE_SPLIT_TRAINVAL(
-        domainsplit_db_ch,
-        trainval_splits,
-        clusters_tsv,
-        core_sources
-    )
+    MLS_TRAINVAL_DEL(domainsplit_db_ch, trainval_splits, clusters_tsv, core_deletion)
+    MLS_TRAINVAL_RAND(domainsplit_db_ch, trainval_splits, clusters_tsv, core_random_addition)
 
-    // ... plus an as-is test set from the held-out sources
+    // ... plus an as-is test set from the held-out sources (shared by both
+    //     external_validation_* methods).
     SUBSET_DDIS_BY_SOURCE(
         domainsplit_db_ch,
         test_sources,
@@ -101,10 +109,14 @@ workflow SPLIT_DOMAINSPLIT_DATABASE {
     )
 
     split_ch = Channel.empty().mix(
-        map_split_dbs(RANDOM_DDI_SPLIT.out.split_info, RANDOM_DDI_SPLIT.out.split_dbs, "random_ddi"),
-        map_split_dbs(MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_info, MINIMAL_LEAKAGE_SPLIT_DOMAIN.out.split_dbs, "minimal_leakage_domain"),
-        map_split_dbs(MINIMAL_LEAKAGE_SPLIT_TRAINVAL.out.split_info, MINIMAL_LEAKAGE_SPLIT_TRAINVAL.out.split_dbs, "external_validation"),
-        map_split_dbs(SUBSET_DDIS_BY_SOURCE.out.split_info, SUBSET_DDIS_BY_SOURCE.out.split_dbs, "external_validation")
+        map_split_dbs(RANDOM_DDI_SPLIT_DEL.out.split_info,  RANDOM_DDI_SPLIT_DEL.out.split_dbs,  "random_ddi_deletion"),
+        map_split_dbs(RANDOM_DDI_SPLIT_RAND.out.split_info, RANDOM_DDI_SPLIT_RAND.out.split_dbs, "random_ddi_random_addition"),
+        map_split_dbs(MLS_DOMAIN_DEL.out.split_info,  MLS_DOMAIN_DEL.out.split_dbs,  "minimal_leakage_domain_deletion"),
+        map_split_dbs(MLS_DOMAIN_RAND.out.split_info, MLS_DOMAIN_RAND.out.split_dbs, "minimal_leakage_domain_random_addition"),
+        map_split_dbs(MLS_TRAINVAL_DEL.out.split_info,  MLS_TRAINVAL_DEL.out.split_dbs,  "external_validation_deletion"),
+        map_split_dbs(MLS_TRAINVAL_RAND.out.split_info, MLS_TRAINVAL_RAND.out.split_dbs, "external_validation_random_addition"),
+        map_split_dbs(SUBSET_DDIS_BY_SOURCE.out.split_info, SUBSET_DDIS_BY_SOURCE.out.split_dbs, "external_validation_deletion"),
+        map_split_dbs(SUBSET_DDIS_BY_SOURCE.out.split_info, SUBSET_DDIS_BY_SOURCE.out.split_dbs, "external_validation_random_addition")
     )
 
     emit:
diff --git a/tests/python/test_insert_ppi_negative_selection.py b/tests/python/test_insert_ppi_negative_selection.py
new file mode 100644
index 0000000..d3c8add
--- /dev/null
+++ b/tests/python/test_insert_ppi_negative_selection.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""Local integration check for the dual-source negative insertion (no cluster).
+
+Validates the schema change (UNIQUE(domain_id_a, domain_id_b, source)) and
+bin/insert_ppi_negative_selection.py together:
+
+  * the four method labels are inserted, with 3did_random_addition copying the
+    full 3did set and 3did_deletion only the pool-domain subset;
+  * a pair can coexist under '3did' and '3did_random_addition' (duplicate by
+    source);
+  * the canonical sources still dedup across each other (a PPIDM pair equal to a
+    3did pair is dropped) because insert_ddis defaults to dedup_across_sources.
+
+Run directly or via pytest.
+"""
+
+import json
+import os
+import sqlite3
+import subprocess
+import sys
+import tempfile
+
+import numpy as np
+
+REPO = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+BIN = os.path.join(REPO, "bin")
+sys.path.insert(0, BIN)
+
+from ddi_db_utils import ensure_domains, insert_ddis  # noqa: E402
+
+INSERTER = os.path.join(BIN, "insert_ppi_negative_selection.py")
+
+SCHEMA = """
+CREATE TABLE domain (id INTEGER PRIMARY KEY, pfam_id, name, UNIQUE(pfam_id));
+CREATE TABLE domain_domain_interaction (
+    id INTEGER PRIMARY KEY,
+    domain_id_a, domain_id_b, negative,
+    source VARCHAR(255),
+    FOREIGN KEY(domain_id_a) REFERENCES domain ON DELETE CASCADE,
+    FOREIGN KEY(domain_id_b) REFERENCES domain ON DELETE CASCADE,
+    UNIQUE(domain_id_a, domain_id_b, source)
+);
+"""
+
+
+def pf(i):
+    return f"PF{i:05d}"
+
+
+def count(conn, source):
+    return conn.execute(
+        "SELECT COUNT(*) FROM domain_domain_interaction WHERE source = ?",
+        (source,),
+    ).fetchone()[0]
+
+
+def write_score(path, method):
+    json.dump({
+        "method": method, "seed": 7, "J": 0.1, "pa": 0.1, "deg": 0.1,
+        "cov": 0.0, "n_sel": 2, "n_dom": 3, "mean_pa": 1.0,
+        "pos_n_sel": 3, "pos_n_dom": 4, "pos_mean_pa": 2.0,
+    }, open(path, "w"))
+
+
+def write_pairs(path, pairs):
+    with open(path, "w") as fh:
+        for a, b in pairs:
+            fh.write(f"{a}\t{b}\n")
+
+
+def test_dual_source_insert():
+    with tempfile.TemporaryDirectory() as tmp:
+        db = os.path.join(tmp, "domainsplit.sqlite3")
+        conn = sqlite3.connect(db)
+        conn.executescript(SCHEMA)
+        ensure_domains(conn, [pf(i) for i in range(1, 7)])
+
+        # 3did positives, then a PPIDM batch overlapping (1,2).
+        insert_ddis(conn, [(pf(1), pf(2)), (pf(1), pf(3)), (pf(1), pf(4))],
+                    negative=False, source="3did")
+        insert_ddis(conn, [(pf(1), pf(2)), (pf(5), pf(6))],
+                    negative=False, source="PPIDM_Gold")
+        conn.commit()
+        assert count(conn, "3did") == 3
+        assert count(conn, "PPIDM_Gold") == 1, "cross-source dedup broken"
+        conn.close()
+
+        # Pool covers only domains 1,2,3 -> 3did_deletion keeps (1,2),(1,3).
+        pool = os.path.join(tmp, "neg_pool.npz")
+        np.savez(pool, pool_dom=np.array([pf(1), pf(2), pf(3)], dtype=object))
+
+        write_pairs(os.path.join(tmp, "pairs_deletion.tsv"),
+                    [(pf(2), pf(5)), (pf(3), pf(6))])
+        write_pairs(os.path.join(tmp, "pairs_random_addition.tsv"),
+                    [(pf(1), pf(6)), (pf(4), pf(5))])
+        write_score(os.path.join(tmp, "score_deletion.json"), "deletion")
+        write_score(os.path.join(tmp, "score_random_addition.json"), "random_addition")
+
+        env = dict(os.environ, PYTHONPATH=BIN + os.pathsep + os.environ.get("PYTHONPATH", ""))
+        subprocess.run(
+            [sys.executable, INSERTER, "--db", db, "--pool", pool,
+             "--pairs-deletion", os.path.join(tmp, "pairs_deletion.tsv"),
+             "--pairs-random-addition", os.path.join(tmp, "pairs_random_addition.tsv"),
+             "--score-deletion", os.path.join(tmp, "score_deletion.json"),
+             "--score-random-addition", os.path.join(tmp, "score_random_addition.json"),
+             "--scores-out", os.path.join(tmp, "scores.tsv")],
+            check=True, env=env,
+        )
+
+        conn = sqlite3.connect(db)
+        assert count(conn, "3did_random_addition") == 3, "full 3did copy wrong"
+        assert count(conn, "3did_deletion") == 2, "pool-restricted copy wrong"
+        assert count(conn, "inferred_ppi_screen_negative_for_deletion") == 2
+        assert count(conn, "inferred_ppi_screen_negative_for_random_addition") == 2
+        # original sources untouched
+        assert count(conn, "3did") == 3
+        assert count(conn, "PPIDM_Gold") == 1
+
+        # The same pair (1,2) coexists under '3did' and '3did_random_addition'.
+        n_dup = conn.execute(
+            "SELECT COUNT(DISTINCT source) FROM domain_domain_interaction ddi "
+            "JOIN domain da ON da.id = ddi.domain_id_a "
+            "JOIN domain db ON db.id = ddi.domain_id_b "
+            "WHERE da.pfam_id = ? AND db.pfam_id = ?",
+            (pf(1), pf(2)),
+        ).fetchone()[0]
+        assert n_dup >= 2, f"(1,2) should exist under >=2 sources, got {n_dup}"
+        conn.close()
+
+    print("OK: dual-source insert + schema invariants hold")
+
+
+if __name__ == "__main__":
+    test_dual_source_insert()
diff --git a/tests/python/test_select_ppi_negative_dans.py b/tests/python/test_select_ppi_negative_dans.py
new file mode 100644
index 0000000..41bfba9
--- /dev/null
+++ b/tests/python/test_select_ppi_negative_dans.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+"""Local unit-check for bin/select_ppi_negative_dans.py (no Nextflow, no cluster).
+
+Builds a tiny synthetic candidate pool and runs both DANS methods, asserting the
+core invariants:
+
+  * deletion        -- draws exactly n_positive_r edges, all from the candidate
+                       pool, all endpoints within the pool-domain universe.
+  * random_addition -- draws exactly n_positive edges, none a positive/forbidden
+                       pair, no self-pairs, no duplicates, and reaches domains
+                       that are absent from the candidate pool.
+
+Run directly (`python3 tests/python/test_select_ppi_negative_dans.py`) or via
+pytest.
+"""
+
+import os
+import subprocess
+import sys
+import tempfile
+
+import numpy as np
+
+REPO = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+BIN = os.path.join(REPO, "bin")
+SELECTOR = os.path.join(BIN, "select_ppi_negative_dans.py")
+
+
+def pf(i):
+    return f"PF{i:05d}"
+
+
+def build_pool(path):
+    # Full 3did positive edges (canonical ascending).
+    pos_edges = [(1, 2), (1, 3), (1, 4), (2, 3), (5, 6),
+                 (7, 8), (9, 10), (1, 9), (2, 10)]
+    # Candidate pool: fresh (non-positive) pairs over domains 1..6.
+    cand = [(1, 5), (1, 6), (2, 5), (2, 6), (3, 5),
+            (3, 6), (4, 5), (4, 6), (3, 4)]
+    pool_domains = sorted({d for e in cand for d in e})        # 1..6
+
+    # Reduced positives = positive edges with both endpoints in the pool.
+    pos_r = [(a, b) for a, b in pos_edges
+             if a in pool_domains and b in pool_domains]
+    deg_r = {d: 0 for d in pool_domains}
+    for a, b in pos_r:
+        deg_r[a] += 1
+        deg_r[b] += 1
+    pool_dom = [pf(d) for d in pool_domains]
+    pool_deg_r = np.array([deg_r[d] for d in pool_domains], dtype=np.int64)
+    pos_edge_pa_r = np.array([deg_r[a] * deg_r[b] for a, b in pos_r], dtype=np.int64)
+
+    # Full positive degrees over all 10 domains.
+    all_dom = list(range(1, 11))
+    deg = {d: 0 for d in all_dom}
+    for a, b in pos_edges:
+        deg[a] += 1
+        deg[b] += 1
+    pos_dom = [pf(d) for d in all_dom]
+    pos_deg = np.array([deg[d] for d in all_dom], dtype=np.int64)
+    pos_edge_pa = np.array([deg[a] * deg[b] for a, b in pos_edges], dtype=np.int64)
+
+    np.savez(
+        path,
+        cand_a=np.array([pf(a) for a, b in cand], dtype=object),
+        cand_b=np.array([pf(b) for a, b in cand], dtype=object),
+        pool_dom=np.array(pool_dom, dtype=object),
+        pool_deg_r=pool_deg_r,
+        pos_edge_pa_r=pos_edge_pa_r,
+        n_positive_r=np.int64(len(pos_r)),
+        n_positive_domains_r=np.int64(sum(1 for d in pool_domains if deg_r[d])),
+        pos_a=np.array([pf(a) for a, b in pos_edges], dtype=object),
+        pos_b=np.array([pf(b) for a, b in pos_edges], dtype=object),
+        pos_dom=np.array(pos_dom, dtype=object),
+        pos_deg=pos_deg,
+        pos_edge_pa=pos_edge_pa,
+        n_positive=np.int64(len(pos_edges)),
+        n_positive_domains=np.int64(len(all_dom)),
+        forbidden_a=np.array([pf(a) for a, b in pos_edges], dtype=object),
+        forbidden_b=np.array([pf(b) for a, b in pos_edges], dtype=object),
+    )
+    return {
+        "cand": {tuple(sorted((pf(a), pf(b)))) for a, b in cand},
+        "pool_domains": {pf(d) for d in pool_domains},
+        "n_positive_r": len(pos_r),
+        "n_positive": len(pos_edges),
+        "forbidden": {(pf(a), pf(b)) for a, b in pos_edges},
+        "pool_only": {pf(d) for d in pool_domains},
+        "extra_domains": {pf(9), pf(10)},
+    }
+
+
+def run_method(pool_path, method, workdir):
+    score = os.path.join(workdir, f"score_{method}.json")
+    pairs = os.path.join(workdir, f"pairs_{method}.tsv")
+    env = dict(os.environ, PYTHONPATH=BIN + os.pathsep + os.environ.get("PYTHONPATH", ""))
+    subprocess.run(
+        [sys.executable, SELECTOR, "--pool", pool_path, "--method", method,
+         "--seed", "7", "--score-out", score, "--pairs-out", pairs],
+        check=True, env=env,
+    )
+    out = []
+    with open(pairs) as fh:
+        for line in fh:
+            line = line.rstrip("\n")
+            if line:
+                a, b = line.split("\t")
+                out.append((a, b))
+    return out
+
+
+def test_dans_methods():
+    with tempfile.TemporaryDirectory() as tmp:
+        pool_path = os.path.join(tmp, "neg_pool.npz")
+        meta = build_pool(pool_path)
+
+        # --- deletion ---
+        del_pairs = run_method(pool_path, "deletion", tmp)
+        assert len(del_pairs) == meta["n_positive_r"], \
+            f"deletion count {len(del_pairs)} != {meta['n_positive_r']}"
+        for a, b in del_pairs:
+            assert tuple(sorted((a, b))) in meta["cand"], f"{a},{b} not in pool"
+            assert a in meta["pool_domains"] and b in meta["pool_domains"]
+        assert len({tuple(sorted(p)) for p in del_pairs}) == len(del_pairs), "dup in deletion"
+
+        # --- random_addition ---
+        rand_pairs = run_method(pool_path, "random_addition", tmp)
+        assert len(rand_pairs) == meta["n_positive"], \
+            f"random_addition count {len(rand_pairs)} != {meta['n_positive']}"
+        seen = set()
+        used_domains = set()
+        for a, b in rand_pairs:
+            assert a != b, f"self pair {a}"
+            key = (a, b) if a <= b else (b, a)
+            assert key not in meta["forbidden"], f"{key} is a positive/forbidden pair"
+            assert key not in seen, f"duplicate {key}"
+            seen.add(key)
+            used_domains.update((a, b))
+        # DANS over the full positive set must be able to reach domains outside
+        # the candidate pool.
+        assert used_domains & meta["extra_domains"], \
+            "random_addition never reached the pool-absent domains"
+
+    print("OK: both DANS methods satisfy invariants")
+
+
+if __name__ == "__main__":
+    test_dans_methods()

From 6be7b1a4bc72c06c940c36174368db620243d835 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Thu, 11 Jun 2026 14:48:50 +0200
Subject: [PATCH 09/16] updated schema

---
 nextflow_schema.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index bb61aef..a4d72f8 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -46,7 +46,7 @@
                 "url_3did": {
                     "type": "string",
                     "description": "URL of the 3did SQL dump archive.",
-                    "default": "https://3did.irbbarcelona.org/download/2022_01/3did.sql.gz",
+                    "default": "https://3did.irbbarcelona.org/download/current/3did.sql.gz",
                     "fa_icon": "fas fa-link"
                 },
                 "url_uniprot_id_mapping": {

From 77df26dab13636cf3810fcc774a509c06c729be4 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Thu, 11 Jun 2026 14:59:48 +0200
Subject: [PATCH 10/16] dont add nf-test to github

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 7e07892..c85aa62 100644
--- a/.gitignore
+++ b/.gitignore
@@ -208,6 +208,9 @@ other_scripts/*
 test-eval-env/*
 *.txt
 
+# Tests
+.nf-test/
+
 # Added by code-review-graph
 .code-review-graph/
 # Claude

From 64f915bd0610517b4fdee08b780bcfe13871628c Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Thu, 11 Jun 2026 15:14:06 +0200
Subject: [PATCH 11/16] marked required files as required, fixed json in test

---
 nextflow_schema.json                               |  9 +++------
 subworkflows/local/collect_ddi_data/main.nf        |  4 ++++
 tests/python/test_insert_ppi_negative_selection.py | 11 ++++++-----
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index a4d72f8..bfb39b3 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -152,10 +152,9 @@
                     "fa_icon": "fas fa-hashtag"
                 },
                 "negative_ppi_parquet": {
-                    "type": ["string", "null"],
+                    "type": ["string"],
                     "format": "file-path",
                     "description": "Required path to a Y2H/MS PPI parquet (columns: gene_name_bait, gene_name_prey, n_tested, ...). COLLECT_DDI_DATA derives negative DDIs by mapping bait/prey genes to UniProt then Pfam (via UniProt REST API) and inserting Pfam-pair candidates (degree-matched or by frequency) restricted to domains already in positive DDIs.",
-                    "default": null,
                     "fa_icon": "fas fa-file-import"
                 },
                 "negative_ppi_min_n_tested": {
@@ -185,10 +184,9 @@
                     "fa_icon": "fas fa-link"
                 },
                 "hippie_tsv": {
-                    "type": ["string", "null"],
+                    "type": ["string"],
                     "format": "file-path",
                     "description": "Required path to a HIPPIE PPI TSV. COLLECT_DDI_DATA adds positive DDIs inferred from PPIs between two single-domain proteins.",
-                    "default": null,
                     "fa_icon": "fas fa-file-import"
                 },
                 "hippie_min_score": {
@@ -200,10 +198,9 @@
                     "fa_icon": "fas fa-filter"
                 },
                 "ppidm_tsv": {
-                    "type": ["string", "null"],
+                    "type": ["string"],
                     "format": "file-path",
                     "description": "Required path to a PPIDM predictions TSV (columns: domain_1, domain_2, class). COLLECT_DDI_DATA adds positive DDIs tagged with source 'PPIDM_<Class>'.",
-                    "default": null,
                     "fa_icon": "fas fa-file-import"
                 },
                 "ppidm_classes": {
diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf
index d893eab..386d71b 100644
--- a/subworkflows/local/collect_ddi_data/main.nf
+++ b/subworkflows/local/collect_ddi_data/main.nf
@@ -44,6 +44,10 @@ workflow COLLECT_DDI_DATA {
     negative_ppi_parquet
 
     main:
+    if( !hippie_tsv || !ppidm_tsv || !negative_ppi_parquet ) {
+        log.error "Required inputs missing: hippie_tsv, ppidm_tsv, and negative_ppi_parquet must be provided"
+        exit 1
+    }
     file_3did     = file(url_3did)
     sqlite_3did   = DOWNLOAD_3DID_SQLITE(file_3did).sqlite
     negatome_file = DOWNLOAD_NEGATOME(url_negatome).negatome
diff --git a/tests/python/test_insert_ppi_negative_selection.py b/tests/python/test_insert_ppi_negative_selection.py
index d3c8add..d39e10c 100644
--- a/tests/python/test_insert_ppi_negative_selection.py
+++ b/tests/python/test_insert_ppi_negative_selection.py
@@ -56,11 +56,12 @@ def count(conn, source):
 
 
 def write_score(path, method):
-    json.dump({
-        "method": method, "seed": 7, "J": 0.1, "pa": 0.1, "deg": 0.1,
-        "cov": 0.0, "n_sel": 2, "n_dom": 3, "mean_pa": 1.0,
-        "pos_n_sel": 3, "pos_n_dom": 4, "pos_mean_pa": 2.0,
-    }, open(path, "w"))
+    with open(path, "w") as fh:
+        json.dump({
+            "method": method, "seed": 7, "J": 0.1, "pa": 0.1, "deg": 0.1,
+            "cov": 0.0, "n_sel": 2, "n_dom": 3, "mean_pa": 1.0,
+            "pos_n_sel": 3, "pos_n_dom": 4, "pos_mean_pa": 2.0,
+        }, fh)
 
 
 def write_pairs(path, pairs):

From bc681c57bff37d28118c49d463c504ca8a798767 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Fri, 12 Jun 2026 11:18:19 +0200
Subject: [PATCH 12/16] updated tests

---
 .github/actions/nf-test/action.yml            |   8 +
 .gitignore                                    |   4 +
 conf/test.config                              |  26 ++-
 modules/local/3did/main.nf                    |   7 +
 modules/local/analyze_ddi_bias/main.nf        |   7 +
 modules/local/build_ppi_negative_pool/main.nf |   7 +
 .../extract_unique_domains/main.nf            |   7 +
 .../enrich/insert_domain_go_terms/main.nf     |   7 +
 .../insert_domain_protein_mapping/main.nf     |   7 +
 modules/local/enrich/insert_ppi/main.nf       |   7 +
 .../enrich/insert_protein_go_terms/main.nf    |   7 +
 .../insert_proteins_with_embeddings/main.nf   |   7 +
 modules/local/esm_embeddings/main.nf          |  34 +++
 .../local/external_validation_split/main.nf   |   8 +
 modules/local/init_domainsplit_db/main.nf     |   7 +
 modules/local/insert_3did/main.nf             |   7 +
 modules/local/insert_negatome/main.nf         |   7 +
 .../insert_ppi_negative_selection/main.nf     |   7 +
 modules/local/insert_ppidm/main.nf            |   7 +
 .../local/insert_single_domain_ppi/main.nf    |   7 +
 modules/local/minimal_leakage_split/main.nf   |  17 ++
 modules/local/negatome/main.nf                |   7 +
 modules/local/pfam/main.nf                    |  15 ++
 modules/local/random_ddi_split/main.nf        |  10 +
 .../local/remove_self_interactions/main.nf    |   7 +
 .../local/select_ppi_negative_dans/main.nf    |   7 +
 modules/local/smoke_filter/main.nf            |   7 +
 modules/local/swissprot_map/main.nf           |   7 +
 modules/local/util/main.nf                    |  14 ++
 subworkflows/local/collect_ddi_data/main.nf   |  19 ++
 subworkflows/local/collect_ddi_data/meta.yml  |   4 +-
 subworkflows/local/curate_domains/main.nf     |   7 +
 .../local/enrich_ddi_database/main.nf         |   9 +
 .../local/generate_embeddings/main.nf         |  26 ---
 .../local/generate_embeddings/meta.yml        |  29 ---
 .../local/split_domainsplit_database/main.nf  |  14 ++
 tests/bin/mmseqs                              |  11 +
 tests/data/3did.sql.gz                        |   0
 tests/data/hippie.tsv                         |   0
 tests/data/negative_ppi.parquet               |   0
 tests/data/negatome.txt                       |   0
 tests/data/pfam2go.txt                        |   0
 tests/data/ppidm.tsv                          |   0
 tests/data/prott5.h5                          |   0
 tests/data/string.txt.gz                      |   0
 tests/data/swissprot_pfam.tsv                 |   0
 tests/data/uniprot_go_terms.tsv               |   0
 tests/data/uniprot_id_mapping.dat.gz          |   0
 tests/data/uniprot_sequences.fasta.gz         |   0
 tests/default.nf.test                         |  30 +--
 tests/default.nf.test.snap                    | 194 ++++++++++++++++++
 tests/nextflow.config                         |   9 +
 tests/python/test_insert_negatome.py          |  82 ++++++++
 tests/python/test_insert_ppidm.py             | 141 +++++++++++++
 workflows/domainsplit.nf                      |  33 ++-
 55 files changed, 805 insertions(+), 79 deletions(-)
 delete mode 100644 subworkflows/local/generate_embeddings/main.nf
 delete mode 100644 subworkflows/local/generate_embeddings/meta.yml
 create mode 100755 tests/bin/mmseqs
 create mode 100644 tests/data/3did.sql.gz
 create mode 100644 tests/data/hippie.tsv
 create mode 100644 tests/data/negative_ppi.parquet
 create mode 100644 tests/data/negatome.txt
 create mode 100644 tests/data/pfam2go.txt
 create mode 100644 tests/data/ppidm.tsv
 create mode 100644 tests/data/prott5.h5
 create mode 100644 tests/data/string.txt.gz
 create mode 100644 tests/data/swissprot_pfam.tsv
 create mode 100644 tests/data/uniprot_go_terms.tsv
 create mode 100644 tests/data/uniprot_id_mapping.dat.gz
 create mode 100644 tests/data/uniprot_sequences.fasta.gz
 create mode 100644 tests/default.nf.test.snap
 create mode 100644 tests/python/test_insert_negatome.py
 create mode 100644 tests/python/test_insert_ppidm.py

diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml
index ad686e8..ea59134 100644
--- a/.github/actions/nf-test/action.yml
+++ b/.github/actions/nf-test/action.yml
@@ -56,6 +56,14 @@ runs:
         channel-priority: strict
         conda-remove-defaults: true
 
+    - name: Set dummy Nextflow secrets for stub tests
+      shell: bash
+      run: |
+        # The pipeline stub test (tests/default.nf.test) exercises the ESM
+        # embedding processes, which declare `secret 'HF_TOKEN'`. Nextflow
+        # requires the secret to exist even under -stub, so register a dummy.
+        nextflow secrets set HF_TOKEN "stub" || true
+
     - name: Run nf-test
       shell: bash
       env:
diff --git a/.gitignore b/.gitignore
index c85aa62..d0ab513 100644
--- a/.gitignore
+++ b/.gitignore
@@ -216,3 +216,7 @@ test-eval-env/*
 # Claude
 .claude/
 .mcp.json
+
+# nf-test fixtures (override the broad ignores above)
+!tests/data/
+!tests/data/**
diff --git a/conf/test.config b/conf/test.config
index 536b85e..7a6b2a8 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -20,10 +20,26 @@ process {
 
 params {
     config_profile_name        = 'Test profile'
-    config_profile_description = 'Minimal test dataset to check pipeline function'
+    config_profile_description = 'Minimal stub test dataset, fully offline (run with -stub)'
 
-    // Input data
-    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
-    // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input  = params.pipelines_testdata_base_path
+    // Every input below is a tiny local placeholder under tests/data/. The
+    // pipeline test runs in stub mode (`-stub`), so processes never read these
+    // files; they only need to exist so Nextflow stages local paths instead of
+    // downloading the real multi-GB sources or hitting REST APIs.
+
+    // Required file params (no defaults in nextflow.config)
+    hippie_tsv           = "${projectDir}/tests/data/hippie.tsv"
+    ppidm_tsv            = "${projectDir}/tests/data/ppidm.tsv"
+    negative_ppi_parquet = "${projectDir}/tests/data/negative_ppi.parquet"
+
+    // Source URLs -> local fixtures
+    url_3did                      = "${projectDir}/tests/data/3did.sql.gz"
+    url_negatome                  = "${projectDir}/tests/data/negatome.txt"
+    url_uniprot_swissprot_pfam    = "${projectDir}/tests/data/swissprot_pfam.tsv"
+    url_uniprot_id_mapping        = "${projectDir}/tests/data/uniprot_id_mapping.dat.gz"
+    url_uniprot_go_terms          = "${projectDir}/tests/data/uniprot_go_terms.tsv"
+    url_uniprot_sequences         = "${projectDir}/tests/data/uniprot_sequences.fasta.gz"
+    url_uniprot_prott5_embeddings = "${projectDir}/tests/data/prott5.h5"
+    url_string                    = "${projectDir}/tests/data/string.txt.gz"
+    url_pfam2go                   = "${projectDir}/tests/data/pfam2go.txt"
 }
diff --git a/modules/local/3did/main.nf b/modules/local/3did/main.nf
index b9e3fc5..e483d7d 100644
--- a/modules/local/3did/main.nf
+++ b/modules/local/3did/main.nf
@@ -27,4 +27,11 @@ process DOWNLOAD_3DID_SQLITE {
         sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)')
     END_VERSIONS
     """
+
+    stub:
+    """
+    touch 3did.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/analyze_ddi_bias/main.nf b/modules/local/analyze_ddi_bias/main.nf
index 2ab9179..e0c6cc5 100644
--- a/modules/local/analyze_ddi_bias/main.nf
+++ b/modules/local/analyze_ddi_bias/main.nf
@@ -24,4 +24,11 @@ process ANALYZE_DDI_BIAS {
         matplotlib: \$(python3 -c 'import matplotlib; print(matplotlib.__version__)')
     END_VERSIONS
     """
+
+    stub:
+    """
+    mkdir bias_analysis
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/build_ppi_negative_pool/main.nf b/modules/local/build_ppi_negative_pool/main.nf
index 3ab76ec..ec15f4c 100644
--- a/modules/local/build_ppi_negative_pool/main.nf
+++ b/modules/local/build_ppi_negative_pool/main.nf
@@ -37,4 +37,11 @@ process BUILD_PPI_NEGATIVE_POOL {
         sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)')
     END_VERSIONS
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3 neg_pool.npz uniprot_pfam_mapping.json
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/curate_domains/extract_unique_domains/main.nf b/modules/local/curate_domains/extract_unique_domains/main.nf
index 86edded..e59d20d 100644
--- a/modules/local/curate_domains/extract_unique_domains/main.nf
+++ b/modules/local/curate_domains/extract_unique_domains/main.nf
@@ -32,4 +32,11 @@ process EXTRACT_UNIQUE_DOMAINS {
         sqlite3: \$(sqlite3 --version | awk '{print \$1}')
     END_VERSIONS
     """
+
+    stub:
+    """
+    echo PF00001 > pfam_ids.txt
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/enrich/insert_domain_go_terms/main.nf b/modules/local/enrich/insert_domain_go_terms/main.nf
index 2eb9437..3e6409a 100644
--- a/modules/local/enrich/insert_domain_go_terms/main.nf
+++ b/modules/local/enrich/insert_domain_go_terms/main.nf
@@ -20,4 +20,11 @@ process INSERT_DOMAIN_GO_TERMS {
         --versions versions.yml \\
         --process-name "${task.process}"
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/enrich/insert_domain_protein_mapping/main.nf b/modules/local/enrich/insert_domain_protein_mapping/main.nf
index 7f7d00c..9fb4b44 100644
--- a/modules/local/enrich/insert_domain_protein_mapping/main.nf
+++ b/modules/local/enrich/insert_domain_protein_mapping/main.nf
@@ -22,4 +22,11 @@ process INSERT_DOMAIN_PROTEIN_MAPPING {
         --versions versions.yml \\
         --process-name "${task.process}"
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/enrich/insert_ppi/main.nf b/modules/local/enrich/insert_ppi/main.nf
index 9a27846..afd5f32 100644
--- a/modules/local/enrich/insert_ppi/main.nf
+++ b/modules/local/enrich/insert_ppi/main.nf
@@ -22,4 +22,11 @@ process INSERT_PPI {
         --versions versions.yml \\
         --process-name "${task.process}"
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/enrich/insert_protein_go_terms/main.nf b/modules/local/enrich/insert_protein_go_terms/main.nf
index 3148d9c..64110a0 100644
--- a/modules/local/enrich/insert_protein_go_terms/main.nf
+++ b/modules/local/enrich/insert_protein_go_terms/main.nf
@@ -20,4 +20,11 @@ process INSERT_PROTEIN_GO_TERMS {
         --versions versions.yml \\
         --process-name "${task.process}"
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/enrich/insert_proteins_with_embeddings/main.nf b/modules/local/enrich/insert_proteins_with_embeddings/main.nf
index 79201a8..096318c 100644
--- a/modules/local/enrich/insert_proteins_with_embeddings/main.nf
+++ b/modules/local/enrich/insert_proteins_with_embeddings/main.nf
@@ -27,4 +27,11 @@ process INSERT_PROTEINS_WITH_EMBEDDINGS {
         --versions versions.yml \\
         --process-name "${task.process}"
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/esm_embeddings/main.nf b/modules/local/esm_embeddings/main.nf
index 7f65c06..a96bd5d 100644
--- a/modules/local/esm_embeddings/main.nf
+++ b/modules/local/esm_embeddings/main.nf
@@ -66,6 +66,15 @@ process FILTER_SEQUENCES {
         f.write(f"    python: {sys.version.split()[0]}\\n")
         f.write(f"    biopython: {Bio.__version__}\\n")
     """
+
+    stub:
+    protein_meta = [id: "protein_sequences"]
+    domain_meta = [id: "domain_sequences"]
+    """
+    touch uniprot_filtered.fasta.gz domain_sequences.fasta.gz
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
 
 // Per-residue protein embeddings. One task per FASTA shard.
@@ -108,6 +117,13 @@ process GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK {
         --max-len ${params.esm_max_len} \\
         --smoke-limit ${smoke}
     """
+
+    stub:
+    """
+    touch ${input_fasta.simpleName}.esm.h5
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
 
 // GPU-pooled domain embeddings. One task per FASTA shard.
@@ -150,6 +166,13 @@ process GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK {
         --max-len ${params.esm_max_len} \\
         --smoke-limit ${smoke}
     """
+
+    stub:
+    """
+    touch ${input_fasta.simpleName}.esm.h5
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
 
 workflow generate_esm_embeddings {
@@ -169,7 +192,18 @@ workflow generate_esm_embeddings {
     protein_embeddings = JOIN_PROTEIN_EMBEDDINGS('esm_protein_embeddings', protein_chunks.chunk.collect()).joined
     domain_embeddings  = JOIN_DOMAIN_EMBEDDINGS('esm_domain_embeddings',  domain_chunks.chunk.collect() ).joined
 
+    ch_versions = Channel.empty().mix(
+        FILTER_SEQUENCES.out.versions,
+        SHARD_PROTEIN_FASTA.out.versions,
+        SHARD_DOMAIN_FASTA.out.versions,
+        GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK.out.versions,
+        GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK.out.versions,
+        JOIN_PROTEIN_EMBEDDINGS.out.versions,
+        JOIN_DOMAIN_EMBEDDINGS.out.versions,
+    )
+
     emit:
     protein_embeddings
     domain_embeddings
+    versions = ch_versions
 }
diff --git a/modules/local/external_validation_split/main.nf b/modules/local/external_validation_split/main.nf
index e7fc98d..5760486 100644
--- a/modules/local/external_validation_split/main.nf
+++ b/modules/local/external_validation_split/main.nf
@@ -101,4 +101,12 @@ process SUBSET_DDIS_BY_SOURCE {
         f.write(f"    python: {sys.version.split()[0]}\\n")
         f.write(f"    sqlite3: {sqlite3.sqlite_version}\\n")
     """
+
+    stub:
+    output_split_info = [["${split_name}.sqlite3", split_name]]
+    """
+    touch ${split_name}.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/init_domainsplit_db/main.nf b/modules/local/init_domainsplit_db/main.nf
index 5f70b1b..41dab03 100644
--- a/modules/local/init_domainsplit_db/main.nf
+++ b/modules/local/init_domainsplit_db/main.nf
@@ -83,4 +83,11 @@ process INIT_DOMAINSPLIT_DB {
         f.write(f"    python: {sys.version.split()[0]}\\n")
         f.write(f"    sqlite3: {sqlite3.sqlite_version}\\n")
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/insert_3did/main.nf b/modules/local/insert_3did/main.nf
index c2485a2..021939a 100644
--- a/modules/local/insert_3did/main.nf
+++ b/modules/local/insert_3did/main.nf
@@ -22,4 +22,11 @@ process INSERT_3DID {
         --versions versions.yml \\
         --process-name "${task.process}"
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/insert_negatome/main.nf b/modules/local/insert_negatome/main.nf
index 454982d..429b495 100644
--- a/modules/local/insert_negatome/main.nf
+++ b/modules/local/insert_negatome/main.nf
@@ -22,4 +22,11 @@ process INSERT_NEGATOME {
         --versions versions.yml \\
         --process-name "${task.process}"
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/insert_ppi_negative_selection/main.nf b/modules/local/insert_ppi_negative_selection/main.nf
index 652e64b..b0380e8 100644
--- a/modules/local/insert_ppi_negative_selection/main.nf
+++ b/modules/local/insert_ppi_negative_selection/main.nf
@@ -37,4 +37,11 @@ process INSERT_PPI_NEGATIVE_SELECTION {
         sqlite3: \$(python3 -c 'import sqlite3; print(sqlite3.sqlite_version)')
     END_VERSIONS
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3 negative_ppi_method_scores.tsv
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/insert_ppidm/main.nf b/modules/local/insert_ppidm/main.nf
index 749b705..553e79d 100644
--- a/modules/local/insert_ppidm/main.nf
+++ b/modules/local/insert_ppidm/main.nf
@@ -24,4 +24,11 @@ process INSERT_PPIDM {
         --versions versions.yml \\
         --process-name "${task.process}"
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/insert_single_domain_ppi/main.nf b/modules/local/insert_single_domain_ppi/main.nf
index 3a14070..57b30df 100644
--- a/modules/local/insert_single_domain_ppi/main.nf
+++ b/modules/local/insert_single_domain_ppi/main.nf
@@ -26,4 +26,11 @@ process INSERT_SINGLE_DOMAIN_PPI {
         --versions versions.yml \\
         --process-name "${task.process}"
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/minimal_leakage_split/main.nf b/modules/local/minimal_leakage_split/main.nf
index af546cb..a548b12 100644
--- a/modules/local/minimal_leakage_split/main.nf
+++ b/modules/local/minimal_leakage_split/main.nf
@@ -29,6 +29,13 @@ process EXTRACT_DOMAIN_SEQUENCES {
         sqlite3: \$(sqlite3 --version | awk '{print \$1}')
     END_VERSIONS
     """
+
+    stub:
+    """
+    touch domain_sequences.fasta.gz
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
 
 process MINIMAL_LEAKAGE_SPLIT_DOMAIN {
@@ -397,4 +404,14 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN {
         f.write(f"    python: {_sys.version.split()[0]}\\n")
         f.write(f"    numpy: {np.__version__}\\n")
     """
+
+    stub:
+    output_split_info = []
+    split_fractions.each { name, fraction -> output_split_info << ["${name}.sqlite3", name] }
+    def touch_cmds = output_split_info.collect { "touch ${it[0]}" }.join("\n    ")
+    """
+    ${touch_cmds}
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/negatome/main.nf b/modules/local/negatome/main.nf
index fae1152..79b8485 100644
--- a/modules/local/negatome/main.nf
+++ b/modules/local/negatome/main.nf
@@ -53,4 +53,11 @@ process DOWNLOAD_NEGATOME {
         f.write('"${task.process}":\\n')
         f.write(f"    python: {sys.version.split()[0]}\\n")
     """
+
+    stub:
+    """
+    touch combined_pfam.txt
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/pfam/main.nf b/modules/local/pfam/main.nf
index b5507f4..f853c2e 100644
--- a/modules/local/pfam/main.nf
+++ b/modules/local/pfam/main.nf
@@ -75,6 +75,13 @@ with open("versions.yml", "w") as f:
     f.write('"${task.process}":\\n')
     f.write(f"    python: {sys.version.split()[0]}\\n")
     """
+
+    stub:
+    """
+    touch PF00001.alignment.full.gz
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
 
 process CREATE_PROTEIN_DOMAIN_MAPPING {
@@ -169,4 +176,12 @@ with open("versions.yml", "w") as f:
     f.write('"${task.process}":\\n')
     f.write(f"    python: {sys.version.split()[0]}\\n")
     """
+
+    stub:
+    out_path = 'protein_domain_mapping.csv.gz'
+    """
+    touch ${out_path}
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/random_ddi_split/main.nf b/modules/local/random_ddi_split/main.nf
index 1d20ff4..79fee98 100644
--- a/modules/local/random_ddi_split/main.nf
+++ b/modules/local/random_ddi_split/main.nf
@@ -126,4 +126,14 @@ process RANDOM_DDI_SPLIT {
         f.write('"${task.process}":\\n')
         f.write(f"    python: {_sys.version.split()[0]}\\n")
     """
+
+    stub:
+    output_split_info = []
+    split_fractions.each { name, fraction -> output_split_info << ["${name}.sqlite3", name] }
+    def touch_cmds = output_split_info.collect { "touch ${it[0]}" }.join("\n    ")
+    """
+    ${touch_cmds}
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/remove_self_interactions/main.nf b/modules/local/remove_self_interactions/main.nf
index c87e5a8..668c375 100644
--- a/modules/local/remove_self_interactions/main.nf
+++ b/modules/local/remove_self_interactions/main.nf
@@ -37,4 +37,11 @@ process REMOVE_SELF_INTERACTIONS {
         f.write(f"    python: {sys.version.split()[0]}\\n")
         f.write(f"    sqlite3: {sqlite3.sqlite_version}\\n")
     """
+
+    stub:
+    """
+    touch domainsplit.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/select_ppi_negative_dans/main.nf b/modules/local/select_ppi_negative_dans/main.nf
index fd1a631..cbd009a 100644
--- a/modules/local/select_ppi_negative_dans/main.nf
+++ b/modules/local/select_ppi_negative_dans/main.nf
@@ -29,4 +29,11 @@ process SELECT_PPI_NEGATIVE_DANS {
         numpy: \$(python3 -c 'import numpy; print(numpy.__version__)')
     END_VERSIONS
     """
+
+    stub:
+    """
+    touch score_${method}.json pairs_${method}.tsv
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/smoke_filter/main.nf b/modules/local/smoke_filter/main.nf
index 3f60a3b..e884cb9 100644
--- a/modules/local/smoke_filter/main.nf
+++ b/modules/local/smoke_filter/main.nf
@@ -88,4 +88,11 @@ process SMOKE_FILTER {
         f.write(f"    python: {sys.version.split()[0]}\\n")
         f.write(f"    sqlite3: {sqlite3.sqlite_version}\\n")
     """
+
+    stub:
+    """
+    touch domainsplit.smoke.sqlite3
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/swissprot_map/main.nf b/modules/local/swissprot_map/main.nf
index e28b2e2..3ad8330 100644
--- a/modules/local/swissprot_map/main.nf
+++ b/modules/local/swissprot_map/main.nf
@@ -19,4 +19,11 @@ process BUILD_SWISSPROT_PFAM_MAP {
         --versions versions.yml \\
         --process-name "${task.process}"
     """
+
+    stub:
+    """
+    touch swissprot_pfam_map.json
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/modules/local/util/main.nf b/modules/local/util/main.nf
index 2d4d80c..4be930d 100644
--- a/modules/local/util/main.nf
+++ b/modules/local/util/main.nf
@@ -39,6 +39,13 @@ process SHARD_FASTA {
     print(f"    biopython: {Bio.__version__}")
     PY
     """
+
+    stub:
+    """
+    touch ${meta.id}_shard_0.fasta.gz
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
 
 // Merge a collection of HDF5 chunks (plain or gzipped) into one HDF5 file.
@@ -89,4 +96,11 @@ process JOIN_HDF_FILES {
         f.write(f"    python: {sys.version.split()[0]}\\n")
         f.write(f"    h5py: {h5py.__version__}\\n")
     """
+
+    stub:
+    """
+    touch ${output_name}.h5
+    echo '"${task.process}":' > versions.yml
+    echo '    stub: "true"' >> versions.yml
+    """
 }
diff --git a/subworkflows/local/collect_ddi_data/main.nf b/subworkflows/local/collect_ddi_data/main.nf
index 386d71b..fe21fea 100644
--- a/subworkflows/local/collect_ddi_data/main.nf
+++ b/subworkflows/local/collect_ddi_data/main.nf
@@ -44,6 +44,8 @@ workflow COLLECT_DDI_DATA {
     negative_ppi_parquet
 
     main:
+    ch_versions = Channel.empty()
+
     if( !hippie_tsv || !ppidm_tsv || !negative_ppi_parquet ) {
         log.error "Required inputs missing: hippie_tsv, ppidm_tsv, and negative_ppi_parquet must be provided"
         exit 1
@@ -77,6 +79,7 @@ workflow COLLECT_DDI_DATA {
     // 6. optional removal of all self-interactions
     if (!params.self_interaction) {
         domainsplit_db = REMOVE_SELF_INTERACTIONS(domainsplit_db).domainsplit_db
+        ch_versions = ch_versions.mix(REMOVE_SELF_INTERACTIONS.out.versions)
     }
 
     // 7. high-confidence non-PPI negatives via uncapped DANS (Cappelletti et al.
@@ -113,9 +116,25 @@ workflow COLLECT_DDI_DATA {
 
     if (params.smoke_test_n_ddis != null) {
         domainsplit_db = SMOKE_FILTER(domainsplit_db, params.smoke_test_n_ddis).domainsplit_db
+        ch_versions = ch_versions.mix(SMOKE_FILTER.out.versions)
     }
 
+    ch_versions = ch_versions.mix(
+        DOWNLOAD_3DID_SQLITE.out.versions,
+        DOWNLOAD_NEGATOME.out.versions,
+        INSERT_3DID.out.versions,
+        BUILD_SWISSPROT_PFAM_MAP.out.versions,
+        INSERT_SINGLE_DOMAIN_PPI.out.versions,
+        INSERT_PPIDM.out.versions,
+        INSERT_NEGATOME.out.versions,
+        BUILD_PPI_NEGATIVE_POOL.out.versions,
+        SELECT_DELETION.out.versions,
+        SELECT_RANDOM_ADDITION.out.versions,
+        INSERT_PPI_NEGATIVE_SELECTION.out.versions,
+    )
+
     emit:
     domainsplit_db
     pfam_mapping
+    versions = ch_versions
 }
diff --git a/subworkflows/local/collect_ddi_data/meta.yml b/subworkflows/local/collect_ddi_data/meta.yml
index 750ebd0..a158369 100644
--- a/subworkflows/local/collect_ddi_data/meta.yml
+++ b/subworkflows/local/collect_ddi_data/meta.yml
@@ -16,7 +16,9 @@ components:
   - insert/ppidm
   - insert/negatome
   - remove/self/interactions
-  - insert/ppi/negative/ddis
+  - build/ppi/negative/pool
+  - select/ppi/negative/dans
+  - insert/ppi/negative/selection
   - smoke/filter
 input:
   - domainsplit_db_in:
diff --git a/subworkflows/local/curate_domains/main.nf b/subworkflows/local/curate_domains/main.nf
index a97b28e..91ee944 100644
--- a/subworkflows/local/curate_domains/main.nf
+++ b/subworkflows/local/curate_domains/main.nf
@@ -28,6 +28,13 @@ workflow CURATE_DOMAINS {
         pfam_files.collect()
     ).mapping
 
+    ch_versions = Channel.empty().mix(
+        EXTRACT_UNIQUE_DOMAINS.out.versions,
+        DOWNLOAD_PFAM_ALIGNMENTS_BATCH.out.versions,
+        CREATE_PROTEIN_DOMAIN_MAPPING.out.versions,
+    )
+
     emit:
     protein_domain_map
+    versions = ch_versions
 }
diff --git a/subworkflows/local/enrich_ddi_database/main.nf b/subworkflows/local/enrich_ddi_database/main.nf
index 56df49b..bddb71e 100644
--- a/subworkflows/local/enrich_ddi_database/main.nf
+++ b/subworkflows/local/enrich_ddi_database/main.nf
@@ -60,6 +60,15 @@ workflow ENRICH_DDI_DATABASE {
         esm_domain_embeddings
     ).domainsplit_db
 
+    ch_versions = Channel.empty().mix(
+        INSERT_DOMAIN_GO_TERMS.out.versions,
+        INSERT_PROTEINS_WITH_EMBEDDINGS.out.versions,
+        INSERT_PROTEIN_GO_TERMS.out.versions,
+        INSERT_PPI.out.versions,
+        INSERT_DOMAIN_PROTEIN_MAPPING.out.versions,
+    )
+
     emit:
     domainsplit_db
+    versions = ch_versions
 }
diff --git a/subworkflows/local/generate_embeddings/main.nf b/subworkflows/local/generate_embeddings/main.nf
deleted file mode 100644
index c043aec..0000000
--- a/subworkflows/local/generate_embeddings/main.nf
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    GENERATE_EMBEDDINGS -- run protein-level ESM (protein + domain)
-    embedding generation.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ESM path produces both per-residue protein embeddings and pooled domain
-    embeddings against the supplied protein <-> domain map.
-
-    ProtT5 embeddings are supplied externally via params.url_uniprot_prott5_embeddings
-    and resolved in the top-level workflow (domainsplit.nf).
-----------------------------------------------------------------------------*/
-
-include { generate_esm_embeddings    } from '../../../modules/local/esm_embeddings/main.nf'
-
-workflow GENERATE_EMBEDDINGS {
-    take:
-    protein_domain_map
-    input_uniprot_sequences
-
-    main:
-    generate_esm_embeddings(input_uniprot_sequences, protein_domain_map)
-
-    emit:
-    esm_protein_embeddings  = generate_esm_embeddings.out.protein_embeddings
-    esm_domain_embeddings   = generate_esm_embeddings.out.domain_embeddings
-}
diff --git a/subworkflows/local/generate_embeddings/meta.yml b/subworkflows/local/generate_embeddings/meta.yml
deleted file mode 100644
index dfd0981..0000000
--- a/subworkflows/local/generate_embeddings/meta.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
-name: "generate_embeddings"
-description: Generate ESM per-residue protein embeddings and pooled per-domain embeddings against the supplied protein-to-domain map.
-keywords:
-  - esm
-  - embeddings
-  - protein
-  - domain
-components:
-  - generate/esm/embeddings
-input:
-  - protein_domain_map:
-      type: file
-      description: Protein-to-Pfam-domain mapping used to pool per-domain embeddings.
-  - input_uniprot_sequences:
-      type: file
-      description: UniProt protein sequences to embed.
-      pattern: "*.{fasta,fa,gz}"
-output:
-  - esm_protein_embeddings:
-      type: file
-      description: ESM per-residue protein embeddings.
-  - esm_domain_embeddings:
-      type: file
-      description: ESM pooled per-domain embeddings.
-authors:
-  - "@KonstantinPelz"
-maintainers:
-  - "@KonstantinPelz"
diff --git a/subworkflows/local/split_domainsplit_database/main.nf b/subworkflows/local/split_domainsplit_database/main.nf
index 1427c03..4536cca 100644
--- a/subworkflows/local/split_domainsplit_database/main.nf
+++ b/subworkflows/local/split_domainsplit_database/main.nf
@@ -119,6 +119,20 @@ workflow SPLIT_DOMAINSPLIT_DATABASE {
         map_split_dbs(SUBSET_DDIS_BY_SOURCE.out.split_info, SUBSET_DDIS_BY_SOURCE.out.split_dbs, "external_validation_random_addition")
     )
 
+    // NB: MMSEQS_EASYCLUSTER (nf-core) reports its version via the `versions`
+    // channel topic, not an `emit: versions` output, so it is not mixed here.
+    ch_versions = Channel.empty().mix(
+        EXTRACT_DOMAIN_SEQUENCES.out.versions,
+        RANDOM_DDI_SPLIT_DEL.out.versions,
+        RANDOM_DDI_SPLIT_RAND.out.versions,
+        MLS_DOMAIN_DEL.out.versions,
+        MLS_DOMAIN_RAND.out.versions,
+        MLS_TRAINVAL_DEL.out.versions,
+        MLS_TRAINVAL_RAND.out.versions,
+        SUBSET_DDIS_BY_SOURCE.out.versions,
+    )
+
     emit:
     split_db = split_ch
+    versions = ch_versions
 }
diff --git a/tests/bin/mmseqs b/tests/bin/mmseqs
new file mode 100755
index 0000000..a99de53
--- /dev/null
+++ b/tests/bin/mmseqs
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+# Test-only stub of the `mmseqs` binary.
+#
+# The nf-test pipeline test (tests/default.nf.test) runs with `-stub` and the
+# `test` profile, which enables no container engine — every process runs on the
+# host. The nf-core MMSEQS_EASYCLUSTER module captures its version via an
+# `eval('mmseqs version')` output directive, which Nextflow executes even in
+# stub mode. Without mmseqs installed that fails with exit 127, so this shim
+# provides a deterministic fake version. It is only ever on PATH for test runs
+# (added via env.PATH in tests/nextflow.config); real runs use the container.
+echo "stub"
diff --git a/tests/data/3did.sql.gz b/tests/data/3did.sql.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/hippie.tsv b/tests/data/hippie.tsv
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/negative_ppi.parquet b/tests/data/negative_ppi.parquet
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/negatome.txt b/tests/data/negatome.txt
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/pfam2go.txt b/tests/data/pfam2go.txt
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/ppidm.tsv b/tests/data/ppidm.tsv
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/prott5.h5 b/tests/data/prott5.h5
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/string.txt.gz b/tests/data/string.txt.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/swissprot_pfam.tsv b/tests/data/swissprot_pfam.tsv
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/uniprot_go_terms.tsv b/tests/data/uniprot_go_terms.tsv
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/uniprot_id_mapping.dat.gz b/tests/data/uniprot_id_mapping.dat.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/uniprot_sequences.fasta.gz b/tests/data/uniprot_sequences.fasta.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/default.nf.test b/tests/default.nf.test
index 86ea2e3..dde4cdc 100644
--- a/tests/default.nf.test
+++ b/tests/default.nf.test
@@ -6,6 +6,8 @@ nextflow_pipeline {
 
     test("-profile test") {
 
+        options "-stub"
+
         when {
             params {
                 outdir = "$outputDir"
@@ -13,20 +15,22 @@ nextflow_pipeline {
         }
 
         then {
-            // stable_path: All files + folders in ${params.outdir}/ with a stable path (including file name)
-            def stable_path = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}'])
-            // stable_content: All files in ${params.outdir}/ with stable content
-            def stable_content = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore')
-            assert workflow.success
+            // This is a `-stub` wiring test: every process runs its stub block and
+            // emits empty placeholder files, so file *content* is meaningless (and
+            // empty .gz/.h5 stubs break content hashing). We therefore snapshot only
+            // the set of produced output paths -- this verifies the whole DAG wires
+            // together (channel topology, the split fan-out, publish paths) end to
+            // end without any downloads, GPU, or containers. pipeline_info/ is
+            // ignored because its filenames embed a run timestamp.
+            def stable_path = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/execution_*', 'pipeline_info/pipeline_dag_*'])
+            // Snapshot the collated software versions with the Nextflow version line
+            // stripped (so the assertion survives Nextflow upgrades). This also
+            // satisfies the nf-core `nf_test_content` lint rule, which requires a
+            // `versions.yml` to be snapshotted by every `*.nf.test`.
+            def versions_yml = removeNextflowVersion("$outputDir/pipeline_info/nf_core_pipeline_software_mqc_versions.yml")
             assertAll(
-                { assert snapshot(
-                    // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions
-                    removeNextflowVersion("$outputDir/pipeline_info/domainsplit_software_mqc_versions.yml"),
-                    // All stable path name, with a relative path
-                    stable_path,
-                    // All files with stable contents
-                    stable_content
-                ).match() }
+                { assert workflow.success },
+                { assert snapshot(stable_path, versions_yml).match() }
             )
         }
     }
diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap
new file mode 100644
index 0000000..483c015
--- /dev/null
+++ b/tests/default.nf.test.snap
@@ -0,0 +1,194 @@
+{
+    "-profile test": {
+        "content": [
+            [
+                "analyze",
+                "analyze/bias_analysis",
+                "create",
+                "create/protein_domain_mapping.csv.gz",
+                "databases",
+                "databases/external_validation_deletion",
+                "databases/external_validation_deletion/test.sqlite3",
+                "databases/external_validation_deletion/train.sqlite3",
+                "databases/external_validation_deletion/validation.sqlite3",
+                "databases/external_validation_random_addition",
+                "databases/external_validation_random_addition/test.sqlite3",
+                "databases/external_validation_random_addition/train.sqlite3",
+                "databases/external_validation_random_addition/validation.sqlite3",
+                "databases/minimal_leakage_domain_deletion",
+                "databases/minimal_leakage_domain_deletion/optimization.sqlite3",
+                "databases/minimal_leakage_domain_deletion/test.sqlite3",
+                "databases/minimal_leakage_domain_deletion/train.sqlite3",
+                "databases/minimal_leakage_domain_random_addition",
+                "databases/minimal_leakage_domain_random_addition/optimization.sqlite3",
+                "databases/minimal_leakage_domain_random_addition/test.sqlite3",
+                "databases/minimal_leakage_domain_random_addition/train.sqlite3",
+                "databases/random_ddi_deletion",
+                "databases/random_ddi_deletion/optimization.sqlite3",
+                "databases/random_ddi_deletion/test.sqlite3",
+                "databases/random_ddi_deletion/train.sqlite3",
+                "databases/random_ddi_random_addition",
+                "databases/random_ddi_random_addition/optimization.sqlite3",
+                "databases/random_ddi_random_addition/test.sqlite3",
+                "databases/random_ddi_random_addition/train.sqlite3",
+                "domainsplit.sqlite3",
+                "download",
+                "download/3did.sqlite3",
+                "download/PF00001.alignment.full.gz",
+                "download/combined_pfam.txt",
+                "extract",
+                "extract/domain_sequences.fasta.gz",
+                "filter",
+                "filter/domain_sequences.fasta.gz",
+                "filter/uniprot_filtered.fasta.gz",
+                "generate",
+                "generate/domain_sequences_shard_0.esm.h5",
+                "generate/protein_sequences_shard_0.esm.h5",
+                "join",
+                "join/esm_domain_embeddings.h5",
+                "join/esm_protein_embeddings.h5",
+                "mls",
+                "mls/optimization.sqlite3",
+                "mls/test.sqlite3",
+                "mls/train.sqlite3",
+                "mls/validation.sqlite3",
+                "mmseqs",
+                "mmseqs/domain.tsv",
+                "mmseqs/domain_all_seqs.fasta",
+                "mmseqs/domain_rep_seq.fasta",
+                "negative_ppi",
+                "negative_ppi/negative_ppi_method_scores.tsv",
+                "pipeline_info",
+                "pipeline_info/nf_core_pipeline_software_mqc_versions.yml",
+                "pipeline_info/params_2026-06-12_11-10-10.json",
+                "random",
+                "random/optimization.sqlite3",
+                "random/test.sqlite3",
+                "random/train.sqlite3",
+                "shard",
+                "shard/domain_sequences_shard_0.fasta.gz",
+                "shard/protein_sequences_shard_0.fasta.gz",
+                "subset",
+                "subset/test.sqlite3"
+            ],
+            {
+                "ANALYZE_DDI_BIAS": {
+                    "stub": "true"
+                },
+                "BUILD_PPI_NEGATIVE_POOL": {
+                    "stub": "true"
+                },
+                "BUILD_SWISSPROT_PFAM_MAP": {
+                    "stub": "true"
+                },
+                "CREATE_PROTEIN_DOMAIN_MAPPING": {
+                    "stub": "true"
+                },
+                "DOWNLOAD_3DID_SQLITE": {
+                    "stub": "true"
+                },
+                "DOWNLOAD_NEGATOME": {
+                    "stub": "true"
+                },
+                "DOWNLOAD_PFAM_ALIGNMENTS_BATCH": {
+                    "stub": "true"
+                },
+                "EXTRACT_DOMAIN_SEQUENCES": {
+                    "stub": "true"
+                },
+                "EXTRACT_UNIQUE_DOMAINS": {
+                    "stub": "true"
+                },
+                "FILTER_SEQUENCES": {
+                    "stub": "true"
+                },
+                "GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK": {
+                    "stub": "true"
+                },
+                "GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK": {
+                    "stub": "true"
+                },
+                "INIT_DOMAINSPLIT_DB": {
+                    "stub": "true"
+                },
+                "INSERT_3DID": {
+                    "stub": "true"
+                },
+                "INSERT_DOMAIN_GO_TERMS": {
+                    "stub": "true"
+                },
+                "INSERT_DOMAIN_PROTEIN_MAPPING": {
+                    "stub": "true"
+                },
+                "INSERT_NEGATOME": {
+                    "stub": "true"
+                },
+                "INSERT_PPI": {
+                    "stub": "true"
+                },
+                "INSERT_PPIDM": {
+                    "stub": "true"
+                },
+                "INSERT_PPI_NEGATIVE_SELECTION": {
+                    "stub": "true"
+                },
+                "INSERT_PROTEINS_WITH_EMBEDDINGS": {
+                    "stub": "true"
+                },
+                "INSERT_PROTEIN_GO_TERMS": {
+                    "stub": "true"
+                },
+                "INSERT_SINGLE_DOMAIN_PPI": {
+                    "stub": "true"
+                },
+                "JOIN_DOMAIN_EMBEDDINGS": {
+                    "stub": "true"
+                },
+                "JOIN_PROTEIN_EMBEDDINGS": {
+                    "stub": "true"
+                },
+                "MLS_DOMAIN_DEL": {
+                    "stub": "true"
+                },
+                "MLS_DOMAIN_RAND": {
+                    "stub": "true"
+                },
+                "MLS_TRAINVAL_DEL": {
+                    "stub": "true"
+                },
+                "MLS_TRAINVAL_RAND": {
+                    "stub": "true"
+                },
+                "RANDOM_DDI_SPLIT_DEL": {
+                    "stub": "true"
+                },
+                "RANDOM_DDI_SPLIT_RAND": {
+                    "stub": "true"
+                },
+                "SELECT_DELETION": {
+                    "stub": "true"
+                },
+                "SELECT_RANDOM_ADDITION": {
+                    "stub": "true"
+                },
+                "SHARD_DOMAIN_FASTA": {
+                    "stub": "true"
+                },
+                "SHARD_PROTEIN_FASTA": {
+                    "stub": "true"
+                },
+                "SUBSET_DDIS_BY_SOURCE": {
+                    "stub": "true"
+                },
+                "Workflow": {
+                    "daisybio/domainsplit": "v1.0.0dev"
+                }
+            }
+        ],
+        "timestamp": "2026-06-12T11:10:24.222393192",
+        "meta": {
+            "nf-test": "0.9.5",
+            "nextflow": "26.04.0"
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/nextflow.config b/tests/nextflow.config
index 12b3258..ab87256 100644
--- a/tests/nextflow.config
+++ b/tests/nextflow.config
@@ -12,3 +12,12 @@ params {
 }
 
 aws.client.anonymous = true // fixes S3 access issues on self-hosted runners
+
+// The pipeline test (tests/default.nf.test) runs with `-stub` and no container
+// engine, so every process executes on the host. The nf-core MMSEQS_EASYCLUSTER
+// module captures its version with an `eval('mmseqs version')` output that runs
+// even under -stub; prepend a test-only shim dir so that resolves without the
+// real binary. Only applied to nf-test runs (this config is test-only).
+env {
+    PATH = "${projectDir}/tests/bin:\$PATH"
+}
diff --git a/tests/python/test_insert_negatome.py b/tests/python/test_insert_negatome.py
new file mode 100644
index 0000000..9691d7d
--- /dev/null
+++ b/tests/python/test_insert_negatome.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""Local unit-check for bin/insert_negatome.py (no Nextflow, no cluster).
+
+Builds a tiny empty Domainsplit SQLite and runs the Negatome inserter against a
+small synthetic ``combined_pfam.txt``, asserting:
+
+  * each whitespace-separated Pfam pair is stored as ``negative=1,
+    source='negatome'`` with its domains auto-created;
+  * lines without at least two tokens (blank / single-token) are skipped.
+
+Run directly (`python3 tests/python/test_insert_negatome.py`) or via pytest.
+"""
+
+import os
+import sqlite3
+import subprocess
+import sys
+import tempfile
+
+REPO = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+BIN = os.path.join(REPO, "bin")
+INSERTER = os.path.join(BIN, "insert_negatome.py")
+
+SCHEMA = """
+CREATE TABLE domain (id INTEGER PRIMARY KEY, pfam_id, name, UNIQUE(pfam_id));
+CREATE TABLE domain_domain_interaction (
+    id INTEGER PRIMARY KEY,
+    domain_id_a, domain_id_b, negative,
+    source VARCHAR(255),
+    FOREIGN KEY(domain_id_a) REFERENCES domain ON DELETE CASCADE,
+    FOREIGN KEY(domain_id_b) REFERENCES domain ON DELETE CASCADE,
+    UNIQUE(domain_id_a, domain_id_b, source)
+);
+"""
+
+NEGATOME_LINES = [
+    "PF00001 PF00002",   # kept
+    "PF00003\tPF00004",  # kept (tab separated)
+    "PF00005 PF00006",   # kept
+    "PF00007",           # single token -> skipped
+    "",                  # blank -> skipped
+]
+
+
+def test_insert_negatome():
+    with tempfile.TemporaryDirectory() as tmp:
+        db = os.path.join(tmp, "domainsplit.sqlite3")
+        conn = sqlite3.connect(db)
+        conn.executescript(SCHEMA)
+        conn.commit()
+        conn.close()
+
+        negatome = os.path.join(tmp, "combined_pfam.txt")
+        with open(negatome, "w") as fh:
+            fh.write("\n".join(NEGATOME_LINES) + "\n")
+
+        env = dict(os.environ, PYTHONPATH=BIN + os.pathsep + os.environ.get("PYTHONPATH", ""))
+        subprocess.run(
+            [sys.executable, INSERTER, "--db", db, "--negatome", negatome,
+             "--versions", os.path.join(tmp, "versions.yml"),
+             "--process-name", "TEST:INSERT_NEGATOME"],
+            check=True, env=env,
+        )
+
+        conn = sqlite3.connect(db)
+        total = conn.execute("SELECT COUNT(*) FROM domain_domain_interaction").fetchone()[0]
+        assert total == 3, f"expected 3 negatome DDIs, got {total}"
+
+        rows = conn.execute(
+            "SELECT COUNT(*) FROM domain_domain_interaction "
+            "WHERE source = 'negatome' AND negative != 0"
+        ).fetchone()[0]
+        assert rows == 3, "all negatome rows must be negative with source 'negatome'"
+
+        n_domains = conn.execute("SELECT COUNT(*) FROM domain").fetchone()[0]
+        assert n_domains == 6, f"expected 6 auto-created domains, got {n_domains}"
+        conn.close()
+
+
+if __name__ == "__main__":
+    test_insert_negatome()
+    print("OK: insert_negatome invariants hold")
diff --git a/tests/python/test_insert_ppidm.py b/tests/python/test_insert_ppidm.py
new file mode 100644
index 0000000..a296053
--- /dev/null
+++ b/tests/python/test_insert_ppidm.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""Local unit-check for bin/insert_ppidm.py (no Nextflow, no cluster).
+
+Builds a tiny empty Domainsplit SQLite and runs the PPIDM inserter against a
+small synthetic ``predicted_ddi_ppi.tsv``, asserting:
+
+  * domain tokens like ``10114/PF00069`` are parsed down to the Pfam accession;
+  * each kept row is stored as ``negative=0, source='PPIDM_<Class>'``;
+  * classes are processed Gold -> Silver -> Bronze, so a pair appearing under
+    two classes is kept only under the highest-confidence one (cross-source
+    dedup in insert_ddis);
+  * unparseable tokens are skipped, and ``--classes`` filters which classes are
+    inserted at all.
+
+Run directly (`python3 tests/python/test_insert_ppidm.py`) or via pytest.
+"""
+
+import os
+import sqlite3
+import subprocess
+import sys
+import tempfile
+
+REPO = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+BIN = os.path.join(REPO, "bin")
+INSERTER = os.path.join(BIN, "insert_ppidm.py")
+
+# Matches the schema the pipeline's INIT_DOMAINSPLIT_DB creates for these tables
+# (see tests/python/test_insert_ppi_negative_selection.py).
+SCHEMA = """
+CREATE TABLE domain (id INTEGER PRIMARY KEY, pfam_id, name, UNIQUE(pfam_id));
+CREATE TABLE domain_domain_interaction (
+    id INTEGER PRIMARY KEY,
+    domain_id_a, domain_id_b, negative,
+    source VARCHAR(255),
+    FOREIGN KEY(domain_id_a) REFERENCES domain ON DELETE CASCADE,
+    FOREIGN KEY(domain_id_b) REFERENCES domain ON DELETE CASCADE,
+    UNIQUE(domain_id_a, domain_id_b, source)
+);
+"""
+
+
+def count(conn, source):
+    return conn.execute(
+        "SELECT COUNT(*) FROM domain_domain_interaction WHERE source = ?",
+        (source,),
+    ).fetchone()[0]
+
+
+def run_inserter(db, ppidm, classes, tmp):
+    env = dict(os.environ, PYTHONPATH=BIN + os.pathsep + os.environ.get("PYTHONPATH", ""))
+    subprocess.run(
+        [sys.executable, INSERTER, "--db", db, "--ppidm", ppidm,
+         "--classes", classes,
+         "--versions", os.path.join(tmp, "versions.yml"),
+         "--process-name", "TEST:INSERT_PPIDM"],
+        check=True, env=env,
+    )
+
+
+# Tokens carry a leading numeric id before the slash, as in real PPIDM output.
+PPIDM_ROWS = [
+    "domain_1\tdomain_2\tclass",          # header (skipped)
+    "10/PF00001\t20/PF00002\tGold",       # kept -> PPIDM_Gold
+    "30/PF00003\t40/PF00004\tSilver",     # kept -> PPIDM_Silver
+    "50/PF00005\t60/PF00006\tBronze",     # kept -> PPIDM_Bronze
+    "10/PF00001\t20/PF00002\tSilver",     # duplicate pair, lower class -> dropped
+    "junk\tnonsense\tGold",               # unparseable -> skipped
+]
+
+
+def write_ppidm(path, rows):
+    with open(path, "w") as fh:
+        fh.write("\n".join(rows) + "\n")
+
+
+def test_insert_ppidm_all_classes():
+    with tempfile.TemporaryDirectory() as tmp:
+        db = os.path.join(tmp, "domainsplit.sqlite3")
+        conn = sqlite3.connect(db)
+        conn.executescript(SCHEMA)
+        conn.commit()
+        conn.close()
+
+        ppidm = os.path.join(tmp, "predicted_ddi_ppi.tsv")
+        write_ppidm(ppidm, PPIDM_ROWS)
+
+        run_inserter(db, ppidm, "Bronze,Silver,Gold", tmp)
+
+        conn = sqlite3.connect(db)
+        # One pair per class; the duplicate (PF00001, PF00002) is kept only under
+        # Gold (processed first) and dropped for Silver via cross-source dedup.
+        assert count(conn, "PPIDM_Gold") == 1, "Gold count wrong"
+        assert count(conn, "PPIDM_Silver") == 1, "Silver count wrong (dedup failed?)"
+        assert count(conn, "PPIDM_Bronze") == 1, "Bronze count wrong"
+
+        # All kept rows are positives stored under a PPIDM_* source only.
+        total = conn.execute("SELECT COUNT(*) FROM domain_domain_interaction").fetchone()[0]
+        assert total == 3, f"expected 3 DDIs total, got {total}"
+        neg = conn.execute(
+            "SELECT COUNT(*) FROM domain_domain_interaction WHERE negative != 0"
+        ).fetchone()[0]
+        assert neg == 0, "PPIDM rows must be positives"
+
+        # The duplicate pair exists only under Gold, not Silver.
+        n_sources = conn.execute(
+            "SELECT COUNT(DISTINCT source) FROM domain_domain_interaction ddi "
+            "JOIN domain da ON da.id = ddi.domain_id_a "
+            "JOIN domain db ON db.id = ddi.domain_id_b "
+            "WHERE da.pfam_id = ? AND db.pfam_id = ?",
+            ("PF00001", "PF00002"),
+        ).fetchone()[0]
+        assert n_sources == 1, f"(PF00001,PF00002) should be under 1 source, got {n_sources}"
+        conn.close()
+
+
+def test_insert_ppidm_class_filter():
+    """--classes restricts which classes are inserted at all."""
+    with tempfile.TemporaryDirectory() as tmp:
+        db = os.path.join(tmp, "domainsplit.sqlite3")
+        conn = sqlite3.connect(db)
+        conn.executescript(SCHEMA)
+        conn.commit()
+        conn.close()
+
+        ppidm = os.path.join(tmp, "predicted_ddi_ppi.tsv")
+        write_ppidm(ppidm, PPIDM_ROWS)
+
+        run_inserter(db, ppidm, "Gold", tmp)
+
+        conn = sqlite3.connect(db)
+        assert count(conn, "PPIDM_Gold") == 1
+        assert count(conn, "PPIDM_Silver") == 0, "Silver should be excluded"
+        assert count(conn, "PPIDM_Bronze") == 0, "Bronze should be excluded"
+        conn.close()
+
+
+if __name__ == "__main__":
+    test_insert_ppidm_all_classes()
+    test_insert_ppidm_class_filter()
+    print("OK: insert_ppidm class handling + dedup invariants hold")
diff --git a/workflows/domainsplit.nf b/workflows/domainsplit.nf
index 7f5fe85..4474982 100644
--- a/workflows/domainsplit.nf
+++ b/workflows/domainsplit.nf
@@ -9,7 +9,7 @@ include { methodsDescriptionText      } from '../subworkflows/local/utils_nfcore
 include { INIT_DOMAINSPLIT_DB         } from '../modules/local/init_domainsplit_db/main.nf'
 include { COLLECT_DDI_DATA            } from '../subworkflows/local/collect_ddi_data/main.nf'
 include { CURATE_DOMAINS              } from '../subworkflows/local/curate_domains/main.nf'
-include { GENERATE_EMBEDDINGS         } from '../subworkflows/local/generate_embeddings/main.nf'
+include { generate_esm_embeddings     } from '../modules/local/esm_embeddings/main.nf'
 include { ENRICH_DDI_DATABASE         } from '../subworkflows/local/enrich_ddi_database/main.nf'
 include { SPLIT_DOMAINSPLIT_DATABASE  } from '../subworkflows/local/split_domainsplit_database/main.nf'
 include { ANALYZE_DDI_BIAS            } from '../modules/local/analyze_ddi_bias/main.nf'
@@ -22,6 +22,8 @@ include { ANALYZE_DDI_BIAS            } from '../modules/local/analyze_ddi_bias/
 
 workflow DOMAINSPLIT {
 main:
+    ch_versions = Channel.empty()
+
     input_uniprot_id_mapping = file(params.url_uniprot_id_mapping)
     input_uniprot_go_terms   = file(params.url_uniprot_go_terms)
     input_uniprot_sequences  = file(params.url_uniprot_sequences)
@@ -51,9 +53,9 @@ main:
 
     protein_domain_map = CURATE_DOMAINS.out.protein_domain_map
 
-    GENERATE_EMBEDDINGS(
-        protein_domain_map,
+    generate_esm_embeddings(
         input_uniprot_sequences,
+        protein_domain_map,
     )
 
     ENRICH_DDI_DATABASE(
@@ -65,8 +67,8 @@ main:
         input_uniprot_go_terms,
         input_string,
         input_uniprot_id_mapping,
-        GENERATE_EMBEDDINGS.out.esm_protein_embeddings,
-        GENERATE_EMBEDDINGS.out.esm_domain_embeddings,
+        generate_esm_embeddings.out.protein_embeddings,
+        generate_esm_embeddings.out.domain_embeddings,
     )
 
     ANALYZE_DDI_BIAS(
@@ -77,6 +79,27 @@ main:
         ENRICH_DDI_DATABASE.out.domainsplit_db
     )
 
+    //
+    // Collate and save software versions
+    //
+    ch_versions = ch_versions.mix(
+        INIT_DOMAINSPLIT_DB.out.versions,
+        COLLECT_DDI_DATA.out.versions,
+        CURATE_DOMAINS.out.versions,
+        generate_esm_embeddings.out.versions,
+        ENRICH_DDI_DATABASE.out.versions,
+        ANALYZE_DDI_BIAS.out.versions,
+        SPLIT_DOMAINSPLIT_DATABASE.out.versions,
+    )
+
+    softwareVersionsToYAML(ch_versions)
+        .collectFile(
+            storeDir: "${params.outdir}/pipeline_info",
+            name: 'nf_core_' + 'pipeline_software_' + 'mqc_' + 'versions.yml',
+            sort: true,
+            newLine: true,
+        )
+
 emit:
     domainsplit_db  = ENRICH_DDI_DATABASE.out.domainsplit_db
     split_db        = SPLIT_DOMAINSPLIT_DATABASE.out.split_db

From 3450738ecf05b783164fabb700495829e22512d0 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Fri, 12 Jun 2026 12:24:44 +0200
Subject: [PATCH 13/16] removed docker:// prefix for containers

---
 modules/local/3did/main.nf                                  | 2 +-
 modules/local/analyze_ddi_bias/main.nf                      | 2 +-
 modules/local/build_ppi_negative_pool/main.nf               | 2 +-
 modules/local/curate_domains/extract_unique_domains/main.nf | 2 +-
 modules/local/enrich/insert_domain_go_terms/main.nf         | 2 +-
 modules/local/enrich/insert_domain_protein_mapping/main.nf  | 2 +-
 modules/local/enrich/insert_ppi/main.nf                     | 2 +-
 modules/local/enrich/insert_protein_go_terms/main.nf        | 2 +-
 .../local/enrich/insert_proteins_with_embeddings/main.nf    | 2 +-
 modules/local/esm_embeddings/main.nf                        | 6 +++---
 modules/local/external_validation_split/main.nf             | 2 +-
 modules/local/init_domainsplit_db/main.nf                   | 2 +-
 modules/local/insert_3did/main.nf                           | 2 +-
 modules/local/insert_negatome/main.nf                       | 2 +-
 modules/local/insert_ppi_negative_selection/main.nf         | 2 +-
 modules/local/insert_ppidm/main.nf                          | 2 +-
 modules/local/insert_single_domain_ppi/main.nf              | 2 +-
 modules/local/minimal_leakage_split/main.nf                 | 4 ++--
 modules/local/negatome/main.nf                              | 2 +-
 modules/local/pfam/main.nf                                  | 4 ++--
 modules/local/random_ddi_split/main.nf                      | 2 +-
 modules/local/remove_self_interactions/main.nf              | 2 +-
 modules/local/select_ppi_negative_dans/main.nf              | 2 +-
 modules/local/smoke_filter/main.nf                          | 2 +-
 modules/local/swissprot_map/main.nf                         | 2 +-
 modules/local/util/main.nf                                  | 4 ++--
 26 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/modules/local/3did/main.nf b/modules/local/3did/main.nf
index e483d7d..3f1cf0f 100644
--- a/modules/local/3did/main.nf
+++ b/modules/local/3did/main.nf
@@ -2,7 +2,7 @@ process DOWNLOAD_3DID_SQLITE {
     tag "3did"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path mysql_gz_file
diff --git a/modules/local/analyze_ddi_bias/main.nf b/modules/local/analyze_ddi_bias/main.nf
index e0c6cc5..6f5f77e 100644
--- a/modules/local/analyze_ddi_bias/main.nf
+++ b/modules/local/analyze_ddi_bias/main.nf
@@ -2,7 +2,7 @@ process ANALYZE_DDI_BIAS {
     tag "bias_analysis"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path "domainsplit.sqlite3"
diff --git a/modules/local/build_ppi_negative_pool/main.nf b/modules/local/build_ppi_negative_pool/main.nf
index ec15f4c..3ddf835 100644
--- a/modules/local/build_ppi_negative_pool/main.nf
+++ b/modules/local/build_ppi_negative_pool/main.nf
@@ -2,7 +2,7 @@ process BUILD_PPI_NEGATIVE_POOL {
     tag "build_ppi_negative_pool"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/curate_domains/extract_unique_domains/main.nf b/modules/local/curate_domains/extract_unique_domains/main.nf
index e59d20d..7a607e9 100644
--- a/modules/local/curate_domains/extract_unique_domains/main.nf
+++ b/modules/local/curate_domains/extract_unique_domains/main.nf
@@ -2,7 +2,7 @@ process EXTRACT_UNIQUE_DOMAINS {
     tag { "${domainsplit_db.simpleName}" }
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db
diff --git a/modules/local/enrich/insert_domain_go_terms/main.nf b/modules/local/enrich/insert_domain_go_terms/main.nf
index 3e6409a..3f8f013 100644
--- a/modules/local/enrich/insert_domain_go_terms/main.nf
+++ b/modules/local/enrich/insert_domain_go_terms/main.nf
@@ -2,7 +2,7 @@ process INSERT_DOMAIN_GO_TERMS {
     tag "insert_domain_go_terms"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/enrich/insert_domain_protein_mapping/main.nf b/modules/local/enrich/insert_domain_protein_mapping/main.nf
index 9fb4b44..950840a 100644
--- a/modules/local/enrich/insert_domain_protein_mapping/main.nf
+++ b/modules/local/enrich/insert_domain_protein_mapping/main.nf
@@ -2,7 +2,7 @@ process INSERT_DOMAIN_PROTEIN_MAPPING {
     tag "insert_domain_protein_mapping"
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/enrich/insert_ppi/main.nf b/modules/local/enrich/insert_ppi/main.nf
index afd5f32..d5c67c8 100644
--- a/modules/local/enrich/insert_ppi/main.nf
+++ b/modules/local/enrich/insert_ppi/main.nf
@@ -2,7 +2,7 @@ process INSERT_PPI {
     tag "insert_ppi"
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/enrich/insert_protein_go_terms/main.nf b/modules/local/enrich/insert_protein_go_terms/main.nf
index 64110a0..ec34f33 100644
--- a/modules/local/enrich/insert_protein_go_terms/main.nf
+++ b/modules/local/enrich/insert_protein_go_terms/main.nf
@@ -2,7 +2,7 @@ process INSERT_PROTEIN_GO_TERMS {
     tag "insert_protein_go_terms"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/enrich/insert_proteins_with_embeddings/main.nf b/modules/local/enrich/insert_proteins_with_embeddings/main.nf
index 096318c..ee2b6d2 100644
--- a/modules/local/enrich/insert_proteins_with_embeddings/main.nf
+++ b/modules/local/enrich/insert_proteins_with_embeddings/main.nf
@@ -2,7 +2,7 @@ process INSERT_PROTEINS_WITH_EMBEDDINGS {
     tag "insert_proteins_with_embeddings"
     label 'process_high'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/esm_embeddings/main.nf b/modules/local/esm_embeddings/main.nf
index a96bd5d..00bac7d 100644
--- a/modules/local/esm_embeddings/main.nf
+++ b/modules/local/esm_embeddings/main.nf
@@ -22,7 +22,7 @@ process FILTER_SEQUENCES {
     tag { "${protein_domain_map.simpleName}" }
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path protein_domain_map
@@ -83,7 +83,7 @@ process GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK {
     label 'process_gpu_large'
     secret 'HF_TOKEN'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-gpu:1.0.0"
+    container "konstantinpelz/domainsplit-gpu:1.0.0"
     containerOptions {
         workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer'
             ? '--env HF_TOKEN --env HF_HOME --env HUGGINGFACE_HUB_CACHE'
@@ -132,7 +132,7 @@ process GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK {
     label 'process_gpu_large'
     secret 'HF_TOKEN'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-gpu:1.0.0"
+    container "konstantinpelz/domainsplit-gpu:1.0.0"
     containerOptions {
         workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer'
             ? '--env HF_TOKEN --env HF_HOME --env HUGGINGFACE_HUB_CACHE'
diff --git a/modules/local/external_validation_split/main.nf b/modules/local/external_validation_split/main.nf
index 5760486..92f1570 100644
--- a/modules/local/external_validation_split/main.nf
+++ b/modules/local/external_validation_split/main.nf
@@ -11,7 +11,7 @@ process SUBSET_DDIS_BY_SOURCE {
     tag "subset_${split_name}"
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path 'domainsplit.sqlite3'
diff --git a/modules/local/init_domainsplit_db/main.nf b/modules/local/init_domainsplit_db/main.nf
index 41dab03..548921c 100644
--- a/modules/local/init_domainsplit_db/main.nf
+++ b/modules/local/init_domainsplit_db/main.nf
@@ -2,7 +2,7 @@ process INIT_DOMAINSPLIT_DB {
     tag "init_domainsplit_db"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     output:
     path "domainsplit.sqlite3", emit: domainsplit_db
diff --git a/modules/local/insert_3did/main.nf b/modules/local/insert_3did/main.nf
index 021939a..73e26bb 100644
--- a/modules/local/insert_3did/main.nf
+++ b/modules/local/insert_3did/main.nf
@@ -2,7 +2,7 @@ process INSERT_3DID {
     tag "insert_3did"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/insert_negatome/main.nf b/modules/local/insert_negatome/main.nf
index 429b495..928f4d1 100644
--- a/modules/local/insert_negatome/main.nf
+++ b/modules/local/insert_negatome/main.nf
@@ -2,7 +2,7 @@ process INSERT_NEGATOME {
     tag "insert_negatome"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/insert_ppi_negative_selection/main.nf b/modules/local/insert_ppi_negative_selection/main.nf
index b0380e8..d6c199f 100644
--- a/modules/local/insert_ppi_negative_selection/main.nf
+++ b/modules/local/insert_ppi_negative_selection/main.nf
@@ -2,7 +2,7 @@ process INSERT_PPI_NEGATIVE_SELECTION {
     tag "insert_ppi_negative_selection"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/insert_ppidm/main.nf b/modules/local/insert_ppidm/main.nf
index 553e79d..804bd95 100644
--- a/modules/local/insert_ppidm/main.nf
+++ b/modules/local/insert_ppidm/main.nf
@@ -2,7 +2,7 @@ process INSERT_PPIDM {
     tag "insert_ppidm"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/insert_single_domain_ppi/main.nf b/modules/local/insert_single_domain_ppi/main.nf
index 57b30df..a2c4ea4 100644
--- a/modules/local/insert_single_domain_ppi/main.nf
+++ b/modules/local/insert_single_domain_ppi/main.nf
@@ -2,7 +2,7 @@ process INSERT_SINGLE_DOMAIN_PPI {
     tag "insert_single_domain_ppi"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/minimal_leakage_split/main.nf b/modules/local/minimal_leakage_split/main.nf
index a548b12..be5dc15 100644
--- a/modules/local/minimal_leakage_split/main.nf
+++ b/modules/local/minimal_leakage_split/main.nf
@@ -2,7 +2,7 @@ process EXTRACT_DOMAIN_SEQUENCES {
     tag "domains"
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path "domainsplit.sqlite3"
@@ -42,7 +42,7 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN {
     tag "minimal_leakage_domain"
     label 'process_high'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path "domainsplit.sqlite3"
diff --git a/modules/local/negatome/main.nf b/modules/local/negatome/main.nf
index 79b8485..c1e04fc 100644
--- a/modules/local/negatome/main.nf
+++ b/modules/local/negatome/main.nf
@@ -2,7 +2,7 @@ process DOWNLOAD_NEGATOME {
     tag "negatome"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     val url
diff --git a/modules/local/pfam/main.nf b/modules/local/pfam/main.nf
index f853c2e..2025a10 100644
--- a/modules/local/pfam/main.nf
+++ b/modules/local/pfam/main.nf
@@ -2,7 +2,7 @@ process DOWNLOAD_PFAM_ALIGNMENTS_BATCH {
     tag { "batch_${pfam_ids_list.size()}" }
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     maxRetries 3
     errorStrategy { task.attempt <= 3 ? 'retry' : 'ignore' }
@@ -88,7 +88,7 @@ process CREATE_PROTEIN_DOMAIN_MAPPING {
     tag { "${uniprot_map_file.simpleName}" }
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path uniprot_map_file
diff --git a/modules/local/random_ddi_split/main.nf b/modules/local/random_ddi_split/main.nf
index 79fee98..cbe6e3e 100644
--- a/modules/local/random_ddi_split/main.nf
+++ b/modules/local/random_ddi_split/main.nf
@@ -2,7 +2,7 @@ process RANDOM_DDI_SPLIT {
     tag "random_ddi"
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path 'domainsplit.sqlite3'
diff --git a/modules/local/remove_self_interactions/main.nf b/modules/local/remove_self_interactions/main.nf
index 668c375..7b77ce4 100644
--- a/modules/local/remove_self_interactions/main.nf
+++ b/modules/local/remove_self_interactions/main.nf
@@ -2,7 +2,7 @@ process REMOVE_SELF_INTERACTIONS {
     tag "remove_self_interactions"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/select_ppi_negative_dans/main.nf b/modules/local/select_ppi_negative_dans/main.nf
index cbd009a..6163df8 100644
--- a/modules/local/select_ppi_negative_dans/main.nf
+++ b/modules/local/select_ppi_negative_dans/main.nf
@@ -2,7 +2,7 @@ process SELECT_PPI_NEGATIVE_DANS {
     tag "select_ppi_negative_dans:${method}"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     val  method
diff --git a/modules/local/smoke_filter/main.nf b/modules/local/smoke_filter/main.nf
index e884cb9..718258d 100644
--- a/modules/local/smoke_filter/main.nf
+++ b/modules/local/smoke_filter/main.nf
@@ -2,7 +2,7 @@ process SMOKE_FILTER {
     tag "smoke_filter"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db
diff --git a/modules/local/swissprot_map/main.nf b/modules/local/swissprot_map/main.nf
index 3ad8330..674cd01 100644
--- a/modules/local/swissprot_map/main.nf
+++ b/modules/local/swissprot_map/main.nf
@@ -2,7 +2,7 @@ process BUILD_SWISSPROT_PFAM_MAP {
     tag "swissprot_map"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     val url
diff --git a/modules/local/util/main.nf b/modules/local/util/main.nf
index 4be930d..c8328d1 100644
--- a/modules/local/util/main.nf
+++ b/modules/local/util/main.nf
@@ -15,7 +15,7 @@ process SHARD_FASTA {
     tag { "${input_fasta.simpleName}:${num_shards}" }
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     tuple val(meta), path(input_fasta)
@@ -58,7 +58,7 @@ process JOIN_HDF_FILES {
     tag { output_name }
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "docker://konstantinpelz/domainsplit-general:1.0.0"
+    container "konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     val output_name

From a40248a3db27d951f094051f7565a608eeb6a115 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Fri, 12 Jun 2026 12:31:29 +0200
Subject: [PATCH 14/16] fixed docker prefix

---
 modules/local/3did/main.nf                                  | 2 +-
 modules/local/analyze_ddi_bias/main.nf                      | 2 +-
 modules/local/build_ppi_negative_pool/main.nf               | 2 +-
 modules/local/curate_domains/extract_unique_domains/main.nf | 2 +-
 modules/local/enrich/insert_domain_go_terms/main.nf         | 2 +-
 modules/local/enrich/insert_domain_protein_mapping/main.nf  | 2 +-
 modules/local/enrich/insert_ppi/main.nf                     | 2 +-
 modules/local/enrich/insert_protein_go_terms/main.nf        | 2 +-
 .../local/enrich/insert_proteins_with_embeddings/main.nf    | 2 +-
 modules/local/esm_embeddings/main.nf                        | 6 +++---
 modules/local/external_validation_split/main.nf             | 2 +-
 modules/local/init_domainsplit_db/main.nf                   | 2 +-
 modules/local/insert_3did/main.nf                           | 2 +-
 modules/local/insert_negatome/main.nf                       | 2 +-
 modules/local/insert_ppi_negative_selection/main.nf         | 2 +-
 modules/local/insert_ppidm/main.nf                          | 2 +-
 modules/local/insert_single_domain_ppi/main.nf              | 2 +-
 modules/local/minimal_leakage_split/main.nf                 | 4 ++--
 modules/local/negatome/main.nf                              | 2 +-
 modules/local/pfam/main.nf                                  | 4 ++--
 modules/local/random_ddi_split/main.nf                      | 2 +-
 modules/local/remove_self_interactions/main.nf              | 2 +-
 modules/local/select_ppi_negative_dans/main.nf              | 2 +-
 modules/local/smoke_filter/main.nf                          | 2 +-
 modules/local/swissprot_map/main.nf                         | 2 +-
 modules/local/util/main.nf                                  | 4 ++--
 26 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/modules/local/3did/main.nf b/modules/local/3did/main.nf
index 3f1cf0f..99873b6 100644
--- a/modules/local/3did/main.nf
+++ b/modules/local/3did/main.nf
@@ -2,7 +2,7 @@ process DOWNLOAD_3DID_SQLITE {
     tag "3did"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path mysql_gz_file
diff --git a/modules/local/analyze_ddi_bias/main.nf b/modules/local/analyze_ddi_bias/main.nf
index 6f5f77e..706d3b4 100644
--- a/modules/local/analyze_ddi_bias/main.nf
+++ b/modules/local/analyze_ddi_bias/main.nf
@@ -2,7 +2,7 @@ process ANALYZE_DDI_BIAS {
     tag "bias_analysis"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path "domainsplit.sqlite3"
diff --git a/modules/local/build_ppi_negative_pool/main.nf b/modules/local/build_ppi_negative_pool/main.nf
index 3ddf835..6d647ec 100644
--- a/modules/local/build_ppi_negative_pool/main.nf
+++ b/modules/local/build_ppi_negative_pool/main.nf
@@ -2,7 +2,7 @@ process BUILD_PPI_NEGATIVE_POOL {
     tag "build_ppi_negative_pool"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/curate_domains/extract_unique_domains/main.nf b/modules/local/curate_domains/extract_unique_domains/main.nf
index 7a607e9..856709c 100644
--- a/modules/local/curate_domains/extract_unique_domains/main.nf
+++ b/modules/local/curate_domains/extract_unique_domains/main.nf
@@ -2,7 +2,7 @@ process EXTRACT_UNIQUE_DOMAINS {
     tag { "${domainsplit_db.simpleName}" }
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db
diff --git a/modules/local/enrich/insert_domain_go_terms/main.nf b/modules/local/enrich/insert_domain_go_terms/main.nf
index 3f8f013..a57c5f5 100644
--- a/modules/local/enrich/insert_domain_go_terms/main.nf
+++ b/modules/local/enrich/insert_domain_go_terms/main.nf
@@ -2,7 +2,7 @@ process INSERT_DOMAIN_GO_TERMS {
     tag "insert_domain_go_terms"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/enrich/insert_domain_protein_mapping/main.nf b/modules/local/enrich/insert_domain_protein_mapping/main.nf
index 950840a..04d3d2e 100644
--- a/modules/local/enrich/insert_domain_protein_mapping/main.nf
+++ b/modules/local/enrich/insert_domain_protein_mapping/main.nf
@@ -2,7 +2,7 @@ process INSERT_DOMAIN_PROTEIN_MAPPING {
     tag "insert_domain_protein_mapping"
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/enrich/insert_ppi/main.nf b/modules/local/enrich/insert_ppi/main.nf
index d5c67c8..0ecc502 100644
--- a/modules/local/enrich/insert_ppi/main.nf
+++ b/modules/local/enrich/insert_ppi/main.nf
@@ -2,7 +2,7 @@ process INSERT_PPI {
     tag "insert_ppi"
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/enrich/insert_protein_go_terms/main.nf b/modules/local/enrich/insert_protein_go_terms/main.nf
index ec34f33..a77973f 100644
--- a/modules/local/enrich/insert_protein_go_terms/main.nf
+++ b/modules/local/enrich/insert_protein_go_terms/main.nf
@@ -2,7 +2,7 @@ process INSERT_PROTEIN_GO_TERMS {
     tag "insert_protein_go_terms"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/enrich/insert_proteins_with_embeddings/main.nf b/modules/local/enrich/insert_proteins_with_embeddings/main.nf
index ee2b6d2..6e296e7 100644
--- a/modules/local/enrich/insert_proteins_with_embeddings/main.nf
+++ b/modules/local/enrich/insert_proteins_with_embeddings/main.nf
@@ -2,7 +2,7 @@ process INSERT_PROTEINS_WITH_EMBEDDINGS {
     tag "insert_proteins_with_embeddings"
     label 'process_high'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/esm_embeddings/main.nf b/modules/local/esm_embeddings/main.nf
index 00bac7d..e8d9eeb 100644
--- a/modules/local/esm_embeddings/main.nf
+++ b/modules/local/esm_embeddings/main.nf
@@ -22,7 +22,7 @@ process FILTER_SEQUENCES {
     tag { "${protein_domain_map.simpleName}" }
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path protein_domain_map
@@ -83,7 +83,7 @@ process GENERATE_PROTEIN_ESM_EMBEDDINGS_CHUNK {
     label 'process_gpu_large'
     secret 'HF_TOKEN'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-gpu:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-gpu:1.0.0"
     containerOptions {
         workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer'
             ? '--env HF_TOKEN --env HF_HOME --env HUGGINGFACE_HUB_CACHE'
@@ -132,7 +132,7 @@ process GENERATE_DOMAIN_ESM_EMBEDDINGS_CHUNK {
     label 'process_gpu_large'
     secret 'HF_TOKEN'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-gpu:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-gpu:1.0.0"
     containerOptions {
         workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer'
             ? '--env HF_TOKEN --env HF_HOME --env HUGGINGFACE_HUB_CACHE'
diff --git a/modules/local/external_validation_split/main.nf b/modules/local/external_validation_split/main.nf
index 92f1570..cbd11bd 100644
--- a/modules/local/external_validation_split/main.nf
+++ b/modules/local/external_validation_split/main.nf
@@ -11,7 +11,7 @@ process SUBSET_DDIS_BY_SOURCE {
     tag "subset_${split_name}"
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path 'domainsplit.sqlite3'
diff --git a/modules/local/init_domainsplit_db/main.nf b/modules/local/init_domainsplit_db/main.nf
index 548921c..53bf98e 100644
--- a/modules/local/init_domainsplit_db/main.nf
+++ b/modules/local/init_domainsplit_db/main.nf
@@ -2,7 +2,7 @@ process INIT_DOMAINSPLIT_DB {
     tag "init_domainsplit_db"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     output:
     path "domainsplit.sqlite3", emit: domainsplit_db
diff --git a/modules/local/insert_3did/main.nf b/modules/local/insert_3did/main.nf
index 73e26bb..5434f10 100644
--- a/modules/local/insert_3did/main.nf
+++ b/modules/local/insert_3did/main.nf
@@ -2,7 +2,7 @@ process INSERT_3DID {
     tag "insert_3did"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/insert_negatome/main.nf b/modules/local/insert_negatome/main.nf
index 928f4d1..5db9dc8 100644
--- a/modules/local/insert_negatome/main.nf
+++ b/modules/local/insert_negatome/main.nf
@@ -2,7 +2,7 @@ process INSERT_NEGATOME {
     tag "insert_negatome"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/insert_ppi_negative_selection/main.nf b/modules/local/insert_ppi_negative_selection/main.nf
index d6c199f..9d5daf9 100644
--- a/modules/local/insert_ppi_negative_selection/main.nf
+++ b/modules/local/insert_ppi_negative_selection/main.nf
@@ -2,7 +2,7 @@ process INSERT_PPI_NEGATIVE_SELECTION {
     tag "insert_ppi_negative_selection"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/insert_ppidm/main.nf b/modules/local/insert_ppidm/main.nf
index 804bd95..3c5244e 100644
--- a/modules/local/insert_ppidm/main.nf
+++ b/modules/local/insert_ppidm/main.nf
@@ -2,7 +2,7 @@ process INSERT_PPIDM {
     tag "insert_ppidm"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/insert_single_domain_ppi/main.nf b/modules/local/insert_single_domain_ppi/main.nf
index a2c4ea4..3884ba6 100644
--- a/modules/local/insert_single_domain_ppi/main.nf
+++ b/modules/local/insert_single_domain_ppi/main.nf
@@ -2,7 +2,7 @@ process INSERT_SINGLE_DOMAIN_PPI {
     tag "insert_single_domain_ppi"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/minimal_leakage_split/main.nf b/modules/local/minimal_leakage_split/main.nf
index be5dc15..f5130d9 100644
--- a/modules/local/minimal_leakage_split/main.nf
+++ b/modules/local/minimal_leakage_split/main.nf
@@ -2,7 +2,7 @@ process EXTRACT_DOMAIN_SEQUENCES {
     tag "domains"
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path "domainsplit.sqlite3"
@@ -42,7 +42,7 @@ process MINIMAL_LEAKAGE_SPLIT_DOMAIN {
     tag "minimal_leakage_domain"
     label 'process_high'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path "domainsplit.sqlite3"
diff --git a/modules/local/negatome/main.nf b/modules/local/negatome/main.nf
index c1e04fc..a3eaf61 100644
--- a/modules/local/negatome/main.nf
+++ b/modules/local/negatome/main.nf
@@ -2,7 +2,7 @@ process DOWNLOAD_NEGATOME {
     tag "negatome"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     val url
diff --git a/modules/local/pfam/main.nf b/modules/local/pfam/main.nf
index 2025a10..b48cf04 100644
--- a/modules/local/pfam/main.nf
+++ b/modules/local/pfam/main.nf
@@ -2,7 +2,7 @@ process DOWNLOAD_PFAM_ALIGNMENTS_BATCH {
     tag { "batch_${pfam_ids_list.size()}" }
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     maxRetries 3
     errorStrategy { task.attempt <= 3 ? 'retry' : 'ignore' }
@@ -88,7 +88,7 @@ process CREATE_PROTEIN_DOMAIN_MAPPING {
     tag { "${uniprot_map_file.simpleName}" }
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path uniprot_map_file
diff --git a/modules/local/random_ddi_split/main.nf b/modules/local/random_ddi_split/main.nf
index cbe6e3e..e9a4564 100644
--- a/modules/local/random_ddi_split/main.nf
+++ b/modules/local/random_ddi_split/main.nf
@@ -2,7 +2,7 @@ process RANDOM_DDI_SPLIT {
     tag "random_ddi"
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path 'domainsplit.sqlite3'
diff --git a/modules/local/remove_self_interactions/main.nf b/modules/local/remove_self_interactions/main.nf
index 7b77ce4..f453df5 100644
--- a/modules/local/remove_self_interactions/main.nf
+++ b/modules/local/remove_self_interactions/main.nf
@@ -2,7 +2,7 @@ process REMOVE_SELF_INTERACTIONS {
     tag "remove_self_interactions"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db_in, stageAs: 'input.domainsplit.sqlite3'
diff --git a/modules/local/select_ppi_negative_dans/main.nf b/modules/local/select_ppi_negative_dans/main.nf
index 6163df8..d06c827 100644
--- a/modules/local/select_ppi_negative_dans/main.nf
+++ b/modules/local/select_ppi_negative_dans/main.nf
@@ -2,7 +2,7 @@ process SELECT_PPI_NEGATIVE_DANS {
     tag "select_ppi_negative_dans:${method}"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     val  method
diff --git a/modules/local/smoke_filter/main.nf b/modules/local/smoke_filter/main.nf
index 718258d..fb2232b 100644
--- a/modules/local/smoke_filter/main.nf
+++ b/modules/local/smoke_filter/main.nf
@@ -2,7 +2,7 @@ process SMOKE_FILTER {
     tag "smoke_filter"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     path domainsplit_db
diff --git a/modules/local/swissprot_map/main.nf b/modules/local/swissprot_map/main.nf
index 674cd01..d79b51c 100644
--- a/modules/local/swissprot_map/main.nf
+++ b/modules/local/swissprot_map/main.nf
@@ -2,7 +2,7 @@ process BUILD_SWISSPROT_PFAM_MAP {
     tag "swissprot_map"
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     val url
diff --git a/modules/local/util/main.nf b/modules/local/util/main.nf
index c8328d1..4d57032 100644
--- a/modules/local/util/main.nf
+++ b/modules/local/util/main.nf
@@ -15,7 +15,7 @@ process SHARD_FASTA {
     tag { "${input_fasta.simpleName}:${num_shards}" }
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     tuple val(meta), path(input_fasta)
@@ -58,7 +58,7 @@ process JOIN_HDF_FILES {
     tag { output_name }
     label 'process_low'
     conda "${moduleDir}/environment.yml"
-    container "konstantinpelz/domainsplit-general:1.0.0"
+    container "docker.io/konstantinpelz/domainsplit-general:1.0.0"
 
     input:
     val output_name

From 2aca8c52017eab6e622a96532dd15ed69c00cc52 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Fri, 12 Jun 2026 12:38:19 +0200
Subject: [PATCH 15/16] added docker home directory

---
 nextflow.config | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 2f39696..6533539 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -162,7 +162,10 @@ profiles {
         shifter.enabled         = false
         charliecloud.enabled    = false
         apptainer.enabled       = false
-        docker.runOptions       = '-u $(id -u):$(id -g)'
+        // -e HOME=/tmp: the micromamba-based images run `micromamba run` as their
+        // entrypoint, which needs a writable HOME for its proc dir. With `-u uid:gid`
+        // the container has no home (HOME=/), so point it at world-writable /tmp.
+        docker.runOptions       = '-u $(id -u):$(id -g) -e HOME=/tmp'
     }
     arm64 {
         process.arch            = 'arm64'
@@ -176,7 +179,7 @@ profiles {
         wave.strategy           = 'conda,container'
     }
     emulate_amd64 {
-        docker.runOptions       = '-u $(id -u):$(id -g) --platform=linux/amd64'
+        docker.runOptions       = '-u $(id -u):$(id -g) -e HOME=/tmp --platform=linux/amd64'
     }
     singularity {
         singularity.enabled     = true
@@ -233,7 +236,7 @@ profiles {
         wave.strategy           = 'conda,container'
     }
     gpu {
-        docker.runOptions       = '-u $(id -u):$(id -g) --gpus all'
+        docker.runOptions       = '-u $(id -u):$(id -g) -e HOME=/tmp --gpus all'
         apptainer.runOptions    = '--nv'
         singularity.runOptions  = '--nv'
     }

From a09986f7628033eff6b1dd523a5d89ff261867b0 Mon Sep 17 00:00:00 2001
From: Konstantin Pelz <konstantin.pelz@tum.de>
Date: Fri, 12 Jun 2026 12:49:23 +0200
Subject: [PATCH 16/16] removed stuff from snapshot

---
 tests/default.nf.test      | 7 ++++---
 tests/default.nf.test.snap | 2 --
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/default.nf.test b/tests/default.nf.test
index dde4cdc..d929657 100644
--- a/tests/default.nf.test
+++ b/tests/default.nf.test
@@ -20,9 +20,10 @@ nextflow_pipeline {
             // empty .gz/.h5 stubs break content hashing). We therefore snapshot only
             // the set of produced output paths -- this verifies the whole DAG wires
             // together (channel topology, the split fan-out, publish paths) end to
-            // end without any downloads, GPU, or containers. pipeline_info/ is
-            // ignored because its filenames embed a run timestamp.
-            def stable_path = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/execution_*', 'pipeline_info/pipeline_dag_*'])
+            // end without any downloads, GPU, or containers. Everything under
+            // pipeline_info/ is ignored because those filenames embed a run
+            // timestamp (e.g. params_<timestamp>.json), which is non-deterministic.
+            def stable_path = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*'])
             // Snapshot the collated software versions with the Nextflow version line
             // stripped (so the assertion survives Nextflow upgrades). This also
             // satisfies the nf-core `nf_test_content` lint rule, which requires a
diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap
index 483c015..864ab56 100644
--- a/tests/default.nf.test.snap
+++ b/tests/default.nf.test.snap
@@ -59,8 +59,6 @@
                 "negative_ppi",
                 "negative_ppi/negative_ppi_method_scores.tsv",
                 "pipeline_info",
-                "pipeline_info/nf_core_pipeline_software_mqc_versions.yml",
-                "pipeline_info/params_2026-06-12_11-10-10.json",
                 "random",
                 "random/optimization.sqlite3",
                 "random/test.sqlite3",