From 7b778443b14bfdd87c061d406af3eeacd2013a4c Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Wed, 10 Sep 2025 13:15:10 +0000
Subject: [PATCH 01/13] download ms2pip models to correct folder

---
 bin/chunked_ms2rescore.py                 |  2 +-
 bin/ms2rescore_check_or_download_model.py | 33 +++++++++++++++++++++++
 src/postprocessing/ms2rescore.nf          | 27 +++++++++++++++++--
 3 files changed, 59 insertions(+), 3 deletions(-)
 create mode 100755 bin/ms2rescore_check_or_download_model.py

diff --git a/bin/chunked_ms2rescore.py b/bin/chunked_ms2rescore.py
index 14dbea9..e22eb86 100755
--- a/bin/chunked_ms2rescore.py
+++ b/bin/chunked_ms2rescore.py
@@ -16,7 +16,7 @@ def argparse_setup():
     parser.add_argument("-spectra", help="Corresponding mzML file or .d path for PSMs file", required=True, type=str)
 
     parser.add_argument("-model", help="Model for MS2PIP", default="HCD", type=str)
-    parser.add_argument("-model_dir", help="Directory to store/find MS2PIP model", default="/mnt/data/ms2pip-model", type=str)
+    parser.add_argument("-model_dir", help="Directory to store/find MS2PIP model", default="./ms2pip-model", type=str)
     parser.add_argument("-ms2_tolerance", help="The MS2/fragment tolerance", default=0.02, type=float)
     parser.add_argument("-spectrum_id_pattern", help="The spectrum ID pattern to correspond PSMs to spectra", default="(.*)", type=str)
     parser.add_argument("-processes", help="Number of processes / threads to use", default=8, type=int)
diff --git a/bin/ms2rescore_check_or_download_model.py b/bin/ms2rescore_check_or_download_model.py
new file mode 100755
index 0000000..696055a
--- /dev/null
+++ b/bin/ms2rescore_check_or_download_model.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+
+import argparse
+import logging
+
+from ms2pip.constants import MODELS
+from ms2pip._utils.xgb_models import validate_requested_xgb_model
+
+
+def argparse_setup():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-ms2pip_model", help="Model for MS2PIP", default="HCD", type=str)
+    parser.add_argument("-model_dir", help="Directory to store/find MS2PIP model", default="./ms2pip-model", type=str)
+
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    args = argparse_setup()
+    logging.basicConfig(level=logging.INFO)
+
+    ms2pip_model = args.ms2pip_model
+    model_dir = args.model_dir
+
+    # Validate / download requested model
+    if ms2pip_model in MODELS.keys():
+        print(f"Checking {ms2pip_model} model")
+        if "xgboost_model_files" in MODELS[ms2pip_model].keys():
+            validate_requested_xgb_model(
+                MODELS[ms2pip_model]["xgboost_model_files"],
+                MODELS[ms2pip_model]["model_hash"],
+                model_dir,
+            )
+    
\ No newline at end of file
diff --git a/src/postprocessing/ms2rescore.nf b/src/postprocessing/ms2rescore.nf
index a9c3ffd..c13d701 100644
--- a/src/postprocessing/ms2rescore.nf
+++ b/src/postprocessing/ms2rescore.nf
@@ -5,6 +5,7 @@ params.ms2rescore_threads = 4
 params.ms2rescore_mem = "64 GB"
 params.ms2rescore_model = "HCD"
 params.ms2rescore_chunk_size = 100000
+params.ms2pip_model_dir = "./ms2pip-model"
 
 workflow ms2rescore_workflow {
     take:
@@ -15,7 +16,10 @@ workflow ms2rescore_workflow {
     searchengine
 
     main:
-    ms2rescore_pre_pins = run_chunked_ms2rescore(psm_tsvs_and_mzmls, psm_tsvs, mzmls, spectrum_id_pattern, params.fragment_tol_da)
+    ms2pip_model_dir = Channel.fromPath(params.ms2pip_model_dir, type: 'dir').first()
+    check_or_download_model(ms2pip_model_dir, params.ms2rescore_model)
+
+    ms2rescore_pre_pins = run_chunked_ms2rescore(psm_tsvs_and_mzmls, psm_tsvs, mzmls, spectrum_id_pattern, params.fragment_tol_da, ms2pip_model_dir)
     ms2rescore_pins = correct_psm_utils_pins(ms2rescore_pre_pins, searchengine)
 
     emit:
@@ -36,6 +40,7 @@ process run_chunked_ms2rescore {
     path mzmls
     val spectrum_id_pattern
     val fragment_tol_da
+    path ms2pip_model_dir
     
     output:
     path "*.ms2rescore.pin", emit: features_file
@@ -44,7 +49,7 @@ process run_chunked_ms2rescore {
     """
     chunked_ms2rescore.py -psms_file ${psm_utils_tsvs} \
         -spectra ${mzml_for_psms} \
-        -model ${params.ms2rescore_model} -model_dir "/mnt/data/ms2pip-model" \
+        -model ${params.ms2rescore_model} -model_dir "${ms2pip_model_dir}" \
         -ms2_tolerance ${fragment_tol_da} \
         -spectrum_id_pattern '${spectrum_id_pattern}' \
         -processes ${params.ms2rescore_threads} \
@@ -75,3 +80,21 @@ process correct_psm_utils_pins {
     awk '{FS="\t";OFS="\t"; if (NR>1) { \$3=\$1; \$1=NR-1; gsub(".*=", "", \$3)  } print}' ${psm_utils_pins} > ${psm_utils_pins.baseName}.corrected.pin
     """
 }
+
+
+process check_or_download_model {
+    cpus 1
+    memory '2 GB'
+    maxForks 1
+
+    container { params.python_image }
+
+    input:
+    path model_dir
+    val ms2rescore_model
+
+    script:
+    """
+    ms2rescore_check_or_download_model.py -ms2pip_model ${ms2rescore_model} -model_dir "${model_dir}"
+    """
+}
\ No newline at end of file

From dd36a203f9eedfa95af1b4cb629f09afdb525556 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Wed, 10 Sep 2025 13:38:18 +0000
Subject: [PATCH 02/13] different memory params for oktoberfest tasks

---
 src/postprocessing/oktoberfest.nf | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
index cc87213..b362551 100644
--- a/src/postprocessing/oktoberfest.nf
+++ b/src/postprocessing/oktoberfest.nf
@@ -1,7 +1,8 @@
 nextflow.enable.dsl=2
 
 // parameters for oktoberfest
-params.oktoberfest_memory = "64 GB"
+params.oktoberfest_memory = "64.GB"
+params.oktoberfest_to_pin_memory = "4.GB"
 params.oktoberfest_intensity_model = "Prosit_2020_intensity_HCD"
 params.oktoberfest_irt_model = "Prosit_2019_irt"
 params.oktoberfest_forks = 1 // have some mercy with the koina servers
@@ -85,7 +86,7 @@ process run_oktoberfest_feature_gen {
  */
 process oktoberfest_features_to_pin {
     cpus 1
-    memory { params.oktoberfest_memory }
+    memory { params.oktoberfest_to_pin_memory }
 
     container { params.oktoberfest_image }
 

From 9dabbe9bba0c32c777dd0b83a34c7260a476fb47 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Wed, 10 Sep 2025 13:45:34 +0000
Subject: [PATCH 03/13] parameter for memory consumption of fdrbench

---
 src/preprocess/create_entrapment_database.nf | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/preprocess/create_entrapment_database.nf b/src/preprocess/create_entrapment_database.nf
index 257185a..1bf6d3c 100644
--- a/src/preprocess/create_entrapment_database.nf
+++ b/src/preprocess/create_entrapment_database.nf
@@ -2,6 +2,8 @@ nextflow.enable.dsl=2
 
 params.fdrbench_image = 'quay.io/medbioinf/fdrbench-nightly:146f77'
 
+params.fdrbench_mem_gb = 16
+
 
 /**
  * Adds decoys and/or entapments to the FASTA file.
@@ -16,7 +18,7 @@ workflow create_entrapment_database {
         fold
 
     main:
-        entrapment_fasta = call_entrapment_database(fasta, fold)
+        entrapment_fasta = call_entrapment_database(fasta, fold, params.fdrbench_mem_gb)
         
     emit:
         entrapment_fasta
@@ -34,18 +36,21 @@ workflow create_entrapment_database {
  */
 process call_entrapment_database {
     cpus 1
+    memory "${ memory_limit }.GB"
+
     container { params.fdrbench_image }
 
     input: 
     path fasta
     val fold
+    val memory_limit
 
     output:
     path "${fasta.baseName}-entrapment.fasta"
 
     script:
     """
-    java -jar /opt/fdrbench/fdrbench.jar -db ${fasta} -o ${fasta.baseName}-entrapment.fasta -fold ${fold} -level protein -entrapment_label ENTRAPMENT_ -entrapment_pos 0 -uniprot -check
+    java -Xmx${memory_limit}G -jar /opt/fdrbench/fdrbench.jar -db ${fasta} -o ${fasta.baseName}-entrapment.fasta -fold ${fold} -level protein -entrapment_label ENTRAPMENT_ -entrapment_pos 0 -uniprot -check
     # 'Reheader' to add entrapment index to database and accession part of the header
     sed -r -i "s;^>ENTRAPMENT_(.+)\\|(.+)\\|(.+)_([0-9]+)\$;>ENTRAPMENT_\\4_\\1|ENTRAPMENT_\\4_\\2|\\3_\\4;g" ${fasta.baseName}-entrapment.fasta
     """

From 3bafb1d87104a6f35303c949d5bc3925b2423d86 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Wed, 10 Sep 2025 14:03:05 +0000
Subject: [PATCH 04/13] info for ms2pip model download

---
 src/postprocessing/ms2rescore.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/postprocessing/ms2rescore.nf b/src/postprocessing/ms2rescore.nf
index c13d701..e9d833d 100644
--- a/src/postprocessing/ms2rescore.nf
+++ b/src/postprocessing/ms2rescore.nf
@@ -85,7 +85,7 @@ process correct_psm_utils_pins {
 process check_or_download_model {
     cpus 1
     memory '2 GB'
-    maxForks 1
+    maxForks 1  // this makes sure that the download is only performed once, not more in parallel
 
     container { params.python_image }
 

From ac8af461d3c63e1e2463e3de9f2af5afde428da5 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Wed, 10 Sep 2025 14:44:53 +0000
Subject: [PATCH 05/13] container images as labels in config

---
 main.nf                                       |  37 -----
 nextflow.config                               | 138 +++++++++++++++++-
 src/identification/comet_identification.nf    |  10 +-
 src/identification/maxquant_identification.nf |   7 +-
 src/identification/msamanda_identification.nf |   7 +-
 .../msfragger_identification.nf               |  10 +-
 src/identification/msgfplus_identification.nf |  20 +--
 src/identification/sage_identification.nf     |  13 +-
 src/identification/xtandem_identification.nf  |  10 +-
 .../convert_and_enhance_psm_tsv.nf            |   9 +-
 src/postprocessing/ms2rescore.nf              |   7 +-
 src/postprocessing/oktoberfest.nf             |   4 +-
 src/postprocessing/percolator.nf              |   7 +-
 src/preprocess/convert_to_mzml.nf             |  16 +-
 src/preprocess/create_decoy_database.nf       |   8 +-
 src/preprocess/create_entrapment_database.nf  |   7 +-
 16 files changed, 189 insertions(+), 121 deletions(-)

diff --git a/main.nf b/main.nf
index 003c608..d4409e4 100644
--- a/main.nf
+++ b/main.nf
@@ -2,43 +2,6 @@
 // Nextflow pipeline for peptide identification with multiple search engines and post-processing tools
 //
 
-// default python image
-params.python_image = 'ghcr.io/medbioinf/pipeline-of-identification:latest'
-params.oktoberfest_image = 'medbioinf/oktoberfest'
-
-// parameters set by the command line
-params.raw_files = ''
-params.mzml_files = ''    // may contain globs
-params.fasta = ''
-params.fasta_target_decoy = ''
-params.precursor_tol_ppm = 10
-params.fragment_tol_da = 0.02
-params.is_timstof = false
-params.entrapment_fold = 0
-params.use_only_rank1_psms = true
-
-// keep the (converted) mzML files
-params.keep_mzmls = true
-
-// should the search engines be executed?
-params.execute_comet = true
-params.execute_maxquant = true
-params.execute_msamanda = true
-params.execute_msfragger = true
-params.execute_msgfplus = true
-params.execute_sage = true
-params.execute_xtandem = true
-
-// default parameter files
-params.outdir = './'
-params.comet_params_file = "${baseDir}/config/comet.params"
-params.maxquant_params_file = "${baseDir}/config/mqpar.xml"
-params.msamanda_config_file = "${baseDir}/config/msamanda_settings.xml"
-params.msfragger_config_file = "${baseDir}/config/closed_fragger.params"
-params.msgfplus_params_file = "${baseDir}/config/MSGFPlus_Params.txt"
-params.sage_config_file = "${baseDir}/config/sage_config.json"
-params.xtandem_config_file = "${baseDir}/config/xtandem_input.xml"
-
 // including modules
 include {create_entrapment_database} from "./src/preprocess/create_entrapment_database.nf"
 include {create_decoy_database} from "./src/preprocess/create_decoy_database.nf"
diff --git a/nextflow.config b/nextflow.config
index 3b375e3..54e5418 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,9 +1,135 @@
-workflow.output.mode = "copy"
+// Global default params, used in configs
+params {
+  // parameters set by the command line
+  raw_files = ''
+  mzml_files = ''    // may contain globs
+  fasta = ''
+  fasta_target_decoy = ''
+  precursor_tol_ppm = 10
+  fragment_tol_da = 0.02
+  is_timstof = false
+  entrapment_fold = 0
+  use_only_rank1_psms = true
+
+  // keep the (converted) mzML files
+  keep_mzmls = true
+
+  // should the search engines be executed?
+  execute_comet = true
+  execute_maxquant = true
+  execute_msamanda = true
+  execute_msfragger = true
+  execute_msgfplus = true
+  execute_sage = true
+  execute_xtandem = true
+
+  // default parameter files
+  outdir = './'
+  comet_params_file = "${baseDir}/config/comet.params"
+  maxquant_params_file = "${baseDir}/config/mqpar.xml"
+  msamanda_config_file = "${baseDir}/config/msamanda_settings.xml"
+  msfragger_config_file = "${baseDir}/config/closed_fragger.params"
+  msgfplus_params_file = "${baseDir}/config/MSGFPlus_Params.txt"
+  sage_config_file = "${baseDir}/config/sage_config.json"
+  xtandem_config_file = "${baseDir}/config/xtandem_input.xml"  
+}
+
+manifest {
+  name = 'mpc-bioinformatics/McQuaC'
+  contributors = [
+    [
+      name: 'Julian Uszkoreit',
+      affiliation: 'Ruhr University Bochum, Medical Bioinformatics',
+      github: '@julianu',
+      contribution: ['author', 'maintainer', 'contributor'],
+      orcid: '0000-0001-7522-4007',
+    ],
+    [
+      name: 'Dirk Winkelhardt',
+      affiliation: 'Ruhr University Bochum, Medical Bioinformatics',
+      github: '@di-hardt',
+      contribution: ['author', 'maintainer', 'contributor'],
+      orcid: '0000-0001-8770-2221',
+    ],
+  ]
+  homePage = 'https://github.com/medbioinf/pipeline-of-identification'
+  description = """A pipeline for the identification of peptides from mass spectrometry data, integrating multiple search engines and post-processing tools."""
+  mainScript = 'main.nf'
+  defaultBranch = 'main'
+  nextflowVersion = '!>=24.10.6'
+  version = '0.1.0'
+}
 
 profiles {
-    docker {
-        docker.enabled = true
-        docker.runOptions = "--user=root"
-        docker.fixOwnership = true
+  docker {
+    docker.enabled = true
+    docker.runOptions = "--user=root"
+    docker.fixOwnership = true
+
+    process {
+      withLabel: python_image {
+        container = 'ghcr.io/medbioinf/pipeline-of-identification:latest'
+      }
+
+      withLabel: comet_image {
+        container = 'quay.io/medbioinf/comet-ms:v2024.01.0'
+      }
+
+      withLabel: maxquant_image {
+        container = 'quay.io/medbioinf/maxquant:2.6.3.0'
+      }
+
+      withLabel: msamanda_image {
+        container = 'quay.io/medbioinf/msamanda:3.0.22.071'
+      }
+
+      withLabel: msfragger_image {
+        container = 'medbioinf/msfragger'
+      }
+
+      withLabel: msgfplus_image {
+        container = 'quay.io/medbioinf/msgfplus:v2024.03.26'
+      }
+
+      withLabel: mzidmerger_image {
+        container = 'quay.io/medbioinf/mzid-merger:1.4.26'
+      }
+
+      withLabel: sage_image {
+        container = 'quay.io/medbioinf/sage:v0.15.0-beta.1'
+      }
+
+      withLabel: xtandem_image {
+        container = 'quay.io/medbioinf/xtandem:2017.2.1.4'
+      }
+
+      withLabel: percolator_image {
+        container = 'ghcr.io/percolator/percolator:branch-3-08'
+      }
+
+      withLabel: msconvert_image {
+        container = 'proteowizard/pwiz-skyline-i-agree-to-the-vendor-licenses:3.0.25073-842baef'
+      }
+
+      withLabel: tdf2mzml_image {
+        container = 'quay.io/medbioinf/tdf2mzml:0.4'
+      }
+
+      withLabel: oktoberfest_image {
+        container = 'medbioinf/oktoberfest'
+      }
+
+      withLabel: openms_image {
+        container = 'quay.io/medbioinf/openms:3.4.1'
+      }
+
+      withLabel: fdrbench_image {
+        container = 'quay.io/medbioinf/fdrbench-nightly:146f77'
+      }
     }
-}
\ No newline at end of file
+  }
+}
+
+plugins {
+  id 'nf-schema@2.5.0'
+}
diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf
index 73aec6d..4905b78 100644
--- a/src/identification/comet_identification.nf
+++ b/src/identification/comet_identification.nf
@@ -1,7 +1,3 @@
-nextflow.enable.dsl=2
-
-params.comet_image = 'quay.io/medbioinf/comet-ms:v2024.01.0'
-
 // number of threads used by comet
 params.comet_threads = 16
 params.comet_mem = "8 GB"
@@ -51,7 +47,8 @@ workflow comet_identification {
 process adjust_comet_param_file {
     cpus 2
     memory "1 GB"
-    container { params.python_image }
+
+    label 'python_image'
 
     input:
     path comet_params_file
@@ -87,7 +84,8 @@ process adjust_comet_param_file {
 process identification_with_comet {
     cpus { params.comet_threads }
     memory { params.comet_mem }
-    container { params.comet_image }
+
+    label 'comet_image'
 
 	publishDir "${params.outdir}/comet", mode: 'copy'
 
diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf
index 07a87e6..a58af47 100644
--- a/src/identification/maxquant_identification.nf
+++ b/src/identification/maxquant_identification.nf
@@ -1,7 +1,3 @@
-nextflow.enable.dsl=2
-
-params.maxquant_image = 'quay.io/medbioinf/maxquant:2.6.3.0'
-
 // number of threads used by maxquant
 params.maxquant_threads = 4
 params.maxquant_mem = "32 GB"
@@ -88,7 +84,8 @@ workflow maxquant_identification {
 process identification_with_maxquant {
     cpus { params.maxquant_threads }
     memory { params.maxquant_mem }
-    container { params.maxquant_image }
+
+    label 'maxquant_image'
 
     publishDir "${params.outdir}/maxquant", mode: 'copy'
 
diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf
index b746108..2812daa 100644
--- a/src/identification/msamanda_identification.nf
+++ b/src/identification/msamanda_identification.nf
@@ -1,7 +1,3 @@
-nextflow.enable.dsl=2
-
-params.msamanda_image = 'quay.io/medbioinf/msamanda:3.0.22.071'
-
 // number of threads used by msamanda
 params.msamanda_threads = 16
 params.msamanda_mem = "64 GB"
@@ -56,7 +52,8 @@ workflow msamanda_identification {
 process identification_with_msamanda {
     cpus { params.msamanda_threads }
     memory { params.msamanda_mem }
-    container { params.msamanda_image }
+
+    label 'msamanda_image'
 
     publishDir "${params.outdir}/msamanda", mode: 'copy'
 
diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf
index 7c69d57..408dfc7 100644
--- a/src/identification/msfragger_identification.nf
+++ b/src/identification/msfragger_identification.nf
@@ -1,7 +1,3 @@
-nextflow.enable.dsl=2
-
-params.msfragger_image = 'medbioinf/msfragger'
-
 // parameters for MSFragger
 params.msfragger_threads = 16
 params.msfragger_mem_gb = 16
@@ -50,7 +46,8 @@ workflow msfragger_identification {
 process adjust_msfragger_param_file {
     cpus 2
     memory "1 GB"
-    container { params.python_image }
+
+    label 'python_image'
 
     input:
     path fragger_params_file
@@ -89,7 +86,8 @@ process adjust_msfragger_param_file {
 process identification_with_msfragger {
     cpus { params.msfragger_threads }
     memory { params.msfragger_mem_gb + " GB" }
-    container { params.msfragger_image }
+
+    label 'msfragger_image'
     
     publishDir "${params.outdir}/msfragger", mode: 'copy'
 
diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf
index 88b9ea5..c7c8c4b 100644
--- a/src/identification/msgfplus_identification.nf
+++ b/src/identification/msgfplus_identification.nf
@@ -1,8 +1,3 @@
-nextflow.enable.dsl=2
-
-params.msgfplus_image = 'quay.io/medbioinf/msgfplus:v2024.03.26'
-params.mzidmerger_image = 'quay.io/medbioinf/mzid-merger:1.4.26'
-
 // params for MS-GF+
 params.msgfplus_threads = 6
 params.msgfplus_mem_gb = 16
@@ -103,7 +98,8 @@ workflow msgfplus_identification {
 process identification_with_msgfplus {
     cpus { params.msgfplus_threads }
     memory { params.msgfplus_mem_gb + " GB" }
-    container { params.msgfplus_image }
+
+    label 'msgfplus_image'
 
     publishDir "${params.outdir}/msgfplus", mode: 'copy', enabled: { publish_results }
 
@@ -136,7 +132,8 @@ process identification_with_msgfplus {
 process split_fasta {
     cpus 2
     memory "8 GB"
-    container { params.python_image }
+
+    label 'python_image'
 
     input:
     path fasta
@@ -154,7 +151,8 @@ process split_fasta {
 process build_msgfplus_index {
     cpus { params.msgfplus_threads }
     memory { params.msgfplus_mem_gb + " GB" }
-    container { params.msgfplus_image }
+
+    label 'msgfplus_image'
 
     input:
     path fasta
@@ -172,7 +170,8 @@ process build_msgfplus_index {
 process merge_psms {
     cpus 2
     memory { params.msgfplus_merge_mem_gb + " GB" }
-    container { params.python_image }
+
+    label 'python_image'
 
     input:
     tuple val(original_mzml_basename), path(psm_tsvs)
@@ -190,7 +189,8 @@ process merge_psms {
 process mzid_merger {
     cpus 2
     memory "8 GB"
-    container { params.mzidmerger_image }
+
+    label 'mzidmerger_image'
 
     publishDir "${params.outdir}/msgfplus", mode: 'copy'
 
diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf
index 0e69d79..96d8c15 100644
--- a/src/identification/sage_identification.nf
+++ b/src/identification/sage_identification.nf
@@ -1,7 +1,3 @@
-nextflow.enable.dsl=2
-
-params.sage_image = 'quay.io/medbioinf/sage:v0.15.0-beta.1'
-
 // number of threads used by sage
 params.sage_threads = 16
 params.sage_mem = "128 GB"
@@ -56,7 +52,8 @@ workflow sage_identification {
 process adjust_sage_config {
     cpus 2
     memory "1 GB"
-    container { params.python_image }
+
+    label 'python_image'
 
     input:
     path default_config_file
@@ -95,7 +92,8 @@ with open("./adjusted_sage_config.json", "w") as outfile:
 process identification_with_sage {
     cpus { params.sage_threads }
     memory { params.sage_mem }
-    container { params.sage_image }
+
+    label 'sage_image'
 
     input:
     path sage_config_file
@@ -115,7 +113,8 @@ process identification_with_sage {
 process separate_sage_results {
     cpus 2
     memory "1 GB"
-    container { params.python_image }
+
+    label 'python_image'
 
     publishDir "${params.outdir}/sage", mode: 'copy'
 
diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf
index 3f7856a..8354144 100644
--- a/src/identification/xtandem_identification.nf
+++ b/src/identification/xtandem_identification.nf
@@ -1,7 +1,3 @@
-nextflow.enable.dsl=2
-
-params.xtandem_image = 'quay.io/medbioinf/xtandem:2017.2.1.4'
-
 // number of threads used by xtandem
 params.xtandem_threads = 16
 params.xtandem_mem = "128 GB"
@@ -60,7 +56,8 @@ workflow xtandem_identification {
 process create_xtandem_params_files_from_default {
     cpus 2
     memory "1 GB"
-    container { params.python_image }
+
+    label 'python_image'
 
     input:
     path xtandem_config_file
@@ -116,7 +113,8 @@ process create_xtandem_params_files_from_default {
 process identification_with_xtandem {
     cpus { params.xtandem_threads }
     memory { params.xtandem_mem }
-    container { params.xtandem_image }
+
+    label 'xtandem_image'
     
     publishDir "${params.outdir}/xtandem", mode: 'copy'
 
diff --git a/src/postprocessing/convert_and_enhance_psm_tsv.nf b/src/postprocessing/convert_and_enhance_psm_tsv.nf
index b43a54b..11dfa1a 100644
--- a/src/postprocessing/convert_and_enhance_psm_tsv.nf
+++ b/src/postprocessing/convert_and_enhance_psm_tsv.nf
@@ -48,7 +48,8 @@ workflow enhance_psm_tsv {
 process convert_searchengine_to_psm_utils {
     cpus 2
     memory { params.convert_psm_tsv_mem }
-    container { params.python_image }
+
+    label 'python_image'
 
     input:
     path searchengine_results
@@ -66,7 +67,8 @@ process convert_searchengine_to_psm_utils {
 process convert_chunked_result_to_psm_utils {
     cpus 2
     memory { params.convert_psm_tsv_mem }
-    container { params.python_image }
+
+    label 'python_image'
 
     input:
     tuple val(original_mzml_basename), path(searchengine_results)
@@ -91,7 +93,8 @@ process convert_chunked_result_to_psm_utils {
 process enhance_psms_and_create_pin {
     cpus 2
     memory { params.enhance_psm_tsv_mem }
-    container { params.python_image }
+
+    label 'python_image'
 
 	publishDir "${params.outdir}/${searchengine}", mode: 'copy'
 
diff --git a/src/postprocessing/ms2rescore.nf b/src/postprocessing/ms2rescore.nf
index e9d833d..8965ae7 100644
--- a/src/postprocessing/ms2rescore.nf
+++ b/src/postprocessing/ms2rescore.nf
@@ -31,8 +31,7 @@ process run_chunked_ms2rescore {
     cpus  { params.ms2rescore_threads }
     memory { params.ms2rescore_mem }
 
-    container { params.python_image }
-    containerOptions { "-v /mnt/data/projects/pipeline-of-identification/bin/ms2pip-model:/mnt/data/ms2pip-model" }
+    label 'python_image'
 
     input:
     tuple val(psm_utils_tsvs), val(mzml_for_psms)
@@ -63,7 +62,7 @@ process correct_psm_utils_pins {
     cpus  2
     memory '8 GB'
 
-    container { params.python_image }
+    label 'python_image'
 
 	publishDir "${params.outdir}/${searchengine}", mode: 'copy'
 
@@ -87,7 +86,7 @@ process check_or_download_model {
     memory '2 GB'
     maxForks 1  // this makes sure that the download is only performed once, not more in parallel
 
-    container { params.python_image }
+    label 'python_image'
 
     input:
     path model_dir
diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
index b362551..d266d87 100644
--- a/src/postprocessing/oktoberfest.nf
+++ b/src/postprocessing/oktoberfest.nf
@@ -48,7 +48,7 @@ process run_oktoberfest_feature_gen {
     maxForks params.oktoberfest_forks
     memory { params.oktoberfest_memory }
 
-    container { params.oktoberfest_image }
+    label 'oktoberfest_image'
 
     input:
     tuple val(psm_utils_tsvs), val(mzml_for_psms)
@@ -88,7 +88,7 @@ process oktoberfest_features_to_pin {
     cpus 1
     memory { params.oktoberfest_to_pin_memory }
 
-    container { params.oktoberfest_image }
+    label 'oktoberfest_image'
 
 	publishDir "${params.outdir}/${searchengine}", mode: 'copy'
 
diff --git a/src/postprocessing/percolator.nf b/src/postprocessing/percolator.nf
index 8c227f7..f32f61a 100644
--- a/src/postprocessing/percolator.nf
+++ b/src/postprocessing/percolator.nf
@@ -1,7 +1,3 @@
-nextflow.enable.dsl=2
-
-params.percolator_image = 'ghcr.io/percolator/percolator:branch-3-08'
-
 // number of threads used by percolator
 params.percolator_threads = 4
 params.percolator_mem = "4 GB"
@@ -27,7 +23,8 @@ workflow psm_percolator {
 process run_percolator {
     cpus  { params.percolator_threads }
     memory { params.percolator_mem }
-    container { params.percolator_image }
+
+    label 'percolator_image'
 
 	publishDir "${params.outdir}/${searchengine}", mode: 'copy'
 
diff --git a/src/preprocess/convert_to_mzml.nf b/src/preprocess/convert_to_mzml.nf
index c7a96c2..01ea3fe 100644
--- a/src/preprocess/convert_to_mzml.nf
+++ b/src/preprocess/convert_to_mzml.nf
@@ -1,7 +1,3 @@
-nextflow.enable.dsl=2
-
-params.msconvert_image = 'proteowizard/pwiz-skyline-i-agree-to-the-vendor-licenses:3.0.25073-842baef'
-params.tdf2mzml_image = 'quay.io/medbioinf/tdf2mzml:0.4'
 params.tdf2mzml_threads = 8
 
 workflow convert_to_mzml {
@@ -23,7 +19,8 @@ workflow convert_to_mzml {
 process convert_thermo_raw {
     cpus 2
     memory "8 GB"
-    container { params.msconvert_image }
+
+    label 'msconvert_image'
 
 	publishDir "${params.outdir}/mzmls", mode: 'copy', enabled: params.keep_mzmls
 
@@ -42,7 +39,8 @@ process convert_thermo_raw {
 process convert_bruker_d {
     cpus { params.tdf2mzml_threads }
     memory "8 GB"
-    container { params.tdf2mzml_image }
+    
+    label 'tdf2mzml_image'
 
     input:
     path input_d
@@ -63,7 +61,8 @@ process convert_bruker_d {
 process adjust_mzML {
     cpus 2
     memory "8 GB"
-    container { params.msconvert_image }
+
+    label 'msconvert_image'
 
 	publishDir "${params.outdir}/mzmls", mode: 'copy', enabled: params.keep_mzmls
 
@@ -94,7 +93,8 @@ process adjust_mzML {
 process split_mzml_into_chunks {
     cpus 2
     memory "8 GB"
-    container { params.msconvert_image }
+
+    label 'msconvert_image'
 
     input:
     val chunksize
diff --git a/src/preprocess/create_decoy_database.nf b/src/preprocess/create_decoy_database.nf
index a82adad..e20023e 100644
--- a/src/preprocess/create_decoy_database.nf
+++ b/src/preprocess/create_decoy_database.nf
@@ -1,7 +1,3 @@
-nextflow.enable.dsl=2
-
-params.openms_image = 'quay.io/medbioinf/openms:3.4.1'
-
 // number of threads used by maxquant
 params.decoy_database_threads = 4
 
@@ -26,7 +22,9 @@ workflow create_decoy_database {
 
 process call_decoy_database {
     cpus { params.decoy_database_threads }
-    container { params.openms_image }
+    memory '8.GB'
+
+    label 'openms_image'
 
     input:
     path fasta
diff --git a/src/preprocess/create_entrapment_database.nf b/src/preprocess/create_entrapment_database.nf
index 1bf6d3c..6d7759d 100644
--- a/src/preprocess/create_entrapment_database.nf
+++ b/src/preprocess/create_entrapment_database.nf
@@ -1,10 +1,5 @@
-nextflow.enable.dsl=2
-
-params.fdrbench_image = 'quay.io/medbioinf/fdrbench-nightly:146f77'
-
 params.fdrbench_mem_gb = 16
 
-
 /**
  * Adds decoys and/or entapments to the FASTA file.
  *
@@ -38,7 +33,7 @@ process call_entrapment_database {
     cpus 1
     memory "${ memory_limit }.GB"
 
-    container { params.fdrbench_image }
+    label 'fdrbench_image'
 
     input: 
     path fasta

From d90a2fbd9d624d1b20b49c548753774a0796b061 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Wed, 10 Sep 2025 15:13:05 +0000
Subject: [PATCH 06/13] moving parameters from main and comet to config

---
 nextflow.config                            |  37 ++--
 nextflow_schema.json                       | 198 +++++++++++++++++++++
 src/identification/comet_identification.nf |   8 -
 3 files changed, 218 insertions(+), 25 deletions(-)
 create mode 100644 nextflow_schema.json

diff --git a/nextflow.config b/nextflow.config
index 54e5418..4000c0d 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,36 +1,39 @@
 // Global default params, used in configs
 params {
-  // parameters set by the command line
-  raw_files = ''
-  mzml_files = ''    // may contain globs
+  // input / output parameters
+  outdir = './'
+  raw_files = './msms-files/*.raw'
+  mzml_files = ''
   fasta = ''
   fasta_target_decoy = ''
+  entrapment_fold = 0
+  keep_mzmls = true
+
+  // search specific parameters
   precursor_tol_ppm = 10
   fragment_tol_da = 0.02
   is_timstof = false
-  entrapment_fold = 0
   use_only_rank1_psms = true
 
-  // keep the (converted) mzML files
-  keep_mzmls = true
-
-  // should the search engines be executed?
   execute_comet = true
-  execute_maxquant = true
-  execute_msamanda = true
-  execute_msfragger = true
-  execute_msgfplus = true
-  execute_sage = true
-  execute_xtandem = true
-
-  // default parameter files
-  outdir = './'
   comet_params_file = "${baseDir}/config/comet.params"
+  comet_threads = 16
+  comet_mem = '8.GB'
+  comet_psm_id_pattern = '(.*)'
+  comet_spectrum_id_pattern = '.*scan=(\\d+)$'
+  comet_scan_id_pattern = '^(?P<scan_id>\\d+)$'
+
+  execute_maxquant = true
   maxquant_params_file = "${baseDir}/config/mqpar.xml"
+  execute_msamanda = true
   msamanda_config_file = "${baseDir}/config/msamanda_settings.xml"
+  execute_msfragger = true
   msfragger_config_file = "${baseDir}/config/closed_fragger.params"
+  execute_msgfplus = true
   msgfplus_params_file = "${baseDir}/config/MSGFPlus_Params.txt"
+  execute_sage = true
   sage_config_file = "${baseDir}/config/sage_config.json"
+  execute_xtandem = true
   xtandem_config_file = "${baseDir}/config/xtandem_input.xml"  
 }
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
new file mode 100644
index 0000000..13ccfcb
--- /dev/null
+++ b/nextflow_schema.json
@@ -0,0 +1,198 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://raw.githubusercontent.com/mpc-bioinformatics/McQuaC//nextflow_schema.json",
+  "title": "mpc-bioinformatics/McQuaC pipeline parameters",
+  "description": "A pipeline for the identification of peptides from mass spectrometry data, integrating multiple search engines and post-processing tools.",
+  "type": "object",
+  "$defs": {
+    "input_output_options": {
+      "title": "Input/output options",
+      "type": "object",
+      "fa_icon": "fas fa-terminal",
+      "description": "Define where the pipeline should find input data and save output data.",
+      "required": ["outdir", "raw_files"],
+      "properties": {
+        "outdir": {
+          "type": "string",
+          "format": "directory-path",
+          "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.",
+          "fa_icon": "fas fa-folder-open",
+          "default": "./"
+        },
+        "raw_files": {
+          "type": "string",
+          "description": "Path to the raw spectra files (either .raw or .d), can contain * to process multiple files",
+          "default": "./msms-files/*.raw"
+        },
+        "mzml_files": {
+          "type": "string",
+          "description": "If you already have conversions to mzML, specify them here. They need to have the same name basis as the raw spectrum files"
+        },
+        "fasta": {
+          "type": "string",
+          "description": "Path to the protein FASTA file to use, in UniProt header format.",
+          "format": "file-path",
+          "mimetype": "text/fasta",
+          "exists": true
+        },
+        "fasta_target_decoy": {
+          "type": "string",
+          "description": "If you already have a target-decoy protein FASTA from a previous run, it can be given here.",
+          "format": "file-path",
+          "mimetype": "text/fasta"
+        },
+        "entrapment_fold": {
+          "type": "integer",
+          "default": 0,
+          "description": "This parameter specifies for an entrapment search the fold of entrapment proteins per target protein. No database will be created if 0."
+        },
+        "keep_mzmls": {
+          "type": "boolean",
+          "default": true,
+          "description": "Whether the converted mzML files should be kept in the results folder."
+        }
+      }
+    },
+    "general_search_parameters": {
+      "title": "General search parameters",
+      "type": "object",
+      "description": "",
+      "default": "",
+      "properties": {
+        "precursor_tol_ppm": {
+          "type": "integer",
+          "default": 10,
+          "description": "The precursor tolerance in PPM"
+        },
+        "fragment_tol_da": {
+          "type": "number",
+          "default": 0.02,
+          "description": "The fragment tolerance in Dalton"
+        },
+        "is_timstof": {
+          "type": "boolean",
+          "description": "Specify whether the data is timsTOF data"
+        },
+        "use_only_rank1_psms": {
+          "type": "boolean",
+          "default": true,
+          "description": "Use only top identifications for any post processing. Otherwise, all identifications will be used."
+        },
+        "execute_maxquant": {
+          "type": "boolean",
+          "default": true,
+          "description": "Whether to execute identification with MaxQuant"
+        },
+        "maxquant_params_file": {
+          "type": "string",
+          "default": "${projectDir}/config/mqpar.xml",
+          "description": "Path to the MaxQuant params file for additional, search engine specific settings."
+        },
+        "execute_msamanda": {
+          "type": "boolean",
+          "default": true,
+          "description": "Whether to execute identification with MSAmanda"
+        },
+        "msamanda_config_file": {
+          "type": "string",
+          "default": "${projectDir}/config/msamanda_settings.xml",
+          "description": "Path to the MSAmanda params file for additional, search engine specific settings."
+        },
+        "execute_msfragger": {
+          "type": "boolean",
+          "default": true,
+          "description": "Whether to execute identification with MS Fragger"
+        },
+        "msfragger_config_file": {
+          "type": "string",
+          "default": "${projectDir}/config/closed_fragger.params",
+          "description": "Path to the MS Fragger params file for additional, search engine specific settings."
+        },
+        "execute_msgfplus": {
+          "type": "boolean",
+          "default": true,
+          "description": "Whether to execute identification with MS-GF+"
+        },
+        "msgfplus_params_file": {
+          "type": "string",
+          "default": "${projectDir}/config/MSGFPlus_Params.txt",
+          "description": "Path to the MS-GF+ params file for additional, search engine specific settings."
+        },
+        "execute_sage": {
+          "type": "boolean",
+          "default": true,
+          "description": "Whether to execute identification with Sage"
+        },
+        "sage_config_file": {
+          "type": "string",
+          "default": "${projectDir}/config/sage_config.json",
+          "description": "Path to the Sage params file for additional, search engine specific settings."
+        },
+        "execute_xtandem": {
+          "type": "boolean",
+          "default": true,
+          "description": "Whether to execute identification with X!Tandem"
+        },
+        "xtandem_config_file": {
+          "type": "string",
+          "default": "/mnt/data/projects/pipeline-of-identification/config/xtandem_input.xml",
+          "description": "Path to the X!Tandem params file for additional, search engine specific settings."
+        }
+      }
+    },
+    "comet_parameters": {
+      "title": "Comet parameters",
+      "type": "object",
+      "description": "",
+      "default": "",
+      "properties": {
+        "execute_comet": {
+          "type": "boolean",
+          "default": true,
+          "description": "Whether to execute identification with Comet"
+        },
+        "comet_params_file": {
+          "type": "string",
+          "default": "${projectDir}/config/comet.params",
+          "description": "Path to the Comet params file for additional, search engine specific settings."
+        },
+        "comet_threads": {
+          "type": "integer",
+          "default": 16,
+          "description": "Number of allowed threads / CPUs for Comet"
+        },
+        "comet_mem": {
+          "type": "string",
+          "default": "8.GB",
+          "description": "Number of allowed memory for Comet"
+        },
+        "comet_psm_id_pattern": {
+          "type": "string",
+          "default": "(.*)",
+          "description": "Regular expression to parse the PSM ID for MS2Rescore"
+        },
+        "comet_spectrum_id_pattern": {
+          "type": "string",
+          "default": ".*scan=(\\\\d+)$",
+          "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore"
+        },
+        "comet_scan_id_pattern": {
+          "type": "string",
+          "default": "^(?P<scan_id>\\\\d+)$",
+          "description": "Regular expression to parse the PSM ID from Comet for Oktoberfest"
+        }
+      }
+    }
+  },
+  "allOf": [
+    {
+      "$ref": "#/$defs/input_output_options"
+    },
+    {
+      "$ref": "#/$defs/general_search_parameters"
+    },
+    {
+      "$ref": "#/$defs/comet_parameters"
+    }
+  ]
+}
diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf
index 4905b78..f34e681 100644
--- a/src/identification/comet_identification.nf
+++ b/src/identification/comet_identification.nf
@@ -1,11 +1,3 @@
-// number of threads used by comet
-params.comet_threads = 16
-params.comet_mem = "8 GB"
-
-params.comet_psm_id_pattern = "(.*)"
-params.comet_spectrum_id_pattern = '.*scan=(\\d+)$'
-params.comet_scan_id_pattern = '^(?P<scan_id>\\d+)$'
-
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'

From 524c3de0f7479227c1579a5b905dc82176cd1212 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Wed, 10 Sep 2025 15:18:52 +0000
Subject: [PATCH 07/13] maxquant params to config and schema

---
 nextflow.config                               | 10 ++++
 nextflow_schema.json                          | 55 +++++++++++++++----
 src/identification/maxquant_identification.nf |  8 ---
 3 files changed, 54 insertions(+), 19 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 4000c0d..0791197 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -25,14 +25,24 @@ params {
 
   execute_maxquant = true
   maxquant_params_file = "${baseDir}/config/mqpar.xml"
+  maxquant_threads = 4
+  maxquant_mem = "32.GB"
+  maxquant_psm_id_pattern = ""
+  maxquant_spectrum_id_pattern = ""
+  maxquant_scan_id_pattern = ""
+
   execute_msamanda = true
   msamanda_config_file = "${baseDir}/config/msamanda_settings.xml"
+
   execute_msfragger = true
   msfragger_config_file = "${baseDir}/config/closed_fragger.params"
+
   execute_msgfplus = true
   msgfplus_params_file = "${baseDir}/config/MSGFPlus_Params.txt"
+
   execute_sage = true
   sage_config_file = "${baseDir}/config/sage_config.json"
+
   execute_xtandem = true
   xtandem_config_file = "${baseDir}/config/xtandem_input.xml"  
 }
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 13ccfcb..187d157 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -78,16 +78,6 @@
           "default": true,
           "description": "Use only top identifications for any post processing. Otherwise, all identifications will be used."
         },
-        "execute_maxquant": {
-          "type": "boolean",
-          "default": true,
-          "description": "Whether to execute identification with MaxQuant"
-        },
-        "maxquant_params_file": {
-          "type": "string",
-          "default": "${projectDir}/config/mqpar.xml",
-          "description": "Path to the MaxQuant params file for additional, search engine specific settings."
-        },
         "execute_msamanda": {
           "type": "boolean",
           "default": true,
@@ -179,7 +169,47 @@
         "comet_scan_id_pattern": {
           "type": "string",
           "default": "^(?P<scan_id>\\\\d+)$",
-          "description": "Regular expression to parse the PSM ID from Comet for Oktoberfest"
+          "description": "Regular expression to parse the PSM ID for Oktoberfest"
+        }
+      }
+    },
+    "maxquant_parameters": {
+      "title": "MaxQuant parameters",
+      "type": "object",
+      "description": "",
+      "default": "",
+      "properties": {
+        "execute_maxquant": {
+          "type": "boolean",
+          "default": true,
+          "description": "Whether to execute identification with MaxQuant"
+        },
+        "maxquant_params_file": {
+          "type": "string",
+          "default": "${projectDir}/config/mqpar.xml",
+          "description": "Path to the MaxQuant params file for additional, search engine specific settings."
+        },
+        "maxquant_threads": {
+          "type": "integer",
+          "default": 4,
+          "description": "Number of allowed threads / CPUs for MaxQuant"
+        },
+        "maxquant_mem": {
+          "type": "string",
+          "default": "32.GB",
+          "description": "Number of allowed memory for MaxQuant"
+        },
+        "maxquant_psm_id_pattern": {
+          "type": "string",
+          "description": "Regular expression to parse the PSM ID for MS2Rescore"
+        },
+        "maxquant_spectrum_id_pattern": {
+          "type": "string",
+          "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore"
+        },
+        "maxquant_scan_id_pattern": {
+          "type": "string",
+          "description": "Regular expression to parse the PSM ID for Oktoberfest"
         }
       }
     }
@@ -193,6 +223,9 @@
     },
     {
       "$ref": "#/$defs/comet_parameters"
+    },
+    {
+      "$ref": "#/$defs/maxquant_parameters"
     }
   ]
 }
diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf
index a58af47..3fb9266 100644
--- a/src/identification/maxquant_identification.nf
+++ b/src/identification/maxquant_identification.nf
@@ -1,11 +1,3 @@
-// number of threads used by maxquant
-params.maxquant_threads = 4
-params.maxquant_mem = "32 GB"
-
-params.maxquant_psm_id_pattern = ""
-params.maxquant_spectrum_id_pattern = ""
-params.maxquant_scan_id_pattern = ""
-
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'

From 3f22ea2a49b5bc5661310e563e824d3eef66ff69 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Wed, 10 Sep 2025 15:28:59 +0000
Subject: [PATCH 08/13] move all parameters to config

---
 nextflow.config                               | 58 +++++++++++++++++++
 src/identification/msamanda_identification.nf |  8 ---
 .../msfragger_identification.nf               | 10 ----
 src/identification/msgfplus_identification.nf | 15 -----
 src/identification/sage_identification.nf     | 10 ----
 src/identification/xtandem_identification.nf  |  8 ---
 .../convert_and_enhance_psm_tsv.nf            |  5 --
 src/postprocessing/ms2rescore.nf              |  9 ---
 src/postprocessing/oktoberfest.nf             |  9 ---
 src/postprocessing/percolator.nf              |  4 --
 src/preprocess/convert_to_mzml.nf             |  2 -
 src/preprocess/create_decoy_database.nf       |  4 --
 src/preprocess/create_entrapment_database.nf  |  2 -
 13 files changed, 58 insertions(+), 86 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 0791197..c54cd76 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -8,12 +8,17 @@ params {
   fasta_target_decoy = ''
   entrapment_fold = 0
   keep_mzmls = true
+  tdf2mzml_threads = 8
+  decoy_database_threads = 4
+  fdrbench_mem_gb = 16
 
   // search specific parameters
   precursor_tol_ppm = 10
   fragment_tol_da = 0.02
   is_timstof = false
   use_only_rank1_psms = true
+  convert_psm_tsv_mem = '60.GB'
+  enhance_psm_tsv_mem = '8.GB'
 
   execute_comet = true
   comet_params_file = "${baseDir}/config/comet.params"
@@ -33,18 +38,71 @@ params {
 
   execute_msamanda = true
   msamanda_config_file = "${baseDir}/config/msamanda_settings.xml"
+  msamanda_threads = 16
+  msamanda_mem = '64.GB'
+  msamanda_psm_id_pattern = '(.*)'
+  msamanda_spectrum_id_pattern = '(.*)'
+  msamanda_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
 
   execute_msfragger = true
   msfragger_config_file = "${baseDir}/config/closed_fragger.params"
+  msfragger_threads = 16
+  msfragger_mem_gb = 16
+  msfragger_db_split = 0
+  msfragger_calibrate = 2
+  msfragger_psm_id_pattern = '(.*)'
+  msfragger_spectrum_id_pattern = '(.*)'
+  msfragger_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
 
   execute_msgfplus = true
   msgfplus_params_file = "${baseDir}/config/MSGFPlus_Params.txt"
+  msgfplus_threads = 6
+  msgfplus_mem_gb = 16
+  msgfplus_tasks = 0
+  msgfplus_instrument = 1 // 0: Low-res LCQ/LTQ, 1: Orbitrap/FTICR/Lumos, 2: TOF, 3: Q-Exactive
+  msgfplus_split_input = 10000     // split input mzMLs into chunks of this size, 0 to disable
+  msgfplus_merge_mem_gb = 16       // memory for merging PSMs, used in merge_psms process
+  msgfplus_split_fasta = 0         // split the fasta into this many chunks, 0 to disable
+  msgfplus_psm_id_pattern = '(.*)'
+  msgfplus_spectrum_id_pattern = '(.*)'
+  msgfplus_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
 
   execute_sage = true
   sage_config_file = "${baseDir}/config/sage_config.json"
+  sage_threads = 16
+  sage_mem = '128.GB'
+  sage_prefilter = 'false'
+  sage_prefilter_chunk_size = 0
+  sage_psm_id_pattern = '(.*)'
+  sage_spectrum_id_pattern = '(.*)'
+  sage_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
 
   execute_xtandem = true
   xtandem_config_file = "${baseDir}/config/xtandem_input.xml"  
+  xtandem_threads = 16
+  xtandem_mem = '128.GB'
+  xtandem_psm_id_pattern = '(.*)'
+  xtandem_spectrum_id_pattern = '(.*)'
+  xtandem_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
+
+  // parameters for ms2rescore
+  ms2rescore_threads = 4
+  ms2rescore_mem = '64.GB'
+  ms2rescore_model = 'HCD'
+  ms2rescore_chunk_size = 100000
+  ms2pip_model_dir = './ms2pip-model'
+
+  // parameters for oktoberfest
+  oktoberfest_memory = '64.GB'
+  oktoberfest_to_pin_memory = '4.GB'
+  oktoberfest_intensity_model = 'Prosit_2020_intensity_HCD'
+  oktoberfest_irt_model = 'Prosit_2019_irt'
+  oktoberfest_forks = 1 // have some mercy with the koina servers
+
+  // number of threads used by percolator
+  percolator_threads = 4
+  percolator_mem = '4.GB'
+
 }
 
 manifest {
diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf
index 2812daa..c2817f6 100644
--- a/src/identification/msamanda_identification.nf
+++ b/src/identification/msamanda_identification.nf
@@ -1,11 +1,3 @@
-// number of threads used by msamanda
-params.msamanda_threads = 16
-params.msamanda_mem = "64 GB"
-
-params.msamanda_psm_id_pattern = "(.*)"
-params.msamanda_spectrum_id_pattern = '(.*)'
-params.msamanda_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
-
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf
index 408dfc7..3e412e5 100644
--- a/src/identification/msfragger_identification.nf
+++ b/src/identification/msfragger_identification.nf
@@ -1,13 +1,3 @@
-// parameters for MSFragger
-params.msfragger_threads = 16
-params.msfragger_mem_gb = 16
-params.msfragger_db_split = 0
-params.msfragger_calibrate = 2
-
-params.msfragger_psm_id_pattern = "(.*)"
-params.msfragger_spectrum_id_pattern = "(.*)"
-params.msfragger_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
-
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf
index c7c8c4b..342c795 100644
--- a/src/identification/msgfplus_identification.nf
+++ b/src/identification/msgfplus_identification.nf
@@ -1,18 +1,3 @@
-// params for MS-GF+
-params.msgfplus_threads = 6
-params.msgfplus_mem_gb = 16
-params.msgfplus_tasks = 0
-
-params.msgfplus_instrument = "1" // 0: Low-res LCQ/LTQ, 1: Orbitrap/FTICR/Lumos, 2: TOF, 3: Q-Exactive
-
-params.msgfplus_split_input = 10000     // split input mzMLs into chunks of this size, 0 to disable
-params.msgfplus_merge_mem_gb = 16       // memory for merging PSMs, used in merge_psms process
-params.msgfplus_split_fasta = 0         // split the fasta into this many chunks, 0 to disable
-
-params.msgfplus_psm_id_pattern = "(.*)"
-params.msgfplus_spectrum_id_pattern = '(.*)'
-params.msgfplus_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
-
 include {convert_chunked_result_to_psm_utils; enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf
index 96d8c15..ff89a90 100644
--- a/src/identification/sage_identification.nf
+++ b/src/identification/sage_identification.nf
@@ -1,13 +1,3 @@
-// number of threads used by sage
-params.sage_threads = 16
-params.sage_mem = "128 GB"
-params.sage_prefilter = "false"
-params.sage_prefilter_chunk_size = 0
-
-params.sage_psm_id_pattern = "(.*)"
-params.sage_spectrum_id_pattern = '(.*)'
-params.sage_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
-
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf
index 8354144..38494fa 100644
--- a/src/identification/xtandem_identification.nf
+++ b/src/identification/xtandem_identification.nf
@@ -1,11 +1,3 @@
-// number of threads used by xtandem
-params.xtandem_threads = 16
-params.xtandem_mem = "128 GB"
-
-params.xtandem_psm_id_pattern = "(.*)"
-params.xtandem_spectrum_id_pattern = '(.*)'
-params.xtandem_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
-
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
diff --git a/src/postprocessing/convert_and_enhance_psm_tsv.nf b/src/postprocessing/convert_and_enhance_psm_tsv.nf
index 11dfa1a..4d0b487 100644
--- a/src/postprocessing/convert_and_enhance_psm_tsv.nf
+++ b/src/postprocessing/convert_and_enhance_psm_tsv.nf
@@ -1,8 +1,3 @@
-nextflow.enable.dsl=2
-
-params.convert_psm_tsv_mem = "60 GB"
-params.enhance_psm_tsv_mem = "8 GB"
-
 /**
  * Executes postprocessing steps to enhance the psm_utils TSV and prepare the PIN files
  *
diff --git a/src/postprocessing/ms2rescore.nf b/src/postprocessing/ms2rescore.nf
index 8965ae7..0dc2282 100644
--- a/src/postprocessing/ms2rescore.nf
+++ b/src/postprocessing/ms2rescore.nf
@@ -1,12 +1,3 @@
-nextflow.enable.dsl=2
-
-// parameters for ms2rescore
-params.ms2rescore_threads = 4
-params.ms2rescore_mem = "64 GB"
-params.ms2rescore_model = "HCD"
-params.ms2rescore_chunk_size = 100000
-params.ms2pip_model_dir = "./ms2pip-model"
-
 workflow ms2rescore_workflow {
     take:
     psm_tsvs_and_mzmls
diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
index d266d87..59f413e 100644
--- a/src/postprocessing/oktoberfest.nf
+++ b/src/postprocessing/oktoberfest.nf
@@ -1,12 +1,3 @@
-nextflow.enable.dsl=2
-
-// parameters for oktoberfest
-params.oktoberfest_memory = "64.GB"
-params.oktoberfest_to_pin_memory = "4.GB"
-params.oktoberfest_intensity_model = "Prosit_2020_intensity_HCD"
-params.oktoberfest_irt_model = "Prosit_2019_irt"
-params.oktoberfest_forks = 1 // have some mercy with the koina servers
-
 /**
  * Runs oktoberfest rescoring for the given PSMs and mzML files.
  * 
diff --git a/src/postprocessing/percolator.nf b/src/postprocessing/percolator.nf
index f32f61a..39e1cee 100644
--- a/src/postprocessing/percolator.nf
+++ b/src/postprocessing/percolator.nf
@@ -1,7 +1,3 @@
-// number of threads used by percolator
-params.percolator_threads = 4
-params.percolator_mem = "4 GB"
-
 /**
  * Executes percolator for the given PIN files
  *
diff --git a/src/preprocess/convert_to_mzml.nf b/src/preprocess/convert_to_mzml.nf
index 01ea3fe..1f0baf2 100644
--- a/src/preprocess/convert_to_mzml.nf
+++ b/src/preprocess/convert_to_mzml.nf
@@ -1,5 +1,3 @@
-params.tdf2mzml_threads = 8
-
 workflow convert_to_mzml {
     take:
     input_path
diff --git a/src/preprocess/create_decoy_database.nf b/src/preprocess/create_decoy_database.nf
index e20023e..3cccd2c 100644
--- a/src/preprocess/create_decoy_database.nf
+++ b/src/preprocess/create_decoy_database.nf
@@ -1,7 +1,3 @@
-// number of threads used by maxquant
-params.decoy_database_threads = 4
-
-
 /**
  * Creates a concatenated target-decoy database
  *
diff --git a/src/preprocess/create_entrapment_database.nf b/src/preprocess/create_entrapment_database.nf
index 6d7759d..41b9959 100644
--- a/src/preprocess/create_entrapment_database.nf
+++ b/src/preprocess/create_entrapment_database.nf
@@ -1,5 +1,3 @@
-params.fdrbench_mem_gb = 16
-
 /**
  * Adds decoys and/or entapments to the FASTA file.
  *

From 3cff3cbe4a3582c3b135f5e60bed9e5b9738fe39 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Thu, 11 Sep 2025 09:20:19 +0000
Subject: [PATCH 09/13] cleaning of entrapment DBs for rare empty proteins

---
 src/preprocess/create_entrapment_database.nf | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/preprocess/create_entrapment_database.nf b/src/preprocess/create_entrapment_database.nf
index 41b9959..637ec53 100644
--- a/src/preprocess/create_entrapment_database.nf
+++ b/src/preprocess/create_entrapment_database.nf
@@ -45,6 +45,7 @@ process call_entrapment_database {
     """
     java -Xmx${memory_limit}G -jar /opt/fdrbench/fdrbench.jar -db ${fasta} -o ${fasta.baseName}-entrapment.fasta -fold ${fold} -level protein -entrapment_label ENTRAPMENT_ -entrapment_pos 0 -uniprot -check
     # 'Reheader' to add entrapment index to database and accession part of the header
-    sed -r -i "s;^>ENTRAPMENT_(.+)\\|(.+)\\|(.+)_([0-9]+)\$;>ENTRAPMENT_\\4_\\1|ENTRAPMENT_\\4_\\2|\\3_\\4;g" ${fasta.baseName}-entrapment.fasta
+    # and remove empty entrapment sequences (which can appear if the original sequence has many Xs)
+    sed -r -i -e "s;^>ENTRAPMENT_(.+)\\|(.+)\\|(.+)_([0-9]+)\$;>ENTRAPMENT_\\4_\\1|ENTRAPMENT_\\4_\\2|\\3_\\4;g" -e '\$!N;/>.*\\n\$/d;P;D'  ${fasta.baseName}-entrapment.fasta
     """
 }

From 803f103b9646aa8d0eb69d5320752500c33f140b Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Thu, 18 Sep 2025 15:42:39 +0000
Subject: [PATCH 10/13] updating docker name to match repo

---
 nextflow.config | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index c54cd76..fb77fc2 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -102,7 +102,6 @@ params {
   // number of threads used by percolator
   percolator_threads = 4
   percolator_mem = '4.GB'
-
 }
 
 manifest {
@@ -139,7 +138,7 @@ profiles {
 
     process {
       withLabel: python_image {
-        container = 'ghcr.io/medbioinf/pipeline-of-identification:latest'
+        container = 'ghcr.io/medbioinf/mspepid:latest'
       }
 
       withLabel: comet_image {

From 4a3ff3778f32aa645a0a77ad3d5263cff5420236 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Thu, 18 Sep 2025 16:21:37 +0000
Subject: [PATCH 11/13] Updated schema for parameters

---
 nextflow.config      |   2 +-
 nextflow_schema.json | 326 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 325 insertions(+), 3 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index fb77fc2..e8fae22 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -17,7 +17,7 @@ params {
   fragment_tol_da = 0.02
   is_timstof = false
   use_only_rank1_psms = true
-  convert_psm_tsv_mem = '60.GB'
+  convert_psm_tsv_mem = '32.GB'
   enhance_psm_tsv_mem = '8.GB'
 
   execute_comet = true
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 187d157..6b7a237 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -41,15 +41,30 @@
           "format": "file-path",
           "mimetype": "text/fasta"
         },
+        "decoy_database_threads": {
+          "type": "integer",
+          "default": 4,
+          "description": "Number of allowed threads / CPUs allowed for creation of target-decoy database"
+        },
         "entrapment_fold": {
           "type": "integer",
           "default": 0,
           "description": "This parameter specifies for an entrapment search the fold of entrapment proteins per target protein. No database will be created if 0."
         },
+        "fdrbench_mem_gb": {
+          "type": "integer",
+          "default": 16,
+          "description": "Amount of allowed memory in GB for the creation of the entrapment database"
+        },
         "keep_mzmls": {
           "type": "boolean",
           "default": true,
           "description": "Whether the converted mzML files should be kept in the results folder."
+        },
+        "tdf2mzml_threads": {
+          "type": "integer",
+          "default": 8,
+          "description": "Number of allowed threads / CPUs for conversion of .d to mzML using tdf2mzml"
         }
       }
     },
@@ -154,7 +169,7 @@
         "comet_mem": {
           "type": "string",
           "default": "8.GB",
-          "description": "Number of allowed memory for Comet"
+          "description": "Amount of allowed memory for Comet"
         },
         "comet_psm_id_pattern": {
           "type": "string",
@@ -197,7 +212,7 @@
         "maxquant_mem": {
           "type": "string",
           "default": "32.GB",
-          "description": "Number of allowed memory for MaxQuant"
+          "description": "Amount of allowed memory for MaxQuant"
         },
         "maxquant_psm_id_pattern": {
           "type": "string",
@@ -212,6 +227,295 @@
           "description": "Regular expression to parse the PSM ID for Oktoberfest"
         }
       }
+    },
+    "msamanda_parameters": {
+      "title": "MSAmanda parameters",
+      "type": "object",
+      "description": "",
+      "default": "",
+      "properties": {
+        "msamanda_threads": {
+          "type": "integer",
+          "default": 16,
+          "description": "Number of allowed threads / CPUs for MSAmanda"
+        },
+        "msamanda_mem": {
+          "type": "string",
+          "default": "64.GB",
+          "description": "Amount of allowed memory for MSAmanda"
+        },
+        "msamanda_psm_id_pattern": {
+          "type": "string",
+          "default": "(.*)",
+          "description": "Regular expression to parse the PSM ID for MS2Rescore"
+        },
+        "msamanda_spectrum_id_pattern": {
+          "type": "string",
+          "default": "(.*)",
+          "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore"
+        },
+        "msamanda_scan_id_pattern": {
+          "type": "string",
+          "default": ".*scan=(?P<scan_id>\\\\d+)$",
+          "description": "Regular expression to parse the PSM ID for Oktoberfest"
+        }
+      }
+    },
+    "ms_fragger_parameters": {
+      "title": "MS Fragger parameters",
+      "type": "object",
+      "description": "",
+      "default": "",
+      "properties": {
+        "msfragger_threads": {
+          "type": "integer",
+          "default": 16,
+          "description": "Number of allowed threads / CPUs for MS Fragger"
+        },
+        "msfragger_mem_gb": {
+          "type": "integer",
+          "default": 16,
+          "description": "Amount of allowed memory in GB for MS Fragger"
+        },
+        "msfragger_db_split": {
+          "type": "integer",
+          "default": 0,
+          "description": "Number of splits for the sequence database to save memory. 0 means no split. You should set msfragger_calibrate to 0 for larger numbers."
+        },
+        "msfragger_calibrate": {
+          "type": "integer",
+          "default": 2,
+          "description": "Parameter \"calibrate_mass\" for MS Fragger. Perform mass calibration (0 for OFF, 1 for ON, 2 for ON and find optimal parameters, 4 for ON and find the optimal fragment mass tolerance)."
+        },
+        "msfragger_psm_id_pattern": {
+          "type": "string",
+          "default": "(.*)",
+          "description": "Regular expression to parse the PSM ID for MS2Rescore"
+        },
+        "msfragger_spectrum_id_pattern": {
+          "type": "string",
+          "default": "(.*)",
+          "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore"
+        },
+        "msfragger_scan_id_pattern": {
+          "type": "string",
+          "default": ".*scan=(?P<scan_id>\\\\d+)$",
+          "description": "Regular expression to parse the PSM ID for Oktoberfest"
+        }
+      }
+    },
+    "ms_gf_parameters": {
+      "title": "MS-GF+ parameters",
+      "type": "object",
+      "description": "",
+      "default": "",
+      "properties": {
+        "msgfplus_threads": {
+          "type": "integer",
+          "default": 6,
+          "description": "Number of allowed threads / CPUs for MS-GF+"
+        },
+        "msgfplus_mem_gb": {
+          "type": "integer",
+          "default": 16,
+          "description": "Amount of allowed memory in GB for MS-GF+"
+        },
+        "msgfplus_tasks": {
+          "type": "integer",
+          "default": 0,
+          "description": "MS-GF+ parameter tasks. Please refer to the MS-GF+ documentation for more help."
+        },
+        "msgfplus_instrument": {
+          "type": "integer",
+          "default": 1,
+          "description": "0 means Low-res LCQ/LTQ (Default for CID and ETD); use InstrumentID=0 if analyzing a dataset with low-res CID and high-res HCD spectra; 1 means High-res LTQ (Default for HCD; also appropriate for high res CID); use InstrumentID=1 for Orbitrap, Lumos, and QEHFX instruments; 2 means TOF; 3 means Q-Exactive",
+          "minimum": 0,
+          "maximum": 3
+        },
+        "msgfplus_split_input": {
+          "type": "integer",
+          "default": 10000,
+          "description": "The input file will be split to the given number of MS2 spectra. Splitting greatly enhances speed and reduces memory consumption."
+        },
+        "msgfplus_merge_mem_gb": {
+          "type": "integer",
+          "default": 16,
+          "description": "Amount of allowed memory in GB to merge the results after a database split"
+        },
+        "msgfplus_split_fasta": {
+          "type": "integer",
+          "default": 0,
+          "description": "Number of splits for the sequence database to save memory and decrease the search time. 0 means no split."
+        },
+        "msgfplus_psm_id_pattern": {
+          "type": "string",
+          "default": "(.*)",
+          "description": "Regular expression to parse the PSM ID for MS2Rescore"
+        },
+        "msgfplus_spectrum_id_pattern": {
+          "type": "string",
+          "default": "(.*)",
+          "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore"
+        },
+        "msgfplus_scan_id_pattern": {
+          "type": "string",
+          "default": ".*scan=(?P<scan_id>\\\\d+)$",
+          "description": "Regular expression to parse the PSM ID for Oktoberfest"
+        }
+      }
+    },
+    "new_group_2": {
+      "title": "New Group 2",
+      "type": "object",
+      "description": "",
+      "default": "",
+      "properties": {
+        "sage_threads": {
+          "type": "integer",
+          "default": 16,
+          "description": "Number of allowed threads / CPUs for Sage"
+        },
+        "sage_mem": {
+          "type": "string",
+          "default": "128.GB",
+          "description": "Amount of allowed memory for Sage"
+        },
+        "sage_prefilter": {
+          "type": "boolean",
+          "description": "Whether the database should be split during Sage search. This greatly reduces the memory consumption on when using large databases (beta feature)"
+        },
+        "sage_prefilter_chunk_size": {
+          "type": "integer",
+          "default": 0,
+          "description": "Number of proteins per database chunk when prefiltering is active."
+        },
+        "sage_psm_id_pattern": {
+          "type": "string",
+          "default": "(.*)",
+          "description": "Regular expression to parse the PSM ID for MS2Rescore"
+        },
+        "sage_spectrum_id_pattern": {
+          "type": "string",
+          "default": "(.*)",
+          "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore"
+        },
+        "sage_scan_id_pattern": {
+          "type": "string",
+          "default": ".*scan=(?P<scan_id>\\\\d+)$",
+          "description": "Regular expression to parse the PSM ID for Oktoberfest"
+        }
+      }
+    },
+    "new_group_3": {
+      "title": "New Group 3",
+      "type": "object",
+      "description": "",
+      "default": "",
+      "properties": {
+        "xtandem_threads": {
+          "type": "integer",
+          "default": 16,
+          "description": "Number of allowed threads / CPUs for X!Tandem"
+        },
+        "xtandem_mem": {
+          "type": "string",
+          "default": "128.GB",
+          "description": "Amount of allowed memory for X!Tandem"
+        },
+        "xtandem_psm_id_pattern": {
+          "type": "string",
+          "default": "(.*)",
+          "description": "Regular expression to parse the PSM ID for MS2Rescore"
+        },
+        "xtandem_spectrum_id_pattern": {
+          "type": "string",
+          "default": "(.*)",
+          "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore"
+        },
+        "xtandem_scan_id_pattern": {
+          "type": "string",
+          "default": ".*scan=(?P<scan_id>\\\\d+)$",
+          "description": "Regular expression to parse the PSM ID for Oktoberfest"
+        }
+      }
+    },
+    "new_group_1": {
+      "title": "New Group 1",
+      "type": "object",
+      "description": "",
+      "default": "",
+      "properties": {
+        "convert_psm_tsv_mem": {
+          "type": "string",
+          "default": "32.GB",
+          "description": "Amount of allowed memory for converting results into psm-utils format"
+        },
+        "enhance_psm_tsv_mem": {
+          "type": "string",
+          "default": "8.GB",
+          "description": "Amount of allowed memory for enhancing psm-utils results with additional information"
+        },
+        "ms2rescore_threads": {
+          "type": "integer",
+          "default": 4,
+          "description": "Number of allowed threads / CPUs for MS2Rescore"
+        },
+        "ms2rescore_mem": {
+          "type": "string",
+          "default": "64.GB",
+          "description": "Amount of allowed memory for MS2Rescore"
+        },
+        "ms2rescore_model": {
+          "type": "string",
+          "default": "HCD",
+          "description": "The model used for MS2Rescore"
+        },
+        "ms2rescore_chunk_size": {
+          "type": "integer",
+          "default": 100000,
+          "description": "Chunksize in PSMs that is processed per MS2Rescore instance. Reduce to decrease memory consumption."
+        },
+        "ms2pip_model_dir": {
+          "type": "string",
+          "default": "./ms2pip-model",
+          "description": "Path to store the MS2Pip model"
+        },
+        "oktoberfest_memory": {
+          "type": "string",
+          "default": "64.GB",
+          "description": "Amount of allowed memory for Oktoberfest feature generation"
+        },
+        "oktoberfest_to_pin_memory": {
+          "type": "string",
+          "default": "4.GB",
+          "description": "Amount of allowed memory for Oktoberfest conversion to PIN"
+        },
+        "oktoberfest_intensity_model": {
+          "type": "string",
+          "default": "Prosit_2020_intensity_HCD",
+          "description": "The intensity model used by Oktoberfest"
+        },
+        "oktoberfest_irt_model": {
+          "type": "string",
+          "default": "Prosit_2019_irt",
+          "description": "The iRT model used by Oktoberfest"
+        },
+        "oktoberfest_forks": {
+          "type": "integer",
+          "default": 1,
+          "description": "Number of forks / parallel instances allowed for Oktoberfest"
+        },
+        "percolator_threads": {
+          "type": "integer",
+          "default": 4,
+          "description": "Number of allowed threads / CPUs for Percolator"
+        },
+        "percolator_mem": {
+          "type": "string",
+          "default": "4.GB",
+          "description": "Amount of allowed memory to run Percolator"
+        }
+      }
     }
   },
   "allOf": [
@@ -226,6 +530,24 @@
     },
     {
       "$ref": "#/$defs/maxquant_parameters"
+    },
+    {
+      "$ref": "#/$defs/msamanda_parameters"
+    },
+    {
+      "$ref": "#/$defs/ms_fragger_parameters"
+    },
+    {
+      "$ref": "#/$defs/ms_gf_parameters"
+    },
+    {
+      "$ref": "#/$defs/new_group_2"
+    },
+    {
+      "$ref": "#/$defs/new_group_3"
+    },
+    {
+      "$ref": "#/$defs/new_group_1"
     }
   ]
 }

From 3f2d3a8ceea8f3180a097028298117aabca68fc6 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Fri, 10 Oct 2025 08:59:30 +0000
Subject: [PATCH 12/13] fixing absolute path in schema default

---
 nextflow_schema.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index 6b7a237..7acf8eb 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -140,7 +140,7 @@
         },
         "xtandem_config_file": {
           "type": "string",
-          "default": "/mnt/data/projects/pipeline-of-identification/config/xtandem_input.xml",
+          "default": "${projectDir}/config/xtandem_input.xml",
           "description": "Path to the X!Tandem params file for additional, search engine specific settings."
         }
       }

From 7b3bdf28abb7351d9df66e12cf8793b67920da05 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Fri, 10 Oct 2025 09:02:08 +0000
Subject: [PATCH 13/13] documentation for complex entrapment sed command

---
 src/preprocess/create_entrapment_database.nf | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/preprocess/create_entrapment_database.nf b/src/preprocess/create_entrapment_database.nf
index 637ec53..60344c0 100644
--- a/src/preprocess/create_entrapment_database.nf
+++ b/src/preprocess/create_entrapment_database.nf
@@ -46,6 +46,18 @@ process call_entrapment_database {
     java -Xmx${memory_limit}G -jar /opt/fdrbench/fdrbench.jar -db ${fasta} -o ${fasta.baseName}-entrapment.fasta -fold ${fold} -level protein -entrapment_label ENTRAPMENT_ -entrapment_pos 0 -uniprot -check
     # 'Reheader' to add entrapment index to database and accession part of the header
     # and remove empty entrapment sequences (which can appear if the original sequence has many Xs)
+    # The following sed command performs two operations:
+    # 1. Substitutes FASTA headers to include the entrapment index in both the database and accession parts.
+    #    Regex breakdown:
+    #      ^>ENTRAPMENT_(.+)\\|(.+)\\|(.+)_([0-9]+)\$
+    #        - Matches headers starting with '>ENTRAPMENT_' followed by three fields separated by '|', with the last field ending in '_[number]'.
+    #      >ENTRAPMENT_\\4_\\1|ENTRAPMENT_\\4_\\2|\\3_\\4
+    #        - Rewrites the header to include the entrapment index (\\4) in both the database and accession parts.
+    # 2. Removes empty entrapment sequences (headers followed by an empty line).
+    #    Control flow:
+    #      \$!N;/>.*\\n\$/d;P;D
+    #        - Reads two lines at a time; if a header is followed by an empty line, deletes both.
+
     sed -r -i -e "s;^>ENTRAPMENT_(.+)\\|(.+)\\|(.+)_([0-9]+)\$;>ENTRAPMENT_\\4_\\1|ENTRAPMENT_\\4_\\2|\\3_\\4;g" -e '\$!N;/>.*\\n\$/d;P;D'  ${fasta.baseName}-entrapment.fasta
     """
 }