From 7b778443b14bfdd87c061d406af3eeacd2013a4c Mon Sep 17 00:00:00 2001 From: julianu Date: Wed, 10 Sep 2025 13:15:10 +0000 Subject: [PATCH 01/13] download ms2pip models to correct folder --- bin/chunked_ms2rescore.py | 2 +- bin/ms2rescore_check_or_download_model.py | 33 +++++++++++++++++++++++ src/postprocessing/ms2rescore.nf | 27 +++++++++++++++++-- 3 files changed, 59 insertions(+), 3 deletions(-) create mode 100755 bin/ms2rescore_check_or_download_model.py diff --git a/bin/chunked_ms2rescore.py b/bin/chunked_ms2rescore.py index 14dbea9..e22eb86 100755 --- a/bin/chunked_ms2rescore.py +++ b/bin/chunked_ms2rescore.py @@ -16,7 +16,7 @@ def argparse_setup(): parser.add_argument("-spectra", help="Corresponding mzML file or .d path for PSMs file", required=True, type=str) parser.add_argument("-model", help="Model for MS2PIP", default="HCD", type=str) - parser.add_argument("-model_dir", help="Directory to store/find MS2PIP model", default="/mnt/data/ms2pip-model", type=str) + parser.add_argument("-model_dir", help="Directory to store/find MS2PIP model", default="./ms2pip-model", type=str) parser.add_argument("-ms2_tolerance", help="The MS2/fragment tolerance", default=0.02, type=float) parser.add_argument("-spectrum_id_pattern", help="The spectrum ID pattern to correspond PSMs to spectra", default="(.*)", type=str) parser.add_argument("-processes", help="Number of processes / threads to use", default=8, type=int) diff --git a/bin/ms2rescore_check_or_download_model.py b/bin/ms2rescore_check_or_download_model.py new file mode 100755 index 0000000..696055a --- /dev/null +++ b/bin/ms2rescore_check_or_download_model.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +import argparse +import logging + +from ms2pip.constants import MODELS +from ms2pip._utils.xgb_models import validate_requested_xgb_model + + +def argparse_setup(): + parser = argparse.ArgumentParser() + parser.add_argument("-ms2pip_model", help="Model for MS2PIP", default="HCD", type=str) + parser.add_argument("-model_dir", help="Directory to store/find MS2PIP model", default="./ms2pip-model", type=str) + + return parser.parse_args() + +if __name__ == "__main__": + args = argparse_setup() + logging.basicConfig(level=logging.INFO) + + ms2pip_model = args.ms2pip_model + model_dir = args.model_dir + + # Validate / download requested model + if ms2pip_model in MODELS.keys(): + print(f"Checking {ms2pip_model} model") + if "xgboost_model_files" in MODELS[ms2pip_model].keys(): + validate_requested_xgb_model( + MODELS[ms2pip_model]["xgboost_model_files"], + MODELS[ms2pip_model]["model_hash"], + model_dir, + ) + \ No newline at end of file diff --git a/src/postprocessing/ms2rescore.nf b/src/postprocessing/ms2rescore.nf index a9c3ffd..c13d701 100644 --- a/src/postprocessing/ms2rescore.nf +++ b/src/postprocessing/ms2rescore.nf @@ -5,6 +5,7 @@ params.ms2rescore_threads = 4 params.ms2rescore_mem = "64 GB" params.ms2rescore_model = "HCD" params.ms2rescore_chunk_size = 100000 +params.ms2pip_model_dir = "./ms2pip-model" workflow ms2rescore_workflow { take: @@ -15,7 +16,10 @@ workflow ms2rescore_workflow { searchengine main: - ms2rescore_pre_pins = run_chunked_ms2rescore(psm_tsvs_and_mzmls, psm_tsvs, mzmls, spectrum_id_pattern, params.fragment_tol_da) + ms2pip_model_dir = Channel.fromPath(params.ms2pip_model_dir, type: 'dir').first() + check_or_download_model(ms2pip_model_dir, params.ms2rescore_model) + + ms2rescore_pre_pins = run_chunked_ms2rescore(psm_tsvs_and_mzmls, psm_tsvs, mzmls, spectrum_id_pattern, params.fragment_tol_da, ms2pip_model_dir) ms2rescore_pins = correct_psm_utils_pins(ms2rescore_pre_pins, searchengine) emit: @@ -36,6 +40,7 @@ process run_chunked_ms2rescore { path mzmls val spectrum_id_pattern val fragment_tol_da + path ms2pip_model_dir output: path "*.ms2rescore.pin", emit: features_file @@ -44,7 +49,7 @@ process run_chunked_ms2rescore { """ chunked_ms2rescore.py -psms_file ${psm_utils_tsvs} \ -spectra ${mzml_for_psms} \ - -model ${params.ms2rescore_model} -model_dir "/mnt/data/ms2pip-model" \ + -model ${params.ms2rescore_model} -model_dir "${ms2pip_model_dir}" \ -ms2_tolerance ${fragment_tol_da} \ -spectrum_id_pattern '${spectrum_id_pattern}' \ -processes ${params.ms2rescore_threads} \ @@ -75,3 +80,21 @@ process correct_psm_utils_pins { awk '{FS="\t";OFS="\t"; if (NR>1) { \$3=\$1; \$1=NR-1; gsub(".*=", "", \$3) } print}' ${psm_utils_pins} > ${psm_utils_pins.baseName}.corrected.pin """ } + + +process check_or_download_model { + cpus 1 + memory '2 GB' + maxForks 1 + + container { params.python_image } + + input: + path model_dir + val ms2rescore_model + + script: + """ + ms2rescore_check_or_download_model.py -ms2pip_model ${ms2rescore_model} -model_dir "${model_dir}" + """ +} \ No newline at end of file From dd36a203f9eedfa95af1b4cb629f09afdb525556 Mon Sep 17 00:00:00 2001 From: julianu Date: Wed, 10 Sep 2025 13:38:18 +0000 Subject: [PATCH 02/13] different memory params for oktoberfest tasks --- src/postprocessing/oktoberfest.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf index cc87213..b362551 100644 --- a/src/postprocessing/oktoberfest.nf +++ b/src/postprocessing/oktoberfest.nf @@ -1,7 +1,8 @@ nextflow.enable.dsl=2 // parameters for oktoberfest -params.oktoberfest_memory = "64 GB" +params.oktoberfest_memory = "64.GB" +params.oktoberfest_to_pin_memory = "4.GB" params.oktoberfest_intensity_model = "Prosit_2020_intensity_HCD" params.oktoberfest_irt_model = "Prosit_2019_irt" params.oktoberfest_forks = 1 // have some mercy with the koina servers @@ -85,7 +86,7 @@ process run_oktoberfest_feature_gen { */ process oktoberfest_features_to_pin { cpus 1 - memory { params.oktoberfest_memory } + memory { params.oktoberfest_to_pin_memory } container { params.oktoberfest_image } From 9dabbe9bba0c32c777dd0b83a34c7260a476fb47 Mon Sep 17 00:00:00 2001 From: julianu Date: Wed, 10 Sep 2025 13:45:34 +0000 Subject: [PATCH 03/13] parameter for memory consumption of fdrbench --- src/preprocess/create_entrapment_database.nf | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/preprocess/create_entrapment_database.nf b/src/preprocess/create_entrapment_database.nf index 257185a..1bf6d3c 100644 --- a/src/preprocess/create_entrapment_database.nf +++ b/src/preprocess/create_entrapment_database.nf @@ -2,6 +2,8 @@ nextflow.enable.dsl=2 params.fdrbench_image = 'quay.io/medbioinf/fdrbench-nightly:146f77' +params.fdrbench_mem_gb = 16 + /** * Adds decoys and/or entapments to the FASTA file. @@ -16,7 +18,7 @@ workflow create_entrapment_database { fold main: - entrapment_fasta = call_entrapment_database(fasta, fold) + entrapment_fasta = call_entrapment_database(fasta, fold, params.fdrbench_mem_gb) emit: entrapment_fasta @@ -34,18 +36,21 @@ workflow create_entrapment_database { */ process call_entrapment_database { cpus 1 + memory "${ memory_limit }.GB" + container { params.fdrbench_image } input: path fasta val fold + val memory_limit output: path "${fasta.baseName}-entrapment.fasta" script: """ - java -jar /opt/fdrbench/fdrbench.jar -db ${fasta} -o ${fasta.baseName}-entrapment.fasta -fold ${fold} -level protein -entrapment_label ENTRAPMENT_ -entrapment_pos 0 -uniprot -check + java -Xmx${memory_limit}G -jar /opt/fdrbench/fdrbench.jar -db ${fasta} -o ${fasta.baseName}-entrapment.fasta -fold ${fold} -level protein -entrapment_label ENTRAPMENT_ -entrapment_pos 0 -uniprot -check # 'Reheader' to add entrapment index to database and accession part of the header sed -r -i "s;^>ENTRAPMENT_(.+)\\|(.+)\\|(.+)_([0-9]+)\$;>ENTRAPMENT_\\4_\\1|ENTRAPMENT_\\4_\\2|\\3_\\4;g" ${fasta.baseName}-entrapment.fasta """ From 3bafb1d87104a6f35303c949d5bc3925b2423d86 Mon Sep 17 00:00:00 2001 From: julianu Date: Wed, 10 Sep 2025 14:03:05 +0000 Subject: [PATCH 04/13] info for ms2pip model download --- src/postprocessing/ms2rescore.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/postprocessing/ms2rescore.nf b/src/postprocessing/ms2rescore.nf index c13d701..e9d833d 100644 --- a/src/postprocessing/ms2rescore.nf +++ b/src/postprocessing/ms2rescore.nf @@ -85,7 +85,7 @@ process correct_psm_utils_pins { process check_or_download_model { cpus 1 memory '2 GB' - maxForks 1 + maxForks 1 // this makes sure that the download is only performed once, not more in parallel container { params.python_image } From ac8af461d3c63e1e2463e3de9f2af5afde428da5 Mon Sep 17 00:00:00 2001 From: julianu Date: Wed, 10 Sep 2025 14:44:53 +0000 Subject: [PATCH 05/13] container images as labels in config --- main.nf | 37 ----- nextflow.config | 138 +++++++++++++++++- src/identification/comet_identification.nf | 10 +- src/identification/maxquant_identification.nf | 7 +- src/identification/msamanda_identification.nf | 7 +- .../msfragger_identification.nf | 10 +- src/identification/msgfplus_identification.nf | 20 +-- src/identification/sage_identification.nf | 13 +- src/identification/xtandem_identification.nf | 10 +- .../convert_and_enhance_psm_tsv.nf | 9 +- src/postprocessing/ms2rescore.nf | 7 +- src/postprocessing/oktoberfest.nf | 4 +- src/postprocessing/percolator.nf | 7 +- src/preprocess/convert_to_mzml.nf | 16 +- src/preprocess/create_decoy_database.nf | 8 +- src/preprocess/create_entrapment_database.nf | 7 +- 16 files changed, 189 insertions(+), 121 deletions(-) diff --git a/main.nf b/main.nf index 003c608..d4409e4 100644 --- a/main.nf +++ b/main.nf @@ -2,43 +2,6 @@ // Nextflow pipeline for peptide identification with multiple search engines and post-processing tools // -// default python image -params.python_image = 'ghcr.io/medbioinf/pipeline-of-identification:latest' -params.oktoberfest_image = 'medbioinf/oktoberfest' - -// parameters set by the command line -params.raw_files = '' -params.mzml_files = '' // may contain globs -params.fasta = '' -params.fasta_target_decoy = '' -params.precursor_tol_ppm = 10 -params.fragment_tol_da = 0.02 -params.is_timstof = false -params.entrapment_fold = 0 -params.use_only_rank1_psms = true - -// keep the (converted) mzML files -params.keep_mzmls = true - -// should the search engines be executed? -params.execute_comet = true -params.execute_maxquant = true -params.execute_msamanda = true -params.execute_msfragger = true -params.execute_msgfplus = true -params.execute_sage = true -params.execute_xtandem = true - -// default parameter files -params.outdir = './' -params.comet_params_file = "${baseDir}/config/comet.params" -params.maxquant_params_file = "${baseDir}/config/mqpar.xml" -params.msamanda_config_file = "${baseDir}/config/msamanda_settings.xml" -params.msfragger_config_file = "${baseDir}/config/closed_fragger.params" -params.msgfplus_params_file = "${baseDir}/config/MSGFPlus_Params.txt" -params.sage_config_file = "${baseDir}/config/sage_config.json" -params.xtandem_config_file = "${baseDir}/config/xtandem_input.xml" - // including modules include {create_entrapment_database} from "./src/preprocess/create_entrapment_database.nf" include {create_decoy_database} from "./src/preprocess/create_decoy_database.nf" diff --git a/nextflow.config b/nextflow.config index 3b375e3..54e5418 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,9 +1,135 @@ -workflow.output.mode = "copy" +// Global default params, used in configs +params { + // parameters set by the command line + raw_files = '' + mzml_files = '' // may contain globs + fasta = '' + fasta_target_decoy = '' + precursor_tol_ppm = 10 + fragment_tol_da = 0.02 + is_timstof = false + entrapment_fold = 0 + use_only_rank1_psms = true + + // keep the (converted) mzML files + keep_mzmls = true + + // should the search engines be executed? + execute_comet = true + execute_maxquant = true + execute_msamanda = true + execute_msfragger = true + execute_msgfplus = true + execute_sage = true + execute_xtandem = true + + // default parameter files + outdir = './' + comet_params_file = "${baseDir}/config/comet.params" + maxquant_params_file = "${baseDir}/config/mqpar.xml" + msamanda_config_file = "${baseDir}/config/msamanda_settings.xml" + msfragger_config_file = "${baseDir}/config/closed_fragger.params" + msgfplus_params_file = "${baseDir}/config/MSGFPlus_Params.txt" + sage_config_file = "${baseDir}/config/sage_config.json" + xtandem_config_file = "${baseDir}/config/xtandem_input.xml" +} + +manifest { + name = 'mpc-bioinformatics/McQuaC' + contributors = [ + [ + name: 'Julian Uszkoreit', + affiliation: 'Ruhr University Bochum, Medical Bioinformatics', + github: '@julianu', + contribution: ['author', 'maintainer', 'contributor'], + orcid: '0000-0001-7522-4007', + ], + [ + name: 'Dirk Winkelhardt', + affiliation: 'Ruhr University Bochum, Medical Bioinformatics', + github: '@di-hardt', + contribution: ['author', 'maintainer', 'contributor'], + orcid: '0000-0001-8770-2221', + ], + ] + homePage = 'https://github.com/medbioinf/pipeline-of-identification' + description = """A pipeline for the identification of peptides from mass spectrometry data, integrating multiple search engines and post-processing tools.""" + mainScript = 'main.nf' + defaultBranch = 'main' + nextflowVersion = '!>=24.10.6' + version = '0.1.0' +} profiles { - docker { - docker.enabled = true - docker.runOptions = "--user=root" - docker.fixOwnership = true + docker { + docker.enabled = true + docker.runOptions = "--user=root" + docker.fixOwnership = true + + process { + withLabel: python_image { + container = 'ghcr.io/medbioinf/pipeline-of-identification:latest' + } + + withLabel: comet_image { + container = 'quay.io/medbioinf/comet-ms:v2024.01.0' + } + + withLabel: maxquant_image { + container = 'quay.io/medbioinf/maxquant:2.6.3.0' + } + + withLabel: msamanda_image { + container = 'quay.io/medbioinf/msamanda:3.0.22.071' + } + + withLabel: msfragger_image { + container = 'medbioinf/msfragger' + } + + withLabel: msgfplus_image { + container = 'quay.io/medbioinf/msgfplus:v2024.03.26' + } + + withLabel: mzidmerger_image { + container = 'quay.io/medbioinf/mzid-merger:1.4.26' + } + + withLabel: sage_image { + container = 'quay.io/medbioinf/sage:v0.15.0-beta.1' + } + + withLabel: xtandem_image { + container = 'quay.io/medbioinf/xtandem:2017.2.1.4' + } + + withLabel: percolator_image { + container = 'ghcr.io/percolator/percolator:branch-3-08' + } + + withLabel: msconvert_image { + container = 'proteowizard/pwiz-skyline-i-agree-to-the-vendor-licenses:3.0.25073-842baef' + } + + withLabel: tdf2mzml_image { + container = 'quay.io/medbioinf/tdf2mzml:0.4' + } + + withLabel: oktoberfest_image { + container = 'medbioinf/oktoberfest' + } + + withLabel: openms_image { + container = 'quay.io/medbioinf/openms:3.4.1' + } + + withLabel: fdrbench_image { + container = 'quay.io/medbioinf/fdrbench-nightly:146f77' + } } -} \ No newline at end of file + } +} + +plugins { + id 'nf-schema@2.5.0' +} diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf index 73aec6d..4905b78 100644 --- a/src/identification/comet_identification.nf +++ b/src/identification/comet_identification.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -params.comet_image = 'quay.io/medbioinf/comet-ms:v2024.01.0' - // number of threads used by comet params.comet_threads = 16 params.comet_mem = "8 GB" @@ -51,7 +47,8 @@ workflow comet_identification { process adjust_comet_param_file { cpus 2 memory "1 GB" - container { params.python_image } + + label 'python_image' input: path comet_params_file @@ -87,7 +84,8 @@ process adjust_comet_param_file { process identification_with_comet { cpus { params.comet_threads } memory { params.comet_mem } - container { params.comet_image } + + label 'comet_image' publishDir "${params.outdir}/comet", mode: 'copy' diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf index 07a87e6..a58af47 100644 --- a/src/identification/maxquant_identification.nf +++ b/src/identification/maxquant_identification.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -params.maxquant_image = 'quay.io/medbioinf/maxquant:2.6.3.0' - // number of threads used by maxquant params.maxquant_threads = 4 params.maxquant_mem = "32 GB" @@ -88,7 +84,8 @@ workflow maxquant_identification { process identification_with_maxquant { cpus { params.maxquant_threads } memory { params.maxquant_mem } - container { params.maxquant_image } + + label 'maxquant_image' publishDir "${params.outdir}/maxquant", mode: 'copy' diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf index b746108..2812daa 100644 --- a/src/identification/msamanda_identification.nf +++ b/src/identification/msamanda_identification.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -params.msamanda_image = 'quay.io/medbioinf/msamanda:3.0.22.071' - // number of threads used by msamanda params.msamanda_threads = 16 params.msamanda_mem = "64 GB" @@ -56,7 +52,8 @@ workflow msamanda_identification { process identification_with_msamanda { cpus { params.msamanda_threads } memory { params.msamanda_mem } - container { params.msamanda_image } + + label 'msamanda_image' publishDir "${params.outdir}/msamanda", mode: 'copy' diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf index 7c69d57..408dfc7 100644 --- a/src/identification/msfragger_identification.nf +++ b/src/identification/msfragger_identification.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -params.msfragger_image = 'medbioinf/msfragger' - // parameters for MSFragger params.msfragger_threads = 16 params.msfragger_mem_gb = 16 @@ -50,7 +46,8 @@ workflow msfragger_identification { process adjust_msfragger_param_file { cpus 2 memory "1 GB" - container { params.python_image } + + label 'python_image' input: path fragger_params_file @@ -89,7 +86,8 @@ process adjust_msfragger_param_file { process identification_with_msfragger { cpus { params.msfragger_threads } memory { params.msfragger_mem_gb + " GB" } - container { params.msfragger_image } + + label 'msfragger_image' publishDir "${params.outdir}/msfragger", mode: 'copy' diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf index 88b9ea5..c7c8c4b 100644 --- a/src/identification/msgfplus_identification.nf +++ b/src/identification/msgfplus_identification.nf @@ -1,8 +1,3 @@ -nextflow.enable.dsl=2 - -params.msgfplus_image = 'quay.io/medbioinf/msgfplus:v2024.03.26' -params.mzidmerger_image = 'quay.io/medbioinf/mzid-merger:1.4.26' - // params for MS-GF+ params.msgfplus_threads = 6 params.msgfplus_mem_gb = 16 @@ -103,7 +98,8 @@ workflow msgfplus_identification { process identification_with_msgfplus { cpus { params.msgfplus_threads } memory { params.msgfplus_mem_gb + " GB" } - container { params.msgfplus_image } + + label 'msgfplus_image' publishDir "${params.outdir}/msgfplus", mode: 'copy', enabled: { publish_results } @@ -136,7 +132,8 @@ process identification_with_msgfplus { process split_fasta { cpus 2 memory "8 GB" - container { params.python_image } + + label 'python_image' input: path fasta @@ -154,7 +151,8 @@ process split_fasta { process build_msgfplus_index { cpus { params.msgfplus_threads } memory { params.msgfplus_mem_gb + " GB" } - container { params.msgfplus_image } + + label 'msgfplus_image' input: path fasta @@ -172,7 +170,8 @@ process build_msgfplus_index { process merge_psms { cpus 2 memory { params.msgfplus_merge_mem_gb + " GB" } - container { params.python_image } + + label 'python_image' input: tuple val(original_mzml_basename), path(psm_tsvs) @@ -190,7 +189,8 @@ process merge_psms { process mzid_merger { cpus 2 memory "8 GB" - container { params.mzidmerger_image } + + label 'mzidmerger_image' publishDir "${params.outdir}/msgfplus", mode: 'copy' diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf index 0e69d79..96d8c15 100644 --- a/src/identification/sage_identification.nf +++ b/src/identification/sage_identification.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -params.sage_image = 'quay.io/medbioinf/sage:v0.15.0-beta.1' - // number of threads used by sage params.sage_threads = 16 params.sage_mem = "128 GB" @@ -56,7 +52,8 @@ workflow sage_identification { process adjust_sage_config { cpus 2 memory "1 GB" - container { params.python_image } + + label 'python_image' input: path default_config_file @@ -95,7 +92,8 @@ with open("./adjusted_sage_config.json", "w") as outfile: process identification_with_sage { cpus { params.sage_threads } memory { params.sage_mem } - container { params.sage_image } + + label 'sage_image' input: path sage_config_file @@ -115,7 +113,8 @@ process identification_with_sage { process separate_sage_results { cpus 2 memory "1 GB" - container { params.python_image } + + label 'python_image' publishDir "${params.outdir}/sage", mode: 'copy' diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf index 3f7856a..8354144 100644 --- a/src/identification/xtandem_identification.nf +++ b/src/identification/xtandem_identification.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -params.xtandem_image = 'quay.io/medbioinf/xtandem:2017.2.1.4' - // number of threads used by xtandem params.xtandem_threads = 16 params.xtandem_mem = "128 GB" @@ -60,7 +56,8 @@ workflow xtandem_identification { process create_xtandem_params_files_from_default { cpus 2 memory "1 GB" - container { params.python_image } + + label 'python_image' input: path xtandem_config_file @@ -116,7 +113,8 @@ process create_xtandem_params_files_from_default { process identification_with_xtandem { cpus { params.xtandem_threads } memory { params.xtandem_mem } - container { params.xtandem_image } + + label 'xtandem_image' publishDir "${params.outdir}/xtandem", mode: 'copy' diff --git a/src/postprocessing/convert_and_enhance_psm_tsv.nf b/src/postprocessing/convert_and_enhance_psm_tsv.nf index b43a54b..11dfa1a 100644 --- a/src/postprocessing/convert_and_enhance_psm_tsv.nf +++ b/src/postprocessing/convert_and_enhance_psm_tsv.nf @@ -48,7 +48,8 @@ workflow enhance_psm_tsv { process convert_searchengine_to_psm_utils { cpus 2 memory { params.convert_psm_tsv_mem } - container { params.python_image } + + label 'python_image' input: path searchengine_results @@ -66,7 +67,8 @@ process convert_searchengine_to_psm_utils { process convert_chunked_result_to_psm_utils { cpus 2 memory { params.convert_psm_tsv_mem } - container { params.python_image } + + label 'python_image' input: tuple val(original_mzml_basename), path(searchengine_results) @@ -91,7 +93,8 @@ process convert_chunked_result_to_psm_utils { process enhance_psms_and_create_pin { cpus 2 memory { params.enhance_psm_tsv_mem } - container { params.python_image } + + label 'python_image' publishDir "${params.outdir}/${searchengine}", mode: 'copy' diff --git a/src/postprocessing/ms2rescore.nf b/src/postprocessing/ms2rescore.nf index e9d833d..8965ae7 100644 --- a/src/postprocessing/ms2rescore.nf +++ b/src/postprocessing/ms2rescore.nf @@ -31,8 +31,7 @@ process run_chunked_ms2rescore { cpus { params.ms2rescore_threads } memory { params.ms2rescore_mem } - container { params.python_image } - containerOptions { "-v /mnt/data/projects/pipeline-of-identification/bin/ms2pip-model:/mnt/data/ms2pip-model" } + label 'python_image' input: tuple val(psm_utils_tsvs), val(mzml_for_psms) @@ -63,7 +62,7 @@ process correct_psm_utils_pins { cpus 2 memory '8 GB' - container { params.python_image } + label 'python_image' publishDir "${params.outdir}/${searchengine}", mode: 'copy' @@ -87,7 +86,7 @@ process check_or_download_model { memory '2 GB' maxForks 1 // this makes sure that the download is only performed once, not more in parallel - container { params.python_image } + label 'python_image' input: path model_dir diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf index b362551..d266d87 100644 --- a/src/postprocessing/oktoberfest.nf +++ b/src/postprocessing/oktoberfest.nf @@ -48,7 +48,7 @@ process run_oktoberfest_feature_gen { maxForks params.oktoberfest_forks memory { params.oktoberfest_memory } - container { params.oktoberfest_image } + label 'oktoberfest_image' input: tuple val(psm_utils_tsvs), val(mzml_for_psms) @@ -88,7 +88,7 @@ process oktoberfest_features_to_pin { cpus 1 memory { params.oktoberfest_to_pin_memory } - container { params.oktoberfest_image } + label 'oktoberfest_image' publishDir "${params.outdir}/${searchengine}", mode: 'copy' diff --git a/src/postprocessing/percolator.nf b/src/postprocessing/percolator.nf index 8c227f7..f32f61a 100644 --- a/src/postprocessing/percolator.nf +++ b/src/postprocessing/percolator.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -params.percolator_image = 'ghcr.io/percolator/percolator:branch-3-08' - // number of threads used by percolator params.percolator_threads = 4 params.percolator_mem = "4 GB" @@ -27,7 +23,8 @@ workflow psm_percolator { process run_percolator { cpus { params.percolator_threads } memory { params.percolator_mem } - container { params.percolator_image } + + label 'percolator_image' publishDir "${params.outdir}/${searchengine}", mode: 'copy' diff --git a/src/preprocess/convert_to_mzml.nf b/src/preprocess/convert_to_mzml.nf index c7a96c2..01ea3fe 100644 --- a/src/preprocess/convert_to_mzml.nf +++ b/src/preprocess/convert_to_mzml.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -params.msconvert_image = 'proteowizard/pwiz-skyline-i-agree-to-the-vendor-licenses:3.0.25073-842baef' -params.tdf2mzml_image = 'quay.io/medbioinf/tdf2mzml:0.4' params.tdf2mzml_threads = 8 workflow convert_to_mzml { @@ -23,7 +19,8 @@ workflow convert_to_mzml { process convert_thermo_raw { cpus 2 memory "8 GB" - container { params.msconvert_image } + + label 'msconvert_image' publishDir "${params.outdir}/mzmls", mode: 'copy', enabled: params.keep_mzmls @@ -42,7 +39,8 @@ process convert_thermo_raw { process convert_bruker_d { cpus { params.tdf2mzml_threads } memory "8 GB" - container { params.tdf2mzml_image } + + label 'tdf2mzml_image' input: path input_d @@ -63,7 +61,8 @@ process convert_bruker_d { process adjust_mzML { cpus 2 memory "8 GB" - container { params.msconvert_image } + + label 'msconvert_image' publishDir "${params.outdir}/mzmls", mode: 'copy', enabled: params.keep_mzmls @@ -94,7 +93,8 @@ process adjust_mzML { process split_mzml_into_chunks { cpus 2 memory "8 GB" - container { params.msconvert_image } + + label 'msconvert_image' input: val chunksize diff --git a/src/preprocess/create_decoy_database.nf b/src/preprocess/create_decoy_database.nf index a82adad..e20023e 100644 --- a/src/preprocess/create_decoy_database.nf +++ b/src/preprocess/create_decoy_database.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -params.openms_image = 'quay.io/medbioinf/openms:3.4.1' - // number of threads used by maxquant params.decoy_database_threads = 4 @@ -26,7 +22,9 @@ workflow create_decoy_database { process call_decoy_database { cpus { params.decoy_database_threads } - container { params.openms_image } + memory '8.GB' + + label 'openms_image' input: path fasta diff --git a/src/preprocess/create_entrapment_database.nf b/src/preprocess/create_entrapment_database.nf index 1bf6d3c..6d7759d 100644 --- a/src/preprocess/create_entrapment_database.nf +++ b/src/preprocess/create_entrapment_database.nf @@ -1,10 +1,5 @@ -nextflow.enable.dsl=2 - -params.fdrbench_image = 'quay.io/medbioinf/fdrbench-nightly:146f77' - params.fdrbench_mem_gb = 16 - /** * Adds decoys and/or entapments to the FASTA file. * @@ -38,7 +33,7 @@ process call_entrapment_database { cpus 1 memory "${ memory_limit }.GB" - container { params.fdrbench_image } + label 'fdrbench_image' input: path fasta From d90a2fbd9d624d1b20b49c548753774a0796b061 Mon Sep 17 00:00:00 2001 From: julianu Date: Wed, 10 Sep 2025 15:13:05 +0000 Subject: [PATCH 06/13] moving parameters from main and comet to config --- nextflow.config | 37 ++-- nextflow_schema.json | 198 +++++++++++++++++++++ src/identification/comet_identification.nf | 8 - 3 files changed, 218 insertions(+), 25 deletions(-) create mode 100644 nextflow_schema.json diff --git a/nextflow.config b/nextflow.config index 54e5418..4000c0d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,36 +1,39 @@ // Global default params, used in configs params { - // parameters set by the command line - raw_files = '' - mzml_files = '' // may contain globs + // input / output parameters + outdir = './' + raw_files = './msms-files/*.raw' + mzml_files = '' fasta = '' fasta_target_decoy = '' + entrapment_fold = 0 + keep_mzmls = true + + // search specific parameters precursor_tol_ppm = 10 fragment_tol_da = 0.02 is_timstof = false - entrapment_fold = 0 use_only_rank1_psms = true - // keep the (converted) mzML files - keep_mzmls = true - - // should the search engines be executed? execute_comet = true - execute_maxquant = true - execute_msamanda = true - execute_msfragger = true - execute_msgfplus = true - execute_sage = true - execute_xtandem = true - - // default parameter files - outdir = './' comet_params_file = "${baseDir}/config/comet.params" + comet_threads = 16 + comet_mem = '8.GB' + comet_psm_id_pattern = '(.*)' + comet_spectrum_id_pattern = '.*scan=(\\d+)$' + comet_scan_id_pattern = '^(?P\\d+)$' + + execute_maxquant = true maxquant_params_file = "${baseDir}/config/mqpar.xml" + execute_msamanda = true msamanda_config_file = "${baseDir}/config/msamanda_settings.xml" + execute_msfragger = true msfragger_config_file = "${baseDir}/config/closed_fragger.params" + execute_msgfplus = true msgfplus_params_file = "${baseDir}/config/MSGFPlus_Params.txt" + execute_sage = true sage_config_file = "${baseDir}/config/sage_config.json" + execute_xtandem = true xtandem_config_file = "${baseDir}/config/xtandem_input.xml" } diff --git a/nextflow_schema.json b/nextflow_schema.json new file mode 100644 index 0000000..13ccfcb --- /dev/null +++ b/nextflow_schema.json @@ -0,0 +1,198 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/mpc-bioinformatics/McQuaC//nextflow_schema.json", + "title": "mpc-bioinformatics/McQuaC pipeline parameters", + "description": "A pipeline for the identification of peptides from mass spectrometry data, integrating multiple search engines and post-processing tools.", + "type": "object", + "$defs": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": ["outdir", "raw_files"], + "properties": { + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open", + "default": "./" + }, + "raw_files": { + "type": "string", + "description": "Path to the raw spectra files (either .raw or .d), can contain * to process multiple files", + "default": "./msms-files/*.raw" + }, + "mzml_files": { + "type": "string", + "description": "If you already have conversions to mzML, specify them here. They need to have the same name basis as the raw spectrum files" + }, + "fasta": { + "type": "string", + "description": "Path to the protein FASTA file to use, in UniProt header format.", + "format": "file-path", + "mimetype": "text/fasta", + "exists": true + }, + "fasta_target_decoy": { + "type": "string", + "description": "If you already have a target-decoy protein FASTA from a previous run, it can be given here.", + "format": "file-path", + "mimetype": "text/fasta" + }, + "entrapment_fold": { + "type": "integer", + "default": 0, + "description": "This parameter specifies for an entrapment search the fold of entrapment proteins per target protein. No database will be created if 0." + }, + "keep_mzmls": { + "type": "boolean", + "default": true, + "description": "Whether the converted mzML files should be kept in the results folder." + } + } + }, + "general_search_parameters": { + "title": "General search parameters", + "type": "object", + "description": "", + "default": "", + "properties": { + "precursor_tol_ppm": { + "type": "integer", + "default": 10, + "description": "The precursor tolerance in PPM" + }, + "fragment_tol_da": { + "type": "number", + "default": 0.02, + "description": "The fragment tolerance in Dalton" + }, + "is_timstof": { + "type": "boolean", + "description": "Specify whether the data is timsTOF data" + }, + "use_only_rank1_psms": { + "type": "boolean", + "default": true, + "description": "Use only top identifications for any post processing. Otherwise, all identifications will be used." + }, + "execute_maxquant": { + "type": "boolean", + "default": true, + "description": "Whether to execute identification with MaxQuant" + }, + "maxquant_params_file": { + "type": "string", + "default": "${projectDir}/config/mqpar.xml", + "description": "Path to the MaxQuant params file for additional, search engine specific settings." + }, + "execute_msamanda": { + "type": "boolean", + "default": true, + "description": "Whether to execute identification with MSAmanda" + }, + "msamanda_config_file": { + "type": "string", + "default": "${projectDir}/config/msamanda_settings.xml", + "description": "Path to the MSAmanda params file for additional, search engine specific settings." + }, + "execute_msfragger": { + "type": "boolean", + "default": true, + "description": "Whether to execute identification with MS Fragger" + }, + "msfragger_config_file": { + "type": "string", + "default": "${projectDir}/config/closed_fragger.params", + "description": "Path to the MS Fragger params file for additional, search engine specific settings." + }, + "execute_msgfplus": { + "type": "boolean", + "default": true, + "description": "Whether to execute identification with MS-GF+" + }, + "msgfplus_params_file": { + "type": "string", + "default": "${projectDir}/config/MSGFPlus_Params.txt", + "description": "Path to the MS-GF+ params file for additional, search engine specific settings." + }, + "execute_sage": { + "type": "boolean", + "default": true, + "description": "Whether to execute identification with Sage" + }, + "sage_config_file": { + "type": "string", + "default": "${projectDir}/config/sage_config.json", + "description": "Path to the Sage params file for additional, search engine specific settings." + }, + "execute_xtandem": { + "type": "boolean", + "default": true, + "description": "Whether to execute identification with X!Tandem" + }, + "xtandem_config_file": { + "type": "string", + "default": "/mnt/data/projects/pipeline-of-identification/config/xtandem_input.xml", + "description": "Path to the X!Tandem params file for additional, search engine specific settings." + } + } + }, + "comet_parameters": { + "title": "Comet parameters", + "type": "object", + "description": "", + "default": "", + "properties": { + "execute_comet": { + "type": "boolean", + "default": true, + "description": "Whether to execute identification with Comet" + }, + "comet_params_file": { + "type": "string", + "default": "${projectDir}/config/comet.params", + "description": "Path to the Comet params file for additional, search engine specific settings." + }, + "comet_threads": { + "type": "integer", + "default": 16, + "description": "Number of allowed threads / CPUs for Comet" + }, + "comet_mem": { + "type": "string", + "default": "8.GB", + "description": "Number of allowed memory for Comet" + }, + "comet_psm_id_pattern": { + "type": "string", + "default": "(.*)", + "description": "Regular expression to parse the PSM ID for MS2Rescore" + }, + "comet_spectrum_id_pattern": { + "type": "string", + "default": ".*scan=(\\\\d+)$", + "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore" + }, + "comet_scan_id_pattern": { + "type": "string", + "default": "^(?P\\\\d+)$", + "description": "Regular expression to parse the PSM ID from Comet for Oktoberfest" + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input_output_options" + }, + { + "$ref": "#/$defs/general_search_parameters" + }, + { + "$ref": "#/$defs/comet_parameters" + } + ] +} diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf index 4905b78..f34e681 100644 --- a/src/identification/comet_identification.nf +++ b/src/identification/comet_identification.nf @@ -1,11 +1,3 @@ -// number of threads used by comet -params.comet_threads = 16 -params.comet_mem = "8 GB" - -params.comet_psm_id_pattern = "(.*)" -params.comet_spectrum_id_pattern = '.*scan=(\\d+)$' -params.comet_scan_id_pattern = '^(?P\\d+)$' - include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' From 524c3de0f7479227c1579a5b905dc82176cd1212 Mon Sep 17 00:00:00 2001 From: julianu Date: Wed, 10 Sep 2025 15:18:52 +0000 Subject: [PATCH 07/13] maxquant params to config and schema --- nextflow.config | 10 ++++ nextflow_schema.json | 55 +++++++++++++++---- src/identification/maxquant_identification.nf | 8 --- 3 files changed, 54 insertions(+), 19 deletions(-) diff --git a/nextflow.config b/nextflow.config index 4000c0d..0791197 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,14 +25,24 @@ params { execute_maxquant = true maxquant_params_file = "${baseDir}/config/mqpar.xml" + maxquant_threads = 4 + maxquant_mem = "32.GB" + maxquant_psm_id_pattern = "" + maxquant_spectrum_id_pattern = "" + maxquant_scan_id_pattern = "" + execute_msamanda = true msamanda_config_file = "${baseDir}/config/msamanda_settings.xml" + execute_msfragger = true msfragger_config_file = "${baseDir}/config/closed_fragger.params" + execute_msgfplus = true msgfplus_params_file = "${baseDir}/config/MSGFPlus_Params.txt" + execute_sage = true sage_config_file = "${baseDir}/config/sage_config.json" + execute_xtandem = true xtandem_config_file = "${baseDir}/config/xtandem_input.xml" } diff --git a/nextflow_schema.json b/nextflow_schema.json index 13ccfcb..187d157 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -78,16 +78,6 @@ "default": true, "description": "Use only top identifications for any post processing. Otherwise, all identifications will be used." }, - "execute_maxquant": { - "type": "boolean", - "default": true, - "description": "Whether to execute identification with MaxQuant" - }, - "maxquant_params_file": { - "type": "string", - "default": "${projectDir}/config/mqpar.xml", - "description": "Path to the MaxQuant params file for additional, search engine specific settings." - }, "execute_msamanda": { "type": "boolean", "default": true, @@ -179,7 +169,47 @@ "comet_scan_id_pattern": { "type": "string", "default": "^(?P\\\\d+)$", - "description": "Regular expression to parse the PSM ID from Comet for Oktoberfest" + "description": "Regular expression to parse the PSM ID for Oktoberfest" + } + } + }, + "maxquant_parameters": { + "title": "MaxQuant parameters", + "type": "object", + "description": "", + "default": "", + "properties": { + "execute_maxquant": { + "type": "boolean", + "default": true, + "description": "Whether to execute identification with MaxQuant" + }, + "maxquant_params_file": { + "type": "string", + "default": "${projectDir}/config/mqpar.xml", + "description": "Path to the MaxQuant params file for additional, search engine specific settings." + }, + "maxquant_threads": { + "type": "integer", + "default": 4, + "description": "Number of allowed threads / CPUs for MaxQuant" + }, + "maxquant_mem": { + "type": "string", + "default": "32.GB", + "description": "Number of allowed memory for MaxQuant" + }, + "maxquant_psm_id_pattern": { + "type": "string", + "description": "Regular expression to parse the PSM ID for MS2Rescore" + }, + "maxquant_spectrum_id_pattern": { + "type": "string", + "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore" + }, + "maxquant_scan_id_pattern": { + "type": "string", + "description": "Regular expression to parse the PSM ID for Oktoberfest" } } } @@ -193,6 +223,9 @@ }, { "$ref": "#/$defs/comet_parameters" + }, + { + "$ref": "#/$defs/maxquant_parameters" } ] } diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf index a58af47..3fb9266 100644 --- a/src/identification/maxquant_identification.nf +++ b/src/identification/maxquant_identification.nf @@ -1,11 +1,3 @@ -// number of threads used by maxquant -params.maxquant_threads = 4 -params.maxquant_mem = "32 GB" - -params.maxquant_psm_id_pattern = "" -params.maxquant_spectrum_id_pattern = "" -params.maxquant_scan_id_pattern = "" - include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' From 3f22ea2a49b5bc5661310e563e824d3eef66ff69 Mon Sep 17 00:00:00 2001 From: julianu Date: Wed, 10 Sep 2025 15:28:59 +0000 Subject: [PATCH 08/13] move all parameters to config --- nextflow.config | 58 +++++++++++++++++++ src/identification/msamanda_identification.nf | 8 --- .../msfragger_identification.nf | 10 ---- src/identification/msgfplus_identification.nf | 15 ----- src/identification/sage_identification.nf | 10 ---- src/identification/xtandem_identification.nf | 8 --- .../convert_and_enhance_psm_tsv.nf | 5 -- src/postprocessing/ms2rescore.nf | 9 --- src/postprocessing/oktoberfest.nf | 9 --- src/postprocessing/percolator.nf | 4 -- src/preprocess/convert_to_mzml.nf | 2 - src/preprocess/create_decoy_database.nf | 4 -- src/preprocess/create_entrapment_database.nf | 2 - 13 files changed, 58 insertions(+), 86 deletions(-) diff --git a/nextflow.config b/nextflow.config index 0791197..c54cd76 100644 --- a/nextflow.config +++ b/nextflow.config @@ -8,12 +8,17 @@ params { fasta_target_decoy = '' entrapment_fold = 0 keep_mzmls = true + tdf2mzml_threads = 8 + decoy_database_threads = 4 + fdrbench_mem_gb = 16 // search specific parameters precursor_tol_ppm = 10 fragment_tol_da = 0.02 is_timstof = false use_only_rank1_psms = true + convert_psm_tsv_mem = '60.GB' + enhance_psm_tsv_mem = '8.GB' execute_comet = true comet_params_file = "${baseDir}/config/comet.params" @@ -33,18 +38,71 @@ params { execute_msamanda = true msamanda_config_file = "${baseDir}/config/msamanda_settings.xml" + msamanda_threads = 16 + msamanda_mem = '64.GB' + msamanda_psm_id_pattern = '(.*)' + msamanda_spectrum_id_pattern = '(.*)' + msamanda_scan_id_pattern = '.*scan=(?P\\d+)$' execute_msfragger = true msfragger_config_file = "${baseDir}/config/closed_fragger.params" + msfragger_threads = 16 + msfragger_mem_gb = 16 + msfragger_db_split = 0 + msfragger_calibrate = 2 + msfragger_psm_id_pattern = '(.*)' + msfragger_spectrum_id_pattern = '(.*)' + msfragger_scan_id_pattern = '.*scan=(?P\\d+)$' execute_msgfplus = true msgfplus_params_file = "${baseDir}/config/MSGFPlus_Params.txt" + msgfplus_threads = 6 + msgfplus_mem_gb = 16 + msgfplus_tasks = 0 + msgfplus_instrument = 1 // 0: Low-res LCQ/LTQ, 1: Orbitrap/FTICR/Lumos, 2: TOF, 3: Q-Exactive + msgfplus_split_input = 10000 // split input mzMLs into chunks of this size, 0 to disable + msgfplus_merge_mem_gb = 16 // memory for merging PSMs, used in merge_psms process + msgfplus_split_fasta = 0 // split the fasta into this many chunks, 0 to disable + msgfplus_psm_id_pattern = '(.*)' + msgfplus_spectrum_id_pattern = '(.*)' + msgfplus_scan_id_pattern = '.*scan=(?P\\d+)$' execute_sage = true sage_config_file = "${baseDir}/config/sage_config.json" + sage_threads = 16 + sage_mem = '128.GB' + sage_prefilter = 'false' + sage_prefilter_chunk_size = 0 + sage_psm_id_pattern = '(.*)' + sage_spectrum_id_pattern = '(.*)' + sage_scan_id_pattern = '.*scan=(?P\\d+)$' execute_xtandem = true xtandem_config_file = "${baseDir}/config/xtandem_input.xml" + xtandem_threads = 16 + xtandem_mem = '128.GB' + xtandem_psm_id_pattern = '(.*)' + xtandem_spectrum_id_pattern = '(.*)' + xtandem_scan_id_pattern = '.*scan=(?P\\d+)$' + + // parameters for ms2rescore + ms2rescore_threads = 4 + ms2rescore_mem = '64.GB' + ms2rescore_model = 'HCD' + ms2rescore_chunk_size = 100000 + ms2pip_model_dir = './ms2pip-model' + + // parameters for oktoberfest + oktoberfest_memory = '64.GB' + oktoberfest_to_pin_memory = '4.GB' + oktoberfest_intensity_model = 'Prosit_2020_intensity_HCD' + oktoberfest_irt_model = 'Prosit_2019_irt' + oktoberfest_forks = 1 // have some mercy with the koina servers + + // number of threads used by percolator + percolator_threads = 4 + percolator_mem = '4.GB' + } manifest { diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf index 2812daa..c2817f6 100644 --- a/src/identification/msamanda_identification.nf +++ b/src/identification/msamanda_identification.nf @@ -1,11 +1,3 @@ -// number of threads used by msamanda -params.msamanda_threads = 16 -params.msamanda_mem = "64 GB" - -params.msamanda_psm_id_pattern = "(.*)" -params.msamanda_spectrum_id_pattern = '(.*)' -params.msamanda_scan_id_pattern = '.*scan=(?P\\d+)$' - include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf index 408dfc7..3e412e5 100644 --- a/src/identification/msfragger_identification.nf +++ b/src/identification/msfragger_identification.nf @@ -1,13 +1,3 @@ -// parameters for MSFragger -params.msfragger_threads = 16 -params.msfragger_mem_gb = 16 -params.msfragger_db_split = 0 -params.msfragger_calibrate = 2 - -params.msfragger_psm_id_pattern = "(.*)" -params.msfragger_spectrum_id_pattern = "(.*)" -params.msfragger_scan_id_pattern = '.*scan=(?P\\d+)$' - include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf index c7c8c4b..342c795 100644 --- a/src/identification/msgfplus_identification.nf +++ b/src/identification/msgfplus_identification.nf @@ -1,18 +1,3 @@ -// params for MS-GF+ -params.msgfplus_threads = 6 -params.msgfplus_mem_gb = 16 -params.msgfplus_tasks = 0 - -params.msgfplus_instrument = "1" // 0: Low-res LCQ/LTQ, 1: Orbitrap/FTICR/Lumos, 2: TOF, 3: Q-Exactive - -params.msgfplus_split_input = 10000 // split input mzMLs into chunks of this size, 0 to disable -params.msgfplus_merge_mem_gb = 16 // memory for merging PSMs, used in merge_psms process -params.msgfplus_split_fasta = 0 // split the fasta into this many chunks, 0 to disable - -params.msgfplus_psm_id_pattern = "(.*)" -params.msgfplus_spectrum_id_pattern = '(.*)' -params.msgfplus_scan_id_pattern = '.*scan=(?P\\d+)$' - include {convert_chunked_result_to_psm_utils; enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf index 96d8c15..ff89a90 100644 --- a/src/identification/sage_identification.nf +++ b/src/identification/sage_identification.nf @@ -1,13 +1,3 @@ -// number of threads used by sage -params.sage_threads = 16 -params.sage_mem = "128 GB" -params.sage_prefilter = "false" -params.sage_prefilter_chunk_size = 0 - -params.sage_psm_id_pattern = "(.*)" -params.sage_spectrum_id_pattern = '(.*)' -params.sage_scan_id_pattern = '.*scan=(?P\\d+)$' - include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf index 8354144..38494fa 100644 --- a/src/identification/xtandem_identification.nf +++ b/src/identification/xtandem_identification.nf @@ -1,11 +1,3 @@ -// number of threads used by xtandem -params.xtandem_threads = 16 -params.xtandem_mem = "128 GB" - -params.xtandem_psm_id_pattern = "(.*)" -params.xtandem_spectrum_id_pattern = '(.*)' -params.xtandem_scan_id_pattern = '.*scan=(?P\\d+)$' - include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' diff --git a/src/postprocessing/convert_and_enhance_psm_tsv.nf b/src/postprocessing/convert_and_enhance_psm_tsv.nf index 11dfa1a..4d0b487 100644 --- a/src/postprocessing/convert_and_enhance_psm_tsv.nf +++ b/src/postprocessing/convert_and_enhance_psm_tsv.nf @@ -1,8 +1,3 @@ -nextflow.enable.dsl=2 - -params.convert_psm_tsv_mem = "60 GB" -params.enhance_psm_tsv_mem = "8 GB" - /** * Executes postprocessing steps to enhance the psm_utils TSV and prepare the PIN files * diff --git a/src/postprocessing/ms2rescore.nf b/src/postprocessing/ms2rescore.nf index 8965ae7..0dc2282 100644 --- a/src/postprocessing/ms2rescore.nf +++ b/src/postprocessing/ms2rescore.nf @@ -1,12 +1,3 @@ -nextflow.enable.dsl=2 - -// parameters for ms2rescore -params.ms2rescore_threads = 4 -params.ms2rescore_mem = "64 GB" -params.ms2rescore_model = "HCD" -params.ms2rescore_chunk_size = 100000 -params.ms2pip_model_dir = "./ms2pip-model" - workflow ms2rescore_workflow { take: psm_tsvs_and_mzmls diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf index d266d87..59f413e 100644 --- a/src/postprocessing/oktoberfest.nf +++ b/src/postprocessing/oktoberfest.nf @@ -1,12 +1,3 @@ -nextflow.enable.dsl=2 - -// parameters for oktoberfest -params.oktoberfest_memory = "64.GB" -params.oktoberfest_to_pin_memory = "4.GB" -params.oktoberfest_intensity_model = "Prosit_2020_intensity_HCD" -params.oktoberfest_irt_model = "Prosit_2019_irt" -params.oktoberfest_forks = 1 // have some mercy with the koina servers - /** * Runs oktoberfest rescoring for the given PSMs and mzML files. * diff --git a/src/postprocessing/percolator.nf b/src/postprocessing/percolator.nf index f32f61a..39e1cee 100644 --- a/src/postprocessing/percolator.nf +++ b/src/postprocessing/percolator.nf @@ -1,7 +1,3 @@ -// number of threads used by percolator -params.percolator_threads = 4 -params.percolator_mem = "4 GB" - /** * Executes percolator for the given PIN files * diff --git a/src/preprocess/convert_to_mzml.nf b/src/preprocess/convert_to_mzml.nf index 01ea3fe..1f0baf2 100644 --- a/src/preprocess/convert_to_mzml.nf +++ b/src/preprocess/convert_to_mzml.nf @@ -1,5 +1,3 @@ -params.tdf2mzml_threads = 8 - workflow convert_to_mzml { take: input_path diff --git a/src/preprocess/create_decoy_database.nf b/src/preprocess/create_decoy_database.nf index e20023e..3cccd2c 100644 --- a/src/preprocess/create_decoy_database.nf +++ b/src/preprocess/create_decoy_database.nf @@ -1,7 +1,3 @@ -// number of threads used by maxquant -params.decoy_database_threads = 4 - - /** * Creates a concatenated target-decoy database * diff --git a/src/preprocess/create_entrapment_database.nf b/src/preprocess/create_entrapment_database.nf index 6d7759d..41b9959 100644 --- a/src/preprocess/create_entrapment_database.nf +++ b/src/preprocess/create_entrapment_database.nf @@ -1,5 +1,3 @@ -params.fdrbench_mem_gb = 16 - /** * Adds decoys and/or entapments to the FASTA file. * From 3cff3cbe4a3582c3b135f5e60bed9e5b9738fe39 Mon Sep 17 00:00:00 2001 From: julianu Date: Thu, 11 Sep 2025 09:20:19 +0000 Subject: [PATCH 09/13] cleaning of entrapment DBs for rare empty proteins --- src/preprocess/create_entrapment_database.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/preprocess/create_entrapment_database.nf b/src/preprocess/create_entrapment_database.nf index 41b9959..637ec53 100644 --- a/src/preprocess/create_entrapment_database.nf +++ b/src/preprocess/create_entrapment_database.nf @@ -45,6 +45,7 @@ process call_entrapment_database { """ java -Xmx${memory_limit}G -jar /opt/fdrbench/fdrbench.jar -db ${fasta} -o ${fasta.baseName}-entrapment.fasta -fold ${fold} -level protein -entrapment_label ENTRAPMENT_ -entrapment_pos 0 -uniprot -check # 'Reheader' to add entrapment index to database and accession part of the header - sed -r -i "s;^>ENTRAPMENT_(.+)\\|(.+)\\|(.+)_([0-9]+)\$;>ENTRAPMENT_\\4_\\1|ENTRAPMENT_\\4_\\2|\\3_\\4;g" ${fasta.baseName}-entrapment.fasta + # and remove empty entrapment sequences (which can appear if the original sequence has many Xs) + sed -r -i -e "s;^>ENTRAPMENT_(.+)\\|(.+)\\|(.+)_([0-9]+)\$;>ENTRAPMENT_\\4_\\1|ENTRAPMENT_\\4_\\2|\\3_\\4;g" -e '\$!N;/>.*\\n\$/d;P;D' ${fasta.baseName}-entrapment.fasta """ } From 803f103b9646aa8d0eb69d5320752500c33f140b Mon Sep 17 00:00:00 2001 From: julianu Date: Thu, 18 Sep 2025 15:42:39 +0000 Subject: [PATCH 10/13] updating docker name to match repo --- nextflow.config | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index c54cd76..fb77fc2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -102,7 +102,6 @@ params { // number of threads used by percolator percolator_threads = 4 percolator_mem = '4.GB' - } manifest { @@ -139,7 +138,7 @@ profiles { process { withLabel: python_image { - container = 'ghcr.io/medbioinf/pipeline-of-identification:latest' + container = 'ghcr.io/medbioinf/mspepid:latest' } withLabel: comet_image { From 4a3ff3778f32aa645a0a77ad3d5263cff5420236 Mon Sep 17 00:00:00 2001 From: julianu Date: Thu, 18 Sep 2025 16:21:37 +0000 Subject: [PATCH 11/13] Updated schema for parameters --- nextflow.config | 2 +- nextflow_schema.json | 326 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 325 insertions(+), 3 deletions(-) diff --git a/nextflow.config b/nextflow.config index fb77fc2..e8fae22 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,7 +17,7 @@ params { fragment_tol_da = 0.02 is_timstof = false use_only_rank1_psms = true - convert_psm_tsv_mem = '60.GB' + convert_psm_tsv_mem = '32.GB' enhance_psm_tsv_mem = '8.GB' execute_comet = true diff --git a/nextflow_schema.json b/nextflow_schema.json index 187d157..6b7a237 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -41,15 +41,30 @@ "format": "file-path", "mimetype": "text/fasta" }, + "decoy_database_threads": { + "type": "integer", + "default": 4, + "description": "Number of allowed threads / CPUs allowed for creation of target-decoy database" + }, "entrapment_fold": { "type": "integer", "default": 0, "description": "This parameter specifies for an entrapment search the fold of entrapment proteins per target protein. No database will be created if 0." }, + "fdrbench_mem_gb": { + "type": "integer", + "default": 16, + "description": "Amount of allowed memory in GB for the creation of the entrapment database" + }, "keep_mzmls": { "type": "boolean", "default": true, "description": "Whether the converted mzML files should be kept in the results folder." + }, + "tdf2mzml_threads": { + "type": "integer", + "default": 8, + "description": "Number of allowed threads / CPUs for conversion of .d to mzML using tdf2mzml" } } }, @@ -154,7 +169,7 @@ "comet_mem": { "type": "string", "default": "8.GB", - "description": "Number of allowed memory for Comet" + "description": "Amount of allowed memory for Comet" }, "comet_psm_id_pattern": { "type": "string", @@ -197,7 +212,7 @@ "maxquant_mem": { "type": "string", "default": "32.GB", - "description": "Number of allowed memory for MaxQuant" + "description": "Amount of allowed memory for MaxQuant" }, "maxquant_psm_id_pattern": { "type": "string", @@ -212,6 +227,295 @@ "description": "Regular expression to parse the PSM ID for Oktoberfest" } } + }, + "msamanda_parameters": { + "title": "MSAmanda parameters", + "type": "object", + "description": "", + "default": "", + "properties": { + "msamanda_threads": { + "type": "integer", + "default": 16, + "description": "Number of allowed threads / CPUs for MSAmanda" + }, + "msamanda_mem": { + "type": "string", + "default": "64.GB", + "description": "Amount of allowed memory for MSAmanda" + }, + "msamanda_psm_id_pattern": { + "type": "string", + "default": "(.*)", + "description": "Regular expression to parse the PSM ID for MS2Rescore" + }, + "msamanda_spectrum_id_pattern": { + "type": "string", + "default": "(.*)", + "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore" + }, + "msamanda_scan_id_pattern": { + "type": "string", + "default": ".*scan=(?P\\\\d+)$", + "description": "Regular expression to parse the PSM ID for Oktoberfest" + } + } + }, + "ms_fragger_parameters": { + "title": "MS Fragger parameters", + "type": "object", + "description": "", + "default": "", + "properties": { + "msfragger_threads": { + "type": "integer", + "default": 16, + "description": "Number of allowed threads / CPUs for MS Fragger" + }, + "msfragger_mem_gb": { + "type": "integer", + "default": 16, + "description": "Amount of allowed memory in GB for MS Fragger" + }, + "msfragger_db_split": { + "type": "integer", + "default": 0, + "description": "Number of splits for the sequence database to save memory. 0 means no split. You should set msfragger_calibrate to 0 for larger numbers." + }, + "msfragger_calibrate": { + "type": "integer", + "default": 2, + "description": "Parameter \"calibrate_mass\" for MS Fragger. Perform mass calibration (0 for OFF, 1 for ON, 2 for ON and find optimal parameters, 4 for ON and find the optimal fragment mass tolerance)." + }, + "msfragger_psm_id_pattern": { + "type": "string", + "default": "(.*)", + "description": "Regular expression to parse the PSM ID for MS2Rescore" + }, + "msfragger_spectrum_id_pattern": { + "type": "string", + "default": "(.*)", + "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore" + }, + "msfragger_scan_id_pattern": { + "type": "string", + "default": ".*scan=(?P\\\\d+)$", + "description": "Regular expression to parse the PSM ID for Oktoberfest" + } + } + }, + "ms_gf_parameters": { + "title": "MS-GF+ parameters", + "type": "object", + "description": "", + "default": "", + "properties": { + "msgfplus_threads": { + "type": "integer", + "default": 6, + "description": "Number of allowed threads / CPUs for MS-GF+" + }, + "msgfplus_mem_gb": { + "type": "integer", + "default": 16, + "description": "Amount of allowed memory in GB for MS-GF+" + }, + "msgfplus_tasks": { + "type": "integer", + "default": 0, + "description": "MS-GF+ parameter tasks. Please refer to the MS-GF+ documentation for more help." + }, + "msgfplus_instrument": { + "type": "integer", + "default": 1, + "description": "0 means Low-res LCQ/LTQ (Default for CID and ETD); use InstrumentID=0 if analyzing a dataset with low-res CID and high-res HCD spectra; 1 means High-res LTQ (Default for HCD; also appropriate for high res CID); use InstrumentID=1 for Orbitrap, Lumos, and QEHFX instruments; 2 means TOF; 3 means Q-Exactive", + "minimum": 0, + "maximum": 3 + }, + "msgfplus_split_input": { + "type": "integer", + "default": 10000, + "description": "The input file will be split to the given number of MS2 spectra. Splitting greatly enhances speed and reduces memory consumption." + }, + "msgfplus_merge_mem_gb": { + "type": "integer", + "default": 16, + "description": "Amount of allowed memory in GB to merge the results after a database split" + }, + "msgfplus_split_fasta": { + "type": "integer", + "default": 0, + "description": "Number of splits for the sequence database to save memory and decrease the search time. 0 means no split." + }, + "msgfplus_psm_id_pattern": { + "type": "string", + "default": "(.*)", + "description": "Regular expression to parse the PSM ID for MS2Rescore" + }, + "msgfplus_spectrum_id_pattern": { + "type": "string", + "default": "(.*)", + "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore" + }, + "msgfplus_scan_id_pattern": { + "type": "string", + "default": ".*scan=(?P\\\\d+)$", + "description": "Regular expression to parse the PSM ID for Oktoberfest" + } + } + }, + "new_group_2": { + "title": "New Group 2", + "type": "object", + "description": "", + "default": "", + "properties": { + "sage_threads": { + "type": "integer", + "default": 16, + "description": "Number of allowed threads / CPUs for Sage" + }, + "sage_mem": { + "type": "string", + "default": "128.GB", + "description": "Amount of allowed memory for Sage" + }, + "sage_prefilter": { + "type": "boolean", + "description": "Whether the database should be split during Sage search. This greatly reduces the memory consumption on when using large databases (beta feature)" + }, + "sage_prefilter_chunk_size": { + "type": "integer", + "default": 0, + "description": "Number of proteins per database chunk when prefiltering is active." + }, + "sage_psm_id_pattern": { + "type": "string", + "default": "(.*)", + "description": "Regular expression to parse the PSM ID for MS2Rescore" + }, + "sage_spectrum_id_pattern": { + "type": "string", + "default": "(.*)", + "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore" + }, + "sage_scan_id_pattern": { + "type": "string", + "default": ".*scan=(?P\\\\d+)$", + "description": "Regular expression to parse the PSM ID for Oktoberfest" + } + } + }, + "new_group_3": { + "title": "New Group 3", + "type": "object", + "description": "", + "default": "", + "properties": { + "xtandem_threads": { + "type": "integer", + "default": 16, + "description": "Number of allowed threads / CPUs for X!Tandem" + }, + "xtandem_mem": { + "type": "string", + "default": "128.GB", + "description": "Amount of allowed memory for X!Tandem" + }, + "xtandem_psm_id_pattern": { + "type": "string", + "default": "(.*)", + "description": "Regular expression to parse the PSM ID for MS2Rescore" + }, + "xtandem_spectrum_id_pattern": { + "type": "string", + "default": "(.*)", + "description": "Regular expression to parse the spectrum ID which is matched against the PSM ID for MS2Rescore" + }, + "xtandem_scan_id_pattern": { + "type": "string", + "default": ".*scan=(?P\\\\d+)$", + "description": "Regular expression to parse the PSM ID for Oktoberfest" + } + } + }, + "new_group_1": { + "title": "New Group 1", + "type": "object", + "description": "", + "default": "", + "properties": { + "convert_psm_tsv_mem": { + "type": "string", + "default": "32.GB", + "description": "Amount of allowed memory for converting results into psm-utils format" + }, + "enhance_psm_tsv_mem": { + "type": "string", + "default": "8.GB", + "description": "Amount of allowed memory for enhancing psm-utils results with additional information" + }, + "ms2rescore_threads": { + "type": "integer", + "default": 4, + "description": "Number of allowed threads / CPUs for MS2Rescore" + }, + "ms2rescore_mem": { + "type": "string", + "default": "64.GB", + "description": "Amount of allowed memory for MS2Rescore" + }, + "ms2rescore_model": { + "type": "string", + "default": "HCD", + "description": "The model used for MS2Rescore" + }, + "ms2rescore_chunk_size": { + "type": "integer", + "default": 100000, + "description": "Chunksize in PSMs that is processed per MS2Rescore instance. Reduce to decrease memory consumption." + }, + "ms2pip_model_dir": { + "type": "string", + "default": "./ms2pip-model", + "description": "Path to store the MS2Pip model" + }, + "oktoberfest_memory": { + "type": "string", + "default": "64.GB", + "description": "Amount of allowed memory for Oktoberfest feature generation" + }, + "oktoberfest_to_pin_memory": { + "type": "string", + "default": "4.GB", + "description": "Amount of allowed memory for Oktoberfest conversion to PIN" + }, + "oktoberfest_intensity_model": { + "type": "string", + "default": "Prosit_2020_intensity_HCD", + "description": "The intensity model used by Oktoberfest" + }, + "oktoberfest_irt_model": { + "type": "string", + "default": "Prosit_2019_irt", + "description": "The iRT model used by Oktoberfest" + }, + "oktoberfest_forks": { + "type": "integer", + "default": 1, + "description": "Number of forks / parallel instances allowed for Oktoberfest" + }, + "percolator_threads": { + "type": "integer", + "default": 4, + "description": "Number of allowed threads / CPUs for Percolator" + }, + "percolator_mem": { + "type": "string", + "default": "4.GB", + "description": "Amount of allowed memory to run Percolator" + } + } } }, "allOf": [ @@ -226,6 +530,24 @@ }, { "$ref": "#/$defs/maxquant_parameters" + }, + { + "$ref": "#/$defs/msamanda_parameters" + }, + { + "$ref": "#/$defs/ms_fragger_parameters" + }, + { + "$ref": "#/$defs/ms_gf_parameters" + }, + { + "$ref": "#/$defs/new_group_2" + }, + { + "$ref": "#/$defs/new_group_3" + }, + { + "$ref": "#/$defs/new_group_1" } ] } From 3f2d3a8ceea8f3180a097028298117aabca68fc6 Mon Sep 17 00:00:00 2001 From: julianu Date: Fri, 10 Oct 2025 08:59:30 +0000 Subject: [PATCH 12/13] fixing absolute path in schema default --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 6b7a237..7acf8eb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -140,7 +140,7 @@ }, "xtandem_config_file": { "type": "string", - "default": "/mnt/data/projects/pipeline-of-identification/config/xtandem_input.xml", + "default": "${projectDir}/config/xtandem_input.xml", "description": "Path to the X!Tandem params file for additional, search engine specific settings." } } From 7b3bdf28abb7351d9df66e12cf8793b67920da05 Mon Sep 17 00:00:00 2001 From: julianu Date: Fri, 10 Oct 2025 09:02:08 +0000 Subject: [PATCH 13/13] documentation for complex entrapment sed command --- src/preprocess/create_entrapment_database.nf | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/preprocess/create_entrapment_database.nf b/src/preprocess/create_entrapment_database.nf index 637ec53..60344c0 100644 --- a/src/preprocess/create_entrapment_database.nf +++ b/src/preprocess/create_entrapment_database.nf @@ -46,6 +46,18 @@ process call_entrapment_database { java -Xmx${memory_limit}G -jar /opt/fdrbench/fdrbench.jar -db ${fasta} -o ${fasta.baseName}-entrapment.fasta -fold ${fold} -level protein -entrapment_label ENTRAPMENT_ -entrapment_pos 0 -uniprot -check # 'Reheader' to add entrapment index to database and accession part of the header # and remove empty entrapment sequences (which can appear if the original sequence has many Xs) + # The following sed command performs two operations: + # 1. Substitutes FASTA headers to include the entrapment index in both the database and accession parts. + # Regex breakdown: + # ^>ENTRAPMENT_(.+)\\|(.+)\\|(.+)_([0-9]+)\$ + # - Matches headers starting with '>ENTRAPMENT_' followed by three fields separated by '|', with the last field ending in '_[number]'. + # >ENTRAPMENT_\\4_\\1|ENTRAPMENT_\\4_\\2|\\3_\\4 + # - Rewrites the header to include the entrapment index (\\4) in both the database and accession parts. + # 2. Removes empty entrapment sequences (headers followed by an empty line). + # Control flow: + # \$!N;/>.*\\n\$/d;P;D + # - Reads two lines at a time; if a header is followed by an empty line, deletes both. + sed -r -i -e "s;^>ENTRAPMENT_(.+)\\|(.+)\\|(.+)_([0-9]+)\$;>ENTRAPMENT_\\4_\\1|ENTRAPMENT_\\4_\\2|\\3_\\4;g" -e '\$!N;/>.*\\n\$/d;P;D' ${fasta.baseName}-entrapment.fasta """ }