From b5eef0866ea8a130c014e0167d55a8f4a280ce16 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Fri, 11 Jul 2025 12:29:01 +0000 Subject: [PATCH 01/31] implement feature creation using oktoberfest --- bin/oktoberfest_feature_gen.py | 196 ++++++++++++++++++++++++++++++ requirements.txt | 3 +- src/postprocessing/oktoberfest.nf | 75 ++++++++++++ 3 files changed, 273 insertions(+), 1 deletion(-) create mode 100755 bin/oktoberfest_feature_gen.py create mode 100644 src/postprocessing/oktoberfest.nf diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py new file mode 100755 index 0000000..7fab7a0 --- /dev/null +++ b/bin/oktoberfest_feature_gen.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python + +""" +Generates features for PSM-rescoring using Oktoberfest. +The rescoring itself is suppresed by setting an unknown FDR estimation method.s +""" + +import argparse +import copy +import json +import logging +from pathlib import Path + +import oktoberfest as ok +from oktoberfest import runner as ok_runner +import pandas as pd +import psm_utils +import psm_utils.io + +OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD = 'f{config.fdr_estimation_method} is not a valid rescoring tool, use either "percolator" or "mokapot"' +"""Oktoberfest's error message for unknown FDR estimation methods. +There is a typo in the oktberfest code, as it is not substituting the f-string correctly. +""" + + +def argparse_setup() -> argparse.Namespace: + """ + Creates the argument parser for the Oktoberfest feature generation script. + """ + + parser = argparse.ArgumentParser() + # files + parser.add_argument( + "-psms-file", help="Input PSMs TSV file", required=True, type=Path + ) + parser.add_argument( + "-spectra-file", + help="Corresponding spectrum file for PSMs file", + required=True, + type=Path, + ) + + # prediction + parser.add_argument("-intensity-model", help="Koina intensity model", type=str) + parser.add_argument("-irt-model", help="Koina IRT model", type=str) + + # mass spec parameters + parser.add_argument( + "-mass-tolerance", + help="Defines the allowed tolerance between theoretical and experimentally observered fragment mass during peak annotation; default = 20 (FTMS), 40 (TOF), 0.35 (ITMS)", + default=20, + type=float, + ) + parser.add_argument( + "-mass-tolerance-unit", + help="Defines the measure of tolerance, either “da” or “ppm”; default = da (mass analyzer is ITMS), ppm (mass analyzer is FTMS or TOF)", + type=str, + choices=["da", "ppm"], + ) + + parser.add_argument( + "-spectra-file-type", + help=".d|raw|mzml", + type=str, + choices=["d", "raw", "mzml"], + default="mzml", + ) + + parser.add_argument( + "-out-folder", help="Output folder for ", required=True, type=Path + ) + + return parser.parse_args() + + +def main(): + """ + Generates features for PSM-rescoring using Oktoberfest. + The rescoring itself is suppresed by setting an unknown FDR estimation method. + + The esulting features can be found as `/none/rescore.tab` + """ + + args = argparse_setup() + logging.basicConfig(level=logging.INFO) + + oktoberfest_input_csv_path = args.psms_file.with_suffix(".oktoberfest.input.csv") + + psms = psm_utils.io.read_file(args.psms_file) + + # Necessary columns according to the docs: + # RAW_FILE,SCAN_NUMBER,MODIFIED_SEQUENCE,PRECURSOR_CHARGE, + # SCAN_EVENT_NUMBER,MASS,SCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH + oktoberfest_df = pd.DataFrame() + + # RAW_FILE, + oktoberfest_df["RAW_FILE"] = [args.mzml_file.stem] * len(psms) + + # SCAN_NUMBER + oktoberfest_df["SCAN_NUMBER"] = [psm.spectrum_id for psm in psms] + + # MODIFIED_SEQUENCE + oktoberfest_df["MODIFIED_SEQUENCE"] = [ + psm.peptidoform.modified_sequence for psm in psms + ] + oktoberfest_df["MODIFIED_SEQUENCE"] = oktoberfest_df["MODIFIED_SEQUENCE"].apply( + lambda x: x.replace("[UNIMOD:Carbamidomethyl]", "[UNIMOD:4]").replace( + "[UNIMOD:Oxidation]", "[UNIMOD:35]" + ) + ) + + # PRECURSOR_CHARGE + oktoberfest_df["PRECURSOR_CHARGE"] = [psm.get_precursor_charge() for psm in psms] + + # SCAN_EVENT_NUMBER + # TODO: Some search engines do not provide this information, skipt it entirely? + + # MASS + oktoberfest_df["MASS"] = [psm.peptidoform.theoretical_mass for psm in psms] + + # SCORE + # TODO: Does the psmutil score means higher is better? + oktoberfest_df["SCORE"] = [psm.score for psm in psms] + + # REVERSE + oktoberfest_df["REVERSE"] = [psm.is_decoy for psm in psms] + + # SEQUENCE + oktoberfest_df["SEQUENCE"] = [psm.peptidoform.sequence for psm in psms] + + # PEPTIDE_LENGTH + oktoberfest_df["PEPTIDE_LENGTH"] = [len(psm.peptidoform.sequence) for psm in psms] + + del psms + + psms_df = pd.read_csv(args.psms_file, sep="\t") + + # PROTEINS (not in the docs, but required by oktberfest) + oktoberfest_df["PROTEINS"] = psms_df["protein_list"].apply( + lambda x: x.replace("[", "").replace("]", "").replace("'", "") + ) # remove the brackets and quotes + + # adding present rescoring columns + present_rescoring_cols = list( + filter(lambda x: x.startswith("rescoring:"), psms_df.columns) + ) + for rescoring_col in present_rescoring_cols: + oktoberfest_df[rescoring_col] = psms_df[rescoring_col] + + oktoberfest_df.to_csv( + oktoberfest_input_csv_path, + sep=",", + index=False, + ) + + # create the config file + config_dict = copy.deepcopy(ok.utils.example_configs.RESCORING) + + # misc + config_dict["output"] = str(args.out_folder) + config_dict["num_threads"] = 1 # Set to 1 for debugging, can be increased later + # mass spec parameters + config_dict["mass_tolerance"] = args.mass_tolerance + config_dict["unitMassTolerance"] = args.mass_tolerance_unit + # predicition params + config_dict["models"]["irt_model"] = args.irt_model + config_dict["models"]["intensity_model"] = args.intensity_model + # input params + config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path) + config_dict["inputs"]["search_results_type"] = "Internal" + config_dict["inputs"]["spectra"] = "./" + config_dict["inputs"]["spectra_type"] = args.spectra_file_type + # resocring params + # deliberately set to NONE, which will cause Oktoberfest to shut down before rescoring + # by raising a ValueError which we can catch later. + # This has the effect, that the generated features + # are stored in the subfolder `results/none` of the output folder. + config_dict["fdr_estimation_method"] = "NONE" + config_dict["quantification"] = False + config_dict["add_feature_cols"] = "all" + + config_path = Path("oktoberfest.config.json") + + with config_path.open("w", encoding="utf-8") as json_file: + json_file.write(json.dumps(config_dict)) + + try: + ok_runner.run_job(config_path) + except ValueError as e: + # Catch the specific ValueError raised when "NONE" is used as fdr_estimation + if str(e) != OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD: + raise e + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 1f01b16..3ff7f46 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ psm-utils @ git+https://github.com/julianu/psm_utils.git@pepxml-and-mzid-fixes #mokapot~=0.10.0 ms2rescore_rs @ git+https://github.com/di-hardt/ms2rescore-rs.git@70c15002a9f065ea2cd01a9a9a95b8bcff762f53 -ms2rescore~=3.1.5 \ No newline at end of file +ms2rescore~=3.1.5 +oktoberfest~=0.10.0 \ No newline at end of file diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf new file mode 100644 index 0000000..75bfee1 --- /dev/null +++ b/src/postprocessing/oktoberfest.nf @@ -0,0 +1,75 @@ +nextflow.enable.dsl=2 + +// parameters for oktoberfest +params.oktoberfest_memory = "64 GB" +params.oktoberfest_intensity_model = "Prosit_2020_intensity_HCD" +params.oktoberfest_irt_mode = "Prosit_2019_irt" + +/** + * Runs oktoberfest rescoring for the given PSMs and mzML files. + * + * @param psm_tsvs_and_mzmls: A tuple containing the PSM utils TSV files and the mzML files for the PSMs. + * @param psm_tsvs: The PSM TSV files. + * @param mzmls: The mzML files. + * @param fragment_tol: The fragment tolerance for the rescoring. + * @param fragment_tol_unit: The unit of the fragment tolerance (e.g., "da" for Dalton). + * + * @return: The oktoberfest rescored PSMs in TSV format. + */ +workflow oktoberfest_rescore_workflow { + take: + psm_tsvs_and_mzmls + psm_tsvs + mzmls + fragment_tol + fragment_tol_unit + + main: + oktoberfest_pins = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, fragment_tol, fragment_tol_unit) + + emit: + oktoberfest_pins +} + +/** + * @param psm_tsvs_and_mzmls: A tuple containing the PSM utils TSV files and the mzML files for the PSMs. + * @param psm_tsvs: The PSM TSV files. + * @param mzmls: The mzML files. + * @param fragment_tol: The fragment tolerance for the rescoring. + * @param fragment_tol_unit: The unit of the fragment tolerance (e.g., "da" for Dalton). + * + * @return: The oktoberfest rescored PSMs in TSV format. + */ +process run_oktoberfest_feature_gen { + cpus 1 + memory { params.oktoberfest_memory } + + container { params.python_image } + + input: + tuple val(psm_utils_tsvs), val(mzml_for_psms) + path psm_tsvs + path mzmls + val fragment_tol + val fragment_tol_unit + + output: + path "oktoberfest.features.tsv" + + script: + """ + oktoberfest_feature_gen.py \ + -out-folder ./oktoberfest_out \ + -psms-file ${psm_utils_tsvs} \ + -spectra-file ${mzml_for_psms} \ + -intensity-model ${params.oktoberfest_intensity_model} \ + -irt-model ${params.oktoberfest_irt_mode} \ + -mass-tolerance ${fragment_tol} \ + -mass-tolerance-unit da + + mv ./oktoberfest_out/results/none/rescore.tab oktoberfest.features.tsv + + // Clean up the output directory + rm -r oktoberfest_out + """ +} From a6bd038c448fb3a85d96df9f5701c9c5098decbd Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Mon, 14 Jul 2025 12:54:26 +0000 Subject: [PATCH 02/31] removing todo about optional feature --- bin/oktoberfest_feature_gen.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 7fab7a0..4d26837 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -112,9 +112,6 @@ def main(): # PRECURSOR_CHARGE oktoberfest_df["PRECURSOR_CHARGE"] = [psm.get_precursor_charge() for psm in psms] - # SCAN_EVENT_NUMBER - # TODO: Some search engines do not provide this information, skipt it entirely? - # MASS oktoberfest_df["MASS"] = [psm.peptidoform.theoretical_mass for psm in psms] From ecc3bbdf4786dcb53ac3d3303bc239c05ba3f722 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Mon, 14 Jul 2025 12:55:06 +0000 Subject: [PATCH 03/31] fix argument name --- bin/oktoberfest_feature_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 4d26837..e18597c 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -94,7 +94,7 @@ def main(): oktoberfest_df = pd.DataFrame() # RAW_FILE, - oktoberfest_df["RAW_FILE"] = [args.mzml_file.stem] * len(psms) + oktoberfest_df["RAW_FILE"] = [args.spectra_file.stem] * len(psms) # SCAN_NUMBER oktoberfest_df["SCAN_NUMBER"] = [psm.spectrum_id for psm in psms] From 45b6838e4ed0c101cb30b11f946048953fe40fcf Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Mon, 14 Jul 2025 12:55:31 +0000 Subject: [PATCH 04/31] finalize arguments --- bin/oktoberfest_feature_gen.py | 37 +++++++++++++++++++++++++------ src/postprocessing/oktoberfest.nf | 25 +++++++++------------ 2 files changed, 40 insertions(+), 22 deletions(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index e18597c..24644da 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -22,6 +22,31 @@ There is a typo in the oktberfest code, as it is not substituting the f-string correctly. """ +def parse_str_bool(value: str) -> bool: + """ + Parses a string argument to a boolean value. + + Arguments + --------- + value : str + The string value to parse. Accepts 'true', 'false', '1', ' + + Returns + ------- + bool + Returns True for 'true' or '1', and False for 'false' or ' + + Raises + ------ + ValueError + If the value is not a valid boolean representation. + """ + if value.lower() in ['true', '1']: + return True + elif value.lower() in ['false', '0']: + return False + else: + raise ValueError(f"Invalid boolean value: {value}") def argparse_setup() -> argparse.Namespace: """ @@ -59,11 +84,9 @@ def argparse_setup() -> argparse.Namespace: ) parser.add_argument( - "-spectra-file-type", - help=".d|raw|mzml", - type=str, - choices=["d", "raw", "mzml"], - default="mzml", + "-is-timstof", + help="If true, the spectra file type is set to 'd' (for timsTOF); otherwise, it defaults to 'mzml'", + type=parse_str_bool ) parser.add_argument( @@ -166,8 +189,8 @@ def main(): config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path) config_dict["inputs"]["search_results_type"] = "Internal" config_dict["inputs"]["spectra"] = "./" - config_dict["inputs"]["spectra_type"] = args.spectra_file_type - # resocring params + config_dict["inputs"]["spectra_type"] = "d" if args.is_timstof else "mzml" + # resocreing params # deliberately set to NONE, which will cause Oktoberfest to shut down before rescoring # by raising a ValueError which we can catch later. # This has the effect, that the generated features diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf index 75bfee1..1d034e5 100644 --- a/src/postprocessing/oktoberfest.nf +++ b/src/postprocessing/oktoberfest.nf @@ -11,8 +11,6 @@ params.oktoberfest_irt_mode = "Prosit_2019_irt" * @param psm_tsvs_and_mzmls: A tuple containing the PSM utils TSV files and the mzML files for the PSMs. * @param psm_tsvs: The PSM TSV files. * @param mzmls: The mzML files. - * @param fragment_tol: The fragment tolerance for the rescoring. - * @param fragment_tol_unit: The unit of the fragment tolerance (e.g., "da" for Dalton). * * @return: The oktoberfest rescored PSMs in TSV format. */ @@ -21,11 +19,9 @@ workflow oktoberfest_rescore_workflow { psm_tsvs_and_mzmls psm_tsvs mzmls - fragment_tol - fragment_tol_unit main: - oktoberfest_pins = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, fragment_tol, fragment_tol_unit) + oktoberfest_pins = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, params.fragment_tol_da) emit: oktoberfest_pins @@ -35,10 +31,9 @@ workflow oktoberfest_rescore_workflow { * @param psm_tsvs_and_mzmls: A tuple containing the PSM utils TSV files and the mzML files for the PSMs. * @param psm_tsvs: The PSM TSV files. * @param mzmls: The mzML files. - * @param fragment_tol: The fragment tolerance for the rescoring. - * @param fragment_tol_unit: The unit of the fragment tolerance (e.g., "da" for Dalton). + * @param fragment_tol_da: The fragment tolerance for the rescoring. * - * @return: The oktoberfest rescored PSMs in TSV format. + * @return The oktoberfest rescored PSMs in TSV format. */ process run_oktoberfest_feature_gen { cpus 1 @@ -50,11 +45,10 @@ process run_oktoberfest_feature_gen { tuple val(psm_utils_tsvs), val(mzml_for_psms) path psm_tsvs path mzmls - val fragment_tol - val fragment_tol_unit + val fragment_tol_da output: - path "oktoberfest.features.tsv" + path "${psm_utils_tsvs.baseName}.features.tsv" script: """ @@ -63,11 +57,12 @@ process run_oktoberfest_feature_gen { -psms-file ${psm_utils_tsvs} \ -spectra-file ${mzml_for_psms} \ -intensity-model ${params.oktoberfest_intensity_model} \ - -irt-model ${params.oktoberfest_irt_mode} \ - -mass-tolerance ${fragment_tol} \ - -mass-tolerance-unit da + -irt-model ${params.oktoberfest_irt_modeö} \ + -mass-tolerance ${fragment_tol_da} \ + -mass-tolerance-unit da \ + -is-timstof ${params.is_timstof} \ - mv ./oktoberfest_out/results/none/rescore.tab oktoberfest.features.tsv + mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs.baseName}.features.tsv" // Clean up the output directory rm -r oktoberfest_out From 9ecd81714e42d68de9fcfe123790ed859b841593 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Mon, 14 Jul 2025 13:00:16 +0000 Subject: [PATCH 05/31] free up some memory --- bin/oktoberfest_feature_gen.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 24644da..e313725 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -151,6 +151,7 @@ def main(): # PEPTIDE_LENGTH oktoberfest_df["PEPTIDE_LENGTH"] = [len(psm.peptidoform.sequence) for psm in psms] + # free up some memory del psms psms_df = pd.read_csv(args.psms_file, sep="\t") @@ -173,6 +174,10 @@ def main(): index=False, ) + # free up some more memory as the dataframe is read fromt disk again + del psms_df + del oktoberfest_df + # create the config file config_dict = copy.deepcopy(ok.utils.example_configs.RESCORING) From 89b8a3ee451b1081db451a868f0f86aaef7ffc3b Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Mon, 14 Jul 2025 13:00:34 +0000 Subject: [PATCH 06/31] fix typos --- bin/oktoberfest_feature_gen.py | 4 +- bin/oktoberfest_feature_to_pin.py | 71 +++++++++++++++++++++++++++++++ src/postprocessing/oktoberfest.nf | 33 ++++++++++++-- 3 files changed, 103 insertions(+), 5 deletions(-) create mode 100755 bin/oktoberfest_feature_to_pin.py diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index e313725..fc6537f 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -2,7 +2,7 @@ """ Generates features for PSM-rescoring using Oktoberfest. -The rescoring itself is suppresed by setting an unknown FDR estimation method.s +The rescoring itself is suppresed by setting an unknown FDR estimation method. """ import argparse @@ -101,7 +101,7 @@ def main(): Generates features for PSM-rescoring using Oktoberfest. The rescoring itself is suppresed by setting an unknown FDR estimation method. - The esulting features can be found as `/none/rescore.tab` + The resulting features can be found as `/none/rescore.tab` """ args = argparse_setup() diff --git a/bin/oktoberfest_feature_to_pin.py b/bin/oktoberfest_feature_to_pin.py new file mode 100755 index 0000000..4293a53 --- /dev/null +++ b/bin/oktoberfest_feature_to_pin.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +""" +Converts Oktoberfest feature files to Percolator's PIN file. +""" + +import argparse +import logging +from pathlib import Path + +import pandas as pd + +COLS_TO_REMOVE = [ + "filename" # str column +] +"""Columns to remove from the Oktoberfest feature file. +E.g. due to wrong type +""" + +COLS_TO_RENAME = { + "SpecId": "id", +} +"""Columns to rename in the Oktoberfest feature file. +""" + + +def argparse_setup() -> argparse.Namespace: + """ + Creates the argument parser for the Oktoberfest feature generation script. + """ + + parser = argparse.ArgumentParser() + # files + parser.add_argument( + "-in-file", help="Input feature TSV file", required=True, type=Path + ) + parser.add_argument("-out-file", help="Pin file ", required=True, type=Path) + + return parser.parse_args() + + +def main(): + """ + Converts Oktoberfest feature files to Percolator's PIN file. + """ + + args = argparse_setup() + logging.basicConfig(level=logging.INFO) + + # feature dataframe + feature_df = pd.read_csv(args.in_file, sep="\t") + + for col in COLS_TO_REMOVE: + if col in feature_df.columns: + feature_df.drop(columns=col, inplace=True) + + for col, new_col in COLS_TO_RENAME.items(): + if col in feature_df.columns: + feature_df.rename(columns={col: new_col}, inplace=True) + + for col in feature_df.columns: + feature_df.rename(columns={col: col.lower()}, inplace=True) + + feature_df.to_csv( + args.out_file, + sep="\t", + index=False, + ) + +if __name__ == "__main__": + main() diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf index 1d034e5..458cfa7 100644 --- a/src/postprocessing/oktoberfest.nf +++ b/src/postprocessing/oktoberfest.nf @@ -3,7 +3,7 @@ nextflow.enable.dsl=2 // parameters for oktoberfest params.oktoberfest_memory = "64 GB" params.oktoberfest_intensity_model = "Prosit_2020_intensity_HCD" -params.oktoberfest_irt_mode = "Prosit_2019_irt" +params.oktoberfest_irt_model = "Prosit_2019_irt" /** * Runs oktoberfest rescoring for the given PSMs and mzML files. @@ -21,7 +21,9 @@ workflow oktoberfest_rescore_workflow { mzmls main: - oktoberfest_pins = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, params.fragment_tol_da) + oktoberfest_features = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, params.fragment_tol_da) + oktoberfest_pins = oktoberfest_features_to_pin(oktoberfest_features) + emit: oktoberfest_pins @@ -57,7 +59,7 @@ process run_oktoberfest_feature_gen { -psms-file ${psm_utils_tsvs} \ -spectra-file ${mzml_for_psms} \ -intensity-model ${params.oktoberfest_intensity_model} \ - -irt-model ${params.oktoberfest_irt_modeö} \ + -irt-model ${params.oktoberfest_irt_model} \ -mass-tolerance ${fragment_tol_da} \ -mass-tolerance-unit da \ -is-timstof ${params.is_timstof} \ @@ -68,3 +70,28 @@ process run_oktoberfest_feature_gen { rm -r oktoberfest_out """ } + +/** + * @param okt_features_tsv: Oktoberfest feature file. + * + * @return Oktoberfest feature file in PIN format ready to use with percolator. + */ +process oktoberfest_features_to_pin { + cpus 1 + memory { params.oktoberfest_memory } + + container { params.python_image } + + input: + path okt_features_tsv + + output: + path "${okt_features_tsv.baseName}.oktoberfest.pin" + + script: + """ + oktoberfest_feature_to_pin.py \ + -features-file ${okt_features_tsv} \ + -out-folder ./{okt_features_tsv.baseName}.oktoberfest.pin + """ +} From eeffc19d24e20ef52bb174c0df04da22bff0bfe7 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Mon, 14 Jul 2025 14:37:32 +0000 Subject: [PATCH 07/31] integration into workflow --- src/identification/comet_identification.nf | 9 +++++++-- src/identification/maxquant_identification.nf | 8 ++++++-- src/identification/msamanda_identification.nf | 8 ++++++-- src/identification/msfragger_identification.nf | 8 ++++++-- src/identification/msgfplus_identification.nf | 8 ++++++-- src/identification/sage_identification.nf | 8 ++++++-- src/identification/xtandem_identification.nf | 8 ++++++-- 7 files changed, 43 insertions(+), 14 deletions(-) diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf index dce4596..5d65cf4 100644 --- a/src/identification/comet_identification.nf +++ b/src/identification/comet_identification.nf @@ -10,8 +10,9 @@ params.comet_psm_id_pattern = "(.*)" params.comet_spectrum_id_pattern = '.*scan=(\\d+)$' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' /** * Exports the identification using Comet configured by a SDRF files @@ -38,9 +39,11 @@ workflow comet_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.comet_psm_id_pattern, params.comet_spectrum_id_pattern, '^DECOY_', 'comet') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) - // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: comet_mzids >> 'comet' @@ -49,6 +52,8 @@ workflow comet_identification { pout_files >> 'comet' ms2rescore_pins >> 'comet' ms2rescore_percolator_results >> 'comet' + oktoberfest_pins >> 'comet' + oktoberfest_percolator_results >> 'comet' } diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf index 21bc147..9ab1370 100644 --- a/src/identification/maxquant_identification.nf +++ b/src/identification/maxquant_identification.nf @@ -10,7 +10,7 @@ params.maxquant_psm_id_pattern = "" params.maxquant_spectrum_id_pattern = "" include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' /** @@ -63,9 +63,11 @@ workflow maxquant_identification { } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) - // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins)) publish: maxquant_results >> 'maxquant' @@ -74,6 +76,8 @@ workflow maxquant_identification { pout_files >> 'maxquant' ms2rescore_pins >> 'maxquant' ms2rescore_percolator_results >> 'maxquant' + oktoberfest_pins >> 'maxquant' + oktoberfest_percolator_results >> 'maxquant' } diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf index 69a8ae7..fd99da0 100644 --- a/src/identification/msamanda_identification.nf +++ b/src/identification/msamanda_identification.nf @@ -10,7 +10,7 @@ params.msamanda_psm_id_pattern = "(.*)" params.msamanda_spectrum_id_pattern = '(.*)' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' // msamanda needs explicit "scan=" in the id of a scan (not there in e.g. TimsTOF converted mzML data) @@ -43,9 +43,11 @@ workflow msamanda_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msamanda.csv')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msamanda_psm_id_pattern, params.msamanda_spectrum_id_pattern, '^DECOY_', 'msamanda') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) - // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: msamanda_results.msamanda_csv >> 'msamanda' @@ -54,6 +56,8 @@ workflow msamanda_identification { pout_files >> 'msamanda' ms2rescore_pins >> 'msamanda' ms2rescore_percolator_results >> 'msamanda' + oktoberfest_pins >> 'msamanda' + oktoberfest_percolator_results >> 'msamanda' } diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf index ac8fad1..bca5a68 100644 --- a/src/identification/msfragger_identification.nf +++ b/src/identification/msfragger_identification.nf @@ -12,7 +12,7 @@ params.msfragger_psm_id_pattern = "(.*)" params.msfragger_spectrum_id_pattern = "(.*)" include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' workflow msfragger_identification { @@ -37,9 +37,11 @@ workflow msfragger_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.pepXML')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msfragger_psm_id_pattern, params.msfragger_spectrum_id_pattern, '^DECOY_', 'msfragger') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) - // // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: fragger_results_pepxml >> 'msfragger' @@ -48,6 +50,8 @@ workflow msfragger_identification { pout_files >> 'msfragger' ms2rescore_pins >> 'msfragger' ms2rescore_percolator_results >> 'msfragger' + oktoberfest_pins >> 'msfragger' + oktoberfest_percolator_results >> 'msfragger' } diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf index 2429ab8..517f2a6 100644 --- a/src/identification/msgfplus_identification.nf +++ b/src/identification/msgfplus_identification.nf @@ -18,7 +18,7 @@ params.msgfplus_psm_id_pattern = "(.*)" params.msgfplus_spectrum_id_pattern = '(.*)' include {convert_chunked_result_to_psm_utils; enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' include {split_mzml_into_chunks} from '../preprocess/convert_to_mzml.nf' @@ -91,9 +91,11 @@ workflow msgfplus_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msgfplus_psm_id_pattern, params.msgfplus_spectrum_id_pattern, '^DECOY_', 'msgfplus') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) - // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins)) publish: fasta_merged_results.map{ it -> it[1] } >> 'msgfplus' @@ -102,6 +104,8 @@ workflow msgfplus_identification { pout_files >> 'msgfplus' ms2rescore_pins >> 'msgfplus' ms2rescore_percolator_results >> 'msgfplus' + oktoberfest_pins >> 'msgfplus' + oktoberfest_percolator_results >> 'msgfplus' } process identification_with_msgfplus { diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf index 5654799..aeaf7a2 100644 --- a/src/identification/sage_identification.nf +++ b/src/identification/sage_identification.nf @@ -12,7 +12,7 @@ params.sage_psm_id_pattern = "(.*)" params.sage_spectrum_id_pattern = '(.*)' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' /** @@ -49,9 +49,11 @@ workflow sage_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.sage')) ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.sage_psm_id_pattern, params.sage_spectrum_id_pattern, '^DECOY_', 'sage') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) - // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: return_files >> 'sage' @@ -60,6 +62,8 @@ workflow sage_identification { pout_files >> 'sage' ms2rescore_pins >> 'sage' ms2rescore_percolator_results >> 'sage' + oktoberfest_pins >> 'sage' + oktoberfest_percolator_results >> 'sage' } diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf index 2823f95..7e85e3a 100644 --- a/src/identification/xtandem_identification.nf +++ b/src/identification/xtandem_identification.nf @@ -10,7 +10,7 @@ params.xtandem_psm_id_pattern = "(.*)" params.xtandem_spectrum_id_pattern = '(.*)' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' /** @@ -39,9 +39,11 @@ workflow xtandem_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.xtandem_identification')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.xtandem_psm_id_pattern, params.xtandem_spectrum_id_pattern, '^DECOY_', 'xtandem') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) - // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: tandem_xmls >> 'xtandem' @@ -50,6 +52,8 @@ workflow xtandem_identification { pout_files >> 'xtandem' ms2rescore_pins >> 'xtandem' ms2rescore_percolator_results >> 'xtandem' + oktoberfest_pins >> 'xtandem' + oktoberfest_percolator_results >> 'xtandem' } /** From 9e7dec6934fef91c6e55aa694c141238544881b5 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Tue, 15 Jul 2025 07:57:19 +0000 Subject: [PATCH 08/31] build separate container for oktoberfest --- Makefile | 4 +++- docker/oktoberfest/Dockerfile | 29 +++++++++++++++++++++++++++++ docker/oktoberfest/environment.yml | 10 ++++++++++ docker/oktoberfest/requirements.txt | 2 ++ main.nf | 1 + requirements.txt | 3 +-- src/postprocessing/oktoberfest.nf | 4 ++-- 7 files changed, 48 insertions(+), 5 deletions(-) create mode 100644 docker/oktoberfest/Dockerfile create mode 100644 docker/oktoberfest/environment.yml create mode 100644 docker/oktoberfest/requirements.txt diff --git a/Makefile b/Makefile index 7c19bdd..50b3f98 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ docker-imgs: docker pull ghcr.io/medbioinf/pipeline-of-identification:latest - + docker pull proteowizard/pwiz-skyline-i-agree-to-the-vendor-licenses:3.0.25073-842baef docker pull quay.io/medbioinf/tdf2mzml:0.4 docker pull quay.io/medbioinf/openms:3.4.1 @@ -17,3 +17,5 @@ docker-imgs: docker pull ghcr.io/percolator/percolator:branch-3-08 docker build --platform linux/amd64 -t medbioinf/msfragger -f docker/msfragger/Dockerfile docker/msfragger/. + + docker build --platform linux/amd64 -t medbioinf/oktoberfest:latest -f docker/oktoberfest/Dockerfile docker/oktoberfest diff --git a/docker/oktoberfest/Dockerfile b/docker/oktoberfest/Dockerfile new file mode 100644 index 0000000..d083afb --- /dev/null +++ b/docker/oktoberfest/Dockerfile @@ -0,0 +1,29 @@ +# AMD64 needed explicitly on ARM as some sofwtare is only available in AMD64 +FROM --platform=amd64 mambaorg/micromamba:2.1.0-ubuntu22.04 + +WORKDIR /home/mambauser +# Copy backend and environment.yml +COPY --chown=mambauser:mambauser environment.yml . +COPY --chown=mambauser:mambauser requirements.txt . + +USER root + +RUN apt update \ + && apt install -y libglib2.0-0 git \ + && apt clean + +USER mambauser +ENV HOME=/home/mambauser +ENV ENV_NAME=oktoberfest + +RUN echo 'show_banner: false' > ~/.mambarc + +RUN micromamba env create -y -f environment.yml \ + && micromamba clean --all --yes + +# TODO: remove build-essential + +USER root +# First is necessary for base_image to actvate the conda environment second is entrypoint +# which adds the python file to PATH +ENTRYPOINT [ "/usr/local/bin/_entrypoint.sh"] diff --git a/docker/oktoberfest/environment.yml b/docker/oktoberfest/environment.yml new file mode 100644 index 0000000..8a46af3 --- /dev/null +++ b/docker/oktoberfest/environment.yml @@ -0,0 +1,10 @@ +name: oktoberfest +channels: + - defaults +dependencies: + - python=3.11 + - pip + - setuptools + - pip: + - -r requirements.txt + diff --git a/docker/oktoberfest/requirements.txt b/docker/oktoberfest/requirements.txt new file mode 100644 index 0000000..fe5f1b5 --- /dev/null +++ b/docker/oktoberfest/requirements.txt @@ -0,0 +1,2 @@ +oktoberfest~=0.10.0 +psm-utils @ git+https://github.com/julianu/psm_utils.git@pepxml-and-mzid-fixes \ No newline at end of file diff --git a/main.nf b/main.nf index 2127322..8174524 100644 --- a/main.nf +++ b/main.nf @@ -4,6 +4,7 @@ nextflow.preview.output = true // default python image params.python_image = 'ghcr.io/medbioinf/pipeline-of-identification:latest' +params.oktoberfest_image = 'medbioinf/oktoberfest' // parameters set by the command line params.raw_files = '' diff --git a/requirements.txt b/requirements.txt index 3ff7f46..1f01b16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,4 @@ psm-utils @ git+https://github.com/julianu/psm_utils.git@pepxml-and-mzid-fixes #mokapot~=0.10.0 ms2rescore_rs @ git+https://github.com/di-hardt/ms2rescore-rs.git@70c15002a9f065ea2cd01a9a9a95b8bcff762f53 -ms2rescore~=3.1.5 -oktoberfest~=0.10.0 \ No newline at end of file +ms2rescore~=3.1.5 \ No newline at end of file diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf index 458cfa7..328b9fa 100644 --- a/src/postprocessing/oktoberfest.nf +++ b/src/postprocessing/oktoberfest.nf @@ -41,7 +41,7 @@ process run_oktoberfest_feature_gen { cpus 1 memory { params.oktoberfest_memory } - container { params.python_image } + container { params.oktoberfest_image } input: tuple val(psm_utils_tsvs), val(mzml_for_psms) @@ -80,7 +80,7 @@ process oktoberfest_features_to_pin { cpus 1 memory { params.oktoberfest_memory } - container { params.python_image } + container { params.oktoberfest_image } input: path okt_features_tsv From 3611bbf7cb74708cfba1ad474b2f1fe75417bd4e Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Tue, 15 Jul 2025 08:07:15 +0000 Subject: [PATCH 09/31] fix comment --- src/postprocessing/oktoberfest.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf index 328b9fa..07f8087 100644 --- a/src/postprocessing/oktoberfest.nf +++ b/src/postprocessing/oktoberfest.nf @@ -66,7 +66,7 @@ process run_oktoberfest_feature_gen { mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs.baseName}.features.tsv" - // Clean up the output directory + # Clean up the output directory rm -r oktoberfest_out """ } From d800c75f34c06e5024fdfda2c7dffacf62a91448 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Tue, 15 Jul 2025 08:08:14 +0000 Subject: [PATCH 10/31] fix filenames --- src/postprocessing/oktoberfest.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf index 07f8087..88ed48b 100644 --- a/src/postprocessing/oktoberfest.nf +++ b/src/postprocessing/oktoberfest.nf @@ -50,7 +50,7 @@ process run_oktoberfest_feature_gen { val fragment_tol_da output: - path "${psm_utils_tsvs.baseName}.features.tsv" + path "${psm_utils_tsvs}.features.tsv" script: """ @@ -64,7 +64,7 @@ process run_oktoberfest_feature_gen { -mass-tolerance-unit da \ -is-timstof ${params.is_timstof} \ - mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs.baseName}.features.tsv" + mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs}.features.tsv" # Clean up the output directory rm -r oktoberfest_out @@ -86,12 +86,12 @@ process oktoberfest_features_to_pin { path okt_features_tsv output: - path "${okt_features_tsv.baseName}.oktoberfest.pin" + path "${okt_features_tsv}.oktoberfest.pin" script: """ oktoberfest_feature_to_pin.py \ -features-file ${okt_features_tsv} \ - -out-folder ./{okt_features_tsv.baseName}.oktoberfest.pin + -out-folder ./{okt_features_tsv}.oktoberfest.pin """ } From 8d9fc9fe7ee5fe208fe7ff36714578221e105fd8 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Tue, 15 Jul 2025 08:08:53 +0000 Subject: [PATCH 11/31] fix params --- src/identification/comet_identification.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf index 5d65cf4..bf8d857 100644 --- a/src/identification/comet_identification.nf +++ b/src/identification/comet_identification.nf @@ -39,7 +39,7 @@ workflow comet_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.comet_psm_id_pattern, params.comet_spectrum_id_pattern, '^DECOY_', 'comet') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) From 7536143dd3987758f2b1ed6b7204f2cf719fc0a4 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Tue, 15 Jul 2025 08:09:08 +0000 Subject: [PATCH 12/31] import missing workflow --- src/identification/maxquant_identification.nf | 5 +++-- src/identification/msamanda_identification.nf | 3 ++- src/identification/msfragger_identification.nf | 3 ++- src/identification/msgfplus_identification.nf | 5 +++-- src/identification/sage_identification.nf | 3 ++- src/identification/xtandem_identification.nf | 3 ++- 6 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf index 9ab1370..ab2bcaa 100644 --- a/src/identification/maxquant_identification.nf +++ b/src/identification/maxquant_identification.nf @@ -12,6 +12,7 @@ params.maxquant_spectrum_id_pattern = "" include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' /** * Executes the identification using MaxQuant @@ -63,11 +64,11 @@ workflow maxquant_identification { } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) - oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins)) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: maxquant_results >> 'maxquant' diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf index fd99da0..3faa4a5 100644 --- a/src/identification/msamanda_identification.nf +++ b/src/identification/msamanda_identification.nf @@ -12,6 +12,7 @@ params.msamanda_spectrum_id_pattern = '(.*)' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' // msamanda needs explicit "scan=" in the id of a scan (not there in e.g. TimsTOF converted mzML data) // 1) ms-convert with "--noindex" @@ -43,7 +44,7 @@ workflow msamanda_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msamanda.csv')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msamanda_psm_id_pattern, params.msamanda_spectrum_id_pattern, '^DECOY_', 'msamanda') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf index bca5a68..4697ed8 100644 --- a/src/identification/msfragger_identification.nf +++ b/src/identification/msfragger_identification.nf @@ -14,6 +14,7 @@ params.msfragger_spectrum_id_pattern = "(.*)" include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' workflow msfragger_identification { take: @@ -37,7 +38,7 @@ workflow msfragger_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.pepXML')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msfragger_psm_id_pattern, params.msfragger_spectrum_id_pattern, '^DECOY_', 'msfragger') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf index 517f2a6..6a76f9e 100644 --- a/src/identification/msgfplus_identification.nf +++ b/src/identification/msgfplus_identification.nf @@ -20,6 +20,7 @@ params.msgfplus_spectrum_id_pattern = '(.*)' include {convert_chunked_result_to_psm_utils; enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' include {split_mzml_into_chunks} from '../preprocess/convert_to_mzml.nf' @@ -91,11 +92,11 @@ workflow msgfplus_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msgfplus_psm_id_pattern, params.msgfplus_spectrum_id_pattern, '^DECOY_', 'msgfplus') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) - oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins)) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: fasta_merged_results.map{ it -> it[1] } >> 'msgfplus' diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf index aeaf7a2..2c23d69 100644 --- a/src/identification/sage_identification.nf +++ b/src/identification/sage_identification.nf @@ -14,6 +14,7 @@ params.sage_spectrum_id_pattern = '(.*)' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' /** * Executes the identification using Sage @@ -49,7 +50,7 @@ workflow sage_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.sage')) ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.sage_psm_id_pattern, params.sage_spectrum_id_pattern, '^DECOY_', 'sage') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf index 7e85e3a..2e58235 100644 --- a/src/identification/xtandem_identification.nf +++ b/src/identification/xtandem_identification.nf @@ -12,6 +12,7 @@ params.xtandem_spectrum_id_pattern = '(.*)' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' /** * Exports the identification using Comet configured by a SDRF files @@ -39,7 +40,7 @@ workflow xtandem_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.xtandem_identification')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.xtandem_psm_id_pattern, params.xtandem_spectrum_id_pattern, '^DECOY_', 'xtandem') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) From 014825dbf56dfe9858c8947a58ff420fc800fb10 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Tue, 15 Jul 2025 09:09:16 +0000 Subject: [PATCH 13/31] correcting names --- src/postprocessing/oktoberfest.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf index 88ed48b..7873d23 100644 --- a/src/postprocessing/oktoberfest.nf +++ b/src/postprocessing/oktoberfest.nf @@ -86,12 +86,12 @@ process oktoberfest_features_to_pin { path okt_features_tsv output: - path "${okt_features_tsv}.oktoberfest.pin" + path "${okt_features_tsv.baseName}.oktoberfest.pin" script: """ oktoberfest_feature_to_pin.py \ - -features-file ${okt_features_tsv} \ - -out-folder ./{okt_features_tsv}.oktoberfest.pin + -in-file ${okt_features_tsv} \ + -out-file ./${okt_features_tsv.baseName}.oktoberfest.pin """ } From d2e0111088ed7b87a8dd26a72590e4b5d5b22241 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Wed, 16 Jul 2025 07:50:05 +0000 Subject: [PATCH 14/31] fix variable name --- src/identification/maxquant_identification.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf index ab2bcaa..74b9b19 100644 --- a/src/identification/maxquant_identification.nf +++ b/src/identification/maxquant_identification.nf @@ -64,7 +64,7 @@ workflow maxquant_identification { } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), mzmls.collect()) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) From cf1a3f0a1f40ae2014d441fc15f078ac7fca267a Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Wed, 16 Jul 2025 07:56:49 +0000 Subject: [PATCH 15/31] ignoring result folder --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 4ce9325..90c0236 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,11 @@ .nextflow/ .nextflow.log* work/ + +# workflow results +results/ + + # binary nextflow trace* From 946ab7374c446ca472a52576314b09ce64f0a113 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Wed, 16 Jul 2025 14:42:24 +0000 Subject: [PATCH 16/31] use regex to get scan id --- bin/oktoberfest_feature_gen.py | 19 ++++++++++++++++++- src/identification/comet_identification.nf | 3 ++- src/identification/maxquant_identification.nf | 9 ++++++++- src/identification/msamanda_identification.nf | 3 ++- .../msfragger_identification.nf | 3 ++- src/identification/msgfplus_identification.nf | 3 ++- src/identification/sage_identification.nf | 3 ++- src/identification/xtandem_identification.nf | 3 ++- src/postprocessing/oktoberfest.nf | 7 ++++++- 9 files changed, 44 insertions(+), 9 deletions(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index fc6537f..4bf27b2 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -10,6 +10,7 @@ import json import logging from pathlib import Path +import re import oktoberfest as ok from oktoberfest import runner as ok_runner @@ -47,6 +48,12 @@ def parse_str_bool(value: str) -> bool: return False else: raise ValueError(f"Invalid boolean value: {value}") + +def get_scan_id(spectrum_id: str, scan_id_regex: re.Pattern) -> int: + match = scan_id_regex.match(spectrum_id) + if not match: + raise ValueError(f"Could not extract scan number from spectrum ID: {spectrum_id}") + return int(match.group("scan_id")) def argparse_setup() -> argparse.Namespace: """ @@ -89,6 +96,15 @@ def argparse_setup() -> argparse.Namespace: type=parse_str_bool ) + parser.add_argument( + "-scan-id-regex", + help=( + "Regular expression to extract the scan number from the spectrum ID." + "Use `scan_id` for the matching group, e.g. `scan=(?P\\d+)`)" + ), + type=str, + ) + parser.add_argument( "-out-folder", help="Output folder for ", required=True, type=Path ) @@ -106,6 +122,7 @@ def main(): args = argparse_setup() logging.basicConfig(level=logging.INFO) + scan_id_regex = re.compile(args.scan_id_regex) oktoberfest_input_csv_path = args.psms_file.with_suffix(".oktoberfest.input.csv") @@ -120,7 +137,7 @@ def main(): oktoberfest_df["RAW_FILE"] = [args.spectra_file.stem] * len(psms) # SCAN_NUMBER - oktoberfest_df["SCAN_NUMBER"] = [psm.spectrum_id for psm in psms] + oktoberfest_df["SCAN_NUMBER"] = [get_scan_id(psm.spectrum_id, scan_id_regex) for psm in psms] # MODIFIED_SEQUENCE oktoberfest_df["MODIFIED_SEQUENCE"] = [ diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf index bf8d857..18d3f9e 100644 --- a/src/identification/comet_identification.nf +++ b/src/identification/comet_identification.nf @@ -8,6 +8,7 @@ params.comet_mem = "8 GB" params.comet_psm_id_pattern = "(.*)" params.comet_spectrum_id_pattern = '.*scan=(\\d+)$' +params.comet_scan_id_pattern = '(?P\\d+)' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' @@ -39,7 +40,7 @@ workflow comet_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.comet_psm_id_pattern, params.comet_spectrum_id_pattern, '^DECOY_', 'comet') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.comet_scan_id_pattern) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf index 74b9b19..c330219 100644 --- a/src/identification/maxquant_identification.nf +++ b/src/identification/maxquant_identification.nf @@ -8,6 +8,7 @@ params.maxquant_mem = "32 GB" params.maxquant_psm_id_pattern = "" params.maxquant_spectrum_id_pattern = "" +params.maxquant_scan_id_pattern = "" include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' @@ -56,6 +57,12 @@ workflow maxquant_identification { spectrum_id_pattern = '.*scan=(\\d+)$' } } + if (params.maxquant_scan_id_pattern) { + scan_id_pattern = params.maxquant_scan_id_pattern + } else{ + // no difference between psm TSVs derived from Bruker and Thermo measurments + scan_id_pattern = '(?P\\d+)' + } if (params.is_timstof) { psm_tsvs_and_spectrafiles = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msms')) + '.d' ] } @@ -64,7 +71,7 @@ workflow maxquant_identification { } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), mzmls.collect()) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), mzmls.collect(), scan_id_pattern) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf index 3faa4a5..177c8c3 100644 --- a/src/identification/msamanda_identification.nf +++ b/src/identification/msamanda_identification.nf @@ -8,6 +8,7 @@ params.msamanda_mem = "64 GB" params.msamanda_psm_id_pattern = "(.*)" params.msamanda_spectrum_id_pattern = '(.*)' +params.msamanda_scan_id_pattern = '.*scan=(?P\\d+)*.' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' @@ -44,7 +45,7 @@ workflow msamanda_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msamanda.csv')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msamanda_psm_id_pattern, params.msamanda_spectrum_id_pattern, '^DECOY_', 'msamanda') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msamanda_scan_id_pattern) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf index 4697ed8..fbfebe0 100644 --- a/src/identification/msfragger_identification.nf +++ b/src/identification/msfragger_identification.nf @@ -10,6 +10,7 @@ params.msfragger_calibrate = 2 params.msfragger_psm_id_pattern = "(.*)" params.msfragger_spectrum_id_pattern = "(.*)" +params.msfragger_scan_id_pattern = '.*scan=(?P\\d+)*.' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' @@ -38,7 +39,7 @@ workflow msfragger_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.pepXML')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msfragger_psm_id_pattern, params.msfragger_spectrum_id_pattern, '^DECOY_', 'msfragger') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msfragger_scan_id_pattern) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf index 6a76f9e..c6815a3 100644 --- a/src/identification/msgfplus_identification.nf +++ b/src/identification/msgfplus_identification.nf @@ -16,6 +16,7 @@ params.msgfplus_split_fasta = 0 // split the fasta into this many chunks params.msgfplus_psm_id_pattern = "(.*)" params.msgfplus_spectrum_id_pattern = '(.*)' +params.msgfplus_scan_id_pattern = '.*scan=(?P\\d+)*.' include {convert_chunked_result_to_psm_utils; enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' @@ -92,7 +93,7 @@ workflow msgfplus_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msgfplus_psm_id_pattern, params.msgfplus_spectrum_id_pattern, '^DECOY_', 'msgfplus') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msgfplus_scan_id_pattern) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf index 2c23d69..5758e9e 100644 --- a/src/identification/sage_identification.nf +++ b/src/identification/sage_identification.nf @@ -10,6 +10,7 @@ params.sage_prefilter_chunk_size = 0 params.sage_psm_id_pattern = "(.*)" params.sage_spectrum_id_pattern = '(.*)' +params.sage_scan_id_pattern = '.*scan=(?P\\d+)*.' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' @@ -50,7 +51,7 @@ workflow sage_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.sage')) ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.sage_psm_id_pattern, params.sage_spectrum_id_pattern, '^DECOY_', 'sage') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.sage_scan_id_pattern) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf index 2e58235..58f6c23 100644 --- a/src/identification/xtandem_identification.nf +++ b/src/identification/xtandem_identification.nf @@ -8,6 +8,7 @@ params.xtandem_mem = "128 GB" params.xtandem_psm_id_pattern = "(.*)" params.xtandem_spectrum_id_pattern = '(.*)' +params.xtandem_scan_id_pattern = '.*scan=(?P\\d+)*.' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' @@ -40,7 +41,7 @@ workflow xtandem_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.xtandem_identification')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.xtandem_psm_id_pattern, params.xtandem_spectrum_id_pattern, '^DECOY_', 'xtandem') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect()) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.xtandem_scan_id_pattern) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf index 7873d23..5ab635f 100644 --- a/src/postprocessing/oktoberfest.nf +++ b/src/postprocessing/oktoberfest.nf @@ -11,6 +11,7 @@ params.oktoberfest_irt_model = "Prosit_2019_irt" * @param psm_tsvs_and_mzmls: A tuple containing the PSM utils TSV files and the mzML files for the PSMs. * @param psm_tsvs: The PSM TSV files. * @param mzmls: The mzML files. + * @param scan_id_regex: A regex pattern to extract the scan number from the spectrum ID. * * @return: The oktoberfest rescored PSMs in TSV format. */ @@ -19,9 +20,10 @@ workflow oktoberfest_rescore_workflow { psm_tsvs_and_mzmls psm_tsvs mzmls + scan_id_regex main: - oktoberfest_features = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, params.fragment_tol_da) + oktoberfest_features = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, params.fragment_tol_da, scan_id_regex) oktoberfest_pins = oktoberfest_features_to_pin(oktoberfest_features) @@ -34,6 +36,7 @@ workflow oktoberfest_rescore_workflow { * @param psm_tsvs: The PSM TSV files. * @param mzmls: The mzML files. * @param fragment_tol_da: The fragment tolerance for the rescoring. + * @param scan_id_regex: A regex pattern to extract the scan number from the spectrum ID. * * @return The oktoberfest rescored PSMs in TSV format. */ @@ -48,6 +51,7 @@ process run_oktoberfest_feature_gen { path psm_tsvs path mzmls val fragment_tol_da + val scan_id_regex output: path "${psm_utils_tsvs}.features.tsv" @@ -63,6 +67,7 @@ process run_oktoberfest_feature_gen { -mass-tolerance ${fragment_tol_da} \ -mass-tolerance-unit da \ -is-timstof ${params.is_timstof} \ + -scan-id-regex '${scan_id_regex}' \ mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs}.features.tsv" From ea08484d86fe68f30d21bc4e34ea3edf6277da54 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Wed, 16 Jul 2025 14:42:55 +0000 Subject: [PATCH 17/31] implement retry on koina server error --- bin/oktoberfest_feature_gen.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 4bf27b2..184582e 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -11,18 +11,23 @@ import logging from pathlib import Path import re +from time import sleep import oktoberfest as ok from oktoberfest import runner as ok_runner import pandas as pd import psm_utils import psm_utils.io +import tritonclient OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD = 'f{config.fdr_estimation_method} is not a valid rescoring tool, use either "percolator" or "mokapot"' """Oktoberfest's error message for unknown FDR estimation methods. There is a typo in the oktberfest code, as it is not substituting the f-string correctly. """ +OKTOBERFEST_RETRIES = 5 +"""Number of retries for the oktberfest job in case of a server error.""" + def parse_str_bool(value: str) -> bool: """ Parses a string argument to a boolean value. @@ -226,12 +231,26 @@ def main(): with config_path.open("w", encoding="utf-8") as json_file: json_file.write(json.dumps(config_dict)) - try: - ok_runner.run_job(config_path) - except ValueError as e: - # Catch the specific ValueError raised when "NONE" is used as fdr_estimation - if str(e) != OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD: - raise e + is_successfull = False + for i in range(OKTOBERFEST_RETRIES): + try: + ok_runner.run_job(config_path) + except ValueError as e: + # Catch the specific ValueError raised when "NONE" is used as fdr_estimation + if str(e) != OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD: + raise e + else: + is_successfull = True + break + except tritonclient.utils.InferenceServerException as e: + if str(e.status) == "504": + logging.error("Koina server not available, retrying in 10 seconds...") + sleep(10) + + if not is_successfull: + logging.error("Oktoberfest job failed after multiple retries.") + exit(101) + if __name__ == "__main__": From 5c127372d9003477cb7aeeb5020f5441f1c02b36 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Wed, 16 Jul 2025 14:45:02 +0000 Subject: [PATCH 18/31] limit max forks of oktoberfest feature generation --- src/postprocessing/oktoberfest.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf index 5ab635f..071e452 100644 --- a/src/postprocessing/oktoberfest.nf +++ b/src/postprocessing/oktoberfest.nf @@ -4,6 +4,7 @@ nextflow.enable.dsl=2 params.oktoberfest_memory = "64 GB" params.oktoberfest_intensity_model = "Prosit_2020_intensity_HCD" params.oktoberfest_irt_model = "Prosit_2019_irt" +params.oktoberfest_forks = 1 // have some mercy with the koina servers /** * Runs oktoberfest rescoring for the given PSMs and mzML files. @@ -42,6 +43,7 @@ workflow oktoberfest_rescore_workflow { */ process run_oktoberfest_feature_gen { cpus 1 + maxForks params.oktoberfest_forks memory { params.oktoberfest_memory } container { params.oktoberfest_image } From 66c85edfbf0ce4a6e53b5219f31224995f026303 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Thu, 17 Jul 2025 12:34:36 +0000 Subject: [PATCH 19/31] select specific spectrum file instead of general folder --- bin/oktoberfest_feature_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 184582e..83a29c4 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -215,7 +215,7 @@ def main(): # input params config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path) config_dict["inputs"]["search_results_type"] = "Internal" - config_dict["inputs"]["spectra"] = "./" + config_dict["inputs"]["spectra"] = str(args.spectra_file) config_dict["inputs"]["spectra_type"] = "d" if args.is_timstof else "mzml" # resocreing params # deliberately set to NONE, which will cause Oktoberfest to shut down before rescoring From 173521ed710dc3ddb91c071b5f3ce22f359324a8 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Fri, 18 Jul 2025 14:26:12 +0000 Subject: [PATCH 20/31] fix missing proteins accession and attribute filters --- bin/oktoberfest_feature_gen.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 83a29c4..7b98e8d 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -28,6 +28,9 @@ OKTOBERFEST_RETRIES = 5 """Number of retries for the oktberfest job in case of a server error.""" +OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS = {"U", "O"} +"""According to documentation, Oktoberfest does not support the amino acids U and O.""" + def parse_str_bool(value: str) -> bool: """ Parses a string argument to a boolean value. @@ -190,14 +193,37 @@ def main(): for rescoring_col in present_rescoring_cols: oktoberfest_df[rescoring_col] = psms_df[rescoring_col] + # free up some more memory as the dataframe is read fromt disk again + del psms_df + + # Filter unsupported amino acids + psms_len = len(oktoberfest_df) + oktoberfest_df = oktoberfest_df[~oktoberfest_df["MODIFIED_SEQUENCE"].str.contains("|".join(OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS), regex=True)] + if len(oktoberfest_df) < psms_len: + logging.warning( + f"Removed {psms_len - len(oktoberfest_df)} PSMs with unsupported amino acids: {OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS}" + ) + + # Filter peptide length > 30 + psms_len = len(oktoberfest_df) + oktoberfest_df = oktoberfest_df[oktoberfest_df["PEPTIDE_LENGTH"] <= 30] + if len(oktoberfest_df) < psms_len: + logging.warning( + f"Removed {psms_len - len(oktoberfest_df)} PSMs with peptide length > 30" + ) + + # Some search engines do not provide protein accessions for decoys. + # In this case, we set the PROTEINS column to the `PEP_y` like in the Oktberfest docs. + oktoberfest_df["PROTEINS"].replace("", pd.NA, inplace=True) + oktoberfest_df["PROTEINS"].fillna("PEP_" + oktoberfest_df["MODIFIED_SEQUENCE"], inplace=True) + oktoberfest_df.to_csv( oktoberfest_input_csv_path, sep=",", index=False, ) - # free up some more memory as the dataframe is read fromt disk again - del psms_df + # free up more memory del oktoberfest_df # create the config file From 4e58ba1ae448891cf2b5c594b6ddb631c68ae2f7 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Fri, 18 Jul 2025 14:50:35 +0000 Subject: [PATCH 21/31] fix column --- bin/oktoberfest_feature_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 7b98e8d..a897a73 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -198,7 +198,7 @@ def main(): # Filter unsupported amino acids psms_len = len(oktoberfest_df) - oktoberfest_df = oktoberfest_df[~oktoberfest_df["MODIFIED_SEQUENCE"].str.contains("|".join(OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS), regex=True)] + oktoberfest_df = oktoberfest_df[~oktoberfest_df["SEQUENCE"].str.contains("|".join(OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS), regex=True)] if len(oktoberfest_df) < psms_len: logging.warning( f"Removed {psms_len - len(oktoberfest_df)} PSMs with unsupported amino acids: {OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS}" From 8cf6a20308eccc6efb1832517d130e9526fa13b1 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Mon, 21 Jul 2025 09:42:25 +0000 Subject: [PATCH 22/31] fix issues with PSM at last amino acid in sequence --- bin/oktoberfest_feature_gen.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index a897a73..b3ab1b3 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -7,6 +7,7 @@ import argparse import copy +from functools import reduce import json import logging from pathlib import Path @@ -31,6 +32,13 @@ OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS = {"U", "O"} """According to documentation, Oktoberfest does not support the amino acids U and O.""" +OKTOBERFEST_MODIFICATION_REPLACEMENTS = [ + ("C[UNIMOD:Carbamidomethyl]", "C[UNIMOD:4]"), + ("C-[UNIMOD:Carbamidomethyl]", "C[UNIMOD:4]"), # MaxQant-specific if C is last in sequence, PSM is annotated with C- instead of C + ("M[UNIMOD:Oxidation]", "M[UNIMOD:35]"), + ("M-[UNIMOD:Oxidation]", "M[UNIMOD:35]"), # MaxQant-specific if M is last in sequence, PSM is annotated with M- instead of M +] + def parse_str_bool(value: str) -> bool: """ Parses a string argument to a boolean value. @@ -151,9 +159,12 @@ def main(): oktoberfest_df["MODIFIED_SEQUENCE"] = [ psm.peptidoform.modified_sequence for psm in psms ] + # sequentially apply the replacements using functools.reduce oktoberfest_df["MODIFIED_SEQUENCE"] = oktoberfest_df["MODIFIED_SEQUENCE"].apply( - lambda x: x.replace("[UNIMOD:Carbamidomethyl]", "[UNIMOD:4]").replace( - "[UNIMOD:Oxidation]", "[UNIMOD:35]" + lambda seq: reduce( + lambda seq_x, repl: seq_x.replace(repl[0], repl[1]), # lambda to replace + OKTOBERFEST_MODIFICATION_REPLACEMENTS, # replacements + seq # starting with the sequences as stated in the PSMs file ) ) @@ -216,7 +227,6 @@ def main(): # In this case, we set the PROTEINS column to the `PEP_y` like in the Oktberfest docs. oktoberfest_df["PROTEINS"].replace("", pd.NA, inplace=True) oktoberfest_df["PROTEINS"].fillna("PEP_" + oktoberfest_df["MODIFIED_SEQUENCE"], inplace=True) - oktoberfest_df.to_csv( oktoberfest_input_csv_path, sep=",", From 9ff4dc661c29dccc2b4e33a4c8186fce3e2cc2a6 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Mon, 21 Jul 2025 09:43:35 +0000 Subject: [PATCH 23/31] add doc string --- bin/oktoberfest_feature_gen.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index b3ab1b3..010e992 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -66,6 +66,20 @@ def parse_str_bool(value: str) -> bool: raise ValueError(f"Invalid boolean value: {value}") def get_scan_id(spectrum_id: str, scan_id_regex: re.Pattern) -> int: + """ + Using the provided regex to extract the scan number from the spectrum ID. + Arguments + --------- + spectrum_id : str + The spectrum ID from which to extract the scan number. + scan_id_regex : re.Pattern + A compiled regular expression pattern to match the scan number. + + Returns + ------- + int + The extracted scan number as an integer. + """ match = scan_id_regex.match(spectrum_id) if not match: raise ValueError(f"Could not extract scan number from spectrum ID: {spectrum_id}") From 62adfa8266fe1c9b91720d0fbd4cb49b7fdc461c Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Tue, 22 Jul 2025 08:35:11 +0000 Subject: [PATCH 24/31] copy parts of Oktoberfest's code for a more stable use --- bin/oktoberfest_feature_gen.py | 112 ++++++++++++++++++++++++++------- 1 file changed, 91 insertions(+), 21 deletions(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 010e992..363734f 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -13,18 +13,18 @@ from pathlib import Path import re from time import sleep +from typing import Union import oktoberfest as ok -from oktoberfest import runner as ok_runner +from oktoberfest.runner import _preprocess, _ce_calib, _refinement_learn, _calculate_features +from oktoberfest.utils import Config, JobPool, ProcessStep +from oktoberfest import rescore as ok_re +from oktoberfest import preprocessing as ok_pp import pandas as pd import psm_utils import psm_utils.io import tritonclient -OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD = 'f{config.fdr_estimation_method} is not a valid rescoring tool, use either "percolator" or "mokapot"' -"""Oktoberfest's error message for unknown FDR estimation methods. -There is a typo in the oktberfest code, as it is not substituting the f-string correctly. -""" OKTOBERFEST_RETRIES = 5 """Number of retries for the oktberfest job in case of a server error.""" @@ -142,6 +142,86 @@ def argparse_setup() -> argparse.Namespace: return parser.parse_args() +def feature_generation(config_path: Union[str, Path]): + """ + Parts of Oktoberfest's [run_rescore-function without the rescoring step](https://github.com/wilhelm-lab/oktoberfest/blob/ce8d909ebf64aaaf9c0eebcc2bb33b9c4492ae90/oktoberfest/runner.py#L1238-L1312) + with some renamed dependencies to avoid conflicts e.g. `re` => `ok_re` (for Oktoberfests rescoring module) to avoid conflicts with the built-in `re` module. + + Arguments + --------- + config_path : Union[str, Path] + Path to the configuration file for Oktoberfest. This file should contain all necessary parameters for the + """ + config = Config() + config.read(config_path) + config.check() + + # load spectra file names + spectra_files = ok_pp.list_spectra(input_dir=config.spectra, input_format=config.spectra_type) + + proc_dir = config.output / "proc" + proc_dir.mkdir(parents=True, exist_ok=True) + + spectra_files = _preprocess(spectra_files, config) + + # TODO is this the most elegant way to multi-thread CE calibration before running refinement learning? + # Should we store the returned libraries and pass them to _calculate_features and _refinement_learn instead of + # _ce_calib returning cached outputs? + if config.num_threads > 1: + processing_pool = JobPool(processes=config.num_threads) + for spectra_file in spectra_files: + _ = processing_pool.apply_async(_ce_calib, [spectra_file, config]) + processing_pool.check_pool() + else: + for spectra_file in spectra_files: + _ = _ce_calib(spectra_file, config) + + if config.do_refinement_learning: + _refinement_learn(spectra_files, config) + + if config.num_threads > 1: + processing_pool = JobPool(processes=config.num_threads) + for spectra_file in spectra_files: + if "xl" in config.models["intensity"].lower(): + if "cms2" in config.models["intensity"].lower(): + cms2 = True + else: + cms2 = False + processing_pool.apply_async(_calculate_features, [spectra_file, config], xl=True, cms2=cms2) + else: + processing_pool.apply_async(_calculate_features, [spectra_file, config]) + processing_pool.check_pool() + else: + for spectra_file in spectra_files: + if "xl" in config.models["intensity"].lower(): + if "cms2" in config.models["intensity"].lower(): + cms2 = True + else: + cms2 = False + _calculate_features(spectra_file, config, xl=True, cms2=cms2) + else: + _calculate_features(spectra_file, config) + + # prepare rescoring + + fdr_dir = config.output / "results" / config.fdr_estimation_method + original_tab_files = [fdr_dir / spectra_file.with_suffix(".original.tab").name for spectra_file in spectra_files] + rescore_tab_files = [fdr_dir / spectra_file.with_suffix(".rescore.tab").name for spectra_file in spectra_files] + + prepare_tab_original_step = ProcessStep(config.output, f"{config.fdr_estimation_method}_prepare_tab_original") + prepare_tab_rescore_step = ProcessStep(config.output, f"{config.fdr_estimation_method}_prepare_tab_prosit") + + if not prepare_tab_original_step.is_done(): + logging.info("Merging input tab files for rescoring without peptide property prediction") + ok_re.merge_input(tab_files=original_tab_files, output_file=fdr_dir / "original.tab") + prepare_tab_original_step.mark_done() + + if not prepare_tab_rescore_step.is_done(): + logging.info("Merging input tab files for rescoring with peptide property prediction") + ok_re.merge_input(tab_files=rescore_tab_files, output_file=fdr_dir / "rescore.tab") + prepare_tab_rescore_step.mark_done() + + def main(): """ Generates features for PSM-rescoring using Oktoberfest. @@ -226,7 +306,7 @@ def main(): oktoberfest_df = oktoberfest_df[~oktoberfest_df["SEQUENCE"].str.contains("|".join(OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS), regex=True)] if len(oktoberfest_df) < psms_len: logging.warning( - f"Removed {psms_len - len(oktoberfest_df)} PSMs with unsupported amino acids: {OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS}" + "Removed %i PSMs with unsupported amino acids: %s", psms_len - len(oktoberfest_df), OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS ) # Filter peptide length > 30 @@ -234,7 +314,7 @@ def main(): oktoberfest_df = oktoberfest_df[oktoberfest_df["PEPTIDE_LENGTH"] <= 30] if len(oktoberfest_df) < psms_len: logging.warning( - f"Removed {psms_len - len(oktoberfest_df)} PSMs with peptide length > 30" + "Removed %i PSMs with peptide length > 30", psms_len - len(oktoberfest_df) ) # Some search engines do not provide protein accessions for decoys. @@ -267,10 +347,7 @@ def main(): config_dict["inputs"]["search_results_type"] = "Internal" config_dict["inputs"]["spectra"] = str(args.spectra_file) config_dict["inputs"]["spectra_type"] = "d" if args.is_timstof else "mzml" - # resocreing params - # deliberately set to NONE, which will cause Oktoberfest to shut down before rescoring - # by raising a ValueError which we can catch later. - # This has the effect, that the generated features + # Setting this to none has the effect, that the generated features # are stored in the subfolder `results/none` of the output folder. config_dict["fdr_estimation_method"] = "NONE" config_dict["quantification"] = False @@ -282,16 +359,10 @@ def main(): json_file.write(json.dumps(config_dict)) is_successfull = False - for i in range(OKTOBERFEST_RETRIES): + for _ in range(OKTOBERFEST_RETRIES): try: - ok_runner.run_job(config_path) - except ValueError as e: - # Catch the specific ValueError raised when "NONE" is used as fdr_estimation - if str(e) != OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD: - raise e - else: - is_successfull = True - break + feature_generation(config_path) + is_successfull = True except tritonclient.utils.InferenceServerException as e: if str(e.status) == "504": logging.error("Koina server not available, retrying in 10 seconds...") @@ -302,6 +373,5 @@ def main(): exit(101) - if __name__ == "__main__": main() From 22cf67acbc52fcbb7c36f53f53918619b5f03569 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Tue, 22 Jul 2025 08:44:03 +0000 Subject: [PATCH 25/31] get rid of deprecation warning --- bin/oktoberfest_feature_gen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 363734f..a5a3bc5 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -319,8 +319,8 @@ def main(): # Some search engines do not provide protein accessions for decoys. # In this case, we set the PROTEINS column to the `PEP_y` like in the Oktberfest docs. - oktoberfest_df["PROTEINS"].replace("", pd.NA, inplace=True) - oktoberfest_df["PROTEINS"].fillna("PEP_" + oktoberfest_df["MODIFIED_SEQUENCE"], inplace=True) + oktoberfest_df.replace({"PROTEINS": ""}, pd.NA, inplace=True) + oktoberfest_df.fillna({"PROTEINS": "PEP_" + oktoberfest_df["MODIFIED_SEQUENCE"]}, inplace=True) oktoberfest_df.to_csv( oktoberfest_input_csv_path, sep=",", From 2a1c86f0edfa4a004132bbffe00ad81280cc9a1a Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Tue, 22 Jul 2025 08:44:36 +0000 Subject: [PATCH 26/31] correct comment --- bin/oktoberfest_feature_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index a5a3bc5..4873fd3 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -318,7 +318,7 @@ def main(): ) # Some search engines do not provide protein accessions for decoys. - # In this case, we set the PROTEINS column to the `PEP_y` like in the Oktberfest docs. + # In this case, we set the PROTEINS column to the `PEP_` like in the ms2rescore. oktoberfest_df.replace({"PROTEINS": ""}, pd.NA, inplace=True) oktoberfest_df.fillna({"PROTEINS": "PEP_" + oktoberfest_df["MODIFIED_SEQUENCE"]}, inplace=True) oktoberfest_df.to_csv( From 760cc2919cf3f99deebf075964f1dd3ee7fc7ca9 Mon Sep 17 00:00:00 2001 From: Dirk Winkelhardt Date: Tue, 22 Jul 2025 08:58:07 +0000 Subject: [PATCH 27/31] remove unnecessary filter --- bin/oktoberfest_feature_gen.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 4873fd3..2b18e03 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -309,14 +309,6 @@ def main(): "Removed %i PSMs with unsupported amino acids: %s", psms_len - len(oktoberfest_df), OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS ) - # Filter peptide length > 30 - psms_len = len(oktoberfest_df) - oktoberfest_df = oktoberfest_df[oktoberfest_df["PEPTIDE_LENGTH"] <= 30] - if len(oktoberfest_df) < psms_len: - logging.warning( - "Removed %i PSMs with peptide length > 30", psms_len - len(oktoberfest_df) - ) - # Some search engines do not provide protein accessions for decoys. # In this case, we set the PROTEINS column to the `PEP_` like in the ms2rescore. oktoberfest_df.replace({"PROTEINS": ""}, pd.NA, inplace=True) From 66730459de446d80327d3545ee82f4aa7212f7b3 Mon Sep 17 00:00:00 2001 From: julianu Date: Mon, 1 Sep 2025 16:05:59 +0000 Subject: [PATCH 28/31] typo fixes --- bin/oktoberfest_feature_gen.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 2b18e03..9f28abd 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -327,13 +327,13 @@ def main(): # misc config_dict["output"] = str(args.out_folder) - config_dict["num_threads"] = 1 # Set to 1 for debugging, can be increased later + config_dict["numThreads"] = 1 # Set to 1 for debugging, can be increased later # mass spec parameters - config_dict["mass_tolerance"] = args.mass_tolerance + config_dict["massTolerance"] = args.mass_tolerance config_dict["unitMassTolerance"] = args.mass_tolerance_unit # predicition params - config_dict["models"]["irt_model"] = args.irt_model - config_dict["models"]["intensity_model"] = args.intensity_model + config_dict["models"]["irt"] = args.irt_model + config_dict["models"]["intensity"] = args.intensity_model # input params config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path) config_dict["inputs"]["search_results_type"] = "Internal" From f5a70db73f2b0a1a167858a0fccf97865272c396 Mon Sep 17 00:00:00 2001 From: julianu Date: Tue, 2 Sep 2025 11:56:09 +0000 Subject: [PATCH 29/31] fixing typo in regex for spectra --- src/identification/comet_identification.nf | 2 +- src/identification/msamanda_identification.nf | 2 +- src/identification/msfragger_identification.nf | 2 +- src/identification/msgfplus_identification.nf | 2 +- src/identification/sage_identification.nf | 2 +- src/identification/xtandem_identification.nf | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf index 18d3f9e..87b4953 100644 --- a/src/identification/comet_identification.nf +++ b/src/identification/comet_identification.nf @@ -8,7 +8,7 @@ params.comet_mem = "8 GB" params.comet_psm_id_pattern = "(.*)" params.comet_spectrum_id_pattern = '.*scan=(\\d+)$' -params.comet_scan_id_pattern = '(?P\\d+)' +params.comet_scan_id_pattern = '^(?P\\d+)$' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf index 177c8c3..abbe794 100644 --- a/src/identification/msamanda_identification.nf +++ b/src/identification/msamanda_identification.nf @@ -8,7 +8,7 @@ params.msamanda_mem = "64 GB" params.msamanda_psm_id_pattern = "(.*)" params.msamanda_spectrum_id_pattern = '(.*)' -params.msamanda_scan_id_pattern = '.*scan=(?P\\d+)*.' +params.msamanda_scan_id_pattern = '.*scan=(?P\\d+)$' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf index fbfebe0..7a21f0b 100644 --- a/src/identification/msfragger_identification.nf +++ b/src/identification/msfragger_identification.nf @@ -10,7 +10,7 @@ params.msfragger_calibrate = 2 params.msfragger_psm_id_pattern = "(.*)" params.msfragger_spectrum_id_pattern = "(.*)" -params.msfragger_scan_id_pattern = '.*scan=(?P\\d+)*.' +params.msfragger_scan_id_pattern = '.*scan=(?P\\d+)$' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf index c6815a3..489855c 100644 --- a/src/identification/msgfplus_identification.nf +++ b/src/identification/msgfplus_identification.nf @@ -16,7 +16,7 @@ params.msgfplus_split_fasta = 0 // split the fasta into this many chunks params.msgfplus_psm_id_pattern = "(.*)" params.msgfplus_spectrum_id_pattern = '(.*)' -params.msgfplus_scan_id_pattern = '.*scan=(?P\\d+)*.' +params.msgfplus_scan_id_pattern = '.*scan=(?P\\d+)$' include {convert_chunked_result_to_psm_utils; enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf index 5758e9e..8a126d8 100644 --- a/src/identification/sage_identification.nf +++ b/src/identification/sage_identification.nf @@ -10,7 +10,7 @@ params.sage_prefilter_chunk_size = 0 params.sage_psm_id_pattern = "(.*)" params.sage_spectrum_id_pattern = '(.*)' -params.sage_scan_id_pattern = '.*scan=(?P\\d+)*.' +params.sage_scan_id_pattern = '.*scan=(?P\\d+)$' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf index 58f6c23..38c6459 100644 --- a/src/identification/xtandem_identification.nf +++ b/src/identification/xtandem_identification.nf @@ -8,7 +8,7 @@ params.xtandem_mem = "128 GB" params.xtandem_psm_id_pattern = "(.*)" params.xtandem_spectrum_id_pattern = '(.*)' -params.xtandem_scan_id_pattern = '.*scan=(?P\\d+)*.' +params.xtandem_scan_id_pattern = '.*scan=(?P\\d+)$' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' From 092dc6206150689419a08ff8304feb09db687a47 Mon Sep 17 00:00:00 2001 From: julianu Date: Wed, 3 Sep 2025 07:17:56 +0000 Subject: [PATCH 30/31] fixingf usage of d. folder only for maxquant tims data --- bin/oktoberfest_feature_gen.py | 2 +- src/identification/maxquant_identification.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 9f28abd..7b31879 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -338,7 +338,7 @@ def main(): config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path) config_dict["inputs"]["search_results_type"] = "Internal" config_dict["inputs"]["spectra"] = str(args.spectra_file) - config_dict["inputs"]["spectra_type"] = "d" if args.is_timstof else "mzml" + config_dict["inputs"]["spectra_type"] = args.spectra_file.suffix.replace(".", "").lower() # Setting this to none has the effect, that the generated features # are stored in the subfolder `results/none` of the output folder. config_dict["fdr_estimation_method"] = "NONE" diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf index c330219..fa894e1 100644 --- a/src/identification/maxquant_identification.nf +++ b/src/identification/maxquant_identification.nf @@ -71,7 +71,7 @@ workflow maxquant_identification { } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), mzmls.collect(), scan_id_pattern) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), scan_id_pattern) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) From 326dec3a89d55ddb5e0e6bc56c87212d51a76784 Mon Sep 17 00:00:00 2001 From: julianu Date: Thu, 4 Sep 2025 06:01:09 +0000 Subject: [PATCH 31/31] always use mzML for oktoberfest (also for timsTOF) --- bin/oktoberfest_feature_gen.py | 8 +------- src/identification/maxquant_identification.nf | 7 ++++++- src/postprocessing/oktoberfest.nf | 1 - 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py index 7b31879..a6f1415 100755 --- a/bin/oktoberfest_feature_gen.py +++ b/bin/oktoberfest_feature_gen.py @@ -120,12 +120,6 @@ def argparse_setup() -> argparse.Namespace: choices=["da", "ppm"], ) - parser.add_argument( - "-is-timstof", - help="If true, the spectra file type is set to 'd' (for timsTOF); otherwise, it defaults to 'mzml'", - type=parse_str_bool - ) - parser.add_argument( "-scan-id-regex", help=( @@ -338,7 +332,7 @@ def main(): config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path) config_dict["inputs"]["search_results_type"] = "Internal" config_dict["inputs"]["spectra"] = str(args.spectra_file) - config_dict["inputs"]["spectra_type"] = args.spectra_file.suffix.replace(".", "").lower() + config_dict["inputs"]["spectra_type"] = "mzml" # Setting this to none has the effect, that the generated features # are stored in the subfolder `results/none` of the output folder. config_dict["fdr_estimation_method"] = "NONE" diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf index fa894e1..54b9476 100644 --- a/src/identification/maxquant_identification.nf +++ b/src/identification/maxquant_identification.nf @@ -65,13 +65,18 @@ workflow maxquant_identification { } if (params.is_timstof) { + // MS2Rescore takes the .d files psm_tsvs_and_spectrafiles = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msms')) + '.d' ] } + + // oktoberfest needs the mzML files + psm_tsvs_and_spectra_oktoberfest = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msms')) + '.mzML' ] } } else { psm_tsvs_and_spectrafiles = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msms')) + '.mzML' ] } + psm_tsvs_and_spectra_oktoberfest = psm_tsvs_and_spectrafiles } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant') - oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), scan_id_pattern) + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectra_oktoberfest, psm_tsvs.collect(), mzmls.collect(), scan_id_pattern) // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf index 071e452..28e9165 100644 --- a/src/postprocessing/oktoberfest.nf +++ b/src/postprocessing/oktoberfest.nf @@ -68,7 +68,6 @@ process run_oktoberfest_feature_gen { -irt-model ${params.oktoberfest_irt_model} \ -mass-tolerance ${fragment_tol_da} \ -mass-tolerance-unit da \ - -is-timstof ${params.is_timstof} \ -scan-id-regex '${scan_id_regex}' \ mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs}.features.tsv"