diff --git a/.gitignore b/.gitignore index 4ce9325..90c0236 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,11 @@ .nextflow/ .nextflow.log* work/ + +# workflow results +results/ + + # binary nextflow trace* diff --git a/Makefile b/Makefile index 7c19bdd..50b3f98 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ docker-imgs: docker pull ghcr.io/medbioinf/pipeline-of-identification:latest - + docker pull proteowizard/pwiz-skyline-i-agree-to-the-vendor-licenses:3.0.25073-842baef docker pull quay.io/medbioinf/tdf2mzml:0.4 docker pull quay.io/medbioinf/openms:3.4.1 @@ -17,3 +17,5 @@ docker-imgs: docker pull ghcr.io/percolator/percolator:branch-3-08 docker build --platform linux/amd64 -t medbioinf/msfragger -f docker/msfragger/Dockerfile docker/msfragger/. + + docker build --platform linux/amd64 -t medbioinf/oktoberfest:latest -f docker/oktoberfest/Dockerfile docker/oktoberfest diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py new file mode 100755 index 0000000..a6f1415 --- /dev/null +++ b/bin/oktoberfest_feature_gen.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python + +""" +Generates features for PSM-rescoring using Oktoberfest. +The rescoring itself is suppresed by setting an unknown FDR estimation method. +""" + +import argparse +import copy +from functools import reduce +import json +import logging +from pathlib import Path +import re +from time import sleep +from typing import Union + +import oktoberfest as ok +from oktoberfest.runner import _preprocess, _ce_calib, _refinement_learn, _calculate_features +from oktoberfest.utils import Config, JobPool, ProcessStep +from oktoberfest import rescore as ok_re +from oktoberfest import preprocessing as ok_pp +import pandas as pd +import psm_utils +import psm_utils.io +import tritonclient + + +OKTOBERFEST_RETRIES = 5 +"""Number of retries for the oktberfest job in case of a server error.""" + +OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS = {"U", "O"} +"""According to documentation, Oktoberfest does not support the amino acids U and O.""" + +OKTOBERFEST_MODIFICATION_REPLACEMENTS = [ + ("C[UNIMOD:Carbamidomethyl]", "C[UNIMOD:4]"), + ("C-[UNIMOD:Carbamidomethyl]", "C[UNIMOD:4]"), # MaxQant-specific if C is last in sequence, PSM is annotated with C- instead of C + ("M[UNIMOD:Oxidation]", "M[UNIMOD:35]"), + ("M-[UNIMOD:Oxidation]", "M[UNIMOD:35]"), # MaxQant-specific if M is last in sequence, PSM is annotated with M- instead of M +] + +def parse_str_bool(value: str) -> bool: + """ + Parses a string argument to a boolean value. + + Arguments + --------- + value : str + The string value to parse. Accepts 'true', 'false', '1', ' + + Returns + ------- + bool + Returns True for 'true' or '1', and False for 'false' or ' + + Raises + ------ + ValueError + If the value is not a valid boolean representation. + """ + if value.lower() in ['true', '1']: + return True + elif value.lower() in ['false', '0']: + return False + else: + raise ValueError(f"Invalid boolean value: {value}") + +def get_scan_id(spectrum_id: str, scan_id_regex: re.Pattern) -> int: + """ + Using the provided regex to extract the scan number from the spectrum ID. + Arguments + --------- + spectrum_id : str + The spectrum ID from which to extract the scan number. + scan_id_regex : re.Pattern + A compiled regular expression pattern to match the scan number. + + Returns + ------- + int + The extracted scan number as an integer. + """ + match = scan_id_regex.match(spectrum_id) + if not match: + raise ValueError(f"Could not extract scan number from spectrum ID: {spectrum_id}") + return int(match.group("scan_id")) + +def argparse_setup() -> argparse.Namespace: + """ + Creates the argument parser for the Oktoberfest feature generation script. + """ + + parser = argparse.ArgumentParser() + # files + parser.add_argument( + "-psms-file", help="Input PSMs TSV file", required=True, type=Path + ) + parser.add_argument( + "-spectra-file", + help="Corresponding spectrum file for PSMs file", + required=True, + type=Path, + ) + + # prediction + parser.add_argument("-intensity-model", help="Koina intensity model", type=str) + parser.add_argument("-irt-model", help="Koina IRT model", type=str) + + # mass spec parameters + parser.add_argument( + "-mass-tolerance", + help="Defines the allowed tolerance between theoretical and experimentally observered fragment mass during peak annotation; default = 20 (FTMS), 40 (TOF), 0.35 (ITMS)", + default=20, + type=float, + ) + parser.add_argument( + "-mass-tolerance-unit", + help="Defines the measure of tolerance, either “da” or “ppm”; default = da (mass analyzer is ITMS), ppm (mass analyzer is FTMS or TOF)", + type=str, + choices=["da", "ppm"], + ) + + parser.add_argument( + "-scan-id-regex", + help=( + "Regular expression to extract the scan number from the spectrum ID." + "Use `scan_id` for the matching group, e.g. `scan=(?P\\d+)`)" + ), + type=str, + ) + + parser.add_argument( + "-out-folder", help="Output folder for ", required=True, type=Path + ) + + return parser.parse_args() + + +def feature_generation(config_path: Union[str, Path]): + """ + Parts of Oktoberfest's [run_rescore-function without the rescoring step](https://github.com/wilhelm-lab/oktoberfest/blob/ce8d909ebf64aaaf9c0eebcc2bb33b9c4492ae90/oktoberfest/runner.py#L1238-L1312) + with some renamed dependencies to avoid conflicts e.g. `re` => `ok_re` (for Oktoberfests rescoring module) to avoid conflicts with the built-in `re` module. + + Arguments + --------- + config_path : Union[str, Path] + Path to the configuration file for Oktoberfest. This file should contain all necessary parameters for the + """ + config = Config() + config.read(config_path) + config.check() + + # load spectra file names + spectra_files = ok_pp.list_spectra(input_dir=config.spectra, input_format=config.spectra_type) + + proc_dir = config.output / "proc" + proc_dir.mkdir(parents=True, exist_ok=True) + + spectra_files = _preprocess(spectra_files, config) + + # TODO is this the most elegant way to multi-thread CE calibration before running refinement learning? + # Should we store the returned libraries and pass them to _calculate_features and _refinement_learn instead of + # _ce_calib returning cached outputs? + if config.num_threads > 1: + processing_pool = JobPool(processes=config.num_threads) + for spectra_file in spectra_files: + _ = processing_pool.apply_async(_ce_calib, [spectra_file, config]) + processing_pool.check_pool() + else: + for spectra_file in spectra_files: + _ = _ce_calib(spectra_file, config) + + if config.do_refinement_learning: + _refinement_learn(spectra_files, config) + + if config.num_threads > 1: + processing_pool = JobPool(processes=config.num_threads) + for spectra_file in spectra_files: + if "xl" in config.models["intensity"].lower(): + if "cms2" in config.models["intensity"].lower(): + cms2 = True + else: + cms2 = False + processing_pool.apply_async(_calculate_features, [spectra_file, config], xl=True, cms2=cms2) + else: + processing_pool.apply_async(_calculate_features, [spectra_file, config]) + processing_pool.check_pool() + else: + for spectra_file in spectra_files: + if "xl" in config.models["intensity"].lower(): + if "cms2" in config.models["intensity"].lower(): + cms2 = True + else: + cms2 = False + _calculate_features(spectra_file, config, xl=True, cms2=cms2) + else: + _calculate_features(spectra_file, config) + + # prepare rescoring + + fdr_dir = config.output / "results" / config.fdr_estimation_method + original_tab_files = [fdr_dir / spectra_file.with_suffix(".original.tab").name for spectra_file in spectra_files] + rescore_tab_files = [fdr_dir / spectra_file.with_suffix(".rescore.tab").name for spectra_file in spectra_files] + + prepare_tab_original_step = ProcessStep(config.output, f"{config.fdr_estimation_method}_prepare_tab_original") + prepare_tab_rescore_step = ProcessStep(config.output, f"{config.fdr_estimation_method}_prepare_tab_prosit") + + if not prepare_tab_original_step.is_done(): + logging.info("Merging input tab files for rescoring without peptide property prediction") + ok_re.merge_input(tab_files=original_tab_files, output_file=fdr_dir / "original.tab") + prepare_tab_original_step.mark_done() + + if not prepare_tab_rescore_step.is_done(): + logging.info("Merging input tab files for rescoring with peptide property prediction") + ok_re.merge_input(tab_files=rescore_tab_files, output_file=fdr_dir / "rescore.tab") + prepare_tab_rescore_step.mark_done() + + +def main(): + """ + Generates features for PSM-rescoring using Oktoberfest. + The rescoring itself is suppresed by setting an unknown FDR estimation method. + + The resulting features can be found as `/none/rescore.tab` + """ + + args = argparse_setup() + logging.basicConfig(level=logging.INFO) + scan_id_regex = re.compile(args.scan_id_regex) + + oktoberfest_input_csv_path = args.psms_file.with_suffix(".oktoberfest.input.csv") + + psms = psm_utils.io.read_file(args.psms_file) + + # Necessary columns according to the docs: + # RAW_FILE,SCAN_NUMBER,MODIFIED_SEQUENCE,PRECURSOR_CHARGE, + # SCAN_EVENT_NUMBER,MASS,SCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH + oktoberfest_df = pd.DataFrame() + + # RAW_FILE, + oktoberfest_df["RAW_FILE"] = [args.spectra_file.stem] * len(psms) + + # SCAN_NUMBER + oktoberfest_df["SCAN_NUMBER"] = [get_scan_id(psm.spectrum_id, scan_id_regex) for psm in psms] + + # MODIFIED_SEQUENCE + oktoberfest_df["MODIFIED_SEQUENCE"] = [ + psm.peptidoform.modified_sequence for psm in psms + ] + # sequentially apply the replacements using functools.reduce + oktoberfest_df["MODIFIED_SEQUENCE"] = oktoberfest_df["MODIFIED_SEQUENCE"].apply( + lambda seq: reduce( + lambda seq_x, repl: seq_x.replace(repl[0], repl[1]), # lambda to replace + OKTOBERFEST_MODIFICATION_REPLACEMENTS, # replacements + seq # starting with the sequences as stated in the PSMs file + ) + ) + + # PRECURSOR_CHARGE + oktoberfest_df["PRECURSOR_CHARGE"] = [psm.get_precursor_charge() for psm in psms] + + # MASS + oktoberfest_df["MASS"] = [psm.peptidoform.theoretical_mass for psm in psms] + + # SCORE + # TODO: Does the psmutil score means higher is better? + oktoberfest_df["SCORE"] = [psm.score for psm in psms] + + # REVERSE + oktoberfest_df["REVERSE"] = [psm.is_decoy for psm in psms] + + # SEQUENCE + oktoberfest_df["SEQUENCE"] = [psm.peptidoform.sequence for psm in psms] + + # PEPTIDE_LENGTH + oktoberfest_df["PEPTIDE_LENGTH"] = [len(psm.peptidoform.sequence) for psm in psms] + + # free up some memory + del psms + + psms_df = pd.read_csv(args.psms_file, sep="\t") + + # PROTEINS (not in the docs, but required by oktberfest) + oktoberfest_df["PROTEINS"] = psms_df["protein_list"].apply( + lambda x: x.replace("[", "").replace("]", "").replace("'", "") + ) # remove the brackets and quotes + + # adding present rescoring columns + present_rescoring_cols = list( + filter(lambda x: x.startswith("rescoring:"), psms_df.columns) + ) + for rescoring_col in present_rescoring_cols: + oktoberfest_df[rescoring_col] = psms_df[rescoring_col] + + # free up some more memory as the dataframe is read fromt disk again + del psms_df + + # Filter unsupported amino acids + psms_len = len(oktoberfest_df) + oktoberfest_df = oktoberfest_df[~oktoberfest_df["SEQUENCE"].str.contains("|".join(OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS), regex=True)] + if len(oktoberfest_df) < psms_len: + logging.warning( + "Removed %i PSMs with unsupported amino acids: %s", psms_len - len(oktoberfest_df), OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS + ) + + # Some search engines do not provide protein accessions for decoys. + # In this case, we set the PROTEINS column to the `PEP_` like in the ms2rescore. + oktoberfest_df.replace({"PROTEINS": ""}, pd.NA, inplace=True) + oktoberfest_df.fillna({"PROTEINS": "PEP_" + oktoberfest_df["MODIFIED_SEQUENCE"]}, inplace=True) + oktoberfest_df.to_csv( + oktoberfest_input_csv_path, + sep=",", + index=False, + ) + + # free up more memory + del oktoberfest_df + + # create the config file + config_dict = copy.deepcopy(ok.utils.example_configs.RESCORING) + + # misc + config_dict["output"] = str(args.out_folder) + config_dict["numThreads"] = 1 # Set to 1 for debugging, can be increased later + # mass spec parameters + config_dict["massTolerance"] = args.mass_tolerance + config_dict["unitMassTolerance"] = args.mass_tolerance_unit + # predicition params + config_dict["models"]["irt"] = args.irt_model + config_dict["models"]["intensity"] = args.intensity_model + # input params + config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path) + config_dict["inputs"]["search_results_type"] = "Internal" + config_dict["inputs"]["spectra"] = str(args.spectra_file) + config_dict["inputs"]["spectra_type"] = "mzml" + # Setting this to none has the effect, that the generated features + # are stored in the subfolder `results/none` of the output folder. + config_dict["fdr_estimation_method"] = "NONE" + config_dict["quantification"] = False + config_dict["add_feature_cols"] = "all" + + config_path = Path("oktoberfest.config.json") + + with config_path.open("w", encoding="utf-8") as json_file: + json_file.write(json.dumps(config_dict)) + + is_successfull = False + for _ in range(OKTOBERFEST_RETRIES): + try: + feature_generation(config_path) + is_successfull = True + except tritonclient.utils.InferenceServerException as e: + if str(e.status) == "504": + logging.error("Koina server not available, retrying in 10 seconds...") + sleep(10) + + if not is_successfull: + logging.error("Oktoberfest job failed after multiple retries.") + exit(101) + + +if __name__ == "__main__": + main() diff --git a/bin/oktoberfest_feature_to_pin.py b/bin/oktoberfest_feature_to_pin.py new file mode 100755 index 0000000..4293a53 --- /dev/null +++ b/bin/oktoberfest_feature_to_pin.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +""" +Converts Oktoberfest feature files to Percolator's PIN file. +""" + +import argparse +import logging +from pathlib import Path + +import pandas as pd + +COLS_TO_REMOVE = [ + "filename" # str column +] +"""Columns to remove from the Oktoberfest feature file. +E.g. due to wrong type +""" + +COLS_TO_RENAME = { + "SpecId": "id", +} +"""Columns to rename in the Oktoberfest feature file. +""" + + +def argparse_setup() -> argparse.Namespace: + """ + Creates the argument parser for the Oktoberfest feature generation script. + """ + + parser = argparse.ArgumentParser() + # files + parser.add_argument( + "-in-file", help="Input feature TSV file", required=True, type=Path + ) + parser.add_argument("-out-file", help="Pin file ", required=True, type=Path) + + return parser.parse_args() + + +def main(): + """ + Converts Oktoberfest feature files to Percolator's PIN file. + """ + + args = argparse_setup() + logging.basicConfig(level=logging.INFO) + + # feature dataframe + feature_df = pd.read_csv(args.in_file, sep="\t") + + for col in COLS_TO_REMOVE: + if col in feature_df.columns: + feature_df.drop(columns=col, inplace=True) + + for col, new_col in COLS_TO_RENAME.items(): + if col in feature_df.columns: + feature_df.rename(columns={col: new_col}, inplace=True) + + for col in feature_df.columns: + feature_df.rename(columns={col: col.lower()}, inplace=True) + + feature_df.to_csv( + args.out_file, + sep="\t", + index=False, + ) + +if __name__ == "__main__": + main() diff --git a/docker/oktoberfest/Dockerfile b/docker/oktoberfest/Dockerfile new file mode 100644 index 0000000..d083afb --- /dev/null +++ b/docker/oktoberfest/Dockerfile @@ -0,0 +1,29 @@ +# AMD64 needed explicitly on ARM as some sofwtare is only available in AMD64 +FROM --platform=amd64 mambaorg/micromamba:2.1.0-ubuntu22.04 + +WORKDIR /home/mambauser +# Copy backend and environment.yml +COPY --chown=mambauser:mambauser environment.yml . +COPY --chown=mambauser:mambauser requirements.txt . + +USER root + +RUN apt update \ + && apt install -y libglib2.0-0 git \ + && apt clean + +USER mambauser +ENV HOME=/home/mambauser +ENV ENV_NAME=oktoberfest + +RUN echo 'show_banner: false' > ~/.mambarc + +RUN micromamba env create -y -f environment.yml \ + && micromamba clean --all --yes + +# TODO: remove build-essential + +USER root +# First is necessary for base_image to actvate the conda environment second is entrypoint +# which adds the python file to PATH +ENTRYPOINT [ "/usr/local/bin/_entrypoint.sh"] diff --git a/docker/oktoberfest/environment.yml b/docker/oktoberfest/environment.yml new file mode 100644 index 0000000..8a46af3 --- /dev/null +++ b/docker/oktoberfest/environment.yml @@ -0,0 +1,10 @@ +name: oktoberfest +channels: + - defaults +dependencies: + - python=3.11 + - pip + - setuptools + - pip: + - -r requirements.txt + diff --git a/docker/oktoberfest/requirements.txt b/docker/oktoberfest/requirements.txt new file mode 100644 index 0000000..fe5f1b5 --- /dev/null +++ b/docker/oktoberfest/requirements.txt @@ -0,0 +1,2 @@ +oktoberfest~=0.10.0 +psm-utils @ git+https://github.com/julianu/psm_utils.git@pepxml-and-mzid-fixes \ No newline at end of file diff --git a/main.nf b/main.nf index 2127322..8174524 100644 --- a/main.nf +++ b/main.nf @@ -4,6 +4,7 @@ nextflow.preview.output = true // default python image params.python_image = 'ghcr.io/medbioinf/pipeline-of-identification:latest' +params.oktoberfest_image = 'medbioinf/oktoberfest' // parameters set by the command line params.raw_files = '' diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf index dce4596..87b4953 100644 --- a/src/identification/comet_identification.nf +++ b/src/identification/comet_identification.nf @@ -8,10 +8,12 @@ params.comet_mem = "8 GB" params.comet_psm_id_pattern = "(.*)" params.comet_spectrum_id_pattern = '.*scan=(\\d+)$' +params.comet_scan_id_pattern = '^(?P\\d+)$' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' /** * Exports the identification using Comet configured by a SDRF files @@ -38,9 +40,11 @@ workflow comet_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.comet_psm_id_pattern, params.comet_spectrum_id_pattern, '^DECOY_', 'comet') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.comet_scan_id_pattern) - // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: comet_mzids >> 'comet' @@ -49,6 +53,8 @@ workflow comet_identification { pout_files >> 'comet' ms2rescore_pins >> 'comet' ms2rescore_percolator_results >> 'comet' + oktoberfest_pins >> 'comet' + oktoberfest_percolator_results >> 'comet' } diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf index 21bc147..54b9476 100644 --- a/src/identification/maxquant_identification.nf +++ b/src/identification/maxquant_identification.nf @@ -8,10 +8,12 @@ params.maxquant_mem = "32 GB" params.maxquant_psm_id_pattern = "" params.maxquant_spectrum_id_pattern = "" +params.maxquant_scan_id_pattern = "" include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' /** * Executes the identification using MaxQuant @@ -55,17 +57,30 @@ workflow maxquant_identification { spectrum_id_pattern = '.*scan=(\\d+)$' } } + if (params.maxquant_scan_id_pattern) { + scan_id_pattern = params.maxquant_scan_id_pattern + } else{ + // no difference between psm TSVs derived from Bruker and Thermo measurments + scan_id_pattern = '(?P\\d+)' + } if (params.is_timstof) { + // MS2Rescore takes the .d files psm_tsvs_and_spectrafiles = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msms')) + '.d' ] } + + // oktoberfest needs the mzML files + psm_tsvs_and_spectra_oktoberfest = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msms')) + '.mzML' ] } } else { psm_tsvs_and_spectrafiles = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msms')) + '.mzML' ] } + psm_tsvs_and_spectra_oktoberfest = psm_tsvs_and_spectrafiles } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectra_oktoberfest, psm_tsvs.collect(), mzmls.collect(), scan_id_pattern) - // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: maxquant_results >> 'maxquant' @@ -74,6 +89,8 @@ workflow maxquant_identification { pout_files >> 'maxquant' ms2rescore_pins >> 'maxquant' ms2rescore_percolator_results >> 'maxquant' + oktoberfest_pins >> 'maxquant' + oktoberfest_percolator_results >> 'maxquant' } diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf index 69a8ae7..abbe794 100644 --- a/src/identification/msamanda_identification.nf +++ b/src/identification/msamanda_identification.nf @@ -8,10 +8,12 @@ params.msamanda_mem = "64 GB" params.msamanda_psm_id_pattern = "(.*)" params.msamanda_spectrum_id_pattern = '(.*)' +params.msamanda_scan_id_pattern = '.*scan=(?P\\d+)$' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' // msamanda needs explicit "scan=" in the id of a scan (not there in e.g. TimsTOF converted mzML data) // 1) ms-convert with "--noindex" @@ -43,9 +45,11 @@ workflow msamanda_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msamanda.csv')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msamanda_psm_id_pattern, params.msamanda_spectrum_id_pattern, '^DECOY_', 'msamanda') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msamanda_scan_id_pattern) - // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: msamanda_results.msamanda_csv >> 'msamanda' @@ -54,6 +58,8 @@ workflow msamanda_identification { pout_files >> 'msamanda' ms2rescore_pins >> 'msamanda' ms2rescore_percolator_results >> 'msamanda' + oktoberfest_pins >> 'msamanda' + oktoberfest_percolator_results >> 'msamanda' } diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf index ac8fad1..7a21f0b 100644 --- a/src/identification/msfragger_identification.nf +++ b/src/identification/msfragger_identification.nf @@ -10,10 +10,12 @@ params.msfragger_calibrate = 2 params.msfragger_psm_id_pattern = "(.*)" params.msfragger_spectrum_id_pattern = "(.*)" +params.msfragger_scan_id_pattern = '.*scan=(?P\\d+)$' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' workflow msfragger_identification { take: @@ -37,9 +39,11 @@ workflow msfragger_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.pepXML')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msfragger_psm_id_pattern, params.msfragger_spectrum_id_pattern, '^DECOY_', 'msfragger') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msfragger_scan_id_pattern) - // // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: fragger_results_pepxml >> 'msfragger' @@ -48,6 +52,8 @@ workflow msfragger_identification { pout_files >> 'msfragger' ms2rescore_pins >> 'msfragger' ms2rescore_percolator_results >> 'msfragger' + oktoberfest_pins >> 'msfragger' + oktoberfest_percolator_results >> 'msfragger' } diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf index 2429ab8..489855c 100644 --- a/src/identification/msgfplus_identification.nf +++ b/src/identification/msgfplus_identification.nf @@ -16,10 +16,12 @@ params.msgfplus_split_fasta = 0 // split the fasta into this many chunks params.msgfplus_psm_id_pattern = "(.*)" params.msgfplus_spectrum_id_pattern = '(.*)' +params.msgfplus_scan_id_pattern = '.*scan=(?P\\d+)$' include {convert_chunked_result_to_psm_utils; enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' include {split_mzml_into_chunks} from '../preprocess/convert_to_mzml.nf' @@ -91,9 +93,11 @@ workflow msgfplus_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msgfplus_psm_id_pattern, params.msgfplus_spectrum_id_pattern, '^DECOY_', 'msgfplus') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msgfplus_scan_id_pattern) - // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: fasta_merged_results.map{ it -> it[1] } >> 'msgfplus' @@ -102,6 +106,8 @@ workflow msgfplus_identification { pout_files >> 'msgfplus' ms2rescore_pins >> 'msgfplus' ms2rescore_percolator_results >> 'msgfplus' + oktoberfest_pins >> 'msgfplus' + oktoberfest_percolator_results >> 'msgfplus' } process identification_with_msgfplus { diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf index 5654799..8a126d8 100644 --- a/src/identification/sage_identification.nf +++ b/src/identification/sage_identification.nf @@ -10,10 +10,12 @@ params.sage_prefilter_chunk_size = 0 params.sage_psm_id_pattern = "(.*)" params.sage_spectrum_id_pattern = '(.*)' +params.sage_scan_id_pattern = '.*scan=(?P\\d+)$' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' /** * Executes the identification using Sage @@ -49,9 +51,11 @@ workflow sage_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.sage')) ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.sage_psm_id_pattern, params.sage_spectrum_id_pattern, '^DECOY_', 'sage') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.sage_scan_id_pattern) - // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: return_files >> 'sage' @@ -60,6 +64,8 @@ workflow sage_identification { pout_files >> 'sage' ms2rescore_pins >> 'sage' ms2rescore_percolator_results >> 'sage' + oktoberfest_pins >> 'sage' + oktoberfest_percolator_results >> 'sage' } diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf index 2823f95..38c6459 100644 --- a/src/identification/xtandem_identification.nf +++ b/src/identification/xtandem_identification.nf @@ -8,10 +8,12 @@ params.xtandem_mem = "128 GB" params.xtandem_psm_id_pattern = "(.*)" params.xtandem_spectrum_id_pattern = '(.*)' +params.xtandem_scan_id_pattern = '.*scan=(?P\\d+)$' include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf' -include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf' +include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf' include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf' +include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf' /** * Exports the identification using Comet configured by a SDRF files @@ -39,9 +41,11 @@ workflow xtandem_identification { psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.xtandem_identification')) + '.mzML' ] } ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.xtandem_psm_id_pattern, params.xtandem_spectrum_id_pattern, '^DECOY_', 'xtandem') + oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.xtandem_scan_id_pattern) - // perform percolation on MS2Rescore results + // perform percolation ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins) + oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins) publish: tandem_xmls >> 'xtandem' @@ -50,6 +54,8 @@ workflow xtandem_identification { pout_files >> 'xtandem' ms2rescore_pins >> 'xtandem' ms2rescore_percolator_results >> 'xtandem' + oktoberfest_pins >> 'xtandem' + oktoberfest_percolator_results >> 'xtandem' } /** diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf new file mode 100644 index 0000000..28e9165 --- /dev/null +++ b/src/postprocessing/oktoberfest.nf @@ -0,0 +1,103 @@ +nextflow.enable.dsl=2 + +// parameters for oktoberfest +params.oktoberfest_memory = "64 GB" +params.oktoberfest_intensity_model = "Prosit_2020_intensity_HCD" +params.oktoberfest_irt_model = "Prosit_2019_irt" +params.oktoberfest_forks = 1 // have some mercy with the koina servers + +/** + * Runs oktoberfest rescoring for the given PSMs and mzML files. + * + * @param psm_tsvs_and_mzmls: A tuple containing the PSM utils TSV files and the mzML files for the PSMs. + * @param psm_tsvs: The PSM TSV files. + * @param mzmls: The mzML files. + * @param scan_id_regex: A regex pattern to extract the scan number from the spectrum ID. + * + * @return: The oktoberfest rescored PSMs in TSV format. + */ +workflow oktoberfest_rescore_workflow { + take: + psm_tsvs_and_mzmls + psm_tsvs + mzmls + scan_id_regex + + main: + oktoberfest_features = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, params.fragment_tol_da, scan_id_regex) + oktoberfest_pins = oktoberfest_features_to_pin(oktoberfest_features) + + + emit: + oktoberfest_pins +} + +/** + * @param psm_tsvs_and_mzmls: A tuple containing the PSM utils TSV files and the mzML files for the PSMs. + * @param psm_tsvs: The PSM TSV files. + * @param mzmls: The mzML files. + * @param fragment_tol_da: The fragment tolerance for the rescoring. + * @param scan_id_regex: A regex pattern to extract the scan number from the spectrum ID. + * + * @return The oktoberfest rescored PSMs in TSV format. + */ +process run_oktoberfest_feature_gen { + cpus 1 + maxForks params.oktoberfest_forks + memory { params.oktoberfest_memory } + + container { params.oktoberfest_image } + + input: + tuple val(psm_utils_tsvs), val(mzml_for_psms) + path psm_tsvs + path mzmls + val fragment_tol_da + val scan_id_regex + + output: + path "${psm_utils_tsvs}.features.tsv" + + script: + """ + oktoberfest_feature_gen.py \ + -out-folder ./oktoberfest_out \ + -psms-file ${psm_utils_tsvs} \ + -spectra-file ${mzml_for_psms} \ + -intensity-model ${params.oktoberfest_intensity_model} \ + -irt-model ${params.oktoberfest_irt_model} \ + -mass-tolerance ${fragment_tol_da} \ + -mass-tolerance-unit da \ + -scan-id-regex '${scan_id_regex}' \ + + mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs}.features.tsv" + + # Clean up the output directory + rm -r oktoberfest_out + """ +} + +/** + * @param okt_features_tsv: Oktoberfest feature file. + * + * @return Oktoberfest feature file in PIN format ready to use with percolator. + */ +process oktoberfest_features_to_pin { + cpus 1 + memory { params.oktoberfest_memory } + + container { params.oktoberfest_image } + + input: + path okt_features_tsv + + output: + path "${okt_features_tsv.baseName}.oktoberfest.pin" + + script: + """ + oktoberfest_feature_to_pin.py \ + -in-file ${okt_features_tsv} \ + -out-file ./${okt_features_tsv.baseName}.oktoberfest.pin + """ +}