Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
b5eef08
implement feature creation using oktoberfest
di-hardt Jul 11, 2025
a6bd038
removing todo about optional feature
di-hardt Jul 14, 2025
ecc3bbd
fix argument name
di-hardt Jul 14, 2025
45b6838
finalize arguments
di-hardt Jul 14, 2025
9ecd817
free up some memory
di-hardt Jul 14, 2025
89b8a3e
fix typos
di-hardt Jul 14, 2025
eeffc19
integration into workflow
di-hardt Jul 14, 2025
9e7dec6
build separate container for oktoberfest
di-hardt Jul 15, 2025
3611bbf
fix comment
di-hardt Jul 15, 2025
d800c75
fix filenames
di-hardt Jul 15, 2025
8d9fc9f
fix params
di-hardt Jul 15, 2025
7536143
import missing workflow
di-hardt Jul 15, 2025
014825d
correcting names
di-hardt Jul 15, 2025
d2e0111
fix variable name
di-hardt Jul 16, 2025
cf1a3f0
ignoring result folder
di-hardt Jul 16, 2025
946ab73
use regex to get scan id
di-hardt Jul 16, 2025
ea08484
implement retry on koina server error
di-hardt Jul 16, 2025
5c12737
limit max forks of oktoberfest feature generation
di-hardt Jul 16, 2025
66c85ed
select specific spectrum file instead of general folder
di-hardt Jul 17, 2025
173521e
fix missing proteins accession and attribute filters
di-hardt Jul 18, 2025
4e58ba1
fix column
di-hardt Jul 18, 2025
8cf6a20
fix issues with PSM at last amino acid in sequence
di-hardt Jul 21, 2025
9ff4dc6
add doc string
di-hardt Jul 21, 2025
62adfa8
copy parts of Oktoberfest's code for a more stable use
di-hardt Jul 22, 2025
22cf67a
get rid of deprecation warning
di-hardt Jul 22, 2025
2a1c86f
correct comment
di-hardt Jul 22, 2025
760cc29
remove unnecessary filter
di-hardt Jul 22, 2025
6673045
typo fixes
julianu Sep 1, 2025
f5a70db
fixing typo in regex for spectra
julianu Sep 2, 2025
092dc62
fixingf usage of d. folder only for maxquant tims data
julianu Sep 3, 2025
326dec3
always use mzML for oktoberfest (also for timsTOF)
julianu Sep 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
.nextflow/
.nextflow.log*
work/

# workflow results
results/


# binary
nextflow
trace*
Expand Down
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
docker-imgs:
docker pull ghcr.io/medbioinf/pipeline-of-identification:latest

docker pull proteowizard/pwiz-skyline-i-agree-to-the-vendor-licenses:3.0.25073-842baef
docker pull quay.io/medbioinf/tdf2mzml:0.4
docker pull quay.io/medbioinf/openms:3.4.1
Expand All @@ -17,3 +17,5 @@ docker-imgs:
docker pull ghcr.io/percolator/percolator:branch-3-08

docker build --platform linux/amd64 -t medbioinf/msfragger -f docker/msfragger/Dockerfile docker/msfragger/.

docker build --platform linux/amd64 -t medbioinf/oktoberfest:latest -f docker/oktoberfest/Dockerfile docker/oktoberfest
363 changes: 363 additions & 0 deletions bin/oktoberfest_feature_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,363 @@
#!/usr/bin/env python

"""
Generates features for PSM-rescoring using Oktoberfest.
The rescoring itself is suppresed by setting an unknown FDR estimation method.
"""

import argparse
import copy
from functools import reduce
import json
import logging
from pathlib import Path
import re
from time import sleep
from typing import Union

import oktoberfest as ok
from oktoberfest.runner import _preprocess, _ce_calib, _refinement_learn, _calculate_features
from oktoberfest.utils import Config, JobPool, ProcessStep
from oktoberfest import rescore as ok_re
from oktoberfest import preprocessing as ok_pp
import pandas as pd
import psm_utils
import psm_utils.io
import tritonclient


OKTOBERFEST_RETRIES = 5
"""Number of retries for the oktberfest job in case of a server error."""

OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS = {"U", "O"}
"""According to documentation, Oktoberfest does not support the amino acids U and O."""

OKTOBERFEST_MODIFICATION_REPLACEMENTS = [
("C[UNIMOD:Carbamidomethyl]", "C[UNIMOD:4]"),
("C-[UNIMOD:Carbamidomethyl]", "C[UNIMOD:4]"), # MaxQant-specific if C is last in sequence, PSM is annotated with C- instead of C
("M[UNIMOD:Oxidation]", "M[UNIMOD:35]"),
("M-[UNIMOD:Oxidation]", "M[UNIMOD:35]"), # MaxQant-specific if M is last in sequence, PSM is annotated with M- instead of M
]

def parse_str_bool(value: str) -> bool:
"""
Parses a string argument to a boolean value.

Arguments
---------
value : str
The string value to parse. Accepts 'true', 'false', '1', '

Returns
-------
bool
Returns True for 'true' or '1', and False for 'false' or '

Raises
------
ValueError
If the value is not a valid boolean representation.
"""
if value.lower() in ['true', '1']:
return True
elif value.lower() in ['false', '0']:
return False
else:
raise ValueError(f"Invalid boolean value: {value}")

def get_scan_id(spectrum_id: str, scan_id_regex: re.Pattern) -> int:
"""
Using the provided regex to extract the scan number from the spectrum ID.
Arguments
---------
spectrum_id : str
The spectrum ID from which to extract the scan number.
scan_id_regex : re.Pattern
A compiled regular expression pattern to match the scan number.

Returns
-------
int
The extracted scan number as an integer.
"""
match = scan_id_regex.match(spectrum_id)
if not match:
raise ValueError(f"Could not extract scan number from spectrum ID: {spectrum_id}")
return int(match.group("scan_id"))

def argparse_setup() -> argparse.Namespace:
"""
Creates the argument parser for the Oktoberfest feature generation script.
"""

parser = argparse.ArgumentParser()
# files
parser.add_argument(
"-psms-file", help="Input PSMs TSV file", required=True, type=Path
)
parser.add_argument(
"-spectra-file",
help="Corresponding spectrum file for PSMs file",
required=True,
type=Path,
)

# prediction
parser.add_argument("-intensity-model", help="Koina intensity model", type=str)
parser.add_argument("-irt-model", help="Koina IRT model", type=str)

# mass spec parameters
parser.add_argument(
"-mass-tolerance",
help="Defines the allowed tolerance between theoretical and experimentally observered fragment mass during peak annotation; default = 20 (FTMS), 40 (TOF), 0.35 (ITMS)",
default=20,
type=float,
)
parser.add_argument(
"-mass-tolerance-unit",
help="Defines the measure of tolerance, either “da” or “ppm”; default = da (mass analyzer is ITMS), ppm (mass analyzer is FTMS or TOF)",
type=str,
choices=["da", "ppm"],
)

parser.add_argument(
"-scan-id-regex",
help=(
"Regular expression to extract the scan number from the spectrum ID."
"Use `scan_id` for the matching group, e.g. `scan=(?P<scan_id>\\d+)`)"
),
type=str,
)

parser.add_argument(
"-out-folder", help="Output folder for ", required=True, type=Path
)

return parser.parse_args()


def feature_generation(config_path: Union[str, Path]):
"""
Parts of Oktoberfest's [run_rescore-function without the rescoring step](https://github.com/wilhelm-lab/oktoberfest/blob/ce8d909ebf64aaaf9c0eebcc2bb33b9c4492ae90/oktoberfest/runner.py#L1238-L1312)
with some renamed dependencies to avoid conflicts e.g. `re` => `ok_re` (for Oktoberfests rescoring module) to avoid conflicts with the built-in `re` module.

Arguments
---------
config_path : Union[str, Path]
Path to the configuration file for Oktoberfest. This file should contain all necessary parameters for the
"""
config = Config()
config.read(config_path)
config.check()

# load spectra file names
spectra_files = ok_pp.list_spectra(input_dir=config.spectra, input_format=config.spectra_type)

proc_dir = config.output / "proc"
proc_dir.mkdir(parents=True, exist_ok=True)

spectra_files = _preprocess(spectra_files, config)

# TODO is this the most elegant way to multi-thread CE calibration before running refinement learning?
# Should we store the returned libraries and pass them to _calculate_features and _refinement_learn instead of
# _ce_calib returning cached outputs?
if config.num_threads > 1:
processing_pool = JobPool(processes=config.num_threads)
for spectra_file in spectra_files:
_ = processing_pool.apply_async(_ce_calib, [spectra_file, config])
processing_pool.check_pool()
else:
for spectra_file in spectra_files:
_ = _ce_calib(spectra_file, config)

if config.do_refinement_learning:
_refinement_learn(spectra_files, config)

if config.num_threads > 1:
processing_pool = JobPool(processes=config.num_threads)
for spectra_file in spectra_files:
if "xl" in config.models["intensity"].lower():
if "cms2" in config.models["intensity"].lower():
cms2 = True
else:
cms2 = False
processing_pool.apply_async(_calculate_features, [spectra_file, config], xl=True, cms2=cms2)
else:
processing_pool.apply_async(_calculate_features, [spectra_file, config])
processing_pool.check_pool()
else:
for spectra_file in spectra_files:
if "xl" in config.models["intensity"].lower():
if "cms2" in config.models["intensity"].lower():
cms2 = True
else:
cms2 = False
_calculate_features(spectra_file, config, xl=True, cms2=cms2)
else:
_calculate_features(spectra_file, config)

# prepare rescoring

fdr_dir = config.output / "results" / config.fdr_estimation_method
original_tab_files = [fdr_dir / spectra_file.with_suffix(".original.tab").name for spectra_file in spectra_files]
rescore_tab_files = [fdr_dir / spectra_file.with_suffix(".rescore.tab").name for spectra_file in spectra_files]

prepare_tab_original_step = ProcessStep(config.output, f"{config.fdr_estimation_method}_prepare_tab_original")
prepare_tab_rescore_step = ProcessStep(config.output, f"{config.fdr_estimation_method}_prepare_tab_prosit")

if not prepare_tab_original_step.is_done():
logging.info("Merging input tab files for rescoring without peptide property prediction")
ok_re.merge_input(tab_files=original_tab_files, output_file=fdr_dir / "original.tab")
prepare_tab_original_step.mark_done()

if not prepare_tab_rescore_step.is_done():
logging.info("Merging input tab files for rescoring with peptide property prediction")
ok_re.merge_input(tab_files=rescore_tab_files, output_file=fdr_dir / "rescore.tab")
prepare_tab_rescore_step.mark_done()


def main():
"""
Generates features for PSM-rescoring using Oktoberfest.
The rescoring itself is suppresed by setting an unknown FDR estimation method.

The resulting features can be found as `<output_folder>/none/rescore.tab`
"""

args = argparse_setup()
logging.basicConfig(level=logging.INFO)
scan_id_regex = re.compile(args.scan_id_regex)

oktoberfest_input_csv_path = args.psms_file.with_suffix(".oktoberfest.input.csv")

psms = psm_utils.io.read_file(args.psms_file)

# Necessary columns according to the docs:
# RAW_FILE,SCAN_NUMBER,MODIFIED_SEQUENCE,PRECURSOR_CHARGE,
# SCAN_EVENT_NUMBER,MASS,SCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH
oktoberfest_df = pd.DataFrame()

# RAW_FILE,
oktoberfest_df["RAW_FILE"] = [args.spectra_file.stem] * len(psms)

# SCAN_NUMBER
oktoberfest_df["SCAN_NUMBER"] = [get_scan_id(psm.spectrum_id, scan_id_regex) for psm in psms]

# MODIFIED_SEQUENCE
oktoberfest_df["MODIFIED_SEQUENCE"] = [
psm.peptidoform.modified_sequence for psm in psms
]
# sequentially apply the replacements using functools.reduce
oktoberfest_df["MODIFIED_SEQUENCE"] = oktoberfest_df["MODIFIED_SEQUENCE"].apply(
lambda seq: reduce(
lambda seq_x, repl: seq_x.replace(repl[0], repl[1]), # lambda to replace
OKTOBERFEST_MODIFICATION_REPLACEMENTS, # replacements
seq # starting with the sequences as stated in the PSMs file
)
)

# PRECURSOR_CHARGE
oktoberfest_df["PRECURSOR_CHARGE"] = [psm.get_precursor_charge() for psm in psms]

# MASS
oktoberfest_df["MASS"] = [psm.peptidoform.theoretical_mass for psm in psms]

# SCORE
# TODO: Does the psmutil score means higher is better?
oktoberfest_df["SCORE"] = [psm.score for psm in psms]

# REVERSE
oktoberfest_df["REVERSE"] = [psm.is_decoy for psm in psms]

# SEQUENCE
oktoberfest_df["SEQUENCE"] = [psm.peptidoform.sequence for psm in psms]

# PEPTIDE_LENGTH
oktoberfest_df["PEPTIDE_LENGTH"] = [len(psm.peptidoform.sequence) for psm in psms]

# free up some memory
del psms

psms_df = pd.read_csv(args.psms_file, sep="\t")

# PROTEINS (not in the docs, but required by oktberfest)
oktoberfest_df["PROTEINS"] = psms_df["protein_list"].apply(
lambda x: x.replace("[", "").replace("]", "").replace("'", "")
) # remove the brackets and quotes

# adding present rescoring columns
present_rescoring_cols = list(
filter(lambda x: x.startswith("rescoring:"), psms_df.columns)
)
for rescoring_col in present_rescoring_cols:
oktoberfest_df[rescoring_col] = psms_df[rescoring_col]

# free up some more memory as the dataframe is read fromt disk again
del psms_df

# Filter unsupported amino acids
psms_len = len(oktoberfest_df)
oktoberfest_df = oktoberfest_df[~oktoberfest_df["SEQUENCE"].str.contains("|".join(OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS), regex=True)]
if len(oktoberfest_df) < psms_len:
logging.warning(
"Removed %i PSMs with unsupported amino acids: %s", psms_len - len(oktoberfest_df), OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS
)

# Some search engines do not provide protein accessions for decoys.
# In this case, we set the PROTEINS column to the `PEP_<MODIFIED_SEQUENCE>` like in the ms2rescore.
oktoberfest_df.replace({"PROTEINS": ""}, pd.NA, inplace=True)
oktoberfest_df.fillna({"PROTEINS": "PEP_" + oktoberfest_df["MODIFIED_SEQUENCE"]}, inplace=True)
oktoberfest_df.to_csv(
oktoberfest_input_csv_path,
sep=",",
index=False,
)

# free up more memory
del oktoberfest_df

# create the config file
config_dict = copy.deepcopy(ok.utils.example_configs.RESCORING)

# misc
config_dict["output"] = str(args.out_folder)
config_dict["numThreads"] = 1 # Set to 1 for debugging, can be increased later
# mass spec parameters
config_dict["massTolerance"] = args.mass_tolerance
config_dict["unitMassTolerance"] = args.mass_tolerance_unit
# predicition params
config_dict["models"]["irt"] = args.irt_model
config_dict["models"]["intensity"] = args.intensity_model
# input params
config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path)
config_dict["inputs"]["search_results_type"] = "Internal"
config_dict["inputs"]["spectra"] = str(args.spectra_file)
config_dict["inputs"]["spectra_type"] = "mzml"
# Setting this to none has the effect, that the generated features
# are stored in the subfolder `results/none` of the output folder.
config_dict["fdr_estimation_method"] = "NONE"
config_dict["quantification"] = False
config_dict["add_feature_cols"] = "all"

config_path = Path("oktoberfest.config.json")

with config_path.open("w", encoding="utf-8") as json_file:
json_file.write(json.dumps(config_dict))

is_successfull = False
for _ in range(OKTOBERFEST_RETRIES):
try:
feature_generation(config_path)
is_successfull = True
except tritonclient.utils.InferenceServerException as e:
if str(e.status) == "504":
logging.error("Koina server not available, retrying in 10 seconds...")
sleep(10)

if not is_successfull:
logging.error("Oktoberfest job failed after multiple retries.")
exit(101)


if __name__ == "__main__":
main()
Loading