From b5eef0866ea8a130c014e0167d55a8f4a280ce16 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Fri, 11 Jul 2025 12:29:01 +0000
Subject: [PATCH 01/31] implement feature creation using oktoberfest

---
 bin/oktoberfest_feature_gen.py    | 196 ++++++++++++++++++++++++++++++
 requirements.txt                  |   3 +-
 src/postprocessing/oktoberfest.nf |  75 ++++++++++++
 3 files changed, 273 insertions(+), 1 deletion(-)
 create mode 100755 bin/oktoberfest_feature_gen.py
 create mode 100644 src/postprocessing/oktoberfest.nf

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
new file mode 100755
index 0000000..7fab7a0
--- /dev/null
+++ b/bin/oktoberfest_feature_gen.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+
+"""
+Generates features for PSM-rescoring using Oktoberfest.
+The rescoring itself is suppresed by setting an unknown FDR estimation method.s
+"""
+
+import argparse
+import copy
+import json
+import logging
+from pathlib import Path
+
+import oktoberfest as ok
+from oktoberfest import runner as ok_runner
+import pandas as pd
+import psm_utils
+import psm_utils.io
+
+OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD = 'f{config.fdr_estimation_method} is not a valid rescoring tool, use either "percolator" or "mokapot"'
+"""Oktoberfest's error message for unknown FDR estimation methods.
+There is a typo in the oktberfest code, as it is not substituting the f-string correctly.
+"""
+
+
+def argparse_setup() -> argparse.Namespace:
+    """
+    Creates the argument parser for the Oktoberfest feature generation script.
+    """
+
+    parser = argparse.ArgumentParser()
+    # files
+    parser.add_argument(
+        "-psms-file", help="Input PSMs TSV file", required=True, type=Path
+    )
+    parser.add_argument(
+        "-spectra-file",
+        help="Corresponding spectrum file for PSMs file",
+        required=True,
+        type=Path,
+    )
+
+    # prediction
+    parser.add_argument("-intensity-model", help="Koina intensity model", type=str)
+    parser.add_argument("-irt-model", help="Koina IRT model", type=str)
+
+    # mass spec parameters
+    parser.add_argument(
+        "-mass-tolerance",
+        help="Defines the allowed tolerance between theoretical and experimentally observered fragment mass during peak annotation; default = 20 (FTMS), 40 (TOF), 0.35 (ITMS)",
+        default=20,
+        type=float,
+    )
+    parser.add_argument(
+        "-mass-tolerance-unit",
+        help="Defines the measure of tolerance, either “da” or “ppm”; default = da (mass analyzer is ITMS), ppm (mass analyzer is FTMS or TOF)",
+        type=str,
+        choices=["da", "ppm"],
+    )
+
+    parser.add_argument(
+        "-spectra-file-type",
+        help=".d|raw|mzml",
+        type=str,
+        choices=["d", "raw", "mzml"],
+        default="mzml",
+    )
+
+    parser.add_argument(
+        "-out-folder", help="Output folder for ", required=True, type=Path
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    """
+    Generates features for PSM-rescoring using Oktoberfest.
+    The rescoring itself is suppresed by setting an unknown FDR estimation method.
+
+    The esulting features can be found as `<output_folder>/none/rescore.tab`
+    """
+
+    args = argparse_setup()
+    logging.basicConfig(level=logging.INFO)
+
+    oktoberfest_input_csv_path = args.psms_file.with_suffix(".oktoberfest.input.csv")
+
+    psms = psm_utils.io.read_file(args.psms_file)
+
+    # Necessary columns according to the docs:
+    # RAW_FILE,SCAN_NUMBER,MODIFIED_SEQUENCE,PRECURSOR_CHARGE,
+    # SCAN_EVENT_NUMBER,MASS,SCORE,REVERSE,SEQUENCE,PEPTIDE_LENGTH
+    oktoberfest_df = pd.DataFrame()
+
+    # RAW_FILE,
+    oktoberfest_df["RAW_FILE"] = [args.mzml_file.stem] * len(psms)
+
+    # SCAN_NUMBER
+    oktoberfest_df["SCAN_NUMBER"] = [psm.spectrum_id for psm in psms]
+
+    # MODIFIED_SEQUENCE
+    oktoberfest_df["MODIFIED_SEQUENCE"] = [
+        psm.peptidoform.modified_sequence for psm in psms
+    ]
+    oktoberfest_df["MODIFIED_SEQUENCE"] = oktoberfest_df["MODIFIED_SEQUENCE"].apply(
+        lambda x: x.replace("[UNIMOD:Carbamidomethyl]", "[UNIMOD:4]").replace(
+            "[UNIMOD:Oxidation]", "[UNIMOD:35]"
+        )
+    )
+
+    # PRECURSOR_CHARGE
+    oktoberfest_df["PRECURSOR_CHARGE"] = [psm.get_precursor_charge() for psm in psms]
+
+    # SCAN_EVENT_NUMBER
+    # TODO: Some search engines do not provide this information, skipt it entirely?
+
+    # MASS
+    oktoberfest_df["MASS"] = [psm.peptidoform.theoretical_mass for psm in psms]
+
+    # SCORE
+    # TODO: Does the psmutil score means higher is better?
+    oktoberfest_df["SCORE"] = [psm.score for psm in psms]
+
+    # REVERSE
+    oktoberfest_df["REVERSE"] = [psm.is_decoy for psm in psms]
+
+    # SEQUENCE
+    oktoberfest_df["SEQUENCE"] = [psm.peptidoform.sequence for psm in psms]
+
+    # PEPTIDE_LENGTH
+    oktoberfest_df["PEPTIDE_LENGTH"] = [len(psm.peptidoform.sequence) for psm in psms]
+
+    del psms
+
+    psms_df = pd.read_csv(args.psms_file, sep="\t")
+
+    # PROTEINS (not in the docs, but required by oktberfest)
+    oktoberfest_df["PROTEINS"] = psms_df["protein_list"].apply(
+        lambda x: x.replace("[", "").replace("]", "").replace("'", "")
+    )  # remove the brackets and quotes
+
+    # adding present rescoring columns
+    present_rescoring_cols = list(
+        filter(lambda x: x.startswith("rescoring:"), psms_df.columns)
+    )
+    for rescoring_col in present_rescoring_cols:
+        oktoberfest_df[rescoring_col] = psms_df[rescoring_col]
+
+    oktoberfest_df.to_csv(
+        oktoberfest_input_csv_path,
+        sep=",",
+        index=False,
+    )
+
+    # create the config file
+    config_dict = copy.deepcopy(ok.utils.example_configs.RESCORING)
+
+    # misc
+    config_dict["output"] = str(args.out_folder)
+    config_dict["num_threads"] = 1  # Set to 1 for debugging, can be increased later
+    # mass spec parameters
+    config_dict["mass_tolerance"] = args.mass_tolerance
+    config_dict["unitMassTolerance"] = args.mass_tolerance_unit
+    # predicition params
+    config_dict["models"]["irt_model"] = args.irt_model
+    config_dict["models"]["intensity_model"] = args.intensity_model
+    # input params
+    config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path)
+    config_dict["inputs"]["search_results_type"] = "Internal"
+    config_dict["inputs"]["spectra"] = "./"
+    config_dict["inputs"]["spectra_type"] = args.spectra_file_type
+    # resocring params
+    # deliberately set to NONE, which will cause Oktoberfest to shut down before rescoring
+    # by raising a ValueError which we can catch later.
+    # This has the effect, that the generated features
+    # are stored in the subfolder `results/none` of the output folder.
+    config_dict["fdr_estimation_method"] = "NONE"
+    config_dict["quantification"] = False
+    config_dict["add_feature_cols"] = "all"
+
+    config_path = Path("oktoberfest.config.json")
+
+    with config_path.open("w", encoding="utf-8") as json_file:
+        json_file.write(json.dumps(config_dict))
+
+    try:
+        ok_runner.run_job(config_path)
+    except ValueError as e:
+        # Catch the specific ValueError raised when "NONE" is used as fdr_estimation
+        if str(e) != OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD:
+            raise e
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
index 1f01b16..3ff7f46 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,5 @@
 psm-utils @ git+https://github.com/julianu/psm_utils.git@pepxml-and-mzid-fixes
 #mokapot~=0.10.0
 ms2rescore_rs @ git+https://github.com/di-hardt/ms2rescore-rs.git@70c15002a9f065ea2cd01a9a9a95b8bcff762f53
-ms2rescore~=3.1.5
\ No newline at end of file
+ms2rescore~=3.1.5
+oktoberfest~=0.10.0
\ No newline at end of file
diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
new file mode 100644
index 0000000..75bfee1
--- /dev/null
+++ b/src/postprocessing/oktoberfest.nf
@@ -0,0 +1,75 @@
+nextflow.enable.dsl=2
+
+// parameters for oktoberfest
+params.oktoberfest_memory = "64 GB"
+params.oktoberfest_intensity_model = "Prosit_2020_intensity_HCD"
+params.oktoberfest_irt_mode = "Prosit_2019_irt"
+
+/**
+ * Runs oktoberfest rescoring for the given PSMs and mzML files.
+ * 
+ * @param psm_tsvs_and_mzmls: A tuple containing the PSM utils TSV files and the mzML files for the PSMs.
+ * @param psm_tsvs: The PSM TSV files.
+ * @param mzmls: The mzML files. 
+ * @param fragment_tol: The fragment tolerance for the rescoring.
+ * @param fragment_tol_unit: The unit of the fragment tolerance (e.g., "da" for Dalton).
+ *
+ * @return: The oktoberfest rescored PSMs in TSV format.
+ */
+workflow oktoberfest_rescore_workflow {
+    take:
+    psm_tsvs_and_mzmls
+    psm_tsvs
+    mzmls
+    fragment_tol
+    fragment_tol_unit
+
+    main:
+    oktoberfest_pins = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, fragment_tol, fragment_tol_unit)
+
+    emit:
+    oktoberfest_pins
+}
+
+/**
+ * @param psm_tsvs_and_mzmls: A tuple containing the PSM utils TSV files and the mzML files for the PSMs.
+ * @param psm_tsvs: The PSM TSV files.
+ * @param mzmls: The mzML files. 
+ * @param fragment_tol: The fragment tolerance for the rescoring.
+ * @param fragment_tol_unit: The unit of the fragment tolerance (e.g., "da" for Dalton).
+ * 
+ * @return: The oktoberfest rescored PSMs in TSV format.
+ */
+process run_oktoberfest_feature_gen {
+    cpus 1
+    memory { params.oktoberfest_memory }
+
+    container { params.python_image }
+
+    input:
+    tuple val(psm_utils_tsvs), val(mzml_for_psms)
+    path psm_tsvs
+    path mzmls
+    val fragment_tol
+    val fragment_tol_unit
+    
+    output:
+    path "oktoberfest.features.tsv"
+    
+    script:
+    """
+    oktoberfest_feature_gen.py \
+        -out-folder ./oktoberfest_out \
+        -psms-file ${psm_utils_tsvs} \
+        -spectra-file ${mzml_for_psms} \
+        -intensity-model ${params.oktoberfest_intensity_model} \
+        -irt-model ${params.oktoberfest_irt_mode} \
+        -mass-tolerance ${fragment_tol} \
+        -mass-tolerance-unit da
+
+    mv ./oktoberfest_out/results/none/rescore.tab oktoberfest.features.tsv
+
+    // Clean up the output directory
+    rm -r oktoberfest_out
+    """
+}

From a6bd038c448fb3a85d96df9f5701c9c5098decbd Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Mon, 14 Jul 2025 12:54:26 +0000
Subject: [PATCH 02/31] removing todo about optional feature

---
 bin/oktoberfest_feature_gen.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 7fab7a0..4d26837 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -112,9 +112,6 @@ def main():
     # PRECURSOR_CHARGE
     oktoberfest_df["PRECURSOR_CHARGE"] = [psm.get_precursor_charge() for psm in psms]
 
-    # SCAN_EVENT_NUMBER
-    # TODO: Some search engines do not provide this information, skipt it entirely?
-
     # MASS
     oktoberfest_df["MASS"] = [psm.peptidoform.theoretical_mass for psm in psms]
 

From ecc3bbdf4786dcb53ac3d3303bc239c05ba3f722 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Mon, 14 Jul 2025 12:55:06 +0000
Subject: [PATCH 03/31] fix argument name

---
 bin/oktoberfest_feature_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 4d26837..e18597c 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -94,7 +94,7 @@ def main():
     oktoberfest_df = pd.DataFrame()
 
     # RAW_FILE,
-    oktoberfest_df["RAW_FILE"] = [args.mzml_file.stem] * len(psms)
+    oktoberfest_df["RAW_FILE"] = [args.spectra_file.stem] * len(psms)
 
     # SCAN_NUMBER
     oktoberfest_df["SCAN_NUMBER"] = [psm.spectrum_id for psm in psms]

From 45b6838e4ed0c101cb30b11f946048953fe40fcf Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Mon, 14 Jul 2025 12:55:31 +0000
Subject: [PATCH 04/31] finalize arguments

---
 bin/oktoberfest_feature_gen.py    | 37 +++++++++++++++++++++++++------
 src/postprocessing/oktoberfest.nf | 25 +++++++++------------
 2 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index e18597c..24644da 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -22,6 +22,31 @@
 There is a typo in the oktberfest code, as it is not substituting the f-string correctly.
 """
 
+def parse_str_bool(value: str) -> bool:
+    """
+    Parses a string argument to a boolean value.
+
+    Arguments
+    ---------
+    value : str
+        The string value to parse. Accepts 'true', 'false', '1', '
+
+    Returns
+    -------
+    bool
+        Returns True for 'true' or '1', and False for 'false' or '
+
+    Raises
+    ------
+    ValueError
+        If the value is not a valid boolean representation.
+    """
+    if value.lower() in ['true', '1']:
+        return True
+    elif value.lower() in ['false', '0']:
+        return False
+    else:
+        raise ValueError(f"Invalid boolean value: {value}")
 
 def argparse_setup() -> argparse.Namespace:
     """
@@ -59,11 +84,9 @@ def argparse_setup() -> argparse.Namespace:
     )
 
     parser.add_argument(
-        "-spectra-file-type",
-        help=".d|raw|mzml",
-        type=str,
-        choices=["d", "raw", "mzml"],
-        default="mzml",
+        "-is-timstof",
+        help="If true, the spectra file type is set to 'd' (for timsTOF); otherwise, it defaults to 'mzml'",
+        type=parse_str_bool
     )
 
     parser.add_argument(
@@ -166,8 +189,8 @@ def main():
     config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path)
     config_dict["inputs"]["search_results_type"] = "Internal"
     config_dict["inputs"]["spectra"] = "./"
-    config_dict["inputs"]["spectra_type"] = args.spectra_file_type
-    # resocring params
+    config_dict["inputs"]["spectra_type"] = "d" if args.is_timstof else "mzml"
+    # resocreing params
     # deliberately set to NONE, which will cause Oktoberfest to shut down before rescoring
     # by raising a ValueError which we can catch later.
     # This has the effect, that the generated features
diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
index 75bfee1..1d034e5 100644
--- a/src/postprocessing/oktoberfest.nf
+++ b/src/postprocessing/oktoberfest.nf
@@ -11,8 +11,6 @@ params.oktoberfest_irt_mode = "Prosit_2019_irt"
  * @param psm_tsvs_and_mzmls: A tuple containing the PSM utils TSV files and the mzML files for the PSMs.
  * @param psm_tsvs: The PSM TSV files.
  * @param mzmls: The mzML files. 
- * @param fragment_tol: The fragment tolerance for the rescoring.
- * @param fragment_tol_unit: The unit of the fragment tolerance (e.g., "da" for Dalton).
  *
  * @return: The oktoberfest rescored PSMs in TSV format.
  */
@@ -21,11 +19,9 @@ workflow oktoberfest_rescore_workflow {
     psm_tsvs_and_mzmls
     psm_tsvs
     mzmls
-    fragment_tol
-    fragment_tol_unit
 
     main:
-    oktoberfest_pins = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, fragment_tol, fragment_tol_unit)
+    oktoberfest_pins = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, params.fragment_tol_da)
 
     emit:
     oktoberfest_pins
@@ -35,10 +31,9 @@ workflow oktoberfest_rescore_workflow {
  * @param psm_tsvs_and_mzmls: A tuple containing the PSM utils TSV files and the mzML files for the PSMs.
  * @param psm_tsvs: The PSM TSV files.
  * @param mzmls: The mzML files. 
- * @param fragment_tol: The fragment tolerance for the rescoring.
- * @param fragment_tol_unit: The unit of the fragment tolerance (e.g., "da" for Dalton).
+ * @param fragment_tol_da: The fragment tolerance for the rescoring.
  * 
- * @return: The oktoberfest rescored PSMs in TSV format.
+ * @return The oktoberfest rescored PSMs in TSV format.
  */
 process run_oktoberfest_feature_gen {
     cpus 1
@@ -50,11 +45,10 @@ process run_oktoberfest_feature_gen {
     tuple val(psm_utils_tsvs), val(mzml_for_psms)
     path psm_tsvs
     path mzmls
-    val fragment_tol
-    val fragment_tol_unit
+    val fragment_tol_da
     
     output:
-    path "oktoberfest.features.tsv"
+    path "${psm_utils_tsvs.baseName}.features.tsv"
     
     script:
     """
@@ -63,11 +57,12 @@ process run_oktoberfest_feature_gen {
         -psms-file ${psm_utils_tsvs} \
         -spectra-file ${mzml_for_psms} \
         -intensity-model ${params.oktoberfest_intensity_model} \
-        -irt-model ${params.oktoberfest_irt_mode} \
-        -mass-tolerance ${fragment_tol} \
-        -mass-tolerance-unit da
+        -irt-model ${params.oktoberfest_irt_modeö} \
+        -mass-tolerance ${fragment_tol_da} \
+        -mass-tolerance-unit da \
+        -is-timstof ${params.is_timstof} \
 
-    mv ./oktoberfest_out/results/none/rescore.tab oktoberfest.features.tsv
+    mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs.baseName}.features.tsv"
 
     // Clean up the output directory
     rm -r oktoberfest_out

From 9ecd81714e42d68de9fcfe123790ed859b841593 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Mon, 14 Jul 2025 13:00:16 +0000
Subject: [PATCH 05/31] free up some memory

---
 bin/oktoberfest_feature_gen.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 24644da..e313725 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -151,6 +151,7 @@ def main():
     # PEPTIDE_LENGTH
     oktoberfest_df["PEPTIDE_LENGTH"] = [len(psm.peptidoform.sequence) for psm in psms]
 
+    # free up some memory
     del psms
 
     psms_df = pd.read_csv(args.psms_file, sep="\t")
@@ -173,6 +174,10 @@ def main():
         index=False,
     )
 
+    # free up some more memory as the dataframe is read fromt disk again
+    del psms_df
+    del oktoberfest_df
+
     # create the config file
     config_dict = copy.deepcopy(ok.utils.example_configs.RESCORING)
 

From 89b8a3ee451b1081db451a868f0f86aaef7ffc3b Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Mon, 14 Jul 2025 13:00:34 +0000
Subject: [PATCH 06/31] fix typos

---
 bin/oktoberfest_feature_gen.py    |  4 +-
 bin/oktoberfest_feature_to_pin.py | 71 +++++++++++++++++++++++++++++++
 src/postprocessing/oktoberfest.nf | 33 ++++++++++++--
 3 files changed, 103 insertions(+), 5 deletions(-)
 create mode 100755 bin/oktoberfest_feature_to_pin.py

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index e313725..fc6537f 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -2,7 +2,7 @@
 
 """
 Generates features for PSM-rescoring using Oktoberfest.
-The rescoring itself is suppresed by setting an unknown FDR estimation method.s
+The rescoring itself is suppresed by setting an unknown FDR estimation method.
 """
 
 import argparse
@@ -101,7 +101,7 @@ def main():
     Generates features for PSM-rescoring using Oktoberfest.
     The rescoring itself is suppresed by setting an unknown FDR estimation method.
 
-    The esulting features can be found as `<output_folder>/none/rescore.tab`
+    The resulting features can be found as `<output_folder>/none/rescore.tab`
     """
 
     args = argparse_setup()
diff --git a/bin/oktoberfest_feature_to_pin.py b/bin/oktoberfest_feature_to_pin.py
new file mode 100755
index 0000000..4293a53
--- /dev/null
+++ b/bin/oktoberfest_feature_to_pin.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+
+"""
+Converts Oktoberfest feature files to Percolator's PIN file.
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+import pandas as pd
+
+COLS_TO_REMOVE = [
+    "filename"  # str column
+]
+"""Columns to remove from the Oktoberfest feature file.
+E.g. due to wrong type
+"""
+
+COLS_TO_RENAME = {
+    "SpecId": "id",
+}
+"""Columns to rename in the Oktoberfest feature file.
+"""
+
+
+def argparse_setup() -> argparse.Namespace:
+    """
+    Creates the argument parser for the Oktoberfest feature generation script.
+    """
+
+    parser = argparse.ArgumentParser()
+    # files
+    parser.add_argument(
+        "-in-file", help="Input feature TSV file", required=True, type=Path
+    )
+    parser.add_argument("-out-file", help="Pin file ", required=True, type=Path)
+
+    return parser.parse_args()
+
+
+def main():
+    """
+    Converts Oktoberfest feature files to Percolator's PIN file.
+    """
+
+    args = argparse_setup()
+    logging.basicConfig(level=logging.INFO)
+
+    # feature dataframe
+    feature_df = pd.read_csv(args.in_file, sep="\t")
+
+    for col in COLS_TO_REMOVE:
+        if col in feature_df.columns:
+            feature_df.drop(columns=col, inplace=True)
+
+    for col, new_col in COLS_TO_RENAME.items():
+        if col in feature_df.columns:
+            feature_df.rename(columns={col: new_col}, inplace=True)
+
+    for col in feature_df.columns:
+        feature_df.rename(columns={col: col.lower()}, inplace=True)
+
+    feature_df.to_csv(
+        args.out_file,
+        sep="\t",
+        index=False,
+    )
+
+if __name__ == "__main__":
+    main()
diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
index 1d034e5..458cfa7 100644
--- a/src/postprocessing/oktoberfest.nf
+++ b/src/postprocessing/oktoberfest.nf
@@ -3,7 +3,7 @@ nextflow.enable.dsl=2
 // parameters for oktoberfest
 params.oktoberfest_memory = "64 GB"
 params.oktoberfest_intensity_model = "Prosit_2020_intensity_HCD"
-params.oktoberfest_irt_mode = "Prosit_2019_irt"
+params.oktoberfest_irt_model = "Prosit_2019_irt"
 
 /**
  * Runs oktoberfest rescoring for the given PSMs and mzML files.
@@ -21,7 +21,9 @@ workflow oktoberfest_rescore_workflow {
     mzmls
 
     main:
-    oktoberfest_pins = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, params.fragment_tol_da)
+    oktoberfest_features = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, params.fragment_tol_da)
+    oktoberfest_pins = oktoberfest_features_to_pin(oktoberfest_features)
+
 
     emit:
     oktoberfest_pins
@@ -57,7 +59,7 @@ process run_oktoberfest_feature_gen {
         -psms-file ${psm_utils_tsvs} \
         -spectra-file ${mzml_for_psms} \
         -intensity-model ${params.oktoberfest_intensity_model} \
-        -irt-model ${params.oktoberfest_irt_modeö} \
+        -irt-model ${params.oktoberfest_irt_model} \
         -mass-tolerance ${fragment_tol_da} \
         -mass-tolerance-unit da \
         -is-timstof ${params.is_timstof} \
@@ -68,3 +70,28 @@ process run_oktoberfest_feature_gen {
     rm -r oktoberfest_out
     """
 }
+
+/**
+ * @param okt_features_tsv: Oktoberfest feature file.
+ * 
+ * @return Oktoberfest feature file in PIN format ready to use with percolator.
+ */
+process oktoberfest_features_to_pin {
+    cpus 1
+    memory { params.oktoberfest_memory }
+
+    container { params.python_image }
+
+    input:
+    path okt_features_tsv
+
+    output:
+    path "${okt_features_tsv.baseName}.oktoberfest.pin"
+
+    script:
+    """
+    oktoberfest_feature_to_pin.py \
+        -features-file ${okt_features_tsv} \
+        -out-folder ./{okt_features_tsv.baseName}.oktoberfest.pin
+    """
+}

From eeffc19d24e20ef52bb174c0df04da22bff0bfe7 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Mon, 14 Jul 2025 14:37:32 +0000
Subject: [PATCH 07/31] integration into workflow

---
 src/identification/comet_identification.nf     | 9 +++++++--
 src/identification/maxquant_identification.nf  | 8 ++++++--
 src/identification/msamanda_identification.nf  | 8 ++++++--
 src/identification/msfragger_identification.nf | 8 ++++++--
 src/identification/msgfplus_identification.nf  | 8 ++++++--
 src/identification/sage_identification.nf      | 8 ++++++--
 src/identification/xtandem_identification.nf   | 8 ++++++--
 7 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf
index dce4596..5d65cf4 100644
--- a/src/identification/comet_identification.nf
+++ b/src/identification/comet_identification.nf
@@ -10,8 +10,9 @@ params.comet_psm_id_pattern = "(.*)"
 params.comet_spectrum_id_pattern = '.*scan=(\\d+)$'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
-include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf'
+include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
+include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf'
 
 /**
  * Exports the identification using Comet configured by a SDRF files
@@ -38,9 +39,11 @@ workflow comet_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.comet_psm_id_pattern, params.comet_spectrum_id_pattern, '^DECOY_', 'comet')
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
     
-    // perform percolation on MS2Rescore results
+    // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
+    oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins)
 
     publish:
     comet_mzids >> 'comet'
@@ -49,6 +52,8 @@ workflow comet_identification {
     pout_files >> 'comet'
     ms2rescore_pins >> 'comet'
     ms2rescore_percolator_results >> 'comet'
+    oktoberfest_pins >> 'comet'
+    oktoberfest_percolator_results >> 'comet'
 }
 
 
diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf
index 21bc147..9ab1370 100644
--- a/src/identification/maxquant_identification.nf
+++ b/src/identification/maxquant_identification.nf
@@ -10,7 +10,7 @@ params.maxquant_psm_id_pattern = ""
 params.maxquant_spectrum_id_pattern = ""
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
-include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf'
+include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
 
 /**
@@ -63,9 +63,11 @@ workflow maxquant_identification {
     }
 
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant')
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
     
-    // perform percolation on MS2Rescore results
+    // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
+    oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins))
 
     publish:
     maxquant_results >> 'maxquant'
@@ -74,6 +76,8 @@ workflow maxquant_identification {
     pout_files >> 'maxquant'
     ms2rescore_pins >> 'maxquant'
     ms2rescore_percolator_results >> 'maxquant'
+    oktoberfest_pins >> 'maxquant'
+    oktoberfest_percolator_results >> 'maxquant'
 }
 
 
diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf
index 69a8ae7..fd99da0 100644
--- a/src/identification/msamanda_identification.nf
+++ b/src/identification/msamanda_identification.nf
@@ -10,7 +10,7 @@ params.msamanda_psm_id_pattern = "(.*)"
 params.msamanda_spectrum_id_pattern = '(.*)'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
-include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf'
+include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
 
 // msamanda needs explicit "scan=" in the id of a scan (not there in e.g. TimsTOF converted mzML data)
@@ -43,9 +43,11 @@ workflow msamanda_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msamanda.csv')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msamanda_psm_id_pattern, params.msamanda_spectrum_id_pattern, '^DECOY_', 'msamanda')
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
 
-    // perform percolation on MS2Rescore results
+    // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
+    oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins)
 
     publish:
     msamanda_results.msamanda_csv >> 'msamanda'
@@ -54,6 +56,8 @@ workflow msamanda_identification {
     pout_files >> 'msamanda'
     ms2rescore_pins >> 'msamanda'
     ms2rescore_percolator_results >> 'msamanda'
+    oktoberfest_pins >> 'msamanda'
+    oktoberfest_percolator_results >> 'msamanda'
 }
 
 
diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf
index ac8fad1..bca5a68 100644
--- a/src/identification/msfragger_identification.nf
+++ b/src/identification/msfragger_identification.nf
@@ -12,7 +12,7 @@ params.msfragger_psm_id_pattern = "(.*)"
 params.msfragger_spectrum_id_pattern = "(.*)"
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
-include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf'
+include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
 
 workflow msfragger_identification {
@@ -37,9 +37,11 @@ workflow msfragger_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.pepXML')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msfragger_psm_id_pattern, params.msfragger_spectrum_id_pattern, '^DECOY_', 'msfragger')
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
     
-    // // perform percolation on MS2Rescore results
+    // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
+    oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins)
 
     publish:
     fragger_results_pepxml >> 'msfragger'
@@ -48,6 +50,8 @@ workflow msfragger_identification {
     pout_files >> 'msfragger'
     ms2rescore_pins >> 'msfragger'
     ms2rescore_percolator_results >> 'msfragger'
+    oktoberfest_pins >> 'msfragger'
+    oktoberfest_percolator_results >> 'msfragger'
 }
 
 
diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf
index 2429ab8..517f2a6 100644
--- a/src/identification/msgfplus_identification.nf
+++ b/src/identification/msgfplus_identification.nf
@@ -18,7 +18,7 @@ params.msgfplus_psm_id_pattern = "(.*)"
 params.msgfplus_spectrum_id_pattern = '(.*)'
 
 include {convert_chunked_result_to_psm_utils; enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
-include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf'
+include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
 
 include {split_mzml_into_chunks} from '../preprocess/convert_to_mzml.nf'
@@ -91,9 +91,11 @@ workflow msgfplus_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msgfplus_psm_id_pattern, params.msgfplus_spectrum_id_pattern, '^DECOY_', 'msgfplus')
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
 
-    // perform percolation on MS2Rescore results
+    // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
+    oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins))
 
     publish:
     fasta_merged_results.map{ it -> it[1] } >> 'msgfplus'
@@ -102,6 +104,8 @@ workflow msgfplus_identification {
     pout_files >> 'msgfplus'
     ms2rescore_pins >> 'msgfplus'
     ms2rescore_percolator_results >> 'msgfplus'
+    oktoberfest_pins >> 'msgfplus'
+    oktoberfest_percolator_results >> 'msgfplus'
 }
 
 process identification_with_msgfplus {
diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf
index 5654799..aeaf7a2 100644
--- a/src/identification/sage_identification.nf
+++ b/src/identification/sage_identification.nf
@@ -12,7 +12,7 @@ params.sage_psm_id_pattern = "(.*)"
 params.sage_spectrum_id_pattern = '(.*)'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
-include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf'
+include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
 
 /**
@@ -49,9 +49,11 @@ workflow sage_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.sage')) ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.sage_psm_id_pattern, params.sage_spectrum_id_pattern, '^DECOY_', 'sage')
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
 
-    // perform percolation on MS2Rescore results
+    // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
+    oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins)
 
     publish:
     return_files >> 'sage'
@@ -60,6 +62,8 @@ workflow sage_identification {
     pout_files >> 'sage'
     ms2rescore_pins >> 'sage'
     ms2rescore_percolator_results >> 'sage'
+    oktoberfest_pins >> 'sage'
+    oktoberfest_percolator_results >> 'sage'
 }
 
 
diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf
index 2823f95..7e85e3a 100644
--- a/src/identification/xtandem_identification.nf
+++ b/src/identification/xtandem_identification.nf
@@ -10,7 +10,7 @@ params.xtandem_psm_id_pattern = "(.*)"
 params.xtandem_spectrum_id_pattern = '(.*)'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
-include {psm_percolator; psm_percolator as ms2rescore_percolator} from '../postprocessing/percolator.nf'
+include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
 
 /**
@@ -39,9 +39,11 @@ workflow xtandem_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.xtandem_identification')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.xtandem_psm_id_pattern, params.xtandem_spectrum_id_pattern, '^DECOY_', 'xtandem')
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
     
-    // perform percolation on MS2Rescore results
+    // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
+    oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins)
     
     publish:
     tandem_xmls >> 'xtandem'
@@ -50,6 +52,8 @@ workflow xtandem_identification {
     pout_files >> 'xtandem'
     ms2rescore_pins >> 'xtandem'
     ms2rescore_percolator_results >> 'xtandem'
+    oktoberfest_pins >> 'xtandem'
+    oktoberfest_percolator_results >> 'xtandem'
 }
 
 /**

From 9e7dec6934fef91c6e55aa694c141238544881b5 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Tue, 15 Jul 2025 07:57:19 +0000
Subject: [PATCH 08/31] build separate container for oktoberfest

---
 Makefile                            |  4 +++-
 docker/oktoberfest/Dockerfile       | 29 +++++++++++++++++++++++++++++
 docker/oktoberfest/environment.yml  | 10 ++++++++++
 docker/oktoberfest/requirements.txt |  2 ++
 main.nf                             |  1 +
 requirements.txt                    |  3 +--
 src/postprocessing/oktoberfest.nf   |  4 ++--
 7 files changed, 48 insertions(+), 5 deletions(-)
 create mode 100644 docker/oktoberfest/Dockerfile
 create mode 100644 docker/oktoberfest/environment.yml
 create mode 100644 docker/oktoberfest/requirements.txt

diff --git a/Makefile b/Makefile
index 7c19bdd..50b3f98 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 docker-imgs:
 	docker pull ghcr.io/medbioinf/pipeline-of-identification:latest
-
+	
 	docker pull proteowizard/pwiz-skyline-i-agree-to-the-vendor-licenses:3.0.25073-842baef
 	docker pull quay.io/medbioinf/tdf2mzml:0.4
 	docker pull quay.io/medbioinf/openms:3.4.1
@@ -17,3 +17,5 @@ docker-imgs:
 	docker pull ghcr.io/percolator/percolator:branch-3-08
 
 	docker build --platform linux/amd64 -t medbioinf/msfragger -f docker/msfragger/Dockerfile docker/msfragger/.
+
+	docker build --platform linux/amd64 -t medbioinf/oktoberfest:latest -f docker/oktoberfest/Dockerfile docker/oktoberfest
diff --git a/docker/oktoberfest/Dockerfile b/docker/oktoberfest/Dockerfile
new file mode 100644
index 0000000..d083afb
--- /dev/null
+++ b/docker/oktoberfest/Dockerfile
@@ -0,0 +1,29 @@
+# AMD64 needed explicitly on ARM as some sofwtare is only available in AMD64
+FROM --platform=amd64 mambaorg/micromamba:2.1.0-ubuntu22.04
+
+WORKDIR /home/mambauser
+# Copy backend and environment.yml
+COPY --chown=mambauser:mambauser environment.yml .
+COPY --chown=mambauser:mambauser requirements.txt .
+
+USER root
+
+RUN apt update \
+    && apt install -y libglib2.0-0 git \
+    && apt clean
+
+USER mambauser
+ENV HOME=/home/mambauser
+ENV ENV_NAME=oktoberfest
+
+RUN echo 'show_banner: false' > ~/.mambarc
+
+RUN micromamba env create -y -f environment.yml \
+    && micromamba clean --all --yes
+
+# TODO: remove build-essential
+
+USER root
+# First is necessary for base_image to actvate the conda environment second is entrypoint
+# which adds the python file to PATH
+ENTRYPOINT [ "/usr/local/bin/_entrypoint.sh"]
diff --git a/docker/oktoberfest/environment.yml b/docker/oktoberfest/environment.yml
new file mode 100644
index 0000000..8a46af3
--- /dev/null
+++ b/docker/oktoberfest/environment.yml
@@ -0,0 +1,10 @@
+name: oktoberfest
+channels:
+  - defaults
+dependencies:
+  - python=3.11
+  - pip
+  - setuptools
+  - pip:
+    - -r requirements.txt
+  
diff --git a/docker/oktoberfest/requirements.txt b/docker/oktoberfest/requirements.txt
new file mode 100644
index 0000000..fe5f1b5
--- /dev/null
+++ b/docker/oktoberfest/requirements.txt
@@ -0,0 +1,2 @@
+oktoberfest~=0.10.0
+psm-utils @ git+https://github.com/julianu/psm_utils.git@pepxml-and-mzid-fixes
\ No newline at end of file
diff --git a/main.nf b/main.nf
index 2127322..8174524 100644
--- a/main.nf
+++ b/main.nf
@@ -4,6 +4,7 @@ nextflow.preview.output = true
 
 // default python image
 params.python_image = 'ghcr.io/medbioinf/pipeline-of-identification:latest'
+params.oktoberfest_image = 'medbioinf/oktoberfest'
 
 // parameters set by the command line
 params.raw_files = ''
diff --git a/requirements.txt b/requirements.txt
index 3ff7f46..1f01b16 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,5 +2,4 @@
 psm-utils @ git+https://github.com/julianu/psm_utils.git@pepxml-and-mzid-fixes
 #mokapot~=0.10.0
 ms2rescore_rs @ git+https://github.com/di-hardt/ms2rescore-rs.git@70c15002a9f065ea2cd01a9a9a95b8bcff762f53
-ms2rescore~=3.1.5
-oktoberfest~=0.10.0
\ No newline at end of file
+ms2rescore~=3.1.5
\ No newline at end of file
diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
index 458cfa7..328b9fa 100644
--- a/src/postprocessing/oktoberfest.nf
+++ b/src/postprocessing/oktoberfest.nf
@@ -41,7 +41,7 @@ process run_oktoberfest_feature_gen {
     cpus 1
     memory { params.oktoberfest_memory }
 
-    container { params.python_image }
+    container { params.oktoberfest_image }
 
     input:
     tuple val(psm_utils_tsvs), val(mzml_for_psms)
@@ -80,7 +80,7 @@ process oktoberfest_features_to_pin {
     cpus 1
     memory { params.oktoberfest_memory }
 
-    container { params.python_image }
+    container { params.oktoberfest_image }
 
     input:
     path okt_features_tsv

From 3611bbf7cb74708cfba1ad474b2f1fe75417bd4e Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Tue, 15 Jul 2025 08:07:15 +0000
Subject: [PATCH 09/31] fix comment

---
 src/postprocessing/oktoberfest.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
index 328b9fa..07f8087 100644
--- a/src/postprocessing/oktoberfest.nf
+++ b/src/postprocessing/oktoberfest.nf
@@ -66,7 +66,7 @@ process run_oktoberfest_feature_gen {
 
     mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs.baseName}.features.tsv"
 
-    // Clean up the output directory
+    # Clean up the output directory
     rm -r oktoberfest_out
     """
 }

From d800c75f34c06e5024fdfda2c7dffacf62a91448 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Tue, 15 Jul 2025 08:08:14 +0000
Subject: [PATCH 10/31] fix filenames

---
 src/postprocessing/oktoberfest.nf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
index 07f8087..88ed48b 100644
--- a/src/postprocessing/oktoberfest.nf
+++ b/src/postprocessing/oktoberfest.nf
@@ -50,7 +50,7 @@ process run_oktoberfest_feature_gen {
     val fragment_tol_da
     
     output:
-    path "${psm_utils_tsvs.baseName}.features.tsv"
+    path "${psm_utils_tsvs}.features.tsv"
     
     script:
     """
@@ -64,7 +64,7 @@ process run_oktoberfest_feature_gen {
         -mass-tolerance-unit da \
         -is-timstof ${params.is_timstof} \
 
-    mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs.baseName}.features.tsv"
+    mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs}.features.tsv"
 
     # Clean up the output directory
     rm -r oktoberfest_out
@@ -86,12 +86,12 @@ process oktoberfest_features_to_pin {
     path okt_features_tsv
 
     output:
-    path "${okt_features_tsv.baseName}.oktoberfest.pin"
+    path "${okt_features_tsv}.oktoberfest.pin"
 
     script:
     """
     oktoberfest_feature_to_pin.py \
         -features-file ${okt_features_tsv} \
-        -out-folder ./{okt_features_tsv.baseName}.oktoberfest.pin
+        -out-folder ./{okt_features_tsv}.oktoberfest.pin
     """
 }

From 8d9fc9fe7ee5fe208fe7ff36714578221e105fd8 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Tue, 15 Jul 2025 08:08:53 +0000
Subject: [PATCH 11/31] fix params

---
 src/identification/comet_identification.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf
index 5d65cf4..bf8d857 100644
--- a/src/identification/comet_identification.nf
+++ b/src/identification/comet_identification.nf
@@ -39,7 +39,7 @@ workflow comet_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.comet_psm_id_pattern, params.comet_spectrum_id_pattern, '^DECOY_', 'comet')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
     
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)

From 7536143dd3987758f2b1ed6b7204f2cf719fc0a4 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Tue, 15 Jul 2025 08:09:08 +0000
Subject: [PATCH 12/31] import missing workflow

---
 src/identification/maxquant_identification.nf  | 5 +++--
 src/identification/msamanda_identification.nf  | 3 ++-
 src/identification/msfragger_identification.nf | 3 ++-
 src/identification/msgfplus_identification.nf  | 5 +++--
 src/identification/sage_identification.nf      | 3 ++-
 src/identification/xtandem_identification.nf   | 3 ++-
 6 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf
index 9ab1370..ab2bcaa 100644
--- a/src/identification/maxquant_identification.nf
+++ b/src/identification/maxquant_identification.nf
@@ -12,6 +12,7 @@ params.maxquant_spectrum_id_pattern = ""
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
+include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf'
 
 /**
  * Executes the identification using MaxQuant
@@ -63,11 +64,11 @@ workflow maxquant_identification {
     }
 
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
     
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
-    oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins))
+    oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins)
 
     publish:
     maxquant_results >> 'maxquant'
diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf
index fd99da0..3faa4a5 100644
--- a/src/identification/msamanda_identification.nf
+++ b/src/identification/msamanda_identification.nf
@@ -12,6 +12,7 @@ params.msamanda_spectrum_id_pattern = '(.*)'
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
+include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf'
 
 // msamanda needs explicit "scan=" in the id of a scan (not there in e.g. TimsTOF converted mzML data)
 // 1) ms-convert with "--noindex"
@@ -43,7 +44,7 @@ workflow msamanda_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msamanda.csv')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msamanda_psm_id_pattern, params.msamanda_spectrum_id_pattern, '^DECOY_', 'msamanda')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
 
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf
index bca5a68..4697ed8 100644
--- a/src/identification/msfragger_identification.nf
+++ b/src/identification/msfragger_identification.nf
@@ -14,6 +14,7 @@ params.msfragger_spectrum_id_pattern = "(.*)"
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
+include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf'
 
 workflow msfragger_identification {
     take:
@@ -37,7 +38,7 @@ workflow msfragger_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.pepXML')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msfragger_psm_id_pattern, params.msfragger_spectrum_id_pattern, '^DECOY_', 'msfragger')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
     
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf
index 517f2a6..6a76f9e 100644
--- a/src/identification/msgfplus_identification.nf
+++ b/src/identification/msgfplus_identification.nf
@@ -20,6 +20,7 @@ params.msgfplus_spectrum_id_pattern = '(.*)'
 include {convert_chunked_result_to_psm_utils; enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
+include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf'
 
 include {split_mzml_into_chunks} from '../preprocess/convert_to_mzml.nf'
 
@@ -91,11 +92,11 @@ workflow msgfplus_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msgfplus_psm_id_pattern, params.msgfplus_spectrum_id_pattern, '^DECOY_', 'msgfplus')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
 
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
-    oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins))
+    oktoberfest_percolator_results = oktoberfest_percolator(oktoberfest_pins.oktoberfest_pins)
 
     publish:
     fasta_merged_results.map{ it -> it[1] } >> 'msgfplus'
diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf
index aeaf7a2..2c23d69 100644
--- a/src/identification/sage_identification.nf
+++ b/src/identification/sage_identification.nf
@@ -14,6 +14,7 @@ params.sage_spectrum_id_pattern = '(.*)'
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
+include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf'
 
 /**
  * Executes the identification using Sage
@@ -49,7 +50,7 @@ workflow sage_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.sage')) ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.sage_psm_id_pattern, params.sage_spectrum_id_pattern, '^DECOY_', 'sage')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
 
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf
index 7e85e3a..2e58235 100644
--- a/src/identification/xtandem_identification.nf
+++ b/src/identification/xtandem_identification.nf
@@ -12,6 +12,7 @@ params.xtandem_spectrum_id_pattern = '(.*)'
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
 include {ms2rescore_workflow} from '../postprocessing/ms2rescore.nf'
+include {oktoberfest_rescore_workflow} from '../postprocessing/oktoberfest.nf'
 
 /**
  * Exports the identification using Comet configured by a SDRF files
@@ -39,7 +40,7 @@ workflow xtandem_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.xtandem_identification')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.xtandem_psm_id_pattern, params.xtandem_spectrum_id_pattern, '^DECOY_', 'xtandem')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.fragment_tol_da)
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
     
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)

From 014825dbf56dfe9858c8947a58ff420fc800fb10 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Tue, 15 Jul 2025 09:09:16 +0000
Subject: [PATCH 13/31] correcting names

---
 src/postprocessing/oktoberfest.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
index 88ed48b..7873d23 100644
--- a/src/postprocessing/oktoberfest.nf
+++ b/src/postprocessing/oktoberfest.nf
@@ -86,12 +86,12 @@ process oktoberfest_features_to_pin {
     path okt_features_tsv
 
     output:
-    path "${okt_features_tsv}.oktoberfest.pin"
+    path "${okt_features_tsv.baseName}.oktoberfest.pin"
 
     script:
     """
     oktoberfest_feature_to_pin.py \
-        -features-file ${okt_features_tsv} \
-        -out-folder ./{okt_features_tsv}.oktoberfest.pin
+        -in-file ${okt_features_tsv} \
+        -out-file ./${okt_features_tsv.baseName}.oktoberfest.pin
     """
 }

From d2e0111088ed7b87a8dd26a72590e4b5d5b22241 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Wed, 16 Jul 2025 07:50:05 +0000
Subject: [PATCH 14/31] fix variable name

---
 src/identification/maxquant_identification.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf
index ab2bcaa..74b9b19 100644
--- a/src/identification/maxquant_identification.nf
+++ b/src/identification/maxquant_identification.nf
@@ -64,7 +64,7 @@ workflow maxquant_identification {
     }
 
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), mzmls.collect())
     
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)

From cf1a3f0a1f40ae2014d441fc15f078ac7fca267a Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Wed, 16 Jul 2025 07:56:49 +0000
Subject: [PATCH 15/31] ignoring result folder

---
 .gitignore | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.gitignore b/.gitignore
index 4ce9325..90c0236 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,11 @@
 .nextflow/
 .nextflow.log*
 work/
+
+# workflow results
+results/
+
+
 # binary
 nextflow
 trace*

From 946ab7374c446ca472a52576314b09ce64f0a113 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Wed, 16 Jul 2025 14:42:24 +0000
Subject: [PATCH 16/31] use regex to get scan id

---
 bin/oktoberfest_feature_gen.py                | 19 ++++++++++++++++++-
 src/identification/comet_identification.nf    |  3 ++-
 src/identification/maxquant_identification.nf |  9 ++++++++-
 src/identification/msamanda_identification.nf |  3 ++-
 .../msfragger_identification.nf               |  3 ++-
 src/identification/msgfplus_identification.nf |  3 ++-
 src/identification/sage_identification.nf     |  3 ++-
 src/identification/xtandem_identification.nf  |  3 ++-
 src/postprocessing/oktoberfest.nf             |  7 ++++++-
 9 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index fc6537f..4bf27b2 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -10,6 +10,7 @@
 import json
 import logging
 from pathlib import Path
+import re
 
 import oktoberfest as ok
 from oktoberfest import runner as ok_runner
@@ -47,6 +48,12 @@ def parse_str_bool(value: str) -> bool:
         return False
     else:
         raise ValueError(f"Invalid boolean value: {value}")
+    
+def get_scan_id(spectrum_id: str, scan_id_regex: re.Pattern) -> int:
+    match = scan_id_regex.match(spectrum_id)
+    if not match:
+        raise ValueError(f"Could not extract scan number from spectrum ID: {spectrum_id}")
+    return int(match.group("scan_id"))
 
 def argparse_setup() -> argparse.Namespace:
     """
@@ -89,6 +96,15 @@ def argparse_setup() -> argparse.Namespace:
         type=parse_str_bool
     )
 
+    parser.add_argument(
+        "-scan-id-regex",
+        help=(
+            "Regular expression to extract the scan number from the spectrum ID."
+            "Use `scan_id` for the matching group, e.g. `scan=(?P<scan_id>\\d+)`)"
+        ),
+        type=str,
+    )
+
     parser.add_argument(
         "-out-folder", help="Output folder for ", required=True, type=Path
     )
@@ -106,6 +122,7 @@ def main():
 
     args = argparse_setup()
     logging.basicConfig(level=logging.INFO)
+    scan_id_regex = re.compile(args.scan_id_regex)
 
     oktoberfest_input_csv_path = args.psms_file.with_suffix(".oktoberfest.input.csv")
 
@@ -120,7 +137,7 @@ def main():
     oktoberfest_df["RAW_FILE"] = [args.spectra_file.stem] * len(psms)
 
     # SCAN_NUMBER
-    oktoberfest_df["SCAN_NUMBER"] = [psm.spectrum_id for psm in psms]
+    oktoberfest_df["SCAN_NUMBER"] = [get_scan_id(psm.spectrum_id, scan_id_regex) for psm in psms]
 
     # MODIFIED_SEQUENCE
     oktoberfest_df["MODIFIED_SEQUENCE"] = [
diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf
index bf8d857..18d3f9e 100644
--- a/src/identification/comet_identification.nf
+++ b/src/identification/comet_identification.nf
@@ -8,6 +8,7 @@ params.comet_mem = "8 GB"
 
 params.comet_psm_id_pattern = "(.*)"
 params.comet_spectrum_id_pattern = '.*scan=(\\d+)$'
+params.comet_scan_id_pattern = '(?P<scan_id>\\d+)'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
@@ -39,7 +40,7 @@ workflow comet_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.comet_psm_id_pattern, params.comet_spectrum_id_pattern, '^DECOY_', 'comet')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.comet_scan_id_pattern)
     
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf
index 74b9b19..c330219 100644
--- a/src/identification/maxquant_identification.nf
+++ b/src/identification/maxquant_identification.nf
@@ -8,6 +8,7 @@ params.maxquant_mem = "32 GB"
 
 params.maxquant_psm_id_pattern = ""
 params.maxquant_spectrum_id_pattern = ""
+params.maxquant_scan_id_pattern = ""
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
@@ -56,6 +57,12 @@ workflow maxquant_identification {
             spectrum_id_pattern = '.*scan=(\\d+)$'
         }
     }
+    if (params.maxquant_scan_id_pattern) {
+        scan_id_pattern = params.maxquant_scan_id_pattern
+    } else{
+        // no difference between psm TSVs derived from Bruker and Thermo measurments
+        scan_id_pattern = '(?P<scan_id>\\d+)'
+    }
 
     if (params.is_timstof) {
         psm_tsvs_and_spectrafiles = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msms')) + '.d'  ] }
@@ -64,7 +71,7 @@ workflow maxquant_identification {
     }
 
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), mzmls.collect())
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), mzmls.collect(), scan_id_pattern)
     
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf
index 3faa4a5..177c8c3 100644
--- a/src/identification/msamanda_identification.nf
+++ b/src/identification/msamanda_identification.nf
@@ -8,6 +8,7 @@ params.msamanda_mem = "64 GB"
 
 params.msamanda_psm_id_pattern = "(.*)"
 params.msamanda_spectrum_id_pattern = '(.*)'
+params.msamanda_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)*.'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
@@ -44,7 +45,7 @@ workflow msamanda_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msamanda.csv')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msamanda_psm_id_pattern, params.msamanda_spectrum_id_pattern, '^DECOY_', 'msamanda')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msamanda_scan_id_pattern)
 
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf
index 4697ed8..fbfebe0 100644
--- a/src/identification/msfragger_identification.nf
+++ b/src/identification/msfragger_identification.nf
@@ -10,6 +10,7 @@ params.msfragger_calibrate = 2
 
 params.msfragger_psm_id_pattern = "(.*)"
 params.msfragger_spectrum_id_pattern = "(.*)"
+params.msfragger_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)*.'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
@@ -38,7 +39,7 @@ workflow msfragger_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.pepXML')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msfragger_psm_id_pattern, params.msfragger_spectrum_id_pattern, '^DECOY_', 'msfragger')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msfragger_scan_id_pattern)
     
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf
index 6a76f9e..c6815a3 100644
--- a/src/identification/msgfplus_identification.nf
+++ b/src/identification/msgfplus_identification.nf
@@ -16,6 +16,7 @@ params.msgfplus_split_fasta = 0         // split the fasta into this many chunks
 
 params.msgfplus_psm_id_pattern = "(.*)"
 params.msgfplus_spectrum_id_pattern = '(.*)'
+params.msgfplus_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)*.'
 
 include {convert_chunked_result_to_psm_utils; enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
@@ -92,7 +93,7 @@ workflow msgfplus_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.mzid')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msgfplus_psm_id_pattern, params.msgfplus_spectrum_id_pattern, '^DECOY_', 'msgfplus')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.msgfplus_scan_id_pattern)
 
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf
index 2c23d69..5758e9e 100644
--- a/src/identification/sage_identification.nf
+++ b/src/identification/sage_identification.nf
@@ -10,6 +10,7 @@ params.sage_prefilter_chunk_size = 0
 
 params.sage_psm_id_pattern = "(.*)"
 params.sage_spectrum_id_pattern = '(.*)'
+params.sage_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)*.'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
@@ -50,7 +51,7 @@ workflow sage_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.sage')) ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.sage_psm_id_pattern, params.sage_spectrum_id_pattern, '^DECOY_', 'sage')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.sage_scan_id_pattern)
 
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf
index 2e58235..58f6c23 100644
--- a/src/identification/xtandem_identification.nf
+++ b/src/identification/xtandem_identification.nf
@@ -8,6 +8,7 @@ params.xtandem_mem = "128 GB"
 
 params.xtandem_psm_id_pattern = "(.*)"
 params.xtandem_spectrum_id_pattern = '(.*)'
+params.xtandem_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)*.'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
@@ -40,7 +41,7 @@ workflow xtandem_identification {
 
     psm_tsvs_and_mzmls = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('.xtandem_identification')) + '.mzML'  ] }
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.xtandem_psm_id_pattern, params.xtandem_spectrum_id_pattern, '^DECOY_', 'xtandem')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect())
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_mzmls, psm_tsvs.collect(), mzmls.collect(), params.xtandem_scan_id_pattern)
     
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
index 7873d23..5ab635f 100644
--- a/src/postprocessing/oktoberfest.nf
+++ b/src/postprocessing/oktoberfest.nf
@@ -11,6 +11,7 @@ params.oktoberfest_irt_model = "Prosit_2019_irt"
  * @param psm_tsvs_and_mzmls: A tuple containing the PSM utils TSV files and the mzML files for the PSMs.
  * @param psm_tsvs: The PSM TSV files.
  * @param mzmls: The mzML files. 
+ * @param scan_id_regex: A regex pattern to extract the scan number from the spectrum ID.
  *
  * @return: The oktoberfest rescored PSMs in TSV format.
  */
@@ -19,9 +20,10 @@ workflow oktoberfest_rescore_workflow {
     psm_tsvs_and_mzmls
     psm_tsvs
     mzmls
+    scan_id_regex
 
     main:
-    oktoberfest_features = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, params.fragment_tol_da)
+    oktoberfest_features = run_oktoberfest_feature_gen(psm_tsvs_and_mzmls, psm_tsvs, mzmls, params.fragment_tol_da, scan_id_regex)
     oktoberfest_pins = oktoberfest_features_to_pin(oktoberfest_features)
 
 
@@ -34,6 +36,7 @@ workflow oktoberfest_rescore_workflow {
  * @param psm_tsvs: The PSM TSV files.
  * @param mzmls: The mzML files. 
  * @param fragment_tol_da: The fragment tolerance for the rescoring.
+ * @param scan_id_regex: A regex pattern to extract the scan number from the spectrum ID.
  * 
  * @return The oktoberfest rescored PSMs in TSV format.
  */
@@ -48,6 +51,7 @@ process run_oktoberfest_feature_gen {
     path psm_tsvs
     path mzmls
     val fragment_tol_da
+    val scan_id_regex
     
     output:
     path "${psm_utils_tsvs}.features.tsv"
@@ -63,6 +67,7 @@ process run_oktoberfest_feature_gen {
         -mass-tolerance ${fragment_tol_da} \
         -mass-tolerance-unit da \
         -is-timstof ${params.is_timstof} \
+        -scan-id-regex '${scan_id_regex}' \
 
     mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs}.features.tsv"
 

From ea08484d86fe68f30d21bc4e34ea3edf6277da54 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Wed, 16 Jul 2025 14:42:55 +0000
Subject: [PATCH 17/31] implement retry on  koina server error

---
 bin/oktoberfest_feature_gen.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 4bf27b2..184582e 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -11,18 +11,23 @@
 import logging
 from pathlib import Path
 import re
+from time import sleep
 
 import oktoberfest as ok
 from oktoberfest import runner as ok_runner
 import pandas as pd
 import psm_utils
 import psm_utils.io
+import tritonclient
 
 OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD = 'f{config.fdr_estimation_method} is not a valid rescoring tool, use either "percolator" or "mokapot"'
 """Oktoberfest's error message for unknown FDR estimation methods.
 There is a typo in the oktberfest code, as it is not substituting the f-string correctly.
 """
 
+OKTOBERFEST_RETRIES = 5
+"""Number of retries for the oktberfest job in case of a server error."""
+
 def parse_str_bool(value: str) -> bool:
     """
     Parses a string argument to a boolean value.
@@ -226,12 +231,26 @@ def main():
     with config_path.open("w", encoding="utf-8") as json_file:
         json_file.write(json.dumps(config_dict))
 
-    try:
-        ok_runner.run_job(config_path)
-    except ValueError as e:
-        # Catch the specific ValueError raised when "NONE" is used as fdr_estimation
-        if str(e) != OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD:
-            raise e
+    is_successfull = False
+    for i in range(OKTOBERFEST_RETRIES):
+        try:
+            ok_runner.run_job(config_path)
+        except ValueError as e:
+            # Catch the specific ValueError raised when "NONE" is used as fdr_estimation
+            if str(e) != OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD:
+                raise e
+            else:
+                is_successfull = True
+                break
+        except tritonclient.utils.InferenceServerException as e:
+            if str(e.status) == "504":
+                logging.error("Koina server not available, retrying in 10 seconds...")
+                sleep(10)
+
+    if not is_successfull:
+        logging.error("Oktoberfest job failed after multiple retries.")
+        exit(101)
+
 
 
 if __name__ == "__main__":

From 5c127372d9003477cb7aeeb5020f5441f1c02b36 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Wed, 16 Jul 2025 14:45:02 +0000
Subject: [PATCH 18/31] limit max forks of oktoberfest feature generation

---
 src/postprocessing/oktoberfest.nf | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
index 5ab635f..071e452 100644
--- a/src/postprocessing/oktoberfest.nf
+++ b/src/postprocessing/oktoberfest.nf
@@ -4,6 +4,7 @@ nextflow.enable.dsl=2
 params.oktoberfest_memory = "64 GB"
 params.oktoberfest_intensity_model = "Prosit_2020_intensity_HCD"
 params.oktoberfest_irt_model = "Prosit_2019_irt"
+params.oktoberfest_forks = 1 // have some mercy with the koina servers
 
 /**
  * Runs oktoberfest rescoring for the given PSMs and mzML files.
@@ -42,6 +43,7 @@ workflow oktoberfest_rescore_workflow {
  */
 process run_oktoberfest_feature_gen {
     cpus 1
+    maxForks params.oktoberfest_forks
     memory { params.oktoberfest_memory }
 
     container { params.oktoberfest_image }

From 66c85edfbf0ce4a6e53b5219f31224995f026303 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Thu, 17 Jul 2025 12:34:36 +0000
Subject: [PATCH 19/31] select specific spectrum file instead of general folder

---
 bin/oktoberfest_feature_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 184582e..83a29c4 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -215,7 +215,7 @@ def main():
     # input params
     config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path)
     config_dict["inputs"]["search_results_type"] = "Internal"
-    config_dict["inputs"]["spectra"] = "./"
+    config_dict["inputs"]["spectra"] = str(args.spectra_file)
     config_dict["inputs"]["spectra_type"] = "d" if args.is_timstof else "mzml"
     # resocreing params
     # deliberately set to NONE, which will cause Oktoberfest to shut down before rescoring

From 173521ed710dc3ddb91c071b5f3ce22f359324a8 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Fri, 18 Jul 2025 14:26:12 +0000
Subject: [PATCH 20/31] fix missing proteins accession and attribute filters

---
 bin/oktoberfest_feature_gen.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 83a29c4..7b98e8d 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -28,6 +28,9 @@
 OKTOBERFEST_RETRIES = 5
 """Number of retries for the oktberfest job in case of a server error."""
 
+OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS = {"U", "O"}
+"""According to documentation, Oktoberfest does not support the amino acids U and O."""
+
 def parse_str_bool(value: str) -> bool:
     """
     Parses a string argument to a boolean value.
@@ -190,14 +193,37 @@ def main():
     for rescoring_col in present_rescoring_cols:
         oktoberfest_df[rescoring_col] = psms_df[rescoring_col]
 
+    # free up some more memory as the dataframe is read fromt disk again
+    del psms_df
+
+    # Filter unsupported amino acids
+    psms_len = len(oktoberfest_df)
+    oktoberfest_df = oktoberfest_df[~oktoberfest_df["MODIFIED_SEQUENCE"].str.contains("|".join(OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS), regex=True)]
+    if len(oktoberfest_df) < psms_len:
+        logging.warning(
+            f"Removed {psms_len - len(oktoberfest_df)} PSMs with unsupported amino acids: {OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS}"
+        )
+
+    # Filter peptide length > 30
+    psms_len = len(oktoberfest_df)
+    oktoberfest_df = oktoberfest_df[oktoberfest_df["PEPTIDE_LENGTH"] <= 30]
+    if len(oktoberfest_df) < psms_len:
+        logging.warning(
+            f"Removed {psms_len - len(oktoberfest_df)} PSMs with peptide length > 30"
+        )
+
+    # Some search engines do not provide protein accessions for decoys.
+    # In this case, we set the PROTEINS column to the `PEP_y<MODIFIED_SEQUENCE>` like in the Oktberfest docs.
+    oktoberfest_df["PROTEINS"].replace("", pd.NA, inplace=True)
+    oktoberfest_df["PROTEINS"].fillna("PEP_" + oktoberfest_df["MODIFIED_SEQUENCE"], inplace=True)
+
     oktoberfest_df.to_csv(
         oktoberfest_input_csv_path,
         sep=",",
         index=False,
     )
 
-    # free up some more memory as the dataframe is read fromt disk again
-    del psms_df
+    # free up more memory
     del oktoberfest_df
 
     # create the config file

From 4e58ba1ae448891cf2b5c594b6ddb631c68ae2f7 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Fri, 18 Jul 2025 14:50:35 +0000
Subject: [PATCH 21/31] fix column

---
 bin/oktoberfest_feature_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 7b98e8d..a897a73 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -198,7 +198,7 @@ def main():
 
     # Filter unsupported amino acids
     psms_len = len(oktoberfest_df)
-    oktoberfest_df = oktoberfest_df[~oktoberfest_df["MODIFIED_SEQUENCE"].str.contains("|".join(OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS), regex=True)]
+    oktoberfest_df = oktoberfest_df[~oktoberfest_df["SEQUENCE"].str.contains("|".join(OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS), regex=True)]
     if len(oktoberfest_df) < psms_len:
         logging.warning(
             f"Removed {psms_len - len(oktoberfest_df)} PSMs with unsupported amino acids: {OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS}"

From 8cf6a20308eccc6efb1832517d130e9526fa13b1 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Mon, 21 Jul 2025 09:42:25 +0000
Subject: [PATCH 22/31] fix issues with PSM at last amino acid in sequence

---
 bin/oktoberfest_feature_gen.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index a897a73..b3ab1b3 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -7,6 +7,7 @@
 
 import argparse
 import copy
+from functools import reduce
 import json
 import logging
 from pathlib import Path
@@ -31,6 +32,13 @@
 OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS = {"U", "O"}
 """According to documentation, Oktoberfest does not support the amino acids U and O."""
 
+OKTOBERFEST_MODIFICATION_REPLACEMENTS = [
+    ("C[UNIMOD:Carbamidomethyl]", "C[UNIMOD:4]"),
+    ("C-[UNIMOD:Carbamidomethyl]", "C[UNIMOD:4]"), # MaxQant-specific if C is last in sequence, PSM is annotated with C- instead of C
+    ("M[UNIMOD:Oxidation]", "M[UNIMOD:35]"),
+    ("M-[UNIMOD:Oxidation]", "M[UNIMOD:35]"), # MaxQant-specific if M is last in sequence, PSM is annotated with M- instead of M
+]
+
 def parse_str_bool(value: str) -> bool:
     """
     Parses a string argument to a boolean value.
@@ -151,9 +159,12 @@ def main():
     oktoberfest_df["MODIFIED_SEQUENCE"] = [
         psm.peptidoform.modified_sequence for psm in psms
     ]
+    # sequentially apply the replacements using functools.reduce
     oktoberfest_df["MODIFIED_SEQUENCE"] = oktoberfest_df["MODIFIED_SEQUENCE"].apply(
-        lambda x: x.replace("[UNIMOD:Carbamidomethyl]", "[UNIMOD:4]").replace(
-            "[UNIMOD:Oxidation]", "[UNIMOD:35]"
+        lambda seq: reduce(
+            lambda seq_x, repl: seq_x.replace(repl[0], repl[1]), # lambda to replace
+            OKTOBERFEST_MODIFICATION_REPLACEMENTS, # replacements
+            seq # starting with the sequences as stated in the PSMs file
         )
     )
 
@@ -216,7 +227,6 @@ def main():
     # In this case, we set the PROTEINS column to the `PEP_y<MODIFIED_SEQUENCE>` like in the Oktberfest docs.
     oktoberfest_df["PROTEINS"].replace("", pd.NA, inplace=True)
     oktoberfest_df["PROTEINS"].fillna("PEP_" + oktoberfest_df["MODIFIED_SEQUENCE"], inplace=True)
-
     oktoberfest_df.to_csv(
         oktoberfest_input_csv_path,
         sep=",",

From 9ff4dc661c29dccc2b4e33a4c8186fce3e2cc2a6 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Mon, 21 Jul 2025 09:43:35 +0000
Subject: [PATCH 23/31] add doc string

---
 bin/oktoberfest_feature_gen.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index b3ab1b3..010e992 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -66,6 +66,20 @@ def parse_str_bool(value: str) -> bool:
         raise ValueError(f"Invalid boolean value: {value}")
     
 def get_scan_id(spectrum_id: str, scan_id_regex: re.Pattern) -> int:
+    """
+    Using the provided regex to extract the scan number from the spectrum ID.
+    Arguments
+    ---------
+    spectrum_id : str
+        The spectrum ID from which to extract the scan number.
+    scan_id_regex : re.Pattern
+        A compiled regular expression pattern to match the scan number. 
+    
+    Returns
+    -------
+    int
+        The extracted scan number as an integer.
+    """
     match = scan_id_regex.match(spectrum_id)
     if not match:
         raise ValueError(f"Could not extract scan number from spectrum ID: {spectrum_id}")

From 62adfa8266fe1c9b91720d0fbd4cb49b7fdc461c Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Tue, 22 Jul 2025 08:35:11 +0000
Subject: [PATCH 24/31] copy parts of Oktoberfest's code for a more stable use

---
 bin/oktoberfest_feature_gen.py | 112 ++++++++++++++++++++++++++-------
 1 file changed, 91 insertions(+), 21 deletions(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 010e992..363734f 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -13,18 +13,18 @@
 from pathlib import Path
 import re
 from time import sleep
+from typing import Union
 
 import oktoberfest as ok
-from oktoberfest import runner as ok_runner
+from oktoberfest.runner import _preprocess, _ce_calib, _refinement_learn, _calculate_features
+from oktoberfest.utils import Config, JobPool, ProcessStep
+from oktoberfest import rescore as ok_re
+from oktoberfest import preprocessing as ok_pp
 import pandas as pd
 import psm_utils
 import psm_utils.io
 import tritonclient
 
-OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD = 'f{config.fdr_estimation_method} is not a valid rescoring tool, use either "percolator" or "mokapot"'
-"""Oktoberfest's error message for unknown FDR estimation methods.
-There is a typo in the oktberfest code, as it is not substituting the f-string correctly.
-"""
 
 OKTOBERFEST_RETRIES = 5
 """Number of retries for the oktberfest job in case of a server error."""
@@ -142,6 +142,86 @@ def argparse_setup() -> argparse.Namespace:
     return parser.parse_args()
 
 
+def feature_generation(config_path: Union[str, Path]):
+    """
+    Parts of Oktoberfest's [run_rescore-function without the rescoring step](https://github.com/wilhelm-lab/oktoberfest/blob/ce8d909ebf64aaaf9c0eebcc2bb33b9c4492ae90/oktoberfest/runner.py#L1238-L1312)
+    with some renamed dependencies to avoid conflicts e.g. `re` => `ok_re` (for Oktoberfests rescoring module) to avoid conflicts with the built-in `re` module.
+
+    Arguments
+    ---------
+    config_path : Union[str, Path]
+        Path to the configuration file for Oktoberfest. This file should contain all necessary parameters for the
+    """
+    config = Config()
+    config.read(config_path)
+    config.check()
+
+    # load spectra file names
+    spectra_files = ok_pp.list_spectra(input_dir=config.spectra, input_format=config.spectra_type)
+
+    proc_dir = config.output / "proc"
+    proc_dir.mkdir(parents=True, exist_ok=True)
+
+    spectra_files = _preprocess(spectra_files, config)
+
+    # TODO is this the most elegant way to multi-thread CE calibration before running refinement learning?
+    # Should we store the returned libraries and pass them to _calculate_features and _refinement_learn instead of
+    # _ce_calib returning cached outputs?
+    if config.num_threads > 1:
+        processing_pool = JobPool(processes=config.num_threads)
+        for spectra_file in spectra_files:
+            _ = processing_pool.apply_async(_ce_calib, [spectra_file, config])
+        processing_pool.check_pool()
+    else:
+        for spectra_file in spectra_files:
+            _ = _ce_calib(spectra_file, config)
+
+    if config.do_refinement_learning:
+        _refinement_learn(spectra_files, config)
+
+    if config.num_threads > 1:
+        processing_pool = JobPool(processes=config.num_threads)
+        for spectra_file in spectra_files:
+            if "xl" in config.models["intensity"].lower():
+                if "cms2" in config.models["intensity"].lower():
+                    cms2 = True
+                else:
+                    cms2 = False
+                processing_pool.apply_async(_calculate_features, [spectra_file, config], xl=True, cms2=cms2)
+            else:
+                processing_pool.apply_async(_calculate_features, [spectra_file, config])
+        processing_pool.check_pool()
+    else:
+        for spectra_file in spectra_files:
+            if "xl" in config.models["intensity"].lower():
+                if "cms2" in config.models["intensity"].lower():
+                    cms2 = True
+                else:
+                    cms2 = False
+                _calculate_features(spectra_file, config, xl=True, cms2=cms2)
+            else:
+                _calculate_features(spectra_file, config)
+
+    # prepare rescoring
+
+    fdr_dir = config.output / "results" / config.fdr_estimation_method
+    original_tab_files = [fdr_dir / spectra_file.with_suffix(".original.tab").name for spectra_file in spectra_files]
+    rescore_tab_files = [fdr_dir / spectra_file.with_suffix(".rescore.tab").name for spectra_file in spectra_files]
+
+    prepare_tab_original_step = ProcessStep(config.output, f"{config.fdr_estimation_method}_prepare_tab_original")
+    prepare_tab_rescore_step = ProcessStep(config.output, f"{config.fdr_estimation_method}_prepare_tab_prosit")
+
+    if not prepare_tab_original_step.is_done():
+        logging.info("Merging input tab files for rescoring without peptide property prediction")
+        ok_re.merge_input(tab_files=original_tab_files, output_file=fdr_dir / "original.tab")
+        prepare_tab_original_step.mark_done()
+
+    if not prepare_tab_rescore_step.is_done():
+        logging.info("Merging input tab files for rescoring with peptide property prediction")
+        ok_re.merge_input(tab_files=rescore_tab_files, output_file=fdr_dir / "rescore.tab")
+        prepare_tab_rescore_step.mark_done()
+
+
 def main():
     """
     Generates features for PSM-rescoring using Oktoberfest.
@@ -226,7 +306,7 @@ def main():
     oktoberfest_df = oktoberfest_df[~oktoberfest_df["SEQUENCE"].str.contains("|".join(OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS), regex=True)]
     if len(oktoberfest_df) < psms_len:
         logging.warning(
-            f"Removed {psms_len - len(oktoberfest_df)} PSMs with unsupported amino acids: {OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS}"
+            "Removed %i PSMs with unsupported amino acids: %s", psms_len - len(oktoberfest_df), OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS
         )
 
     # Filter peptide length > 30
@@ -234,7 +314,7 @@ def main():
     oktoberfest_df = oktoberfest_df[oktoberfest_df["PEPTIDE_LENGTH"] <= 30]
     if len(oktoberfest_df) < psms_len:
         logging.warning(
-            f"Removed {psms_len - len(oktoberfest_df)} PSMs with peptide length > 30"
+            "Removed %i PSMs with peptide length > 30", psms_len - len(oktoberfest_df)
         )
 
     # Some search engines do not provide protein accessions for decoys.
@@ -267,10 +347,7 @@ def main():
     config_dict["inputs"]["search_results_type"] = "Internal"
     config_dict["inputs"]["spectra"] = str(args.spectra_file)
     config_dict["inputs"]["spectra_type"] = "d" if args.is_timstof else "mzml"
-    # resocreing params
-    # deliberately set to NONE, which will cause Oktoberfest to shut down before rescoring
-    # by raising a ValueError which we can catch later.
-    # This has the effect, that the generated features
+    # Setting this to none has the effect, that the generated features
     # are stored in the subfolder `results/none` of the output folder.
     config_dict["fdr_estimation_method"] = "NONE"
     config_dict["quantification"] = False
@@ -282,16 +359,10 @@ def main():
         json_file.write(json.dumps(config_dict))
 
     is_successfull = False
-    for i in range(OKTOBERFEST_RETRIES):
+    for _ in range(OKTOBERFEST_RETRIES):
         try:
-            ok_runner.run_job(config_path)
-        except ValueError as e:
-            # Catch the specific ValueError raised when "NONE" is used as fdr_estimation
-            if str(e) != OKTOBERFEST_UNKNOWN_FDR_ESTIMATION_METHOD:
-                raise e
-            else:
-                is_successfull = True
-                break
+            feature_generation(config_path)
+            is_successfull = True
         except tritonclient.utils.InferenceServerException as e:
             if str(e.status) == "504":
                 logging.error("Koina server not available, retrying in 10 seconds...")
@@ -302,6 +373,5 @@ def main():
         exit(101)
 
 
-
 if __name__ == "__main__":
     main()

From 22cf67acbc52fcbb7c36f53f53918619b5f03569 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Tue, 22 Jul 2025 08:44:03 +0000
Subject: [PATCH 25/31] get rid of deprecation warning

---
 bin/oktoberfest_feature_gen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 363734f..a5a3bc5 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -319,8 +319,8 @@ def main():
 
     # Some search engines do not provide protein accessions for decoys.
     # In this case, we set the PROTEINS column to the `PEP_y<MODIFIED_SEQUENCE>` like in the Oktberfest docs.
-    oktoberfest_df["PROTEINS"].replace("", pd.NA, inplace=True)
-    oktoberfest_df["PROTEINS"].fillna("PEP_" + oktoberfest_df["MODIFIED_SEQUENCE"], inplace=True)
+    oktoberfest_df.replace({"PROTEINS": ""}, pd.NA, inplace=True)
+    oktoberfest_df.fillna({"PROTEINS": "PEP_" + oktoberfest_df["MODIFIED_SEQUENCE"]}, inplace=True)
     oktoberfest_df.to_csv(
         oktoberfest_input_csv_path,
         sep=",",

From 2a1c86f0edfa4a004132bbffe00ad81280cc9a1a Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Tue, 22 Jul 2025 08:44:36 +0000
Subject: [PATCH 26/31] correct comment

---
 bin/oktoberfest_feature_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index a5a3bc5..4873fd3 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -318,7 +318,7 @@ def main():
         )
 
     # Some search engines do not provide protein accessions for decoys.
-    # In this case, we set the PROTEINS column to the `PEP_y<MODIFIED_SEQUENCE>` like in the Oktberfest docs.
+    # In this case, we set the PROTEINS column to the `PEP_<MODIFIED_SEQUENCE>` like in the ms2rescore.
     oktoberfest_df.replace({"PROTEINS": ""}, pd.NA, inplace=True)
     oktoberfest_df.fillna({"PROTEINS": "PEP_" + oktoberfest_df["MODIFIED_SEQUENCE"]}, inplace=True)
     oktoberfest_df.to_csv(

From 760cc2919cf3f99deebf075964f1dd3ee7fc7ca9 Mon Sep 17 00:00:00 2001
From: Dirk Winkelhardt <dirk.winkelhardt@rub.de>
Date: Tue, 22 Jul 2025 08:58:07 +0000
Subject: [PATCH 27/31] remove unnecessary filter

---
 bin/oktoberfest_feature_gen.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 4873fd3..2b18e03 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -309,14 +309,6 @@ def main():
             "Removed %i PSMs with unsupported amino acids: %s", psms_len - len(oktoberfest_df), OKTOBERFEST_UNSUPPORTED_AMINO_ACIDS
         )
 
-    # Filter peptide length > 30
-    psms_len = len(oktoberfest_df)
-    oktoberfest_df = oktoberfest_df[oktoberfest_df["PEPTIDE_LENGTH"] <= 30]
-    if len(oktoberfest_df) < psms_len:
-        logging.warning(
-            "Removed %i PSMs with peptide length > 30", psms_len - len(oktoberfest_df)
-        )
-
     # Some search engines do not provide protein accessions for decoys.
     # In this case, we set the PROTEINS column to the `PEP_<MODIFIED_SEQUENCE>` like in the ms2rescore.
     oktoberfest_df.replace({"PROTEINS": ""}, pd.NA, inplace=True)

From 66730459de446d80327d3545ee82f4aa7212f7b3 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Mon, 1 Sep 2025 16:05:59 +0000
Subject: [PATCH 28/31] typo fixes

---
 bin/oktoberfest_feature_gen.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 2b18e03..9f28abd 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -327,13 +327,13 @@ def main():
 
     # misc
     config_dict["output"] = str(args.out_folder)
-    config_dict["num_threads"] = 1  # Set to 1 for debugging, can be increased later
+    config_dict["numThreads"] = 1  # Set to 1 for debugging, can be increased later
     # mass spec parameters
-    config_dict["mass_tolerance"] = args.mass_tolerance
+    config_dict["massTolerance"] = args.mass_tolerance
     config_dict["unitMassTolerance"] = args.mass_tolerance_unit
     # predicition params
-    config_dict["models"]["irt_model"] = args.irt_model
-    config_dict["models"]["intensity_model"] = args.intensity_model
+    config_dict["models"]["irt"] = args.irt_model
+    config_dict["models"]["intensity"] = args.intensity_model
     # input params
     config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path)
     config_dict["inputs"]["search_results_type"] = "Internal"

From f5a70db73f2b0a1a167858a0fccf97865272c396 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Tue, 2 Sep 2025 11:56:09 +0000
Subject: [PATCH 29/31] fixing typo in regex for spectra

---
 src/identification/comet_identification.nf     | 2 +-
 src/identification/msamanda_identification.nf  | 2 +-
 src/identification/msfragger_identification.nf | 2 +-
 src/identification/msgfplus_identification.nf  | 2 +-
 src/identification/sage_identification.nf      | 2 +-
 src/identification/xtandem_identification.nf   | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/identification/comet_identification.nf b/src/identification/comet_identification.nf
index 18d3f9e..87b4953 100644
--- a/src/identification/comet_identification.nf
+++ b/src/identification/comet_identification.nf
@@ -8,7 +8,7 @@ params.comet_mem = "8 GB"
 
 params.comet_psm_id_pattern = "(.*)"
 params.comet_spectrum_id_pattern = '.*scan=(\\d+)$'
-params.comet_scan_id_pattern = '(?P<scan_id>\\d+)'
+params.comet_scan_id_pattern = '^(?P<scan_id>\\d+)$'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
diff --git a/src/identification/msamanda_identification.nf b/src/identification/msamanda_identification.nf
index 177c8c3..abbe794 100644
--- a/src/identification/msamanda_identification.nf
+++ b/src/identification/msamanda_identification.nf
@@ -8,7 +8,7 @@ params.msamanda_mem = "64 GB"
 
 params.msamanda_psm_id_pattern = "(.*)"
 params.msamanda_spectrum_id_pattern = '(.*)'
-params.msamanda_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)*.'
+params.msamanda_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
diff --git a/src/identification/msfragger_identification.nf b/src/identification/msfragger_identification.nf
index fbfebe0..7a21f0b 100644
--- a/src/identification/msfragger_identification.nf
+++ b/src/identification/msfragger_identification.nf
@@ -10,7 +10,7 @@ params.msfragger_calibrate = 2
 
 params.msfragger_psm_id_pattern = "(.*)"
 params.msfragger_spectrum_id_pattern = "(.*)"
-params.msfragger_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)*.'
+params.msfragger_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
diff --git a/src/identification/msgfplus_identification.nf b/src/identification/msgfplus_identification.nf
index c6815a3..489855c 100644
--- a/src/identification/msgfplus_identification.nf
+++ b/src/identification/msgfplus_identification.nf
@@ -16,7 +16,7 @@ params.msgfplus_split_fasta = 0         // split the fasta into this many chunks
 
 params.msgfplus_psm_id_pattern = "(.*)"
 params.msgfplus_spectrum_id_pattern = '(.*)'
-params.msgfplus_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)*.'
+params.msgfplus_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
 
 include {convert_chunked_result_to_psm_utils; enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
diff --git a/src/identification/sage_identification.nf b/src/identification/sage_identification.nf
index 5758e9e..8a126d8 100644
--- a/src/identification/sage_identification.nf
+++ b/src/identification/sage_identification.nf
@@ -10,7 +10,7 @@ params.sage_prefilter_chunk_size = 0
 
 params.sage_psm_id_pattern = "(.*)"
 params.sage_spectrum_id_pattern = '(.*)'
-params.sage_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)*.'
+params.sage_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'
diff --git a/src/identification/xtandem_identification.nf b/src/identification/xtandem_identification.nf
index 58f6c23..38c6459 100644
--- a/src/identification/xtandem_identification.nf
+++ b/src/identification/xtandem_identification.nf
@@ -8,7 +8,7 @@ params.xtandem_mem = "128 GB"
 
 params.xtandem_psm_id_pattern = "(.*)"
 params.xtandem_spectrum_id_pattern = '(.*)'
-params.xtandem_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)*.'
+params.xtandem_scan_id_pattern = '.*scan=(?P<scan_id>\\d+)$'
 
 include {convert_and_enhance_psm_tsv} from '../postprocessing/convert_and_enhance_psm_tsv.nf'
 include {psm_percolator; psm_percolator as ms2rescore_percolator; psm_percolator as oktoberfest_percolator} from '../postprocessing/percolator.nf'

From 092dc6206150689419a08ff8304feb09db687a47 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Wed, 3 Sep 2025 07:17:56 +0000
Subject: [PATCH 30/31] fixingf usage of d. folder only for maxquant tims data

---
 bin/oktoberfest_feature_gen.py                | 2 +-
 src/identification/maxquant_identification.nf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 9f28abd..7b31879 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -338,7 +338,7 @@ def main():
     config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path)
     config_dict["inputs"]["search_results_type"] = "Internal"
     config_dict["inputs"]["spectra"] = str(args.spectra_file)
-    config_dict["inputs"]["spectra_type"] = "d" if args.is_timstof else "mzml"
+    config_dict["inputs"]["spectra_type"] = args.spectra_file.suffix.replace(".", "").lower()
     # Setting this to none has the effect, that the generated features
     # are stored in the subfolder `results/none` of the output folder.
     config_dict["fdr_estimation_method"] = "NONE"
diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf
index c330219..fa894e1 100644
--- a/src/identification/maxquant_identification.nf
+++ b/src/identification/maxquant_identification.nf
@@ -71,7 +71,7 @@ workflow maxquant_identification {
     }
 
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), mzmls.collect(), scan_id_pattern)
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), scan_id_pattern)
     
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)

From 326dec3a89d55ddb5e0e6bc56c87212d51a76784 Mon Sep 17 00:00:00 2001
From: julianu <julian.uszkoreit@rub.de>
Date: Thu, 4 Sep 2025 06:01:09 +0000
Subject: [PATCH 31/31] always use mzML for oktoberfest (also for timsTOF)

---
 bin/oktoberfest_feature_gen.py                | 8 +-------
 src/identification/maxquant_identification.nf | 7 ++++++-
 src/postprocessing/oktoberfest.nf             | 1 -
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/bin/oktoberfest_feature_gen.py b/bin/oktoberfest_feature_gen.py
index 7b31879..a6f1415 100755
--- a/bin/oktoberfest_feature_gen.py
+++ b/bin/oktoberfest_feature_gen.py
@@ -120,12 +120,6 @@ def argparse_setup() -> argparse.Namespace:
         choices=["da", "ppm"],
     )
 
-    parser.add_argument(
-        "-is-timstof",
-        help="If true, the spectra file type is set to 'd' (for timsTOF); otherwise, it defaults to 'mzml'",
-        type=parse_str_bool
-    )
-
     parser.add_argument(
         "-scan-id-regex",
         help=(
@@ -338,7 +332,7 @@ def main():
     config_dict["inputs"]["search_results"] = str(oktoberfest_input_csv_path)
     config_dict["inputs"]["search_results_type"] = "Internal"
     config_dict["inputs"]["spectra"] = str(args.spectra_file)
-    config_dict["inputs"]["spectra_type"] = args.spectra_file.suffix.replace(".", "").lower()
+    config_dict["inputs"]["spectra_type"] = "mzml"
     # Setting this to none has the effect, that the generated features
     # are stored in the subfolder `results/none` of the output folder.
     config_dict["fdr_estimation_method"] = "NONE"
diff --git a/src/identification/maxquant_identification.nf b/src/identification/maxquant_identification.nf
index fa894e1..54b9476 100644
--- a/src/identification/maxquant_identification.nf
+++ b/src/identification/maxquant_identification.nf
@@ -65,13 +65,18 @@ workflow maxquant_identification {
     }
 
     if (params.is_timstof) {
+        // MS2Rescore takes the .d files
         psm_tsvs_and_spectrafiles = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msms')) + '.d'  ] }
+
+        // oktoberfest needs the mzML files
+        psm_tsvs_and_spectra_oktoberfest = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msms')) + '.mzML'  ] }
     } else {
         psm_tsvs_and_spectrafiles = psm_tsvs.map { it -> [ it.name, it.name.take(it.name.lastIndexOf('_msms')) + '.mzML'  ] }
+        psm_tsvs_and_spectra_oktoberfest = psm_tsvs_and_spectrafiles
     }
 
     ms2rescore_pins = ms2rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), psm_id_pattern, spectrum_id_pattern, '', 'maxquant')
-    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectrafiles, psm_tsvs.collect(), process_files.collect(), scan_id_pattern)
+    oktoberfest_pins = oktoberfest_rescore_workflow(psm_tsvs_and_spectra_oktoberfest, psm_tsvs.collect(), mzmls.collect(), scan_id_pattern)
     
     // perform percolation
     ms2rescore_percolator_results = ms2rescore_percolator(ms2rescore_pins.ms2rescore_pins)
diff --git a/src/postprocessing/oktoberfest.nf b/src/postprocessing/oktoberfest.nf
index 071e452..28e9165 100644
--- a/src/postprocessing/oktoberfest.nf
+++ b/src/postprocessing/oktoberfest.nf
@@ -68,7 +68,6 @@ process run_oktoberfest_feature_gen {
         -irt-model ${params.oktoberfest_irt_model} \
         -mass-tolerance ${fragment_tol_da} \
         -mass-tolerance-unit da \
-        -is-timstof ${params.is_timstof} \
         -scan-id-regex '${scan_id_regex}' \
 
     mv ./oktoberfest_out/results/none/rescore.tab "${psm_utils_tsvs}.features.tsv"