pluskal-lab · segef · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/data/ec_to_substrate_mapping_2026_03_14.json b/data/ec_to_substrate_mapping_2026_03_14.json
diff --git a/data/marts2ec_2026_03_14.csv b/data/marts2ec_2026_03_14.csv
diff --git a/data/martsDB_reactions_2026_02_22.csv b/data/martsDB_reactions_2026_02_22.csv
diff --git a/data/rhea-directions_2026_03_14.tsv b/data/rhea-directions_2026_03_14.tsv
diff --git a/data/rhea-reaction-smiles_2026_03_14.tsv b/data/rhea-reaction-smiles_2026_03_14.tsv
diff --git a/data/rhea2ec_2026_03_14.tsv b/data/rhea2ec_2026_03_14.tsv
diff --git a/enzymeexplorer/configs/CLEAN/with_minor_reactions/config.yaml b/enzymeexplorer/configs/CLEAN/with_minor_reactions/config.yaml
@@ -16,15 +16,13 @@ random_state: 0
 hyperparam_dimensions: none
 seq_col_name: "Amino acid sequence"
 n_calls_hyperparams_opt: 0
-clean_installation_root: "/mnt/c/Users/raman/Documents/bio_ml/CLEAN"
-rhea2ec_link: "https://ftp.expasy.org/databases/rhea/tsv/rhea2ec.tsv"
-rhea_reaction_smiles_link: "https://ftp.expasy.org/databases/rhea/tsv/rhea-reaction-smiles.tsv"
-rhea_directions_link: "https://ftp.expasy.org/databases/rhea/tsv/rhea-directions.tsv"
-clean_working_dir: "_clean_working_dir"
+clean_installation_root: "~/CLEAN"
+ec_2_substrates_json_path: "data/ec_to_substrate_mapping_2026_03_14.json"
 is_halo: false
 neg_val: "Unknown"
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
 tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: false
 reuse_existing_partial_results: false
 load_per_class_params_from: ""
+pretrained_models_link: "https://drive.google.com/file/d/1J3FXW08At6LTjDckWVTaMx6VdJdYrLNp/view?usp=sharing"
diff --git a/enzymeexplorer/src/data_preparation/clean_dataset_prep.py b/enzymeexplorer/src/data_preparation/clean_dataset_prep.py
@@ -0,0 +1,85 @@
+import argparse
+import pandas as pd
+import logging
+from collections import defaultdict
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__file__)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Get EC to substrate mapping")
+    parser.add_argument(
+        "--marts2ec-csv-path",
+        type=str,
+        help="Path to the Marts2EC CSV file",
+        default="./data/marts2ec_2026_03_14.csv",
+    )
+    parser.add_argument(
+        "--swissprot-csv-path",
+        type=str,
+        help="Path to the SwissProt CSV file",
+        default="./data/swissprot_2026_03_14.tsv",
+    )
+    parser.add_argument(
+        "--enzyme-explorer-dataset-csv-path",
+        type=str,
+        help="Path to the EnzymeExplorer dataset CSV file",
+        default="./data/EnzymeExplorer_Dataset.csv",
+    )
+    parser.add_argument(
+        "--prefix",
+        type=str,
+        help="Prefix for the output files",
+        default="enzexp",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="Path to the output directory",
+        default="./data/clean_datasets",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    output_dir = Path(args.output_dir)
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    marts2ec = pd.read_csv(args.marts2ec_csv_path)
+    dataset = pd.read_csv(args.enzyme_explorer_dataset_csv_path)
+    swissprot = pd.read_csv(args.swissprot_csv_path, sep="\t")
+    swissprot = swissprot[["Entry", "EC number", "Sequence"]].dropna()
+    swissprot = swissprot[~swissprot["EC number"].str.contains("-")]
+    swissprot["EC number"] = swissprot["EC number"].str.split(";").apply(lambda x: [ec.strip() for ec in x])
+    swissprot = swissprot.explode("EC number").reset_index(drop=True)
+
+    marts_id_to_ec = marts2ec[["Enzyme_marts_ID", "ID"]].groupby("Enzyme_marts_ID")["ID"].apply(set).to_dict()
+    swissprot_to_ec = swissprot[["Entry", "EC number"]].groupby("Entry")["EC number"].apply(set).to_dict()
+    prot_id_to_ec = {**marts_id_to_ec, **swissprot_to_ec}
+
+    dataset["ECs"] = dataset["ID"].map(prot_id_to_ec)
+    dataset = dataset[dataset["ECs"].notna()]
+
+    for i in range(5):
+        fold_i_train = dataset[dataset["Fold"] != i]
+        fold_i_train = fold_i_train[["ID", "ECs", "Aminoacid_sequence"]].drop_duplicates("ID")
+        fold_i_train["ECs"] = fold_i_train["ECs"].apply(lambda x: ";".join(x))
+        fold_i_train.columns = ["Entry", "EC number", "Sequence"]
+        fold_i_train.to_csv(output_dir / f"{args.prefix}_fold_{i}_train.csv", index=False, sep="\t")
+
+        fold_i_test = dataset[dataset["Fold"] == i]
+        fold_i_test = fold_i_test[["ID", "Aminoacid_sequence"]].drop_duplicates("ID")
+        with open(output_dir / f"{args.prefix}_fold_{i}_test.fasta", "w") as f:
+            for _, row in fold_i_test.iterrows():
+                f.write(f">{row['ID']}\n{row['Aminoacid_sequence']}\n")
+
+    combined = dataset[["ID", "ECs", "Aminoacid_sequence"]].drop_duplicates("ID")
+    combined["ECs"] = combined["ECs"].apply(lambda x: ";".join(x))
+    combined.columns = ["Entry", "EC number", "Sequence"]
+    combined.to_csv(output_dir / f"{args.prefix}_combined_data.csv", index=False, sep="\t")
+
+if __name__ == "__main__":
+    main()
diff --git a/enzymeexplorer/src/data_preparation/constants.py b/enzymeexplorer/src/data_preparation/constants.py
@@ -0,0 +1,106 @@
+import numpy as np
+
+# Potential TPS SwissProt IDs without any functional annotation (manually curated)
+PUTATIVE_TPS_IDS = [
+    "A0A2B7YDW3",
+    "P9WEH1",
+    "A0A2Z4HPY4",
+    "P0CJ42",
+    "B8NHE1",
+    "A0A084R1K7",
+    "A0A6A6H2E0",
+    "P9WEH1",
+    "Q2UEK4",
+    "A0A5Q0QMX6",
+    "F0ZHE2",
+    "A0A5Q0QRK8",
+    "O65688",
+    "A0A8H5Z7W4",
+    "M2SPA3",
+    "A0A348B794",
+    "A0A1V0QSA9",
+    "P0DXD5",
+]
+
+
+TPS_ECS_TO_SUBSTRATES_BASE = {
+    "2.5.1.30": "precursor substr",
+    "2.5.1.67": "precursor substr",
+    "2.5.1.69": "precursor substr",
+    "2.5.1.82": "precursor substr",
+    "2.5.1.83": "precursor substr",
+    "2.5.1.84": "precursor substr",
+    "2.5.1.85": "precursor substr",
+    "2.5.1.90": "precursor substr",
+    "2.5.1.91": "precursor substr",
+    "2.5.1.150": "precursor substr",
+    "3.1.7.5": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "3.1.7.10": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "3.1.7.12": "CC1CCC2(C)C(CCC=C2C)C1(C)CCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.1.123": "CC(C)=CCCC(C)=CCCC(C)=CCCC=C(C)CCC=C(C)CCC=C(C)C",
+    "4.2.1.138": "CC1CCC2C(CC2(C)C)C(=C)CCC=1",
+    "4.2.3.8": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.41": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.63": "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.64": "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.142": "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.151": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.156": "CC(CCC=C(C)CCC=C(C)C)=CC1C(COP([O-])(=O)OP([O-])([O-])=O)C1(C)CCC=C(C)CCC=C(C)C",
+    "4.2.3.205": "CC1C(C)C(C)C(C)(CCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O)C=1C",
+    "4.2.3.207": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.208": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.209": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.210": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.211": "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.212": "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.213": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.216": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.217": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.222": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.224": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "4.2.3.229": "CC1(C)CCCC2(C)C1CCC(=C)C2CCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+    "5.5.1.8": "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
+}
+
+NON_TPS_ECS = {"2.5.1.142", "2.5.1.28", "2.5.1.68", "2.5.1.92", "4.1.99.16"}
+
+# GO terms to blacklist for TPS identification even though TPS enzymes may be annotated with them
+TPS_GO_BLACKLIST = set(
+    [
+        "GO:0003723",
+        "GO:0009975",
+        "GO:0016491",
+        "GO:0016740",
+        "GO:0016787",
+        "GO:0016829",
+        "GO:0016853",
+        "GO:0042802",
+        "GO:0000287",
+        "GO:0004452",
+        "GO:0016746",
+        "GO:0016765",
+        "GO:0016791",
+        "GO:0016823",
+        "GO:0016836",
+        "GO:0016866",
+        "GO:0016872",
+        "GO:0042803",
+        "GO:0046872",
+        "GO:0030955",
+        "GO:0005506",
+        "GO:0030145",
+        "GO:0016838",
+    ]
+)
+
+MAJOR_CLASSES = [
+    "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",  # FPP
+    "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",  # GPP
+    "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",  # GGPP
+    "CC(C)=CCCC(C)=CCCC(C)=CCCC=C(C)CCC=C(C)CCC1OC1(C)C",  # squalene epoxide
+    "CC1(C)CCCC2(C)C1CCC(=C)C2CCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",  # copalyl PP
+    "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",  # GFPP
+    "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",  # FPP + FPP
+    "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",  # GGPP + GGPP
+    "precursor substr",
+]