Skip to content
883 changes: 883 additions & 0 deletions data/ec_to_substrate_mapping_2026_03_14.json

Large diffs are not rendered by default.

4,957 changes: 4,957 additions & 0 deletions data/marts2ec_2026_03_14.csv

Large diffs are not rendered by default.

4,206 changes: 4,206 additions & 0 deletions data/martsDB_reactions_2026_02_22.csv

Large diffs are not rendered by default.

18,344 changes: 18,344 additions & 0 deletions data/rhea-directions_2026_03_14.tsv

Large diffs are not rendered by default.

36,014 changes: 36,014 additions & 0 deletions data/rhea-reaction-smiles_2026_03_14.tsv

Large diffs are not rendered by default.

7,903 changes: 7,903 additions & 0 deletions data/rhea2ec_2026_03_14.tsv

Large diffs are not rendered by default.

8 changes: 3 additions & 5 deletions enzymeexplorer/configs/CLEAN/with_minor_reactions/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,13 @@ random_state: 0
hyperparam_dimensions: none
seq_col_name: "Amino acid sequence"
n_calls_hyperparams_opt: 0
clean_installation_root: "/mnt/c/Users/raman/Documents/bio_ml/CLEAN"
rhea2ec_link: "https://ftp.expasy.org/databases/rhea/tsv/rhea2ec.tsv"
rhea_reaction_smiles_link: "https://ftp.expasy.org/databases/rhea/tsv/rhea-reaction-smiles.tsv"
rhea_directions_link: "https://ftp.expasy.org/databases/rhea/tsv/rhea-directions.tsv"
clean_working_dir: "_clean_working_dir"
clean_installation_root: "~/CLEAN"
ec_2_substrates_json_path: "data/ec_to_substrate_mapping_2026_03_14.json"
is_halo: false
neg_val: "Unknown"
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: false
reuse_existing_partial_results: false
load_per_class_params_from: ""
pretrained_models_link: "https://drive.google.com/file/d/1J3FXW08At6LTjDckWVTaMx6VdJdYrLNp/view?usp=sharing"
85 changes: 85 additions & 0 deletions enzymeexplorer/src/data_preparation/clean_dataset_prep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import argparse
import pandas as pd
import logging
from collections import defaultdict
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__file__)

def parse_args():
parser = argparse.ArgumentParser(description="Get EC to substrate mapping")
parser.add_argument(
"--marts2ec-csv-path",
type=str,
help="Path to the Marts2EC CSV file",
default="./data/marts2ec_2026_03_14.csv",
)
parser.add_argument(
"--swissprot-csv-path",
type=str,
help="Path to the SwissProt CSV file",
default="./data/swissprot_2026_03_14.tsv",
)
parser.add_argument(
"--enzyme-explorer-dataset-csv-path",
type=str,
help="Path to the EnzymeExplorer dataset CSV file",
default="./data/EnzymeExplorer_Dataset.csv",
)
parser.add_argument(
"--prefix",
type=str,
help="Prefix for the output files",
default="enzexp",
)
parser.add_argument(
"--output-dir",
type=str,
help="Path to the output directory",
default="./data/clean_datasets",
)
return parser.parse_args()


def main():
args = parse_args()

output_dir = Path(args.output_dir)
Path(output_dir).mkdir(parents=True, exist_ok=True)

marts2ec = pd.read_csv(args.marts2ec_csv_path)
dataset = pd.read_csv(args.enzyme_explorer_dataset_csv_path)
swissprot = pd.read_csv(args.swissprot_csv_path, sep="\t")
swissprot = swissprot[["Entry", "EC number", "Sequence"]].dropna()
swissprot = swissprot[~swissprot["EC number"].str.contains("-")]
swissprot["EC number"] = swissprot["EC number"].str.split(";").apply(lambda x: [ec.strip() for ec in x])
swissprot = swissprot.explode("EC number").reset_index(drop=True)

marts_id_to_ec = marts2ec[["Enzyme_marts_ID", "ID"]].groupby("Enzyme_marts_ID")["ID"].apply(set).to_dict()
swissprot_to_ec = swissprot[["Entry", "EC number"]].groupby("Entry")["EC number"].apply(set).to_dict()
prot_id_to_ec = {**marts_id_to_ec, **swissprot_to_ec}

dataset["ECs"] = dataset["ID"].map(prot_id_to_ec)
dataset = dataset[dataset["ECs"].notna()]

for i in range(5):
fold_i_train = dataset[dataset["Fold"] != i]
fold_i_train = fold_i_train[["ID", "ECs", "Aminoacid_sequence"]].drop_duplicates("ID")
fold_i_train["ECs"] = fold_i_train["ECs"].apply(lambda x: ";".join(x))
fold_i_train.columns = ["Entry", "EC number", "Sequence"]
fold_i_train.to_csv(output_dir / f"{args.prefix}_fold_{i}_train.csv", index=False, sep="\t")

fold_i_test = dataset[dataset["Fold"] == i]
fold_i_test = fold_i_test[["ID", "Aminoacid_sequence"]].drop_duplicates("ID")
with open(output_dir / f"{args.prefix}_fold_{i}_test.fasta", "w") as f:
for _, row in fold_i_test.iterrows():
f.write(f">{row['ID']}\n{row['Aminoacid_sequence']}\n")

combined = dataset[["ID", "ECs", "Aminoacid_sequence"]].drop_duplicates("ID")
combined["ECs"] = combined["ECs"].apply(lambda x: ";".join(x))
combined.columns = ["Entry", "EC number", "Sequence"]
combined.to_csv(output_dir / f"{args.prefix}_combined_data.csv", index=False, sep="\t")

if __name__ == "__main__":
main()
106 changes: 106 additions & 0 deletions enzymeexplorer/src/data_preparation/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import numpy as np

# Potential TPS SwissProt IDs without any functional annotation (manually curated)
PUTATIVE_TPS_IDS = [
"A0A2B7YDW3",
"P9WEH1",
"A0A2Z4HPY4",
"P0CJ42",
"B8NHE1",
"A0A084R1K7",
"A0A6A6H2E0",
"P9WEH1",
"Q2UEK4",
"A0A5Q0QMX6",
"F0ZHE2",
"A0A5Q0QRK8",
"O65688",
"A0A8H5Z7W4",
"M2SPA3",
"A0A348B794",
"A0A1V0QSA9",
"P0DXD5",
]


TPS_ECS_TO_SUBSTRATES_BASE = {
"2.5.1.30": "precursor substr",
"2.5.1.67": "precursor substr",
"2.5.1.69": "precursor substr",
"2.5.1.82": "precursor substr",
"2.5.1.83": "precursor substr",
"2.5.1.84": "precursor substr",
"2.5.1.85": "precursor substr",
"2.5.1.90": "precursor substr",
"2.5.1.91": "precursor substr",
"2.5.1.150": "precursor substr",
"3.1.7.5": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"3.1.7.10": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"3.1.7.12": "CC1CCC2(C)C(CCC=C2C)C1(C)CCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.1.123": "CC(C)=CCCC(C)=CCCC(C)=CCCC=C(C)CCC=C(C)CCC=C(C)C",
"4.2.1.138": "CC1CCC2C(CC2(C)C)C(=C)CCC=1",
"4.2.3.8": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.41": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.63": "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.64": "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.142": "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.151": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.156": "CC(CCC=C(C)CCC=C(C)C)=CC1C(COP([O-])(=O)OP([O-])([O-])=O)C1(C)CCC=C(C)CCC=C(C)C",
"4.2.3.205": "CC1C(C)C(C)C(C)(CCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O)C=1C",
"4.2.3.207": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.208": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.209": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.210": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.211": "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.212": "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.213": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.216": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.217": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.222": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.224": "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"4.2.3.229": "CC1(C)CCCC2(C)C1CCC(=C)C2CCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"5.5.1.8": "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
}

NON_TPS_ECS = {"2.5.1.142", "2.5.1.28", "2.5.1.68", "2.5.1.92", "4.1.99.16"}

# GO terms to blacklist for TPS identification even though TPS enzymes may be annotated with them
TPS_GO_BLACKLIST = set(
[
"GO:0003723",
"GO:0009975",
"GO:0016491",
"GO:0016740",
"GO:0016787",
"GO:0016829",
"GO:0016853",
"GO:0042802",
"GO:0000287",
"GO:0004452",
"GO:0016746",
"GO:0016765",
"GO:0016791",
"GO:0016823",
"GO:0016836",
"GO:0016866",
"GO:0016872",
"GO:0042803",
"GO:0046872",
"GO:0030955",
"GO:0005506",
"GO:0030145",
"GO:0016838",
]
)

MAJOR_CLASSES = [
"CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O", # FPP
"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O", # GPP
"CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O", # GGPP
"CC(C)=CCCC(C)=CCCC(C)=CCCC=C(C)CCC=C(C)CCC1OC1(C)C", # squalene epoxide
"CC1(C)CCCC2(C)C1CCC(=C)C2CCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O", # copalyl PP
"CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O", # GFPP
"CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O", # FPP + FPP
"CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O", # GGPP + GGPP
"precursor substr",
]
Loading
Loading