pluskal-lab · segef · Mar 11, 2026 · Jan 14, 2026 · Jan 14, 2026
diff --git a/data/TPS-Nov19_2023_verified_all_reactions_with_folds_mmseqs.csv b/data/TPS-Nov19_2023_verified_all_reactions_with_folds_mmseqs.csv
diff --git a/data/TPS-Nov19_2023_verified_all_reactions_with_folds_mmseqs_30_50.csv b/data/TPS-Nov19_2023_verified_all_reactions_with_folds_mmseqs_30_50.csv
diff --git a/data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs.csv b/data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs.csv
diff --git a/data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv b/data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv
diff --git a/data/mmseqs_clusters.pkl b/data/mmseqs_clusters.pkl
diff --git a/data/mmseqs_clusters_30pct.pkl b/data/mmseqs_clusters_30pct.pkl
diff --git a/data/mmseqs_clusters_30pct_50cov.pkl b/data/mmseqs_clusters_30pct_50cov.pkl
diff --git a/data/mmseqs_clusters_40pct_80cov.pkl b/data/mmseqs_clusters_40pct_80cov.pkl
diff --git a/data/sampled_id_2_seq.pkl b/data/sampled_id_2_seq.pkl
diff --git a/data/sampled_id_2_seq_filtered.pkl b/data/sampled_id_2_seq_filtered.pkl
diff --git a/data/sampled_id_2_seq_original_backup.pkl b/data/sampled_id_2_seq_original_backup.pkl
diff --git a/enzymeexplorer/configs/Blastp/with_minor_reactions/config.yaml b/enzymeexplorer/configs/Blastp/with_minor_reactions/config.yaml
@@ -1,6 +1,6 @@
 id_col_name: "Uniprot ID"
 target_col_name: "SMILES_substrate_canonical_no_stereo"
-split_col_name: "stratified_phylogeny_based_split_with_minor_products"
+split_col_name: "stratified_mmseqs_based_split_with_minor_products"
 class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "precursor substr",
@@ -22,7 +22,7 @@ n_jobs: 64
 pred_batch_size: 32
 neg_val: "Unknown"
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: false
 reuse_existing_partial_results: false
 load_per_class_params_from: ""
diff --git a/enzymeexplorer/configs/CLEAN/with_minor_reactions/config.yaml b/enzymeexplorer/configs/CLEAN/with_minor_reactions/config.yaml
@@ -1,6 +1,6 @@
 id_col_name: "Uniprot ID"
 target_col_name: "SMILES_substrate_canonical_no_stereo"
-split_col_name: "stratified_phylogeny_based_split_with_minor_products"
+split_col_name: "stratified_mmseqs_based_split_with_minor_products"
 class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "precursor substr",
@@ -16,14 +16,15 @@ random_state: 0
 hyperparam_dimensions: none
 seq_col_name: "Amino acid sequence"
 n_calls_hyperparams_opt: 0
-clean_installation_root: "/home/samusevich/CLEAN"
+clean_installation_root: "/mnt/c/Users/raman/Documents/bio_ml/CLEAN"
 rhea2ec_link: "https://ftp.expasy.org/databases/rhea/tsv/rhea2ec.tsv"
 rhea_reaction_smiles_link: "https://ftp.expasy.org/databases/rhea/tsv/rhea-reaction-smiles.tsv"
 rhea_directions_link: "https://ftp.expasy.org/databases/rhea/tsv/rhea-directions.tsv"
 clean_working_dir: "_clean_working_dir"
+is_halo: false
 neg_val: "Unknown"
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: false
 reuse_existing_partial_results: false
 load_per_class_params_from: ""
diff --git a/enzymeexplorer/configs/DomainsRandomForest/main_config.yaml b/enzymeexplorer/configs/DomainsRandomForest/main_config.yaml
@@ -1,6 +1,6 @@
 id_col_name: "Uniprot ID"
 target_col_name: "SMILES_substrate_canonical_no_stereo"
-split_col_name: "stratified_phylogeny_based_split_with_minor_products"
+split_col_name: "stratified_mmseqs_based_split_with_minor_products"
 class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "precursor substr",
@@ -41,7 +41,7 @@ max_train_negs_proportion: 0.98
 neg_val: "Unknown"
 save_trained_model: true
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: false
 per_class_with_multilabel_regularization: 0
 reuse_existing_partial_results: false

diff --git a/enzymeexplorer/configs/DomainsRandomForest/with_minor_reactions/config.yaml b/enzymeexplorer/configs/DomainsRandomForest/with_minor_reactions/config.yaml
@@ -1,2 +1,2 @@
 include: ../main_config.yaml
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
diff --git a/enzymeexplorer/configs/DomainsRandomForest/with_minor_reactions_foldseek/config.yaml b/enzymeexplorer/configs/DomainsRandomForest/with_minor_reactions_foldseek/config.yaml
@@ -1,3 +1,3 @@
 include: ../main_config.yaml
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 foldseek_distances: true
diff --git a/enzymeexplorer/configs/DomainsRandomForest/with_minor_reactions_global_tuning/config.yaml b/enzymeexplorer/configs/DomainsRandomForest/with_minor_reactions_global_tuning/config.yaml
@@ -1,3 +1,3 @@
 include: ../main_config.yaml
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: false
diff --git a/enzymeexplorer/configs/Foldseek/with_minor_reactions/config.yaml b/enzymeexplorer/configs/Foldseek/with_minor_reactions/config.yaml
@@ -1,6 +1,6 @@
 id_col_name: "Uniprot ID"
 target_col_name: "SMILES_substrate_canonical_no_stereo"
-split_col_name: "stratified_phylogeny_based_split_with_minor_products"
+split_col_name: "stratified_mmseqs_based_split_with_minor_products"
 class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "precursor substr",
@@ -23,7 +23,7 @@ pred_batch_size: 32
 local_pdb_storage_path: "foldseek_temp"
 neg_val: "Unknown"
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: false
 reuse_existing_partial_results: false
 load_per_class_params_from: ""
diff --git a/enzymeexplorer/configs/HMM/with_minor_reactions/config.yaml b/enzymeexplorer/configs/HMM/with_minor_reactions/config.yaml
@@ -1,6 +1,6 @@
 id_col_name: "Uniprot ID"
 target_col_name: "SMILES_substrate_canonical_no_stereo"
-split_col_name: "stratified_phylogeny_based_split_with_minor_products"
+split_col_name: "stratified_mmseqs_based_split_with_minor_products"
 class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "precursor substr",
@@ -23,7 +23,7 @@ zero_conf_level: 0.1
 group_column_name: "Kingdom (plant, fungi, bacteria)"
 neg_val: "Unknown"
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: false
 reuse_existing_partial_results: false
 load_per_class_params_from: ""
diff --git a/enzymeexplorer/configs/PfamSUPFAM/pfam/config.yaml b/enzymeexplorer/configs/PfamSUPFAM/pfam/config.yaml
@@ -1,6 +1,6 @@
 id_col_name: "Uniprot ID"
 target_col_name: "SMILES_substrate_canonical_no_stereo"
-split_col_name: "stratified_phylogeny_based_split_with_minor_products"
+split_col_name: "stratified_mmseqs_based_split_with_minor_products"
 class_names: ["isTPS"]
 optimize_hyperparams: false
 random_state: 0
@@ -13,7 +13,7 @@ root_path_to_models: "data/pfam_models"
 working_directory: "pfam_temp"
 neg_val: "Unknown"
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: false
 reuse_existing_partial_results: false
 load_per_class_params_from: ""
diff --git a/enzymeexplorer/configs/PfamSUPFAM/supfam/config.yaml b/enzymeexplorer/configs/PfamSUPFAM/supfam/config.yaml
@@ -1,6 +1,6 @@
 id_col_name: "Uniprot ID"
 target_col_name: "SMILES_substrate_canonical_no_stereo"
-split_col_name: "stratified_phylogeny_based_split_with_minor_products"
+split_col_name: "stratified_mmseqs_based_split_with_minor_products"
 class_names: ["isTPS"]
 optimize_hyperparams: false
 random_state: 0
@@ -13,7 +13,7 @@ root_path_to_models: "data/supfam_models"
 working_directory: "supfam_temp"
 neg_val: "Unknown"
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: false
 reuse_existing_partial_results: false
 load_per_class_params_from: ""
diff --git a/enzymeexplorer/configs/PlmDomainsLogisticRegression/main_config.yaml b/enzymeexplorer/configs/PlmDomainsLogisticRegression/main_config.yaml
@@ -1,6 +1,6 @@
 id_col_name: "Uniprot ID"
 target_col_name: "SMILES_substrate_canonical_no_stereo"
-split_col_name: "stratified_phylogeny_based_split_with_minor_products"
+split_col_name: "stratified_mmseqs_based_split_with_minor_products"
 class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "precursor substr",
@@ -36,7 +36,7 @@ max_train_negs_proportion: 0.98
 neg_val: "Unknown"
 save_trained_model: true
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: true
 reuse_existing_partial_results: false
 load_per_class_params_from: ""

diff --git a/enzymeexplorer/configs/PlmDomainsMLP/main_config.yaml b/enzymeexplorer/configs/PlmDomainsMLP/main_config.yaml
@@ -1,6 +1,6 @@
 id_col_name: "Uniprot ID"
 target_col_name: "SMILES_substrate_canonical_no_stereo"
-split_col_name: "stratified_phylogeny_based_split_with_minor_products"
+split_col_name: "stratified_mmseqs_based_split_with_minor_products"
 class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "precursor substr",
@@ -31,7 +31,7 @@ max_train_negs_proportion: 0.98
 neg_val: "Unknown"
 save_trained_model: true
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: true
 reuse_existing_partial_results: false
 load_per_class_params_from: ""
diff --git a/enzymeexplorer/configs/PlmDomainsRandomForest/main_config.yaml b/enzymeexplorer/configs/PlmDomainsRandomForest/main_config.yaml
@@ -1,6 +1,6 @@
 id_col_name: "Uniprot ID"
 target_col_name: "SMILES_substrate_canonical_no_stereo"
-split_col_name: "stratified_phylogeny_based_split_with_minor_products"
+split_col_name: "stratified_mmseqs_based_split_with_minor_products"
 class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "precursor substr",
@@ -41,7 +41,7 @@ max_train_negs_proportion: 0.5
 neg_val: "Unknown"
 save_trained_model: true
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: false
 per_class_with_multilabel_regularization: 0
 reuse_existing_partial_results: false

diff --git a/enzymeexplorer/configs/PlmRandomForest/main_config.yaml b/enzymeexplorer/configs/PlmRandomForest/main_config.yaml
@@ -1,6 +1,6 @@
 id_col_name: "Uniprot ID"
 target_col_name: "SMILES_substrate_canonical_no_stereo"
-split_col_name: "stratified_phylogeny_based_split_with_minor_products"
+split_col_name: "stratified_mmseqs_based_split_with_minor_products"
 class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "precursor substr",
@@ -11,7 +11,7 @@ class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "isTPS"]
-optimize_hyperparams: true
+optimize_hyperparams: false
 random_state: 0
 n_calls_hyperparams_opt: 150
 hyperparam_dimensions:
@@ -41,8 +41,8 @@ max_train_negs_proportion: 0.98
 neg_val: "Unknown"
 save_trained_model: true
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
-per_class_optimization: true
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
+per_class_optimization: false
 per_class_with_multilabel_regularization: 0
 reuse_existing_partial_results: false
 load_per_class_params_from: ""
diff --git a/enzymeexplorer/configs/PlmRandomForest/tps_esm1v_quick_check/config.yaml b/enzymeexplorer/configs/PlmRandomForest/tps_esm1v_quick_check/config.yaml
@@ -0,0 +1,3 @@
+include: ../main_config.yaml
+representations_path: "data/gathered_embs_esm-1v_embs_avg.h5"
+per_class_optimization: false
diff --git a/enzymeexplorer/configs/PlmXgb/main_config.yaml b/enzymeexplorer/configs/PlmXgb/main_config.yaml
@@ -1,6 +1,6 @@
 id_col_name: "Uniprot ID"
 target_col_name: "SMILES_substrate_canonical_no_stereo"
-split_col_name: "stratified_phylogeny_based_split_with_minor_products"
+split_col_name: "stratified_mmseqs_based_split_with_minor_products"
 class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
             "precursor substr",
@@ -57,7 +57,7 @@ max_train_negs_proportion: 0.98
 neg_val: "Unknown"
 save_trained_model: true
 negatives_sample_path: "data/sampled_id_2_seq.pkl"
-tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
+tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
 per_class_optimization: true
 reuse_existing_partial_results: false
 load_per_class_params_from: ""
diff --git a/enzymeexplorer/src/data_preparation/get_balanced_stratified_group_kfolds.py b/enzymeexplorer/src/data_preparation/get_balanced_stratified_group_kfolds.py
@@ -48,6 +48,25 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--split-description", type=str, default="stratified_phylogeny_based_split"
     )
+    parser.add_argument(
+        "--cluster-source",
+        type=str,
+        choices=["phylogenetic", "mmseqs"],
+        default="phylogenetic",
+        help="Source of sequence clusters: 'phylogenetic' (MSA-based) or 'mmseqs' (sequence identity-based)",
+    )
+    parser.add_argument(
+        "--phylogenetic-clusters-path",
+        type=str,
+        default="data/phylogenetic_clusters.pkl",
+        help="Path to phylogenetic clusters pickle file",
+    )
+    parser.add_argument(
+        "--mmseqs-clusters-path",
+        type=str,
+        default="data/mmseqs_clusters.pkl",
+        help="Path to mmseqs clusters pickle file",
+    )
     args = parser.parse_args()
     return args
 
@@ -308,6 +327,14 @@ def stratified_kfold_phylogeny_based(
         len(terpene_synthases_df),
     )
 
+    # Determine which cluster file to use
+    if cli_args.cluster_source == "mmseqs":
+        clusters_path = cli_args.mmseqs_clusters_path
+        logger.info("Using mmseqs clusters from %s", clusters_path)
+    else:
+        clusters_path = cli_args.phylogenetic_clusters_path
+        logger.info("Using phylogenetic clusters from %s", clusters_path)
+
     logger.info(
         "Computing a balanced StratifiedGroupKFold for split named %s",
         cli_args.split_description,
@@ -316,6 +343,7 @@ def stratified_kfold_phylogeny_based(
         terpene_synthases_df,
         cli_args,
         target_col_name="SMILES_substrate_canonical_no_stereo",
+        phylogenetic_clusters_path=clusters_path,
         major_classes=[
             {"CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"},
             {"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"},