Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,297 changes: 2,297 additions & 0 deletions data/TPS-Nov19_2023_verified_all_reactions_with_folds_mmseqs.csv

Large diffs are not rendered by default.

2,297 changes: 2,297 additions & 0 deletions data/TPS-Nov19_2023_verified_all_reactions_with_folds_mmseqs_30_50.csv

Large diffs are not rendered by default.

12,270 changes: 12,270 additions & 0 deletions data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs.csv

Large diffs are not rendered by default.

12,270 changes: 12,270 additions & 0 deletions data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv

Large diffs are not rendered by default.

Binary file added data/mmseqs_clusters.pkl
Binary file not shown.
Binary file added data/mmseqs_clusters_30pct.pkl
Binary file not shown.
Binary file added data/mmseqs_clusters_30pct_50cov.pkl
Binary file not shown.
Binary file added data/mmseqs_clusters_40pct_80cov.pkl
Binary file not shown.
Binary file modified data/sampled_id_2_seq.pkl
Binary file not shown.
Binary file added data/sampled_id_2_seq_filtered.pkl
Binary file not shown.
Binary file added data/sampled_id_2_seq_original_backup.pkl
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id_col_name: "Uniprot ID"
target_col_name: "SMILES_substrate_canonical_no_stereo"
split_col_name: "stratified_phylogeny_based_split_with_minor_products"
split_col_name: "stratified_mmseqs_based_split_with_minor_products"
class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"precursor substr",
Expand All @@ -22,7 +22,7 @@ n_jobs: 64
pred_batch_size: 32
neg_val: "Unknown"
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: false
reuse_existing_partial_results: false
load_per_class_params_from: ""
7 changes: 4 additions & 3 deletions enzymeexplorer/configs/CLEAN/with_minor_reactions/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id_col_name: "Uniprot ID"
target_col_name: "SMILES_substrate_canonical_no_stereo"
split_col_name: "stratified_phylogeny_based_split_with_minor_products"
split_col_name: "stratified_mmseqs_based_split_with_minor_products"
class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"precursor substr",
Expand All @@ -16,14 +16,15 @@ random_state: 0
hyperparam_dimensions: none
seq_col_name: "Amino acid sequence"
n_calls_hyperparams_opt: 0
clean_installation_root: "/home/samusevich/CLEAN"
clean_installation_root: "/mnt/c/Users/raman/Documents/bio_ml/CLEAN"
rhea2ec_link: "https://ftp.expasy.org/databases/rhea/tsv/rhea2ec.tsv"
rhea_reaction_smiles_link: "https://ftp.expasy.org/databases/rhea/tsv/rhea-reaction-smiles.tsv"
rhea_directions_link: "https://ftp.expasy.org/databases/rhea/tsv/rhea-directions.tsv"
clean_working_dir: "_clean_working_dir"
is_halo: false
neg_val: "Unknown"
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: false
reuse_existing_partial_results: false
load_per_class_params_from: ""
4 changes: 2 additions & 2 deletions enzymeexplorer/configs/DomainsRandomForest/main_config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id_col_name: "Uniprot ID"
target_col_name: "SMILES_substrate_canonical_no_stereo"
split_col_name: "stratified_phylogeny_based_split_with_minor_products"
split_col_name: "stratified_mmseqs_based_split_with_minor_products"
class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"precursor substr",
Expand Down Expand Up @@ -41,7 +41,7 @@ max_train_negs_proportion: 0.98
neg_val: "Unknown"
save_trained_model: true
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: false
per_class_with_multilabel_regularization: 0
reuse_existing_partial_results: false
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
include: ../main_config.yaml
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
include: ../main_config.yaml
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
foldseek_distances: true
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
include: ../main_config.yaml
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: false
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id_col_name: "Uniprot ID"
target_col_name: "SMILES_substrate_canonical_no_stereo"
split_col_name: "stratified_phylogeny_based_split_with_minor_products"
split_col_name: "stratified_mmseqs_based_split_with_minor_products"
class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"precursor substr",
Expand All @@ -23,7 +23,7 @@ pred_batch_size: 32
local_pdb_storage_path: "foldseek_temp"
neg_val: "Unknown"
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: false
reuse_existing_partial_results: false
load_per_class_params_from: ""
4 changes: 2 additions & 2 deletions enzymeexplorer/configs/HMM/with_minor_reactions/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id_col_name: "Uniprot ID"
target_col_name: "SMILES_substrate_canonical_no_stereo"
split_col_name: "stratified_phylogeny_based_split_with_minor_products"
split_col_name: "stratified_mmseqs_based_split_with_minor_products"
class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"precursor substr",
Expand All @@ -23,7 +23,7 @@ zero_conf_level: 0.1
group_column_name: "Kingdom (plant, fungi, bacteria)"
neg_val: "Unknown"
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: false
reuse_existing_partial_results: false
load_per_class_params_from: ""
4 changes: 2 additions & 2 deletions enzymeexplorer/configs/PfamSUPFAM/pfam/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id_col_name: "Uniprot ID"
target_col_name: "SMILES_substrate_canonical_no_stereo"
split_col_name: "stratified_phylogeny_based_split_with_minor_products"
split_col_name: "stratified_mmseqs_based_split_with_minor_products"
class_names: ["isTPS"]
optimize_hyperparams: false
random_state: 0
Expand All @@ -13,7 +13,7 @@ root_path_to_models: "data/pfam_models"
working_directory: "pfam_temp"
neg_val: "Unknown"
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: false
reuse_existing_partial_results: false
load_per_class_params_from: ""
4 changes: 2 additions & 2 deletions enzymeexplorer/configs/PfamSUPFAM/supfam/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id_col_name: "Uniprot ID"
target_col_name: "SMILES_substrate_canonical_no_stereo"
split_col_name: "stratified_phylogeny_based_split_with_minor_products"
split_col_name: "stratified_mmseqs_based_split_with_minor_products"
class_names: ["isTPS"]
optimize_hyperparams: false
random_state: 0
Expand All @@ -13,7 +13,7 @@ root_path_to_models: "data/supfam_models"
working_directory: "supfam_temp"
neg_val: "Unknown"
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: false
reuse_existing_partial_results: false
load_per_class_params_from: ""
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id_col_name: "Uniprot ID"
target_col_name: "SMILES_substrate_canonical_no_stereo"
split_col_name: "stratified_phylogeny_based_split_with_minor_products"
split_col_name: "stratified_mmseqs_based_split_with_minor_products"
class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"precursor substr",
Expand Down Expand Up @@ -36,7 +36,7 @@ max_train_negs_proportion: 0.98
neg_val: "Unknown"
save_trained_model: true
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: true
reuse_existing_partial_results: false
load_per_class_params_from: ""
Expand Down
4 changes: 2 additions & 2 deletions enzymeexplorer/configs/PlmDomainsMLP/main_config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id_col_name: "Uniprot ID"
target_col_name: "SMILES_substrate_canonical_no_stereo"
split_col_name: "stratified_phylogeny_based_split_with_minor_products"
split_col_name: "stratified_mmseqs_based_split_with_minor_products"
class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"precursor substr",
Expand Down Expand Up @@ -31,7 +31,7 @@ max_train_negs_proportion: 0.98
neg_val: "Unknown"
save_trained_model: true
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: true
reuse_existing_partial_results: false
load_per_class_params_from: ""
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id_col_name: "Uniprot ID"
target_col_name: "SMILES_substrate_canonical_no_stereo"
split_col_name: "stratified_phylogeny_based_split_with_minor_products"
split_col_name: "stratified_mmseqs_based_split_with_minor_products"
class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"precursor substr",
Expand Down Expand Up @@ -41,7 +41,7 @@ max_train_negs_proportion: 0.5
neg_val: "Unknown"
save_trained_model: true
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: false
per_class_with_multilabel_regularization: 0
reuse_existing_partial_results: false
Expand Down
8 changes: 4 additions & 4 deletions enzymeexplorer/configs/PlmRandomForest/main_config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id_col_name: "Uniprot ID"
target_col_name: "SMILES_substrate_canonical_no_stereo"
split_col_name: "stratified_phylogeny_based_split_with_minor_products"
split_col_name: "stratified_mmseqs_based_split_with_minor_products"
class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"precursor substr",
Expand All @@ -11,7 +11,7 @@ class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"isTPS"]
optimize_hyperparams: true
optimize_hyperparams: false
random_state: 0
n_calls_hyperparams_opt: 150
hyperparam_dimensions:
Expand Down Expand Up @@ -41,8 +41,8 @@ max_train_negs_proportion: 0.98
neg_val: "Unknown"
save_trained_model: true
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
per_class_optimization: true
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: false
per_class_with_multilabel_regularization: 0
reuse_existing_partial_results: false
load_per_class_params_from: ""
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
include: ../main_config.yaml
representations_path: "data/gathered_embs_esm-1v_embs_avg.h5"
per_class_optimization: false
4 changes: 2 additions & 2 deletions enzymeexplorer/configs/PlmXgb/main_config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id_col_name: "Uniprot ID"
target_col_name: "SMILES_substrate_canonical_no_stereo"
split_col_name: "stratified_phylogeny_based_split_with_minor_products"
split_col_name: "stratified_mmseqs_based_split_with_minor_products"
class_names: ["CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O",
"precursor substr",
Expand Down Expand Up @@ -57,7 +57,7 @@ max_train_negs_proportion: 0.98
neg_val: "Unknown"
save_trained_model: true
negatives_sample_path: "data/sampled_id_2_seq.pkl"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds.csv"
tps_cleaned_csv_path: "data/TPS-Nov19_2023_verified_all_reactions_with_neg_with_folds_mmseqs_30_50.csv"
per_class_optimization: true
reuse_existing_partial_results: false
load_per_class_params_from: ""
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,25 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--split-description", type=str, default="stratified_phylogeny_based_split"
)
parser.add_argument(
"--cluster-source",
type=str,
choices=["phylogenetic", "mmseqs"],
default="phylogenetic",
help="Source of sequence clusters: 'phylogenetic' (MSA-based) or 'mmseqs' (sequence identity-based)",
)
parser.add_argument(
"--phylogenetic-clusters-path",
type=str,
default="data/phylogenetic_clusters.pkl",
help="Path to phylogenetic clusters pickle file",
)
parser.add_argument(
"--mmseqs-clusters-path",
type=str,
default="data/mmseqs_clusters.pkl",
help="Path to mmseqs clusters pickle file",
)
args = parser.parse_args()
return args

Expand Down Expand Up @@ -308,6 +327,14 @@ def stratified_kfold_phylogeny_based(
len(terpene_synthases_df),
)

# Determine which cluster file to use
if cli_args.cluster_source == "mmseqs":
clusters_path = cli_args.mmseqs_clusters_path
logger.info("Using mmseqs clusters from %s", clusters_path)
else:
clusters_path = cli_args.phylogenetic_clusters_path
logger.info("Using phylogenetic clusters from %s", clusters_path)

logger.info(
"Computing a balanced StratifiedGroupKFold for split named %s",
cli_args.split_description,
Expand All @@ -316,6 +343,7 @@ def stratified_kfold_phylogeny_based(
terpene_synthases_df,
cli_args,
target_col_name="SMILES_substrate_canonical_no_stereo",
phylogenetic_clusters_path=clusters_path,
major_classes=[
{"CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"},
{"CC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"},
Expand Down
Loading
Loading