Remove hardcoded plm checkpoint paths

soldatmat · soldatmat · commit 4cee5aae2f2e · 2026-03-02T16:48:41.000+01:00
diff --git a/enzymeexplorer/src/embeddings_extraction/ankh_transformer_utils.py b/enzymeexplorer/src/embeddings_extraction/ankh_transformer_utils.py
@@ -6,10 +6,12 @@
 
 def get_model_and_tokenizer(
     model_name: str,
+    checkpoint_dir: Optional[str] = "data/plm_checkpoints"
 ) -> tuple:
     """
     This function returns bert model and batch converter (basically a tokenizer) based on the model name
     :param model_name: model name
+    :param checkpoint_dir: directory where checkpoints are stored
     :return: a pair of the bert protein model and its tokenizer
     """
     assert model_name in {
@@ -22,7 +24,7 @@ def get_model_and_tokenizer(
     elif model_name == "ankh_tps":
         model, tokenizer = ankh.load_base_model(generation=True)
         model.load_state_dict(
-            torch.load("data/plm_checkpoints/tps_ankh_lr=5e-05_bs=32.pth")[
+            torch.load(f"{checkpoint_dir}/tps_ankh_lr=5e-05_bs=32.pth")[
                 "model_state_dict"
             ],
             strict=False,
diff --git a/enzymeexplorer/src/embeddings_extraction/esm_transformer_utils.py b/enzymeexplorer/src/embeddings_extraction/esm_transformer_utils.py
@@ -24,20 +24,22 @@ def get_model_and_tokenizer(
     model_name: str,
     checkpoint_names: Optional[dict[str, str]] = None,
     return_alphabet: bool = False,
+    checkpoint_dir: Optional[str] = "data/plm_checkpoints",
 ) -> tuple:
     """
     This function returns bert model and batch converter (basically a tokenizer) based on the name
     :param model_name: model name
     :param checkpoint_names: mapping between model name and checkpoint file
     :param return_alphabet: flag to return alphabet object
+    :param checkpoint_dir: directory where checkpoints are stored
     :return: a pair of the bert protein model and its batch converter
     """
     if checkpoint_names is None:
         checkpoint_names = CHECKPOINT_NAMES
     if model_name in checkpoint_names:
         checkpoint_name = checkpoint_names[model_name]
         ckpt = torch.load(
-            f"data/plm_checkpoints/{checkpoint_name}",
+            f"{checkpoint_dir}/{checkpoint_name}",
             map_location=torch.device("cpu"),
         )
         bert_model, bert_alphabet = getattr(esm.pretrained, "esm1v_t33_650M_UR90S_1")()
diff --git a/enzymeexplorer/src/screening/tps_predict_fasta.py b/enzymeexplorer/src/screening/tps_predict_fasta.py
@@ -56,6 +56,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--ckpt-root-path", type=str, default="data/classifier_checkpoints.pkl"
     )
+    parser.add_argument("--plm_checkpoint_dir", type=str, default="data/plm_checkpoints")
     parser.add_argument("--detection-threshold", type=float, default=0.2)
     parser.add_argument("--detect-precursor-synthases", help="Flag to detect precursor synthases as well. Set to False with `--no-detect-precursor-synthases`.", default=True, action=argparse.BooleanOptionalAction)
     parser.add_argument("--gpu", type=str, default="0")
@@ -94,6 +95,7 @@ def main(args: argparse.Namespace):
             - clf_batch_size: Number of samples processed in each classification batch.
             - output_root: Directory to store prediction outputs.
             - ckpt_root_path: Path to the checkpoint file containing pre-trained classifiers.
+            - plm_checkpoint_dir: Directory where PLM checkpoints are stored.
             - detect_precursor_synthases: Boolean flag to detect precursor synthases.
             - starting_i, end_i: Range of indices to process sequences.
             - gpu: GPU identifier for processing sequences.
@@ -105,7 +107,7 @@ def main(args: argparse.Namespace):
 
     if "esm" in args.model:
         model, batch_converter, alphabet = get_model_and_tokenizer(
-            args.model, return_alphabet=True
+            args.model, return_alphabet=True, checkpoint_dir=args.plm_checkpoint_dir
         )
 
         compute_embeddings_partial = partial(
@@ -119,7 +121,7 @@ def main(args: argparse.Namespace):
     elif "ankh" in args.model:
         model, tokenizer = ankh_get_model_and_tokenizer(args.model)
         compute_embeddings_partial = partial(
-            ankh_compute_embeddings, bert_model=model, tokenizer=tokenizer
+            ankh_compute_embeddings, bert_model=model, tokenizer=tokenizer, checkpoint_dir=args.plm_checkpoint_dir
         )
     else:
         raise NotImplementedError(