usadellab · felicitas215 · Mar 28, 2025 · Mar 25, 2025 · Mar 25, 2025 · Mar 25, 2025
diff --git a/Helixer.py b/Helixer.py
@@ -10,7 +10,7 @@
 from termcolor import colored
 
 from helixer.core.scripts import ParameterParser
-from helixer.core.data import prioritized_models, report_if_current_not_best, identify_current, MODEL_PATH
+from helixer.core.data import prioritized_models, report_if_current_not_best, identify_current, set_model_path
 from helixer.prediction.HybridModel import HybridModel
 from helixer.export.exporter import HelixerFastaToH5Controller
 
@@ -31,9 +31,9 @@ def __init__(self, config_file_path=''):
                                           'lineage dependent from 21384 to 213840).')
         self.data_group.add_argument('--write-by', type=int,
                                      help='convert genomic sequence in super-chunks to numerical matrices with this '
-                                          'many base pairs; for lower memory consumption, which will be rounded to be '
-                                          'divisible by subsequence-length; ; needs to be equal to or larger than '
-                                          'subsequence length, for lower memory consumption, consider setting a '
+                                          'many base pairs, which will be rounded to be divisible by '
+                                          'subsequence-length; needs to be equal to or larger than '
+                                          'subsequence length; for lower memory consumption, consider setting a '
                                           'lower number')
         self.data_group.add_argument('--lineage', type=str, default=None,
                                      choices=['vertebrate', 'land_plant', 'fungi', 'invertebrate'],
@@ -42,6 +42,10 @@ def __init__(self, config_file_path=''):
                                      help='set this to override the default model for any given '
                                           'lineage and instead take a specific model',
                                      type=str)
+        self.data_group.add_argument('--downloaded-model-path', type=str,
+                                     help='set to override the default download directory (<users_home_directory>/'
+                                          '.local/share/Helixer/models) Helixer is checking to see if you use the '
+                                          'newest model; only works with --lineage')
         self.pred_group = self.parser.add_argument_group("Prediction parameters")
         self.pred_group.add_argument('--batch-size', type=int,
                                      help='The batch size for the raw predictions in TensorFlow. Should be as large as '
@@ -84,6 +88,7 @@ def __init__(self, config_file_path=''):
             'write_by': 20_000_000,
             'lineage': None,
             'model_filepath': None,
+            'downloaded_model_path': None,
             'batch_size': 32,
             'no_overlap': False,
             'overlap_offset': None,
@@ -96,15 +101,15 @@ def __init__(self, config_file_path=''):
         self.defaults = {**self.defaults, **helixer_defaults}
 
     @staticmethod
-    def check_for_lineage_model(lineage):
+    def check_for_lineage_model(lineage, downloaded_model_path):
         # which models are available?
-        priorty_ms = prioritized_models(lineage)
+        model_path = set_model_path(downloaded_model_path)
+        priorty_ms = prioritized_models(lineage, model_path)
         # which model is already downloaded / will be used?
-        current_model = identify_current(lineage, priorty_ms)
+        current_model = identify_current(lineage, priorty_ms, model_path)
         # provide feedback if not up to date, error out if missing
         report_if_current_not_best(priorty_ms, current_model)
-
-        return os.path.join(MODEL_PATH, lineage, current_model)
+        return os.path.join(model_path, lineage, current_model)
 
     def check_args(self, args):
 
@@ -119,7 +124,7 @@ def check_args(self, args):
         else:
             assert args.lineage is not None, ("Either --lineage or --model-filepath is required. Run `Helixer.py "
                                               "--help` to see lineage options.")
-            model_filepath = self.check_for_lineage_model(args.lineage)
+            model_filepath = self.check_for_lineage_model(args.lineage, args.downloaded_model_path)
             if args.subsequence_length is None:
                 key = {'vertebrate': 213840, 'land_plant': 64152, 'fungi': 21384, 'invertebrate': 213840}
                 args.subsequence_length = key[args.lineage]

diff --git a/README.md b/README.md
@@ -88,13 +88,17 @@ The best models are:
 The best models for all lineages are best downloaded by running:
 
 ```bash
-# the models will be at /home/<user>/.local/share/Helixer/models
+# by default the models will be at /home/<user>/.local/share/Helixer/models
 scripts/fetch_helixer_models.py
 ```
 
 If desired, the `--lineage` (`land_plant`, `vertebrate`, `invertebrate`,
 and `fungi`) can be specified, or `--all` released models
-can be fetched. 
+can be fetched. If the models should be downloaded to another path
+you can specify `fetch_helixer_models.py --custom-path <path_to_download_models_to>`.
+If you want `Helixer.py` to use this custom path to check for new releases/lineage
+models, please provide `--downloaded-model-path <path_to_download_models_to>` when
+running `Helixer.py`. Otherwise, the default folder will be checked.
 
 Downloaded models (and any new releases) can also be found at
 https://zenodo.org/records/10836346, but we recommend simply using

diff --git a/docs/fine_tuning.md b/docs/fine_tuning.md
@@ -344,6 +344,15 @@ HybridModel.py --batch-size 50 --val-test-batch-size 100 -e 100 \
 | -s/--save-model-path  | ./best_model.h5 | Path to save the best model (model with the best validation genic F1 (the F1 for the classes CDS, UTR and Intron)) to.                                                                                                       |
 | -v/--verbose          | False           | Add to run HybridModel.py in verbosity mode (additional information will be printed)                                                                                                                                         |
 
+> **RESUMING TRAINING**   
+> Simply replace two arguments:
+> - replace the previous pretrained Helixer model with your fine-tuned model,
+> i.e. `--load-model-path <your_fine_tuned_model>`
+> - replace `--fine-tune` with `--fine-tune-resume`   
+> 
+> **Hint**: if you want to use the same path to save your fine-tuned model to as before,
+> it might be a good idea to back up your previous model checkpoint(s) somewhere
+
 ## 3. Train new final layer(s) with extrinsic information
 ### 3.1 Info
 Extrinsic information that could be used to help gene calling

diff --git a/docs/helixer_options.md b/docs/helixer_options.md
@@ -13,16 +13,17 @@ working directory. If that file isn't provided, the parameters are expected to b
 command line.
 
 ### General parameters
-| Parameter            | Default                                                                   | Explanation                                                                                                                                                                                                                                                                                   |
-|:---------------------|:--------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| --fasta-path         | /                                                                         | FASTA input file                                                                                                                                                                                                                                                                              |
-| --gff-output-path    | /                                                                         | Output GFF3 file path                                                                                                                                                                                                                                                                         |
-| --species            | /                                                                         | Species name. Will be added to the GFF3 file.                                                                                                                                                                                                                                                 |
-| --temporary-dir      | system default                                                            | Use supplied (instead of system default) for temporary directory (place where temporary h5 files from fasta to h5 conversion and Helixer's raw base-wise predictions get saved)                                                                                                               |
-| --subsequence-length | vertebrate: 213840, land_plant: 64152, fungi: 21384, invertebrate: 213840 | How to slice the genomic sequence. Set moderately longer than length of typical genic loci. Tested up to 213840. Must be evenly divisible by the timestep width of the used model, which is typically 9. (Lineage dependent defaults)                                                         |
-| --write-by           | 20_000_000                                                                | Convert genomic sequence in super-chunks to numerical matrices with this many base pairs, which will be rounded to be divisible by subsequence-length; needs to be equal to or larger than subsequence length; for lower memory consumption, consider setting a lower number                  |
-| --lineage            | /                                                                         | What model to use for the annotation. Options are: vertebrate, land_plant, fungi or invertebrate.                                                                                                                                                                                             |
-| --model-filepath     | /                                                                         | Set this to override the default model for any given lineage and instead take a specific model                                                                                                                                                                                                |
+| Parameter               | Default                                                                   | Explanation                                                                                                                                                                                                                                                                  |
+|:------------------------|:--------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| --fasta-path            | /                                                                         | FASTA input file                                                                                                                                                                                                                                                             |
+| --gff-output-path       | /                                                                         | Output GFF3 file path                                                                                                                                                                                                                                                        |
+| --species               | /                                                                         | Species name. Will be added to the GFF3 file.                                                                                                                                                                                                                                |
+| --temporary-dir         | system default                                                            | Use supplied (instead of system default) for temporary directory (place where temporary h5 files from fasta to h5 conversion and Helixer's raw base-wise predictions get saved)                                                                                              |
+| --subsequence-length    | vertebrate: 213840, land_plant: 64152, fungi: 21384, invertebrate: 213840 | How to slice the genomic sequence. Set moderately longer than length of typical genic loci. Tested up to 213840. Must be evenly divisible by the timestep width of the used model, which is typically 9. (Lineage dependent defaults)                                        |
+| --write-by              | 20_000_000                                                                | Convert genomic sequence in super-chunks to numerical matrices with this many base pairs, which will be rounded to be divisible by subsequence-length; needs to be equal to or larger than subsequence length; for lower memory consumption, consider setting a lower number |
+| --lineage               | /                                                                         | What model to use for the annotation. Options are: vertebrate, land_plant, fungi or invertebrate.                                                                                                                                                                            |
+| --model-filepath        | /                                                                         | Set this to override the default model for any given lineage and instead take a specific model                                                                                                                                                                               |
+| --downloaded-model-path | /                                                                         | Set to override the default download directory (<users_home_directory>/.local/share/Helixer/models) Helixer is checking to see if you use the newest model; only works with --lineage                                                                                        |
 
 ### Prediction parameters
 | Parameter             | Default                                                                                                     | Explanation                                                                                                                                                                                                                                                                                                                                                                                 |
@@ -77,8 +78,8 @@ command line.
 | Parameter               | Default   | Explanation                                                                                                                                                                                                                  |
 |:------------------------|:----------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | -e/--epochs             | 10,000    | Number of training runs                                                                                                                                                                                                      |
-| -b/--batch-size         | 8         | Batch size for training data                                                                                                                                                                                                 |
-| --val-test-batch-size   | 32        | Batch size for validation/test data                                                                                                                                                                                          |
+| -b/--batch-size         | 8         | Batch size for training data; for multi-GPU training please multiply by the number of GPUs                                                                                                                                   |
+| --val-test-batch-size   | 32        | Batch size for validation/test data; for multi-GPU training please multiply by the number of GPUs                                                                                                                            |
 | --loss                  | /         | Loss function specification                                                                                                                                                                                                  |
 | --patience              | 3         | Allowed epochs without the validation genic F1 improving before stopping training                                                                                                                                            |
 | --check-every-nth-batch | 1,000,000 | Check validation genic F1 every nth batch, on default this check gets executed once every epoch regardless of the number of batches                                                                                          |
@@ -122,6 +123,7 @@ command line.
 | Parameter                    | Default | Explanation                                                                                                                             |
 |:-----------------------------|:--------|:----------------------------------------------------------------------------------------------------------------------------------------|
 | --fine-tune                  | False   | Add/Use with --resume-training to replace and fine tune just the very last layer                                                        |
+| --fine-tune-resume           | False   | Add/Use with --resume-training to resume your fine tuning of the very last layer                                                        |
 | --pretrained-model-path      | /       | Required when predicting with a model fine tuned with coverage                                                                          |
 | --input-coverage             | False   | Add to use "evaluation/rnaseq_(spliced_)coverage" from HDF5 training/validation files as additional input for a late layer of the model |
 | --coverage-norm              | None    | None, linear or log (recommended); how coverage will be normalized before inputting                                                     |

diff --git a/docs/training.md b/docs/training.md
@@ -132,6 +132,11 @@ via HelixerPost. For a detailed explanation of all possible parameters see the
 HybridModel.py --data-dir example/train/ --save-model-path example/best_helixer_model.h5 \
   --epochs 5 --predict-phase
 ```
+> **Hint**: Multi-GPU training works by using all GPUs available on your machine. To restrict
+> Helixer in training (and eval) mode to only use one GPU, you can provide the ID of the GPU that
+> should be used like so `--gpu-id <ID>` (example: `--gpu-id 0`). If you are using a job scheduler
+> like Sun Grid Engine or PBSPro, the scheduling system will handle how many GPUs Helixer is
+> able to use.
 
 The rest of this example will continue with the model example/best_helixer_model.h5 produced above.