Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions Helixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from termcolor import colored

from helixer.core.scripts import ParameterParser
from helixer.core.data import prioritized_models, report_if_current_not_best, identify_current, MODEL_PATH
from helixer.core.data import prioritized_models, report_if_current_not_best, identify_current, set_model_path
from helixer.prediction.HybridModel import HybridModel
from helixer.export.exporter import HelixerFastaToH5Controller

Expand All @@ -31,9 +31,9 @@ def __init__(self, config_file_path=''):
'lineage dependent from 21384 to 213840).')
self.data_group.add_argument('--write-by', type=int,
help='convert genomic sequence in super-chunks to numerical matrices with this '
'many base pairs; for lower memory consumption, which will be rounded to be '
'divisible by subsequence-length; ; needs to be equal to or larger than '
'subsequence length, for lower memory consumption, consider setting a '
'many base pairs, which will be rounded to be divisible by '
'subsequence-length; needs to be equal to or larger than '
'subsequence length; for lower memory consumption, consider setting a '
'lower number')
self.data_group.add_argument('--lineage', type=str, default=None,
choices=['vertebrate', 'land_plant', 'fungi', 'invertebrate'],
Expand All @@ -42,6 +42,10 @@ def __init__(self, config_file_path=''):
help='set this to override the default model for any given '
'lineage and instead take a specific model',
type=str)
self.data_group.add_argument('--downloaded-model-path', type=str,
help='set to override the default download directory (<users_home_directory>/'
'.local/share/Helixer/models) Helixer is checking to see if you use the '
'newest model; only works with --lineage')
self.pred_group = self.parser.add_argument_group("Prediction parameters")
self.pred_group.add_argument('--batch-size', type=int,
help='The batch size for the raw predictions in TensorFlow. Should be as large as '
Expand Down Expand Up @@ -84,6 +88,7 @@ def __init__(self, config_file_path=''):
'write_by': 20_000_000,
'lineage': None,
'model_filepath': None,
'downloaded_model_path': None,
'batch_size': 32,
'no_overlap': False,
'overlap_offset': None,
Expand All @@ -96,15 +101,15 @@ def __init__(self, config_file_path=''):
self.defaults = {**self.defaults, **helixer_defaults}

@staticmethod
def check_for_lineage_model(lineage):
def check_for_lineage_model(lineage, downloaded_model_path):
# which models are available?
priorty_ms = prioritized_models(lineage)
model_path = set_model_path(downloaded_model_path)
priorty_ms = prioritized_models(lineage, model_path)
# which model is already downloaded / will be used?
current_model = identify_current(lineage, priorty_ms)
current_model = identify_current(lineage, priorty_ms, model_path)
# provide feedback if not up to date, error out if missing
report_if_current_not_best(priorty_ms, current_model)

return os.path.join(MODEL_PATH, lineage, current_model)
return os.path.join(model_path, lineage, current_model)

def check_args(self, args):

Expand All @@ -119,7 +124,7 @@ def check_args(self, args):
else:
assert args.lineage is not None, ("Either --lineage or --model-filepath is required. Run `Helixer.py "
"--help` to see lineage options.")
model_filepath = self.check_for_lineage_model(args.lineage)
model_filepath = self.check_for_lineage_model(args.lineage, args.downloaded_model_path)
if args.subsequence_length is None:
key = {'vertebrate': 213840, 'land_plant': 64152, 'fungi': 21384, 'invertebrate': 213840}
args.subsequence_length = key[args.lineage]
Expand Down
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,17 @@ The best models are:
The best models for all lineages are best downloaded by running:

```bash
# the models will be at /home/<user>/.local/share/Helixer/models
# by default the models will be at /home/<user>/.local/share/Helixer/models
scripts/fetch_helixer_models.py
```

If desired, the `--lineage` (`land_plant`, `vertebrate`, `invertebrate`,
and `fungi`) can be specified, or `--all` released models
can be fetched.
can be fetched. If the models should be downloaded to another path
you can specify `fetch_helixer_models.py --custom-path <path_to_download_models_to>`.
If you want `Helixer.py` to use this custom path to check for new releases/lineage
models, please provide `--downloaded-model-path <path_to_download_models_to>` when
running `Helixer.py`. Otherwise, the default folder will be checked.

Downloaded models (and any new releases) can also be found at
https://zenodo.org/records/10836346, but we recommend simply using
Expand Down
9 changes: 9 additions & 0 deletions docs/fine_tuning.md
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,15 @@ HybridModel.py --batch-size 50 --val-test-batch-size 100 -e 100 \
| -s/--save-model-path | ./best_model.h5 | Path to save the best model (model with the best validation genic F1 (the F1 for the classes CDS, UTR and Intron)) to. |
| -v/--verbose | False | Add to run HybridModel.py in verbosity mode (additional information will be printed) |

> **RESUMING TRAINING**
> Simply replace two arguments:
> - replace the previous pretrained Helixer model with your fine-tuned model,
> i.e. `--load-model-path <your_fine_tuned_model>`
> - replace `--fine-tune` with `--fine-tune-resume`
>
> **Hint**: if you want to use the same path to save your fine-tuned model to as before,
> it might be a good idea to back up your previous model checkpoint(s) somewhere

## 3. Train new final layer(s) with extrinsic information
### 3.1 Info
Extrinsic information that could be used to help gene calling
Expand Down
26 changes: 14 additions & 12 deletions docs/helixer_options.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,17 @@ working directory. If that file isn't provided, the parameters are expected to b
command line.

### General parameters
| Parameter | Default | Explanation |
|:---------------------|:--------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| --fasta-path | / | FASTA input file |
| --gff-output-path | / | Output GFF3 file path |
| --species | / | Species name. Will be added to the GFF3 file. |
| --temporary-dir | system default | Use supplied (instead of system default) for temporary directory (place where temporary h5 files from fasta to h5 conversion and Helixer's raw base-wise predictions get saved) |
| --subsequence-length | vertebrate: 213840, land_plant: 64152, fungi: 21384, invertebrate: 213840 | How to slice the genomic sequence. Set moderately longer than length of typical genic loci. Tested up to 213840. Must be evenly divisible by the timestep width of the used model, which is typically 9. (Lineage dependent defaults) |
| --write-by | 20_000_000 | Convert genomic sequence in super-chunks to numerical matrices with this many base pairs, which will be rounded to be divisible by subsequence-length; needs to be equal to or larger than subsequence length; for lower memory consumption, consider setting a lower number |
| --lineage | / | What model to use for the annotation. Options are: vertebrate, land_plant, fungi or invertebrate. |
| --model-filepath | / | Set this to override the default model for any given lineage and instead take a specific model |
| Parameter | Default | Explanation |
|:------------------------|:--------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| --fasta-path | / | FASTA input file |
| --gff-output-path | / | Output GFF3 file path |
| --species | / | Species name. Will be added to the GFF3 file. |
| --temporary-dir | system default | Use supplied (instead of system default) for temporary directory (place where temporary h5 files from fasta to h5 conversion and Helixer's raw base-wise predictions get saved) |
| --subsequence-length | vertebrate: 213840, land_plant: 64152, fungi: 21384, invertebrate: 213840 | How to slice the genomic sequence. Set moderately longer than length of typical genic loci. Tested up to 213840. Must be evenly divisible by the timestep width of the used model, which is typically 9. (Lineage dependent defaults) |
| --write-by | 20_000_000 | Convert genomic sequence in super-chunks to numerical matrices with this many base pairs, which will be rounded to be divisible by subsequence-length; needs to be equal to or larger than subsequence length; for lower memory consumption, consider setting a lower number |
| --lineage | / | What model to use for the annotation. Options are: vertebrate, land_plant, fungi or invertebrate. |
| --model-filepath | / | Set this to override the default model for any given lineage and instead take a specific model |
| --downloaded-model-path | / | Set to override the default download directory (<users_home_directory>/.local/share/Helixer/models) Helixer is checking to see if you use the newest model; only works with --lineage |

### Prediction parameters
| Parameter | Default | Explanation |
Expand Down Expand Up @@ -77,8 +78,8 @@ command line.
| Parameter | Default | Explanation |
|:------------------------|:----------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| -e/--epochs | 10,000 | Number of training runs |
| -b/--batch-size | 8 | Batch size for training data |
| --val-test-batch-size | 32 | Batch size for validation/test data |
| -b/--batch-size | 8 | Batch size for training data; for multi-GPU training please multiply by the number of GPUs |
| --val-test-batch-size | 32 | Batch size for validation/test data; for multi-GPU training please multiply by the number of GPUs |
| --loss | / | Loss function specification |
| --patience | 3 | Allowed epochs without the validation genic F1 improving before stopping training |
| --check-every-nth-batch | 1,000,000 | Check validation genic F1 every nth batch, on default this check gets executed once every epoch regardless of the number of batches |
Expand Down Expand Up @@ -122,6 +123,7 @@ command line.
| Parameter | Default | Explanation |
|:-----------------------------|:--------|:----------------------------------------------------------------------------------------------------------------------------------------|
| --fine-tune | False | Add/Use with --resume-training to replace and fine tune just the very last layer |
| --fine-tune-resume | False | Add/Use with --resume-training to resume your fine tuning of the very last layer |
| --pretrained-model-path | / | Required when predicting with a model fine tuned with coverage |
| --input-coverage | False | Add to use "evaluation/rnaseq_(spliced_)coverage" from HDF5 training/validation files as additional input for a late layer of the model |
| --coverage-norm | None | None, linear or log (recommended); how coverage will be normalized before inputting |
Expand Down
5 changes: 5 additions & 0 deletions docs/training.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ via HelixerPost. For a detailed explanation of all possible parameters see the
HybridModel.py --data-dir example/train/ --save-model-path example/best_helixer_model.h5 \
--epochs 5 --predict-phase
```
> **Hint**: Multi-GPU training works by using all GPUs available on your machine. To restrict
> Helixer in training (and eval) mode to only use one GPU, you can provide the ID of the GPU that
> should be used like so `--gpu-id <ID>` (example: `--gpu-id 0`). If you are using a job scheduler
> like Sun Grid Engine or PBSPro, the scheduling system will handle how many GPUs Helixer is
> able to use.

The rest of this example will continue with the model example/best_helixer_model.h5 produced above.

Expand Down
Loading