-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain.py
More file actions
54 lines (39 loc) · 1.15 KB
/
train.py
File metadata and controls
54 lines (39 loc) · 1.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""
OpenResearch pretraining script. Single-GPU, package-backed wrapper.
Usage: uv run train.py
"""
from __future__ import annotations
from pathlib import Path
import sys
def _bootstrap_legacy_package() -> None:
repo_root = Path(__file__).resolve().parent
legacy_src = repo_root / "packages" / "openresearch_legacy" / "src"
if legacy_src.is_dir():
sys.path.insert(0, str(legacy_src))
_bootstrap_legacy_package()
import openresearch_legacy.training as _training
# ---------------------------------------------------------------------------
# Hyperparameters (edit these directly, no CLI flags needed)
# ---------------------------------------------------------------------------
# Model architecture
ASPECT_RATIO = 64
HEAD_DIM = 128
WINDOW_PATTERN = "SSSL"
# Optimization
TOTAL_BATCH_SIZE = 2**19
EMBEDDING_LR = 0.6
UNEMBEDDING_LR = 0.004
MATRIX_LR = 0.04
SCALAR_LR = 0.5
WEIGHT_DECAY = 0.2
ADAM_BETAS = (0.8, 0.95)
WARMUP_RATIO = 0.0
WARMDOWN_RATIO = 0.5
FINAL_LR_FRAC = 0.0
# Model size
DEPTH = 8
DEVICE_BATCH_SIZE = 128
def main():
return _training.main(sys.modules[__name__])
if __name__ == "__main__":
raise SystemExit(main())