-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare.py
More file actions
82 lines (69 loc) · 1.83 KB
/
prepare.py
File metadata and controls
82 lines (69 loc) · 1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""
One-time data preparation for openresearch experiments.
Downloads data shards and trains a BPE tokenizer.
Usage:
python prepare.py # full prep (download + tokenizer)
python prepare.py --num-shards 8 # download only 8 shards (for testing)
"""
from pathlib import Path
import sys
def _bootstrap_legacy_package() -> None:
repo_root = Path(__file__).resolve().parent
legacy_src = repo_root / "packages" / "openresearch_legacy" / "src"
if legacy_src.is_dir():
sys.path.insert(0, str(legacy_src))
_bootstrap_legacy_package()
from openresearch_legacy.constants import (
BASE_URL,
BOS_TOKEN,
CACHE_DIR,
DATA_DIR,
EVAL_TOKENS,
MAX_SEQ_LEN,
MAX_SHARD,
SPECIAL_TOKENS,
SPLIT_PATTERN,
TIME_BUDGET,
TOKENIZER_DIR,
VAL_FILENAME,
VAL_SHARD,
VOCAB_SIZE,
)
from openresearch_legacy.runtime import Tokenizer, evaluate_bpb, get_token_bytes, list_parquet_files, make_dataloader
import openresearch_legacy.data_prep as _data_prep
build_parser = _data_prep.build_parser
download_data = _data_prep.download_data
download_single_shard = _data_prep.download_single_shard
text_iterator = _data_prep.text_iterator
train_tokenizer = _data_prep.train_tokenizer
def main(argv=None):
return _data_prep.main(argv)
__all__ = [
"BASE_URL",
"BOS_TOKEN",
"CACHE_DIR",
"DATA_DIR",
"EVAL_TOKENS",
"MAX_SEQ_LEN",
"MAX_SHARD",
"SPECIAL_TOKENS",
"SPLIT_PATTERN",
"TIME_BUDGET",
"TOKENIZER_DIR",
"Tokenizer",
"VAL_FILENAME",
"VAL_SHARD",
"VOCAB_SIZE",
"build_parser",
"download_data",
"download_single_shard",
"evaluate_bpb",
"get_token_bytes",
"list_parquet_files",
"main",
"make_dataloader",
"text_iterator",
"train_tokenizer",
]
if __name__ == "__main__":
raise SystemExit(main())