Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions scripts/extend_existing_tokenizer.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
model_name=gpt-4 # download from https://huggingface.co/Xenova/gpt-4o
dataset_name=olmo2_p99_truncate
corpus_dir=./$dataset_name
orig_tokenizer_dir=tokenizer_json/$model_name
num_inherit_merges=100000 # leave this unspecified if extending the entire tokenizer
vocab_size=128000
num_bytes=$((10**10))
regex_string="\p{N}{1,3}| ?[^\s\p{L}\p{N}]{2,}[\r\n/]*| +(?!\S)"

# create a str called num_inherit_merges_str, which turns 100000 into 100K
if [ $num_inherit_merges -ge 1000 ]; then
if [ ${num_inherit_merges:-0} -ge 1000 ]; then
num_inherit_merges_str=$(($num_inherit_merges / 1000))K
else
num_inherit_merges_str=${num_inherit_merges}
num_inherit_merges_str=${num_inherit_merges:-0}
fi

# convert vocab_size to something like 100K, depending on the value
Expand Down
4 changes: 2 additions & 2 deletions scripts/extend_tokenizer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ vocab_size=128000
regex_string="\p{N}{1,3}| ?[^\s\p{L}\p{N}]{2,}[\r\n/]*| +(?!\S)"

# create a str called num_inherit_merges_str, which turns 100000 into 100K
if [ $num_inherit_merges -ge 1000 ]; then
if [ ${num_inherit_merges:-0} -ge 1000 ]; then
num_inherit_merges_str=$(($num_inherit_merges / 1000))K
else
num_inherit_merges_str=${num_inherit_merges}
num_inherit_merges_str=${num_inherit_merges:-0}
fi

# convert vocab_size to something like 100K, depending on the value
Expand Down
4 changes: 2 additions & 2 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from tokenizers.trainers import BpeTrainer, UnigramTrainer
from tokenizers import Tokenizer
from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
from tokenizers.decoders import ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder


def ensure_dir(d):
Expand Down Expand Up @@ -188,7 +188,7 @@ def get_files_with_num_bytes(data_dir, num_bytes=None, loop_around=True):
def construct_hf_tokenizer(tokenizer_dir):
tokenizer_dir = Path(tokenizer_dir)
base_tokenizer = Tokenizer.from_file(str(tokenizer_dir / "tokenizer.json"))
base_tokenizer.decoder = ByteLevel(
base_tokenizer.decoder = ByteLevelDecoder(
add_prefix_space=True, trim_offsets=True, use_regex=True
)
eos_token_id = base_tokenizer.get_vocab_size()
Expand Down