From 55b83132b4cd69e3bb7c3a1e0d2e98aa6d00298b Mon Sep 17 00:00:00 2001 From: klasocki Date: Tue, 16 Dec 2025 18:26:21 +0100 Subject: [PATCH] Fix bug with double import of ByteDecoder, small script improvements Better handling of unspecified number of inherit merges --- scripts/extend_existing_tokenizer.sh | 5 +++-- scripts/extend_tokenizer.sh | 4 ++-- utils.py | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/extend_existing_tokenizer.sh b/scripts/extend_existing_tokenizer.sh index ac50aaf..973661f 100644 --- a/scripts/extend_existing_tokenizer.sh +++ b/scripts/extend_existing_tokenizer.sh @@ -1,5 +1,6 @@ model_name=gpt-4 # download from https://huggingface.co/Xenova/gpt-4o dataset_name=olmo2_p99_truncate +corpus_dir=./$dataset_name orig_tokenizer_dir=tokenizer_json/$model_name num_inherit_merges=100000 # leave this unspecified if extending the entire tokenizer vocab_size=128000 @@ -7,10 +8,10 @@ num_bytes=$((10**10)) regex_string="\p{N}{1,3}| ?[^\s\p{L}\p{N}]{2,}[\r\n/]*| +(?!\S)" # create a str called num_inherit_merges_str, which turns 100000 into 100K -if [ $num_inherit_merges -ge 1000 ]; then +if [ ${num_inherit_merges:-0} -ge 1000 ]; then num_inherit_merges_str=$(($num_inherit_merges / 1000))K else - num_inherit_merges_str=${num_inherit_merges} + num_inherit_merges_str=${num_inherit_merges:-0} fi # convert vocab_size to something like 100K, depending on the value diff --git a/scripts/extend_tokenizer.sh b/scripts/extend_tokenizer.sh index 296e866..9836a0c 100644 --- a/scripts/extend_tokenizer.sh +++ b/scripts/extend_tokenizer.sh @@ -5,10 +5,10 @@ vocab_size=128000 regex_string="\p{N}{1,3}| ?[^\s\p{L}\p{N}]{2,}[\r\n/]*| +(?!\S)" # create a str called num_inherit_merges_str, which turns 100000 into 100K -if [ $num_inherit_merges -ge 1000 ]; then +if [ ${num_inherit_merges:-0} -ge 1000 ]; then num_inherit_merges_str=$(($num_inherit_merges / 1000))K else - num_inherit_merges_str=${num_inherit_merges} + num_inherit_merges_str=${num_inherit_merges:-0} fi # convert vocab_size to something like 100K, depending on the value diff --git a/utils.py b/utils.py index 0f3f22a..3b0d984 100755 --- a/utils.py +++ b/utils.py @@ -14,7 +14,7 @@ from tokenizers.trainers import BpeTrainer, UnigramTrainer from tokenizers import Tokenizer from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast -from tokenizers.decoders import ByteLevel +from tokenizers.decoders import ByteLevel as ByteLevelDecoder def ensure_dir(d): @@ -188,7 +188,7 @@ def get_files_with_num_bytes(data_dir, num_bytes=None, loop_around=True): def construct_hf_tokenizer(tokenizer_dir): tokenizer_dir = Path(tokenizer_dir) base_tokenizer = Tokenizer.from_file(str(tokenizer_dir / "tokenizer.json")) - base_tokenizer.decoder = ByteLevel( + base_tokenizer.decoder = ByteLevelDecoder( add_prefix_space=True, trim_offsets=True, use_regex=True ) eos_token_id = base_tokenizer.get_vocab_size()