forked from turboderp/exllama
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.py
More file actions
29 lines (20 loc) · 732 Bytes
/
tokenizer.py
File metadata and controls
29 lines (20 loc) · 732 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from sentencepiece import SentencePieceProcessor
import os
import torch
class ExLlamaTokenizer:
def __init__(self, tokenizer_model_path):
self.path = tokenizer_model_path
self.tokenizer = SentencePieceProcessor(model_file = self.path)
self.eos_token_id = self.tokenizer.eos_id()
self.bos_token_id = self.tokenizer.bos_id()
self.newline_token_id = 13
def encode(self, text):
ids = self.tokenizer.Encode(text)
return torch.tensor(ids).unsqueeze(0)
def decode(self, ids):
ids = ids.tolist()
text = self.tokenizer.Decode(ids)
return text
def num_tokens(self, text):
ids = self.tokenizer.Encode(text)
return len(ids)