-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWordPieceTokenizer.cpp
More file actions
44 lines (36 loc) · 1.06 KB
/
WordPieceTokenizer.cpp
File metadata and controls
44 lines (36 loc) · 1.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#include "WordPieceTokenizer.h"
#include <fstream>
#include <sstream>
#include <vector>
#include <string>
#include <cstdint>
// Constructor
WordPieceTokenizer::WordPieceTokenizer(const std::string& vocabPath) {
std::ifstream f(vocabPath);
std::string token;
int64_t id = 0;
// Load vocab file and build token to ID mapping
while (std::getline(f, token)) {
vocab[token] = id++;
}
// Set special token IDs
pad_id = vocab["[PAD]"];
cls_id = vocab["[CLS]"];
sep_id = vocab["[SEP]"];
unk_id = vocab["[UNK]"];
}
// Encode text into token IDs with padding/truncation
std::vector<int64_t> WordPieceTokenizer::encode(const std::string& text, size_t maxLen) const {
std::vector<int64_t> ids;
ids.reserve(maxLen);
ids.push_back(cls_id);
std::istringstream iss(text);
std::string word;
while (iss >> word && ids.size() < maxLen - 1) {
auto it = vocab.find(word);
ids.push_back(it != vocab.end() ? it->second : unk_id);
}
ids.push_back(sep_id);
ids.resize(maxLen, pad_id);
return ids;
}