forked from nntrainer/Quick.AI
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizers_cpp.h
More file actions
126 lines (114 loc) · 3.55 KB
/
Copy pathtokenizers_cpp.h
File metadata and controls
126 lines (114 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/**
* Copyright (c) 2023 by Contributors
* @file tokenizers_cpp.h
* @brief A C++ binding to common set of tokenizers
* @author Contributors
* @bug No known bugs
*/
#ifndef TOKENIZERS_CPP_H_
#define TOKENIZERS_CPP_H_
#include <memory>
#include <string>
#include <vector>
namespace tokenizers {
/**
* @brief a universal tokenizer that loads
* either HF's tokenizer or sentence piece,
* depending on the constructor
*/
class Tokenizer {
public:
/** @brief virtual destructor */
virtual ~Tokenizer() {}
/**
* @brief Encode text into ids.
* @param text The input text.
* @return The encoded token ids.
*/
virtual std::vector<int32_t> Encode(const std::string &text) = 0;
/**
* @brief Encode text into ids with special tokens option.
* @param text The input text.
* @param add_special_tokens Whether to add special tokens.
* @return The encoded token ids.
*/
virtual std::vector<int32_t> Encode(const std::string &text,
bool add_special_tokens) = 0;
/**
* @brief Encode a batch of texts into ids.
* @param texts The input texts.
* @return The encoded token ids.
*/
virtual std::vector<std::vector<int32_t>>
EncodeBatch(const std::vector<std::string> &texts) {
// Fall back when the derived class does not implement this function.
std::vector<std::vector<int32_t>> ret;
ret.reserve(texts.size());
for (const auto &text : texts) {
ret.push_back(Encode(text));
}
return ret;
}
/**
* @brief Decode token ids into text.
* @param text The token ids.
* @return The decoded text.
*/
virtual std::string Decode(const std::vector<int32_t> &ids) = 0;
/**
* @brief Returns the vocabulary size. Special tokens are considered.
*/
virtual size_t GetVocabSize() = 0;
/**
* @brief Convert the given id to its corresponding token if it exists. If
* not, return an empty string.
*/
virtual std::string IdToToken(int32_t token_id) = 0;
/**
* @brief Convert the given token to its corresponding id if it exists. If
* not, return -1.
*/
virtual int32_t TokenToId(const std::string &token) = 0;
//---------------------------------------------------
// Factory functions from byte-blobs
// These factory function takes in in-memory blobs
// so the library can be independent from filesystem
//---------------------------------------------------
/**
* @brief Create HF tokenizer from a single in-memory json blob.
*
* @param json_blob The json blob.
* @return The created tokenzier.
*/
static std::unique_ptr<Tokenizer> FromBlobJSON(const std::string &json_blob);
/**
* @brief Create BPE tokenizer
*
* @param vocab_blob The blob that contains vocabs.
* @param merges_blob The blob that contains the merges.
* @param added_tokens The added tokens.
* @return The created tokenizer.
*/
static std::unique_ptr<Tokenizer>
FromBlobByteLevelBPE(const std::string &vocab_blob,
const std::string &merges_blob,
const std::string &added_tokens = "");
/**
* @brief Create SentencePiece.
*
* @param model_blob The blob that contains vocabs.
* @return The created tokenizer.
*/
static std::unique_ptr<Tokenizer>
FromBlobSentencePiece(const std::string &model_blob);
/**
* @brief Create RWKVWorldTokenizer.
*
* @param model_blob The blob that contains vocabs.
* @return The created tokenizer.
*/
static std::unique_ptr<Tokenizer>
FromBlobRWKVWorld(const std::string &model_blob);
};
} // namespace tokenizers
#endif // TOKENIZERS_CPP_H_