@@ -20,19 +20,55 @@ limitations under the License.
2020
2121namespace xllm {
2222
23- FastTokenizer::FastTokenizer (const std::string& tokenizer_json_path )
24- : tokenizer_json_path_(tokenizer_json_path ) {
25- handle_ = tokenizers_new_from_path (tokenizer_json_path .c_str ());
23+ FastTokenizer::FastTokenizer (const TokenizerArgs& tokenizer_args )
24+ : tokenizer_args_(tokenizer_args ) {
25+ handle_ = tokenizers_new_from_path (tokenizer_args. vocab_file () .c_str ());
2626 CHECK (handle_ != nullptr )
27- << " Failed to load tokenizer from file: " << tokenizer_json_path ;
27+ << " Failed to load tokenizer from file: " << tokenizer_args. vocab_file () ;
2828}
2929
3030std::unique_ptr<Tokenizer> FastTokenizer::clone () const {
31- return std::make_unique<FastTokenizer>(tokenizer_json_path_ );
31+ return std::make_unique<FastTokenizer>(tokenizer_args_ );
3232}
3333
3434FastTokenizer::~FastTokenizer () { tokenizers_free (handle_); }
3535
36+ namespace {
37+ // Helper function to add a special token to the beginning or end of ids
38+ // Checks if token already exists before adding to avoid duplication
39+ // Returns true on success, false if token is not found, empty, or already
40+ // exists
41+ bool add_special_token_id (const std::string& token,
42+ std::optional<int32_t > token_id,
43+ std::vector<int32_t >* ids,
44+ bool prepend) {
45+ if (token.empty () || !token_id.has_value ()) {
46+ if (!token.empty () && !token_id.has_value ()) {
47+ LOG (WARNING) << " Failed to find token ID for token: " << token;
48+ }
49+ return false ;
50+ }
51+
52+ const int32_t id = token_id.value ();
53+
54+ // Check if token already exists at the expected position
55+ if (prepend) {
56+ // For BOS: check if already at the beginning
57+ if (!ids->empty () && ids->front () == id) {
58+ return false ; // Already exists, skip adding
59+ }
60+ ids->insert (ids->begin (), id);
61+ } else {
62+ // For EOS: check if already at the end
63+ if (!ids->empty () && ids->back () == id) {
64+ return false ; // Already exists, skip adding
65+ }
66+ ids->push_back (id);
67+ }
68+ return true ;
69+ }
70+ } // namespace
71+
3672bool FastTokenizer::encode (const std::string_view& text,
3773 std::vector<int32_t >* ids,
3874 bool add_special_tokens) const {
@@ -43,6 +79,31 @@ bool FastTokenizer::encode(const std::string_view& text,
4379 std::vector<int32_t > ret (result.token_ids , result.token_ids + result.len );
4480 *ids = std::move (ret);
4581
82+ // Free the memory allocated by Rust tokenizer
83+ // The token_ids pointer is allocated by Rust's Box::into_raw and must be
84+ // freed
85+ if (result.token_ids != nullptr && result.len > 0 ) {
86+ tokenizers_free_encode_results (&result, 1 );
87+ }
88+
89+ // Add BOS token if configured
90+ if (tokenizer_args_.add_bos_token () && !tokenizer_args_.bos_token ().empty ()) {
91+ const auto bos_id = token_to_id (tokenizer_args_.bos_token ());
92+ add_special_token_id (tokenizer_args_.bos_token (),
93+ bos_id,
94+ ids,
95+ /* prepend=*/ true );
96+ }
97+
98+ // Add EOS token if configured
99+ if (tokenizer_args_.add_eos_token () && !tokenizer_args_.eos_token ().empty ()) {
100+ const auto eos_id = token_to_id (tokenizer_args_.eos_token ());
101+ add_special_token_id (tokenizer_args_.eos_token (),
102+ eos_id,
103+ ids,
104+ /* prepend=*/ false );
105+ }
106+
46107 return true ;
47108}
48109
0 commit comments