Skip to content

Commit 5c5d5e4

Browse files
a120092009phantomlei3
authored andcommitted
bugfix: setup the tokenizer config function of bos and eos to fast tokenizer. (#367)
Co-authored-by: phantomlei <phantomlei3@gmail.com>
1 parent 3e35b5a commit 5c5d5e4

File tree

8 files changed

+480
-13
lines changed

8 files changed

+480
-13
lines changed

xllm/core/framework/hf_model_loader.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,15 @@ bool HFModelLoader::load_tokenizer_args(const std::string& model_weights_path) {
259259
JsonReader tokenizer_reader;
260260
const std::string tokenizer_args_file_path =
261261
model_weights_path_ + "/tokenizer_config.json";
262+
263+
// check if tokenizer.json exists, if exists, set the tokenizer type to fast
264+
const std::string tokenizer_json_path =
265+
model_weights_path + "/tokenizer.json";
266+
if (std::filesystem::exists(tokenizer_json_path)) {
267+
tokenizer_args_.tokenizer_type() = "fast";
268+
tokenizer_args_.vocab_file() = tokenizer_json_path;
269+
}
270+
262271
if (tokenizer_reader.parse(tokenizer_args_file_path)) {
263272
// read chat template if exists
264273
if (auto v = load_chat_template_file(model_weights_path_)) {

xllm/core/framework/tokenizer/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,14 @@ cc_library(
3030
re2::re2
3131
)
3232

33+
cc_test(
34+
NAME
35+
fast_tokenizer_test
36+
SRCS
37+
tests/fast_tokenizer_tests.cpp
38+
DEPS
39+
:tokenizer
40+
glog::glog
41+
GTest::gtest_main
42+
)
43+

xllm/core/framework/tokenizer/fast_tokenizer.cpp

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,55 @@ limitations under the License.
2020

2121
namespace xllm {
2222

23-
FastTokenizer::FastTokenizer(const std::string& tokenizer_json_path)
24-
: tokenizer_json_path_(tokenizer_json_path) {
25-
handle_ = tokenizers_new_from_path(tokenizer_json_path.c_str());
23+
FastTokenizer::FastTokenizer(const TokenizerArgs& tokenizer_args)
24+
: tokenizer_args_(tokenizer_args) {
25+
handle_ = tokenizers_new_from_path(tokenizer_args.vocab_file().c_str());
2626
CHECK(handle_ != nullptr)
27-
<< "Failed to load tokenizer from file: " << tokenizer_json_path;
27+
<< "Failed to load tokenizer from file: " << tokenizer_args.vocab_file();
2828
}
2929

3030
std::unique_ptr<Tokenizer> FastTokenizer::clone() const {
31-
return std::make_unique<FastTokenizer>(tokenizer_json_path_);
31+
return std::make_unique<FastTokenizer>(tokenizer_args_);
3232
}
3333

3434
FastTokenizer::~FastTokenizer() { tokenizers_free(handle_); }
3535

36+
namespace {
37+
// Helper function to add a special token to the beginning or end of ids
38+
// Checks if token already exists before adding to avoid duplication
39+
// Returns true on success, false if token is not found, empty, or already
40+
// exists
41+
bool add_special_token_id(const std::string& token,
42+
std::optional<int32_t> token_id,
43+
std::vector<int32_t>* ids,
44+
bool prepend) {
45+
if (token.empty() || !token_id.has_value()) {
46+
if (!token.empty() && !token_id.has_value()) {
47+
LOG(WARNING) << "Failed to find token ID for token: " << token;
48+
}
49+
return false;
50+
}
51+
52+
const int32_t id = token_id.value();
53+
54+
// Check if token already exists at the expected position
55+
if (prepend) {
56+
// For BOS: check if already at the beginning
57+
if (!ids->empty() && ids->front() == id) {
58+
return false; // Already exists, skip adding
59+
}
60+
ids->insert(ids->begin(), id);
61+
} else {
62+
// For EOS: check if already at the end
63+
if (!ids->empty() && ids->back() == id) {
64+
return false; // Already exists, skip adding
65+
}
66+
ids->push_back(id);
67+
}
68+
return true;
69+
}
70+
} // namespace
71+
3672
bool FastTokenizer::encode(const std::string_view& text,
3773
std::vector<int32_t>* ids,
3874
bool add_special_tokens) const {
@@ -43,6 +79,31 @@ bool FastTokenizer::encode(const std::string_view& text,
4379
std::vector<int32_t> ret(result.token_ids, result.token_ids + result.len);
4480
*ids = std::move(ret);
4581

82+
// Free the memory allocated by Rust tokenizer
83+
// The token_ids pointer is allocated by Rust's Box::into_raw and must be
84+
// freed
85+
if (result.token_ids != nullptr && result.len > 0) {
86+
tokenizers_free_encode_results(&result, 1);
87+
}
88+
89+
// Add BOS token if configured
90+
if (tokenizer_args_.add_bos_token() && !tokenizer_args_.bos_token().empty()) {
91+
const auto bos_id = token_to_id(tokenizer_args_.bos_token());
92+
add_special_token_id(tokenizer_args_.bos_token(),
93+
bos_id,
94+
ids,
95+
/*prepend=*/true);
96+
}
97+
98+
// Add EOS token if configured
99+
if (tokenizer_args_.add_eos_token() && !tokenizer_args_.eos_token().empty()) {
100+
const auto eos_id = token_to_id(tokenizer_args_.eos_token());
101+
add_special_token_id(tokenizer_args_.eos_token(),
102+
eos_id,
103+
ids,
104+
/*prepend=*/false);
105+
}
106+
46107
return true;
47108
}
48109

xllm/core/framework/tokenizer/fast_tokenizer.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,14 @@ limitations under the License.
1717
#pragma once
1818

1919
#include "tokenizer.h"
20+
#include "tokenizer_args.h"
2021
#include "tokenizers/tokenizers.h"
2122

2223
namespace xllm {
2324

2425
class FastTokenizer : public Tokenizer {
2526
public:
26-
FastTokenizer(const std::string& tokenizer_json_path);
27+
FastTokenizer(const TokenizerArgs& tokenizer_args);
2728

2829
~FastTokenizer() override;
2930

@@ -44,8 +45,7 @@ class FastTokenizer : public Tokenizer {
4445
std::unique_ptr<Tokenizer> clone() const override;
4546

4647
private:
47-
std::string tokenizer_json_path_;
48-
48+
TokenizerArgs tokenizer_args_;
4949
TokenizerHandle handle_ = nullptr;
5050
};
5151

0 commit comments

Comments
 (0)