Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,20 @@ if(USE_NPU)
if(DEVICE_TYPE STREQUAL "USE_A3")
message("downloading a3 arm xllm kernels")
file(DOWNLOAD
"https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.3-Linux.a3.arm.rpm"
"https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.4-Linux.a3.arm.rpm"
"${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
)
else()
if(DEVICE_ARCH STREQUAL "ARM")
message("downloading a2 arm xllm_kernels")
file(DOWNLOAD
"https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.3-Linux.a2.arm.rpm"
"https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.4-Linux.a2.arm.rpm"
"${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
)
else()
message("downloading a2 x86 xllm_kernels")
file(DOWNLOAD
"https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.3-Linux.a2.x86.rpm"
"https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.4-Linux.a2.x86.rpm"
"${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
)
endif()
Expand Down
5 changes: 3 additions & 2 deletions vcpkg.json
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,10 @@
"version>=": "1.3.1"
},
{
"name": "opencv",
"name": "opencv4",
"version>=": "4.7.0",
"default-features": false
"default-features": false,
"features": ["ffmpeg", "jpeg", "png","tiff","webp","openexr","quirc"]
},
{
"name": "yaml-cpp",
Expand Down
142 changes: 139 additions & 3 deletions xllm/core/framework/batch/mposition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,34 @@ limitations under the License.

#include "mposition.h"

#include <absl/strings/match.h>

#include "framework/model/model_args.h"
#include "framework/request/sequence.h"

namespace xllm {

namespace {
std::vector<std::tuple<std::string, int, int>> groupByTokenType(
const std::vector<std::string>& token_types) {
std::vector<std::tuple<std::string, int, int>> groups;
if (token_types.empty()) return groups;

std::string current_key = token_types[0];
int start = 0;

for (int i = 1; i < token_types.size(); ++i) {
if (token_types[i] != current_key) {
groups.emplace_back(current_key, start, i);
current_key = token_types[i];
start = i;
}
}
groups.emplace_back(current_key, start, static_cast<int>(token_types.size()));
return groups;
}
} // namespace

torch::Tensor MPositionHelper::get_positions() {
// if (seq_.is_chunked_prefill_stage()) {
if (seq_.kv_state().kv_cache_tokens_num() < seq_.num_prompt_tokens()) {
Expand All @@ -35,16 +59,128 @@ torch::Tensor MPositionHelper::get_positions() {
torch::Tensor second_per_grid_ts;
if (auto res = mm_data.get<torch::Tensor>("second_per_grid_ts"))
second_per_grid_ts = res.value();
auto res =
get_positions_p(image_grid_thw, video_grid_thw, second_per_grid_ts);
std::tuple<torch::Tensor, int> res;
if (!absl::StartsWith(args_.model_type(), "glm4v")) {
res = get_positions_p(image_grid_thw, video_grid_thw, second_per_grid_ts);
} else {
res = get_positions_glm(image_grid_thw, video_grid_thw);
}
seq_.set_mrope_position_delta(std::get<1>(res));

return std::get<0>(res);
} else {
return get_positions_d();
}
}

std::tuple<torch::Tensor, int> MPositionHelper::get_positions_glm(
torch::Tensor image_grid_thw,
torch::Tensor video_grid_thw) {
auto input_tokens = seq_.tokens();
auto spatial_merge_size = args_.mm_spatial_merge_size();
auto image_token_id = args_.image_token_id();
auto video_token_id = args_.video_token_id();
auto video_start_token_id = args_.video_start_token_id();
auto video_end_token_id = args_.video_end_token_id();

auto dtype = torch::kInt32;

std::vector<std::string> input_token_type;
bool in_video = false;
int num_tokens = input_tokens.size();

for (int index = 0; index < num_tokens; ++index) {
auto token = input_tokens[index];
if (token == video_start_token_id) {
in_video = true;
} else if (token == video_end_token_id) {
in_video = false;
}

if (token == image_token_id && !in_video) {
input_token_type.push_back("image");
} else if (token == image_token_id && in_video) {
input_token_type.push_back("video");
} else {
input_token_type.push_back("text");
}
}
auto input_type_group = groupByTokenType(input_token_type);
int image_index = 0;
int video_index = 0;
int video_group_index = 0;

std::vector<torch::Tensor> llm_pos_ids_list;
int video_frame_num = 1;
for (const auto& group : input_type_group) {
const auto& modality_type = std::get<0>(group);
int start_idx = std::get<1>(group);
int end_idx = std::get<2>(group);
int st_idx = 0;
if (!llm_pos_ids_list.empty()) {
st_idx = llm_pos_ids_list.back().max().item<int>() + 1;
}

if (modality_type == "image") {
auto grid = image_grid_thw[image_index];
int t = grid[0].item<int>();
int h = grid[1].item<int>() / spatial_merge_size;
int w = grid[2].item<int>() / spatial_merge_size;

auto t_arange =
torch::arange(t, dtype).view({-1, 1}).expand({-1, h * w}).flatten();
auto h_arange =
torch::arange(h, dtype).view({1, -1, 1}).expand({t, -1, w}).flatten();
auto w_arange =
torch::arange(w, dtype).view({1, 1, -1}).expand({t, h, -1}).flatten();

auto pos = torch::stack({t_arange, h_arange, w_arange}) + st_idx;
llm_pos_ids_list.push_back(pos);
video_frame_num = 1;
image_index++;
} else if (modality_type == "video") {
int t = video_frame_num;
int h = video_grid_thw[video_index][1].item<int>() / spatial_merge_size;
int w = video_grid_thw[video_index][2].item<int>() / spatial_merge_size;

for (int t_idx = 0; t_idx < t; ++t_idx) {
auto t_tensor = torch::full({1, h * w}, t_idx, dtype).flatten();
auto h_tensor = torch::arange(h, dtype)
.view({1, -1, 1})
.expand({1, -1, w})
.flatten();
auto w_tensor = torch::arange(w, dtype)
.view({1, 1, -1})
.expand({1, h, -1})
.flatten();

auto pos = torch::stack({t_tensor, h_tensor, w_tensor}) + st_idx;
llm_pos_ids_list.push_back(pos);
}

video_group_index++;
if (video_group_index >= video_grid_thw[video_index][0].item<int>()) {
video_index++;
video_group_index = 0;
}
video_frame_num++;
} else { // text
int text_len = end_idx - start_idx;
auto arange =
torch::arange(text_len, dtype).view({1, -1}).expand({3, -1}) + st_idx;
llm_pos_ids_list.push_back(arange);
video_frame_num = 1;
}
}

torch::Tensor llm_positions =
torch::cat(llm_pos_ids_list, /*dim=*/1).reshape({3, -1});
llm_positions = llm_positions;
int mrope_position_delta =
(llm_positions.max().item<int>() + 1 - input_tokens.size());

return std::make_pair(llm_positions, mrope_position_delta);
}

std::tuple<torch::Tensor, int> MPositionHelper::get_positions_p(
torch::Tensor image_grid_thw,
torch::Tensor video_grid_thw,
Expand Down
4 changes: 4 additions & 0 deletions xllm/core/framework/batch/mposition.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ class MPositionHelper {
torch::Tensor image_grid_thw,
torch::Tensor video_grid_thw,
torch::Tensor second_per_grid_ts);
std::tuple<torch::Tensor, int> get_positions_glm(
torch::Tensor image_grid_thw,
torch::Tensor video_grid_thw);

torch::Tensor get_positions_d();

private:
Expand Down
9 changes: 9 additions & 0 deletions xllm/core/framework/chat_template/jinja_chat_template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,15 @@ nlohmann::ordered_json JinjaChatTemplate::get_mm_content(

if (item.type == "text") {
item_json["text"] = item.text;
} else if (item.type == "video_url") {
item_json["video"] = "mm place holder";
item_json["video_url"] = "mm place holder";
} else if (item.type == "image_url") {
item_json["image"] = "mm place holder";
item_json["image_url"] = "mm place holder";
} else if (item.type == "audio_url") {
item_json["audio"] = "mm place holder";
item_json["audio_url"] = "mm place holder";
} else {
item_json[item.type] = "mm place holder";
}
Expand Down
25 changes: 25 additions & 0 deletions xllm/core/framework/hf_model_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,11 @@ bool HFModelLoader::load_args(const std::string& model_weights_path) {
return false;
}

if (!load_video_preprocessor_args(model_weights_path)) {
LOG(ERROR) << "Failed to load video preprocess args from "
<< model_weights_path;
return false;
}
// Some hacky logics to support loading of old models
// always use float16 for quantization
// TODO: support quantization for other data types
Expand Down Expand Up @@ -416,4 +421,24 @@ bool HFModelLoader::load_image_preprocessor_args(
return true;
}

bool HFModelLoader::load_video_preprocessor_args(
const std::string& model_weights_path) {
// image preprocessor args
JsonReader video_preprocess_reader;
const std::string video_preprocess_file_path =
model_weights_path + "/video_preprocessor_config.json";
if (video_preprocess_reader.parse(video_preprocess_file_path)) {
LOG(INFO) << "Success to parse video preprocess args file: "
<< video_preprocess_file_path;

args_.mm_video_shortest_edge() =
video_preprocess_reader.value_or<int>("size.shortest_edge", 0);

args_.mm_video_longest_edge() =
video_preprocess_reader.value_or<int>("size.longest_edge", 0);
}

return true;
}

} // namespace xllm
1 change: 1 addition & 0 deletions xllm/core/framework/hf_model_loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class HFModelLoader : public ModelLoader {
bool load_quant_args(const std::string& model_weights_path);
bool load_tokenizer_args(const std::string& model_weights_path);
bool load_image_preprocessor_args(const std::string& model_weights_path);
bool load_video_preprocessor_args(const std::string& model_weights_path);
std::string model_weights_path() const override {
return model_weights_path_;
}
Expand Down
10 changes: 10 additions & 0 deletions xllm/core/framework/model/model_args.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,12 @@ struct ModelArgs {
PROPERTY(int32_t, image_token_id) = 0;
PROPERTY(int32_t, video_token_id) = 0;

// glm4v moe
PROPERTY(int32_t, image_start_token_id) = 0;
PROPERTY(int32_t, image_end_token_id) = 0;
PROPERTY(int32_t, video_start_token_id) = 0;
PROPERTY(int32_t, video_end_token_id) = 0;

PROPERTY(std::string, vision_custom_adapter);
PROPERTY(int32_t, vision_max_slice_nums) = 0;

Expand Down Expand Up @@ -297,6 +303,10 @@ struct ModelArgs {
PROPERTY(int64_t, mm_image_shortest_edge) = 0;
PROPERTY(int64_t, mm_image_longest_edge) = 0;

// GLM
PROPERTY(int64_t, mm_video_shortest_edge) = 0;
PROPERTY(int64_t, mm_video_longest_edge) = 0;

PROPERTY(int, mm_image_patch_size) = 0;
PROPERTY(int, mm_image_temporal_patch_size) = 0;
PROPERTY(int, mm_image_merge_size) = 0;
Expand Down
Loading
Loading