jd-opensource · DongheJin · Nov 30, 2025 · Nov 28, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,20 +32,20 @@ if(USE_NPU)
     if(DEVICE_TYPE STREQUAL "USE_A3")
         message("downloading a3 arm xllm kernels")
         file(DOWNLOAD 
-            "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.3-Linux.a3.arm.rpm"
+            "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.4-Linux.a3.arm.rpm"
             "${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
         )
     else()  
       if(DEVICE_ARCH STREQUAL "ARM")
           message("downloading a2 arm xllm_kernels")
           file(DOWNLOAD 
-              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.3-Linux.a2.arm.rpm"
+              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.4-Linux.a2.arm.rpm"
               "${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
           )
       else()
           message("downloading a2 x86 xllm_kernels")
           file(DOWNLOAD 
-              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.3-Linux.a2.x86.rpm"
+              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.4-Linux.a2.x86.rpm"
               "${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
           )
       endif()

diff --git a/vcpkg.json b/vcpkg.json
@@ -98,9 +98,10 @@
       "version>=": "1.3.1"
     },
     {
-      "name": "opencv",
+      "name": "opencv4",
       "version>=": "4.7.0",
-      "default-features": false
+      "default-features": false,
+      "features": ["ffmpeg", "jpeg", "png","tiff","webp","openexr","quirc"]
     },
     {
       "name": "yaml-cpp",

diff --git a/xllm/core/framework/batch/mposition.cpp b/xllm/core/framework/batch/mposition.cpp
@@ -15,10 +15,34 @@ limitations under the License.
 
 #include "mposition.h"
 
+#include <absl/strings/match.h>
+
 #include "framework/model/model_args.h"
 #include "framework/request/sequence.h"
+
 namespace xllm {
 
+namespace {
+std::vector<std::tuple<std::string, int, int>> groupByTokenType(
+    const std::vector<std::string>& token_types) {
+  std::vector<std::tuple<std::string, int, int>> groups;
+  if (token_types.empty()) return groups;
+
+  std::string current_key = token_types[0];
+  int start = 0;
+
+  for (int i = 1; i < token_types.size(); ++i) {
+    if (token_types[i] != current_key) {
+      groups.emplace_back(current_key, start, i);
+      current_key = token_types[i];
+      start = i;
+    }
+  }
+  groups.emplace_back(current_key, start, static_cast<int>(token_types.size()));
+  return groups;
+}
+}  // namespace
+
 torch::Tensor MPositionHelper::get_positions() {
   // if (seq_.is_chunked_prefill_stage()) {
   if (seq_.kv_state().kv_cache_tokens_num() < seq_.num_prompt_tokens()) {
@@ -35,16 +59,128 @@ torch::Tensor MPositionHelper::get_positions() {
     torch::Tensor second_per_grid_ts;
     if (auto res = mm_data.get<torch::Tensor>("second_per_grid_ts"))
       second_per_grid_ts = res.value();
-    auto res =
-        get_positions_p(image_grid_thw, video_grid_thw, second_per_grid_ts);
+    std::tuple<torch::Tensor, int> res;
+    if (!absl::StartsWith(args_.model_type(), "glm4v")) {
+      res = get_positions_p(image_grid_thw, video_grid_thw, second_per_grid_ts);
+    } else {
+      res = get_positions_glm(image_grid_thw, video_grid_thw);
+    }
     seq_.set_mrope_position_delta(std::get<1>(res));
-
     return std::get<0>(res);
   } else {
     return get_positions_d();
   }
 }
 
+std::tuple<torch::Tensor, int> MPositionHelper::get_positions_glm(
+    torch::Tensor image_grid_thw,
+    torch::Tensor video_grid_thw) {
+  auto input_tokens = seq_.tokens();
+  auto spatial_merge_size = args_.mm_spatial_merge_size();
+  auto image_token_id = args_.image_token_id();
+  auto video_token_id = args_.video_token_id();
+  auto video_start_token_id = args_.video_start_token_id();
+  auto video_end_token_id = args_.video_end_token_id();
+
+  auto dtype = torch::kInt32;
+
+  std::vector<std::string> input_token_type;
+  bool in_video = false;
+  int num_tokens = input_tokens.size();
+
+  for (int index = 0; index < num_tokens; ++index) {
+    auto token = input_tokens[index];
+    if (token == video_start_token_id) {
+      in_video = true;
+    } else if (token == video_end_token_id) {
+      in_video = false;
+    }
+
+    if (token == image_token_id && !in_video) {
+      input_token_type.push_back("image");
+    } else if (token == image_token_id && in_video) {
+      input_token_type.push_back("video");
+    } else {
+      input_token_type.push_back("text");
+    }
+  }
+  auto input_type_group = groupByTokenType(input_token_type);
+  int image_index = 0;
+  int video_index = 0;
+  int video_group_index = 0;
+
+  std::vector<torch::Tensor> llm_pos_ids_list;
+  int video_frame_num = 1;
+  for (const auto& group : input_type_group) {
+    const auto& modality_type = std::get<0>(group);
+    int start_idx = std::get<1>(group);
+    int end_idx = std::get<2>(group);
+    int st_idx = 0;
+    if (!llm_pos_ids_list.empty()) {
+      st_idx = llm_pos_ids_list.back().max().item<int>() + 1;
+    }
+
+    if (modality_type == "image") {
+      auto grid = image_grid_thw[image_index];
+      int t = grid[0].item<int>();
+      int h = grid[1].item<int>() / spatial_merge_size;
+      int w = grid[2].item<int>() / spatial_merge_size;
+
+      auto t_arange =
+          torch::arange(t, dtype).view({-1, 1}).expand({-1, h * w}).flatten();
+      auto h_arange =
+          torch::arange(h, dtype).view({1, -1, 1}).expand({t, -1, w}).flatten();
+      auto w_arange =
+          torch::arange(w, dtype).view({1, 1, -1}).expand({t, h, -1}).flatten();
+
+      auto pos = torch::stack({t_arange, h_arange, w_arange}) + st_idx;
+      llm_pos_ids_list.push_back(pos);
+      video_frame_num = 1;
+      image_index++;
+    } else if (modality_type == "video") {
+      int t = video_frame_num;
+      int h = video_grid_thw[video_index][1].item<int>() / spatial_merge_size;
+      int w = video_grid_thw[video_index][2].item<int>() / spatial_merge_size;
+
+      for (int t_idx = 0; t_idx < t; ++t_idx) {
+        auto t_tensor = torch::full({1, h * w}, t_idx, dtype).flatten();
+        auto h_tensor = torch::arange(h, dtype)
+                            .view({1, -1, 1})
+                            .expand({1, -1, w})
+                            .flatten();
+        auto w_tensor = torch::arange(w, dtype)
+                            .view({1, 1, -1})
+                            .expand({1, h, -1})
+                            .flatten();
+
+        auto pos = torch::stack({t_tensor, h_tensor, w_tensor}) + st_idx;
+        llm_pos_ids_list.push_back(pos);
+      }
+
+      video_group_index++;
+      if (video_group_index >= video_grid_thw[video_index][0].item<int>()) {
+        video_index++;
+        video_group_index = 0;
+      }
+      video_frame_num++;
+    } else {  // text
+      int text_len = end_idx - start_idx;
+      auto arange =
+          torch::arange(text_len, dtype).view({1, -1}).expand({3, -1}) + st_idx;
+      llm_pos_ids_list.push_back(arange);
+      video_frame_num = 1;
+    }
+  }
+
+  torch::Tensor llm_positions =
+      torch::cat(llm_pos_ids_list, /*dim=*/1).reshape({3, -1});
+  llm_positions = llm_positions;
+  int mrope_position_delta =
+      (llm_positions.max().item<int>() + 1 - input_tokens.size());
+
+  return std::make_pair(llm_positions, mrope_position_delta);
+}
+
 std::tuple<torch::Tensor, int> MPositionHelper::get_positions_p(
     torch::Tensor image_grid_thw,
     torch::Tensor video_grid_thw,

diff --git a/xllm/core/framework/batch/mposition.h b/xllm/core/framework/batch/mposition.h
@@ -37,6 +37,10 @@ class MPositionHelper {
       torch::Tensor image_grid_thw,
       torch::Tensor video_grid_thw,
       torch::Tensor second_per_grid_ts);
+  std::tuple<torch::Tensor, int> get_positions_glm(
+      torch::Tensor image_grid_thw,
+      torch::Tensor video_grid_thw);
+
   torch::Tensor get_positions_d();
 
  private:

diff --git a/xllm/core/framework/chat_template/jinja_chat_template.cpp b/xllm/core/framework/chat_template/jinja_chat_template.cpp
@@ -141,6 +141,15 @@ nlohmann::ordered_json JinjaChatTemplate::get_mm_content(
 
     if (item.type == "text") {
       item_json["text"] = item.text;
+    } else if (item.type == "video_url") {
+      item_json["video"] = "mm place holder";
+      item_json["video_url"] = "mm place holder";
+    } else if (item.type == "image_url") {
+      item_json["image"] = "mm place holder";
+      item_json["image_url"] = "mm place holder";
+    } else if (item.type == "audio_url") {
+      item_json["audio"] = "mm place holder";
+      item_json["audio_url"] = "mm place holder";
     } else {
       item_json[item.type] = "mm place holder";
     }

diff --git a/xllm/core/framework/hf_model_loader.cpp b/xllm/core/framework/hf_model_loader.cpp
@@ -102,6 +102,11 @@ bool HFModelLoader::load_args(const std::string& model_weights_path) {
     return false;
   }
 
+  if (!load_video_preprocessor_args(model_weights_path)) {
+    LOG(ERROR) << "Failed to load video preprocess args from "
+               << model_weights_path;
+    return false;
+  }
   // Some hacky logics to support loading of old models
   // always use float16 for quantization
   // TODO: support quantization for other data types
@@ -416,4 +421,24 @@ bool HFModelLoader::load_image_preprocessor_args(
   return true;
 }
 
+bool HFModelLoader::load_video_preprocessor_args(
+    const std::string& model_weights_path) {
+  // image preprocessor args
+  JsonReader video_preprocess_reader;
+  const std::string video_preprocess_file_path =
+      model_weights_path + "/video_preprocessor_config.json";
+  if (video_preprocess_reader.parse(video_preprocess_file_path)) {
+    LOG(INFO) << "Success to parse video preprocess args file: "
+              << video_preprocess_file_path;
+
+    args_.mm_video_shortest_edge() =
+        video_preprocess_reader.value_or<int>("size.shortest_edge", 0);
+
+    args_.mm_video_longest_edge() =
+        video_preprocess_reader.value_or<int>("size.longest_edge", 0);
+  }
+
+  return true;
+}
+
 }  // namespace xllm
diff --git a/xllm/core/framework/hf_model_loader.h b/xllm/core/framework/hf_model_loader.h
@@ -40,6 +40,7 @@ class HFModelLoader : public ModelLoader {
   bool load_quant_args(const std::string& model_weights_path);
   bool load_tokenizer_args(const std::string& model_weights_path);
   bool load_image_preprocessor_args(const std::string& model_weights_path);
+  bool load_video_preprocessor_args(const std::string& model_weights_path);
   std::string model_weights_path() const override {
     return model_weights_path_;
   }

diff --git a/xllm/core/framework/model/model_args.h b/xllm/core/framework/model/model_args.h
@@ -136,6 +136,12 @@ struct ModelArgs {
   PROPERTY(int32_t, image_token_id) = 0;
   PROPERTY(int32_t, video_token_id) = 0;
 
+  // glm4v moe
+  PROPERTY(int32_t, image_start_token_id) = 0;
+  PROPERTY(int32_t, image_end_token_id) = 0;
+  PROPERTY(int32_t, video_start_token_id) = 0;
+  PROPERTY(int32_t, video_end_token_id) = 0;
+
   PROPERTY(std::string, vision_custom_adapter);
   PROPERTY(int32_t, vision_max_slice_nums) = 0;
 
@@ -297,6 +303,10 @@ struct ModelArgs {
   PROPERTY(int64_t, mm_image_shortest_edge) = 0;
   PROPERTY(int64_t, mm_image_longest_edge) = 0;
 
+  // GLM
+  PROPERTY(int64_t, mm_video_shortest_edge) = 0;
+  PROPERTY(int64_t, mm_video_longest_edge) = 0;
+
   PROPERTY(int, mm_image_patch_size) = 0;
   PROPERTY(int, mm_image_temporal_patch_size) = 0;
   PROPERTY(int, mm_image_merge_size) = 0;