From 8ab298234bde9c790944311d8f2a483146ced037 Mon Sep 17 00:00:00 2001
From: "wangziyue.28" <wangziyue.28@jd.com>
Date: Sun, 30 Nov 2025 17:27:11 +0800
Subject: [PATCH 01/20] feat: support video modal.

---
 vcpkg.json                                    |   5 +-
 .../chat_template/jinja_chat_template.cpp     |  18 ++
 xllm/core/framework/request/mm_codec.cpp      | 216 +++++++++++++++++-
 xllm/core/framework/request/mm_codec.h        |  11 +
 xllm/core/framework/request/mm_data.h         |  11 +
 xllm/core/framework/request/mm_handler.cpp    |  29 ++-
 xllm/core/framework/request/mm_handler.h      |  12 +
 xllm/core/framework/request/mm_input.h        |  13 ++
 xllm/models/vlm/qwen2_5_vl.h                  |  27 +++
 xllm/processors/image_processor.cpp           | 187 +++++++++++++++
 xllm/processors/image_processor.h             |   9 +
 xllm/processors/qwen2_vl_image_processor.cpp  | 178 ++++++++++++++-
 xllm/processors/qwen2_vl_image_processor.h    |  13 ++
 xllm/pybind/CMakeLists.txt                    |   1 +
 14 files changed, 719 insertions(+), 11 deletions(-)

diff --git a/vcpkg.json b/vcpkg.json
index 3600c2dab..bc61fca13 100644
--- a/vcpkg.json
+++ b/vcpkg.json
@@ -98,9 +98,10 @@
       "version>=": "1.3.1"
     },
     {
-      "name": "opencv",
+      "name": "opencv4",
       "version>=": "4.7.0",
-      "default-features": false
+      "default-features": false,
+      "features": ["ffmpeg", "jpeg", "png","tiff","webp","openexr","quirc"]
     },
     {
       "name": "yaml-cpp",
diff --git a/xllm/core/framework/chat_template/jinja_chat_template.cpp b/xllm/core/framework/chat_template/jinja_chat_template.cpp
index f206cc0f0..44caf09ee 100644
--- a/xllm/core/framework/chat_template/jinja_chat_template.cpp
+++ b/xllm/core/framework/chat_template/jinja_chat_template.cpp
@@ -121,6 +121,24 @@ std::optional<std::string> JinjaChatTemplate::apply(
     nlohmann::ordered_json& messages,
     const nlohmann::ordered_json& tools,
     const nlohmann::ordered_json& chat_template_kwargs) const {
+  for (auto& msg : messages) {
+    if (!msg.contains("content")) continue;
+    auto& content = msg["content"];
+    auto normalize_item = [](nlohmann::ordered_json& item) {
+      if (item.contains("type") && item["type"].is_string()) {
+        std::string t = item["type"].get<std::string>();
+        if (t == "video_url") item["type"] = "video";
+      }
+      if (item.contains("video_url") && !item.contains("video"))
+        item["video"] = item["video_url"];
+    };
+
+    if (content.is_array()) {
+      for (auto& it : content) normalize_item(it);
+    } else if (content.is_object()) {
+      normalize_item(content);
+    }
+  }
   minja::chat_template_inputs input;
   input.messages = messages;
   input.tools = tools;
diff --git a/xllm/core/framework/request/mm_codec.cpp b/xllm/core/framework/request/mm_codec.cpp
index cdb1abc1c..0c78b5933 100644
--- a/xllm/core/framework/request/mm_codec.cpp
+++ b/xllm/core/framework/request/mm_codec.cpp
@@ -13,7 +13,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavformat/avformat.h>
+#include <libswscale/swscale.h>
+}
 #include "mm_codec.h"
 
 namespace xllm {
@@ -73,4 +77,214 @@ bool OpenCVImageEncoder::valid(const torch::Tensor& t) {
   return true;
 }
 
+bool OpenCVVideoDecoder::decode(const std::string& raw_data,
+                                torch::Tensor& t,
+                                VideoMetadata& metadata) {
+  struct MemCtx {
+    const uint8_t* p;
+    size_t sz;
+    size_t off;
+  };
+
+  struct Reader {
+    static int read(void* opaque, uint8_t* buf, int buf_size) {
+      auto* mc = static_cast<MemCtx*>(opaque);
+      size_t remain = mc->sz - mc->off;
+      int n = (int)std::min(remain, (size_t)buf_size);
+      if (n <= 0) return AVERROR_EOF;
+      memcpy(buf, mc->p + mc->off, n);
+      mc->off += (size_t)n;
+      return n;
+    }
+
+    static int64_t seek(void* opaque, int64_t offset, int whence) {
+      auto* mc = static_cast<MemCtx*>(opaque);
+
+      if (whence == AVSEEK_SIZE) {
+        return (int64_t)mc->sz;
+      }
+
+      int64_t pos = 0;
+      switch (whence) {
+        case SEEK_SET:
+          pos = offset;
+          break;
+        case SEEK_CUR:
+          pos = (int64_t)mc->off + offset;
+          break;
+        case SEEK_END:
+          pos = (int64_t)mc->sz + offset;
+          break;
+        default:
+          return AVERROR(EINVAL);
+      }
+
+      if (pos < 0 || pos > (int64_t)mc->sz) return AVERROR_EOF;
+
+      mc->off = (size_t)pos;
+      return pos;
+    }
+  };
+
+  AVFormatContext* fmt = avformat_alloc_context();
+  const int avio_buf_sz = 1 << 16;
+  uint8_t* avio_buf = (uint8_t*)av_malloc(avio_buf_sz);
+  if (!fmt || !avio_buf) {
+    if (fmt) avformat_free_context(fmt);
+    if (avio_buf) av_free(avio_buf);
+    return false;
+  }
+
+  MemCtx mc{(const uint8_t*)raw_data.data(), raw_data.size(), 0};
+
+  AVIOContext* avio = avio_alloc_context(
+      avio_buf, avio_buf_sz, 0, &mc, &Reader::read, nullptr, &Reader::seek);
+  if (!avio) {
+    av_free(avio_buf);
+    avformat_free_context(fmt);
+    return false;
+  }
+
+  avio->seekable = AVIO_SEEKABLE_NORMAL;
+
+  fmt->pb = avio;
+  fmt->flags |= AVFMT_FLAG_CUSTOM_IO;
+
+  fmt->probesize = std::min<size_t>(raw_data.size(), 20 * 1024 * 1024);
+  fmt->max_analyze_duration = 5LL * AV_TIME_BASE;
+
+  bool ok = false;
+
+  AVDictionary* opts = nullptr;
+  av_dict_set(&opts, "probesize", "20000000", 0);
+  av_dict_set(&opts, "analyzeduration", "5000000", 0);
+
+  const AVInputFormat* in_fmt = av_find_input_format("mp4");
+  int ret = avformat_open_input(&fmt, nullptr, in_fmt, &opts);
+  av_dict_free(&opts);
+
+  if (ret < 0) {
+    av_freep(&avio->buffer);
+    avio_context_free(&avio);
+    avformat_free_context(fmt);
+    return false;
+  }
+
+  ret = avformat_find_stream_info(fmt, nullptr);
+  if (ret < 0) {
+    av_freep(&avio->buffer);
+    avio_context_free(&avio);
+    avformat_close_input(&fmt);
+    return false;
+  }
+
+  int vs = av_find_best_stream(fmt, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
+  if (vs < 0) {
+    av_freep(&avio->buffer);
+    avio_context_free(&avio);
+    avformat_close_input(&fmt);
+    return false;
+  }
+
+  AVStream* st = fmt->streams[vs];
+  AVCodecParameters* par = st->codecpar;
+  const AVCodec* dec = avcodec_find_decoder(par->codec_id);
+  if (!dec) {
+    av_freep(&avio->buffer);
+    avio_context_free(&avio);
+    avformat_close_input(&fmt);
+    return false;
+  }
+
+  AVCodecContext* cc = avcodec_alloc_context3(dec);
+  if (!cc) {
+    av_freep(&avio->buffer);
+    avio_context_free(&avio);
+    avformat_close_input(&fmt);
+    return false;
+  }
+
+  if (avcodec_parameters_to_context(cc, par) < 0 ||
+      avcodec_open2(cc, dec, nullptr) < 0) {
+    avcodec_free_context(&cc);
+    av_freep(&avio->buffer);
+    avio_context_free(&avio);
+    avformat_close_input(&fmt);
+    return false;
+  }
+
+  AVRational r = st->avg_frame_rate.num ? st->avg_frame_rate : st->r_frame_rate;
+  double fps = (r.num && r.den) ? av_q2d(r) : 0.0;
+  metadata.fps = fps;
+
+  SwsContext* sws = nullptr;
+  AVPacket* pkt = av_packet_alloc();
+  AVFrame* frm = av_frame_alloc();
+  std::vector<torch::Tensor> frames;
+
+  auto push_frame = [&](AVFrame* f) -> bool {
+    if (!sws) {
+      sws = sws_getContext(f->width,
+                           f->height,
+                           (AVPixelFormat)f->format,
+                           f->width,
+                           f->height,
+                           AV_PIX_FMT_RGB24,
+                           SWS_BILINEAR,
+                           nullptr,
+                           nullptr,
+                           nullptr);
+      if (!sws) return false;
+    }
+
+    torch::Tensor rgb = torch::empty({f->height, f->width, 3}, torch::kUInt8);
+    uint8_t* dst_data[4] = {rgb.data_ptr<uint8_t>(), nullptr, nullptr, nullptr};
+    int dst_linesize[4] = {(int)rgb.stride(0), 0, 0, 0};
+
+    sws_scale(sws, f->data, f->linesize, 0, f->height, dst_data, dst_linesize);
+
+    frames.emplace_back(rgb.permute({2, 0, 1}).clone());  // [C,H,W]
+    return true;
+  };
+
+  while (av_read_frame(fmt, pkt) >= 0) {
+    if (pkt->stream_index == vs) {
+      if (avcodec_send_packet(cc, pkt) == 0) {
+        while (avcodec_receive_frame(cc, frm) == 0) {
+          if (!push_frame(frm)) break;
+        }
+      }
+    }
+    av_packet_unref(pkt);
+  }
+
+  // flush
+  avcodec_send_packet(cc, nullptr);
+  while (avcodec_receive_frame(cc, frm) == 0) {
+    if (!push_frame(frm)) break;
+  }
+
+  if (!frames.empty()) {
+    t = torch::stack(frames);  // [T,C,H,W]
+    metadata.total_num_frames = static_cast<int64_t>(frames.size());
+    if (metadata.fps > 0.0) {
+      metadata.duration = metadata.total_num_frames / metadata.fps;
+    } else {
+      metadata.duration = 0.0;
+    }
+    ok = true;
+  }
+
+  if (sws) sws_freeContext(sws);
+  av_frame_free(&frm);
+  av_packet_free(&pkt);
+  avcodec_free_context(&cc);
+
+  av_freep(&avio->buffer);
+  avio_context_free(&avio);
+  avformat_close_input(&fmt);
+
+  return ok;
+}
+
 }  // namespace xllm
diff --git a/xllm/core/framework/request/mm_codec.h b/xllm/core/framework/request/mm_codec.h
index eea7d9d32..92dd0f001 100644
--- a/xllm/core/framework/request/mm_codec.h
+++ b/xllm/core/framework/request/mm_codec.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <opencv2/opencv.hpp>
 #include <string>
 
+#include "mm_data.h"
+
 namespace xllm {
 
 class OpenCVImageDecoder {
@@ -41,4 +43,13 @@ class OpenCVImageEncoder {
   bool valid(const torch::Tensor& t);
 };
 
+class OpenCVVideoDecoder {
+ public:
+  OpenCVVideoDecoder() = default;
+  ~OpenCVVideoDecoder() = default;
+
+  bool decode(const std::string& raw_data,
+              torch::Tensor& t,
+              VideoMetadata& meta);
+};
 }  // namespace xllm
diff --git a/xllm/core/framework/request/mm_data.h b/xllm/core/framework/request/mm_data.h
index a6bca46c3..78b299044 100644
--- a/xllm/core/framework/request/mm_data.h
+++ b/xllm/core/framework/request/mm_data.h
@@ -52,6 +52,15 @@ class MMType {
   Value value = Value::NONE;
 };
 
+struct VideoMetadata {
+  double fps = 0.0;              // original fps
+  int64_t total_num_frames = 0;  // original frames
+  double duration = 0.0;
+  double sampled_fps = 0.0;
+  torch::Tensor frame_indices;
+  std::vector<double> timestamps;
+};
+
 using MMKey = std::string;
 using MMValue = std::variant<torch::Tensor, std::vector<torch::Tensor>>;
 using MMDict = std::unordered_map<MMKey, MMValue>;
@@ -138,6 +147,8 @@ struct MMData {
 
   uint32_t ty_ = MMType::NONE;
   MMDict data_;
+
+  std::vector<VideoMetadata> video_metadata;
 };
 
 }  // namespace xllm
diff --git a/xllm/core/framework/request/mm_handler.cpp b/xllm/core/framework/request/mm_handler.cpp
index 7a93dfd5e..bb2397b69 100644
--- a/xllm/core/framework/request/mm_handler.cpp
+++ b/xllm/core/framework/request/mm_handler.cpp
@@ -83,9 +83,36 @@ bool ImageHandler::decode(MMInputItem& input) {
   return decoder.decode(input.raw_data_, input.decode_data_);
 }
 
+bool VideoHandler::load(const MMContent& content, MMInputItem& input) {
+  input.clear();
+
+  const auto& video_url = content.video_url;
+  const auto& url = video_url.url;
+
+  if (url.compare(0, dataurl_prefix_.size(), dataurl_prefix_) ==
+      0) {  // data url
+
+    input.type_ = MMType::VIDEO;
+    return this->load_from_dataurl(url, input.raw_data_);
+  } else if (url.compare(0, httpurl_prefix_.size(), httpurl_prefix_) ==
+             0) {  // http url
+
+    input.type_ = MMType::VIDEO;
+    return this->load_from_http(url, input.raw_data_);
+  } else {
+    LOG(ERROR) << " video url is invalid, url is " << url;
+    return false;
+  }
+}
+
+bool VideoHandler::decode(MMInputItem& input) {
+  OpenCVVideoDecoder decoder;
+  return decoder.decode(input.raw_data_, input.decode_data_, input.video_meta_);
+}
+
 MMHandlerSet::MMHandlerSet() {
   handlers_["image_url"] = std::make_unique<ImageHandler>();
-  // handlers_["video_url"] = std::make_unique<VideoHandler>();
+  handlers_["video_url"] = std::make_unique<VideoHandler>();
   // handlers_["audio_url"] = std::make_unique<AudioHandler>();
 }
 
diff --git a/xllm/core/framework/request/mm_handler.h b/xllm/core/framework/request/mm_handler.h
index db6d8ac1d..ff8d55c9c 100644
--- a/xllm/core/framework/request/mm_handler.h
+++ b/xllm/core/framework/request/mm_handler.h
@@ -59,6 +59,18 @@ class ImageHandler : public MMHandlerBase {
   std::string dataurl_prefix_{"data:image"};
 };
 
+class VideoHandler : public MMHandlerBase {
+ public:
+  VideoHandler() = default;
+  ~VideoHandler() = default;
+
+  virtual bool load(const MMContent& content, MMInputItem& input) override;
+  virtual bool decode(MMInputItem& input) override;
+
+ private:
+  std::string dataurl_prefix_{"data:video"};
+};
+
 class MMHandlerSet {
  public:
   MMHandlerSet();
diff --git a/xllm/core/framework/request/mm_input.h b/xllm/core/framework/request/mm_input.h
index 32deea294..1b5fc57b3 100644
--- a/xllm/core/framework/request/mm_input.h
+++ b/xllm/core/framework/request/mm_input.h
@@ -35,6 +35,8 @@ struct MMInputItem {
   std::string raw_data_;  // binary
 
   torch::Tensor decode_data_;  // image: rgb, [c,h,w], uint8
+
+  VideoMetadata video_meta_;
 };
 
 struct MMInput {
@@ -56,6 +58,17 @@ struct MMInput {
     return std::move(vec);
   }
 
+  std::vector<VideoMetadata> get_video_metadata(MMType type) const {
+    std::vector<VideoMetadata> metas;
+    metas.reserve(items_.size());
+    for (auto& item : items_) {
+      if (item.type_ == type) {
+        metas.push_back(item.video_meta_);
+      }
+    }
+    return metas;
+  }
+
   std::vector<MMInputItem> items_;
 };
 
diff --git a/xllm/models/vlm/qwen2_5_vl.h b/xllm/models/vlm/qwen2_5_vl.h
index ec6e6aa4a..48683a5ad 100644
--- a/xllm/models/vlm/qwen2_5_vl.h
+++ b/xllm/models/vlm/qwen2_5_vl.h
@@ -697,6 +697,15 @@ class Qwen2_5_VLForConditionalGenerationImpl : public torch::nn::Module {
       auto is_multimodal = torch::isin(input_ids, model_args_.image_token_id());
       inputs_embeds.index_put_({is_multimodal}, image_embeds);
     }
+    if (video_input) {
+      // visual
+      auto video_embeds = visual_(video_input->pixel_values_videos.to(options_),
+                                  video_input->video_grid_thw,
+                                  input_params);
+      // merge
+      auto is_multimodal = torch::isin(input_ids, model_args_.video_token_id());
+      inputs_embeds.index_put_({is_multimodal}, video_embeds);
+    }
     return inputs_embeds;
   }
 
@@ -715,11 +724,29 @@ class Qwen2_5_VLForConditionalGenerationImpl : public torch::nn::Module {
     if (const auto& res = mm_data.get<torch::Tensor>("image_grid_thw"))
       image_grid_thw = res.value();
 
+    torch::Tensor pixel_values_videos;
+    if (const auto& res = mm_data.get<torch::Tensor>("pixel_values_videos"))
+      pixel_values_videos = res.value();
+
+    torch::Tensor video_grid_thw;
+    if (const auto& res = mm_data.get<torch::Tensor>("video_grid_thw"))
+      video_grid_thw = res.value();
+
+    torch::Tensor second_per_grid_ts;
+    if (const auto& res = mm_data.get<torch::Tensor>("second_per_grid_ts"))
+      second_per_grid_ts = res.value();
+
     std::optional<Qwen2_5_VLImageInputs> image_inputs;
     std::optional<Qwen2_5_VLVideoInputs> video_inputs;
 
     if (pixel_values.defined() && image_grid_thw.defined())
       image_inputs = Qwen2_5_VLImageInputs{pixel_values, image_grid_thw};
+
+    if (pixel_values_videos.defined() && video_grid_thw.defined() &&
+        second_per_grid_ts.defined())
+      video_inputs = Qwen2_5_VLVideoInputs{
+          pixel_values_videos, video_grid_thw, second_per_grid_ts};
+
     auto inputs_embeds =
         get_input_embeddings(tokens, image_inputs, video_inputs, input_params);
     input_params.input_embedding = inputs_embeds;
diff --git a/xllm/processors/image_processor.cpp b/xllm/processors/image_processor.cpp
index f77f82b24..645905001 100644
--- a/xllm/processors/image_processor.cpp
+++ b/xllm/processors/image_processor.cpp
@@ -129,4 +129,191 @@ torch::Tensor ImageProcessor::normalize(const torch::Tensor& image,
   return result.div_(s_tensor);
 }
 
+torch::Tensor ImageProcessor::init_frames(const VideoMetadata& metadata) {
+  int total_num_frames = metadata.total_num_frames;
+  int nframes_len = 32;
+  if (total_num_frames <= 0) {
+    return torch::empty({0}, torch::dtype(torch::kLong));
+  }
+  auto idx = torch::linspace(
+      0, total_num_frames - 1, nframes_len, torch::dtype(torch::kLong));
+  return idx;
+}
+
+torch::Tensor ImageProcessor::sample_frames(const VideoMetadata& metadata,
+                                            int temporal_patch_size,
+                                            int min_frames,
+                                            int max_frames,
+                                            int num_frames,
+                                            double set_fps) {
+  if (set_fps > 0.0 && num_frames > 0) {
+    LOG(FATAL) << "num_frames and fps are mutually exclusive arguments, please "
+                  "use only one!";
+  }
+
+  double fps = set_fps;
+
+  int total_num_frames = metadata.total_num_frames;
+
+  if (num_frames > 0) {
+    double double_num_frames =
+        std::round(static_cast<double>(num_frames) / temporal_patch_size) *
+        temporal_patch_size;
+    num_frames = static_cast<int>(double_num_frames);
+  } else if (fps > 0.0) {
+    if (metadata.fps <= 0.0) {
+      LOG(FATAL)
+          << "Asked to sample `fps` frames per second but no video metadata "
+             "was provided which is required when sampling with `fps`. ";
+    }
+
+    max_frames =
+        (std::min(max_frames, total_num_frames) / temporal_patch_size) *
+        temporal_patch_size;
+    double double_num_frames =
+        static_cast<double>(total_num_frames) / metadata.fps * fps;
+    double_num_frames = std::min(
+        std::min(std::max(double_num_frames, static_cast<double>(min_frames)),
+                 static_cast<double>(max_frames)),
+        static_cast<double>(total_num_frames));
+    double_num_frames = std::floor(double_num_frames / temporal_patch_size) *
+                        temporal_patch_size;
+
+    num_frames = static_cast<int>(double_num_frames);
+  }
+
+  if (num_frames > total_num_frames) {
+    LOG(FATAL) << "Video can't be sampled. The inferred num_frames="
+               << num_frames << " exceeds total_num_frames=" << total_num_frames
+               << ".";
+  }
+
+  if (num_frames > 0) {
+    std::vector<int64_t> indices;
+    indices.reserve(num_frames);
+    for (int i = 0; i < num_frames; ++i) {
+      int64_t k = static_cast<int64_t>(
+          (static_cast<int64_t>(i) * total_num_frames) / num_frames);
+      if (k >= total_num_frames) k = total_num_frames - 1;
+      indices.push_back(k);
+    }
+    return torch::tensor(indices, torch::TensorOptions().dtype(torch::kLong));
+  } else {
+    return torch::arange(0,
+                         static_cast<int64_t>(total_num_frames),
+                         torch::TensorOptions().dtype(torch::kLong));
+  }
+}
+
+torch::Tensor ImageProcessor::GLM_sample_frames(const VideoMetadata& metadata,
+                                                int temporal_patch_size) {
+  // video: [T, C, H, W]
+  const int total_frames = metadata.total_num_frames;
+  if (total_frames <= 0) {
+    return torch::empty({0}, torch::dtype(torch::kLong));
+  }
+
+  if (metadata.fps <= 0.0) {
+    LOG(FATAL) << "invalid metadata.fps <= 0";
+  }
+
+  const int max_frame_idx = total_frames - 1;
+
+  // duration = metadata.duration or round(max_idx / fps) + 1
+  double duration = metadata.duration;
+  if (duration <= 0.0) {
+    duration =
+        std::round(static_cast<double>(max_frame_idx) / metadata.fps) + 1.0;
+  }
+
+  constexpr double DYN_FPS_30 = 3.0;
+  constexpr double DYN_FPS_300 = 1.0;
+  constexpr double DYN_FPS_2400 = 0.5;
+  constexpr int MAX_FRAME_COUNT_DYNAMIC = 640;
+  constexpr double MAX_DURATION = 2400.0;
+
+  const double effective_duration = std::min(duration, MAX_DURATION);
+
+  double target_fps = 0.0;
+  if (effective_duration <= 30.0) {
+    target_fps = DYN_FPS_30;
+  } else if (effective_duration <= 300.0) {
+    target_fps = DYN_FPS_300;
+  } else {
+    target_fps = DYN_FPS_2400;
+  }
+
+  // extract_t = int(effective_duration * target_fps * temporal_patch_size)
+  int extract_t = static_cast<int>(effective_duration * target_fps *
+                                   static_cast<double>(temporal_patch_size));
+  extract_t = std::min(extract_t, MAX_FRAME_COUNT_DYNAMIC);
+
+  const double duration_per_frame = 1.0 / metadata.fps;
+  std::vector<double> timestamps(total_frames);
+  for (int i = 0; i < total_frames; ++i) {
+    timestamps[i] = static_cast<double>(i) * duration_per_frame;
+  }
+  const int max_second = static_cast<int>(duration);
+
+  torch::Tensor frame_indices;
+
+  if (total_frames < extract_t) {
+    frame_indices = torch::linspace(
+        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
+  } else {
+    std::vector<int64_t> tmp;
+    tmp.reserve(static_cast<size_t>(total_frames));
+    double current_second = 0.0;
+    const double inv_fps =
+        1.0 / (static_cast<double>(temporal_patch_size) * target_fps);
+
+    for (int frame_index = 0; frame_index < total_frames; frame_index++) {
+      if (timestamps[frame_index] >= current_second) {
+        current_second += inv_fps;
+        tmp.push_back(frame_index);
+        if (current_second >= static_cast<double>(max_second)) {
+          break;
+        }
+      }
+    }
+    frame_indices =
+        torch::tensor(tmp, torch::TensorOptions().dtype(torch::kLong));
+  }
+  int64_t len = frame_indices.size(0);
+  if (len < extract_t) {
+    int64_t start, end;
+    if (len == 0) {
+      start = 0;
+      end = std::max<int64_t>(total_frames - 1, 0);
+    } else {
+      start = frame_indices[0].item<int64_t>();
+      end = frame_indices[len - 1].item<int64_t>();
+    }
+    frame_indices =
+        torch::linspace(start, end, extract_t, torch::dtype(torch::kLong));
+  } else if (len > extract_t) {
+    frame_indices = torch::linspace(
+        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
+  }
+
+  len = frame_indices.size(0);
+  std::unordered_set<int64_t> seen;
+  seen.reserve(static_cast<size_t>(len) * 2);
+  std::vector<int64_t> uniq;
+  uniq.reserve(static_cast<size_t>(len));
+
+  for (int64_t i = 0; i < len; ++i) {
+    auto idx = frame_indices[i].item<int64_t>();
+    if (seen.insert(idx).second) {
+      uniq.push_back(idx);
+    }
+  }
+
+  if (!uniq.empty() && (uniq.size() & 1)) {
+    uniq.push_back(uniq.back());
+  }
+
+  return torch::tensor(uniq, torch::TensorOptions().dtype(torch::kLong));
+}
+
 }  // namespace xllm
diff --git a/xllm/processors/image_processor.h b/xllm/processors/image_processor.h
index be7fac5a1..f5b5f33ef 100644
--- a/xllm/processors/image_processor.h
+++ b/xllm/processors/image_processor.h
@@ -39,6 +39,15 @@ class ImageProcessor {
   virtual torch::Tensor normalize(const torch::Tensor& image,
                                   const std::vector<double>& mean,
                                   const std::vector<double>& std);
+  virtual torch::Tensor init_frames(const VideoMetadata& metadata);
+  virtual torch::Tensor sample_frames(const VideoMetadata& metadata,
+                                      int temporal_patch_size,
+                                      int min_frames,
+                                      int max_frames,
+                                      int num_frames = -1,
+                                      double set_fps = -1.0);
+  virtual torch::Tensor GLM_sample_frames(const VideoMetadata& metadata,
+                                          int temporal_patch_size);
 };
 
 }  // namespace xllm
diff --git a/xllm/processors/qwen2_vl_image_processor.cpp b/xllm/processors/qwen2_vl_image_processor.cpp
index 16adc17d4..84ebe977a 100644
--- a/xllm/processors/qwen2_vl_image_processor.cpp
+++ b/xllm/processors/qwen2_vl_image_processor.cpp
@@ -92,14 +92,27 @@ Qwen2VLImageProcessor::Qwen2VLImageProcessor(const ModelArgs& args) {
 
 bool Qwen2VLImageProcessor::process(const MMInput& inputs, MMData& datas) {
   std::vector<torch::Tensor> images = inputs.get_decode_data(MMType::IMAGE);
-  if (images.empty()) {
-    LOG(ERROR) << " image tensor not found.";
+  std::vector<torch::Tensor> videos = inputs.get_decode_data(MMType::VIDEO);
+  std::vector<VideoMetadata> video_meta_list =
+      inputs.get_video_metadata(MMType::VIDEO);
+
+  if (images.empty() && (videos.empty() || video_meta_list.empty())) {
+    LOG(ERROR) << "no image/video tensor found.";
     return false;
   }
 
-  if (!this->process_images(images, datas)) {
-    LOG(ERROR) << " process image failed.";
-    return false;
+  if (!images.empty()) {
+    if (!this->process_images(images, datas)) {
+      LOG(ERROR) << " process image failed.";
+      return false;
+    }
+  }
+
+  if (!videos.empty()) {
+    if (!this->process_videos(videos, video_meta_list, datas)) {
+      LOG(ERROR) << " process video failed.";
+      return false;
+    }
   }
 
   return true;
@@ -120,9 +133,9 @@ bool Qwen2VLImageProcessor::process_images(std::vector<torch::Tensor> images,
   auto thw = torch::tensor(grids);
 
   thw = thw.clone().reshape({-1, 3});
-  mm_datas = std::move(MMData(
-      MMType::IMAGE, {{"image_grid_thw", thw}, {"pixel_values", values}}));
 
+  mm_datas.update(MMType::IMAGE, "image_grid_thw", thw);
+  mm_datas.update(MMType::IMAGE, "pixel_values", values);
   return true;
 }
 
@@ -198,4 +211,155 @@ bool Qwen2VLImageProcessor::process_image(
   return true;
 }
 
+bool Qwen2VLImageProcessor::process_videos(
+    std::vector<torch::Tensor> videos,
+    std::vector<VideoMetadata> video_meta_list,
+    MMData& mm_datas) {
+  std::vector<torch::Tensor> pixel_values;
+  std::vector<int64_t> grids;
+
+  const size_t video_size = videos.size();
+  for (size_t i = 0; i < video_size; ++i) {
+    auto& vid = videos[i];
+    auto& metadata = video_meta_list[i];
+    if (!this->process_video(vid, metadata, pixel_values, grids)) {
+      return false;
+    }
+  }
+
+  auto values = torch::cat(pixel_values);
+  auto thw = torch::tensor(grids).clone().reshape({-1, 3});
+
+  const size_t num_videos = videos.size();
+  std::vector<double> second_per_grid;
+  second_per_grid.reserve(num_videos);
+  for (size_t i = 0; i < num_videos; ++i) {
+    const auto& metadata = video_meta_list[i];
+    double fps =
+        metadata.sampled_fps > 0.0 ? metadata.sampled_fps : metadata.fps;
+    double seconds_per_grid = static_cast<double>(temporal_patch_size_) / fps;
+    second_per_grid.push_back(seconds_per_grid);
+  }
+
+  auto opts = torch::TensorOptions().dtype(torch::kFloat32);
+  auto second_per_grid_ts = torch::tensor(second_per_grid, opts);
+
+  mm_datas.update(MMType::VIDEO, "video_grid_thw", thw);
+  mm_datas.update(MMType::VIDEO, "pixel_values_videos", values);
+  mm_datas.update(MMType::VIDEO, "second_per_grid_ts", second_per_grid_ts);
+
+  mm_datas.video_metadata = std::move(video_meta_list);
+  return true;
+}
+
+bool Qwen2VLImageProcessor::process_video(
+    torch::Tensor origin_video,
+    VideoMetadata& metadata,
+    std::vector<torch::Tensor>& pixel_values,
+    std::vector<int64_t>& grids) {
+  if (origin_video.dim() != 4) {
+    LOG(FATAL) << "video must be TCHW";
+  }
+
+  torch::Tensor indices;
+  if (do_sample_frame_) {
+    indices = this->sample_frames(metadata,
+                                  temporal_patch_size_,
+                                  min_frames_,
+                                  max_frames_,
+                                  /*num_frames=*/-1,
+                                  /*set_fps=*/2.0);
+  } else {
+    indices = this->init_frames(metadata);  // default sample to 32 frames
+  }
+  auto video = origin_video.index_select(/*dim=*/0, indices);
+  int64_t sampled_total_frames = video.size(0);
+
+  metadata.frame_indices = indices;
+  metadata.timestamps.clear();
+  metadata.timestamps.reserve(static_cast<size_t>(sampled_total_frames));
+  double fps_for_ts = (metadata.fps > 0.0) ? metadata.fps : 24.0;
+  for (int64_t i = 0; i < sampled_total_frames; ++i) {
+    int64_t frame_idx = metadata.frame_indices[i].item<int64_t>();
+    metadata.timestamps.push_back(static_cast<double>(frame_idx) / fps_for_ts);
+  }
+
+  if (metadata.total_num_frames > 0 && metadata.fps > 0.0) {
+    metadata.sampled_fps = double(sampled_total_frames) /
+                           double(metadata.total_num_frames) * metadata.fps;
+  } else {
+    metadata.sampled_fps = fps_for_ts;
+  }
+
+  auto shape = video.sizes();
+  auto time_len = shape[0];
+  auto channel = shape[1];
+  auto resized_height = shape[2];
+  auto resized_width = shape[3];
+
+  if (do_resize_) {
+    auto size = smart_resize(resized_height,
+                             resized_width,
+                             patch_size_ * merge_size_,
+                             size_["shortest_edge"],
+                             size_["longest_edge"]);
+    if (!size) {
+      return false;
+    }
+    std::tie(resized_height, resized_width) = *size;
+  }
+
+  std::vector<torch::Tensor> out_frames;
+  out_frames.reserve(time_len);
+  // for each frame
+  auto frames = video.unbind(0);
+  for (auto& frame : frames) {
+    // resize
+    if (do_resize_)
+      frame =
+          this->resize(frame, {resized_height, resized_width}, resample_, true);
+    // normalize
+    if (do_normalize_) frame = this->normalize(frame, image_mean_, image_std_);
+    // rescale
+    if (do_rescale_) frame = this->rescale(frame, rescale_factor_);
+    out_frames.push_back(frame);
+  }
+
+  auto out_video = torch::stack(out_frames);  // [T,C,H,W]
+
+  auto pad_t = (temporal_patch_size_ - (time_len % temporal_patch_size_)) %
+               temporal_patch_size_;
+  if (pad_t > 0) {
+    auto last =
+        out_video.index({time_len - 1}).unsqueeze(0).repeat({pad_t, 1, 1, 1});
+    out_video = torch::cat({out_video, last}, 0);
+  }
+
+  shape = out_video.sizes();
+  auto grid_h = resized_height / patch_size_;
+  auto grid_w = resized_width / patch_size_;
+  auto grid_t = shape[0] / temporal_patch_size_;
+
+  out_video = out_video.contiguous();
+
+  auto patches = out_video.view({grid_t,
+                                 temporal_patch_size_,
+                                 channel,
+                                 grid_h / merge_size_,
+                                 merge_size_,
+                                 patch_size_,
+                                 grid_w / merge_size_,
+                                 merge_size_,
+                                 patch_size_});
+
+  patches = patches.permute({0, 3, 6, 4, 7, 2, 1, 5, 8});
+  patches = patches.reshape(
+      {grid_t * grid_h * grid_w,
+       channel * temporal_patch_size_ * patch_size_ * patch_size_});
+
+  pixel_values.emplace_back(patches);
+  grids.insert(grids.end(), {grid_t, grid_h, grid_w});
+  return true;
+}
+
 }  // namespace xllm
diff --git a/xllm/processors/qwen2_vl_image_processor.h b/xllm/processors/qwen2_vl_image_processor.h
index 0cdd4c0d9..974ec83e5 100644
--- a/xllm/processors/qwen2_vl_image_processor.h
+++ b/xllm/processors/qwen2_vl_image_processor.h
@@ -36,6 +36,14 @@ class Qwen2VLImageProcessor : public ImageProcessor {
                      std::vector<torch::Tensor>& pixel_values,
                      std::vector<int64_t>& grids);
 
+  bool process_videos(std::vector<torch::Tensor> videos,
+                      std::vector<VideoMetadata> video_meta_list,
+                      MMData& mm_datas);
+  bool process_video(torch::Tensor video,
+                     VideoMetadata& metadata,
+                     std::vector<torch::Tensor>& pixel_values,
+                     std::vector<int64_t>& grids);
+
  private:
   bool do_convert_rgb_ = true;
   bool do_normalize_ = true;
@@ -57,6 +65,11 @@ class Qwen2VLImageProcessor : public ImageProcessor {
 
   std::unordered_map<std::string, int> size_;
   int temporal_patch_size_ = 2;
+
+  bool do_sample_frame_ = false;
+
+  int min_frames_ = 4;
+  int max_frames_ = 768;
 };
 
 }  // namespace xllm
diff --git a/xllm/pybind/CMakeLists.txt b/xllm/pybind/CMakeLists.txt
index 964678802..6eccea0c9 100644
--- a/xllm/pybind/CMakeLists.txt
+++ b/xllm/pybind/CMakeLists.txt
@@ -24,6 +24,7 @@ pybind_extension(
     torch
     c10
 )
+target_link_options(xllm_export PRIVATE -Wl,-Bsymbolic)
 target_link_libraries(common PRIVATE leveldb::leveldb ZLIB::ZLIB OpenSSL::SSL OpenSSL::Crypto protobuf::libprotobuf)
 add_dependencies(common brpc-static)
 

From ad30dcc1d6b6a830313dc3fe7cb9598d226b801a Mon Sep 17 00:00:00 2001
From: "wangyunlong.115" <wangyunlong.115@jd.com>
Date: Fri, 28 Nov 2025 18:09:11 +0800
Subject: [PATCH 02/20] feat: support GLM46v modal processor.

---
 xllm/processors/CMakeLists.txt            |   2 +
 xllm/processors/glm4v_image_processor.cpp | 365 ++++++++++++++++++++++
 xllm/processors/glm4v_image_processor.h   |  74 +++++
 xllm/processors/image_processor.cpp       |   3 +-
 4 files changed, 442 insertions(+), 2 deletions(-)
 create mode 100644 xllm/processors/glm4v_image_processor.cpp
 create mode 100755 xllm/processors/glm4v_image_processor.h
 mode change 100644 => 100755 xllm/processors/image_processor.cpp

diff --git a/xllm/processors/CMakeLists.txt b/xllm/processors/CMakeLists.txt
index 27365efe8..fe24f2f4d 100755
--- a/xllm/processors/CMakeLists.txt
+++ b/xllm/processors/CMakeLists.txt
@@ -21,6 +21,7 @@ cc_library(
     processors
   HDRS
     image_processor.h
+    glm4v_image_processor.h
     clip_image_processor.h
     minicpmv_image_processor.h
     qwen2_vl_image_processor.h
@@ -28,6 +29,7 @@ cc_library(
     input_processor.h
   SRCS
     image_processor.cpp
+    glm4v_image_processor.cpp
     clip_image_processor.cpp
     minicpmv_image_processor.cpp
     qwen2_vl_image_processor.cpp
diff --git a/xllm/processors/glm4v_image_processor.cpp b/xllm/processors/glm4v_image_processor.cpp
new file mode 100644
index 000000000..dca14cbd5
--- /dev/null
+++ b/xllm/processors/glm4v_image_processor.cpp
@@ -0,0 +1,365 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "glm4v_image_processor.h"
+
+namespace xllm {
+
+namespace {
+
+using Size = std::pair<int, int>;
+
+std::optional<Size> smart_resize(int num_frames,
+                                 int height,
+                                 int width,
+                                 int temporal_factor,
+                                 int factor = 28,
+                                 int min_pixels = 56 * 56,
+                                 int max_pixels = 14 * 14 * 4 * 1280) {
+  if (height < factor || width < factor) {
+    LOG(ERROR) << "Height or width must be larger than factor";
+    return std::nullopt;
+  }
+  if (num_frames < temporal_factor) {
+    LOG(ERROR) << "t:{num_frames} must be larger than "
+                  "temporal_factor:{temporal_factor}";
+    return std::nullopt;
+  }
+
+  if (static_cast<double>(std::max(height, width)) / std::min(height, width) >
+      200) {
+    LOG(ERROR) << "Absolute aspect ratio must be smaller than 200";
+    return std::nullopt;
+  }
+  int t_bar = static_cast<int>(std::round(
+                  num_frames / static_cast<double>(temporal_factor))) *
+              temporal_factor;
+  int h_bar =
+      static_cast<int>(std::round(height / static_cast<double>(factor))) *
+      factor;
+  int w_bar =
+      static_cast<int>(std::round(width / static_cast<double>(factor))) *
+      factor;
+
+  if (t_bar * h_bar * w_bar > max_pixels) {
+    double beta = std::sqrt((num_frames * height * width) /
+                            static_cast<double>(max_pixels));
+    h_bar = static_cast<int>(
+                std::floor(height / beta / static_cast<double>(factor))) *
+            factor;
+    w_bar = static_cast<int>(
+                std::floor(width / beta / static_cast<double>(factor))) *
+            factor;
+  } else if (t_bar * h_bar * w_bar < min_pixels) {
+    double beta = std::sqrt(min_pixels /
+                            static_cast<double>(height * width * num_frames));
+    h_bar = static_cast<int>(
+                std::ceil(height * beta / static_cast<double>(factor))) *
+            factor;
+    w_bar = static_cast<int>(
+                std::ceil(width * beta / static_cast<double>(factor))) *
+            factor;
+  }
+
+  return std::make_pair(h_bar, w_bar);
+}
+}  // namespace
+
+Glm4VImageProcessor::Glm4VImageProcessor(const ModelArgs& args) {
+  image_mean_ = args.mm_image_normalize_mean();
+  image_std_ = args.mm_image_normalize_std();
+
+  if (args.mm_image_max_pixels() && args.mm_image_min_pixels()) {
+    min_pixels_ = args.mm_image_min_pixels();
+    max_pixels_ = args.mm_image_max_pixels();
+  } else if (args.mm_image_shortest_edge() && args.mm_image_longest_edge()) {
+    min_pixels_ = args.mm_image_shortest_edge();
+    max_pixels_ = args.mm_image_longest_edge();
+  }
+
+  patch_size_ = args.mm_image_patch_size();
+  temporal_patch_size_ = args.mm_image_temporal_patch_size();
+
+  merge_size_ = args.mm_image_merge_size();
+  size_ = {{"longest_edge", 12845056}, {"shortest_edge", 3136}};
+
+  // fuse image mean/std and rescale_factor
+  if (do_rescale_ && do_normalize_) {
+    for (auto& item : image_mean_) {
+      item = item * (1.0 / rescale_factor_);
+    }
+
+    for (auto& item : image_std_) {
+      item = item * (1.0 / rescale_factor_);
+    }
+
+    do_rescale_ = false;
+  }
+}
+
+bool Glm4VImageProcessor::process(const MMInput& inputs, MMData& datas) {
+  std::vector<torch::Tensor> images = inputs.get_decode_data(MMType::IMAGE);
+  std::vector<torch::Tensor> videos = inputs.get_decode_data(MMType::VIDEO);
+  std::vector<VideoMetadata> video_meta_list =
+      inputs.get_video_metadata(MMType::VIDEO);
+
+  if (images.empty() && (videos.empty() || video_meta_list.empty())) {
+    LOG(ERROR) << "no image/video tensor found.";
+    return false;
+  }
+
+  if (!images.empty()) {
+    if (!this->process_images(images, datas)) {
+      LOG(ERROR) << " process image failed.";
+      return false;
+    }
+  }
+
+  if (!videos.empty()) {
+    if (!this->process_videos(videos, video_meta_list, datas)) {
+      LOG(ERROR) << " process video failed.";
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool Glm4VImageProcessor::process_images(std::vector<torch::Tensor> images,
+                                         MMData& mm_datas) {
+  std::vector<torch::Tensor> pixel_values;
+  std::vector<int64_t> grids;
+
+  for (const auto& img : images) {
+    if (!this->process_image(img, pixel_values, grids)) {
+      return false;
+    }
+  }
+
+  auto values = torch::cat(pixel_values);
+  auto thw = torch::tensor(grids);
+
+  thw = thw.clone().reshape({-1, 3});
+  mm_datas = std::move(MMData(
+      MMType::IMAGE, {{"image_grid_thw", thw}, {"pixel_values", values}}));
+
+  return true;
+}
+
+bool Glm4VImageProcessor::process_image(
+    torch::Tensor image,
+    std::vector<torch::Tensor>& pixel_values,
+    std::vector<int64_t>& grids) {
+  auto shape = image.sizes();
+
+  auto resized_height = shape[1];
+  auto resized_width = shape[2];
+
+  // do_convert_rgb
+
+  // resize
+  if (do_resize_) {
+    auto size = smart_resize(temporal_patch_size_,
+                             resized_height,
+                             resized_width,
+                             temporal_patch_size_,
+                             patch_size_ * merge_size_,
+                             min_pixels_,
+                             max_pixels_);
+    if (!size) {
+      return false;
+    }
+
+    std::tie(resized_height, resized_width) = *size;
+    image =
+        this->resize(image, {resized_height, resized_width}, resample_, true);
+  }
+
+  // normalize
+  if (do_normalize_) {
+    image = this->normalize(image, image_mean_, image_std_);
+  }
+
+  // rescale
+  if (do_rescale_) {
+    image = this->rescale(image, rescale_factor_);
+  }
+
+  auto patches = torch::stack({image}, 0);
+
+  auto repeats = patches[-1].unsqueeze(0).repeat(
+      /*{temporal_patch_size_ - (shape[0] % temporal_patch_size_)*/ {
+          temporal_patch_size_ - 1, 1, 1, 1});
+  patches = torch::cat({patches, repeats}, 0);
+  shape = patches.sizes();
+  auto channel = shape[1];
+  auto grid_t = shape[0] / temporal_patch_size_;
+
+  auto grid_h = resized_height / patch_size_;
+  auto grid_w = resized_width / patch_size_;
+
+  patches = patches.view({grid_t,
+                          temporal_patch_size_,
+                          channel,
+                          grid_h / merge_size_,
+                          merge_size_,
+                          patch_size_,
+                          grid_w / merge_size_,
+                          merge_size_,
+                          patch_size_});
+  patches = patches.permute({0, 3, 6, 4, 7, 2, 1, 5, 8});
+  patches = patches.reshape(
+      {grid_t * grid_h * grid_w,
+       channel * temporal_patch_size_ * patch_size_ * patch_size_});
+
+  pixel_values.emplace_back(patches);
+  grids.insert(grids.end(), {grid_t, grid_h, grid_w});
+
+  return true;
+}
+
+bool Glm4VImageProcessor::process_videos(
+    std::vector<torch::Tensor> videos,
+    std::vector<VideoMetadata> video_meta_list,
+    MMData& mm_datas) {
+  std::vector<torch::Tensor> pixel_values;
+  std::vector<int64_t> grids;
+
+  const size_t video_size = videos.size();
+  for (size_t i = 0; i < video_size; ++i) {
+    auto& vid = videos[i];
+    auto& metadata = video_meta_list[i];
+    if (!this->process_video(vid, metadata, pixel_values, grids)) {
+      return false;
+    }
+  }
+
+  auto values = torch::cat(pixel_values);
+  auto thw = torch::tensor(grids).clone().reshape({-1, 3});
+  mm_datas.update(MMType::VIDEO, "video_grid_thw", thw);
+  mm_datas.update(MMType::VIDEO, "pixel_values_videos", values);
+  mm_datas.video_metadata = std::move(video_meta_list);
+  return true;
+}
+
+bool Glm4VImageProcessor::process_video(
+    torch::Tensor origin_video,
+    VideoMetadata& metadata,
+    std::vector<torch::Tensor>& pixel_values,
+    std::vector<int64_t>& grids) {
+  if (origin_video.dim() != 4) {
+    LOG(FATAL) << "video must be TCHW";
+  }
+
+  torch::Tensor indices;
+  if (do_sample_frame_) {
+    indices = this->GLM_sample_frames(metadata, temporal_patch_size_);
+  } else {
+    indices = this->init_frames(metadata);  // default sample to 32 frames
+  }
+  auto video = origin_video.index_select(/*dim=*/0, indices);
+  int64_t sampled_total_frames = video.size(0);
+
+  metadata.frame_indices = indices;
+  metadata.timestamps.clear();
+  metadata.timestamps.reserve(static_cast<size_t>(sampled_total_frames));
+  double fps_for_ts = (metadata.fps > 0.0) ? metadata.fps : 24.0;
+  for (int64_t i = 0; i < sampled_total_frames; ++i) {
+    int64_t frame_idx = metadata.frame_indices[i].item<int64_t>();
+    metadata.timestamps.push_back(static_cast<double>(frame_idx) / fps_for_ts);
+  }
+
+  if (metadata.total_num_frames > 0 && metadata.fps > 0.0) {
+    metadata.sampled_fps = double(sampled_total_frames) /
+                           double(metadata.total_num_frames) * metadata.fps;
+  } else {
+    metadata.sampled_fps = fps_for_ts;
+  }
+
+  auto shape = video.sizes();
+  auto time_len = shape[0];
+  auto channel = shape[1];
+  auto resized_height = shape[2];
+  auto resized_width = shape[3];
+
+  if (do_resize_) {
+    auto size = smart_resize(temporal_patch_size_,
+                             resized_height,
+                             resized_width,
+                             temporal_patch_size_,
+                             patch_size_ * merge_size_,
+                             min_pixels_,
+                             max_pixels_);
+    if (!size) {
+      return false;
+    }
+    std::tie(resized_height, resized_width) = *size;
+  }
+
+  std::vector<torch::Tensor> out_frames;
+  out_frames.reserve(time_len);
+  // for each frame
+  auto frames = video.unbind(0);
+
+  for (auto& frame : frames) {
+    // resize
+    if (do_resize_)
+      frame =
+          this->resize(frame, {resized_height, resized_width}, resample_, true);
+    // normalize
+    if (do_normalize_) frame = this->normalize(frame, image_mean_, image_std_);
+    // rescale
+    if (do_rescale_) frame = this->rescale(frame, rescale_factor_);
+    out_frames.push_back(frame);
+  }
+
+  auto out_video = torch::stack(out_frames);  // [T,C,H,W]
+
+  if (out_video.size(0) % temporal_patch_size_) {
+    auto last = out_video.index({time_len - 1})
+                    .unsqueeze(0)
+                    .repeat({temporal_patch_size_ - 1, 1, 1, 1});
+    out_video = torch::cat({out_video, last}, 0);
+  }
+
+  shape = out_video.sizes();
+  auto grid_h = resized_height / patch_size_;
+  auto grid_w = resized_width / patch_size_;
+  auto grid_t = shape[0] / temporal_patch_size_;
+
+  out_video = out_video.contiguous();
+
+  auto patches = out_video.view({grid_t,
+                                 temporal_patch_size_,
+                                 channel,
+                                 grid_h / merge_size_,
+                                 merge_size_,
+                                 patch_size_,
+                                 grid_w / merge_size_,
+                                 merge_size_,
+                                 patch_size_});
+
+  patches = patches.permute({0, 3, 6, 4, 7, 2, 1, 5, 8});
+  patches = patches.reshape(
+      {grid_t * grid_h * grid_w,
+       channel * temporal_patch_size_ * patch_size_ * patch_size_});
+
+  pixel_values.emplace_back(patches);
+
+  grids.insert(grids.end(), {grid_t, grid_h, grid_w});
+  return true;
+}
+
+}  // namespace xllm
\ No newline at end of file
diff --git a/xllm/processors/glm4v_image_processor.h b/xllm/processors/glm4v_image_processor.h
new file mode 100755
index 000000000..7a2202250
--- /dev/null
+++ b/xllm/processors/glm4v_image_processor.h
@@ -0,0 +1,74 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "image_processor.h"
+
+namespace xllm {
+
+class Glm4VImageProcessor : public ImageProcessor {
+ public:
+  Glm4VImageProcessor(const ModelArgs&);
+  ~Glm4VImageProcessor() override = default;
+
+  bool process(const MMInput& mm_inputs, MMData& mm_datas) override;
+
+ private:
+  bool process_images(std::vector<torch::Tensor> images, MMData& mm_datas);
+  bool process_image(torch::Tensor image,
+                     std::vector<torch::Tensor>& pixel_values,
+                     std::vector<int64_t>& grids);
+  bool process_videos(std::vector<torch::Tensor> videos,
+                      std::vector<VideoMetadata> video_meta_list,
+                      MMData& mm_datas);
+  bool process_video(torch::Tensor video,
+                     VideoMetadata& metadata,
+                     std::vector<torch::Tensor>& pixel_values,
+                     std::vector<int64_t>& grids);
+
+ private:
+  bool do_convert_rgb_ = true;
+  bool do_normalize_ = true;
+
+  bool do_rescale_ = true;
+  bool do_resize_ = true;
+
+  std::vector<double> image_mean_;
+  std::vector<double> image_std_;
+
+  int max_pixels_ = 12845056;
+  int min_pixels_ = 3136;
+
+  int merge_size_ = 2;
+  int patch_size_ = 14;
+
+  int resample_ = 3;
+  double rescale_factor_ = 0.00392156862745098;
+
+  std::unordered_map<std::string, int> size_;
+  int temporal_patch_size_ = 2;
+
+  bool do_sample_frame_ = true;
+
+  int min_frames_ = 4;
+  int max_frames_ = 768;
+};
+
+}  // namespace xllm
diff --git a/xllm/processors/image_processor.cpp b/xllm/processors/image_processor.cpp
old mode 100644
new mode 100755
index 645905001..7317ea911
--- a/xllm/processors/image_processor.cpp
+++ b/xllm/processors/image_processor.cpp
@@ -118,9 +118,8 @@ torch::Tensor ImageProcessor::normalize(const torch::Tensor& image,
     result = image.to(torch::kFloat32);
   }
 
-  auto dtype = image.dtype();
   auto device = image.device();
-  auto options = torch::dtype(dtype).device(device);
+  auto options = torch::dtype(torch::kFloat32).device(device);
 
   auto m_tensor = torch::tensor(mean, options).reshape({-1, 1, 1});
   auto s_tensor = torch::tensor(std, options).reshape({-1, 1, 1});

From f5a7fdf61e68d48b263482415b01b0ec38e2d0c2 Mon Sep 17 00:00:00 2001
From: "wangziyue.28" <wangziyue.28@jd.com>
Date: Mon, 1 Dec 2025 20:27:12 +0800
Subject: [PATCH 03/20] feat: update VideoMetada in MMData.

---
 xllm/core/framework/request/mm_data.h        | 11 ++++++++++-
 xllm/processors/glm4v_image_processor.cpp    |  7 ++++---
 xllm/processors/qwen2_vl_image_processor.cpp | 12 ++++++------
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/xllm/core/framework/request/mm_data.h b/xllm/core/framework/request/mm_data.h
index 78b299044..12d4dff61 100644
--- a/xllm/core/framework/request/mm_data.h
+++ b/xllm/core/framework/request/mm_data.h
@@ -142,13 +142,22 @@ struct MMData {
 
   void debug_print() const;
 
+  const std::vector<VideoMetadata>& get_video_metadata() const {
+    return video_metadata_;
+  }
+
+  void set_video_metadata(const std::vector<VideoMetadata>& meta) {
+    video_metadata_ = meta;
+  }
+
   static MMData to(const MMData& mm_data, const torch::Device& device);
   static MMData batch(const std::vector<MMData>& mm_datas);
 
   uint32_t ty_ = MMType::NONE;
   MMDict data_;
 
-  std::vector<VideoMetadata> video_metadata;
+ private:
+  std::vector<VideoMetadata> video_metadata_;
 };
 
 }  // namespace xllm
diff --git a/xllm/processors/glm4v_image_processor.cpp b/xllm/processors/glm4v_image_processor.cpp
index dca14cbd5..be548f613 100644
--- a/xllm/processors/glm4v_image_processor.cpp
+++ b/xllm/processors/glm4v_image_processor.cpp
@@ -152,8 +152,8 @@ bool Glm4VImageProcessor::process_images(std::vector<torch::Tensor> images,
   auto thw = torch::tensor(grids);
 
   thw = thw.clone().reshape({-1, 3});
-  mm_datas = std::move(MMData(
-      MMType::IMAGE, {{"image_grid_thw", thw}, {"pixel_values", values}}));
+  mm_datas.add(MMType::IMAGE, "image_grid_thw", thw);
+  mm_datas.add(MMType::IMAGE, "pixel_values", values);
 
   return true;
 }
@@ -245,12 +245,13 @@ bool Glm4VImageProcessor::process_videos(
       return false;
     }
   }
+  mm_datas.set_video_metadata(video_meta_list);
 
   auto values = torch::cat(pixel_values);
   auto thw = torch::tensor(grids).clone().reshape({-1, 3});
   mm_datas.update(MMType::VIDEO, "video_grid_thw", thw);
   mm_datas.update(MMType::VIDEO, "pixel_values_videos", values);
-  mm_datas.video_metadata = std::move(video_meta_list);
+
   return true;
 }
 
diff --git a/xllm/processors/qwen2_vl_image_processor.cpp b/xllm/processors/qwen2_vl_image_processor.cpp
index 84ebe977a..e1639dd6b 100644
--- a/xllm/processors/qwen2_vl_image_processor.cpp
+++ b/xllm/processors/qwen2_vl_image_processor.cpp
@@ -134,8 +134,8 @@ bool Qwen2VLImageProcessor::process_images(std::vector<torch::Tensor> images,
 
   thw = thw.clone().reshape({-1, 3});
 
-  mm_datas.update(MMType::IMAGE, "image_grid_thw", thw);
-  mm_datas.update(MMType::IMAGE, "pixel_values", values);
+  mm_datas.add(MMType::IMAGE, "image_grid_thw", thw);
+  mm_datas.add(MMType::IMAGE, "pixel_values", values);
   return true;
 }
 
@@ -240,15 +240,15 @@ bool Qwen2VLImageProcessor::process_videos(
     double seconds_per_grid = static_cast<double>(temporal_patch_size_) / fps;
     second_per_grid.push_back(seconds_per_grid);
   }
+  mm_datas.set_video_metadata(video_meta_list);
 
   auto opts = torch::TensorOptions().dtype(torch::kFloat32);
   auto second_per_grid_ts = torch::tensor(second_per_grid, opts);
 
-  mm_datas.update(MMType::VIDEO, "video_grid_thw", thw);
-  mm_datas.update(MMType::VIDEO, "pixel_values_videos", values);
-  mm_datas.update(MMType::VIDEO, "second_per_grid_ts", second_per_grid_ts);
+  mm_datas.add(MMType::VIDEO, "video_grid_thw", thw);
+  mm_datas.add(MMType::VIDEO, "pixel_values_videos", values);
+  mm_datas.add(MMType::VIDEO, "second_per_grid_ts", second_per_grid_ts);
 
-  mm_datas.video_metadata = std::move(video_meta_list);
   return true;
 }
 

From 95f4f017c21242164b1eb706ed0b093c3f05e200 Mon Sep 17 00:00:00 2001
From: "wangziyue.28" <wangziyue.28@jd.com>
Date: Mon, 1 Dec 2025 20:35:55 +0800
Subject: [PATCH 04/20] feat: add args loader for video preprocess config.

---
 xllm/core/framework/hf_model_loader.cpp | 25 +++++++++++++++++++++++++
 xllm/core/framework/hf_model_loader.h   |  1 +
 xllm/core/framework/model/model_args.h  |  4 ++++
 3 files changed, 30 insertions(+)

diff --git a/xllm/core/framework/hf_model_loader.cpp b/xllm/core/framework/hf_model_loader.cpp
index e5fd7c348..23738289b 100644
--- a/xllm/core/framework/hf_model_loader.cpp
+++ b/xllm/core/framework/hf_model_loader.cpp
@@ -102,6 +102,11 @@ bool HFModelLoader::load_args(const std::string& model_weights_path) {
     return false;
   }
 
+  if (!load_video_preprocessor_args(model_weights_path)) {
+    LOG(ERROR) << "Failed to load video preprocess args from "
+               << model_weights_path;
+    return false;
+  }
   // Some hacky logics to support loading of old models
   // always use float16 for quantization
   // TODO: support quantization for other data types
@@ -416,4 +421,24 @@ bool HFModelLoader::load_image_preprocessor_args(
   return true;
 }
 
+bool HFModelLoader::load_video_preprocessor_args(
+    const std::string& model_weights_path) {
+  // image preprocessor args
+  JsonReader video_preprocess_reader;
+  const std::string video_preprocess_file_path =
+      model_weights_path + "/video_preprocessor_config.json";
+  if (video_preprocess_reader.parse(video_preprocess_file_path)) {
+    LOG(INFO) << "Success to parse video preprocess args file: "
+              << video_preprocess_file_path;
+
+    args_.mm_video_shortest_edge() =
+        video_preprocess_reader.value_or<int>("size.shortest_edge", 0);
+
+    args_.mm_video_longest_edge() =
+        video_preprocess_reader.value_or<int>("size.longest_edge", 0);
+  }
+
+  return true;
+}
+
 }  // namespace xllm
diff --git a/xllm/core/framework/hf_model_loader.h b/xllm/core/framework/hf_model_loader.h
index bb2401b0f..f98939adb 100644
--- a/xllm/core/framework/hf_model_loader.h
+++ b/xllm/core/framework/hf_model_loader.h
@@ -40,6 +40,7 @@ class HFModelLoader : public ModelLoader {
   bool load_quant_args(const std::string& model_weights_path);
   bool load_tokenizer_args(const std::string& model_weights_path);
   bool load_image_preprocessor_args(const std::string& model_weights_path);
+  bool load_video_preprocessor_args(const std::string& model_weights_path);
   std::string model_weights_path() const override {
     return model_weights_path_;
   }
diff --git a/xllm/core/framework/model/model_args.h b/xllm/core/framework/model/model_args.h
index 168565e89..c8c421904 100644
--- a/xllm/core/framework/model/model_args.h
+++ b/xllm/core/framework/model/model_args.h
@@ -297,6 +297,10 @@ struct ModelArgs {
   PROPERTY(int64_t, mm_image_shortest_edge) = 0;
   PROPERTY(int64_t, mm_image_longest_edge) = 0;
 
+  // GLM
+  PROPERTY(int64_t, mm_video_shortest_edge) = 0;
+  PROPERTY(int64_t, mm_video_longest_edge) = 0;
+
   PROPERTY(int, mm_image_patch_size) = 0;
   PROPERTY(int, mm_image_temporal_patch_size) = 0;
   PROPERTY(int, mm_image_merge_size) = 0;

From 9c2007f9a63f225dfa6f5f1dfbf414fec42317eb Mon Sep 17 00:00:00 2001
From: chenxiaoyu8 <chenxiaoyu8@jd.com>
Date: Tue, 2 Dec 2025 12:13:57 +0800
Subject: [PATCH 05/20] feat: add glm4_6_vl input processor.

---
 xllm/models/models.h      |   1 +
 xllm/models/vlm/glm4_vl.h | 180 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 181 insertions(+)
 create mode 100644 xllm/models/vlm/glm4_vl.h

diff --git a/xllm/models/models.h b/xllm/models/models.h
index 0460d6ff5..7b318e484 100644
--- a/xllm/models/models.h
+++ b/xllm/models/models.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "llm/llama.h"                  // IWYU pragma: keep
 #include "llm/llama3.h"                 // IWYU pragma: keep
 #include "llm/qwen3_embedding.h"        // IWYU pragma: keep
+#include "vlm/glm4_vl.h"                // IWYU pragma: keep
 #include "vlm/minicpmv.h"               // IWYU pragma: keep
 #include "vlm/qwen2_5_vl.h"             // IWYU pragma: keep
 #include "vlm/qwen3_vl.h"               // IWYU pragma: keep
diff --git a/xllm/models/vlm/glm4_vl.h b/xllm/models/vlm/glm4_vl.h
new file mode 100644
index 000000000..a9afaceaa
--- /dev/null
+++ b/xllm/models/vlm/glm4_vl.h
@@ -0,0 +1,180 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <torch/torch.h>
+
+#include <string>
+#include <vector>
+
+#include "core/framework/model/model_args.h"
+#include "core/framework/request/mm_data.h"
+#include "processors/input_processor.h"
+
+namespace xllm {
+
+class GLM4_6_VLInputProcessor : public InputProcessor {
+  enum class TokenType {
+    INVALID,
+    IMAGE,
+    VIDEO,
+  };
+
+ public:
+  GLM4_6_VLInputProcessor(const ModelArgs& args) {
+    merge_size_ = args.mm_image_merge_size();
+  }
+
+  void process(std::string& prompt, const MMData& mm_data) override {
+    torch::Tensor image_grid_thw;
+    if (auto res = mm_data.get<torch::Tensor>("image_grid_thw"))
+      image_grid_thw = res.value();
+
+    torch::Tensor video_grid_thw;
+    if (auto res = mm_data.get<torch::Tensor>("video_grid_thw"))
+      video_grid_thw = res.value();
+
+    if (!image_grid_thw.defined() && !video_grid_thw.defined()) return;
+
+    const auto& video_metadata = mm_data.get_video_metadata();
+    if (video_metadata.size() > 0) {
+      CHECK(video_metadata.size() ==
+            static_cast<size_t>(video_grid_thw.sizes()[0]));
+    }
+
+    auto merge_length = merge_size_ * merge_size_;
+    int total_image_token = 0;
+
+    if (image_grid_thw.defined()) {
+      auto count = image_grid_thw.sizes()[0];
+      for (int idx = 0; idx < count; ++idx)
+        total_image_token +=
+            image_grid_thw[idx].prod().item<int>() / merge_length;
+    }
+
+    int total_video_token = 0;
+    if (video_grid_thw.defined()) {
+      auto count = video_grid_thw.sizes()[0];
+      for (int idx = 0; idx < count; ++idx)
+        total_video_token += video_grid_thw[idx].prod().item<int>() /
+                             merge_length / video_grid_thw[idx][0].item<int>();
+    }
+
+    size_t total_token_len = total_image_token * image_token_.size() +
+                             total_video_token * image_token_.size();
+    std::string data;
+    data.reserve(prompt.size() + total_token_len);
+
+    int image_index = 0;
+    int video_index = 0;
+
+    size_t begin = 0;
+    auto pair = find_vision_token(prompt, begin);
+
+    while (pair.second != std::string::npos) {
+      data.append(prompt, begin, pair.second - begin);
+
+      if (pair.first == TokenType::IMAGE) {
+        auto token_num =
+            image_grid_thw[image_index].prod().item<int>() / merge_length;
+        while (token_num--) data.append(image_token_);
+
+        image_index++;
+        begin = pair.second + image_token_.size();
+      } else if (pair.first == TokenType::VIDEO) {
+        auto num_frames = video_grid_thw[video_index][0].item<int>();
+        auto timestamps = video_metadata[video_index].timestamps;
+        CHECK(!timestamps.empty());
+
+        auto selected = build_timestamps(timestamps, num_frames);
+        auto token_num = video_grid_thw[video_index].prod().item<int>() /
+                         merge_length / num_frames;
+
+        for (size_t idx = 0; idx < num_frames; ++idx) {
+          data.append(begin_of_image_token_);
+
+          auto num = token_num;
+          while (num--) data.append(image_token_);
+
+          data.append(end_of_image_token_);
+          data.append(format_timestamp_str(selected[idx]));
+        }
+
+        video_index++;
+        begin = pair.second + video_token_.size();
+      } else {
+        assert(false);
+      }
+
+      pair = find_vision_token(prompt, begin);
+    }
+
+    if (begin < prompt.size()) data.append(prompt, begin, std::string::npos);
+
+    prompt = std::move(data);
+  }
+
+ private:
+  std::pair<TokenType, size_t> find_vision_token(const std::string& prompt,
+                                                 size_t begin) {
+    auto img_pos = prompt.find(image_token_, begin);
+    auto vid_pos = prompt.find(video_token_, begin);
+
+    if (img_pos == std::string::npos && vid_pos == std::string::npos)
+      return {TokenType::INVALID, std::string::npos};
+    else if (vid_pos == std::string::npos)
+      return {TokenType::IMAGE, img_pos};
+    else if (img_pos == std::string::npos)
+      return {TokenType::VIDEO, vid_pos};
+    else
+      return img_pos < vid_pos ? std::make_pair(TokenType::IMAGE, img_pos)
+                               : std::make_pair(TokenType::VIDEO, vid_pos);
+  }
+
+  std::vector<double> build_timestamps(const std::vector<double>& timestamps,
+                                       size_t num_frames) {
+    std::vector<double> vec;
+    vec.reserve(num_frames);
+
+    for (size_t i = 0; i < timestamps.size(); i += 2) {
+      vec.push_back(timestamps[i]);
+      if (vec.size() == num_frames) break;
+    }
+
+    while (vec.size() < num_frames) {
+      vec.push_back(vec.back());
+    }
+
+    return vec;
+  }
+
+  std::string format_timestamp_str(double timestamp) {
+    char buffer[32];
+    sprintf(buffer, "%.1f seconds", timestamp);
+    return buffer;
+  }
+
+ private:
+  const std::string image_token_ = "<|image|>";
+  const std::string video_token_ = "<|video|>";
+
+  const std::string begin_of_image_token_ = "<|begin_of_image|>";
+  const std::string end_of_image_token_ = "<|end_of_image|>";
+
+  int merge_size_ = 0;
+};
+
+}  // namespace xllm

From fa7a5eedc4a0004486869864b604a32793043114 Mon Sep 17 00:00:00 2001
From: "wangziyue.28" <wangziyue.28@jd.com>
Date: Wed, 3 Dec 2025 15:47:42 +0800
Subject: [PATCH 06/20] feat: 1.move sample_frames from image_processor.
 2.support diffenrent types in chat_template content.

---
 .../chat_template/jinja_chat_template.cpp     |  27 +--
 xllm/core/framework/request/mm_codec.cpp      |   3 +-
 xllm/core/framework/request/mm_input.h        |   6 +-
 xllm/processors/glm4v_image_processor.cpp     | 124 +++++++++++-
 xllm/processors/glm4v_image_processor.h       |   2 +
 xllm/processors/image_processor.cpp           | 187 ------------------
 xllm/processors/image_processor.h             |   9 -
 xllm/processors/qwen2_vl_image_processor.cpp  |  73 ++++++-
 xllm/processors/qwen2_vl_image_processor.h    |   8 +-
 9 files changed, 209 insertions(+), 230 deletions(-)
 mode change 100755 => 100644 xllm/processors/glm4v_image_processor.h
 mode change 100755 => 100644 xllm/processors/image_processor.cpp

diff --git a/xllm/core/framework/chat_template/jinja_chat_template.cpp b/xllm/core/framework/chat_template/jinja_chat_template.cpp
index 44caf09ee..fcd1f2166 100644
--- a/xllm/core/framework/chat_template/jinja_chat_template.cpp
+++ b/xllm/core/framework/chat_template/jinja_chat_template.cpp
@@ -121,24 +121,6 @@ std::optional<std::string> JinjaChatTemplate::apply(
     nlohmann::ordered_json& messages,
     const nlohmann::ordered_json& tools,
     const nlohmann::ordered_json& chat_template_kwargs) const {
-  for (auto& msg : messages) {
-    if (!msg.contains("content")) continue;
-    auto& content = msg["content"];
-    auto normalize_item = [](nlohmann::ordered_json& item) {
-      if (item.contains("type") && item["type"].is_string()) {
-        std::string t = item["type"].get<std::string>();
-        if (t == "video_url") item["type"] = "video";
-      }
-      if (item.contains("video_url") && !item.contains("video"))
-        item["video"] = item["video_url"];
-    };
-
-    if (content.is_array()) {
-      for (auto& it : content) normalize_item(it);
-    } else if (content.is_object()) {
-      normalize_item(content);
-    }
-  }
   minja::chat_template_inputs input;
   input.messages = messages;
   input.tools = tools;
@@ -159,6 +141,15 @@ nlohmann::ordered_json JinjaChatTemplate::get_mm_content(
 
     if (item.type == "text") {
       item_json["text"] = item.text;
+    } else if (item.type == "video_url") {
+      item_json["video"] = "mm place holder";
+      item_json["video_url"] = "mm place holder";
+    } else if (item.type == "image_url") {
+      item_json["image"] = "mm place holder";
+      item_json["image_url"] = "mm place holder";
+    } else if (item.type == "audio_url") {
+      item_json["audio"] = "mm place holder";
+      item_json["audio_url"] = "mm place holder";
     } else {
       item_json[item.type] = "mm place holder";
     }
diff --git a/xllm/core/framework/request/mm_codec.cpp b/xllm/core/framework/request/mm_codec.cpp
index 0c78b5933..e862b76d4 100644
--- a/xllm/core/framework/request/mm_codec.cpp
+++ b/xllm/core/framework/request/mm_codec.cpp
@@ -159,8 +159,7 @@ bool OpenCVVideoDecoder::decode(const std::string& raw_data,
   av_dict_set(&opts, "probesize", "20000000", 0);
   av_dict_set(&opts, "analyzeduration", "5000000", 0);
 
-  const AVInputFormat* in_fmt = av_find_input_format("mp4");
-  int ret = avformat_open_input(&fmt, nullptr, in_fmt, &opts);
+  int ret = avformat_open_input(&fmt, nullptr, nullptr, &opts);
   av_dict_free(&opts);
 
   if (ret < 0) {
diff --git a/xllm/core/framework/request/mm_input.h b/xllm/core/framework/request/mm_input.h
index 1b5fc57b3..9f2d3237c 100644
--- a/xllm/core/framework/request/mm_input.h
+++ b/xllm/core/framework/request/mm_input.h
@@ -58,13 +58,11 @@ struct MMInput {
     return std::move(vec);
   }
 
-  std::vector<VideoMetadata> get_video_metadata(MMType type) const {
+  std::vector<VideoMetadata> get_video_metadata() const {
     std::vector<VideoMetadata> metas;
     metas.reserve(items_.size());
     for (auto& item : items_) {
-      if (item.type_ == type) {
-        metas.push_back(item.video_meta_);
-      }
+      metas.push_back(item.video_meta_);
     }
     return metas;
   }
diff --git a/xllm/processors/glm4v_image_processor.cpp b/xllm/processors/glm4v_image_processor.cpp
index be548f613..6e00b7e2b 100644
--- a/xllm/processors/glm4v_image_processor.cpp
+++ b/xllm/processors/glm4v_image_processor.cpp
@@ -77,6 +77,117 @@ std::optional<Size> smart_resize(int num_frames,
 }
 }  // namespace
 
+torch::Tensor Glm4VImageProcessor::sample_frames(const VideoMetadata& metadata,
+                                                 int temporal_patch_size) {
+  // video: [T, C, H, W]
+  const int total_frames = metadata.total_num_frames;
+  if (total_frames <= 0) {
+    return torch::empty({0}, torch::dtype(torch::kLong));
+  }
+
+  if (metadata.fps <= 0.0) {
+    LOG(FATAL) << "invalid metadata.fps <= 0";
+  }
+
+  const int max_frame_idx = total_frames - 1;
+
+  // duration = metadata.duration or round(max_idx / fps) + 1
+  double duration = metadata.duration;
+  if (duration <= 0.0) {
+    duration =
+        std::round(static_cast<double>(max_frame_idx) / metadata.fps) + 1.0;
+  }
+
+  constexpr double DYN_FPS_30 = 3.0;
+  constexpr double DYN_FPS_300 = 1.0;
+  constexpr double DYN_FPS_2400 = 0.5;
+  constexpr int MAX_FRAME_COUNT_DYNAMIC = 640;
+  constexpr double MAX_DURATION = 2400.0;
+
+  const double effective_duration = std::min(duration, MAX_DURATION);
+
+  double target_fps = 0.0;
+  if (effective_duration <= 30.0) {
+    target_fps = DYN_FPS_30;
+  } else if (effective_duration <= 300.0) {
+    target_fps = DYN_FPS_300;
+  } else {
+    target_fps = DYN_FPS_2400;
+  }
+
+  // extract_t = int(effective_duration * target_fps * temporal_patch_size)
+  int extract_t = static_cast<int>(effective_duration * target_fps *
+                                   static_cast<double>(temporal_patch_size));
+  extract_t = std::min(extract_t, MAX_FRAME_COUNT_DYNAMIC);
+
+  const double duration_per_frame = 1.0 / metadata.fps;
+  std::vector<double> timestamps(total_frames);
+  for (int i = 0; i < total_frames; ++i) {
+    timestamps[i] = static_cast<double>(i) * duration_per_frame;
+  }
+  const int max_second = static_cast<int>(duration);
+
+  torch::Tensor frame_indices;
+
+  if (total_frames < extract_t) {
+    frame_indices = torch::linspace(
+        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
+  } else {
+    std::vector<int64_t> tmp;
+    tmp.reserve(static_cast<size_t>(total_frames));
+    double current_second = 0.0;
+    const double inv_fps =
+        1.0 / (static_cast<double>(temporal_patch_size) * target_fps);
+
+    for (int frame_index = 0; frame_index < total_frames; frame_index++) {
+      if (timestamps[frame_index] >= current_second) {
+        current_second += inv_fps;
+        tmp.push_back(frame_index);
+        if (current_second >= static_cast<double>(max_second)) {
+          break;
+        }
+      }
+    }
+    frame_indices =
+        torch::tensor(tmp, torch::TensorOptions().dtype(torch::kLong));
+  }
+  int64_t len = frame_indices.size(0);
+  if (len < extract_t) {
+    int64_t start, end;
+    if (len == 0) {
+      start = 0;
+      end = std::max<int64_t>(total_frames - 1, 0);
+    } else {
+      start = frame_indices[0].item<int64_t>();
+      end = frame_indices[len - 1].item<int64_t>();
+    }
+    frame_indices =
+        torch::linspace(start, end, extract_t, torch::dtype(torch::kLong));
+  } else if (len > extract_t) {
+    frame_indices = torch::linspace(
+        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
+  }
+
+  len = frame_indices.size(0);
+  std::unordered_set<int64_t> seen;
+  seen.reserve(static_cast<size_t>(len) * 2);
+  std::vector<int64_t> uniq;
+  uniq.reserve(static_cast<size_t>(len));
+
+  for (int64_t i = 0; i < len; ++i) {
+    auto idx = frame_indices[i].item<int64_t>();
+    if (seen.insert(idx).second) {
+      uniq.push_back(idx);
+    }
+  }
+
+  if (!uniq.empty() && (uniq.size() & 1)) {
+    uniq.push_back(uniq.back());
+  }
+
+  return torch::tensor(uniq, torch::TensorOptions().dtype(torch::kLong));
+}
+
 Glm4VImageProcessor::Glm4VImageProcessor(const ModelArgs& args) {
   image_mean_ = args.mm_image_normalize_mean();
   image_std_ = args.mm_image_normalize_std();
@@ -112,8 +223,7 @@ Glm4VImageProcessor::Glm4VImageProcessor(const ModelArgs& args) {
 bool Glm4VImageProcessor::process(const MMInput& inputs, MMData& datas) {
   std::vector<torch::Tensor> images = inputs.get_decode_data(MMType::IMAGE);
   std::vector<torch::Tensor> videos = inputs.get_decode_data(MMType::VIDEO);
-  std::vector<VideoMetadata> video_meta_list =
-      inputs.get_video_metadata(MMType::VIDEO);
+  std::vector<VideoMetadata> video_meta_list = inputs.get_video_metadata();
 
   if (images.empty() && (videos.empty() || video_meta_list.empty())) {
     LOG(ERROR) << "no image/video tensor found.";
@@ -249,8 +359,8 @@ bool Glm4VImageProcessor::process_videos(
 
   auto values = torch::cat(pixel_values);
   auto thw = torch::tensor(grids).clone().reshape({-1, 3});
-  mm_datas.update(MMType::VIDEO, "video_grid_thw", thw);
-  mm_datas.update(MMType::VIDEO, "pixel_values_videos", values);
+  mm_datas.add(MMType::VIDEO, "video_grid_thw", thw);
+  mm_datas.add(MMType::VIDEO, "pixel_values_videos", values);
 
   return true;
 }
@@ -266,9 +376,11 @@ bool Glm4VImageProcessor::process_video(
 
   torch::Tensor indices;
   if (do_sample_frame_) {
-    indices = this->GLM_sample_frames(metadata, temporal_patch_size_);
+    indices = this->sample_frames(metadata, temporal_patch_size_);
   } else {
-    indices = this->init_frames(metadata);  // default sample to 32 frames
+    indices = torch::arange(0,
+                            static_cast<int64_t>(origin_video.size(0)),
+                            torch::TensorOptions().dtype(torch::kLong));
   }
   auto video = origin_video.index_select(/*dim=*/0, indices);
   int64_t sampled_total_frames = video.size(0);
diff --git a/xllm/processors/glm4v_image_processor.h b/xllm/processors/glm4v_image_processor.h
old mode 100755
new mode 100644
index 7a2202250..2313fb9bf
--- a/xllm/processors/glm4v_image_processor.h
+++ b/xllm/processors/glm4v_image_processor.h
@@ -42,6 +42,8 @@ class Glm4VImageProcessor : public ImageProcessor {
                      VideoMetadata& metadata,
                      std::vector<torch::Tensor>& pixel_values,
                      std::vector<int64_t>& grids);
+  torch::Tensor sample_frames(const VideoMetadata& metadata,
+                              int temporal_patch_size);
 
  private:
   bool do_convert_rgb_ = true;
diff --git a/xllm/processors/image_processor.cpp b/xllm/processors/image_processor.cpp
old mode 100755
new mode 100644
index 7317ea911..82195f36a
--- a/xllm/processors/image_processor.cpp
+++ b/xllm/processors/image_processor.cpp
@@ -128,191 +128,4 @@ torch::Tensor ImageProcessor::normalize(const torch::Tensor& image,
   return result.div_(s_tensor);
 }
 
-torch::Tensor ImageProcessor::init_frames(const VideoMetadata& metadata) {
-  int total_num_frames = metadata.total_num_frames;
-  int nframes_len = 32;
-  if (total_num_frames <= 0) {
-    return torch::empty({0}, torch::dtype(torch::kLong));
-  }
-  auto idx = torch::linspace(
-      0, total_num_frames - 1, nframes_len, torch::dtype(torch::kLong));
-  return idx;
-}
-
-torch::Tensor ImageProcessor::sample_frames(const VideoMetadata& metadata,
-                                            int temporal_patch_size,
-                                            int min_frames,
-                                            int max_frames,
-                                            int num_frames,
-                                            double set_fps) {
-  if (set_fps > 0.0 && num_frames > 0) {
-    LOG(FATAL) << "num_frames and fps are mutually exclusive arguments, please "
-                  "use only one!";
-  }
-
-  double fps = set_fps;
-
-  int total_num_frames = metadata.total_num_frames;
-
-  if (num_frames > 0) {
-    double double_num_frames =
-        std::round(static_cast<double>(num_frames) / temporal_patch_size) *
-        temporal_patch_size;
-    num_frames = static_cast<int>(double_num_frames);
-  } else if (fps > 0.0) {
-    if (metadata.fps <= 0.0) {
-      LOG(FATAL)
-          << "Asked to sample `fps` frames per second but no video metadata "
-             "was provided which is required when sampling with `fps`. ";
-    }
-
-    max_frames =
-        (std::min(max_frames, total_num_frames) / temporal_patch_size) *
-        temporal_patch_size;
-    double double_num_frames =
-        static_cast<double>(total_num_frames) / metadata.fps * fps;
-    double_num_frames = std::min(
-        std::min(std::max(double_num_frames, static_cast<double>(min_frames)),
-                 static_cast<double>(max_frames)),
-        static_cast<double>(total_num_frames));
-    double_num_frames = std::floor(double_num_frames / temporal_patch_size) *
-                        temporal_patch_size;
-
-    num_frames = static_cast<int>(double_num_frames);
-  }
-
-  if (num_frames > total_num_frames) {
-    LOG(FATAL) << "Video can't be sampled. The inferred num_frames="
-               << num_frames << " exceeds total_num_frames=" << total_num_frames
-               << ".";
-  }
-
-  if (num_frames > 0) {
-    std::vector<int64_t> indices;
-    indices.reserve(num_frames);
-    for (int i = 0; i < num_frames; ++i) {
-      int64_t k = static_cast<int64_t>(
-          (static_cast<int64_t>(i) * total_num_frames) / num_frames);
-      if (k >= total_num_frames) k = total_num_frames - 1;
-      indices.push_back(k);
-    }
-    return torch::tensor(indices, torch::TensorOptions().dtype(torch::kLong));
-  } else {
-    return torch::arange(0,
-                         static_cast<int64_t>(total_num_frames),
-                         torch::TensorOptions().dtype(torch::kLong));
-  }
-}
-
-torch::Tensor ImageProcessor::GLM_sample_frames(const VideoMetadata& metadata,
-                                                int temporal_patch_size) {
-  // video: [T, C, H, W]
-  const int total_frames = metadata.total_num_frames;
-  if (total_frames <= 0) {
-    return torch::empty({0}, torch::dtype(torch::kLong));
-  }
-
-  if (metadata.fps <= 0.0) {
-    LOG(FATAL) << "invalid metadata.fps <= 0";
-  }
-
-  const int max_frame_idx = total_frames - 1;
-
-  // duration = metadata.duration or round(max_idx / fps) + 1
-  double duration = metadata.duration;
-  if (duration <= 0.0) {
-    duration =
-        std::round(static_cast<double>(max_frame_idx) / metadata.fps) + 1.0;
-  }
-
-  constexpr double DYN_FPS_30 = 3.0;
-  constexpr double DYN_FPS_300 = 1.0;
-  constexpr double DYN_FPS_2400 = 0.5;
-  constexpr int MAX_FRAME_COUNT_DYNAMIC = 640;
-  constexpr double MAX_DURATION = 2400.0;
-
-  const double effective_duration = std::min(duration, MAX_DURATION);
-
-  double target_fps = 0.0;
-  if (effective_duration <= 30.0) {
-    target_fps = DYN_FPS_30;
-  } else if (effective_duration <= 300.0) {
-    target_fps = DYN_FPS_300;
-  } else {
-    target_fps = DYN_FPS_2400;
-  }
-
-  // extract_t = int(effective_duration * target_fps * temporal_patch_size)
-  int extract_t = static_cast<int>(effective_duration * target_fps *
-                                   static_cast<double>(temporal_patch_size));
-  extract_t = std::min(extract_t, MAX_FRAME_COUNT_DYNAMIC);
-
-  const double duration_per_frame = 1.0 / metadata.fps;
-  std::vector<double> timestamps(total_frames);
-  for (int i = 0; i < total_frames; ++i) {
-    timestamps[i] = static_cast<double>(i) * duration_per_frame;
-  }
-  const int max_second = static_cast<int>(duration);
-
-  torch::Tensor frame_indices;
-
-  if (total_frames < extract_t) {
-    frame_indices = torch::linspace(
-        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
-  } else {
-    std::vector<int64_t> tmp;
-    tmp.reserve(static_cast<size_t>(total_frames));
-    double current_second = 0.0;
-    const double inv_fps =
-        1.0 / (static_cast<double>(temporal_patch_size) * target_fps);
-
-    for (int frame_index = 0; frame_index < total_frames; frame_index++) {
-      if (timestamps[frame_index] >= current_second) {
-        current_second += inv_fps;
-        tmp.push_back(frame_index);
-        if (current_second >= static_cast<double>(max_second)) {
-          break;
-        }
-      }
-    }
-    frame_indices =
-        torch::tensor(tmp, torch::TensorOptions().dtype(torch::kLong));
-  }
-  int64_t len = frame_indices.size(0);
-  if (len < extract_t) {
-    int64_t start, end;
-    if (len == 0) {
-      start = 0;
-      end = std::max<int64_t>(total_frames - 1, 0);
-    } else {
-      start = frame_indices[0].item<int64_t>();
-      end = frame_indices[len - 1].item<int64_t>();
-    }
-    frame_indices =
-        torch::linspace(start, end, extract_t, torch::dtype(torch::kLong));
-  } else if (len > extract_t) {
-    frame_indices = torch::linspace(
-        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
-  }
-
-  len = frame_indices.size(0);
-  std::unordered_set<int64_t> seen;
-  seen.reserve(static_cast<size_t>(len) * 2);
-  std::vector<int64_t> uniq;
-  uniq.reserve(static_cast<size_t>(len));
-
-  for (int64_t i = 0; i < len; ++i) {
-    auto idx = frame_indices[i].item<int64_t>();
-    if (seen.insert(idx).second) {
-      uniq.push_back(idx);
-    }
-  }
-
-  if (!uniq.empty() && (uniq.size() & 1)) {
-    uniq.push_back(uniq.back());
-  }
-
-  return torch::tensor(uniq, torch::TensorOptions().dtype(torch::kLong));
-}
-
 }  // namespace xllm
diff --git a/xllm/processors/image_processor.h b/xllm/processors/image_processor.h
index f5b5f33ef..be7fac5a1 100644
--- a/xllm/processors/image_processor.h
+++ b/xllm/processors/image_processor.h
@@ -39,15 +39,6 @@ class ImageProcessor {
   virtual torch::Tensor normalize(const torch::Tensor& image,
                                   const std::vector<double>& mean,
                                   const std::vector<double>& std);
-  virtual torch::Tensor init_frames(const VideoMetadata& metadata);
-  virtual torch::Tensor sample_frames(const VideoMetadata& metadata,
-                                      int temporal_patch_size,
-                                      int min_frames,
-                                      int max_frames,
-                                      int num_frames = -1,
-                                      double set_fps = -1.0);
-  virtual torch::Tensor GLM_sample_frames(const VideoMetadata& metadata,
-                                          int temporal_patch_size);
 };
 
 }  // namespace xllm
diff --git a/xllm/processors/qwen2_vl_image_processor.cpp b/xllm/processors/qwen2_vl_image_processor.cpp
index e1639dd6b..cd30d8146 100644
--- a/xllm/processors/qwen2_vl_image_processor.cpp
+++ b/xllm/processors/qwen2_vl_image_processor.cpp
@@ -60,6 +60,72 @@ std::optional<Size> smart_resize(int height,
 }
 }  // namespace
 
+torch::Tensor Qwen2VLImageProcessor::sample_frames(
+    const VideoMetadata& metadata,
+    int temporal_patch_size,
+    int min_frames,
+    int max_frames,
+    int num_frames,
+    double set_fps) {
+  if (set_fps > 0.0 && num_frames > 0) {
+    LOG(FATAL) << "num_frames and fps are mutually exclusive arguments, please "
+                  "use only one!";
+  }
+
+  double fps = set_fps;
+
+  int total_num_frames = metadata.total_num_frames;
+
+  if (num_frames > 0) {
+    double double_num_frames =
+        std::round(static_cast<double>(num_frames) / temporal_patch_size) *
+        temporal_patch_size;
+    num_frames = static_cast<int>(double_num_frames);
+  } else if (fps > 0.0) {
+    if (metadata.fps <= 0.0) {
+      LOG(FATAL)
+          << "Asked to sample `fps` frames per second but no video metadata "
+             "was provided which is required when sampling with `fps`. ";
+    }
+
+    max_frames =
+        (std::min(max_frames, total_num_frames) / temporal_patch_size) *
+        temporal_patch_size;
+    double double_num_frames =
+        static_cast<double>(total_num_frames) / metadata.fps * fps;
+    double_num_frames = std::min(
+        std::min(std::max(double_num_frames, static_cast<double>(min_frames)),
+                 static_cast<double>(max_frames)),
+        static_cast<double>(total_num_frames));
+    double_num_frames = std::floor(double_num_frames / temporal_patch_size) *
+                        temporal_patch_size;
+
+    num_frames = static_cast<int>(double_num_frames);
+  }
+
+  if (num_frames > total_num_frames) {
+    LOG(FATAL) << "Video can't be sampled. The inferred num_frames="
+               << num_frames << " exceeds total_num_frames=" << total_num_frames
+               << ".";
+  }
+
+  if (num_frames > 0) {
+    std::vector<int64_t> indices;
+    indices.reserve(num_frames);
+    for (int i = 0; i < num_frames; ++i) {
+      int64_t k = static_cast<int64_t>(
+          (static_cast<int64_t>(i) * total_num_frames) / num_frames);
+      if (k >= total_num_frames) k = total_num_frames - 1;
+      indices.push_back(k);
+    }
+    return torch::tensor(indices, torch::TensorOptions().dtype(torch::kLong));
+  } else {
+    return torch::arange(0,
+                         static_cast<int64_t>(total_num_frames),
+                         torch::TensorOptions().dtype(torch::kLong));
+  }
+}
+
 Qwen2VLImageProcessor::Qwen2VLImageProcessor(const ModelArgs& args) {
   image_mean_ = args.mm_image_normalize_mean();
   image_std_ = args.mm_image_normalize_std();
@@ -93,8 +159,7 @@ Qwen2VLImageProcessor::Qwen2VLImageProcessor(const ModelArgs& args) {
 bool Qwen2VLImageProcessor::process(const MMInput& inputs, MMData& datas) {
   std::vector<torch::Tensor> images = inputs.get_decode_data(MMType::IMAGE);
   std::vector<torch::Tensor> videos = inputs.get_decode_data(MMType::VIDEO);
-  std::vector<VideoMetadata> video_meta_list =
-      inputs.get_video_metadata(MMType::VIDEO);
+  std::vector<VideoMetadata> video_meta_list = inputs.get_video_metadata();
 
   if (images.empty() && (videos.empty() || video_meta_list.empty())) {
     LOG(ERROR) << "no image/video tensor found.";
@@ -270,7 +335,9 @@ bool Qwen2VLImageProcessor::process_video(
                                   /*num_frames=*/-1,
                                   /*set_fps=*/2.0);
   } else {
-    indices = this->init_frames(metadata);  // default sample to 32 frames
+    indices = torch::arange(0,
+                            static_cast<int64_t>(origin_video.size(0)),
+                            torch::TensorOptions().dtype(torch::kLong));
   }
   auto video = origin_video.index_select(/*dim=*/0, indices);
   int64_t sampled_total_frames = video.size(0);
diff --git a/xllm/processors/qwen2_vl_image_processor.h b/xllm/processors/qwen2_vl_image_processor.h
index 974ec83e5..3e35ac501 100644
--- a/xllm/processors/qwen2_vl_image_processor.h
+++ b/xllm/processors/qwen2_vl_image_processor.h
@@ -43,6 +43,12 @@ class Qwen2VLImageProcessor : public ImageProcessor {
                      VideoMetadata& metadata,
                      std::vector<torch::Tensor>& pixel_values,
                      std::vector<int64_t>& grids);
+  torch::Tensor sample_frames(const VideoMetadata& metadata,
+                              int temporal_patch_size,
+                              int min_frames,
+                              int max_frames,
+                              int num_frames = -1,
+                              double set_fps = -1.0);
 
  private:
   bool do_convert_rgb_ = true;
@@ -66,7 +72,7 @@ class Qwen2VLImageProcessor : public ImageProcessor {
   std::unordered_map<std::string, int> size_;
   int temporal_patch_size_ = 2;
 
-  bool do_sample_frame_ = false;
+  bool do_sample_frame_ = true;
 
   int min_frames_ = 4;
   int max_frames_ = 768;

From d00f6040cc8961d7a25f703bae0d61aefd347fdf Mon Sep 17 00:00:00 2001
From: jindonghe1 <jindonghe1@jd.com>
Date: Fri, 28 Nov 2025 17:04:18 +0800
Subject: [PATCH 07/20] feat: support new model glm4.

---
 CMakeLists.txt                                |   6 +-
 xllm/core/layers/CMakeLists.txt               |   1 +
 xllm/core/layers/glm4_decoder_layer.h         |  45 ++
 xllm/core/layers/npu/CMakeLists.txt           |   2 +
 .../npu/npu_glm4_decoder_layer_impl.cpp       | 395 ++++++++++++++++++
 .../layers/npu/npu_glm4_decoder_layer_impl.h  | 111 +++++
 xllm/models/llm/glm4.h                        | 221 ++++++++++
 xllm/models/llm/llm_model_base.h              |  31 +-
 xllm/models/models.h                          |   1 +
 9 files changed, 805 insertions(+), 8 deletions(-)
 create mode 100644 xllm/core/layers/glm4_decoder_layer.h
 create mode 100644 xllm/core/layers/npu/npu_glm4_decoder_layer_impl.cpp
 create mode 100644 xllm/core/layers/npu/npu_glm4_decoder_layer_impl.h
 create mode 100644 xllm/models/llm/glm4.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f18bca2ff..1bdcb25e5 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,20 +32,20 @@ if(USE_NPU)
     if(DEVICE_TYPE STREQUAL "USE_A3")
         message("downloading a3 arm xllm kernels")
         file(DOWNLOAD 
-            "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.3-Linux.a3.arm.rpm"
+            "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.4-Linux.a3.arm.rpm"
             "${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
         )
     else()  
       if(DEVICE_ARCH STREQUAL "ARM")
           message("downloading a2 arm xllm_kernels")
           file(DOWNLOAD 
-              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.3-Linux.a2.arm.rpm"
+              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.4-Linux.a2.arm.rpm"
               "${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
           )
       else()
           message("downloading a2 x86 xllm_kernels")
           file(DOWNLOAD 
-              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.3-Linux.a2.x86.rpm"
+              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.4-Linux.a2.x86.rpm"
               "${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
           )
       endif()
diff --git a/xllm/core/layers/CMakeLists.txt b/xllm/core/layers/CMakeLists.txt
index 6d56faccd..608cab254 100644
--- a/xllm/core/layers/CMakeLists.txt
+++ b/xllm/core/layers/CMakeLists.txt
@@ -63,6 +63,7 @@ cc_library(
     qwen3_vision_encode_layer.h
     qwen3_decoder_layer.h
     qwen3_moe_decoder_layer.h
+    glm4_decoder_layer.h
     rms_norm.h
     siglip_encoder_layer.h
     pos_embedding.h
diff --git a/xllm/core/layers/glm4_decoder_layer.h b/xllm/core/layers/glm4_decoder_layer.h
new file mode 100644
index 000000000..8fd399450
--- /dev/null
+++ b/xllm/core/layers/glm4_decoder_layer.h
@@ -0,0 +1,45 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include "npu/npu_glm4_decoder_layer_impl.h"
+
+namespace xllm {
+namespace layer {
+
+#if defined(USE_NPU)
+class Glm4DecoderLayer
+    : public torch::nn::ModuleHolder<NpuGlm4DecoderLayerImpl> {
+ public:
+  using torch::nn::ModuleHolder<NpuGlm4DecoderLayerImpl>::ModuleHolder;
+  using Impl __attribute__((__unused__)) = NpuGlm4DecoderLayerImpl;
+
+  Glm4DecoderLayer(const ModelContext& context)
+      : ModuleHolder(std::make_shared<NpuGlm4DecoderLayerImpl>(context)) {}
+};
+#else
+class Glm4DecoderLayer : public torch::nn::ModuleHolder<Qwen2DecoderImpl> {
+ public:
+  using torch::nn::ModuleHolder<Qwen2DecoderImpl>::ModuleHolder;
+  using Impl __attribute__((__unused__)) = Qwen2DecoderImpl;
+
+  Glm4DecoderLayer(const ModelContext& context)
+      : ModuleHolder(std::make_shared<Qwen2DecoderImpl>(context)) {}
+};
+#endif
+
+}  // namespace layer
+}  // namespace xllm
diff --git a/xllm/core/layers/npu/CMakeLists.txt b/xllm/core/layers/npu/CMakeLists.txt
index 61f7759d9..d8aff02f2 100644
--- a/xllm/core/layers/npu/CMakeLists.txt
+++ b/xllm/core/layers/npu/CMakeLists.txt
@@ -23,6 +23,7 @@ cc_library(
     npu_llama_decoder_layer_impl.h
     npu_qwen2_decoder_layer_impl.h
     npu_qwen3_decoder_layer_impl.h
+    npu_glm4_decoder_layer_impl.h
     npu_rms_norm_impl.h
     npu_siglip_encoder_layer_impl.h
   SRCS
@@ -43,6 +44,7 @@ cc_library(
     npu_llama_decoder_layer_impl.cpp
     npu_qwen2_decoder_layer_impl.cpp
     npu_qwen3_decoder_layer_impl.cpp
+    npu_glm4_decoder_layer_impl.cpp
     npu_rms_norm_impl.cpp
     npu_siglip_encoder_layer_impl.cpp
   DEPS
diff --git a/xllm/core/layers/npu/npu_glm4_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_glm4_decoder_layer_impl.cpp
new file mode 100644
index 000000000..26a87bae6
--- /dev/null
+++ b/xllm/core/layers/npu/npu_glm4_decoder_layer_impl.cpp
@@ -0,0 +1,395 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "npu_glm4_decoder_layer_impl.h"
+
+#include <glog/logging.h>
+#include <mstx/ms_tools_ext.h>
+
+#include <map>
+
+#include "common/global_flags.h"
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
+
+namespace xllm {
+namespace layer {
+
+enum DecoderLayerTensorId : int {
+  IN_NORM_WEIGHT = 0,      // weight
+  IN_NORM_BIAS = 1,        // bias
+  IN_NORM_NEW_WEIGHT = 2,  // new weight
+  IN_NORM_NEW_BIAS = 3,    // new bias
+
+  IN_Q_WEIGHT = 4,    // weight
+  IN_Q_BIAS = 5,      // bias
+  IN_Q_DEQSCALE = 6,  // deq_scale
+  IN_Q_OFFSET = 7,    // offset
+  IN_Q_SCALE = 8,     // scale
+  IN_Q_COMPRESS_IDX = 9,
+
+  IN_K_WEIGHT = 10,    // weight
+  IN_K_BIAS = 11,      // bias
+  IN_K_DEQSCALE = 12,  // deq_scale
+  IN_K_OFFSET = 13,    // offset
+  IN_K_SCALE = 14,     // scale
+  IN_K_COMPRESS_IDX = 15,
+
+  IN_V_WEIGHT = 16,    // weight
+  IN_V_BIAS = 17,      // bias
+  IN_V_DEQSCALE = 18,  // deq_scale
+  IN_V_OFFSET = 19,    // offset
+  IN_V_SCALE = 20,     // scale
+  IN_V_COMPRESS_IDX = 21,
+
+  IN_ATTENTION_OUT_WEIGHT = 22,    // weight
+  IN_ATTENTION_OUT_BIAS = 23,      // bias
+  IN_ATTENTION_OUT_DEQSCALE = 24,  // deq_scale
+  IN_ATTENTION_OUT_OFFSET = 25,    // offset
+  IN_ATTENTION_OUT_SCALE = 26,     // scale
+  IN_ATTENTION_OUT_COMPRESS_IDX = 27,
+
+  IN_SELFOUT_NORM_WEIGHT = 28,      // weight
+  IN_SELFOUT_NORM_BIAS = 29,        // bias
+  IN_SELFOUT_NORM_NEW_WEIGHT = 30,  // new weight
+  IN_SELFOUT_NORM_NEW_BIAS = 31,    // new bias
+
+  IN_MLP_GATEUP_WEIGHT = 32,    // weight
+  IN_MLP_GATEUP_BIAS = 33,      // bias
+  IN_MLP_GATEUP_DEQSCALE = 34,  // deq_scale
+  IN_MLP_GATEUP_OFFSET = 35,    // offset
+  IN_MLP_GATEUP_SCALE = 36,     // scale
+  IN_MLP_GATEUP_COMPRESS_IDX = 37,
+
+  IN_MLP_W1_WEIGHT = 38,    // weight
+  IN_MLP_W1_BIAS = 39,      // bias
+  IN_MLP_W1_DEQSCALE = 40,  // deq_scale
+  IN_MLP_W1_OFFSET = 41,    // offset
+  IN_MLP_W1_SCALE = 42,     // scale
+  IN_MLP_W1_COMPRESS_IDX = 43,
+
+  IN_MLP_CPROJ_WEIGHT = 44,    // weight
+  IN_MLP_CPROJ_BIAS = 45,      // bias
+  IN_MLP_CPROJ_DEQSCALE = 46,  // deq_scale
+  IN_MLP_CPROJ_OFFSET = 47,    // offset
+  IN_MLP_CPROJ_SCALE = 48,     // scale
+  IN_MLP_CPROJ_COMPRESS_IDX = 49,
+
+  IN_SELFIN_NORM_WEIGHT = 50,
+  IN_MLPOUT_NORM_WEIGHT = 51
+};
+
+const uint64_t WEIGHT_COUNT_PER_LAYER = 52;
+
+static std::unordered_map<std::string, int> WEIGHT_MAPPING = {
+    {"input_layernorm.weight", IN_NORM_WEIGHT},
+
+    {"self_attn.q_proj.weight", IN_Q_WEIGHT},
+    {"self_attn.q_proj.bias", IN_Q_BIAS},
+
+    {"self_attn.k_proj.weight", IN_K_WEIGHT},
+    {"self_attn.k_proj.bias", IN_K_BIAS},
+
+    {"self_attn.v_proj.weight", IN_V_WEIGHT},
+    {"self_attn.v_proj.bias", IN_V_BIAS},
+
+    {"self_attn.o_proj.weight", IN_ATTENTION_OUT_WEIGHT},
+
+    {"post_attention_layernorm.weight", IN_SELFOUT_NORM_WEIGHT},
+
+    // mlp
+    {"mlp.gate_up_proj.weight", IN_MLP_GATEUP_WEIGHT},
+
+    {"mlp.down_proj.weight", IN_MLP_CPROJ_WEIGHT},
+
+    {"post_self_attn_layernorm.weight", IN_SELFIN_NORM_WEIGHT},
+    {"post_mlp_layernorm.weight", IN_MLPOUT_NORM_WEIGHT}
+
+};
+
+static std::map<int, int> WEIGHT_SHARD = {{IN_Q_WEIGHT, 0},
+                                          {IN_Q_BIAS, 0},
+                                          {IN_K_WEIGHT, 0},
+                                          {IN_K_BIAS, 0},
+                                          {IN_V_WEIGHT, 0},
+                                          {IN_V_BIAS, 0},
+                                          {IN_ATTENTION_OUT_WEIGHT, 1},
+                                          {IN_MLP_GATEUP_WEIGHT, 0},
+                                          {IN_MLP_CPROJ_WEIGHT, 1}};
+
+void NpuGlm4DecoderLayerImpl::param_from_args(
+    atb_speed::chatglm::ChatglmLayerParam& param,
+    const ModelArgs& args,
+    const ParallelArgs& parallel_args,
+    bool isPrefill) {
+  param.isFA = false;
+  param.enableSwiGLU = true;
+
+  param.enableLcoc = false;
+  param.rmsnormQKNorm = false;
+  param.isPrefill = isPrefill;
+  param.isBF16 = args.dtype() == "bfloat16";
+  param.enableSplitFuse = FLAGS_enable_chunked_prefill && isPrefill;
+  param.loraEnableGMM = false;
+
+  param.linearTransposeType = {1, -1, -1, 1, 1, -1, 1};  // TODO
+  param.quantGroupSize = 0;
+  param.normEps = args.rms_norm_eps();
+  param.numAttentionHeadsPerRank = args.n_heads() / parallel_args.world_size();
+  param.hiddenSizePerAttentionHead = args.head_dim();
+  std::optional<long int> optionalValue = args.n_kv_heads();
+  param.numKeyValueHeadsPerRank =
+      static_cast<int>(optionalValue.value()) / parallel_args.world_size();
+  param.backend = FLAGS_communication_backend;
+  param.tensorParallelInfo = {parallel_args.rank(),
+                              parallel_args.world_size(),
+                              FLAGS_communication_backend};
+  param.linearHasBias = {true, false, false, false};
+  param.useQKNorm = false;
+
+  param.numHiddenLayers = args.n_layers();
+  param.usePostSelfAttnLayerNorm = true;
+  param.usePostMlpLayerNorm = true;
+  initialize_quantization_parameters(param);
+}
+void NpuGlm4DecoderLayerImpl::initialize_quantization_parameters(
+    atb_speed::chatglm::ChatglmLayerParam& param) {
+  param.linearDescs = {static_cast<int>(LinearTypeV2::INVALID),
+                       static_cast<int>(LinearTypeV2::INVALID),
+                       static_cast<int>(LinearTypeV2::INVALID),
+                       static_cast<int>(LinearTypeV2::INVALID),
+                       static_cast<int>(LinearTypeV2::INVALID),
+                       static_cast<int>(LinearTypeV2::INVALID),
+                       static_cast<int>(LinearTypeV2::INVALID)};
+  param.packQuantType = {static_cast<int>(PackType::ALL_FP),
+                         static_cast<int>(PackType::ALL_FP)};
+  param.linearQuantType = {static_cast<int>(LinearType::FP),
+                           static_cast<int>(LinearType::INVALID),
+                           static_cast<int>(LinearType::INVALID),
+                           static_cast<int>(LinearType::FP),
+                           static_cast<int>(LinearType::FP),
+                           static_cast<int>(LinearType::INVALID),
+                           static_cast<int>(LinearType::FP)};
+}
+
+NpuGlm4DecoderLayerImpl::NpuGlm4DecoderLayerImpl(const ModelContext& context)
+    : NpuBaseLayer(context) {
+  auto model_args = context.get_model_args();
+  auto parallel_args = context.get_parallel_args();
+  auto options = context.get_tensor_options();
+
+  param_from_args(prefill_param_, model_args, parallel_args, true);
+  param_from_args(decode_param_, model_args, parallel_args, false);
+  at_weight_tensors_.resize(WEIGHT_COUNT_PER_LAYER);
+  atb_weight_tensors_.resize(WEIGHT_COUNT_PER_LAYER);
+  placeholder_vec_ = {1};
+  dtype_ = c10::typeMetaToScalarType(options.dtype());
+  rank_id_ = parallel_args.rank();
+  placeholder_ = atb_speed::Utils::AtTensor2Tensor(
+      torch::zeros({1}).to(device_).to(dtype_));
+  at_placeholder_ = torch::zeros({1}).to(device_).to(dtype_);
+  for (int i = 0; i < WEIGHT_COUNT_PER_LAYER; ++i) {
+    at_weight_tensors_[i] = torch::zeros({1}).to(options);
+  }
+}
+void NpuGlm4DecoderLayerImpl::verify_loaded_weights() const {
+  for (const auto& [name, index] : WEIGHT_MAPPING) {
+    CHECK(at_weight_tensors_[index].sizes() != std::vector<int64_t>({1}))
+        << "weight is not loaded for " << name;
+  }
+}
+
+void NpuGlm4DecoderLayerImpl::merge_loaded_weights() {
+  at_weight_tensors_[IN_Q_WEIGHT] =
+      torch::cat({at_weight_tensors_[IN_Q_WEIGHT],
+                  at_weight_tensors_[IN_K_WEIGHT],
+                  at_weight_tensors_[IN_V_WEIGHT]},
+                 0)
+          .contiguous();
+  at_weight_tensors_[IN_Q_BIAS] = torch::cat({at_weight_tensors_[IN_Q_BIAS],
+                                              at_weight_tensors_[IN_K_BIAS],
+                                              at_weight_tensors_[IN_V_BIAS]},
+                                             0)
+                                      .contiguous();
+
+  for (auto idx :
+       {IN_MLP_W1_WEIGHT, IN_K_WEIGHT, IN_V_WEIGHT, IN_K_BIAS, IN_V_BIAS}) {
+    at_weight_tensors_[idx] = at_placeholder_;
+  }
+
+  c10_npu::NPUCachingAllocator::emptyCache();
+  for (int i = 0; i < WEIGHT_COUNT_PER_LAYER; ++i) {
+    atb_weight_tensors_[i] =
+        atb_speed::Utils::AtTensor2Tensor(at_weight_tensors_[i]);
+  }
+
+  init_layer();
+}
+
+void NpuGlm4DecoderLayerImpl::load_state_dict(const StateDict& state_dict) {
+  for (const auto& [name, index] : WEIGHT_MAPPING) {
+    if (WEIGHT_SHARD.find(index) != WEIGHT_SHARD.end()) {
+      set_weight(state_dict, name, index, WEIGHT_SHARD[index]);
+    } else {
+      set_weight(state_dict, name, index);
+    }
+  }
+}
+
+int64_t NpuGlm4DecoderLayerImpl::init_layer() {
+  init_attn_mask();
+  name_ = "glm4_decoder_layer";
+  model_name_ = "glm4";
+  CHECK_OPERATION_STATUS_RETURN(init_node(prefill_node_, prefill_param_));
+  CHECK_OPERATION_STATUS_RETURN(init_node(decode_node_, decode_param_));
+
+  return atb::NO_ERROR;
+}
+
+int64_t NpuGlm4DecoderLayerImpl::init_attn_mask() {
+  torch::Dtype dtype =
+      prefill_param_.isBF16 ? torch::kBFloat16 : torch::kFloat16;
+  decode_attn_mask_ = torch::zeros({1}).to(device_).to(dtype);
+
+  return atb::NO_ERROR;
+}
+
+int64_t NpuGlm4DecoderLayerImpl::init_node(
+    atb_speed::Model::Node& node,
+    atb_speed::chatglm::ChatglmLayerParam& param) {
+  atb::Operation* operation = nullptr;
+  atb_speed::chatglm::ChatglmDecoderLayer decoder_layer(param);
+  decoder_layer.BuildGraph(&operation);
+  node.operation.reset(operation);
+  if (node.operation == nullptr) {
+    LOG(ERROR) << "node.operation is null";
+    return -1;
+  }
+  if (node.operation->GetInputNum() < 1) {
+    LOG(ERROR) << "Can not resize number which is smaller than 1";
+    return -1;
+  }
+  node.inTensors.resize(node.operation->GetInputNum());
+  node.outTensors.resize(1);
+  size_t inTensorId = 1;
+
+  for (size_t weightTensorId = 0; weightTensorId < WEIGHT_COUNT_PER_LAYER;
+       ++weightTensorId) {
+    node.inTensors.at(weightTensorId) = &atb_weight_tensors_[weightTensorId];
+  }
+  node.variantPack.inTensors.reserve(node.inTensors.size());
+  node.variantPack.inTensors.resize(node.inTensors.size());
+  node.variantPack.outTensors.reserve(1);
+  node.variantPack.outTensors.resize(1);
+
+  return atb::NO_ERROR;
+}
+
+torch::Tensor NpuGlm4DecoderLayerImpl::forward(torch::Tensor& x,
+                                               torch::Tensor& cos_pos,
+                                               torch::Tensor& sin_pos,
+                                               torch::Tensor& attn_mask,
+                                               KVCache& kv_cache,
+                                               ModelInputParams& input_params,
+                                               aclrtEvent* event,
+                                               std::atomic<bool>* event_flag,
+                                               int node_id) {
+  atb::Status st;
+  if (input_params.decode_seq_range.second !=
+      input_params.q_seq_lens.size(0) - 1) {
+    // if (input_params.empty_kv_cache) {
+    // mstxRangeId id = mstxRangeStartA("prefill build variant", nullptr);
+    build_node_variant_pack(prefill_node_,
+                            x,
+                            cos_pos,
+                            sin_pos,
+                            attn_mask,
+                            kv_cache,
+                            input_params,
+                            true);
+    // mstxRangeEnd(id);
+    st = execute_node(prefill_node_, node_id, event, event_flag);
+    LOG_IF(FATAL, st != 0) << model_name_
+                           << "excute prefill layer fail, error code: " << st;
+  } else {
+    build_node_variant_pack(decode_node_,
+                            x,
+                            cos_pos,
+                            sin_pos,
+                            decode_attn_mask_,
+                            kv_cache,
+                            input_params,
+                            false);
+    st = execute_node(decode_node_, node_id + 1000, event, event_flag);
+    LOG_IF(FATAL, st != 0) << model_name_
+                           << "excute decode layer fail, error code: " << st;
+  }
+
+  return at_placeholder_;
+}
+
+void NpuGlm4DecoderLayerImpl::build_node_variant_pack(
+    atb_speed::Model::Node& node,
+    torch::Tensor& x,
+    torch::Tensor& cos_pos,
+    torch::Tensor& sin_pos,
+    at::Tensor& attn_mask,
+    KVCache& kv_cache,
+    ModelInputParams& input_params,
+    bool is_prefill) {
+  internal_tensors_ = atb_speed::Utils::AtTensor2Tensor(x);
+  // std::cout<<"node.variantPack.inTensors.size:"<<node.variantPack.inTensors.size()<<std::endl;
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER) = internal_tensors_;
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 1) =
+      atb_speed::Utils::AtTensor2Tensor(cos_pos);
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 2) =
+      atb_speed::Utils::AtTensor2Tensor(sin_pos);
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 3) =
+      atb_speed::Utils::AtTensor2Tensor(attn_mask);
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 4) =
+      atb_speed::Utils::AtTensor2Tensor(kv_cache.get_k_cache());
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 5) =
+      atb_speed::Utils::AtTensor2Tensor(kv_cache.get_v_cache());
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 6) =
+      atb_speed::Utils::AtTensor2Tensor(input_params.kv_seq_lens);
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 6).hostData =
+      input_params.kv_seq_lens_vec.data();
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 7) = placeholder_;
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 7).hostData =
+      placeholder_vec_.data();
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 8) = placeholder_;
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 9) =
+      atb_speed::Utils::AtTensor2Tensor(input_params.block_tables);
+  node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 10) =
+      atb_speed::Utils::AtTensor2Tensor(input_params.new_cache_slots);
+  if (is_prefill &&
+      (FLAGS_enable_chunked_prefill || FLAGS_enable_prefix_cache)) {
+    node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 11) =
+        atb_speed::Utils::AtTensor2Tensor(input_params.q_seq_lens);
+    node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 11).hostData =
+        input_params.q_seq_lens_vec.data();
+  }
+
+  for (size_t i = 0; i < WEIGHT_COUNT_PER_LAYER; ++i) {
+    CHECK_THROW(node.inTensors.at(i) == nullptr,
+                model_name_ << "inTensor " << i << "is NULL");
+    node.variantPack.inTensors.at(i) = *node.inTensors.at(i);
+  }
+
+  node.variantPack.outTensors.at(0) = internal_tensors_;
+}
+
+}  // namespace layer
+}  // namespace xllm
diff --git a/xllm/core/layers/npu/npu_glm4_decoder_layer_impl.h b/xllm/core/layers/npu/npu_glm4_decoder_layer_impl.h
new file mode 100644
index 000000000..fc62c2b32
--- /dev/null
+++ b/xllm/core/layers/npu/npu_glm4_decoder_layer_impl.h
@@ -0,0 +1,111 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+#ifdef TORCH_HIGHER_THAN_PTA6
+#include <torch_npu/csrc/core/npu/NPUFormat.h>
+#include <torch_npu/csrc/framework/OpCommand.h>
+#else
+#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
+#include <torch_npu/csrc/framework/utils/OpPreparation.h>
+#endif
+
+#include <torch_npu/csrc/libs/init_npu.h>
+
+#include <functional>
+
+#include "atb/atb_infer.h"
+#include "framework/kv_cache/kv_cache.h"
+#include "framework/model/model_input_params.h"
+#include "framework/model_context.h"
+#include "framework/state_dict/state_dict.h"
+#include "nlohmann/json.hpp"
+#include "npu_base_layer.h"
+#include "pytorch/adapter/utils/utils.h"
+#include "xllm_kernels/core/include/atb_speed/base/hosttensor_binder.h"
+#include "xllm_kernels/core/include/atb_speed/base/model.h"
+#include "xllm_kernels/core/include/atb_speed/log.h"
+#include "xllm_kernels/core/include/atb_speed/utils/model_factory.h"
+#include "xllm_kernels/models/glm/layer/decoder_layer.h"
+
+namespace xllm {
+namespace layer {
+
+class NpuGlm4DecoderLayerImpl : public NpuBaseLayer {
+ public:
+  explicit NpuGlm4DecoderLayerImpl(const ModelContext& context);
+
+  ~NpuGlm4DecoderLayerImpl() {};
+
+  virtual void load_state_dict(const StateDict& state_dict) override;
+
+  virtual void verify_loaded_weights() const override;
+
+  virtual void merge_loaded_weights() override;
+
+  virtual int64_t init_layer() override;
+
+  torch::Tensor forward(torch::Tensor& x,
+                        torch::Tensor& cos_pos,
+                        torch::Tensor& sin_pos,
+                        torch::Tensor& attn_mask,
+                        KVCache& kv_cache,
+                        ModelInputParams& input_params,
+                        aclrtEvent* event = nullptr,
+                        std::atomic<bool>* event_flag = nullptr,
+                        int node_id = 0);
+
+ private:
+  void param_from_args(atb_speed::chatglm::ChatglmLayerParam& param,
+                       const ModelArgs& args,
+                       const ParallelArgs& parallel_args,
+                       bool isPrefill);
+
+  void build_node_variant_pack(atb_speed::Model::Node& node,
+                               torch::Tensor& x,
+                               torch::Tensor& cos_pos,
+                               torch::Tensor& sin_pos,
+                               torch::Tensor& attn_mask,
+                               KVCache& kv_cache,
+                               ModelInputParams& input_params,
+                               bool is_prefill);
+
+  void initialize_quantization_parameters(
+      atb_speed::chatglm::ChatglmLayerParam& param);
+
+  int64_t init_node(atb_speed::Model::Node& node,
+                    atb_speed::chatglm::ChatglmLayerParam& param);
+
+  int64_t init_attn_mask();
+
+  atb_speed::Model::Node prefill_node_;
+  atb_speed::Model::Node decode_node_;
+  std::string model_name_;
+  atb_speed::chatglm::ChatglmLayerParam prefill_param_;
+  atb_speed::chatglm::ChatglmLayerParam decode_param_;
+  atb::Tensor internal_tensors_;
+  atb::Tensor placeholder_;
+
+  at::Tensor decode_attn_mask_;
+
+  at::Tensor at_placeholder_;
+
+  int device_id_;
+  int32_t layer_id_;
+  int rank_id_;
+};
+
+}  // namespace layer
+}  // namespace xllm
diff --git a/xllm/models/llm/glm4.h b/xllm/models/llm/glm4.h
new file mode 100644
index 000000000..db657dac5
--- /dev/null
+++ b/xllm/models/llm/glm4.h
@@ -0,0 +1,221 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include "core/layers/glm4_decoder_layer.h"
+#include "llm_model_base.h"
+
+namespace xllm {
+
+class Glm4DecoderLayerImpl
+    : public LlmDecoderLayerImplBase<layer::Glm4DecoderLayer> {
+ public:
+  Glm4DecoderLayerImpl(const ModelContext& context)
+      : LlmDecoderLayerImplBase<layer::Glm4DecoderLayer>(context) {}
+};
+TORCH_MODULE(Glm4DecoderLayer);
+
+class Glm4ModelImpl : public LlmModelImplBase<Glm4DecoderLayer> {
+ public:
+  Glm4ModelImpl(const ModelContext& context)
+      : LlmModelImplBase<Glm4DecoderLayer>("glm4", context.get_model_args()) {
+    // register submodules
+    auto model_args = context.get_model_args();
+    auto options = context.get_tensor_options();
+    auto parallel_args = context.get_parallel_args();
+    auto dp_local_tp_size =
+        parallel_args.world_size() / parallel_args.dp_size();
+    dp_rank_ = parallel_args.rank() / dp_local_tp_size;
+
+    blocks_ = register_module("layers", torch::nn::ModuleList());
+    layers_.reserve(model_args.n_layers());
+    norm_ = register_module("norm", layer::RmsNorm(context));
+    embed_tokens_ =
+        register_module("embed_tokens", layer::WordEmbedding(context));
+#if defined(USE_NPU)
+    atb_pos_emb_ = layer::PosEmbedding(context);
+#endif
+    cos_sin_ =
+        get_chatglm_rotary_embedding(64,
+                                     model_args.max_position_embeddings(),
+                                     model_args.rope_theta(),
+                                     options);
+#if defined(USE_NPU)
+    int32_t mask_value = FLAGS_enable_chunked_prefill ? -9984 : 1;
+    attn_mask_ = layer::AttentionMask(options.device(),
+                                      options.dtype().toScalarType(),
+                                      /*mask_value=*/mask_value);
+#endif
+
+    for (int32_t i = 0; i < model_args.n_layers(); i++) {
+      auto block = Glm4DecoderLayer(context);
+      layers_.push_back(block);
+      blocks_->push_back(block);
+    }
+  }
+
+  virtual torch::Tensor forward(torch::Tensor tokens,
+                                torch::Tensor positions,
+                                std::vector<KVCache>& kv_caches,
+                                const ModelInputParams& input_params) {
+    ModelInputParams& input_params_new =
+        const_cast<ModelInputParams&>(input_params);
+
+    if (tokens.numel() == 0) {
+      tokens = torch::tensor({1}).to(torch::kInt32).to(tokens.device());
+      positions = torch::tensor({0}).to(torch::kInt32).to(tokens.device());
+    }
+    auto inputs_embeds = input_params.input_embedding;
+    torch::Tensor h;
+    if (inputs_embeds.defined()) {
+      h = inputs_embeds;
+    } else {
+      h = embed_tokens_(tokens, 0);
+    }
+
+    auto target_cos_sin = atb_pos_emb_(cos_sin_, positions, 0);
+    auto target_cos_sin_chunks = target_cos_sin.chunk(/*chunks=*/2, /*dim=*/-1);
+    auto cos_pos = target_cos_sin_chunks[0].contiguous();
+
+    auto sin_pos = target_cos_sin_chunks[1].contiguous();
+
+    if (positions.dim() == 2) {  // mrope
+      auto apply = [this](torch::Tensor x) {
+        auto freqs_t = x[0].clone();
+        for (int dim_idx = 1; dim_idx <= 2; ++dim_idx) {
+          int64_t offset = dim_idx;
+          int64_t section_len = mrope_section_[dim_idx];
+          int64_t length = section_len * 3;
+          auto idx_first_half = torch::arange(offset, length, 3, torch::kLong);
+          auto idx_second_half = torch::arange(offset, length, 3, torch::kLong);
+          auto idx_tensor =
+              torch::cat({idx_first_half, idx_second_half}, 0).to(x.device());
+          // freqs_t[..., idx] = freqs[dim_idx][..., idx]
+          auto src = x[dim_idx].index_select(-1, idx_tensor);
+          freqs_t.index_copy_(-1, idx_tensor, src);
+        }
+        return freqs_t;
+      };
+      cos_pos = apply(cos_pos.reshape(
+          {positions.sizes().front(), -1, cos_pos.sizes().back()}));
+      sin_pos = apply(sin_pos.reshape(
+          {positions.sizes().front(), -1, sin_pos.sizes().back()}));
+    }
+
+    torch::Tensor attn_mask;
+    if (FLAGS_enable_chunked_prefill) {
+      int max_kv_seq = input_params.kv_max_seq_len;
+      int num_sequences = input_params.num_sequences;
+      if (num_sequences > 0) {
+        std::vector<torch::Tensor> req_mask_vec;
+        req_mask_vec.reserve(num_sequences);
+
+        for (int j = 0; j < num_sequences; j++) {
+          auto mask =
+              attn_mask_.gen_append_mask(input_params.q_seq_lens_vec[j],
+                                         input_params.kv_seq_lens_vec[j],
+                                         max_kv_seq,
+                                         cos_pos.dtype().toScalarType(),
+                                         cos_pos.device());
+          req_mask_vec.emplace_back(mask);
+        }
+        attn_mask = torch::cat(req_mask_vec, 0);
+      }
+    } else {
+      if (FLAGS_num_speculative_tokens == 0 ||
+          input_params.global_empty_kv_cache) {
+        attn_mask = attn_mask_.get_attn_mask(
+            128, cos_pos.dtype().toScalarType(), cos_pos.device());
+      } else {
+        attn_mask = attn_mask_.gen_free_mask(FLAGS_num_speculative_tokens + 1,
+                                             cos_pos.dtype().toScalarType(),
+                                             cos_pos.device());
+      }
+    }
+
+    for (size_t i = 0; i < layers_.size(); i++) {
+      aclrtEvent* event{nullptr};
+      std::atomic<bool>* event_flag{nullptr};
+
+      if (input_params.layer_synchronizer != nullptr) {
+        event = input_params.layer_synchronizer->get_event(i);
+        event_flag = input_params.layer_synchronizer->get_event_flag(i);
+      }
+      if (input_params.layer_wise_load_synchronizer != nullptr) {
+        if (!input_params.layer_wise_load_synchronizer->synchronize_layer(i)) {
+          return torch::Tensor();
+        }
+      }
+
+      auto& layer = layers_[i];
+
+      layer(h,
+            cos_pos,
+            sin_pos,
+            attn_mask,
+            kv_caches[i],
+            input_params_new,
+            i,
+            event,
+            event_flag);
+    }
+    return norm_(h, 0);
+  }
+
+ private:
+  torch::Tensor viusal_pos_mask_;
+};
+TORCH_MODULE(Glm4Model);
+
+class Glm4ForCausalLMImpl : public LlmForCausalLMImplBase<Glm4Model> {
+ public:
+  Glm4ForCausalLMImpl(const ModelContext& context)
+      : LlmForCausalLMImplBase<Glm4Model>(context) {}
+};
+TORCH_MODULE(Glm4ForCausalLM);
+
+// register the causal model
+REGISTER_CAUSAL_MODEL(glm4, Glm4ForCausalLM);
+
+// register the model args
+REGISTER_MODEL_ARGS(glm4, [&] {
+  LOAD_ARG_OR(model_type, "model_type", "glm4");
+
+  LOAD_ARG_OR(dtype, "torch_dtype", "");
+  LOAD_ARG_OR(attention_bias, "attention_bias", true);
+  LOAD_ARG_OR(attention_dropout, "attention_dropout", 0.0f);
+  LOAD_ARG_OR(eos_token_id_vec, "eos_token_id", std::vector<int>{151329});
+  LOAD_ARG_OR(head_dim, "head_dim", 128);
+  LOAD_ARG_OR(hidden_act, "hidden_act", "silu");
+  LOAD_ARG_OR(hidden_size, "hidden_size", 4096);
+  LOAD_ARG_OR(initializer_range, "initializer_range", 0.02f);
+  LOAD_ARG_OR(intermediate_size, "intermediate_size", 13696);
+  LOAD_ARG_OR(max_position_embeddings, "max_position_embeddings", 32768);
+  LOAD_ARG_OR(n_heads, "num_attention_heads", 32);
+  LOAD_ARG_OR(n_layers, "num_hidden_layers", 40);
+  LOAD_ARG_OR(n_kv_heads, "num_key_value_heads", 2);
+  LOAD_ARG_OR(pad_token_id, "pad_token_id", 151329);
+  LOAD_ARG_OR(rms_norm_eps, "rms_norm_eps", 1e-5);
+  LOAD_ARG_OR(rope_theta, "rope_theta", 10000.0f);
+  LOAD_ARG_OR(tie_word_embeddings, "tie_word_embeddings", false);
+  LOAD_ARG_OR(vocab_size, "vocab_size", 151552);
+
+  SET_ARG(stop_token_ids,
+          std::unordered_set<int32_t>(args->eos_token_id_vec().begin(),
+                                      args->eos_token_id_vec().end()));
+});
+
+}  // namespace xllm
diff --git a/xllm/models/llm/llm_model_base.h b/xllm/models/llm/llm_model_base.h
index 441b52004..25b0b0000 100644
--- a/xllm/models/llm/llm_model_base.h
+++ b/xllm/models/llm/llm_model_base.h
@@ -50,10 +50,11 @@ limitations under the License.
 
 namespace xllm {
 
-torch::Tensor get_concat_rotary_embedding(int64_t dim,
-                                          int64_t seq_len,
-                                          double rope_theta,
-                                          const torch::TensorOptions& options) {
+torch::Tensor compute_rotary_embedding(int64_t dim,
+                                       int64_t seq_len,
+                                       double rope_theta,
+                                       const torch::TensorOptions& options,
+                                       bool use_cat) {
   auto options_new =
       torch::device(options.device()).dtype(at::ScalarType::Double);
   auto inv_freq =
@@ -62,7 +63,12 @@ torch::Tensor get_concat_rotary_embedding(int64_t dim,
   auto seq_idx = torch::arange(seq_len, options_new);
 
   auto freqs = torch::ger(seq_idx, inv_freq).to(torch::kFloat32);
-  auto emb = torch::cat({freqs, freqs}, -1);
+  torch::Tensor emb;
+  if (use_cat) {
+    emb = torch::cat({freqs, freqs}, -1);
+  } else {
+    emb = torch::stack({freqs, freqs}, -1);
+  }
   auto rope_cos = torch::cos(emb);
   auto rope_sin = torch::sin(emb);
 
@@ -81,6 +87,21 @@ torch::Tensor get_concat_rotary_embedding(int64_t dim,
   return torch::cat(cos_sin, -1);
 }
 
+torch::Tensor get_concat_rotary_embedding(int64_t dim,
+                                          int64_t seq_len,
+                                          double rope_theta,
+                                          const torch::TensorOptions& options) {
+  return compute_rotary_embedding(dim, seq_len, rope_theta, options, true);
+}
+
+torch::Tensor get_chatglm_rotary_embedding(
+    int64_t dim,
+    int64_t seq_len,
+    double rope_theta,
+    const torch::TensorOptions& options) {
+  return compute_rotary_embedding(dim, seq_len, rope_theta, options, false);
+}
+
 template <typename DecoderType>
 class LlmDecoderLayerImplBase : public torch::nn::Module {
  public:
diff --git a/xllm/models/models.h b/xllm/models/models.h
index 7b318e484..48ab4ba88 100644
--- a/xllm/models/models.h
+++ b/xllm/models/models.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "llm/deepseek_v2.h"            // IWYU pragma: keep
 #include "llm/deepseek_v2_mtp.h"        // IWYU pragma: keep
 #include "llm/deepseek_v3.h"            // IWYU pragma: keep
+#include "llm/glm4.h"                   // IWYU pragma: keep
 #include "llm/glm4_moe.h"               // IWYU pragma: keep
 #include "llm/glm4_moe_mtp.h"           // IWYU pragma: keep
 #include "llm/kimi_k2.h"                // IWYU pragma: keep

From 73cd22c0601193995de476aed85f6a04ef387486 Mon Sep 17 00:00:00 2001
From: xiongjun3 <xiongjun3@jd.com>
Date: Sun, 30 Nov 2025 15:50:49 +0800
Subject: [PATCH 08/20] feat: add glm4v_moe.h for GLM-4.6V-Air.

---
 xllm/core/framework/model/model_args.h |   6 +
 xllm/models/llm/glm4_moe.h             |  19 ++-
 xllm/models/models.h                   |   3 +-
 xllm/models/vlm/glm4v_moe.h            | 210 +++++++++++++++++++++++++
 4 files changed, 234 insertions(+), 4 deletions(-)
 create mode 100644 xllm/models/vlm/glm4v_moe.h

diff --git a/xllm/core/framework/model/model_args.h b/xllm/core/framework/model/model_args.h
index c8c421904..2f3ce9d96 100644
--- a/xllm/core/framework/model/model_args.h
+++ b/xllm/core/framework/model/model_args.h
@@ -136,6 +136,12 @@ struct ModelArgs {
   PROPERTY(int32_t, image_token_id) = 0;
   PROPERTY(int32_t, video_token_id) = 0;
 
+  // glm4v moe
+  PROPERTY(int32_t, image_start_token_id) = 0;
+  PROPERTY(int32_t, image_end_token_id) = 0;
+  PROPERTY(int32_t, video_start_token_id) = 0;
+  PROPERTY(int32_t, video_end_token_id) = 0;
+
   PROPERTY(std::string, vision_custom_adapter);
   PROPERTY(int32_t, vision_max_slice_nums) = 0;
 
diff --git a/xllm/models/llm/glm4_moe.h b/xllm/models/llm/glm4_moe.h
index f9e582838..401931ceb 100644
--- a/xllm/models/llm/glm4_moe.h
+++ b/xllm/models/llm/glm4_moe.h
@@ -117,6 +117,14 @@ class Glm4MoeModelImpl : public torch::nn::Module {
     }
   }
 
+  torch::Tensor get_input_embeddings(torch::Tensor input_ids) {
+#if defined(USE_NPU)
+    return embed_tokens_(input_ids, 0);
+#else
+    return embed_tokens_(input_ids);
+#endif
+  }
+
   // tokens: [num_tokens]
   // positions: [num_tokens] token pos in the sequence
   torch::Tensor forward(torch::Tensor tokens,
@@ -260,6 +268,10 @@ class Glm4MoeForCausalLMImpl : public torch::nn::Module {
     lm_head_ = register_module("lm_head", layer::LmHead(context));
   }
 
+  torch::Tensor get_input_embeddings(torch::Tensor input_ids) {
+    return model_->get_input_embeddings(input_ids);
+  }
+
   // tokens: [num_tokens]
   // positions: [num_tokens] token pos in the sequence
   // returns: [num_tokens, hidden_size]
@@ -280,14 +292,15 @@ class Glm4MoeForCausalLMImpl : public torch::nn::Module {
     return lm_head_(hidden_states, seleted_idxes, 0);
   }
 
-  void load_model(std::unique_ptr<ModelLoader> loader) {
+  void load_model(std::unique_ptr<ModelLoader> loader,
+                  std::string prefix = "model." /*llm model weight prefix*/) {
     for (const auto& state_dict : loader->get_state_dicts()) {
-      model_->load_state_dict(state_dict->get_dict_with_prefix("model."));
+      model_->load_state_dict(state_dict->get_dict_with_prefix(prefix));
       lm_head_->load_state_dict(state_dict->get_dict_with_prefix("lm_head."));
     }
 
     // verify
-    model_->verify_loaded_weights("model.");
+    model_->verify_loaded_weights(prefix);
     lm_head_->verify_loaded_weights("lm_head.");
 
     model_->merge_loaded_weights();
diff --git a/xllm/models/models.h b/xllm/models/models.h
index 48ab4ba88..f79a9ed99 100644
--- a/xllm/models/models.h
+++ b/xllm/models/models.h
@@ -34,7 +34,8 @@ limitations under the License.
 #include "llm/llama.h"                  // IWYU pragma: keep
 #include "llm/llama3.h"                 // IWYU pragma: keep
 #include "llm/qwen3_embedding.h"        // IWYU pragma: keep
-#include "vlm/glm4_vl.h"                // IWYU pragma: keep
+#include "vlm/glm4v.h"                // IWYU pragma: keep
+#include "vlm/glm4v_moe.h"              // IWYU pragma: keep
 #include "vlm/minicpmv.h"               // IWYU pragma: keep
 #include "vlm/qwen2_5_vl.h"             // IWYU pragma: keep
 #include "vlm/qwen3_vl.h"               // IWYU pragma: keep
diff --git a/xllm/models/vlm/glm4v_moe.h b/xllm/models/vlm/glm4v_moe.h
new file mode 100644
index 000000000..4452abdce
--- /dev/null
+++ b/xllm/models/vlm/glm4v_moe.h
@@ -0,0 +1,210 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <atb/atb_infer.h>
+#include <c10/core/ScalarType.h>
+#include <glog/logging.h>
+#include <torch/torch.h>
+
+#include <boost/algorithm/string.hpp>
+#include <unordered_map>
+
+#include "core/framework/kv_cache/kv_cache.h"
+#include "core/framework/model/model_input_params.h"
+#include "core/framework/model_context.h"
+#include "core/layers/lm_head.h"
+#include "core/layers/rms_norm.h"
+#include "models/llm/glm4_moe.h"
+#include "models/model_registry.h"
+#include "processors/input_processor.h"
+#include "xllm_kernels/core/include/atb_speed/log.h"
+
+namespace xllm {
+
+class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
+ public:
+  Glm4vMoeForConditionalGenerationImpl(const ModelContext& context)
+      : model_args_(context.get_model_args()),
+        options_(context.get_tensor_options()) {
+    // visual_ = register_module("visual", Glm4VisionTransformer(context));
+
+    language_model_ =
+        register_module("language_model", Glm4MoeForCausalLM(context));
+  }
+
+  torch::Tensor get_input_embeddings(
+      torch::Tensor input_ids,
+      // const std::optional<Glm4VImageInputs>& image_input,
+      // const std::optional<Glm4VVideoInputs>& video_input,
+      const ModelInputParams& input_params) {
+    auto inputs_embeds = language_model_->get_input_embeddings(input_ids);
+    // if (image_input) {
+    //   // visual
+    //   auto [image_embeds, deep_stacks] =
+    //       visual_(image_input->pixel_values.to(options_),
+    //               image_input->image_grid_thw,
+    //               input_params);
+    //   input_params.deep_stacks = deep_stacks;
+    //   // merge
+    //   auto is_multimodal = torch::isin(input_ids,
+    //   model_args_.image_token_id()); input_params.visual_pos_masks =
+    //   is_multimodal; inputs_embeds.index_put_({is_multimodal}, image_embeds);
+    // }
+    return inputs_embeds;
+  }
+
+  torch::Tensor forward(const torch::Tensor& tokens,
+                        const torch::Tensor& positions,
+                        std::vector<KVCache>& kv_caches,
+                        const ModelInputParams& input_params) {
+    torch::NoGradGuard no_grad;
+    // const auto& mm_data = input_params.mm_data;
+    // torch::Tensor pixel_values;
+    // if (const auto& res = mm_data.get<torch::Tensor>("pixel_values"))
+    //   pixel_values = res.value();
+
+    // torch::Tensor image_grid_thw;
+    // if (const auto& res = mm_data.get<torch::Tensor>("image_grid_thw"))
+    //   image_grid_thw = res.value();
+    // std::optional<Glm4VImageInputs> image_inputs;
+    // std::optional<Glm4VVideoInputs> video_inputs;
+
+    // if (pixel_values.defined() && image_grid_thw.defined())
+    //   image_inputs = Glm4VImageInputs{pixel_values, image_grid_thw};
+
+    auto inputs_embeds = get_input_embeddings(tokens, input_params);
+    input_params.input_embedding = inputs_embeds;
+    auto emb = language_model_(tokens, positions, kv_caches, input_params);
+
+    return emb;
+  }
+
+  torch::Tensor logits(const torch::Tensor& hidden_states,
+                       const torch::Tensor& seleted_idxes) {
+    return language_model_->logits(hidden_states, seleted_idxes);
+  }
+
+  void load_model(std::unique_ptr<ModelLoader> loader) {
+    // for (const auto& state_dict : loader->get_state_dicts()) {
+    //   visual_->load_state_dict(
+    //       state_dict->get_dict_with_prefix("model.visual."));
+    // }
+    // // verify
+    // visual_->verify_loaded_weights("model.visual.");
+    // visual_->merge_loaded_weights();
+    if (!model_args_.image_embedding_mode()) {
+      language_model_->load_model(std::move(loader), "model.language_model.");
+    }
+  }
+
+  layer::LmHead get_lm_head() { return language_model_->get_lm_head(); }
+  void set_lm_head(layer::LmHead& head) { language_model_->set_lm_head(head); }
+
+  layer::WordEmbedding get_word_embedding() {
+    return language_model_->get_word_embedding();
+  }
+
+  void set_word_embedding(layer::WordEmbedding& word_embedding) {
+    language_model_->set_word_embedding(word_embedding);
+  }
+
+ private:
+  ModelArgs model_args_;
+  torch::TensorOptions options_;
+  // Glm4VisionTransformer visual_{nullptr};
+  Glm4MoeForCausalLM language_model_{nullptr};
+};
+TORCH_MODULE(Glm4vMoeForConditionalGeneration);
+
+// REGISTER_INPUT_PROCESSOR(glm4v_moe, GLM4VInputProcessor);
+REGISTER_CAUSAL_VLM_MODEL(glm4v_moe, Glm4vMoeForConditionalGeneration);
+// REGISTER_IMAGE_PROCESSOR(glm4v_moe, Glm4vImageProcessor);
+// register the model args
+REGISTER_MODEL_ARGS(glm4v_moe, [&] {
+  LOAD_ARG_OR(model_type, "model_type", "glm4v_moe");
+  LOAD_ARG_OR(image_start_token_id, "image_start_token_id", 151339);
+  LOAD_ARG_OR(image_end_token_id, "image_end_token_id", 151340);
+  LOAD_ARG_OR(video_start_token_id, "video_start_token_id", 151341);
+  LOAD_ARG_OR(video_end_token_id, "video_end_token_id", 151342);
+  LOAD_ARG_OR(image_token_id, "image_token_id", 151363);
+  LOAD_ARG_OR(video_token_id, "video_token_id", 151364);
+  LOAD_ARG_OR(tie_word_embeddings, "tie_word_embeddings", false);
+
+  // text config
+  LOAD_ARG_OR(vocab_size, "text_config.vocab_size", 151552);
+  // LOAD_ARG_OR(pad_token_id, "text_config.pad_token_id", 151329);
+  LOAD_ARG_OR(
+      eos_token_id_vec, "text_config.eos_token_id", std::vector<int>{151329});
+  LOAD_ARG_OR_FUNC(head_dim, "text_config.head_dim", [&] {
+    return args->hidden_size() / args->n_heads();
+  });
+  LOAD_ARG_OR(attention_bias, "text_config.attention_bias", true);
+  LOAD_ARG_OR(attention_dropout, "text_config.attention_dropout", 0.0f);
+  LOAD_ARG_OR(first_k_dense_replace, "text_config.first_k_dense_replace", 1);
+  LOAD_ARG_OR(hidden_act, "text_config.hidden_act", "silu");
+  LOAD_ARG_OR(hidden_size, "text_config.hidden_size", 4096);
+  LOAD_ARG_OR(initializer_range, "text_config.initializer_range", 0.02);
+  LOAD_ARG_OR(intermediate_size, "text_config.intermediate_size", 10944);
+  LOAD_ARG_OR(
+      max_position_embeddings, "text_config.max_position_embeddings", 131072);
+  LOAD_ARG_OR(moe_intermediate_size, "text_config.moe_intermediate_size", 1408);
+  LOAD_ARG_OR(n_group, "text_config.n_group", 1);
+  LOAD_ARG_OR(num_experts, "text_config.n_routed_experts", 128);
+  LOAD_ARG_OR(n_shared_experts, "text_config.n_shared_experts", 1);
+  LOAD_ARG_OR(norm_topk_prob, "text_config.norm_topk_prob", true);
+  LOAD_ARG_OR(n_heads, "text_config.num_attention_heads", 96);
+  LOAD_ARG_OR(num_experts_per_tok, "text_config.num_experts_per_tok", 8);
+  LOAD_ARG_OR(n_layers, "text_config.num_hidden_layers", 46);
+  LOAD_ARG_OR(n_kv_heads, "text_config.num_key_value_heads", 8);
+  // LOAD_ARG_OR(partial_rotary_factor, "text_config.partial_rotary_factor",
+  // 0.5);
+  LOAD_ARG_OR(rms_norm_eps, "text_config.rms_norm_eps", 1e-05);
+  LOAD_ARG_OR(dtype, "text_config.dtype", "bfloat16");
+  LOAD_ARG_OR(rope_scaling_rope_type, "text_config.rope_scaling.type", "mrope");
+  LOAD_ARG(rope_scaling_mrope_section,
+           "text_config.rope_scaling.mrope_section");
+  LOAD_ARG_OR(rope_theta, "text_config.rope_theta", 500000.0f);
+  LOAD_ARG_OR(routed_scaling_factor, "text_config.routed_scaling_factor", 1.0);
+  LOAD_ARG_OR(topk_group, "text_config.topk_group", 1);
+  // LOAD_ARG_OR(use_cache, "text_config.use_cache", true);
+  LOAD_ARG_OR(use_qk_norm, "text_config.use_qk_norm", false);
+
+  // vision config
+  // LOAD_ARG_OR(mm_attention_bias, "vision_config.attention_bias", false);
+  // LOAD_ARG_OR(mm_attention_dropout, "vision_config.attention_dropout", 0.0f);
+  LOAD_ARG_OR(mm_num_hidden_layers, "vision_config.depth", 24);
+  LOAD_ARG_OR(mm_hidden_act, "vision_config.hidden_act", "silu");
+  LOAD_ARG_OR(mm_hidden_size, "vision_config.hidden_size", 1536);
+  LOAD_ARG_OR(mm_image_size, "vision_config.image_size", 336);
+  LOAD_ARG_OR(mm_num_channels, "vision_config.in_channels", 3);
+  LOAD_ARG_OR(mm_initializer_range, "vision_config.initializer_range", 0.02);
+  LOAD_ARG_OR(mm_intermediate_size, "vision_config.intermediate_size", 10944);
+  LOAD_ARG_OR(mm_num_attention_heads, "vision_config.num_heads", 12);
+  LOAD_ARG_OR(mm_projection_dim, "vision_config.out_hidden_size", 4096);
+  LOAD_ARG_OR(mm_patch_size, "vision_config.patch_size", 14);
+  // LOAD_ARG_OR(mm_rms_norm_eps, "text_config.rms_norm_eps", 1e-05);
+  LOAD_ARG_OR(mm_spatial_merge_size, "vision_config.spatial_merge_size", 2);
+  LOAD_ARG_OR(mm_temporal_patch_size, "vision_config.temporal_patch_size", 2);
+  LOAD_ARG_OR_FUNC(mm_head_dim, "head_dim", [&] {
+    return args->mm_hidden_size() / args->mm_num_attention_heads();
+  });
+
+  SET_ARG(stop_token_ids,
+          std::unordered_set<int32_t>(args->eos_token_id_vec().begin(),
+                                      args->eos_token_id_vec().end()));
+});
+}  // namespace xllm

From d3e7baf268bbd2f61842ecb86332903c4bf37bc6 Mon Sep 17 00:00:00 2001
From: chenxiaoyu8 <chenxiaoyu8@jd.com>
Date: Tue, 2 Dec 2025 12:13:57 +0800
Subject: [PATCH 09/20] feat: add glm4_6_vl input processor.

---
 xllm/models/models.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xllm/models/models.h b/xllm/models/models.h
index f79a9ed99..55a6a3b10 100644
--- a/xllm/models/models.h
+++ b/xllm/models/models.h
@@ -36,6 +36,7 @@ limitations under the License.
 #include "llm/qwen3_embedding.h"        // IWYU pragma: keep
 #include "vlm/glm4v.h"                // IWYU pragma: keep
 #include "vlm/glm4v_moe.h"              // IWYU pragma: keep
+#include "vlm/glm4_vl.h"                // IWYU pragma: keep
 #include "vlm/minicpmv.h"               // IWYU pragma: keep
 #include "vlm/qwen2_5_vl.h"             // IWYU pragma: keep
 #include "vlm/qwen3_vl.h"               // IWYU pragma: keep

From 96e605bc8e2a59d4c7af163b561de39f7d1d9a4e Mon Sep 17 00:00:00 2001
From: "wangziyue.28" <wangziyue.28@jd.com>
Date: Wed, 3 Dec 2025 15:47:42 +0800
Subject: [PATCH 10/20] feat: move sample_frames from image_processor.

---
 xllm/core/framework/chat_template/jinja_chat_template.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/xllm/core/framework/chat_template/jinja_chat_template.cpp b/xllm/core/framework/chat_template/jinja_chat_template.cpp
index fcd1f2166..920b768c4 100644
--- a/xllm/core/framework/chat_template/jinja_chat_template.cpp
+++ b/xllm/core/framework/chat_template/jinja_chat_template.cpp
@@ -137,7 +137,11 @@ nlohmann::ordered_json JinjaChatTemplate::get_mm_content(
 
   for (const auto& item : vec) {
     nlohmann::ordered_json item_json;
-    item_json["type"] = item.type;
+    if (item.type == "video_url") {
+      item_json["type"] = "video";
+    } else {
+      item_json["type"] = item.type;
+    }
 
     if (item.type == "text") {
       item_json["text"] = item.text;

From 48284cece9c9e6e50b655718d8919038d8eb3b3f Mon Sep 17 00:00:00 2001
From: jindonghe1 <jindonghe1@jd.com>
Date: Wed, 3 Dec 2025 18:47:56 +0800
Subject: [PATCH 11/20] feat: support new model glm4v.

---
 xllm/core/layers/CMakeLists.txt               |   1 +
 xllm/core/layers/glm4_vision_encode_layer.h   |  39 +
 xllm/core/layers/npu/CMakeLists.txt           |   2 +
 .../npu_glm4_vision_encoder_layer_impl.cpp    | 263 +++++
 .../npu/npu_glm4_vision_encoder_layer_impl.h  | 121 +++
 xllm/models/models.h                          |   2 +-
 xllm/models/vlm/glm4_vl.h                     | 180 ----
 xllm/models/vlm/glm4v.h                       | 969 ++++++++++++++++++
 xllm/models/vlm/glm4v_moe.h                   |  79 +-
 9 files changed, 1436 insertions(+), 220 deletions(-)
 create mode 100644 xllm/core/layers/glm4_vision_encode_layer.h
 create mode 100644 xllm/core/layers/npu/npu_glm4_vision_encoder_layer_impl.cpp
 create mode 100644 xllm/core/layers/npu/npu_glm4_vision_encoder_layer_impl.h
 delete mode 100644 xllm/models/vlm/glm4_vl.h
 create mode 100644 xllm/models/vlm/glm4v.h

diff --git a/xllm/core/layers/CMakeLists.txt b/xllm/core/layers/CMakeLists.txt
index 608cab254..bb7d67ab9 100644
--- a/xllm/core/layers/CMakeLists.txt
+++ b/xllm/core/layers/CMakeLists.txt
@@ -61,6 +61,7 @@ cc_library(
     qwen2_decoder_layer.h
     qwen2dot5_vision_decode_layer.h
     qwen3_vision_encode_layer.h
+    glm4_vision_encode_layer.h
     qwen3_decoder_layer.h
     qwen3_moe_decoder_layer.h
     glm4_decoder_layer.h
diff --git a/xllm/core/layers/glm4_vision_encode_layer.h b/xllm/core/layers/glm4_vision_encode_layer.h
new file mode 100644
index 000000000..792700dfb
--- /dev/null
+++ b/xllm/core/layers/glm4_vision_encode_layer.h
@@ -0,0 +1,39 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#if defined(USE_NPU)
+#include "npu/npu_glm4_vision_encoder_layer_impl.h"
+#endif
+
+namespace xllm {
+namespace layer {
+
+#if defined(USE_NPU)
+class Glm4VisionEncoderLayer
+    : public torch::nn::ModuleHolder<NpuGlm4VisionEncoderLayerImpl> {
+ public:
+  using torch::nn::ModuleHolder<NpuGlm4VisionEncoderLayerImpl>::ModuleHolder;
+  using Impl __attribute__((__unused__)) = NpuGlm4VisionEncoderLayerImpl;
+
+  Glm4VisionEncoderLayer(const ModelContext& context)
+      : ModuleHolder(
+            std::make_shared<NpuGlm4VisionEncoderLayerImpl>(context)) {}
+};
+#endif
+
+}  // namespace layer
+}  // namespace xllm
\ No newline at end of file
diff --git a/xllm/core/layers/npu/CMakeLists.txt b/xllm/core/layers/npu/CMakeLists.txt
index d8aff02f2..8bae3ec2f 100644
--- a/xllm/core/layers/npu/CMakeLists.txt
+++ b/xllm/core/layers/npu/CMakeLists.txt
@@ -18,6 +18,7 @@ cc_library(
     buffer/atb_workspace.h
     npu_base_layer.h
     npu_column_parallel_linear_impl.h
+    npu_glm4_vision_encoder_layer_impl.h
     npu_glm4_moe_decoder_layer.h
     npu_deepseek_v2_decoder_layer_impl.h
     npu_llama_decoder_layer_impl.h
@@ -39,6 +40,7 @@ cc_library(
     buffer/atb_workspace.cpp
     npu_base_layer.cpp
     npu_column_parallel_linear_impl.cpp
+    npu_glm4_vision_encoder_layer_impl.cpp
     npu_glm4_moe_decoder_layer.cpp
     npu_deepseek_v2_decoder_layer_impl.cpp
     npu_llama_decoder_layer_impl.cpp
diff --git a/xllm/core/layers/npu/npu_glm4_vision_encoder_layer_impl.cpp b/xllm/core/layers/npu/npu_glm4_vision_encoder_layer_impl.cpp
new file mode 100644
index 000000000..866ec9c77
--- /dev/null
+++ b/xllm/core/layers/npu/npu_glm4_vision_encoder_layer_impl.cpp
@@ -0,0 +1,263 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// copy from qwen3 vl, please follow its modifications
+#include "npu_glm4_vision_encoder_layer_impl.h"
+
+#include <glog/logging.h>
+#include <mstx/ms_tools_ext.h>
+
+#include <iostream>
+#include <map>
+
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "xllm_kernels/models/glm4v/glm4v_encoder.h"
+
+namespace xllm {
+namespace layer {
+
+enum Glm4VisionEncoderLayerTensorId : int {
+  IN_INPUT_NORM_WEIGHT = 0,
+  IN_POST_NORM_WEIGHT,
+  IN_QKV_WEIGHT,
+  IN_ATTN_PROJ_WEIGHT,
+  IN_LINEAR_GATE_UP_WEIGHT,
+  IN_LINEAR_DOWN_WEIGHT,
+  IN_LINEAR_UP_WEIGHT,
+  IN_LINEAR_GATE_WEIGHT
+};
+
+const uint64_t WEIGHT_COUNT_PER_LAYER = 8;
+
+static std::vector<std::pair<int, std::string>> WEIGHT_MAPPING = {
+    {IN_INPUT_NORM_WEIGHT, "norm1.weight"},
+    {IN_POST_NORM_WEIGHT, "norm2.weight"},
+    {IN_QKV_WEIGHT, "attn.qkv.weight"},
+    {IN_ATTN_PROJ_WEIGHT, "attn.proj.weight"},
+    {IN_LINEAR_GATE_WEIGHT, "mlp.gate_proj.weight"},
+    {IN_LINEAR_UP_WEIGHT, "mlp.up_proj.weight"},
+    {IN_LINEAR_DOWN_WEIGHT, "mlp.down_proj.weight"}};
+
+// {weight,dim}
+// IN_QKV_WEIGHT SHARD artificially in merge_loaded_weights
+static std::map<int, int> WEIGHT_SHARD = {
+    {IN_ATTN_PROJ_WEIGHT, 1},
+    {IN_LINEAR_UP_WEIGHT, 0},
+    {IN_LINEAR_GATE_WEIGHT, 0},
+    {IN_LINEAR_DOWN_WEIGHT, 1}};
+// TODO: check shape with atb log -- HW pxy
+
+void NpuGlm4VisionEncoderLayerImpl::param_from_args(
+    atb_speed::glm::VisionEncoderLayerParam& param,
+    const ModelArgs& args,
+    const ParallelArgs& parallel_args) {
+  param.isBF16 = args.dtype() == "bfloat16";
+  param.supportLcoc = false;
+  param.supportLora = false;
+  param.loraEnableGMM = false;
+  param.enableLogN = false;
+  param.backend = "hccl";
+  param.rank = parallel_args.rank();
+  param.worldSize = parallel_args.world_size();
+
+  param.quantType = 0;
+  param.quantGroupSize = 64;
+
+  param.numAttentionHeadsPerRank =
+      args.mm_num_attention_heads() / param.worldSize;
+  param.hiddenSizePerAttentionHead =
+      args.mm_hidden_size() / args.mm_num_attention_heads();
+  std::optional<long int> optionalValue = args.mm_num_attention_heads();
+  param.numKeyValueHeadsPerRank =
+      static_cast<int>(optionalValue.value()) / param.worldSize;
+
+  param.rmsNormEps = args.rms_norm_eps();
+}
+
+NpuGlm4VisionEncoderLayerImpl::NpuGlm4VisionEncoderLayerImpl(
+    const ModelContext& context)
+    : NpuBaseLayer(context) {
+  auto model_args = context.get_model_args();
+  auto parallel_args = context.get_parallel_args();
+  auto options = context.get_tensor_options();
+  param_from_args(encode_param_, model_args, parallel_args);
+  at_weight_tensors_.resize(WEIGHT_COUNT_PER_LAYER);
+  atb_weight_tensors_.resize(WEIGHT_COUNT_PER_LAYER);
+  dtype_ = c10::typeMetaToScalarType(options.dtype());
+  device_id_ = options.device().index();
+  placeholder_ =
+      atb_speed::Utils::AtTensor2Tensor(torch::zeros({1}).to(device_).to(
+          dtype_));  // seems not to be used -- HW pxy
+  at_placeholder_ = torch::zeros({1}).to(device_).to(dtype_);
+  for (int i = 0; i < WEIGHT_COUNT_PER_LAYER; ++i) {
+    at_weight_tensors_[i] = torch::zeros({1}).to(options);
+  }
+}
+
+void NpuGlm4VisionEncoderLayerImpl::verify_loaded_weights() const {
+  for (const auto& [index, name] : WEIGHT_MAPPING) {
+    CHECK(at_weight_tensors_[index].sizes() != std::vector<int64_t>({1}))
+        << "weight is not loaded for " << name;
+  }
+}
+
+void NpuGlm4VisionEncoderLayerImpl::merge_loaded_weights() {
+  if (encode_param_.worldSize > 1) {
+    // spilt pack qkv weight when enable tp
+    get_weights_col_packed_qkv();
+  }
+
+  at_weight_tensors_[IN_LINEAR_GATE_UP_WEIGHT] = torch::cat({
+		  at_weight_tensors_[IN_LINEAR_GATE_WEIGHT],
+		  at_weight_tensors_[IN_LINEAR_UP_WEIGHT]},
+		  0);
+  at_weight_tensors_[IN_LINEAR_GATE_WEIGHT] = torch::empty({}, device_);
+  at_weight_tensors_[IN_LINEAR_UP_WEIGHT] = torch::empty({}, device_);
+
+  c10_npu::NPUCachingAllocator::emptyCache();
+  for (int i = 0; i < WEIGHT_COUNT_PER_LAYER; ++i) {
+    atb_weight_tensors_[i] =
+        atb_speed::Utils::AtTensor2Tensor(at_weight_tensors_[i]);
+  }
+
+  init_layer();
+}
+
+// tp spilt weight
+void NpuGlm4VisionEncoderLayerImpl::get_weights_col_packed_qkv() {
+  int rank = encode_param_.rank;
+  int worldSize = encode_param_.worldSize;
+  // split qkv weight
+  auto qkv_weight = torch::chunk(at_weight_tensors_[IN_QKV_WEIGHT], 3, 0);
+  // get local weight and merge
+  auto new_qkv_weight = torch::cat({(qkv_weight[0].chunk(worldSize, 0))[rank],
+                                    (qkv_weight[1].chunk(worldSize, 0))[rank],
+                                    (qkv_weight[2].chunk(worldSize, 0))[rank]},
+                                   0);
+  at_weight_tensors_[IN_QKV_WEIGHT] = new_qkv_weight;
+}
+
+void NpuGlm4VisionEncoderLayerImpl::load_state_dict(
+    const StateDict& state_dict) {
+  for (const auto& [index, name] : WEIGHT_MAPPING) {
+    if (WEIGHT_SHARD.find(index) != WEIGHT_SHARD.end()) {
+      set_weight(state_dict, name, index, WEIGHT_SHARD[index]);
+    } else {
+      set_weight(state_dict, name, index);
+    }
+  }
+}
+
+int64_t NpuGlm4VisionEncoderLayerImpl::init_layer() {
+  name_ = "glm4_vision_encoder_layer";
+  model_name_ = "glm4v";
+  CHECK_OPERATION_STATUS_RETURN(init_node(encode_node_, encode_param_));
+  return atb::NO_ERROR;
+}
+
+int64_t NpuGlm4VisionEncoderLayerImpl::init_node(
+    atb_speed::Model::Node& node,
+    atb_speed::glm::VisionEncoderLayerParam& param) {
+  atb::Operation* operation = nullptr;
+  atb_speed::glm::Glm4v_EncoderLayer(param, &operation);
+  node.operation.reset(operation);
+  if (node.operation == nullptr) {
+    LOG(ERROR) << "node.operation is null";
+    return -1;
+  }
+  if (node.operation->GetInputNum() < 1) {
+    LOG(ERROR) << "Can not resize number which is smaller than 1";
+    return -1;
+  }
+  node.inTensors.resize(node.operation->GetInputNum());
+  node.outTensors.resize(1);
+  size_t inTensorId = 1;
+
+  for (size_t weightTensorId = 0; weightTensorId < WEIGHT_COUNT_PER_LAYER;
+       ++weightTensorId) {
+    node.inTensors.at(weightTensorId) = &atb_weight_tensors_[weightTensorId];
+  }
+
+  node.variantPack.inTensors.reserve(node.inTensors.size());
+  node.variantPack.inTensors.resize(node.inTensors.size());
+  node.variantPack.outTensors.reserve(1);
+  node.variantPack.outTensors.resize(1);
+  return atb::NO_ERROR;
+}
+
+torch::Tensor NpuGlm4VisionEncoderLayerImpl::forward(
+    torch::Tensor& x,
+    torch::Tensor& cos_pos,
+    torch::Tensor& sin_pos,
+    torch::Tensor& cu_seqlen,
+    std::vector<int>& cu_seqlen_vec,
+    ModelInputParams& input_params,
+    int node_id,
+    aclrtEvent* event,
+    std::atomic<bool>* event_flag) {
+  atb::Status st;
+
+  build_node_variant_pack(encode_node_,
+                          x,
+                          cos_pos,
+                          sin_pos,
+                          cu_seqlen,
+                          cu_seqlen_vec,
+                          input_params,
+                          true);
+  // mstxRangeEnd(id);
+  st = execute_node(encode_node_, node_id);
+  LOG_IF(FATAL, st != 0) << model_name_
+                         << "excute encode layer fail, error code: " << st;
+  return x;
+}
+
+void NpuGlm4VisionEncoderLayerImpl::build_node_variant_pack(
+    atb_speed::Model::Node& node,
+    torch::Tensor& x,
+    torch::Tensor& cos_pos,
+    torch::Tensor& sin_pos,
+    torch::Tensor& cu_seqlen,
+    std::vector<int>& cu_seqlen_vec,
+    ModelInputParams& input_params,
+    bool is_prefill) {
+  internal_tensors_ = atb_speed::Utils::AtTensor2Tensor(x);
+
+  auto actual_weight_num = WEIGHT_COUNT_PER_LAYER - 2;
+  for (size_t i = 0; i < actual_weight_num; ++i) {
+    CHECK_THROW(node.inTensors.at(i) == nullptr,
+                model_name_ << "inTensor " << i << "is NULL");
+    node.variantPack.inTensors.at(i) = *node.inTensors.at(i);
+    // LOG(INFO) << model_name_ << "inTensors[" << i << "]:"
+    //               << atb_speed::TensorUtil::TensorToString(
+    //                      node.variantPack.inTensors.at(i));
+  }
+  node.variantPack.inTensors.at(actual_weight_num) = internal_tensors_;
+  node.variantPack.inTensors.at(actual_weight_num + 1) =
+      atb_speed::Utils::AtTensor2Tensor(cos_pos);
+  node.variantPack.inTensors.at(actual_weight_num + 2) =
+      atb_speed::Utils::AtTensor2Tensor(sin_pos);
+  node.variantPack.inTensors.at(actual_weight_num + 3) =
+      atb_speed::Utils::AtTensor2Tensor(cu_seqlen);
+  node.variantPack.inTensors.at(actual_weight_num + 3).hostData =
+      cu_seqlen_vec.data();
+
+
+  node.variantPack.outTensors.at(0) = internal_tensors_;
+}
+
+}  // namespace layer
+}  // namespace xllm
\ No newline at end of file
diff --git a/xllm/core/layers/npu/npu_glm4_vision_encoder_layer_impl.h b/xllm/core/layers/npu/npu_glm4_vision_encoder_layer_impl.h
new file mode 100644
index 000000000..75f72aadf
--- /dev/null
+++ b/xllm/core/layers/npu/npu_glm4_vision_encoder_layer_impl.h
@@ -0,0 +1,121 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+#ifdef TORCH_HIGHER_THAN_PTA6
+#include <torch_npu/csrc/core/npu/NPUFormat.h>
+#include <torch_npu/csrc/framework/OpCommand.h>
+#else
+#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
+#include <torch_npu/csrc/framework/utils/OpPreparation.h>
+#endif
+
+#include <torch_npu/csrc/libs/init_npu.h>
+
+#include <functional>
+
+#include "atb/atb_infer.h"
+#include "atb_speed/base/hosttensor_binder.h"
+#include "atb_speed/base/model.h"
+#include "atb_speed/log.h"
+#include "atb_speed/utils/model_factory.h"
+#include "core/framework/model/model_args.h"
+#include "core/framework/model/model_input_params.h"
+#include "core/framework/state_dict/state_dict.h"
+#include "nlohmann/json.hpp"
+#include "npu_base_layer.h"
+#include "pytorch/adapter/utils/utils.h"
+#include "xllm_kernels/models/glm4v/glm4v_encoder.h"
+
+namespace xllm {
+namespace layer {
+
+// copy from qwen3 vl, please follow its modifications
+class NpuGlm4VisionEncoderLayerImpl : public NpuBaseLayer {
+ public:
+  explicit NpuGlm4VisionEncoderLayerImpl(const ModelContext& context);
+
+  ~NpuGlm4VisionEncoderLayerImpl() {};
+
+  void load_state_dict(const StateDict& state_dict) override;
+
+  void verify_loaded_weights() const override;
+
+  void merge_loaded_weights() override;
+
+  int64_t init_layer() override;
+
+  torch::Tensor forward(torch::Tensor& x,
+                        torch::Tensor& cos_pos,
+                        torch::Tensor& sin_pos,
+                        torch::Tensor& cu_seqlen,
+                        std::vector<int>& cu_seqlen_vec,
+                        ModelInputParams& input_params,
+                        int node_id = 0,
+                        aclrtEvent* event = nullptr,
+                        std::atomic<bool>* event_flag = nullptr);
+  void build_node_variant_pack(atb_speed::Model::Node& node,
+                               torch::Tensor& x,
+                               torch::Tensor& cos_pos,
+                               torch::Tensor& sin_pos,
+                               torch::Tensor& cu_seqlen,
+                               std::vector<int>& cu_seqlen_vec,
+                               ModelInputParams& input_params,
+                               bool is_prefill);
+
+  void get_weights_col_packed_qkv();
+
+  void param_from_args(atb_speed::glm::VisionEncoderLayerParam& param,
+                       const ModelArgs& args,
+                       const ParallelArgs& parallel_args);
+
+  int64_t init_node(atb_speed::Model::Node& node,
+                    atb_speed::glm::VisionEncoderLayerParam& param);
+
+  void pad_qkv_weights();
+
+  void pad_mlp_weights();
+
+  torch::Tensor pad_tensor(const torch::Tensor& tensor,
+                           int64_t target_shape,
+                           int64_t dim = 0) {
+    int64_t pad_size = target_shape - tensor.size(dim);
+    if (tensor.dim() == 1) {
+      return torch::nn::functional::pad(
+          tensor, torch::nn::functional::PadFuncOptions({0, pad_size}));
+    } else if (tensor.dim() == 2) {
+      if (1 == dim)
+        return torch::nn::functional::pad(
+            tensor, torch::nn::functional::PadFuncOptions({0, pad_size, 0, 0}));
+      else
+        return torch::nn::functional::pad(
+            tensor, torch::nn::functional::PadFuncOptions({0, 0, 0, pad_size}));
+    }
+    return tensor;
+  }
+
+  atb_speed::Model::Node encode_node_;
+  std::string model_name_;
+
+  atb_speed::glm::VisionEncoderLayerParam encode_param_;
+  atb::Tensor internal_tensors_;
+  atb::Tensor placeholder_;
+  at::Tensor cu_seqlen_;
+  at::Tensor at_placeholder_;
+  int device_id_;
+};
+
+}  // namespace layer
+}  // namespace xllm
\ No newline at end of file
diff --git a/xllm/models/models.h b/xllm/models/models.h
index 55a6a3b10..9fbc3ea45 100644
--- a/xllm/models/models.h
+++ b/xllm/models/models.h
@@ -36,7 +36,7 @@ limitations under the License.
 #include "llm/qwen3_embedding.h"        // IWYU pragma: keep
 #include "vlm/glm4v.h"                // IWYU pragma: keep
 #include "vlm/glm4v_moe.h"              // IWYU pragma: keep
-#include "vlm/glm4_vl.h"                // IWYU pragma: keep
+#include "vlm/glm4v.h"                  // IWYU pragma: keep
 #include "vlm/minicpmv.h"               // IWYU pragma: keep
 #include "vlm/qwen2_5_vl.h"             // IWYU pragma: keep
 #include "vlm/qwen3_vl.h"               // IWYU pragma: keep
diff --git a/xllm/models/vlm/glm4_vl.h b/xllm/models/vlm/glm4_vl.h
deleted file mode 100644
index a9afaceaa..000000000
--- a/xllm/models/vlm/glm4_vl.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#pragma once
-
-#include <torch/torch.h>
-
-#include <string>
-#include <vector>
-
-#include "core/framework/model/model_args.h"
-#include "core/framework/request/mm_data.h"
-#include "processors/input_processor.h"
-
-namespace xllm {
-
-class GLM4_6_VLInputProcessor : public InputProcessor {
-  enum class TokenType {
-    INVALID,
-    IMAGE,
-    VIDEO,
-  };
-
- public:
-  GLM4_6_VLInputProcessor(const ModelArgs& args) {
-    merge_size_ = args.mm_image_merge_size();
-  }
-
-  void process(std::string& prompt, const MMData& mm_data) override {
-    torch::Tensor image_grid_thw;
-    if (auto res = mm_data.get<torch::Tensor>("image_grid_thw"))
-      image_grid_thw = res.value();
-
-    torch::Tensor video_grid_thw;
-    if (auto res = mm_data.get<torch::Tensor>("video_grid_thw"))
-      video_grid_thw = res.value();
-
-    if (!image_grid_thw.defined() && !video_grid_thw.defined()) return;
-
-    const auto& video_metadata = mm_data.get_video_metadata();
-    if (video_metadata.size() > 0) {
-      CHECK(video_metadata.size() ==
-            static_cast<size_t>(video_grid_thw.sizes()[0]));
-    }
-
-    auto merge_length = merge_size_ * merge_size_;
-    int total_image_token = 0;
-
-    if (image_grid_thw.defined()) {
-      auto count = image_grid_thw.sizes()[0];
-      for (int idx = 0; idx < count; ++idx)
-        total_image_token +=
-            image_grid_thw[idx].prod().item<int>() / merge_length;
-    }
-
-    int total_video_token = 0;
-    if (video_grid_thw.defined()) {
-      auto count = video_grid_thw.sizes()[0];
-      for (int idx = 0; idx < count; ++idx)
-        total_video_token += video_grid_thw[idx].prod().item<int>() /
-                             merge_length / video_grid_thw[idx][0].item<int>();
-    }
-
-    size_t total_token_len = total_image_token * image_token_.size() +
-                             total_video_token * image_token_.size();
-    std::string data;
-    data.reserve(prompt.size() + total_token_len);
-
-    int image_index = 0;
-    int video_index = 0;
-
-    size_t begin = 0;
-    auto pair = find_vision_token(prompt, begin);
-
-    while (pair.second != std::string::npos) {
-      data.append(prompt, begin, pair.second - begin);
-
-      if (pair.first == TokenType::IMAGE) {
-        auto token_num =
-            image_grid_thw[image_index].prod().item<int>() / merge_length;
-        while (token_num--) data.append(image_token_);
-
-        image_index++;
-        begin = pair.second + image_token_.size();
-      } else if (pair.first == TokenType::VIDEO) {
-        auto num_frames = video_grid_thw[video_index][0].item<int>();
-        auto timestamps = video_metadata[video_index].timestamps;
-        CHECK(!timestamps.empty());
-
-        auto selected = build_timestamps(timestamps, num_frames);
-        auto token_num = video_grid_thw[video_index].prod().item<int>() /
-                         merge_length / num_frames;
-
-        for (size_t idx = 0; idx < num_frames; ++idx) {
-          data.append(begin_of_image_token_);
-
-          auto num = token_num;
-          while (num--) data.append(image_token_);
-
-          data.append(end_of_image_token_);
-          data.append(format_timestamp_str(selected[idx]));
-        }
-
-        video_index++;
-        begin = pair.second + video_token_.size();
-      } else {
-        assert(false);
-      }
-
-      pair = find_vision_token(prompt, begin);
-    }
-
-    if (begin < prompt.size()) data.append(prompt, begin, std::string::npos);
-
-    prompt = std::move(data);
-  }
-
- private:
-  std::pair<TokenType, size_t> find_vision_token(const std::string& prompt,
-                                                 size_t begin) {
-    auto img_pos = prompt.find(image_token_, begin);
-    auto vid_pos = prompt.find(video_token_, begin);
-
-    if (img_pos == std::string::npos && vid_pos == std::string::npos)
-      return {TokenType::INVALID, std::string::npos};
-    else if (vid_pos == std::string::npos)
-      return {TokenType::IMAGE, img_pos};
-    else if (img_pos == std::string::npos)
-      return {TokenType::VIDEO, vid_pos};
-    else
-      return img_pos < vid_pos ? std::make_pair(TokenType::IMAGE, img_pos)
-                               : std::make_pair(TokenType::VIDEO, vid_pos);
-  }
-
-  std::vector<double> build_timestamps(const std::vector<double>& timestamps,
-                                       size_t num_frames) {
-    std::vector<double> vec;
-    vec.reserve(num_frames);
-
-    for (size_t i = 0; i < timestamps.size(); i += 2) {
-      vec.push_back(timestamps[i]);
-      if (vec.size() == num_frames) break;
-    }
-
-    while (vec.size() < num_frames) {
-      vec.push_back(vec.back());
-    }
-
-    return vec;
-  }
-
-  std::string format_timestamp_str(double timestamp) {
-    char buffer[32];
-    sprintf(buffer, "%.1f seconds", timestamp);
-    return buffer;
-  }
-
- private:
-  const std::string image_token_ = "<|image|>";
-  const std::string video_token_ = "<|video|>";
-
-  const std::string begin_of_image_token_ = "<|begin_of_image|>";
-  const std::string end_of_image_token_ = "<|end_of_image|>";
-
-  int merge_size_ = 0;
-};
-
-}  // namespace xllm
diff --git a/xllm/models/vlm/glm4v.h b/xllm/models/vlm/glm4v.h
new file mode 100644
index 000000000..ce8693424
--- /dev/null
+++ b/xllm/models/vlm/glm4v.h
@@ -0,0 +1,969 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <atb/atb_infer.h>
+#include <c10/core/ScalarType.h>
+#include <glog/logging.h>
+#include <torch/torch.h>
+#include <unordered_map>
+#include <torch/nn/options/vision.h>
+#include "core/framework/kv_cache/kv_cache.h"
+#include "core/framework/model/model_input_params.h"
+#include "core/layers/lm_head.h"
+#include "models/model_registry.h"
+#include "processors/input_processor.h"
+#include "xllm_kernels/core/include/atb_speed/log.h"
+#include "models/llm/glm4.h"
+#include "xllm/core/layers/glm4_vision_encode_layer.h"
+#include "torch_npu/csrc/aten/CustomFunctions.h"
+
+
+namespace xllm {
+
+class GLM4_6_VLInputProcessor : public InputProcessor {
+  enum class TokenType {
+    INVALID,
+    IMAGE,
+    VIDEO,
+  };
+
+ public:
+  GLM4_6_VLInputProcessor(const ModelArgs& args) {
+    merge_size_ = args.mm_image_merge_size();
+  }
+
+  void process(std::string& prompt, const MMData& mm_data) override {
+    torch::Tensor image_grid_thw;
+    if (auto res = mm_data.get<torch::Tensor>("image_grid_thw"))
+      image_grid_thw = res.value();
+
+    torch::Tensor video_grid_thw;
+    if (auto res = mm_data.get<torch::Tensor>("video_grid_thw"))
+      video_grid_thw = res.value();
+
+    if (!image_grid_thw.defined() && !video_grid_thw.defined()) return;
+
+    const auto& video_metadata = mm_data.get_video_metadata();
+    if (video_metadata.size() > 0) {
+      CHECK(video_metadata.size() ==
+            static_cast<size_t>(video_grid_thw.sizes()[0]));
+    }
+
+    auto merge_length = merge_size_ * merge_size_;
+    int total_image_token = 0;
+
+    if (image_grid_thw.defined()) {
+      auto count = image_grid_thw.sizes()[0];
+      for (int idx = 0; idx < count; ++idx)
+        total_image_token +=
+            image_grid_thw[idx].prod().item<int>() / merge_length;
+    }
+
+    int total_video_token = 0;
+    if (video_grid_thw.defined()) {
+      auto count = video_grid_thw.sizes()[0];
+      for (int idx = 0; idx < count; ++idx)
+        total_video_token += video_grid_thw[idx].prod().item<int>() /
+                             merge_length / video_grid_thw[idx][0].item<int>();
+    }
+
+    size_t total_token_len = total_image_token * image_token_.size() +
+                             total_video_token * image_token_.size();
+    std::string data;
+    data.reserve(prompt.size() + total_token_len);
+
+    int image_index = 0;
+    int video_index = 0;
+
+    size_t begin = 0;
+    auto pair = find_vision_token(prompt, begin);
+
+    while (pair.second != std::string::npos) {
+      data.append(prompt, begin, pair.second - begin);
+
+      if (pair.first == TokenType::IMAGE) {
+        auto token_num =
+            image_grid_thw[image_index].prod().item<int>() / merge_length;
+        while (token_num--) data.append(image_token_);
+
+        image_index++;
+        begin = pair.second + image_token_.size();
+      } else if (pair.first == TokenType::VIDEO) {
+        auto num_frames = video_grid_thw[video_index][0].item<int>();
+        auto timestamps = video_metadata[video_index].timestamps;
+        CHECK(!timestamps.empty());
+
+        auto selected = build_timestamps(timestamps, num_frames);
+        auto token_num = video_grid_thw[video_index].prod().item<int>() /
+                         merge_length / num_frames;
+
+        for (size_t idx = 0; idx < num_frames; ++idx) {
+          data.append(begin_of_image_token_);
+
+          auto num = token_num;
+          while (num--) data.append(image_token_);
+
+          data.append(end_of_image_token_);
+          data.append(format_timestamp_str(selected[idx]));
+        }
+
+        video_index++;
+        begin = pair.second + video_token_.size();
+      } else {
+        assert(false);
+      }
+
+      pair = find_vision_token(prompt, begin);
+    }
+
+    if (begin < prompt.size()) data.append(prompt, begin, std::string::npos);
+
+    prompt = std::move(data);
+  }
+
+ private:
+  std::pair<TokenType, size_t> find_vision_token(const std::string& prompt,
+                                                 size_t begin) {
+    auto img_pos = prompt.find(image_token_, begin);
+    auto vid_pos = prompt.find(video_token_, begin);
+
+    if (img_pos == std::string::npos && vid_pos == std::string::npos)
+      return {TokenType::INVALID, std::string::npos};
+    else if (vid_pos == std::string::npos)
+      return {TokenType::IMAGE, img_pos};
+    else if (img_pos == std::string::npos)
+      return {TokenType::VIDEO, vid_pos};
+    else
+      return img_pos < vid_pos ? std::make_pair(TokenType::IMAGE, img_pos)
+                               : std::make_pair(TokenType::VIDEO, vid_pos);
+  }
+
+  std::vector<double> build_timestamps(const std::vector<double>& timestamps,
+                                       size_t num_frames) {
+    std::vector<double> vec;
+    vec.reserve(num_frames);
+
+    for (size_t i = 0; i < timestamps.size(); i += 2) {
+      vec.push_back(timestamps[i]);
+      if (vec.size() == num_frames) break;
+    }
+
+    while (vec.size() < num_frames) {
+      vec.push_back(vec.back());
+    }
+
+    return vec;
+  }
+
+  std::string format_timestamp_str(double timestamp) {
+    char buffer[32];
+    sprintf(buffer, "%.1f seconds", timestamp);
+    return buffer;
+  }
+
+ private:
+  const std::string image_token_ = "<|image|>";
+  const std::string video_token_ = "<|video|>";
+
+  const std::string begin_of_image_token_ = "<|begin_of_image|>";
+  const std::string end_of_image_token_ = "<|end_of_image|>";
+
+  int merge_size_ = 0;
+};
+
+class Glm4VisionRmsNormImpl : public torch::nn::Module {
+  public:
+    torch::Tensor weight;
+    Glm4VisionRmsNormImpl(const ModelContext& context){
+      auto model_args = context.get_model_args();
+      auto options = context.get_tensor_options();
+      weight = torch::empty({model_args.mm_hidden_size()}, options);
+      epsilon_ = 1e-5;
+    }
+
+    torch::Tensor forward(torch::Tensor& x){
+      auto results = at_npu::native::custom_ops::npu_rms_norm(x, weight, epsilon_);
+      return std::get<0>(results);
+    }
+  private:
+    double epsilon_;
+};
+TORCH_MODULE(Glm4VisionRmsNorm);
+
+class Glm4VisionPatchEmbedImpl : public torch::nn::Module {
+ public:
+  Glm4VisionPatchEmbedImpl(const ModelContext& context) {
+    auto model_args = context.get_model_args();
+    auto options = context.get_tensor_options();
+
+    auto in_features = model_args.mm_num_channels() *
+                       model_args.mm_temporal_patch_size() *
+                       model_args.mm_patch_size() * model_args.mm_patch_size();
+
+    auto out_features = model_args.mm_hidden_size();
+
+    proj_ = register_module(
+        "proj",
+        torch::nn::Linear(
+            torch::nn::LinearOptions(in_features, out_features).bias(true)));
+
+    proj_->weight.set_data(proj_->weight.to(options));
+    proj_->bias.set_data(proj_->bias.to(options));
+  }
+
+  torch::Tensor forward(torch::Tensor x) { return proj_(x); }
+
+  void load_state_dict(const StateDict& state_dict) {
+    auto weight = state_dict.get_tensor("proj.weight");
+    if (weight.defined()) {
+      weight = weight.reshape({weight.size(0), -1});
+      DCHECK_EQ(proj_->weight.sizes(), weight.sizes())
+          << "proj weight size mismatch for " << name();
+      proj_->weight.data().copy_(weight);
+      proj_weight_loaded_ = true;
+    }
+    auto bias = state_dict.get_tensor("proj.bias");
+    if (bias.defined()) {
+      bias = bias.reshape({bias.size(0)});
+      DCHECK_EQ(proj_->bias.sizes(), bias.sizes())
+          << "proj bias size mismatch for " << name();
+      proj_->bias.data().copy_(bias);
+      proj_bias_loaded_ = true;
+    }
+  }
+
+  void verify_loaded_weights(const std::string& prefix) const {
+    CHECK(proj_weight_loaded_)
+        << "weight is not loaded for " << prefix + "proj.weight";
+    CHECK(proj_bias_loaded_)
+        << "bias is not loaded for " << prefix + "proj.bias";
+  }
+
+ private:
+  bool proj_weight_loaded_ = false;
+  bool proj_bias_loaded_ = false;
+  torch::nn::Linear proj_{nullptr};
+};
+TORCH_MODULE(Glm4VisionPatchEmbed);
+
+class Glm4_VisionBlockImpl : public torch::nn::Module {
+ public:
+  Glm4_VisionBlockImpl(const ModelContext& context) {
+    // register submodules
+    encoder_layer_ = register_module("encoder_layer",
+                                     layer::Glm4VisionEncoderLayer(context));
+  }
+  //TO DO
+  torch::Tensor forward(torch::Tensor& x,
+                        torch::Tensor& m_cos_pos,
+                        torch::Tensor& m_sin_pos,
+                        torch::Tensor& cu_seq_len,
+                        std::vector<int>& cu_seq_len_vec,
+                        ModelInputParams& input_params,
+                        int node_id) {
+    return encoder_layer_(x,
+                          m_cos_pos,
+                          m_sin_pos,
+                          cu_seq_len,
+                          cu_seq_len_vec,
+                          input_params,
+                          node_id);
+  }
+
+  // load the weight from the checkpoint
+  void load_state_dict(const StateDict& state_dict) {
+    // call each submodule's load_state_dict function
+    encoder_layer_->load_state_dict(state_dict);
+  }
+
+  void verify_loaded_weights(const std::string& prefix) const {
+    encoder_layer_->verify_loaded_weights();
+  }
+  void merge_loaded_weights() { encoder_layer_->merge_loaded_weights(); }
+
+ private:
+  layer::Glm4VisionEncoderLayer encoder_layer_{nullptr};
+};
+TORCH_MODULE(Glm4_VisionBlock);
+
+class Glm4VisionRotaryEmbeddingImpl : public torch::nn::Module {
+ public:
+  Glm4VisionRotaryEmbeddingImpl(const ModelContext& context) {
+    auto model_args = context.get_model_args();
+    auto options = context.get_tensor_options();
+
+    dim_ = model_args.mm_head_dim() / 2;
+    theta_ = 10000.0;
+
+    auto opts = options.dtype(torch::kFloat32);
+    auto inv_freq =
+        1.0 / torch::pow(theta_, torch::arange(0, dim_, 2, opts) / dim_);
+    inv_freq_ = register_buffer("inv_freq", inv_freq);
+  }
+
+  void update_freqs_cache(int64_t seqlen) {
+    if (seqlen <= seq_len_cached_) return;
+
+    seqlen *= 2;
+    seq_len_cached_ = seqlen;
+
+    auto options = torch::TensorOptions()
+                       .dtype(torch::kFloat32)
+                       .device(inv_freq_.device());
+    inv_freq_ =
+        1.0 / torch::pow(theta_, torch::arange(0, dim_, 2, options) / dim_);
+    auto seq = torch::arange(seqlen, options);
+    freqs_cached_ = torch::outer(seq, inv_freq_);
+  }
+
+  torch::Tensor forward(int seqlen) {
+    update_freqs_cache(seqlen);
+    return freqs_cached_.slice(0, 0, seqlen);
+  }
+
+ private:
+  int dim_ = 0;
+  double theta_ = 0.0;
+
+  int64_t seq_len_cached_ = 0;
+  torch::Tensor inv_freq_;
+  torch::Tensor freqs_cached_;
+};
+TORCH_MODULE(Glm4VisionRotaryEmbedding);
+
+class Glm4vVisionEmbeddingsImpl : public torch::nn::Module {
+ public:
+  Glm4vVisionEmbeddingsImpl(const ModelContext& context) {
+    auto model_args = context.get_model_args();
+    auto options = context.get_tensor_options();
+    embed_dim_ = model_args.mm_hidden_size();
+    image_size_ = model_args.mm_image_size();
+    patch_size_ = model_args.mm_patch_size();
+    num_positions_ = image_size_ / patch_size_;
+    num_positions_ = num_positions_ * num_positions_;
+    position_embedding_ = register_module(
+        "position_embedding",
+        torch::nn::Embedding(num_positions_, embed_dim_)
+    );
+    position_embedding_->weight.set_data(position_embedding_->weight.to(options));
+  }
+  torch::Tensor forward(
+      torch::Tensor x,
+      std::vector<int64_t> lengths,
+      torch::Tensor image_shapes,
+      torch::Tensor h_coords,
+      torch::Tensor w_coords
+  ) {
+    const auto& pos_embed_weight = position_embedding_->weight;
+    const int64_t hidden_size = pos_embed_weight.size(1);
+    const int64_t total_seq = x.size(0);
+    const auto device = pos_embed_weight.device();
+    const auto dtype = pos_embed_weight.dtype();
+
+    image_shapes = image_shapes.to(device);
+    h_coords = h_coords.to(device);
+    w_coords = w_coords.to(device);
+    x = x.to(device, dtype);
+
+    torch::Tensor adapted_pos_embed;
+    if (total_seq == 0) {
+      adapted_pos_embed = torch::empty(
+          {0, hidden_size},
+          torch::TensorOptions().device(device).dtype(dtype)
+      );
+    } else {
+      const int64_t batch_size = static_cast<int64_t>(lengths.size());
+      const int64_t orig_size_sq = pos_embed_weight.size(0);
+      const int64_t orig_size = static_cast<int64_t>(std::sqrt(orig_size_sq));
+      auto pos_embed_2d = pos_embed_weight
+          .view({orig_size, orig_size, hidden_size})
+          .permute({2, 0, 1})
+          .unsqueeze(0)
+          .to(torch::kFloat32);
+
+      std::vector<torch::Tensor> target_h_list;
+      std::vector<torch::Tensor> target_w_list;
+      target_h_list.reserve(batch_size);
+      target_w_list.reserve(batch_size);
+
+      for (int64_t i = 0; i < batch_size; ++i) {
+        const int64_t seq_len = lengths[i];
+        const auto img_h = image_shapes.index({i, 1}).to(torch::kFloat32);
+        const auto img_w = image_shapes.index({i, 2}).to(torch::kFloat32);
+
+        target_h_list.push_back(img_h.repeat({seq_len}));
+        target_w_list.push_back(img_w.repeat({seq_len}));
+      }
+
+      auto target_h = torch::cat(target_h_list, 0);
+      auto target_w = torch::cat(target_w_list, 0);
+
+      auto h_coords_fp32 = h_coords.to(torch::kFloat32);
+      auto w_coords_fp32 = w_coords.to(torch::kFloat32);
+
+      const auto norm_w = ((w_coords_fp32 + 0.5f) / target_w) * 2.0f - 1.0f;
+      const auto norm_h = ((h_coords_fp32 + 0.5f) / target_h) * 2.0f - 1.0f;
+
+      auto grid = torch::stack({norm_w, norm_h}, -1)
+          .unsqueeze(0)
+          .unsqueeze(2);
+
+      namespace F = torch::nn::functional;
+      auto interpolated_embed = F::grid_sample(
+        pos_embed_2d,
+        grid);
+      // namespace F = torch::nn::functional;
+      // auto interpolated_embed = F::grid_sample(
+      //   pos_embed_2d,
+      //   grid,
+      //   F::GridSampleFuncOptions().mode(torch::kBicubic).padding_mode(torch::kBorder).align_corners(false));
+      // TODO
+      adapted_pos_embed = interpolated_embed
+          .squeeze(0)
+          .squeeze(-1)
+          .permute({1, 0})
+          .to(dtype);
+    }
+
+    return x + adapted_pos_embed;
+  }
+
+  void load_state_dict(const StateDict& state_dict) {
+    auto weight = state_dict.get_tensor("position_embedding.weight");
+    if (weight.defined()) {
+      position_embedding_->weight.data().copy_(weight);
+      position_embedding_weight_loaded_ = true;
+    }
+  }
+
+  void verify_loaded_weights(const std::string& prefix) const {
+    CHECK(position_embedding_weight_loaded_)
+        << "weight is not loaded for " << prefix + "position_embedding.weight";
+  }
+ private:
+  int64_t embed_dim_ = 0;
+  int64_t image_size_ = 0;
+  int64_t patch_size_ = 0 ;
+  int64_t num_positions_ = 0;
+  bool position_embedding_weight_loaded_ = false;
+  torch::nn::Embedding position_embedding_{nullptr};
+};
+TORCH_MODULE(Glm4vVisionEmbeddings);
+
+class Glm4_VisionPatchMergerImpl : public torch::nn::Module {
+ public:
+  Glm4_VisionPatchMergerImpl(const ModelContext& context) {
+    auto model_args = context.get_model_args();
+    auto options = context.get_tensor_options();
+    auto parallel_args = context.get_parallel_args();
+    int64_t dim = model_args.mm_projection_dim();
+    int64_t context_dim = model_args.mm_intermediate_size();
+    norm_ = register_module("norm", torch::nn::LayerNorm(torch::nn::LayerNormOptions({dim})));
+    norm_->weight.set_data(norm_->weight.to(options));
+    norm_->bias.set_data(norm_->bias.to(options));
+    proj_ = register_module(
+        "proj",
+        torch::nn::Linear(torch::nn::LinearOptions(dim, dim).bias(false)));
+
+    act_ = register_module("act", torch::nn::GELU());
+    silu_ = register_module("silu", torch::nn::SiLU());
+
+    gate_ = register_module(
+        "gate",
+        torch::nn::Linear(torch::nn::LinearOptions(dim, context_dim).bias(false)));
+
+    up_ = register_module(
+        "up",
+        torch::nn::Linear(torch::nn::LinearOptions(dim, context_dim).bias(false)));
+
+    down_ = register_module(
+        "down",
+        torch::nn::Linear(torch::nn::LinearOptions(context_dim, dim).bias(false)));
+  }
+
+  torch::Tensor forward(torch::Tensor x) {
+    x = proj_(x);
+    x = act_(norm_(x));
+    x = down_(torch::mul(silu_((gate_(x))), up_(x)));
+    return x;
+  }
+
+  void load_state_dict(const StateDict& state_dict) {
+    // norm
+    const auto& norm_dict = state_dict.get_dict_with_prefix("post_projection_norm.");
+    const auto& norm_weight = norm_dict.get_tensor("weight");
+    if (norm_weight.defined()) {
+      CHECK_EQ(norm_->weight.sizes(), norm_weight.sizes())
+          << "weight size mismatch for " << name();
+      norm_->weight.data().copy_(norm_weight);
+      is_norm_weight_loaded = true;
+    }
+    const auto norm_bias = norm_dict.get_tensor("bias");
+    if (norm_bias.defined()) {
+      CHECK_EQ(norm_->bias.sizes(), norm_bias.sizes())
+          << "bias size mismatch for " << name();
+      norm_->bias.data().copy_(norm_bias);
+      is_norm_bias_loaded = true;
+    }
+
+    const auto& proj_dict = state_dict.get_dict_with_prefix("proj.");
+    const auto& proj_weight = proj_dict.get_tensor("weight");
+    if (proj_weight.defined()) {
+      proj_->weight.data().copy_(proj_weight);
+      is_proj_weight_loaded = true;
+    }
+
+    const auto& up_dict = state_dict.get_dict_with_prefix("up_proj.");
+    const auto& up_weight = up_dict.get_tensor("weight");
+    if (up_weight.defined()) {
+      up_->weight.data().copy_(up_weight);
+      is_up_weight_loaded = true;
+    }
+
+    const auto& down_dict = state_dict.get_dict_with_prefix("down_proj.");
+    const auto& down_weight = down_dict.get_tensor("weight");
+    if (down_weight.defined()) {
+      down_->weight.data().copy_(down_weight);
+      is_down_weight_loaded = true;
+    }
+
+    const auto& gate_dict = state_dict.get_dict_with_prefix("gate_proj.");
+    const auto& gate_weight = gate_dict.get_tensor("weight");
+    if (gate_weight.defined()) {
+      gate_->weight.data().copy_(gate_weight);
+      is_gate_weight_loaded = true;
+    }
+  }
+
+  void verify_loaded_weights(const std::string& prefix) const {
+    CHECK(is_proj_weight_loaded)
+        << "weight is not loaded for " << prefix + "proj_weight" + ".weight";
+    CHECK(is_up_weight_loaded)
+        << "weight is not loaded for " << prefix + "up_weight" + ".weight";
+    CHECK(is_down_weight_loaded)
+        << "weight is not loaded for " << prefix + "down_weight" + ".weight";
+    CHECK(is_gate_weight_loaded)
+        << "weight is not loaded for " << prefix + "gate_weight" + ".weight";
+    CHECK(is_norm_weight_loaded)
+        << "weight is not loaded for " << prefix + "norm" + ".weight";
+    CHECK(is_norm_bias_loaded)
+        << "bias is not loaded for " << prefix + "norm" + ".bias";
+  }
+
+ private:
+  torch::nn::LayerNorm norm_{nullptr};
+  torch::nn::Linear proj_{nullptr};
+  torch::nn::Linear up_{nullptr};
+  torch::nn::Linear gate_{nullptr};
+  torch::nn::Linear down_{nullptr};
+  torch::nn::GELU act_{nullptr};
+  torch::nn::SiLU silu_{nullptr};
+
+
+  bool is_proj_weight_loaded = false;
+  bool is_up_weight_loaded = false;
+  bool is_down_weight_loaded = false;
+  bool is_gate_weight_loaded = false;
+  bool is_norm_weight_loaded = false;
+  bool is_norm_bias_loaded = false;
+};
+TORCH_MODULE(Glm4_VisionPatchMerger);
+
+class Glm4VisionTransformerImpl : public torch::nn::Module {
+ public:
+  Glm4VisionTransformerImpl(const ModelContext& context): options_(context.get_tensor_options()) {
+    auto model_args = context.get_model_args();
+    spatial_merge_size_ = model_args.mm_spatial_merge_size();
+    hidden_size_ = model_args.mm_hidden_size();
+    out_hidden_size_ = model_args.mm_projection_dim();
+
+    patch_embed_ =
+        register_module("patch_embed", Glm4VisionPatchEmbed(context));
+    rotary_pos_emb_ =
+        register_module("rotary_pos_emb", Glm4VisionRotaryEmbedding(context));
+    post_conv_layernorm_ = register_module("post_conv_layernorm", Glm4VisionRmsNorm(context));
+
+    embeddings_ = register_module("embeddings", Glm4vVisionEmbeddings(context));
+
+    blocks_ = register_module("blocks", torch::nn::ModuleList());
+
+    for (int32_t idx = 0; idx < model_args.mm_num_hidden_layers(); idx++) {
+      auto block = Glm4_VisionBlock(context);
+      blocks_->push_back(block);
+      layers_.push_back(block);
+    }
+     // TODO 融合算子
+    post_layernorm_ = register_module("post_layernorm", Glm4VisionRmsNorm(context));
+
+    downsample_ = register_module("downsample", torch::nn::Conv2d(torch::nn::Conv2dOptions(hidden_size_, out_hidden_size_, spatial_merge_size_)
+                                                                  .stride(spatial_merge_size_).bias(true).padding(0)));
+    merger_ = register_module("merger", Glm4_VisionPatchMerger(context));
+
+  }
+  std::tuple<torch::Tensor, torch::Tensor> rot_pos_emb(torch::Tensor grid_thw) {
+    std::vector<torch::Tensor> pos_ids_vec;
+    auto count = grid_thw.sizes()[0];
+    pos_ids_vec.reserve(count);
+    auto options =
+      torch::TensorOptions().dtype(torch::kLong).device(grid_thw.device());
+
+    auto grid_thw_cpu = grid_thw.cpu();
+    for (int idx = 0; idx < count; ++idx) {
+      auto t = grid_thw_cpu[idx][0].item<int64_t>();
+      auto h = grid_thw_cpu[idx][1].item<int64_t>();
+      auto w = grid_thw_cpu[idx][2].item<int64_t>();
+      auto hpos_ids = torch::arange(h, options).unsqueeze(1).expand({-1, w});
+      hpos_ids = hpos_ids.reshape({h / spatial_merge_size_,
+                                  spatial_merge_size_,
+                                  w / spatial_merge_size_,
+                                  spatial_merge_size_}).permute({0, 2, 1, 3}).flatten();
+      auto wpos_ids = torch::arange(w, options).unsqueeze(0).expand({h, -1});
+      wpos_ids = wpos_ids.reshape({h / spatial_merge_size_,
+                                  spatial_merge_size_,
+                                  w / spatial_merge_size_,
+                                  spatial_merge_size_}).permute({0, 2, 1, 3}).flatten();
+      pos_ids_vec.push_back(torch::stack({hpos_ids, wpos_ids}, -1).repeat({t, 1}));
+    }
+    auto pos_ids = torch::cat(pos_ids_vec, 0);
+    auto max_grid_size = grid_thw.index({torch::indexing::Slice(),
+                  torch::indexing::Slice(1, torch::indexing::None)}).max();
+    auto rotary_pos_emb_full = rotary_pos_emb_(max_grid_size.item<int64_t>());
+    auto rotary_pos_emb = rotary_pos_emb_full.index({pos_ids}).flatten(1);
+
+    return std::make_tuple(rotary_pos_emb, pos_ids);
+  }
+
+  torch::Tensor forward(
+       torch::Tensor hidden_states,
+       torch::Tensor grid_thw,
+       const ModelInputParams& input_params) {
+    hidden_states = patch_embed_(hidden_states);
+    // at_npu::native::custom_ops::npu_rms_norm()
+    hidden_states = post_conv_layernorm_(hidden_states);
+    // hidden_states = at_npu::native::custom_ops::npu_rms_norm(hidden_states);
+
+    auto [rotary_pos_emb, image_type_ids] = rot_pos_emb(grid_thw);
+    auto emb = torch::cat({rotary_pos_emb, rotary_pos_emb}, -1);
+    auto m_cos = emb.cos(); 
+    auto m_sin = emb.sin();
+
+    auto device = grid_thw.device();
+    auto grid_t = grid_thw.index_select(1, torch::tensor({0}, torch::TensorOptions().dtype(torch::kLong).device(device)));
+    auto grid_h = grid_thw.index_select(1, torch::tensor({1}, torch::TensorOptions().dtype(torch::kLong).device(device)));
+    auto grid_w = grid_thw.index_select(1, torch::tensor({2}, torch::TensorOptions().dtype(torch::kLong).device(device)));
+    auto h_times_w = (grid_h * grid_w).squeeze(1);
+    auto repeats = grid_t.squeeze(1);
+    auto repeated = torch::repeat_interleave(h_times_w, repeats, 0);
+    c10::optional<torch::ScalarType> cumsum_dtype;
+    // if (torch::jit::is_tracing()) {
+    //     cumsum_dtype = grid_thw.scalar_type();
+    // } else {
+    cumsum_dtype = torch::kInt32;
+    auto cu_seqlens = torch::cumsum(repeated, 0, cumsum_dtype);
+    namespace F = torch::nn::functional;
+    cu_seqlens = F::pad(
+        cu_seqlens, F::PadFuncOptions({1, 0}).mode(torch::kConstant).value(0));
+    cu_seqlens = torch::diff(cu_seqlens);
+    torch::Tensor cu_seqlens_slice1 = cu_seqlens.narrow(0, 1, cu_seqlens.size(0) - 1);
+    torch::Tensor cu_seqlens_slice0 = cu_seqlens.narrow(0, 0, cu_seqlens.size(0) - 1);
+    torch::Tensor seqlens_tensor = cu_seqlens_slice1 - cu_seqlens_slice0;
+    std::vector<int64_t> seqlens;
+    seqlens.assign(
+      seqlens_tensor.cpu().to(torch::kLong).data_ptr<int64_t>(),
+      seqlens_tensor.cpu().to(torch::kLong).data_ptr<int64_t>() + seqlens_tensor.numel()
+    );
+
+    hidden_states = embeddings_(hidden_states, seqlens, grid_thw, image_type_ids.select(1, 0), image_type_ids.select(1, 1));
+    ModelInputParams& input_params_new =
+        const_cast<ModelInputParams&>(input_params);
+    torch::Tensor cu_seqlens_cpu = cu_seqlens.cpu();
+    std::vector<int> cu_seqlens_vec( 
+        cu_seqlens_cpu.data_ptr<int>(),  // full seqlen vec
+        cu_seqlens_cpu.data_ptr<int>() + cu_seqlens_cpu.numel());
+    for (int idx = 0; idx < blocks_->size(); ++idx) {  
+      hidden_states = layers_[idx](hidden_states, m_cos, m_sin, cu_seqlens, cu_seqlens_vec, input_params_new, idx); //TODO
+    }
+    hidden_states = post_layernorm_(hidden_states);
+    hidden_states = hidden_states.view({-1, spatial_merge_size_, spatial_merge_size_, hidden_states.size(-1)});
+    // TO down sample  merge op
+    hidden_states = hidden_states.permute({0, 3, 1, 2});
+    hidden_states = downsample_(hidden_states).view({-1, out_hidden_size_});
+    hidden_states = merger_(hidden_states);
+    return hidden_states;
+  };
+
+  void load_state_dict(const StateDict& state_dict) {
+    patch_embed_->load_state_dict(
+        state_dict.get_dict_with_prefix("patch_embed."));
+    embeddings_->load_state_dict(state_dict.get_dict_with_prefix("embeddings."));
+    const auto& norm_weight = state_dict.get_dict_with_prefix("post_conv_layernorm.").get_tensor("weight");
+    if (norm_weight.defined()) {
+      CHECK_EQ(post_conv_layernorm_->weight.sizes(), norm_weight.sizes())
+          << "weight size mismatch for " << name();
+      post_conv_layernorm_->weight.data().copy_(norm_weight);
+      is_post_conv_layernorm_weight_loaded = true;
+    }
+    for (int idx = 0; idx < layers_.size(); ++idx) {
+      layers_[idx]->load_state_dict(state_dict.get_dict_with_prefix(
+          "blocks." + std::to_string(idx) + "."));
+    }
+
+    const auto& post_norm_weight = state_dict.get_dict_with_prefix("post_layernorm.").get_tensor("weight");
+    if (post_norm_weight.defined()) {
+      CHECK_EQ(post_layernorm_->weight.sizes(), post_norm_weight.sizes())
+          << "weight size mismatch for " << name();
+      post_layernorm_->weight.data().copy_(post_norm_weight);
+      is_post_layernorm_weight_loaded = true;
+    }
+    const auto& downsample_dict = state_dict.get_dict_with_prefix("downsample.");
+    const auto& downsample_weight = downsample_dict.get_tensor("weight");
+    const auto& downsample_bias = downsample_dict.get_tensor("bias");
+    if (downsample_weight.defined()) {
+      downsample_->weight.data().copy_(downsample_weight);
+      is_downsample_weight_loaded_ = true;
+    }
+    if (downsample_bias.defined()) {
+      downsample_->bias.data().copy_(downsample_bias);
+      is_downsample_bias_loaded_ = true;
+    }
+    merger_->load_state_dict(state_dict.get_dict_with_prefix("merger."));
+  }
+
+  void verify_loaded_weights(const std::string& prefix) const {
+    patch_embed_->verify_loaded_weights(prefix + "patch_embed.");
+    embeddings_->verify_loaded_weights(prefix + "embeddings.");
+    CHECK(is_post_conv_layernorm_weight_loaded)
+        << "weight is not loaded for " << prefix + "post_conv_layernorm.weight";
+    for (int idx = 0; idx < blocks_->size(); ++idx) {
+      layers_[idx]->verify_loaded_weights(prefix + "blocks." +
+                                          std::to_string(idx) + ".");
+    }
+    CHECK(is_post_layernorm_weight_loaded)
+        << "weight is not loaded for " << prefix + "post_layernorm.weight";
+    merger_->verify_loaded_weights(prefix + "merger.");
+
+    CHECK(is_downsample_weight_loaded_)
+        << "weight is not loaded for " << prefix + "downsample.weight";
+    CHECK(is_downsample_bias_loaded_)
+        << "bias is not loaded for " << prefix + "downsample.bias";
+  }
+
+  void merge_loaded_weights() {
+    for (int idx = 0; idx < layers_.size(); ++idx) {
+      layers_[idx]->merge_loaded_weights();
+    }
+   }
+ private:
+  int hidden_size_ = 0;
+  int out_hidden_size_ = 0;
+  int spatial_merge_size_ = 0;
+
+  Glm4VisionPatchEmbed patch_embed_{nullptr};
+  Glm4VisionRotaryEmbedding rotary_pos_emb_{nullptr};
+  torch::nn::ModuleList blocks_{nullptr};
+  Glm4vVisionEmbeddings embeddings_{nullptr};
+  Glm4VisionRmsNorm post_conv_layernorm_{nullptr};
+  Glm4VisionRmsNorm post_layernorm_{nullptr};
+  torch::nn::Conv2d downsample_{nullptr};
+  std::vector<Glm4_VisionBlock> layers_;
+  Glm4_VisionPatchMerger merger_{nullptr};
+  torch::TensorOptions options_;
+  bool is_post_conv_layernorm_weight_loaded = false;
+  bool is_post_layernorm_weight_loaded = false;
+  bool is_downsample_weight_loaded_ = false;
+  bool is_downsample_bias_loaded_ = false;
+};
+TORCH_MODULE(Glm4VisionTransformer);
+
+struct Glm4VImageInputs {
+  torch::Tensor pixel_values;
+  torch::Tensor image_grid_thw;
+};
+
+struct Glm4VVideoInputs {
+  torch::Tensor pixel_values_videos;
+  torch::Tensor video_grid_thw;
+  torch::Tensor second_per_grid_ts;
+};
+
+class Glm4vForConditionalGenerationImpl : public torch::nn::Module {
+ public:
+  Glm4vForConditionalGenerationImpl(const ModelContext& context)
+      : model_args_(context.get_model_args()),
+        options_(context.get_tensor_options()) {
+    visual_ = register_module("visual", Glm4VisionTransformer(context));
+
+    language_model_ =
+        register_module("language_model", Glm4ForCausalLM(context));
+  }
+
+  torch::Tensor get_input_embeddings(
+      torch::Tensor input_ids,
+      const std::optional<Glm4VImageInputs>& image_input,
+      const std::optional<Glm4VVideoInputs>& video_input,
+      const ModelInputParams& input_params) {
+    auto inputs_embeds = language_model_->get_input_embeddings(input_ids);
+    if (image_input) {
+      // visual
+      auto image_embeds =
+          visual_(image_input->pixel_values.to(options_),
+                  image_input->image_grid_thw,
+                  input_params);
+      // merge
+      auto is_multimodal = torch::isin(input_ids,
+      model_args_.image_token_id()); input_params.visual_pos_masks =
+      is_multimodal; inputs_embeds.index_put_({is_multimodal}, image_embeds);
+    }
+    return inputs_embeds;
+  }
+
+  torch::Tensor forward(const torch::Tensor& tokens,
+                        const torch::Tensor& positions,
+                        std::vector<KVCache>& kv_caches,
+                        const ModelInputParams& input_params) {
+    torch::NoGradGuard no_grad;
+    const auto& mm_data = input_params.mm_data;
+    torch::Tensor pixel_values;
+    if (const auto& res = mm_data.get<torch::Tensor>("pixel_values"))
+      pixel_values = res.value();
+
+    torch::Tensor image_grid_thw;
+    if (const auto& res = mm_data.get<torch::Tensor>("image_grid_thw"))
+      image_grid_thw = res.value();
+    std::optional<Glm4VImageInputs> image_inputs;
+    std::optional<Glm4VVideoInputs> video_inputs;
+
+    if (pixel_values.defined() && image_grid_thw.defined())
+      image_inputs = Glm4VImageInputs{pixel_values, image_grid_thw};
+
+    auto inputs_embeds = get_input_embeddings(tokens, image_inputs, video_inputs, input_params);
+    input_params.input_embedding = inputs_embeds;
+    auto emb = language_model_(tokens, positions, kv_caches, input_params);
+
+    return emb;
+  }
+
+  torch::Tensor logits(const torch::Tensor& hidden_states,
+                       const torch::Tensor& seleted_idxes) {
+    return language_model_->logits(hidden_states, seleted_idxes);
+  }
+
+  void load_model(std::unique_ptr<ModelLoader> loader) {
+    for (const auto& state_dict : loader->get_state_dicts()) {
+      visual_->load_state_dict(
+          state_dict->get_dict_with_prefix("model.visual."));
+    }
+    // verify
+    visual_->verify_loaded_weights("model.visual.");
+    visual_->merge_loaded_weights();
+    if (!model_args_.image_embedding_mode()) {
+      language_model_->load_model(std::move(loader), "model.language_model.");
+    }
+  }
+
+  layer::LmHead get_lm_head() { return language_model_->get_lm_head(); }
+  void set_lm_head(layer::LmHead& head) { language_model_->set_lm_head(head); }
+
+  layer::WordEmbedding get_word_embedding() {
+    return language_model_->get_word_embedding();
+  }
+
+  void set_word_embedding(layer::WordEmbedding& word_embedding) {
+    language_model_->set_word_embedding(word_embedding);
+  }
+
+ private:
+  ModelArgs model_args_;
+  torch::TensorOptions options_;
+  Glm4VisionTransformer visual_{nullptr};
+  Glm4ForCausalLM language_model_{nullptr};
+};
+TORCH_MODULE(Glm4vForConditionalGeneration);
+
+REGISTER_INPUT_PROCESSOR(glm4v, GLM4_6_VLInputProcessor);
+REGISTER_CAUSAL_VLM_MODEL(glm4v, Glm4vForConditionalGeneration);
+REGISTER_IMAGE_PROCESSOR(glm4v, Glm4VImageProcessor);
+// register the model args
+REGISTER_MODEL_ARGS(glm4v, [&] {
+  LOAD_ARG_OR(model_type, "model_type", "glm4v");
+  LOAD_ARG_OR(image_start_token_id, "image_start_token_id", 151339);
+  LOAD_ARG_OR(image_end_token_id, "image_end_token_id", 151340);
+  LOAD_ARG_OR(video_start_token_id, "video_start_token_id", 151341);
+  LOAD_ARG_OR(video_end_token_id, "video_end_token_id", 151342);
+  LOAD_ARG_OR(image_token_id, "image_token_id", 151363);
+  LOAD_ARG_OR(video_token_id, "video_token_id", 151364);
+  LOAD_ARG_OR(tie_word_embeddings, "tie_word_embeddings", false);
+
+  // text config
+  LOAD_ARG_OR(vocab_size, "text_config.vocab_size", 151552);
+  // LOAD_ARG_OR(pad_token_id, "text_config.pad_token_id", 151329);
+  LOAD_ARG_OR(
+      eos_token_id_vec, "text_config.eos_token_id", std::vector<int>{151329});
+  LOAD_ARG_OR_FUNC(head_dim, "text_config.head_dim", [&] {
+    return args->hidden_size() / args->n_heads();
+  });
+  LOAD_ARG_OR(attention_bias, "text_config.attention_bias", true);
+  LOAD_ARG_OR(attention_dropout, "text_config.attention_dropout", 0.0f);
+  LOAD_ARG_OR(first_k_dense_replace, "text_config.first_k_dense_replace", 1);
+  LOAD_ARG_OR(hidden_act, "text_config.hidden_act", "silu");
+  LOAD_ARG_OR(hidden_size, "text_config.hidden_size", 4096);
+  LOAD_ARG_OR(initializer_range, "text_config.initializer_range", 0.02);
+  LOAD_ARG_OR(intermediate_size, "text_config.intermediate_size", 10944);
+  LOAD_ARG_OR(
+      max_position_embeddings, "text_config.max_position_embeddings", 131072);
+  LOAD_ARG_OR(n_heads, "text_config.num_attention_heads", 96);
+  LOAD_ARG_OR(num_experts_per_tok, "text_config.num_experts_per_tok", 8);
+  LOAD_ARG_OR(n_layers, "text_config.num_hidden_layers", 46);
+  LOAD_ARG_OR(n_kv_heads, "text_config.num_key_value_heads", 8);
+  // LOAD_ARG_OR(partial_rotary_factor, "text_config.partial_rotary_factor",
+  // 0.5);
+  LOAD_ARG_OR(rms_norm_eps, "text_config.rms_norm_eps", 1e-05);
+  LOAD_ARG_OR(dtype, "text_config.dtype", "bfloat16");
+  LOAD_ARG_OR(rope_scaling_rope_type, "text_config.rope_scaling.type", "mrope");
+  LOAD_ARG(rope_scaling_mrope_section,
+           "text_config.rope_scaling.mrope_section");
+  LOAD_ARG_OR(rope_theta, "text_config.rope_theta", 500000.0f);
+  LOAD_ARG_OR(routed_scaling_factor, "text_config.routed_scaling_factor", 1.0);
+  LOAD_ARG_OR(topk_group, "text_config.topk_group", 1);
+  // LOAD_ARG_OR(use_cache, "text_config.use_cache", true);
+  LOAD_ARG_OR(use_qk_norm, "text_config.use_qk_norm", false);
+
+  // vision config
+  // LOAD_ARG_OR(mm_attention_bias, "vision_config.attention_bias", false);
+  // LOAD_ARG_OR(mm_attention_dropout, "vision_config.attention_dropout", 0.0f);
+  LOAD_ARG_OR(mm_num_hidden_layers, "vision_config.depth", 24);
+  LOAD_ARG_OR(mm_hidden_act, "vision_config.hidden_act", "silu");
+  LOAD_ARG_OR(mm_hidden_size, "vision_config.hidden_size", 1536);
+  LOAD_ARG_OR(mm_image_size, "vision_config.image_size", 336);
+  LOAD_ARG_OR(mm_num_channels, "vision_config.in_channels", 3);
+  LOAD_ARG_OR(mm_initializer_range, "vision_config.initializer_range", 0.02);
+  LOAD_ARG_OR(mm_intermediate_size, "vision_config.intermediate_size", 10944);
+  LOAD_ARG_OR(mm_num_attention_heads, "vision_config.num_heads", 12);
+  LOAD_ARG_OR(mm_projection_dim, "vision_config.out_hidden_size", 4096);
+  LOAD_ARG_OR(mm_patch_size, "vision_config.patch_size", 14);
+  // LOAD_ARG_OR(mm_rms_norm_eps, "vision_config.rms_norm_eps", 1e-05);
+  LOAD_ARG_OR(mm_spatial_merge_size, "vision_config.spatial_merge_size", 2);
+  LOAD_ARG_OR(mm_temporal_patch_size, "vision_config.temporal_patch_size", 2);
+  LOAD_ARG_OR_FUNC(mm_head_dim, "head_dim", [&] {
+    return args->mm_hidden_size() / args->mm_num_attention_heads();
+  });
+
+  SET_ARG(stop_token_ids,
+          std::unordered_set<int32_t>(args->eos_token_id_vec().begin(),
+                                      args->eos_token_id_vec().end()));
+});
+}  // namespace xllm
\ No newline at end of file
diff --git a/xllm/models/vlm/glm4v_moe.h b/xllm/models/vlm/glm4v_moe.h
index 4452abdce..bce4a80d6 100644
--- a/xllm/models/vlm/glm4v_moe.h
+++ b/xllm/models/vlm/glm4v_moe.h
@@ -31,7 +31,9 @@ limitations under the License.
 #include "models/llm/glm4_moe.h"
 #include "models/model_registry.h"
 #include "processors/input_processor.h"
+#include "processors/glm4v_image_processor.h"
 #include "xllm_kernels/core/include/atb_speed/log.h"
+#include "models/vlm/glm4v.h"
 
 namespace xllm {
 
@@ -40,7 +42,7 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
   Glm4vMoeForConditionalGenerationImpl(const ModelContext& context)
       : model_args_(context.get_model_args()),
         options_(context.get_tensor_options()) {
-    // visual_ = register_module("visual", Glm4VisionTransformer(context));
+    visual_ = register_module("visual", Glm4VisionTransformer(context));
 
     language_model_ =
         register_module("language_model", Glm4MoeForCausalLM(context));
@@ -48,22 +50,21 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
 
   torch::Tensor get_input_embeddings(
       torch::Tensor input_ids,
-      // const std::optional<Glm4VImageInputs>& image_input,
-      // const std::optional<Glm4VVideoInputs>& video_input,
+      const std::optional<Glm4VImageInputs>& image_input,
+      const std::optional<Glm4VVideoInputs>& video_input,
       const ModelInputParams& input_params) {
     auto inputs_embeds = language_model_->get_input_embeddings(input_ids);
-    // if (image_input) {
-    //   // visual
-    //   auto [image_embeds, deep_stacks] =
-    //       visual_(image_input->pixel_values.to(options_),
-    //               image_input->image_grid_thw,
-    //               input_params);
-    //   input_params.deep_stacks = deep_stacks;
-    //   // merge
-    //   auto is_multimodal = torch::isin(input_ids,
-    //   model_args_.image_token_id()); input_params.visual_pos_masks =
-    //   is_multimodal; inputs_embeds.index_put_({is_multimodal}, image_embeds);
-    // }
+    if (image_input) {
+      // visual
+      auto image_embeds =
+          visual_(image_input->pixel_values.to(options_),
+                  image_input->image_grid_thw,
+                  input_params);
+      // merge
+      auto is_multimodal = torch::isin(input_ids,
+      model_args_.image_token_id()); input_params.visual_pos_masks =
+      is_multimodal; inputs_embeds.index_put_({is_multimodal}, image_embeds);
+    }
     return inputs_embeds;
   }
 
@@ -72,21 +73,21 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
                         std::vector<KVCache>& kv_caches,
                         const ModelInputParams& input_params) {
     torch::NoGradGuard no_grad;
-    // const auto& mm_data = input_params.mm_data;
-    // torch::Tensor pixel_values;
-    // if (const auto& res = mm_data.get<torch::Tensor>("pixel_values"))
-    //   pixel_values = res.value();
+    const auto& mm_data = input_params.mm_data;
+    torch::Tensor pixel_values;
+    if (const auto& res = mm_data.get<torch::Tensor>("pixel_values"))
+      pixel_values = res.value();
 
-    // torch::Tensor image_grid_thw;
-    // if (const auto& res = mm_data.get<torch::Tensor>("image_grid_thw"))
-    //   image_grid_thw = res.value();
-    // std::optional<Glm4VImageInputs> image_inputs;
-    // std::optional<Glm4VVideoInputs> video_inputs;
+    torch::Tensor image_grid_thw;
+    if (const auto& res = mm_data.get<torch::Tensor>("image_grid_thw"))
+      image_grid_thw = res.value();
+    std::optional<Glm4VImageInputs> image_inputs;
+    std::optional<Glm4VVideoInputs> video_inputs;
 
-    // if (pixel_values.defined() && image_grid_thw.defined())
-    //   image_inputs = Glm4VImageInputs{pixel_values, image_grid_thw};
+    if (pixel_values.defined() && image_grid_thw.defined())
+      image_inputs = Glm4VImageInputs{pixel_values, image_grid_thw};
 
-    auto inputs_embeds = get_input_embeddings(tokens, input_params);
+    auto inputs_embeds = get_input_embeddings(tokens, image_inputs, video_inputs, input_params);
     input_params.input_embedding = inputs_embeds;
     auto emb = language_model_(tokens, positions, kv_caches, input_params);
 
@@ -99,13 +100,13 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
   }
 
   void load_model(std::unique_ptr<ModelLoader> loader) {
-    // for (const auto& state_dict : loader->get_state_dicts()) {
-    //   visual_->load_state_dict(
-    //       state_dict->get_dict_with_prefix("model.visual."));
-    // }
-    // // verify
-    // visual_->verify_loaded_weights("model.visual.");
-    // visual_->merge_loaded_weights();
+    for (const auto& state_dict : loader->get_state_dicts()) {
+      visual_->load_state_dict(
+          state_dict->get_dict_with_prefix("model.visual."));
+    }
+    // verify
+    visual_->verify_loaded_weights("model.visual.");
+    visual_->merge_loaded_weights();
     if (!model_args_.image_embedding_mode()) {
       language_model_->load_model(std::move(loader), "model.language_model.");
     }
@@ -125,14 +126,14 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
  private:
   ModelArgs model_args_;
   torch::TensorOptions options_;
-  // Glm4VisionTransformer visual_{nullptr};
+  Glm4VisionTransformer visual_{nullptr};
   Glm4MoeForCausalLM language_model_{nullptr};
 };
 TORCH_MODULE(Glm4vMoeForConditionalGeneration);
 
-// REGISTER_INPUT_PROCESSOR(glm4v_moe, GLM4VInputProcessor);
+REGISTER_INPUT_PROCESSOR(glm4v_moe, GLM4_6_VLInputProcessor);
 REGISTER_CAUSAL_VLM_MODEL(glm4v_moe, Glm4vMoeForConditionalGeneration);
-// REGISTER_IMAGE_PROCESSOR(glm4v_moe, Glm4vImageProcessor);
+REGISTER_IMAGE_PROCESSOR(glm4v_moe, Glm4VImageProcessor);
 // register the model args
 REGISTER_MODEL_ARGS(glm4v_moe, [&] {
   LOAD_ARG_OR(model_type, "model_type", "glm4v_moe");
@@ -196,7 +197,7 @@ REGISTER_MODEL_ARGS(glm4v_moe, [&] {
   LOAD_ARG_OR(mm_num_attention_heads, "vision_config.num_heads", 12);
   LOAD_ARG_OR(mm_projection_dim, "vision_config.out_hidden_size", 4096);
   LOAD_ARG_OR(mm_patch_size, "vision_config.patch_size", 14);
-  // LOAD_ARG_OR(mm_rms_norm_eps, "text_config.rms_norm_eps", 1e-05);
+  // LOAD_ARG_OR(mm_rms_norm_eps, "vision_config.rms_norm_eps", 1e-05);
   LOAD_ARG_OR(mm_spatial_merge_size, "vision_config.spatial_merge_size", 2);
   LOAD_ARG_OR(mm_temporal_patch_size, "vision_config.temporal_patch_size", 2);
   LOAD_ARG_OR_FUNC(mm_head_dim, "head_dim", [&] {
@@ -207,4 +208,4 @@ REGISTER_MODEL_ARGS(glm4v_moe, [&] {
           std::unordered_set<int32_t>(args->eos_token_id_vec().begin(),
                                       args->eos_token_id_vec().end()));
 });
-}  // namespace xllm
+}  // namespace xllm
\ No newline at end of file

From e00e8102b00147debc0ba98c1381528b23befd48 Mon Sep 17 00:00:00 2001
From: "ext.xingsilan1" <ext.xingsilan1@jd.com>
Date: Wed, 3 Dec 2025 17:24:11 +0800
Subject: [PATCH 12/20] fix: add gridsample op

---
 xllm/models/vlm/glm4v.h     | 15 +++++++--------
 xllm/models/vlm/glm4v_moe.h |  5 ++++-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/xllm/models/vlm/glm4v.h b/xllm/models/vlm/glm4v.h
index ce8693424..522321c55 100644
--- a/xllm/models/vlm/glm4v.h
+++ b/xllm/models/vlm/glm4v.h
@@ -425,13 +425,8 @@ class Glm4vVisionEmbeddingsImpl : public torch::nn::Module {
       namespace F = torch::nn::functional;
       auto interpolated_embed = F::grid_sample(
         pos_embed_2d,
-        grid);
-      // namespace F = torch::nn::functional;
-      // auto interpolated_embed = F::grid_sample(
-      //   pos_embed_2d,
-      //   grid,
-      //   F::GridSampleFuncOptions().mode(torch::kBicubic).padding_mode(torch::kBorder).align_corners(false));
-      // TODO
+        grid,
+        F::GridSampleFuncOptions().mode(torch::kBicubic).padding_mode(torch::kBorder).align_corners(false));
       adapted_pos_embed = interpolated_embed
           .squeeze(0)
           .squeeze(-1)
@@ -651,6 +646,7 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
        torch::Tensor hidden_states,
        torch::Tensor grid_thw,
        const ModelInputParams& input_params) {
+    LOG(INFO) << " Glm4VisionTransformerImpl forward beging ";
     hidden_states = patch_embed_(hidden_states);
     // at_npu::native::custom_ops::npu_rms_norm()
     hidden_states = post_conv_layernorm_(hidden_states);
@@ -686,7 +682,7 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
       seqlens_tensor.cpu().to(torch::kLong).data_ptr<int64_t>(),
       seqlens_tensor.cpu().to(torch::kLong).data_ptr<int64_t>() + seqlens_tensor.numel()
     );
-
+    LOG(INFO) << " Glm4VisionTransformerImpl forward embedding before ";
     hidden_states = embeddings_(hidden_states, seqlens, grid_thw, image_type_ids.select(1, 0), image_type_ids.select(1, 1));
     ModelInputParams& input_params_new =
         const_cast<ModelInputParams&>(input_params);
@@ -697,12 +693,15 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
     for (int idx = 0; idx < blocks_->size(); ++idx) {  
       hidden_states = layers_[idx](hidden_states, m_cos, m_sin, cu_seqlens, cu_seqlens_vec, input_params_new, idx); //TODO
     }
+    LOG(INFO) << " Glm4VisionTransformerImpl forward layer after ";
     hidden_states = post_layernorm_(hidden_states);
     hidden_states = hidden_states.view({-1, spatial_merge_size_, spatial_merge_size_, hidden_states.size(-1)});
     // TO down sample  merge op
     hidden_states = hidden_states.permute({0, 3, 1, 2});
     hidden_states = downsample_(hidden_states).view({-1, out_hidden_size_});
+    LOG(INFO) << " Glm4VisionTransformerImpl downsample after";
     hidden_states = merger_(hidden_states);
+    LOG(INFO) << " Glm4VisionTransformerImpl forward end";
     return hidden_states;
   };
 
diff --git a/xllm/models/vlm/glm4v_moe.h b/xllm/models/vlm/glm4v_moe.h
index bce4a80d6..002df8397 100644
--- a/xllm/models/vlm/glm4v_moe.h
+++ b/xllm/models/vlm/glm4v_moe.h
@@ -72,6 +72,7 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
                         const torch::Tensor& positions,
                         std::vector<KVCache>& kv_caches,
                         const ModelInputParams& input_params) {
+    LOG(INFO) << " Glm4vMoeForConditionalGenerationImpl beging ";
     torch::NoGradGuard no_grad;
     const auto& mm_data = input_params.mm_data;
     torch::Tensor pixel_values;
@@ -83,10 +84,12 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
       image_grid_thw = res.value();
     std::optional<Glm4VImageInputs> image_inputs;
     std::optional<Glm4VVideoInputs> video_inputs;
-
+    LOG(INFO) << " Glm4vMoeForConditionalGenerationImpl forward Glm4VImageInputs  beging ";
     if (pixel_values.defined() && image_grid_thw.defined())
       image_inputs = Glm4VImageInputs{pixel_values, image_grid_thw};
 
+    LOG(INFO) << " Glm4vMoeForConditionalGenerationImpl forward get_input_embeddings  beging ";
+
     auto inputs_embeds = get_input_embeddings(tokens, image_inputs, video_inputs, input_params);
     input_params.input_embedding = inputs_embeds;
     auto emb = language_model_(tokens, positions, kv_caches, input_params);

From d5dd564bc420bbb230c7a84c5ed14edbc851cac8 Mon Sep 17 00:00:00 2001
From: "wangyunlong.115" <wangyunlong.115@jd.com>
Date: Tue, 2 Dec 2025 14:11:07 +0800
Subject: [PATCH 13/20] feat: support Glm46 mpositions.

---
 vcpkg.json                              |   2 +-
 xllm/core/framework/batch/mposition.cpp | 142 +++++++++++++++++++++++-
 xllm/core/framework/batch/mposition.h   |   4 +
 3 files changed, 144 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 vcpkg.json

diff --git a/vcpkg.json b/vcpkg.json
old mode 100644
new mode 100755
index bc61fca13..a64c9030b
--- a/vcpkg.json
+++ b/vcpkg.json
@@ -101,7 +101,7 @@
       "name": "opencv4",
       "version>=": "4.7.0",
       "default-features": false,
-      "features": ["ffmpeg", "jpeg", "png","tiff","webp","openexr","quirc"]
+      "features": ["ffmpeg", "jpeg", "png"]
     },
     {
       "name": "yaml-cpp",
diff --git a/xllm/core/framework/batch/mposition.cpp b/xllm/core/framework/batch/mposition.cpp
index d4421aad5..ef4f0ee64 100755
--- a/xllm/core/framework/batch/mposition.cpp
+++ b/xllm/core/framework/batch/mposition.cpp
@@ -15,10 +15,34 @@ limitations under the License.
 
 #include "mposition.h"
 
+#include <absl/strings/match.h>
+
 #include "framework/model/model_args.h"
 #include "framework/request/sequence.h"
+
 namespace xllm {
 
+namespace {
+std::vector<std::tuple<std::string, int, int>> groupByTokenType(
+    const std::vector<std::string>& token_types) {
+  std::vector<std::tuple<std::string, int, int>> groups;
+  if (token_types.empty()) return groups;
+
+  std::string current_key = token_types[0];
+  int start = 0;
+
+  for (int i = 1; i < token_types.size(); ++i) {
+    if (token_types[i] != current_key) {
+      groups.emplace_back(current_key, start, i);
+      current_key = token_types[i];
+      start = i;
+    }
+  }
+  groups.emplace_back(current_key, start, static_cast<int>(token_types.size()));
+  return groups;
+}
+}  // namespace
+
 torch::Tensor MPositionHelper::get_positions() {
   // if (seq_.is_chunked_prefill_stage()) {
   if (seq_.kv_state().kv_cache_tokens_num() < seq_.num_prompt_tokens()) {
@@ -35,16 +59,128 @@ torch::Tensor MPositionHelper::get_positions() {
     torch::Tensor second_per_grid_ts;
     if (auto res = mm_data.get<torch::Tensor>("second_per_grid_ts"))
       second_per_grid_ts = res.value();
-    auto res =
-        get_positions_p(image_grid_thw, video_grid_thw, second_per_grid_ts);
+    std::tuple<torch::Tensor, int> res;
+    if (!absl::StartsWith(args_.model_type(), "glm4v")) {
+      res = get_positions_p(image_grid_thw, video_grid_thw, second_per_grid_ts);
+    } else {
+      res = get_positions_glm(image_grid_thw, video_grid_thw);
+    }
     seq_.set_mrope_position_delta(std::get<1>(res));
-
     return std::get<0>(res);
   } else {
     return get_positions_d();
   }
 }
 
+std::tuple<torch::Tensor, int> MPositionHelper::get_positions_glm(
+    torch::Tensor image_grid_thw,
+    torch::Tensor video_grid_thw) {
+  auto input_tokens = seq_.tokens();
+  auto spatial_merge_size = args_.mm_spatial_merge_size();
+  auto image_token_id = args_.image_token_id();
+  auto video_token_id = args_.video_token_id();
+  auto video_start_token_id = args_.video_start_token_id();
+  auto video_end_token_id = args_.video_end_token_id();
+
+  auto dtype = torch::kInt32;
+
+  std::vector<std::string> input_token_type;
+  bool in_video = false;
+  int num_tokens = input_tokens.size();
+
+  for (int index = 0; index < num_tokens; ++index) {
+    auto token = input_tokens[index];
+    if (token == video_start_token_id) {
+      in_video = true;
+    } else if (token == video_end_token_id) {
+      in_video = false;
+    }
+
+    if (token == image_token_id && !in_video) {
+      input_token_type.push_back("image");
+    } else if (token == image_token_id && in_video) {
+      input_token_type.push_back("video");
+    } else {
+      input_token_type.push_back("text");
+    }
+  }
+  auto input_type_group = groupByTokenType(input_token_type);
+  int image_index = 0;
+  int video_index = 0;
+  int video_group_index = 0;
+
+  std::vector<torch::Tensor> llm_pos_ids_list;
+  int video_frame_num = 1;
+  for (const auto& group : input_type_group) {
+    const auto& modality_type = std::get<0>(group);
+    int start_idx = std::get<1>(group);
+    int end_idx = std::get<2>(group);
+    int st_idx = 0;
+    if (!llm_pos_ids_list.empty()) {
+      st_idx = llm_pos_ids_list.back().max().item<int>() + 1;
+    }
+
+    if (modality_type == "image") {
+      auto grid = image_grid_thw[image_index];
+      int t = grid[0].item<int>();
+      int h = grid[1].item<int>() / spatial_merge_size;
+      int w = grid[2].item<int>() / spatial_merge_size;
+
+      auto t_arange =
+          torch::arange(t, dtype).view({-1, 1}).expand({-1, h * w}).flatten();
+      auto h_arange =
+          torch::arange(h, dtype).view({1, -1, 1}).expand({t, -1, w}).flatten();
+      auto w_arange =
+          torch::arange(w, dtype).view({1, 1, -1}).expand({t, h, -1}).flatten();
+
+      auto pos = torch::stack({t_arange, h_arange, w_arange}) + st_idx;
+      llm_pos_ids_list.push_back(pos);
+      video_frame_num = 1;
+      image_index++;
+    } else if (modality_type == "video") {
+      int t = video_frame_num;
+      int h = video_grid_thw[video_index][1].item<int>() / spatial_merge_size;
+      int w = video_grid_thw[video_index][2].item<int>() / spatial_merge_size;
+
+      for (int t_idx = 0; t_idx < t; ++t_idx) {
+        auto t_tensor = torch::full({1, h * w}, t_idx, dtype).flatten();
+        auto h_tensor = torch::arange(h, dtype)
+                            .view({1, -1, 1})
+                            .expand({1, -1, w})
+                            .flatten();
+        auto w_tensor = torch::arange(w, dtype)
+                            .view({1, 1, -1})
+                            .expand({1, h, -1})
+                            .flatten();
+
+        auto pos = torch::stack({t_tensor, h_tensor, w_tensor}) + st_idx;
+        llm_pos_ids_list.push_back(pos);
+      }
+
+      video_group_index++;
+      if (video_group_index >= video_grid_thw[video_index][0].item<int>()) {
+        video_index++;
+        video_group_index = 0;
+      }
+      video_frame_num++;
+    } else {  // text
+      int text_len = end_idx - start_idx;
+      auto arange =
+          torch::arange(text_len, dtype).view({1, -1}).expand({3, -1}) + st_idx;
+      llm_pos_ids_list.push_back(arange);
+      video_frame_num = 1;
+    }
+  }
+
+  torch::Tensor llm_positions =
+      torch::cat(llm_pos_ids_list, /*dim=*/1).reshape({3, -1});
+  llm_positions = llm_positions;
+  int mrope_position_delta =
+      (llm_positions.max().item<int>() + 1 - input_tokens.size());
+
+  return std::make_pair(llm_positions, mrope_position_delta);
+}
+
 std::tuple<torch::Tensor, int> MPositionHelper::get_positions_p(
     torch::Tensor image_grid_thw,
     torch::Tensor video_grid_thw,
diff --git a/xllm/core/framework/batch/mposition.h b/xllm/core/framework/batch/mposition.h
index c4575526c..466660baa 100644
--- a/xllm/core/framework/batch/mposition.h
+++ b/xllm/core/framework/batch/mposition.h
@@ -37,6 +37,10 @@ class MPositionHelper {
       torch::Tensor image_grid_thw,
       torch::Tensor video_grid_thw,
       torch::Tensor second_per_grid_ts);
+  std::tuple<torch::Tensor, int> get_positions_glm(
+      torch::Tensor image_grid_thw,
+      torch::Tensor video_grid_thw);
+
   torch::Tensor get_positions_d();
 
  private:

From dc7633edf1faa776b5b4e12519e501ab9afb88e2 Mon Sep 17 00:00:00 2001
From: jindonghe1 <jindonghe1@jd.com>
Date: Thu, 4 Dec 2025 15:32:52 +0800
Subject: [PATCH 14/20] feat: support glm4v position embedding.

---
 vcpkg.json                       | 2 +-
 xllm/models/llm/glm4.h           | 6 +++---
 xllm/models/llm/llm_model_base.h | 1 +
 xllm/models/vlm/glm4v.h          | 6 +++---
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/vcpkg.json b/vcpkg.json
index a64c9030b..bc61fca13 100755
--- a/vcpkg.json
+++ b/vcpkg.json
@@ -101,7 +101,7 @@
       "name": "opencv4",
       "version>=": "4.7.0",
       "default-features": false,
-      "features": ["ffmpeg", "jpeg", "png"]
+      "features": ["ffmpeg", "jpeg", "png","tiff","webp","openexr","quirc"]
     },
     {
       "name": "yaml-cpp",
diff --git a/xllm/models/llm/glm4.h b/xllm/models/llm/glm4.h
index db657dac5..dbaeac860 100644
--- a/xllm/models/llm/glm4.h
+++ b/xllm/models/llm/glm4.h
@@ -85,7 +85,6 @@ class Glm4ModelImpl : public LlmModelImplBase<Glm4DecoderLayer> {
     } else {
       h = embed_tokens_(tokens, 0);
     }
-
     auto target_cos_sin = atb_pos_emb_(cos_sin_, positions, 0);
     auto target_cos_sin_chunks = target_cos_sin.chunk(/*chunks=*/2, /*dim=*/-1);
     auto cos_pos = target_cos_sin_chunks[0].contiguous();
@@ -98,7 +97,7 @@ class Glm4ModelImpl : public LlmModelImplBase<Glm4DecoderLayer> {
         for (int dim_idx = 1; dim_idx <= 2; ++dim_idx) {
           int64_t offset = dim_idx;
           int64_t section_len = mrope_section_[dim_idx];
-          int64_t length = section_len * 3;
+          int64_t length = section_len * 2;
           auto idx_first_half = torch::arange(offset, length, 3, torch::kLong);
           auto idx_second_half = torch::arange(offset, length, 3, torch::kLong);
           auto idx_tensor =
@@ -114,7 +113,8 @@ class Glm4ModelImpl : public LlmModelImplBase<Glm4DecoderLayer> {
       sin_pos = apply(sin_pos.reshape(
           {positions.sizes().front(), -1, sin_pos.sizes().back()}));
     }
-
+    cos_pos = cos_pos.reshape({-1, cos_pos.sizes().back() /2, 2});
+    sin_pos = sin_pos.reshape({-1, sin_pos.sizes().back() /2, 2});
     torch::Tensor attn_mask;
     if (FLAGS_enable_chunked_prefill) {
       int max_kv_seq = input_params.kv_max_seq_len;
diff --git a/xllm/models/llm/llm_model_base.h b/xllm/models/llm/llm_model_base.h
index 25b0b0000..6c380cde0 100644
--- a/xllm/models/llm/llm_model_base.h
+++ b/xllm/models/llm/llm_model_base.h
@@ -68,6 +68,7 @@ torch::Tensor compute_rotary_embedding(int64_t dim,
     emb = torch::cat({freqs, freqs}, -1);
   } else {
     emb = torch::stack({freqs, freqs}, -1);
+    emb = emb.reshape({seq_len, dim});
   }
   auto rope_cos = torch::cos(emb);
   auto rope_sin = torch::sin(emb);
diff --git a/xllm/models/vlm/glm4v.h b/xllm/models/vlm/glm4v.h
index 522321c55..afc418a8c 100644
--- a/xllm/models/vlm/glm4v.h
+++ b/xllm/models/vlm/glm4v.h
@@ -912,9 +912,6 @@ REGISTER_MODEL_ARGS(glm4v, [&] {
   // LOAD_ARG_OR(pad_token_id, "text_config.pad_token_id", 151329);
   LOAD_ARG_OR(
       eos_token_id_vec, "text_config.eos_token_id", std::vector<int>{151329});
-  LOAD_ARG_OR_FUNC(head_dim, "text_config.head_dim", [&] {
-    return args->hidden_size() / args->n_heads();
-  });
   LOAD_ARG_OR(attention_bias, "text_config.attention_bias", true);
   LOAD_ARG_OR(attention_dropout, "text_config.attention_dropout", 0.0f);
   LOAD_ARG_OR(first_k_dense_replace, "text_config.first_k_dense_replace", 1);
@@ -925,6 +922,9 @@ REGISTER_MODEL_ARGS(glm4v, [&] {
   LOAD_ARG_OR(
       max_position_embeddings, "text_config.max_position_embeddings", 131072);
   LOAD_ARG_OR(n_heads, "text_config.num_attention_heads", 96);
+  LOAD_ARG_OR_FUNC(head_dim, "text_config.head_dim", [&] {
+    return args->hidden_size() / args->n_heads();
+  });
   LOAD_ARG_OR(num_experts_per_tok, "text_config.num_experts_per_tok", 8);
   LOAD_ARG_OR(n_layers, "text_config.num_hidden_layers", 46);
   LOAD_ARG_OR(n_kv_heads, "text_config.num_key_value_heads", 8);

From 71add7e911225d061aa9420dba7b52f0abf81b9b Mon Sep 17 00:00:00 2001
From: "ext.xingsilan1" <ext.xingsilan1@jd.com>
Date: Wed, 3 Dec 2025 19:43:35 +0800
Subject: [PATCH 15/20] bug fix

---
 xllm/models/vlm/glm4v.h     |  8 +++-----
 xllm/models/vlm/glm4v_moe.h | 28 +++++++++++++++++-----------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/xllm/models/vlm/glm4v.h b/xllm/models/vlm/glm4v.h
index afc418a8c..353a35e7e 100644
--- a/xllm/models/vlm/glm4v.h
+++ b/xllm/models/vlm/glm4v.h
@@ -648,9 +648,9 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
        const ModelInputParams& input_params) {
     LOG(INFO) << " Glm4VisionTransformerImpl forward beging ";
     hidden_states = patch_embed_(hidden_states);
-    // at_npu::native::custom_ops::npu_rms_norm()
+    LOG(INFO) << " Glm4VisionTransformerImpl patch_embed_ beging ";
     hidden_states = post_conv_layernorm_(hidden_states);
-    // hidden_states = at_npu::native::custom_ops::npu_rms_norm(hidden_states);
+    LOG(INFO) << " Glm4VisionTransformerImpl post_conv_layernorm_ beging ";
 
     auto [rotary_pos_emb, image_type_ids] = rot_pos_emb(grid_thw);
     auto emb = torch::cat({rotary_pos_emb, rotary_pos_emb}, -1);
@@ -665,9 +665,7 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
     auto repeats = grid_t.squeeze(1);
     auto repeated = torch::repeat_interleave(h_times_w, repeats, 0);
     c10::optional<torch::ScalarType> cumsum_dtype;
-    // if (torch::jit::is_tracing()) {
-    //     cumsum_dtype = grid_thw.scalar_type();
-    // } else {
+
     cumsum_dtype = torch::kInt32;
     auto cu_seqlens = torch::cumsum(repeated, 0, cumsum_dtype);
     namespace F = torch::nn::functional;
diff --git a/xllm/models/vlm/glm4v_moe.h b/xllm/models/vlm/glm4v_moe.h
index 002df8397..5693e18e2 100644
--- a/xllm/models/vlm/glm4v_moe.h
+++ b/xllm/models/vlm/glm4v_moe.h
@@ -42,6 +42,7 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
   Glm4vMoeForConditionalGenerationImpl(const ModelContext& context)
       : model_args_(context.get_model_args()),
         options_(context.get_tensor_options()) {
+    std::cout << "----------------Glm4vMoeForConditionalGenerationImpl init begin ----------------- " << std::endl;
     visual_ = register_module("visual", Glm4VisionTransformer(context));
 
     language_model_ =
@@ -53,18 +54,20 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
       const std::optional<Glm4VImageInputs>& image_input,
       const std::optional<Glm4VVideoInputs>& video_input,
       const ModelInputParams& input_params) {
+    // visual
+    LOG(INFO) << " Glm4vMoeForConditionalGenerationImpl forward get_input_embeddings  visual_ begin ";
+    torch::Tensor pixel = image_input->pixel_values.to(options_);
+    LOG(INFO) << " Glm4vMoeForConditionalGenerationImpl forward get_input_embeddings  pixel aft ";
+    auto image_embeds =
+        visual_(pixel,
+                image_input->image_grid_thw,
+                input_params);
+    LOG(INFO) << " Glm4vMoeForConditionalGenerationImpl forward get_input_embeddings  visual_ end ";
     auto inputs_embeds = language_model_->get_input_embeddings(input_ids);
-    if (image_input) {
-      // visual
-      auto image_embeds =
-          visual_(image_input->pixel_values.to(options_),
-                  image_input->image_grid_thw,
-                  input_params);
-      // merge
-      auto is_multimodal = torch::isin(input_ids,
-      model_args_.image_token_id()); input_params.visual_pos_masks =
-      is_multimodal; inputs_embeds.index_put_({is_multimodal}, image_embeds);
-    }
+    // merge
+    auto is_multimodal = torch::isin(input_ids,
+    model_args_.image_token_id()); input_params.visual_pos_masks =
+    is_multimodal; inputs_embeds.index_put_({is_multimodal}, image_embeds);
     return inputs_embeds;
   }
 
@@ -72,6 +75,7 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
                         const torch::Tensor& positions,
                         std::vector<KVCache>& kv_caches,
                         const ModelInputParams& input_params) {
+    std::cout << "----------------Glm4vMoeForConditionalGenerationImpl beging ----------------- " << std::endl;
     LOG(INFO) << " Glm4vMoeForConditionalGenerationImpl beging ";
     torch::NoGradGuard no_grad;
     const auto& mm_data = input_params.mm_data;
@@ -87,6 +91,8 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
     LOG(INFO) << " Glm4vMoeForConditionalGenerationImpl forward Glm4VImageInputs  beging ";
     if (pixel_values.defined() && image_grid_thw.defined())
       image_inputs = Glm4VImageInputs{pixel_values, image_grid_thw};
+    else
+      LOG(FATAL) << "Pixel value or image grid thw is null";
 
     LOG(INFO) << " Glm4vMoeForConditionalGenerationImpl forward get_input_embeddings  beging ";
 

From 7ee36cc0d2112b9a5b5c2b210ca2a5fd7231d3db Mon Sep 17 00:00:00 2001
From: "ext.xingsilan1" <ext.xingsilan1@jd.com>
Date: Thu, 4 Dec 2025 09:06:12 +0800
Subject: [PATCH 16/20] fix:  cos sin errror

---
 xllm/models/vlm/glm4v.h     | 43 ++++++++++++++++++-------------------
 xllm/models/vlm/glm4v_moe.h |  2 +-
 2 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/xllm/models/vlm/glm4v.h b/xllm/models/vlm/glm4v.h
index 353a35e7e..e422bc640 100644
--- a/xllm/models/vlm/glm4v.h
+++ b/xllm/models/vlm/glm4v.h
@@ -363,7 +363,7 @@ class Glm4vVisionEmbeddingsImpl : public torch::nn::Module {
   }
   torch::Tensor forward(
       torch::Tensor x,
-      std::vector<int64_t> lengths,
+      std::vector<int> lengths,
       torch::Tensor image_shapes,
       torch::Tensor h_coords,
       torch::Tensor w_coords
@@ -399,12 +399,12 @@ class Glm4vVisionEmbeddingsImpl : public torch::nn::Module {
       std::vector<torch::Tensor> target_w_list;
       target_h_list.reserve(batch_size);
       target_w_list.reserve(batch_size);
-
+      LOG(INFO) << " Glm4vVisionEmbeddingsImpl forward  batch_size: " << batch_size << "image_shapes " << image_shapes;
       for (int64_t i = 0; i < batch_size; ++i) {
         const int64_t seq_len = lengths[i];
         const auto img_h = image_shapes.index({i, 1}).to(torch::kFloat32);
         const auto img_w = image_shapes.index({i, 2}).to(torch::kFloat32);
-
+        LOG(INFO) << " Glm4vVisionEmbeddingsImpl forward  batch_size idx " << i;
         target_h_list.push_back(img_h.repeat({seq_len}));
         target_w_list.push_back(img_w.repeat({seq_len}));
       }
@@ -417,16 +417,17 @@ class Glm4vVisionEmbeddingsImpl : public torch::nn::Module {
 
       const auto norm_w = ((w_coords_fp32 + 0.5f) / target_w) * 2.0f - 1.0f;
       const auto norm_h = ((h_coords_fp32 + 0.5f) / target_h) * 2.0f - 1.0f;
-
+      LOG(INFO) << " Glm4vVisionEmbeddingsImpl stack";
       auto grid = torch::stack({norm_w, norm_h}, -1)
           .unsqueeze(0)
           .unsqueeze(2);
-
+      LOG(INFO) << " Glm4vVisionEmbeddingsImpl stack after";
       namespace F = torch::nn::functional;
       auto interpolated_embed = F::grid_sample(
         pos_embed_2d,
         grid,
         F::GridSampleFuncOptions().mode(torch::kBicubic).padding_mode(torch::kBorder).align_corners(false));
+      LOG(INFO) << " Glm4vVisionEmbeddingsImpl interpolated_embed";
       adapted_pos_embed = interpolated_embed
           .squeeze(0)
           .squeeze(-1)
@@ -656,34 +657,32 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
     auto emb = torch::cat({rotary_pos_emb, rotary_pos_emb}, -1);
     auto m_cos = emb.cos(); 
     auto m_sin = emb.sin();
+    LOG(INFO) << " Glm4VisionTransformerImpl" << " numel=" << grid_thw.numel() << " min=" << grid_thw.min().item<float>();
+    LOG(INFO) << grid_thw;
 
     auto device = grid_thw.device();
-    auto grid_t = grid_thw.index_select(1, torch::tensor({0}, torch::TensorOptions().dtype(torch::kLong).device(device)));
-    auto grid_h = grid_thw.index_select(1, torch::tensor({1}, torch::TensorOptions().dtype(torch::kLong).device(device)));
-    auto grid_w = grid_thw.index_select(1, torch::tensor({2}, torch::TensorOptions().dtype(torch::kLong).device(device)));
+    auto grid_t = grid_thw.index_select(1, torch::tensor({0}, torch::TensorOptions().dtype(torch::kInt).device(device)));
+    auto grid_h = grid_thw.index_select(1, torch::tensor({1}, torch::TensorOptions().dtype(torch::kInt).device(device)));
+    auto grid_w = grid_thw.index_select(1, torch::tensor({2}, torch::TensorOptions().dtype(torch::kInt).device(device)));
     auto h_times_w = (grid_h * grid_w).squeeze(1);
     auto repeats = grid_t.squeeze(1);
     auto repeated = torch::repeat_interleave(h_times_w, repeats, 0);
     c10::optional<torch::ScalarType> cumsum_dtype;
 
+    LOG(INFO) << " Glm4VisionTransformerImpl repeated " << repeated;
+
     cumsum_dtype = torch::kInt32;
     auto cu_seqlens = torch::cumsum(repeated, 0, cumsum_dtype);
     namespace F = torch::nn::functional;
-    cu_seqlens = F::pad(
-        cu_seqlens, F::PadFuncOptions({1, 0}).mode(torch::kConstant).value(0));
-    cu_seqlens = torch::diff(cu_seqlens);
-    torch::Tensor cu_seqlens_slice1 = cu_seqlens.narrow(0, 1, cu_seqlens.size(0) - 1);
-    torch::Tensor cu_seqlens_slice0 = cu_seqlens.narrow(0, 0, cu_seqlens.size(0) - 1);
-    torch::Tensor seqlens_tensor = cu_seqlens_slice1 - cu_seqlens_slice0;
-    std::vector<int64_t> seqlens;
-    seqlens.assign(
-      seqlens_tensor.cpu().to(torch::kLong).data_ptr<int64_t>(),
-      seqlens_tensor.cpu().to(torch::kLong).data_ptr<int64_t>() + seqlens_tensor.numel()
-    );
-    LOG(INFO) << " Glm4VisionTransformerImpl forward embedding before ";
+    cu_seqlens = F::pad(cu_seqlens, F::PadFuncOptions({1, 0}).mode(torch::kConstant).value(0));
+    cu_seqlens = torch::diff(cu_seqlens).cpu().to(torch::kInt);
+    std::vector<int> seqlens;
+    seqlens.assign(cu_seqlens.data_ptr<int>(),cu_seqlens.data_ptr<int>() + cu_seqlens.numel());
+
+    LOG(INFO) << " Glm4VisionTransformerImpl forward embedding before cu_seqlens " << cu_seqlens << "seqlens.size()" << seqlens.size();
     hidden_states = embeddings_(hidden_states, seqlens, grid_thw, image_type_ids.select(1, 0), image_type_ids.select(1, 1));
-    ModelInputParams& input_params_new =
-        const_cast<ModelInputParams&>(input_params);
+    LOG(INFO) << " Glm4VisionTransformerImpl forward embedding after ";
+    ModelInputParams& input_params_new = const_cast<ModelInputParams&>(input_params);
     torch::Tensor cu_seqlens_cpu = cu_seqlens.cpu();
     std::vector<int> cu_seqlens_vec( 
         cu_seqlens_cpu.data_ptr<int>(),  // full seqlen vec
diff --git a/xllm/models/vlm/glm4v_moe.h b/xllm/models/vlm/glm4v_moe.h
index 5693e18e2..e772a04c9 100644
--- a/xllm/models/vlm/glm4v_moe.h
+++ b/xllm/models/vlm/glm4v_moe.h
@@ -60,7 +60,7 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
     LOG(INFO) << " Glm4vMoeForConditionalGenerationImpl forward get_input_embeddings  pixel aft ";
     auto image_embeds =
         visual_(pixel,
-                image_input->image_grid_thw,
+                image_input->image_grid_thw.to(pixel.device()),
                 input_params);
     LOG(INFO) << " Glm4vMoeForConditionalGenerationImpl forward get_input_embeddings  visual_ end ";
     auto inputs_embeds = language_model_->get_input_embeddings(input_ids);

From 6cda3eeb6f644500b2fed87f20d31e497e7ad02b Mon Sep 17 00:00:00 2001
From: "ext.xingsilan1" <ext.xingsilan1@jd.com>
Date: Thu, 4 Dec 2025 13:51:42 +0800
Subject: [PATCH 17/20] feat: vit forward run success

---
 xllm/models/vlm/glm4v.h | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/xllm/models/vlm/glm4v.h b/xllm/models/vlm/glm4v.h
index e422bc640..ffe44d0b0 100644
--- a/xllm/models/vlm/glm4v.h
+++ b/xllm/models/vlm/glm4v.h
@@ -464,34 +464,36 @@ class Glm4_VisionPatchMergerImpl : public torch::nn::Module {
  public:
   Glm4_VisionPatchMergerImpl(const ModelContext& context) {
     auto model_args = context.get_model_args();
-    auto options = context.get_tensor_options();
+    options_ = context.get_tensor_options();
     auto parallel_args = context.get_parallel_args();
     int64_t dim = model_args.mm_projection_dim();
     int64_t context_dim = model_args.mm_intermediate_size();
     norm_ = register_module("norm", torch::nn::LayerNorm(torch::nn::LayerNormOptions({dim})));
-    norm_->weight.set_data(norm_->weight.to(options));
-    norm_->bias.set_data(norm_->bias.to(options));
+    norm_->weight.set_data(norm_->weight.to(options_));
+    norm_->bias.set_data(norm_->bias.to(options_));
     proj_ = register_module(
         "proj",
         torch::nn::Linear(torch::nn::LinearOptions(dim, dim).bias(false)));
-
+    proj_->weight.set_data(proj_->weight.to(options_));
     act_ = register_module("act", torch::nn::GELU());
     silu_ = register_module("silu", torch::nn::SiLU());
 
     gate_ = register_module(
         "gate",
         torch::nn::Linear(torch::nn::LinearOptions(dim, context_dim).bias(false)));
-
+    gate_->weight.set_data(gate_->weight.to(options_));
     up_ = register_module(
         "up",
         torch::nn::Linear(torch::nn::LinearOptions(dim, context_dim).bias(false)));
-
+    up_->weight.set_data(up_->weight.to(options_));
     down_ = register_module(
         "down",
         torch::nn::Linear(torch::nn::LinearOptions(context_dim, dim).bias(false)));
+    down_->weight.set_data(down_->weight.to(options_));
   }
 
   torch::Tensor forward(torch::Tensor x) {
+    LOG(INFO) << " Glm4_VisionPatchMergerImpl forward beging " << x.device() << "options_.device() : " << options_.device();
     x = proj_(x);
     x = act_(norm_(x));
     x = down_(torch::mul(silu_((gate_(x))), up_(x)));
@@ -568,6 +570,7 @@ class Glm4_VisionPatchMergerImpl : public torch::nn::Module {
   torch::nn::Linear down_{nullptr};
   torch::nn::GELU act_{nullptr};
   torch::nn::SiLU silu_{nullptr};
+  torch::TensorOptions options_;
 
 
   bool is_proj_weight_loaded = false;
@@ -607,6 +610,8 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
 
     downsample_ = register_module("downsample", torch::nn::Conv2d(torch::nn::Conv2dOptions(hidden_size_, out_hidden_size_, spatial_merge_size_)
                                                                   .stride(spatial_merge_size_).bias(true).padding(0)));
+    downsample_->weight.set_data(downsample_->weight.to(options_));
+    downsample_->bias.set_data(downsample_->bias.to(options_));
     merger_ = register_module("merger", Glm4_VisionPatchMerger(context));
 
   }
@@ -655,10 +660,8 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
 
     auto [rotary_pos_emb, image_type_ids] = rot_pos_emb(grid_thw);
     auto emb = torch::cat({rotary_pos_emb, rotary_pos_emb}, -1);
-    auto m_cos = emb.cos(); 
-    auto m_sin = emb.sin();
-    LOG(INFO) << " Glm4VisionTransformerImpl" << " numel=" << grid_thw.numel() << " min=" << grid_thw.min().item<float>();
-    LOG(INFO) << grid_thw;
+    auto m_cos = emb.cos().type_as(hidden_states);
+    auto m_sin = emb.sin().type_as(hidden_states);
 
     auto device = grid_thw.device();
     auto grid_t = grid_thw.index_select(1, torch::tensor({0}, torch::TensorOptions().dtype(torch::kInt).device(device)));
@@ -689,6 +692,7 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
         cu_seqlens_cpu.data_ptr<int>() + cu_seqlens_cpu.numel());
     for (int idx = 0; idx < blocks_->size(); ++idx) {  
       hidden_states = layers_[idx](hidden_states, m_cos, m_sin, cu_seqlens, cu_seqlens_vec, input_params_new, idx); //TODO
+      LOG(INFO) << " Glm4VisionTransformerImpl forward layer "<< idx;
     }
     LOG(INFO) << " Glm4VisionTransformerImpl forward layer after ";
     hidden_states = post_layernorm_(hidden_states);
@@ -782,6 +786,8 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
   bool is_post_layernorm_weight_loaded = false;
   bool is_downsample_weight_loaded_ = false;
   bool is_downsample_bias_loaded_ = false;
+  torch::Tensor m_cos;
+  torch::Tensor m_sin;
 };
 TORCH_MODULE(Glm4VisionTransformer);
 

From 01f2c508d984afff8651a2f58e77c595b03bc4c9 Mon Sep 17 00:00:00 2001
From: jindonghe1 <jindonghe1@jd.com>
Date: Thu, 4 Dec 2025 17:18:32 +0800
Subject: [PATCH 18/20] feat: support new model glm4-flash.

---
 .../chat_template/jinja_chat_template.cpp     |  33 ++--
 xllm/core/framework/request/mm_codec.cpp      |   3 +-
 xllm/core/framework/request/mm_input.h        |   6 +-
 xllm/core/runtime/vlm_master.cpp              |   7 +-
 xllm/models/llm/glm4.h                        |  24 ++-
 xllm/models/vlm/glm4v.h                       |  19 +-
 xllm/processors/glm4v_image_processor.cpp     | 124 +-----------
 xllm/processors/glm4v_image_processor.h       |   2 -
 xllm/processors/image_processor.cpp           | 187 ++++++++++++++++++
 xllm/processors/image_processor.h             |   9 +
 xllm/processors/qwen2_vl_image_processor.cpp  |  73 +------
 xllm/processors/qwen2_vl_image_processor.h    |   8 +-
 12 files changed, 250 insertions(+), 245 deletions(-)
 mode change 100644 => 100755 xllm/processors/glm4v_image_processor.h
 mode change 100644 => 100755 xllm/processors/image_processor.cpp

diff --git a/xllm/core/framework/chat_template/jinja_chat_template.cpp b/xllm/core/framework/chat_template/jinja_chat_template.cpp
index 920b768c4..44caf09ee 100644
--- a/xllm/core/framework/chat_template/jinja_chat_template.cpp
+++ b/xllm/core/framework/chat_template/jinja_chat_template.cpp
@@ -121,6 +121,24 @@ std::optional<std::string> JinjaChatTemplate::apply(
     nlohmann::ordered_json& messages,
     const nlohmann::ordered_json& tools,
     const nlohmann::ordered_json& chat_template_kwargs) const {
+  for (auto& msg : messages) {
+    if (!msg.contains("content")) continue;
+    auto& content = msg["content"];
+    auto normalize_item = [](nlohmann::ordered_json& item) {
+      if (item.contains("type") && item["type"].is_string()) {
+        std::string t = item["type"].get<std::string>();
+        if (t == "video_url") item["type"] = "video";
+      }
+      if (item.contains("video_url") && !item.contains("video"))
+        item["video"] = item["video_url"];
+    };
+
+    if (content.is_array()) {
+      for (auto& it : content) normalize_item(it);
+    } else if (content.is_object()) {
+      normalize_item(content);
+    }
+  }
   minja::chat_template_inputs input;
   input.messages = messages;
   input.tools = tools;
@@ -137,23 +155,10 @@ nlohmann::ordered_json JinjaChatTemplate::get_mm_content(
 
   for (const auto& item : vec) {
     nlohmann::ordered_json item_json;
-    if (item.type == "video_url") {
-      item_json["type"] = "video";
-    } else {
-      item_json["type"] = item.type;
-    }
+    item_json["type"] = item.type;
 
     if (item.type == "text") {
       item_json["text"] = item.text;
-    } else if (item.type == "video_url") {
-      item_json["video"] = "mm place holder";
-      item_json["video_url"] = "mm place holder";
-    } else if (item.type == "image_url") {
-      item_json["image"] = "mm place holder";
-      item_json["image_url"] = "mm place holder";
-    } else if (item.type == "audio_url") {
-      item_json["audio"] = "mm place holder";
-      item_json["audio_url"] = "mm place holder";
     } else {
       item_json[item.type] = "mm place holder";
     }
diff --git a/xllm/core/framework/request/mm_codec.cpp b/xllm/core/framework/request/mm_codec.cpp
index e862b76d4..0c78b5933 100644
--- a/xllm/core/framework/request/mm_codec.cpp
+++ b/xllm/core/framework/request/mm_codec.cpp
@@ -159,7 +159,8 @@ bool OpenCVVideoDecoder::decode(const std::string& raw_data,
   av_dict_set(&opts, "probesize", "20000000", 0);
   av_dict_set(&opts, "analyzeduration", "5000000", 0);
 
-  int ret = avformat_open_input(&fmt, nullptr, nullptr, &opts);
+  const AVInputFormat* in_fmt = av_find_input_format("mp4");
+  int ret = avformat_open_input(&fmt, nullptr, in_fmt, &opts);
   av_dict_free(&opts);
 
   if (ret < 0) {
diff --git a/xllm/core/framework/request/mm_input.h b/xllm/core/framework/request/mm_input.h
index 9f2d3237c..1b5fc57b3 100644
--- a/xllm/core/framework/request/mm_input.h
+++ b/xllm/core/framework/request/mm_input.h
@@ -58,11 +58,13 @@ struct MMInput {
     return std::move(vec);
   }
 
-  std::vector<VideoMetadata> get_video_metadata() const {
+  std::vector<VideoMetadata> get_video_metadata(MMType type) const {
     std::vector<VideoMetadata> metas;
     metas.reserve(items_.size());
     for (auto& item : items_) {
-      metas.push_back(item.video_meta_);
+      if (item.type_ == type) {
+        metas.push_back(item.video_meta_);
+      }
     }
     return metas;
   }
diff --git a/xllm/core/runtime/vlm_master.cpp b/xllm/core/runtime/vlm_master.cpp
index a031f2aa7..6eb12f1eb 100755
--- a/xllm/core/runtime/vlm_master.cpp
+++ b/xllm/core/runtime/vlm_master.cpp
@@ -220,7 +220,11 @@ void VLMMaster::handle_request(const std::vector<Message>& messages,
                         "Image processor process failed.");
     return;
   }
-
+  if (const auto& res = mm_data.get<torch::Tensor>("image_grid_thw"))
+  {
+    auto image_grid_thw = res.value();
+  LOG(INFO)<<"image_grid_thw:"<<image_grid_thw;
+  }
   this->handle_request(messages, mm_data, sp, callback);
 }
 
@@ -307,7 +311,6 @@ std::shared_ptr<Request> VLMMaster::generate_request(std::string prompt,
   }
   Timer timer;
   input_processor_->process(prompt, mm_data);
-
   std::vector<int> prompt_tokens;
   if (!tokenizer_->encode(prompt, &prompt_tokens)) {
     LOG(ERROR) << "Failed to encode prompt: " << prompt;
diff --git a/xllm/models/llm/glm4.h b/xllm/models/llm/glm4.h
index dbaeac860..3fae6a20d 100644
--- a/xllm/models/llm/glm4.h
+++ b/xllm/models/llm/glm4.h
@@ -93,20 +93,18 @@ class Glm4ModelImpl : public LlmModelImplBase<Glm4DecoderLayer> {
 
     if (positions.dim() == 2) {  // mrope
       auto apply = [this](torch::Tensor x) {
-        auto freqs_t = x[0].clone();
-        for (int dim_idx = 1; dim_idx <= 2; ++dim_idx) {
-          int64_t offset = dim_idx;
-          int64_t section_len = mrope_section_[dim_idx];
-          int64_t length = section_len * 2;
-          auto idx_first_half = torch::arange(offset, length, 3, torch::kLong);
-          auto idx_second_half = torch::arange(offset, length, 3, torch::kLong);
-          auto idx_tensor =
-              torch::cat({idx_first_half, idx_second_half}, 0).to(x.device());
-          // freqs_t[..., idx] = freqs[dim_idx][..., idx]
-          auto src = x[dim_idx].index_select(-1, idx_tensor);
-          freqs_t.index_copy_(-1, idx_tensor, src);
+        auto sections = mrope_section_;
+        sections.insert(sections.end(), sections.begin(), sections.end());
+
+        auto vec = x.split(sections, -1);
+        std::vector<torch::Tensor> selects;
+        selects.reserve(vec.size());
+
+        for (int64_t i = 0; i < vec.size(); ++i) {
+          auto m = vec[i];
+          selects.push_back(m[i % mrope_section_.size()]);
         }
-        return freqs_t;
+        return torch::cat(selects, -1);
       };
       cos_pos = apply(cos_pos.reshape(
           {positions.sizes().front(), -1, cos_pos.sizes().back()}));
diff --git a/xllm/models/vlm/glm4v.h b/xllm/models/vlm/glm4v.h
index ffe44d0b0..8ef661a3c 100644
--- a/xllm/models/vlm/glm4v.h
+++ b/xllm/models/vlm/glm4v.h
@@ -605,7 +605,6 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
       blocks_->push_back(block);
       layers_.push_back(block);
     }
-     // TODO 融合算子
     post_layernorm_ = register_module("post_layernorm", Glm4VisionRmsNorm(context));
 
     downsample_ = register_module("downsample", torch::nn::Conv2d(torch::nn::Conv2dOptions(hidden_size_, out_hidden_size_, spatial_merge_size_)
@@ -672,8 +671,6 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
     auto repeated = torch::repeat_interleave(h_times_w, repeats, 0);
     c10::optional<torch::ScalarType> cumsum_dtype;
 
-    LOG(INFO) << " Glm4VisionTransformerImpl repeated " << repeated;
-
     cumsum_dtype = torch::kInt32;
     auto cu_seqlens = torch::cumsum(repeated, 0, cumsum_dtype);
     namespace F = torch::nn::functional;
@@ -682,27 +679,21 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
     std::vector<int> seqlens;
     seqlens.assign(cu_seqlens.data_ptr<int>(),cu_seqlens.data_ptr<int>() + cu_seqlens.numel());
 
-    LOG(INFO) << " Glm4VisionTransformerImpl forward embedding before cu_seqlens " << cu_seqlens << "seqlens.size()" << seqlens.size();
     hidden_states = embeddings_(hidden_states, seqlens, grid_thw, image_type_ids.select(1, 0), image_type_ids.select(1, 1));
-    LOG(INFO) << " Glm4VisionTransformerImpl forward embedding after ";
     ModelInputParams& input_params_new = const_cast<ModelInputParams&>(input_params);
     torch::Tensor cu_seqlens_cpu = cu_seqlens.cpu();
     std::vector<int> cu_seqlens_vec( 
-        cu_seqlens_cpu.data_ptr<int>(),  // full seqlen vec
+        cu_seqlens_cpu.data_ptr<int>(), 
         cu_seqlens_cpu.data_ptr<int>() + cu_seqlens_cpu.numel());
+    cu_seqlens = cu_seqlens.to(hidden_states.device());
     for (int idx = 0; idx < blocks_->size(); ++idx) {  
-      hidden_states = layers_[idx](hidden_states, m_cos, m_sin, cu_seqlens, cu_seqlens_vec, input_params_new, idx); //TODO
-      LOG(INFO) << " Glm4VisionTransformerImpl forward layer "<< idx;
+      hidden_states = layers_[idx](hidden_states, m_cos, m_sin, cu_seqlens, cu_seqlens_vec, input_params_new, idx);
     }
-    LOG(INFO) << " Glm4VisionTransformerImpl forward layer after ";
     hidden_states = post_layernorm_(hidden_states);
     hidden_states = hidden_states.view({-1, spatial_merge_size_, spatial_merge_size_, hidden_states.size(-1)});
-    // TO down sample  merge op
     hidden_states = hidden_states.permute({0, 3, 1, 2});
     hidden_states = downsample_(hidden_states).view({-1, out_hidden_size_});
-    LOG(INFO) << " Glm4VisionTransformerImpl downsample after";
     hidden_states = merger_(hidden_states);
-    LOG(INFO) << " Glm4VisionTransformerImpl forward end";
     return hidden_states;
   };
 
@@ -820,12 +811,10 @@ class Glm4vForConditionalGenerationImpl : public torch::nn::Module {
       const ModelInputParams& input_params) {
     auto inputs_embeds = language_model_->get_input_embeddings(input_ids);
     if (image_input) {
-      // visual
       auto image_embeds =
           visual_(image_input->pixel_values.to(options_),
                   image_input->image_grid_thw,
                   input_params);
-      // merge
       auto is_multimodal = torch::isin(input_ids,
       model_args_.image_token_id()); input_params.visual_pos_masks =
       is_multimodal; inputs_embeds.index_put_({is_multimodal}, image_embeds);
@@ -851,7 +840,6 @@ class Glm4vForConditionalGenerationImpl : public torch::nn::Module {
 
     if (pixel_values.defined() && image_grid_thw.defined())
       image_inputs = Glm4VImageInputs{pixel_values, image_grid_thw};
-
     auto inputs_embeds = get_input_embeddings(tokens, image_inputs, video_inputs, input_params);
     input_params.input_embedding = inputs_embeds;
     auto emb = language_model_(tokens, positions, kv_caches, input_params);
@@ -869,7 +857,6 @@ class Glm4vForConditionalGenerationImpl : public torch::nn::Module {
       visual_->load_state_dict(
           state_dict->get_dict_with_prefix("model.visual."));
     }
-    // verify
     visual_->verify_loaded_weights("model.visual.");
     visual_->merge_loaded_weights();
     if (!model_args_.image_embedding_mode()) {
diff --git a/xllm/processors/glm4v_image_processor.cpp b/xllm/processors/glm4v_image_processor.cpp
index 6e00b7e2b..be548f613 100644
--- a/xllm/processors/glm4v_image_processor.cpp
+++ b/xllm/processors/glm4v_image_processor.cpp
@@ -77,117 +77,6 @@ std::optional<Size> smart_resize(int num_frames,
 }
 }  // namespace
 
-torch::Tensor Glm4VImageProcessor::sample_frames(const VideoMetadata& metadata,
-                                                 int temporal_patch_size) {
-  // video: [T, C, H, W]
-  const int total_frames = metadata.total_num_frames;
-  if (total_frames <= 0) {
-    return torch::empty({0}, torch::dtype(torch::kLong));
-  }
-
-  if (metadata.fps <= 0.0) {
-    LOG(FATAL) << "invalid metadata.fps <= 0";
-  }
-
-  const int max_frame_idx = total_frames - 1;
-
-  // duration = metadata.duration or round(max_idx / fps) + 1
-  double duration = metadata.duration;
-  if (duration <= 0.0) {
-    duration =
-        std::round(static_cast<double>(max_frame_idx) / metadata.fps) + 1.0;
-  }
-
-  constexpr double DYN_FPS_30 = 3.0;
-  constexpr double DYN_FPS_300 = 1.0;
-  constexpr double DYN_FPS_2400 = 0.5;
-  constexpr int MAX_FRAME_COUNT_DYNAMIC = 640;
-  constexpr double MAX_DURATION = 2400.0;
-
-  const double effective_duration = std::min(duration, MAX_DURATION);
-
-  double target_fps = 0.0;
-  if (effective_duration <= 30.0) {
-    target_fps = DYN_FPS_30;
-  } else if (effective_duration <= 300.0) {
-    target_fps = DYN_FPS_300;
-  } else {
-    target_fps = DYN_FPS_2400;
-  }
-
-  // extract_t = int(effective_duration * target_fps * temporal_patch_size)
-  int extract_t = static_cast<int>(effective_duration * target_fps *
-                                   static_cast<double>(temporal_patch_size));
-  extract_t = std::min(extract_t, MAX_FRAME_COUNT_DYNAMIC);
-
-  const double duration_per_frame = 1.0 / metadata.fps;
-  std::vector<double> timestamps(total_frames);
-  for (int i = 0; i < total_frames; ++i) {
-    timestamps[i] = static_cast<double>(i) * duration_per_frame;
-  }
-  const int max_second = static_cast<int>(duration);
-
-  torch::Tensor frame_indices;
-
-  if (total_frames < extract_t) {
-    frame_indices = torch::linspace(
-        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
-  } else {
-    std::vector<int64_t> tmp;
-    tmp.reserve(static_cast<size_t>(total_frames));
-    double current_second = 0.0;
-    const double inv_fps =
-        1.0 / (static_cast<double>(temporal_patch_size) * target_fps);
-
-    for (int frame_index = 0; frame_index < total_frames; frame_index++) {
-      if (timestamps[frame_index] >= current_second) {
-        current_second += inv_fps;
-        tmp.push_back(frame_index);
-        if (current_second >= static_cast<double>(max_second)) {
-          break;
-        }
-      }
-    }
-    frame_indices =
-        torch::tensor(tmp, torch::TensorOptions().dtype(torch::kLong));
-  }
-  int64_t len = frame_indices.size(0);
-  if (len < extract_t) {
-    int64_t start, end;
-    if (len == 0) {
-      start = 0;
-      end = std::max<int64_t>(total_frames - 1, 0);
-    } else {
-      start = frame_indices[0].item<int64_t>();
-      end = frame_indices[len - 1].item<int64_t>();
-    }
-    frame_indices =
-        torch::linspace(start, end, extract_t, torch::dtype(torch::kLong));
-  } else if (len > extract_t) {
-    frame_indices = torch::linspace(
-        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
-  }
-
-  len = frame_indices.size(0);
-  std::unordered_set<int64_t> seen;
-  seen.reserve(static_cast<size_t>(len) * 2);
-  std::vector<int64_t> uniq;
-  uniq.reserve(static_cast<size_t>(len));
-
-  for (int64_t i = 0; i < len; ++i) {
-    auto idx = frame_indices[i].item<int64_t>();
-    if (seen.insert(idx).second) {
-      uniq.push_back(idx);
-    }
-  }
-
-  if (!uniq.empty() && (uniq.size() & 1)) {
-    uniq.push_back(uniq.back());
-  }
-
-  return torch::tensor(uniq, torch::TensorOptions().dtype(torch::kLong));
-}
-
 Glm4VImageProcessor::Glm4VImageProcessor(const ModelArgs& args) {
   image_mean_ = args.mm_image_normalize_mean();
   image_std_ = args.mm_image_normalize_std();
@@ -223,7 +112,8 @@ Glm4VImageProcessor::Glm4VImageProcessor(const ModelArgs& args) {
 bool Glm4VImageProcessor::process(const MMInput& inputs, MMData& datas) {
   std::vector<torch::Tensor> images = inputs.get_decode_data(MMType::IMAGE);
   std::vector<torch::Tensor> videos = inputs.get_decode_data(MMType::VIDEO);
-  std::vector<VideoMetadata> video_meta_list = inputs.get_video_metadata();
+  std::vector<VideoMetadata> video_meta_list =
+      inputs.get_video_metadata(MMType::VIDEO);
 
   if (images.empty() && (videos.empty() || video_meta_list.empty())) {
     LOG(ERROR) << "no image/video tensor found.";
@@ -359,8 +249,8 @@ bool Glm4VImageProcessor::process_videos(
 
   auto values = torch::cat(pixel_values);
   auto thw = torch::tensor(grids).clone().reshape({-1, 3});
-  mm_datas.add(MMType::VIDEO, "video_grid_thw", thw);
-  mm_datas.add(MMType::VIDEO, "pixel_values_videos", values);
+  mm_datas.update(MMType::VIDEO, "video_grid_thw", thw);
+  mm_datas.update(MMType::VIDEO, "pixel_values_videos", values);
 
   return true;
 }
@@ -376,11 +266,9 @@ bool Glm4VImageProcessor::process_video(
 
   torch::Tensor indices;
   if (do_sample_frame_) {
-    indices = this->sample_frames(metadata, temporal_patch_size_);
+    indices = this->GLM_sample_frames(metadata, temporal_patch_size_);
   } else {
-    indices = torch::arange(0,
-                            static_cast<int64_t>(origin_video.size(0)),
-                            torch::TensorOptions().dtype(torch::kLong));
+    indices = this->init_frames(metadata);  // default sample to 32 frames
   }
   auto video = origin_video.index_select(/*dim=*/0, indices);
   int64_t sampled_total_frames = video.size(0);
diff --git a/xllm/processors/glm4v_image_processor.h b/xllm/processors/glm4v_image_processor.h
old mode 100644
new mode 100755
index 2313fb9bf..7a2202250
--- a/xllm/processors/glm4v_image_processor.h
+++ b/xllm/processors/glm4v_image_processor.h
@@ -42,8 +42,6 @@ class Glm4VImageProcessor : public ImageProcessor {
                      VideoMetadata& metadata,
                      std::vector<torch::Tensor>& pixel_values,
                      std::vector<int64_t>& grids);
-  torch::Tensor sample_frames(const VideoMetadata& metadata,
-                              int temporal_patch_size);
 
  private:
   bool do_convert_rgb_ = true;
diff --git a/xllm/processors/image_processor.cpp b/xllm/processors/image_processor.cpp
old mode 100644
new mode 100755
index 82195f36a..7317ea911
--- a/xllm/processors/image_processor.cpp
+++ b/xllm/processors/image_processor.cpp
@@ -128,4 +128,191 @@ torch::Tensor ImageProcessor::normalize(const torch::Tensor& image,
   return result.div_(s_tensor);
 }
 
+torch::Tensor ImageProcessor::init_frames(const VideoMetadata& metadata) {
+  int total_num_frames = metadata.total_num_frames;
+  int nframes_len = 32;
+  if (total_num_frames <= 0) {
+    return torch::empty({0}, torch::dtype(torch::kLong));
+  }
+  auto idx = torch::linspace(
+      0, total_num_frames - 1, nframes_len, torch::dtype(torch::kLong));
+  return idx;
+}
+
+torch::Tensor ImageProcessor::sample_frames(const VideoMetadata& metadata,
+                                            int temporal_patch_size,
+                                            int min_frames,
+                                            int max_frames,
+                                            int num_frames,
+                                            double set_fps) {
+  if (set_fps > 0.0 && num_frames > 0) {
+    LOG(FATAL) << "num_frames and fps are mutually exclusive arguments, please "
+                  "use only one!";
+  }
+
+  double fps = set_fps;
+
+  int total_num_frames = metadata.total_num_frames;
+
+  if (num_frames > 0) {
+    double double_num_frames =
+        std::round(static_cast<double>(num_frames) / temporal_patch_size) *
+        temporal_patch_size;
+    num_frames = static_cast<int>(double_num_frames);
+  } else if (fps > 0.0) {
+    if (metadata.fps <= 0.0) {
+      LOG(FATAL)
+          << "Asked to sample `fps` frames per second but no video metadata "
+             "was provided which is required when sampling with `fps`. ";
+    }
+
+    max_frames =
+        (std::min(max_frames, total_num_frames) / temporal_patch_size) *
+        temporal_patch_size;
+    double double_num_frames =
+        static_cast<double>(total_num_frames) / metadata.fps * fps;
+    double_num_frames = std::min(
+        std::min(std::max(double_num_frames, static_cast<double>(min_frames)),
+                 static_cast<double>(max_frames)),
+        static_cast<double>(total_num_frames));
+    double_num_frames = std::floor(double_num_frames / temporal_patch_size) *
+                        temporal_patch_size;
+
+    num_frames = static_cast<int>(double_num_frames);
+  }
+
+  if (num_frames > total_num_frames) {
+    LOG(FATAL) << "Video can't be sampled. The inferred num_frames="
+               << num_frames << " exceeds total_num_frames=" << total_num_frames
+               << ".";
+  }
+
+  if (num_frames > 0) {
+    std::vector<int64_t> indices;
+    indices.reserve(num_frames);
+    for (int i = 0; i < num_frames; ++i) {
+      int64_t k = static_cast<int64_t>(
+          (static_cast<int64_t>(i) * total_num_frames) / num_frames);
+      if (k >= total_num_frames) k = total_num_frames - 1;
+      indices.push_back(k);
+    }
+    return torch::tensor(indices, torch::TensorOptions().dtype(torch::kLong));
+  } else {
+    return torch::arange(0,
+                         static_cast<int64_t>(total_num_frames),
+                         torch::TensorOptions().dtype(torch::kLong));
+  }
+}
+
+torch::Tensor ImageProcessor::GLM_sample_frames(const VideoMetadata& metadata,
+                                                int temporal_patch_size) {
+  // video: [T, C, H, W]
+  const int total_frames = metadata.total_num_frames;
+  if (total_frames <= 0) {
+    return torch::empty({0}, torch::dtype(torch::kLong));
+  }
+
+  if (metadata.fps <= 0.0) {
+    LOG(FATAL) << "invalid metadata.fps <= 0";
+  }
+
+  const int max_frame_idx = total_frames - 1;
+
+  // duration = metadata.duration or round(max_idx / fps) + 1
+  double duration = metadata.duration;
+  if (duration <= 0.0) {
+    duration =
+        std::round(static_cast<double>(max_frame_idx) / metadata.fps) + 1.0;
+  }
+
+  constexpr double DYN_FPS_30 = 3.0;
+  constexpr double DYN_FPS_300 = 1.0;
+  constexpr double DYN_FPS_2400 = 0.5;
+  constexpr int MAX_FRAME_COUNT_DYNAMIC = 640;
+  constexpr double MAX_DURATION = 2400.0;
+
+  const double effective_duration = std::min(duration, MAX_DURATION);
+
+  double target_fps = 0.0;
+  if (effective_duration <= 30.0) {
+    target_fps = DYN_FPS_30;
+  } else if (effective_duration <= 300.0) {
+    target_fps = DYN_FPS_300;
+  } else {
+    target_fps = DYN_FPS_2400;
+  }
+
+  // extract_t = int(effective_duration * target_fps * temporal_patch_size)
+  int extract_t = static_cast<int>(effective_duration * target_fps *
+                                   static_cast<double>(temporal_patch_size));
+  extract_t = std::min(extract_t, MAX_FRAME_COUNT_DYNAMIC);
+
+  const double duration_per_frame = 1.0 / metadata.fps;
+  std::vector<double> timestamps(total_frames);
+  for (int i = 0; i < total_frames; ++i) {
+    timestamps[i] = static_cast<double>(i) * duration_per_frame;
+  }
+  const int max_second = static_cast<int>(duration);
+
+  torch::Tensor frame_indices;
+
+  if (total_frames < extract_t) {
+    frame_indices = torch::linspace(
+        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
+  } else {
+    std::vector<int64_t> tmp;
+    tmp.reserve(static_cast<size_t>(total_frames));
+    double current_second = 0.0;
+    const double inv_fps =
+        1.0 / (static_cast<double>(temporal_patch_size) * target_fps);
+
+    for (int frame_index = 0; frame_index < total_frames; frame_index++) {
+      if (timestamps[frame_index] >= current_second) {
+        current_second += inv_fps;
+        tmp.push_back(frame_index);
+        if (current_second >= static_cast<double>(max_second)) {
+          break;
+        }
+      }
+    }
+    frame_indices =
+        torch::tensor(tmp, torch::TensorOptions().dtype(torch::kLong));
+  }
+  int64_t len = frame_indices.size(0);
+  if (len < extract_t) {
+    int64_t start, end;
+    if (len == 0) {
+      start = 0;
+      end = std::max<int64_t>(total_frames - 1, 0);
+    } else {
+      start = frame_indices[0].item<int64_t>();
+      end = frame_indices[len - 1].item<int64_t>();
+    }
+    frame_indices =
+        torch::linspace(start, end, extract_t, torch::dtype(torch::kLong));
+  } else if (len > extract_t) {
+    frame_indices = torch::linspace(
+        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
+  }
+
+  len = frame_indices.size(0);
+  std::unordered_set<int64_t> seen;
+  seen.reserve(static_cast<size_t>(len) * 2);
+  std::vector<int64_t> uniq;
+  uniq.reserve(static_cast<size_t>(len));
+
+  for (int64_t i = 0; i < len; ++i) {
+    auto idx = frame_indices[i].item<int64_t>();
+    if (seen.insert(idx).second) {
+      uniq.push_back(idx);
+    }
+  }
+
+  if (!uniq.empty() && (uniq.size() & 1)) {
+    uniq.push_back(uniq.back());
+  }
+
+  return torch::tensor(uniq, torch::TensorOptions().dtype(torch::kLong));
+}
+
 }  // namespace xllm
diff --git a/xllm/processors/image_processor.h b/xllm/processors/image_processor.h
index be7fac5a1..f5b5f33ef 100644
--- a/xllm/processors/image_processor.h
+++ b/xllm/processors/image_processor.h
@@ -39,6 +39,15 @@ class ImageProcessor {
   virtual torch::Tensor normalize(const torch::Tensor& image,
                                   const std::vector<double>& mean,
                                   const std::vector<double>& std);
+  virtual torch::Tensor init_frames(const VideoMetadata& metadata);
+  virtual torch::Tensor sample_frames(const VideoMetadata& metadata,
+                                      int temporal_patch_size,
+                                      int min_frames,
+                                      int max_frames,
+                                      int num_frames = -1,
+                                      double set_fps = -1.0);
+  virtual torch::Tensor GLM_sample_frames(const VideoMetadata& metadata,
+                                          int temporal_patch_size);
 };
 
 }  // namespace xllm
diff --git a/xllm/processors/qwen2_vl_image_processor.cpp b/xllm/processors/qwen2_vl_image_processor.cpp
index cd30d8146..e1639dd6b 100644
--- a/xllm/processors/qwen2_vl_image_processor.cpp
+++ b/xllm/processors/qwen2_vl_image_processor.cpp
@@ -60,72 +60,6 @@ std::optional<Size> smart_resize(int height,
 }
 }  // namespace
 
-torch::Tensor Qwen2VLImageProcessor::sample_frames(
-    const VideoMetadata& metadata,
-    int temporal_patch_size,
-    int min_frames,
-    int max_frames,
-    int num_frames,
-    double set_fps) {
-  if (set_fps > 0.0 && num_frames > 0) {
-    LOG(FATAL) << "num_frames and fps are mutually exclusive arguments, please "
-                  "use only one!";
-  }
-
-  double fps = set_fps;
-
-  int total_num_frames = metadata.total_num_frames;
-
-  if (num_frames > 0) {
-    double double_num_frames =
-        std::round(static_cast<double>(num_frames) / temporal_patch_size) *
-        temporal_patch_size;
-    num_frames = static_cast<int>(double_num_frames);
-  } else if (fps > 0.0) {
-    if (metadata.fps <= 0.0) {
-      LOG(FATAL)
-          << "Asked to sample `fps` frames per second but no video metadata "
-             "was provided which is required when sampling with `fps`. ";
-    }
-
-    max_frames =
-        (std::min(max_frames, total_num_frames) / temporal_patch_size) *
-        temporal_patch_size;
-    double double_num_frames =
-        static_cast<double>(total_num_frames) / metadata.fps * fps;
-    double_num_frames = std::min(
-        std::min(std::max(double_num_frames, static_cast<double>(min_frames)),
-                 static_cast<double>(max_frames)),
-        static_cast<double>(total_num_frames));
-    double_num_frames = std::floor(double_num_frames / temporal_patch_size) *
-                        temporal_patch_size;
-
-    num_frames = static_cast<int>(double_num_frames);
-  }
-
-  if (num_frames > total_num_frames) {
-    LOG(FATAL) << "Video can't be sampled. The inferred num_frames="
-               << num_frames << " exceeds total_num_frames=" << total_num_frames
-               << ".";
-  }
-
-  if (num_frames > 0) {
-    std::vector<int64_t> indices;
-    indices.reserve(num_frames);
-    for (int i = 0; i < num_frames; ++i) {
-      int64_t k = static_cast<int64_t>(
-          (static_cast<int64_t>(i) * total_num_frames) / num_frames);
-      if (k >= total_num_frames) k = total_num_frames - 1;
-      indices.push_back(k);
-    }
-    return torch::tensor(indices, torch::TensorOptions().dtype(torch::kLong));
-  } else {
-    return torch::arange(0,
-                         static_cast<int64_t>(total_num_frames),
-                         torch::TensorOptions().dtype(torch::kLong));
-  }
-}
-
 Qwen2VLImageProcessor::Qwen2VLImageProcessor(const ModelArgs& args) {
   image_mean_ = args.mm_image_normalize_mean();
   image_std_ = args.mm_image_normalize_std();
@@ -159,7 +93,8 @@ Qwen2VLImageProcessor::Qwen2VLImageProcessor(const ModelArgs& args) {
 bool Qwen2VLImageProcessor::process(const MMInput& inputs, MMData& datas) {
   std::vector<torch::Tensor> images = inputs.get_decode_data(MMType::IMAGE);
   std::vector<torch::Tensor> videos = inputs.get_decode_data(MMType::VIDEO);
-  std::vector<VideoMetadata> video_meta_list = inputs.get_video_metadata();
+  std::vector<VideoMetadata> video_meta_list =
+      inputs.get_video_metadata(MMType::VIDEO);
 
   if (images.empty() && (videos.empty() || video_meta_list.empty())) {
     LOG(ERROR) << "no image/video tensor found.";
@@ -335,9 +270,7 @@ bool Qwen2VLImageProcessor::process_video(
                                   /*num_frames=*/-1,
                                   /*set_fps=*/2.0);
   } else {
-    indices = torch::arange(0,
-                            static_cast<int64_t>(origin_video.size(0)),
-                            torch::TensorOptions().dtype(torch::kLong));
+    indices = this->init_frames(metadata);  // default sample to 32 frames
   }
   auto video = origin_video.index_select(/*dim=*/0, indices);
   int64_t sampled_total_frames = video.size(0);
diff --git a/xllm/processors/qwen2_vl_image_processor.h b/xllm/processors/qwen2_vl_image_processor.h
index 3e35ac501..974ec83e5 100644
--- a/xllm/processors/qwen2_vl_image_processor.h
+++ b/xllm/processors/qwen2_vl_image_processor.h
@@ -43,12 +43,6 @@ class Qwen2VLImageProcessor : public ImageProcessor {
                      VideoMetadata& metadata,
                      std::vector<torch::Tensor>& pixel_values,
                      std::vector<int64_t>& grids);
-  torch::Tensor sample_frames(const VideoMetadata& metadata,
-                              int temporal_patch_size,
-                              int min_frames,
-                              int max_frames,
-                              int num_frames = -1,
-                              double set_fps = -1.0);
 
  private:
   bool do_convert_rgb_ = true;
@@ -72,7 +66,7 @@ class Qwen2VLImageProcessor : public ImageProcessor {
   std::unordered_map<std::string, int> size_;
   int temporal_patch_size_ = 2;
 
-  bool do_sample_frame_ = true;
+  bool do_sample_frame_ = false;
 
   int min_frames_ = 4;
   int max_frames_ = 768;

From 3546418f03a83399033203355a696ed163545f9c Mon Sep 17 00:00:00 2001
From: "wangziyue.28" <wangziyue.28@jd.com>
Date: Wed, 3 Dec 2025 15:47:42 +0800
Subject: [PATCH 19/20] feat: 1.move sample_frames from image_processor.
 2.support diffenrent types in chat_template content.

---
 .../chat_template/jinja_chat_template.cpp     |  27 +--
 xllm/core/framework/request/mm_codec.cpp      |   3 +-
 xllm/core/framework/request/mm_input.h        |   6 +-
 xllm/processors/glm4v_image_processor.cpp     | 124 +++++++++++-
 xllm/processors/glm4v_image_processor.h       |   2 +
 xllm/processors/image_processor.cpp           | 187 ------------------
 xllm/processors/image_processor.h             |   9 -
 xllm/processors/qwen2_vl_image_processor.cpp  |  73 ++++++-
 xllm/processors/qwen2_vl_image_processor.h    |   8 +-
 9 files changed, 209 insertions(+), 230 deletions(-)
 mode change 100755 => 100644 xllm/processors/glm4v_image_processor.h
 mode change 100755 => 100644 xllm/processors/image_processor.cpp

diff --git a/xllm/core/framework/chat_template/jinja_chat_template.cpp b/xllm/core/framework/chat_template/jinja_chat_template.cpp
index 44caf09ee..fcd1f2166 100644
--- a/xllm/core/framework/chat_template/jinja_chat_template.cpp
+++ b/xllm/core/framework/chat_template/jinja_chat_template.cpp
@@ -121,24 +121,6 @@ std::optional<std::string> JinjaChatTemplate::apply(
     nlohmann::ordered_json& messages,
     const nlohmann::ordered_json& tools,
     const nlohmann::ordered_json& chat_template_kwargs) const {
-  for (auto& msg : messages) {
-    if (!msg.contains("content")) continue;
-    auto& content = msg["content"];
-    auto normalize_item = [](nlohmann::ordered_json& item) {
-      if (item.contains("type") && item["type"].is_string()) {
-        std::string t = item["type"].get<std::string>();
-        if (t == "video_url") item["type"] = "video";
-      }
-      if (item.contains("video_url") && !item.contains("video"))
-        item["video"] = item["video_url"];
-    };
-
-    if (content.is_array()) {
-      for (auto& it : content) normalize_item(it);
-    } else if (content.is_object()) {
-      normalize_item(content);
-    }
-  }
   minja::chat_template_inputs input;
   input.messages = messages;
   input.tools = tools;
@@ -159,6 +141,15 @@ nlohmann::ordered_json JinjaChatTemplate::get_mm_content(
 
     if (item.type == "text") {
       item_json["text"] = item.text;
+    } else if (item.type == "video_url") {
+      item_json["video"] = "mm place holder";
+      item_json["video_url"] = "mm place holder";
+    } else if (item.type == "image_url") {
+      item_json["image"] = "mm place holder";
+      item_json["image_url"] = "mm place holder";
+    } else if (item.type == "audio_url") {
+      item_json["audio"] = "mm place holder";
+      item_json["audio_url"] = "mm place holder";
     } else {
       item_json[item.type] = "mm place holder";
     }
diff --git a/xllm/core/framework/request/mm_codec.cpp b/xllm/core/framework/request/mm_codec.cpp
index 0c78b5933..e862b76d4 100644
--- a/xllm/core/framework/request/mm_codec.cpp
+++ b/xllm/core/framework/request/mm_codec.cpp
@@ -159,8 +159,7 @@ bool OpenCVVideoDecoder::decode(const std::string& raw_data,
   av_dict_set(&opts, "probesize", "20000000", 0);
   av_dict_set(&opts, "analyzeduration", "5000000", 0);
 
-  const AVInputFormat* in_fmt = av_find_input_format("mp4");
-  int ret = avformat_open_input(&fmt, nullptr, in_fmt, &opts);
+  int ret = avformat_open_input(&fmt, nullptr, nullptr, &opts);
   av_dict_free(&opts);
 
   if (ret < 0) {
diff --git a/xllm/core/framework/request/mm_input.h b/xllm/core/framework/request/mm_input.h
index 1b5fc57b3..9f2d3237c 100644
--- a/xllm/core/framework/request/mm_input.h
+++ b/xllm/core/framework/request/mm_input.h
@@ -58,13 +58,11 @@ struct MMInput {
     return std::move(vec);
   }
 
-  std::vector<VideoMetadata> get_video_metadata(MMType type) const {
+  std::vector<VideoMetadata> get_video_metadata() const {
     std::vector<VideoMetadata> metas;
     metas.reserve(items_.size());
     for (auto& item : items_) {
-      if (item.type_ == type) {
-        metas.push_back(item.video_meta_);
-      }
+      metas.push_back(item.video_meta_);
     }
     return metas;
   }
diff --git a/xllm/processors/glm4v_image_processor.cpp b/xllm/processors/glm4v_image_processor.cpp
index be548f613..6e00b7e2b 100644
--- a/xllm/processors/glm4v_image_processor.cpp
+++ b/xllm/processors/glm4v_image_processor.cpp
@@ -77,6 +77,117 @@ std::optional<Size> smart_resize(int num_frames,
 }
 }  // namespace
 
+torch::Tensor Glm4VImageProcessor::sample_frames(const VideoMetadata& metadata,
+                                                 int temporal_patch_size) {
+  // video: [T, C, H, W]
+  const int total_frames = metadata.total_num_frames;
+  if (total_frames <= 0) {
+    return torch::empty({0}, torch::dtype(torch::kLong));
+  }
+
+  if (metadata.fps <= 0.0) {
+    LOG(FATAL) << "invalid metadata.fps <= 0";
+  }
+
+  const int max_frame_idx = total_frames - 1;
+
+  // duration = metadata.duration or round(max_idx / fps) + 1
+  double duration = metadata.duration;
+  if (duration <= 0.0) {
+    duration =
+        std::round(static_cast<double>(max_frame_idx) / metadata.fps) + 1.0;
+  }
+
+  constexpr double DYN_FPS_30 = 3.0;
+  constexpr double DYN_FPS_300 = 1.0;
+  constexpr double DYN_FPS_2400 = 0.5;
+  constexpr int MAX_FRAME_COUNT_DYNAMIC = 640;
+  constexpr double MAX_DURATION = 2400.0;
+
+  const double effective_duration = std::min(duration, MAX_DURATION);
+
+  double target_fps = 0.0;
+  if (effective_duration <= 30.0) {
+    target_fps = DYN_FPS_30;
+  } else if (effective_duration <= 300.0) {
+    target_fps = DYN_FPS_300;
+  } else {
+    target_fps = DYN_FPS_2400;
+  }
+
+  // extract_t = int(effective_duration * target_fps * temporal_patch_size)
+  int extract_t = static_cast<int>(effective_duration * target_fps *
+                                   static_cast<double>(temporal_patch_size));
+  extract_t = std::min(extract_t, MAX_FRAME_COUNT_DYNAMIC);
+
+  const double duration_per_frame = 1.0 / metadata.fps;
+  std::vector<double> timestamps(total_frames);
+  for (int i = 0; i < total_frames; ++i) {
+    timestamps[i] = static_cast<double>(i) * duration_per_frame;
+  }
+  const int max_second = static_cast<int>(duration);
+
+  torch::Tensor frame_indices;
+
+  if (total_frames < extract_t) {
+    frame_indices = torch::linspace(
+        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
+  } else {
+    std::vector<int64_t> tmp;
+    tmp.reserve(static_cast<size_t>(total_frames));
+    double current_second = 0.0;
+    const double inv_fps =
+        1.0 / (static_cast<double>(temporal_patch_size) * target_fps);
+
+    for (int frame_index = 0; frame_index < total_frames; frame_index++) {
+      if (timestamps[frame_index] >= current_second) {
+        current_second += inv_fps;
+        tmp.push_back(frame_index);
+        if (current_second >= static_cast<double>(max_second)) {
+          break;
+        }
+      }
+    }
+    frame_indices =
+        torch::tensor(tmp, torch::TensorOptions().dtype(torch::kLong));
+  }
+  int64_t len = frame_indices.size(0);
+  if (len < extract_t) {
+    int64_t start, end;
+    if (len == 0) {
+      start = 0;
+      end = std::max<int64_t>(total_frames - 1, 0);
+    } else {
+      start = frame_indices[0].item<int64_t>();
+      end = frame_indices[len - 1].item<int64_t>();
+    }
+    frame_indices =
+        torch::linspace(start, end, extract_t, torch::dtype(torch::kLong));
+  } else if (len > extract_t) {
+    frame_indices = torch::linspace(
+        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
+  }
+
+  len = frame_indices.size(0);
+  std::unordered_set<int64_t> seen;
+  seen.reserve(static_cast<size_t>(len) * 2);
+  std::vector<int64_t> uniq;
+  uniq.reserve(static_cast<size_t>(len));
+
+  for (int64_t i = 0; i < len; ++i) {
+    auto idx = frame_indices[i].item<int64_t>();
+    if (seen.insert(idx).second) {
+      uniq.push_back(idx);
+    }
+  }
+
+  if (!uniq.empty() && (uniq.size() & 1)) {
+    uniq.push_back(uniq.back());
+  }
+
+  return torch::tensor(uniq, torch::TensorOptions().dtype(torch::kLong));
+}
+
 Glm4VImageProcessor::Glm4VImageProcessor(const ModelArgs& args) {
   image_mean_ = args.mm_image_normalize_mean();
   image_std_ = args.mm_image_normalize_std();
@@ -112,8 +223,7 @@ Glm4VImageProcessor::Glm4VImageProcessor(const ModelArgs& args) {
 bool Glm4VImageProcessor::process(const MMInput& inputs, MMData& datas) {
   std::vector<torch::Tensor> images = inputs.get_decode_data(MMType::IMAGE);
   std::vector<torch::Tensor> videos = inputs.get_decode_data(MMType::VIDEO);
-  std::vector<VideoMetadata> video_meta_list =
-      inputs.get_video_metadata(MMType::VIDEO);
+  std::vector<VideoMetadata> video_meta_list = inputs.get_video_metadata();
 
   if (images.empty() && (videos.empty() || video_meta_list.empty())) {
     LOG(ERROR) << "no image/video tensor found.";
@@ -249,8 +359,8 @@ bool Glm4VImageProcessor::process_videos(
 
   auto values = torch::cat(pixel_values);
   auto thw = torch::tensor(grids).clone().reshape({-1, 3});
-  mm_datas.update(MMType::VIDEO, "video_grid_thw", thw);
-  mm_datas.update(MMType::VIDEO, "pixel_values_videos", values);
+  mm_datas.add(MMType::VIDEO, "video_grid_thw", thw);
+  mm_datas.add(MMType::VIDEO, "pixel_values_videos", values);
 
   return true;
 }
@@ -266,9 +376,11 @@ bool Glm4VImageProcessor::process_video(
 
   torch::Tensor indices;
   if (do_sample_frame_) {
-    indices = this->GLM_sample_frames(metadata, temporal_patch_size_);
+    indices = this->sample_frames(metadata, temporal_patch_size_);
   } else {
-    indices = this->init_frames(metadata);  // default sample to 32 frames
+    indices = torch::arange(0,
+                            static_cast<int64_t>(origin_video.size(0)),
+                            torch::TensorOptions().dtype(torch::kLong));
   }
   auto video = origin_video.index_select(/*dim=*/0, indices);
   int64_t sampled_total_frames = video.size(0);
diff --git a/xllm/processors/glm4v_image_processor.h b/xllm/processors/glm4v_image_processor.h
old mode 100755
new mode 100644
index 7a2202250..2313fb9bf
--- a/xllm/processors/glm4v_image_processor.h
+++ b/xllm/processors/glm4v_image_processor.h
@@ -42,6 +42,8 @@ class Glm4VImageProcessor : public ImageProcessor {
                      VideoMetadata& metadata,
                      std::vector<torch::Tensor>& pixel_values,
                      std::vector<int64_t>& grids);
+  torch::Tensor sample_frames(const VideoMetadata& metadata,
+                              int temporal_patch_size);
 
  private:
   bool do_convert_rgb_ = true;
diff --git a/xllm/processors/image_processor.cpp b/xllm/processors/image_processor.cpp
old mode 100755
new mode 100644
index 7317ea911..82195f36a
--- a/xllm/processors/image_processor.cpp
+++ b/xllm/processors/image_processor.cpp
@@ -128,191 +128,4 @@ torch::Tensor ImageProcessor::normalize(const torch::Tensor& image,
   return result.div_(s_tensor);
 }
 
-torch::Tensor ImageProcessor::init_frames(const VideoMetadata& metadata) {
-  int total_num_frames = metadata.total_num_frames;
-  int nframes_len = 32;
-  if (total_num_frames <= 0) {
-    return torch::empty({0}, torch::dtype(torch::kLong));
-  }
-  auto idx = torch::linspace(
-      0, total_num_frames - 1, nframes_len, torch::dtype(torch::kLong));
-  return idx;
-}
-
-torch::Tensor ImageProcessor::sample_frames(const VideoMetadata& metadata,
-                                            int temporal_patch_size,
-                                            int min_frames,
-                                            int max_frames,
-                                            int num_frames,
-                                            double set_fps) {
-  if (set_fps > 0.0 && num_frames > 0) {
-    LOG(FATAL) << "num_frames and fps are mutually exclusive arguments, please "
-                  "use only one!";
-  }
-
-  double fps = set_fps;
-
-  int total_num_frames = metadata.total_num_frames;
-
-  if (num_frames > 0) {
-    double double_num_frames =
-        std::round(static_cast<double>(num_frames) / temporal_patch_size) *
-        temporal_patch_size;
-    num_frames = static_cast<int>(double_num_frames);
-  } else if (fps > 0.0) {
-    if (metadata.fps <= 0.0) {
-      LOG(FATAL)
-          << "Asked to sample `fps` frames per second but no video metadata "
-             "was provided which is required when sampling with `fps`. ";
-    }
-
-    max_frames =
-        (std::min(max_frames, total_num_frames) / temporal_patch_size) *
-        temporal_patch_size;
-    double double_num_frames =
-        static_cast<double>(total_num_frames) / metadata.fps * fps;
-    double_num_frames = std::min(
-        std::min(std::max(double_num_frames, static_cast<double>(min_frames)),
-                 static_cast<double>(max_frames)),
-        static_cast<double>(total_num_frames));
-    double_num_frames = std::floor(double_num_frames / temporal_patch_size) *
-                        temporal_patch_size;
-
-    num_frames = static_cast<int>(double_num_frames);
-  }
-
-  if (num_frames > total_num_frames) {
-    LOG(FATAL) << "Video can't be sampled. The inferred num_frames="
-               << num_frames << " exceeds total_num_frames=" << total_num_frames
-               << ".";
-  }
-
-  if (num_frames > 0) {
-    std::vector<int64_t> indices;
-    indices.reserve(num_frames);
-    for (int i = 0; i < num_frames; ++i) {
-      int64_t k = static_cast<int64_t>(
-          (static_cast<int64_t>(i) * total_num_frames) / num_frames);
-      if (k >= total_num_frames) k = total_num_frames - 1;
-      indices.push_back(k);
-    }
-    return torch::tensor(indices, torch::TensorOptions().dtype(torch::kLong));
-  } else {
-    return torch::arange(0,
-                         static_cast<int64_t>(total_num_frames),
-                         torch::TensorOptions().dtype(torch::kLong));
-  }
-}
-
-torch::Tensor ImageProcessor::GLM_sample_frames(const VideoMetadata& metadata,
-                                                int temporal_patch_size) {
-  // video: [T, C, H, W]
-  const int total_frames = metadata.total_num_frames;
-  if (total_frames <= 0) {
-    return torch::empty({0}, torch::dtype(torch::kLong));
-  }
-
-  if (metadata.fps <= 0.0) {
-    LOG(FATAL) << "invalid metadata.fps <= 0";
-  }
-
-  const int max_frame_idx = total_frames - 1;
-
-  // duration = metadata.duration or round(max_idx / fps) + 1
-  double duration = metadata.duration;
-  if (duration <= 0.0) {
-    duration =
-        std::round(static_cast<double>(max_frame_idx) / metadata.fps) + 1.0;
-  }
-
-  constexpr double DYN_FPS_30 = 3.0;
-  constexpr double DYN_FPS_300 = 1.0;
-  constexpr double DYN_FPS_2400 = 0.5;
-  constexpr int MAX_FRAME_COUNT_DYNAMIC = 640;
-  constexpr double MAX_DURATION = 2400.0;
-
-  const double effective_duration = std::min(duration, MAX_DURATION);
-
-  double target_fps = 0.0;
-  if (effective_duration <= 30.0) {
-    target_fps = DYN_FPS_30;
-  } else if (effective_duration <= 300.0) {
-    target_fps = DYN_FPS_300;
-  } else {
-    target_fps = DYN_FPS_2400;
-  }
-
-  // extract_t = int(effective_duration * target_fps * temporal_patch_size)
-  int extract_t = static_cast<int>(effective_duration * target_fps *
-                                   static_cast<double>(temporal_patch_size));
-  extract_t = std::min(extract_t, MAX_FRAME_COUNT_DYNAMIC);
-
-  const double duration_per_frame = 1.0 / metadata.fps;
-  std::vector<double> timestamps(total_frames);
-  for (int i = 0; i < total_frames; ++i) {
-    timestamps[i] = static_cast<double>(i) * duration_per_frame;
-  }
-  const int max_second = static_cast<int>(duration);
-
-  torch::Tensor frame_indices;
-
-  if (total_frames < extract_t) {
-    frame_indices = torch::linspace(
-        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
-  } else {
-    std::vector<int64_t> tmp;
-    tmp.reserve(static_cast<size_t>(total_frames));
-    double current_second = 0.0;
-    const double inv_fps =
-        1.0 / (static_cast<double>(temporal_patch_size) * target_fps);
-
-    for (int frame_index = 0; frame_index < total_frames; frame_index++) {
-      if (timestamps[frame_index] >= current_second) {
-        current_second += inv_fps;
-        tmp.push_back(frame_index);
-        if (current_second >= static_cast<double>(max_second)) {
-          break;
-        }
-      }
-    }
-    frame_indices =
-        torch::tensor(tmp, torch::TensorOptions().dtype(torch::kLong));
-  }
-  int64_t len = frame_indices.size(0);
-  if (len < extract_t) {
-    int64_t start, end;
-    if (len == 0) {
-      start = 0;
-      end = std::max<int64_t>(total_frames - 1, 0);
-    } else {
-      start = frame_indices[0].item<int64_t>();
-      end = frame_indices[len - 1].item<int64_t>();
-    }
-    frame_indices =
-        torch::linspace(start, end, extract_t, torch::dtype(torch::kLong));
-  } else if (len > extract_t) {
-    frame_indices = torch::linspace(
-        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
-  }
-
-  len = frame_indices.size(0);
-  std::unordered_set<int64_t> seen;
-  seen.reserve(static_cast<size_t>(len) * 2);
-  std::vector<int64_t> uniq;
-  uniq.reserve(static_cast<size_t>(len));
-
-  for (int64_t i = 0; i < len; ++i) {
-    auto idx = frame_indices[i].item<int64_t>();
-    if (seen.insert(idx).second) {
-      uniq.push_back(idx);
-    }
-  }
-
-  if (!uniq.empty() && (uniq.size() & 1)) {
-    uniq.push_back(uniq.back());
-  }
-
-  return torch::tensor(uniq, torch::TensorOptions().dtype(torch::kLong));
-}
-
 }  // namespace xllm
diff --git a/xllm/processors/image_processor.h b/xllm/processors/image_processor.h
index f5b5f33ef..be7fac5a1 100644
--- a/xllm/processors/image_processor.h
+++ b/xllm/processors/image_processor.h
@@ -39,15 +39,6 @@ class ImageProcessor {
   virtual torch::Tensor normalize(const torch::Tensor& image,
                                   const std::vector<double>& mean,
                                   const std::vector<double>& std);
-  virtual torch::Tensor init_frames(const VideoMetadata& metadata);
-  virtual torch::Tensor sample_frames(const VideoMetadata& metadata,
-                                      int temporal_patch_size,
-                                      int min_frames,
-                                      int max_frames,
-                                      int num_frames = -1,
-                                      double set_fps = -1.0);
-  virtual torch::Tensor GLM_sample_frames(const VideoMetadata& metadata,
-                                          int temporal_patch_size);
 };
 
 }  // namespace xllm
diff --git a/xllm/processors/qwen2_vl_image_processor.cpp b/xllm/processors/qwen2_vl_image_processor.cpp
index e1639dd6b..cd30d8146 100644
--- a/xllm/processors/qwen2_vl_image_processor.cpp
+++ b/xllm/processors/qwen2_vl_image_processor.cpp
@@ -60,6 +60,72 @@ std::optional<Size> smart_resize(int height,
 }
 }  // namespace
 
+torch::Tensor Qwen2VLImageProcessor::sample_frames(
+    const VideoMetadata& metadata,
+    int temporal_patch_size,
+    int min_frames,
+    int max_frames,
+    int num_frames,
+    double set_fps) {
+  if (set_fps > 0.0 && num_frames > 0) {
+    LOG(FATAL) << "num_frames and fps are mutually exclusive arguments, please "
+                  "use only one!";
+  }
+
+  double fps = set_fps;
+
+  int total_num_frames = metadata.total_num_frames;
+
+  if (num_frames > 0) {
+    double double_num_frames =
+        std::round(static_cast<double>(num_frames) / temporal_patch_size) *
+        temporal_patch_size;
+    num_frames = static_cast<int>(double_num_frames);
+  } else if (fps > 0.0) {
+    if (metadata.fps <= 0.0) {
+      LOG(FATAL)
+          << "Asked to sample `fps` frames per second but no video metadata "
+             "was provided which is required when sampling with `fps`. ";
+    }
+
+    max_frames =
+        (std::min(max_frames, total_num_frames) / temporal_patch_size) *
+        temporal_patch_size;
+    double double_num_frames =
+        static_cast<double>(total_num_frames) / metadata.fps * fps;
+    double_num_frames = std::min(
+        std::min(std::max(double_num_frames, static_cast<double>(min_frames)),
+                 static_cast<double>(max_frames)),
+        static_cast<double>(total_num_frames));
+    double_num_frames = std::floor(double_num_frames / temporal_patch_size) *
+                        temporal_patch_size;
+
+    num_frames = static_cast<int>(double_num_frames);
+  }
+
+  if (num_frames > total_num_frames) {
+    LOG(FATAL) << "Video can't be sampled. The inferred num_frames="
+               << num_frames << " exceeds total_num_frames=" << total_num_frames
+               << ".";
+  }
+
+  if (num_frames > 0) {
+    std::vector<int64_t> indices;
+    indices.reserve(num_frames);
+    for (int i = 0; i < num_frames; ++i) {
+      int64_t k = static_cast<int64_t>(
+          (static_cast<int64_t>(i) * total_num_frames) / num_frames);
+      if (k >= total_num_frames) k = total_num_frames - 1;
+      indices.push_back(k);
+    }
+    return torch::tensor(indices, torch::TensorOptions().dtype(torch::kLong));
+  } else {
+    return torch::arange(0,
+                         static_cast<int64_t>(total_num_frames),
+                         torch::TensorOptions().dtype(torch::kLong));
+  }
+}
+
 Qwen2VLImageProcessor::Qwen2VLImageProcessor(const ModelArgs& args) {
   image_mean_ = args.mm_image_normalize_mean();
   image_std_ = args.mm_image_normalize_std();
@@ -93,8 +159,7 @@ Qwen2VLImageProcessor::Qwen2VLImageProcessor(const ModelArgs& args) {
 bool Qwen2VLImageProcessor::process(const MMInput& inputs, MMData& datas) {
   std::vector<torch::Tensor> images = inputs.get_decode_data(MMType::IMAGE);
   std::vector<torch::Tensor> videos = inputs.get_decode_data(MMType::VIDEO);
-  std::vector<VideoMetadata> video_meta_list =
-      inputs.get_video_metadata(MMType::VIDEO);
+  std::vector<VideoMetadata> video_meta_list = inputs.get_video_metadata();
 
   if (images.empty() && (videos.empty() || video_meta_list.empty())) {
     LOG(ERROR) << "no image/video tensor found.";
@@ -270,7 +335,9 @@ bool Qwen2VLImageProcessor::process_video(
                                   /*num_frames=*/-1,
                                   /*set_fps=*/2.0);
   } else {
-    indices = this->init_frames(metadata);  // default sample to 32 frames
+    indices = torch::arange(0,
+                            static_cast<int64_t>(origin_video.size(0)),
+                            torch::TensorOptions().dtype(torch::kLong));
   }
   auto video = origin_video.index_select(/*dim=*/0, indices);
   int64_t sampled_total_frames = video.size(0);
diff --git a/xllm/processors/qwen2_vl_image_processor.h b/xllm/processors/qwen2_vl_image_processor.h
index 974ec83e5..3e35ac501 100644
--- a/xllm/processors/qwen2_vl_image_processor.h
+++ b/xllm/processors/qwen2_vl_image_processor.h
@@ -43,6 +43,12 @@ class Qwen2VLImageProcessor : public ImageProcessor {
                      VideoMetadata& metadata,
                      std::vector<torch::Tensor>& pixel_values,
                      std::vector<int64_t>& grids);
+  torch::Tensor sample_frames(const VideoMetadata& metadata,
+                              int temporal_patch_size,
+                              int min_frames,
+                              int max_frames,
+                              int num_frames = -1,
+                              double set_fps = -1.0);
 
  private:
   bool do_convert_rgb_ = true;
@@ -66,7 +72,7 @@ class Qwen2VLImageProcessor : public ImageProcessor {
   std::unordered_map<std::string, int> size_;
   int temporal_patch_size_ = 2;
 
-  bool do_sample_frame_ = false;
+  bool do_sample_frame_ = true;
 
   int min_frames_ = 4;
   int max_frames_ = 768;

From 3b95422805da5a321828ee285261b3b681054958 Mon Sep 17 00:00:00 2001
From: "wangziyue.28" <wangziyue.28@jd.com>
Date: Wed, 3 Dec 2025 15:47:42 +0800
Subject: [PATCH 20/20] feat: 1.move sample_frames from image_processor.
 2.support diffenrent types in chat_template content.