feat: support new model glm4-flash.

DongheJin · DongheJin · commit 01f2c508d984 · 2025-12-04T17:42:59.000+08:00
diff --git a/xllm/core/framework/chat_template/jinja_chat_template.cpp b/xllm/core/framework/chat_template/jinja_chat_template.cpp
@@ -121,6 +121,24 @@ std::optional<std::string> JinjaChatTemplate::apply(
     nlohmann::ordered_json& messages,
     const nlohmann::ordered_json& tools,
     const nlohmann::ordered_json& chat_template_kwargs) const {
+  for (auto& msg : messages) {
+    if (!msg.contains("content")) continue;
+    auto& content = msg["content"];
+    auto normalize_item = [](nlohmann::ordered_json& item) {
+      if (item.contains("type") && item["type"].is_string()) {
+        std::string t = item["type"].get<std::string>();
+        if (t == "video_url") item["type"] = "video";
+      }
+      if (item.contains("video_url") && !item.contains("video"))
+        item["video"] = item["video_url"];
+    };
+
+    if (content.is_array()) {
+      for (auto& it : content) normalize_item(it);
+    } else if (content.is_object()) {
+      normalize_item(content);
+    }
+  }
   minja::chat_template_inputs input;
   input.messages = messages;
   input.tools = tools;
@@ -137,23 +155,10 @@ nlohmann::ordered_json JinjaChatTemplate::get_mm_content(
 
   for (const auto& item : vec) {
     nlohmann::ordered_json item_json;
-    if (item.type == "video_url") {
-      item_json["type"] = "video";
-    } else {
-      item_json["type"] = item.type;
-    }
+    item_json["type"] = item.type;
 
     if (item.type == "text") {
       item_json["text"] = item.text;
-    } else if (item.type == "video_url") {
-      item_json["video"] = "mm place holder";
-      item_json["video_url"] = "mm place holder";
-    } else if (item.type == "image_url") {
-      item_json["image"] = "mm place holder";
-      item_json["image_url"] = "mm place holder";
-    } else if (item.type == "audio_url") {
-      item_json["audio"] = "mm place holder";
-      item_json["audio_url"] = "mm place holder";
     } else {
       item_json[item.type] = "mm place holder";
     }
diff --git a/xllm/core/framework/request/mm_codec.cpp b/xllm/core/framework/request/mm_codec.cpp
@@ -159,7 +159,8 @@ bool OpenCVVideoDecoder::decode(const std::string& raw_data,
   av_dict_set(&opts, "probesize", "20000000", 0);
   av_dict_set(&opts, "analyzeduration", "5000000", 0);
 
-  int ret = avformat_open_input(&fmt, nullptr, nullptr, &opts);
+  const AVInputFormat* in_fmt = av_find_input_format("mp4");
+  int ret = avformat_open_input(&fmt, nullptr, in_fmt, &opts);
   av_dict_free(&opts);
 
   if (ret < 0) {
diff --git a/xllm/core/framework/request/mm_input.h b/xllm/core/framework/request/mm_input.h
@@ -58,11 +58,13 @@ struct MMInput {
     return std::move(vec);
   }
 
-  std::vector<VideoMetadata> get_video_metadata() const {
+  std::vector<VideoMetadata> get_video_metadata(MMType type) const {
     std::vector<VideoMetadata> metas;
     metas.reserve(items_.size());
     for (auto& item : items_) {
-      metas.push_back(item.video_meta_);
+      if (item.type_ == type) {
+        metas.push_back(item.video_meta_);
+      }
     }
     return metas;
   }
diff --git a/xllm/core/runtime/vlm_master.cpp b/xllm/core/runtime/vlm_master.cpp
@@ -220,7 +220,11 @@ void VLMMaster::handle_request(const std::vector<Message>& messages,
                         "Image processor process failed.");
     return;
   }
-
+  if (const auto& res = mm_data.get<torch::Tensor>("image_grid_thw"))
+  {
+    auto image_grid_thw = res.value();
+  LOG(INFO)<<"image_grid_thw:"<<image_grid_thw;
+  }
   this->handle_request(messages, mm_data, sp, callback);
 }
 
@@ -307,7 +311,6 @@ std::shared_ptr<Request> VLMMaster::generate_request(std::string prompt,
   }
   Timer timer;
   input_processor_->process(prompt, mm_data);
-
   std::vector<int> prompt_tokens;
   if (!tokenizer_->encode(prompt, &prompt_tokens)) {
     LOG(ERROR) << "Failed to encode prompt: " << prompt;
diff --git a/xllm/models/llm/glm4.h b/xllm/models/llm/glm4.h
@@ -93,20 +93,18 @@ class Glm4ModelImpl : public LlmModelImplBase<Glm4DecoderLayer> {
 
     if (positions.dim() == 2) {  // mrope
       auto apply = [this](torch::Tensor x) {
-        auto freqs_t = x[0].clone();
-        for (int dim_idx = 1; dim_idx <= 2; ++dim_idx) {
-          int64_t offset = dim_idx;
-          int64_t section_len = mrope_section_[dim_idx];
-          int64_t length = section_len * 2;
-          auto idx_first_half = torch::arange(offset, length, 3, torch::kLong);
-          auto idx_second_half = torch::arange(offset, length, 3, torch::kLong);
-          auto idx_tensor =
-              torch::cat({idx_first_half, idx_second_half}, 0).to(x.device());
-          // freqs_t[..., idx] = freqs[dim_idx][..., idx]
-          auto src = x[dim_idx].index_select(-1, idx_tensor);
-          freqs_t.index_copy_(-1, idx_tensor, src);
+        auto sections = mrope_section_;
+        sections.insert(sections.end(), sections.begin(), sections.end());
+
+        auto vec = x.split(sections, -1);
+        std::vector<torch::Tensor> selects;
+        selects.reserve(vec.size());
+
+        for (int64_t i = 0; i < vec.size(); ++i) {
+          auto m = vec[i];
+          selects.push_back(m[i % mrope_section_.size()]);
         }
-        return freqs_t;
+        return torch::cat(selects, -1);
       };
       cos_pos = apply(cos_pos.reshape(
           {positions.sizes().front(), -1, cos_pos.sizes().back()}));
diff --git a/xllm/models/vlm/glm4v.h b/xllm/models/vlm/glm4v.h
@@ -605,7 +605,6 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
       blocks_->push_back(block);
       layers_.push_back(block);
     }
-     // TODO 融合算子
     post_layernorm_ = register_module("post_layernorm", Glm4VisionRmsNorm(context));
 
     downsample_ = register_module("downsample", torch::nn::Conv2d(torch::nn::Conv2dOptions(hidden_size_, out_hidden_size_, spatial_merge_size_)
@@ -672,8 +671,6 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
     auto repeated = torch::repeat_interleave(h_times_w, repeats, 0);
     c10::optional<torch::ScalarType> cumsum_dtype;
 
-    LOG(INFO) << " Glm4VisionTransformerImpl repeated " << repeated;
-
     cumsum_dtype = torch::kInt32;
     auto cu_seqlens = torch::cumsum(repeated, 0, cumsum_dtype);
     namespace F = torch::nn::functional;
@@ -682,27 +679,21 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
     std::vector<int> seqlens;
     seqlens.assign(cu_seqlens.data_ptr<int>(),cu_seqlens.data_ptr<int>() + cu_seqlens.numel());
 
-    LOG(INFO) << " Glm4VisionTransformerImpl forward embedding before cu_seqlens " << cu_seqlens << "seqlens.size()" << seqlens.size();
     hidden_states = embeddings_(hidden_states, seqlens, grid_thw, image_type_ids.select(1, 0), image_type_ids.select(1, 1));
-    LOG(INFO) << " Glm4VisionTransformerImpl forward embedding after ";
     ModelInputParams& input_params_new = const_cast<ModelInputParams&>(input_params);
     torch::Tensor cu_seqlens_cpu = cu_seqlens.cpu();
     std::vector<int> cu_seqlens_vec( 
-        cu_seqlens_cpu.data_ptr<int>(),  // full seqlen vec
+        cu_seqlens_cpu.data_ptr<int>(), 
         cu_seqlens_cpu.data_ptr<int>() + cu_seqlens_cpu.numel());
+    cu_seqlens = cu_seqlens.to(hidden_states.device());
     for (int idx = 0; idx < blocks_->size(); ++idx) {  
-      hidden_states = layers_[idx](hidden_states, m_cos, m_sin, cu_seqlens, cu_seqlens_vec, input_params_new, idx); //TODO
-      LOG(INFO) << " Glm4VisionTransformerImpl forward layer "<< idx;
+      hidden_states = layers_[idx](hidden_states, m_cos, m_sin, cu_seqlens, cu_seqlens_vec, input_params_new, idx);
     }
-    LOG(INFO) << " Glm4VisionTransformerImpl forward layer after ";
     hidden_states = post_layernorm_(hidden_states);
     hidden_states = hidden_states.view({-1, spatial_merge_size_, spatial_merge_size_, hidden_states.size(-1)});
-    // TO down sample  merge op
     hidden_states = hidden_states.permute({0, 3, 1, 2});
     hidden_states = downsample_(hidden_states).view({-1, out_hidden_size_});
-    LOG(INFO) << " Glm4VisionTransformerImpl downsample after";
     hidden_states = merger_(hidden_states);
-    LOG(INFO) << " Glm4VisionTransformerImpl forward end";
     return hidden_states;
   };
 
@@ -820,12 +811,10 @@ class Glm4vForConditionalGenerationImpl : public torch::nn::Module {
       const ModelInputParams& input_params) {
     auto inputs_embeds = language_model_->get_input_embeddings(input_ids);
     if (image_input) {
-      // visual
       auto image_embeds =
           visual_(image_input->pixel_values.to(options_),
                   image_input->image_grid_thw,
                   input_params);
-      // merge
       auto is_multimodal = torch::isin(input_ids,
       model_args_.image_token_id()); input_params.visual_pos_masks =
       is_multimodal; inputs_embeds.index_put_({is_multimodal}, image_embeds);
@@ -851,7 +840,6 @@ class Glm4vForConditionalGenerationImpl : public torch::nn::Module {
 
     if (pixel_values.defined() && image_grid_thw.defined())
       image_inputs = Glm4VImageInputs{pixel_values, image_grid_thw};
-
     auto inputs_embeds = get_input_embeddings(tokens, image_inputs, video_inputs, input_params);
     input_params.input_embedding = inputs_embeds;
     auto emb = language_model_(tokens, positions, kv_caches, input_params);
@@ -869,7 +857,6 @@ class Glm4vForConditionalGenerationImpl : public torch::nn::Module {
       visual_->load_state_dict(
           state_dict->get_dict_with_prefix("model.visual."));
     }
-    // verify
     visual_->verify_loaded_weights("model.visual.");
     visual_->merge_loaded_weights();
     if (!model_args_.image_embedding_mode()) {
diff --git a/xllm/processors/glm4v_image_processor.cpp b/xllm/processors/glm4v_image_processor.cpp
@@ -77,117 +77,6 @@ std::optional<Size> smart_resize(int num_frames,
 }
 }  // namespace
 
-torch::Tensor Glm4VImageProcessor::sample_frames(const VideoMetadata& metadata,
-                                                 int temporal_patch_size) {
-  // video: [T, C, H, W]
-  const int total_frames = metadata.total_num_frames;
-  if (total_frames <= 0) {
-    return torch::empty({0}, torch::dtype(torch::kLong));
-  }
-
-  if (metadata.fps <= 0.0) {
-    LOG(FATAL) << "invalid metadata.fps <= 0";
-  }
-
-  const int max_frame_idx = total_frames - 1;
-
-  // duration = metadata.duration or round(max_idx / fps) + 1
-  double duration = metadata.duration;
-  if (duration <= 0.0) {
-    duration =
-        std::round(static_cast<double>(max_frame_idx) / metadata.fps) + 1.0;
-  }
-
-  constexpr double DYN_FPS_30 = 3.0;
-  constexpr double DYN_FPS_300 = 1.0;
-  constexpr double DYN_FPS_2400 = 0.5;
-  constexpr int MAX_FRAME_COUNT_DYNAMIC = 640;
-  constexpr double MAX_DURATION = 2400.0;
-
-  const double effective_duration = std::min(duration, MAX_DURATION);
-
-  double target_fps = 0.0;
-  if (effective_duration <= 30.0) {
-    target_fps = DYN_FPS_30;
-  } else if (effective_duration <= 300.0) {
-    target_fps = DYN_FPS_300;
-  } else {
-    target_fps = DYN_FPS_2400;
-  }
-
-  // extract_t = int(effective_duration * target_fps * temporal_patch_size)
-  int extract_t = static_cast<int>(effective_duration * target_fps *
-                                   static_cast<double>(temporal_patch_size));
-  extract_t = std::min(extract_t, MAX_FRAME_COUNT_DYNAMIC);
-
-  const double duration_per_frame = 1.0 / metadata.fps;
-  std::vector<double> timestamps(total_frames);
-  for (int i = 0; i < total_frames; ++i) {
-    timestamps[i] = static_cast<double>(i) * duration_per_frame;
-  }
-  const int max_second = static_cast<int>(duration);
-
-  torch::Tensor frame_indices;
-
-  if (total_frames < extract_t) {
-    frame_indices = torch::linspace(
-        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
-  } else {
-    std::vector<int64_t> tmp;
-    tmp.reserve(static_cast<size_t>(total_frames));
-    double current_second = 0.0;
-    const double inv_fps =
-        1.0 / (static_cast<double>(temporal_patch_size) * target_fps);
-
-    for (int frame_index = 0; frame_index < total_frames; frame_index++) {
-      if (timestamps[frame_index] >= current_second) {
-        current_second += inv_fps;
-        tmp.push_back(frame_index);
-        if (current_second >= static_cast<double>(max_second)) {
-          break;
-        }
-      }
-    }
-    frame_indices =
-        torch::tensor(tmp, torch::TensorOptions().dtype(torch::kLong));
-  }
-  int64_t len = frame_indices.size(0);
-  if (len < extract_t) {
-    int64_t start, end;
-    if (len == 0) {
-      start = 0;
-      end = std::max<int64_t>(total_frames - 1, 0);
-    } else {
-      start = frame_indices[0].item<int64_t>();
-      end = frame_indices[len - 1].item<int64_t>();
-    }
-    frame_indices =
-        torch::linspace(start, end, extract_t, torch::dtype(torch::kLong));
-  } else if (len > extract_t) {
-    frame_indices = torch::linspace(
-        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
-  }
-
-  len = frame_indices.size(0);
-  std::unordered_set<int64_t> seen;
-  seen.reserve(static_cast<size_t>(len) * 2);
-  std::vector<int64_t> uniq;
-  uniq.reserve(static_cast<size_t>(len));
-
-  for (int64_t i = 0; i < len; ++i) {
-    auto idx = frame_indices[i].item<int64_t>();
-    if (seen.insert(idx).second) {
-      uniq.push_back(idx);
-    }
-  }
-
-  if (!uniq.empty() && (uniq.size() & 1)) {
-    uniq.push_back(uniq.back());
-  }
-
-  return torch::tensor(uniq, torch::TensorOptions().dtype(torch::kLong));
-}
-
 Glm4VImageProcessor::Glm4VImageProcessor(const ModelArgs& args) {
   image_mean_ = args.mm_image_normalize_mean();
   image_std_ = args.mm_image_normalize_std();
@@ -223,7 +112,8 @@ Glm4VImageProcessor::Glm4VImageProcessor(const ModelArgs& args) {
 bool Glm4VImageProcessor::process(const MMInput& inputs, MMData& datas) {
   std::vector<torch::Tensor> images = inputs.get_decode_data(MMType::IMAGE);
   std::vector<torch::Tensor> videos = inputs.get_decode_data(MMType::VIDEO);
-  std::vector<VideoMetadata> video_meta_list = inputs.get_video_metadata();
+  std::vector<VideoMetadata> video_meta_list =
+      inputs.get_video_metadata(MMType::VIDEO);
 
   if (images.empty() && (videos.empty() || video_meta_list.empty())) {
     LOG(ERROR) << "no image/video tensor found.";
@@ -359,8 +249,8 @@ bool Glm4VImageProcessor::process_videos(
 
   auto values = torch::cat(pixel_values);
   auto thw = torch::tensor(grids).clone().reshape({-1, 3});
-  mm_datas.add(MMType::VIDEO, "video_grid_thw", thw);
-  mm_datas.add(MMType::VIDEO, "pixel_values_videos", values);
+  mm_datas.update(MMType::VIDEO, "video_grid_thw", thw);
+  mm_datas.update(MMType::VIDEO, "pixel_values_videos", values);
 
   return true;
 }
@@ -376,11 +266,9 @@ bool Glm4VImageProcessor::process_video(
 
   torch::Tensor indices;
   if (do_sample_frame_) {
-    indices = this->sample_frames(metadata, temporal_patch_size_);
+    indices = this->GLM_sample_frames(metadata, temporal_patch_size_);
   } else {
-    indices = torch::arange(0,
-                            static_cast<int64_t>(origin_video.size(0)),
-                            torch::TensorOptions().dtype(torch::kLong));
+    indices = this->init_frames(metadata);  // default sample to 32 frames
   }
   auto video = origin_video.index_select(/*dim=*/0, indices);
   int64_t sampled_total_frames = video.size(0);
diff --git a/xllm/processors/glm4v_image_processor.h b/xllm/processors/glm4v_image_processor.h
@@ -42,8 +42,6 @@ class Glm4VImageProcessor : public ImageProcessor {
                      VideoMetadata& metadata,
                      std::vector<torch::Tensor>& pixel_values,
                      std::vector<int64_t>& grids);
-  torch::Tensor sample_frames(const VideoMetadata& metadata,
-                              int temporal_patch_size);
 
  private:
   bool do_convert_rgb_ = true;
diff --git a/xllm/processors/image_processor.cpp b/xllm/processors/image_processor.cpp
diff --git a/xllm/processors/image_processor.h b/xllm/processors/image_processor.h
diff --git a/xllm/processors/qwen2_vl_image_processor.cpp b/xllm/processors/qwen2_vl_image_processor.cpp
diff --git a/xllm/processors/qwen2_vl_image_processor.h b/xllm/processors/qwen2_vl_image_processor.h

Original file line number	Diff line number	Diff line change
`@@ -58,11 +58,13 @@ struct MMInput {`
`58`	`58`	`return std::move(vec);`
`59`	`59`	`}`
`60`	`60`
`61`		`- std::vector<VideoMetadata> get_video_metadata() const {`
	`61`	`+ std::vector<VideoMetadata> get_video_metadata(MMType type) const {`
`62`	`62`	`std::vector<VideoMetadata> metas;`
`63`	`63`	`metas.reserve(items_.size());`
`64`	`64`	`for (auto& item : items_) {`
`65`		`- metas.push_back(item.video_meta_);`
	`65`	`+ if (item.type_ == type) {`
	`66`	`+ metas.push_back(item.video_meta_);`
	`67`	`+ }`
`66`	`68`	`}`
`67`	`69`	`return metas;`
`68`	`70`	`}`
Original file line number	Diff line number	Diff line change
`@@ -220,7 +220,11 @@ void VLMMaster::handle_request(const std::vector<Message>& messages,`
`220`	`220`	`"Image processor process failed.");`
`221`	`221`	`return;`
`222`	`222`	`}`
`223`		`-`
	`223`	`+ if (const auto& res = mm_data.get<torch::Tensor>("image_grid_thw"))`
	`224`	`+ {`
	`225`	`+ auto image_grid_thw = res.value();`
	`226`	`+ LOG(INFO)<<"image_grid_thw:"<<image_grid_thw;`
	`227`	`+ }`
`224`	`228`	`this->handle_request(messages, mm_data, sp, callback);`
`225`	`229`	`}`
`226`	`230`
`@@ -307,7 +311,6 @@ std::shared_ptr<Request> VLMMaster::generate_request(std::string prompt,`
`307`	`311`	`}`
`308`	`312`	`Timer timer;`
`309`	`313`	`input_processor_->process(prompt, mm_data);`
`310`		`-`
`311`	`314`	`std::vector<int> prompt_tokens;`
`312`	`315`	`if (!tokenizer_->encode(prompt, &prompt_tokens)) {`
`313`	`316`	`LOG(ERROR) << "Failed to encode prompt: " << prompt;`