feat: support new model glm4-flash.

DongheJin · DongheJin · commit a7e15a621b22 · 2025-12-04T17:18:32.000+08:00
diff --git a/xllm/core/runtime/vlm_master.cpp b/xllm/core/runtime/vlm_master.cpp
@@ -220,7 +220,11 @@ void VLMMaster::handle_request(const std::vector<Message>& messages,
                         "Image processor process failed.");
     return;
   }
-
+  if (const auto& res = mm_data.get<torch::Tensor>("image_grid_thw"))
+  {
+    auto image_grid_thw = res.value();
+  LOG(INFO)<<"image_grid_thw:"<<image_grid_thw;
+  }
   this->handle_request(messages, mm_data, sp, callback);
 }
 
@@ -307,7 +311,6 @@ std::shared_ptr<Request> VLMMaster::generate_request(std::string prompt,
   }
   Timer timer;
   input_processor_->process(prompt, mm_data);
-
   std::vector<int> prompt_tokens;
   if (!tokenizer_->encode(prompt, &prompt_tokens)) {
     LOG(ERROR) << "Failed to encode prompt: " << prompt;
diff --git a/xllm/models/llm/glm4.h b/xllm/models/llm/glm4.h
@@ -93,20 +93,18 @@ class Glm4ModelImpl : public LlmModelImplBase<Glm4DecoderLayer> {
 
     if (positions.dim() == 2) {  // mrope
       auto apply = [this](torch::Tensor x) {
-        auto freqs_t = x[0].clone();
-        for (int dim_idx = 1; dim_idx <= 2; ++dim_idx) {
-          int64_t offset = dim_idx;
-          int64_t section_len = mrope_section_[dim_idx];
-          int64_t length = section_len * 2;
-          auto idx_first_half = torch::arange(offset, length, 3, torch::kLong);
-          auto idx_second_half = torch::arange(offset, length, 3, torch::kLong);
-          auto idx_tensor =
-              torch::cat({idx_first_half, idx_second_half}, 0).to(x.device());
-          // freqs_t[..., idx] = freqs[dim_idx][..., idx]
-          auto src = x[dim_idx].index_select(-1, idx_tensor);
-          freqs_t.index_copy_(-1, idx_tensor, src);
+        auto sections = mrope_section_;
+        sections.insert(sections.end(), sections.begin(), sections.end());
+
+        auto vec = x.split(sections, -1);
+        std::vector<torch::Tensor> selects;
+        selects.reserve(vec.size());
+
+        for (int64_t i = 0; i < vec.size(); ++i) {
+          auto m = vec[i];
+          selects.push_back(m[i % mrope_section_.size()]);
         }
-        return freqs_t;
+        return torch::cat(selects, -1);
       };
       cos_pos = apply(cos_pos.reshape(
           {positions.sizes().front(), -1, cos_pos.sizes().back()}));
diff --git a/xllm/models/vlm/glm4v.h b/xllm/models/vlm/glm4v.h
@@ -605,7 +605,6 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
       blocks_->push_back(block);
       layers_.push_back(block);
     }
-     // TODO 融合算子
     post_layernorm_ = register_module("post_layernorm", Glm4VisionRmsNorm(context));
 
     downsample_ = register_module("downsample", torch::nn::Conv2d(torch::nn::Conv2dOptions(hidden_size_, out_hidden_size_, spatial_merge_size_)
@@ -672,8 +671,6 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
     auto repeated = torch::repeat_interleave(h_times_w, repeats, 0);
     c10::optional<torch::ScalarType> cumsum_dtype;
 
-    LOG(INFO) << " Glm4VisionTransformerImpl repeated " << repeated;
-
     cumsum_dtype = torch::kInt32;
     auto cu_seqlens = torch::cumsum(repeated, 0, cumsum_dtype);
     namespace F = torch::nn::functional;
@@ -682,27 +679,21 @@ class Glm4VisionTransformerImpl : public torch::nn::Module {
     std::vector<int> seqlens;
     seqlens.assign(cu_seqlens.data_ptr<int>(),cu_seqlens.data_ptr<int>() + cu_seqlens.numel());
 
-    LOG(INFO) << " Glm4VisionTransformerImpl forward embedding before cu_seqlens " << cu_seqlens << "seqlens.size()" << seqlens.size();
     hidden_states = embeddings_(hidden_states, seqlens, grid_thw, image_type_ids.select(1, 0), image_type_ids.select(1, 1));
-    LOG(INFO) << " Glm4VisionTransformerImpl forward embedding after ";
     ModelInputParams& input_params_new = const_cast<ModelInputParams&>(input_params);
     torch::Tensor cu_seqlens_cpu = cu_seqlens.cpu();
     std::vector<int> cu_seqlens_vec( 
-        cu_seqlens_cpu.data_ptr<int>(),  // full seqlen vec
+        cu_seqlens_cpu.data_ptr<int>(), 
         cu_seqlens_cpu.data_ptr<int>() + cu_seqlens_cpu.numel());
+    cu_seqlens = cu_seqlens.to(hidden_states.device());
     for (int idx = 0; idx < blocks_->size(); ++idx) {  
-      hidden_states = layers_[idx](hidden_states, m_cos, m_sin, cu_seqlens, cu_seqlens_vec, input_params_new, idx); //TODO
-      LOG(INFO) << " Glm4VisionTransformerImpl forward layer "<< idx;
+      hidden_states = layers_[idx](hidden_states, m_cos, m_sin, cu_seqlens, cu_seqlens_vec, input_params_new, idx);
     }
-    LOG(INFO) << " Glm4VisionTransformerImpl forward layer after ";
     hidden_states = post_layernorm_(hidden_states);
     hidden_states = hidden_states.view({-1, spatial_merge_size_, spatial_merge_size_, hidden_states.size(-1)});
-    // TO down sample  merge op
     hidden_states = hidden_states.permute({0, 3, 1, 2});
     hidden_states = downsample_(hidden_states).view({-1, out_hidden_size_});
-    LOG(INFO) << " Glm4VisionTransformerImpl downsample after";
     hidden_states = merger_(hidden_states);
-    LOG(INFO) << " Glm4VisionTransformerImpl forward end";
     return hidden_states;
   };
 
@@ -820,12 +811,10 @@ class Glm4vForConditionalGenerationImpl : public torch::nn::Module {
       const ModelInputParams& input_params) {
     auto inputs_embeds = language_model_->get_input_embeddings(input_ids);
     if (image_input) {
-      // visual
       auto image_embeds =
           visual_(image_input->pixel_values.to(options_),
                   image_input->image_grid_thw,
                   input_params);
-      // merge
       auto is_multimodal = torch::isin(input_ids,
       model_args_.image_token_id()); input_params.visual_pos_masks =
       is_multimodal; inputs_embeds.index_put_({is_multimodal}, image_embeds);
@@ -851,7 +840,6 @@ class Glm4vForConditionalGenerationImpl : public torch::nn::Module {
 
     if (pixel_values.defined() && image_grid_thw.defined())
       image_inputs = Glm4VImageInputs{pixel_values, image_grid_thw};
-
     auto inputs_embeds = get_input_embeddings(tokens, image_inputs, video_inputs, input_params);
     input_params.input_embedding = inputs_embeds;
     auto emb = language_model_(tokens, positions, kv_caches, input_params);
@@ -869,7 +857,6 @@ class Glm4vForConditionalGenerationImpl : public torch::nn::Module {
       visual_->load_state_dict(
           state_dict->get_dict_with_prefix("model.visual."));
     }
-    // verify
     visual_->verify_loaded_weights("model.visual.");
     visual_->merge_loaded_weights();
     if (!model_args_.image_embedding_mode()) {

Original file line number	Diff line number	Diff line change
`@@ -220,7 +220,11 @@ void VLMMaster::handle_request(const std::vector<Message>& messages,`
`220`	`220`	`"Image processor process failed.");`
`221`	`221`	`return;`
`222`	`222`	`}`
`223`		`-`
	`223`	`+ if (const auto& res = mm_data.get<torch::Tensor>("image_grid_thw"))`
	`224`	`+ {`
	`225`	`+ auto image_grid_thw = res.value();`
	`226`	`+ LOG(INFO)<<"image_grid_thw:"<<image_grid_thw;`
	`227`	`+ }`
`224`	`228`	`this->handle_request(messages, mm_data, sp, callback);`
`225`	`229`	`}`
`226`	`230`
`@@ -307,7 +311,6 @@ std::shared_ptr<Request> VLMMaster::generate_request(std::string prompt,`
`307`	`311`	`}`
`308`	`312`	`Timer timer;`
`309`	`313`	`input_processor_->process(prompt, mm_data);`
`310`		`-`
`311`	`314`	`std::vector<int> prompt_tokens;`
`312`	`315`	`if (!tokenizer_->encode(prompt, &prompt_tokens)) {`
`313`	`316`	`LOG(ERROR) << "Failed to encode prompt: " << prompt;`