jd-opensource
diff --git a/‎xllm/core/framework/chat_template/jinja_chat_template.cpp‎
Lines changed: 9 additions & 18 deletions b/‎xllm/core/framework/chat_template/jinja_chat_template.cpp‎
Lines changed: 9 additions & 18 deletions
diff --git a/‎xllm/core/framework/request/mm_codec.cpp‎
Lines changed: 1 addition & 2 deletions b/‎xllm/core/framework/request/mm_codec.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎xllm/core/framework/request/mm_input.h‎
Lines changed: 2 additions & 4 deletions b/‎xllm/core/framework/request/mm_input.h‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎xllm/processors/glm4v_image_processor.cpp‎
Lines changed: 118 additions & 6 deletions b/‎xllm/processors/glm4v_image_processor.cpp‎
Lines changed: 118 additions & 6 deletions
diff --git a/‎xllm/processors/glm4v_image_processor.h‎
100755100644
Lines changed: 2 additions & 0 deletions b/‎xllm/processors/glm4v_image_processor.h‎
100755100644
Lines changed: 2 additions & 0 deletions
@@ -121,24 +121,6 @@ std::optional<std::string> JinjaChatTemplate::apply(
     nlohmann::ordered_json& messages,
     const nlohmann::ordered_json& tools,
     const nlohmann::ordered_json& chat_template_kwargs) const {
-  for (auto& msg : messages) {
-    if (!msg.contains("content")) continue;
-    auto& content = msg["content"];
-    auto normalize_item = [](nlohmann::ordered_json& item) {
-      if (item.contains("type") && item["type"].is_string()) {
-        std::string t = item["type"].get<std::string>();
-        if (t == "video_url") item["type"] = "video";
-      }
-      if (item.contains("video_url") && !item.contains("video"))
-        item["video"] = item["video_url"];
-    };
-
-    if (content.is_array()) {
-      for (auto& it : content) normalize_item(it);
-    } else if (content.is_object()) {
-      normalize_item(content);
-    }
-  }
   minja::chat_template_inputs input;
   input.messages = messages;
   input.tools = tools;
@@ -159,6 +141,15 @@ nlohmann::ordered_json JinjaChatTemplate::get_mm_content(
 
     if (item.type == "text") {
       item_json["text"] = item.text;
+    } else if (item.type == "video_url") {
+      item_json["video"] = "mm place holder";
+      item_json["video_url"] = "mm place holder";
+    } else if (item.type == "image_url") {
+      item_json["image"] = "mm place holder";
+      item_json["image_url"] = "mm place holder";
+    } else if (item.type == "audio_url") {
+      item_json["audio"] = "mm place holder";
+      item_json["audio_url"] = "mm place holder";
     } else {
       item_json[item.type] = "mm place holder";
     }
 
@@ -159,8 +159,7 @@ bool OpenCVVideoDecoder::decode(const std::string& raw_data,
   av_dict_set(&opts, "probesize", "20000000", 0);
   av_dict_set(&opts, "analyzeduration", "5000000", 0);
 
-  const AVInputFormat* in_fmt = av_find_input_format("mp4");
-  int ret = avformat_open_input(&fmt, nullptr, in_fmt, &opts);
+  int ret = avformat_open_input(&fmt, nullptr, nullptr, &opts);
   av_dict_free(&opts);
 
   if (ret < 0) {
 
@@ -58,13 +58,11 @@ struct MMInput {
     return std::move(vec);
   }
 
-  std::vector<VideoMetadata> get_video_metadata(MMType type) const {
+  std::vector<VideoMetadata> get_video_metadata() const {
     std::vector<VideoMetadata> metas;
     metas.reserve(items_.size());
     for (auto& item : items_) {
-      if (item.type_ == type) {
-        metas.push_back(item.video_meta_);
-      }
+      metas.push_back(item.video_meta_);
     }
     return metas;
   }
 
@@ -77,6 +77,117 @@ std::optional<Size> smart_resize(int num_frames,
 }
 }  // namespace
 
+torch::Tensor Glm4VImageProcessor::sample_frames(const VideoMetadata& metadata,
+                                                 int temporal_patch_size) {
+  // video: [T, C, H, W]
+  const int total_frames = metadata.total_num_frames;
+  if (total_frames <= 0) {
+    return torch::empty({0}, torch::dtype(torch::kLong));
+  }
+
+  if (metadata.fps <= 0.0) {
+    LOG(FATAL) << "invalid metadata.fps <= 0";
+  }
+
+  const int max_frame_idx = total_frames - 1;
+
+  // duration = metadata.duration or round(max_idx / fps) + 1
+  double duration = metadata.duration;
+  if (duration <= 0.0) {
+    duration =
+        std::round(static_cast<double>(max_frame_idx) / metadata.fps) + 1.0;
+  }
+
+  constexpr double DYN_FPS_30 = 3.0;
+  constexpr double DYN_FPS_300 = 1.0;
+  constexpr double DYN_FPS_2400 = 0.5;
+  constexpr int MAX_FRAME_COUNT_DYNAMIC = 640;
+  constexpr double MAX_DURATION = 2400.0;
+
+  const double effective_duration = std::min(duration, MAX_DURATION);
+
+  double target_fps = 0.0;
+  if (effective_duration <= 30.0) {
+    target_fps = DYN_FPS_30;
+  } else if (effective_duration <= 300.0) {
+    target_fps = DYN_FPS_300;
+  } else {
+    target_fps = DYN_FPS_2400;
+  }
+
+  // extract_t = int(effective_duration * target_fps * temporal_patch_size)
+  int extract_t = static_cast<int>(effective_duration * target_fps *
+                                   static_cast<double>(temporal_patch_size));
+  extract_t = std::min(extract_t, MAX_FRAME_COUNT_DYNAMIC);
+
+  const double duration_per_frame = 1.0 / metadata.fps;
+  std::vector<double> timestamps(total_frames);
+  for (int i = 0; i < total_frames; ++i) {
+    timestamps[i] = static_cast<double>(i) * duration_per_frame;
+  }
+  const int max_second = static_cast<int>(duration);
+
+  torch::Tensor frame_indices;
+
+  if (total_frames < extract_t) {
+    frame_indices = torch::linspace(
+        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
+  } else {
+    std::vector<int64_t> tmp;
+    tmp.reserve(static_cast<size_t>(total_frames));
+    double current_second = 0.0;
+    const double inv_fps =
+        1.0 / (static_cast<double>(temporal_patch_size) * target_fps);
+
+    for (int frame_index = 0; frame_index < total_frames; frame_index++) {
+      if (timestamps[frame_index] >= current_second) {
+        current_second += inv_fps;
+        tmp.push_back(frame_index);
+        if (current_second >= static_cast<double>(max_second)) {
+          break;
+        }
+      }
+    }
+    frame_indices =
+        torch::tensor(tmp, torch::TensorOptions().dtype(torch::kLong));
+  }
+  int64_t len = frame_indices.size(0);
+  if (len < extract_t) {
+    int64_t start, end;
+    if (len == 0) {
+      start = 0;
+      end = std::max<int64_t>(total_frames - 1, 0);
+    } else {
+      start = frame_indices[0].item<int64_t>();
+      end = frame_indices[len - 1].item<int64_t>();
+    }
+    frame_indices =
+        torch::linspace(start, end, extract_t, torch::dtype(torch::kLong));
+  } else if (len > extract_t) {
+    frame_indices = torch::linspace(
+        0, total_frames - 1, extract_t, torch::dtype(torch::kLong));
+  }
+
+  len = frame_indices.size(0);
+  std::unordered_set<int64_t> seen;
+  seen.reserve(static_cast<size_t>(len) * 2);
+  std::vector<int64_t> uniq;
+  uniq.reserve(static_cast<size_t>(len));
+
+  for (int64_t i = 0; i < len; ++i) {
+    auto idx = frame_indices[i].item<int64_t>();
+    if (seen.insert(idx).second) {
+      uniq.push_back(idx);
+    }
+  }
+
+  if (!uniq.empty() && (uniq.size() & 1)) {
+    uniq.push_back(uniq.back());
+  }
+
+  return torch::tensor(uniq, torch::TensorOptions().dtype(torch::kLong));
+}
+
 Glm4VImageProcessor::Glm4VImageProcessor(const ModelArgs& args) {
   image_mean_ = args.mm_image_normalize_mean();
   image_std_ = args.mm_image_normalize_std();
@@ -112,8 +223,7 @@ Glm4VImageProcessor::Glm4VImageProcessor(const ModelArgs& args) {
 bool Glm4VImageProcessor::process(const MMInput& inputs, MMData& datas) {
   std::vector<torch::Tensor> images = inputs.get_decode_data(MMType::IMAGE);
   std::vector<torch::Tensor> videos = inputs.get_decode_data(MMType::VIDEO);
-  std::vector<VideoMetadata> video_meta_list =
-      inputs.get_video_metadata(MMType::VIDEO);
+  std::vector<VideoMetadata> video_meta_list = inputs.get_video_metadata();
 
   if (images.empty() && (videos.empty() || video_meta_list.empty())) {
     LOG(ERROR) << "no image/video tensor found.";
@@ -249,8 +359,8 @@ bool Glm4VImageProcessor::process_videos(
 
   auto values = torch::cat(pixel_values);
   auto thw = torch::tensor(grids).clone().reshape({-1, 3});
-  mm_datas.update(MMType::VIDEO, "video_grid_thw", thw);
-  mm_datas.update(MMType::VIDEO, "pixel_values_videos", values);
+  mm_datas.add(MMType::VIDEO, "video_grid_thw", thw);
+  mm_datas.add(MMType::VIDEO, "pixel_values_videos", values);
 
   return true;
 }
@@ -266,9 +376,11 @@ bool Glm4VImageProcessor::process_video(
 
   torch::Tensor indices;
   if (do_sample_frame_) {
-    indices = this->GLM_sample_frames(metadata, temporal_patch_size_);
+    indices = this->sample_frames(metadata, temporal_patch_size_);
   } else {
-    indices = this->init_frames(metadata);  // default sample to 32 frames
+    indices = torch::arange(0,
+                            static_cast<int64_t>(origin_video.size(0)),
+                            torch::TensorOptions().dtype(torch::kLong));
   }
   auto video = origin_video.index_select(/*dim=*/0, indices);
   int64_t sampled_total_frames = video.size(0);
 
@@ -42,6 +42,8 @@ class Glm4VImageProcessor : public ImageProcessor {
                      VideoMetadata& metadata,
                      std::vector<torch::Tensor>& pixel_values,
                      std::vector<int64_t>& grids);
+  torch::Tensor sample_frames(const VideoMetadata& metadata,
+                              int temporal_patch_size);
 
  private:
   bool do_convert_rgb_ = true;
Original file line number	Diff line number	Diff line change
`@@ -58,13 +58,11 @@ struct MMInput {`
`58`	`58`	`return std::move(vec);`
`59`	`59`	`}`
`60`	`60`
`61`		`- std::vector<VideoMetadata> get_video_metadata(MMType type) const {`
	`61`	`+ std::vector<VideoMetadata> get_video_metadata() const {`
`62`	`62`	`std::vector<VideoMetadata> metas;`
`63`	`63`	`metas.reserve(items_.size());`
`64`	`64`	`for (auto& item : items_) {`
`65`		`- if (item.type_ == type) {`
`66`		`- metas.push_back(item.video_meta_);`
`67`		`- }`
	`65`	`+ metas.push_back(item.video_meta_);`
`68`	`66`	`}`
`69`	`67`	`return metas;`
`70`	`68`	`}`