feat: support async layer wise batch copy.

Kang-Meng · Kang-Meng · commit c67cf1294be5 · 2025-12-11T14:49:10.000+08:00
diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
@@ -168,6 +168,12 @@ DEFINE_uint32(prefetch_timeout,
               0,
               "Prefetch timeout for prefetch from kv cache store.");
 
+DEFINE_uint32(prefetch_bacth_size,
+              2,
+              "Prefetch from kvcache store copy batch size.");
+
+DEFINE_uint32(layers_wise_copy_batchs, 4, "Layer wise H2D copy batchs.");
+
 // --- parallel config ---
 
 DEFINE_int32(dp_size, 1, "Data parallel size for MLA attention.");
diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h
@@ -155,6 +155,10 @@ DECLARE_int32(max_decode_token_per_sequence);
 
 DECLARE_uint32(prefetch_timeout);
 
+DECLARE_uint32(prefetch_bacth_size);
+
+DECLARE_uint32(layers_wise_copy_batchs);
+
 DECLARE_string(priority_strategy);
 
 DECLARE_bool(enable_online_preempt_offline);
diff --git a/xllm/core/common/options.cpp b/xllm/core/common/options.cpp
@@ -54,6 +54,8 @@ std::string Options::to_string() const {
      << ", enable_cache_upload: " << enable_cache_upload()
      << ", enable_kvcache_store: " << enable_kvcache_store()
      << ", prefetch_timeout: " << prefetch_timeout()
+     << ", prefetch_bacth_size: " << prefetch_bacth_size()
+     << ", layers_wise_copy_batchs: " << layers_wise_copy_batchs()
      << ", store_protocol: " << store_protocol()
      << ", store_master_server_address: " << store_master_server_address()
      << ", store_metadata_server: " << store_metadata_server()
diff --git a/xllm/core/common/options.h b/xllm/core/common/options.h
@@ -192,6 +192,12 @@ class Options {
 
   // Prefetch timeout for prefetch from kv cache store
   PROPERTY(uint32_t, prefetch_timeout) = 0;
+
+  // Prefetch from kvcache store copy batch size
+  PROPERTY(uint32_t, prefetch_bacth_size) = 2;
+
+  // Layer wise H2D copy batchs
+  PROPERTY(uint32_t, layers_wise_copy_batchs) = 4;
 };
 
 }  // namespace xllm
diff --git a/xllm/core/distributed_runtime/worker_service.cpp b/xllm/core/distributed_runtime/worker_service.cpp
@@ -33,8 +33,6 @@ limitations under the License.
 
 namespace xllm {
 
-constexpr uint32_t COPY_BATCH_SIZE = 1;
-
 WorkerService::WorkerService(runtime::Options options,
                              const torch::Device& device)
     : options_(options), device_(device), initialized_(false) {
@@ -477,21 +475,23 @@ void WorkerService::PrefetchFromStorage(
         auto close_future = stream_handler->get_close_future();
         bool is_completed = false;
 
-        for (size_t i = 0; i < transfer_slice.size(); i += COPY_BATCH_SIZE) {
-          auto current_slice = transfer_slice.slice(
-              i, std::min(i + COPY_BATCH_SIZE, transfer_slice.size()));
+        for (size_t i = 0; i < transfer_slice.size();
+             i += options_.prefetch_bacth_size()) {
+          auto current_slice =
+              transfer_slice.slice(i,
+                                   std::min(i + options_.prefetch_bacth_size(),
+                                            transfer_slice.size()));
 
           auto success_cnt = worker_->prefetch_from_storage(current_slice);
 
           if (success_cnt != current_slice.size() ||
-              i + COPY_BATCH_SIZE >= transfer_slice.size()) {
+              i + options_.prefetch_bacth_size() >= transfer_slice.size()) {
             is_completed = true;
           }
 
           butil::IOBuf buf;
           buf.append(std::to_string(success_cnt));
           if (brpc::StreamWrite(*stream_id.get(), buf) != 0) {
-            is_completed = false;
             break;
           }
 
diff --git a/xllm/core/framework/request/sequence.cpp b/xllm/core/framework/request/sequence.cpp
@@ -382,6 +382,7 @@ void Sequence::reset() {
   kv_state_.reset();
   host_kv_state_.reset();
   timer_.reset();
+  is_timeout_set_ = false;
   volatile_num_prompt_tokens_ = num_tokens_;
 }
 
@@ -462,12 +463,13 @@ bool Sequence::update_prefetch_result(uint32_t timeout) {
   }
 
   if (timeout != 0 && !termination_flag_.load(std::memory_order_acquire)) {
-    if (timer_ != nullptr) {
-      timer_ = std::make_shared<Timer>();
+    if (!is_timeout_set_) {
+      timer_.reset();
+      is_timeout_set_ = true;
       return false;
     }
 
-    if (timer_->elapsed_milliseconds() < timeout) {
+    if (timer_.elapsed_milliseconds() < timeout) {
       return false;
     }
   }
diff --git a/xllm/core/framework/request/sequence.h b/xllm/core/framework/request/sequence.h
@@ -364,7 +364,8 @@ class Sequence final {
   std::atomic<bool> termination_flag_{false};
   std::vector<std::shared_ptr<std::atomic<uint32_t>>> prefetch_results_;
 
-  std::shared_ptr<Timer> timer_ = nullptr;
+  Timer timer_;
+  bool is_timeout_set_ = false;
 };
 
 }  // namespace xllm
diff --git a/xllm/core/kernels/npu/xllm_ops/top_k_top_p.h b/xllm/core/kernels/npu/xllm_ops/top_k_top_p.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "acl/acl.h"
-#include "aclnn_apply_top_k_top_p.h"
+#include "aclnnop/aclnn_apply_top_k_top_p.h"
 #include "acltensor_utils.h"
 #include "util/tensor_helper.h"
 
diff --git a/xllm/core/runtime/master.cpp b/xllm/core/runtime/master.cpp
@@ -218,6 +218,8 @@ Master::Master(const Options& options, EngineType type) : options_(options) {
         .store_master_server_address(options_.store_master_server_address())
         .store_metadata_server(options_.store_metadata_server())
         .store_local_hostname(options_.store_local_hostname())
+        .prefetch_bacth_size(options_.prefetch_bacth_size())
+        .layers_wise_copy_batchs(options_.layers_wise_copy_batchs())
         .enable_continuous_kvcache(options_.enable_continuous_kvcache())
         .enable_offline_inference(options_.enable_offline_inference())
         .spawn_worker_path(options_.spawn_worker_path())
diff --git a/xllm/core/runtime/options.h b/xllm/core/runtime/options.h
@@ -158,6 +158,12 @@ struct Options {
   //  value used if port is not included)
   PROPERTY(std::string, store_local_hostname) = "";
 
+  // Prefetch from kvcache store copy batch size
+  PROPERTY(uint32_t, prefetch_bacth_size) = 2;
+
+  // Layer wise H2D copy batchs
+  PROPERTY(uint32_t, layers_wise_copy_batchs) = 4;
+
   // dit
   // max requests per batch
   PROPERTY(int, max_requests_per_batch) = 0;
diff --git a/xllm/core/runtime/worker_impl.cpp b/xllm/core/runtime/worker_impl.cpp
@@ -638,7 +638,6 @@ bool WorkerImpl::init_model(const std::string& model_weights_path,
   if (!status) {
     return false;
   }
-  layers_per_copy_ = context_.get_model_args().n_layers() / 4;
 
   this->load_model(std::move(model_loader));
 
@@ -899,13 +898,15 @@ bool WorkerImpl::h2d_batch_copy(const uint64_t batch_id,
   }
 
   const int64_t num_layers = context_.get_model_args().n_layers();
-  uint32_t layers_per_copy = layers_per_copy_;
+  uint32_t layers_per_bacth_copy =
+      num_layers / options_.layers_wise_copy_batchs();
   uint32_t num_batches = block_transfer_info.size() * 2;
-  while (num_batches * layers_per_copy > BATCH_COPY_MAX_SIZE) {
-    layers_per_copy--;
+  while (num_batches * layers_per_bacth_copy > BATCH_COPY_MAX_SIZE) {
+    layers_per_bacth_copy--;
   }
 
-  uint32_t copy_cnt = (num_layers + layers_per_copy - 1) / layers_per_copy;
+  uint32_t copy_cnt =
+      (num_layers + layers_per_bacth_copy - 1) / layers_per_bacth_copy;
   auto synchronizer = std::make_shared<NPULayerSynchronizerImpl>(copy_cnt);
   {
     std::lock_guard<std::mutex> lock(mutex_);
@@ -923,17 +924,18 @@ bool WorkerImpl::h2d_batch_copy(const uint64_t batch_id,
   c10::StreamGuard streamGuard = stream->set_stream_guard();
   aclError ret = 0;
 
-  void** srcs = new void*[num_batches * layers_per_copy];
-  void** dsts = new void*[num_batches * layers_per_copy];
-  size_t* copy_size = new size_t[num_batches * layers_per_copy];
+  void** srcs = new void*[num_batches * layers_per_bacth_copy];
+  void** dsts = new void*[num_batches * layers_per_bacth_copy];
+  size_t* copy_size = new size_t[num_batches * layers_per_bacth_copy];
 
   for (int index = 0; index < copy_cnt; index++) {
-    int layer_id = index * layers_per_copy;
+    int layer_id = index * layers_per_bacth_copy;
     size_t fail_index = 0;
     uint32_t curr_index = 0;
     uint32_t layer_cnt = 0;
 
-    while (layer_id < (index + 1) * layers_per_copy && layer_id < num_layers) {
+    while (layer_id < (index + 1) * layers_per_bacth_copy &&
+           layer_id < num_layers) {
       auto dst_k_cache = kv_caches_.at(layer_id).get_k_cache();
       auto dst_v_cache = kv_caches_.at(layer_id).get_v_cache();
 
@@ -955,18 +957,16 @@ bool WorkerImpl::h2d_batch_copy(const uint64_t batch_id,
       layer_cnt++;
     }
 
-    // TODO(kangmeng): change to async API
-    CHECK(layer_cnt <= layers_per_copy)
-        << "layer_cnt should less equal to layers_per_copy.";
-    ret = aclrtMemcpyBatch(dsts,
-                           copy_size,
-                           srcs,
-                           copy_size,
-                           num_batches * layer_cnt,
-                           attrs,
-                           attrs_indexes,
-                           1,
-                           &fail_index);
+    ret = aclrtMemcpyBatchAsync(dsts,
+                                copy_size,
+                                srcs,
+                                copy_size,
+                                num_batches * layer_cnt,
+                                attrs,
+                                attrs_indexes,
+                                1,
+                                &fail_index,
+                                stream->get_stream()->stream());
 
     if (ret != 0 || fail_index != SIZE_MAX) {
       LOG(ERROR) << "aclrtMemcpyBatch error: " << ret
@@ -1020,6 +1020,7 @@ AlignedTensorCreater::AlignedTensorCreater(
     const torch::ScalarType dtype,
     const uint32_t num_tensors,
     std::vector<xllm::KVCache>* tensors) {
+#if defined(USE_NPU)
   CHECK(tensor_shapes.size() == 2)
       << "tensor_shapes.size() must equal to 2, but got "
       << tensor_shapes.size();
@@ -1057,6 +1058,14 @@ AlignedTensorCreater::AlignedTensorCreater(
     LOG(FATAL) << "Failed to lock memory pool!";
   }
 
+  auto ret = aclrtHostRegister(base_ptr_,
+                               total_size_,
+                               aclrtHostRegisterType::ACL_HOST_REGISTER_MAPPED,
+                               &mapped_ptr_);
+  if (ret != 0) {
+    LOG(FATAL) << "aclrtHostRegister fail: " << ret;
+  }
+
   size_t current_offset = 0;
   auto options = torch::TensorOptions().dtype(dtype).device(torch::kCPU);
   tensors->reserve(num_tensors);
@@ -1077,5 +1086,6 @@ AlignedTensorCreater::AlignedTensorCreater(
 
   LOG(INFO) << "Page aligned: "
             << ((uintptr_t)base_ptr_ % page_size == 0 ? "YES" : "NO");
+#endif
 }
 }  // namespace xllm
diff --git a/xllm/core/runtime/worker_impl.h b/xllm/core/runtime/worker_impl.h
@@ -268,7 +268,6 @@ class WorkerImpl {
 
   uint64_t key_cache_size_per_layer_;
   uint64_t value_cache_size_per_layer_;
-  uint32_t layers_per_copy_;
 
   bool is_spec_draft_ = false;
 
@@ -280,6 +279,7 @@ class WorkerImpl {
 class AlignedTensorCreater {
  private:
   void* base_ptr_;
+  void* mapped_ptr_;
   size_t total_size_;
 
  public:
@@ -290,6 +290,9 @@ class AlignedTensorCreater {
 
   ~AlignedTensorCreater() {
     if (base_ptr_ != nullptr) {
+#if defined(USE_NPU)
+      aclrtHostUnregister(base_ptr_);
+#endif
       munlock(base_ptr_, total_size_);
       munmap(base_ptr_, total_size_);
     }
diff --git a/xllm/models/llm/npu/deepseek_v2_mtp.h b/xllm/models/llm/npu/deepseek_v2_mtp.h
@@ -124,15 +124,25 @@ class DeepseekV2MtpModelImpl : public torch::nn::Module {
 
     auto attn_mask = attn_mask_.get_attn_mask(
         128, cos_pos.dtype().toScalarType(), cos_pos.device());
+
+    uint32_t layers_per_bacth_copy = 0;
+    if (input_params.layer_wise_load_synchronizer != nullptr) {
+      uint32_t event_cnt =
+          input_params.layer_wise_load_synchronizer->get_event_size();
+      layers_per_bacth_copy = layers_.size() / event_cnt +
+                              uint32_t(layers_.size() % event_cnt == 0);
+    }
     for (size_t i = 0; i < layers_.size(); i++) {
       aclrtEvent* event = nullptr;
       std::atomic<bool>* event_flag = nullptr;
       if (input_params.layer_synchronizer != nullptr) {
         event = input_params.layer_synchronizer->get_event(i);
         event_flag = input_params.layer_synchronizer->get_event_flag(i);
       }
-      if (input_params.layer_wise_load_synchronizer != nullptr) {
-        if (!input_params.layer_wise_load_synchronizer->synchronize_layer(i)) {
+      if (layers_per_bacth_copy <= layers_.size() &&
+          i % layers_per_bacth_copy == 0) {
+        if (!input_params.layer_wise_load_synchronizer->synchronize_layer(
+                i / layers_per_bacth_copy)) {
           return torch::Tensor();
         }
       }
diff --git a/xllm/models/llm/npu/glm4_moe.h b/xllm/models/llm/npu/glm4_moe.h
@@ -202,15 +202,24 @@ class Glm4MoeModelImpl : public torch::nn::Module {
       }
     }
 
+    uint32_t layers_per_bacth_copy = 0;
+    if (input_params.layer_wise_load_synchronizer != nullptr) {
+      uint32_t event_cnt =
+          input_params.layer_wise_load_synchronizer->get_event_size();
+      layers_per_bacth_copy = layers_.size() / event_cnt +
+                              uint32_t(layers_.size() % event_cnt == 0);
+    }
+
     for (size_t i = 0; i < layers_.size(); i++) {
       aclrtEvent* event = nullptr;
       std::atomic<bool>* event_flag = nullptr;
       if (input_params.layer_synchronizer != nullptr) {
         event = input_params.layer_synchronizer->get_event(i);
         event_flag = input_params.layer_synchronizer->get_event_flag(i);
       }
-      if (input_params.layer_wise_load_synchronizer != nullptr) {
-        if (!input_params.layer_wise_load_synchronizer->synchronize_layer(i)) {
+      if (layers_per_bacth_copy > 0 && i % layers_per_bacth_copy == 0) {
+        if (!input_params.layer_wise_load_synchronizer->synchronize_layer(
+                i / layers_per_bacth_copy)) {
           return torch::Tensor();
         }
       }
diff --git a/xllm/models/llm/npu/glm4_moe_mtp.h b/xllm/models/llm/npu/glm4_moe_mtp.h
@@ -147,19 +147,18 @@ class Glm4MoeMtpModelImpl : public torch::nn::Module {
         input_length * num_experts_per_tok_,
         torch::TensorOptions().dtype(torch::kInt32).device(tokens.device()));
 
+    // TODO(liangzhiwei20): MTP need more support for layer wise copy.
+    if (input_params.layer_wise_load_synchronizer != nullptr) {
+      LOG(FATAL) << "MTP not support layer wise copy!";
+    }
+
     for (size_t i = 0; i < layers_.size(); i++) {
       aclrtEvent* event = nullptr;
       std::atomic<bool>* event_flag = nullptr;
       if (input_params.layer_synchronizer != nullptr) {
         event = input_params.layer_synchronizer->get_event(i);
         event_flag = input_params.layer_synchronizer->get_event_flag(i);
       }
-      // TODO(liangzhiwei20): MTP need more support for layer wise copy.
-      if (input_params.layer_wise_load_synchronizer != nullptr) {
-        if (!input_params.layer_wise_load_synchronizer->synchronize_layer(i)) {
-          return torch::Tensor();
-        }
-      }
 
       auto& layer = layers_[i];
       layer(h,
diff --git a/xllm/xllm.cpp b/xllm/xllm.cpp
@@ -182,6 +182,8 @@ int run() {
                             FLAGS_enable_prefix_cache &&
                             (FLAGS_host_blocks_factor > 0.0))
       .prefetch_timeout(FLAGS_prefetch_timeout)
+      .prefetch_bacth_size(FLAGS_prefetch_bacth_size)
+      .layers_wise_copy_batchs(FLAGS_layers_wise_copy_batchs)
       .store_protocol(FLAGS_store_protocol)
       .store_master_server_address(FLAGS_store_master_server_address)
       .store_metadata_server(FLAGS_store_metadata_server)

Original file line number	Diff line number	Diff line change
`@@ -382,6 +382,7 @@ void Sequence::reset() {`
`382`	`382`	`kv_state_.reset();`
`383`	`383`	`host_kv_state_.reset();`
`384`	`384`	`timer_.reset();`
	`385`	`+ is_timeout_set_ = false;`
`385`	`386`	`volatile_num_prompt_tokens_ = num_tokens_;`
`386`	`387`	`}`
`387`	`388`
`@@ -462,12 +463,13 @@ bool Sequence::update_prefetch_result(uint32_t timeout) {`
`462`	`463`	`}`
`463`	`464`
`464`	`465`	`if (timeout != 0 && !termination_flag_.load(std::memory_order_acquire)) {`
`465`		`- if (timer_ != nullptr) {`
`466`		`- timer_ = std::make_shared<Timer>();`
	`466`	`+ if (!is_timeout_set_) {`
	`467`	`+ timer_.reset();`
	`468`	`+ is_timeout_set_ = true;`
`467`	`469`	`return false;`
`468`	`470`	`}`
`469`	`471`
`470`		`- if (timer_->elapsed_milliseconds() < timeout) {`
	`472`	`+ if (timer_.elapsed_milliseconds() < timeout) {`
`471`	`473`	`return false;`
`472`	`474`	`}`
`473`	`475`	`}`