jd-opensource
diff --git a/‎xllm/core/common/global_flags.cpp‎
Lines changed: 10 additions & 10 deletions b/‎xllm/core/common/global_flags.cpp‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎xllm/core/distributed_runtime/comm_channel.cpp‎
Lines changed: 5 additions & 5 deletions b/‎xllm/core/distributed_runtime/comm_channel.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎xllm/core/distributed_runtime/comm_channel.h‎
Lines changed: 2 additions & 2 deletions b/‎xllm/core/distributed_runtime/comm_channel.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎xllm/core/distributed_runtime/remote_worker.cpp‎
Lines changed: 3 additions & 3 deletions b/‎xllm/core/distributed_runtime/remote_worker.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎xllm/core/distributed_runtime/remote_worker.h‎
Lines changed: 2 additions & 2 deletions b/‎xllm/core/distributed_runtime/remote_worker.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎xllm/core/framework/block/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎xllm/core/framework/block/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎xllm/core/framework/block/block_manager_pool.h‎
Lines changed: 1 addition & 1 deletion b/‎xllm/core/framework/block/block_manager_pool.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎…/block/multi_tier_block_manager_pool.cpp‎ ‎…k/block/hierarchy_block_manager_pool.cpp‎xllm/core/framework/block/multi_tier_block_manager_pool.cpp renamed to xllm/core/framework/block/hierarchy_block_manager_pool.cpp
Lines changed: 13 additions & 12 deletions b/‎…/block/multi_tier_block_manager_pool.cpp‎ ‎…k/block/hierarchy_block_manager_pool.cpp‎xllm/core/framework/block/multi_tier_block_manager_pool.cpp renamed to xllm/core/framework/block/hierarchy_block_manager_pool.cpp
Lines changed: 13 additions & 12 deletions
diff --git a/‎…rk/block/multi_tier_block_manager_pool.h‎ ‎…ork/block/hierarchy_block_manager_pool.h‎xllm/core/framework/block/multi_tier_block_manager_pool.h renamed to xllm/core/framework/block/hierarchy_block_manager_pool.h
Lines changed: 4 additions & 4 deletions b/‎…rk/block/multi_tier_block_manager_pool.h‎ ‎…ork/block/hierarchy_block_manager_pool.h‎xllm/core/framework/block/multi_tier_block_manager_pool.h renamed to xllm/core/framework/block/hierarchy_block_manager_pool.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎xllm/core/framework/block/kv_cache_manager.h‎
Lines changed: 1 addition & 1 deletion b/‎xllm/core/framework/block/kv_cache_manager.h‎
Lines changed: 1 addition & 1 deletion
@@ -164,16 +164,6 @@ DEFINE_int32(
     256,
     "Max decode token per sequence which used for ZeroEvictionScheduler.");
 
-DEFINE_uint32(prefetch_timeout,
-              0,
-              "Prefetch timeout for prefetch from kv cache store.");
-
-DEFINE_uint32(prefetch_bacth_size,
-              2,
-              "Prefetch from kvcache store copy batch size.");
-
-DEFINE_uint32(layers_wise_copy_batchs, 4, "Layer wise H2D copy batchs.");
-
 // --- parallel config ---
 
 DEFINE_int32(dp_size, 1, "Data parallel size for MLA attention.");
@@ -341,6 +331,16 @@ DEFINE_bool(enable_online_preempt_offline,
 
 // --- kvcache store config ---
 
+DEFINE_uint32(prefetch_timeout,
+              0,
+              "Prefetch timeout for prefetch from kv cache store.");
+
+DEFINE_uint32(prefetch_bacth_size,
+              2,
+              "Prefetch from kvcache store copy batch size.");
+
+DEFINE_uint32(layers_wise_copy_batchs, 4, "Layer wise H2D copy batchs.");
+
 DEFINE_double(host_blocks_factor,
               0.0,
               "Host block factor, e.g. host block num = host_blocks_factor * "
 
@@ -372,14 +372,14 @@ void CommChannel::transfer_kv_blocks(
 
 class ClientStreamReceiver : public brpc::StreamInputHandler {
  private:
-  std::atomic<bool>* termination_flag_;
+  std::shared_ptr<std::atomic<bool>> termination_flag_;
   std::shared_ptr<std::atomic<uint32_t>> success_cnt_;
   std::promise<void> close_promise_;
   std::atomic<bool> promise_set_{false};
 
  public:
-  ClientStreamReceiver(std::atomic<bool>* termination_flag,
-                       std::shared_ptr<std::atomic<uint32_t>>& success_cnt)
+  ClientStreamReceiver(std::shared_ptr<std::atomic<bool>> termination_flag,
+                       std::shared_ptr<std::atomic<uint32_t>> success_cnt)
       : termination_flag_(termination_flag), success_cnt_(success_cnt) {}
 
   ~ClientStreamReceiver() {
@@ -427,8 +427,8 @@ class ClientStreamReceiver : public brpc::StreamInputHandler {
 
 void CommChannel::prefetch_from_storage(
     const std::vector<BlockTransferInfo>& block_transfer_info,
-    std::atomic<bool>* flag,
-    std::shared_ptr<std::atomic<uint32_t>>& success_cnt) {
+    std::shared_ptr<std::atomic<bool>> flag,
+    std::shared_ptr<std::atomic<uint32_t>> success_cnt) {
   proto::BlockTransferInfos pb_block_transfer_info;
   if (!block_transfer_info_to_proto(block_transfer_info,
                                     &pb_block_transfer_info)) {
 
@@ -99,8 +99,8 @@ class CommChannel {
 
   virtual void prefetch_from_storage(
       const std::vector<BlockTransferInfo>& block_transfer_info,
-      std::atomic<bool>* flag,
-      std::shared_ptr<std::atomic<uint32_t>>& success_cnt);
+      std::shared_ptr<std::atomic<bool>> flag,
+      std::shared_ptr<std::atomic<uint32_t>> success_cnt);
 
   virtual bool get_last_step_result_async(
       folly::Promise<std::optional<RawForwardOutput>>& promise);
 
@@ -314,12 +314,12 @@ void RemoteWorker::transfer_kv_blocks(
 
 void RemoteWorker::prefetch_from_storage(
     const std::vector<BlockTransferInfo>& block_transfer_info,
-    std::atomic<bool>* flag,
-    std::shared_ptr<std::atomic<uint32_t>>& success_cnt) {
+    std::shared_ptr<std::atomic<bool>> flag,
+    std::shared_ptr<std::atomic<uint32_t>> success_cnt) {
   copy_threadpool_.schedule(
       [this,
-       flag = flag,
        block_transfer_info = std::move(block_transfer_info),
+       flag = flag,
        success_cnt = success_cnt]() mutable {
         channel_->prefetch_from_storage(block_transfer_info, flag, success_cnt);
       });
 
@@ -121,8 +121,8 @@ class RemoteWorker : public WorkerClient {
 
   virtual void prefetch_from_storage(
       const std::vector<BlockTransferInfo>& block_transfer_info,
-      std::atomic<bool>* flag,
-      std::shared_ptr<std::atomic<uint32_t>>& success_cnt) override;
+      std::shared_ptr<std::atomic<bool>> flag,
+      std::shared_ptr<std::atomic<uint32_t>> success_cnt) override;
 
   // Run the model and return the output.
   virtual folly::SemiFuture<std::optional<ForwardOutput>> step_async(
 
@@ -11,13 +11,13 @@ cc_library(
     block_manager_pool.h
     block_manager_impl.h
     concurrent_block_manager_impl.h
-    multi_tier_block_manager_pool.h
+    hierarchy_block_manager_pool.h
   SRCS 
     block.cpp
     block_manager_pool.cpp
     concurrent_block_manager_impl.cpp
     block_manager_impl.cpp
-    multi_tier_block_manager_pool.cpp
+    hierarchy_block_manager_pool.cpp
   DEPS
     $<$<BOOL:${USE_NPU}>:torch_npu>
     $<$<BOOL:${USE_NPU}>:graph>
 
@@ -76,7 +76,7 @@ class BlockManagerPool : public KVCacheManager {
   int32_t get_manager_with_max_free_blocks() const;
   int32_t get_dp_rank(Sequence* sequence) const;
 
-  void process_beam_search(Sequence* sequence, bool need_swap = false);
+  bool process_beam_search(Sequence* sequence, bool need_swap = false);
 
  private:
   std::vector<std::vector<BlockTransferInfo>> swap_block_transfer_infos_;
 
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "multi_tier_block_manager_pool.h"
+#include "hierarchy_block_manager_pool.h"
 
 #include "block_manager_impl.h"
 #include "concurrent_block_manager_impl.h"
 
 namespace xllm {
 
-MultiTierBlockManagerPool::MultiTierBlockManagerPool(
+HierarchyBlockManagerPool::HierarchyBlockManagerPool(
     const BlockManagerPool::Options& options,
     Engine* engine,
     int32_t dp_size)
@@ -52,7 +52,7 @@ MultiTierBlockManagerPool::MultiTierBlockManagerPool(
   saved_device_blocks_.resize(host_block_managers_.size());
 }
 
-void MultiTierBlockManagerPool::deallocate(Sequence* sequence) {
+void HierarchyBlockManagerPool::deallocate(Sequence* sequence) {
   DCHECK(sequence != nullptr);
   // add blocks to the prefix cache
   int32_t dp_rank = BlockManagerPool::get_dp_rank(sequence);
@@ -65,7 +65,7 @@ void MultiTierBlockManagerPool::deallocate(Sequence* sequence) {
     return;
   }
 
-  int cached_block_num =
+  size_t cached_block_num =
       sequence->host_kv_state().kv_cache_tokens_num() / options_.block_size();
 
   if (host_blocks->size() > 0) {
@@ -82,7 +82,7 @@ void MultiTierBlockManagerPool::deallocate(Sequence* sequence) {
   sequence->host_kv_state().add_kv_blocks(
       host_block_managers_[dp_rank]->allocate(needed_block_num));
 
-  for (int i = cached_block_num; i < host_blocks->size(); i++) {
+  for (size_t i = cached_block_num; i < host_blocks->size(); i++) {
     if (blocks->at(i).ref_count() != 2) {
       continue;
     }
@@ -107,7 +107,7 @@ void MultiTierBlockManagerPool::deallocate(Sequence* sequence) {
   sequence->reset();
 }
 
-bool MultiTierBlockManagerPool::allocate(Sequence* sequence,
+bool HierarchyBlockManagerPool::allocate(Sequence* sequence,
                                          size_t num_tokens) {
   BlockManagerPool::allocate(sequence, num_tokens);
 
@@ -137,7 +137,7 @@ bool MultiTierBlockManagerPool::allocate(Sequence* sequence,
   return true;
 }
 
-void MultiTierBlockManagerPool::allocate_host_shared(Sequence* sequence) {
+void HierarchyBlockManagerPool::allocate_host_shared(Sequence* sequence) {
   if (options_.enable_prefix_cache()) {
     int32_t dp_rank = BlockManagerPool::get_dp_rank(sequence);
     std::vector<Block> shared_blocks =
@@ -146,7 +146,7 @@ void MultiTierBlockManagerPool::allocate_host_shared(Sequence* sequence) {
   }
 }
 
-void MultiTierBlockManagerPool::prefetch_from_storage(
+void HierarchyBlockManagerPool::prefetch_from_storage(
     std::shared_ptr<Request>& request) {
   if (!options_.enable_kvcache_store()) {
     return;
@@ -202,7 +202,7 @@ void MultiTierBlockManagerPool::prefetch_from_storage(
   }
 }
 
-bool MultiTierBlockManagerPool::update_prefetch_result(
+bool HierarchyBlockManagerPool::update_prefetch_result(
     std::shared_ptr<Request>& request,
     const uint32_t timeout) {
   if (!options_.enable_kvcache_store()) {
@@ -216,8 +216,9 @@ bool MultiTierBlockManagerPool::update_prefetch_result(
   return prefetch_result;
 }
 
-void MultiTierBlockManagerPool::transfer_blocks(std::vector<Batch>* batches) {
-  if (batches != nullptr) {
+void HierarchyBlockManagerPool::transfer_blocks(
+    std::optional<std::vector<Batch>> batches) {
+  if (batches.has_value()) {
     // load blocks from host to device
     for (int i = 0; i < batches->size(); i++) {
       if (!load_block_transfer_infos_[i].empty()) {
@@ -265,7 +266,7 @@ void MultiTierBlockManagerPool::transfer_blocks(std::vector<Batch>* batches) {
   saved_device_blocks_.resize(host_block_managers_.size());
 }
 
-void MultiTierBlockManagerPool::get_merged_kvcache_event(
+void HierarchyBlockManagerPool::get_merged_kvcache_event(
     KvCacheEvent* event) const {
   if (host_block_managers_.empty()) {
     BlockManagerPool::get_merged_kvcache_event(event);
 
@@ -22,18 +22,18 @@ namespace xllm {
 
 class Engine;
 
-class MultiTierBlockManagerPool : public BlockManagerPool {
+class HierarchyBlockManagerPool : public BlockManagerPool {
  public:
-  explicit MultiTierBlockManagerPool(const BlockManagerPool::Options& options,
+  explicit HierarchyBlockManagerPool(const BlockManagerPool::Options& options,
                                      Engine* engine,
                                      int32_t dp_size = 1);
-  ~MultiTierBlockManagerPool() = default;
+  ~HierarchyBlockManagerPool() = default;
 
   bool allocate(Sequence* sequence, size_t num_tokens) override;
 
   void deallocate(Sequence* sequence) override;
 
-  void transfer_blocks(std::vector<Batch>* batches = nullptr) override;
+  void transfer_blocks(std::optional<std::vector<Batch>> batches) override;
 
   void prefetch_from_storage(std::shared_ptr<Request>& request) override;
 
 
@@ -33,7 +33,7 @@ class KVCacheManager {
   virtual bool allocate(std::vector<Sequence*>& sequences) = 0;
   virtual bool allocate(Sequence* sequence, size_t num_tokens) = 0;
 
-  virtual void transfer_blocks(std::vector<Batch>* batches = nullptr) {
+  virtual void transfer_blocks(std::optional<std::vector<Batch>> batches) {
     return;
   };