feat: extend KVCache store to support MLU format with index cache.

Kang-Meng · Kang-Meng · commit 57082c35f5c4 · 2025-12-08T10:57:41.000+08:00
diff --git a/xllm/core/framework/kv_cache/kv_cache.cpp b/xllm/core/framework/kv_cache/kv_cache.cpp
@@ -37,7 +37,7 @@ torch::Tensor KVCache::get_index_cache() const { return index_cache_; }
 
 std::vector<std::vector<int64_t>> KVCache::get_shapes() {
   std::vector<std::vector<int64_t>> tensor_shapes(3);
-  if (key_cache_.defined()) {
+  if (key_cache_.defined() && key_cache_.numel() != 0) {
     std::vector<int64_t> shape;
     auto sizes = key_cache_.sizes();
     shape.resize(sizes.size());
@@ -47,7 +47,7 @@ std::vector<std::vector<int64_t>> KVCache::get_shapes() {
     tensor_shapes[0] = std::move(shape);
   }
 
-  if (value_cache_.defined() && key_cache_.numel() != 0) {
+  if (value_cache_.defined() && value_cache_.numel() != 0) {
     std::vector<int64_t> shape;
     auto sizes = value_cache_.sizes();
     shape.resize(sizes.size());
diff --git a/xllm/core/framework/kv_cache/kv_cache_store.cpp b/xllm/core/framework/kv_cache/kv_cache_store.cpp
@@ -43,16 +43,22 @@ bool KVCacheStore::init(const StoreConfig& config,
   }
   client_ptr_ = client_opt.value();
 
-  auto k_tensor_one_block = host_kv_caches_->at(0).get_k_cache();
-  auto v_tensor_one_block = host_kv_caches_->at(0).get_v_cache();
-
-  k_cache_size_per_block_ =
-      k_tensor_one_block.numel() * k_tensor_one_block.element_size();
-  v_cache_size_per_block_ =
-      v_tensor_one_block.numel() * v_tensor_one_block.element_size();
+  auto k_cache = host_kv_caches_->at(0).get_k_cache();
+  k_cache_size_per_block_ = k_cache.numel() * k_cache.element_size();
+  LOG(INFO) << "key cache size per block: " << k_cache_size_per_block_;
+
+  auto v_cache = host_kv_caches_->at(0).get_v_cache();
+  if (v_cache.defined() && v_cache.numel() != 0) {
+    v_cache_size_per_block_ = v_cache.numel() * v_cache.element_size();
+    LOG(INFO) << "value cache size per block: " << v_cache_size_per_block_;
+  }
 
-  LOG(INFO) << "k_cache_size_per_block: " << k_cache_size_per_block_;
-  LOG(INFO) << "v_cache_size_per_block: " << v_cache_size_per_block_;
+  auto index_cache = host_kv_caches_->at(0).get_index_cache();
+  if (index_cache.defined() && index_cache.numel() != 0) {
+    index_cache_size_per_block_ =
+        index_cache.numel() * index_cache.element_size();
+    LOG(INFO) << "index cache size per block: " << index_cache_size_per_block_;
+  }
 
   if (config_.protocol == "rdma") {
     if (config_.total_size > 0 && config_.tensor_data != nullptr) {
@@ -103,14 +109,28 @@ uint32_t KVCacheStore::batch_put(
 
     str_keys.emplace_back(str_key);
 
+    std::vector<mooncake::Slice> slice;
+    slice.reserve(3);
+
     void* k_cache =
         host_kv_caches_->at(block_info.dst_block_id).get_k_cache().data_ptr();
-    void* v_cache =
-        host_kv_caches_->at(block_info.dst_block_id).get_k_cache().data_ptr();
+    slice.emplace_back(mooncake::Slice{k_cache, k_cache_size_per_block_});
+
+    if (v_cache_size_per_block_ != 0) {
+      void* v_cache =
+          host_kv_caches_->at(block_info.dst_block_id).get_v_cache().data_ptr();
+      slice.emplace_back(mooncake::Slice{v_cache, v_cache_size_per_block_});
+    }
 
-    slices.emplace_back(std::vector<mooncake::Slice>{
-        mooncake::Slice{k_cache, k_cache_size_per_block_},
-        mooncake::Slice{v_cache, v_cache_size_per_block_}});
+    if (index_cache_size_per_block_ != 0) {
+      void* index_cache = host_kv_caches_->at(block_info.dst_block_id)
+                              .get_index_cache()
+                              .data_ptr();
+      slice.emplace_back(
+          mooncake::Slice{index_cache, index_cache_size_per_block_});
+    }
+
+    slices.emplace_back(std::move(slice));
   }
 
   if (str_keys.size() == 0) {
@@ -150,16 +170,28 @@ uint32_t KVCacheStore::batch_get(
 
     str_keys.emplace_back(str_key);
 
+    std::vector<mooncake::Slice> slice;
+    slice.reserve(3);
+
     void* k_cache =
         host_kv_caches_->at(block_info.dst_block_id).get_k_cache().data_ptr();
-    void* v_cache =
-        host_kv_caches_->at(block_info.dst_block_id).get_k_cache().data_ptr();
+    slice.emplace_back(mooncake::Slice{k_cache, k_cache_size_per_block_});
 
-    slices.insert(
-        std::make_pair(str_key,
-                       std::vector<mooncake::Slice>{
-                           mooncake::Slice{k_cache, k_cache_size_per_block_},
-                           mooncake::Slice{v_cache, v_cache_size_per_block_}}));
+    if (v_cache_size_per_block_ != 0) {
+      void* v_cache =
+          host_kv_caches_->at(block_info.dst_block_id).get_v_cache().data_ptr();
+      slice.emplace_back(mooncake::Slice{v_cache, v_cache_size_per_block_});
+    }
+
+    if (index_cache_size_per_block_ != 0) {
+      void* index_cache = host_kv_caches_->at(block_info.dst_block_id)
+                              .get_index_cache()
+                              .data_ptr();
+      slice.emplace_back(
+          mooncake::Slice{index_cache, index_cache_size_per_block_});
+    }
+
+    slices.insert(std::make_pair(str_key, std::move(slice)));
   }
 
   if (str_keys.size() == 0) {
@@ -177,24 +209,6 @@ uint32_t KVCacheStore::batch_get(
   return success_cnt;
 }
 
-uint32_t KVCacheStore::batch_remove(
-    Slice<BlockTransferInfo>& block_transfer_info) {
-  CHECK(is_initialized_) << "KVCacheStore is not initialized.";
-  uint32_t success_cnt = 0;
-  for (auto block_info : block_transfer_info) {
-    std::string str_key(reinterpret_cast<const char*>(block_info.hash_key),
-                        MURMUR_HASH3_VALUE_LEN);
-    str_key.append(std::to_string(config_.tp_rank));
-
-    auto result = client_ptr_->Remove(str_key);
-
-    if (result.has_value()) {
-      success_cnt++;
-    }
-  }
-  return success_cnt;
-}
-
 uint32_t KVCacheStore::batch_exist(std::vector<std::string>&& keys) {
   if (!is_initialized_) {
     return 0;
diff --git a/xllm/core/framework/kv_cache/kv_cache_store.h b/xllm/core/framework/kv_cache/kv_cache_store.h
@@ -49,8 +49,6 @@ class KVCacheStore {
 
   uint32_t batch_get(Slice<BlockTransferInfo>& block_transfer_info);
 
-  uint32_t batch_remove(Slice<BlockTransferInfo>& block_transfer_info);
-
   uint32_t batch_exist(std::vector<std::string>&& keys);
 
   static KVCacheStore& get_instance() {
@@ -71,8 +69,9 @@ class KVCacheStore {
 
   std::vector<xllm::KVCache>* host_kv_caches_;
 
-  uint64_t k_cache_size_per_block_;
-  uint64_t v_cache_size_per_block_;
+  uint64_t k_cache_size_per_block_ = 0;
+  uint64_t v_cache_size_per_block_ = 0;
+  uint64_t index_cache_size_per_block_ = 0;
 
   std::shared_ptr<mooncake::Client> client_ptr_;
 };
diff --git a/xllm/core/framework/kv_cache/multi_tier_kv_cache_transfer.cpp b/xllm/core/framework/kv_cache/multi_tier_kv_cache_transfer.cpp
@@ -192,6 +192,7 @@ uint32_t MultiTierKVCacheTransfer::offload_kv_blocks(
 
 bool MultiTierKVCacheTransfer::d2h_batch_copy(
     Slice<BlockTransferInfo>& block_transfer_info) {
+#if defined(USE_NPU)
   const int64_t num_layers = options_.layers();
   uint32_t num_batches =
       block_transfer_info.size() * num_layers * cache_tensor_cnt_;
@@ -266,12 +267,14 @@ bool MultiTierKVCacheTransfer::d2h_batch_copy(
   delete[] dsts;
   delete[] srcs;
   delete[] copy_size;
+#endif
   return true;
 }
 
 bool MultiTierKVCacheTransfer::h2d_batch_copy(
     const uint64_t batch_id,
     Slice<BlockTransferInfo>& block_transfer_info) {
+#if defined(USE_NPU)
   CHECK(block_transfer_info.size() < BATCH_COPY_MAX_SIZE / cache_tensor_cnt_)
       << "h2d_batch_copy support copy blocks less than "
       << BATCH_COPY_MAX_SIZE / cache_tensor_cnt_ << ", but got "
@@ -353,16 +356,15 @@ bool MultiTierKVCacheTransfer::h2d_batch_copy(
       layer_cnt++;
     }
 
-    ret = aclrtMemcpyBatchAsync(dsts,
-                                copy_size,
-                                srcs,
-                                copy_size,
-                                num_batches * layer_cnt,
-                                attrs,
-                                attrs_indexes,
-                                1,
-                                &fail_index,
-                                stream->get_stream()->stream());
+    ret = aclrtMemcpyBatch(dsts,
+                           copy_size,
+                           srcs,
+                           copy_size,
+                           num_batches * layer_cnt,
+                           attrs,
+                           attrs_indexes,
+                           1,
+                           &fail_index);
 
     if (ret != 0 || fail_index != SIZE_MAX) {
       LOG(ERROR) << "aclrtMemcpyBatch error: " << ret
@@ -390,7 +392,7 @@ bool MultiTierKVCacheTransfer::h2d_batch_copy(
   delete[] dsts;
   delete[] srcs;
   delete[] copy_size;
-
+#endif
   return true;
 }
 
diff --git a/xllm/core/framework/kv_cache/multi_tier_kv_cache_transfer.h b/xllm/core/framework/kv_cache/multi_tier_kv_cache_transfer.h
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <memory>
 
-#include "acl/acl_rt.h"
 #include "common/types.h"
 #include "framework/kv_cache/kv_cache_store.h"
 #include "framework/model/model_input_params.h"
@@ -28,6 +27,7 @@ limitations under the License.
 #include "util/threadpool.h"
 
 #if defined(USE_NPU)
+#include "acl/acl_rt.h"
 #include "platform/npu/npu_layer_synchronizer.h"
 #endif