jd-opensource
diff --git a/‎xllm/core/layers/base_layer.h‎
Lines changed: 39 additions & 11 deletions b/‎xllm/core/layers/base_layer.h‎
Lines changed: 39 additions & 11 deletions
diff --git a/‎xllm/core/layers/npu/CMakeLists.txt‎
Lines changed: 28 additions & 0 deletions b/‎xllm/core/layers/npu/CMakeLists.txt‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎xllm/core/layers/npu/loader/base_loader.cpp‎
Lines changed: 146 additions & 0 deletions b/‎xllm/core/layers/npu/loader/base_loader.cpp‎
Lines changed: 146 additions & 0 deletions
diff --git a/‎xllm/core/layers/npu/loader/base_loader.h‎
Lines changed: 102 additions & 0 deletions b/‎xllm/core/layers/npu/loader/base_loader.h‎
Lines changed: 102 additions & 0 deletions
@@ -27,11 +27,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "framework/kv_cache/kv_cache.h"
-#include "framework/model/model_input_params.h"
-#include "framework/model_context.h"
-#include "framework/state_dict/state_dict.h"
-
+#if defined(USE_NPU)
+#include "npu/loader/base_loader.h"
+#endif
 namespace xllm {
 namespace layer {
 
@@ -92,14 +90,44 @@ class BaseLayer : public torch::nn::Module {
 
   virtual ~BaseLayer() {};
 
-  virtual void load_state_dict(const StateDict& state_dict) {};
+  virtual void load_state_dict(const StateDict& state_dict) {
+#if defined(USE_NPU)
+    if (loader_) {
+      loader_->load_state_dict(state_dict);
+    }
+#endif
+  };
+
+  virtual void verify_loaded_weights() const {
+#if defined(USE_NPU)
+    if (loader_) {
+      loader_->verify_loaded_weights();
+    }
+#endif
+  };
 
-  virtual void verify_loaded_weights() const {};
+  virtual void verify_loaded_weights(const std::string& prefix) const {
+#if defined(USE_NPU)
+    if (loader_) {
+      loader_->verify_loaded_weights(prefix);
+    }
+#endif
+  };
 
-  virtual void merge_loaded_weights() {};
+  virtual void merge_loaded_weights() {
+#if defined(USE_NPU)
+    if (loader_) {
+      loader_->merge_loaded_weights();
+    }
+    init_layer();
+#endif
+  };
 
   virtual int64_t init_layer() { return 0; };
 
+  virtual void run_task(std::string taskName, std::function<int()> task) const {
+  };
+
   void set_weight(const StateDict& state_dict,
                   const std::string& tensor_name,
                   int weight_position,
@@ -116,16 +144,16 @@ class BaseLayer : public torch::nn::Module {
                   int rank,
                   int world_size);
 
-  virtual void run_task(std::string taskName, std::function<int()> task) const {
-  };
-
   torch::Dtype string2dtype(const std::string& dtype_str);
 
   void correct_tensor_dtype(torch::Tensor& tensor,
                             const std::string& tensorName);
 
  protected:
   std::vector<at::Tensor> at_weight_tensors_;
+#if defined(USE_NPU)
+  std::unique_ptr<BaseLoader> loader_ = nullptr;
+#endif
   at::Device device_;
   std::string name_;
   torch::ScalarType dtype_;
 
@@ -25,6 +25,20 @@ cc_library(
     npu_qwen3_decoder_layer_impl.h
     npu_rms_norm_impl.h
     npu_siglip_encoder_layer_impl.h
+    loader/qwen3_decoder_loader.h
+    loader/qwen2_decoder_loader.h
+    loader/qwen3_moe_decoder_loader.h
+    loader/word_embedding_loader.h
+    loader/lm_head_loader.h
+    loader/column_parallel_linear_loader.h
+    loader/deepseek_v2_decoder_loader.h
+    loader/glm4_moe_decoder_loader.h
+    loader/llama_decoder_loader.h
+    loader/qwen2dot5_vision_encoder_loader.h
+    loader/qwen3_vision_encoder_loader.h
+    loader/rms_norm_loader.h
+    loader/siglip_encoder_loader.h
+    loader/base_loader.h
   SRCS
     npu_word_embedding_impl.cpp
     npu_pos_embedding_impl.cpp
@@ -45,6 +59,20 @@ cc_library(
     npu_qwen3_decoder_layer_impl.cpp
     npu_rms_norm_impl.cpp
     npu_siglip_encoder_layer_impl.cpp
+    loader/qwen3_decoder_loader.cpp
+    loader/qwen2_decoder_loader.cpp
+    loader/qwen3_moe_decoder_loader.cpp
+    loader/word_embedding_loader.cpp
+    loader/lm_head_loader.cpp
+    loader/column_parallel_linear_loader.cpp
+    loader/deepseek_v2_decoder_loader.cpp
+    loader/glm4_moe_decoder_loader.cpp
+    loader/llama_decoder_loader.cpp
+    loader/qwen2dot5_vision_encoder_loader.cpp
+    loader/qwen3_vision_encoder_loader.cpp
+    loader/rms_norm_loader.cpp
+    loader/siglip_encoder_loader.cpp
+    loader/base_loader.cpp
   DEPS
     "-Wl,--whole-archive"
     "-Wl,--no-whole-archive"
 
@@ -0,0 +1,146 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "base_loader.h"
+
+namespace xllm {
+namespace layer {
+
+BaseLoader::BaseLoader(uint64_t weight_count, const ModelContext& context)
+    : weight_count_(weight_count),
+      parallel_args_(context.get_parallel_args()),
+      device_(context.get_tensor_options().device()) {
+  auto quant_args = context.get_quant_args();
+  if (!quant_args.quantize_type().empty()) {
+    quantize_type_ = quant_args.quantize_type();
+  }
+
+  if (!quant_args.torch_dtype().empty()) {
+    torch_dtype_ = quant_args.torch_dtype();
+  }
+
+  dp_size_ = parallel_args_.dp_size();
+  dp_local_tp_size_ = parallel_args_.world_size() / dp_size_;
+  dp_rank_ = parallel_args_.rank() / dp_local_tp_size_;
+  CHECK_EQ(parallel_args_.world_size(), dp_size_ * dp_local_tp_size_);
+  dp_local_tp_rank_ = parallel_args_.rank() % dp_local_tp_size_;
+
+  at_weight_tensors_.resize(weight_count_);
+}
+
+void BaseLoader::set_weight(const StateDict& state_dict,
+                            const std::string& tensor_name,
+                            int weight_position) {
+  for (const auto& [name, tensor] : state_dict) {
+    if (absl::EndsWith(name, tensor_name)) {
+      at::Tensor mutable_tensor = tensor;
+      correct_tensor_dtype(mutable_tensor, tensor_name);
+      at_weight_tensors_[weight_position] = mutable_tensor.to(device_);
+    }
+  }
+}
+
+void BaseLoader::set_weight(const StateDict& state_dict,
+                            const std::string& tensor_name,
+                            int weight_position,
+                            int dim) {
+  for (const auto& [name, tensor] : state_dict) {
+    if (absl::EndsWith(name, tensor_name)) {
+      if (parallel_args_.world_size() <= 1) {
+        at::Tensor mutable_tensor = tensor;
+        correct_tensor_dtype(mutable_tensor, tensor_name);
+        at_weight_tensors_[weight_position] = mutable_tensor.to(device_);
+      } else {
+        at_weight_tensors_[weight_position] =
+            state_dict
+                .get_sharded_tensor(tensor_name,
+                                    /*dim=*/dim,
+                                    /*rank=*/parallel_args_.rank(),
+                                    /*world_size=*/parallel_args_.world_size())
+                .to(device_);
+        correct_tensor_dtype(at_weight_tensors_[weight_position], tensor_name);
+      }
+    }
+  }
+}
+
+void BaseLoader::set_weight(const StateDict& state_dict,
+                            const std::string& tensor_name,
+                            int weight_position,
+                            int dim,
+                            int rank,
+                            int world_size) {
+  for (const auto& [name, tensor] : state_dict) {
+    if (absl::EndsWith(name, tensor_name)) {
+      if (world_size <= 1) {
+        at::Tensor mutable_tensor = tensor;
+        correct_tensor_dtype(mutable_tensor, tensor_name);
+        at_weight_tensors_[weight_position] = mutable_tensor.to(device_);
+      } else {
+        at_weight_tensors_[weight_position] =
+            state_dict
+                .get_sharded_tensor(tensor_name,
+                                    /*dim=*/dim,
+                                    /*rank=*/rank,
+                                    /*world_size=*/world_size)
+                .to(device_);
+        correct_tensor_dtype(at_weight_tensors_[weight_position], tensor_name);
+      }
+    }
+  }
+}
+
+void BaseLoader::correct_tensor_dtype(torch::Tensor& tensor,
+                                      const std::string& tensorName) {
+  if (absl::EndsWith(tensorName, "deq_scale") &&
+      (torch_dtype_.compare("bfloat16") == 0)) {
+    return;
+  }
+
+  if (tensor.dtype() != torch::kInt8 && tensor.dtype() != torch::kInt32 &&
+      tensor.dtype() != torch::kInt64) {
+    torch::Dtype dtype = string2dtype(torch_dtype_);
+    tensor = tensor.to(dtype);
+  }
+}
+
+torch::Dtype BaseLoader::string2dtype(const std::string& dtype_str) {
+  if (dtype_str.compare("float16") == 0) {
+    return torch::kFloat16;
+  } else if (dtype_str.compare("bfloat16") == 0) {
+    return torch::kBFloat16;
+  } else if (dtype_str.compare("float32") == 0) {
+    return torch::kFloat32;
+  } else if (dtype_str.compare("float64") == 0) {
+    return torch::kFloat64;
+  } else if (dtype_str.compare("int8") == 0) {
+    return torch::kInt8;
+  } else if (dtype_str.compare("int16") == 0) {
+    return torch::kInt16;
+  } else if (dtype_str.compare("int32") == 0) {
+    return torch::kInt32;
+  } else if (dtype_str.compare("int64") == 0) {
+    return torch::kInt64;
+  } else if (dtype_str.compare("uint8") == 0) {
+    return torch::kUInt8;
+  } else if (dtype_str.compare("bool") == 0) {
+    return torch::kBool;
+  }
+
+  LOG(FATAL) << "Unsupported dtype string: " << dtype_str;
+}
+
+}  // namespace layer
+}  // namespace xllm
@@ -0,0 +1,102 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <absl/strings/match.h>
+#include <torch/torch.h>
+
+#include "framework/eplb/expert_buffer_manager.h"
+#include "framework/kv_cache/kv_cache.h"
+#include "framework/model/model_input_params.h"
+#include "framework/model_context.h"
+#include "framework/state_dict/state_dict.h"
+#include "xllm_kernels/pytorch/atb_torch/core/include/base_operation.h"
+#include "xllm_kernels/pytorch/atb_torch/core/include/graph_operation.h"
+
+namespace xllm {
+namespace layer {
+
+class BaseLoader {
+ public:
+  explicit BaseLoader(uint64_t weight_count, const ModelContext& context);
+  virtual ~BaseLoader() = default;
+
+  virtual void load_state_dict(const StateDict& state_dict) {};
+  virtual void verify_loaded_weights() const {};
+  virtual void verify_loaded_weights(const std::string& prefix) const {};
+  virtual void merge_loaded_weights() {};
+  virtual void resize_experts_weights(int num_of_device_experts) {};
+  torch::Dtype string2dtype(const std::string& dtype_str);
+
+  void correct_tensor_dtype(torch::Tensor& tensor,
+                            const std::string& tensorName);
+
+  std::vector<at::Tensor>& get_at_weight_tensors() {
+    return at_weight_tensors_;
+  }
+
+  std::unordered_map<std::string, std::vector<torch::Tensor>>&
+  get_experts_weight_tensors() {
+    return experts_weights_;
+  }
+
+  std::unique_ptr<ExpertBufferManager>& get_expert_shared_buffer() {
+    return shared_buffer_;
+  }
+
+  std::vector<int32_t>& get_device_expert_list() { return device_expert_list_; }
+
+  atb_torch::TorchTensorMap& get_weights_map() { return weights_map_; }
+
+ protected:
+  uint64_t weight_count_;
+  xllm::ParallelArgs parallel_args_;
+  std::string quantize_type_;
+  std::string torch_dtype_;
+  torch::ScalarType dtype_;
+  torch::TensorOptions options_;
+  std::vector<at::Tensor> at_weight_tensors_;
+  std::unique_ptr<ExpertBufferManager> shared_buffer_ = nullptr;
+  std::unordered_map<std::string, torch::Tensor> shared_experts_weights_;
+  std::unordered_map<std::string, std::vector<torch::Tensor>> experts_weights_;
+  std::vector<int32_t> device_expert_list_;
+  atb_torch::TorchTensorMap weights_map_;
+
+  at::Device device_;
+  int32_t dp_size_;
+  int32_t dp_local_tp_size_;
+  int32_t dp_rank_;
+  int32_t dp_local_tp_rank_;
+
+  void set_weight(const StateDict& state_dict,
+                  const std::string& tensor_name,
+                  int weight_position);
+
+  void set_weight(const StateDict& state_dict,
+                  const std::string& tensor_name,
+                  int weight_position,
+                  int dim);
+
+  void set_weight(const StateDict& state_dict,
+                  const std::string& tensor_name,
+                  int weight_position,
+                  int dim,
+                  int rank,
+                  int world_size);
+};
+
+}  // namespace layer
+}  // namespace xllm