bugfix: fix bug caused by atten_mask nullptr on mlu device. (#513)

yq33victor · web-flow · commit 672e63cc9e1c · 2025-12-10T17:52:30.000+08:00
Signed-off-by: pengtao.156 &lt;pengtao.156@jd.com&gt;
diff --git a/xllm/core/layers/CMakeLists.txt b/xllm/core/layers/CMakeLists.txt
@@ -21,9 +21,9 @@ cc_library(
   NAME
     attention_mask
   HDRS
-    common/attention_mask_impl.h
+    common/attention_mask.h
   SRCS
-    common/attention_mask_impl.cpp
+    common/attention_mask.cpp
   DEPS
     :state_dict
     :block
diff --git a/xllm/core/layers/common/attention_mask.cpp b/xllm/core/layers/common/attention_mask.cpp
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "attention_mask_impl.h"
+#include "attention_mask.h"
 
 namespace xllm {
 namespace layer {
 
-AttentionMaskImpl::AttentionMaskImpl(at::Device device,
-                                     torch::Dtype dtype,
-                                     float mask_value) {
+AttentionMask::AttentionMask(at::Device device,
+                             torch::Dtype dtype,
+                             float mask_value) {
   int max_seq_len = 128;
   seq_len_cached_ = max_seq_len;
   auto bias_cache =
@@ -37,25 +37,24 @@ AttentionMaskImpl::AttentionMaskImpl(at::Device device,
                           .to(device);
 }
 
-torch::Tensor AttentionMaskImpl::get_decode_attn_mask(
-    torch::Tensor input_lengths,
-    int64_t max_s,
-    torch::Dtype dtype,
-    torch::Device device) {
+torch::Tensor AttentionMask::get_decode_attn_mask(torch::Tensor input_lengths,
+                                                  int64_t max_s,
+                                                  torch::Dtype dtype,
+                                                  torch::Device device) {
   update_attn_cache(dtype, device, max_s);
   return atten_mask_cache_.index_select(0, input_lengths).view({-1, 1, max_s});
 }
 
-torch::Tensor AttentionMaskImpl::get_attn_mask(int64_t max_s,
-                                               torch::Dtype dtype,
-                                               torch::Device device) {
+torch::Tensor AttentionMask::get_attn_mask(int64_t max_s,
+                                           torch::Dtype dtype,
+                                           torch::Device device) {
   update_attn_cache(dtype, device, max_s);
   return atten_mask_cache_.slice(0, 0, max_s).slice(1, 0, max_s);
 }
 
-torch::Tensor AttentionMaskImpl::gen_free_mask(int32_t q_len,
-                                               torch::Dtype dtype,
-                                               torch::Device device) {
+torch::Tensor AttentionMask::gen_free_mask(int32_t q_len,
+                                           torch::Dtype dtype,
+                                           torch::Device device) {
   float pre_mask_factor = -10000.0f;
   if (dtype == torch::kBFloat16) {
     pre_mask_factor = 1.0f;
@@ -68,11 +67,11 @@ torch::Tensor AttentionMaskImpl::gen_free_mask(int32_t q_len,
   return mask_free;
 }
 
-torch::Tensor AttentionMaskImpl::gen_append_mask(int32_t q_len,
-                                                 int32_t kv_len,
-                                                 int32_t max_kv_len,
-                                                 torch::Dtype dtype,
-                                                 torch::Device device) {
+torch::Tensor AttentionMask::gen_append_mask(int32_t q_len,
+                                             int32_t kv_len,
+                                             int32_t max_kv_len,
+                                             torch::Dtype dtype,
+                                             torch::Device device) {
   int diagonal = kv_len - q_len;
   auto options = torch::TensorOptions().dtype(torch::kBool).device(device);
   auto bias = torch::tril(torch::ones({q_len, max_kv_len}, options), diagonal);
@@ -84,9 +83,9 @@ torch::Tensor AttentionMaskImpl::gen_append_mask(int32_t q_len,
   return mask;
 }
 
-void AttentionMaskImpl::update_attn_cache(torch::Dtype dtype,
-                                          torch::Device device,
-                                          int64_t seqlen) {
+void AttentionMask::update_attn_cache(torch::Dtype dtype,
+                                      torch::Device device,
+                                      int64_t seqlen) {
   if (seqlen > seq_len_cached_ || atten_mask_cache_.dtype() != dtype) {
     seq_len_cached_ = seqlen;
 
diff --git a/xllm/core/layers/common/attention_mask.h b/xllm/core/layers/common/attention_mask.h
@@ -19,13 +19,13 @@ limitations under the License.
 namespace xllm {
 namespace layer {
 
-class AttentionMaskImpl : public torch::nn::Module {
+class AttentionMask : public torch::nn::Module {
  public:
-  AttentionMaskImpl() = default;
+  AttentionMask() = default;
 
-  explicit AttentionMaskImpl(at::Device device,
-                             torch::Dtype dtype,
-                             float mask_value = -9984);
+  explicit AttentionMask(at::Device device,
+                         torch::Dtype dtype,
+                         float mask_value = -9984);
 
   torch::Tensor get_decode_attn_mask(torch::Tensor input_lengths,
                                      int64_t max_s,
@@ -55,7 +55,6 @@ class AttentionMaskImpl : public torch::nn::Module {
   float mask_value_;
   at::Tensor atten_mask_cache_;
 };
-TORCH_MODULE(AttentionMask);
 
 }  // namespace layer
 }  // namespace xllm
diff --git a/xllm/core/layers/common/rotary_embedding.cpp b/xllm/core/layers/common/rotary_embedding.cpp
@@ -88,7 +88,6 @@ MRotaryEmbeddingImpl::MRotaryEmbeddingImpl(
                           rope_theta,
                           interleaved,
                           options),
-      interleaved_(interleaved),
       mrope_section_(rope_scaling_mrope_section) {
   mrope_cu_seq_lens_ = torch::zeros(2, torch::kInt32).to(options.device());
 }
diff --git a/xllm/core/layers/common/rotary_embedding.h b/xllm/core/layers/common/rotary_embedding.h
@@ -52,8 +52,10 @@ class RotaryEmbeddingImpl : public torch::nn::Module {
 
   torch::Tensor get_cos_sin_cache() { return cos_sin_cache_; }
 
- private:
+ protected:
   bool interleaved_;
+
+ private:
   torch::Tensor sin_;
   torch::Tensor cos_;
   torch::Tensor cos_sin_cache_;
diff --git a/xllm/core/layers/config.h b/xllm/core/layers/config.h
@@ -112,4 +112,15 @@ REGISTER_NOT_IMPLEMENTED_CLASS(SiglipEncoderLayerImpl);
 #include "npu/npu_glm4_decoder_layer_impl.h"
 #else
 REGISTER_NOT_IMPLEMENTED_CLASS(Glm4DecoderLayerImpl);
-#endif
+#endif
+
+#if defined(USE_NPU)
+#include "npu/npu_glm4_vision_encoder_layer_impl.h"
+namespace xllm {
+namespace layer {
+using Glm4VisionEncoderLayerImpl = NpuGlm4VisionEncoderLayerImpl;
+}
+}  // namespace xllm
+#else
+REGISTER_NOT_IMPLEMENTED_CLASS(Glm4VisionEncoderLayerImpl);
+#endif
diff --git a/xllm/core/layers/glm4_vision_encode_layer.h b/xllm/core/layers/glm4_vision_encode_layer.h
@@ -15,25 +15,20 @@ limitations under the License.
 
 #pragma once
 
-#if defined(USE_NPU)
-#include "npu/npu_glm4_vision_encoder_layer_impl.h"
-#endif
+#include "config.h"
 
 namespace xllm {
 namespace layer {
 
-#if defined(USE_NPU)
 class Glm4VisionEncoderLayer
-    : public torch::nn::ModuleHolder<NpuGlm4VisionEncoderLayerImpl> {
+    : public torch::nn::ModuleHolder<Glm4VisionEncoderLayerImpl> {
  public:
-  using torch::nn::ModuleHolder<NpuGlm4VisionEncoderLayerImpl>::ModuleHolder;
-  using Impl __attribute__((__unused__)) = NpuGlm4VisionEncoderLayerImpl;
+  using torch::nn::ModuleHolder<Glm4VisionEncoderLayerImpl>::ModuleHolder;
+  using Impl __attribute__((__unused__)) = Glm4VisionEncoderLayerImpl;
 
   Glm4VisionEncoderLayer(const ModelContext& context)
-      : ModuleHolder(std::make_shared<NpuGlm4VisionEncoderLayerImpl>(context)) {
-  }
+      : ModuleHolder(std::make_shared<Glm4VisionEncoderLayerImpl>(context)) {}
 };
-#endif
 
 }  // namespace layer
 }  // namespace xllm
diff --git a/xllm/core/layers/npu/npu_llama_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_llama_decoder_layer_impl.cpp
@@ -21,7 +21,7 @@ limitations under the License.
 #include <map>
 
 #include "common/global_flags.h"
-#include "core/layers/common/attention_mask_impl.h"
+#include "core/layers/common/attention_mask.h"
 #include "loader/llama_decoder_loader.h"
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUException.h"
diff --git a/xllm/models/llm/deepseek_v2.h b/xllm/models/llm/deepseek_v2.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "core/framework/model/model_input_params.h"
 #include "core/framework/model/npu_dp_ep_padding.h"
 #include "core/framework/model_context.h"
-#include "core/layers/common/attention_mask_impl.h"
+#include "core/layers/common/attention_mask.h"
 #include "core/layers/deepseek_v2_decoder_layer.h"
 #include "core/layers/lm_head.h"
 #include "core/layers/npu/npu_rms_norm_impl.h"
@@ -159,9 +159,9 @@ class DeepseekV2ModelImpl : public torch::nn::Module {
 
     torch::Tensor attn_mask;
     if (num_speculative_tokens_ == 0 || input_params.global_empty_kv_cache) {
-      attn_mask = attn_mask_->get_attn_mask(128, dtype_, device_);
+      attn_mask = attn_mask_.get_attn_mask(128, dtype_, device_);
     } else {
-      attn_mask = attn_mask_->gen_free_mask(
+      attn_mask = attn_mask_.gen_free_mask(
           num_speculative_tokens_ + 1, dtype_, device_);
     }
 
@@ -251,7 +251,7 @@ class DeepseekV2ModelImpl : public torch::nn::Module {
   layer::WordEmbedding embed_tokens_{nullptr};
   std::shared_ptr<RotaryEmbedding> pos_emb_{nullptr};
   layer::PosEmbedding atb_pos_emb_{nullptr};
-  layer::AttentionMask attn_mask_{nullptr};
+  layer::AttentionMask attn_mask_;
   layer::RMSNorm norm_{nullptr};
 };
 TORCH_MODULE(DeepseekV2Model);
diff --git a/xllm/models/llm/deepseek_v2_mtp.h b/xllm/models/llm/deepseek_v2_mtp.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "core/framework/model/model_input_params.h"
 #include "core/framework/model/npu_dp_ep_padding.h"
 #include "core/framework/model_context.h"
-#include "core/layers/common/attention_mask_impl.h"
+#include "core/layers/common/attention_mask.h"
 #include "core/layers/deepseek_v2_decoder_layer.h"
 #include "core/layers/lm_head.h"
 #include "core/layers/npu/npu_column_parallel_linear_impl.h"
@@ -122,7 +122,7 @@ class DeepseekV2MtpModelImpl : public torch::nn::Module {
     auto cos_pos = cos_sin_chunks[0].contiguous();
     auto sin_pos = cos_sin_chunks[1].contiguous();
 
-    auto attn_mask = attn_mask_->get_attn_mask(
+    auto attn_mask = attn_mask_.get_attn_mask(
         128, cos_pos.dtype().toScalarType(), cos_pos.device());
     for (size_t i = 0; i < layers_.size(); i++) {
       aclrtEvent* event = nullptr;
@@ -205,7 +205,7 @@ class DeepseekV2MtpModelImpl : public torch::nn::Module {
   layer::WordEmbedding embed_tokens_{nullptr};
   std::shared_ptr<RotaryEmbedding> pos_emb_{nullptr};
   layer::PosEmbedding atb_pos_emb_{nullptr};
-  layer::AttentionMask attn_mask_{nullptr};
+  layer::AttentionMask attn_mask_;
   layer::ColumnParallelLinear eh_proj_{nullptr};
   layer::RMSNorm enorm_{nullptr};
   layer::RMSNorm hnorm_{nullptr};
diff --git a/xllm/models/llm/glm4.h b/xllm/models/llm/glm4.h
@@ -124,24 +124,24 @@ class Glm4ModelImpl : public LlmModelImplBase<Glm4DecoderLayer> {
 
         for (int j = 0; j < num_sequences; j++) {
           auto mask =
-              attn_mask_->gen_append_mask(input_params.q_seq_lens_vec[j],
-                                          input_params.kv_seq_lens_vec[j],
-                                          max_kv_seq,
-                                          cos_pos.dtype().toScalarType(),
-                                          cos_pos.device());
+              attn_mask_.gen_append_mask(input_params.q_seq_lens_vec[j],
+                                         input_params.kv_seq_lens_vec[j],
+                                         max_kv_seq,
+                                         cos_pos.dtype().toScalarType(),
+                                         cos_pos.device());
           req_mask_vec.emplace_back(mask);
         }
         attn_mask = torch::cat(req_mask_vec, 0);
       }
     } else {
       if (FLAGS_num_speculative_tokens == 0 ||
           input_params.global_empty_kv_cache) {
-        attn_mask = attn_mask_->get_attn_mask(
+        attn_mask = attn_mask_.get_attn_mask(
             128, cos_pos.dtype().toScalarType(), cos_pos.device());
       } else {
-        attn_mask = attn_mask_->gen_free_mask(FLAGS_num_speculative_tokens + 1,
-                                              cos_pos.dtype().toScalarType(),
-                                              cos_pos.device());
+        attn_mask = attn_mask_.gen_free_mask(FLAGS_num_speculative_tokens + 1,
+                                             cos_pos.dtype().toScalarType(),
+                                             cos_pos.device());
       }
     }
 
diff --git a/xllm/models/llm/glm4_moe.h b/xllm/models/llm/glm4_moe.h
@@ -188,20 +188,20 @@ class Glm4MoeModelImpl : public torch::nn::Module {
 
         for (int j = 0; j < num_sequences; j++) {
           auto mask =
-              attn_mask_->gen_append_mask(input_params.q_seq_lens_vec[j],
-                                          input_params.kv_seq_lens_vec[j],
-                                          max_kv_seq,
-                                          cos_pos.dtype().toScalarType(),
-                                          cos_pos.device());
+              attn_mask_.gen_append_mask(input_params.q_seq_lens_vec[j],
+                                         input_params.kv_seq_lens_vec[j],
+                                         max_kv_seq,
+                                         cos_pos.dtype().toScalarType(),
+                                         cos_pos.device());
           req_mask_vec.emplace_back(mask);
         }
         attn_mask = torch::cat(req_mask_vec, 0);
       }
     } else {
       if (num_speculative_tokens_ == 0 || input_params.global_empty_kv_cache) {
-        attn_mask = attn_mask_->get_attn_mask(128, dtype_, device_);
+        attn_mask = attn_mask_.get_attn_mask(128, dtype_, device_);
       } else {
-        attn_mask = attn_mask_->gen_free_mask(
+        attn_mask = attn_mask_.gen_free_mask(
             num_speculative_tokens_ + 1, dtype_, device_);
       }
     }
@@ -282,7 +282,7 @@ class Glm4MoeModelImpl : public torch::nn::Module {
   at::Device device_;
   torch::Dtype dtype_;
   layer::WordEmbedding embed_tokens_{nullptr};
-  layer::AttentionMask attn_mask_{nullptr};
+  layer::AttentionMask attn_mask_;
   layer::RMSNorm norm_{nullptr};
   torch::Tensor cos_sin_;
   layer::PosEmbedding atb_pos_emb_{nullptr};
diff --git a/xllm/models/llm/glm4_moe_mtp.h b/xllm/models/llm/glm4_moe_mtp.h
@@ -123,20 +123,20 @@ class Glm4MoeMtpModelImpl : public torch::nn::Module {
 
         for (int j = 0; j < num_sequences; j++) {
           auto mask =
-              attn_mask_->gen_append_mask(input_params.q_seq_lens_vec[j],
-                                          input_params.kv_seq_lens_vec[j],
-                                          max_kv_seq,
-                                          cos_pos.dtype().toScalarType(),
-                                          cos_pos.device());
+              attn_mask_.gen_append_mask(input_params.q_seq_lens_vec[j],
+                                         input_params.kv_seq_lens_vec[j],
+                                         max_kv_seq,
+                                         cos_pos.dtype().toScalarType(),
+                                         cos_pos.device());
           req_mask_vec.emplace_back(mask);
         }
         attn_mask = torch::cat(req_mask_vec, 0);
       }
     } else {
       if (num_speculative_tokens_ == 0 || input_params.global_empty_kv_cache) {
-        attn_mask = attn_mask_->get_attn_mask(128, dtype_, device_);
+        attn_mask = attn_mask_.get_attn_mask(128, dtype_, device_);
       } else {
-        attn_mask = attn_mask_->gen_free_mask(
+        attn_mask = attn_mask_.gen_free_mask(
             num_speculative_tokens_ + 1, dtype_, device_);
       }
     }
@@ -232,7 +232,7 @@ class Glm4MoeMtpModelImpl : public torch::nn::Module {
   at::Device device_;
   torch::Dtype dtype_;
   layer::WordEmbedding embed_tokens_{nullptr};
-  layer::AttentionMask attn_mask_{nullptr};
+  layer::AttentionMask attn_mask_;
   torch::Tensor cos_sin_;
   layer::PosEmbedding atb_pos_emb_{nullptr};
   layer::ColumnParallelLinear eh_proj_{nullptr};
diff --git a/xllm/models/llm/llama.h b/xllm/models/llm/llama.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "core/framework/model/model_args.h"
 #include "core/framework/model/model_input_params.h"
 #include "core/framework/model_context.h"
-#include "core/layers/common/attention_mask_impl.h"
+#include "core/layers/common/attention_mask.h"
 #include "core/layers/llama_decoder_layer.h"
 #include "core/layers/lm_head.h"
 #include "core/layers/npu/npu_rms_norm_impl.h"
@@ -158,7 +158,7 @@ class LlamaModelImpl : public torch::nn::Module {
     max_seq_len_ = FLAGS_enable_chunked_prefill
                        ? std::max(max_of_seq.item<int>(), max_seq_len_)
                        : 128;
-    auto attn_mask = attn_mask_->get_attn_mask(
+    auto attn_mask = attn_mask_.get_attn_mask(
         max_seq_len_, cos_pos.dtype().toScalarType(), cos_pos.device());
 
     if (FLAGS_enable_chunked_prefill) {
@@ -226,7 +226,7 @@ class LlamaModelImpl : public torch::nn::Module {
   torch::Tensor sin_pos_;
   int max_seq_len_ = 0;
   int device_id_ = 0;
-  layer::AttentionMask attn_mask_{nullptr};
+  layer::AttentionMask attn_mask_;
   layer::WordEmbedding embed_tokens_{nullptr};
   layer::RMSNorm norm_{nullptr};
 
diff --git a/xllm/models/llm/llm_model_base.h b/xllm/models/llm/llm_model_base.h
diff --git a/xllm/models/llm/qwen3.h b/xllm/models/llm/qwen3.h
diff --git a/xllm/models/llm/qwen3_moe.h b/xllm/models/llm/qwen3_moe.h

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,6 @@ MRotaryEmbeddingImpl::MRotaryEmbeddingImpl(`
`88`	`88`	`rope_theta,`
`89`	`89`	`interleaved,`
`90`	`90`	`options),`
`91`		`- interleaved_(interleaved),`
`92`	`91`	`mrope_section_(rope_scaling_mrope_section) {`
`93`	`92`	`mrope_cu_seq_lens_ = torch::zeros(2, torch::kInt32).to(options.device());`
`94`	`93`	`}`