From 71ece5fea507b2bfc0294bc4f6f437f21d826fe1 Mon Sep 17 00:00:00 2001 From: Xuejun Date: Mon, 13 Apr 2026 22:58:04 +0800 Subject: [PATCH] OpenVINO backend: fix error for attention size compute in llm param --- ggml/src/ggml-openvino/ggml-decoder.cpp | 11 ++++++++++- ggml/src/ggml-openvino/ggml-decoder.h | 4 ++-- ggml/src/ggml-openvino/openvino/translate_session.cpp | 4 +++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d75915cd00d..acb6a485d79 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -286,7 +286,7 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr for (int i = 0; i < cgraph->n_nodes; i++) { auto * node = cgraph->nodes[i]; std::string name = std::string(node->name); - if (node->op == GGML_OP_FLASH_ATTN_EXT || (node->op == GGML_OP_SOFT_MAX && node->src[1] != nullptr)) { + if (node->op == GGML_OP_FLASH_ATTN_EXT || (node->op == GGML_OP_SOFT_MAX && node->src[1] != nullptr && node->src[0]->src[1] != nullptr)) { compute_params.input_len = node->src[0]->ne[1]; auto * q_perm = node->src[0]; @@ -342,6 +342,15 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr compute_params.token_len_per_seq = 1; } } + + if (node->op == GGML_OP_MUL_MAT && node->src[0]->op == GGML_OP_PERMUTE && + node->src[0]->src[0]->op == GGML_OP_VIEW && is_kvcache(node->src[0]->view_src, node->view_src)) { + if (node->src[1]->op == GGML_OP_PERMUTE && node->src[1]->src[0]->op == GGML_OP_VIEW && + node->src[1]->src[0]->src[0]->op == GGML_OP_ROPE) { + compute_params.attention_size = node->ne[0]; + } + } + // if the node op is TRANSPOSE and its input is PERMUTE and the source of the PERMUTE is VIEW, then get the attention size with the TRANSPOSE node ne[0] (in case no GGML_OP_FLASH_ATTN_EXT) if (node->op == GGML_OP_TRANSPOSE && node->src[0]->op == GGML_OP_PERMUTE && node->src[0]->src[0]->op == GGML_OP_VIEW) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index ff8f81e8ae6..c39410ffde2 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -248,8 +248,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { } inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) { - return (op->op == GGML_OP_SET_ROWS && op->src[2] == tensor) || - tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY; + return tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || + (op->op == GGML_OP_SET_ROWS && op->src[2] == tensor); } inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 8283777cdd0..828c0b8a47f 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -146,7 +146,9 @@ void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) // Create common patterns void preprocess(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { - add_sliced_mask(tensor_map, ggml_model_decoder); + if (ggml_model_decoder.is_stateful()) { + add_sliced_mask(tensor_map, ggml_model_decoder); + } add_rope_sin_cos(tensor_map, ggml_model_decoder); }