Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion ggml/src/ggml-openvino/ggml-decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
for (int i = 0; i < cgraph->n_nodes; i++) {
auto * node = cgraph->nodes[i];
std::string name = std::string(node->name);
if (node->op == GGML_OP_FLASH_ATTN_EXT || (node->op == GGML_OP_SOFT_MAX && node->src[1] != nullptr)) {
if (node->op == GGML_OP_FLASH_ATTN_EXT || (node->op == GGML_OP_SOFT_MAX && node->src[1] != nullptr && node->src[0]->src[1] != nullptr)) {
compute_params.input_len = node->src[0]->ne[1];

auto * q_perm = node->src[0];
Expand Down Expand Up @@ -342,6 +342,15 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
compute_params.token_len_per_seq = 1;
}
}

if (node->op == GGML_OP_MUL_MAT && node->src[0]->op == GGML_OP_PERMUTE &&
node->src[0]->src[0]->op == GGML_OP_VIEW && is_kvcache(node->src[0]->view_src, node->view_src)) {
if (node->src[1]->op == GGML_OP_PERMUTE && node->src[1]->src[0]->op == GGML_OP_VIEW &&
node->src[1]->src[0]->src[0]->op == GGML_OP_ROPE) {
compute_params.attention_size = node->ne[0];
}
}

// if the node op is TRANSPOSE and its input is PERMUTE and the source of the PERMUTE is VIEW, then get the attention size with the TRANSPOSE node ne[0] (in case no GGML_OP_FLASH_ATTN_EXT)
if (node->op == GGML_OP_TRANSPOSE && node->src[0]->op == GGML_OP_PERMUTE &&
node->src[0]->src[0]->op == GGML_OP_VIEW) {
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-openvino/ggml-decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
}

inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
return (op->op == GGML_OP_SET_ROWS && op->src[2] == tensor) ||
tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY;
return tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY ||
(op->op == GGML_OP_SET_ROWS && op->src[2] == tensor);
}

inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
Expand Down
4 changes: 3 additions & 1 deletion ggml/src/ggml-openvino/openvino/translate_session.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,9 @@ void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder)

// Create common patterns
void preprocess(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
add_sliced_mask(tensor_map, ggml_model_decoder);
if (ggml_model_decoder.is_stateful()) {
add_sliced_mask(tensor_map, ggml_model_decoder);
}
add_rope_sin_cos(tensor_map, ggml_model_decoder);
}

Expand Down