Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions ggml/src/ggml-openvino/ggml-openvino.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,15 @@ bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) {

static void ggml_backend_openvino_free(ggml_backend_t backend) {
ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;

if (ctx->runtime_context) {
auto r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
r_ctx->backend_count--;
if (r_ctx->backend_count == 0) {
r_ctx->clear_caches();
}
}

delete ctx;
delete backend;
}
Expand Down Expand Up @@ -672,6 +681,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
r_ctx->device = ggml_openvino_get_device_name();
r_ctx->stateful = is_stateful_enabled() && !ggml_openvino_is_npu();
r_ctx->backend_count++;

ggml_backend_t openvino_backend = new ggml_backend{
/* .guid = */ ggml_backend_openvino_guid(),
Expand Down
12 changes: 11 additions & 1 deletion ggml/src/ggml-openvino/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,21 @@ struct ov_runtime_context {
// Simultanous stateful inference request support to be added.
size_t stateful_kv_size;
std::map<std::string, std::string> kv_state_input_name_map;
int backend_count;

ov_runtime_context() :
device("CPU"),
stateful(false),
stateful_kv_size(0) {}
stateful_kv_size(0),
backend_count(0) {}

void clear_caches() {
decoder_cache.clear();
infer_request_cache.clear();
infer_request_cache_prefill.clear();
ov_input_names_cache.clear();
ov_output_names_cache.clear();
}
};

enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend);
Expand Down