Refactor weight tensor processing

wine99 · wine99 · commit 6d71ded5faff · 2026-02-06T21:02:34.000+08:00
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -550,11 +550,6 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
     return model_weights;
 }
 
-// Static cache for quantized weight nodes (keyed by tensor data pointer)
-// This is a fallback for when tensors don't have pre-built constants in extra
-static std::unordered_map<const void *, std::shared_ptr<ov::Node>> s_quantized_weight_cache;
-static std::mutex s_quantized_weight_cache_mutex;
-
 std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
     // Check if we have a pre-built constant from the OpenVINO backend buffer
     // This is set during ggml_backend_openvino_buffer_set_tensor
@@ -569,51 +564,62 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
         if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) {
             // F16/F32/BF16 weight with shared-memory constant
             auto * weight_extra = static_cast<ggml_openvino_weight_extra *>(tensor->extra);
-            if (weight_extra->constant) {
-                GGML_LOG_DEBUG("%s: using pre-built constant for %s\n", __func__, tensor->name);
-                return weight_extra->constant;
+            if (weight_extra->weight_node) {
+                GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name);
+                return weight_extra->weight_node;
             }
         } else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) {
             // Quantized weight with pre-extracted data
             auto * quant_extra = static_cast<ggml_openvino_quantized_weight_extra *>(tensor->extra);
-            if (quant_extra->constant) {
-                GGML_LOG_DEBUG("%s: using pre-extracted quantized constant for %s\n", __func__, tensor->name);
-                return quant_extra->constant;
+            if (quant_extra->weight_node) {
+                GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name);
+                return quant_extra->weight_node;
             }
         }
     }
 
-    // Fallback: Check static cache for quantized weights (keyed by data pointer)
-    // This handles cases where tensors weren't loaded through OpenVINO buffer
-    if (ggml_is_quantized(tensor->type)) {
-        std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
-        auto it = s_quantized_weight_cache.find(tensor->data);
-        if (it != s_quantized_weight_cache.end()) {
-            GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name);
-            return it->second;
-        }
-    }
-
-    GGML_LOG_DEBUG("%s: creating new constant for %s (extra=%p)\n", __func__, tensor->name, tensor->extra);
+    // Fallback: tensor doesn't have a pre-built extra. The buffer type can only be
+    // openvino_host_buffer_type, which has enough space (get_alloc_size returns
+    // layout.total_size for quantized 2D tensors) to store extracted data in-place.
+    // Build the weight node and store it in tensor->extra for future reuse.
+    GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
 
-    std::set<ggml_type> weight_types = {GGML_TYPE_F32,  GGML_TYPE_F16,  GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0,
-                                        GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
+    static const std::set<ggml_type> weight_types = {GGML_TYPE_F32,  GGML_TYPE_F16,  GGML_TYPE_BF16,
+                                                     GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
+                                                     GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
     if (weight_types.find(tensor->type) == weight_types.end()) {
         throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
                                  ggml_type_name(tensor->type));
     }
 
-    std::shared_ptr<ov::Node> result = process_weight_tensor(tensor, tensor->data, nullptr);
-    result->set_friendly_name(tensor->name);
-
-    // Cache the quantized weight node for future reuse
+    OvWeight ov_weight;
     if (ggml_is_quantized(tensor->type)) {
-        std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
-        s_quantized_weight_cache[tensor->data] = result;
-        GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name);
+        // For quantized weights, copy raw data to a temp buffer first because
+        // process_weight_tensor reads from data and writes extracted results
+        // (weights/scales/zp) to output_base_ptr — they would overlap if both
+        // point to tensor->data.
+        size_t raw_size = ggml_nbytes(tensor);
+        std::vector<uint8_t> tmp(raw_size);
+        memcpy(tmp.data(), tensor->data, raw_size);
+        ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data);
+    } else {
+        // For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
+        // process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
+        ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data);
+    }
+
+    ov_weight.weight_node->set_friendly_name(tensor->name);
+
+    ggml_openvino_extra_base * extra;
+    if (ov_weight.is_quantized()) {
+        extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales),
+                                                         std::move(ov_weight.zp), ov_weight.weight_node);
+    } else {
+        extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node);
     }
+    ggml_openvino_buffer_register_extra(tensor, extra);
 
-    return result;
+    return ov_weight.weight_node;
 }
 
 void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@@ -319,6 +319,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
     layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
     layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
     layout.total_size = layout.zp_offset + layout.zp_size;
+    layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
 
     return layout;
 }
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@@ -102,27 +102,30 @@ struct ggml_openvino_extra_base {
     explicit ggml_openvino_extra_base(Type t) : type(t) {}
 };
 
-// Extra data for F16/F32/BF16 weight tensors - stores the pre-built ov::Constant node
+// Extra data for F16/F32/BF16 weight tensors - stores the pre-built weight node
 struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
-    std::shared_ptr<ov::Node> constant;  // Pre-built OpenVINO Constant node
+    ov::Tensor weights;                     // The underlying weight data tensor
+    std::shared_ptr<ov::Node> weight_node;  // Pre-built OpenVINO weight node
 
-    explicit ggml_openvino_weight_extra(std::shared_ptr<ov::Node> c)
-        : ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {}
+    ggml_openvino_weight_extra(ov::Tensor w, std::shared_ptr<ov::Node> n) :
+        ggml_openvino_extra_base(Type::WEIGHT),
+        weights(std::move(w)),
+        weight_node(std::move(n)) {}
 };
 
-// Extra data for quantized weight tensors - stores extracted weights/scales/zp and ov::Constant
+// Extra data for quantized weight tensors - stores extracted weights/scales/zp and weight node
 struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
     ov::Tensor weights;   // U4 or U8 extracted weights
     ov::Tensor scales;    // F16 scales
     ov::Tensor zp;        // U4 or U8 zero points (same type as weights)
-    std::shared_ptr<ov::Node> constant;  // Pre-built OpenVINO weight subgraph
+    std::shared_ptr<ov::Node> weight_node;  // Pre-built OpenVINO weight subgraph
 
-    ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> c) :
+    ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> n) :
         ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
         weights(std::move(w)),
         scales(std::move(s)),
         zp(std::move(z)),
-        constant(std::move(c)) {}
+        weight_node(std::move(n)) {}
 };
 
 // Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request
@@ -140,23 +143,27 @@ struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
 // Returns the total size needed in the buffer for extracted data.
 
 struct ggml_openvino_extracted_layout {
-    size_t total_size;        // Total bytes needed
-    size_t weights_offset;    // Offset to weights in buffer
-    size_t weights_size;      // Size of weights in bytes
-    size_t scales_offset;     // Offset to scales in buffer
-    size_t scales_size;       // Size of scales in bytes
-    size_t zp_offset;         // Offset to zero points in buffer
-    size_t zp_size;           // Size of zero points in bytes (U4 or U8)
-    bool is_u4;               // true for U4 weights, false for U8
+    size_t total_size = 0;      // Total bytes needed
+    size_t weights_offset = 0;  // Offset to weights in buffer
+    size_t weights_size = 0;    // Size of weights in bytes
+    size_t scales_offset = 0;   // Offset to scales in buffer
+    size_t scales_size = 0;     // Size of scales in bytes
+    size_t zp_offset = 0;       // Offset to zero points in buffer
+    size_t zp_size = 0;         // Size of zero points in bytes (U4 or U8)
+    bool is_u4;                 // true for U4 weights, false for U8
     int64_t weights_per_block;  // weights per scale/zp block
     bool is_symmetric;        // true for symmetric quantization
 
     // Requantization info
-    bool is_requant;                              // true if this tensor needs requantization
+    bool is_requant = false;                      // true if this tensor needs requantization
     std::optional<ExtraQuantType> requant_type;   // target requant type if is_requant
 };
 
 // Calculate the buffer layout for extracted quantized data
 ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor);
 
 ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);
+
+// Register an extra with the tensor's OpenVINO buffer context for proper lifetime management.
+// This sets tensor->extra and tracks the extra in the buffer context for cleanup.
+void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra);
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -230,80 +230,45 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
     // 2D tensor (typical weight shape)
     bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1);
 
-    // Check if this is a quantized weight tensor that needs extraction/requantization
-    ggml_openvino_extracted_layout layout = {};
-    if (is_weight_buffer && is_full_tensor_set && is_2d && ggml_is_quantized(tensor->type)) {
-        layout = ggml_openvino_get_extracted_layout(tensor);
-    }
+    if (is_weight_buffer && is_full_tensor_set && is_2d) {
+        try {
+            auto result = process_weight_tensor(tensor, data, tensor->data);
+            result.weight_node->set_friendly_name(tensor->name);
 
-    if (layout.total_size > 0) {
-        // Quantized weight tensor with extraction/requantization
-        uint8_t * buf_base = (uint8_t *) tensor->data;
+            const auto & layout = result.layout;
+            ggml_openvino_extra_base * extra;
 
-        try {
-            std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, buf_base);
-            constant->set_friendly_name(tensor->name);
-
-            // Store in tensor->extra
-            if (layout.is_requant && layout.requant_type.has_value() &&
-                layout.requant_type.value() == ExtraQuantType::F16) {
-                // F16 requant case - use weight_extra
-                auto * extra = new ggml_openvino_weight_extra(constant);
-                ctx->tensor_extras[tensor] = extra;
-                tensor->extra = extra;
-                GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
-            } else {
-                // Quantized case - use quantized_weight_extra
-                // Create tensors with external memory (already filled by process_weight_tensor)
-                ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
-                ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
-                ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
-                                         static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
-                // zp shape: scalar for symmetric, per-block for asymmetric
-                ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
-
-                ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
-                ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-                ov::Tensor zp(weight_type, zp_shape, buf_base + layout.zp_offset);
-
-                auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
-                                                                        std::move(zp), constant);
-                ctx->tensor_extras[tensor] = extra;
-                tensor->extra = extra;
+            // Quantized path with extracted weight/scale/zp tensors
+            if (result.is_quantized()) {
+                extra = new ggml_openvino_quantized_weight_extra(std::move(result.weights), std::move(result.scales),
+                                                                 std::move(result.zp), result.weight_node);
 
                 if (layout.is_requant) {
                     GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name,
-                                   layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32",
-                                   layout.is_u4 ? 4 : 8, layout.weights_per_block);
+                                   extra_quant_type_name(layout.requant_type.value()), layout.is_u4 ? 4 : 8,
+                                   layout.weights_per_block);
                 } else {
                     int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block;
-                    GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__,
-                                   tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
+                    GGML_LOG_DEBUG("%s: extracted quantized weight node for %s (u%d, %zu weights, %ld blocks)\n",
+                                   __func__, tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
                 }
-            }
+            } else {
+                // F16/F32/BF16 weight or F16-requant
+                extra = new ggml_openvino_weight_extra(std::move(result.weights), result.weight_node);
 
-        } catch (const std::exception & e) {
-            GGML_LOG_ERROR("%s: failed to process quantized data for %s: %s\n", __func__, tensor->name, e.what());
-            // Fall back to storing raw data
-            memcpy((char *) tensor->data + offset, data, size);
-        }
-    } else if (is_weight_buffer && is_full_tensor_set && is_2d &&
-               (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) {
-        // F16/F32/BF16 weight tensor
-        try {
-            std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, tensor->data);
-            constant->set_friendly_name(tensor->name);
+                if (layout.total_size > 0) {
+                    GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
+                } else {
+                    GGML_LOG_DEBUG("%s: created shared-memory weight node for %s\n", __func__, tensor->name);
+                }
+            }
 
-            // Store in tensor->extra
-            ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant);
             ctx->tensor_extras[tensor] = extra;
             tensor->extra = extra;
 
-            GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name);
-
         } catch (const std::exception & e) {
-            GGML_LOG_DEBUG("%s: failed to create shared-memory constant for %s: %s\n", __func__, tensor->name,
-                           e.what());
+            GGML_LOG_ERROR("%s: failed to process weight tensor for %s: %s\n", __func__, tensor->name, e.what());
+            memcpy((char *) tensor->data + offset, data, size);
         }
     } else {
         // Non-weight tensor (KV cache, activations, etc.) - copy data
@@ -604,6 +569,22 @@ size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) {
     return ctx->id;
 }
 
+void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra) {
+    GGML_ASSERT(tensor != nullptr);
+    GGML_ASSERT(tensor->buffer != nullptr);
+    GGML_ASSERT(ggml_backend_buffer_is_openvino(tensor->buffer));
+
+    auto * ctx = static_cast<ggml_backend_openvino_buffer_context *>(tensor->buffer->context);
+
+    auto it = ctx->tensor_extras.find(tensor);
+    if (it != ctx->tensor_extras.end()) {
+        delete it->second;
+    }
+
+    ctx->tensor_extras[tensor] = extra;
+    tensor->extra = extra;
+}
+
 bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
     return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
 }
diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp
diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp

Original file line number	Diff line number	Diff line change
`@@ -319,6 +319,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten`
`319`	`319`	`layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;`
`320`	`320`	`layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;`
`321`	`321`	`layout.total_size = layout.zp_offset + layout.zp_size;`
	`322`	`+ layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));`
`322`	`323`
`323`	`324`	`return layout;`
`324`	`325`	`}`