@@ -230,80 +230,45 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
230230 // 2D tensor (typical weight shape)
231231 bool is_2d = (tensor->ne [2 ] == 1 && tensor->ne [3 ] == 1 );
232232
233- // Check if this is a quantized weight tensor that needs extraction/requantization
234- ggml_openvino_extracted_layout layout = {};
235- if (is_weight_buffer && is_full_tensor_set && is_2d && ggml_is_quantized (tensor->type )) {
236- layout = ggml_openvino_get_extracted_layout (tensor);
237- }
233+ if (is_weight_buffer && is_full_tensor_set && is_2d) {
234+ try {
235+ auto result = process_weight_tensor (tensor, data, tensor->data );
236+ result.weight_node ->set_friendly_name (tensor->name );
238237
239- if (layout.total_size > 0 ) {
240- // Quantized weight tensor with extraction/requantization
241- uint8_t * buf_base = (uint8_t *) tensor->data ;
238+ const auto & layout = result.layout ;
239+ ggml_openvino_extra_base * extra;
242240
243- try {
244- std::shared_ptr<ov::Node> constant = process_weight_tensor (tensor, data, buf_base);
245- constant->set_friendly_name (tensor->name );
246-
247- // Store in tensor->extra
248- if (layout.is_requant && layout.requant_type .has_value () &&
249- layout.requant_type .value () == ExtraQuantType::F16) {
250- // F16 requant case - use weight_extra
251- auto * extra = new ggml_openvino_weight_extra (constant);
252- ctx->tensor_extras [tensor] = extra;
253- tensor->extra = extra;
254- GGML_LOG_DEBUG (" %s: requantized %s to F16\n " , __func__, tensor->name );
255- } else {
256- // Quantized case - use quantized_weight_extra
257- // Create tensors with external memory (already filled by process_weight_tensor)
258- ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8 ;
259- ov::Shape weight_shape = {static_cast <size_t >(tensor->ne [1 ]), static_cast <size_t >(tensor->ne [0 ])};
260- ov::Shape scale_shape = {static_cast <size_t >(tensor->ne [1 ]),
261- static_cast <size_t >(tensor->ne [0 ] / layout.weights_per_block )};
262- // zp shape: scalar for symmetric, per-block for asymmetric
263- ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
264-
265- ov::Tensor weights (weight_type, weight_shape, buf_base + layout.weights_offset );
266- ov::Tensor scales (ov::element::f16 , scale_shape, buf_base + layout.scales_offset );
267- ov::Tensor zp (weight_type, zp_shape, buf_base + layout.zp_offset );
268-
269- auto * extra = new ggml_openvino_quantized_weight_extra (std::move (weights), std::move (scales),
270- std::move (zp), constant);
271- ctx->tensor_extras [tensor] = extra;
272- tensor->extra = extra;
241+ // Quantized path with extracted weight/scale/zp tensors
242+ if (result.is_quantized ()) {
243+ extra = new ggml_openvino_quantized_weight_extra (std::move (result.weights ), std::move (result.scales ),
244+ std::move (result.zp ), result.weight_node );
273245
274246 if (layout.is_requant ) {
275247 GGML_LOG_DEBUG (" %s: requantized %s to %s (u%d, block_size=%ld)\n " , __func__, tensor->name ,
276- layout.requant_type .value () == ExtraQuantType::Q4_0_128 ? " Q4_0_128 " : " Q8_0_32 " ,
277- layout.is_u4 ? 4 : 8 , layout. weights_per_block );
248+ extra_quant_type_name ( layout.requant_type .value ()), layout. is_u4 ? 4 : 8 ,
249+ layout.weights_per_block );
278250 } else {
279251 int64_t n_blocks = ggml_nelements (tensor) / layout.weights_per_block ;
280- GGML_LOG_DEBUG (" %s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n " , __func__ ,
281- tensor->name , layout.is_u4 ? 4 : 8 , layout.weights_size , n_blocks);
252+ GGML_LOG_DEBUG (" %s: extracted quantized weight node for %s (u%d, %zu weights, %ld blocks)\n " ,
253+ __func__, tensor->name , layout.is_u4 ? 4 : 8 , layout.weights_size , n_blocks);
282254 }
283- }
255+ } else {
256+ // F16/F32/BF16 weight or F16-requant
257+ extra = new ggml_openvino_weight_extra (std::move (result.weights ), result.weight_node );
284258
285- } catch (const std::exception & e) {
286- GGML_LOG_ERROR (" %s: failed to process quantized data for %s: %s\n " , __func__, tensor->name , e.what ());
287- // Fall back to storing raw data
288- memcpy ((char *) tensor->data + offset, data, size);
289- }
290- } else if (is_weight_buffer && is_full_tensor_set && is_2d &&
291- (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) {
292- // F16/F32/BF16 weight tensor
293- try {
294- std::shared_ptr<ov::Node> constant = process_weight_tensor (tensor, data, tensor->data );
295- constant->set_friendly_name (tensor->name );
259+ if (layout.total_size > 0 ) {
260+ GGML_LOG_DEBUG (" %s: requantized %s to F16\n " , __func__, tensor->name );
261+ } else {
262+ GGML_LOG_DEBUG (" %s: created shared-memory weight node for %s\n " , __func__, tensor->name );
263+ }
264+ }
296265
297- // Store in tensor->extra
298- ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra (constant);
299266 ctx->tensor_extras [tensor] = extra;
300267 tensor->extra = extra;
301268
302- GGML_LOG_DEBUG (" %s: created shared-memory constant for %s\n " , __func__, tensor->name );
303-
304269 } catch (const std::exception & e) {
305- GGML_LOG_DEBUG (" %s: failed to create shared-memory constant for %s: %s\n " , __func__, tensor->name ,
306- e. what () );
270+ GGML_LOG_ERROR (" %s: failed to process weight tensor for %s: %s\n " , __func__, tensor->name , e. what ());
271+ memcpy (( char *) tensor-> data + offset, data, size );
307272 }
308273 } else {
309274 // Non-weight tensor (KV cache, activations, etc.) - copy data
@@ -604,6 +569,22 @@ size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) {
604569 return ctx->id ;
605570}
606571
572+ void ggml_openvino_buffer_register_extra (ggml_tensor * tensor, ggml_openvino_extra_base * extra) {
573+ GGML_ASSERT (tensor != nullptr );
574+ GGML_ASSERT (tensor->buffer != nullptr );
575+ GGML_ASSERT (ggml_backend_buffer_is_openvino (tensor->buffer ));
576+
577+ auto * ctx = static_cast <ggml_backend_openvino_buffer_context *>(tensor->buffer ->context );
578+
579+ auto it = ctx->tensor_extras .find (tensor);
580+ if (it != ctx->tensor_extras .end ()) {
581+ delete it->second ;
582+ }
583+
584+ ctx->tensor_extras [tensor] = extra;
585+ tensor->extra = extra;
586+ }
587+
607588bool ggml_backend_buft_is_openvino (ggml_backend_buffer_type_t buft) {
608589 return buft->iface .get_name == ggml_backend_openvino_buffer_type_get_name;
609590}
0 commit comments