Skip to content

Commit 6d71ded

Browse files
committed
Refactor weight tensor processing
1 parent 907d832 commit 6d71ded

6 files changed

Lines changed: 181 additions & 177 deletions

File tree

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 39 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -550,11 +550,6 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
550550
return model_weights;
551551
}
552552

553-
// Static cache for quantized weight nodes (keyed by tensor data pointer)
554-
// This is a fallback for when tensors don't have pre-built constants in extra
555-
static std::unordered_map<const void *, std::shared_ptr<ov::Node>> s_quantized_weight_cache;
556-
static std::mutex s_quantized_weight_cache_mutex;
557-
558553
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
559554
// Check if we have a pre-built constant from the OpenVINO backend buffer
560555
// This is set during ggml_backend_openvino_buffer_set_tensor
@@ -569,51 +564,62 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
569564
if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) {
570565
// F16/F32/BF16 weight with shared-memory constant
571566
auto * weight_extra = static_cast<ggml_openvino_weight_extra *>(tensor->extra);
572-
if (weight_extra->constant) {
573-
GGML_LOG_DEBUG("%s: using pre-built constant for %s\n", __func__, tensor->name);
574-
return weight_extra->constant;
567+
if (weight_extra->weight_node) {
568+
GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name);
569+
return weight_extra->weight_node;
575570
}
576571
} else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) {
577572
// Quantized weight with pre-extracted data
578573
auto * quant_extra = static_cast<ggml_openvino_quantized_weight_extra *>(tensor->extra);
579-
if (quant_extra->constant) {
580-
GGML_LOG_DEBUG("%s: using pre-extracted quantized constant for %s\n", __func__, tensor->name);
581-
return quant_extra->constant;
574+
if (quant_extra->weight_node) {
575+
GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name);
576+
return quant_extra->weight_node;
582577
}
583578
}
584579
}
585580

586-
// Fallback: Check static cache for quantized weights (keyed by data pointer)
587-
// This handles cases where tensors weren't loaded through OpenVINO buffer
588-
if (ggml_is_quantized(tensor->type)) {
589-
std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
590-
auto it = s_quantized_weight_cache.find(tensor->data);
591-
if (it != s_quantized_weight_cache.end()) {
592-
GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name);
593-
return it->second;
594-
}
595-
}
596-
597-
GGML_LOG_DEBUG("%s: creating new constant for %s (extra=%p)\n", __func__, tensor->name, tensor->extra);
581+
// Fallback: tensor doesn't have a pre-built extra. The buffer type can only be
582+
// openvino_host_buffer_type, which has enough space (get_alloc_size returns
583+
// layout.total_size for quantized 2D tensors) to store extracted data in-place.
584+
// Build the weight node and store it in tensor->extra for future reuse.
585+
GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
598586

599-
std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0,
600-
GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
587+
static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
588+
GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
589+
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
601590
if (weight_types.find(tensor->type) == weight_types.end()) {
602591
throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
603592
ggml_type_name(tensor->type));
604593
}
605594

606-
std::shared_ptr<ov::Node> result = process_weight_tensor(tensor, tensor->data, nullptr);
607-
result->set_friendly_name(tensor->name);
608-
609-
// Cache the quantized weight node for future reuse
595+
OvWeight ov_weight;
610596
if (ggml_is_quantized(tensor->type)) {
611-
std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
612-
s_quantized_weight_cache[tensor->data] = result;
613-
GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name);
597+
// For quantized weights, copy raw data to a temp buffer first because
598+
// process_weight_tensor reads from data and writes extracted results
599+
// (weights/scales/zp) to output_base_ptr — they would overlap if both
600+
// point to tensor->data.
601+
size_t raw_size = ggml_nbytes(tensor);
602+
std::vector<uint8_t> tmp(raw_size);
603+
memcpy(tmp.data(), tensor->data, raw_size);
604+
ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data);
605+
} else {
606+
// For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
607+
// process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
608+
ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data);
609+
}
610+
611+
ov_weight.weight_node->set_friendly_name(tensor->name);
612+
613+
ggml_openvino_extra_base * extra;
614+
if (ov_weight.is_quantized()) {
615+
extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales),
616+
std::move(ov_weight.zp), ov_weight.weight_node);
617+
} else {
618+
extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node);
614619
}
620+
ggml_openvino_buffer_register_extra(tensor, extra);
615621

616-
return result;
622+
return ov_weight.weight_node;
617623
}
618624

619625
void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {

ggml/src/ggml-openvino/ggml-openvino-extra.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
319319
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
320320
layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
321321
layout.total_size = layout.zp_offset + layout.zp_size;
322+
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
322323

323324
return layout;
324325
}

ggml/src/ggml-openvino/ggml-openvino-extra.h

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -102,27 +102,30 @@ struct ggml_openvino_extra_base {
102102
explicit ggml_openvino_extra_base(Type t) : type(t) {}
103103
};
104104

105-
// Extra data for F16/F32/BF16 weight tensors - stores the pre-built ov::Constant node
105+
// Extra data for F16/F32/BF16 weight tensors - stores the pre-built weight node
106106
struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
107-
std::shared_ptr<ov::Node> constant; // Pre-built OpenVINO Constant node
107+
ov::Tensor weights; // The underlying weight data tensor
108+
std::shared_ptr<ov::Node> weight_node; // Pre-built OpenVINO weight node
108109

109-
explicit ggml_openvino_weight_extra(std::shared_ptr<ov::Node> c)
110-
: ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {}
110+
ggml_openvino_weight_extra(ov::Tensor w, std::shared_ptr<ov::Node> n) :
111+
ggml_openvino_extra_base(Type::WEIGHT),
112+
weights(std::move(w)),
113+
weight_node(std::move(n)) {}
111114
};
112115

113-
// Extra data for quantized weight tensors - stores extracted weights/scales/zp and ov::Constant
116+
// Extra data for quantized weight tensors - stores extracted weights/scales/zp and weight node
114117
struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
115118
ov::Tensor weights; // U4 or U8 extracted weights
116119
ov::Tensor scales; // F16 scales
117120
ov::Tensor zp; // U4 or U8 zero points (same type as weights)
118-
std::shared_ptr<ov::Node> constant; // Pre-built OpenVINO weight subgraph
121+
std::shared_ptr<ov::Node> weight_node; // Pre-built OpenVINO weight subgraph
119122

120-
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> c) :
123+
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> n) :
121124
ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
122125
weights(std::move(w)),
123126
scales(std::move(s)),
124127
zp(std::move(z)),
125-
constant(std::move(c)) {}
128+
weight_node(std::move(n)) {}
126129
};
127130

128131
// Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request
@@ -140,23 +143,27 @@ struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
140143
// Returns the total size needed in the buffer for extracted data.
141144

142145
struct ggml_openvino_extracted_layout {
143-
size_t total_size; // Total bytes needed
144-
size_t weights_offset; // Offset to weights in buffer
145-
size_t weights_size; // Size of weights in bytes
146-
size_t scales_offset; // Offset to scales in buffer
147-
size_t scales_size; // Size of scales in bytes
148-
size_t zp_offset; // Offset to zero points in buffer
149-
size_t zp_size; // Size of zero points in bytes (U4 or U8)
150-
bool is_u4; // true for U4 weights, false for U8
146+
size_t total_size = 0; // Total bytes needed
147+
size_t weights_offset = 0; // Offset to weights in buffer
148+
size_t weights_size = 0; // Size of weights in bytes
149+
size_t scales_offset = 0; // Offset to scales in buffer
150+
size_t scales_size = 0; // Size of scales in bytes
151+
size_t zp_offset = 0; // Offset to zero points in buffer
152+
size_t zp_size = 0; // Size of zero points in bytes (U4 or U8)
153+
bool is_u4; // true for U4 weights, false for U8
151154
int64_t weights_per_block; // weights per scale/zp block
152155
bool is_symmetric; // true for symmetric quantization
153156

154157
// Requantization info
155-
bool is_requant; // true if this tensor needs requantization
158+
bool is_requant = false; // true if this tensor needs requantization
156159
std::optional<ExtraQuantType> requant_type; // target requant type if is_requant
157160
};
158161

159162
// Calculate the buffer layout for extracted quantized data
160163
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor);
161164

162165
ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);
166+
167+
// Register an extra with the tensor's OpenVINO buffer context for proper lifetime management.
168+
// This sets tensor->extra and tracks the extra in the buffer context for cleanup.
169+
void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra);

ggml/src/ggml-openvino/ggml-openvino.cpp

Lines changed: 41 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -230,80 +230,45 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
230230
// 2D tensor (typical weight shape)
231231
bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1);
232232

233-
// Check if this is a quantized weight tensor that needs extraction/requantization
234-
ggml_openvino_extracted_layout layout = {};
235-
if (is_weight_buffer && is_full_tensor_set && is_2d && ggml_is_quantized(tensor->type)) {
236-
layout = ggml_openvino_get_extracted_layout(tensor);
237-
}
233+
if (is_weight_buffer && is_full_tensor_set && is_2d) {
234+
try {
235+
auto result = process_weight_tensor(tensor, data, tensor->data);
236+
result.weight_node->set_friendly_name(tensor->name);
238237

239-
if (layout.total_size > 0) {
240-
// Quantized weight tensor with extraction/requantization
241-
uint8_t * buf_base = (uint8_t *) tensor->data;
238+
const auto & layout = result.layout;
239+
ggml_openvino_extra_base * extra;
242240

243-
try {
244-
std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, buf_base);
245-
constant->set_friendly_name(tensor->name);
246-
247-
// Store in tensor->extra
248-
if (layout.is_requant && layout.requant_type.has_value() &&
249-
layout.requant_type.value() == ExtraQuantType::F16) {
250-
// F16 requant case - use weight_extra
251-
auto * extra = new ggml_openvino_weight_extra(constant);
252-
ctx->tensor_extras[tensor] = extra;
253-
tensor->extra = extra;
254-
GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
255-
} else {
256-
// Quantized case - use quantized_weight_extra
257-
// Create tensors with external memory (already filled by process_weight_tensor)
258-
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
259-
ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
260-
ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
261-
static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
262-
// zp shape: scalar for symmetric, per-block for asymmetric
263-
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
264-
265-
ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
266-
ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
267-
ov::Tensor zp(weight_type, zp_shape, buf_base + layout.zp_offset);
268-
269-
auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
270-
std::move(zp), constant);
271-
ctx->tensor_extras[tensor] = extra;
272-
tensor->extra = extra;
241+
// Quantized path with extracted weight/scale/zp tensors
242+
if (result.is_quantized()) {
243+
extra = new ggml_openvino_quantized_weight_extra(std::move(result.weights), std::move(result.scales),
244+
std::move(result.zp), result.weight_node);
273245

274246
if (layout.is_requant) {
275247
GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name,
276-
layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32",
277-
layout.is_u4 ? 4 : 8, layout.weights_per_block);
248+
extra_quant_type_name(layout.requant_type.value()), layout.is_u4 ? 4 : 8,
249+
layout.weights_per_block);
278250
} else {
279251
int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block;
280-
GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__,
281-
tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
252+
GGML_LOG_DEBUG("%s: extracted quantized weight node for %s (u%d, %zu weights, %ld blocks)\n",
253+
__func__, tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
282254
}
283-
}
255+
} else {
256+
// F16/F32/BF16 weight or F16-requant
257+
extra = new ggml_openvino_weight_extra(std::move(result.weights), result.weight_node);
284258

285-
} catch (const std::exception & e) {
286-
GGML_LOG_ERROR("%s: failed to process quantized data for %s: %s\n", __func__, tensor->name, e.what());
287-
// Fall back to storing raw data
288-
memcpy((char *) tensor->data + offset, data, size);
289-
}
290-
} else if (is_weight_buffer && is_full_tensor_set && is_2d &&
291-
(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) {
292-
// F16/F32/BF16 weight tensor
293-
try {
294-
std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, tensor->data);
295-
constant->set_friendly_name(tensor->name);
259+
if (layout.total_size > 0) {
260+
GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
261+
} else {
262+
GGML_LOG_DEBUG("%s: created shared-memory weight node for %s\n", __func__, tensor->name);
263+
}
264+
}
296265

297-
// Store in tensor->extra
298-
ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant);
299266
ctx->tensor_extras[tensor] = extra;
300267
tensor->extra = extra;
301268

302-
GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name);
303-
304269
} catch (const std::exception & e) {
305-
GGML_LOG_DEBUG("%s: failed to create shared-memory constant for %s: %s\n", __func__, tensor->name,
306-
e.what());
270+
GGML_LOG_ERROR("%s: failed to process weight tensor for %s: %s\n", __func__, tensor->name, e.what());
271+
memcpy((char *) tensor->data + offset, data, size);
307272
}
308273
} else {
309274
// Non-weight tensor (KV cache, activations, etc.) - copy data
@@ -604,6 +569,22 @@ size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) {
604569
return ctx->id;
605570
}
606571

572+
void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra) {
573+
GGML_ASSERT(tensor != nullptr);
574+
GGML_ASSERT(tensor->buffer != nullptr);
575+
GGML_ASSERT(ggml_backend_buffer_is_openvino(tensor->buffer));
576+
577+
auto * ctx = static_cast<ggml_backend_openvino_buffer_context *>(tensor->buffer->context);
578+
579+
auto it = ctx->tensor_extras.find(tensor);
580+
if (it != ctx->tensor_extras.end()) {
581+
delete it->second;
582+
}
583+
584+
ctx->tensor_extras[tensor] = extra;
585+
tensor->extra = extra;
586+
}
587+
607588
bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
608589
return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
609590
}

0 commit comments

Comments
 (0)