diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index 815fc6aa69a60..e26d22558c1d1 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -1608,6 +1608,18 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi return *prepacked_weights_for_graph_; } + // Tags a fusion-generated initializer (whose name is not stable across sessions) with a stable, + // content-derived identity that SessionState uses to key cross-session pre-pack sharing. + void SetSharedPrepackInitializerId(const std::string& initializer_name, std::string share_id) { + generated_shared_prepack_ids_[initializer_name] = std::move(share_id); + } + + // Returns the sharing identity for a generated initializer, or nullptr if it was not tagged. + const std::string* GetSharedPrepackInitializerId(const std::string& initializer_name) const { + auto it = generated_shared_prepack_ids_.find(initializer_name); + return it == generated_shared_prepack_ids_.end() ? nullptr : &it->second; + } + /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */ const Node* ParentNode() const { return parent_node_; } @@ -2011,6 +2023,10 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi // This is optional due to delayed construction. std::optional prepacked_weights_for_graph_; + // Maps a fusion-generated initializer name to its cross-session sharing identity. + // See SetSharedPrepackInitializerId. + InlinedHashMap generated_shared_prepack_ids_; + #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) // Runtime optimization storage. // Note: runtime_optimizations_ == *runtime_optimizations_ptr_ and must be initialized diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc index 6bd1690fca815..162d7257d0a4c 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc @@ -4,6 +4,7 @@ #include "contrib_ops/cpu/quantization/matmul_nbits_impl.h" #include +#include #include #include @@ -162,6 +163,13 @@ class MatMulNBits final : public OpKernel { const bool column_wise_quant_{true}; IAllocatorUniquePtr packed_b_{}; size_t packed_b_size_{0}; + // True once PrePack(InputIndex::B) has folded the scales and (constant) zero points into packed_b_, + // leaving the CompInt8 buffer fully packed and compute-ready. Pre-packed weight sharing + // content-hashes the buffer right after the B PrePack returns, so everything that affects the + // packed bytes (in particular the block sum / BZpCorr, which depend on the zero points) must be + // folded in by then. Once set, the later scales/zero_point PrePack calls must not pack again: the + // CompInt8 packing is single-shot, and the buffer may by then be one shared from another session. + bool packed_b_finalized_{false}; IAllocatorUniquePtr scales_fp32_{}; IAllocatorUniquePtr bias_fp32_{}; @@ -227,7 +235,6 @@ template Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc, /*out*/ bool& is_packed, /*out*/ PrePackedWeights* prepacked_weights) { - ORT_UNUSED_PARAMETER(prepacked_weights); is_packed = false; if (has_g_idx_) { return Status::OK(); @@ -308,10 +315,12 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All static_cast(packed_b_.get()), threadpool_ptr); - if (prepacked_weights != nullptr) { - prepacked_weights->buffers_.push_back(std::move(packed_b_)); - prepacked_weights->buffer_sizes_.push_back(packed_b_size_); - } + // Do not append packed_b_ here. Both the LUT and non-LUT branches share the single append + // after this if/else, so each records exactly one buffer. Appending here as well would move + // packed_b_ out now and then have the shared append record a second, moved-from/null buffer + // with a non-zero packed_b_size_. PrePackedWeights::GetHash() skips null buffers so sharing + // appears to work, but the prepacked-blob save path writes buffer_sizes_[i] bytes from + // buffers_[i].get() and would dereference that null pointer. } else { // For HQNBIT_CompInt8, route through SQNBIT_CompInt8 for sizing and packing. // This gets KleidiAI-sized buffer when available for 4-bit and packs B+scales correctly. @@ -341,24 +350,64 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All } packed_b_ = IAllocator::MakeUniquePtr(alloc, packed_b_size_, true); + // The framework content-hashes this packed buffer to deduplicate pre-packed weights, both + // within a session and across sessions (the shared container). The session-state prepack pass + // (SessionState::PrepackConstantInitializedTensors) passes a non-null prepacked_weights on both + // the container and the default single-session paths, so this zero-fill runs on essentially + // every prepack at load, not only when a sharing container is configured -- the guard below + // only skips a caller that asks for no cacheable buffer. The pack routines need not write every + // byte (alignment padding between the CompInt8 sub-regions; any layout could gain padding) and + // the reserve allocation is not zero-filled, so the hash would otherwise depend on uninitialized + // bytes. Zeroing the whole buffer is a one-time O(packed_b_size_) load cost (the pack overwrites + // the data regions, leaving only padding zeroed); inference is unaffected. + if (prepacked_weights != nullptr) { + std::memset(packed_b_.get(), 0, packed_b_size_); + } MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, effective_compute_type, qptr, packed_b_.get(), scale_ptr, has_zp_input_, nullptr, threadpool_ptr, &mlas_backend_kernel_selector_config_); -#if defined(MLAS_TARGET_ARM64) - // For KleidiAI asymmetric 4-bit path: compute BZpCorr now while scales and zero_points are accessible. - if (compute_type_ == HQNBIT_CompInt8 && nbits_ == 4 && has_zp_input_ && scales_fp32_ && - MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, SQNBIT_CompInt8, has_zp_input_, &mlas_backend_kernel_selector_config_)) { - const Tensor* zp_tensor = nullptr; - OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor); - if (zp_tensor != nullptr) { - auto zptr = zp_tensor->Data(); - MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(), - scales_fp32_.get(), has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); + // Fold the scales and (constant) zero points into packed_b_ now, during the B PrePack, instead + // of deferring them to the later scales/zero_points PrePack calls. Pre-packed weight sharing + // content-hashes this buffer immediately after the B PrePack returns; the CompInt8 block sum + // (and the KleidiAI BZpCorr) is a function of the zero points, so they must already be folded + // in for the hash to reflect them. Otherwise two initializers with identical B and scales but + // different zero points would hash equal and the second would wrongly adopt the first's buffer + // and silently compute wrong results. scales and zero_points are constant initializers, so they + // are available here. The B pack above only partially populates the buffer (on x64 the block sum + // is deferred; on ARM64 8-bit the scales are ignored during B packing), so issue one more pack + // call with QuantBData == nullptr to finalize it. This is byte-identical to the staged + // scales + zero_points packing it replaces. + bool finalize_scale_zp_into_packed_b = effective_compute_type == SQNBIT_CompInt8 && scale_ptr != nullptr; +#if !defined(MLAS_TARGET_AMD64_IX86) + // On ARM64 the scales/zero points are folded into B only for 8-bit, or for 4-bit when MLAS bakes + // them in (KleidiAI). For 4-bit non-KleidiAI they are applied at compute time and must not be + // passed to the packing routine, which would dereference the null QuantBData buffer. + finalize_scale_zp_into_packed_b = + finalize_scale_zp_into_packed_b && + (nbits_ == 8 || MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, effective_compute_type, + has_zp_input_, &mlas_backend_kernel_selector_config_)); +#endif + if (finalize_scale_zp_into_packed_b) { + const uint8_t* zp_ptr = nullptr; + if (has_zp_input_) { + const Tensor* zp_tensor = nullptr; + OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor); + if (zp_tensor != nullptr) { + zp_ptr = zp_tensor->Data(); + } } + MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, effective_compute_type, nullptr /*QuantBData*/, + packed_b_.get(), scale_ptr, has_zp_input_, zp_ptr, nullptr, + &mlas_backend_kernel_selector_config_); + packed_b_finalized_ = true; } -#endif // MLAS_TARGET_ARM64 } is_packed = true; + + if (prepacked_weights != nullptr) { + prepacked_weights->buffers_.push_back(std::move(packed_b_)); + prepacked_weights->buffer_sizes_.push_back(packed_b_size_); + } } else if (compute_type_ == SQNBIT_CompInt8 && !prefer_lut_gemm_) { // Packing scales and zero points // Guard: for LUT-eligible nodes, scales/ZP are already packed inside @@ -376,7 +425,10 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All }(); if (should_pack_scale_and_zp_inputs) { - if (input_idx == InputIndex::scales && packed_b_ != nullptr) { + // packed_b_ is already finalized during the B PrePack (scales and zero points folded in there so + // the sharing content hash captures them), so skip packing here. The CompInt8 packing is + // single-shot and packed_b_ may now be a buffer shared from another session. + if (input_idx == InputIndex::scales && packed_b_ != nullptr && !packed_b_finalized_) { auto sptr = tensor.Data(); MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), sptr, has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -384,7 +436,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All } // Packing zero_point - if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) { + if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && !packed_b_finalized_) { auto zptr = tensor.Data(); MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -410,13 +462,21 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All } scales_are_packed_ = true; - is_packed = true; - // For KleidiAI asymmetric 4-bit path: compute BZpCorr now while scales are still accessible. - // After this PrePack returns is_packed=true, ORT may erase scales from the constant - // input table (use count drops to 0), making them unavailable in later PrePack calls. - // Zero points haven't been PrePacked yet so they are still accessible. - if (has_zp_input_ && nbits_ == 4) { + // The scales were folded into packed_b_ during the B PrePack, so there is no separate packed + // scales buffer to cache or share. Report is_packed = false (as the x64 path already does for + // the scales input) so the framework does not engage pre-packed weight sharing for scales. + // Engaging it would require pushing a placeholder buffer, but the real scales live inside + // packed_b_ so the placeholder would be null - and PrePackedWeights::GetHash() skips null + // buffers, making the scales container key identical for every MatMulNBits node. That would + // falsely increment the shared-weights counter for unrelated nodes without sharing any real + // buffer. The quantized weight B (which carries the folded-in scales) is shared on its own. + is_packed = false; + + // BZpCorr was already folded into packed_b_ during the B PrePack (so the sharing content hash + // captures the zero points), so re-folding it here must be skipped: the packing is single-shot + // and packed_b_ may now be a buffer shared from another session. + if (has_zp_input_ && nbits_ == 4 && !packed_b_finalized_) { const Tensor* zp_tensor = nullptr; OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor); if (zp_tensor != nullptr) { @@ -457,7 +517,14 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All // BZpCorr was already computed during B packing in Step 1 (if applicable). scales_are_packed_ = true; - is_packed = true; + + // The scales were folded into the packed B buffer during the B PrePack, so there is no + // separate packed scales buffer to cache or share. Report is_packed = false (mirroring the + // x64 path and the SQNBIT_CompInt8 path above) so the framework does not engage sharing for + // the scales input; engaging it would push a null placeholder whose content hash is identical + // for every node, falsely incrementing the shared-weights counter without sharing any real + // buffer. + is_packed = false; } else #endif // MLAS_TARGET_ARM64 { @@ -471,7 +538,9 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All // Pack scales separately only for 8-bit. For 4-bit on ARM64, scales are already packed // during B packing or used as a raw pointer at compute time (matching standard // SQNBIT_CompInt8 behavior where should_pack_scale_and_zp_inputs = (nbits_ == 8) on ARM64). - if (nbits_ == 8) { + // Skip when packed_b_ was already finalized during the B PrePack (scales/zero points folded + // in there for the sharing content hash); it may now be a buffer shared from another session. + if (nbits_ == 8 && !packed_b_finalized_) { MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(), scales_fp32_.get(), has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -482,7 +551,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All // Pack zero_points separately only for 8-bit (matching standard SQNBIT_CompInt8 behavior). // For 4-bit, zero_points are passed directly in data params or handled via KleidiAI BZpCorr. - if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && nbits_ == 8) { + if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && nbits_ == 8 && !packed_b_finalized_) { auto zptr = tensor.Data(); MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -540,8 +609,6 @@ template <> Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc, /*out*/ bool& is_packed, /*out*/ PrePackedWeights* prepacked_weights) { - ORT_UNUSED_PARAMETER(prepacked_weights); - if (input_idx == InputIndex::scales || input_idx == InputIndex::bias) { auto sptr = tensor.Data(); auto tensor_size = static_cast(tensor.Shape().Size()); @@ -565,8 +632,12 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou if (input_idx == InputIndex::B) { const Tensor* scales = nullptr; OpKernel::Info().TryGetConstantInput(InputIndex::scales, &scales); - if (scales && MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, compute_type_, - has_zp_input_, &mlas_backend_kernel_selector_config_)) { + // Convert the constant fp16 scales to fp32 up front so they (and the zero points) can be folded + // into packed_b_ during this B PrePack, mirroring the primary float PrePack above. Pre-packed + // weight sharing content-hashes the buffer right after this B PrePack returns, so for CompInt8 + // everything that affects the packed bytes (the scales, and the block sum / KleidiAI BZpCorr that + // depend on the zero points) must be folded in by now. + if (scales && compute_type_ == SQNBIT_CompInt8) { auto sptr = scales->Data(); auto scales_size = static_cast(scales->Shape().Size()); auto ptr = IAllocator::MakeUniquePtr(alloc, scales_size, true); @@ -581,25 +652,55 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou } auto qptr = tensor.DataRaw(); packed_b_ = IAllocator::MakeUniquePtr(alloc, packed_b_size_, true); + // See the primary PrePack() above: SessionState::PrepackConstantInitializedTensors passes a + // non-null prepacked_weights on both the container and the default single-session paths, so this + // zero-fill runs on essentially every prepack at load (the guard only skips a caller that asks for + // no cacheable buffer). It keeps the dedup content hash reproducible regardless of bytes the pack + // leaves uninitialized (alignment padding), for any compute type. One-time O(packed_b_size_) load + // cost; inference is unaffected. + if (prepacked_weights != nullptr) { + std::memset(packed_b_.get(), 0, packed_b_size_); + } MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), scales_fp32_.get(), has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_); -#if defined(MLAS_TARGET_ARM64) - // For KleidiAI asymmetric 4-bit path: compute BZpCorr during B packing. - // The fp16 specialization packs B here (with scales already converted to fp32), - // so we also compute BZpCorr now while both scales and zero_points are accessible. - if (has_zp_input_ && nbits_ == 4 && scales_fp32_ != nullptr) { - const Tensor* zp_tensor = nullptr; - OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor); - if (zp_tensor != nullptr) { - auto zptr = zp_tensor->Data(); - MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), - scales_fp32_.get(), has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); + // Fold the scales and (constant) zero points into packed_b_ now (see the primary PrePack above): + // the CompInt8 block sum and the KleidiAI BZpCorr depend on the zero points, so they must be + // folded in before the sharing content hash is taken. Otherwise two initializers with identical B + // and scales but different zero points would hash equal and the second would wrongly adopt the + // first's buffer. The B pack above only partially populates the buffer, so issue one more pack + // call with QuantBData == nullptr to finalize it. This is byte-identical to the staged + // scales + zero_points packing it replaces. + bool finalize_scale_zp_into_packed_b = compute_type_ == SQNBIT_CompInt8 && scales_fp32_ != nullptr; +#if !defined(MLAS_TARGET_AMD64_IX86) + // On ARM64 the scales/zero points are folded into B only for 8-bit, or for 4-bit when MLAS bakes + // them in (KleidiAI). For 4-bit non-KleidiAI they are applied at compute time and must not be + // passed to the packing routine, which would dereference the null QuantBData buffer. + finalize_scale_zp_into_packed_b = + finalize_scale_zp_into_packed_b && + (nbits_ == 8 || MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, compute_type_, + has_zp_input_, &mlas_backend_kernel_selector_config_)); +#endif + if (finalize_scale_zp_into_packed_b) { + const uint8_t* zp_ptr = nullptr; + if (has_zp_input_) { + const Tensor* zp_tensor = nullptr; + OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor); + if (zp_tensor != nullptr) { + zp_ptr = zp_tensor->Data(); + } } + MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr /*QuantBData*/, + packed_b_.get(), scales_fp32_.get(), has_zp_input_, zp_ptr, nullptr, + &mlas_backend_kernel_selector_config_); + packed_b_finalized_ = true; } -#endif // MLAS_TARGET_ARM64 - is_packed = true; + + if (prepacked_weights != nullptr) { + prepacked_weights->buffers_.push_back(std::move(packed_b_)); + prepacked_weights->buffer_sizes_.push_back(packed_b_size_); + } } else if (compute_type_ == SQNBIT_CompInt8) { bool should_pack_scale_and_zp = [&]() { #if defined(MLAS_TARGET_AMD64_IX86) @@ -610,11 +711,11 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou }(); if (should_pack_scale_and_zp) { - if (input_idx == InputIndex::scales && packed_b_ != nullptr) { + if (input_idx == InputIndex::scales && packed_b_ != nullptr && !packed_b_finalized_) { MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), scales_fp32_.get(), has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_); is_packed = false; - } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) { + } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && !packed_b_finalized_) { auto zptr = tensor.Data(); MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -635,6 +736,11 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector& used_shared_buffers = false; if (input_idx == InputIndex::B && !prepacked_buffers.empty()) { + // The buffer handed back is fully finalized: the producing session folded the scales and zero + // points (block sums / KleidiAI BZpCorr) into it during its PrePack(B), which is also when this + // kernel set packed_b_finalized_ on its own (identical) B PrePack. The later scale/zero-point + // PrePack calls already skip the staged packing whenever packed_b_finalized_ is set, so simply + // adopt the shared buffer here - no extra bookkeeping is needed to avoid re-folding into it. packed_b_ = std::move(prepacked_buffers[0]); used_shared_buffers = true; @@ -643,6 +749,9 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector& packed_b_size_ = MlasLutGemmPackedSize(N_, K_, nbits_, block_size_, has_zp_input_); } } + // Only the quantized weight B yields a separately cached pre-packed buffer. The scales (and zero + // points) are folded into packed_b_ during the B PrePack and reported with is_packed = false, so + // the framework never asks this kernel to adopt a shared buffer for them. return Status::OK(); } diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 6ef2319c1d3f4..85def4898d21c 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -498,8 +498,13 @@ Status SessionState::PrepackConstantInitializedTensors( auto iter = initializers_to_share_map.find(input_name); bool is_shared_initializer = (iter != initializers_to_share_map.end()); - // Caching pre-packed weights is limited to shared initializers associated with the CPU EP for now - if (is_shared_initializer && should_cache_prepacked_weights_for_shared_initializers && + // CPU EP only. An initializer joins the shared pre-packed container either when it was + // registered via OrtApi::AddInitializer (is_shared_initializer) or when a graph transformer + // tagged this synthesized initializer with a sharing identity (tagged_share_id). + const std::string* tagged_share_id = st->graph_.GetSharedPrepackInitializerId(input_name); + const bool enroll_tagged_initializer = (tagged_share_id != nullptr); + if ((is_shared_initializer || enroll_tagged_initializer) && + should_cache_prepacked_weights_for_shared_initializers && node.GetExecutionProviderType() == kCpuExecutionProvider) { // caching of pre-packed weights' turned ON @@ -530,12 +535,12 @@ Status SessionState::PrepackConstantInitializedTensors( // TODO: Check if some version of the ONNX IR allows op_type to be empty ORT_ENFORCE(!op_type.empty(), "The op type of a node cannot be empty"); - // The key for the pre-packed weights container lookup is the op_type + hash of the prepacked-weight - // that we just got by invoking PrePack() on this kernel. - + // Tagged initializers are keyed by their sharing identity; AddInitializer ones by the + // packed-bytes hash. Both carry the op_type prefix. const std::string prepacked_weights_container_key = - GenerateKeyForPrepackedWeightsMap(op_type, - weights_to_be_filled_in); + enroll_tagged_initializer + ? (op_type + "+id+" + *tagged_share_id) + : GenerateKeyForPrepackedWeightsMap(op_type, weights_to_be_filled_in); bool container_contains_packed_weight = prepacked_weights_container_->HasWeight( prepacked_weights_container_key); @@ -615,11 +620,9 @@ Status SessionState::PrepackConstantInitializedTensors( is_packed, &weights_to_be_filled_in)); - // Some kernels (matmul_nbits and non-CPU related kernels) do not share their pre-packed results + // Some kernels (non-CPU related kernels) do not share their pre-packed results // even though they set is_packed = true so we leave it up to them. // We can change their behavior if we wish do so in a separate PR - // XXX: Interestingly enough, matmul_nbits does accept shared pre-packs, but does not - // produce them. if (is_packed && !weights_to_be_filled_in.buffers_.empty()) { const auto& op_type = node.OpType(); const std::string prepacked_weights_container_key = GenerateKeyForPrepackedWeightsMap( diff --git a/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc b/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc index f3956d5e9e0f3..07fccef64fee1 100644 --- a/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc +++ b/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc @@ -12,6 +12,7 @@ #include "core/graph/graph_utils.h" #include "core/graph/node_attr_utils.h" #include "core/optimizer/initializer.h" +#include "core/optimizer/matmul_nbits_sharing_identity.h" #include "core/optimizer/utils.h" #include @@ -447,7 +448,6 @@ std::vector CollectDirectDQMatches( return direct_matches; } -// --------------------------------------------------------------------------- // Pattern 1 rewriting: DQ+Reshape+Transpose+[Cast]+MatMul/Gemm -> MatMulNBits // --------------------------------------------------------------------------- @@ -569,6 +569,10 @@ void ApplyReshapeTransposeFusions( zp_mnb_tp.emplace(utils::TensorToTensorProto(*zp_dst, zp_dst_name, true)); } + // Cross-session sharing identity for the generated weight group; computed before the tensors move. + const std::string share_id = + ComputeMatMulNBitsSharingId(weight_dst, scale_dst, zp_dst, N, K, block_size, /*bits*/ 4, accuracy_level); + NodeAttributes mnb_attrs; utils::SetNodeAttribute(utils::MakeAttribute("K", K), mnb_attrs); utils::SetNodeAttribute(utils::MakeAttribute("N", N), mnb_attrs); @@ -578,7 +582,10 @@ void ApplyReshapeTransposeFusions( std::vector mnb_inputs; mnb_inputs.push_back(const_cast(mm_node->InputDefs()[0])); - mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, weight_mnb_tp, std::move(weight_dst))); + NodeArg& b_weight_arg = graph_utils::AddInitializerWithOrtValue(graph, weight_mnb_tp, std::move(weight_dst)); + // Tag the generated B weight for cross-session pre-pack sharing. + graph.SetSharedPrepackInitializerId(b_weight_arg.Name(), share_id); + mnb_inputs.push_back(&b_weight_arg); mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, scale_mnb_tp, std::move(scale_dst))); if (zp_mnb_tp) { mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, zp_mnb_tp.value(), std::move(*zp_dst))); @@ -749,6 +756,10 @@ void ApplyDirectDQFusions( zp_mnb_tp.emplace(utils::TensorToTensorProto(*zp_dst, zp_dst_name, true)); } + // Cross-session sharing identity for the generated weight group; computed before the tensors move. + const std::string share_id = + ComputeMatMulNBitsSharingId(weight_dst, scale_dst, zp_dst, N, K, block_size, /*bits*/ 4, accuracy_level); + NodeAttributes mnb_attrs; utils::SetNodeAttribute(utils::MakeAttribute("K", K), mnb_attrs); utils::SetNodeAttribute(utils::MakeAttribute("N", N), mnb_attrs); @@ -758,7 +769,10 @@ void ApplyDirectDQFusions( std::vector mnb_inputs; mnb_inputs.push_back(const_cast(mm_node->InputDefs()[0])); - mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, weight_mnb_tp, std::move(weight_dst))); + NodeArg& b_weight_arg = graph_utils::AddInitializerWithOrtValue(graph, weight_mnb_tp, std::move(weight_dst)); + // Tag the generated B weight for cross-session pre-pack sharing. + graph.SetSharedPrepackInitializerId(b_weight_arg.Name(), share_id); + mnb_inputs.push_back(&b_weight_arg); mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, scale_mnb_tp, std::move(scale_dst))); if (zp_mnb_tp) { mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, zp_mnb_tp.value(), std::move(*zp_dst))); diff --git a/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h b/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h new file mode 100644 index 0000000000000..597c8a292afd8 --- /dev/null +++ b/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include + +#include "core/framework/murmurhash3.h" +#include "core/framework/tensor.h" + +namespace onnxruntime { + +// Stable, content-derived identity for a fusion-generated MatMulNBits weight group, used to share its +// pre-packed buffer across sessions. The id is identical for the same model in any session and differs +// whenever a semantic input differs. accuracy_level is hashed so buffers packed for different compute +// types never collide. Pass zero_point only when it is an actual kernel input. +inline std::string ComputeMatMulNBitsSharingId(const Tensor& weight, const Tensor& scale, + const std::optional& zero_point, + int64_t N, int64_t K, int64_t block_size, + int64_t bits, int64_t accuracy_level) { + uint32_t hash[4] = {0, 0, 0, 0}; + auto hash_bytes = [&hash](const void* data, size_t len) { + MurmurHash3::x86_128(data, len, hash[0], &hash); + }; + hash_bytes(weight.DataRaw(), weight.SizeInBytes()); + hash_bytes(scale.DataRaw(), scale.SizeInBytes()); + if (zero_point) { + hash_bytes(zero_point->DataRaw(), zero_point->SizeInBytes()); + } + const int64_t params[] = {N, K, block_size, bits, accuracy_level}; + hash_bytes(params, sizeof(params)); + return "MatMulNBits.DQ:" + std::to_string((static_cast(hash[1]) << 32) | hash[0]); +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc index b9d7e898157bd..6bd5e157d8b65 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc @@ -7,6 +7,7 @@ #include "core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h" #include "core/optimizer/qdq_transformer/qdq_util.h" #include "core/optimizer/initializer.h" +#include "core/optimizer/matmul_nbits_sharing_identity.h" #include "core/graph/node_attr_utils.h" #include "core/graph/graph_utils.h" #include "core/framework/tensorprotoutils.h" @@ -646,8 +647,23 @@ Status DQMatMulToMatMulNBitsAction::ProcessNewNode(Graph& graph, ORT_RETURN_IF_ERROR(TransposeDQWeightsForMatMulNBits( graph, *dq_node, "fused_DQ_MatMul", intra_op_thread_pool_, effective_bs, transposed)); + // Cross-session sharing identity for the generated B weight; computed before it is moved. + const auto* weight_arg = dq_node->InputDefs()[0]; + const auto* weight_shape = weight_arg->Shape(); + ORT_RETURN_IF_NOT(weight_shape != nullptr && weight_shape->dim_size() >= 2, + "Weight shape unavailable for DQ node ", dq_node->Name()); + const int64_t bits = DQWeightBits(weight_arg->TypeAsProto()->tensor_type().elem_type()); + const std::string share_id = ComputeMatMulNBitsSharingId( + transposed.weight, transposed.scale, transposed.zero_point, + weight_shape->dim(1).dim_value(), weight_shape->dim(0).dim_value(), + effective_bs, bits, accuracy_level_); + auto& input_defs = replacement_node.MutableInputDefs(); - input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, transposed.weight_proto, std::move(transposed.weight))); + NodeArg& b_weight_arg = + graph_utils::AddInitializerWithOrtValue(graph, transposed.weight_proto, std::move(transposed.weight)); + // Tag the generated B weight for cross-session pre-pack sharing. + graph.SetSharedPrepackInitializerId(b_weight_arg.Name(), share_id); + input_defs.push_back(&b_weight_arg); replacement_node.MutableInputArgsCount().push_back(1); input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, transposed.scale_proto, std::move(transposed.scale))); diff --git a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc index 9deb064a90853..8e133caa15d55 100644 --- a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc @@ -3,6 +3,7 @@ #ifndef ORT_MINIMAL_BUILD +#include #include #include "gtest/gtest.h" @@ -26,6 +27,9 @@ #include "core/session/onnxruntime_cxx_api.h" #include "core/session/ort_env.h" #include "core/util/qmath.h" +#include "core/graph/model.h" +#include "test/util/include/inference_session_wrapper.h" +#include "test/util/include/test/test_environment.h" #include "core/providers/webgpu/webgpu_provider_options.h" #ifdef USE_WEBGPU #include "contrib_ops/webgpu/quantization/matmul_nbits_common.h" @@ -461,6 +465,117 @@ TEST(MatMulNBitsLutGemm, Float32_2Bits_Asymmetric_Batch32_256x256_Bias) { TestMatMul2BitsLutGemm(32, 256, 256, 32, /*has_zero_point=*/true, /*has_bias=*/true); } +// Regression test for the LUT GEMM pre-pack + prepacked-save path. A 2-bit MatMulNBits node pre-packed +// via the LUT path must record its packed B buffer exactly once. A prior bug appended packed_b_ twice +// on the LUT path (inside the LUT branch and again in the shared append at the end of the B block), so +// the second entry was a moved-from/null buffer paired with a non-zero packed_b_size_. The pre-packed +// content hash skips null buffers, so cross-session sharing appeared to work, but saving pre-packed +// initializers iterates every recorded buffer and writes buffer_sizes_[i] bytes from buffers_[i].get(), +// dereferencing the null pointer when mlas.use_lut_gemm=1. This drives mlas.use_lut_gemm=1 together with +// session.save_external_prepacked_constant_initializers=1 and a non-empty optimized_model_filepath, and +// asserts that initialization (which performs the save) and a subsequent run both succeed. +TEST(MatMulNBitsLutGemm, Float32_2Bits_PrepackSaveDoesNotCrash) { + constexpr int64_t M = 1, N = 128, K = 128, block_size = 32; + if (!MlasIsLutGemmAvailable(static_cast(N), static_cast(K), 2, static_cast(block_size))) { + GTEST_SKIP() << "LUT GEMM not available on this platform"; + } + + // Quantize random weights into valid 2-bit MatMulNBits B/scales/zero_points initializers. + RandomValueGenerator random{1234}; + std::vector b_fp32(random.Gaussian(AsSpan({K, N}), 0.0f, 0.25f)); + + int q_rows = 0, q_cols = 0; + MlasBlockwiseQuantizedShape(static_cast(block_size), /*columnwise*/ true, + static_cast(K), static_cast(N), q_rows, q_cols); + size_t q_data_size_in_bytes = 0, q_scale_size = 0, q_zp_size_in_bytes = 0; + MlasBlockwiseQuantizedBufferSizes(static_cast(block_size), /*columnwise*/ true, + static_cast(K), static_cast(N), + q_data_size_in_bytes, q_scale_size, &q_zp_size_in_bytes); + + std::vector b_data(q_data_size_in_bytes); + std::vector scales(q_scale_size); + std::vector zp(q_zp_size_in_bytes); + + auto& ortenv = **ort_env.get(); + onnxruntime::concurrency::ThreadPool* tp = ortenv.GetEnvironment().GetIntraOpThreadPool(); + MlasQuantizeBlockwise(b_data.data(), scales.data(), zp.data(), b_fp32.data(), + static_cast(block_size), /*columnwise*/ true, + static_cast(K), static_cast(N), + static_cast(N), tp); + + // Single-node MatMulNBits model: A is a runtime input; B/scales/zero_points are constant initializers + // (so they are pre-packed at session initialization). + const int64_t k_blocks = (K + block_size - 1) / block_size; + const std::unordered_map domain_to_version{{"", 21}, {kMSDomain, 1}}; + Model model("matmul_2bits_lut_prepack_save", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, + std::vector(), DefaultLoggingManager().DefaultLogger()); + Graph& graph = model.MainGraph(); + ModelTestBuilder builder(graph); + + ONNX_NAMESPACE::TypeProto float_2d; + float_2d.mutable_tensor_type()->set_elem_type(utils::ToTensorProtoElementType()); + float_2d.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(M); + float_2d.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(K); + NodeArg* A = &graph.GetOrCreateNodeArg("A", &float_2d); + NodeArg* Y = &graph.GetOrCreateNodeArg("Y", nullptr); + + NodeArg* B = builder.MakeInitializer( + {static_cast(q_cols), k_blocks, static_cast(q_rows) / k_blocks}, b_data); + NodeArg* scales_arg = builder.MakeInitializer({N, static_cast(q_scale_size) / N}, scales); + NodeArg* zero_points = + builder.MakeInitializer({N, static_cast(q_zp_size_in_bytes) / N}, zp); + + Node& node = builder.AddNode("MatMulNBits", {A, B, scales_arg, zero_points}, {Y}, kMSDomain); + node.AddAttribute("K", K); + node.AddAttribute("N", N); + node.AddAttribute("block_size", block_size); + node.AddAttribute("bits", static_cast(QBits)); + node.AddAttribute("accuracy_level", static_cast(0)); + + graph.SetOutputs(std::vector{Y}); + ASSERT_STATUS_OK(graph.Resolve()); + + std::string model_bytes; + ASSERT_TRUE(model.ToProto().SerializeToString(&model_bytes)); + + // Save the optimized model + pre-packed initializers into a unique temp dir. Writing the prepacked + // initializers is the path that dereferenced the duplicate null buffer before the fix. + namespace fs = std::filesystem; + const fs::path tmp_dir = fs::temp_directory_path() / "ort_matmul2bits_lut_prepack_save_test"; + std::error_code ec; + fs::remove_all(tmp_dir, ec); + ASSERT_TRUE(fs::create_directories(tmp_dir, ec)) << ec.message(); + const fs::path optimized_model_path = tmp_dir / "optimized.onnx"; + + SessionOptions so; + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsMlasLutGemm, "1")); + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsSavePrePackedConstantInitializers, "1")); + so.optimized_model_filepath = optimized_model_path.native(); + + std::vector fetches; + { + InferenceSessionWrapper session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast(model_bytes.size()))); + // Initialization performs the LUT pre-pack and writes the optimized model with external + // pre-packed initializers. Before the fix this dereferenced the duplicate null packed buffer. + ASSERT_STATUS_OK(session.Initialize()); + + auto cpu_allocator = TestCPUExecutionProvider()->CreatePreferredAllocators()[0]; + std::vector a_data = random.Gaussian(AsSpan({M, K}), 0.0f, 0.25f); + OrtValue a_value; + CreateMLValue(cpu_allocator, AsSpan({M, K}), a_data, &a_value); + NameMLValMap feeds{{"A", a_value}}; + + ASSERT_STATUS_OK(session.Run(RunOptions{}, feeds, std::vector{"Y"}, &fetches)); + } + + ASSERT_EQ(fetches.size(), static_cast(1)); + EXPECT_TRUE(fs::exists(optimized_model_path)); + + fs::remove_all(tmp_dir, ec); +} + // Float zero point tests — directed QAD scenario (zp=1.5) void RunTest2BitsFloatZP(int64_t M, int64_t N, int64_t K, int64_t block_size, float zp_value) { RandomValueGenerator random{1234}; diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index aadbbab1c135b..dd5cfb73dfe31 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -22,10 +22,14 @@ #include "test/unittest_util/graph_transform_test_builder.h" #include "test/util/include/default_providers.h" #include "test/util/include/scoped_env_vars.h" +#include "test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h" #include "core/session/onnxruntime_cxx_api.h" #include "core/session/ort_env.h" #include "core/util/qmath.h" #include "core/providers/webgpu/webgpu_provider_options.h" +#include "core/framework/prepacked_weights_container.h" +#include "core/session/onnxruntime_session_options_config_keys.h" +#include "test/util/include/test/test_environment.h" extern std::unique_ptr ort_env; @@ -87,6 +91,10 @@ struct TestOptions { bool legacy_shape{false}; // for backward compatibility + // When set, RunTest validates cross-session sharing of the pre-packed weights instead of doing a + // single run. The model is run in two sessions that use the same pre-packed weights container. + std::optional prepack_sharing_mode{}; + std::optional output_abs_error{}; std::optional output_rel_error{}; }; @@ -269,6 +277,13 @@ void RunTest(const TestOptions& opts, test.SetOutputRelErr("Y", *opts.output_rel_error); } + if (opts.prepack_sharing_mode.has_value()) { + // Pre-packed weight sharing is a CPU-EP-only feature; the helper runs the model on the CPU EP + // in two sessions and validates the sharing counters. + CheckSharedPrepackedWeights(test, *opts.prepack_sharing_mode, {N, k_blocks, blob_size}, input1_vals); + return; + } + if (!explicit_eps.empty()) { test.ConfigEps(std::move(explicit_eps)); } @@ -597,6 +612,55 @@ TEST(MatMulNBits, Float32_4b_Accuracy4_Batch) { RunTest(opts); } +#ifndef ENABLE_TRAINING +// Pre-packing (and therefore cross-session sharing of pre-packed weights) is disabled in a full +// training build, so there is nothing to exercise there. + +namespace { +// Builds a representative MatMulNBits TestOptions for the pre-packed weight sharing tests. +TestOptions MakeSharingTestOptions(int64_t N, int64_t K, int64_t block_size, int64_t accuracy_level, + bool has_zero_point, bool has_bias, PrepackSharingMode mode) { + TestOptions opts{}; + opts.M = 8; + opts.N = N; + opts.K = K; + opts.block_size = block_size; + opts.accuracy_level = accuracy_level; + opts.has_zero_point = has_zero_point; + opts.zp_is_4bit = true; + opts.has_bias = has_bias; + opts.prepack_sharing_mode = mode; + opts.output_abs_error = 0.1f; + opts.output_rel_error = 0.02f; + return opts; +} +} // namespace + +// Legacy sharing path: the weight B is registered as a shared initializer via +// SessionOptions::AddInitializer. Covers float and float16 activations, symmetric/asymmetric, +/- bias. +TEST(MatMulNBits, SharedPrepackedWeights_AddInitializer) { + for (bool has_zero_point : {false, true}) { + for (bool has_bias : {false, true}) { + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, has_zero_point, + has_bias, PrepackSharingMode::kAddInitializer)); + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, has_zero_point, + has_bias, PrepackSharingMode::kAddInitializer)); + } + } +} + +// Negative control: with the shared container present but neither opt-in mechanism enabled, no +// pre-packed weights are shared across sessions. +TEST(MatMulNBits, SharedPrepackedWeights_NotSharedWithoutOptIn) { + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ true, + /*has_bias*/ true, PrepackSharingMode::kNoSharing)); + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, + /*has_zero_point*/ false, /*has_bias*/ false, + PrepackSharingMode::kNoSharing)); +} + +#endif // !ENABLE_TRAINING + #endif #endif diff --git a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc index f99334c4f33ef..411e83536c190 100644 --- a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc @@ -21,6 +21,7 @@ #include "test/unittest_util/graph_transform_test_builder.h" #include "test/util/include/default_providers.h" #include "test/util/include/scoped_env_vars.h" +#include "test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h" #include "core/session/onnxruntime_cxx_api.h" #include "core/session/ort_env.h" #include "core/util/qmath.h" @@ -51,6 +52,10 @@ struct TestOptions8Bits { bool has_g_idx{false}; bool has_bias{false}; + // When set, RunTest8Bits validates cross-session sharing of the pre-packed weights instead of + // doing a single run. The model is run in two CPU sessions that use the same container. + std::optional prepack_sharing_mode{}; + std::optional output_abs_error{}; std::optional output_rel_error{}; }; @@ -221,6 +226,14 @@ void RunTest8Bits(const TestOptions8Bits& opts) { test.SetOutputRelErr("Y", *opts.output_rel_error); } + if (opts.prepack_sharing_mode.has_value()) { + // Pre-packed weight sharing is a CPU-EP-only feature; the helper runs the model on the CPU EP + // in two sessions and validates the sharing counters. + CheckSharedPrepackedWeights(test, *opts.prepack_sharing_mode, + {q_cols, k_blocks, q_rows / k_blocks}, input1_vals); + return; + } + std::vector> execution_providers; #ifdef USE_CUDA execution_providers.emplace_back(DefaultCudaExecutionProvider()); @@ -671,6 +684,56 @@ TEST(MatMulNBits, BFloat16_Int8_Chunked_BFloat16ZeroPoint) { } #endif +#if !defined(USE_CUDA) && !defined(USE_WEBGPU) +#ifndef ENABLE_TRAINING +// Pre-packing (and therefore cross-session sharing of pre-packed weights) is disabled in a full +// training build and is only implemented for the CPU EP, so these tests are CPU-only. + +namespace { +// Builds a representative 8-bit MatMulNBits TestOptions for the pre-packed weight sharing tests. +// accuracy_level 4 selects the int8 compute type (SQNBIT_CompInt8 / HQNBIT_CompInt8), which is the +// 8-bit path that pre-packs the quantized B weight. +TestOptions8Bits MakeSharingTestOptions8Bits(int64_t block_size, bool has_zero_point, bool has_bias, + PrepackSharingMode mode) { + TestOptions8Bits opts{}; + opts.M = 8; + opts.N = 32; + opts.K = 256; + opts.block_size = block_size; + opts.accuracy_level = 4; + opts.has_zero_point = has_zero_point; + opts.has_bias = has_bias; + opts.prepack_sharing_mode = mode; + opts.output_abs_error = 0.1f; + opts.output_rel_error = 0.02f; + return opts; +} +} // namespace + +// Legacy sharing path for 8-bit weights: B is registered as a shared initializer via +// SessionOptions::AddInitializer. +TEST(MatMulNBits, SharedPrepackedWeights_8b_AddInitializer) { + for (bool has_zero_point : {false, true}) { + for (bool has_bias : {false, true}) { + RunTest8Bits(MakeSharingTestOptions8Bits(32, has_zero_point, has_bias, + PrepackSharingMode::kAddInitializer)); + RunTest8Bits(MakeSharingTestOptions8Bits(32, has_zero_point, has_bias, + PrepackSharingMode::kAddInitializer)); + } + } +} + +// Negative control for 8-bit weights: with the shared container present but neither opt-in mechanism +// enabled, no pre-packed weights are shared across sessions. +TEST(MatMulNBits, SharedPrepackedWeights_8b_NotSharedWithoutOptIn) { + RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ true, /*has_bias*/ true, + PrepackSharingMode::kNoSharing)); + RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ false, /*has_bias*/ false, + PrepackSharingMode::kNoSharing)); +} +#endif // !ENABLE_TRAINING +#endif // !USE_CUDA && !USE_WEBGPU + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc new file mode 100644 index 0000000000000..97566afe02489 --- /dev/null +++ b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc @@ -0,0 +1,108 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h" + +#include +#include +#include + +#include "gtest/gtest.h" + +#include "core/framework/tensor.h" +#include "core/session/onnxruntime_session_options_config_keys.h" +#include "test/providers/provider_test_utils.h" +#include "test/util/include/asserts.h" +#include "test/util/include/default_providers.h" + +namespace onnxruntime { +namespace test { + +void CheckSharedPrepackedWeights(OpTester& test, PrepackSharingMode mode, + const std::vector& b_dims, + std::vector& b_data) { + SessionOptions so; + OrtValue b_ortvalue; + + switch (mode) { + case PrepackSharingMode::kAddInitializer: + // Register B as an explicitly shared initializer (the pre-existing sharing mechanism). + Tensor::InitOrtValue(DataTypeImpl::GetType(), TensorShape(b_dims), b_data.data(), + OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator), b_ortvalue); + ASSERT_STATUS_OK(so.AddInitializer("B", &b_ortvalue)); + break; + case PrepackSharingMode::kNoSharing: + // Neither opt-in mechanism is used. + break; + } + + // Have all sessions created by this OpTester use the same pre-packed weights container. + test.EnableSharingOfPrePackedWeightsAcrossSessions(); + + // Pre-packing is limited to the CPU EP, so the sharing behavior is only exercised there. + auto cpu_ep = []() -> std::vector> { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + return execution_providers; + }; + + size_t number_of_pre_packed_weights_counter_session_1 = 0; + size_t number_of_shared_pre_packed_weights_counter = 0; + + // Session 1 + { + auto ep_vec = cpu_ep(); + test.Run(so, OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &ep_vec, {}, + &number_of_pre_packed_weights_counter_session_1, + &number_of_shared_pre_packed_weights_counter); + // Nothing can be shared yet because this is the first session. + ASSERT_EQ(number_of_shared_pre_packed_weights_counter, static_cast(0)); + } + + const auto number_of_elements_in_shared_container = test.GetNumPrePackedWeightsShared(); + + if (mode == PrepackSharingMode::kNoSharing) { + // Without opting in, pre-packed weights must not be placed in the shared container. + ASSERT_EQ(number_of_elements_in_shared_container, static_cast(0)); + } + + // On some platforms/architectures MLAS may choose not to pre-pack, in which case there is nothing + // to share and we cannot meaningfully continue. + if (number_of_pre_packed_weights_counter_session_1 == 0) { + return; + } + + if (mode != PrepackSharingMode::kNoSharing) { + // At least the quantized weight B is content-addressed into the shared container. Some + // architectures (e.g. ARM64 KleidiAI) additionally pre-pack scales, but in the AddInitializer + // mode only the explicitly-registered B participates, so the container can hold fewer elements + // than the total number of pre-packed weights. + ASSERT_GT(number_of_elements_in_shared_container, static_cast(0)); + ASSERT_LE(number_of_elements_in_shared_container, number_of_pre_packed_weights_counter_session_1); + } + + // Session 2 + { + size_t number_of_pre_packed_weights_counter_session_2 = 0; + auto ep_vec = cpu_ep(); + test.Run(so, OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &ep_vec, {}, + &number_of_pre_packed_weights_counter_session_2, + &number_of_shared_pre_packed_weights_counter); + + // The same number of weights is pre-packed in both sessions. + ASSERT_EQ(number_of_pre_packed_weights_counter_session_1, number_of_pre_packed_weights_counter_session_2); + + // Every weight stored in the shared container is served from it (i.e. shared) in the second + // session. For the no-sharing control this is zero; otherwise it matches the container size. + ASSERT_EQ(number_of_shared_pre_packed_weights_counter, number_of_elements_in_shared_container); + + if (mode == PrepackSharingMode::kNoSharing) { + ASSERT_EQ(number_of_shared_pre_packed_weights_counter, static_cast(0)); + } else { + ASSERT_GT(number_of_shared_pre_packed_weights_counter, static_cast(0)); + } + } +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h new file mode 100644 index 0000000000000..1de0bbaa4bb85 --- /dev/null +++ b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#pragma once + +#include +#include + +namespace onnxruntime { +namespace test { + +class OpTester; + +// How two sessions are configured to share the pre-packed weights of a MatMulNBits node. +enum class PrepackSharingMode { + // Legacy path: the weight is explicitly registered as a shared initializer via + // SessionOptions::AddInitializer. + kAddInitializer, + // Negative control: the shared container exists but neither opt-in mechanism is used, so no + // cross-session sharing must happen. + kNoSharing, +}; + +// Runs the already-configured MatMulNBits OpTester in two CPU sessions that share the same +// pre-packed weights container and asserts that the pre-packed weights are shared as expected. +// This logic is independent of the weight bit width, so it is shared by the 4-bit and 8-bit tests. +// `b_dims`/`b_data` describe the quantized B initializer and are only needed for the +// PrepackSharingMode::kAddInitializer path (to register B as a shared initializer). +void CheckSharedPrepackedWeights(OpTester& test, PrepackSharingMode mode, + const std::vector& b_dims, + std::vector& b_data); + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc index 8aa4c88052742..385aa7ffebc66 100644 --- a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc +++ b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc @@ -7,14 +7,20 @@ #include "core/common/span_utils.h" #include "core/framework/int4.h" +#include "core/framework/prepacked_weights_container.h" +#include "core/graph/constants.h" +#include "core/graph/model.h" #include "core/graph/node_attr_utils.h" #include "core/optimizer/dq_matmulnbits_fusion.h" +#include "core/session/inference_session.h" +#include "core/session/onnxruntime_session_options_config_keys.h" #include "test/test_environment.h" #include "test/unittest_util/framework_test_utils.h" #include "test/unittest_util/graph_transform_test_builder.h" #include "test/optimizer/graph_transform_test_fixture.h" #include "test/util/include/asserts.h" +#include "test/util/include/inference_session_wrapper.h" #include "gtest/gtest.h" @@ -354,6 +360,152 @@ TEST_F(DQMatMulNBitsFusionTest, Pattern1_MatMul_NoZP) { TransformerLevel::Level1, 1, pre_check, post_check)); } +// Validates the cross-session-sharing tag the fusion attaches to the generated B weight. The tag is a +// stable, content-derived identity: identical source quantization groups must yield the SAME identity +// (so two sessions optimizing the same model share the pre-packed B), while any semantic difference -- +// here, different zero points -- must yield a DIFFERENT identity (so they must not falsely share). +TEST_F(DQMatMulNBitsFusionTest, TagsGeneratedWeightWithStableContentIdentity) { + constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; + const int64_t num_blocks = K / block_size; + + std::vector weight(static_cast(N * num_blocks * block_size)); + for (size_t i = 0; i < weight.size(); ++i) { + weight[i] = static_cast(i % 16); + } + std::vector scale(static_cast(N * num_blocks)); + for (size_t i = 0; i < scale.size(); ++i) { + scale[i] = 0.1f + 0.01f * static_cast(i % 10); + } + // Non-default (non-8) zero points so the fusion keeps them (it elides uniform-8 zero points). + std::vector zp_a(static_cast(N * num_blocks), 3); + std::vector zp_b(zp_a.size(), 5); + + // Runs the fusion on a Pattern-1 model built from the given zero points and returns the sharing + // identity tagged onto the generated MatMulNBits B weight. + auto tag_for = [&](const std::vector& zp) -> std::string { + std::string captured; + auto build = [&](ModelTestBuilder& builder) { + BuildPattern1Graph(builder, M, N, K, block_size, /*with_zp*/ true, /*with_cast*/ false, + /*use_gemm*/ false, &weight, &scale, &zp); + }; + auto pre_check = [](Graph&) -> Status { return Status::OK(); }; + auto post_check = [&](Graph& graph) -> Status { + int matmulnbits = 0; + for (const auto& node : graph.Nodes()) { + if (node.OpType() == "MatMulNBits") { + ++matmulnbits; + const std::string& b_name = node.InputDefs()[1]->Name(); // input 1 == quantized B + const std::string* id = graph.GetSharedPrepackInitializerId(b_name); + EXPECT_NE(id, nullptr) << "generated B weight was not tagged for cross-session sharing"; + if (id != nullptr) { + captured = *id; + } + } + } + EXPECT_EQ(matmulnbits, 1); + return Status::OK(); + }; + auto transformer = std::make_unique(4); + EXPECT_TRUE(TestGraphTransformer(build, 21, *logger_, std::move(transformer), + TransformerLevel::Level1, 1, pre_check, post_check) + .IsOK()); + return captured; + }; + + const std::string id_a1 = tag_for(zp_a); + const std::string id_a2 = tag_for(zp_a); + const std::string id_b = tag_for(zp_b); + + ASSERT_FALSE(id_a1.empty()); + EXPECT_EQ(id_a1, id_a2); // stable: identical source quantization group -> identical identity + EXPECT_NE(id_a1, id_b); // collision-safe: different zero points -> different identity +} + +// Builds and serializes a Pattern-1 DQ->Reshape->Transpose->MatMul model (UINT4 constant weight). When +// loaded into a session with the DQ->MatMulNBits fusion enabled, it becomes a MatMulNBits whose B is +// tagged for cross-session sharing. +static void SerializeDQMatMulModel(int64_t M, int64_t N, int64_t K, int64_t block_size, + const std::vector& weight, const std::vector& scale, + const std::vector& zp, std::string& model_bytes) { + const std::unordered_map domain_to_version{{"", 21}, {kMSDomain, 1}}; + Model model("dq_matmulnbits_share", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, + std::vector(), DefaultLoggingManager().DefaultLogger()); + ModelTestBuilder builder(model.MainGraph()); + BuildPattern1Graph(builder, M, N, K, block_size, /*with_zp*/ true, /*with_cast*/ false, + /*use_gemm*/ false, &weight, &scale, &zp); + builder.SetGraphOutputs(); + ASSERT_STATUS_OK(model.MainGraph().Resolve()); + ASSERT_TRUE(model.ToProto().SerializeToString(&model_bytes)); +} + +// Loads the serialized model on the CPU EP with the DQ->MatMulNBits fusion enabled and the supplied +// shared container. Reports whether the fusion produced a MatMulNBits and how many pre-packed weights +// this session served from the container. +static void RunSharedFusionSession(const std::string& model_bytes, PrepackedWeightsContainer& container, + bool& produced_matmulnbits, size_t& used_shared_count) { + SessionOptions so; + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsEnableDQMatMulNBitsFusion, "1")); + + InferenceSessionWrapper session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.AddPrePackedWeightsContainer(&container)); + ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast(model_bytes.size()))); + ASSERT_STATUS_OK(session.Initialize()); + + produced_matmulnbits = false; + for (const auto& node : session.GetGraph().Nodes()) { + if (node.OpType() == "MatMulNBits") { + produced_matmulnbits = true; + break; + } + } + used_shared_count = session.GetSessionState().GetUsedSharedPrePackedWeightCounter(); +} + +// End-to-end: two sessions optimizing the same DQ+MatMul model share the fused MatMulNBits B weight +// through a common container WITHOUT any session option -- the fusion tags it and SessionState enrolls +// it by that identity. A session over a model that differs only in its zero points must NOT share. +TEST_F(DQMatMulNBitsFusionTest, SharesFusedWeightAcrossSessionsViaTag) { + constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; + const int64_t num_blocks = K / block_size; + + std::vector weight(static_cast(N * num_blocks * block_size)); + for (size_t i = 0; i < weight.size(); ++i) { + weight[i] = static_cast(i % 16); + } + std::vector scale(static_cast(N * num_blocks)); + for (size_t i = 0; i < scale.size(); ++i) { + scale[i] = 0.1f + 0.01f * static_cast(i % 10); + } + std::vector zp_a(static_cast(N * num_blocks), 3); + std::vector zp_b(zp_a.size(), 5); // differs only in zero points + + std::string model_a, model_b; + SerializeDQMatMulModel(M, N, K, block_size, weight, scale, zp_a, model_a); + SerializeDQMatMulModel(M, N, K, block_size, weight, scale, zp_b, model_b); + + PrepackedWeightsContainer container; + bool fused1 = false, fused2 = false, fused_b = false; + size_t used1 = 0, used2 = 0, used_b = 0; + + RunSharedFusionSession(model_a, container, fused1, used1); + ASSERT_TRUE(fused1) << "DQ -> MatMulNBits fusion did not run"; + if (container.GetNumberOfElements() == 0) { + GTEST_SKIP() << "MatMulNBits B was not pre-packed on this platform"; + } + EXPECT_EQ(used1, static_cast(0)); // first session: nothing to share yet + + // Second session over the SAME model shares the tagged B from the container. + RunSharedFusionSession(model_a, container, fused2, used2); + ASSERT_TRUE(fused2); + EXPECT_GT(used2, static_cast(0)); + + // A model differing only in zero points has a different identity and must NOT reuse the buffer. + RunSharedFusionSession(model_b, container, fused_b, used_b); + ASSERT_TRUE(fused_b); + EXPECT_EQ(used_b, static_cast(0)); +} + TEST_F(DQMatMulNBitsFusionTest, Pattern1_MatMul_WithDefaultZP8) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc index a1c0f8adfffb7..d16707e9a9ad4 100644 --- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc @@ -2,10 +2,14 @@ // Licensed under the MIT License. #include +#include #include "core/common/span_utils.h" #include "core/common/float16.h" #include "core/framework/int4.h" +#include "core/framework/prepacked_weights_container.h" +#include "core/graph/constants.h" +#include "core/graph/model.h" #include "core/graph/node_attr_utils.h" #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h" #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h" @@ -1462,6 +1466,219 @@ TEST(QDQTransformerTests, DQGemmNotConvertedToMatMulNBits_Alpha) { 1e-5, 2e-5); } +// --------------------------------------------------------------------------- +// Cross-session pre-pack sharing for the DEFAULT DQ->MatMulNBits path +// --------------------------------------------------------------------------- +// DQMatMulToMatMulNBitsAction (in the QDQ selector/action transformer) runs without the +// session.enable_dq_matmulnbits_fusion flag and synthesizes the MatMulNBits B/scales/zp initializers +// with names that are NOT stable across sessions. It tags the generated B weight with a stable, +// content-derived identity that SessionState uses to share the pre-packed buffer across sessions. + +// Packs uint4 nibble values (row-major, 2 per byte) into UInt4x2 storage. +static std::vector PackUint4Nibbles(const std::vector& values) { + const size_t num_pairs = UInt4x2::CalcNumInt4Pairs(values.size()); + std::vector packed(num_pairs); + for (size_t i = 0; i < values.size(); i += 2) { + const uint8_t lo = values[i] & 0x0F; + const uint8_t hi = (i + 1 < values.size()) ? (values[i + 1] & 0x0F) : 0; + packed[i / 2] = UInt4x2(lo, hi); + } + return packed; +} + +// Builds a default-path model: a constant UINT4 weight [K, N] block-quantized along axis 0 feeding a +// DequantizeLinear whose output is the second input to a single MatMul. The QDQ selector/action +// transformer converts this into a MatMulNBits. Explicit weight/scale/zp give a deterministic identity. +static void BuildDefaultPathDQMatMul(ModelTestBuilder& builder, int64_t M, int64_t N, int64_t K, + int64_t block_size, const std::vector& weight, + const std::vector& scale, const std::vector& zp) { + const int64_t num_blocks = (K + block_size - 1) / block_size; + + auto* input_a = builder.MakeInput({M, K}, -1.0f, 1.0f); + auto* output = builder.MakeOutput(); + + auto* weight_arg = builder.MakeInitializer({K, N}, PackUint4Nibbles(weight)); + auto* scale_arg = builder.MakeInitializer({num_blocks, N}, scale); + auto* zp_arg = builder.MakeInitializer({num_blocks, N}, PackUint4Nibbles(zp)); + + NodeAttributes dq_attrs; + utils::SetNodeAttribute(utils::MakeAttribute("axis", static_cast(0)), dq_attrs); + utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), dq_attrs); + auto* dq_output = builder.MakeIntermediate(); + builder.AddNode("DequantizeLinear", {weight_arg, scale_arg, zp_arg}, {dq_output}, "", &dq_attrs); + + builder.AddNode("MatMul", {input_a, dq_output}, {output}); +} + +// Serializes a default-path DQ->MatMul model built from explicit quantization data. +static void SerializeDefaultPathModel(int64_t M, int64_t N, int64_t K, int64_t block_size, + const std::vector& weight, const std::vector& scale, + const std::vector& zp, std::string& model_bytes) { + const std::unordered_map domain_to_version{{"", 21}, {kMSDomain, 1}}; + Model model("dq_matmul_default_share", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, + std::vector(), DefaultLoggingManager().DefaultLogger()); + ModelTestBuilder builder(model.MainGraph()); + BuildDefaultPathDQMatMul(builder, M, N, K, block_size, weight, scale, zp); + builder.SetGraphOutputs(); + ASSERT_STATUS_OK(model.MainGraph().Resolve()); + ASSERT_TRUE(model.ToProto().SerializeToString(&model_bytes)); +} + +// Loads the model on the CPU EP with the given shared container and DEFAULT options (no fusion flag). +// Reports whether a MatMulNBits was produced, the sharing identity tagged onto its B weight, and how +// many pre-packed weights this session served from the container. +static void RunDefaultPathSession(const std::string& model_bytes, PrepackedWeightsContainer& container, + bool& produced_matmulnbits, std::string& b_tag, size_t& used_shared_count, + int accuracy_level = -1) { + SessionOptions so; + if (accuracy_level >= 0) { + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel, + std::to_string(accuracy_level).c_str())); + } + InferenceSessionWrapper session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.AddPrePackedWeightsContainer(&container)); + ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast(model_bytes.size()))); + ASSERT_STATUS_OK(session.Initialize()); + + produced_matmulnbits = false; + b_tag.clear(); + const Graph& graph = session.GetGraph(); + for (const auto& node : graph.Nodes()) { + if (node.OpType() == "MatMulNBits") { + produced_matmulnbits = true; + const std::string& b_name = node.InputDefs()[1]->Name(); // input 1 == quantized B + if (const std::string* id = graph.GetSharedPrepackInitializerId(b_name); id != nullptr) { + b_tag = *id; + } + break; + } + } + used_shared_count = session.GetSessionState().GetUsedSharedPrePackedWeightCounter(); +} + +// Verifies the default DQ->MatMulNBits path tags its generated B weight with a stable, content-derived +// identity: identical quantization data yields the SAME identity, while different zero points yield a +// DIFFERENT identity (so two models differing only in zp must not falsely share a pre-packed buffer). +TEST(QDQTransformerTests, DefaultPath_TagsGeneratedWeightWithStableContentIdentity) { + constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; + const int64_t num_blocks = K / block_size; + + std::vector weight(static_cast(K * N)); + for (size_t i = 0; i < weight.size(); ++i) { + weight[i] = static_cast(i % 16); + } + std::vector scale(static_cast(num_blocks * N)); + for (size_t i = 0; i < scale.size(); ++i) { + scale[i] = 0.1f + 0.01f * static_cast(i % 10); + } + std::vector zp_a(static_cast(num_blocks * N), 3); + std::vector zp_b(zp_a.size(), 5); + + auto tag_for = [&](const std::vector& zp) -> std::string { + std::string model_bytes; + SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp, model_bytes); + PrepackedWeightsContainer container; + bool produced = false; + std::string tag; + size_t used = 0; + RunDefaultPathSession(model_bytes, container, produced, tag, used); + EXPECT_TRUE(produced) << "DQ -> MatMulNBits conversion did not run on the default path"; + return tag; + }; + + const std::string id_a1 = tag_for(zp_a); + const std::string id_a2 = tag_for(zp_a); + const std::string id_b = tag_for(zp_b); + + ASSERT_FALSE(id_a1.empty()) << "generated B weight was not tagged for cross-session sharing"; + EXPECT_EQ(id_a1, id_a2); // stable: identical quantization data -> identical identity + EXPECT_NE(id_a1, id_b); // collision-safe: different zero points -> different identity +} + +// End-to-end: two sessions converting the same model via the default path share the MatMulNBits B +// pre-packed buffer through a common container (no session option). A model differing only in zero +// points has a different identity and must not reuse the buffer. +TEST(QDQTransformerTests, DefaultPath_SharesWeightAcrossSessionsViaTag) { + constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; + const int64_t num_blocks = K / block_size; + + std::vector weight(static_cast(K * N)); + for (size_t i = 0; i < weight.size(); ++i) { + weight[i] = static_cast(i % 16); + } + std::vector scale(static_cast(num_blocks * N)); + for (size_t i = 0; i < scale.size(); ++i) { + scale[i] = 0.1f + 0.01f * static_cast(i % 10); + } + std::vector zp_a(static_cast(num_blocks * N), 3); + std::vector zp_b(zp_a.size(), 5); // differs only in zero points + + std::string model_a, model_b; + SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp_a, model_a); + SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp_b, model_b); + + PrepackedWeightsContainer container; + bool produced1 = false, produced2 = false, produced_b = false; + std::string tag1, tag2, tag_b; + size_t used1 = 0, used2 = 0, used_b = 0; + + RunDefaultPathSession(model_a, container, produced1, tag1, used1); + ASSERT_TRUE(produced1) << "DQ -> MatMulNBits conversion did not run on the default path"; + if (container.GetNumberOfElements() == 0) { + GTEST_SKIP() << "MatMulNBits B was not pre-packed on this platform"; + } + EXPECT_EQ(used1, static_cast(0)); // first session: nothing to share yet + + // Second session over the SAME model reuses the tagged B from the container. + RunDefaultPathSession(model_a, container, produced2, tag2, used2); + ASSERT_TRUE(produced2); + EXPECT_GT(used2, static_cast(0)); + + // A model differing only in zero points must NOT reuse the buffer. + RunDefaultPathSession(model_b, container, produced_b, tag_b, used_b); + ASSERT_TRUE(produced_b); + EXPECT_EQ(used_b, static_cast(0)); +} + +// The sharing identity includes accuracy_level, so the same weights compiled for different compute +// types (e.g. CompFp32 at level 0 vs CompInt8 at level 4) get DIFFERENT identities and must not share +// a pre-packed buffer whose layout depends on that compute type. +TEST(QDQTransformerTests, DefaultPath_DifferentAccuracyLevelDoesNotShare) { + constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; + const int64_t num_blocks = K / block_size; + + std::vector weight(static_cast(K * N)); + for (size_t i = 0; i < weight.size(); ++i) { + weight[i] = static_cast(i % 16); + } + std::vector scale(static_cast(num_blocks * N)); + for (size_t i = 0; i < scale.size(); ++i) { + scale[i] = 0.1f + 0.01f * static_cast(i % 10); + } + std::vector zp(static_cast(num_blocks * N), 3); + + std::string model_bytes; + SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp, model_bytes); + + PrepackedWeightsContainer container; + bool produced0 = false, produced4 = false; + std::string tag0, tag4; + size_t used0 = 0, used4 = 0; + + RunDefaultPathSession(model_bytes, container, produced0, tag0, used0, /*accuracy_level*/ 0); + ASSERT_TRUE(produced0) << "DQ -> MatMulNBits conversion did not run on the default path"; + + // Same model/weights, different accuracy level, sharing the same container. + RunDefaultPathSession(model_bytes, container, produced4, tag4, used4, /*accuracy_level*/ 4); + ASSERT_TRUE(produced4); + + ASSERT_FALSE(tag0.empty()); + ASSERT_FALSE(tag4.empty()); + EXPECT_NE(tag0, tag4); // accuracy_level participates in the identity + EXPECT_EQ(used4, static_cast(0)); // different identity => no cross-accuracy sharing +} + #endif // !defined(DISABLE_CONTRIB_OPS) } // namespace test