From 64329127cd4532762043d249eff6dba46ae77635 Mon Sep 17 00:00:00 2001 From: Dusan Erdeljan Date: Fri, 19 Jun 2026 10:51:35 +0200 Subject: [PATCH 01/13] [CPU] Enable pre-packed weights sharing for MatMulNBits --- .../onnxruntime_session_options_config_keys.h | 23 ++++ .../cpu/quantization/matmul_nbits.cc | 52 +++++++- onnxruntime/core/framework/session_state.cc | 25 +++- .../test/contrib_ops/matmul_4bits_test.cc | 109 +++++++++++++++++ .../test/contrib_ops/matmul_8bits_test.cc | 88 ++++++++++++++ .../matmul_nbits_prepack_sharing_test_util.cc | 112 ++++++++++++++++++ .../matmul_nbits_prepack_sharing_test_util.h | 38 ++++++ 7 files changed, 438 insertions(+), 9 deletions(-) create mode 100644 onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc create mode 100644 onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index 3efa6ae50faa7..ce4222129e24b 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -372,6 +372,29 @@ static const char* const kOrtSessionOptionsModelExternalInitializersFileFolderPa static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers = "session.save_external_prepacked_constant_initializers"; +// Enables cross-session sharing of pre-packed weights for *all* CPU constant initializers via an +// OrtPrepackedWeightsContainer, not just those explicitly registered with OrtApi::AddInitializer. +// +// Without this flag, only initializers registered via AddInitializer participate in the +// OrtPrepackedWeightsContainer cache. That requires knowing initializer names ahead of time, which is +// not possible when graph-level optimizers synthesize new initializers at session-creation time (e.g. +// the DQ + MatMul -> MatMulNBits fusion produces transposed/repacked weights with auto-generated +// names that do not exist in the input model file). +// +// With this flag set to "1", any CPU constant initializer that a kernel pre-packs participates in the +// shared container, content-addressed by hash(packed_bytes). Two sessions that pack byte-identical +// weights deduplicate to a single buffer owned by the container. +// +// Requirements: +// - The session must be created via OrtApi::CreateSessionWithPrepackedWeightsContainer. +// - All sessions intended to share must set this flag consistently and use the same container. +// +// - "0": Default. Only AddInitializer-registered initializers can share pre-packed weights cross-session. +// - "1": All CPU constant initializers can share pre-packed weights cross-session via the container. +// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsShareAllPrepackedCpuInitializers, "1") +static const char* const kOrtSessionOptionsShareAllPrepackedCpuInitializers = + "session.share_all_prepacked_cpu_initializers"; + // Use this config when you want to collect memory stats for each node in the graph. // The file format is a CSV file with the following columns: // The file will be created if it does not exist, and will be overwritten if it does. diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc index 6bd1690fca815..19e6d0389fe7d 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc @@ -4,6 +4,7 @@ #include "contrib_ops/cpu/quantization/matmul_nbits_impl.h" #include +#include #include #include @@ -227,7 +228,6 @@ template Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc, /*out*/ bool& is_packed, /*out*/ PrePackedWeights* prepacked_weights) { - ORT_UNUSED_PARAMETER(prepacked_weights); is_packed = false; if (has_g_idx_) { return Status::OK(); @@ -341,6 +341,14 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All } packed_b_ = IAllocator::MakeUniquePtr(alloc, packed_b_size_, true); + // The CompInt8 packed layout is populated across several PrePack() calls (B data here, then + // scales and zero_points). Pre-packed weight sharing keys this buffer by a content hash taken + // right after this B pack returns, so any bytes not yet written (later-packed regions and + // alignment padding) must be deterministic. The allocation above uses reserve and is not + // zero-filled, so zero it here to keep the hash identical across sessions and enable sharing. + if (effective_compute_type == SQNBIT_CompInt8) { + std::memset(packed_b_.get(), 0, packed_b_size_); + } MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, effective_compute_type, qptr, packed_b_.get(), scale_ptr, has_zp_input_, nullptr, threadpool_ptr, &mlas_backend_kernel_selector_config_); @@ -359,6 +367,11 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All #endif // MLAS_TARGET_ARM64 } is_packed = true; + + if (prepacked_weights != nullptr) { + prepacked_weights->buffers_.push_back(std::move(packed_b_)); + prepacked_weights->buffer_sizes_.push_back(packed_b_size_); + } } else if (compute_type_ == SQNBIT_CompInt8 && !prefer_lut_gemm_) { // Packing scales and zero points // Guard: for LUT-eligible nodes, scales/ZP are already packed inside @@ -412,6 +425,14 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All scales_are_packed_ = true; is_packed = true; + // The scales have already been absorbed into the shared packed B buffer. + // Push a nullptr placeholder so the framework's sharing logic recognizes this + // input as participating in pre-packing without allocating an additional buffer. + if (prepacked_weights != nullptr) { + prepacked_weights->buffers_.push_back(nullptr); + prepacked_weights->buffer_sizes_.push_back(0); + } + // For KleidiAI asymmetric 4-bit path: compute BZpCorr now while scales are still accessible. // After this PrePack returns is_packed=true, ORT may erase scales from the constant // input table (use count drops to 0), making them unavailable in later PrePack calls. @@ -458,6 +479,16 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All // BZpCorr was already computed during B packing in Step 1 (if applicable). scales_are_packed_ = true; is_packed = true; + + // The scales have already been absorbed into the packed B buffer (and BZpCorr folded in + // during B packing), so there is no separate buffer to cache. Push a nullptr placeholder + // so the pre-packed weight sharing logic recognizes this input as participating in + // pre-packing, mirroring the SQNBIT_CompInt8 path above. Without it the shared-container + // path asserts (buffers_.size() > 0) because is_packed is true but no buffer was provided. + if (prepacked_weights != nullptr) { + prepacked_weights->buffers_.push_back(nullptr); + prepacked_weights->buffer_sizes_.push_back(0); + } } else #endif // MLAS_TARGET_ARM64 { @@ -540,8 +571,6 @@ template <> Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc, /*out*/ bool& is_packed, /*out*/ PrePackedWeights* prepacked_weights) { - ORT_UNUSED_PARAMETER(prepacked_weights); - if (input_idx == InputIndex::scales || input_idx == InputIndex::bias) { auto sptr = tensor.Data(); auto tensor_size = static_cast(tensor.Shape().Size()); @@ -581,6 +610,11 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou } auto qptr = tensor.DataRaw(); packed_b_ = IAllocator::MakeUniquePtr(alloc, packed_b_size_, true); + // See the primary PrePack() above: zero CompInt8 buffers so the pre-packed weight sharing + // content hash (taken right after this B pack) is stable across sessions. + if (compute_type_ == SQNBIT_CompInt8 || compute_type_ == HQNBIT_CompInt8) { + std::memset(packed_b_.get(), 0, packed_b_size_); + } MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), scales_fp32_.get(), has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -600,6 +634,11 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou #endif // MLAS_TARGET_ARM64 is_packed = true; + + if (prepacked_weights != nullptr) { + prepacked_weights->buffers_.push_back(std::move(packed_b_)); + prepacked_weights->buffer_sizes_.push_back(packed_b_size_); + } } else if (compute_type_ == SQNBIT_CompInt8) { bool should_pack_scale_and_zp = [&]() { #if defined(MLAS_TARGET_AMD64_IX86) @@ -642,6 +681,13 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector& MlasInitLutGemmKernelConfig(N_, K_, nbits_, block_size_, has_zp_input_); packed_b_size_ = MlasLutGemmPackedSize(N_, K_, nbits_, block_size_, has_zp_input_); } + } else if (input_idx == InputIndex::scales) { + // When the scales were absorbed into the shared packed B buffer (see PrePack), + // a nullptr placeholder is pushed to keep the framework's sharing bookkeeping + // consistent. There is no separate buffer to adopt here - the packed B buffer + // (already populated via UseSharedPrePackedBuffers for input_idx == B) carries + // the packed scales. + used_shared_buffers = true; } return Status::OK(); diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 6ef2319c1d3f4..78627e76a4abd 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -470,7 +470,17 @@ static std::string GenerateKeyForPrepackedWeightsMap(const std::string& op_type, Status SessionState::PrepackConstantInitializedTensors( InlinedHashMap& constant_initializers_use_count, const std::unordered_map& initializers_to_share_map) { - auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map]( + // When set, any CPU constant initializer that a kernel pre-packs is content-addressed + // into the shared OrtPrepackedWeightsContainer. This is required for cross-session sharing + // of pre-packed weights synthesized by graph optimizers at session-creation time (e.g. the + // DQ + MatMul -> MatMulNBits fusion), whose initializer names are auto-generated and + // therefore cannot be registered via OrtApi::AddInitializer ahead of time. + const bool share_all_cpu_prepacked_initializers = + sess_options_.config_options.GetConfigOrDefault( + kOrtSessionOptionsShareAllPrepackedCpuInitializers, "0") == "1"; + + auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map, + share_all_cpu_prepacked_initializers]( bool should_cache_prepacked_weights_for_shared_initializers) -> Status { for (auto& node : GetGraphViewer().Nodes()) { if (sess_options_.IsLoadCancellationFlagSet()) { @@ -498,8 +508,13 @@ Status SessionState::PrepackConstantInitializedTensors( auto iter = initializers_to_share_map.find(input_name); bool is_shared_initializer = (iter != initializers_to_share_map.end()); - // Caching pre-packed weights is limited to shared initializers associated with the CPU EP for now - if (is_shared_initializer && should_cache_prepacked_weights_for_shared_initializers && + // Caching pre-packed weights is limited to CPU EP for now. By default only initializers + // explicitly registered via OrtApi::AddInitializer (is_shared_initializer) participate. + // When share_all_cpu_prepacked_initializers is enabled, every CPU constant initializer is + // eligible; deduplication is content-addressed via hash(packed_bytes) so different + // per-session synthesized names still collide on identical packed contents. + if ((is_shared_initializer || share_all_cpu_prepacked_initializers) && + should_cache_prepacked_weights_for_shared_initializers && node.GetExecutionProviderType() == kCpuExecutionProvider) { // caching of pre-packed weights' turned ON @@ -615,11 +630,9 @@ Status SessionState::PrepackConstantInitializedTensors( is_packed, &weights_to_be_filled_in)); - // Some kernels (matmul_nbits and non-CPU related kernels) do not share their pre-packed results + // Some kernels (non-CPU related kernels) do not share their pre-packed results // even though they set is_packed = true so we leave it up to them. // We can change their behavior if we wish do so in a separate PR - // XXX: Interestingly enough, matmul_nbits does accept shared pre-packs, but does not - // produce them. if (is_packed && !weights_to_be_filled_in.buffers_.empty()) { const auto& op_type = node.OpType(); const std::string prepacked_weights_container_key = GenerateKeyForPrepackedWeightsMap( diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index aadbbab1c135b..32df70e0d5138 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -22,6 +22,7 @@ #include "test/unittest_util/graph_transform_test_builder.h" #include "test/util/include/default_providers.h" #include "test/util/include/scoped_env_vars.h" +#include "test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h" #include "core/session/onnxruntime_cxx_api.h" #include "core/session/ort_env.h" #include "core/util/qmath.h" @@ -87,6 +88,10 @@ struct TestOptions { bool legacy_shape{false}; // for backward compatibility + // When set, RunTest validates cross-session sharing of the pre-packed weights instead of doing a + // single run. The model is run in two sessions that use the same pre-packed weights container. + std::optional prepack_sharing_mode{}; + std::optional output_abs_error{}; std::optional output_rel_error{}; }; @@ -269,6 +274,13 @@ void RunTest(const TestOptions& opts, test.SetOutputRelErr("Y", *opts.output_rel_error); } + if (opts.prepack_sharing_mode.has_value()) { + // Pre-packed weight sharing is a CPU-EP-only feature; the helper runs the model on the CPU EP + // in two sessions and validates the sharing counters. + CheckSharedPrepackedWeights(test, *opts.prepack_sharing_mode, {N, k_blocks, blob_size}, input1_vals); + return; + } + if (!explicit_eps.empty()) { test.ConfigEps(std::move(explicit_eps)); } @@ -597,6 +609,103 @@ TEST(MatMulNBits, Float32_4b_Accuracy4_Batch) { RunTest(opts); } +#ifndef ENABLE_TRAINING +// Pre-packing (and therefore cross-session sharing of pre-packed weights) is disabled in a full +// training build, so there is nothing to exercise there. + +namespace { +// Builds a representative MatMulNBits TestOptions for the pre-packed weight sharing tests. +TestOptions MakeSharingTestOptions(int64_t N, int64_t K, int64_t block_size, int64_t accuracy_level, + bool has_zero_point, bool has_bias, PrepackSharingMode mode) { + TestOptions opts{}; + opts.M = 8; + opts.N = N; + opts.K = K; + opts.block_size = block_size; + opts.accuracy_level = accuracy_level; + opts.has_zero_point = has_zero_point; + opts.zp_is_4bit = true; + opts.has_bias = has_bias; + opts.prepack_sharing_mode = mode; + opts.output_abs_error = 0.1f; + opts.output_rel_error = 0.02f; + return opts; +} +} // namespace + +// Hash-based sharing: session.share_all_prepacked_cpu_initializers = "1" with no AddInitializer call. +// This is the path used by the DQ + MatMul -> MatMulNBits fusion, whose weights are synthesized at +// session-creation time with auto-generated names. Covers symmetric/asymmetric quantization, with and +// without bias, across several block sizes. +TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_Float32) { + for (bool has_zero_point : {false, true}) { + for (bool has_bias : {false, true}) { + for (int64_t block_size : {16, 32, 128}) { + RunTest(MakeSharingTestOptions(32, 256, block_size, /*accuracy_level*/ 0, has_zero_point, + has_bias, PrepackSharingMode::kShareAllCpuInitializers)); + } + } + } +} + +TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_Float16) { + for (bool has_zero_point : {false, true}) { + for (bool has_bias : {false, true}) { + for (int64_t block_size : {16, 32, 128}) { + RunTest(MakeSharingTestOptions(32, 256, block_size, /*accuracy_level*/ 0, has_zero_point, + has_bias, PrepackSharingMode::kShareAllCpuInitializers)); + } + } + } +} + +// Hash-based sharing across different accuracy levels. Each accuracy level selects a different MLAS +// compute type and therefore a different packed-weight layout, all of which must share correctly. +// accuracy_level 4 (DP4A/int8) requires block_size % 32 == 0, K % 128 == 0 and N % 16 == 0. +TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_AccuracyLevels) { + for (int64_t accuracy_level : {0, 1, 4}) { + RunTest(MakeSharingTestOptions(32, 128, /*block_size*/ 32, accuracy_level, /*has_zero_point*/ true, + /*has_bias*/ false, PrepackSharingMode::kShareAllCpuInitializers)); + } +} + +// Hash-based sharing for the fp16 + accuracy_level 4 path (HQNBIT_CompInt8). This is distinct from +// the fp32 int8 path: fp16 scales are converted to fp32 at pack time, and on ARM64 KleidiAI the +// asymmetric 4-bit case packs scales as a separate placeholder buffer and folds the zero points in +// via BZpCorr during B packing. Cover symmetric/asymmetric (BZpCorr) with and without bias. +TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_Float16_AccuracyLevel4) { + for (bool has_zero_point : {false, true}) { + for (bool has_bias : {false, true}) { + RunTest(MakeSharingTestOptions(32, 128, /*block_size*/ 32, /*accuracy_level*/ 4, has_zero_point, + has_bias, PrepackSharingMode::kShareAllCpuInitializers)); + } + } +} + +// Legacy sharing path: the weight B is registered as a shared initializer via +// SessionOptions::AddInitializer. Covers float and float16 activations, symmetric/asymmetric, +/- bias. +TEST(MatMulNBits, SharedPrepackedWeights_AddInitializer) { + for (bool has_zero_point : {false, true}) { + for (bool has_bias : {false, true}) { + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, has_zero_point, + has_bias, PrepackSharingMode::kAddInitializer)); + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, has_zero_point, + has_bias, PrepackSharingMode::kAddInitializer)); + } + } +} + +// Negative control: with the shared container present but neither opt-in mechanism enabled, no +// pre-packed weights are shared across sessions. +TEST(MatMulNBits, SharedPrepackedWeights_NotSharedWithoutOptIn) { + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ true, + /*has_bias*/ true, PrepackSharingMode::kNoSharing)); + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, + /*has_zero_point*/ false, /*has_bias*/ false, + PrepackSharingMode::kNoSharing)); +} +#endif // !ENABLE_TRAINING + #endif #endif diff --git a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc index f99334c4f33ef..bf4b81da2219c 100644 --- a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc @@ -21,6 +21,7 @@ #include "test/unittest_util/graph_transform_test_builder.h" #include "test/util/include/default_providers.h" #include "test/util/include/scoped_env_vars.h" +#include "test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h" #include "core/session/onnxruntime_cxx_api.h" #include "core/session/ort_env.h" #include "core/util/qmath.h" @@ -51,6 +52,10 @@ struct TestOptions8Bits { bool has_g_idx{false}; bool has_bias{false}; + // When set, RunTest8Bits validates cross-session sharing of the pre-packed weights instead of + // doing a single run. The model is run in two CPU sessions that use the same container. + std::optional prepack_sharing_mode{}; + std::optional output_abs_error{}; std::optional output_rel_error{}; }; @@ -221,6 +226,14 @@ void RunTest8Bits(const TestOptions8Bits& opts) { test.SetOutputRelErr("Y", *opts.output_rel_error); } + if (opts.prepack_sharing_mode.has_value()) { + // Pre-packed weight sharing is a CPU-EP-only feature; the helper runs the model on the CPU EP + // in two sessions and validates the sharing counters. + CheckSharedPrepackedWeights(test, *opts.prepack_sharing_mode, + {q_cols, k_blocks, q_rows / k_blocks}, input1_vals); + return; + } + std::vector> execution_providers; #ifdef USE_CUDA execution_providers.emplace_back(DefaultCudaExecutionProvider()); @@ -671,6 +684,81 @@ TEST(MatMulNBits, BFloat16_Int8_Chunked_BFloat16ZeroPoint) { } #endif +#if !defined(USE_CUDA) && !defined(USE_WEBGPU) +#ifndef ENABLE_TRAINING +// Pre-packing (and therefore cross-session sharing of pre-packed weights) is disabled in a full +// training build and is only implemented for the CPU EP, so these tests are CPU-only. + +namespace { +// Builds a representative 8-bit MatMulNBits TestOptions for the pre-packed weight sharing tests. +// accuracy_level 4 selects the int8 compute type (SQNBIT_CompInt8 / HQNBIT_CompInt8), which is the +// 8-bit path that pre-packs the quantized B weight. +TestOptions8Bits MakeSharingTestOptions8Bits(int64_t block_size, bool has_zero_point, bool has_bias, + PrepackSharingMode mode) { + TestOptions8Bits opts{}; + opts.M = 8; + opts.N = 32; + opts.K = 256; + opts.block_size = block_size; + opts.accuracy_level = 4; + opts.has_zero_point = has_zero_point; + opts.has_bias = has_bias; + opts.prepack_sharing_mode = mode; + opts.output_abs_error = 0.1f; + opts.output_rel_error = 0.02f; + return opts; +} +} // namespace + +// Hash-based sharing for 8-bit weights: session.share_all_prepacked_cpu_initializers = "1" with no +// AddInitializer call. Covers symmetric/asymmetric quantization, with and without bias, across +// several block sizes. +TEST(MatMulNBits, SharedPrepackedWeights_8b_ShareAll_Float32) { + for (bool has_zero_point : {false, true}) { + for (bool has_bias : {false, true}) { + for (int64_t block_size : {16, 32, 128}) { + RunTest8Bits(MakeSharingTestOptions8Bits(block_size, has_zero_point, has_bias, + PrepackSharingMode::kShareAllCpuInitializers)); + } + } + } +} + +TEST(MatMulNBits, SharedPrepackedWeights_8b_ShareAll_Float16) { + for (bool has_zero_point : {false, true}) { + for (bool has_bias : {false, true}) { + for (int64_t block_size : {16, 32, 128}) { + RunTest8Bits(MakeSharingTestOptions8Bits(block_size, has_zero_point, has_bias, + PrepackSharingMode::kShareAllCpuInitializers)); + } + } + } +} + +// Legacy sharing path for 8-bit weights: B is registered as a shared initializer via +// SessionOptions::AddInitializer. +TEST(MatMulNBits, SharedPrepackedWeights_8b_AddInitializer) { + for (bool has_zero_point : {false, true}) { + for (bool has_bias : {false, true}) { + RunTest8Bits(MakeSharingTestOptions8Bits(32, has_zero_point, has_bias, + PrepackSharingMode::kAddInitializer)); + RunTest8Bits(MakeSharingTestOptions8Bits(32, has_zero_point, has_bias, + PrepackSharingMode::kAddInitializer)); + } + } +} + +// Negative control for 8-bit weights: with the shared container present but neither opt-in mechanism +// enabled, no pre-packed weights are shared across sessions. +TEST(MatMulNBits, SharedPrepackedWeights_8b_NotSharedWithoutOptIn) { + RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ true, /*has_bias*/ true, + PrepackSharingMode::kNoSharing)); + RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ false, /*has_bias*/ false, + PrepackSharingMode::kNoSharing)); +} +#endif // !ENABLE_TRAINING +#endif // !USE_CUDA && !USE_WEBGPU + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc new file mode 100644 index 0000000000000..f9f18547fd1ad --- /dev/null +++ b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc @@ -0,0 +1,112 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h" + +#include +#include +#include + +#include "gtest/gtest.h" + +#include "core/framework/tensor.h" +#include "core/session/onnxruntime_session_options_config_keys.h" +#include "test/providers/provider_test_utils.h" +#include "test/util/include/asserts.h" +#include "test/util/include/default_providers.h" + +namespace onnxruntime { +namespace test { + +void CheckSharedPrepackedWeights(OpTester& test, PrepackSharingMode mode, + const std::vector& b_dims, + std::vector& b_data) { + SessionOptions so; + OrtValue b_ortvalue; + + switch (mode) { + case PrepackSharingMode::kShareAllCpuInitializers: + // Opt in to hash-based sharing of every CPU constant initializer that gets pre-packed. + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsShareAllPrepackedCpuInitializers, "1")); + break; + case PrepackSharingMode::kAddInitializer: + // Register B as an explicitly shared initializer (the pre-existing sharing mechanism). + Tensor::InitOrtValue(DataTypeImpl::GetType(), TensorShape(b_dims), b_data.data(), + OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator), b_ortvalue); + ASSERT_STATUS_OK(so.AddInitializer("B", &b_ortvalue)); + break; + case PrepackSharingMode::kNoSharing: + // Neither opt-in mechanism is used. + break; + } + + // Have all sessions created by this OpTester use the same pre-packed weights container. + test.EnableSharingOfPrePackedWeightsAcrossSessions(); + + // Pre-packing is limited to the CPU EP, so the sharing behavior is only exercised there. + auto cpu_ep = []() -> std::vector> { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + return execution_providers; + }; + + size_t number_of_pre_packed_weights_counter_session_1 = 0; + size_t number_of_shared_pre_packed_weights_counter = 0; + + // Session 1 + { + auto ep_vec = cpu_ep(); + test.Run(so, OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &ep_vec, {}, + &number_of_pre_packed_weights_counter_session_1, + &number_of_shared_pre_packed_weights_counter); + // Nothing can be shared yet because this is the first session. + ASSERT_EQ(number_of_shared_pre_packed_weights_counter, static_cast(0)); + } + + const auto number_of_elements_in_shared_container = test.GetNumPrePackedWeightsShared(); + + if (mode == PrepackSharingMode::kNoSharing) { + // Without opting in, pre-packed weights must not be placed in the shared container. + ASSERT_EQ(number_of_elements_in_shared_container, static_cast(0)); + } + + // On some platforms/architectures MLAS may choose not to pre-pack, in which case there is nothing + // to share and we cannot meaningfully continue. + if (number_of_pre_packed_weights_counter_session_1 == 0) { + return; + } + + if (mode != PrepackSharingMode::kNoSharing) { + // At least the quantized weight B is content-addressed into the shared container. Some + // architectures (e.g. ARM64 KleidiAI) additionally pre-pack scales, but in the AddInitializer + // mode only the explicitly-registered B participates, so the container can hold fewer elements + // than the total number of pre-packed weights. + ASSERT_GT(number_of_elements_in_shared_container, static_cast(0)); + ASSERT_LE(number_of_elements_in_shared_container, number_of_pre_packed_weights_counter_session_1); + } + + // Session 2 + { + size_t number_of_pre_packed_weights_counter_session_2 = 0; + auto ep_vec = cpu_ep(); + test.Run(so, OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &ep_vec, {}, + &number_of_pre_packed_weights_counter_session_2, + &number_of_shared_pre_packed_weights_counter); + + // The same number of weights is pre-packed in both sessions. + ASSERT_EQ(number_of_pre_packed_weights_counter_session_1, number_of_pre_packed_weights_counter_session_2); + + // Every weight stored in the shared container is served from it (i.e. shared) in the second + // session. For the no-sharing control this is zero; otherwise it matches the container size. + ASSERT_EQ(number_of_shared_pre_packed_weights_counter, number_of_elements_in_shared_container); + + if (mode == PrepackSharingMode::kNoSharing) { + ASSERT_EQ(number_of_shared_pre_packed_weights_counter, static_cast(0)); + } else { + ASSERT_GT(number_of_shared_pre_packed_weights_counter, static_cast(0)); + } + } +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h new file mode 100644 index 0000000000000..557f312976a9c --- /dev/null +++ b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#pragma once + +#include +#include + +namespace onnxruntime { +namespace test { + +class OpTester; + +// How two sessions are configured to share the pre-packed weights of a MatMulNBits node. +enum class PrepackSharingMode { + // session.share_all_prepacked_cpu_initializers = "1": any CPU constant initializer that the kernel + // pre-packs participates in the shared container, content-addressed by hash(packed_bytes). + // No OrtApi::AddInitializer call is required. This is the path used by the DQ + MatMul -> MatMulNBits + // fusion, whose weights are synthesized at session-creation time with auto-generated names. + kShareAllCpuInitializers, + // Legacy path: the weight is explicitly registered as a shared initializer via + // SessionOptions::AddInitializer. + kAddInitializer, + // Negative control: the shared container exists but neither opt-in mechanism is used, so no + // cross-session sharing must happen. + kNoSharing, +}; + +// Runs the already-configured MatMulNBits OpTester in two CPU sessions that share the same +// pre-packed weights container and asserts that the pre-packed weights are shared as expected. +// This logic is independent of the weight bit width, so it is shared by the 4-bit and 8-bit tests. +// `b_dims`/`b_data` describe the quantized B initializer and are only needed for the +// PrepackSharingMode::kAddInitializer path (to register B as a shared initializer). +void CheckSharedPrepackedWeights(OpTester& test, PrepackSharingMode mode, + const std::vector& b_dims, + std::vector& b_data); + +} // namespace test +} // namespace onnxruntime From 84f3f9a3fffa9724849d1f9126b15241b8d4416a Mon Sep 17 00:00:00 2001 From: Dusan Erdeljan Date: Fri, 19 Jun 2026 17:54:18 +0200 Subject: [PATCH 02/13] [CPU] Fix MatMulNBits pre-packed weight sharing corrupting int8 buffers MatMulNBits' CompInt8 (accuracy_level 4) packing is staged and stateful: PrePack(B) packs the quantized weights and accumulates a partial block sum into the buffer, then PrePack(scales)/PrePack(zero_points) consume that state to finalize it. MLAS requires each step to run exactly once per buffer (see SQ8BitGemmPackQuantBDataAndBlkSum). Cross-session pre-packed weight sharing broke this contract: the second session adopts the buffer the first session already finalized and then re-runs PrePack(scales)/PrePack(zero_points) on it, finalizing a second time over already-folded data. That corrupts the block-sum correction and produces wrong results. It reproduces on Linux ARM64, where ArmNeonIsQuantActivationsUnsigned selects the stateful correction path, and is latent in the AVX2/AVX512 packers that use the same design. Track the buffer each instance packs, and in UseSharedPrePackedBuffers detect when the buffer handed back came from another session (it differs from the one this instance packed) and skip the staged scale/zero-point re-pack. The first session and the non-sharing path adopt their own buffer and are unchanged; only the redundant re-pack in later sessions is removed. All changes are in PrePack/UseSharedPrePackedBuffers, so inference and the single-session path are unaffected. --- .../cpu/quantization/matmul_nbits.cc | 36 +++++++++++++++---- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc index 19e6d0389fe7d..b82df6a72101b 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc @@ -163,6 +163,15 @@ class MatMulNBits final : public OpKernel { const bool column_wise_quant_{true}; IAllocatorUniquePtr packed_b_{}; size_t packed_b_size_{0}; + // Address of the packed B buffer this kernel instance produced in its own PrePack(B), captured + // before the buffer is moved into the pre-packed weights container. UseSharedPrePackedBuffers uses + // it to tell whether the buffer handed back is our own (still needs the staged scale/zero-point + // packing) or one another session already finalized (see packed_b_is_shared_). + const void* self_prepacked_b_data_{nullptr}; + // True when packed_b_ was adopted from another session's already-finalized shared buffer. The + // CompInt8 scale/zero-point packing is stateful and single-shot (it consumes the partial block sum + // produced during B packing), so it must not be re-run on a buffer another session finalized. + bool packed_b_is_shared_{false}; IAllocatorUniquePtr scales_fp32_{}; IAllocatorUniquePtr bias_fp32_{}; @@ -365,6 +374,9 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All } } #endif // MLAS_TARGET_ARM64 + // Remember the buffer we packed so UseSharedPrePackedBuffers can distinguish it from a + // finalized buffer shared by another session. + self_prepacked_b_data_ = packed_b_.get(); } is_packed = true; @@ -389,7 +401,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All }(); if (should_pack_scale_and_zp_inputs) { - if (input_idx == InputIndex::scales && packed_b_ != nullptr) { + if (input_idx == InputIndex::scales && packed_b_ != nullptr && !packed_b_is_shared_) { auto sptr = tensor.Data(); MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), sptr, has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -397,7 +409,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All } // Packing zero_point - if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) { + if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && !packed_b_is_shared_) { auto zptr = tensor.Data(); MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -437,7 +449,8 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All // After this PrePack returns is_packed=true, ORT may erase scales from the constant // input table (use count drops to 0), making them unavailable in later PrePack calls. // Zero points haven't been PrePacked yet so they are still accessible. - if (has_zp_input_ && nbits_ == 4) { + // Skip when packed_b_ was adopted from another session - it already carries BZpCorr. + if (has_zp_input_ && nbits_ == 4 && !packed_b_is_shared_) { const Tensor* zp_tensor = nullptr; OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor); if (zp_tensor != nullptr) { @@ -502,7 +515,8 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All // Pack scales separately only for 8-bit. For 4-bit on ARM64, scales are already packed // during B packing or used as a raw pointer at compute time (matching standard // SQNBIT_CompInt8 behavior where should_pack_scale_and_zp_inputs = (nbits_ == 8) on ARM64). - if (nbits_ == 8) { + // Skip when packed_b_ was adopted from another session - it is already finalized. + if (nbits_ == 8 && !packed_b_is_shared_) { MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(), scales_fp32_.get(), has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -513,7 +527,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All // Pack zero_points separately only for 8-bit (matching standard SQNBIT_CompInt8 behavior). // For 4-bit, zero_points are passed directly in data params or handled via KleidiAI BZpCorr. - if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && nbits_ == 8) { + if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && nbits_ == 8 && !packed_b_is_shared_) { auto zptr = tensor.Data(); MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -633,6 +647,9 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou } #endif // MLAS_TARGET_ARM64 + // Remember the buffer we packed so UseSharedPrePackedBuffers can distinguish it from a + // finalized buffer shared by another session. + self_prepacked_b_data_ = packed_b_.get(); is_packed = true; if (prepacked_weights != nullptr) { @@ -649,11 +666,11 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou }(); if (should_pack_scale_and_zp) { - if (input_idx == InputIndex::scales && packed_b_ != nullptr) { + if (input_idx == InputIndex::scales && packed_b_ != nullptr && !packed_b_is_shared_) { MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), scales_fp32_.get(), has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_); is_packed = false; - } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) { + } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && !packed_b_is_shared_) { auto zptr = tensor.Data(); MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -674,6 +691,11 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector& used_shared_buffers = false; if (input_idx == InputIndex::B && !prepacked_buffers.empty()) { + // If the buffer handed back is not the one this kernel packed in its own PrePack(B), it is a + // finalized buffer produced and shared by another session. The staged CompInt8 scale/zero-point + // packing must not be re-run on it (that would corrupt the already-folded block sums), so flag it + // and skip those steps in the scale/zero-point PrePack calls. + packed_b_is_shared_ = prepacked_buffers[0].get() != self_prepacked_b_data_; packed_b_ = std::move(prepacked_buffers[0]); used_shared_buffers = true; From afb93d00c74aa0c2fb31644eaeb932d6012e373f Mon Sep 17 00:00:00 2001 From: Dusan Erdeljan Date: Mon, 22 Jun 2026 16:16:57 +0200 Subject: [PATCH 03/13] Address PR comments --- .../cpu/quantization/matmul_nbits.cc | 221 ++++++++++-------- .../test/contrib_ops/matmul_4bits_test.cc | 145 ++++++++++++ 2 files changed, 272 insertions(+), 94 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc index b82df6a72101b..39158ac90dcc3 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc @@ -163,15 +163,13 @@ class MatMulNBits final : public OpKernel { const bool column_wise_quant_{true}; IAllocatorUniquePtr packed_b_{}; size_t packed_b_size_{0}; - // Address of the packed B buffer this kernel instance produced in its own PrePack(B), captured - // before the buffer is moved into the pre-packed weights container. UseSharedPrePackedBuffers uses - // it to tell whether the buffer handed back is our own (still needs the staged scale/zero-point - // packing) or one another session already finalized (see packed_b_is_shared_). - const void* self_prepacked_b_data_{nullptr}; - // True when packed_b_ was adopted from another session's already-finalized shared buffer. The - // CompInt8 scale/zero-point packing is stateful and single-shot (it consumes the partial block sum - // produced during B packing), so it must not be re-run on a buffer another session finalized. - bool packed_b_is_shared_{false}; + // True once PrePack(InputIndex::B) has folded the scales and (constant) zero points into packed_b_, + // leaving the CompInt8 buffer fully packed and compute-ready. Pre-packed weight sharing + // content-hashes the buffer right after the B PrePack returns, so everything that affects the + // packed bytes (in particular the block sum / BZpCorr, which depend on the zero points) must be + // folded in by then. Once set, the later scales/zero_point PrePack calls must not pack again: the + // CompInt8 packing is single-shot, and the buffer may by then be one shared from another session. + bool packed_b_finalized_{false}; IAllocatorUniquePtr scales_fp32_{}; IAllocatorUniquePtr bias_fp32_{}; @@ -350,33 +348,52 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All } packed_b_ = IAllocator::MakeUniquePtr(alloc, packed_b_size_, true); - // The CompInt8 packed layout is populated across several PrePack() calls (B data here, then - // scales and zero_points). Pre-packed weight sharing keys this buffer by a content hash taken - // right after this B pack returns, so any bytes not yet written (later-packed regions and - // alignment padding) must be deterministic. The allocation above uses reserve and is not - // zero-filled, so zero it here to keep the hash identical across sessions and enable sharing. - if (effective_compute_type == SQNBIT_CompInt8) { + // The CompInt8 packed layout has alignment padding between its sub-regions that the packing + // routines never write. Pre-packed weight sharing content-hashes this buffer, so the padding + // must be deterministic for two sessions to produce the same key. The allocation above uses + // reserve and is not zero-filled, so zero it here. This only matters when the framework asks + // for the buffer to be cached for sharing (prepacked_weights != nullptr). + if (prepacked_weights != nullptr && effective_compute_type == SQNBIT_CompInt8) { std::memset(packed_b_.get(), 0, packed_b_size_); } MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, effective_compute_type, qptr, packed_b_.get(), scale_ptr, has_zp_input_, nullptr, threadpool_ptr, &mlas_backend_kernel_selector_config_); -#if defined(MLAS_TARGET_ARM64) - // For KleidiAI asymmetric 4-bit path: compute BZpCorr now while scales and zero_points are accessible. - if (compute_type_ == HQNBIT_CompInt8 && nbits_ == 4 && has_zp_input_ && scales_fp32_ && - MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, SQNBIT_CompInt8, has_zp_input_, &mlas_backend_kernel_selector_config_)) { - const Tensor* zp_tensor = nullptr; - OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor); - if (zp_tensor != nullptr) { - auto zptr = zp_tensor->Data(); - MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(), - scales_fp32_.get(), has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); + // Fold the scales and (constant) zero points into packed_b_ now, during the B PrePack, instead + // of deferring them to the later scales/zero_points PrePack calls. Pre-packed weight sharing + // content-hashes this buffer immediately after the B PrePack returns; the CompInt8 block sum + // (and the KleidiAI BZpCorr) is a function of the zero points, so they must already be folded + // in for the hash to reflect them. Otherwise two initializers with identical B and scales but + // different zero points would hash equal and the second would wrongly adopt the first's buffer + // and silently compute wrong results. scales and zero_points are constant initializers, so they + // are available here. The B pack above only partially populates the buffer (on x64 the block sum + // is deferred; on ARM64 8-bit the scales are ignored during B packing), so issue one more pack + // call with QuantBData == nullptr to finalize it. This is byte-identical to the staged + // scales + zero_points packing it replaces. + bool finalize_scale_zp_into_packed_b = effective_compute_type == SQNBIT_CompInt8 && scale_ptr != nullptr; +#if !defined(MLAS_TARGET_AMD64_IX86) + // On ARM64 the scales/zero points are folded into B only for 8-bit, or for 4-bit when MLAS bakes + // them in (KleidiAI). For 4-bit non-KleidiAI they are applied at compute time and must not be + // passed to the packing routine, which would dereference the null QuantBData buffer. + finalize_scale_zp_into_packed_b = + finalize_scale_zp_into_packed_b && + (nbits_ == 8 || MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, effective_compute_type, + has_zp_input_, &mlas_backend_kernel_selector_config_)); +#endif + if (finalize_scale_zp_into_packed_b) { + const uint8_t* zp_ptr = nullptr; + if (has_zp_input_) { + const Tensor* zp_tensor = nullptr; + OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor); + if (zp_tensor != nullptr) { + zp_ptr = zp_tensor->Data(); + } } + MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, effective_compute_type, nullptr /*QuantBData*/, + packed_b_.get(), scale_ptr, has_zp_input_, zp_ptr, nullptr, + &mlas_backend_kernel_selector_config_); + packed_b_finalized_ = true; } -#endif // MLAS_TARGET_ARM64 - // Remember the buffer we packed so UseSharedPrePackedBuffers can distinguish it from a - // finalized buffer shared by another session. - self_prepacked_b_data_ = packed_b_.get(); } is_packed = true; @@ -401,7 +418,10 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All }(); if (should_pack_scale_and_zp_inputs) { - if (input_idx == InputIndex::scales && packed_b_ != nullptr && !packed_b_is_shared_) { + // packed_b_ is already finalized during the B PrePack (scales and zero points folded in there so + // the sharing content hash captures them), so skip packing here. The CompInt8 packing is + // single-shot and packed_b_ may now be a buffer shared from another session. + if (input_idx == InputIndex::scales && packed_b_ != nullptr && !packed_b_finalized_) { auto sptr = tensor.Data(); MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), sptr, has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -409,7 +429,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All } // Packing zero_point - if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && !packed_b_is_shared_) { + if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && !packed_b_finalized_) { auto zptr = tensor.Data(); MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -435,22 +455,21 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All } scales_are_packed_ = true; - is_packed = true; - // The scales have already been absorbed into the shared packed B buffer. - // Push a nullptr placeholder so the framework's sharing logic recognizes this - // input as participating in pre-packing without allocating an additional buffer. - if (prepacked_weights != nullptr) { - prepacked_weights->buffers_.push_back(nullptr); - prepacked_weights->buffer_sizes_.push_back(0); - } + // The scales were folded into packed_b_ during the B PrePack, so there is no separate packed + // scales buffer to cache or share. Report is_packed = false (as the x64 path already does for + // the scales input) so the framework does not engage pre-packed weight sharing for scales. + // Engaging it would require pushing a placeholder buffer, but the real scales live inside + // packed_b_ so the placeholder would be null - and PrePackedWeights::GetHash() skips null + // buffers, making the scales container key identical for every MatMulNBits node. That would + // falsely increment the shared-weights counter for unrelated nodes without sharing any real + // buffer. The quantized weight B (which carries the folded-in scales) is shared on its own. + is_packed = false; - // For KleidiAI asymmetric 4-bit path: compute BZpCorr now while scales are still accessible. - // After this PrePack returns is_packed=true, ORT may erase scales from the constant - // input table (use count drops to 0), making them unavailable in later PrePack calls. - // Zero points haven't been PrePacked yet so they are still accessible. - // Skip when packed_b_ was adopted from another session - it already carries BZpCorr. - if (has_zp_input_ && nbits_ == 4 && !packed_b_is_shared_) { + // BZpCorr was already folded into packed_b_ during the B PrePack (so the sharing content hash + // captures the zero points), so re-folding it here must be skipped: the packing is single-shot + // and packed_b_ may now be a buffer shared from another session. + if (has_zp_input_ && nbits_ == 4 && !packed_b_finalized_) { const Tensor* zp_tensor = nullptr; OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor); if (zp_tensor != nullptr) { @@ -491,17 +510,14 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All // BZpCorr was already computed during B packing in Step 1 (if applicable). scales_are_packed_ = true; - is_packed = true; - - // The scales have already been absorbed into the packed B buffer (and BZpCorr folded in - // during B packing), so there is no separate buffer to cache. Push a nullptr placeholder - // so the pre-packed weight sharing logic recognizes this input as participating in - // pre-packing, mirroring the SQNBIT_CompInt8 path above. Without it the shared-container - // path asserts (buffers_.size() > 0) because is_packed is true but no buffer was provided. - if (prepacked_weights != nullptr) { - prepacked_weights->buffers_.push_back(nullptr); - prepacked_weights->buffer_sizes_.push_back(0); - } + + // The scales were folded into the packed B buffer during the B PrePack, so there is no + // separate packed scales buffer to cache or share. Report is_packed = false (mirroring the + // x64 path and the SQNBIT_CompInt8 path above) so the framework does not engage sharing for + // the scales input; engaging it would push a null placeholder whose content hash is identical + // for every node, falsely incrementing the shared-weights counter without sharing any real + // buffer. + is_packed = false; } else #endif // MLAS_TARGET_ARM64 { @@ -515,8 +531,9 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All // Pack scales separately only for 8-bit. For 4-bit on ARM64, scales are already packed // during B packing or used as a raw pointer at compute time (matching standard // SQNBIT_CompInt8 behavior where should_pack_scale_and_zp_inputs = (nbits_ == 8) on ARM64). - // Skip when packed_b_ was adopted from another session - it is already finalized. - if (nbits_ == 8 && !packed_b_is_shared_) { + // Skip when packed_b_ was already finalized during the B PrePack (scales/zero points folded + // in there for the sharing content hash); it may now be a buffer shared from another session. + if (nbits_ == 8 && !packed_b_finalized_) { MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(), scales_fp32_.get(), has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -527,7 +544,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All // Pack zero_points separately only for 8-bit (matching standard SQNBIT_CompInt8 behavior). // For 4-bit, zero_points are passed directly in data params or handled via KleidiAI BZpCorr. - if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && nbits_ == 8 && !packed_b_is_shared_) { + if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && nbits_ == 8 && !packed_b_finalized_) { auto zptr = tensor.Data(); MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -608,8 +625,12 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou if (input_idx == InputIndex::B) { const Tensor* scales = nullptr; OpKernel::Info().TryGetConstantInput(InputIndex::scales, &scales); - if (scales && MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, compute_type_, - has_zp_input_, &mlas_backend_kernel_selector_config_)) { + // Convert the constant fp16 scales to fp32 up front so they (and the zero points) can be folded + // into packed_b_ during this B PrePack, mirroring the primary float PrePack above. Pre-packed + // weight sharing content-hashes the buffer right after this B PrePack returns, so for CompInt8 + // everything that affects the packed bytes (the scales, and the block sum / KleidiAI BZpCorr that + // depend on the zero points) must be folded in by now. + if (scales && compute_type_ == SQNBIT_CompInt8) { auto sptr = scales->Data(); auto scales_size = static_cast(scales->Shape().Size()); auto ptr = IAllocator::MakeUniquePtr(alloc, scales_size, true); @@ -624,32 +645,48 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou } auto qptr = tensor.DataRaw(); packed_b_ = IAllocator::MakeUniquePtr(alloc, packed_b_size_, true); - // See the primary PrePack() above: zero CompInt8 buffers so the pre-packed weight sharing - // content hash (taken right after this B pack) is stable across sessions. - if (compute_type_ == SQNBIT_CompInt8 || compute_type_ == HQNBIT_CompInt8) { + // See the primary PrePack() above: the CompInt8 packed layout has alignment padding that the + // packing routines never write, so zero the buffer to keep the sharing content hash (taken right + // after this B pack) deterministic. Only needed when the framework asks to cache the buffer for + // sharing (prepacked_weights != nullptr). + if (prepacked_weights != nullptr && + (compute_type_ == SQNBIT_CompInt8 || compute_type_ == HQNBIT_CompInt8)) { std::memset(packed_b_.get(), 0, packed_b_size_); } MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), scales_fp32_.get(), has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_); -#if defined(MLAS_TARGET_ARM64) - // For KleidiAI asymmetric 4-bit path: compute BZpCorr during B packing. - // The fp16 specialization packs B here (with scales already converted to fp32), - // so we also compute BZpCorr now while both scales and zero_points are accessible. - if (has_zp_input_ && nbits_ == 4 && scales_fp32_ != nullptr) { - const Tensor* zp_tensor = nullptr; - OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor); - if (zp_tensor != nullptr) { - auto zptr = zp_tensor->Data(); - MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), - scales_fp32_.get(), has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); + // Fold the scales and (constant) zero points into packed_b_ now (see the primary PrePack above): + // the CompInt8 block sum and the KleidiAI BZpCorr depend on the zero points, so they must be + // folded in before the sharing content hash is taken. Otherwise two initializers with identical B + // and scales but different zero points would hash equal and the second would wrongly adopt the + // first's buffer. The B pack above only partially populates the buffer, so issue one more pack + // call with QuantBData == nullptr to finalize it. This is byte-identical to the staged + // scales + zero_points packing it replaces. + bool finalize_scale_zp_into_packed_b = compute_type_ == SQNBIT_CompInt8 && scales_fp32_ != nullptr; +#if !defined(MLAS_TARGET_AMD64_IX86) + // On ARM64 the scales/zero points are folded into B only for 8-bit, or for 4-bit when MLAS bakes + // them in (KleidiAI). For 4-bit non-KleidiAI they are applied at compute time and must not be + // passed to the packing routine, which would dereference the null QuantBData buffer. + finalize_scale_zp_into_packed_b = + finalize_scale_zp_into_packed_b && + (nbits_ == 8 || MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, compute_type_, + has_zp_input_, &mlas_backend_kernel_selector_config_)); +#endif + if (finalize_scale_zp_into_packed_b) { + const uint8_t* zp_ptr = nullptr; + if (has_zp_input_) { + const Tensor* zp_tensor = nullptr; + OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor); + if (zp_tensor != nullptr) { + zp_ptr = zp_tensor->Data(); + } } + MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr /*QuantBData*/, + packed_b_.get(), scales_fp32_.get(), has_zp_input_, zp_ptr, nullptr, + &mlas_backend_kernel_selector_config_); + packed_b_finalized_ = true; } -#endif // MLAS_TARGET_ARM64 - - // Remember the buffer we packed so UseSharedPrePackedBuffers can distinguish it from a - // finalized buffer shared by another session. - self_prepacked_b_data_ = packed_b_.get(); is_packed = true; if (prepacked_weights != nullptr) { @@ -666,11 +703,11 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou }(); if (should_pack_scale_and_zp) { - if (input_idx == InputIndex::scales && packed_b_ != nullptr && !packed_b_is_shared_) { + if (input_idx == InputIndex::scales && packed_b_ != nullptr && !packed_b_finalized_) { MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), scales_fp32_.get(), has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_); is_packed = false; - } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && !packed_b_is_shared_) { + } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && !packed_b_finalized_) { auto zptr = tensor.Data(); MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_); @@ -691,11 +728,11 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector& used_shared_buffers = false; if (input_idx == InputIndex::B && !prepacked_buffers.empty()) { - // If the buffer handed back is not the one this kernel packed in its own PrePack(B), it is a - // finalized buffer produced and shared by another session. The staged CompInt8 scale/zero-point - // packing must not be re-run on it (that would corrupt the already-folded block sums), so flag it - // and skip those steps in the scale/zero-point PrePack calls. - packed_b_is_shared_ = prepacked_buffers[0].get() != self_prepacked_b_data_; + // The buffer handed back is fully finalized: the producing session folded the scales and zero + // points (block sums / KleidiAI BZpCorr) into it during its PrePack(B), which is also when this + // kernel set packed_b_finalized_ on its own (identical) B PrePack. The later scale/zero-point + // PrePack calls already skip the staged packing whenever packed_b_finalized_ is set, so simply + // adopt the shared buffer here - no extra bookkeeping is needed to avoid re-folding into it. packed_b_ = std::move(prepacked_buffers[0]); used_shared_buffers = true; @@ -703,14 +740,10 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector& MlasInitLutGemmKernelConfig(N_, K_, nbits_, block_size_, has_zp_input_); packed_b_size_ = MlasLutGemmPackedSize(N_, K_, nbits_, block_size_, has_zp_input_); } - } else if (input_idx == InputIndex::scales) { - // When the scales were absorbed into the shared packed B buffer (see PrePack), - // a nullptr placeholder is pushed to keep the framework's sharing bookkeeping - // consistent. There is no separate buffer to adopt here - the packed B buffer - // (already populated via UseSharedPrePackedBuffers for input_idx == B) carries - // the packed scales. - used_shared_buffers = true; } + // Only the quantized weight B yields a separately cached pre-packed buffer. The scales (and zero + // points) are folded into packed_b_ during the B PrePack and reported with is_packed = false, so + // the framework never asks this kernel to adopt a shared buffer for them. return Status::OK(); } diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index 32df70e0d5138..39672bfbbfdb2 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -27,6 +27,9 @@ #include "core/session/ort_env.h" #include "core/util/qmath.h" #include "core/providers/webgpu/webgpu_provider_options.h" +#include "core/framework/prepacked_weights_container.h" +#include "core/session/onnxruntime_session_options_config_keys.h" +#include "test/util/include/test/test_environment.h" extern std::unique_ptr ort_env; @@ -704,6 +707,148 @@ TEST(MatMulNBits, SharedPrepackedWeights_NotSharedWithoutOptIn) { /*has_zero_point*/ false, /*has_bias*/ false, PrepackSharingMode::kNoSharing)); } + +namespace { +// Builds and serializes a single-node float MatMulNBits model. "A" is a runtime graph input; B, scales +// and zero_points are constant initializers and the single output is named "Y". The zero-point +// collision test below builds two such models that share byte-identical B and scales initializers but +// differ only in their zero_points. +void BuildMatMulNBitsModelBytes(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accuracy_level, + const std::vector& b_data, const std::vector& scales_data, + const std::vector& zp_data, std::string& model_bytes) { + const int64_t k_blocks = (K + block_size - 1) / block_size; + const int64_t blob_size = (block_size * QBits + 7) / 8; + const int64_t zp_blob_size = (k_blocks * QBits + 7) / 8; + + const std::unordered_map domain_to_version{{"", 21}, {kMSDomain, 1}}; + Model model("matmul_nbits_zp_collision", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, + std::vector(), DefaultLoggingManager().DefaultLogger()); + Graph& graph = model.MainGraph(); + ModelTestBuilder builder(graph); + + ONNX_NAMESPACE::TypeProto float_2d; + float_2d.mutable_tensor_type()->set_elem_type(utils::ToTensorProtoElementType()); + float_2d.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(M); + float_2d.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(K); + NodeArg* A = &graph.GetOrCreateNodeArg("A", &float_2d); + NodeArg* Y = &graph.GetOrCreateNodeArg("Y", nullptr); + + NodeArg* B = builder.MakeInitializer({N, k_blocks, blob_size}, b_data); + NodeArg* scales = builder.MakeInitializer({N, k_blocks}, scales_data); + NodeArg* zero_points = builder.MakeInitializer({N, zp_blob_size}, zp_data); + + Node& node = builder.AddNode("MatMulNBits", {A, B, scales, zero_points}, {Y}, kMSDomain); + node.AddAttribute("K", K); + node.AddAttribute("N", N); + node.AddAttribute("block_size", block_size); + node.AddAttribute("bits", static_cast(QBits)); + node.AddAttribute("accuracy_level", accuracy_level); + + graph.SetOutputs(std::vector{Y}); + ASSERT_STATUS_OK(graph.Resolve()); + ASSERT_TRUE(model.ToProto().SerializeToString(&model_bytes)); +} + +// Loads and runs the given serialized MatMulNBits model on the CPU EP with +// share_all_prepacked_cpu_initializers enabled, backed by the supplied shared pre-packed weights +// container. Returns the single "Y" output and the number of pre-packed weights this session served +// from the shared container. +void RunSharedPrepackSession(const std::string& model_bytes, const NameMLValMap& feeds, + PrepackedWeightsContainer& container, std::vector& fetches, + size_t& used_shared_count) { + SessionOptions so; + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsShareAllPrepackedCpuInitializers, "1")); + + InferenceSessionWrapper session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.AddPrePackedWeightsContainer(&container)); + ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast(model_bytes.size()))); + ASSERT_STATUS_OK(session.Initialize()); + + RunOptions run_options; + ASSERT_STATUS_OK(session.Run(run_options, feeds, std::vector{"Y"}, &fetches)); + used_shared_count = session.GetSessionState().GetUsedSharedPrePackedWeightCounter(); +} +} // namespace + +// Regression test for pre-packed weight sharing: two MatMulNBits initializers with byte-identical +// quantized B and identical scales but DIFFERENT zero points must not be treated as identical by the +// content-addressed sharing cache. For the CompInt8 path (accuracy_level 4) the zero points are folded +// into the packed buffer (as the per-block sum). Before the fix the sharing hash was taken before the +// zero points were folded in, so the second session silently adopted the first session's buffer and +// produced a wrong result. +TEST(MatMulNBits, SharedPrepackedWeights_DifferentZeroPointsDoNotCollide) { + constexpr int64_t M = 2, N = 16, K = 128, block_size = 32; + constexpr int64_t accuracy_level = 4; // CompInt8 - the path that folds zero points into packed_b_. + const int64_t k_blocks = (K + block_size - 1) / block_size; + const int64_t blob_size = (block_size * QBits + 7) / 8; + const int64_t zp_blob_size = (k_blocks * QBits + 7) / 8; + + // Byte-identical B and scales for both models; only the zero points differ. + std::vector b_data(static_cast(N * k_blocks * blob_size)); + for (size_t i = 0; i < b_data.size(); ++i) { + b_data[i] = static_cast((i * 7 + 3) & 0xFF); + } + std::vector scales(static_cast(N * k_blocks)); + for (size_t i = 0; i < scales.size(); ++i) { + scales[i] = 0.02f + 0.001f * static_cast(i % 17); + } + std::vector zp1(static_cast(N * zp_blob_size)); + std::vector zp2(zp1.size()); + for (size_t i = 0; i < zp1.size(); ++i) { + zp1[i] = static_cast((i * 5 + 1) & 0xFF); + zp2[i] = static_cast(~zp1[i]); // every 4-bit zero point differs between the two models + } + + std::string model_zp1, model_zp2; + BuildMatMulNBitsModelBytes(M, N, K, block_size, accuracy_level, b_data, scales, zp1, model_zp1); + BuildMatMulNBitsModelBytes(M, N, K, block_size, accuracy_level, b_data, scales, zp2, model_zp2); + + auto cpu_allocator = TestCPUExecutionProvider()->CreatePreferredAllocators()[0]; + RandomValueGenerator random{1234}; + std::vector a_data = random.Gaussian(AsSpan({M, K}), 0.0f, 0.25f); + OrtValue a_value; + CreateMLValue(cpu_allocator, AsSpan({M, K}), a_data, &a_value); + NameMLValMap feeds{{"A", a_value}}; + + // Reference output for the zp2 model, computed in isolation (its own private container). + std::vector ref_fetches; + size_t ref_used = 0; + { + PrepackedWeightsContainer ref_container; + RunSharedPrepackSession(model_zp2, feeds, ref_container, ref_fetches, ref_used); + } + + // Share one container across a zp1 session (which warms the container with its finalized buffer) and + // a subsequent zp2 session. + PrepackedWeightsContainer shared_container; + std::vector warm_fetches, shared_fetches; + size_t warm_used = 0, shared_used = 0; + RunSharedPrepackSession(model_zp1, feeds, shared_container, warm_fetches, warm_used); + + // If the platform did not pre-pack B for this configuration there is nothing to collide on. + if (shared_container.GetNumberOfElements() == 0) { + GTEST_SKIP() << "CompInt8 pre-packing not available on this platform"; + } + + RunSharedPrepackSession(model_zp2, feeds, shared_container, shared_fetches, shared_used); + + // The zp2 session must NOT have reused the zp1 session's finalized buffer: different zero points + // produce different packed bytes and therefore a different sharing hash. + EXPECT_EQ(shared_used, static_cast(0)); + + // And the shared zp2 run must match the isolated zp2 reference element for element. + ASSERT_EQ(ref_fetches.size(), static_cast(1)); + ASSERT_EQ(shared_fetches.size(), static_cast(1)); + const Tensor& ref_tensor = ref_fetches[0].Get(); + const Tensor& shared_tensor = shared_fetches[0].Get(); + ASSERT_EQ(ref_tensor.Shape(), shared_tensor.Shape()); + const auto ref_span = ref_tensor.DataAsSpan(); + const auto shared_span = shared_tensor.DataAsSpan(); + for (size_t i = 0; i < ref_span.size(); ++i) { + EXPECT_EQ(ref_span[i], shared_span[i]) << "output mismatch at index " << i; + } +} #endif // !ENABLE_TRAINING #endif From a5947ef69c6645327aa2752adbe0a2a0b08202cf Mon Sep 17 00:00:00 2001 From: Dusan Erdeljan Date: Tue, 23 Jun 2026 19:00:45 +0200 Subject: [PATCH 04/13] Make CPU initializers sharing MatMulNBits specific --- .../onnxruntime_session_options_config_keys.h | 32 ++++++++----------- .../cpu/quantization/matmul_nbits.cc | 30 ++++++++++------- onnxruntime/core/framework/session_state.cc | 28 ++++++++-------- .../test/contrib_ops/matmul_4bits_test.cc | 14 ++++---- .../test/contrib_ops/matmul_8bits_test.cc | 6 ++-- .../matmul_nbits_prepack_sharing_test_util.cc | 6 ++-- .../matmul_nbits_prepack_sharing_test_util.h | 10 +++--- 7 files changed, 63 insertions(+), 63 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index ce4222129e24b..13d7405b0f9fb 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -372,28 +372,22 @@ static const char* const kOrtSessionOptionsModelExternalInitializersFileFolderPa static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers = "session.save_external_prepacked_constant_initializers"; -// Enables cross-session sharing of pre-packed weights for *all* CPU constant initializers via an -// OrtPrepackedWeightsContainer, not just those explicitly registered with OrtApi::AddInitializer. +// Enables cross-session sharing of MatMulNBits pre-packed weights via an OrtPrepackedWeightsContainer, +// content-addressed by hash(packed_bytes) so weights with auto-generated names still deduplicate. This +// covers MatMulNBits weights synthesized at session-creation time (e.g. by the DQ + MatMul -> MatMulNBits +// fusion), whose names are not known ahead of time and so cannot be registered via OrtApi::AddInitializer. // -// Without this flag, only initializers registered via AddInitializer participate in the -// OrtPrepackedWeightsContainer cache. That requires knowing initializer names ahead of time, which is -// not possible when graph-level optimizers synthesize new initializers at session-creation time (e.g. -// the DQ + MatMul -> MatMulNBits fusion produces transposed/repacked weights with auto-generated -// names that do not exist in the input model file). +// Scoped to MatMulNBits: content-addressed sharing is only safe when a kernel's packed bytes fully +// determine its Compute result, which MatMulNBits satisfies. Other CPU kernels are unaffected. // -// With this flag set to "1", any CPU constant initializer that a kernel pre-packs participates in the -// shared container, content-addressed by hash(packed_bytes). Two sessions that pack byte-identical -// weights deduplicate to a single buffer owned by the container. +// Requires the session to be created via OrtApi::CreateSessionWithPrepackedWeightsContainer, with this +// option set consistently across all sharing sessions using the same container. // -// Requirements: -// - The session must be created via OrtApi::CreateSessionWithPrepackedWeightsContainer. -// - All sessions intended to share must set this flag consistently and use the same container. -// -// - "0": Default. Only AddInitializer-registered initializers can share pre-packed weights cross-session. -// - "1": All CPU constant initializers can share pre-packed weights cross-session via the container. -// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsShareAllPrepackedCpuInitializers, "1") -static const char* const kOrtSessionOptionsShareAllPrepackedCpuInitializers = - "session.share_all_prepacked_cpu_initializers"; +// - "0": Default. Only AddInitializer-registered initializers share pre-packed weights cross-session. +// - "1": Also share MatMulNBits pre-packed weights cross-session via the container. +// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsShareMatMulNBitsPrepackedWeights, "1") +static const char* const kOrtSessionOptionsShareMatMulNBitsPrepackedWeights = + "session.share_matmulnbits_prepacked_weights"; // Use this config when you want to collect memory stats for each node in the graph. // The file format is a CSV file with the following columns: diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc index 39158ac90dcc3..e8e79794f52eb 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc @@ -348,12 +348,17 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All } packed_b_ = IAllocator::MakeUniquePtr(alloc, packed_b_size_, true); - // The CompInt8 packed layout has alignment padding between its sub-regions that the packing - // routines never write. Pre-packed weight sharing content-hashes this buffer, so the padding - // must be deterministic for two sessions to produce the same key. The allocation above uses - // reserve and is not zero-filled, so zero it here. This only matters when the framework asks - // for the buffer to be cached for sharing (prepacked_weights != nullptr). - if (prepacked_weights != nullptr && effective_compute_type == SQNBIT_CompInt8) { + // The framework content-hashes this packed buffer to deduplicate pre-packed weights, both + // within a session and across sessions (the shared container). The session-state prepack pass + // (SessionState::PrepackConstantInitializedTensors) passes a non-null prepacked_weights on both + // the container and the default single-session paths, so this zero-fill runs on essentially + // every prepack at load, not only when a sharing container is configured -- the guard below + // only skips a caller that asks for no cacheable buffer. The pack routines need not write every + // byte (alignment padding between the CompInt8 sub-regions; any layout could gain padding) and + // the reserve allocation is not zero-filled, so the hash would otherwise depend on uninitialized + // bytes. Zeroing the whole buffer is a one-time O(packed_b_size_) load cost (the pack overwrites + // the data regions, leaving only padding zeroed); inference is unaffected. + if (prepacked_weights != nullptr) { std::memset(packed_b_.get(), 0, packed_b_size_); } MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, effective_compute_type, qptr, packed_b_.get(), scale_ptr, @@ -645,12 +650,13 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou } auto qptr = tensor.DataRaw(); packed_b_ = IAllocator::MakeUniquePtr(alloc, packed_b_size_, true); - // See the primary PrePack() above: the CompInt8 packed layout has alignment padding that the - // packing routines never write, so zero the buffer to keep the sharing content hash (taken right - // after this B pack) deterministic. Only needed when the framework asks to cache the buffer for - // sharing (prepacked_weights != nullptr). - if (prepacked_weights != nullptr && - (compute_type_ == SQNBIT_CompInt8 || compute_type_ == HQNBIT_CompInt8)) { + // See the primary PrePack() above: SessionState::PrepackConstantInitializedTensors passes a + // non-null prepacked_weights on both the container and the default single-session paths, so this + // zero-fill runs on essentially every prepack at load (the guard only skips a caller that asks for + // no cacheable buffer). It keeps the dedup content hash reproducible regardless of bytes the pack + // leaves uninitialized (alignment padding), for any compute type. One-time O(packed_b_size_) load + // cost; inference is unaffected. + if (prepacked_weights != nullptr) { std::memset(packed_b_.get(), 0, packed_b_size_); } MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 78627e76a4abd..1f42ca0258e4f 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -470,17 +470,15 @@ static std::string GenerateKeyForPrepackedWeightsMap(const std::string& op_type, Status SessionState::PrepackConstantInitializedTensors( InlinedHashMap& constant_initializers_use_count, const std::unordered_map& initializers_to_share_map) { - // When set, any CPU constant initializer that a kernel pre-packs is content-addressed - // into the shared OrtPrepackedWeightsContainer. This is required for cross-session sharing - // of pre-packed weights synthesized by graph optimizers at session-creation time (e.g. the - // DQ + MatMul -> MatMulNBits fusion), whose initializer names are auto-generated and - // therefore cannot be registered via OrtApi::AddInitializer ahead of time. - const bool share_all_cpu_prepacked_initializers = + // When set, MatMulNBits pre-packed weights are content-addressed into the shared + // OrtPrepackedWeightsContainer for cross-session sharing. Needed for fusion-synthesized weights (e.g. + // DQ + MatMul -> MatMulNBits) whose auto-generated names can't be pre-registered via AddInitializer. + const bool share_matmulnbits_prepacked_weights = sess_options_.config_options.GetConfigOrDefault( - kOrtSessionOptionsShareAllPrepackedCpuInitializers, "0") == "1"; + kOrtSessionOptionsShareMatMulNBitsPrepackedWeights, "0") == "1"; auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map, - share_all_cpu_prepacked_initializers]( + share_matmulnbits_prepacked_weights]( bool should_cache_prepacked_weights_for_shared_initializers) -> Status { for (auto& node : GetGraphViewer().Nodes()) { if (sess_options_.IsLoadCancellationFlagSet()) { @@ -508,12 +506,14 @@ Status SessionState::PrepackConstantInitializedTensors( auto iter = initializers_to_share_map.find(input_name); bool is_shared_initializer = (iter != initializers_to_share_map.end()); - // Caching pre-packed weights is limited to CPU EP for now. By default only initializers - // explicitly registered via OrtApi::AddInitializer (is_shared_initializer) participate. - // When share_all_cpu_prepacked_initializers is enabled, every CPU constant initializer is - // eligible; deduplication is content-addressed via hash(packed_bytes) so different - // per-session synthesized names still collide on identical packed contents. - if ((is_shared_initializer || share_all_cpu_prepacked_initializers) && + // CPU EP only. By default only AddInitializer-registered initializers (is_shared_initializer) + // participate; share_matmulnbits_prepacked_weights also enrolls MatMulNBits weights, + // deduplicated content-addressed via hash(packed_bytes). Enrollment is restricted to + // MatMulNBits because content-addressed sharing is only safe when packed bytes fully + // determine Compute (which MatMulNBits satisfies); this also keeps the BUG CHECK below valid. + const bool enroll_matmulnbits_initializer = + share_matmulnbits_prepacked_weights && node.OpType() == "MatMulNBits"; + if ((is_shared_initializer || enroll_matmulnbits_initializer) && should_cache_prepacked_weights_for_shared_initializers && node.GetExecutionProviderType() == kCpuExecutionProvider) { // caching of pre-packed weights' turned ON diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index 39672bfbbfdb2..14aefbad816aa 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -636,7 +636,7 @@ TestOptions MakeSharingTestOptions(int64_t N, int64_t K, int64_t block_size, int } } // namespace -// Hash-based sharing: session.share_all_prepacked_cpu_initializers = "1" with no AddInitializer call. +// Hash-based sharing: session.share_matmulnbits_prepacked_weights = "1" with no AddInitializer call. // This is the path used by the DQ + MatMul -> MatMulNBits fusion, whose weights are synthesized at // session-creation time with auto-generated names. Covers symmetric/asymmetric quantization, with and // without bias, across several block sizes. @@ -645,7 +645,7 @@ TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_Float32) { for (bool has_bias : {false, true}) { for (int64_t block_size : {16, 32, 128}) { RunTest(MakeSharingTestOptions(32, 256, block_size, /*accuracy_level*/ 0, has_zero_point, - has_bias, PrepackSharingMode::kShareAllCpuInitializers)); + has_bias, PrepackSharingMode::kShareMatMulNBitsPrepackedWeights)); } } } @@ -656,7 +656,7 @@ TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_Float16) { for (bool has_bias : {false, true}) { for (int64_t block_size : {16, 32, 128}) { RunTest(MakeSharingTestOptions(32, 256, block_size, /*accuracy_level*/ 0, has_zero_point, - has_bias, PrepackSharingMode::kShareAllCpuInitializers)); + has_bias, PrepackSharingMode::kShareMatMulNBitsPrepackedWeights)); } } } @@ -668,7 +668,7 @@ TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_Float16) { TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_AccuracyLevels) { for (int64_t accuracy_level : {0, 1, 4}) { RunTest(MakeSharingTestOptions(32, 128, /*block_size*/ 32, accuracy_level, /*has_zero_point*/ true, - /*has_bias*/ false, PrepackSharingMode::kShareAllCpuInitializers)); + /*has_bias*/ false, PrepackSharingMode::kShareMatMulNBitsPrepackedWeights)); } } @@ -680,7 +680,7 @@ TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_Float16_AccuracyLevel4) { for (bool has_zero_point : {false, true}) { for (bool has_bias : {false, true}) { RunTest(MakeSharingTestOptions(32, 128, /*block_size*/ 32, /*accuracy_level*/ 4, has_zero_point, - has_bias, PrepackSharingMode::kShareAllCpuInitializers)); + has_bias, PrepackSharingMode::kShareMatMulNBitsPrepackedWeights)); } } } @@ -751,14 +751,14 @@ void BuildMatMulNBitsModelBytes(int64_t M, int64_t N, int64_t K, int64_t block_s } // Loads and runs the given serialized MatMulNBits model on the CPU EP with -// share_all_prepacked_cpu_initializers enabled, backed by the supplied shared pre-packed weights +// share_matmulnbits_prepacked_weights enabled, backed by the supplied shared pre-packed weights // container. Returns the single "Y" output and the number of pre-packed weights this session served // from the shared container. void RunSharedPrepackSession(const std::string& model_bytes, const NameMLValMap& feeds, PrepackedWeightsContainer& container, std::vector& fetches, size_t& used_shared_count) { SessionOptions so; - ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsShareAllPrepackedCpuInitializers, "1")); + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsShareMatMulNBitsPrepackedWeights, "1")); InferenceSessionWrapper session{so, GetEnvironment()}; ASSERT_STATUS_OK(session.AddPrePackedWeightsContainer(&container)); diff --git a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc index bf4b81da2219c..f066a9dbd80ac 100644 --- a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc @@ -710,7 +710,7 @@ TestOptions8Bits MakeSharingTestOptions8Bits(int64_t block_size, bool has_zero_p } } // namespace -// Hash-based sharing for 8-bit weights: session.share_all_prepacked_cpu_initializers = "1" with no +// Hash-based sharing for 8-bit weights: session.share_matmulnbits_prepacked_weights = "1" with no // AddInitializer call. Covers symmetric/asymmetric quantization, with and without bias, across // several block sizes. TEST(MatMulNBits, SharedPrepackedWeights_8b_ShareAll_Float32) { @@ -718,7 +718,7 @@ TEST(MatMulNBits, SharedPrepackedWeights_8b_ShareAll_Float32) { for (bool has_bias : {false, true}) { for (int64_t block_size : {16, 32, 128}) { RunTest8Bits(MakeSharingTestOptions8Bits(block_size, has_zero_point, has_bias, - PrepackSharingMode::kShareAllCpuInitializers)); + PrepackSharingMode::kShareMatMulNBitsPrepackedWeights)); } } } @@ -729,7 +729,7 @@ TEST(MatMulNBits, SharedPrepackedWeights_8b_ShareAll_Float16) { for (bool has_bias : {false, true}) { for (int64_t block_size : {16, 32, 128}) { RunTest8Bits(MakeSharingTestOptions8Bits(block_size, has_zero_point, has_bias, - PrepackSharingMode::kShareAllCpuInitializers)); + PrepackSharingMode::kShareMatMulNBitsPrepackedWeights)); } } } diff --git a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc index f9f18547fd1ad..515553c28cba8 100644 --- a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc +++ b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc @@ -25,9 +25,9 @@ void CheckSharedPrepackedWeights(OpTester& test, PrepackSharingMode mode, OrtValue b_ortvalue; switch (mode) { - case PrepackSharingMode::kShareAllCpuInitializers: - // Opt in to hash-based sharing of every CPU constant initializer that gets pre-packed. - ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsShareAllPrepackedCpuInitializers, "1")); + case PrepackSharingMode::kShareMatMulNBitsPrepackedWeights: + // Opt in to hash-based sharing of MatMulNBits pre-packed weights. + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsShareMatMulNBitsPrepackedWeights, "1")); break; case PrepackSharingMode::kAddInitializer: // Register B as an explicitly shared initializer (the pre-existing sharing mechanism). diff --git a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h index 557f312976a9c..2d898f1c4a01f 100644 --- a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h +++ b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h @@ -12,11 +12,11 @@ class OpTester; // How two sessions are configured to share the pre-packed weights of a MatMulNBits node. enum class PrepackSharingMode { - // session.share_all_prepacked_cpu_initializers = "1": any CPU constant initializer that the kernel - // pre-packs participates in the shared container, content-addressed by hash(packed_bytes). - // No OrtApi::AddInitializer call is required. This is the path used by the DQ + MatMul -> MatMulNBits - // fusion, whose weights are synthesized at session-creation time with auto-generated names. - kShareAllCpuInitializers, + // session.share_matmulnbits_prepacked_weights = "1": MatMulNBits pre-packed weights participate in the + // shared container, content-addressed by hash(packed_bytes). No OrtApi::AddInitializer call is required. + // This is the path used by the DQ + MatMul -> MatMulNBits fusion, whose weights are synthesized at + // session-creation time with auto-generated names. + kShareMatMulNBitsPrepackedWeights, // Legacy path: the weight is explicitly registered as a shared initializer via // SessionOptions::AddInitializer. kAddInitializer, From 0902515915d1276643e51cd8fc4ecb9b595dd7be Mon Sep 17 00:00:00 2001 From: Dusan Erdeljan Date: Wed, 24 Jun 2026 09:45:43 +0200 Subject: [PATCH 05/13] Fix MatMulNBits LUT tests. Add coverage UT. --- .../cpu/quantization/matmul_nbits.cc | 10 +- .../test/contrib_ops/matmul_2bits_test.cc | 115 ++++++++++++++++++ 2 files changed, 121 insertions(+), 4 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc index e8e79794f52eb..162d7257d0a4c 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc @@ -315,10 +315,12 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All static_cast(packed_b_.get()), threadpool_ptr); - if (prepacked_weights != nullptr) { - prepacked_weights->buffers_.push_back(std::move(packed_b_)); - prepacked_weights->buffer_sizes_.push_back(packed_b_size_); - } + // Do not append packed_b_ here. Both the LUT and non-LUT branches share the single append + // after this if/else, so each records exactly one buffer. Appending here as well would move + // packed_b_ out now and then have the shared append record a second, moved-from/null buffer + // with a non-zero packed_b_size_. PrePackedWeights::GetHash() skips null buffers so sharing + // appears to work, but the prepacked-blob save path writes buffer_sizes_[i] bytes from + // buffers_[i].get() and would dereference that null pointer. } else { // For HQNBIT_CompInt8, route through SQNBIT_CompInt8 for sizing and packing. // This gets KleidiAI-sized buffer when available for 4-bit and packs B+scales correctly. diff --git a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc index 9deb064a90853..8e133caa15d55 100644 --- a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc @@ -3,6 +3,7 @@ #ifndef ORT_MINIMAL_BUILD +#include #include #include "gtest/gtest.h" @@ -26,6 +27,9 @@ #include "core/session/onnxruntime_cxx_api.h" #include "core/session/ort_env.h" #include "core/util/qmath.h" +#include "core/graph/model.h" +#include "test/util/include/inference_session_wrapper.h" +#include "test/util/include/test/test_environment.h" #include "core/providers/webgpu/webgpu_provider_options.h" #ifdef USE_WEBGPU #include "contrib_ops/webgpu/quantization/matmul_nbits_common.h" @@ -461,6 +465,117 @@ TEST(MatMulNBitsLutGemm, Float32_2Bits_Asymmetric_Batch32_256x256_Bias) { TestMatMul2BitsLutGemm(32, 256, 256, 32, /*has_zero_point=*/true, /*has_bias=*/true); } +// Regression test for the LUT GEMM pre-pack + prepacked-save path. A 2-bit MatMulNBits node pre-packed +// via the LUT path must record its packed B buffer exactly once. A prior bug appended packed_b_ twice +// on the LUT path (inside the LUT branch and again in the shared append at the end of the B block), so +// the second entry was a moved-from/null buffer paired with a non-zero packed_b_size_. The pre-packed +// content hash skips null buffers, so cross-session sharing appeared to work, but saving pre-packed +// initializers iterates every recorded buffer and writes buffer_sizes_[i] bytes from buffers_[i].get(), +// dereferencing the null pointer when mlas.use_lut_gemm=1. This drives mlas.use_lut_gemm=1 together with +// session.save_external_prepacked_constant_initializers=1 and a non-empty optimized_model_filepath, and +// asserts that initialization (which performs the save) and a subsequent run both succeed. +TEST(MatMulNBitsLutGemm, Float32_2Bits_PrepackSaveDoesNotCrash) { + constexpr int64_t M = 1, N = 128, K = 128, block_size = 32; + if (!MlasIsLutGemmAvailable(static_cast(N), static_cast(K), 2, static_cast(block_size))) { + GTEST_SKIP() << "LUT GEMM not available on this platform"; + } + + // Quantize random weights into valid 2-bit MatMulNBits B/scales/zero_points initializers. + RandomValueGenerator random{1234}; + std::vector b_fp32(random.Gaussian(AsSpan({K, N}), 0.0f, 0.25f)); + + int q_rows = 0, q_cols = 0; + MlasBlockwiseQuantizedShape(static_cast(block_size), /*columnwise*/ true, + static_cast(K), static_cast(N), q_rows, q_cols); + size_t q_data_size_in_bytes = 0, q_scale_size = 0, q_zp_size_in_bytes = 0; + MlasBlockwiseQuantizedBufferSizes(static_cast(block_size), /*columnwise*/ true, + static_cast(K), static_cast(N), + q_data_size_in_bytes, q_scale_size, &q_zp_size_in_bytes); + + std::vector b_data(q_data_size_in_bytes); + std::vector scales(q_scale_size); + std::vector zp(q_zp_size_in_bytes); + + auto& ortenv = **ort_env.get(); + onnxruntime::concurrency::ThreadPool* tp = ortenv.GetEnvironment().GetIntraOpThreadPool(); + MlasQuantizeBlockwise(b_data.data(), scales.data(), zp.data(), b_fp32.data(), + static_cast(block_size), /*columnwise*/ true, + static_cast(K), static_cast(N), + static_cast(N), tp); + + // Single-node MatMulNBits model: A is a runtime input; B/scales/zero_points are constant initializers + // (so they are pre-packed at session initialization). + const int64_t k_blocks = (K + block_size - 1) / block_size; + const std::unordered_map domain_to_version{{"", 21}, {kMSDomain, 1}}; + Model model("matmul_2bits_lut_prepack_save", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, + std::vector(), DefaultLoggingManager().DefaultLogger()); + Graph& graph = model.MainGraph(); + ModelTestBuilder builder(graph); + + ONNX_NAMESPACE::TypeProto float_2d; + float_2d.mutable_tensor_type()->set_elem_type(utils::ToTensorProtoElementType()); + float_2d.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(M); + float_2d.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(K); + NodeArg* A = &graph.GetOrCreateNodeArg("A", &float_2d); + NodeArg* Y = &graph.GetOrCreateNodeArg("Y", nullptr); + + NodeArg* B = builder.MakeInitializer( + {static_cast(q_cols), k_blocks, static_cast(q_rows) / k_blocks}, b_data); + NodeArg* scales_arg = builder.MakeInitializer({N, static_cast(q_scale_size) / N}, scales); + NodeArg* zero_points = + builder.MakeInitializer({N, static_cast(q_zp_size_in_bytes) / N}, zp); + + Node& node = builder.AddNode("MatMulNBits", {A, B, scales_arg, zero_points}, {Y}, kMSDomain); + node.AddAttribute("K", K); + node.AddAttribute("N", N); + node.AddAttribute("block_size", block_size); + node.AddAttribute("bits", static_cast(QBits)); + node.AddAttribute("accuracy_level", static_cast(0)); + + graph.SetOutputs(std::vector{Y}); + ASSERT_STATUS_OK(graph.Resolve()); + + std::string model_bytes; + ASSERT_TRUE(model.ToProto().SerializeToString(&model_bytes)); + + // Save the optimized model + pre-packed initializers into a unique temp dir. Writing the prepacked + // initializers is the path that dereferenced the duplicate null buffer before the fix. + namespace fs = std::filesystem; + const fs::path tmp_dir = fs::temp_directory_path() / "ort_matmul2bits_lut_prepack_save_test"; + std::error_code ec; + fs::remove_all(tmp_dir, ec); + ASSERT_TRUE(fs::create_directories(tmp_dir, ec)) << ec.message(); + const fs::path optimized_model_path = tmp_dir / "optimized.onnx"; + + SessionOptions so; + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsMlasLutGemm, "1")); + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsSavePrePackedConstantInitializers, "1")); + so.optimized_model_filepath = optimized_model_path.native(); + + std::vector fetches; + { + InferenceSessionWrapper session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast(model_bytes.size()))); + // Initialization performs the LUT pre-pack and writes the optimized model with external + // pre-packed initializers. Before the fix this dereferenced the duplicate null packed buffer. + ASSERT_STATUS_OK(session.Initialize()); + + auto cpu_allocator = TestCPUExecutionProvider()->CreatePreferredAllocators()[0]; + std::vector a_data = random.Gaussian(AsSpan({M, K}), 0.0f, 0.25f); + OrtValue a_value; + CreateMLValue(cpu_allocator, AsSpan({M, K}), a_data, &a_value); + NameMLValMap feeds{{"A", a_value}}; + + ASSERT_STATUS_OK(session.Run(RunOptions{}, feeds, std::vector{"Y"}, &fetches)); + } + + ASSERT_EQ(fetches.size(), static_cast(1)); + EXPECT_TRUE(fs::exists(optimized_model_path)); + + fs::remove_all(tmp_dir, ec); +} + // Float zero point tests — directed QAD scenario (zp=1.5) void RunTest2BitsFloatZP(int64_t M, int64_t N, int64_t K, int64_t block_size, float zp_value) { RandomValueGenerator random{1234}; From bc2a0e83664abf448d06bc4eb25aed669eddf686 Mon Sep 17 00:00:00 2001 From: Dusan Erdeljan Date: Wed, 24 Jun 2026 23:59:21 +0200 Subject: [PATCH 06/13] Share MatMulNBits pre-packed weights cross-session via content tags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DQ->MatMulNBits fusions synthesize new B/scales/zero-point initializers whose names are generated per-graph and are therefore not stable across sessions, so the prior content-hash-of-packed-bytes sharing could not safely dedup them. Replace the broad `session.share_matmulnbits_prepacked_weights` option with a tagging mechanism: each fusion computes a stable, content-derived identity over the generated weight/scale/zero-point bytes plus the quant params (N, K, block_size, bits, accuracy_level) and tags the generated B initializer with it. SessionState enrolls tagged initializers into the shared pre-packed-weights container keyed by that identity, so cross-session sharing needs no session option and cannot false-share across models differing in any semantic input (e.g. zero points, or accuracy_level which changes the packed layout). Mechanism: - graph.h: add Graph::Set/GetSharedPrepackInitializerId and the backing name->identity side-map. - matmul_nbits_sharing_identity.h (new): shared ComputeMatMulNBitsSharingId helper (MurmurHash3 over the generated tensors + quant params). - dq_matmulnbits_fusion.cc: tag the generated B in both fusion patterns via the shared helper. - qdq_actions.cc: tag the generated B in the default DQMatMulToMatMulNBitsAction (QDQ selector/action) path, which runs without the fusion flag — closing the gap where typical QDQ models never shared. - session_state.cc: enroll tagged initializers by identity; drop the option gate. - onnxruntime_session_options_config_keys.h: remove the now-unused kOrtSessionOptionsShareMatMulNBitsPrepackedWeights config key. Tests: - dq_matmulnbits_fusion_test.cc: fusion-path tag stability/collision-safety + end-to-end cross-session sharing tests. - qdq_matmulnbits_transformer_test.cc: default-path equivalents, plus a test that a different accuracy_level yields a different identity (no cross-compute-type sharing). - Remove the option-based sharing tests from matmul_4bits/8bits_test.cc and the shared test util; add an opt-in (disabled) pre-pack memset benchmark. --- include/onnxruntime/core/graph/graph.h | 16 ++ .../onnxruntime_session_options_config_keys.h | 17 -- onnxruntime/core/framework/session_state.cc | 34 +-- .../core/optimizer/dq_matmulnbits_fusion.cc | 20 +- .../optimizer/matmul_nbits_sharing_identity.h | 37 +++ .../selectors_actions/qdq_actions.cc | 18 +- .../test/contrib_ops/matmul_4bits_test.cc | 190 --------------- .../test/contrib_ops/matmul_8bits_test.cc | 25 -- .../matmul_nbits_prepack_sharing_test_util.cc | 4 - .../matmul_nbits_prepack_sharing_test_util.h | 5 - .../optimizer/dq_matmulnbits_fusion_test.cc | 152 ++++++++++++ .../qdq_matmulnbits_transformer_test.cc | 217 ++++++++++++++++++ 12 files changed, 468 insertions(+), 267 deletions(-) create mode 100644 onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index 815fc6aa69a60..e26d22558c1d1 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -1608,6 +1608,18 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi return *prepacked_weights_for_graph_; } + // Tags a fusion-generated initializer (whose name is not stable across sessions) with a stable, + // content-derived identity that SessionState uses to key cross-session pre-pack sharing. + void SetSharedPrepackInitializerId(const std::string& initializer_name, std::string share_id) { + generated_shared_prepack_ids_[initializer_name] = std::move(share_id); + } + + // Returns the sharing identity for a generated initializer, or nullptr if it was not tagged. + const std::string* GetSharedPrepackInitializerId(const std::string& initializer_name) const { + auto it = generated_shared_prepack_ids_.find(initializer_name); + return it == generated_shared_prepack_ids_.end() ? nullptr : &it->second; + } + /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */ const Node* ParentNode() const { return parent_node_; } @@ -2011,6 +2023,10 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi // This is optional due to delayed construction. std::optional prepacked_weights_for_graph_; + // Maps a fusion-generated initializer name to its cross-session sharing identity. + // See SetSharedPrepackInitializerId. + InlinedHashMap generated_shared_prepack_ids_; + #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) // Runtime optimization storage. // Note: runtime_optimizations_ == *runtime_optimizations_ptr_ and must be initialized diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index 13d7405b0f9fb..3efa6ae50faa7 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -372,23 +372,6 @@ static const char* const kOrtSessionOptionsModelExternalInitializersFileFolderPa static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers = "session.save_external_prepacked_constant_initializers"; -// Enables cross-session sharing of MatMulNBits pre-packed weights via an OrtPrepackedWeightsContainer, -// content-addressed by hash(packed_bytes) so weights with auto-generated names still deduplicate. This -// covers MatMulNBits weights synthesized at session-creation time (e.g. by the DQ + MatMul -> MatMulNBits -// fusion), whose names are not known ahead of time and so cannot be registered via OrtApi::AddInitializer. -// -// Scoped to MatMulNBits: content-addressed sharing is only safe when a kernel's packed bytes fully -// determine its Compute result, which MatMulNBits satisfies. Other CPU kernels are unaffected. -// -// Requires the session to be created via OrtApi::CreateSessionWithPrepackedWeightsContainer, with this -// option set consistently across all sharing sessions using the same container. -// -// - "0": Default. Only AddInitializer-registered initializers share pre-packed weights cross-session. -// - "1": Also share MatMulNBits pre-packed weights cross-session via the container. -// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsShareMatMulNBitsPrepackedWeights, "1") -static const char* const kOrtSessionOptionsShareMatMulNBitsPrepackedWeights = - "session.share_matmulnbits_prepacked_weights"; - // Use this config when you want to collect memory stats for each node in the graph. // The file format is a CSV file with the following columns: // The file will be created if it does not exist, and will be overwritten if it does. diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 1f42ca0258e4f..85def4898d21c 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -470,15 +470,7 @@ static std::string GenerateKeyForPrepackedWeightsMap(const std::string& op_type, Status SessionState::PrepackConstantInitializedTensors( InlinedHashMap& constant_initializers_use_count, const std::unordered_map& initializers_to_share_map) { - // When set, MatMulNBits pre-packed weights are content-addressed into the shared - // OrtPrepackedWeightsContainer for cross-session sharing. Needed for fusion-synthesized weights (e.g. - // DQ + MatMul -> MatMulNBits) whose auto-generated names can't be pre-registered via AddInitializer. - const bool share_matmulnbits_prepacked_weights = - sess_options_.config_options.GetConfigOrDefault( - kOrtSessionOptionsShareMatMulNBitsPrepackedWeights, "0") == "1"; - - auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map, - share_matmulnbits_prepacked_weights]( + auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map]( bool should_cache_prepacked_weights_for_shared_initializers) -> Status { for (auto& node : GetGraphViewer().Nodes()) { if (sess_options_.IsLoadCancellationFlagSet()) { @@ -506,14 +498,12 @@ Status SessionState::PrepackConstantInitializedTensors( auto iter = initializers_to_share_map.find(input_name); bool is_shared_initializer = (iter != initializers_to_share_map.end()); - // CPU EP only. By default only AddInitializer-registered initializers (is_shared_initializer) - // participate; share_matmulnbits_prepacked_weights also enrolls MatMulNBits weights, - // deduplicated content-addressed via hash(packed_bytes). Enrollment is restricted to - // MatMulNBits because content-addressed sharing is only safe when packed bytes fully - // determine Compute (which MatMulNBits satisfies); this also keeps the BUG CHECK below valid. - const bool enroll_matmulnbits_initializer = - share_matmulnbits_prepacked_weights && node.OpType() == "MatMulNBits"; - if ((is_shared_initializer || enroll_matmulnbits_initializer) && + // CPU EP only. An initializer joins the shared pre-packed container either when it was + // registered via OrtApi::AddInitializer (is_shared_initializer) or when a graph transformer + // tagged this synthesized initializer with a sharing identity (tagged_share_id). + const std::string* tagged_share_id = st->graph_.GetSharedPrepackInitializerId(input_name); + const bool enroll_tagged_initializer = (tagged_share_id != nullptr); + if ((is_shared_initializer || enroll_tagged_initializer) && should_cache_prepacked_weights_for_shared_initializers && node.GetExecutionProviderType() == kCpuExecutionProvider) { // caching of pre-packed weights' turned ON @@ -545,12 +535,12 @@ Status SessionState::PrepackConstantInitializedTensors( // TODO: Check if some version of the ONNX IR allows op_type to be empty ORT_ENFORCE(!op_type.empty(), "The op type of a node cannot be empty"); - // The key for the pre-packed weights container lookup is the op_type + hash of the prepacked-weight - // that we just got by invoking PrePack() on this kernel. - + // Tagged initializers are keyed by their sharing identity; AddInitializer ones by the + // packed-bytes hash. Both carry the op_type prefix. const std::string prepacked_weights_container_key = - GenerateKeyForPrepackedWeightsMap(op_type, - weights_to_be_filled_in); + enroll_tagged_initializer + ? (op_type + "+id+" + *tagged_share_id) + : GenerateKeyForPrepackedWeightsMap(op_type, weights_to_be_filled_in); bool container_contains_packed_weight = prepacked_weights_container_->HasWeight( prepacked_weights_container_key); diff --git a/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc b/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc index f3956d5e9e0f3..07fccef64fee1 100644 --- a/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc +++ b/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc @@ -12,6 +12,7 @@ #include "core/graph/graph_utils.h" #include "core/graph/node_attr_utils.h" #include "core/optimizer/initializer.h" +#include "core/optimizer/matmul_nbits_sharing_identity.h" #include "core/optimizer/utils.h" #include @@ -447,7 +448,6 @@ std::vector CollectDirectDQMatches( return direct_matches; } -// --------------------------------------------------------------------------- // Pattern 1 rewriting: DQ+Reshape+Transpose+[Cast]+MatMul/Gemm -> MatMulNBits // --------------------------------------------------------------------------- @@ -569,6 +569,10 @@ void ApplyReshapeTransposeFusions( zp_mnb_tp.emplace(utils::TensorToTensorProto(*zp_dst, zp_dst_name, true)); } + // Cross-session sharing identity for the generated weight group; computed before the tensors move. + const std::string share_id = + ComputeMatMulNBitsSharingId(weight_dst, scale_dst, zp_dst, N, K, block_size, /*bits*/ 4, accuracy_level); + NodeAttributes mnb_attrs; utils::SetNodeAttribute(utils::MakeAttribute("K", K), mnb_attrs); utils::SetNodeAttribute(utils::MakeAttribute("N", N), mnb_attrs); @@ -578,7 +582,10 @@ void ApplyReshapeTransposeFusions( std::vector mnb_inputs; mnb_inputs.push_back(const_cast(mm_node->InputDefs()[0])); - mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, weight_mnb_tp, std::move(weight_dst))); + NodeArg& b_weight_arg = graph_utils::AddInitializerWithOrtValue(graph, weight_mnb_tp, std::move(weight_dst)); + // Tag the generated B weight for cross-session pre-pack sharing. + graph.SetSharedPrepackInitializerId(b_weight_arg.Name(), share_id); + mnb_inputs.push_back(&b_weight_arg); mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, scale_mnb_tp, std::move(scale_dst))); if (zp_mnb_tp) { mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, zp_mnb_tp.value(), std::move(*zp_dst))); @@ -749,6 +756,10 @@ void ApplyDirectDQFusions( zp_mnb_tp.emplace(utils::TensorToTensorProto(*zp_dst, zp_dst_name, true)); } + // Cross-session sharing identity for the generated weight group; computed before the tensors move. + const std::string share_id = + ComputeMatMulNBitsSharingId(weight_dst, scale_dst, zp_dst, N, K, block_size, /*bits*/ 4, accuracy_level); + NodeAttributes mnb_attrs; utils::SetNodeAttribute(utils::MakeAttribute("K", K), mnb_attrs); utils::SetNodeAttribute(utils::MakeAttribute("N", N), mnb_attrs); @@ -758,7 +769,10 @@ void ApplyDirectDQFusions( std::vector mnb_inputs; mnb_inputs.push_back(const_cast(mm_node->InputDefs()[0])); - mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, weight_mnb_tp, std::move(weight_dst))); + NodeArg& b_weight_arg = graph_utils::AddInitializerWithOrtValue(graph, weight_mnb_tp, std::move(weight_dst)); + // Tag the generated B weight for cross-session pre-pack sharing. + graph.SetSharedPrepackInitializerId(b_weight_arg.Name(), share_id); + mnb_inputs.push_back(&b_weight_arg); mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, scale_mnb_tp, std::move(scale_dst))); if (zp_mnb_tp) { mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, zp_mnb_tp.value(), std::move(*zp_dst))); diff --git a/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h b/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h new file mode 100644 index 0000000000000..597c8a292afd8 --- /dev/null +++ b/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include + +#include "core/framework/murmurhash3.h" +#include "core/framework/tensor.h" + +namespace onnxruntime { + +// Stable, content-derived identity for a fusion-generated MatMulNBits weight group, used to share its +// pre-packed buffer across sessions. The id is identical for the same model in any session and differs +// whenever a semantic input differs. accuracy_level is hashed so buffers packed for different compute +// types never collide. Pass zero_point only when it is an actual kernel input. +inline std::string ComputeMatMulNBitsSharingId(const Tensor& weight, const Tensor& scale, + const std::optional& zero_point, + int64_t N, int64_t K, int64_t block_size, + int64_t bits, int64_t accuracy_level) { + uint32_t hash[4] = {0, 0, 0, 0}; + auto hash_bytes = [&hash](const void* data, size_t len) { + MurmurHash3::x86_128(data, len, hash[0], &hash); + }; + hash_bytes(weight.DataRaw(), weight.SizeInBytes()); + hash_bytes(scale.DataRaw(), scale.SizeInBytes()); + if (zero_point) { + hash_bytes(zero_point->DataRaw(), zero_point->SizeInBytes()); + } + const int64_t params[] = {N, K, block_size, bits, accuracy_level}; + hash_bytes(params, sizeof(params)); + return "MatMulNBits.DQ:" + std::to_string((static_cast(hash[1]) << 32) | hash[0]); +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc index b9d7e898157bd..6bd5e157d8b65 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc @@ -7,6 +7,7 @@ #include "core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h" #include "core/optimizer/qdq_transformer/qdq_util.h" #include "core/optimizer/initializer.h" +#include "core/optimizer/matmul_nbits_sharing_identity.h" #include "core/graph/node_attr_utils.h" #include "core/graph/graph_utils.h" #include "core/framework/tensorprotoutils.h" @@ -646,8 +647,23 @@ Status DQMatMulToMatMulNBitsAction::ProcessNewNode(Graph& graph, ORT_RETURN_IF_ERROR(TransposeDQWeightsForMatMulNBits( graph, *dq_node, "fused_DQ_MatMul", intra_op_thread_pool_, effective_bs, transposed)); + // Cross-session sharing identity for the generated B weight; computed before it is moved. + const auto* weight_arg = dq_node->InputDefs()[0]; + const auto* weight_shape = weight_arg->Shape(); + ORT_RETURN_IF_NOT(weight_shape != nullptr && weight_shape->dim_size() >= 2, + "Weight shape unavailable for DQ node ", dq_node->Name()); + const int64_t bits = DQWeightBits(weight_arg->TypeAsProto()->tensor_type().elem_type()); + const std::string share_id = ComputeMatMulNBitsSharingId( + transposed.weight, transposed.scale, transposed.zero_point, + weight_shape->dim(1).dim_value(), weight_shape->dim(0).dim_value(), + effective_bs, bits, accuracy_level_); + auto& input_defs = replacement_node.MutableInputDefs(); - input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, transposed.weight_proto, std::move(transposed.weight))); + NodeArg& b_weight_arg = + graph_utils::AddInitializerWithOrtValue(graph, transposed.weight_proto, std::move(transposed.weight)); + // Tag the generated B weight for cross-session pre-pack sharing. + graph.SetSharedPrepackInitializerId(b_weight_arg.Name(), share_id); + input_defs.push_back(&b_weight_arg); replacement_node.MutableInputArgsCount().push_back(1); input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, transposed.scale_proto, std::move(transposed.scale))); diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index 14aefbad816aa..dd5cfb73dfe31 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -636,55 +636,6 @@ TestOptions MakeSharingTestOptions(int64_t N, int64_t K, int64_t block_size, int } } // namespace -// Hash-based sharing: session.share_matmulnbits_prepacked_weights = "1" with no AddInitializer call. -// This is the path used by the DQ + MatMul -> MatMulNBits fusion, whose weights are synthesized at -// session-creation time with auto-generated names. Covers symmetric/asymmetric quantization, with and -// without bias, across several block sizes. -TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_Float32) { - for (bool has_zero_point : {false, true}) { - for (bool has_bias : {false, true}) { - for (int64_t block_size : {16, 32, 128}) { - RunTest(MakeSharingTestOptions(32, 256, block_size, /*accuracy_level*/ 0, has_zero_point, - has_bias, PrepackSharingMode::kShareMatMulNBitsPrepackedWeights)); - } - } - } -} - -TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_Float16) { - for (bool has_zero_point : {false, true}) { - for (bool has_bias : {false, true}) { - for (int64_t block_size : {16, 32, 128}) { - RunTest(MakeSharingTestOptions(32, 256, block_size, /*accuracy_level*/ 0, has_zero_point, - has_bias, PrepackSharingMode::kShareMatMulNBitsPrepackedWeights)); - } - } - } -} - -// Hash-based sharing across different accuracy levels. Each accuracy level selects a different MLAS -// compute type and therefore a different packed-weight layout, all of which must share correctly. -// accuracy_level 4 (DP4A/int8) requires block_size % 32 == 0, K % 128 == 0 and N % 16 == 0. -TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_AccuracyLevels) { - for (int64_t accuracy_level : {0, 1, 4}) { - RunTest(MakeSharingTestOptions(32, 128, /*block_size*/ 32, accuracy_level, /*has_zero_point*/ true, - /*has_bias*/ false, PrepackSharingMode::kShareMatMulNBitsPrepackedWeights)); - } -} - -// Hash-based sharing for the fp16 + accuracy_level 4 path (HQNBIT_CompInt8). This is distinct from -// the fp32 int8 path: fp16 scales are converted to fp32 at pack time, and on ARM64 KleidiAI the -// asymmetric 4-bit case packs scales as a separate placeholder buffer and folds the zero points in -// via BZpCorr during B packing. Cover symmetric/asymmetric (BZpCorr) with and without bias. -TEST(MatMulNBits, SharedPrepackedWeights_ShareAll_Float16_AccuracyLevel4) { - for (bool has_zero_point : {false, true}) { - for (bool has_bias : {false, true}) { - RunTest(MakeSharingTestOptions(32, 128, /*block_size*/ 32, /*accuracy_level*/ 4, has_zero_point, - has_bias, PrepackSharingMode::kShareMatMulNBitsPrepackedWeights)); - } - } -} - // Legacy sharing path: the weight B is registered as a shared initializer via // SessionOptions::AddInitializer. Covers float and float16 activations, symmetric/asymmetric, +/- bias. TEST(MatMulNBits, SharedPrepackedWeights_AddInitializer) { @@ -708,147 +659,6 @@ TEST(MatMulNBits, SharedPrepackedWeights_NotSharedWithoutOptIn) { PrepackSharingMode::kNoSharing)); } -namespace { -// Builds and serializes a single-node float MatMulNBits model. "A" is a runtime graph input; B, scales -// and zero_points are constant initializers and the single output is named "Y". The zero-point -// collision test below builds two such models that share byte-identical B and scales initializers but -// differ only in their zero_points. -void BuildMatMulNBitsModelBytes(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accuracy_level, - const std::vector& b_data, const std::vector& scales_data, - const std::vector& zp_data, std::string& model_bytes) { - const int64_t k_blocks = (K + block_size - 1) / block_size; - const int64_t blob_size = (block_size * QBits + 7) / 8; - const int64_t zp_blob_size = (k_blocks * QBits + 7) / 8; - - const std::unordered_map domain_to_version{{"", 21}, {kMSDomain, 1}}; - Model model("matmul_nbits_zp_collision", false, ModelMetaData(), PathString(), - IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, - std::vector(), DefaultLoggingManager().DefaultLogger()); - Graph& graph = model.MainGraph(); - ModelTestBuilder builder(graph); - - ONNX_NAMESPACE::TypeProto float_2d; - float_2d.mutable_tensor_type()->set_elem_type(utils::ToTensorProtoElementType()); - float_2d.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(M); - float_2d.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(K); - NodeArg* A = &graph.GetOrCreateNodeArg("A", &float_2d); - NodeArg* Y = &graph.GetOrCreateNodeArg("Y", nullptr); - - NodeArg* B = builder.MakeInitializer({N, k_blocks, blob_size}, b_data); - NodeArg* scales = builder.MakeInitializer({N, k_blocks}, scales_data); - NodeArg* zero_points = builder.MakeInitializer({N, zp_blob_size}, zp_data); - - Node& node = builder.AddNode("MatMulNBits", {A, B, scales, zero_points}, {Y}, kMSDomain); - node.AddAttribute("K", K); - node.AddAttribute("N", N); - node.AddAttribute("block_size", block_size); - node.AddAttribute("bits", static_cast(QBits)); - node.AddAttribute("accuracy_level", accuracy_level); - - graph.SetOutputs(std::vector{Y}); - ASSERT_STATUS_OK(graph.Resolve()); - ASSERT_TRUE(model.ToProto().SerializeToString(&model_bytes)); -} - -// Loads and runs the given serialized MatMulNBits model on the CPU EP with -// share_matmulnbits_prepacked_weights enabled, backed by the supplied shared pre-packed weights -// container. Returns the single "Y" output and the number of pre-packed weights this session served -// from the shared container. -void RunSharedPrepackSession(const std::string& model_bytes, const NameMLValMap& feeds, - PrepackedWeightsContainer& container, std::vector& fetches, - size_t& used_shared_count) { - SessionOptions so; - ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsShareMatMulNBitsPrepackedWeights, "1")); - - InferenceSessionWrapper session{so, GetEnvironment()}; - ASSERT_STATUS_OK(session.AddPrePackedWeightsContainer(&container)); - ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast(model_bytes.size()))); - ASSERT_STATUS_OK(session.Initialize()); - - RunOptions run_options; - ASSERT_STATUS_OK(session.Run(run_options, feeds, std::vector{"Y"}, &fetches)); - used_shared_count = session.GetSessionState().GetUsedSharedPrePackedWeightCounter(); -} -} // namespace - -// Regression test for pre-packed weight sharing: two MatMulNBits initializers with byte-identical -// quantized B and identical scales but DIFFERENT zero points must not be treated as identical by the -// content-addressed sharing cache. For the CompInt8 path (accuracy_level 4) the zero points are folded -// into the packed buffer (as the per-block sum). Before the fix the sharing hash was taken before the -// zero points were folded in, so the second session silently adopted the first session's buffer and -// produced a wrong result. -TEST(MatMulNBits, SharedPrepackedWeights_DifferentZeroPointsDoNotCollide) { - constexpr int64_t M = 2, N = 16, K = 128, block_size = 32; - constexpr int64_t accuracy_level = 4; // CompInt8 - the path that folds zero points into packed_b_. - const int64_t k_blocks = (K + block_size - 1) / block_size; - const int64_t blob_size = (block_size * QBits + 7) / 8; - const int64_t zp_blob_size = (k_blocks * QBits + 7) / 8; - - // Byte-identical B and scales for both models; only the zero points differ. - std::vector b_data(static_cast(N * k_blocks * blob_size)); - for (size_t i = 0; i < b_data.size(); ++i) { - b_data[i] = static_cast((i * 7 + 3) & 0xFF); - } - std::vector scales(static_cast(N * k_blocks)); - for (size_t i = 0; i < scales.size(); ++i) { - scales[i] = 0.02f + 0.001f * static_cast(i % 17); - } - std::vector zp1(static_cast(N * zp_blob_size)); - std::vector zp2(zp1.size()); - for (size_t i = 0; i < zp1.size(); ++i) { - zp1[i] = static_cast((i * 5 + 1) & 0xFF); - zp2[i] = static_cast(~zp1[i]); // every 4-bit zero point differs between the two models - } - - std::string model_zp1, model_zp2; - BuildMatMulNBitsModelBytes(M, N, K, block_size, accuracy_level, b_data, scales, zp1, model_zp1); - BuildMatMulNBitsModelBytes(M, N, K, block_size, accuracy_level, b_data, scales, zp2, model_zp2); - - auto cpu_allocator = TestCPUExecutionProvider()->CreatePreferredAllocators()[0]; - RandomValueGenerator random{1234}; - std::vector a_data = random.Gaussian(AsSpan({M, K}), 0.0f, 0.25f); - OrtValue a_value; - CreateMLValue(cpu_allocator, AsSpan({M, K}), a_data, &a_value); - NameMLValMap feeds{{"A", a_value}}; - - // Reference output for the zp2 model, computed in isolation (its own private container). - std::vector ref_fetches; - size_t ref_used = 0; - { - PrepackedWeightsContainer ref_container; - RunSharedPrepackSession(model_zp2, feeds, ref_container, ref_fetches, ref_used); - } - - // Share one container across a zp1 session (which warms the container with its finalized buffer) and - // a subsequent zp2 session. - PrepackedWeightsContainer shared_container; - std::vector warm_fetches, shared_fetches; - size_t warm_used = 0, shared_used = 0; - RunSharedPrepackSession(model_zp1, feeds, shared_container, warm_fetches, warm_used); - - // If the platform did not pre-pack B for this configuration there is nothing to collide on. - if (shared_container.GetNumberOfElements() == 0) { - GTEST_SKIP() << "CompInt8 pre-packing not available on this platform"; - } - - RunSharedPrepackSession(model_zp2, feeds, shared_container, shared_fetches, shared_used); - - // The zp2 session must NOT have reused the zp1 session's finalized buffer: different zero points - // produce different packed bytes and therefore a different sharing hash. - EXPECT_EQ(shared_used, static_cast(0)); - - // And the shared zp2 run must match the isolated zp2 reference element for element. - ASSERT_EQ(ref_fetches.size(), static_cast(1)); - ASSERT_EQ(shared_fetches.size(), static_cast(1)); - const Tensor& ref_tensor = ref_fetches[0].Get(); - const Tensor& shared_tensor = shared_fetches[0].Get(); - ASSERT_EQ(ref_tensor.Shape(), shared_tensor.Shape()); - const auto ref_span = ref_tensor.DataAsSpan(); - const auto shared_span = shared_tensor.DataAsSpan(); - for (size_t i = 0; i < ref_span.size(); ++i) { - EXPECT_EQ(ref_span[i], shared_span[i]) << "output mismatch at index " << i; - } -} #endif // !ENABLE_TRAINING #endif diff --git a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc index f066a9dbd80ac..411e83536c190 100644 --- a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc @@ -710,31 +710,6 @@ TestOptions8Bits MakeSharingTestOptions8Bits(int64_t block_size, bool has_zero_p } } // namespace -// Hash-based sharing for 8-bit weights: session.share_matmulnbits_prepacked_weights = "1" with no -// AddInitializer call. Covers symmetric/asymmetric quantization, with and without bias, across -// several block sizes. -TEST(MatMulNBits, SharedPrepackedWeights_8b_ShareAll_Float32) { - for (bool has_zero_point : {false, true}) { - for (bool has_bias : {false, true}) { - for (int64_t block_size : {16, 32, 128}) { - RunTest8Bits(MakeSharingTestOptions8Bits(block_size, has_zero_point, has_bias, - PrepackSharingMode::kShareMatMulNBitsPrepackedWeights)); - } - } - } -} - -TEST(MatMulNBits, SharedPrepackedWeights_8b_ShareAll_Float16) { - for (bool has_zero_point : {false, true}) { - for (bool has_bias : {false, true}) { - for (int64_t block_size : {16, 32, 128}) { - RunTest8Bits(MakeSharingTestOptions8Bits(block_size, has_zero_point, has_bias, - PrepackSharingMode::kShareMatMulNBitsPrepackedWeights)); - } - } - } -} - // Legacy sharing path for 8-bit weights: B is registered as a shared initializer via // SessionOptions::AddInitializer. TEST(MatMulNBits, SharedPrepackedWeights_8b_AddInitializer) { diff --git a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc index 515553c28cba8..97566afe02489 100644 --- a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc +++ b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc @@ -25,10 +25,6 @@ void CheckSharedPrepackedWeights(OpTester& test, PrepackSharingMode mode, OrtValue b_ortvalue; switch (mode) { - case PrepackSharingMode::kShareMatMulNBitsPrepackedWeights: - // Opt in to hash-based sharing of MatMulNBits pre-packed weights. - ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsShareMatMulNBitsPrepackedWeights, "1")); - break; case PrepackSharingMode::kAddInitializer: // Register B as an explicitly shared initializer (the pre-existing sharing mechanism). Tensor::InitOrtValue(DataTypeImpl::GetType(), TensorShape(b_dims), b_data.data(), diff --git a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h index 2d898f1c4a01f..1de0bbaa4bb85 100644 --- a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h +++ b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h @@ -12,11 +12,6 @@ class OpTester; // How two sessions are configured to share the pre-packed weights of a MatMulNBits node. enum class PrepackSharingMode { - // session.share_matmulnbits_prepacked_weights = "1": MatMulNBits pre-packed weights participate in the - // shared container, content-addressed by hash(packed_bytes). No OrtApi::AddInitializer call is required. - // This is the path used by the DQ + MatMul -> MatMulNBits fusion, whose weights are synthesized at - // session-creation time with auto-generated names. - kShareMatMulNBitsPrepackedWeights, // Legacy path: the weight is explicitly registered as a shared initializer via // SessionOptions::AddInitializer. kAddInitializer, diff --git a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc index 8aa4c88052742..385aa7ffebc66 100644 --- a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc +++ b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc @@ -7,14 +7,20 @@ #include "core/common/span_utils.h" #include "core/framework/int4.h" +#include "core/framework/prepacked_weights_container.h" +#include "core/graph/constants.h" +#include "core/graph/model.h" #include "core/graph/node_attr_utils.h" #include "core/optimizer/dq_matmulnbits_fusion.h" +#include "core/session/inference_session.h" +#include "core/session/onnxruntime_session_options_config_keys.h" #include "test/test_environment.h" #include "test/unittest_util/framework_test_utils.h" #include "test/unittest_util/graph_transform_test_builder.h" #include "test/optimizer/graph_transform_test_fixture.h" #include "test/util/include/asserts.h" +#include "test/util/include/inference_session_wrapper.h" #include "gtest/gtest.h" @@ -354,6 +360,152 @@ TEST_F(DQMatMulNBitsFusionTest, Pattern1_MatMul_NoZP) { TransformerLevel::Level1, 1, pre_check, post_check)); } +// Validates the cross-session-sharing tag the fusion attaches to the generated B weight. The tag is a +// stable, content-derived identity: identical source quantization groups must yield the SAME identity +// (so two sessions optimizing the same model share the pre-packed B), while any semantic difference -- +// here, different zero points -- must yield a DIFFERENT identity (so they must not falsely share). +TEST_F(DQMatMulNBitsFusionTest, TagsGeneratedWeightWithStableContentIdentity) { + constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; + const int64_t num_blocks = K / block_size; + + std::vector weight(static_cast(N * num_blocks * block_size)); + for (size_t i = 0; i < weight.size(); ++i) { + weight[i] = static_cast(i % 16); + } + std::vector scale(static_cast(N * num_blocks)); + for (size_t i = 0; i < scale.size(); ++i) { + scale[i] = 0.1f + 0.01f * static_cast(i % 10); + } + // Non-default (non-8) zero points so the fusion keeps them (it elides uniform-8 zero points). + std::vector zp_a(static_cast(N * num_blocks), 3); + std::vector zp_b(zp_a.size(), 5); + + // Runs the fusion on a Pattern-1 model built from the given zero points and returns the sharing + // identity tagged onto the generated MatMulNBits B weight. + auto tag_for = [&](const std::vector& zp) -> std::string { + std::string captured; + auto build = [&](ModelTestBuilder& builder) { + BuildPattern1Graph(builder, M, N, K, block_size, /*with_zp*/ true, /*with_cast*/ false, + /*use_gemm*/ false, &weight, &scale, &zp); + }; + auto pre_check = [](Graph&) -> Status { return Status::OK(); }; + auto post_check = [&](Graph& graph) -> Status { + int matmulnbits = 0; + for (const auto& node : graph.Nodes()) { + if (node.OpType() == "MatMulNBits") { + ++matmulnbits; + const std::string& b_name = node.InputDefs()[1]->Name(); // input 1 == quantized B + const std::string* id = graph.GetSharedPrepackInitializerId(b_name); + EXPECT_NE(id, nullptr) << "generated B weight was not tagged for cross-session sharing"; + if (id != nullptr) { + captured = *id; + } + } + } + EXPECT_EQ(matmulnbits, 1); + return Status::OK(); + }; + auto transformer = std::make_unique(4); + EXPECT_TRUE(TestGraphTransformer(build, 21, *logger_, std::move(transformer), + TransformerLevel::Level1, 1, pre_check, post_check) + .IsOK()); + return captured; + }; + + const std::string id_a1 = tag_for(zp_a); + const std::string id_a2 = tag_for(zp_a); + const std::string id_b = tag_for(zp_b); + + ASSERT_FALSE(id_a1.empty()); + EXPECT_EQ(id_a1, id_a2); // stable: identical source quantization group -> identical identity + EXPECT_NE(id_a1, id_b); // collision-safe: different zero points -> different identity +} + +// Builds and serializes a Pattern-1 DQ->Reshape->Transpose->MatMul model (UINT4 constant weight). When +// loaded into a session with the DQ->MatMulNBits fusion enabled, it becomes a MatMulNBits whose B is +// tagged for cross-session sharing. +static void SerializeDQMatMulModel(int64_t M, int64_t N, int64_t K, int64_t block_size, + const std::vector& weight, const std::vector& scale, + const std::vector& zp, std::string& model_bytes) { + const std::unordered_map domain_to_version{{"", 21}, {kMSDomain, 1}}; + Model model("dq_matmulnbits_share", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, + std::vector(), DefaultLoggingManager().DefaultLogger()); + ModelTestBuilder builder(model.MainGraph()); + BuildPattern1Graph(builder, M, N, K, block_size, /*with_zp*/ true, /*with_cast*/ false, + /*use_gemm*/ false, &weight, &scale, &zp); + builder.SetGraphOutputs(); + ASSERT_STATUS_OK(model.MainGraph().Resolve()); + ASSERT_TRUE(model.ToProto().SerializeToString(&model_bytes)); +} + +// Loads the serialized model on the CPU EP with the DQ->MatMulNBits fusion enabled and the supplied +// shared container. Reports whether the fusion produced a MatMulNBits and how many pre-packed weights +// this session served from the container. +static void RunSharedFusionSession(const std::string& model_bytes, PrepackedWeightsContainer& container, + bool& produced_matmulnbits, size_t& used_shared_count) { + SessionOptions so; + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsEnableDQMatMulNBitsFusion, "1")); + + InferenceSessionWrapper session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.AddPrePackedWeightsContainer(&container)); + ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast(model_bytes.size()))); + ASSERT_STATUS_OK(session.Initialize()); + + produced_matmulnbits = false; + for (const auto& node : session.GetGraph().Nodes()) { + if (node.OpType() == "MatMulNBits") { + produced_matmulnbits = true; + break; + } + } + used_shared_count = session.GetSessionState().GetUsedSharedPrePackedWeightCounter(); +} + +// End-to-end: two sessions optimizing the same DQ+MatMul model share the fused MatMulNBits B weight +// through a common container WITHOUT any session option -- the fusion tags it and SessionState enrolls +// it by that identity. A session over a model that differs only in its zero points must NOT share. +TEST_F(DQMatMulNBitsFusionTest, SharesFusedWeightAcrossSessionsViaTag) { + constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; + const int64_t num_blocks = K / block_size; + + std::vector weight(static_cast(N * num_blocks * block_size)); + for (size_t i = 0; i < weight.size(); ++i) { + weight[i] = static_cast(i % 16); + } + std::vector scale(static_cast(N * num_blocks)); + for (size_t i = 0; i < scale.size(); ++i) { + scale[i] = 0.1f + 0.01f * static_cast(i % 10); + } + std::vector zp_a(static_cast(N * num_blocks), 3); + std::vector zp_b(zp_a.size(), 5); // differs only in zero points + + std::string model_a, model_b; + SerializeDQMatMulModel(M, N, K, block_size, weight, scale, zp_a, model_a); + SerializeDQMatMulModel(M, N, K, block_size, weight, scale, zp_b, model_b); + + PrepackedWeightsContainer container; + bool fused1 = false, fused2 = false, fused_b = false; + size_t used1 = 0, used2 = 0, used_b = 0; + + RunSharedFusionSession(model_a, container, fused1, used1); + ASSERT_TRUE(fused1) << "DQ -> MatMulNBits fusion did not run"; + if (container.GetNumberOfElements() == 0) { + GTEST_SKIP() << "MatMulNBits B was not pre-packed on this platform"; + } + EXPECT_EQ(used1, static_cast(0)); // first session: nothing to share yet + + // Second session over the SAME model shares the tagged B from the container. + RunSharedFusionSession(model_a, container, fused2, used2); + ASSERT_TRUE(fused2); + EXPECT_GT(used2, static_cast(0)); + + // A model differing only in zero points has a different identity and must NOT reuse the buffer. + RunSharedFusionSession(model_b, container, fused_b, used_b); + ASSERT_TRUE(fused_b); + EXPECT_EQ(used_b, static_cast(0)); +} + TEST_F(DQMatMulNBitsFusionTest, Pattern1_MatMul_WithDefaultZP8) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc index a1c0f8adfffb7..d16707e9a9ad4 100644 --- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc @@ -2,10 +2,14 @@ // Licensed under the MIT License. #include +#include #include "core/common/span_utils.h" #include "core/common/float16.h" #include "core/framework/int4.h" +#include "core/framework/prepacked_weights_container.h" +#include "core/graph/constants.h" +#include "core/graph/model.h" #include "core/graph/node_attr_utils.h" #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h" #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h" @@ -1462,6 +1466,219 @@ TEST(QDQTransformerTests, DQGemmNotConvertedToMatMulNBits_Alpha) { 1e-5, 2e-5); } +// --------------------------------------------------------------------------- +// Cross-session pre-pack sharing for the DEFAULT DQ->MatMulNBits path +// --------------------------------------------------------------------------- +// DQMatMulToMatMulNBitsAction (in the QDQ selector/action transformer) runs without the +// session.enable_dq_matmulnbits_fusion flag and synthesizes the MatMulNBits B/scales/zp initializers +// with names that are NOT stable across sessions. It tags the generated B weight with a stable, +// content-derived identity that SessionState uses to share the pre-packed buffer across sessions. + +// Packs uint4 nibble values (row-major, 2 per byte) into UInt4x2 storage. +static std::vector PackUint4Nibbles(const std::vector& values) { + const size_t num_pairs = UInt4x2::CalcNumInt4Pairs(values.size()); + std::vector packed(num_pairs); + for (size_t i = 0; i < values.size(); i += 2) { + const uint8_t lo = values[i] & 0x0F; + const uint8_t hi = (i + 1 < values.size()) ? (values[i + 1] & 0x0F) : 0; + packed[i / 2] = UInt4x2(lo, hi); + } + return packed; +} + +// Builds a default-path model: a constant UINT4 weight [K, N] block-quantized along axis 0 feeding a +// DequantizeLinear whose output is the second input to a single MatMul. The QDQ selector/action +// transformer converts this into a MatMulNBits. Explicit weight/scale/zp give a deterministic identity. +static void BuildDefaultPathDQMatMul(ModelTestBuilder& builder, int64_t M, int64_t N, int64_t K, + int64_t block_size, const std::vector& weight, + const std::vector& scale, const std::vector& zp) { + const int64_t num_blocks = (K + block_size - 1) / block_size; + + auto* input_a = builder.MakeInput({M, K}, -1.0f, 1.0f); + auto* output = builder.MakeOutput(); + + auto* weight_arg = builder.MakeInitializer({K, N}, PackUint4Nibbles(weight)); + auto* scale_arg = builder.MakeInitializer({num_blocks, N}, scale); + auto* zp_arg = builder.MakeInitializer({num_blocks, N}, PackUint4Nibbles(zp)); + + NodeAttributes dq_attrs; + utils::SetNodeAttribute(utils::MakeAttribute("axis", static_cast(0)), dq_attrs); + utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), dq_attrs); + auto* dq_output = builder.MakeIntermediate(); + builder.AddNode("DequantizeLinear", {weight_arg, scale_arg, zp_arg}, {dq_output}, "", &dq_attrs); + + builder.AddNode("MatMul", {input_a, dq_output}, {output}); +} + +// Serializes a default-path DQ->MatMul model built from explicit quantization data. +static void SerializeDefaultPathModel(int64_t M, int64_t N, int64_t K, int64_t block_size, + const std::vector& weight, const std::vector& scale, + const std::vector& zp, std::string& model_bytes) { + const std::unordered_map domain_to_version{{"", 21}, {kMSDomain, 1}}; + Model model("dq_matmul_default_share", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, + std::vector(), DefaultLoggingManager().DefaultLogger()); + ModelTestBuilder builder(model.MainGraph()); + BuildDefaultPathDQMatMul(builder, M, N, K, block_size, weight, scale, zp); + builder.SetGraphOutputs(); + ASSERT_STATUS_OK(model.MainGraph().Resolve()); + ASSERT_TRUE(model.ToProto().SerializeToString(&model_bytes)); +} + +// Loads the model on the CPU EP with the given shared container and DEFAULT options (no fusion flag). +// Reports whether a MatMulNBits was produced, the sharing identity tagged onto its B weight, and how +// many pre-packed weights this session served from the container. +static void RunDefaultPathSession(const std::string& model_bytes, PrepackedWeightsContainer& container, + bool& produced_matmulnbits, std::string& b_tag, size_t& used_shared_count, + int accuracy_level = -1) { + SessionOptions so; + if (accuracy_level >= 0) { + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel, + std::to_string(accuracy_level).c_str())); + } + InferenceSessionWrapper session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.AddPrePackedWeightsContainer(&container)); + ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast(model_bytes.size()))); + ASSERT_STATUS_OK(session.Initialize()); + + produced_matmulnbits = false; + b_tag.clear(); + const Graph& graph = session.GetGraph(); + for (const auto& node : graph.Nodes()) { + if (node.OpType() == "MatMulNBits") { + produced_matmulnbits = true; + const std::string& b_name = node.InputDefs()[1]->Name(); // input 1 == quantized B + if (const std::string* id = graph.GetSharedPrepackInitializerId(b_name); id != nullptr) { + b_tag = *id; + } + break; + } + } + used_shared_count = session.GetSessionState().GetUsedSharedPrePackedWeightCounter(); +} + +// Verifies the default DQ->MatMulNBits path tags its generated B weight with a stable, content-derived +// identity: identical quantization data yields the SAME identity, while different zero points yield a +// DIFFERENT identity (so two models differing only in zp must not falsely share a pre-packed buffer). +TEST(QDQTransformerTests, DefaultPath_TagsGeneratedWeightWithStableContentIdentity) { + constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; + const int64_t num_blocks = K / block_size; + + std::vector weight(static_cast(K * N)); + for (size_t i = 0; i < weight.size(); ++i) { + weight[i] = static_cast(i % 16); + } + std::vector scale(static_cast(num_blocks * N)); + for (size_t i = 0; i < scale.size(); ++i) { + scale[i] = 0.1f + 0.01f * static_cast(i % 10); + } + std::vector zp_a(static_cast(num_blocks * N), 3); + std::vector zp_b(zp_a.size(), 5); + + auto tag_for = [&](const std::vector& zp) -> std::string { + std::string model_bytes; + SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp, model_bytes); + PrepackedWeightsContainer container; + bool produced = false; + std::string tag; + size_t used = 0; + RunDefaultPathSession(model_bytes, container, produced, tag, used); + EXPECT_TRUE(produced) << "DQ -> MatMulNBits conversion did not run on the default path"; + return tag; + }; + + const std::string id_a1 = tag_for(zp_a); + const std::string id_a2 = tag_for(zp_a); + const std::string id_b = tag_for(zp_b); + + ASSERT_FALSE(id_a1.empty()) << "generated B weight was not tagged for cross-session sharing"; + EXPECT_EQ(id_a1, id_a2); // stable: identical quantization data -> identical identity + EXPECT_NE(id_a1, id_b); // collision-safe: different zero points -> different identity +} + +// End-to-end: two sessions converting the same model via the default path share the MatMulNBits B +// pre-packed buffer through a common container (no session option). A model differing only in zero +// points has a different identity and must not reuse the buffer. +TEST(QDQTransformerTests, DefaultPath_SharesWeightAcrossSessionsViaTag) { + constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; + const int64_t num_blocks = K / block_size; + + std::vector weight(static_cast(K * N)); + for (size_t i = 0; i < weight.size(); ++i) { + weight[i] = static_cast(i % 16); + } + std::vector scale(static_cast(num_blocks * N)); + for (size_t i = 0; i < scale.size(); ++i) { + scale[i] = 0.1f + 0.01f * static_cast(i % 10); + } + std::vector zp_a(static_cast(num_blocks * N), 3); + std::vector zp_b(zp_a.size(), 5); // differs only in zero points + + std::string model_a, model_b; + SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp_a, model_a); + SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp_b, model_b); + + PrepackedWeightsContainer container; + bool produced1 = false, produced2 = false, produced_b = false; + std::string tag1, tag2, tag_b; + size_t used1 = 0, used2 = 0, used_b = 0; + + RunDefaultPathSession(model_a, container, produced1, tag1, used1); + ASSERT_TRUE(produced1) << "DQ -> MatMulNBits conversion did not run on the default path"; + if (container.GetNumberOfElements() == 0) { + GTEST_SKIP() << "MatMulNBits B was not pre-packed on this platform"; + } + EXPECT_EQ(used1, static_cast(0)); // first session: nothing to share yet + + // Second session over the SAME model reuses the tagged B from the container. + RunDefaultPathSession(model_a, container, produced2, tag2, used2); + ASSERT_TRUE(produced2); + EXPECT_GT(used2, static_cast(0)); + + // A model differing only in zero points must NOT reuse the buffer. + RunDefaultPathSession(model_b, container, produced_b, tag_b, used_b); + ASSERT_TRUE(produced_b); + EXPECT_EQ(used_b, static_cast(0)); +} + +// The sharing identity includes accuracy_level, so the same weights compiled for different compute +// types (e.g. CompFp32 at level 0 vs CompInt8 at level 4) get DIFFERENT identities and must not share +// a pre-packed buffer whose layout depends on that compute type. +TEST(QDQTransformerTests, DefaultPath_DifferentAccuracyLevelDoesNotShare) { + constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; + const int64_t num_blocks = K / block_size; + + std::vector weight(static_cast(K * N)); + for (size_t i = 0; i < weight.size(); ++i) { + weight[i] = static_cast(i % 16); + } + std::vector scale(static_cast(num_blocks * N)); + for (size_t i = 0; i < scale.size(); ++i) { + scale[i] = 0.1f + 0.01f * static_cast(i % 10); + } + std::vector zp(static_cast(num_blocks * N), 3); + + std::string model_bytes; + SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp, model_bytes); + + PrepackedWeightsContainer container; + bool produced0 = false, produced4 = false; + std::string tag0, tag4; + size_t used0 = 0, used4 = 0; + + RunDefaultPathSession(model_bytes, container, produced0, tag0, used0, /*accuracy_level*/ 0); + ASSERT_TRUE(produced0) << "DQ -> MatMulNBits conversion did not run on the default path"; + + // Same model/weights, different accuracy level, sharing the same container. + RunDefaultPathSession(model_bytes, container, produced4, tag4, used4, /*accuracy_level*/ 4); + ASSERT_TRUE(produced4); + + ASSERT_FALSE(tag0.empty()); + ASSERT_FALSE(tag4.empty()); + EXPECT_NE(tag0, tag4); // accuracy_level participates in the identity + EXPECT_EQ(used4, static_cast(0)); // different identity => no cross-accuracy sharing +} + #endif // !defined(DISABLE_CONTRIB_OPS) } // namespace test From 2f1e6ed8b1fe60af30d9286418620a5a294a0c48 Mon Sep 17 00:00:00 2001 From: Dusan Erdeljan Date: Thu, 25 Jun 2026 09:59:42 +0200 Subject: [PATCH 07/13] Key MatMulNBits prepack sharing by packed bytes; harden id and invariant Addresses review feedback on the MatMulNBits cross-session prepacked-weight sharing feature. Four related changes: session_state: key the shared-prepack container by the packed-bytes hash (GenerateKeyForPrepackedWeightsMap) for tagged initializers too, exactly as the AddInitializer path already does; the fusion tag is now only the enrollment signal. The tag is derived from the *unpacked* initializer content, so using it as the key let two sessions that differ in any option affecting the packed layout (mlas.use_lut_gemm, a CPU backend-selector difference, or the compute type) reuse an incompatible packed buffer -- wrong results/crash. Keying by the packed bytes only ever shares byte-identical buffers. graph: enforce the single-consumer invariant in SetSharedPrepackInitializerId. A MatMulNBits packed buffer folds in the consuming node's scales/zero-points/ attributes, so a sharing id is valid only for a B initializer with exactly one consumer (guaranteed today by the DQ->MatMulNBits producers). ORT_ENFORCE that a name is never re-tagged with a conflicting id so the guarantee survives later refactors. matmul_nbits_sharing_identity: fold each segment's full 128-bit MurmurHash3 output into a 64-bit accumulator instead of forwarding only hash[0] (a 32-bit seed bottleneck). Every input bit now reaches the id, raising collision resistance from ~2^32 to ~2^64; a collision would silently adopt another weight group's already-packed buffer. tests: make the negative "must not share" cases differ in the weight, which changes the packed bytes on every compute type, instead of the zero points or accuracy level (those only change the bytes under CompInt8 -- on CompFp32 they are applied at compute time and left out of the packed B, so such models correctly share a byte-identical buffer). Rename DefaultPath_DifferentAccuracyLevelDoesNotShare to ...GetsDistinctIdentity and assert the identity is distinct rather than a platform-dependent sharing count. Update comments to reflect packed-bytes keying. --- include/onnxruntime/core/graph/graph.h | 18 +++++- onnxruntime/core/framework/session_state.cc | 24 +++++--- .../optimizer/matmul_nbits_sharing_identity.h | 34 ++++++++--- .../optimizer/dq_matmulnbits_fusion_test.cc | 42 ++++++++----- .../qdq_matmulnbits_transformer_test.cc | 61 +++++++++++-------- 5 files changed, 122 insertions(+), 57 deletions(-) diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index e26d22558c1d1..a6d8eaecad0c0 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -1610,8 +1610,24 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi // Tags a fusion-generated initializer (whose name is not stable across sessions) with a stable, // content-derived identity that SessionState uses to key cross-session pre-pack sharing. + // + // Single-consumer invariant: a MatMulNBits packed buffer folds in the *consuming* node's + // scales/zero_points/attributes, not B alone, so this id is meaningful only for a B initializer that + // has exactly one consumer. The DQ->MatMulNBits producers guarantee that -- each generated B has a + // unique name with a single consumer, and the fusion bails when the source weight/scale is shared (the + // DQMatMulNotConvertedToMatMulNBits_SharedWeight case). If a future change ever tags a multi-consumer + // initializer whose consumers differ in scales/zp/attrs, they would compute different ids for the same + // name and the last writer would silently mis-share. Enforce that a name is never re-tagged with a + // conflicting id so the invariant survives later refactors. void SetSharedPrepackInitializerId(const std::string& initializer_name, std::string share_id) { - generated_shared_prepack_ids_[initializer_name] = std::move(share_id); + auto it = generated_shared_prepack_ids_.find(initializer_name); + if (it != generated_shared_prepack_ids_.end()) { + ORT_ENFORCE(it->second == share_id, "MatMulNBits pre-pack sharing id for initializer '", + initializer_name, "' was re-tagged with a different id; the single-consumer invariant ", + "is violated (a multi-consumer weight whose consumers differ in scales/zp/attrs)."); + return; + } + generated_shared_prepack_ids_.emplace(initializer_name, std::move(share_id)); } // Returns the sharing identity for a generated initializer, or nullptr if it was not tagged. diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 85def4898d21c..1a224e5908c23 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -500,9 +500,11 @@ Status SessionState::PrepackConstantInitializedTensors( // CPU EP only. An initializer joins the shared pre-packed container either when it was // registered via OrtApi::AddInitializer (is_shared_initializer) or when a graph transformer - // tagged this synthesized initializer with a sharing identity (tagged_share_id). - const std::string* tagged_share_id = st->graph_.GetSharedPrepackInitializerId(input_name); - const bool enroll_tagged_initializer = (tagged_share_id != nullptr); + // tagged this synthesized initializer with a sharing identity. Only the tag's *presence* + // matters here: it is the enrollment signal. The container key below is the packed-bytes + // hash, never the tag value (see the rationale at the key computation). + const bool enroll_tagged_initializer = + (st->graph_.GetSharedPrepackInitializerId(input_name) != nullptr); if ((is_shared_initializer || enroll_tagged_initializer) && should_cache_prepacked_weights_for_shared_initializers && node.GetExecutionProviderType() == kCpuExecutionProvider) { @@ -535,12 +537,18 @@ Status SessionState::PrepackConstantInitializedTensors( // TODO: Check if some version of the ONNX IR allows op_type to be empty ORT_ENFORCE(!op_type.empty(), "The op type of a node cannot be empty"); - // Tagged initializers are keyed by their sharing identity; AddInitializer ones by the - // packed-bytes hash. Both carry the op_type prefix. + // Key by the packed-bytes hash (op_type + a hash of the packed buffer), exactly as the + // AddInitializer path does, so only byte-identical packed buffers are ever shared. The + // tag is solely the enrollment signal that opted this fusion-generated initializer into + // the container; it must NOT be used as the key, because it is derived from the + // *unpacked* initializer content and so cannot distinguish packings that differ by node + // options/attributes that change the packed layout (e.g. mlas.use_lut_gemm or a CPU + // backend-selector difference). Two sessions that share a container but differ in such an + // option compute the same tag yet produce different packed bytes; keying by the packed + // bytes gives them distinct keys and prevents reusing an incompatible buffer + // (wrong results/crash). const std::string prepacked_weights_container_key = - enroll_tagged_initializer - ? (op_type + "+id+" + *tagged_share_id) - : GenerateKeyForPrepackedWeightsMap(op_type, weights_to_be_filled_in); + GenerateKeyForPrepackedWeightsMap(op_type, weights_to_be_filled_in); bool container_contains_packed_weight = prepacked_weights_container_->HasWeight( prepacked_weights_container_key); diff --git a/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h b/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h index 597c8a292afd8..829a78d3ebcf1 100644 --- a/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h +++ b/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h @@ -20,18 +20,36 @@ inline std::string ComputeMatMulNBitsSharingId(const Tensor& weight, const Tenso const std::optional& zero_point, int64_t N, int64_t K, int64_t block_size, int64_t bits, int64_t accuracy_level) { - uint32_t hash[4] = {0, 0, 0, 0}; - auto hash_bytes = [&hash](const void* data, size_t len) { - MurmurHash3::x86_128(data, len, hash[0], &hash); + // MurmurHash3 fmix64 finalizer: a bijection that avalanches a 64-bit value so each input bit affects + // every output bit. + auto fmix64 = [](uint64_t x) { + x ^= x >> 33; + x *= 0xff51afd7ed558ccdULL; + x ^= x >> 33; + x *= 0xc4ceb9fe1a85ec53ULL; + x ^= x >> 33; + return x; }; - hash_bytes(weight.DataRaw(), weight.SizeInBytes()); - hash_bytes(scale.DataRaw(), scale.SizeInBytes()); + // Fold each segment's full 128-bit hash into the 64-bit accumulator and carry the whole accumulator + // forward, not just a 32-bit seed. Every bit of weight/scale/zero_point/params therefore reaches the + // id, so collision resistance tracks the 64-bit id width instead of the ~2^32 a chain forwarding only + // hash[0] would give. A collision would let one weight group adopt another's already-packed buffer and + // silently compute a wrong result, so the wider margin is worth the few extra mixing ops. + uint64_t acc = 0; + auto mix = [&acc, &fmix64](const void* data, size_t len) { + uint32_t h[4]; + MurmurHash3::x86_128(data, len, static_cast(acc), h); + acc = fmix64(acc ^ ((static_cast(h[1]) << 32) | h[0])); + acc = fmix64(acc ^ ((static_cast(h[3]) << 32) | h[2])); + }; + mix(weight.DataRaw(), weight.SizeInBytes()); + mix(scale.DataRaw(), scale.SizeInBytes()); if (zero_point) { - hash_bytes(zero_point->DataRaw(), zero_point->SizeInBytes()); + mix(zero_point->DataRaw(), zero_point->SizeInBytes()); } const int64_t params[] = {N, K, block_size, bits, accuracy_level}; - hash_bytes(params, sizeof(params)); - return "MatMulNBits.DQ:" + std::to_string((static_cast(hash[1]) << 32) | hash[0]); + mix(params, sizeof(params)); + return "MatMulNBits.DQ:" + std::to_string(acc); } } // namespace onnxruntime diff --git a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc index 385aa7ffebc66..0dbfc02423b77 100644 --- a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc +++ b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc @@ -4,6 +4,7 @@ // Unit tests for the DQMatMulNBitsFusion graph transformer. // Tests Pattern 1: DQ(3D,axis=2)->Reshape->Transpose([1,0])->[Cast]->MatMul/Gemm -> MatMulNBits // Tests Pattern 2: DQ(2D,axis=0)->MatMul/Gemm -> MatMulNBits + #include #include "core/common/span_utils.h" #include "core/framework/int4.h" @@ -361,9 +362,10 @@ TEST_F(DQMatMulNBitsFusionTest, Pattern1_MatMul_NoZP) { } // Validates the cross-session-sharing tag the fusion attaches to the generated B weight. The tag is a -// stable, content-derived identity: identical source quantization groups must yield the SAME identity -// (so two sessions optimizing the same model share the pre-packed B), while any semantic difference -- -// here, different zero points -- must yield a DIFFERENT identity (so they must not falsely share). +// stable, content-derived enrollment identity: identical source quantization groups yield the SAME +// identity, while a semantic difference -- here, different zero points -- yields a DIFFERENT identity. +// (The tag only enrolls B into the shared container; the actual sharing is keyed by the packed-bytes +// hash, so a stable, content-distinct tag just keeps enrollment deterministic across sessions.) TEST_F(DQMatMulNBitsFusionTest, TagsGeneratedWeightWithStableContentIdentity) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; @@ -463,8 +465,11 @@ static void RunSharedFusionSession(const std::string& model_bytes, PrepackedWeig } // End-to-end: two sessions optimizing the same DQ+MatMul model share the fused MatMulNBits B weight -// through a common container WITHOUT any session option -- the fusion tags it and SessionState enrolls -// it by that identity. A session over a model that differs only in its zero points must NOT share. +// through a common container WITHOUT any session option -- the fusion tags it to enroll it, and +// SessionState keys the sharing by the packed-bytes hash. A model whose quantized weight differs packs +// to different bytes, so it gets a different key and must NOT share. (A zero-point-only difference is +// intentionally NOT used: on the CompFp32 path the zero points are not folded into the packed B, so two +// such models pack identically and would correctly share a byte-identical buffer.) TEST_F(DQMatMulNBitsFusionTest, SharesFusedWeightAcrossSessionsViaTag) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; @@ -473,20 +478,24 @@ TEST_F(DQMatMulNBitsFusionTest, SharesFusedWeightAcrossSessionsViaTag) { for (size_t i = 0; i < weight.size(); ++i) { weight[i] = static_cast(i % 16); } + // A different quantized weight -> different packed B on every compute type (unlike a zp-only change). + std::vector weight_other(weight.size()); + for (size_t i = 0; i < weight_other.size(); ++i) { + weight_other[i] = static_cast((i + 7) % 16); + } std::vector scale(static_cast(N * num_blocks)); for (size_t i = 0; i < scale.size(); ++i) { scale[i] = 0.1f + 0.01f * static_cast(i % 10); } - std::vector zp_a(static_cast(N * num_blocks), 3); - std::vector zp_b(zp_a.size(), 5); // differs only in zero points + std::vector zp(static_cast(N * num_blocks), 3); - std::string model_a, model_b; - SerializeDQMatMulModel(M, N, K, block_size, weight, scale, zp_a, model_a); - SerializeDQMatMulModel(M, N, K, block_size, weight, scale, zp_b, model_b); + std::string model_a, model_other; + SerializeDQMatMulModel(M, N, K, block_size, weight, scale, zp, model_a); + SerializeDQMatMulModel(M, N, K, block_size, weight_other, scale, zp, model_other); PrepackedWeightsContainer container; - bool fused1 = false, fused2 = false, fused_b = false; - size_t used1 = 0, used2 = 0, used_b = 0; + bool fused1 = false, fused2 = false, fused_other = false; + size_t used1 = 0, used2 = 0, used_other = 0; RunSharedFusionSession(model_a, container, fused1, used1); ASSERT_TRUE(fused1) << "DQ -> MatMulNBits fusion did not run"; @@ -500,10 +509,11 @@ TEST_F(DQMatMulNBitsFusionTest, SharesFusedWeightAcrossSessionsViaTag) { ASSERT_TRUE(fused2); EXPECT_GT(used2, static_cast(0)); - // A model differing only in zero points has a different identity and must NOT reuse the buffer. - RunSharedFusionSession(model_b, container, fused_b, used_b); - ASSERT_TRUE(fused_b); - EXPECT_EQ(used_b, static_cast(0)); + // A model with a different quantized weight packs to different bytes -> different key, so it must NOT + // reuse the buffer (on any compute type). + RunSharedFusionSession(model_other, container, fused_other, used_other); + ASSERT_TRUE(fused_other); + EXPECT_EQ(used_other, static_cast(0)); } TEST_F(DQMatMulNBitsFusionTest, Pattern1_MatMul_WithDefaultZP8) { diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc index d16707e9a9ad4..14d0ade2ffa64 100644 --- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc @@ -1471,8 +1471,11 @@ TEST(QDQTransformerTests, DQGemmNotConvertedToMatMulNBits_Alpha) { // --------------------------------------------------------------------------- // DQMatMulToMatMulNBitsAction (in the QDQ selector/action transformer) runs without the // session.enable_dq_matmulnbits_fusion flag and synthesizes the MatMulNBits B/scales/zp initializers -// with names that are NOT stable across sessions. It tags the generated B weight with a stable, -// content-derived identity that SessionState uses to share the pre-packed buffer across sessions. +// with names that are NOT stable across sessions. It tags the generated B weight with a sharing +// identity that SessionState treats as the enrollment signal opting the buffer into the cross-session +// container; the actual sharing is keyed by the packed-bytes hash (only byte-identical packed buffers +// are reused, exactly like the AddInitializer path), so packings that differ by compute type/options +// are never falsely shared. // Packs uint4 nibble values (row-major, 2 per byte) into UInt4x2 storage. static std::vector PackUint4Nibbles(const std::vector& values) { @@ -1558,8 +1561,9 @@ static void RunDefaultPathSession(const std::string& model_bytes, PrepackedWeigh } // Verifies the default DQ->MatMulNBits path tags its generated B weight with a stable, content-derived -// identity: identical quantization data yields the SAME identity, while different zero points yield a -// DIFFERENT identity (so two models differing only in zp must not falsely share a pre-packed buffer). +// enrollment identity: identical quantization data yields the SAME identity, while different zero points +// yield a DIFFERENT identity. (The tag only enrolls the buffer for sharing; the container keys by the +// packed-bytes hash. A stable, content-distinct tag keeps enrollment deterministic across sessions.) TEST(QDQTransformerTests, DefaultPath_TagsGeneratedWeightWithStableContentIdentity) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; @@ -1597,8 +1601,11 @@ TEST(QDQTransformerTests, DefaultPath_TagsGeneratedWeightWithStableContentIdenti } // End-to-end: two sessions converting the same model via the default path share the MatMulNBits B -// pre-packed buffer through a common container (no session option). A model differing only in zero -// points has a different identity and must not reuse the buffer. +// pre-packed buffer through a common container (no session option). A model whose quantized weight +// differs packs to different bytes -> different container key, so it must not reuse the buffer. (A +// zero-point-only difference is intentionally NOT used here: on the CompFp32 path the zero points are +// applied at compute time and left out of the packed B, so two such models pack identically and would +// correctly share -- packed-bytes keying only ever reuses byte-identical buffers.) TEST(QDQTransformerTests, DefaultPath_SharesWeightAcrossSessionsViaTag) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; @@ -1607,21 +1614,25 @@ TEST(QDQTransformerTests, DefaultPath_SharesWeightAcrossSessionsViaTag) { for (size_t i = 0; i < weight.size(); ++i) { weight[i] = static_cast(i % 16); } + // A different quantized weight -> different packed B on every compute type (unlike a zp-only change). + std::vector weight_other(weight.size()); + for (size_t i = 0; i < weight_other.size(); ++i) { + weight_other[i] = static_cast((i + 7) % 16); + } std::vector scale(static_cast(num_blocks * N)); for (size_t i = 0; i < scale.size(); ++i) { scale[i] = 0.1f + 0.01f * static_cast(i % 10); } - std::vector zp_a(static_cast(num_blocks * N), 3); - std::vector zp_b(zp_a.size(), 5); // differs only in zero points + std::vector zp(static_cast(num_blocks * N), 3); - std::string model_a, model_b; - SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp_a, model_a); - SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp_b, model_b); + std::string model_a, model_other; + SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp, model_a); + SerializeDefaultPathModel(M, N, K, block_size, weight_other, scale, zp, model_other); PrepackedWeightsContainer container; - bool produced1 = false, produced2 = false, produced_b = false; - std::string tag1, tag2, tag_b; - size_t used1 = 0, used2 = 0, used_b = 0; + bool produced1 = false, produced2 = false, produced_other = false; + std::string tag1, tag2, tag_other; + size_t used1 = 0, used2 = 0, used_other = 0; RunDefaultPathSession(model_a, container, produced1, tag1, used1); ASSERT_TRUE(produced1) << "DQ -> MatMulNBits conversion did not run on the default path"; @@ -1635,16 +1646,19 @@ TEST(QDQTransformerTests, DefaultPath_SharesWeightAcrossSessionsViaTag) { ASSERT_TRUE(produced2); EXPECT_GT(used2, static_cast(0)); - // A model differing only in zero points must NOT reuse the buffer. - RunDefaultPathSession(model_b, container, produced_b, tag_b, used_b); - ASSERT_TRUE(produced_b); - EXPECT_EQ(used_b, static_cast(0)); + // A model with a different quantized weight packs to different bytes -> different key, so it must NOT + // reuse the buffer (on any compute type). + RunDefaultPathSession(model_other, container, produced_other, tag_other, used_other); + ASSERT_TRUE(produced_other); + EXPECT_EQ(used_other, static_cast(0)); } -// The sharing identity includes accuracy_level, so the same weights compiled for different compute -// types (e.g. CompFp32 at level 0 vs CompInt8 at level 4) get DIFFERENT identities and must not share -// a pre-packed buffer whose layout depends on that compute type. -TEST(QDQTransformerTests, DefaultPath_DifferentAccuracyLevelDoesNotShare) { +// accuracy_level participates in the enrollment identity, so the same weights requested at different +// accuracy levels get distinct identities. Whether the two sessions then share the packed buffer is +// platform-dependent (level 4 may pack as CompInt8 -- different bytes, no share -- or fall back to the +// same CompFp32 packing as level 0 and benignly reuse the byte-identical buffer); packed-bytes keying +// makes either outcome safe, so this asserts the identity is distinct, not a fixed sharing count. +TEST(QDQTransformerTests, DefaultPath_DifferentAccuracyLevelGetsDistinctIdentity) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; @@ -1675,8 +1689,7 @@ TEST(QDQTransformerTests, DefaultPath_DifferentAccuracyLevelDoesNotShare) { ASSERT_FALSE(tag0.empty()); ASSERT_FALSE(tag4.empty()); - EXPECT_NE(tag0, tag4); // accuracy_level participates in the identity - EXPECT_EQ(used4, static_cast(0)); // different identity => no cross-accuracy sharing + EXPECT_NE(tag0, tag4); // accuracy_level participates in the enrollment identity } #endif // !defined(DISABLE_CONTRIB_OPS) From 1bb7bd1c14db0a3395e6b350feb17aa1af6308dc Mon Sep 17 00:00:00 2001 From: derdeljan-msft Date: Thu, 25 Jun 2026 10:07:04 +0200 Subject: [PATCH 08/13] Update onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc index 0dbfc02423b77..7fd842ab0da83 100644 --- a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc +++ b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc @@ -4,7 +4,7 @@ // Unit tests for the DQMatMulNBitsFusion graph transformer. // Tests Pattern 1: DQ(3D,axis=2)->Reshape->Transpose([1,0])->[Cast]->MatMul/Gemm -> MatMulNBits // Tests Pattern 2: DQ(2D,axis=0)->MatMul/Gemm -> MatMulNBits - #include +#include #include "core/common/span_utils.h" #include "core/framework/int4.h" From 926a6de0783703ee2fe15bcc2600d27f49328f0b Mon Sep 17 00:00:00 2001 From: Dusan Erdeljan Date: Thu, 25 Jun 2026 12:25:22 +0200 Subject: [PATCH 09/13] debug: Try to disable newly introduced pre-packed weight sharing tests --- onnxruntime/test/contrib_ops/matmul_2bits_test.cc | 3 ++- onnxruntime/test/contrib_ops/matmul_4bits_test.cc | 6 ++++-- onnxruntime/test/contrib_ops/matmul_8bits_test.cc | 6 ++++-- onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc | 6 ++++-- .../test/optimizer/qdq_matmulnbits_transformer_test.cc | 9 ++++++--- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc index 8e133caa15d55..c0d2a57913f9b 100644 --- a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc @@ -474,7 +474,8 @@ TEST(MatMulNBitsLutGemm, Float32_2Bits_Asymmetric_Batch32_256x256_Bias) { // dereferencing the null pointer when mlas.use_lut_gemm=1. This drives mlas.use_lut_gemm=1 together with // session.save_external_prepacked_constant_initializers=1 and a non-empty optimized_model_filepath, and // asserts that initialization (which performs the save) and a subsequent run both succeed. -TEST(MatMulNBitsLutGemm, Float32_2Bits_PrepackSaveDoesNotCrash) { +// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. +TEST(MatMulNBitsLutGemm, DISABLED_Float32_2Bits_PrepackSaveDoesNotCrash) { constexpr int64_t M = 1, N = 128, K = 128, block_size = 32; if (!MlasIsLutGemmAvailable(static_cast(N), static_cast(K), 2, static_cast(block_size))) { GTEST_SKIP() << "LUT GEMM not available on this platform"; diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index dd5cfb73dfe31..c820c64d838ff 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -638,7 +638,8 @@ TestOptions MakeSharingTestOptions(int64_t N, int64_t K, int64_t block_size, int // Legacy sharing path: the weight B is registered as a shared initializer via // SessionOptions::AddInitializer. Covers float and float16 activations, symmetric/asymmetric, +/- bias. -TEST(MatMulNBits, SharedPrepackedWeights_AddInitializer) { +// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. +TEST(MatMulNBits, DISABLED_SharedPrepackedWeights_AddInitializer) { for (bool has_zero_point : {false, true}) { for (bool has_bias : {false, true}) { RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, has_zero_point, @@ -651,7 +652,8 @@ TEST(MatMulNBits, SharedPrepackedWeights_AddInitializer) { // Negative control: with the shared container present but neither opt-in mechanism enabled, no // pre-packed weights are shared across sessions. -TEST(MatMulNBits, SharedPrepackedWeights_NotSharedWithoutOptIn) { +// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. +TEST(MatMulNBits, DISABLED_SharedPrepackedWeights_NotSharedWithoutOptIn) { RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ true, /*has_bias*/ true, PrepackSharingMode::kNoSharing)); RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, diff --git a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc index 411e83536c190..92cfc5e5bb679 100644 --- a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc @@ -712,7 +712,8 @@ TestOptions8Bits MakeSharingTestOptions8Bits(int64_t block_size, bool has_zero_p // Legacy sharing path for 8-bit weights: B is registered as a shared initializer via // SessionOptions::AddInitializer. -TEST(MatMulNBits, SharedPrepackedWeights_8b_AddInitializer) { +// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. +TEST(MatMulNBits, DISABLED_SharedPrepackedWeights_8b_AddInitializer) { for (bool has_zero_point : {false, true}) { for (bool has_bias : {false, true}) { RunTest8Bits(MakeSharingTestOptions8Bits(32, has_zero_point, has_bias, @@ -725,7 +726,8 @@ TEST(MatMulNBits, SharedPrepackedWeights_8b_AddInitializer) { // Negative control for 8-bit weights: with the shared container present but neither opt-in mechanism // enabled, no pre-packed weights are shared across sessions. -TEST(MatMulNBits, SharedPrepackedWeights_8b_NotSharedWithoutOptIn) { +// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. +TEST(MatMulNBits, DISABLED_SharedPrepackedWeights_8b_NotSharedWithoutOptIn) { RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ true, /*has_bias*/ true, PrepackSharingMode::kNoSharing)); RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ false, /*has_bias*/ false, diff --git a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc index 7fd842ab0da83..78db0d0799962 100644 --- a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc +++ b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc @@ -366,7 +366,8 @@ TEST_F(DQMatMulNBitsFusionTest, Pattern1_MatMul_NoZP) { // identity, while a semantic difference -- here, different zero points -- yields a DIFFERENT identity. // (The tag only enrolls B into the shared container; the actual sharing is keyed by the packed-bytes // hash, so a stable, content-distinct tag just keeps enrollment deterministic across sessions.) -TEST_F(DQMatMulNBitsFusionTest, TagsGeneratedWeightWithStableContentIdentity) { +// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. +TEST_F(DQMatMulNBitsFusionTest, DISABLED_TagsGeneratedWeightWithStableContentIdentity) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; @@ -470,7 +471,8 @@ static void RunSharedFusionSession(const std::string& model_bytes, PrepackedWeig // to different bytes, so it gets a different key and must NOT share. (A zero-point-only difference is // intentionally NOT used: on the CompFp32 path the zero points are not folded into the packed B, so two // such models pack identically and would correctly share a byte-identical buffer.) -TEST_F(DQMatMulNBitsFusionTest, SharesFusedWeightAcrossSessionsViaTag) { +// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. +TEST_F(DQMatMulNBitsFusionTest, DISABLED_SharesFusedWeightAcrossSessionsViaTag) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc index 14d0ade2ffa64..a6aed1801a9fc 100644 --- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc @@ -1564,7 +1564,8 @@ static void RunDefaultPathSession(const std::string& model_bytes, PrepackedWeigh // enrollment identity: identical quantization data yields the SAME identity, while different zero points // yield a DIFFERENT identity. (The tag only enrolls the buffer for sharing; the container keys by the // packed-bytes hash. A stable, content-distinct tag keeps enrollment deterministic across sessions.) -TEST(QDQTransformerTests, DefaultPath_TagsGeneratedWeightWithStableContentIdentity) { +// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. +TEST(QDQTransformerTests, DISABLED_DefaultPath_TagsGeneratedWeightWithStableContentIdentity) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; @@ -1606,7 +1607,8 @@ TEST(QDQTransformerTests, DefaultPath_TagsGeneratedWeightWithStableContentIdenti // zero-point-only difference is intentionally NOT used here: on the CompFp32 path the zero points are // applied at compute time and left out of the packed B, so two such models pack identically and would // correctly share -- packed-bytes keying only ever reuses byte-identical buffers.) -TEST(QDQTransformerTests, DefaultPath_SharesWeightAcrossSessionsViaTag) { +// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. +TEST(QDQTransformerTests, DISABLED_DefaultPath_SharesWeightAcrossSessionsViaTag) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; @@ -1658,7 +1660,8 @@ TEST(QDQTransformerTests, DefaultPath_SharesWeightAcrossSessionsViaTag) { // platform-dependent (level 4 may pack as CompInt8 -- different bytes, no share -- or fall back to the // same CompFp32 packing as level 0 and benignly reuse the byte-identical buffer); packed-bytes keying // makes either outcome safe, so this asserts the identity is distinct, not a fixed sharing count. -TEST(QDQTransformerTests, DefaultPath_DifferentAccuracyLevelGetsDistinctIdentity) { +// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. +TEST(QDQTransformerTests, DISABLED_DefaultPath_DifferentAccuracyLevelGetsDistinctIdentity) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; From 42b07b73b4503c7bc954c8b756182f66146aa38a Mon Sep 17 00:00:00 2001 From: Dusan Erdeljan Date: Thu, 25 Jun 2026 14:01:55 +0200 Subject: [PATCH 10/13] Re-enable and reduce new tests --- .../test/contrib_ops/matmul_2bits_test.cc | 3 +- .../test/contrib_ops/matmul_4bits_test.cc | 34 +++++++++---------- .../test/contrib_ops/matmul_8bits_test.cc | 29 ++++++++-------- .../optimizer/dq_matmulnbits_fusion_test.cc | 6 ++-- .../qdq_matmulnbits_transformer_test.cc | 9 ++--- 5 files changed, 37 insertions(+), 44 deletions(-) diff --git a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc index c0d2a57913f9b..8e133caa15d55 100644 --- a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc @@ -474,8 +474,7 @@ TEST(MatMulNBitsLutGemm, Float32_2Bits_Asymmetric_Batch32_256x256_Bias) { // dereferencing the null pointer when mlas.use_lut_gemm=1. This drives mlas.use_lut_gemm=1 together with // session.save_external_prepacked_constant_initializers=1 and a non-empty optimized_model_filepath, and // asserts that initialization (which performs the save) and a subsequent run both succeed. -// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. -TEST(MatMulNBitsLutGemm, DISABLED_Float32_2Bits_PrepackSaveDoesNotCrash) { +TEST(MatMulNBitsLutGemm, Float32_2Bits_PrepackSaveDoesNotCrash) { constexpr int64_t M = 1, N = 128, K = 128, block_size = 32; if (!MlasIsLutGemmAvailable(static_cast(N), static_cast(K), 2, static_cast(block_size))) { GTEST_SKIP() << "LUT GEMM not available on this platform"; diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index c820c64d838ff..ec03f5ff90101 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -637,28 +637,28 @@ TestOptions MakeSharingTestOptions(int64_t N, int64_t K, int64_t block_size, int } // namespace // Legacy sharing path: the weight B is registered as a shared initializer via -// SessionOptions::AddInitializer. Covers float and float16 activations, symmetric/asymmetric, +/- bias. -// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. -TEST(MatMulNBits, DISABLED_SharedPrepackedWeights_AddInitializer) { - for (bool has_zero_point : {false, true}) { - for (bool has_bias : {false, true}) { - RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, has_zero_point, - has_bias, PrepackSharingMode::kAddInitializer)); - RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, has_zero_point, - has_bias, PrepackSharingMode::kAddInitializer)); - } - } +// SessionOptions::AddInitializer. Pre-packed B sharing depends on the activation dtype path and on the +// zero points (which are folded into the packed B), but not on bias (bias is not pre-packed). Cover both +// dtypes x both zero-point states with bias varied across the cases, rather than the full zp x bias x +// dtype cross-product, to limit the number of InferenceSession constructions (the dominant memory cost +// under AddressSanitizer). +TEST(MatMulNBits, SharedPrepackedWeights_AddInitializer) { + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ true, + /*has_bias*/ true, PrepackSharingMode::kAddInitializer)); + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ false, + /*has_bias*/ false, PrepackSharingMode::kAddInitializer)); + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ false, + /*has_bias*/ true, PrepackSharingMode::kAddInitializer)); + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ true, + /*has_bias*/ false, PrepackSharingMode::kAddInitializer)); } // Negative control: with the shared container present but neither opt-in mechanism enabled, no -// pre-packed weights are shared across sessions. -// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. -TEST(MatMulNBits, DISABLED_SharedPrepackedWeights_NotSharedWithoutOptIn) { +// pre-packed weights are shared across sessions. Opt-in gating is independent of dtype/zp/bias, so a +// single representative case suffices (keeping InferenceSession constructions low under AddressSanitizer). +TEST(MatMulNBits, SharedPrepackedWeights_NotSharedWithoutOptIn) { RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ true, /*has_bias*/ true, PrepackSharingMode::kNoSharing)); - RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, - /*has_zero_point*/ false, /*has_bias*/ false, - PrepackSharingMode::kNoSharing)); } #endif // !ENABLE_TRAINING diff --git a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc index 92cfc5e5bb679..74e4b116e3dfc 100644 --- a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc @@ -711,27 +711,26 @@ TestOptions8Bits MakeSharingTestOptions8Bits(int64_t block_size, bool has_zero_p } // namespace // Legacy sharing path for 8-bit weights: B is registered as a shared initializer via -// SessionOptions::AddInitializer. -// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. -TEST(MatMulNBits, DISABLED_SharedPrepackedWeights_8b_AddInitializer) { - for (bool has_zero_point : {false, true}) { - for (bool has_bias : {false, true}) { - RunTest8Bits(MakeSharingTestOptions8Bits(32, has_zero_point, has_bias, +// SessionOptions::AddInitializer. As in the 4-bit SharedPrepackedWeights_AddInitializer test, cover both +// dtypes x both zero-point states (zero points are folded into the packed B) with bias varied across the +// cases, rather than the full cross-product, to limit InferenceSession constructions under AddressSanitizer. +TEST(MatMulNBits, SharedPrepackedWeights_8b_AddInitializer) { + RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ true, /*has_bias*/ true, + PrepackSharingMode::kAddInitializer)); + RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ false, /*has_bias*/ false, + PrepackSharingMode::kAddInitializer)); + RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ false, /*has_bias*/ true, + PrepackSharingMode::kAddInitializer)); + RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ true, /*has_bias*/ false, PrepackSharingMode::kAddInitializer)); - RunTest8Bits(MakeSharingTestOptions8Bits(32, has_zero_point, has_bias, - PrepackSharingMode::kAddInitializer)); - } - } } // Negative control for 8-bit weights: with the shared container present but neither opt-in mechanism -// enabled, no pre-packed weights are shared across sessions. -// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. -TEST(MatMulNBits, DISABLED_SharedPrepackedWeights_8b_NotSharedWithoutOptIn) { +// enabled, no pre-packed weights are shared across sessions. Opt-in gating is independent of dtype/zp/bias, +// so a single representative case suffices (keeping InferenceSession constructions low under AddressSanitizer). +TEST(MatMulNBits, SharedPrepackedWeights_8b_NotSharedWithoutOptIn) { RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ true, /*has_bias*/ true, PrepackSharingMode::kNoSharing)); - RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ false, /*has_bias*/ false, - PrepackSharingMode::kNoSharing)); } #endif // !ENABLE_TRAINING #endif // !USE_CUDA && !USE_WEBGPU diff --git a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc index 78db0d0799962..7fd842ab0da83 100644 --- a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc +++ b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc @@ -366,8 +366,7 @@ TEST_F(DQMatMulNBitsFusionTest, Pattern1_MatMul_NoZP) { // identity, while a semantic difference -- here, different zero points -- yields a DIFFERENT identity. // (The tag only enrolls B into the shared container; the actual sharing is keyed by the packed-bytes // hash, so a stable, content-distinct tag just keeps enrollment deterministic across sessions.) -// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. -TEST_F(DQMatMulNBitsFusionTest, DISABLED_TagsGeneratedWeightWithStableContentIdentity) { +TEST_F(DQMatMulNBitsFusionTest, TagsGeneratedWeightWithStableContentIdentity) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; @@ -471,8 +470,7 @@ static void RunSharedFusionSession(const std::string& model_bytes, PrepackedWeig // to different bytes, so it gets a different key and must NOT share. (A zero-point-only difference is // intentionally NOT used: on the CompFp32 path the zero points are not folded into the packed B, so two // such models pack identically and would correctly share a byte-identical buffer.) -// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. -TEST_F(DQMatMulNBitsFusionTest, DISABLED_SharesFusedWeightAcrossSessionsViaTag) { +TEST_F(DQMatMulNBitsFusionTest, SharesFusedWeightAcrossSessionsViaTag) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc index a6aed1801a9fc..14d0ade2ffa64 100644 --- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc @@ -1564,8 +1564,7 @@ static void RunDefaultPathSession(const std::string& model_bytes, PrepackedWeigh // enrollment identity: identical quantization data yields the SAME identity, while different zero points // yield a DIFFERENT identity. (The tag only enrolls the buffer for sharing; the container keys by the // packed-bytes hash. A stable, content-distinct tag keeps enrollment deterministic across sessions.) -// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. -TEST(QDQTransformerTests, DISABLED_DefaultPath_TagsGeneratedWeightWithStableContentIdentity) { +TEST(QDQTransformerTests, DefaultPath_TagsGeneratedWeightWithStableContentIdentity) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; @@ -1607,8 +1606,7 @@ TEST(QDQTransformerTests, DISABLED_DefaultPath_TagsGeneratedWeightWithStableCont // zero-point-only difference is intentionally NOT used here: on the CompFp32 path the zero points are // applied at compute time and left out of the packed B, so two such models pack identically and would // correctly share -- packed-bytes keying only ever reuses byte-identical buffers.) -// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. -TEST(QDQTransformerTests, DISABLED_DefaultPath_SharesWeightAcrossSessionsViaTag) { +TEST(QDQTransformerTests, DefaultPath_SharesWeightAcrossSessionsViaTag) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; @@ -1660,8 +1658,7 @@ TEST(QDQTransformerTests, DISABLED_DefaultPath_SharesWeightAcrossSessionsViaTag) // platform-dependent (level 4 may pack as CompInt8 -- different bytes, no share -- or fall back to the // same CompFp32 packing as level 0 and benignly reuse the byte-identical buffer); packed-bytes keying // makes either outcome safe, so this asserts the identity is distinct, not a fixed sharing count. -// DISABLED_ to verify the Windows x64 ASan CI OOM is not caused by the new MatMulNBits tests. -TEST(QDQTransformerTests, DISABLED_DefaultPath_DifferentAccuracyLevelGetsDistinctIdentity) { +TEST(QDQTransformerTests, DefaultPath_DifferentAccuracyLevelGetsDistinctIdentity) { constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; const int64_t num_blocks = K / block_size; From a4b9a469d63ee30b6b419e2d1686b63beb006416 Mon Sep 17 00:00:00 2001 From: Dusan Erdeljan Date: Thu, 25 Jun 2026 16:18:17 +0200 Subject: [PATCH 11/13] Reduce the number of tests ran in CI --- .../qdq_matmulnbits_transformer_test.cc | 86 +------------------ 1 file changed, 1 insertion(+), 85 deletions(-) diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc index 14d0ade2ffa64..e5ef1b69f98a3 100644 --- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc @@ -1532,13 +1532,8 @@ static void SerializeDefaultPathModel(int64_t M, int64_t N, int64_t K, int64_t b // Reports whether a MatMulNBits was produced, the sharing identity tagged onto its B weight, and how // many pre-packed weights this session served from the container. static void RunDefaultPathSession(const std::string& model_bytes, PrepackedWeightsContainer& container, - bool& produced_matmulnbits, std::string& b_tag, size_t& used_shared_count, - int accuracy_level = -1) { + bool& produced_matmulnbits, std::string& b_tag, size_t& used_shared_count) { SessionOptions so; - if (accuracy_level >= 0) { - ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel, - std::to_string(accuracy_level).c_str())); - } InferenceSessionWrapper session{so, GetEnvironment()}; ASSERT_STATUS_OK(session.AddPrePackedWeightsContainer(&container)); ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast(model_bytes.size()))); @@ -1560,46 +1555,6 @@ static void RunDefaultPathSession(const std::string& model_bytes, PrepackedWeigh used_shared_count = session.GetSessionState().GetUsedSharedPrePackedWeightCounter(); } -// Verifies the default DQ->MatMulNBits path tags its generated B weight with a stable, content-derived -// enrollment identity: identical quantization data yields the SAME identity, while different zero points -// yield a DIFFERENT identity. (The tag only enrolls the buffer for sharing; the container keys by the -// packed-bytes hash. A stable, content-distinct tag keeps enrollment deterministic across sessions.) -TEST(QDQTransformerTests, DefaultPath_TagsGeneratedWeightWithStableContentIdentity) { - constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; - const int64_t num_blocks = K / block_size; - - std::vector weight(static_cast(K * N)); - for (size_t i = 0; i < weight.size(); ++i) { - weight[i] = static_cast(i % 16); - } - std::vector scale(static_cast(num_blocks * N)); - for (size_t i = 0; i < scale.size(); ++i) { - scale[i] = 0.1f + 0.01f * static_cast(i % 10); - } - std::vector zp_a(static_cast(num_blocks * N), 3); - std::vector zp_b(zp_a.size(), 5); - - auto tag_for = [&](const std::vector& zp) -> std::string { - std::string model_bytes; - SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp, model_bytes); - PrepackedWeightsContainer container; - bool produced = false; - std::string tag; - size_t used = 0; - RunDefaultPathSession(model_bytes, container, produced, tag, used); - EXPECT_TRUE(produced) << "DQ -> MatMulNBits conversion did not run on the default path"; - return tag; - }; - - const std::string id_a1 = tag_for(zp_a); - const std::string id_a2 = tag_for(zp_a); - const std::string id_b = tag_for(zp_b); - - ASSERT_FALSE(id_a1.empty()) << "generated B weight was not tagged for cross-session sharing"; - EXPECT_EQ(id_a1, id_a2); // stable: identical quantization data -> identical identity - EXPECT_NE(id_a1, id_b); // collision-safe: different zero points -> different identity -} - // End-to-end: two sessions converting the same model via the default path share the MatMulNBits B // pre-packed buffer through a common container (no session option). A model whose quantized weight // differs packs to different bytes -> different container key, so it must not reuse the buffer. (A @@ -1653,45 +1608,6 @@ TEST(QDQTransformerTests, DefaultPath_SharesWeightAcrossSessionsViaTag) { EXPECT_EQ(used_other, static_cast(0)); } -// accuracy_level participates in the enrollment identity, so the same weights requested at different -// accuracy levels get distinct identities. Whether the two sessions then share the packed buffer is -// platform-dependent (level 4 may pack as CompInt8 -- different bytes, no share -- or fall back to the -// same CompFp32 packing as level 0 and benignly reuse the byte-identical buffer); packed-bytes keying -// makes either outcome safe, so this asserts the identity is distinct, not a fixed sharing count. -TEST(QDQTransformerTests, DefaultPath_DifferentAccuracyLevelGetsDistinctIdentity) { - constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; - const int64_t num_blocks = K / block_size; - - std::vector weight(static_cast(K * N)); - for (size_t i = 0; i < weight.size(); ++i) { - weight[i] = static_cast(i % 16); - } - std::vector scale(static_cast(num_blocks * N)); - for (size_t i = 0; i < scale.size(); ++i) { - scale[i] = 0.1f + 0.01f * static_cast(i % 10); - } - std::vector zp(static_cast(num_blocks * N), 3); - - std::string model_bytes; - SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp, model_bytes); - - PrepackedWeightsContainer container; - bool produced0 = false, produced4 = false; - std::string tag0, tag4; - size_t used0 = 0, used4 = 0; - - RunDefaultPathSession(model_bytes, container, produced0, tag0, used0, /*accuracy_level*/ 0); - ASSERT_TRUE(produced0) << "DQ -> MatMulNBits conversion did not run on the default path"; - - // Same model/weights, different accuracy level, sharing the same container. - RunDefaultPathSession(model_bytes, container, produced4, tag4, used4, /*accuracy_level*/ 4); - ASSERT_TRUE(produced4); - - ASSERT_FALSE(tag0.empty()); - ASSERT_FALSE(tag4.empty()); - EXPECT_NE(tag0, tag4); // accuracy_level participates in the enrollment identity -} - #endif // !defined(DISABLE_CONTRIB_OPS) } // namespace test From 14dd3a3a7bd7020c398feee9f7c348d88a05e8bb Mon Sep 17 00:00:00 2001 From: Dusan Erdeljan Date: Thu, 25 Jun 2026 18:57:32 +0200 Subject: [PATCH 12/13] Restore full MatMulNBits prepack-sharing tests; keep --test_parallel 2 for ASan CI Reverts the debug DISABLED_ and test-count reductions (commits 926a6de..a4b9a46) back to the pre-hack state at 1bb7bd1, so CI runs the full test set with reduced ctest parallelism only. --- .github/workflows/windows_build_x64_asan.yml | 2 +- .../test/contrib_ops/matmul_4bits_test.cc | 28 +++--- .../test/contrib_ops/matmul_8bits_test.cc | 23 +++-- .../qdq_matmulnbits_transformer_test.cc | 86 ++++++++++++++++++- 4 files changed, 110 insertions(+), 29 deletions(-) diff --git a/.github/workflows/windows_build_x64_asan.yml b/.github/workflows/windows_build_x64_asan.yml index 116938b5129db..74ad951d91f3a 100644 --- a/.github/workflows/windows_build_x64_asan.yml +++ b/.github/workflows/windows_build_x64_asan.yml @@ -44,4 +44,4 @@ jobs: @echo off echo %PATH% python -m pip install -r "%GITHUB_WORKSPACE%\tools\ci_build/github/windows\python\requirements.txt" - python "%GITHUB_WORKSPACE%\tools\ci_build\build.py" --config Debug --build_dir "%RUNNER_TEMP%\build" --skip_submodule_sync --parallel --test_parallel 4 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_generator "Visual Studio 17 2022" --disable_memleak_checker --enable_address_sanitizer + python "%GITHUB_WORKSPACE%\tools\ci_build\build.py" --config Debug --build_dir "%RUNNER_TEMP%\build" --skip_submodule_sync --parallel --test_parallel 2 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_generator "Visual Studio 17 2022" --disable_memleak_checker --enable_address_sanitizer diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index ec03f5ff90101..dd5cfb73dfe31 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -637,28 +637,26 @@ TestOptions MakeSharingTestOptions(int64_t N, int64_t K, int64_t block_size, int } // namespace // Legacy sharing path: the weight B is registered as a shared initializer via -// SessionOptions::AddInitializer. Pre-packed B sharing depends on the activation dtype path and on the -// zero points (which are folded into the packed B), but not on bias (bias is not pre-packed). Cover both -// dtypes x both zero-point states with bias varied across the cases, rather than the full zp x bias x -// dtype cross-product, to limit the number of InferenceSession constructions (the dominant memory cost -// under AddressSanitizer). +// SessionOptions::AddInitializer. Covers float and float16 activations, symmetric/asymmetric, +/- bias. TEST(MatMulNBits, SharedPrepackedWeights_AddInitializer) { - RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ true, - /*has_bias*/ true, PrepackSharingMode::kAddInitializer)); - RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ false, - /*has_bias*/ false, PrepackSharingMode::kAddInitializer)); - RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ false, - /*has_bias*/ true, PrepackSharingMode::kAddInitializer)); - RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ true, - /*has_bias*/ false, PrepackSharingMode::kAddInitializer)); + for (bool has_zero_point : {false, true}) { + for (bool has_bias : {false, true}) { + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, has_zero_point, + has_bias, PrepackSharingMode::kAddInitializer)); + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, has_zero_point, + has_bias, PrepackSharingMode::kAddInitializer)); + } + } } // Negative control: with the shared container present but neither opt-in mechanism enabled, no -// pre-packed weights are shared across sessions. Opt-in gating is independent of dtype/zp/bias, so a -// single representative case suffices (keeping InferenceSession constructions low under AddressSanitizer). +// pre-packed weights are shared across sessions. TEST(MatMulNBits, SharedPrepackedWeights_NotSharedWithoutOptIn) { RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ true, /*has_bias*/ true, PrepackSharingMode::kNoSharing)); + RunTest(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, + /*has_zero_point*/ false, /*has_bias*/ false, + PrepackSharingMode::kNoSharing)); } #endif // !ENABLE_TRAINING diff --git a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc index 74e4b116e3dfc..411e83536c190 100644 --- a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc @@ -711,26 +711,25 @@ TestOptions8Bits MakeSharingTestOptions8Bits(int64_t block_size, bool has_zero_p } // namespace // Legacy sharing path for 8-bit weights: B is registered as a shared initializer via -// SessionOptions::AddInitializer. As in the 4-bit SharedPrepackedWeights_AddInitializer test, cover both -// dtypes x both zero-point states (zero points are folded into the packed B) with bias varied across the -// cases, rather than the full cross-product, to limit InferenceSession constructions under AddressSanitizer. +// SessionOptions::AddInitializer. TEST(MatMulNBits, SharedPrepackedWeights_8b_AddInitializer) { - RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ true, /*has_bias*/ true, - PrepackSharingMode::kAddInitializer)); - RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ false, /*has_bias*/ false, - PrepackSharingMode::kAddInitializer)); - RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ false, /*has_bias*/ true, - PrepackSharingMode::kAddInitializer)); - RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ true, /*has_bias*/ false, + for (bool has_zero_point : {false, true}) { + for (bool has_bias : {false, true}) { + RunTest8Bits(MakeSharingTestOptions8Bits(32, has_zero_point, has_bias, PrepackSharingMode::kAddInitializer)); + RunTest8Bits(MakeSharingTestOptions8Bits(32, has_zero_point, has_bias, + PrepackSharingMode::kAddInitializer)); + } + } } // Negative control for 8-bit weights: with the shared container present but neither opt-in mechanism -// enabled, no pre-packed weights are shared across sessions. Opt-in gating is independent of dtype/zp/bias, -// so a single representative case suffices (keeping InferenceSession constructions low under AddressSanitizer). +// enabled, no pre-packed weights are shared across sessions. TEST(MatMulNBits, SharedPrepackedWeights_8b_NotSharedWithoutOptIn) { RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ true, /*has_bias*/ true, PrepackSharingMode::kNoSharing)); + RunTest8Bits(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ false, /*has_bias*/ false, + PrepackSharingMode::kNoSharing)); } #endif // !ENABLE_TRAINING #endif // !USE_CUDA && !USE_WEBGPU diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc index e5ef1b69f98a3..14d0ade2ffa64 100644 --- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc @@ -1532,8 +1532,13 @@ static void SerializeDefaultPathModel(int64_t M, int64_t N, int64_t K, int64_t b // Reports whether a MatMulNBits was produced, the sharing identity tagged onto its B weight, and how // many pre-packed weights this session served from the container. static void RunDefaultPathSession(const std::string& model_bytes, PrepackedWeightsContainer& container, - bool& produced_matmulnbits, std::string& b_tag, size_t& used_shared_count) { + bool& produced_matmulnbits, std::string& b_tag, size_t& used_shared_count, + int accuracy_level = -1) { SessionOptions so; + if (accuracy_level >= 0) { + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel, + std::to_string(accuracy_level).c_str())); + } InferenceSessionWrapper session{so, GetEnvironment()}; ASSERT_STATUS_OK(session.AddPrePackedWeightsContainer(&container)); ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast(model_bytes.size()))); @@ -1555,6 +1560,46 @@ static void RunDefaultPathSession(const std::string& model_bytes, PrepackedWeigh used_shared_count = session.GetSessionState().GetUsedSharedPrePackedWeightCounter(); } +// Verifies the default DQ->MatMulNBits path tags its generated B weight with a stable, content-derived +// enrollment identity: identical quantization data yields the SAME identity, while different zero points +// yield a DIFFERENT identity. (The tag only enrolls the buffer for sharing; the container keys by the +// packed-bytes hash. A stable, content-distinct tag keeps enrollment deterministic across sessions.) +TEST(QDQTransformerTests, DefaultPath_TagsGeneratedWeightWithStableContentIdentity) { + constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; + const int64_t num_blocks = K / block_size; + + std::vector weight(static_cast(K * N)); + for (size_t i = 0; i < weight.size(); ++i) { + weight[i] = static_cast(i % 16); + } + std::vector scale(static_cast(num_blocks * N)); + for (size_t i = 0; i < scale.size(); ++i) { + scale[i] = 0.1f + 0.01f * static_cast(i % 10); + } + std::vector zp_a(static_cast(num_blocks * N), 3); + std::vector zp_b(zp_a.size(), 5); + + auto tag_for = [&](const std::vector& zp) -> std::string { + std::string model_bytes; + SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp, model_bytes); + PrepackedWeightsContainer container; + bool produced = false; + std::string tag; + size_t used = 0; + RunDefaultPathSession(model_bytes, container, produced, tag, used); + EXPECT_TRUE(produced) << "DQ -> MatMulNBits conversion did not run on the default path"; + return tag; + }; + + const std::string id_a1 = tag_for(zp_a); + const std::string id_a2 = tag_for(zp_a); + const std::string id_b = tag_for(zp_b); + + ASSERT_FALSE(id_a1.empty()) << "generated B weight was not tagged for cross-session sharing"; + EXPECT_EQ(id_a1, id_a2); // stable: identical quantization data -> identical identity + EXPECT_NE(id_a1, id_b); // collision-safe: different zero points -> different identity +} + // End-to-end: two sessions converting the same model via the default path share the MatMulNBits B // pre-packed buffer through a common container (no session option). A model whose quantized weight // differs packs to different bytes -> different container key, so it must not reuse the buffer. (A @@ -1608,6 +1653,45 @@ TEST(QDQTransformerTests, DefaultPath_SharesWeightAcrossSessionsViaTag) { EXPECT_EQ(used_other, static_cast(0)); } +// accuracy_level participates in the enrollment identity, so the same weights requested at different +// accuracy levels get distinct identities. Whether the two sessions then share the packed buffer is +// platform-dependent (level 4 may pack as CompInt8 -- different bytes, no share -- or fall back to the +// same CompFp32 packing as level 0 and benignly reuse the byte-identical buffer); packed-bytes keying +// makes either outcome safe, so this asserts the identity is distinct, not a fixed sharing count. +TEST(QDQTransformerTests, DefaultPath_DifferentAccuracyLevelGetsDistinctIdentity) { + constexpr int64_t M = 4, N = 8, K = 32, block_size = 16; + const int64_t num_blocks = K / block_size; + + std::vector weight(static_cast(K * N)); + for (size_t i = 0; i < weight.size(); ++i) { + weight[i] = static_cast(i % 16); + } + std::vector scale(static_cast(num_blocks * N)); + for (size_t i = 0; i < scale.size(); ++i) { + scale[i] = 0.1f + 0.01f * static_cast(i % 10); + } + std::vector zp(static_cast(num_blocks * N), 3); + + std::string model_bytes; + SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp, model_bytes); + + PrepackedWeightsContainer container; + bool produced0 = false, produced4 = false; + std::string tag0, tag4; + size_t used0 = 0, used4 = 0; + + RunDefaultPathSession(model_bytes, container, produced0, tag0, used0, /*accuracy_level*/ 0); + ASSERT_TRUE(produced0) << "DQ -> MatMulNBits conversion did not run on the default path"; + + // Same model/weights, different accuracy level, sharing the same container. + RunDefaultPathSession(model_bytes, container, produced4, tag4, used4, /*accuracy_level*/ 4); + ASSERT_TRUE(produced4); + + ASSERT_FALSE(tag0.empty()); + ASSERT_FALSE(tag4.empty()); + EXPECT_NE(tag0, tag4); // accuracy_level participates in the enrollment identity +} + #endif // !defined(DISABLE_CONTRIB_OPS) } // namespace test From a074c775b505fdb8a73143bd9a8922531c067213 Mon Sep 17 00:00:00 2001 From: Dusan Erdeljan Date: Thu, 25 Jun 2026 22:22:41 +0200 Subject: [PATCH 13/13] Limit intra op thread pull for prepacking tests to 1 --- .github/workflows/windows_build_x64_asan.yml | 2 +- onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc | 5 +++++ .../test/optimizer/qdq_matmulnbits_transformer_test.cc | 5 +++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/windows_build_x64_asan.yml b/.github/workflows/windows_build_x64_asan.yml index 74ad951d91f3a..116938b5129db 100644 --- a/.github/workflows/windows_build_x64_asan.yml +++ b/.github/workflows/windows_build_x64_asan.yml @@ -44,4 +44,4 @@ jobs: @echo off echo %PATH% python -m pip install -r "%GITHUB_WORKSPACE%\tools\ci_build/github/windows\python\requirements.txt" - python "%GITHUB_WORKSPACE%\tools\ci_build\build.py" --config Debug --build_dir "%RUNNER_TEMP%\build" --skip_submodule_sync --parallel --test_parallel 2 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_generator "Visual Studio 17 2022" --disable_memleak_checker --enable_address_sanitizer + python "%GITHUB_WORKSPACE%\tools\ci_build\build.py" --config Debug --build_dir "%RUNNER_TEMP%\build" --skip_submodule_sync --parallel --test_parallel 4 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_generator "Visual Studio 17 2022" --disable_memleak_checker --enable_address_sanitizer diff --git a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc index 7fd842ab0da83..47e08802c9e20 100644 --- a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc +++ b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc @@ -447,6 +447,11 @@ static void SerializeDQMatMulModel(int64_t M, int64_t N, int64_t K, int64_t bloc static void RunSharedFusionSession(const std::string& model_bytes, PrepackedWeightsContainer& container, bool& produced_matmulnbits, size_t& used_shared_count) { SessionOptions so; + // This test exercises prepack-weight sharing, not parallel execution. Cap the intra-op thread pool + // to a single thread so we don't spin up one worker per core: under AddressSanitizer each thread adds + // fake-stack and thread-local allocator overhead, which on a high-core CI runner multiplies across the + // sessions every test creates (the sibling SessionStatePrepackingTest caps it for the same reason). + so.intra_op_param.thread_pool_size = 1; ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsEnableDQMatMulNBitsFusion, "1")); InferenceSessionWrapper session{so, GetEnvironment()}; diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc index 14d0ade2ffa64..b53577a81ff4a 100644 --- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc @@ -1535,6 +1535,11 @@ static void RunDefaultPathSession(const std::string& model_bytes, PrepackedWeigh bool& produced_matmulnbits, std::string& b_tag, size_t& used_shared_count, int accuracy_level = -1) { SessionOptions so; + // This test exercises prepack-weight sharing, not parallel execution. Cap the intra-op thread pool + // to a single thread so we don't spin up one worker per core: under AddressSanitizer each thread adds + // fake-stack and thread-local allocator overhead, which on a high-core CI runner multiplies across the + // sessions every test creates (the sibling SessionStatePrepackingTest caps it for the same reason). + so.intra_op_param.thread_pool_size = 1; if (accuracy_level >= 0) { ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel, std::to_string(accuracy_level).c_str()));