diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 815fc6aa69a60..e26d22558c1d1 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -1608,6 +1608,18 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
     return *prepacked_weights_for_graph_;
   }
 
+  // Tags a fusion-generated initializer (whose name is not stable across sessions) with a stable,
+  // content-derived identity that SessionState uses to key cross-session pre-pack sharing.
+  void SetSharedPrepackInitializerId(const std::string& initializer_name, std::string share_id) {
+    generated_shared_prepack_ids_[initializer_name] = std::move(share_id);
+  }
+
+  // Returns the sharing identity for a generated initializer, or nullptr if it was not tagged.
+  const std::string* GetSharedPrepackInitializerId(const std::string& initializer_name) const {
+    auto it = generated_shared_prepack_ids_.find(initializer_name);
+    return it == generated_shared_prepack_ids_.end() ? nullptr : &it->second;
+  }
+
   /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */
   const Node* ParentNode() const { return parent_node_; }
 
@@ -2011,6 +2023,10 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   // This is optional due to delayed construction.
   std::optional<PrepackedWeightsForGraph> prepacked_weights_for_graph_;
 
+  // Maps a fusion-generated initializer name to its cross-session sharing identity.
+  // See SetSharedPrepackInitializerId.
+  InlinedHashMap<std::string, std::string> generated_shared_prepack_ids_;
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   // Runtime optimization storage.
   // Note: runtime_optimizations_ == *runtime_optimizations_ptr_ and must be initialized
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index 6bd1690fca815..162d7257d0a4c 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -4,6 +4,7 @@
 #include "contrib_ops/cpu/quantization/matmul_nbits_impl.h"
 
 #include <cstdint>
+#include <cstring>
 #include <memory>
 #include <type_traits>
 
@@ -162,6 +163,13 @@ class MatMulNBits final : public OpKernel {
   const bool column_wise_quant_{true};
   IAllocatorUniquePtr<void> packed_b_{};
   size_t packed_b_size_{0};
+  // True once PrePack(InputIndex::B) has folded the scales and (constant) zero points into packed_b_,
+  // leaving the CompInt8 buffer fully packed and compute-ready. Pre-packed weight sharing
+  // content-hashes the buffer right after the B PrePack returns, so everything that affects the
+  // packed bytes (in particular the block sum / BZpCorr, which depend on the zero points) must be
+  // folded in by then. Once set, the later scales/zero_point PrePack calls must not pack again: the
+  // CompInt8 packing is single-shot, and the buffer may by then be one shared from another session.
+  bool packed_b_finalized_{false};
   IAllocatorUniquePtr<float> scales_fp32_{};
   IAllocatorUniquePtr<float> bias_fp32_{};
 
@@ -227,7 +235,6 @@ template <typename T1>
 Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
                                 /*out*/ bool& is_packed,
                                 /*out*/ PrePackedWeights* prepacked_weights) {
-  ORT_UNUSED_PARAMETER(prepacked_weights);
   is_packed = false;
   if (has_g_idx_) {
     return Status::OK();
@@ -308,10 +315,12 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
           static_cast<std::byte*>(packed_b_.get()),
           threadpool_ptr);
 
-      if (prepacked_weights != nullptr) {
-        prepacked_weights->buffers_.push_back(std::move(packed_b_));
-        prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
-      }
+      // Do not append packed_b_ here. Both the LUT and non-LUT branches share the single append
+      // after this if/else, so each records exactly one buffer. Appending here as well would move
+      // packed_b_ out now and then have the shared append record a second, moved-from/null buffer
+      // with a non-zero packed_b_size_. PrePackedWeights::GetHash() skips null buffers so sharing
+      // appears to work, but the prepacked-blob save path writes buffer_sizes_[i] bytes from
+      // buffers_[i].get() and would dereference that null pointer.
     } else {
       // For HQNBIT_CompInt8, route through SQNBIT_CompInt8 for sizing and packing.
       // This gets KleidiAI-sized buffer when available for 4-bit and packs B+scales correctly.
@@ -341,24 +350,64 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
       }
 
       packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
+      // The framework content-hashes this packed buffer to deduplicate pre-packed weights, both
+      // within a session and across sessions (the shared container). The session-state prepack pass
+      // (SessionState::PrepackConstantInitializedTensors) passes a non-null prepacked_weights on both
+      // the container and the default single-session paths, so this zero-fill runs on essentially
+      // every prepack at load, not only when a sharing container is configured -- the guard below
+      // only skips a caller that asks for no cacheable buffer. The pack routines need not write every
+      // byte (alignment padding between the CompInt8 sub-regions; any layout could gain padding) and
+      // the reserve allocation is not zero-filled, so the hash would otherwise depend on uninitialized
+      // bytes. Zeroing the whole buffer is a one-time O(packed_b_size_) load cost (the pack overwrites
+      // the data regions, leaving only padding zeroed); inference is unaffected.
+      if (prepacked_weights != nullptr) {
+        std::memset(packed_b_.get(), 0, packed_b_size_);
+      }
       MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, effective_compute_type, qptr, packed_b_.get(), scale_ptr,
                                   has_zp_input_, nullptr, threadpool_ptr, &mlas_backend_kernel_selector_config_);
 
-#if defined(MLAS_TARGET_ARM64)
-      // For KleidiAI asymmetric 4-bit path: compute BZpCorr now while scales and zero_points are accessible.
-      if (compute_type_ == HQNBIT_CompInt8 && nbits_ == 4 && has_zp_input_ && scales_fp32_ &&
-          MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, SQNBIT_CompInt8, has_zp_input_, &mlas_backend_kernel_selector_config_)) {
-        const Tensor* zp_tensor = nullptr;
-        OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor);
-        if (zp_tensor != nullptr) {
-          auto zptr = zp_tensor->Data<uint8_t>();
-          MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(),
-                                      scales_fp32_.get(), has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_);
+      // Fold the scales and (constant) zero points into packed_b_ now, during the B PrePack, instead
+      // of deferring them to the later scales/zero_points PrePack calls. Pre-packed weight sharing
+      // content-hashes this buffer immediately after the B PrePack returns; the CompInt8 block sum
+      // (and the KleidiAI BZpCorr) is a function of the zero points, so they must already be folded
+      // in for the hash to reflect them. Otherwise two initializers with identical B and scales but
+      // different zero points would hash equal and the second would wrongly adopt the first's buffer
+      // and silently compute wrong results. scales and zero_points are constant initializers, so they
+      // are available here. The B pack above only partially populates the buffer (on x64 the block sum
+      // is deferred; on ARM64 8-bit the scales are ignored during B packing), so issue one more pack
+      // call with QuantBData == nullptr to finalize it. This is byte-identical to the staged
+      // scales + zero_points packing it replaces.
+      bool finalize_scale_zp_into_packed_b = effective_compute_type == SQNBIT_CompInt8 && scale_ptr != nullptr;
+#if !defined(MLAS_TARGET_AMD64_IX86)
+      // On ARM64 the scales/zero points are folded into B only for 8-bit, or for 4-bit when MLAS bakes
+      // them in (KleidiAI). For 4-bit non-KleidiAI they are applied at compute time and must not be
+      // passed to the packing routine, which would dereference the null QuantBData buffer.
+      finalize_scale_zp_into_packed_b =
+          finalize_scale_zp_into_packed_b &&
+          (nbits_ == 8 || MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, effective_compute_type,
+                                                    has_zp_input_, &mlas_backend_kernel_selector_config_));
+#endif
+      if (finalize_scale_zp_into_packed_b) {
+        const uint8_t* zp_ptr = nullptr;
+        if (has_zp_input_) {
+          const Tensor* zp_tensor = nullptr;
+          OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor);
+          if (zp_tensor != nullptr) {
+            zp_ptr = zp_tensor->Data<uint8_t>();
+          }
         }
+        MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, effective_compute_type, nullptr /*QuantBData*/,
+                                    packed_b_.get(), scale_ptr, has_zp_input_, zp_ptr, nullptr,
+                                    &mlas_backend_kernel_selector_config_);
+        packed_b_finalized_ = true;
       }
-#endif  // MLAS_TARGET_ARM64
     }
     is_packed = true;
+
+    if (prepacked_weights != nullptr) {
+      prepacked_weights->buffers_.push_back(std::move(packed_b_));
+      prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
+    }
   } else if (compute_type_ == SQNBIT_CompInt8 && !prefer_lut_gemm_) {
     // Packing scales and zero points
     // Guard: for LUT-eligible nodes, scales/ZP are already packed inside
@@ -376,7 +425,10 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
     }();
 
     if (should_pack_scale_and_zp_inputs) {
-      if (input_idx == InputIndex::scales && packed_b_ != nullptr) {
+      // packed_b_ is already finalized during the B PrePack (scales and zero points folded in there so
+      // the sharing content hash captures them), so skip packing here. The CompInt8 packing is
+      // single-shot and packed_b_ may now be a buffer shared from another session.
+      if (input_idx == InputIndex::scales && packed_b_ != nullptr && !packed_b_finalized_) {
         auto sptr = tensor.Data<float>();
         MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), sptr,
                                     has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_);
@@ -384,7 +436,7 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
       }
 
       // Packing zero_point
-      if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) {
+      if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && !packed_b_finalized_) {
         auto zptr = tensor.Data<uint8_t>();
         MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr,
                                     has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_);
@@ -410,13 +462,21 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
       }
 
       scales_are_packed_ = true;
-      is_packed = true;
 
-      // For KleidiAI asymmetric 4-bit path: compute BZpCorr now while scales are still accessible.
-      // After this PrePack returns is_packed=true, ORT may erase scales from the constant
-      // input table (use count drops to 0), making them unavailable in later PrePack calls.
-      // Zero points haven't been PrePacked yet so they are still accessible.
-      if (has_zp_input_ && nbits_ == 4) {
+      // The scales were folded into packed_b_ during the B PrePack, so there is no separate packed
+      // scales buffer to cache or share. Report is_packed = false (as the x64 path already does for
+      // the scales input) so the framework does not engage pre-packed weight sharing for scales.
+      // Engaging it would require pushing a placeholder buffer, but the real scales live inside
+      // packed_b_ so the placeholder would be null - and PrePackedWeights::GetHash() skips null
+      // buffers, making the scales container key identical for every MatMulNBits node. That would
+      // falsely increment the shared-weights counter for unrelated nodes without sharing any real
+      // buffer. The quantized weight B (which carries the folded-in scales) is shared on its own.
+      is_packed = false;
+
+      // BZpCorr was already folded into packed_b_ during the B PrePack (so the sharing content hash
+      // captures the zero points), so re-folding it here must be skipped: the packing is single-shot
+      // and packed_b_ may now be a buffer shared from another session.
+      if (has_zp_input_ && nbits_ == 4 && !packed_b_finalized_) {
         const Tensor* zp_tensor = nullptr;
         OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor);
         if (zp_tensor != nullptr) {
@@ -457,7 +517,14 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
 
         // BZpCorr was already computed during B packing in Step 1 (if applicable).
         scales_are_packed_ = true;
-        is_packed = true;
+
+        // The scales were folded into the packed B buffer during the B PrePack, so there is no
+        // separate packed scales buffer to cache or share. Report is_packed = false (mirroring the
+        // x64 path and the SQNBIT_CompInt8 path above) so the framework does not engage sharing for
+        // the scales input; engaging it would push a null placeholder whose content hash is identical
+        // for every node, falsely incrementing the shared-weights counter without sharing any real
+        // buffer.
+        is_packed = false;
       } else
 #endif  // MLAS_TARGET_ARM64
       {
@@ -471,7 +538,9 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
         // Pack scales separately only for 8-bit. For 4-bit on ARM64, scales are already packed
         // during B packing or used as a raw pointer at compute time (matching standard
         // SQNBIT_CompInt8 behavior where should_pack_scale_and_zp_inputs = (nbits_ == 8) on ARM64).
-        if (nbits_ == 8) {
+        // Skip when packed_b_ was already finalized during the B PrePack (scales/zero points folded
+        // in there for the sharing content hash); it may now be a buffer shared from another session.
+        if (nbits_ == 8 && !packed_b_finalized_) {
           MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(),
                                       scales_fp32_.get(), has_zp_input_, nullptr, nullptr,
                                       &mlas_backend_kernel_selector_config_);
@@ -482,7 +551,7 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
 
     // Pack zero_points separately only for 8-bit (matching standard SQNBIT_CompInt8 behavior).
     // For 4-bit, zero_points are passed directly in data params or handled via KleidiAI BZpCorr.
-    if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && nbits_ == 8) {
+    if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && nbits_ == 8 && !packed_b_finalized_) {
       auto zptr = tensor.Data<uint8_t>();
       MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, SQNBIT_CompInt8, nullptr, packed_b_.get(), nullptr,
                                   has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_);
@@ -540,8 +609,6 @@ template <>
 Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
                                        /*out*/ bool& is_packed,
                                        /*out*/ PrePackedWeights* prepacked_weights) {
-  ORT_UNUSED_PARAMETER(prepacked_weights);
-
   if (input_idx == InputIndex::scales || input_idx == InputIndex::bias) {
     auto sptr = tensor.Data<MLFloat16>();
     auto tensor_size = static_cast<size_t>(tensor.Shape().Size());
@@ -565,8 +632,12 @@ Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*ou
   if (input_idx == InputIndex::B) {
     const Tensor* scales = nullptr;
     OpKernel::Info().TryGetConstantInput(InputIndex::scales, &scales);
-    if (scales && MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, compute_type_,
-                                            has_zp_input_, &mlas_backend_kernel_selector_config_)) {
+    // Convert the constant fp16 scales to fp32 up front so they (and the zero points) can be folded
+    // into packed_b_ during this B PrePack, mirroring the primary float PrePack above. Pre-packed
+    // weight sharing content-hashes the buffer right after this B PrePack returns, so for CompInt8
+    // everything that affects the packed bytes (the scales, and the block sum / KleidiAI BZpCorr that
+    // depend on the zero points) must be folded in by now.
+    if (scales && compute_type_ == SQNBIT_CompInt8) {
       auto sptr = scales->Data<MLFloat16>();
       auto scales_size = static_cast<size_t>(scales->Shape().Size());
       auto ptr = IAllocator::MakeUniquePtr<float>(alloc, scales_size, true);
@@ -581,25 +652,55 @@ Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*ou
     }
     auto qptr = tensor.DataRaw();
     packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
+    // See the primary PrePack() above: SessionState::PrepackConstantInitializedTensors passes a
+    // non-null prepacked_weights on both the container and the default single-session paths, so this
+    // zero-fill runs on essentially every prepack at load (the guard only skips a caller that asks for
+    // no cacheable buffer). It keeps the dedup content hash reproducible regardless of bytes the pack
+    // leaves uninitialized (alignment padding), for any compute type. One-time O(packed_b_size_) load
+    // cost; inference is unaffected.
+    if (prepacked_weights != nullptr) {
+      std::memset(packed_b_.get(), 0, packed_b_size_);
+    }
     MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(),
                                 scales_fp32_.get(), has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_);
 
-#if defined(MLAS_TARGET_ARM64)
-    // For KleidiAI asymmetric 4-bit path: compute BZpCorr during B packing.
-    // The fp16 specialization packs B here (with scales already converted to fp32),
-    // so we also compute BZpCorr now while both scales and zero_points are accessible.
-    if (has_zp_input_ && nbits_ == 4 && scales_fp32_ != nullptr) {
-      const Tensor* zp_tensor = nullptr;
-      OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor);
-      if (zp_tensor != nullptr) {
-        auto zptr = zp_tensor->Data<uint8_t>();
-        MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(),
-                                    scales_fp32_.get(), has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_);
+    // Fold the scales and (constant) zero points into packed_b_ now (see the primary PrePack above):
+    // the CompInt8 block sum and the KleidiAI BZpCorr depend on the zero points, so they must be
+    // folded in before the sharing content hash is taken. Otherwise two initializers with identical B
+    // and scales but different zero points would hash equal and the second would wrongly adopt the
+    // first's buffer. The B pack above only partially populates the buffer, so issue one more pack
+    // call with QuantBData == nullptr to finalize it. This is byte-identical to the staged
+    // scales + zero_points packing it replaces.
+    bool finalize_scale_zp_into_packed_b = compute_type_ == SQNBIT_CompInt8 && scales_fp32_ != nullptr;
+#if !defined(MLAS_TARGET_AMD64_IX86)
+    // On ARM64 the scales/zero points are folded into B only for 8-bit, or for 4-bit when MLAS bakes
+    // them in (KleidiAI). For 4-bit non-KleidiAI they are applied at compute time and must not be
+    // passed to the packing routine, which would dereference the null QuantBData buffer.
+    finalize_scale_zp_into_packed_b =
+        finalize_scale_zp_into_packed_b &&
+        (nbits_ == 8 || MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, compute_type_,
+                                                  has_zp_input_, &mlas_backend_kernel_selector_config_));
+#endif
+    if (finalize_scale_zp_into_packed_b) {
+      const uint8_t* zp_ptr = nullptr;
+      if (has_zp_input_) {
+        const Tensor* zp_tensor = nullptr;
+        OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zp_tensor);
+        if (zp_tensor != nullptr) {
+          zp_ptr = zp_tensor->Data<uint8_t>();
+        }
       }
+      MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr /*QuantBData*/,
+                                  packed_b_.get(), scales_fp32_.get(), has_zp_input_, zp_ptr, nullptr,
+                                  &mlas_backend_kernel_selector_config_);
+      packed_b_finalized_ = true;
     }
-#endif  // MLAS_TARGET_ARM64
-
     is_packed = true;
+
+    if (prepacked_weights != nullptr) {
+      prepacked_weights->buffers_.push_back(std::move(packed_b_));
+      prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
+    }
   } else if (compute_type_ == SQNBIT_CompInt8) {
     bool should_pack_scale_and_zp = [&]() {
 #if defined(MLAS_TARGET_AMD64_IX86)
@@ -610,11 +711,11 @@ Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*ou
     }();
 
     if (should_pack_scale_and_zp) {
-      if (input_idx == InputIndex::scales && packed_b_ != nullptr) {
+      if (input_idx == InputIndex::scales && packed_b_ != nullptr && !packed_b_finalized_) {
         MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(),
                                     scales_fp32_.get(), has_zp_input_, nullptr, nullptr, &mlas_backend_kernel_selector_config_);
         is_packed = false;
-      } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) {
+      } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr && !packed_b_finalized_) {
         auto zptr = tensor.Data<uint8_t>();
         MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(),
                                     nullptr, has_zp_input_, zptr, nullptr, &mlas_backend_kernel_selector_config_);
@@ -635,6 +736,11 @@ Status MatMulNBits<T1>::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>&
   used_shared_buffers = false;
 
   if (input_idx == InputIndex::B && !prepacked_buffers.empty()) {
+    // The buffer handed back is fully finalized: the producing session folded the scales and zero
+    // points (block sums / KleidiAI BZpCorr) into it during its PrePack(B), which is also when this
+    // kernel set packed_b_finalized_ on its own (identical) B PrePack. The later scale/zero-point
+    // PrePack calls already skip the staged packing whenever packed_b_finalized_ is set, so simply
+    // adopt the shared buffer here - no extra bookkeeping is needed to avoid re-folding into it.
     packed_b_ = std::move(prepacked_buffers[0]);
     used_shared_buffers = true;
 
@@ -643,6 +749,9 @@ Status MatMulNBits<T1>::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>&
       packed_b_size_ = MlasLutGemmPackedSize(N_, K_, nbits_, block_size_, has_zp_input_);
     }
   }
+  // Only the quantized weight B yields a separately cached pre-packed buffer. The scales (and zero
+  // points) are folded into packed_b_ during the B PrePack and reported with is_packed = false, so
+  // the framework never asks this kernel to adopt a shared buffer for them.
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 6ef2319c1d3f4..85def4898d21c 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -498,8 +498,13 @@ Status SessionState::PrepackConstantInitializedTensors(
                 auto iter = initializers_to_share_map.find(input_name);
                 bool is_shared_initializer = (iter != initializers_to_share_map.end());
 
-                // Caching pre-packed weights is limited to shared initializers associated with the CPU EP for now
-                if (is_shared_initializer && should_cache_prepacked_weights_for_shared_initializers &&
+                // CPU EP only. An initializer joins the shared pre-packed container either when it was
+                // registered via OrtApi::AddInitializer (is_shared_initializer) or when a graph transformer
+                // tagged this synthesized initializer with a sharing identity (tagged_share_id).
+                const std::string* tagged_share_id = st->graph_.GetSharedPrepackInitializerId(input_name);
+                const bool enroll_tagged_initializer = (tagged_share_id != nullptr);
+                if ((is_shared_initializer || enroll_tagged_initializer) &&
+                    should_cache_prepacked_weights_for_shared_initializers &&
                     node.GetExecutionProviderType() == kCpuExecutionProvider) {
                   // caching of pre-packed weights' turned ON
 
@@ -530,12 +535,12 @@ Status SessionState::PrepackConstantInitializedTensors(
                     // TODO: Check if some version of the ONNX IR allows op_type to be empty
                     ORT_ENFORCE(!op_type.empty(), "The op type of a node cannot be empty");
 
-                    // The key for the pre-packed weights container lookup is the op_type + hash of the prepacked-weight
-                    // that we just got by invoking PrePack() on this kernel.
-
+                    // Tagged initializers are keyed by their sharing identity; AddInitializer ones by the
+                    // packed-bytes hash. Both carry the op_type prefix.
                     const std::string prepacked_weights_container_key =
-                        GenerateKeyForPrepackedWeightsMap(op_type,
-                                                          weights_to_be_filled_in);
+                        enroll_tagged_initializer
+                            ? (op_type + "+id+" + *tagged_share_id)
+                            : GenerateKeyForPrepackedWeightsMap(op_type, weights_to_be_filled_in);
 
                     bool container_contains_packed_weight = prepacked_weights_container_->HasWeight(
                         prepacked_weights_container_key);
@@ -615,11 +620,9 @@ Status SessionState::PrepackConstantInitializedTensors(
                                                       is_packed,
                                                       &weights_to_be_filled_in));
 
-                  // Some kernels (matmul_nbits and non-CPU related kernels) do not share their pre-packed results
+                  // Some kernels (non-CPU related kernels) do not share their pre-packed results
                   // even though they set is_packed = true so we leave it up to them.
                   // We can change their behavior if we wish do so in a separate PR
-                  // XXX: Interestingly enough, matmul_nbits does accept shared pre-packs, but does not
-                  // produce them.
                   if (is_packed && !weights_to_be_filled_in.buffers_.empty()) {
                     const auto& op_type = node.OpType();
                     const std::string prepacked_weights_container_key = GenerateKeyForPrepackedWeightsMap(
diff --git a/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc b/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc
index f3956d5e9e0f3..07fccef64fee1 100644
--- a/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc
+++ b/onnxruntime/core/optimizer/dq_matmulnbits_fusion.cc
@@ -12,6 +12,7 @@
 #include "core/graph/graph_utils.h"
 #include "core/graph/node_attr_utils.h"
 #include "core/optimizer/initializer.h"
+#include "core/optimizer/matmul_nbits_sharing_identity.h"
 #include "core/optimizer/utils.h"
 
 #include <cmath>
@@ -447,7 +448,6 @@ std::vector<DirectDQMatch> CollectDirectDQMatches(
   return direct_matches;
 }
 
-// ---------------------------------------------------------------------------
 // Pattern 1 rewriting: DQ+Reshape+Transpose+[Cast]+MatMul/Gemm -> MatMulNBits
 // ---------------------------------------------------------------------------
 
@@ -569,6 +569,10 @@ void ApplyReshapeTransposeFusions(
       zp_mnb_tp.emplace(utils::TensorToTensorProto(*zp_dst, zp_dst_name, true));
     }
 
+    // Cross-session sharing identity for the generated weight group; computed before the tensors move.
+    const std::string share_id =
+        ComputeMatMulNBitsSharingId(weight_dst, scale_dst, zp_dst, N, K, block_size, /*bits*/ 4, accuracy_level);
+
     NodeAttributes mnb_attrs;
     utils::SetNodeAttribute(utils::MakeAttribute("K", K), mnb_attrs);
     utils::SetNodeAttribute(utils::MakeAttribute("N", N), mnb_attrs);
@@ -578,7 +582,10 @@ void ApplyReshapeTransposeFusions(
 
     std::vector<NodeArg*> mnb_inputs;
     mnb_inputs.push_back(const_cast<NodeArg*>(mm_node->InputDefs()[0]));
-    mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, weight_mnb_tp, std::move(weight_dst)));
+    NodeArg& b_weight_arg = graph_utils::AddInitializerWithOrtValue(graph, weight_mnb_tp, std::move(weight_dst));
+    // Tag the generated B weight for cross-session pre-pack sharing.
+    graph.SetSharedPrepackInitializerId(b_weight_arg.Name(), share_id);
+    mnb_inputs.push_back(&b_weight_arg);
     mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, scale_mnb_tp, std::move(scale_dst)));
     if (zp_mnb_tp) {
       mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, zp_mnb_tp.value(), std::move(*zp_dst)));
@@ -749,6 +756,10 @@ void ApplyDirectDQFusions(
       zp_mnb_tp.emplace(utils::TensorToTensorProto(*zp_dst, zp_dst_name, true));
     }
 
+    // Cross-session sharing identity for the generated weight group; computed before the tensors move.
+    const std::string share_id =
+        ComputeMatMulNBitsSharingId(weight_dst, scale_dst, zp_dst, N, K, block_size, /*bits*/ 4, accuracy_level);
+
     NodeAttributes mnb_attrs;
     utils::SetNodeAttribute(utils::MakeAttribute("K", K), mnb_attrs);
     utils::SetNodeAttribute(utils::MakeAttribute("N", N), mnb_attrs);
@@ -758,7 +769,10 @@ void ApplyDirectDQFusions(
 
     std::vector<NodeArg*> mnb_inputs;
     mnb_inputs.push_back(const_cast<NodeArg*>(mm_node->InputDefs()[0]));
-    mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, weight_mnb_tp, std::move(weight_dst)));
+    NodeArg& b_weight_arg = graph_utils::AddInitializerWithOrtValue(graph, weight_mnb_tp, std::move(weight_dst));
+    // Tag the generated B weight for cross-session pre-pack sharing.
+    graph.SetSharedPrepackInitializerId(b_weight_arg.Name(), share_id);
+    mnb_inputs.push_back(&b_weight_arg);
     mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, scale_mnb_tp, std::move(scale_dst)));
     if (zp_mnb_tp) {
       mnb_inputs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, zp_mnb_tp.value(), std::move(*zp_dst)));
diff --git a/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h b/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h
new file mode 100644
index 0000000000000..597c8a292afd8
--- /dev/null
+++ b/onnxruntime/core/optimizer/matmul_nbits_sharing_identity.h
@@ -0,0 +1,37 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "core/framework/murmurhash3.h"
+#include "core/framework/tensor.h"
+
+namespace onnxruntime {
+
+// Stable, content-derived identity for a fusion-generated MatMulNBits weight group, used to share its
+// pre-packed buffer across sessions. The id is identical for the same model in any session and differs
+// whenever a semantic input differs. accuracy_level is hashed so buffers packed for different compute
+// types never collide. Pass zero_point only when it is an actual kernel input.
+inline std::string ComputeMatMulNBitsSharingId(const Tensor& weight, const Tensor& scale,
+                                               const std::optional<Tensor>& zero_point,
+                                               int64_t N, int64_t K, int64_t block_size,
+                                               int64_t bits, int64_t accuracy_level) {
+  uint32_t hash[4] = {0, 0, 0, 0};
+  auto hash_bytes = [&hash](const void* data, size_t len) {
+    MurmurHash3::x86_128(data, len, hash[0], &hash);
+  };
+  hash_bytes(weight.DataRaw(), weight.SizeInBytes());
+  hash_bytes(scale.DataRaw(), scale.SizeInBytes());
+  if (zero_point) {
+    hash_bytes(zero_point->DataRaw(), zero_point->SizeInBytes());
+  }
+  const int64_t params[] = {N, K, block_size, bits, accuracy_level};
+  hash_bytes(params, sizeof(params));
+  return "MatMulNBits.DQ:" + std::to_string((static_cast<uint64_t>(hash[1]) << 32) | hash[0]);
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
index b9d7e898157bd..6bd5e157d8b65 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
@@ -7,6 +7,7 @@
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h"
 #include "core/optimizer/qdq_transformer/qdq_util.h"
 #include "core/optimizer/initializer.h"
+#include "core/optimizer/matmul_nbits_sharing_identity.h"
 #include "core/graph/node_attr_utils.h"
 #include "core/graph/graph_utils.h"
 #include "core/framework/tensorprotoutils.h"
@@ -646,8 +647,23 @@ Status DQMatMulToMatMulNBitsAction::ProcessNewNode(Graph& graph,
   ORT_RETURN_IF_ERROR(TransposeDQWeightsForMatMulNBits(
       graph, *dq_node, "fused_DQ_MatMul", intra_op_thread_pool_, effective_bs, transposed));
 
+  // Cross-session sharing identity for the generated B weight; computed before it is moved.
+  const auto* weight_arg = dq_node->InputDefs()[0];
+  const auto* weight_shape = weight_arg->Shape();
+  ORT_RETURN_IF_NOT(weight_shape != nullptr && weight_shape->dim_size() >= 2,
+                    "Weight shape unavailable for DQ node ", dq_node->Name());
+  const int64_t bits = DQWeightBits(weight_arg->TypeAsProto()->tensor_type().elem_type());
+  const std::string share_id = ComputeMatMulNBitsSharingId(
+      transposed.weight, transposed.scale, transposed.zero_point,
+      weight_shape->dim(1).dim_value(), weight_shape->dim(0).dim_value(),
+      effective_bs, bits, accuracy_level_);
+
   auto& input_defs = replacement_node.MutableInputDefs();
-  input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, transposed.weight_proto, std::move(transposed.weight)));
+  NodeArg& b_weight_arg =
+      graph_utils::AddInitializerWithOrtValue(graph, transposed.weight_proto, std::move(transposed.weight));
+  // Tag the generated B weight for cross-session pre-pack sharing.
+  graph.SetSharedPrepackInitializerId(b_weight_arg.Name(), share_id);
+  input_defs.push_back(&b_weight_arg);
   replacement_node.MutableInputArgsCount().push_back(1);
 
   input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, transposed.scale_proto, std::move(transposed.scale)));
diff --git a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc
index 9deb064a90853..8e133caa15d55 100644
--- a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc
@@ -3,6 +3,7 @@
 
 #ifndef ORT_MINIMAL_BUILD
 
+#include <filesystem>
 #include <optional>
 
 #include "gtest/gtest.h"
@@ -26,6 +27,9 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/ort_env.h"
 #include "core/util/qmath.h"
+#include "core/graph/model.h"
+#include "test/util/include/inference_session_wrapper.h"
+#include "test/util/include/test/test_environment.h"
 #include "core/providers/webgpu/webgpu_provider_options.h"
 #ifdef USE_WEBGPU
 #include "contrib_ops/webgpu/quantization/matmul_nbits_common.h"
@@ -461,6 +465,117 @@ TEST(MatMulNBitsLutGemm, Float32_2Bits_Asymmetric_Batch32_256x256_Bias) {
   TestMatMul2BitsLutGemm<float>(32, 256, 256, 32, /*has_zero_point=*/true, /*has_bias=*/true);
 }
 
+// Regression test for the LUT GEMM pre-pack + prepacked-save path. A 2-bit MatMulNBits node pre-packed
+// via the LUT path must record its packed B buffer exactly once. A prior bug appended packed_b_ twice
+// on the LUT path (inside the LUT branch and again in the shared append at the end of the B block), so
+// the second entry was a moved-from/null buffer paired with a non-zero packed_b_size_. The pre-packed
+// content hash skips null buffers, so cross-session sharing appeared to work, but saving pre-packed
+// initializers iterates every recorded buffer and writes buffer_sizes_[i] bytes from buffers_[i].get(),
+// dereferencing the null pointer when mlas.use_lut_gemm=1. This drives mlas.use_lut_gemm=1 together with
+// session.save_external_prepacked_constant_initializers=1 and a non-empty optimized_model_filepath, and
+// asserts that initialization (which performs the save) and a subsequent run both succeed.
+TEST(MatMulNBitsLutGemm, Float32_2Bits_PrepackSaveDoesNotCrash) {
+  constexpr int64_t M = 1, N = 128, K = 128, block_size = 32;
+  if (!MlasIsLutGemmAvailable(static_cast<size_t>(N), static_cast<size_t>(K), 2, static_cast<size_t>(block_size))) {
+    GTEST_SKIP() << "LUT GEMM not available on this platform";
+  }
+
+  // Quantize random weights into valid 2-bit MatMulNBits B/scales/zero_points initializers.
+  RandomValueGenerator random{1234};
+  std::vector<float> b_fp32(random.Gaussian<float>(AsSpan({K, N}), 0.0f, 0.25f));
+
+  int q_rows = 0, q_cols = 0;
+  MlasBlockwiseQuantizedShape<float, QBits>(static_cast<int>(block_size), /*columnwise*/ true,
+                                            static_cast<int>(K), static_cast<int>(N), q_rows, q_cols);
+  size_t q_data_size_in_bytes = 0, q_scale_size = 0, q_zp_size_in_bytes = 0;
+  MlasBlockwiseQuantizedBufferSizes<QBits>(static_cast<int>(block_size), /*columnwise*/ true,
+                                           static_cast<int>(K), static_cast<int>(N),
+                                           q_data_size_in_bytes, q_scale_size, &q_zp_size_in_bytes);
+
+  std::vector<uint8_t> b_data(q_data_size_in_bytes);
+  std::vector<float> scales(q_scale_size);
+  std::vector<uint8_t> zp(q_zp_size_in_bytes);
+
+  auto& ortenv = **ort_env.get();
+  onnxruntime::concurrency::ThreadPool* tp = ortenv.GetEnvironment().GetIntraOpThreadPool();
+  MlasQuantizeBlockwise<float, QBits>(b_data.data(), scales.data(), zp.data(), b_fp32.data(),
+                                      static_cast<int32_t>(block_size), /*columnwise*/ true,
+                                      static_cast<int32_t>(K), static_cast<int32_t>(N),
+                                      static_cast<int32_t>(N), tp);
+
+  // Single-node MatMulNBits model: A is a runtime input; B/scales/zero_points are constant initializers
+  // (so they are pre-packed at session initialization).
+  const int64_t k_blocks = (K + block_size - 1) / block_size;
+  const std::unordered_map<std::string, int> domain_to_version{{"", 21}, {kMSDomain, 1}};
+  Model model("matmul_2bits_lut_prepack_save", false, ModelMetaData(), PathString(),
+              IOnnxRuntimeOpSchemaRegistryList(), domain_to_version,
+              std::vector<ONNX_NAMESPACE::FunctionProto>(), DefaultLoggingManager().DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder builder(graph);
+
+  ONNX_NAMESPACE::TypeProto float_2d;
+  float_2d.mutable_tensor_type()->set_elem_type(utils::ToTensorProtoElementType<float>());
+  float_2d.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(M);
+  float_2d.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(K);
+  NodeArg* A = &graph.GetOrCreateNodeArg("A", &float_2d);
+  NodeArg* Y = &graph.GetOrCreateNodeArg("Y", nullptr);
+
+  NodeArg* B = builder.MakeInitializer<uint8_t>(
+      {static_cast<int64_t>(q_cols), k_blocks, static_cast<int64_t>(q_rows) / k_blocks}, b_data);
+  NodeArg* scales_arg = builder.MakeInitializer<float>({N, static_cast<int64_t>(q_scale_size) / N}, scales);
+  NodeArg* zero_points =
+      builder.MakeInitializer<uint8_t>({N, static_cast<int64_t>(q_zp_size_in_bytes) / N}, zp);
+
+  Node& node = builder.AddNode("MatMulNBits", {A, B, scales_arg, zero_points}, {Y}, kMSDomain);
+  node.AddAttribute("K", K);
+  node.AddAttribute("N", N);
+  node.AddAttribute("block_size", block_size);
+  node.AddAttribute("bits", static_cast<int64_t>(QBits));
+  node.AddAttribute("accuracy_level", static_cast<int64_t>(0));
+
+  graph.SetOutputs(std::vector<const NodeArg*>{Y});
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  std::string model_bytes;
+  ASSERT_TRUE(model.ToProto().SerializeToString(&model_bytes));
+
+  // Save the optimized model + pre-packed initializers into a unique temp dir. Writing the prepacked
+  // initializers is the path that dereferenced the duplicate null buffer before the fix.
+  namespace fs = std::filesystem;
+  const fs::path tmp_dir = fs::temp_directory_path() / "ort_matmul2bits_lut_prepack_save_test";
+  std::error_code ec;
+  fs::remove_all(tmp_dir, ec);
+  ASSERT_TRUE(fs::create_directories(tmp_dir, ec)) << ec.message();
+  const fs::path optimized_model_path = tmp_dir / "optimized.onnx";
+
+  SessionOptions so;
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsMlasLutGemm, "1"));
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsSavePrePackedConstantInitializers, "1"));
+  so.optimized_model_filepath = optimized_model_path.native();
+
+  std::vector<OrtValue> fetches;
+  {
+    InferenceSessionWrapper session{so, GetEnvironment()};
+    ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast<int>(model_bytes.size())));
+    // Initialization performs the LUT pre-pack and writes the optimized model with external
+    // pre-packed initializers. Before the fix this dereferenced the duplicate null packed buffer.
+    ASSERT_STATUS_OK(session.Initialize());
+
+    auto cpu_allocator = TestCPUExecutionProvider()->CreatePreferredAllocators()[0];
+    std::vector<float> a_data = random.Gaussian<float>(AsSpan({M, K}), 0.0f, 0.25f);
+    OrtValue a_value;
+    CreateMLValue<float>(cpu_allocator, AsSpan({M, K}), a_data, &a_value);
+    NameMLValMap feeds{{"A", a_value}};
+
+    ASSERT_STATUS_OK(session.Run(RunOptions{}, feeds, std::vector<std::string>{"Y"}, &fetches));
+  }
+
+  ASSERT_EQ(fetches.size(), static_cast<size_t>(1));
+  EXPECT_TRUE(fs::exists(optimized_model_path));
+
+  fs::remove_all(tmp_dir, ec);
+}
+
 // Float zero point tests — directed QAD scenario (zp=1.5)
 void RunTest2BitsFloatZP(int64_t M, int64_t N, int64_t K, int64_t block_size, float zp_value) {
   RandomValueGenerator random{1234};
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index aadbbab1c135b..dd5cfb73dfe31 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -22,10 +22,14 @@
 #include "test/unittest_util/graph_transform_test_builder.h"
 #include "test/util/include/default_providers.h"
 #include "test/util/include/scoped_env_vars.h"
+#include "test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/ort_env.h"
 #include "core/util/qmath.h"
 #include "core/providers/webgpu/webgpu_provider_options.h"
+#include "core/framework/prepacked_weights_container.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "test/util/include/test/test_environment.h"
 
 extern std::unique_ptr<Ort::Env> ort_env;
 
@@ -87,6 +91,10 @@ struct TestOptions {
 
   bool legacy_shape{false};  // for backward compatibility
 
+  // When set, RunTest validates cross-session sharing of the pre-packed weights instead of doing a
+  // single run. The model is run in two sessions that use the same pre-packed weights container.
+  std::optional<PrepackSharingMode> prepack_sharing_mode{};
+
   std::optional<float> output_abs_error{};
   std::optional<float> output_rel_error{};
 };
@@ -269,6 +277,13 @@ void RunTest(const TestOptions& opts,
     test.SetOutputRelErr("Y", *opts.output_rel_error);
   }
 
+  if (opts.prepack_sharing_mode.has_value()) {
+    // Pre-packed weight sharing is a CPU-EP-only feature; the helper runs the model on the CPU EP
+    // in two sessions and validates the sharing counters.
+    CheckSharedPrepackedWeights(test, *opts.prepack_sharing_mode, {N, k_blocks, blob_size}, input1_vals);
+    return;
+  }
+
   if (!explicit_eps.empty()) {
     test.ConfigEps(std::move(explicit_eps));
   }
@@ -597,6 +612,55 @@ TEST(MatMulNBits, Float32_4b_Accuracy4_Batch) {
   RunTest<float>(opts);
 }
 
+#ifndef ENABLE_TRAINING
+// Pre-packing (and therefore cross-session sharing of pre-packed weights) is disabled in a full
+// training build, so there is nothing to exercise there.
+
+namespace {
+// Builds a representative MatMulNBits TestOptions for the pre-packed weight sharing tests.
+TestOptions MakeSharingTestOptions(int64_t N, int64_t K, int64_t block_size, int64_t accuracy_level,
+                                   bool has_zero_point, bool has_bias, PrepackSharingMode mode) {
+  TestOptions opts{};
+  opts.M = 8;
+  opts.N = N;
+  opts.K = K;
+  opts.block_size = block_size;
+  opts.accuracy_level = accuracy_level;
+  opts.has_zero_point = has_zero_point;
+  opts.zp_is_4bit = true;
+  opts.has_bias = has_bias;
+  opts.prepack_sharing_mode = mode;
+  opts.output_abs_error = 0.1f;
+  opts.output_rel_error = 0.02f;
+  return opts;
+}
+}  // namespace
+
+// Legacy sharing path: the weight B is registered as a shared initializer via
+// SessionOptions::AddInitializer. Covers float and float16 activations, symmetric/asymmetric, +/- bias.
+TEST(MatMulNBits, SharedPrepackedWeights_AddInitializer) {
+  for (bool has_zero_point : {false, true}) {
+    for (bool has_bias : {false, true}) {
+      RunTest<float>(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, has_zero_point,
+                                            has_bias, PrepackSharingMode::kAddInitializer));
+      RunTest<MLFloat16>(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, has_zero_point,
+                                                has_bias, PrepackSharingMode::kAddInitializer));
+    }
+  }
+}
+
+// Negative control: with the shared container present but neither opt-in mechanism enabled, no
+// pre-packed weights are shared across sessions.
+TEST(MatMulNBits, SharedPrepackedWeights_NotSharedWithoutOptIn) {
+  RunTest<float>(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0, /*has_zero_point*/ true,
+                                        /*has_bias*/ true, PrepackSharingMode::kNoSharing));
+  RunTest<MLFloat16>(MakeSharingTestOptions(32, 256, /*block_size*/ 32, /*accuracy_level*/ 0,
+                                            /*has_zero_point*/ false, /*has_bias*/ false,
+                                            PrepackSharingMode::kNoSharing));
+}
+
+#endif  // !ENABLE_TRAINING
+
 #endif
 #endif
 
diff --git a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc
index f99334c4f33ef..411e83536c190 100644
--- a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc
@@ -21,6 +21,7 @@
 #include "test/unittest_util/graph_transform_test_builder.h"
 #include "test/util/include/default_providers.h"
 #include "test/util/include/scoped_env_vars.h"
+#include "test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/ort_env.h"
 #include "core/util/qmath.h"
@@ -51,6 +52,10 @@ struct TestOptions8Bits {
   bool has_g_idx{false};
   bool has_bias{false};
 
+  // When set, RunTest8Bits validates cross-session sharing of the pre-packed weights instead of
+  // doing a single run. The model is run in two CPU sessions that use the same container.
+  std::optional<PrepackSharingMode> prepack_sharing_mode{};
+
   std::optional<float> output_abs_error{};
   std::optional<float> output_rel_error{};
 };
@@ -221,6 +226,14 @@ void RunTest8Bits(const TestOptions8Bits& opts) {
     test.SetOutputRelErr("Y", *opts.output_rel_error);
   }
 
+  if (opts.prepack_sharing_mode.has_value()) {
+    // Pre-packed weight sharing is a CPU-EP-only feature; the helper runs the model on the CPU EP
+    // in two sessions and validates the sharing counters.
+    CheckSharedPrepackedWeights(test, *opts.prepack_sharing_mode,
+                                {q_cols, k_blocks, q_rows / k_blocks}, input1_vals);
+    return;
+  }
+
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
 #ifdef USE_CUDA
   execution_providers.emplace_back(DefaultCudaExecutionProvider());
@@ -671,6 +684,56 @@ TEST(MatMulNBits, BFloat16_Int8_Chunked_BFloat16ZeroPoint) {
 }
 #endif
 
+#if !defined(USE_CUDA) && !defined(USE_WEBGPU)
+#ifndef ENABLE_TRAINING
+// Pre-packing (and therefore cross-session sharing of pre-packed weights) is disabled in a full
+// training build and is only implemented for the CPU EP, so these tests are CPU-only.
+
+namespace {
+// Builds a representative 8-bit MatMulNBits TestOptions for the pre-packed weight sharing tests.
+// accuracy_level 4 selects the int8 compute type (SQNBIT_CompInt8 / HQNBIT_CompInt8), which is the
+// 8-bit path that pre-packs the quantized B weight.
+TestOptions8Bits MakeSharingTestOptions8Bits(int64_t block_size, bool has_zero_point, bool has_bias,
+                                             PrepackSharingMode mode) {
+  TestOptions8Bits opts{};
+  opts.M = 8;
+  opts.N = 32;
+  opts.K = 256;
+  opts.block_size = block_size;
+  opts.accuracy_level = 4;
+  opts.has_zero_point = has_zero_point;
+  opts.has_bias = has_bias;
+  opts.prepack_sharing_mode = mode;
+  opts.output_abs_error = 0.1f;
+  opts.output_rel_error = 0.02f;
+  return opts;
+}
+}  // namespace
+
+// Legacy sharing path for 8-bit weights: B is registered as a shared initializer via
+// SessionOptions::AddInitializer.
+TEST(MatMulNBits, SharedPrepackedWeights_8b_AddInitializer) {
+  for (bool has_zero_point : {false, true}) {
+    for (bool has_bias : {false, true}) {
+      RunTest8Bits<float>(MakeSharingTestOptions8Bits(32, has_zero_point, has_bias,
+                                                      PrepackSharingMode::kAddInitializer));
+      RunTest8Bits<MLFloat16>(MakeSharingTestOptions8Bits(32, has_zero_point, has_bias,
+                                                          PrepackSharingMode::kAddInitializer));
+    }
+  }
+}
+
+// Negative control for 8-bit weights: with the shared container present but neither opt-in mechanism
+// enabled, no pre-packed weights are shared across sessions.
+TEST(MatMulNBits, SharedPrepackedWeights_8b_NotSharedWithoutOptIn) {
+  RunTest8Bits<float>(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ true, /*has_bias*/ true,
+                                                  PrepackSharingMode::kNoSharing));
+  RunTest8Bits<MLFloat16>(MakeSharingTestOptions8Bits(32, /*has_zero_point*/ false, /*has_bias*/ false,
+                                                      PrepackSharingMode::kNoSharing));
+}
+#endif  // !ENABLE_TRAINING
+#endif  // !USE_CUDA && !USE_WEBGPU
+
 }  // namespace test
 }  // namespace onnxruntime
 
diff --git a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc
new file mode 100644
index 0000000000000..97566afe02489
--- /dev/null
+++ b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.cc
@@ -0,0 +1,108 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "core/framework/tensor.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "test/providers/provider_test_utils.h"
+#include "test/util/include/asserts.h"
+#include "test/util/include/default_providers.h"
+
+namespace onnxruntime {
+namespace test {
+
+void CheckSharedPrepackedWeights(OpTester& test, PrepackSharingMode mode,
+                                 const std::vector<int64_t>& b_dims,
+                                 std::vector<uint8_t>& b_data) {
+  SessionOptions so;
+  OrtValue b_ortvalue;
+
+  switch (mode) {
+    case PrepackSharingMode::kAddInitializer:
+      // Register B as an explicitly shared initializer (the pre-existing sharing mechanism).
+      Tensor::InitOrtValue(DataTypeImpl::GetType<uint8_t>(), TensorShape(b_dims), b_data.data(),
+                           OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator), b_ortvalue);
+      ASSERT_STATUS_OK(so.AddInitializer("B", &b_ortvalue));
+      break;
+    case PrepackSharingMode::kNoSharing:
+      // Neither opt-in mechanism is used.
+      break;
+  }
+
+  // Have all sessions created by this OpTester use the same pre-packed weights container.
+  test.EnableSharingOfPrePackedWeightsAcrossSessions();
+
+  // Pre-packing is limited to the CPU EP, so the sharing behavior is only exercised there.
+  auto cpu_ep = []() -> std::vector<std::unique_ptr<IExecutionProvider>> {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    return execution_providers;
+  };
+
+  size_t number_of_pre_packed_weights_counter_session_1 = 0;
+  size_t number_of_shared_pre_packed_weights_counter = 0;
+
+  // Session 1
+  {
+    auto ep_vec = cpu_ep();
+    test.Run(so, OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &ep_vec, {},
+             &number_of_pre_packed_weights_counter_session_1,
+             &number_of_shared_pre_packed_weights_counter);
+    // Nothing can be shared yet because this is the first session.
+    ASSERT_EQ(number_of_shared_pre_packed_weights_counter, static_cast<size_t>(0));
+  }
+
+  const auto number_of_elements_in_shared_container = test.GetNumPrePackedWeightsShared();
+
+  if (mode == PrepackSharingMode::kNoSharing) {
+    // Without opting in, pre-packed weights must not be placed in the shared container.
+    ASSERT_EQ(number_of_elements_in_shared_container, static_cast<size_t>(0));
+  }
+
+  // On some platforms/architectures MLAS may choose not to pre-pack, in which case there is nothing
+  // to share and we cannot meaningfully continue.
+  if (number_of_pre_packed_weights_counter_session_1 == 0) {
+    return;
+  }
+
+  if (mode != PrepackSharingMode::kNoSharing) {
+    // At least the quantized weight B is content-addressed into the shared container. Some
+    // architectures (e.g. ARM64 KleidiAI) additionally pre-pack scales, but in the AddInitializer
+    // mode only the explicitly-registered B participates, so the container can hold fewer elements
+    // than the total number of pre-packed weights.
+    ASSERT_GT(number_of_elements_in_shared_container, static_cast<size_t>(0));
+    ASSERT_LE(number_of_elements_in_shared_container, number_of_pre_packed_weights_counter_session_1);
+  }
+
+  // Session 2
+  {
+    size_t number_of_pre_packed_weights_counter_session_2 = 0;
+    auto ep_vec = cpu_ep();
+    test.Run(so, OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &ep_vec, {},
+             &number_of_pre_packed_weights_counter_session_2,
+             &number_of_shared_pre_packed_weights_counter);
+
+    // The same number of weights is pre-packed in both sessions.
+    ASSERT_EQ(number_of_pre_packed_weights_counter_session_1, number_of_pre_packed_weights_counter_session_2);
+
+    // Every weight stored in the shared container is served from it (i.e. shared) in the second
+    // session. For the no-sharing control this is zero; otherwise it matches the container size.
+    ASSERT_EQ(number_of_shared_pre_packed_weights_counter, number_of_elements_in_shared_container);
+
+    if (mode == PrepackSharingMode::kNoSharing) {
+      ASSERT_EQ(number_of_shared_pre_packed_weights_counter, static_cast<size_t>(0));
+    } else {
+      ASSERT_GT(number_of_shared_pre_packed_weights_counter, static_cast<size_t>(0));
+    }
+  }
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h
new file mode 100644
index 0000000000000..1de0bbaa4bb85
--- /dev/null
+++ b/onnxruntime/test/contrib_ops/matmul_nbits_prepack_sharing_test_util.h
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+namespace onnxruntime {
+namespace test {
+
+class OpTester;
+
+// How two sessions are configured to share the pre-packed weights of a MatMulNBits node.
+enum class PrepackSharingMode {
+  // Legacy path: the weight is explicitly registered as a shared initializer via
+  // SessionOptions::AddInitializer.
+  kAddInitializer,
+  // Negative control: the shared container exists but neither opt-in mechanism is used, so no
+  // cross-session sharing must happen.
+  kNoSharing,
+};
+
+// Runs the already-configured MatMulNBits OpTester in two CPU sessions that share the same
+// pre-packed weights container and asserts that the pre-packed weights are shared as expected.
+// This logic is independent of the weight bit width, so it is shared by the 4-bit and 8-bit tests.
+// `b_dims`/`b_data` describe the quantized B initializer and are only needed for the
+// PrepackSharingMode::kAddInitializer path (to register B as a shared initializer).
+void CheckSharedPrepackedWeights(OpTester& test, PrepackSharingMode mode,
+                                 const std::vector<int64_t>& b_dims,
+                                 std::vector<uint8_t>& b_data);
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc
index 8aa4c88052742..385aa7ffebc66 100644
--- a/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc
+++ b/onnxruntime/test/optimizer/dq_matmulnbits_fusion_test.cc
@@ -7,14 +7,20 @@
 
 #include "core/common/span_utils.h"
 #include "core/framework/int4.h"
+#include "core/framework/prepacked_weights_container.h"
+#include "core/graph/constants.h"
+#include "core/graph/model.h"
 #include "core/graph/node_attr_utils.h"
 #include "core/optimizer/dq_matmulnbits_fusion.h"
+#include "core/session/inference_session.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 #include "test/test_environment.h"
 #include "test/unittest_util/framework_test_utils.h"
 #include "test/unittest_util/graph_transform_test_builder.h"
 #include "test/optimizer/graph_transform_test_fixture.h"
 #include "test/util/include/asserts.h"
+#include "test/util/include/inference_session_wrapper.h"
 
 #include "gtest/gtest.h"
 
@@ -354,6 +360,152 @@ TEST_F(DQMatMulNBitsFusionTest, Pattern1_MatMul_NoZP) {
                                         TransformerLevel::Level1, 1, pre_check, post_check));
 }
 
+// Validates the cross-session-sharing tag the fusion attaches to the generated B weight. The tag is a
+// stable, content-derived identity: identical source quantization groups must yield the SAME identity
+// (so two sessions optimizing the same model share the pre-packed B), while any semantic difference --
+// here, different zero points -- must yield a DIFFERENT identity (so they must not falsely share).
+TEST_F(DQMatMulNBitsFusionTest, TagsGeneratedWeightWithStableContentIdentity) {
+  constexpr int64_t M = 4, N = 8, K = 32, block_size = 16;
+  const int64_t num_blocks = K / block_size;
+
+  std::vector<uint8_t> weight(static_cast<size_t>(N * num_blocks * block_size));
+  for (size_t i = 0; i < weight.size(); ++i) {
+    weight[i] = static_cast<uint8_t>(i % 16);
+  }
+  std::vector<float> scale(static_cast<size_t>(N * num_blocks));
+  for (size_t i = 0; i < scale.size(); ++i) {
+    scale[i] = 0.1f + 0.01f * static_cast<float>(i % 10);
+  }
+  // Non-default (non-8) zero points so the fusion keeps them (it elides uniform-8 zero points).
+  std::vector<uint8_t> zp_a(static_cast<size_t>(N * num_blocks), 3);
+  std::vector<uint8_t> zp_b(zp_a.size(), 5);
+
+  // Runs the fusion on a Pattern-1 model built from the given zero points and returns the sharing
+  // identity tagged onto the generated MatMulNBits B weight.
+  auto tag_for = [&](const std::vector<uint8_t>& zp) -> std::string {
+    std::string captured;
+    auto build = [&](ModelTestBuilder& builder) {
+      BuildPattern1Graph(builder, M, N, K, block_size, /*with_zp*/ true, /*with_cast*/ false,
+                         /*use_gemm*/ false, &weight, &scale, &zp);
+    };
+    auto pre_check = [](Graph&) -> Status { return Status::OK(); };
+    auto post_check = [&](Graph& graph) -> Status {
+      int matmulnbits = 0;
+      for (const auto& node : graph.Nodes()) {
+        if (node.OpType() == "MatMulNBits") {
+          ++matmulnbits;
+          const std::string& b_name = node.InputDefs()[1]->Name();  // input 1 == quantized B
+          const std::string* id = graph.GetSharedPrepackInitializerId(b_name);
+          EXPECT_NE(id, nullptr) << "generated B weight was not tagged for cross-session sharing";
+          if (id != nullptr) {
+            captured = *id;
+          }
+        }
+      }
+      EXPECT_EQ(matmulnbits, 1);
+      return Status::OK();
+    };
+    auto transformer = std::make_unique<DQMatMulNBitsFusion>(4);
+    EXPECT_TRUE(TestGraphTransformer(build, 21, *logger_, std::move(transformer),
+                                     TransformerLevel::Level1, 1, pre_check, post_check)
+                    .IsOK());
+    return captured;
+  };
+
+  const std::string id_a1 = tag_for(zp_a);
+  const std::string id_a2 = tag_for(zp_a);
+  const std::string id_b = tag_for(zp_b);
+
+  ASSERT_FALSE(id_a1.empty());
+  EXPECT_EQ(id_a1, id_a2);  // stable: identical source quantization group -> identical identity
+  EXPECT_NE(id_a1, id_b);   // collision-safe: different zero points -> different identity
+}
+
+// Builds and serializes a Pattern-1 DQ->Reshape->Transpose->MatMul model (UINT4 constant weight). When
+// loaded into a session with the DQ->MatMulNBits fusion enabled, it becomes a MatMulNBits whose B is
+// tagged for cross-session sharing.
+static void SerializeDQMatMulModel(int64_t M, int64_t N, int64_t K, int64_t block_size,
+                                   const std::vector<uint8_t>& weight, const std::vector<float>& scale,
+                                   const std::vector<uint8_t>& zp, std::string& model_bytes) {
+  const std::unordered_map<std::string, int> domain_to_version{{"", 21}, {kMSDomain, 1}};
+  Model model("dq_matmulnbits_share", false, ModelMetaData(), PathString(),
+              IOnnxRuntimeOpSchemaRegistryList(), domain_to_version,
+              std::vector<ONNX_NAMESPACE::FunctionProto>(), DefaultLoggingManager().DefaultLogger());
+  ModelTestBuilder builder(model.MainGraph());
+  BuildPattern1Graph(builder, M, N, K, block_size, /*with_zp*/ true, /*with_cast*/ false,
+                     /*use_gemm*/ false, &weight, &scale, &zp);
+  builder.SetGraphOutputs();
+  ASSERT_STATUS_OK(model.MainGraph().Resolve());
+  ASSERT_TRUE(model.ToProto().SerializeToString(&model_bytes));
+}
+
+// Loads the serialized model on the CPU EP with the DQ->MatMulNBits fusion enabled and the supplied
+// shared container. Reports whether the fusion produced a MatMulNBits and how many pre-packed weights
+// this session served from the container.
+static void RunSharedFusionSession(const std::string& model_bytes, PrepackedWeightsContainer& container,
+                                   bool& produced_matmulnbits, size_t& used_shared_count) {
+  SessionOptions so;
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsEnableDQMatMulNBitsFusion, "1"));
+
+  InferenceSessionWrapper session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.AddPrePackedWeightsContainer(&container));
+  ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast<int>(model_bytes.size())));
+  ASSERT_STATUS_OK(session.Initialize());
+
+  produced_matmulnbits = false;
+  for (const auto& node : session.GetGraph().Nodes()) {
+    if (node.OpType() == "MatMulNBits") {
+      produced_matmulnbits = true;
+      break;
+    }
+  }
+  used_shared_count = session.GetSessionState().GetUsedSharedPrePackedWeightCounter();
+}
+
+// End-to-end: two sessions optimizing the same DQ+MatMul model share the fused MatMulNBits B weight
+// through a common container WITHOUT any session option -- the fusion tags it and SessionState enrolls
+// it by that identity. A session over a model that differs only in its zero points must NOT share.
+TEST_F(DQMatMulNBitsFusionTest, SharesFusedWeightAcrossSessionsViaTag) {
+  constexpr int64_t M = 4, N = 8, K = 32, block_size = 16;
+  const int64_t num_blocks = K / block_size;
+
+  std::vector<uint8_t> weight(static_cast<size_t>(N * num_blocks * block_size));
+  for (size_t i = 0; i < weight.size(); ++i) {
+    weight[i] = static_cast<uint8_t>(i % 16);
+  }
+  std::vector<float> scale(static_cast<size_t>(N * num_blocks));
+  for (size_t i = 0; i < scale.size(); ++i) {
+    scale[i] = 0.1f + 0.01f * static_cast<float>(i % 10);
+  }
+  std::vector<uint8_t> zp_a(static_cast<size_t>(N * num_blocks), 3);
+  std::vector<uint8_t> zp_b(zp_a.size(), 5);  // differs only in zero points
+
+  std::string model_a, model_b;
+  SerializeDQMatMulModel(M, N, K, block_size, weight, scale, zp_a, model_a);
+  SerializeDQMatMulModel(M, N, K, block_size, weight, scale, zp_b, model_b);
+
+  PrepackedWeightsContainer container;
+  bool fused1 = false, fused2 = false, fused_b = false;
+  size_t used1 = 0, used2 = 0, used_b = 0;
+
+  RunSharedFusionSession(model_a, container, fused1, used1);
+  ASSERT_TRUE(fused1) << "DQ -> MatMulNBits fusion did not run";
+  if (container.GetNumberOfElements() == 0) {
+    GTEST_SKIP() << "MatMulNBits B was not pre-packed on this platform";
+  }
+  EXPECT_EQ(used1, static_cast<size_t>(0));  // first session: nothing to share yet
+
+  // Second session over the SAME model shares the tagged B from the container.
+  RunSharedFusionSession(model_a, container, fused2, used2);
+  ASSERT_TRUE(fused2);
+  EXPECT_GT(used2, static_cast<size_t>(0));
+
+  // A model differing only in zero points has a different identity and must NOT reuse the buffer.
+  RunSharedFusionSession(model_b, container, fused_b, used_b);
+  ASSERT_TRUE(fused_b);
+  EXPECT_EQ(used_b, static_cast<size_t>(0));
+}
+
 TEST_F(DQMatMulNBitsFusionTest, Pattern1_MatMul_WithDefaultZP8) {
   constexpr int64_t M = 4, N = 8, K = 32, block_size = 16;
 
diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc
index a1c0f8adfffb7..d16707e9a9ad4 100644
--- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc
@@ -2,10 +2,14 @@
 // Licensed under the MIT License.
 
 #include <type_traits>
+#include <unordered_map>
 
 #include "core/common/span_utils.h"
 #include "core/common/float16.h"
 #include "core/framework/int4.h"
+#include "core/framework/prepacked_weights_container.h"
+#include "core/graph/constants.h"
+#include "core/graph/model.h"
 #include "core/graph/node_attr_utils.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
@@ -1462,6 +1466,219 @@ TEST(QDQTransformerTests, DQGemmNotConvertedToMatMulNBits_Alpha) {
                     1e-5, 2e-5);
 }
 
+// ---------------------------------------------------------------------------
+// Cross-session pre-pack sharing for the DEFAULT DQ->MatMulNBits path
+// ---------------------------------------------------------------------------
+// DQMatMulToMatMulNBitsAction (in the QDQ selector/action transformer) runs without the
+// session.enable_dq_matmulnbits_fusion flag and synthesizes the MatMulNBits B/scales/zp initializers
+// with names that are NOT stable across sessions. It tags the generated B weight with a stable,
+// content-derived identity that SessionState uses to share the pre-packed buffer across sessions.
+
+// Packs uint4 nibble values (row-major, 2 per byte) into UInt4x2 storage.
+static std::vector<UInt4x2> PackUint4Nibbles(const std::vector<uint8_t>& values) {
+  const size_t num_pairs = UInt4x2::CalcNumInt4Pairs(values.size());
+  std::vector<UInt4x2> packed(num_pairs);
+  for (size_t i = 0; i < values.size(); i += 2) {
+    const uint8_t lo = values[i] & 0x0F;
+    const uint8_t hi = (i + 1 < values.size()) ? (values[i + 1] & 0x0F) : 0;
+    packed[i / 2] = UInt4x2(lo, hi);
+  }
+  return packed;
+}
+
+// Builds a default-path model: a constant UINT4 weight [K, N] block-quantized along axis 0 feeding a
+// DequantizeLinear whose output is the second input to a single MatMul. The QDQ selector/action
+// transformer converts this into a MatMulNBits. Explicit weight/scale/zp give a deterministic identity.
+static void BuildDefaultPathDQMatMul(ModelTestBuilder& builder, int64_t M, int64_t N, int64_t K,
+                                     int64_t block_size, const std::vector<uint8_t>& weight,
+                                     const std::vector<float>& scale, const std::vector<uint8_t>& zp) {
+  const int64_t num_blocks = (K + block_size - 1) / block_size;
+
+  auto* input_a = builder.MakeInput<float>({M, K}, -1.0f, 1.0f);
+  auto* output = builder.MakeOutput();
+
+  auto* weight_arg = builder.MakeInitializer<UInt4x2>({K, N}, PackUint4Nibbles(weight));
+  auto* scale_arg = builder.MakeInitializer<float>({num_blocks, N}, scale);
+  auto* zp_arg = builder.MakeInitializer<UInt4x2>({num_blocks, N}, PackUint4Nibbles(zp));
+
+  NodeAttributes dq_attrs;
+  utils::SetNodeAttribute(utils::MakeAttribute("axis", static_cast<int64_t>(0)), dq_attrs);
+  utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), dq_attrs);
+  auto* dq_output = builder.MakeIntermediate();
+  builder.AddNode("DequantizeLinear", {weight_arg, scale_arg, zp_arg}, {dq_output}, "", &dq_attrs);
+
+  builder.AddNode("MatMul", {input_a, dq_output}, {output});
+}
+
+// Serializes a default-path DQ->MatMul model built from explicit quantization data.
+static void SerializeDefaultPathModel(int64_t M, int64_t N, int64_t K, int64_t block_size,
+                                      const std::vector<uint8_t>& weight, const std::vector<float>& scale,
+                                      const std::vector<uint8_t>& zp, std::string& model_bytes) {
+  const std::unordered_map<std::string, int> domain_to_version{{"", 21}, {kMSDomain, 1}};
+  Model model("dq_matmul_default_share", false, ModelMetaData(), PathString(),
+              IOnnxRuntimeOpSchemaRegistryList(), domain_to_version,
+              std::vector<ONNX_NAMESPACE::FunctionProto>(), DefaultLoggingManager().DefaultLogger());
+  ModelTestBuilder builder(model.MainGraph());
+  BuildDefaultPathDQMatMul(builder, M, N, K, block_size, weight, scale, zp);
+  builder.SetGraphOutputs();
+  ASSERT_STATUS_OK(model.MainGraph().Resolve());
+  ASSERT_TRUE(model.ToProto().SerializeToString(&model_bytes));
+}
+
+// Loads the model on the CPU EP with the given shared container and DEFAULT options (no fusion flag).
+// Reports whether a MatMulNBits was produced, the sharing identity tagged onto its B weight, and how
+// many pre-packed weights this session served from the container.
+static void RunDefaultPathSession(const std::string& model_bytes, PrepackedWeightsContainer& container,
+                                  bool& produced_matmulnbits, std::string& b_tag, size_t& used_shared_count,
+                                  int accuracy_level = -1) {
+  SessionOptions so;
+  if (accuracy_level >= 0) {
+    ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                      std::to_string(accuracy_level).c_str()));
+  }
+  InferenceSessionWrapper session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.AddPrePackedWeightsContainer(&container));
+  ASSERT_STATUS_OK(session.Load(model_bytes.data(), static_cast<int>(model_bytes.size())));
+  ASSERT_STATUS_OK(session.Initialize());
+
+  produced_matmulnbits = false;
+  b_tag.clear();
+  const Graph& graph = session.GetGraph();
+  for (const auto& node : graph.Nodes()) {
+    if (node.OpType() == "MatMulNBits") {
+      produced_matmulnbits = true;
+      const std::string& b_name = node.InputDefs()[1]->Name();  // input 1 == quantized B
+      if (const std::string* id = graph.GetSharedPrepackInitializerId(b_name); id != nullptr) {
+        b_tag = *id;
+      }
+      break;
+    }
+  }
+  used_shared_count = session.GetSessionState().GetUsedSharedPrePackedWeightCounter();
+}
+
+// Verifies the default DQ->MatMulNBits path tags its generated B weight with a stable, content-derived
+// identity: identical quantization data yields the SAME identity, while different zero points yield a
+// DIFFERENT identity (so two models differing only in zp must not falsely share a pre-packed buffer).
+TEST(QDQTransformerTests, DefaultPath_TagsGeneratedWeightWithStableContentIdentity) {
+  constexpr int64_t M = 4, N = 8, K = 32, block_size = 16;
+  const int64_t num_blocks = K / block_size;
+
+  std::vector<uint8_t> weight(static_cast<size_t>(K * N));
+  for (size_t i = 0; i < weight.size(); ++i) {
+    weight[i] = static_cast<uint8_t>(i % 16);
+  }
+  std::vector<float> scale(static_cast<size_t>(num_blocks * N));
+  for (size_t i = 0; i < scale.size(); ++i) {
+    scale[i] = 0.1f + 0.01f * static_cast<float>(i % 10);
+  }
+  std::vector<uint8_t> zp_a(static_cast<size_t>(num_blocks * N), 3);
+  std::vector<uint8_t> zp_b(zp_a.size(), 5);
+
+  auto tag_for = [&](const std::vector<uint8_t>& zp) -> std::string {
+    std::string model_bytes;
+    SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp, model_bytes);
+    PrepackedWeightsContainer container;
+    bool produced = false;
+    std::string tag;
+    size_t used = 0;
+    RunDefaultPathSession(model_bytes, container, produced, tag, used);
+    EXPECT_TRUE(produced) << "DQ -> MatMulNBits conversion did not run on the default path";
+    return tag;
+  };
+
+  const std::string id_a1 = tag_for(zp_a);
+  const std::string id_a2 = tag_for(zp_a);
+  const std::string id_b = tag_for(zp_b);
+
+  ASSERT_FALSE(id_a1.empty()) << "generated B weight was not tagged for cross-session sharing";
+  EXPECT_EQ(id_a1, id_a2);  // stable: identical quantization data -> identical identity
+  EXPECT_NE(id_a1, id_b);   // collision-safe: different zero points -> different identity
+}
+
+// End-to-end: two sessions converting the same model via the default path share the MatMulNBits B
+// pre-packed buffer through a common container (no session option). A model differing only in zero
+// points has a different identity and must not reuse the buffer.
+TEST(QDQTransformerTests, DefaultPath_SharesWeightAcrossSessionsViaTag) {
+  constexpr int64_t M = 4, N = 8, K = 32, block_size = 16;
+  const int64_t num_blocks = K / block_size;
+
+  std::vector<uint8_t> weight(static_cast<size_t>(K * N));
+  for (size_t i = 0; i < weight.size(); ++i) {
+    weight[i] = static_cast<uint8_t>(i % 16);
+  }
+  std::vector<float> scale(static_cast<size_t>(num_blocks * N));
+  for (size_t i = 0; i < scale.size(); ++i) {
+    scale[i] = 0.1f + 0.01f * static_cast<float>(i % 10);
+  }
+  std::vector<uint8_t> zp_a(static_cast<size_t>(num_blocks * N), 3);
+  std::vector<uint8_t> zp_b(zp_a.size(), 5);  // differs only in zero points
+
+  std::string model_a, model_b;
+  SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp_a, model_a);
+  SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp_b, model_b);
+
+  PrepackedWeightsContainer container;
+  bool produced1 = false, produced2 = false, produced_b = false;
+  std::string tag1, tag2, tag_b;
+  size_t used1 = 0, used2 = 0, used_b = 0;
+
+  RunDefaultPathSession(model_a, container, produced1, tag1, used1);
+  ASSERT_TRUE(produced1) << "DQ -> MatMulNBits conversion did not run on the default path";
+  if (container.GetNumberOfElements() == 0) {
+    GTEST_SKIP() << "MatMulNBits B was not pre-packed on this platform";
+  }
+  EXPECT_EQ(used1, static_cast<size_t>(0));  // first session: nothing to share yet
+
+  // Second session over the SAME model reuses the tagged B from the container.
+  RunDefaultPathSession(model_a, container, produced2, tag2, used2);
+  ASSERT_TRUE(produced2);
+  EXPECT_GT(used2, static_cast<size_t>(0));
+
+  // A model differing only in zero points must NOT reuse the buffer.
+  RunDefaultPathSession(model_b, container, produced_b, tag_b, used_b);
+  ASSERT_TRUE(produced_b);
+  EXPECT_EQ(used_b, static_cast<size_t>(0));
+}
+
+// The sharing identity includes accuracy_level, so the same weights compiled for different compute
+// types (e.g. CompFp32 at level 0 vs CompInt8 at level 4) get DIFFERENT identities and must not share
+// a pre-packed buffer whose layout depends on that compute type.
+TEST(QDQTransformerTests, DefaultPath_DifferentAccuracyLevelDoesNotShare) {
+  constexpr int64_t M = 4, N = 8, K = 32, block_size = 16;
+  const int64_t num_blocks = K / block_size;
+
+  std::vector<uint8_t> weight(static_cast<size_t>(K * N));
+  for (size_t i = 0; i < weight.size(); ++i) {
+    weight[i] = static_cast<uint8_t>(i % 16);
+  }
+  std::vector<float> scale(static_cast<size_t>(num_blocks * N));
+  for (size_t i = 0; i < scale.size(); ++i) {
+    scale[i] = 0.1f + 0.01f * static_cast<float>(i % 10);
+  }
+  std::vector<uint8_t> zp(static_cast<size_t>(num_blocks * N), 3);
+
+  std::string model_bytes;
+  SerializeDefaultPathModel(M, N, K, block_size, weight, scale, zp, model_bytes);
+
+  PrepackedWeightsContainer container;
+  bool produced0 = false, produced4 = false;
+  std::string tag0, tag4;
+  size_t used0 = 0, used4 = 0;
+
+  RunDefaultPathSession(model_bytes, container, produced0, tag0, used0, /*accuracy_level*/ 0);
+  ASSERT_TRUE(produced0) << "DQ -> MatMulNBits conversion did not run on the default path";
+
+  // Same model/weights, different accuracy level, sharing the same container.
+  RunDefaultPathSession(model_bytes, container, produced4, tag4, used4, /*accuracy_level*/ 4);
+  ASSERT_TRUE(produced4);
+
+  ASSERT_FALSE(tag0.empty());
+  ASSERT_FALSE(tag4.empty());
+  EXPECT_NE(tag0, tag4);                     // accuracy_level participates in the identity
+  EXPECT_EQ(used4, static_cast<size_t>(0));  // different identity => no cross-accuracy sharing
+}
+
 #endif  // !defined(DISABLE_CONTRIB_OPS)
 
 }  // namespace test