From d49467da68680474aef87378f46fcef4078d0863 Mon Sep 17 00:00:00 2001
From: neilmsft <neilmsft@hotmail.com>
Date: Thu, 18 Jun 2026 11:43:10 -0700
Subject: [PATCH 1/3] Fix memcpy over-copy of packed sub-byte tensors

---
 onnxruntime/core/session/onnxruntime_c_api.cc | 10 +++--
 .../test/shared_lib/test_nontensor_types.cc   | 45 +++++++++++++++++++
 2 files changed, 51 insertions(+), 4 deletions(-)
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index a663d209cfa53..5c135b1dab7f1 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2018,14 +2018,16 @@ static ORT_STATUS_PTR OrtGetValueImplSeqOfMap(const OrtValue* p_ml_value, int in
 }
 #endif
 
-ORT_STATUS_PTR PopulateTensorWithData(Tensor& tensor, bool is_string, _In_ const void* data_elem, size_t num_elems,
-                                      size_t elem_size) {
+ORT_STATUS_PTR PopulateTensorWithData(Tensor& tensor, bool is_string, _In_ const void* data_elem, size_t num_elems) {
   auto len = narrow<size_t>(tensor.Shape().Size());
   if (num_elems < len) {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "input array is too short");
   }
   if (!is_string) {
-    memcpy(tensor.MutableDataRaw(), data_elem, elem_size * num_elems);
+    // Use the tensor's actual storage size in bytes rather than elem_size * num_elems.
+    // For packed sub-byte types (e.g., int4/uint4) multiple elements share a storage byte,
+    // so the naive product over-counts and would over-read the source / overflow the destination.
+    memcpy(tensor.MutableDataRaw(), data_elem, tensor.SizeInBytes());
   } else {
     const std::string* strings = reinterpret_cast<const std::string*>(data_elem);
     auto str_span = gsl::make_span(strings, num_elems);
@@ -2039,7 +2041,7 @@ ORT_STATUS_PTR CreateTensorAndPopulate(MLDataType element_type, const int64_t* s
                                        const void* data, size_t num_elements, _Inout_ OrtAllocator* allocator, OrtValue& result) {
   ORT_API_RETURN_IF_ERROR(CreateTensorImpl(element_type, shape, shape_len, allocator, result));
   ORT_API_RETURN_IF_ERROR(PopulateTensorWithData(*result.GetMutable<Tensor>(), utils::IsDataTypeString(element_type),
-                                                 data, num_elements, element_type->Size()));
+                                                 data, num_elements));
   return nullptr;
 }
 
diff --git a/onnxruntime/test/shared_lib/test_nontensor_types.cc b/onnxruntime/test/shared_lib/test_nontensor_types.cc
index 497298474b36a..7a81fc0e15e00 100644
--- a/onnxruntime/test/shared_lib/test_nontensor_types.cc
+++ b/onnxruntime/test/shared_lib/test_nontensor_types.cc
@@ -4,6 +4,7 @@
 #include <functional>
 #include <iostream>
 #include <set>
+#include <array>
 
 #include "core/common/common.h"
 #include "core/session/onnxruntime_cxx_api.h"
@@ -277,6 +278,50 @@ TEST(CApiTest, CreateGetSeqStringTensors) {
   ASSERT_EQ(string_set, std::set<std::string>(std::begin(string_input_data), std::end(string_input_data)));
 }
 
+// Regression test for MSRC 119491: GetValue() on a sequence of packed sub-byte tensors
+// (int4/uint4) must copy only the packed storage bytes. Previously it copied
+// element_type->Size() * logical_element_count bytes, which over-counts ~2x for sub-byte
+// types and caused a heap over-read of the source and overflow of the destination.
+// Uses an odd logical length (7 elements -> ceil(7/2) = 4 packed bytes) to exercise the
+// packing edge. Best run under AddressSanitizer, where the pre-fix overflow is detected.
+TEST(CApiTest, CreateGetSeqSubByteTensors) {
+  auto default_allocator = std::make_unique<MockedOrtAllocator>();
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+
+  auto run_for_type = [&](ONNXTensorElementDataType elem_type, const std::array<uint8_t, 4>& packed) {
+    const std::vector<int64_t> dims{7};  // 7 4-bit elements -> 4 packed bytes
+    constexpr int N = 2;
+
+    std::vector<Ort::Value> in;
+    for (int i = 0; i < N; ++i) {
+      Ort::Value tensor = Ort::Value::CreateTensor(info, const_cast<uint8_t*>(packed.data()), packed.size(),
+                                                   dims.data(), dims.size(), elem_type);
+      in.push_back(std::move(tensor));
+    }
+
+    Ort::Value seq_ort = Ort::Value::CreateSequence(in);
+
+    for (int idx = 0; idx < N; ++idx) {
+      Ort::Value out = seq_ort.GetValue(idx, default_allocator.get());
+
+      auto type_info = out.GetTypeInfo();
+      auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
+      ASSERT_EQ(tensor_info.GetElementType(), elem_type);
+      ASSERT_EQ(tensor_info.GetShape(), dims);
+
+      const auto* ret = out.GetTensorData<uint8_t>();
+      for (size_t i = 0; i < packed.size(); ++i) {
+        ASSERT_EQ(ret[i], packed[i]);
+      }
+    }
+  };
+
+  // {0, 1, 2, 3, -8, 7, 6, pad_0}
+  run_for_type(ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4, {0x10, 0x32, 0x78, 0x06});
+  // {0, 1, 2, 3, 4, 5, 15, pad_0}
+  run_for_type(ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4, {0x10, 0x32, 0x54, 0x0F});
+}
+
 TEST(CApiTest, TypeInfoSequence) {
   // Creation
   auto default_allocator = std::make_unique<MockedOrtAllocator>();

From e05b6d9f82535021da5c569776d49e0486306854 Mon Sep 17 00:00:00 2001
From: neilmsft <neilmsft@hotmail.com>
Date: Thu, 18 Jun 2026 12:04:48 -0700
Subject: [PATCH 2/3] update comment

---
 onnxruntime/test/shared_lib/test_nontensor_types.cc | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/test/shared_lib/test_nontensor_types.cc b/onnxruntime/test/shared_lib/test_nontensor_types.cc
index 7a81fc0e15e00..6b4ea27022cb4 100644
--- a/onnxruntime/test/shared_lib/test_nontensor_types.cc
+++ b/onnxruntime/test/shared_lib/test_nontensor_types.cc
@@ -278,12 +278,8 @@ TEST(CApiTest, CreateGetSeqStringTensors) {
   ASSERT_EQ(string_set, std::set<std::string>(std::begin(string_input_data), std::end(string_input_data)));
 }
 
-// Regression test for MSRC 119491: GetValue() on a sequence of packed sub-byte tensors
-// (int4/uint4) must copy only the packed storage bytes. Previously it copied
-// element_type->Size() * logical_element_count bytes, which over-counts ~2x for sub-byte
-// types and caused a heap over-read of the source and overflow of the destination.
-// Uses an odd logical length (7 elements -> ceil(7/2) = 4 packed bytes) to exercise the
-// packing edge. Best run under AddressSanitizer, where the pre-fix overflow is detected.
+// Test - GetValue() on a sequence of packed sub-byte tensors
+// (int4/uint4) must copy only the packed storage bytes.
 TEST(CApiTest, CreateGetSeqSubByteTensors) {
   auto default_allocator = std::make_unique<MockedOrtAllocator>();
   Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);

From 38461d009bf17d1874796eb5d099aecf8d7c78ea Mon Sep 17 00:00:00 2001
From: neilmsft <neilmsft@hotmail.com>
Date: Sat, 20 Jun 2026 16:19:51 -0700
Subject: [PATCH 3/3] address const and other update

---
 onnxruntime/core/session/onnxruntime_c_api.cc       | 11 ++++++-----
 onnxruntime/test/shared_lib/test_nontensor_types.cc | 12 ++++++++----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 5c135b1dab7f1..663aac12cf317 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2018,19 +2018,21 @@ static ORT_STATUS_PTR OrtGetValueImplSeqOfMap(const OrtValue* p_ml_value, int in
 }
 #endif
 
-ORT_STATUS_PTR PopulateTensorWithData(Tensor& tensor, bool is_string, _In_ const void* data_elem, size_t num_elems) {
+ORT_STATUS_PTR PopulateTensorWithData(Tensor& tensor, _In_ const void* data_elem, size_t num_elems) {
   auto len = narrow<size_t>(tensor.Shape().Size());
   if (num_elems < len) {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "input array is too short");
   }
-  if (!is_string) {
+  if (!tensor.IsDataTypeString()) {
     // Use the tensor's actual storage size in bytes rather than elem_size * num_elems.
     // For packed sub-byte types (e.g., int4/uint4) multiple elements share a storage byte,
     // so the naive product over-counts and would over-read the source / overflow the destination.
     memcpy(tensor.MutableDataRaw(), data_elem, tensor.SizeInBytes());
   } else {
     const std::string* strings = reinterpret_cast<const std::string*>(data_elem);
-    auto str_span = gsl::make_span(strings, num_elems);
+    // Copy exactly the tensor's element count (len), not num_elems, to avoid writing past
+    // the destination when the source is larger than the tensor.
+    auto str_span = gsl::make_span(strings, len);
     auto* dst = tensor.MutableData<std::string>();
     std::copy(str_span.begin(), str_span.end(), dst);
   }
@@ -2040,8 +2042,7 @@ ORT_STATUS_PTR PopulateTensorWithData(Tensor& tensor, bool is_string, _In_ const
 ORT_STATUS_PTR CreateTensorAndPopulate(MLDataType element_type, const int64_t* shape, size_t shape_len,
                                        const void* data, size_t num_elements, _Inout_ OrtAllocator* allocator, OrtValue& result) {
   ORT_API_RETURN_IF_ERROR(CreateTensorImpl(element_type, shape, shape_len, allocator, result));
-  ORT_API_RETURN_IF_ERROR(PopulateTensorWithData(*result.GetMutable<Tensor>(), utils::IsDataTypeString(element_type),
-                                                 data, num_elements));
+  ORT_API_RETURN_IF_ERROR(PopulateTensorWithData(*result.GetMutable<Tensor>(), data, num_elements));
   return nullptr;
 }
 
diff --git a/onnxruntime/test/shared_lib/test_nontensor_types.cc b/onnxruntime/test/shared_lib/test_nontensor_types.cc
index 6b4ea27022cb4..ef982f07f7782 100644
--- a/onnxruntime/test/shared_lib/test_nontensor_types.cc
+++ b/onnxruntime/test/shared_lib/test_nontensor_types.cc
@@ -284,13 +284,13 @@ TEST(CApiTest, CreateGetSeqSubByteTensors) {
   auto default_allocator = std::make_unique<MockedOrtAllocator>();
   Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
 
-  auto run_for_type = [&](ONNXTensorElementDataType elem_type, const std::array<uint8_t, 4>& packed) {
+  auto run_for_type = [&](ONNXTensorElementDataType elem_type, std::array<uint8_t, 4> packed) {
     const std::vector<int64_t> dims{7};  // 7 4-bit elements -> 4 packed bytes
     constexpr int N = 2;
 
     std::vector<Ort::Value> in;
     for (int i = 0; i < N; ++i) {
-      Ort::Value tensor = Ort::Value::CreateTensor(info, const_cast<uint8_t*>(packed.data()), packed.size(),
+      Ort::Value tensor = Ort::Value::CreateTensor(info, packed.data(), packed.size(),
                                                    dims.data(), dims.size(), elem_type);
       in.push_back(std::move(tensor));
     }
@@ -305,8 +305,12 @@ TEST(CApiTest, CreateGetSeqSubByteTensors) {
       ASSERT_EQ(tensor_info.GetElementType(), elem_type);
       ASSERT_EQ(tensor_info.GetShape(), dims);
 
-      const auto* ret = out.GetTensorData<uint8_t>();
-      for (size_t i = 0; i < packed.size(); ++i) {
+      // Compare the packed bytes directly. GetTensorData<T>() does not support sub-byte
+      // types, so use the raw pointer and the packing-aware byte size.
+      const size_t out_bytes = out.GetTensorSizeInBytes();
+      ASSERT_EQ(out_bytes, packed.size());
+      const auto* ret = static_cast<const uint8_t*>(out.GetTensorRawData());
+      for (size_t i = 0; i < out_bytes; ++i) {
         ASSERT_EQ(ret[i], packed[i]);
       }
     }