From d49467da68680474aef87378f46fcef4078d0863 Mon Sep 17 00:00:00 2001 From: neilmsft Date: Thu, 18 Jun 2026 11:43:10 -0700 Subject: [PATCH 1/3] Fix memcpy over-copy of packed sub-byte tensors --- onnxruntime/core/session/onnxruntime_c_api.cc | 10 +++-- .../test/shared_lib/test_nontensor_types.cc | 45 +++++++++++++++++++ 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index a663d209cfa53..5c135b1dab7f1 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -2018,14 +2018,16 @@ static ORT_STATUS_PTR OrtGetValueImplSeqOfMap(const OrtValue* p_ml_value, int in } #endif -ORT_STATUS_PTR PopulateTensorWithData(Tensor& tensor, bool is_string, _In_ const void* data_elem, size_t num_elems, - size_t elem_size) { +ORT_STATUS_PTR PopulateTensorWithData(Tensor& tensor, bool is_string, _In_ const void* data_elem, size_t num_elems) { auto len = narrow(tensor.Shape().Size()); if (num_elems < len) { return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "input array is too short"); } if (!is_string) { - memcpy(tensor.MutableDataRaw(), data_elem, elem_size * num_elems); + // Use the tensor's actual storage size in bytes rather than elem_size * num_elems. + // For packed sub-byte types (e.g., int4/uint4) multiple elements share a storage byte, + // so the naive product over-counts and would over-read the source / overflow the destination. + memcpy(tensor.MutableDataRaw(), data_elem, tensor.SizeInBytes()); } else { const std::string* strings = reinterpret_cast(data_elem); auto str_span = gsl::make_span(strings, num_elems); @@ -2039,7 +2041,7 @@ ORT_STATUS_PTR CreateTensorAndPopulate(MLDataType element_type, const int64_t* s const void* data, size_t num_elements, _Inout_ OrtAllocator* allocator, OrtValue& result) { ORT_API_RETURN_IF_ERROR(CreateTensorImpl(element_type, shape, shape_len, allocator, result)); ORT_API_RETURN_IF_ERROR(PopulateTensorWithData(*result.GetMutable(), utils::IsDataTypeString(element_type), - data, num_elements, element_type->Size())); + data, num_elements)); return nullptr; } diff --git a/onnxruntime/test/shared_lib/test_nontensor_types.cc b/onnxruntime/test/shared_lib/test_nontensor_types.cc index 497298474b36a..7a81fc0e15e00 100644 --- a/onnxruntime/test/shared_lib/test_nontensor_types.cc +++ b/onnxruntime/test/shared_lib/test_nontensor_types.cc @@ -4,6 +4,7 @@ #include #include #include +#include #include "core/common/common.h" #include "core/session/onnxruntime_cxx_api.h" @@ -277,6 +278,50 @@ TEST(CApiTest, CreateGetSeqStringTensors) { ASSERT_EQ(string_set, std::set(std::begin(string_input_data), std::end(string_input_data))); } +// Regression test for MSRC 119491: GetValue() on a sequence of packed sub-byte tensors +// (int4/uint4) must copy only the packed storage bytes. Previously it copied +// element_type->Size() * logical_element_count bytes, which over-counts ~2x for sub-byte +// types and caused a heap over-read of the source and overflow of the destination. +// Uses an odd logical length (7 elements -> ceil(7/2) = 4 packed bytes) to exercise the +// packing edge. Best run under AddressSanitizer, where the pre-fix overflow is detected. +TEST(CApiTest, CreateGetSeqSubByteTensors) { + auto default_allocator = std::make_unique(); + Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); + + auto run_for_type = [&](ONNXTensorElementDataType elem_type, const std::array& packed) { + const std::vector dims{7}; // 7 4-bit elements -> 4 packed bytes + constexpr int N = 2; + + std::vector in; + for (int i = 0; i < N; ++i) { + Ort::Value tensor = Ort::Value::CreateTensor(info, const_cast(packed.data()), packed.size(), + dims.data(), dims.size(), elem_type); + in.push_back(std::move(tensor)); + } + + Ort::Value seq_ort = Ort::Value::CreateSequence(in); + + for (int idx = 0; idx < N; ++idx) { + Ort::Value out = seq_ort.GetValue(idx, default_allocator.get()); + + auto type_info = out.GetTypeInfo(); + auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); + ASSERT_EQ(tensor_info.GetElementType(), elem_type); + ASSERT_EQ(tensor_info.GetShape(), dims); + + const auto* ret = out.GetTensorData(); + for (size_t i = 0; i < packed.size(); ++i) { + ASSERT_EQ(ret[i], packed[i]); + } + } + }; + + // {0, 1, 2, 3, -8, 7, 6, pad_0} + run_for_type(ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4, {0x10, 0x32, 0x78, 0x06}); + // {0, 1, 2, 3, 4, 5, 15, pad_0} + run_for_type(ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4, {0x10, 0x32, 0x54, 0x0F}); +} + TEST(CApiTest, TypeInfoSequence) { // Creation auto default_allocator = std::make_unique(); From e05b6d9f82535021da5c569776d49e0486306854 Mon Sep 17 00:00:00 2001 From: neilmsft Date: Thu, 18 Jun 2026 12:04:48 -0700 Subject: [PATCH 2/3] update comment --- onnxruntime/test/shared_lib/test_nontensor_types.cc | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/onnxruntime/test/shared_lib/test_nontensor_types.cc b/onnxruntime/test/shared_lib/test_nontensor_types.cc index 7a81fc0e15e00..6b4ea27022cb4 100644 --- a/onnxruntime/test/shared_lib/test_nontensor_types.cc +++ b/onnxruntime/test/shared_lib/test_nontensor_types.cc @@ -278,12 +278,8 @@ TEST(CApiTest, CreateGetSeqStringTensors) { ASSERT_EQ(string_set, std::set(std::begin(string_input_data), std::end(string_input_data))); } -// Regression test for MSRC 119491: GetValue() on a sequence of packed sub-byte tensors -// (int4/uint4) must copy only the packed storage bytes. Previously it copied -// element_type->Size() * logical_element_count bytes, which over-counts ~2x for sub-byte -// types and caused a heap over-read of the source and overflow of the destination. -// Uses an odd logical length (7 elements -> ceil(7/2) = 4 packed bytes) to exercise the -// packing edge. Best run under AddressSanitizer, where the pre-fix overflow is detected. +// Test - GetValue() on a sequence of packed sub-byte tensors +// (int4/uint4) must copy only the packed storage bytes. TEST(CApiTest, CreateGetSeqSubByteTensors) { auto default_allocator = std::make_unique(); Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); From 38461d009bf17d1874796eb5d099aecf8d7c78ea Mon Sep 17 00:00:00 2001 From: neilmsft Date: Sat, 20 Jun 2026 16:19:51 -0700 Subject: [PATCH 3/3] address const and other update --- onnxruntime/core/session/onnxruntime_c_api.cc | 11 ++++++----- onnxruntime/test/shared_lib/test_nontensor_types.cc | 12 ++++++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 5c135b1dab7f1..663aac12cf317 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -2018,19 +2018,21 @@ static ORT_STATUS_PTR OrtGetValueImplSeqOfMap(const OrtValue* p_ml_value, int in } #endif -ORT_STATUS_PTR PopulateTensorWithData(Tensor& tensor, bool is_string, _In_ const void* data_elem, size_t num_elems) { +ORT_STATUS_PTR PopulateTensorWithData(Tensor& tensor, _In_ const void* data_elem, size_t num_elems) { auto len = narrow(tensor.Shape().Size()); if (num_elems < len) { return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "input array is too short"); } - if (!is_string) { + if (!tensor.IsDataTypeString()) { // Use the tensor's actual storage size in bytes rather than elem_size * num_elems. // For packed sub-byte types (e.g., int4/uint4) multiple elements share a storage byte, // so the naive product over-counts and would over-read the source / overflow the destination. memcpy(tensor.MutableDataRaw(), data_elem, tensor.SizeInBytes()); } else { const std::string* strings = reinterpret_cast(data_elem); - auto str_span = gsl::make_span(strings, num_elems); + // Copy exactly the tensor's element count (len), not num_elems, to avoid writing past + // the destination when the source is larger than the tensor. + auto str_span = gsl::make_span(strings, len); auto* dst = tensor.MutableData(); std::copy(str_span.begin(), str_span.end(), dst); } @@ -2040,8 +2042,7 @@ ORT_STATUS_PTR PopulateTensorWithData(Tensor& tensor, bool is_string, _In_ const ORT_STATUS_PTR CreateTensorAndPopulate(MLDataType element_type, const int64_t* shape, size_t shape_len, const void* data, size_t num_elements, _Inout_ OrtAllocator* allocator, OrtValue& result) { ORT_API_RETURN_IF_ERROR(CreateTensorImpl(element_type, shape, shape_len, allocator, result)); - ORT_API_RETURN_IF_ERROR(PopulateTensorWithData(*result.GetMutable(), utils::IsDataTypeString(element_type), - data, num_elements)); + ORT_API_RETURN_IF_ERROR(PopulateTensorWithData(*result.GetMutable(), data, num_elements)); return nullptr; } diff --git a/onnxruntime/test/shared_lib/test_nontensor_types.cc b/onnxruntime/test/shared_lib/test_nontensor_types.cc index 6b4ea27022cb4..ef982f07f7782 100644 --- a/onnxruntime/test/shared_lib/test_nontensor_types.cc +++ b/onnxruntime/test/shared_lib/test_nontensor_types.cc @@ -284,13 +284,13 @@ TEST(CApiTest, CreateGetSeqSubByteTensors) { auto default_allocator = std::make_unique(); Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); - auto run_for_type = [&](ONNXTensorElementDataType elem_type, const std::array& packed) { + auto run_for_type = [&](ONNXTensorElementDataType elem_type, std::array packed) { const std::vector dims{7}; // 7 4-bit elements -> 4 packed bytes constexpr int N = 2; std::vector in; for (int i = 0; i < N; ++i) { - Ort::Value tensor = Ort::Value::CreateTensor(info, const_cast(packed.data()), packed.size(), + Ort::Value tensor = Ort::Value::CreateTensor(info, packed.data(), packed.size(), dims.data(), dims.size(), elem_type); in.push_back(std::move(tensor)); } @@ -305,8 +305,12 @@ TEST(CApiTest, CreateGetSeqSubByteTensors) { ASSERT_EQ(tensor_info.GetElementType(), elem_type); ASSERT_EQ(tensor_info.GetShape(), dims); - const auto* ret = out.GetTensorData(); - for (size_t i = 0; i < packed.size(); ++i) { + // Compare the packed bytes directly. GetTensorData() does not support sub-byte + // types, so use the raw pointer and the packing-aware byte size. + const size_t out_bytes = out.GetTensorSizeInBytes(); + ASSERT_EQ(out_bytes, packed.size()); + const auto* ret = static_cast(out.GetTensorRawData()); + for (size_t i = 0; i < out_bytes; ++i) { ASSERT_EQ(ret[i], packed[i]); } }