1313#include < vector>
1414
1515#if defined(__APPLE__) || defined(__linux__)
16+
17+ // Unicode constants for validation
18+ constexpr uint32_t kUnicodeReplacementChar = 0xFFFD ;
19+ constexpr uint32_t kUnicodeMaxCodePoint = 0x10FFFF ;
20+
1621// Constants for character encoding
1722const char * kOdbcEncoding = " utf-16-le" ; // ODBC uses UTF-16LE for SQLWCHAR
1823const size_t kUcsLength = 2 ; // SQLWCHAR is 2 bytes on all platforms
@@ -24,74 +29,113 @@ std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS)
2429 return std::wstring ();
2530 }
2631
32+ // Lambda to calculate string length using pointer arithmetic
33+ auto calculateLength = [](const SQLWCHAR* str) -> size_t {
34+ const SQLWCHAR* p = str;
35+ while (*p)
36+ ++p;
37+ return p - str;
38+ };
39+
2740 if (length == SQL_NTS) {
28- // Determine length if not provided
29- size_t i = 0 ;
30- while (sqlwStr[i] != 0 )
31- ++i;
32- length = i;
41+ length = calculateLength (sqlwStr);
3342 }
3443
35- // Create a UTF-16LE byte array from the SQLWCHAR array
36- std::vector<char > utf16Bytes (length * kUcsLength );
37- for (size_t i = 0 ; i < length; ++i) {
38- // Copy each SQLWCHAR (2 bytes) to the byte array
39- memcpy (&utf16Bytes[i * kUcsLength ], &sqlwStr[i], kUcsLength );
44+ if (length == 0 ) {
45+ return std::wstring ();
4046 }
4147
42- // Convert UTF-16LE to std::wstring (UTF-32 on macOS)
43- try {
44- // CRITICAL FIX: Use thread_local to make std::wstring_convert thread-safe
45- // std::wstring_convert is NOT thread-safe and its use is deprecated in C++17
46- // Each thread gets its own converter instance, eliminating race conditions
47- thread_local std::wstring_convert<
48- std::codecvt_utf8_utf16<wchar_t , 0x10ffff , std::little_endian>>
49- converter;
50-
51- std::wstring result = converter.from_bytes (
52- reinterpret_cast <const char *>(utf16Bytes.data ()),
53- reinterpret_cast <const char *>(utf16Bytes.data () + utf16Bytes.size ()));
54- return result;
55- } catch (const std::exception& e) {
56- // Fallback to character-by-character conversion if codecvt fails
57- std::wstring result;
58- result.reserve (length);
59- for (size_t i = 0 ; i < length; ++i) {
60- result.push_back (static_cast <wchar_t >(sqlwStr[i]));
48+ // Lambda to check if character is in Basic Multilingual Plane
49+ auto isBMP = [](uint16_t ch) { return ch < 0xD800 || ch > 0xDFFF ; };
50+
51+ // Lambda to decode surrogate pair into code point
52+ auto decodeSurrogatePair = [](uint16_t high, uint16_t low) -> uint32_t {
53+ return 0x10000 + (static_cast <uint32_t >(high & 0x3FF ) << 10 ) + (low & 0x3FF );
54+ };
55+
56+ // Convert UTF-16 to UTF-32 directly without intermediate buffer
57+ std::wstring result;
58+ result.reserve (length); // Reserve assuming most chars are BMP
59+
60+ size_t i = 0 ;
61+ while (i < length) {
62+ uint16_t utf16Char = static_cast <uint16_t >(sqlwStr[i]);
63+
64+ // Fast path: BMP character (most common - ~99% of strings)
65+ if (isBMP (utf16Char)) {
66+ result.push_back (static_cast <wchar_t >(utf16Char));
67+ ++i;
68+ }
69+ // Handle surrogate pairs for characters outside BMP
70+ else if (utf16Char <= 0xDBFF ) { // High surrogate
71+ if (i + 1 < length) {
72+ uint16_t lowSurrogate = static_cast <uint16_t >(sqlwStr[i + 1 ]);
73+ if (lowSurrogate >= 0xDC00 && lowSurrogate <= 0xDFFF ) {
74+ uint32_t codePoint = decodeSurrogatePair (utf16Char, lowSurrogate);
75+ result.push_back (static_cast <wchar_t >(codePoint));
76+ i += 2 ;
77+ continue ;
78+ }
79+ }
80+ // Invalid surrogate - replace with Unicode replacement character
81+ result.push_back (static_cast <wchar_t >(kUnicodeReplacementChar ));
82+ ++i;
83+ } else { // Low surrogate without high - invalid, replace with replacement character
84+ result.push_back (static_cast <wchar_t >(kUnicodeReplacementChar ));
85+ ++i;
6186 }
62- return result;
6387 }
88+ return result;
6489}
6590
66- // Function to convert std::wstring to SQLWCHAR array on macOS
67- // THREAD-SAFE: Uses thread_local converter to avoid std::wstring_convert race conditions
91+ // Function to convert std::wstring to SQLWCHAR array on macOS/Linux
92+ // Converts UTF-32 (wstring on Unix) to UTF-16 (SQLWCHAR)
93+ // Invalid Unicode scalars (surrogates, values > 0x10FFFF) are replaced with U+FFFD
6894std::vector<SQLWCHAR> WStringToSQLWCHAR (const std::wstring& str) {
69- try {
70- // CRITICAL FIX: Use thread_local to make std::wstring_convert thread-safe
71- // std::wstring_convert is NOT thread-safe and its use is deprecated in C++17
72- // Each thread gets its own converter instance, eliminating race conditions
73- thread_local std::wstring_convert<
74- std::codecvt_utf8_utf16<wchar_t , 0x10ffff , std::little_endian>>
75- converter;
76-
77- std::string utf16Bytes = converter.to_bytes (str);
78-
79- // Convert the bytes to SQLWCHAR array
80- std::vector<SQLWCHAR> result (utf16Bytes.size () / kUcsLength + 1 ,
81- 0 ); // +1 for null terminator
82- for (size_t i = 0 ; i < utf16Bytes.size () / kUcsLength ; ++i) {
83- memcpy (&result[i], &utf16Bytes[i * kUcsLength ], kUcsLength );
95+ if (str.empty ()) {
96+ return std::vector<SQLWCHAR>(1 , 0 ); // Just null terminator
97+ }
98+
99+ // Lambda to encode code point as surrogate pair and append to result
100+ auto encodeSurrogatePair = [](std::vector<SQLWCHAR>& vec, uint32_t cp) {
101+ cp -= 0x10000 ;
102+ vec.push_back (static_cast <SQLWCHAR>(0xD800 | ((cp >> 10 ) & 0x3FF )));
103+ vec.push_back (static_cast <SQLWCHAR>(0xDC00 | (cp & 0x3FF )));
104+ };
105+
106+ // Lambda to check if code point is a valid Unicode scalar value
107+ auto isValidUnicodeScalar = [](uint32_t cp) -> bool {
108+ // Exclude surrogate range (0xD800-0xDFFF) and values beyond max Unicode
109+ return cp <= kUnicodeMaxCodePoint && (cp < 0xD800 || cp > 0xDFFF );
110+ };
111+
112+ // Convert wstring (UTF-32) to UTF-16
113+ std::vector<SQLWCHAR> result;
114+ result.reserve (str.size () + 1 ); // Most chars are BMP, so reserve exact size
115+
116+ for (wchar_t wc : str) {
117+ uint32_t codePoint = static_cast <uint32_t >(wc);
118+
119+ // Validate code point first
120+ if (!isValidUnicodeScalar (codePoint)) {
121+ codePoint = kUnicodeReplacementChar ;
84122 }
85- return result;
86- } catch (const std::exception& e) {
87- // Fallback to simple casting if codecvt fails
88- std::vector<SQLWCHAR> result (str.size () + 1 ,
89- 0 ); // +1 for null terminator
90- for (size_t i = 0 ; i < str.size (); ++i) {
91- result[i] = static_cast <SQLWCHAR>(str[i]);
123+
124+ // Fast path: BMP character (most common - ~99% of strings)
125+ // After validation, codePoint cannot be in surrogate range (0xD800-0xDFFF)
126+ if (codePoint <= 0xFFFF ) {
127+ result.push_back (static_cast <SQLWCHAR>(codePoint));
92128 }
93- return result;
129+ // Encode as surrogate pair for characters outside BMP
130+ else if (codePoint <= kUnicodeMaxCodePoint ) {
131+ encodeSurrogatePair (result, codePoint);
132+ }
133+ // Note: Invalid code points (surrogates and > 0x10FFFF) already
134+ // replaced with replacement character (0xFFFD) at validation above
94135 }
136+
137+ result.push_back (0 ); // Null terminator
138+ return result;
95139}
96140
97141#endif
0 commit comments