Skip to content

Commit 1dcf9ce

Browse files
author
subrata-ms
committed
unix utility function fixes
1 parent 5c77e3f commit 1dcf9ce

File tree

1 file changed

+84
-47
lines changed

1 file changed

+84
-47
lines changed

mssql_python/pybind/unix_utils.cpp

Lines changed: 84 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -17,71 +17,108 @@
1717
const char* kOdbcEncoding = "utf-16-le"; // ODBC uses UTF-16LE for SQLWCHAR
1818
const size_t kUcsLength = 2; // SQLWCHAR is 2 bytes on all platforms
1919

20-
// Function to convert SQLWCHAR strings to std::wstring on macOS
20+
// Function to convert SQLWCHAR strings to std::wstring on macOS/Linux
21+
// Optimized version: direct conversion without intermediate buffer
2122
std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
2223
if (!sqlwStr) {
2324
return std::wstring();
2425
}
2526

27+
// Lambda to calculate string length using pointer arithmetic
28+
auto calculateLength = [](const SQLWCHAR* str) -> size_t {
29+
const SQLWCHAR* p = str;
30+
while (*p) ++p;
31+
return p - str;
32+
};
33+
2634
if (length == SQL_NTS) {
27-
// Determine length if not provided
28-
size_t i = 0;
29-
while (sqlwStr[i] != 0)
30-
++i;
31-
length = i;
35+
length = calculateLength(sqlwStr);
3236
}
3337

34-
// Create a UTF-16LE byte array from the SQLWCHAR array
35-
std::vector<char> utf16Bytes(length * kUcsLength);
36-
for (size_t i = 0; i < length; ++i) {
37-
// Copy each SQLWCHAR (2 bytes) to the byte array
38-
memcpy(&utf16Bytes[i * kUcsLength], &sqlwStr[i], kUcsLength);
38+
if (length == 0) {
39+
return std::wstring();
3940
}
4041

41-
// Convert UTF-16LE to std::wstring (UTF-32 on macOS)
42-
try {
43-
// Use C++11 codecvt to convert between UTF-16LE and wstring
44-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::little_endian>>
45-
converter;
46-
std::wstring result = converter.from_bytes(
47-
reinterpret_cast<const char*>(utf16Bytes.data()),
48-
reinterpret_cast<const char*>(utf16Bytes.data() + utf16Bytes.size()));
49-
return result;
50-
} catch (const std::exception& e) {
51-
// Fallback to character-by-character conversion if codecvt fails
52-
std::wstring result;
53-
result.reserve(length);
54-
for (size_t i = 0; i < length; ++i) {
55-
result.push_back(static_cast<wchar_t>(sqlwStr[i]));
42+
// Lambda to check if character is in Basic Multilingual Plane
43+
auto isBMP = [](uint16_t ch) { return ch < 0xD800 || ch > 0xDFFF; };
44+
45+
// Lambda to decode surrogate pair into code point
46+
auto decodeSurrogatePair = [](uint16_t high, uint16_t low) -> uint32_t {
47+
return 0x10000 +
48+
(static_cast<uint32_t>(high & 0x3FF) << 10) +
49+
(low & 0x3FF);
50+
};
51+
52+
// Convert UTF-16 to UTF-32 directly without intermediate buffer
53+
std::wstring result;
54+
result.reserve(length); // Reserve assuming most chars are BMP
55+
56+
size_t i = 0;
57+
while (i < length) {
58+
uint16_t utf16Char = static_cast<uint16_t>(sqlwStr[i]);
59+
60+
// Fast path: BMP character (most common - ~99% of strings)
61+
if (isBMP(utf16Char)) {
62+
result.push_back(static_cast<wchar_t>(utf16Char));
63+
++i;
64+
}
65+
// Handle surrogate pairs for characters outside BMP
66+
else if (utf16Char <= 0xDBFF) { // High surrogate
67+
if (i + 1 < length) {
68+
uint16_t lowSurrogate = static_cast<uint16_t>(sqlwStr[i + 1]);
69+
if (lowSurrogate >= 0xDC00 && lowSurrogate <= 0xDFFF) {
70+
uint32_t codePoint = decodeSurrogatePair(utf16Char, lowSurrogate);
71+
result.push_back(static_cast<wchar_t>(codePoint));
72+
i += 2;
73+
continue;
74+
}
75+
}
76+
// Invalid surrogate - push as-is
77+
result.push_back(static_cast<wchar_t>(utf16Char));
78+
++i;
79+
}
80+
else { // Low surrogate without high - invalid but push as-is
81+
result.push_back(static_cast<wchar_t>(utf16Char));
82+
++i;
5683
}
57-
return result;
5884
}
85+
return result;
5986
}
6087

61-
// Function to convert std::wstring to SQLWCHAR array on macOS
88+
// Function to convert std::wstring to SQLWCHAR array on macOS/Linux
89+
// Optimized version: streamlined conversion with better branch prediction
6290
std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
63-
try {
64-
// Convert wstring (UTF-32 on macOS) to UTF-16LE bytes
65-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::little_endian>>
66-
converter;
67-
std::string utf16Bytes = converter.to_bytes(str);
91+
if (str.empty()) {
92+
return std::vector<SQLWCHAR>(1, 0); // Just null terminator
93+
}
6894

69-
// Convert the bytes to SQLWCHAR array
70-
std::vector<SQLWCHAR> result(utf16Bytes.size() / kUcsLength + 1,
71-
0); // +1 for null terminator
72-
for (size_t i = 0; i < utf16Bytes.size() / kUcsLength; ++i) {
73-
memcpy(&result[i], &utf16Bytes[i * kUcsLength], kUcsLength);
74-
}
75-
return result;
76-
} catch (const std::exception& e) {
77-
// Fallback to simple casting if codecvt fails
78-
std::vector<SQLWCHAR> result(str.size() + 1,
79-
0); // +1 for null terminator
80-
for (size_t i = 0; i < str.size(); ++i) {
81-
result[i] = static_cast<SQLWCHAR>(str[i]);
95+
// Lambda to encode code point as surrogate pair and append to result
96+
auto encodeSurrogatePair = [](std::vector<SQLWCHAR>& vec, uint32_t cp) {
97+
cp -= 0x10000;
98+
vec.push_back(static_cast<SQLWCHAR>(0xD800 | ((cp >> 10) & 0x3FF)));
99+
vec.push_back(static_cast<SQLWCHAR>(0xDC00 | (cp & 0x3FF)));
100+
};
101+
102+
// Convert wstring (UTF-32) to UTF-16
103+
std::vector<SQLWCHAR> result;
104+
result.reserve(str.size() + 1); // Most chars are BMP, so reserve exact size
105+
106+
for (wchar_t wc : str) {
107+
uint32_t codePoint = static_cast<uint32_t>(wc);
108+
109+
// Fast path: BMP character (most common - ~99% of strings)
110+
if (codePoint <= 0xFFFF) {
111+
result.push_back(static_cast<SQLWCHAR>(codePoint));
112+
}
113+
// Encode as surrogate pair for characters outside BMP
114+
else if (codePoint <= 0x10FFFF) {
115+
encodeSurrogatePair(result, codePoint);
82116
}
83-
return result;
117+
// Invalid code points silently skipped
84118
}
119+
120+
result.push_back(0); // Null terminator
121+
return result;
85122
}
86123

87124
#endif

0 commit comments

Comments
 (0)