Skip to content

Commit c2ea17e

Browse files
committed
Merge branch 'jahnvi/250_encoding_decoding' of https://github.com/microsoft/mssql-python into jahnvi/timeout_291
2 parents f5079bb + 79706c7 commit c2ea17e

File tree

4 files changed

+965
-611
lines changed

4 files changed

+965
-611
lines changed

mssql_python/connection.py

Lines changed: 70 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,54 @@
5757
# Note: "utf-16" with BOM is NOT included as it's problematic for SQL_WCHAR
5858
UTF16_ENCODINGS: frozenset[str] = frozenset(["utf-16le", "utf-16be"])
5959

60-
# Valid encoding characters (alphanumeric, dash, underscore only)
61-
import string
6260

63-
VALID_ENCODING_CHARS: frozenset[str] = frozenset(string.ascii_letters + string.digits + "-_")
61+
def _validate_utf16_wchar_compatibility(
62+
encoding: str, wchar_type: int, context: str = "SQL_WCHAR"
63+
) -> None:
64+
"""
65+
Validates UTF-16 encoding compatibility with SQL_WCHAR.
66+
67+
Centralizes the validation logic to eliminate duplication across setencoding/setdecoding.
68+
69+
Args:
70+
encoding: The encoding string (already normalized to lowercase)
71+
wchar_type: The SQL_WCHAR constant value to check against
72+
context: Context string for error messages ('SQL_WCHAR', 'SQL_WCHAR ctype', etc.)
73+
74+
Raises:
75+
ProgrammingError: If encoding is incompatible with SQL_WCHAR
76+
"""
77+
if encoding == "utf-16":
78+
# UTF-16 with BOM is rejected due to byte order ambiguity
79+
logger.warning("utf-16 with BOM rejected for %s", context)
80+
raise ProgrammingError(
81+
driver_error="UTF-16 with Byte Order Mark not supported for SQL_WCHAR",
82+
ddbc_error=(
83+
"Cannot use 'utf-16' encoding with SQL_WCHAR due to Byte Order Mark ambiguity. "
84+
"Use 'utf-16le' or 'utf-16be' instead for explicit byte order."
85+
),
86+
)
87+
elif encoding not in UTF16_ENCODINGS:
88+
# Non-UTF-16 encodings are not supported with SQL_WCHAR
89+
logger.warning(
90+
"Non-UTF-16 encoding %s attempted with %s", sanitize_user_input(encoding), context
91+
)
92+
93+
# Generate context-appropriate error messages
94+
if "ctype" in context:
95+
driver_error = f"SQL_WCHAR ctype only supports UTF-16 encodings"
96+
ddbc_context = "SQL_WCHAR ctype"
97+
else:
98+
driver_error = f"SQL_WCHAR only supports UTF-16 encodings"
99+
ddbc_context = "SQL_WCHAR"
100+
101+
raise ProgrammingError(
102+
driver_error=driver_error,
103+
ddbc_error=(
104+
f"Cannot use encoding '{encoding}' with {ddbc_context}. "
105+
f"SQL_WCHAR requires UTF-16 encodings (utf-16le, utf-16be)"
106+
),
107+
)
64108

65109

66110
def _validate_encoding(encoding: str) -> bool:
@@ -78,14 +122,18 @@ def _validate_encoding(encoding: str) -> bool:
78122
Cache size is limited to 128 entries which should cover most use cases.
79123
Also validates that encoding name only contains safe characters.
80124
"""
81-
# First check for dangerous characters (security validation)
82-
if not all(c in VALID_ENCODING_CHARS for c in encoding):
125+
# Basic security checks - prevent obvious attacks
126+
if not encoding or not isinstance(encoding, str):
83127
return False
84128

85129
# Check length limit (prevent DOS)
86130
if len(encoding) > 100:
87131
return False
88132

133+
# Prevent null bytes and control characters that could cause issues
134+
if "\x00" in encoding or any(ord(c) < 32 and c not in "\t\n\r" for c in encoding):
135+
return False
136+
89137
# Then check if it's a valid Python codec
90138
try:
91139
codecs.lookup(encoding)
@@ -437,8 +485,7 @@ def setencoding(self, encoding: Optional[str] = None, ctype: Optional[int] = Non
437485
# Validate encoding using cached validation for better performance
438486
if not _validate_encoding(encoding):
439487
# Log the sanitized encoding for security
440-
logger.debug(
441-
"warning",
488+
logger.warning(
442489
"Invalid encoding attempted: %s",
443490
sanitize_user_input(str(encoding)),
444491
)
@@ -451,19 +498,9 @@ def setencoding(self, encoding: Optional[str] = None, ctype: Optional[int] = Non
451498
encoding = encoding.casefold()
452499
logger.debug("setencoding: Encoding normalized to %s", encoding)
453500

454-
# Reject 'utf-16' with BOM for SQL_WCHAR (ambiguous byte order)
455-
if encoding == "utf-16" and ctype == ConstantsDDBC.SQL_WCHAR.value:
456-
logger.debug(
457-
"warning",
458-
"utf-16 with BOM rejected for SQL_WCHAR",
459-
)
460-
raise ProgrammingError(
461-
driver_error="UTF-16 with Byte Order Mark not supported for SQL_WCHAR",
462-
ddbc_error=(
463-
"Cannot use 'utf-16' encoding with SQL_WCHAR due to Byte Order Mark ambiguity. "
464-
"Use 'utf-16le' or 'utf-16be' instead for explicit byte order."
465-
),
466-
)
501+
# Early validation if ctype is already specified as SQL_WCHAR
502+
if ctype == ConstantsDDBC.SQL_WCHAR.value:
503+
_validate_utf16_wchar_compatibility(encoding, ctype, "SQL_WCHAR")
467504

468505
# Set default ctype based on encoding if not provided
469506
if ctype is None:
@@ -478,8 +515,7 @@ def setencoding(self, encoding: Optional[str] = None, ctype: Optional[int] = Non
478515
valid_ctypes = [ConstantsDDBC.SQL_CHAR.value, ConstantsDDBC.SQL_WCHAR.value]
479516
if ctype not in valid_ctypes:
480517
# Log the sanitized ctype for security
481-
logger.debug(
482-
"warning",
518+
logger.warning(
483519
"Invalid ctype attempted: %s",
484520
sanitize_user_input(str(ctype)),
485521
)
@@ -491,37 +527,16 @@ def setencoding(self, encoding: Optional[str] = None, ctype: Optional[int] = Non
491527
),
492528
)
493529

494-
# Validate that SQL_WCHAR ctype only used with UTF-16 encodings (not utf-16 with BOM)
530+
# Final validation: SQL_WCHAR ctype only supports UTF-16 encodings (without BOM)
495531
if ctype == ConstantsDDBC.SQL_WCHAR.value:
496-
if encoding == "utf-16":
497-
raise ProgrammingError(
498-
driver_error="UTF-16 with Byte Order Mark not supported for SQL_WCHAR",
499-
ddbc_error=(
500-
"Cannot use 'utf-16' encoding with SQL_WCHAR due to Byte Order Mark ambiguity. "
501-
"Use 'utf-16le' or 'utf-16be' instead for explicit byte order."
502-
),
503-
)
504-
elif encoding not in UTF16_ENCODINGS:
505-
logger.debug(
506-
"warning",
507-
"Non-UTF-16 encoding %s attempted with SQL_WCHAR ctype",
508-
sanitize_user_input(encoding),
509-
)
510-
raise ProgrammingError(
511-
driver_error=f"SQL_WCHAR only supports UTF-16 encodings",
512-
ddbc_error=(
513-
f"Cannot use encoding '{encoding}' with SQL_WCHAR. "
514-
f"SQL_WCHAR requires UTF-16 encodings (utf-16le, utf-16be)"
515-
),
516-
)
532+
_validate_utf16_wchar_compatibility(encoding, ctype, "SQL_WCHAR")
517533

518534
# Store the encoding settings (thread-safe with lock)
519535
with self._encoding_lock:
520536
self._encoding_settings = {"encoding": encoding, "ctype": ctype}
521537

522538
# Log with sanitized values for security
523-
logger.debug(
524-
"info",
539+
logger.info(
525540
"Text encoding set to %s with ctype %s",
526541
sanitize_user_input(encoding),
527542
sanitize_user_input(str(ctype)),
@@ -604,8 +619,7 @@ def setdecoding(
604619
SQL_WMETADATA,
605620
]
606621
if sqltype not in valid_sqltypes:
607-
logger.debug(
608-
"warning",
622+
logger.warning(
609623
"Invalid sqltype attempted: %s",
610624
sanitize_user_input(str(sqltype)),
611625
)
@@ -627,8 +641,7 @@ def setdecoding(
627641

628642
# Validate encoding using cached validation for better performance
629643
if not _validate_encoding(encoding):
630-
logger.debug(
631-
"warning",
644+
logger.warning(
632645
"Invalid encoding attempted: %s",
633646
sanitize_user_input(str(encoding)),
634647
)
@@ -640,34 +653,9 @@ def setdecoding(
640653
# Normalize encoding to lowercase for consistency
641654
encoding = encoding.lower()
642655

643-
# Reject 'utf-16' with BOM for SQL_WCHAR (ambiguous byte order)
644-
if sqltype == ConstantsDDBC.SQL_WCHAR.value and encoding == "utf-16":
645-
logger.debug(
646-
"warning",
647-
"utf-16 with BOM rejected for SQL_WCHAR",
648-
)
649-
raise ProgrammingError(
650-
driver_error="UTF-16 with Byte Order Mark not supported for SQL_WCHAR",
651-
ddbc_error=(
652-
"Cannot use 'utf-16' encoding with SQL_WCHAR due to Byte Order Mark ambiguity. "
653-
"Use 'utf-16le' or 'utf-16be' instead for explicit byte order."
654-
),
655-
)
656-
657-
# Validate SQL_WCHAR only supports UTF-16 encodings (SQL_WMETADATA is more flexible)
658-
if sqltype == ConstantsDDBC.SQL_WCHAR.value and encoding not in UTF16_ENCODINGS:
659-
logger.debug(
660-
"warning",
661-
"Non-UTF-16 encoding %s attempted with SQL_WCHAR sqltype",
662-
sanitize_user_input(encoding),
663-
)
664-
raise ProgrammingError(
665-
driver_error=f"SQL_WCHAR only supports UTF-16 encodings",
666-
ddbc_error=(
667-
f"Cannot use encoding '{encoding}' with SQL_WCHAR. "
668-
f"SQL_WCHAR requires UTF-16 encodings (utf-16le, utf-16be)"
669-
),
670-
)
656+
# Validate SQL_WCHAR encoding compatibility
657+
if sqltype == ConstantsDDBC.SQL_WCHAR.value:
658+
_validate_utf16_wchar_compatibility(encoding, sqltype, "SQL_WCHAR sqltype")
671659

672660
# SQL_WMETADATA can use any valid encoding (UTF-8, UTF-16, etc.)
673661
# No restriction needed here - let users configure as needed
@@ -682,8 +670,7 @@ def setdecoding(
682670
# Validate ctype
683671
valid_ctypes = [ConstantsDDBC.SQL_CHAR.value, ConstantsDDBC.SQL_WCHAR.value]
684672
if ctype not in valid_ctypes:
685-
logger.debug(
686-
"warning",
673+
logger.warning(
687674
"Invalid ctype attempted: %s",
688675
sanitize_user_input(str(ctype)),
689676
)
@@ -695,29 +682,9 @@ def setdecoding(
695682
),
696683
)
697684

698-
# Validate that SQL_WCHAR ctype only used with UTF-16 encodings (not utf-16 with BOM)
685+
# Validate SQL_WCHAR ctype encoding compatibility
699686
if ctype == ConstantsDDBC.SQL_WCHAR.value:
700-
if encoding == "utf-16":
701-
raise ProgrammingError(
702-
driver_error="UTF-16 with Byte Order Mark not supported for SQL_WCHAR",
703-
ddbc_error=(
704-
"Cannot use 'utf-16' encoding with SQL_WCHAR due to Byte Order Mark ambiguity. "
705-
"Use 'utf-16le' or 'utf-16be' instead for explicit byte order."
706-
),
707-
)
708-
elif encoding not in UTF16_ENCODINGS:
709-
logger.debug(
710-
"warning",
711-
"Non-UTF-16 encoding %s attempted with SQL_WCHAR ctype",
712-
sanitize_user_input(encoding),
713-
)
714-
raise ProgrammingError(
715-
driver_error=f"SQL_WCHAR ctype only supports UTF-16 encodings",
716-
ddbc_error=(
717-
f"Cannot use encoding '{encoding}' with SQL_WCHAR ctype. "
718-
f"SQL_WCHAR requires UTF-16 encodings (utf-16le, utf-16be)"
719-
),
720-
)
687+
_validate_utf16_wchar_compatibility(encoding, ctype, "SQL_WCHAR ctype")
721688

722689
# Store the decoding settings for the specified sqltype (thread-safe with lock)
723690
with self._encoding_lock:
@@ -730,8 +697,7 @@ def setdecoding(
730697
SQL_WMETADATA: "SQL_WMETADATA",
731698
}.get(sqltype, str(sqltype))
732699

733-
logger.debug(
734-
"info",
700+
logger.info(
735701
"Text decoding set for %s to %s with ctype %s",
736702
sqltype_name,
737703
sanitize_user_input(encoding),

mssql_python/cursor.py

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -297,21 +297,33 @@ def _get_encoding_settings(self):
297297
298298
Returns:
299299
dict: A dictionary with 'encoding' and 'ctype' keys, or default settings if not available
300+
301+
Raises:
302+
OperationalError, DatabaseError: If there are unexpected database connection issues
303+
that indicate a broken connection state. These should not be silently ignored
304+
as they can lead to data corruption or inconsistent behavior.
300305
"""
301306
if hasattr(self._connection, "getencoding"):
302307
try:
303308
return self._connection.getencoding()
304309
except (OperationalError, DatabaseError) as db_error:
305-
# Only catch database-related errors, not programming errors
306-
from mssql_python.helpers import log
307-
308-
log(
309-
"warning",
310-
f"Failed to get encoding settings from connection due to database error: {db_error}",
310+
# Log the error for debugging but re-raise for fail-fast behavior
311+
# Silently returning defaults can lead to data corruption and hard-to-debug issues
312+
logger.error(
313+
"Failed to get encoding settings from connection due to database error: %s. "
314+
"This indicates a broken connection state that should not be ignored.",
315+
db_error,
311316
)
312-
return {"encoding": "utf-16le", "ctype": ddbc_sql_const.SQL_WCHAR.value}
317+
# Re-raise to fail fast - users should know their connection is broken
318+
raise
319+
except Exception as unexpected_error:
320+
# Handle other unexpected errors (connection closed, programming errors, etc.)
321+
logger.error("Unexpected error getting encoding settings: %s", unexpected_error)
322+
# Re-raise unexpected errors as well
323+
raise
313324

314325
# Return default encoding settings if getencoding is not available
326+
# This is the only case where defaults are appropriate (method doesn't exist)
315327
return {"encoding": "utf-16le", "ctype": ddbc_sql_const.SQL_WCHAR.value}
316328

317329
def _get_decoding_settings(self, sql_type):
@@ -323,22 +335,35 @@ def _get_decoding_settings(self, sql_type):
323335
324336
Returns:
325337
Dictionary containing the decoding settings.
338+
339+
Raises:
340+
OperationalError, DatabaseError: If there are unexpected database connection issues
341+
that indicate a broken connection state. These should not be silently ignored
342+
as they can lead to data corruption or inconsistent behavior.
326343
"""
327344
try:
328345
# Get decoding settings from connection for this SQL type
329346
return self._connection.getdecoding(sql_type)
330347
except (OperationalError, DatabaseError) as db_error:
331-
# Only handle expected database-related errors
332-
from mssql_python.helpers import log
333-
334-
log(
335-
"warning",
336-
f"Failed to get decoding settings for SQL type {sql_type} due to database error: {db_error}",
348+
# Log the error for debugging but re-raise for fail-fast behavior
349+
# Silently returning defaults can lead to data corruption and hard-to-debug issues
350+
logger.error(
351+
"Failed to get decoding settings for SQL type %s due to database error: %s. "
352+
"This indicates a broken connection state that should not be ignored.",
353+
sql_type,
354+
db_error,
337355
)
338-
if sql_type == ddbc_sql_const.SQL_WCHAR.value:
339-
return {"encoding": "utf-16le", "ctype": ddbc_sql_const.SQL_WCHAR.value}
340-
else:
341-
return {"encoding": "utf-8", "ctype": ddbc_sql_const.SQL_CHAR.value}
356+
# Re-raise to fail fast - users should know their connection is broken
357+
raise
358+
except Exception as unexpected_error:
359+
# Handle other unexpected errors (connection closed, programming errors, etc.)
360+
logger.error(
361+
"Unexpected error getting decoding settings for SQL type %s: %s",
362+
sql_type,
363+
unexpected_error,
364+
)
365+
# Re-raise unexpected errors as well
366+
raise
342367

343368
def _map_sql_type( # pylint: disable=too-many-arguments,too-many-positional-arguments,too-many-locals,too-many-return-statements,too-many-branches
344369
self,

0 commit comments

Comments
 (0)