Resolving comments

jahnvi480 · jahnvi480 · commit 4ad47cae9ed8 · 2025-11-28T15:48:57.000+05:30
diff --git a/mssql_python/connection.py b/mssql_python/connection.py
@@ -57,10 +57,54 @@
 # Note: "utf-16" with BOM is NOT included as it's problematic for SQL_WCHAR
 UTF16_ENCODINGS: frozenset[str] = frozenset(["utf-16le", "utf-16be"])
 
-# Valid encoding characters (alphanumeric, dash, underscore only)
-import string
 
-VALID_ENCODING_CHARS: frozenset[str] = frozenset(string.ascii_letters + string.digits + "-_")
+def _validate_utf16_wchar_compatibility(
+    encoding: str, wchar_type: int, context: str = "SQL_WCHAR"
+) -> None:
+    """
+    Validates UTF-16 encoding compatibility with SQL_WCHAR.
+
+    Centralizes the validation logic to eliminate duplication across setencoding/setdecoding.
+
+    Args:
+        encoding: The encoding string (already normalized to lowercase)
+        wchar_type: The SQL_WCHAR constant value to check against
+        context: Context string for error messages ('SQL_WCHAR', 'SQL_WCHAR ctype', etc.)
+
+    Raises:
+        ProgrammingError: If encoding is incompatible with SQL_WCHAR
+    """
+    if encoding == "utf-16":
+        # UTF-16 with BOM is rejected due to byte order ambiguity
+        logger.warning("utf-16 with BOM rejected for %s", context)
+        raise ProgrammingError(
+            driver_error="UTF-16 with Byte Order Mark not supported for SQL_WCHAR",
+            ddbc_error=(
+                "Cannot use 'utf-16' encoding with SQL_WCHAR due to Byte Order Mark ambiguity. "
+                "Use 'utf-16le' or 'utf-16be' instead for explicit byte order."
+            ),
+        )
+    elif encoding not in UTF16_ENCODINGS:
+        # Non-UTF-16 encodings are not supported with SQL_WCHAR
+        logger.warning(
+            "Non-UTF-16 encoding %s attempted with %s", sanitize_user_input(encoding), context
+        )
+
+        # Generate context-appropriate error messages
+        if "ctype" in context:
+            driver_error = f"SQL_WCHAR ctype only supports UTF-16 encodings"
+            ddbc_context = "SQL_WCHAR ctype"
+        else:
+            driver_error = f"SQL_WCHAR only supports UTF-16 encodings"
+            ddbc_context = "SQL_WCHAR"
+
+        raise ProgrammingError(
+            driver_error=driver_error,
+            ddbc_error=(
+                f"Cannot use encoding '{encoding}' with {ddbc_context}. "
+                f"SQL_WCHAR requires UTF-16 encodings (utf-16le, utf-16be)"
+            ),
+        )
 
 
 def _validate_encoding(encoding: str) -> bool:
@@ -78,14 +122,18 @@ def _validate_encoding(encoding: str) -> bool:
         Cache size is limited to 128 entries which should cover most use cases.
         Also validates that encoding name only contains safe characters.
     """
-    # First check for dangerous characters (security validation)
-    if not all(c in VALID_ENCODING_CHARS for c in encoding):
+    # Basic security checks - prevent obvious attacks
+    if not encoding or not isinstance(encoding, str):
         return False
 
     # Check length limit (prevent DOS)
     if len(encoding) > 100:
         return False
 
+    # Prevent null bytes and control characters that could cause issues
+    if "\x00" in encoding or any(ord(c) < 32 and c not in "\t\n\r" for c in encoding):
+        return False
+
     # Then check if it's a valid Python codec
     try:
         codecs.lookup(encoding)
@@ -450,18 +498,9 @@ def setencoding(self, encoding: Optional[str] = None, ctype: Optional[int] = Non
         encoding = encoding.casefold()
         logger.debug("setencoding: Encoding normalized to %s", encoding)
 
-        # Reject 'utf-16' with BOM for SQL_WCHAR (ambiguous byte order)
-        if encoding == "utf-16" and ctype == ConstantsDDBC.SQL_WCHAR.value:
-            logger.warning(
-                "utf-16 with BOM rejected for SQL_WCHAR",
-            )
-            raise ProgrammingError(
-                driver_error="UTF-16 with Byte Order Mark not supported for SQL_WCHAR",
-                ddbc_error=(
-                    "Cannot use 'utf-16' encoding with SQL_WCHAR due to Byte Order Mark ambiguity. "
-                    "Use 'utf-16le' or 'utf-16be' instead for explicit byte order."
-                ),
-            )
+        # Early validation if ctype is already specified as SQL_WCHAR
+        if ctype == ConstantsDDBC.SQL_WCHAR.value:
+            _validate_utf16_wchar_compatibility(encoding, ctype, "SQL_WCHAR")
 
         # Set default ctype based on encoding if not provided
         if ctype is None:
@@ -488,28 +527,9 @@ def setencoding(self, encoding: Optional[str] = None, ctype: Optional[int] = Non
                 ),
             )
 
-        # Validate that SQL_WCHAR ctype only used with UTF-16 encodings (not utf-16 with BOM)
+        # Final validation: SQL_WCHAR ctype only supports UTF-16 encodings (without BOM)
         if ctype == ConstantsDDBC.SQL_WCHAR.value:
-            if encoding == "utf-16":
-                raise ProgrammingError(
-                    driver_error="UTF-16 with Byte Order Mark not supported for SQL_WCHAR",
-                    ddbc_error=(
-                        "Cannot use 'utf-16' encoding with SQL_WCHAR due to Byte Order Mark ambiguity. "
-                        "Use 'utf-16le' or 'utf-16be' instead for explicit byte order."
-                    ),
-                )
-            elif encoding not in UTF16_ENCODINGS:
-                logger.warning(
-                    "Non-UTF-16 encoding %s attempted with SQL_WCHAR ctype",
-                    sanitize_user_input(encoding),
-                )
-                raise ProgrammingError(
-                    driver_error=f"SQL_WCHAR only supports UTF-16 encodings",
-                    ddbc_error=(
-                        f"Cannot use encoding '{encoding}' with SQL_WCHAR. "
-                        f"SQL_WCHAR requires UTF-16 encodings (utf-16le, utf-16be)"
-                    ),
-                )
+            _validate_utf16_wchar_compatibility(encoding, ctype, "SQL_WCHAR")
 
         # Store the encoding settings (thread-safe with lock)
         with self._encoding_lock:
@@ -633,32 +653,9 @@ def setdecoding(
         # Normalize encoding to lowercase for consistency
         encoding = encoding.lower()
 
-        # Reject 'utf-16' with BOM for SQL_WCHAR (ambiguous byte order)
-        if sqltype == ConstantsDDBC.SQL_WCHAR.value and encoding == "utf-16":
-            logger.warning(
-                "utf-16 with BOM rejected for SQL_WCHAR",
-            )
-            raise ProgrammingError(
-                driver_error="UTF-16 with Byte Order Mark not supported for SQL_WCHAR",
-                ddbc_error=(
-                    "Cannot use 'utf-16' encoding with SQL_WCHAR due to Byte Order Mark ambiguity. "
-                    "Use 'utf-16le' or 'utf-16be' instead for explicit byte order."
-                ),
-            )
-
-        # Validate SQL_WCHAR only supports UTF-16 encodings (SQL_WMETADATA is more flexible)
-        if sqltype == ConstantsDDBC.SQL_WCHAR.value and encoding not in UTF16_ENCODINGS:
-            logger.warning(
-                "Non-UTF-16 encoding %s attempted with SQL_WCHAR sqltype",
-                sanitize_user_input(encoding),
-            )
-            raise ProgrammingError(
-                driver_error=f"SQL_WCHAR only supports UTF-16 encodings",
-                ddbc_error=(
-                    f"Cannot use encoding '{encoding}' with SQL_WCHAR. "
-                    f"SQL_WCHAR requires UTF-16 encodings (utf-16le, utf-16be)"
-                ),
-            )
+        # Validate SQL_WCHAR encoding compatibility
+        if sqltype == ConstantsDDBC.SQL_WCHAR.value:
+            _validate_utf16_wchar_compatibility(encoding, sqltype, "SQL_WCHAR sqltype")
 
         # SQL_WMETADATA can use any valid encoding (UTF-8, UTF-16, etc.)
         # No restriction needed here - let users configure as needed
@@ -685,28 +682,9 @@ def setdecoding(
                 ),
             )
 
-        # Validate that SQL_WCHAR ctype only used with UTF-16 encodings (not utf-16 with BOM)
+        # Validate SQL_WCHAR ctype encoding compatibility
         if ctype == ConstantsDDBC.SQL_WCHAR.value:
-            if encoding == "utf-16":
-                raise ProgrammingError(
-                    driver_error="UTF-16 with Byte Order Mark not supported for SQL_WCHAR",
-                    ddbc_error=(
-                        "Cannot use 'utf-16' encoding with SQL_WCHAR due to Byte Order Mark ambiguity. "
-                        "Use 'utf-16le' or 'utf-16be' instead for explicit byte order."
-                    ),
-                )
-            elif encoding not in UTF16_ENCODINGS:
-                logger.warning(
-                    "Non-UTF-16 encoding %s attempted with SQL_WCHAR ctype",
-                    sanitize_user_input(encoding),
-                )
-                raise ProgrammingError(
-                    driver_error=f"SQL_WCHAR ctype only supports UTF-16 encodings",
-                    ddbc_error=(
-                        f"Cannot use encoding '{encoding}' with SQL_WCHAR ctype. "
-                        f"SQL_WCHAR requires UTF-16 encodings (utf-16le, utf-16be)"
-                    ),
-                )
+            _validate_utf16_wchar_compatibility(encoding, ctype, "SQL_WCHAR ctype")
 
         # Store the decoding settings for the specified sqltype (thread-safe with lock)
         with self._encoding_lock:
diff --git a/mssql_python/cursor.py b/mssql_python/cursor.py
@@ -297,21 +297,33 @@ def _get_encoding_settings(self):
 
         Returns:
             dict: A dictionary with 'encoding' and 'ctype' keys, or default settings if not available
+
+        Raises:
+            OperationalError, DatabaseError: If there are unexpected database connection issues
+            that indicate a broken connection state. These should not be silently ignored
+            as they can lead to data corruption or inconsistent behavior.
         """
         if hasattr(self._connection, "getencoding"):
             try:
                 return self._connection.getencoding()
             except (OperationalError, DatabaseError) as db_error:
-                # Only catch database-related errors, not programming errors
-                from mssql_python.helpers import log
-
-                log(
-                    "warning",
-                    f"Failed to get encoding settings from connection due to database error: {db_error}",
+                # Log the error for debugging but re-raise for fail-fast behavior
+                # Silently returning defaults can lead to data corruption and hard-to-debug issues
+                logger.error(
+                    "Failed to get encoding settings from connection due to database error: %s. "
+                    "This indicates a broken connection state that should not be ignored.",
+                    db_error,
                 )
-                return {"encoding": "utf-16le", "ctype": ddbc_sql_const.SQL_WCHAR.value}
+                # Re-raise to fail fast - users should know their connection is broken
+                raise
+            except Exception as unexpected_error:
+                # Handle other unexpected errors (connection closed, programming errors, etc.)
+                logger.error("Unexpected error getting encoding settings: %s", unexpected_error)
+                # Re-raise unexpected errors as well
+                raise
 
         # Return default encoding settings if getencoding is not available
+        # This is the only case where defaults are appropriate (method doesn't exist)
         return {"encoding": "utf-16le", "ctype": ddbc_sql_const.SQL_WCHAR.value}
 
     def _get_decoding_settings(self, sql_type):
@@ -323,22 +335,35 @@ def _get_decoding_settings(self, sql_type):
 
         Returns:
             Dictionary containing the decoding settings.
+
+        Raises:
+            OperationalError, DatabaseError: If there are unexpected database connection issues
+            that indicate a broken connection state. These should not be silently ignored
+            as they can lead to data corruption or inconsistent behavior.
         """
         try:
             # Get decoding settings from connection for this SQL type
             return self._connection.getdecoding(sql_type)
         except (OperationalError, DatabaseError) as db_error:
-            # Only handle expected database-related errors
-            from mssql_python.helpers import log
-
-            log(
-                "warning",
-                f"Failed to get decoding settings for SQL type {sql_type} due to database error: {db_error}",
+            # Log the error for debugging but re-raise for fail-fast behavior
+            # Silently returning defaults can lead to data corruption and hard-to-debug issues
+            logger.error(
+                "Failed to get decoding settings for SQL type %s due to database error: %s. "
+                "This indicates a broken connection state that should not be ignored.",
+                sql_type,
+                db_error,
             )
-            if sql_type == ddbc_sql_const.SQL_WCHAR.value:
-                return {"encoding": "utf-16le", "ctype": ddbc_sql_const.SQL_WCHAR.value}
-            else:
-                return {"encoding": "utf-8", "ctype": ddbc_sql_const.SQL_CHAR.value}
+            # Re-raise to fail fast - users should know their connection is broken
+            raise
+        except Exception as unexpected_error:
+            # Handle other unexpected errors (connection closed, programming errors, etc.)
+            logger.error(
+                "Unexpected error getting decoding settings for SQL type %s: %s",
+                sql_type,
+                unexpected_error,
+            )
+            # Re-raise unexpected errors as well
+            raise
 
     def _map_sql_type(  # pylint: disable=too-many-arguments,too-many-positional-arguments,too-many-locals,too-many-return-statements,too-many-branches
         self,
diff --git a/mssql_python/pybind/ddbc_bindings.cpp b/mssql_python/pybind/ddbc_bindings.cpp
@@ -1811,7 +1811,8 @@ SQLRETURN SQLExecute_wrap(const SqlHandlePtr statementHandle,
 
 SQLRETURN BindParameterArray(SQLHANDLE hStmt, const py::list& columnwise_params,
                              const std::vector<ParamInfo>& paramInfos, size_t paramSetSize,
-                             std::vector<std::shared_ptr<void>>& paramBuffers) {
+                             std::vector<std::shared_ptr<void>>& paramBuffers,
+                             const std::string& charEncoding = "utf-8") {
     LOG("BindParameterArray: Starting column-wise array binding - "
         "param_count=%zu, param_set_size=%zu",
         columnwise_params.size(), paramSetSize);
@@ -2013,8 +2014,8 @@ SQLRETURN BindParameterArray(SQLHANDLE hStmt, const py::list& columnwise_params,
                 case SQL_C_CHAR:
                 case SQL_C_BINARY: {
                     LOG("BindParameterArray: Binding SQL_C_CHAR/BINARY array - "
-                        "param_index=%d, count=%zu, column_size=%zu",
-                        paramIndex, paramSetSize, info.columnSize);
+                        "param_index=%d, count=%zu, column_size=%zu, encoding='%s'",
+                        paramIndex, paramSetSize, info.columnSize, charEncoding.c_str());
                     char* charArray = AllocateParamBufferArray<char>(
                         tempBuffers, paramSetSize * (info.columnSize + 1));
                     strLenOrIndArray = AllocateParamBufferArray<SQLLEN>(tempBuffers, paramSetSize);
@@ -2024,18 +2025,45 @@ SQLRETURN BindParameterArray(SQLHANDLE hStmt, const py::list& columnwise_params,
                             std::memset(charArray + i * (info.columnSize + 1), 0,
                                         info.columnSize + 1);
                         } else {
-                            std::string str = columnValues[i].cast<std::string>();
-                            if (str.size() > info.columnSize) {
+                            std::string encodedStr;
+
+                            if (py::isinstance<py::str>(columnValues[i])) {
+                                // Use Python's codec system to encode the string with specified
+                                // encoding (like pyodbc does)
+                                try {
+                                    py::object encoded =
+                                        columnValues[i].attr("encode")(charEncoding, "strict");
+                                    encodedStr = encoded.cast<std::string>();
+                                    LOG("BindParameterArray: param[%d] row[%zu] SQL_C_CHAR - "
+                                        "Encoded with '%s', "
+                                        "size=%zu bytes",
+                                        paramIndex, i, charEncoding.c_str(), encodedStr.size());
+                                } catch (const py::error_already_set& e) {
+                                    LOG_ERROR("BindParameterArray: param[%d] row[%zu] SQL_C_CHAR - "
+                                              "Failed to encode "
+                                              "with '%s': %s",
+                                              paramIndex, i, charEncoding.c_str(), e.what());
+                                    throw std::runtime_error(
+                                        std::string("Failed to encode parameter ") +
+                                        std::to_string(paramIndex) + " row " + std::to_string(i) +
+                                        " with encoding '" + charEncoding + "': " + e.what());
+                                }
+                            } else {
+                                // bytes/bytearray - use as-is (already encoded)
+                                encodedStr = columnValues[i].cast<std::string>();
+                            }
+
+                            if (encodedStr.size() > info.columnSize) {
                                 LOG("BindParameterArray: String/binary too "
                                     "long - param_index=%d, row=%zu, size=%zu, "
                                     "max=%zu",
-                                    paramIndex, i, str.size(), info.columnSize);
+                                    paramIndex, i, encodedStr.size(), info.columnSize);
                                 ThrowStdException("Input exceeds column size at index " +
                                                   std::to_string(i));
                             }
-                            std::memcpy(charArray + i * (info.columnSize + 1), str.c_str(),
-                                        str.size());
-                            strLenOrIndArray[i] = static_cast<SQLLEN>(str.size());
+                            std::memcpy(charArray + i * (info.columnSize + 1), encodedStr.c_str(),
+                                        encodedStr.size());
+                            strLenOrIndArray[i] = static_cast<SQLLEN>(encodedStr.size());
                         }
                     }
                     LOG("BindParameterArray: SQL_C_CHAR/BINARY bound - "
@@ -2471,10 +2499,11 @@ SQLRETURN SQLExecuteMany_wrap(const SqlHandlePtr statementHandle, const std::wst
 
     if (!hasDAE) {
         LOG("SQLExecuteMany: Using array binding (non-DAE) - calling "
-            "BindParameterArray");
+            "BindParameterArray with encoding '%s'",
+            charEncoding.c_str());
         std::vector<std::shared_ptr<void>> paramBuffers;
-        // TODO: Pass charEncoding to BindParameterArray when it's updated to support encoding
-        rc = BindParameterArray(hStmt, columnwise_params, paramInfos, paramSetSize, paramBuffers);
+        rc = BindParameterArray(hStmt, columnwise_params, paramInfos, paramSetSize, paramBuffers,
+                                charEncoding);
         if (!SQL_SUCCEEDED(rc)) {
             LOG("SQLExecuteMany: BindParameterArray failed - rc=%d", rc);
             return rc;
@@ -2500,7 +2529,7 @@ SQLRETURN SQLExecuteMany_wrap(const SqlHandlePtr statementHandle, const std::wst
 
             std::vector<std::shared_ptr<void>> paramBuffers;
             rc = BindParameters(hStmt, rowParams, const_cast<std::vector<ParamInfo>&>(paramInfos),
-                                paramBuffers);
+                                paramBuffers, charEncoding);
             if (!SQL_SUCCEEDED(rc)) {
                 LOG("SQLExecuteMany: BindParameters failed for row %zu - rc=%d", rowIndex, rc);
                 return rc;
diff --git a/tests/test_013_encoding_decoding.py b/tests/test_013_encoding_decoding.py