@@ -383,6 +383,13 @@ def __init__(self, value, metadata):
383383 self .value = value
384384 self .metadata = metadata
385385
386+ def __eq__ (self , other ):
387+ # type: (Any) -> bool
388+ if not isinstance (other , AnnotatedValue ):
389+ return False
390+
391+ return self .value == other .value and self .metadata == other .metadata
392+
386393 @classmethod
387394 def removed_because_raw_data (cls ):
388395 # type: () -> AnnotatedValue
@@ -1119,6 +1126,39 @@ def _is_in_project_root(abs_path, project_root):
11191126 return False
11201127
11211128
1129+ def _truncate_by_bytes (string , max_bytes ):
1130+ # type: (str, int) -> str
1131+ """
1132+ Truncate a UTF-8-encodable string to the last full codepoint so that it fits in max_bytes.
1133+ """
1134+ # This function technically supports bytes, but only for Python 2 compat.
1135+ # XXX remove support for bytes when we drop Python 2
1136+ if isinstance (string , bytes ):
1137+ truncated = string [: max_bytes - 3 ]
1138+ else :
1139+ truncated = string .encode ("utf-8" )[: max_bytes - 3 ].decode (
1140+ "utf-8" , errors = "ignore"
1141+ )
1142+
1143+ return truncated + "..."
1144+
1145+
1146+ def _get_size_in_bytes (value ):
1147+ # type: (str) -> Optional[int]
1148+ # This function technically supports bytes, but only for Python 2 compat.
1149+ # XXX remove support for bytes when we drop Python 2
1150+ if not isinstance (value , (bytes , text_type )):
1151+ return None
1152+
1153+ if isinstance (value , bytes ):
1154+ return len (value )
1155+
1156+ try :
1157+ return len (value .encode ("utf-8" ))
1158+ except (UnicodeEncodeError , UnicodeDecodeError ):
1159+ return None
1160+
1161+
11221162def strip_string (value , max_length = None ):
11231163 # type: (str, Optional[int]) -> Union[AnnotatedValue, str]
11241164 if not value :
@@ -1127,17 +1167,27 @@ def strip_string(value, max_length=None):
11271167 if max_length is None :
11281168 max_length = DEFAULT_MAX_VALUE_LENGTH
11291169
1130- length = len (value .encode ("utf-8" ))
1170+ byte_size = _get_size_in_bytes (value )
1171+ text_size = None
1172+ if isinstance (value , text_type ):
1173+ text_size = len (value )
1174+
1175+ if byte_size is not None and byte_size > max_length :
1176+ # truncate to max_length bytes, preserving code points
1177+ truncated_value = _truncate_by_bytes (value , max_length )
1178+ elif text_size is not None and text_size > max_length :
1179+ # fallback to truncating by string length
1180+ truncated_value = value [: max_length - 3 ] + "..."
1181+ else :
1182+ return value
11311183
1132- if length > max_length :
1133- return AnnotatedValue (
1134- value = value [: max_length - 3 ] + "..." ,
1135- metadata = {
1136- "len" : length ,
1137- "rem" : [["!limit" , "x" , max_length - 3 , max_length ]],
1138- },
1139- )
1140- return value
1184+ return AnnotatedValue (
1185+ value = truncated_value ,
1186+ metadata = {
1187+ "len" : byte_size or text_size ,
1188+ "rem" : [["!limit" , "x" , max_length - 3 , max_length ]],
1189+ },
1190+ )
11411191
11421192
11431193def parse_version (version ):
0 commit comments