diff --git a/.changeset/brave-otters-mask.md b/.changeset/brave-otters-mask.md new file mode 100644 index 00000000..7f7b6646 --- /dev/null +++ b/.changeset/brave-otters-mask.md @@ -0,0 +1,5 @@ +--- +'pypi/posthog': patch +--- + +Mask sensitive data held inside objects and in URL/DSN credentials when capturing exception code variables. Custom objects are now traversed so fields like `password` are redacted by attribute name instead of leaking via `repr()`, and credentials embedded in connection strings are scrubbed. Adds the `code_variables_mask_url_credentials` option (default `True`). diff --git a/posthog/__init__.py b/posthog/__init__.py index 4f719c77..dc191fec 100644 --- a/posthog/__init__.py +++ b/posthog/__init__.py @@ -24,6 +24,9 @@ from posthog.contexts import ( set_code_variables_mask_patterns_context as inner_set_code_variables_mask_patterns_context, ) +from posthog.contexts import ( + set_code_variables_mask_url_credentials_context as inner_set_code_variables_mask_url_credentials_context, +) from posthog.contexts import ( set_context_device_id as inner_set_context_device_id, ) @@ -39,6 +42,7 @@ from posthog.exception_utils import ( DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS, DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS, ) from posthog.feature_flag_evaluations import ( FeatureFlagEvaluations as FeatureFlagEvaluations, @@ -226,6 +230,14 @@ def set_code_variables_ignore_patterns_context(ignore_patterns: list): return inner_set_code_variables_ignore_patterns_context(ignore_patterns) +def set_code_variables_mask_url_credentials_context(enabled: bool): + """ + Whether to scrub credentials embedded in URLs/DSNs (e.g. user:pass@host) from + captured code variables for the current context. + """ + return inner_set_code_variables_mask_url_credentials_context(enabled) + + def tag(name: str, value: Any): """ Add a tag to the current context. @@ -346,6 +358,7 @@ def get_tags() -> Dict[str, Any]: capture_exception_code_variables = False code_variables_mask_patterns = DEFAULT_CODE_VARIABLES_MASK_PATTERNS code_variables_ignore_patterns = DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS +code_variables_mask_url_credentials = DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS in_app_modules = None # type: Optional[list[str]] enable_exception_autocapture_rate_limiting = False # type: bool exception_autocapture_bucket_size = ExceptionCapture.DEFAULT_BUCKET_SIZE # type: int @@ -1124,6 +1137,7 @@ def setup() -> Client: capture_exception_code_variables=capture_exception_code_variables, code_variables_mask_patterns=code_variables_mask_patterns, code_variables_ignore_patterns=code_variables_ignore_patterns, + code_variables_mask_url_credentials=code_variables_mask_url_credentials, in_app_modules=in_app_modules, enable_exception_autocapture_rate_limiting=enable_exception_autocapture_rate_limiting, exception_autocapture_bucket_size=exception_autocapture_bucket_size, diff --git a/posthog/client.py b/posthog/client.py index 379200d0..845b14e4 100644 --- a/posthog/client.py +++ b/posthog/client.py @@ -21,6 +21,7 @@ get_capture_exception_code_variables_context, get_code_variables_ignore_patterns_context, get_code_variables_mask_patterns_context, + get_code_variables_mask_url_credentials_context, get_context_device_id, get_context_distinct_id, get_context_session_id, @@ -37,6 +38,7 @@ from posthog.exception_utils import ( DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS, DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS, exc_info_from_error, exception_is_already_captured, exceptions_from_error_tuple, @@ -239,6 +241,7 @@ def __init__( capture_exception_code_variables=False, code_variables_mask_patterns=None, code_variables_ignore_patterns=None, + code_variables_mask_url_credentials=None, in_app_modules: list[str] | None = None, enable_exception_autocapture_rate_limiting=False, exception_autocapture_bucket_size=ExceptionCapture.DEFAULT_BUCKET_SIZE, @@ -304,6 +307,9 @@ def __init__( capturing code variables. code_variables_ignore_patterns: Variable-name patterns to omit when capturing code variables. + code_variables_mask_url_credentials: Scrub credentials embedded in + URLs/DSNs (e.g. ``user:pass@host``) from captured code variables, + regardless of the surrounding variable name. Defaults to True. in_app_modules: Module/package prefixes treated as in-app frames in captured exceptions. enable_exception_autocapture_rate_limiting: Rate limit @@ -396,6 +402,11 @@ def __init__( if code_variables_ignore_patterns is not None else DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS ) + self.code_variables_mask_url_credentials = ( + code_variables_mask_url_credentials + if code_variables_mask_url_credentials is not None + else DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS + ) self.in_app_modules = in_app_modules if project_root is None: @@ -1327,6 +1338,9 @@ def capture_exception( context_enabled = get_capture_exception_code_variables_context() context_mask = get_code_variables_mask_patterns_context() context_ignore = get_code_variables_ignore_patterns_context() + context_mask_url_credentials = ( + get_code_variables_mask_url_credentials_context() + ) enabled = ( context_enabled @@ -1343,6 +1357,11 @@ def capture_exception( if context_ignore is not None else self.code_variables_ignore_patterns ) + mask_url_credentials = ( + context_mask_url_credentials + if context_mask_url_credentials is not None + else self.code_variables_mask_url_credentials + ) if enabled: try_attach_code_variables_to_frames( @@ -1350,6 +1369,7 @@ def capture_exception( exc_info, mask_patterns=mask_patterns, ignore_patterns=ignore_patterns, + mask_url_credentials=mask_url_credentials, ) if self.log_captured_exceptions: diff --git a/posthog/contexts.py b/posthog/contexts.py index 5a31f80a..79323ede 100644 --- a/posthog/contexts.py +++ b/posthog/contexts.py @@ -26,6 +26,7 @@ def __init__( self.capture_exception_code_variables: Optional[bool] = None self.code_variables_mask_patterns: Optional[list] = None self.code_variables_ignore_patterns: Optional[list] = None + self.code_variables_mask_url_credentials: Optional[bool] = None def set_session_id(self, session_id: str): self.session_id = session_id @@ -48,6 +49,9 @@ def set_code_variables_mask_patterns(self, mask_patterns: list): def set_code_variables_ignore_patterns(self, ignore_patterns: list): self.code_variables_ignore_patterns = ignore_patterns + def set_code_variables_mask_url_credentials(self, enabled: bool): + self.code_variables_mask_url_credentials = enabled + def get_parent(self): return self.parent @@ -102,6 +106,13 @@ def get_code_variables_ignore_patterns(self) -> Optional[list]: return self.parent.get_code_variables_ignore_patterns() return None + def get_code_variables_mask_url_credentials(self) -> Optional[bool]: + if self.code_variables_mask_url_credentials is not None: + return self.code_variables_mask_url_credentials + if self.parent is not None and not self.fresh: + return self.parent.get_code_variables_mask_url_credentials() + return None + _context_stack: contextvars.ContextVar[Optional[ContextScope]] = contextvars.ContextVar( "posthog_context_stack", default=None @@ -369,6 +380,16 @@ def set_code_variables_ignore_patterns_context(ignore_patterns: list) -> None: current_context.set_code_variables_ignore_patterns(ignore_patterns) +def set_code_variables_mask_url_credentials_context(enabled: bool) -> None: + """ + Whether to scrub credentials embedded in URLs/DSNs (e.g. user:pass@host) from + captured code variables for the current context. + """ + current_context = _get_current_context() + if current_context: + current_context.set_code_variables_mask_url_credentials(enabled) + + def get_capture_exception_code_variables_context() -> Optional[bool]: current_context = _get_current_context() if current_context: @@ -390,6 +411,13 @@ def get_code_variables_ignore_patterns_context() -> Optional[list]: return None +def get_code_variables_mask_url_credentials_context() -> Optional[bool]: + current_context = _get_current_context() + if current_context: + return current_context.get_code_variables_mask_url_credentials() + return None + + F = TypeVar("F", bound=Callable[..., Any]) diff --git a/posthog/exception_utils.py b/posthog/exception_utils.py index 0f24818c..5828e68b 100644 --- a/posthog/exception_utils.py +++ b/posthog/exception_utils.py @@ -5,8 +5,11 @@ # 💖open source (under MIT License) # We want to keep payloads as similar to Sentry as possible for easy interoperability +import dataclasses +import functools import json import linecache +import math import os import re import sys @@ -58,10 +61,17 @@ r"(?i)_pass", r"(?i)sk_", r"(?i)jwt", + r"(?i)connection_string", + r"(?i)connectionstring", + r"(?i)conn_str", + r"(?i)connstr", + r"(?i)dsn", ] DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS = [r"^__.*"] +DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS = True + CODE_VARIABLES_REDACTED_VALUE = "$$_posthog_redacted_based_on_masking_rules_$$" CODE_VARIABLES_TOO_LONG_VALUE = "$$_posthog_value_too_long_$$" @@ -69,6 +79,28 @@ _MAX_COLLECTION_ITEMS_TO_SCAN = 100 _REGEX_METACHARACTERS = frozenset(r"\.^$*+?{}[]|()") +# Max recursion depth into nested structures while masking (cycles are guarded separately). +_MAX_MASK_DEPTH = 25 + +# Cap on total non-scalar nodes traversed per top-level value; the depth/collection caps +# don't bound aggregate work, so this stops a wide-and-deep graph from fanning out. +_MAX_TOTAL_NODES_TO_MASK = 200 + +# Matches `user:pass` credentials in URLs/DSNs (e.g. `postgresql://user:pass@host`); the +# bounded scheme length avoids catastrophic backtracking. +_URL_CREDENTIALS_RE = re.compile( + r"([a-z][a-z0-9+.\-]{0,30}://)(?=[^/@\s]*:)[^/\s]*@", re.IGNORECASE +) + + +def _redact_url_credentials(value): + if "://" not in value: + return value + return _URL_CREDENTIALS_RE.sub( + r"\g<1>" + CODE_VARIABLES_REDACTED_VALUE + "@", value + ) + + DEFAULT_TOTAL_VARIABLES_SIZE_LIMIT = 20 * 1024 @@ -945,7 +977,7 @@ def _extract_plain_substring(pattern): return remainder.lower() -def _compile_patterns(patterns): +def _compile_patterns_impl(patterns): if not patterns: return None substrings = [] @@ -964,6 +996,20 @@ def _compile_patterns(patterns): return (substrings, regexes) +@functools.lru_cache(maxsize=256) +def _compile_patterns_cached(patterns_tuple): + return _compile_patterns_impl(patterns_tuple) + + +def _compile_patterns(patterns): + # Cache by content so the default pattern set compiles once; fall back to an uncached + # compile for exotic, unhashable custom input. + try: + return _compile_patterns_cached(tuple(patterns)) + except TypeError: + return _compile_patterns_impl(list(patterns)) + + def _pattern_matches(name, patterns): if patterns is None: return False @@ -979,179 +1025,420 @@ def _pattern_matches(name, patterns): return False -def _mask_sensitive_data(value, compiled_mask, _seen=None): - if not compiled_mask: +def _build_matcher(compiled): + """Collapse a compiled ``(substrings, regexes)`` pair into a fast single-call matcher + by compiling the literal substrings into one alternation regex. Returns ``None`` when + there is nothing to match.""" + if compiled is None: + return None + substrings, regexes = compiled + return (_compile_substring_alternation(tuple(substrings)), regexes) + + +@functools.lru_cache(maxsize=256) +def _compile_substring_alternation(substrings): + # Substrings are already lowercased and matched against the lowercased name, so a + # case-sensitive alternation suffices and avoids IGNORECASE's per-character case-folding. + if not substrings: + return None + return re.compile("|".join(re.escape(s) for s in substrings)) + + +def _matcher_matches(name, matcher): + """Hot-path equivalent of ``_pattern_matches`` for a ``_build_matcher`` matcher.""" + if matcher is None: + return False + substr_re, regexes = matcher + if substr_re is not None and substr_re.search(name.lower()): + return True + for pattern in regexes: + if pattern.search(name): + return True + return False + + +@dataclasses.dataclass(frozen=True) +class _MaskingConfig: + """Everything the masking pipeline needs, compiled once per capture and threaded + through instead of recompiling patterns per frame.""" + + mask: Optional[ + Tuple[Optional[Pattern], List[Pattern]] + ] # name/value redaction matcher + ignore: Optional[ + Tuple[Optional[Pattern], List[Pattern]] + ] # variable-name skip matcher + mask_url_credentials: bool + max_length: int = DEFAULT_MAX_VALUE_LENGTH + + @classmethod + def build( + cls, + mask_patterns=None, + ignore_patterns=None, + mask_url_credentials=True, + max_length=DEFAULT_MAX_VALUE_LENGTH, + ): + # type: (...) -> _MaskingConfig + return cls( + mask=_build_matcher(_compile_patterns(mask_patterns or [])), + ignore=_build_matcher(_compile_patterns(ignore_patterns or [])), + mask_url_credentials=mask_url_credentials, + max_length=max_length, + ) + + +def _mask_string(value, config): + """Apply the string masking policy: over-length cap, name/value patterns, then + embedded URL credentials.""" + if len(value) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: + return CODE_VARIABLES_TOO_LONG_VALUE + if _matcher_matches(value, config.mask): + return CODE_VARIABLES_REDACTED_VALUE + if config.mask_url_credentials: + return _redact_url_credentials(value) + return value + + +def _safe_type_name(value): + try: + return type(value).__qualname__ + except Exception: + return "unknown" + + +def _safe_repr(value, config): + """Last-resort serialization for values we can't structurally decompose. Renders + ``repr(value)`` but fails closed: redact entirely on any mask match, over-length + repr, or a raising ``__repr__``.""" + try: + rendered = repr(value) + except Exception: + return "<" + _safe_type_name(value) + ">" + + # Too long to scan, so we can't vouch for it -> redact it all. + if len(rendered) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: + return CODE_VARIABLES_REDACTED_VALUE + if _matcher_matches(rendered, config.mask): + return CODE_VARIABLES_REDACTED_VALUE + if config.mask_url_credentials: + return _redact_url_credentials(rendered) + return rendered + + +def _extract_object_attrs(value): + """Return a ``name -> value`` mapping of an object's attributes, or ``None`` for + values that should be treated as opaque leaves (built-ins, slotted objects, empty + ``__dict__``).""" + if isinstance(value, type): + # A class/type object itself, not an instance - nothing useful to traverse. + return None + try: + if dataclasses.is_dataclass(value): + return {f.name: getattr(value, f.name) for f in dataclasses.fields(value)} + instance_dict = getattr(value, "__dict__", None) + except Exception: + return None + if isinstance(instance_dict, dict) and instance_dict: + # Copy so we never mutate the live object; keys here are attribute names. + return dict(instance_dict) + return None + + +# Method/function descriptor types excluded when scanning a class for sensitively-named +# members, so an object isn't redacted merely for having e.g. an `authenticate()` method. +_METHOD_MEMBER_TYPES = ( + types.FunctionType, + types.BuiltinFunctionType, + types.MethodType, + types.MethodDescriptorType, + types.WrapperDescriptorType, + types.MethodWrapperType, + types.ClassMethodDescriptorType, + classmethod, + staticmethod, +) + + +def _is_data_member(attr): + """True for a class member that holds or produces a value (class attribute, + ``@property``, descriptor) as opposed to a method or nested class.""" + return not isinstance(attr, type) and not isinstance(attr, _METHOD_MEMBER_TYPES) + + +def _masked_type_members(value, config): + """Redact sensitively-named members declared on the object's type (class attributes, + ``@property``, ``__slots__`` entries) which live on the class, not instance + ``__dict__``. Only member *names* are read, never the getters.""" + if config.mask is None: + return {} + try: + mro = type(value).__mro__ + except Exception: + return {} + masked = {} + for klass in mro: + for name, attr in klass.__dict__.items(): + if ( + isinstance(name, str) + and name not in masked + and _is_data_member(attr) + and _matcher_matches(name, config.mask) + ): + masked[name] = CODE_VARIABLES_REDACTED_VALUE + return masked + + +def _mask_mapping(items, config, seen, depth): + """Mask a sequence of ``(key, value)`` pairs into a dict. A key matching the mask + redacts its value; surviving values recurse through ``_mask_value``. Keys are kept + JSON-serializable.""" + result = {} + for key, value in items: + if type(key) is str: + out_key = key_str = key + else: + key_str = key if isinstance(key, str) else str(key) + # json.dumps only accepts str/int/float/bool/None keys; coerce anything else to + # its string form so one exotic key can't make json.dumps fail. + key_is_json_safe = ( + key is None + or isinstance(key, (str, int)) # bool is an int subclass + or (isinstance(key, float) and math.isfinite(key)) + ) + out_key = key if key_is_json_safe else key_str + if len(key_str) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: + result[out_key] = CODE_VARIABLES_TOO_LONG_VALUE + elif _matcher_matches(key_str, config.mask): + result[out_key] = CODE_VARIABLES_REDACTED_VALUE + else: + result[out_key] = _mask_value(value, config, seen, depth + 1) + return result + + +def _mask_value(value, config, seen=None, depth=0): + """Turn any Python value into a JSON-safe, masked value. Single source of truth for + what gets redacted; the result contains only JSON-native types so it can be handed + straight to ``json.dumps``.""" + # Name masking and URL scrubbing are independent toggles; skip only when both are off. + if config.mask is None and not config.mask_url_credentials: return value - if isinstance(value, (dict, list, tuple)): - if _seen is None: - _seen = set() - obj_id = id(value) - if obj_id in _seen: - return "" - _seen.add(obj_id) + # Exact-type fast paths for plain scalars, avoiding the isinstance ladder below. + # Subclasses fall through to the isinstance checks, so output is unchanged. + t = type(value) + if t is str: + return _mask_string(value, config) + if t is int or t is bool: + return value + if t is float: + # Non-finite floats (NaN/Infinity) are invalid JSON, so render them as strings. + return value if math.isfinite(value) else str(value) + if value is None: + return value + + if t is not dict and t is not list and t is not tuple: + # A scalar subclass (e.g. IntEnum, str subclass) - handle before traversing. + if isinstance(value, str): + return _mask_string(value, config) + if isinstance(value, float) and not math.isfinite(value): + return str(value) + if isinstance(value, (bool, int, float)): + return value + + if depth >= _MAX_MASK_DEPTH: + # Too deep to keep traversing; fail closed rather than repr (which could leak). + return CODE_VARIABLES_TOO_LONG_VALUE + + if seen is None: + seen = set() + obj_id = id(value) + if obj_id in seen: + return "" + seen.add(obj_id) - if isinstance(value, dict): + if len(seen) > _MAX_TOTAL_NODES_TO_MASK: + return CODE_VARIABLES_TOO_LONG_VALUE + + if t is dict or isinstance(value, dict): if len(value) > _MAX_COLLECTION_ITEMS_TO_SCAN: return CODE_VARIABLES_TOO_LONG_VALUE - result = {} - for k, v in value.items(): - key_str = str(k) if not isinstance(k, str) else k - if len(key_str) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: - result[k] = CODE_VARIABLES_TOO_LONG_VALUE - elif _pattern_matches(key_str, compiled_mask): - result[k] = CODE_VARIABLES_REDACTED_VALUE - else: - result[k] = _mask_sensitive_data(v, compiled_mask, _seen) - return result - elif isinstance(value, (list, tuple)): + return _mask_mapping(value.items(), config, seen, depth) + + # namedtuples are tuples but their fields have names: traverse like an object (so a + # field named `password` is caught) and emit a dict the encoder can serialize directly. + if isinstance(value, tuple) and hasattr(value, "_fields"): + fields = value._fields + if len(fields) > _MAX_COLLECTION_ITEMS_TO_SCAN: + return CODE_VARIABLES_TOO_LONG_VALUE + masked = _mask_mapping(zip(fields, value), config, seen, depth) + masked["__class__"] = _safe_type_name(value) + return masked + + if isinstance(value, (list, tuple)): if len(value) > _MAX_COLLECTION_ITEMS_TO_SCAN: return CODE_VARIABLES_TOO_LONG_VALUE - masked_items = [ - _mask_sensitive_data(item, compiled_mask, _seen) for item in value - ] - return type(value)(masked_items) - elif isinstance(value, str): - if len(value) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: + masked_items = [_mask_value(item, config, seen, depth + 1) for item in value] + try: + return type(value)(masked_items) + except Exception: + # A list/tuple subclass whose constructor rejects a single iterable; the items + # are already masked, so fall back to a plain list. + return masked_items + + # Custom objects: traverse their real attributes so a field named e.g. `password` is + # caught by name, rather than repr-scanning (which a custom __repr__ could relabel). + attrs = _extract_object_attrs(value) + if attrs is not None: + if len(attrs) > _MAX_COLLECTION_ITEMS_TO_SCAN: return CODE_VARIABLES_TOO_LONG_VALUE - if _pattern_matches(value, compiled_mask): - return CODE_VARIABLES_REDACTED_VALUE - return value - else: - return value + masked = _mask_mapping(attrs.items(), config, seen, depth) + masked["__class__"] = _safe_type_name(value) + return masked + + # A custom __repr__ can expose secrets held on the *class* (class attribute, @property, + # __slots__ entry) that attribute traversal never sees; redact those by name first. + masked_members = _masked_type_members(value, config) + if masked_members: + masked_members["__class__"] = _safe_type_name(value) + return masked_members + + # Opaque leaf (built-in/slotted/etc.): fall back to a fail-closed repr. + return _safe_repr(value, config) + + +def _finalize(result, limiter, max_length): + """Truncate to ``max_length`` and charge against the size budget; ``None`` when spent.""" + if len(result) > max_length: + result = result[: max_length - 3] + "..." + if not limiter.can_add(len(result)): + return None + limiter.add(len(result)) + return result -def _serialize_variable_value(value, limiter, max_length=1024, compiled_mask=None): +def _encode_variable(value, config, limiter): + """Format one already-masked variable for the wire: finite numbers stay raw JSON + numbers, everything else becomes a string. ``None`` when the size budget is spent.""" try: - if value is None: + safe = _mask_value(value, config) + + if safe is None: result = "None" - elif isinstance(value, bool): - result = str(value) - elif isinstance(value, (int, float)): - result_size = len(str(value)) + elif isinstance(safe, bool): + result = str(safe) + elif isinstance(safe, float) and not math.isfinite(safe): + # Only reachable when masking is disabled; keep NaN/Infinity out of the JSON. + result = str(safe) + elif isinstance(safe, (int, float)): + # Numbers are emitted as raw JSON numbers, so they skip string truncation. + result_size = len(str(safe)) if not limiter.can_add(result_size): return None limiter.add(result_size) - return value - elif isinstance(value, str): - if len(value) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: - result = CODE_VARIABLES_TOO_LONG_VALUE - elif compiled_mask and _pattern_matches(value, compiled_mask): - result = CODE_VARIABLES_REDACTED_VALUE - else: - result = value + return safe + elif isinstance(safe, str): + result = safe else: - masked_value = _mask_sensitive_data(value, compiled_mask) - result = json.dumps(masked_value) - - if len(result) > max_length: - result = result[: max_length - 3] + "..." - - result_size = len(result) - if not limiter.can_add(result_size): - return None - limiter.add(result_size) + # `default` is a safety net for anything _mask_value left non-serializable. + result = json.dumps( + safe, + default=lambda o: _safe_repr(o, config), + allow_nan=False, + ) - return result + return _finalize(result, limiter, config.max_length) except Exception: + # Fail closed: even if json.dumps chokes, re-render through the masking-aware repr. try: - result = repr(value) - if len(result) > max_length: - result = result[: max_length - 3] + "..." - - result_size = len(result) - if not limiter.can_add(result_size): - return None - limiter.add(result_size) - return result + rendered = _safe_repr(value, config) except Exception: - try: - fallback = f"<{type(value).__name__}>" - fallback_size = len(fallback) - if not limiter.can_add(fallback_size): - return None - limiter.add(fallback_size) - return fallback - except Exception: - fallback = "" - fallback_size = len(fallback) - if not limiter.can_add(fallback_size): - return None - limiter.add(fallback_size) - return fallback + rendered = f"<{_safe_type_name(value)}>" + return _finalize(rendered, limiter, config.max_length) def _is_simple_type(value): return isinstance(value, (type(None), bool, int, float, str)) -def serialize_code_variables( - frame, limiter, mask_patterns=None, ignore_patterns=None, max_length=1024 -): - if mask_patterns is None: - mask_patterns = [] - if ignore_patterns is None: - ignore_patterns = [] +def _add_variable(result, name, value, config, limiter): + """Add one masked variable to ``result``; returns False when the budget is spent. A + variable whose *name* matches the mask is redacted whole, value unread.""" + if _matcher_matches(name, config.mask): + if not limiter.can_add(len(CODE_VARIABLES_REDACTED_VALUE)): + return False + limiter.add(len(CODE_VARIABLES_REDACTED_VALUE)) + result[name] = CODE_VARIABLES_REDACTED_VALUE + return True + + encoded = _encode_variable(value, config, limiter) + if encoded is None: + return False + result[name] = encoded + return True - compiled_mask = _compile_patterns(mask_patterns) - compiled_ignore = _compile_patterns(ignore_patterns) +def _serialize_frame_variables(frame, limiter, config): + """Serialize one frame's locals into a ``name -> wire value`` dict. Scalars are + emitted before complex values so the cheapest context survives a tight size budget.""" try: local_vars = frame.f_locals.copy() except Exception: return {} - simple_vars = {} - complex_vars = {} - + simple: Dict[str, Any] = {} + complex_: Dict[str, Any] = {} for name, value in local_vars.items(): - if _pattern_matches(name, compiled_ignore): + if _matcher_matches(name, config.ignore): continue + (simple if _is_simple_type(value) else complex_)[name] = value - if _is_simple_type(value): - simple_vars[name] = value - else: - complex_vars[name] = value - - result = {} - - all_vars = {**simple_vars, **complex_vars} - ordered_names = list(sorted(simple_vars.keys())) + list(sorted(complex_vars.keys())) - - for name in ordered_names: - value = all_vars[name] + result: Dict[str, Any] = {} + for name in sorted(simple): + if not _add_variable(result, name, simple[name], config, limiter): + return result + for name in sorted(complex_): + if not _add_variable(result, name, complex_[name], config, limiter): + return result + return result - if _pattern_matches(name, compiled_mask): - redacted_value = CODE_VARIABLES_REDACTED_VALUE - redacted_size = len(redacted_value) - if not limiter.can_add(redacted_size): - break - limiter.add(redacted_size) - result[name] = redacted_value - else: - serialized = _serialize_variable_value( - value, limiter, max_length, compiled_mask - ) - if serialized is None: - break - result[name] = serialized - return result +def serialize_code_variables( + frame, + limiter, + mask_patterns=None, + ignore_patterns=None, + max_length=1024, + mask_url_credentials=True, +): + """Serialize a single frame's locals. Convenience wrapper that builds a one-off config; + the hot path builds the config once - see ``attach_code_variables_to_frames``.""" + config = _MaskingConfig.build( + mask_patterns=mask_patterns, + ignore_patterns=ignore_patterns, + mask_url_credentials=mask_url_credentials, + max_length=max_length, + ) + return _serialize_frame_variables(frame, limiter, config) def try_attach_code_variables_to_frames( - all_exceptions, exc_info, mask_patterns, ignore_patterns + all_exceptions, exc_info, mask_patterns, ignore_patterns, mask_url_credentials=True ): try: attach_code_variables_to_frames( - all_exceptions, exc_info, mask_patterns, ignore_patterns + all_exceptions, + exc_info, + mask_patterns, + ignore_patterns, + mask_url_credentials, ) except Exception: pass def attach_code_variables_to_frames( - all_exceptions, exc_info, mask_patterns, ignore_patterns + all_exceptions, exc_info, mask_patterns, ignore_patterns, mask_url_credentials=True ): exc_type, exc_value, traceback = exc_info @@ -1163,6 +1450,12 @@ def attach_code_variables_to_frames( if not tb_frames: return + # Compile patterns once for the whole capture and share one budget across all frames. + config = _MaskingConfig.build( + mask_patterns=mask_patterns, + ignore_patterns=ignore_patterns, + mask_url_credentials=mask_url_credentials, + ) limiter = VariableSizeLimiter() for exception in all_exceptions: @@ -1170,19 +1463,10 @@ def attach_code_variables_to_frames( if not stacktrace or "frames" not in stacktrace: continue - serialized_frames = stacktrace["frames"] - - for serialized_frame, tb_item in zip(serialized_frames, tb_frames): + for serialized_frame, tb_item in zip(stacktrace["frames"], tb_frames): if not serialized_frame.get("in_app"): continue - variables = serialize_code_variables( - tb_item.tb_frame, - limiter, - mask_patterns=mask_patterns, - ignore_patterns=ignore_patterns, - max_length=1024, - ) - + variables = _serialize_frame_variables(tb_item.tb_frame, limiter, config) if variables: serialized_frame["code_variables"] = variables diff --git a/posthog/test/test_code_variables.py b/posthog/test/test_code_variables.py new file mode 100644 index 00000000..6e7d451b --- /dev/null +++ b/posthog/test/test_code_variables.py @@ -0,0 +1,910 @@ +import collections +import functools +import json +import os +import subprocess +import sys +import types +from dataclasses import dataclass +from textwrap import dedent + +import pytest + +from posthog.exception_utils import ( + DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS, + DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + VariableSizeLimiter, + _MAX_COLLECTION_ITEMS_TO_SCAN, + _MAX_MASK_DEPTH, + _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH, + _MaskingConfig, + _compile_patterns, + _encode_variable, + _mask_value, + _pattern_matches, + _redact_url_credentials, + _safe_repr, + _serialize_frame_variables, + attach_code_variables_to_frames, + iter_stacks, +) +from posthog.exception_utils import ( + CODE_VARIABLES_REDACTED_VALUE as REDACTED, +) +from posthog.exception_utils import ( + CODE_VARIABLES_TOO_LONG_VALUE as TOO_LONG, +) + +# --- shared helpers ------------------------------------------------------------------ + + +def make_config( + *, patterns=DEFAULT_CODE_VARIABLES_MASK_PATTERNS, ignore=(), mask_urls=True +): + """Build a masking config, defaulting to the patterns the SDK ships with.""" + return _MaskingConfig.build( + list(patterns), list(ignore), mask_url_credentials=mask_urls + ) + + +def mask(value, **kwargs): + """Run one value through the recursive masker and return the masked result.""" + return _mask_value(value, make_config(**kwargs)) + + +def encode(value, *, limiter=None, **kwargs): + """Run one top-level variable through the full wire encoder (mask + format).""" + return _encode_variable( + value, make_config(**kwargs), limiter or VariableSizeLimiter() + ) + + +def extract( + *, + mask_patterns=DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + ignore_patterns=DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS, + **local_vars, +): + """Serialize a frame's locals. Pass the locals you want as keyword arguments.""" + config = _MaskingConfig.build(list(mask_patterns), list(ignore_patterns), True) + frame = types.SimpleNamespace(f_locals=local_vars) + return _serialize_frame_variables(frame, VariableSizeLimiter(), config) + + +_APP_HEADER = """\ +import os +import posthog +from posthog import Posthog + + +def make_client(**options): + return Posthog( + "phc_x", + host="https://eu.i.posthog.com", + debug=True, + enable_exception_autocapture=True, + project_root=os.path.dirname(os.path.abspath(__file__)), + **options, + ) +""" + + +def run_app(tmpdir, body, *, env=None): + """Write and run a tiny PostHog app that raises an uncaught exception. + + The app already has ``make_client(**options)`` and ``posthog`` imported; the body + creates the client, defines the failing code, and triggers it. In debug mode the + autocaptured payload (including ``code_variables``) is printed, which is what the + end-to-end tests assert against. Returns the combined stdout/stderr. + """ + app = tmpdir.join("app.py") + app.write(_APP_HEADER + "\n" + dedent(body)) + run_env = {**os.environ, **(env or {})} + with pytest.raises(subprocess.CalledProcessError) as excinfo: + subprocess.check_output( + [sys.executable, str(app)], stderr=subprocess.STDOUT, env=run_env + ) + return excinfo.value.output.decode("utf-8") + + +# --- 1. pattern compilation ---------------------------------------------------------- + + +class TestPatternCompilation: + """Patterns compile into a (substrings, regexes) pair; simple ones take a fast path.""" + + def test_simple_case_insensitive_pattern_becomes_a_substring(self): + # "(?i)password" is just a case-insensitive contains-check -> plain substring + substrings, regexes = _compile_patterns([r"(?i)password"]) + assert substrings == ["password"] + assert regexes == [] + + def test_complex_pattern_stays_a_regex(self): + # "^sk_live_" needs anchoring -> kept as a compiled regex + substrings, regexes = _compile_patterns([r"^sk_live_"]) + assert substrings == [] + assert len(regexes) == 1 + + def test_patterns_can_mix_substrings_and_regexes(self): + substrings, regexes = _compile_patterns([r"(?i)secret", r"^__.*"]) + assert substrings == ["secret"] + assert len(regexes) == 1 + + def test_no_patterns_compiles_to_none(self): + assert _compile_patterns([]) is None + + def test_substring_match_is_case_insensitive(self): + patterns = _compile_patterns([r"(?i)password"]) + assert _pattern_matches("MY_PASSWORD", patterns) is True + assert _pattern_matches("safe_name", patterns) is False + + def test_regex_match_is_anchored_where_the_pattern_says(self): + patterns = _compile_patterns([r"^__"]) + assert _pattern_matches("__private", patterns) is True # starts with __ + assert _pattern_matches("trailing__", patterns) is False # __ not at start + + +# --- 2. URL credential scrubbing ----------------------------------------------------- + + +class TestUrlCredentialScrubbing: + """`scheme://user:pass@host` credentials are stripped from any string.""" + + @pytest.mark.parametrize( + "url, secret", + [ + ("postgresql://warehouse:topsecret@db:26257/x", "topsecret"), + ("redis://:p4ss@cache:6379", "p4ss"), # password-only userinfo + ("mongodb://admin:hush@mongo:27017", "hush"), + ("https://admin:hush@api.example.com/v1", "hush"), + ], + ) + def test_embedded_credentials_are_removed(self, url, secret): + # scheme://user:@host -> scheme://@host + result = _redact_url_credentials(url) + assert secret not in result + assert REDACTED in result + + def test_every_url_in_the_string_is_scrubbed(self): + # two DSNs in one string -> both credentials gone + result = _redact_url_credentials("a=postgres://u:p1@h1 b=redis://u:p2@h2") + assert "p1" not in result and "p2" not in result + + def test_ipv6_host_is_preserved(self): + # redis://user:@[::1]:6379 -> host kept, credential gone + result = _redact_url_credentials("redis://user:secret@[::1]:6379") + assert result == "redis://" + REDACTED + "@[::1]:6379" + + @pytest.mark.parametrize( + "value", + [ + "ssh://gituser@github.com/repo", # bare username, no password slot + "https://api.example.com:8080/v1", # port but no credentials + "just a plain string", # not a URL at all + ], + ) + def test_strings_without_credentials_are_left_untouched(self, value): + assert _redact_url_credentials(value) == value + + +# --- 3. scalar masking --------------------------------------------------------------- + + +class TestScalarMasking: + """Simple scalars pass through untouched; non-finite floats become strings.""" + + @pytest.mark.parametrize("value", [None, True, False, 42, -7, 3.14, "plain text"]) + def test_safe_scalars_pass_through_unchanged(self, value): + assert mask(value) == value + + @pytest.mark.parametrize( + "value, expected", + [(float("nan"), "nan"), (float("inf"), "inf"), (float("-inf"), "-inf")], + ) + def test_non_finite_floats_become_strings(self, value, expected): + # NaN/Infinity are invalid JSON, so they are rendered as strings instead + assert mask(value) == expected + + def test_non_finite_floats_are_converted_even_when_nested(self): + # [inf, 2.0] -> ["inf", 2.0] (so json.dumps never sees a NaN/Infinity token) + assert mask([float("inf"), 2.0]) == ["inf", 2.0] + + +# --- 4. string masking --------------------------------------------------------------- + + +class TestStringMasking: + """A string is redacted by content, capped by length, and scrubbed of URL creds.""" + + def test_plain_string_passes_through(self): + assert mask("hello world") == "hello world" + + def test_string_matching_a_pattern_is_redacted(self): + # the value itself contains "password" -> redact the whole string + assert mask("contains_password_here") == REDACTED + + def test_overly_long_string_is_replaced(self): + # too long to scan within budget -> placeholder, not the raw value + assert mask("x" * (_MAX_VALUE_LENGTH_FOR_PATTERN_MATCH + 1)) == TOO_LONG + + def test_url_credentials_are_scrubbed_even_with_no_mask_patterns(self): + # name masking off, URL scrubbing on (they are independent toggles) + result = mask("postgresql://user:p4ss@host/db", patterns=[]) + assert "p4ss" not in result + assert REDACTED in result + + def test_url_scrubbing_can_be_turned_off(self): + result = mask("postgresql://user:p4ss@host/db", patterns=[], mask_urls=False) + assert result == "postgresql://user:p4ss@host/db" + + +# --- 5. collection masking ----------------------------------------------------------- + + +class TestCollectionMasking: + """Dicts, lists and tuples are walked; size, depth and cycles are all bounded.""" + + def test_safe_dict_is_unchanged(self): + # {"name": "test", "value": 123} -> unchanged + assert mask({"name": "test", "value": 123}) == {"name": "test", "value": 123} + + def test_dict_key_matching_a_pattern_redacts_its_value(self): + # {"password": ...} -> value redacted on the strength of the key name alone + assert mask({"password": "anything"}) == {"password": REDACTED} + + def test_dict_value_matching_a_pattern_is_redacted(self): + # {"note": "...password..."} -> value redacted because the value matches + assert mask({"note": "contains_password_here"}) == {"note": REDACTED} + + def test_nested_dict_is_masked_at_every_depth(self): + # {"l1": {"l2": {"api_key": , "safe": "ok"}}} + out = mask({"l1": {"l2": {"api_key": "xyz", "safe": "ok"}}}) + assert out["l1"]["l2"]["api_key"] == REDACTED + assert out["l1"]["l2"]["safe"] == "ok" + + def test_list_items_are_masked_individually(self): + # ["safe", "...password...", "safe2"] -> only the middle item is redacted + assert mask(["safe", "contains_password_here", "safe2"]) == [ + "safe", + REDACTED, + "safe2", + ] + + def test_tuple_stays_a_tuple(self): + # ("a", "secret_token", "b") -> still a tuple, middle redacted by value + out = mask(("a", "secret_token", "b")) + assert isinstance(out, tuple) + assert out == ("a", REDACTED, "b") + + def test_list_of_dicts_is_masked(self): + # [{"id": 1, "password": }, {"id": 2, "value": "ok"}] + out = mask([{"id": 1, "password": "x"}, {"id": 2, "value": "ok"}]) + assert out == [{"id": 1, "password": REDACTED}, {"id": 2, "value": "ok"}] + + def test_overly_long_dict_key_replaces_only_that_entry(self): + # {"short": "ok", : ..., "password": ...} + long_key = "k" * 20000 + out = mask({"short": "ok", long_key: "v", "password": "x"}) + assert out["short"] == "ok" + assert out[long_key] == TOO_LONG + assert out["password"] == REDACTED + + @pytest.mark.parametrize( + "build", + [ + lambda n: {f"key{i}": i for i in range(n)}, + lambda n: list(range(n)), + lambda n: tuple(range(n)), + ], + ids=["dict", "list", "tuple"], + ) + def test_collection_with_too_many_items_is_replaced(self, build): + # over the item cap -> placeholder; comfortably under it -> masked normally + assert mask(build(_MAX_COLLECTION_ITEMS_TO_SCAN + 1)) == TOO_LONG + assert mask(build(2)) != TOO_LONG + + def test_circular_dict_reference_is_detected(self): + # d = {"key": "value", "self": d} + d = {"key": "value"} + d["self"] = d + out = mask(d) + assert out["key"] == "value" + assert out["self"] == "" + + def test_circular_list_reference_is_detected(self): + # items = ["item", items] + items = ["item"] + items.append(items) + out = mask(items) + assert out[0] == "item" + assert out[1] == "" + + def test_non_string_dict_key_is_coerced_to_stay_serializable(self): + # {(1, 2): "ok"} -> key stringified so the masked dict is always JSON-safe + assert mask({(1, 2): "ok"}) == {"(1, 2)": "ok"} + + def test_non_string_dict_key_still_redacts_on_its_name(self): + # a key whose text matches a pattern redacts its value, just like a string key + assert mask({("db", "password"): "x"}) == {"('db', 'password')": REDACTED} + + def test_non_string_dict_key_does_not_defeat_value_masking(self): + # a tuple key used to break json.dumps and fall back to a repr of the *original* + # dict, leaking everything - the value must still be masked structurally + class Conn: + def __init__(self): + self.password = "hunter2" + + def __repr__( + self, + ): # repr hides the field name, so only name-masking saves it + return f"Conn({self.password})" + + out = mask({(1, 2): Conn()}) + assert out["(1, 2)"]["password"] == REDACTED + assert "hunter2" not in str(out) + + def test_sequence_subclass_that_cannot_be_rebuilt_falls_back_to_a_list(self): + # a subclass whose __new__ won't take a single iterable must not raise out of + # masking (which would repr the original); its items are already masked + class Pair(tuple): + def __new__(cls, a, b): + return super().__new__(cls, (a, b)) + + assert mask(Pair("ok", "contains_password_here")) == ["ok", REDACTED] + + def test_total_node_budget_caps_runaway_structures(self): + # depth and per-collection caps are per-level; a moderately wide+deep tree still + # blows past the *total* node budget -> truncated, so masking stays cheap + def tree(width, depth): + if depth == 0: + return {} + return {f"k{i}": tree(width, depth - 1) for i in range(width)} + + assert TOO_LONG in json.dumps(mask(tree(25, 3))) # ~16k nodes, over the budget + assert TOO_LONG not in json.dumps(mask(tree(5, 3))) # ~150 nodes, well under + + +# --- 6. object traversal ------------------------------------------------------------- + + +class TestObjectTraversal: + """Custom objects are decomposed into their real fields, so a `password` attribute + is caught by name instead of leaking through repr().""" + + def test_dataclass_is_decomposed_into_its_fields(self): + # Config(host="db", user="wh", password=) -> dict; password redacted + @dataclass + class Config: + host: str + user: str + password: str + + out = mask(Config("db.example.com", "warehouse", "uHjH9secret")) + assert out["host"] == "db.example.com" + assert out["user"] == "warehouse" + assert out["password"] == REDACTED + assert "Config" in out["__class__"] + assert "uHjH9secret" not in str(out) + + def test_plain_object_is_decomposed_via_its_dict(self): + # obj.username="alice", obj.api_key= + class Credentials: + def __init__(self): + self.username = "alice" + self.api_key = "sk_live_abc123" + + out = mask(Credentials()) + assert out["username"] == "alice" + assert out["api_key"] == REDACTED + assert "sk_live_abc123" not in str(out) + + def test_object_nested_in_a_tuple_is_still_masked(self): + # (Config(password=), Inputs(schema_name="traffic")) + @dataclass + class Config: + host: str + password: str + + @dataclass + class Inputs: + schema_name: str + + out = mask((Config("db", "topsecret"), Inputs("traffic"))) + assert out[0]["host"] == "db" + assert out[0]["password"] == REDACTED + assert out[1]["schema_name"] == "traffic" + assert "topsecret" not in str(out) + + def test_object_with_too_many_attributes_is_replaced(self): + class Wide: + def __init__(self, n): + for i in range(n): + setattr(self, f"attr{i}", i) + + assert mask(Wide(_MAX_COLLECTION_ITEMS_TO_SCAN + 1)) == TOO_LONG + assert isinstance(mask(Wide(2)), dict) + + def test_shallow_object_secret_is_redacted_by_name(self): + # Box(password=) -> field redacted, secret absent + @dataclass + class Box: + password: str + + out = mask(Box("hunter2")) + assert out["password"] == REDACTED + assert "hunter2" not in str(out) + + @pytest.mark.parametrize("depth", [_MAX_MASK_DEPTH, _MAX_MASK_DEPTH + 5]) + def test_secret_past_the_depth_limit_does_not_leak(self, depth): + # Node(Node(...Node(Box(password=)))) nested past the depth cap. + # Box.__repr__ hides the field name, so only name-based traversal could catch + # it - past the cap we must fail closed and never emit the repr. + @dataclass + class Box: + password: str + + def __repr__(self): + return f"Box({self.password})" + + class Node: + def __init__(self, child): + self.child = child + + value = Box("hunter2") + for _ in range(depth): + value = Node(value) + assert "hunter2" not in str(mask(value)) + + def test_structure_past_the_depth_limit_degrades_to_a_placeholder(self): + # [[[ ... ["leaf"] ... ]]] nested past the cap -> placeholder, leaf dropped + value = "leaf" + for _ in range(_MAX_MASK_DEPTH + 1): + value = [value] + out = mask(value) + assert TOO_LONG in str(out) + assert "leaf" not in str(out) + + def test_namedtuple_is_decomposed_by_field_name(self): + # Creds(user="alice", password=) -> dict; password redacted by name + Creds = collections.namedtuple("Creds", ["user", "password"]) + out = mask(Creds("alice", "uHjH9secret")) + assert out["user"] == "alice" + assert out["password"] == REDACTED + assert "Creds" in out["__class__"] + assert "uHjH9secret" not in str(out) + + def test_namedtuple_field_is_caught_by_name_not_repr(self): + # a custom __repr__ that hides the field names can't relabel a sensitive field + # out of the mask: we traverse the real fields, not the repr + class Token(collections.namedtuple("Token", ["label", "secret"])): + def __repr__(self): + return f"<{self.label}>" + + out = mask(Token("prod", "sk_live_xyz")) + assert out["secret"] == REDACTED + assert "sk_live_xyz" not in str(out) + + def test_sensitively_named_property_is_not_leaked_through_repr(self): + # a @property lives on the class, not in __dict__, so attribute traversal never + # sees it - but a custom __repr__ can expose it. Catch it by name, don't repr. + class Config: + @property + def password(self): + return "hunter2" + + def __repr__(self): # hides the field name, so the repr scan can't catch it + return f"Config({self.password})" + + out = mask(Config()) + assert out["password"] == REDACTED + assert "Config" in out["__class__"] + assert "hunter2" not in str(out) + + def test_sensitively_named_cached_property_is_not_leaked(self): + # functools.cached_property is a class descriptor until first access, so an + # un-accessed one isn't in __dict__ either - catch it by name like @property + class Client: + @functools.cached_property + def api_key(self): + return "sk_live_xyz" + + def __repr__(self): + return f"Client({self.api_key})" + + out = mask(Client()) + assert out["api_key"] == REDACTED + assert "sk_live_xyz" not in str(out) + + def test_sensitively_named_slot_is_redacted_by_name(self): + # a __slots__ value lives outside __dict__; a custom __repr__ that hides the slot + # name would otherwise leak it, so catch the slot by name + class Session: + __slots__ = ("token",) + + def __init__(self, token): + self.token = token + + def __repr__(self): # hides the slot name + return f"Session({self.token})" + + out = mask(Session("sk_live_xyz")) + assert out["token"] == REDACTED + assert "sk_live_xyz" not in str(out) + + def test_sensitively_named_class_attribute_is_redacted_by_name(self): + # a class-level data attribute also lives outside instance __dict__; redact it by + # name so a custom __repr__ can't leak it + class Config: + password = "hunter2" + + def __repr__(self): + return f"Config({self.password})" + + out = mask(Config()) + assert out["password"] == REDACTED + assert "hunter2" not in str(out) + + def test_non_sensitive_members_are_left_to_the_normal_repr(self): + # only members whose *name* matches the mask are redacted; a plain property/slot + # must not trip redaction (and no getter is ever called) + class Box: + __slots__ = ("region",) + + def __init__(self): + self.region = "us-east-1" + + @property + def host(self): + return "db.example.com" + + def __repr__(self): + return f"Box({self.host}/{self.region})" + + assert mask(Box()) == "Box(db.example.com/us-east-1)" + + +# --- 7. opaque repr fallback --------------------------------------------------------- + + +class TestOpaqueReprFallback: + """Slotted objects have no __dict__ to walk, so they fall back to a fail-closed + repr: keep it only if nothing about it looks sensitive.""" + + def test_repr_is_kept_when_nothing_looks_sensitive(self): + class Point: + __slots__ = ("x", "y") + + def __repr__(self): + return "Point(x=1, y=2)" + + assert _safe_repr(Point(), make_config()) == "Point(x=1, y=2)" + + def test_whole_value_is_redacted_when_repr_mentions_a_secret(self): + # repr embeds the word "password" -> redact the entire repr, don't emit it. + # No maskable member here (empty slots), so this exercises the repr fallback. + class Creds: + __slots__ = () + + def __repr__(self): + return "Creds(password=s3cr3t)" + + assert _safe_repr(Creds(), make_config()) == REDACTED + assert mask(Creds()) == REDACTED # same outcome through the masker + + def test_broken_repr_yields_a_type_placeholder(self): + # a __repr__ that raises must neither crash capture nor leak its fields + class Boom: + __slots__ = ("secret",) + + def __init__(self): + self.secret = "leak123" + + def __repr__(self): + raise RuntimeError("boom") + + result = _safe_repr(Boom(), make_config()) + assert "leak123" not in result + assert result.startswith("<") and result.endswith(">") + assert "Boom" in result # a type-name placeholder, nothing from the instance + + def test_overly_long_repr_is_redacted(self): + # a repr too long to scan can't be vouched for -> redact it whole + class Huge: + __slots__ = ("payload",) + + def __init__(self, payload): + self.payload = payload + + def __repr__(self): + return self.payload + + payload = "topsecret" + "x" * (_MAX_VALUE_LENGTH_FOR_PATTERN_MATCH + 1) + assert _safe_repr(Huge(payload), make_config()) == REDACTED + + def test_url_credentials_in_repr_are_scrubbed(self): + # repr embeds a connection string -> credential scrubbed, rest of repr kept + class Conn: + __slots__ = () + + def __repr__(self): + return "Conn(url=postgresql://user:leakpw@db/app)" + + assert "leakpw" not in _safe_repr(Conn(), make_config()) + assert "leakpw" in _safe_repr(Conn(), make_config(mask_urls=False)) + + +# --- 8. variable encoding ------------------------------------------------------------ + + +class TestVariableEncoding: + """The top-level encoder decides the wire format: numbers stay raw, everything else + becomes a string, and the size budget is enforced.""" + + @pytest.mark.parametrize("value", [0, 42, -7, 3.14, 1.0]) + def test_numbers_stay_raw_json_numbers(self, value): + assert encode(value) == value + + def test_none_and_booleans_become_strings(self): + assert encode(None) == "None" + assert encode(True) == "True" + assert encode(False) == "False" + + @pytest.mark.parametrize( + "value, expected", + [(float("nan"), "nan"), (float("inf"), "inf"), (float("-inf"), "-inf")], + ) + def test_non_finite_floats_become_strings(self, value, expected): + assert encode(value) == expected + + def test_string_is_emitted_unchanged(self): + assert encode("hello world") == "hello world" + + def test_dict_becomes_a_json_string(self): + # {"name": "test", "value": 123} -> '{"name": "test", "value": 123}' + assert ( + encode({"name": "test", "value": 123}) == '{"name": "test", "value": 123}' + ) + + def test_nested_non_finite_floats_keep_the_json_strict_valid(self): + # {"ratio": inf, "ok": 1.5} -> inf becomes "inf", output parses as strict JSON + out = encode({"ratio": float("inf"), "ok": 1.5}) + assert "NaN" not in out and "Infinity" not in out + assert json.loads(out) == {"ratio": "inf", "ok": 1.5} + + def test_value_is_truncated_to_the_length_budget(self): + out = encode("a" * 2000) + assert len(out) == 1024 + assert out.endswith("...") + + def test_value_is_dropped_when_the_shared_budget_is_exhausted(self): + limiter = VariableSizeLimiter(max_size=4) + assert encode("ok", limiter=limiter) == "ok" # 2 bytes fit + assert encode("overflow", limiter=limiter) is None # budget already spent + + def test_dict_with_a_non_string_key_encodes_to_valid_json(self): + # the tuple key used to break json.dumps and leak a repr of the original dict; + # now it stays strict JSON with the sensitive value masked + out = encode({(1, 2): {"password": "hunter2"}}) + assert json.loads(out) == {"(1, 2)": {"password": REDACTED}} + assert "hunter2" not in out + + def test_namedtuple_encodes_to_valid_json(self): + # Creds(user, password) -> a JSON object keyed by field name, password masked + Creds = collections.namedtuple("Creds", ["user", "password"]) + out = encode(Creds("alice", "hunter2")) + assert json.loads(out)["password"] == REDACTED + assert "hunter2" not in out + + +# --- 9. frame variable extraction ---------------------------------------------------- + + +class TestFrameVariableExtraction: + """Reading a frame's locals: ignored names dropped, masked names redacted, scalars + ordered ahead of complex values.""" + + def test_simple_and_complex_locals_are_serialized(self): + # locals: count=3 (scalar), data={"a": 1} (complex) + out = extract(count=3, data={"a": 1}) + assert out == {"count": 3, "data": '{"a": 1}'} + + def test_ignored_names_are_skipped(self): + # locals: visible=1, __hidden=2 -> __hidden dropped by the ignore patterns + assert extract(visible=1, __hidden=2) == {"visible": 1} + + def test_variable_whose_name_matches_the_mask_is_redacted_whole(self): + # a local named "password" is redacted without inspecting its value at all + assert extract(password="anything") == {"password": REDACTED} + + def test_scalars_come_first_then_complex_values_each_group_sorted(self): + # scalars z, a + complex data, m -> a, z, then data, m + out = extract(z=1, a=2, m=[1], data={"k": 1}) + assert list(out) == ["a", "z", "data", "m"] + + def test_patterns_are_compiled_once_per_capture(self, monkeypatch): + # A multi-frame in-app stack must compile the mask/ignore patterns ONCE for the + # whole capture, not once per frame (the regression this refactor fixes). + from posthog import exception_utils + + # Each frame carries a local so it has something to serialize. + def deepest(note): + raise ValueError(note) + + def middle(note): + deepest(note) + + def outer(note): + middle(note) + + try: + outer("boom") + except ValueError: + exc_info = sys.exc_info() + + tb_frames = list(iter_stacks(exc_info[2])) + assert len(tb_frames) >= 3 # several in-app frames to process + frames = [{"in_app": True} for _ in tb_frames] + all_exceptions = [{"stacktrace": {"frames": frames}}] + + compile_calls = [] + real_compile = exception_utils._compile_patterns + monkeypatch.setattr( + exception_utils, + "_compile_patterns", + lambda patterns: compile_calls.append(patterns) or real_compile(patterns), + ) + + attach_code_variables_to_frames( + all_exceptions, + exc_info, + list(DEFAULT_CODE_VARIABLES_MASK_PATTERNS), + list(DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS), + ) + + # exactly two compiles (mask + ignore) regardless of how many frames there are + assert len(compile_calls) == 2 + # and multiple frames really were processed + assert sum("code_variables" in frame for frame in frames) >= 2 + + +# --- 10. end to end ------------------------------------------------------------------ + + +class TestEndToEnd: + """The whole code-variables pipeline through a real subprocess: an uncaught + exception is autocaptured and its serialized payload (with code_variables) is + printed in debug mode.""" + + def test_code_variables_are_captured_and_masked(self, tmpdir): + # the failing frame's locals: one safe dict, two secrets, one ignored dunder + # greeting="hello world" count=42 data={"name": "test", ...} + # password= note= __hidden= + output = run_app( + tmpdir, + """ + make_client(capture_exception_code_variables=True) + + def trigger_error(): + greeting = "hello world" + count = 42 + data = {"name": "test", "value": 123} + password = "secret123" # name matches -> redacted + note = "contains_password_here" # value matches -> redacted + __hidden = "ignored" # dunder -> skipped + 1 / 0 + + trigger_error() + """, + ) + assert "ZeroDivisionError" in output + assert "'code_variables':" in output # the key form, not the temp-path word + # scalars show up in the debug log in their repr form + assert "'greeting': 'hello world'" in output + assert "'count': 42" in output + # a dict variable is double-encoded as a JSON string (locks the wire format) + assert '"data": "{\\"name\\": \\"test\\", \\"value\\": 123}"' in output + # both secrets are redacted, and the ignored name never appears + assert "'password': '%s'" % REDACTED in output + assert "'note': '%s'" % REDACTED in output + assert "'__hidden'" not in output + + def test_code_variables_are_not_captured_when_disabled(self, tmpdir): + output = run_app( + tmpdir, + """ + make_client(capture_exception_code_variables=False) + + def trigger_error(): + value = "hello world" + 1 / 0 + + trigger_error() + """, + ) + assert "ZeroDivisionError" in output + # check the key forms (repr + JSON), not the bare word - the temp path that + # gets serialized into the payload happens to contain "code_variables" too. + assert "'code_variables':" not in output + assert '"code_variables":' not in output + + def test_a_context_can_enable_and_customize_masking(self, tmpdir): + # client has capture OFF; the context turns it ON with a custom "bank" pattern + output = run_app( + tmpdir, + """ + client = make_client(capture_exception_code_variables=False) + + def process_data(): + bank = "should_be_masked" # matched by the custom context pattern + account = "visible" + 1 / 0 + + with posthog.new_context(client=client): + posthog.set_capture_exception_code_variables_context(True) + posthog.set_code_variables_mask_patterns_context([r"(?i).*bank.*"]) + posthog.set_code_variables_ignore_patterns_context([]) + process_data() + """, + ) + assert "code_variables" in output + assert "'bank': '%s'" % REDACTED in output + assert "'account': 'visible'" in output + + def test_object_secret_is_never_emitted(self, tmpdir): + # args = (PostgresConfig(host=..., password=), Inputs(schema=...)) + # The secret is read from the environment so it is never a source literal. + output = run_app( + tmpdir, + """ + from dataclasses import dataclass + + @dataclass + class PostgresConfig: + host: str + password: str + + @dataclass + class Inputs: + schema_name: str + + make_client(capture_exception_code_variables=True) + + def trigger_error(): + secret = os.environ["TEST_DB_PASSWORD"] + args = ( + PostgresConfig(host="db.example.com", password=secret), + Inputs(schema_name="traffic_stats"), + ) + 1 / 0 + + trigger_error() + """, + env={"TEST_DB_PASSWORD": "uHjH9WJuEV0VT2NKoP7zpQ"}, + ) + assert "code_variables" in output + assert "uHjH9WJuEV0VT2NKoP7zpQ" not in output # the secret never leaks + assert REDACTED in output + assert "PostgresConfig" in output # surrounding context is kept + assert "traffic_stats" in output + + def test_url_credential_masking_can_be_disabled(self, tmpdir): + # db_uri = "postgresql://user:@host/db" with a neutral name and no + # masked keyword - so only the URL heuristic could scrub it, and it is off here. + output = run_app( + tmpdir, + """ + make_client( + capture_exception_code_variables=True, + code_variables_mask_url_credentials=False, + ) + + def trigger_error(): + db_uri = "postgresql://user:" + os.environ["TEST_DB_PASSWORD"] + "@host/db" + 1 / 0 + + trigger_error() + """, + env={"TEST_DB_PASSWORD": "p4ssRUNTIME"}, + ) + assert "code_variables" in output + assert "p4ssRUNTIME" in output # URL masking disabled -> credential retained diff --git a/posthog/test/test_exception_capture.py b/posthog/test/test_exception_capture.py index 155a144c..bc632ba0 100644 --- a/posthog/test/test_exception_capture.py +++ b/posthog/test/test_exception_capture.py @@ -154,658 +154,3 @@ def test_excepthook(tmpdir): b'"$exception_list": [{"mechanism": {"type": "generic", "handled": true}, "module": null, "type": "ZeroDivisionError", "value": "division by zero", "stacktrace": {"frames": [{"platform": "python", "filename": "app.py", "abs_path"' in output ) - - -def test_code_variables_capture(tmpdir): - app = tmpdir.join("app.py") - app.write( - dedent( - """ - import os - from posthog import Posthog - - class UnserializableObject: - pass - - posthog = Posthog( - 'phc_x', - host='https://eu.i.posthog.com', - debug=True, - enable_exception_autocapture=True, - capture_exception_code_variables=True, - project_root=os.path.dirname(os.path.abspath(__file__)) - ) - - def trigger_error(): - my_string = "hello world" - my_number = 42 - my_bool = True - my_dict = {"name": "test", "value": 123} - my_sensitive_dict = { - "safe_key": "safe_value", - "password": "secret123", # key matches pattern -> should be masked - "other_key": "contains_password_here", # value matches pattern -> should be masked - } - my_nested_dict = { - "level1": { - "level2": { - "api_key": "nested_secret", # deeply nested key matches - "data": "contains_token_here", # deeply nested value matches - "safe": "visible", - } - } - } - my_list = ["safe_item", "has_password_inside", "another_safe"] - my_tuple = ("tuple_safe", "secret_in_value", "tuple_also_safe") - my_list_of_dicts = [ - {"id": 1, "password": "list_dict_secret"}, - {"id": 2, "value": "safe_value"}, - ] - my_obj = UnserializableObject() - my_password = "secret123" # Should be masked by default (name matches) - my_innocent_var = "contains_password_here" # Should be masked by default (value matches) - __should_be_ignored = "hidden" # Should be ignored by default - - 1/0 # Trigger exception - - def intermediate_function(): - request_id = "abc-123" - user_count = 100 - is_active = True - - trigger_error() - - def process_data(): - batch_size = 50 - retry_count = 3 - - intermediate_function() - - process_data() - """ - ) - ) - - with pytest.raises(subprocess.CalledProcessError) as excinfo: - subprocess.check_output([sys.executable, str(app)], stderr=subprocess.STDOUT) - - output = excinfo.value.output - - assert b"ZeroDivisionError" in output - assert b"code_variables" in output - - # Variables from trigger_error frame - assert b"'my_string': 'hello world'" in output - assert b"'my_number': 42" in output - assert b"'my_bool': 'True'" in output - assert b'"my_dict": "{\\"name\\": \\"test\\", \\"value\\": 123}"' in output - assert ( - b'{\\"safe_key\\": \\"safe_value\\", \\"password\\": \\"$$_posthog_redacted_based_on_masking_rules_$$\\", \\"other_key\\": \\"$$_posthog_redacted_based_on_masking_rules_$$\\"}' - in output - ) - assert ( - b'{\\"level1\\": {\\"level2\\": {\\"api_key\\": \\"$$_posthog_redacted_based_on_masking_rules_$$\\", \\"data\\": \\"$$_posthog_redacted_based_on_masking_rules_$$\\", \\"safe\\": \\"visible\\"}}}' - in output - ) - assert ( - b'[\\"safe_item\\", \\"$$_posthog_redacted_based_on_masking_rules_$$\\", \\"another_safe\\"]' - in output - ) - assert ( - b'[\\"tuple_safe\\", \\"$$_posthog_redacted_based_on_masking_rules_$$\\", \\"tuple_also_safe\\"]' - in output - ) - assert ( - b'[{\\"id\\": 1, \\"password\\": \\"$$_posthog_redacted_based_on_masking_rules_$$\\"}, {\\"id\\": 2, \\"value\\": \\"safe_value\\"}]' - in output - ) - assert b"<__main__.UnserializableObject object at" in output - assert b"'my_password': '$$_posthog_redacted_based_on_masking_rules_$$'" in output - assert ( - b"'my_innocent_var': '$$_posthog_redacted_based_on_masking_rules_$$'" in output - ) - assert b"'__should_be_ignored':" not in output - - # Variables from intermediate_function frame - assert b"'request_id': 'abc-123'" in output - assert b"'user_count': 100" in output - assert b"'is_active': 'True'" in output - - # Variables from process_data frame - assert b"'batch_size': 50" in output - assert b"'retry_count': 3" in output - - -def test_code_variables_context_override(tmpdir): - app = tmpdir.join("app.py") - app.write( - dedent( - """ - import os - import posthog - from posthog import Posthog - - posthog_client = Posthog( - 'phc_x', - host='https://eu.i.posthog.com', - debug=True, - enable_exception_autocapture=True, - capture_exception_code_variables=False, - project_root=os.path.dirname(os.path.abspath(__file__)) - ) - - def process_data(): - bank = "should_be_masked" - __dunder_var = "should_be_visible" - - 1/0 - - with posthog.new_context(client=posthog_client): - posthog.set_capture_exception_code_variables_context(True) - posthog.set_code_variables_mask_patterns_context([r"(?i).*bank.*"]) - posthog.set_code_variables_ignore_patterns_context([]) - - process_data() - """ - ) - ) - - with pytest.raises(subprocess.CalledProcessError) as excinfo: - subprocess.check_output([sys.executable, str(app)], stderr=subprocess.STDOUT) - - output = excinfo.value.output - - assert b"ZeroDivisionError" in output - assert b"code_variables" in output - assert b"'bank': '$$_posthog_redacted_based_on_masking_rules_$$'" in output - assert b"'__dunder_var': 'should_be_visible'" in output - - -def test_code_variables_size_limiter(tmpdir): - app = tmpdir.join("app.py") - app.write( - dedent( - """ - import os - from posthog import Posthog - - posthog = Posthog( - 'phc_x', - host='https://eu.i.posthog.com', - debug=True, - enable_exception_autocapture=True, - capture_exception_code_variables=True, - project_root=os.path.dirname(os.path.abspath(__file__)) - ) - - def trigger_error(): - var_a = "a" * 2000 - var_b = "b" * 2000 - var_c = "c" * 2000 - var_d = "d" * 2000 - var_e = "e" * 2000 - var_f = "f" * 2000 - var_g = "g" * 2000 - - 1/0 - - def intermediate_function(): - var_h = "h" * 2000 - var_i = "i" * 2000 - var_j = "j" * 2000 - var_k = "k" * 2000 - var_l = "l" * 2000 - var_m = "m" * 2000 - var_n = "n" * 2000 - - trigger_error() - - def process_data(): - var_o = "o" * 2000 - var_p = "p" * 2000 - var_q = "q" * 2000 - var_r = "r" * 2000 - var_s = "s" * 2000 - var_t = "t" * 2000 - var_u = "u" * 2000 - - intermediate_function() - - process_data() - """ - ) - ) - - with pytest.raises(subprocess.CalledProcessError) as excinfo: - subprocess.check_output([sys.executable, str(app)], stderr=subprocess.STDOUT) - - output = excinfo.value.output.decode("utf-8") - - assert "ZeroDivisionError" in output - assert "code_variables" in output - - captured_vars = [] - for var_name in [ - "var_a", - "var_b", - "var_c", - "var_d", - "var_e", - "var_f", - "var_g", - "var_h", - "var_i", - "var_j", - "var_k", - "var_l", - "var_m", - "var_n", - "var_o", - "var_p", - "var_q", - "var_r", - "var_s", - "var_t", - "var_u", - ]: - if f"'{var_name}'" in output: - captured_vars.append(var_name) - - assert len(captured_vars) > 0 - assert len(captured_vars) < 21 - - -def test_code_variables_disabled_capture(tmpdir): - app = tmpdir.join("app.py") - app.write( - dedent( - """ - import os - from posthog import Posthog - - posthog = Posthog( - 'phc_x', - host='https://eu.i.posthog.com', - debug=True, - enable_exception_autocapture=True, - capture_exception_code_variables=False, - project_root=os.path.dirname(os.path.abspath(__file__)) - ) - - def trigger_error(): - my_string = "hello world" - my_number = 42 - my_bool = True - - 1/0 - - trigger_error() - """ - ) - ) - - with pytest.raises(subprocess.CalledProcessError) as excinfo: - subprocess.check_output([sys.executable, str(app)], stderr=subprocess.STDOUT) - - output = excinfo.value.output.decode("utf-8") - - assert "ZeroDivisionError" in output - assert "'code_variables':" not in output - assert '"code_variables":' not in output - assert "'my_string'" not in output - assert "'my_number'" not in output - - -def test_code_variables_enabled_then_disabled_in_context(tmpdir): - app = tmpdir.join("app.py") - app.write( - dedent( - """ - import os - import posthog - from posthog import Posthog - - posthog_client = Posthog( - 'phc_x', - host='https://eu.i.posthog.com', - debug=True, - enable_exception_autocapture=True, - capture_exception_code_variables=True, - project_root=os.path.dirname(os.path.abspath(__file__)) - ) - - def process_data(): - my_var = "should not be captured" - important_value = 123 - - 1/0 - - with posthog.new_context(client=posthog_client): - posthog.set_capture_exception_code_variables_context(False) - - process_data() - """ - ) - ) - - with pytest.raises(subprocess.CalledProcessError) as excinfo: - subprocess.check_output([sys.executable, str(app)], stderr=subprocess.STDOUT) - - output = excinfo.value.output.decode("utf-8") - - assert "ZeroDivisionError" in output - assert "'code_variables':" not in output - assert '"code_variables":' not in output - assert "'my_var'" not in output - assert "'important_value'" not in output - - -def test_code_variables_repr_fallback(tmpdir): - app = tmpdir.join("app.py") - app.write( - dedent( - """ - import os - import re - from datetime import datetime, timedelta - from decimal import Decimal - from fractions import Fraction - from posthog import Posthog - - class CustomReprClass: - def __repr__(self): - return '' - - posthog = Posthog( - 'phc_x', - host='https://eu.i.posthog.com', - debug=True, - enable_exception_autocapture=True, - capture_exception_code_variables=True, - project_root=os.path.dirname(os.path.abspath(__file__)) - ) - - def trigger_error(): - my_regex = re.compile(r'\\d+') - my_datetime = datetime(2024, 1, 15, 10, 30, 45) - my_timedelta = timedelta(days=5, hours=3) - my_decimal = Decimal('123.456') - my_fraction = Fraction(3, 4) - my_set = {1, 2, 3} - my_frozenset = frozenset([4, 5, 6]) - my_bytes = b'hello bytes' - my_bytearray = bytearray(b'mutable bytes') - my_memoryview = memoryview(b'memory view') - my_complex = complex(3, 4) - my_range = range(10) - my_custom = CustomReprClass() - my_lambda = lambda x: x * 2 - my_function = trigger_error - - 1/0 - - trigger_error() - """ - ) - ) - - with pytest.raises(subprocess.CalledProcessError) as excinfo: - subprocess.check_output([sys.executable, str(app)], stderr=subprocess.STDOUT) - - output = excinfo.value.output.decode("utf-8") - - assert "ZeroDivisionError" in output - assert "code_variables" in output - - assert "re.compile(" in output and "\\\\d+" in output - assert "datetime.datetime(2024, 1, 15, 10, 30, 45)" in output - assert "datetime.timedelta(days=5, seconds=10800)" in output - assert "Decimal('123.456')" in output - assert "Fraction(3, 4)" in output - assert "{1, 2, 3}" in output - assert "frozenset({4, 5, 6})" in output - assert "b'hello bytes'" in output - assert "bytearray(b'mutable bytes')" in output - assert "" in output - assert "" in output - assert "" - - # Circular list - circular_list = ["item"] - circular_list.append(circular_list) - - result = _mask_sensitive_data(circular_list, compiled_mask) - assert result[0] == "item" - assert result[1] == "" - - -def test_compile_patterns_fast_path_and_regex_fallback(): - from posthog.exception_utils import _compile_patterns, _pattern_matches - - # Simple case-insensitive patterns should become substrings - simple_only = _compile_patterns([r"(?i)password", r"(?i)token", r"(?i)jwt"]) - substrings, regexes = simple_only - assert substrings == ["password", "token", "jwt"] - assert regexes == [] - - assert _pattern_matches("my_password_var", simple_only) is True - assert _pattern_matches("MY_TOKEN", simple_only) is True - assert _pattern_matches("safe_variable", simple_only) is False - - # Complex regex patterns should stay as compiled regexes - complex_only = _compile_patterns([r"^__.*", r"\d{3,}", r"^sk_live_"]) - substrings, regexes = complex_only - assert substrings == [] - assert len(regexes) == 3 - - assert _pattern_matches("__dunder", complex_only) is True - assert _pattern_matches("has_999_numbers", complex_only) is True - assert _pattern_matches("sk_live_abc123", complex_only) is True - assert _pattern_matches("normal_var", complex_only) is False - - # Mixed: simple substrings + complex regexes together - mixed = _compile_patterns( - [ - r"(?i)secret", # simple - r"(?i)api_key", # simple - r"^__.*", # regex - r"\btoken_\w+", # regex - ] - ) - substrings, regexes = mixed - assert substrings == ["secret", "api_key"] - assert len(regexes) == 2 - - # Substring matches - assert _pattern_matches("my_secret", mixed) is True - assert _pattern_matches("API_KEY_VALUE", mixed) is True - - # Regex matches - assert _pattern_matches("__private", mixed) is True - assert _pattern_matches("token_abc", mixed) is True - - # No match - assert _pattern_matches("safe_var", mixed) is False - - -def test_mask_sensitive_data_large_dict_replaced(): - from posthog.exception_utils import ( - CODE_VARIABLES_TOO_LONG_VALUE, - _compile_patterns, - _mask_sensitive_data, - ) - - compiled_mask = _compile_patterns([r"(?i)password"]) - - large_dict = {f"key_{i}": f"value_{i}" for i in range(300)} - - result = _mask_sensitive_data(large_dict, compiled_mask) - - assert result == CODE_VARIABLES_TOO_LONG_VALUE - - -def test_mask_sensitive_data_large_list_replaced(): - from posthog.exception_utils import ( - CODE_VARIABLES_TOO_LONG_VALUE, - _compile_patterns, - _mask_sensitive_data, - ) - - compiled_mask = _compile_patterns([r"(?i)password"]) - - large_list = [f"item_{i}" for i in range(300)] - - result = _mask_sensitive_data(large_list, compiled_mask) - - assert result == CODE_VARIABLES_TOO_LONG_VALUE - - -def test_mask_sensitive_data_large_tuple_replaced(): - from posthog.exception_utils import ( - CODE_VARIABLES_TOO_LONG_VALUE, - _compile_patterns, - _mask_sensitive_data, - ) - - compiled_mask = _compile_patterns([r"(?i)password"]) - - large_tuple = tuple(f"item_{i}" for i in range(300)) - - result = _mask_sensitive_data(large_tuple, compiled_mask) - - assert result == CODE_VARIABLES_TOO_LONG_VALUE diff --git a/references/public_api_snapshot.txt b/references/public_api_snapshot.txt index f8f1a031..e8c7d815 100644 --- a/references/public_api_snapshot.txt +++ b/references/public_api_snapshot.txt @@ -7,6 +7,7 @@ alias posthog.BeforeSendCallback -> posthog.types.BeforeSendCallback alias posthog.Client -> posthog.client.Client alias posthog.DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS -> posthog.exception_utils.DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS alias posthog.DEFAULT_CODE_VARIABLES_MASK_PATTERNS -> posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_PATTERNS +alias posthog.DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS -> posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS alias posthog.ExceptionArg -> posthog.args.ExceptionArg alias posthog.ExceptionCapture -> posthog.exception_capture.ExceptionCapture alias posthog.FeatureFlag -> posthog.types.FeatureFlag @@ -215,6 +216,7 @@ alias posthog.client.APIError -> posthog.request.APIError alias posthog.client.Consumer -> posthog.consumer.Consumer alias posthog.client.DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS -> posthog.exception_utils.DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS alias posthog.client.DEFAULT_CODE_VARIABLES_MASK_PATTERNS -> posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_PATTERNS +alias posthog.client.DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS -> posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS alias posthog.client.EVENTS_ENDPOINT -> posthog.request.EVENTS_ENDPOINT alias posthog.client.ExceptionArg -> posthog.args.ExceptionArg alias posthog.client.ExceptionCapture -> posthog.exception_capture.ExceptionCapture @@ -253,6 +255,7 @@ alias posthog.client.get -> posthog.request.get alias posthog.client.get_capture_exception_code_variables_context -> posthog.contexts.get_capture_exception_code_variables_context alias posthog.client.get_code_variables_ignore_patterns_context -> posthog.contexts.get_code_variables_ignore_patterns_context alias posthog.client.get_code_variables_mask_patterns_context -> posthog.contexts.get_code_variables_mask_patterns_context +alias posthog.client.get_code_variables_mask_url_credentials_context -> posthog.contexts.get_code_variables_mask_url_credentials_context alias posthog.client.get_context_device_id -> posthog.contexts.get_context_device_id alias posthog.client.get_context_distinct_id -> posthog.contexts.get_context_distinct_id alias posthog.client.get_context_session_id -> posthog.contexts.get_context_session_id @@ -298,6 +301,7 @@ alias posthog.inner_scoped -> posthog.contexts.scoped alias posthog.inner_set_capture_exception_code_variables_context -> posthog.contexts.set_capture_exception_code_variables_context alias posthog.inner_set_code_variables_ignore_patterns_context -> posthog.contexts.set_code_variables_ignore_patterns_context alias posthog.inner_set_code_variables_mask_patterns_context -> posthog.contexts.set_code_variables_mask_patterns_context +alias posthog.inner_set_code_variables_mask_url_credentials_context -> posthog.contexts.set_code_variables_mask_url_credentials_context alias posthog.inner_set_context_device_id -> posthog.contexts.set_context_device_id alias posthog.inner_set_context_session -> posthog.contexts.set_context_session alias posthog.inner_tag -> posthog.contexts.tag @@ -452,6 +456,7 @@ attribute posthog.client.Client.api_key = (project_api_key or '').strip() attribute posthog.client.Client.capture_exception_code_variables = capture_exception_code_variables attribute posthog.client.Client.code_variables_ignore_patterns = code_variables_ignore_patterns if code_variables_ignore_patterns is not None else DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS attribute posthog.client.Client.code_variables_mask_patterns = code_variables_mask_patterns if code_variables_mask_patterns is not None else DEFAULT_CODE_VARIABLES_MASK_PATTERNS +attribute posthog.client.Client.code_variables_mask_url_credentials = code_variables_mask_url_credentials if code_variables_mask_url_credentials is not None else DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS attribute posthog.client.Client.cohorts: Optional[dict[str, Any]] = None attribute posthog.client.Client.consumers = None attribute posthog.client.Client.debug = debug @@ -494,6 +499,7 @@ attribute posthog.client.Client.timeout = timeout attribute posthog.client.MAX_DICT_SIZE = 50000 attribute posthog.code_variables_ignore_patterns = DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS attribute posthog.code_variables_mask_patterns = DEFAULT_CODE_VARIABLES_MASK_PATTERNS +attribute posthog.code_variables_mask_url_credentials = DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS attribute posthog.consumer.AI_MAX_MSG_SIZE = 8 * 1024 * 1024 attribute posthog.consumer.BATCH_SIZE_LIMIT = 5 * 1024 * 1024 attribute posthog.consumer.Consumer.api_key = api_key @@ -516,6 +522,7 @@ attribute posthog.contexts.ContextScope.capture_exceptions = capture_exceptions attribute posthog.contexts.ContextScope.client: Optional[Client] = client attribute posthog.contexts.ContextScope.code_variables_ignore_patterns: Optional[list] = None attribute posthog.contexts.ContextScope.code_variables_mask_patterns: Optional[list] = None +attribute posthog.contexts.ContextScope.code_variables_mask_url_credentials: Optional[bool] = None attribute posthog.contexts.ContextScope.device_id: Optional[str] = None attribute posthog.contexts.ContextScope.distinct_id: Optional[str] = None attribute posthog.contexts.ContextScope.fresh = fresh @@ -546,7 +553,8 @@ attribute posthog.exception_utils.BASE64_ALPHABET = re.compile('^[a-zA-Z0-9/+=]* attribute posthog.exception_utils.CODE_VARIABLES_REDACTED_VALUE = '$$_posthog_redacted_based_on_masking_rules_$$' attribute posthog.exception_utils.CODE_VARIABLES_TOO_LONG_VALUE = '$$_posthog_value_too_long_$$' attribute posthog.exception_utils.DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS = ['^__.*'] -attribute posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_PATTERNS = ['(?i)password', '(?i)secret', '(?i)passwd', '(?i)pwd', '(?i)api_key', '(?i)apikey', '(?i)auth', '(?i)credentials', '(?i)privatekey', '(?i)private_key', '(?i)token', '(?i)aws_access_key_id', '(?i)_pass', '(?i)sk_', '(?i)jwt'] +attribute posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_PATTERNS = ['(?i)password', '(?i)secret', '(?i)passwd', '(?i)pwd', '(?i)api_key', '(?i)apikey', '(?i)auth', '(?i)credentials', '(?i)privatekey', '(?i)private_key', '(?i)token', '(?i)aws_access_key_id', '(?i)_pass', '(?i)sk_', '(?i)jwt', '(?i)connection_string', '(?i)connectionstring', '(?i)conn_str', '(?i)connstr', '(?i)dsn'] +attribute posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS = True attribute posthog.exception_utils.DEFAULT_MAX_VALUE_LENGTH = 1024 attribute posthog.exception_utils.DEFAULT_TOTAL_VARIABLES_SIZE_LIMIT = 20 * 1024 attribute posthog.exception_utils.Event = TypedDict('Event', {'breadcrumbs': Dict[Literal['values'], List[Dict[str, Any]]], 'check_in_id': str, 'contexts': Dict[str, Dict[str, object]], 'dist': str, 'duration': Optional[float], 'environment': str, 'errors': List[Dict[str, Any]], 'event_id': str, 'exception': Dict[Literal['values'], List[Dict[str, Any]]], 'level': LogLevelStr, 'logger': str, 'message': str, 'modules': Dict[str, str], 'monitor_slug': Optional[str], 'platform': Literal['python'], 'profile': object, 'release': str, 'request': Dict[str, object], 'server_name': str, 'spans': List[Dict[str, object]], 'stacktrace': Dict[str, object], 'start_timestamp': datetime, 'status': Optional[str], 'threads': Dict[Literal['values'], List[Dict[str, Any]]], 'timestamp': Optional[datetime], 'transaction': str, 'type': Literal['check_in', 'transaction'], 'user': Dict[str, object], '_metrics_summary': Dict[str, object]}, total=False) @@ -744,7 +752,7 @@ class posthog.ai.types.ToolInProgress class posthog.args.OptionalCaptureArgs class posthog.args.OptionalSetArgs class posthog.bucketed_rate_limiter.BucketedRateLimiter(bucket_size: Number, refill_rate: Number, refill_interval_seconds: Number, on_bucket_rate_limited: Optional[Callable[[Hashable], None]] = None, clock: Callable[[], float] = time.monotonic) -class posthog.client.Client(project_api_key: str, host=None, debug=False, max_queue_size=10000, send=True, on_error=None, flush_at=100, flush_interval=5.0, gzip=False, max_retries=3, sync_mode=False, timeout=15, thread=1, poll_interval=30, personal_api_key=None, disabled=False, disable_geoip=True, is_server=True, historical_migration=False, feature_flags_request_timeout_seconds=3, super_properties=None, enable_exception_autocapture=False, log_captured_exceptions=False, project_root=None, privacy_mode=False, before_send=None, flag_fallback_cache_url=None, enable_local_evaluation=True, flag_definition_cache_provider: Optional[FlagDefinitionCacheProvider] = None, capture_exception_code_variables=False, code_variables_mask_patterns=None, code_variables_ignore_patterns=None, in_app_modules: list[str] | None = None, enable_exception_autocapture_rate_limiting=False, exception_autocapture_bucket_size=ExceptionCapture.DEFAULT_BUCKET_SIZE, exception_autocapture_refill_rate=ExceptionCapture.DEFAULT_REFILL_RATE, exception_autocapture_refill_interval_seconds=ExceptionCapture.DEFAULT_REFILL_INTERVAL_SECONDS, _dedicated_ai_endpoint=False) +class posthog.client.Client(project_api_key: str, host=None, debug=False, max_queue_size=10000, send=True, on_error=None, flush_at=100, flush_interval=5.0, gzip=False, max_retries=3, sync_mode=False, timeout=15, thread=1, poll_interval=30, personal_api_key=None, disabled=False, disable_geoip=True, is_server=True, historical_migration=False, feature_flags_request_timeout_seconds=3, super_properties=None, enable_exception_autocapture=False, log_captured_exceptions=False, project_root=None, privacy_mode=False, before_send=None, flag_fallback_cache_url=None, enable_local_evaluation=True, flag_definition_cache_provider: Optional[FlagDefinitionCacheProvider] = None, capture_exception_code_variables=False, code_variables_mask_patterns=None, code_variables_ignore_patterns=None, code_variables_mask_url_credentials=None, in_app_modules: list[str] | None = None, enable_exception_autocapture_rate_limiting=False, exception_autocapture_bucket_size=ExceptionCapture.DEFAULT_BUCKET_SIZE, exception_autocapture_refill_rate=ExceptionCapture.DEFAULT_REFILL_RATE, exception_autocapture_refill_interval_seconds=ExceptionCapture.DEFAULT_REFILL_INTERVAL_SECONDS, _dedicated_ai_endpoint=False) class posthog.consumer.Consumer(queue, api_key, flush_at=100, host=None, on_error=None, flush_interval=5.0, gzip=False, retries=10, timeout=15, historical_migration=False, dedicated_ai_endpoint=False) class posthog.contexts.ContextScope(parent=None, fresh: bool = False, capture_exceptions: bool = True, client: Optional[Client] = None) class posthog.exception_capture.ExceptionCapture(client: Client, rate_limiting_enabled=False, bucket_size=DEFAULT_BUCKET_SIZE, refill_rate=DEFAULT_REFILL_RATE, refill_interval_seconds=DEFAULT_REFILL_INTERVAL_SECONDS) @@ -865,6 +873,7 @@ function posthog.client.stringify_id(val) function posthog.contexts.get_capture_exception_code_variables_context() -> Optional[bool] function posthog.contexts.get_code_variables_ignore_patterns_context() -> Optional[list] function posthog.contexts.get_code_variables_mask_patterns_context() -> Optional[list] +function posthog.contexts.get_code_variables_mask_url_credentials_context() -> Optional[bool] function posthog.contexts.get_context_device_id() -> Optional[str] function posthog.contexts.get_context_distinct_id() -> Optional[str] function posthog.contexts.get_context_session_id() -> Optional[str] @@ -875,11 +884,12 @@ function posthog.contexts.scoped(fresh: bool = False, capture_exceptions: Option function posthog.contexts.set_capture_exception_code_variables_context(enabled: bool) -> None function posthog.contexts.set_code_variables_ignore_patterns_context(ignore_patterns: list) -> None function posthog.contexts.set_code_variables_mask_patterns_context(mask_patterns: list) -> None +function posthog.contexts.set_code_variables_mask_url_credentials_context(enabled: bool) -> None function posthog.contexts.set_context_device_id(device_id: str) -> None function posthog.contexts.set_context_session(session_id: str) -> None function posthog.contexts.tag(key: str, value: Any) -> None function posthog.evaluate_flags(distinct_id=None, groups=None, person_properties=None, group_properties=None, only_evaluate_locally=False, disable_geoip=None, flag_keys=None, device_id=None) -> FeatureFlagEvaluations -function posthog.exception_utils.attach_code_variables_to_frames(all_exceptions, exc_info, mask_patterns, ignore_patterns) +function posthog.exception_utils.attach_code_variables_to_frames(all_exceptions, exc_info, mask_patterns, ignore_patterns, mask_url_credentials=True) function posthog.exception_utils.construct_artificial_traceback(e) function posthog.exception_utils.event_hint_with_exc_info(exc_info=None) function posthog.exception_utils.exc_info_from_error(error) @@ -901,7 +911,7 @@ function posthog.exception_utils.iter_stacks(tb) function posthog.exception_utils.mark_exception_as_captured(error, uuid) function posthog.exception_utils.safe_repr(value) function posthog.exception_utils.safe_str(value) -function posthog.exception_utils.serialize_code_variables(frame, limiter, mask_patterns=None, ignore_patterns=None, max_length=1024) +function posthog.exception_utils.serialize_code_variables(frame, limiter, mask_patterns=None, ignore_patterns=None, max_length=1024, mask_url_credentials=True) function posthog.exception_utils.serialize_frame(frame, tb_lineno=None, max_value_length=None) function posthog.exception_utils.set_in_app_in_frames(frames, in_app_exclude, in_app_include, project_root=None) function posthog.exception_utils.should_hide_frame(frame: FrameType) -> bool @@ -909,7 +919,7 @@ function posthog.exception_utils.single_exception_from_error_tuple(exc_type, exc function posthog.exception_utils.strip_string(value, max_length=None) function posthog.exception_utils.to_string(value) function posthog.exception_utils.to_timestamp(value) -function posthog.exception_utils.try_attach_code_variables_to_frames(all_exceptions, exc_info, mask_patterns, ignore_patterns) +function posthog.exception_utils.try_attach_code_variables_to_frames(all_exceptions, exc_info, mask_patterns, ignore_patterns, mask_url_credentials=True) function posthog.exception_utils.walk_exception_chain(exc_info) function posthog.feature_enabled(key, distinct_id, groups=None, person_properties=None, group_properties=None, only_evaluate_locally=False, send_feature_flag_events=True, disable_geoip=None, device_id=None) function posthog.feature_flag_definitions() @@ -957,6 +967,7 @@ function posthog.set(**kwargs: Unpack[OptionalSetArgs]) -> Optional[str] function posthog.set_capture_exception_code_variables_context(enabled: bool) function posthog.set_code_variables_ignore_patterns_context(ignore_patterns: list) function posthog.set_code_variables_mask_patterns_context(mask_patterns: list) +function posthog.set_code_variables_mask_url_credentials_context(enabled: bool) function posthog.set_context_device_id(device_id: str) function posthog.set_context_session(session_id: str) function posthog.set_once(**kwargs: Unpack[OptionalSetArgs]) -> Optional[str] @@ -1082,6 +1093,7 @@ method posthog.contexts.ContextScope.collect_tags() -> Dict[str, Any] method posthog.contexts.ContextScope.get_capture_exception_code_variables() -> Optional[bool] method posthog.contexts.ContextScope.get_code_variables_ignore_patterns() -> Optional[list] method posthog.contexts.ContextScope.get_code_variables_mask_patterns() -> Optional[list] +method posthog.contexts.ContextScope.get_code_variables_mask_url_credentials() -> Optional[bool] method posthog.contexts.ContextScope.get_device_id() -> Optional[str] method posthog.contexts.ContextScope.get_distinct_id() -> Optional[str] method posthog.contexts.ContextScope.get_parent() @@ -1089,6 +1101,7 @@ method posthog.contexts.ContextScope.get_session_id() -> Optional[str] method posthog.contexts.ContextScope.set_capture_exception_code_variables(enabled: bool) method posthog.contexts.ContextScope.set_code_variables_ignore_patterns(ignore_patterns: list) method posthog.contexts.ContextScope.set_code_variables_mask_patterns(mask_patterns: list) +method posthog.contexts.ContextScope.set_code_variables_mask_url_credentials(enabled: bool) method posthog.contexts.ContextScope.set_device_id(device_id: str) method posthog.contexts.ContextScope.set_distinct_id(distinct_id: str) method posthog.contexts.ContextScope.set_session_id(session_id: str)