diff --git a/CHANGELOG.md b/CHANGELOG.md index 30adc3fd..fea1158b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## Unreleased +- Do not trust inbound snapshot profiling baggage by default. Set `SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE=true` to honor upstream snapshot profiling decisions. ## 2.10.1 - 2026-04-23 - Restrict `wrapt` to `<2.0.0` to fix `TypeError: 'TracedCursorProxy' object is not iterable` when using DB instrumentation on fresh installs ([upstream issue](https://github.com/open-telemetry/opentelemetry-python-contrib/issues/4462)) diff --git a/docs/profiling.md b/docs/profiling.md index ca2b16f7..677ba595 100644 --- a/docs/profiling.md +++ b/docs/profiling.md @@ -60,15 +60,20 @@ opentelemetry-instrument python app.py | `SPLUNK_SNAPSHOT_PROFILER_ENABLED` | `false` | Set to `true` to enable call graph profiling. | | `SPLUNK_SNAPSHOT_SELECTION_PROBABILITY` | `0.01` | Fraction of traces to profile, as a float between `0.0` and `1.0`. `0.01` means 1% of traces. | | `SPLUNK_SNAPSHOT_SAMPLING_INTERVAL` | `10` | How often (in milliseconds) to collect a stack sample during an active profiled trace. | +| `SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE` | `false` | Set to `true` to honor upstream snapshot profiling decisions from inbound baggage. | | `SPLUNK_PROFILER_LOGS_ENDPOINT` | _(uses `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT`)_ | Override the endpoint where profiling data is sent. Applies to both profiling modes. | ### How it works -A trace is selected for profiling in one of two ways: the distro randomly selects it -based on `SPLUNK_SNAPSHOT_SELECTION_PROBABILITY`, or an upstream service has already -selected it and propagated that decision via baggage. In the latter case this service -profiles the request regardless of the local probability setting. Either way, the -decision propagates to downstream services so the entire trace is profiled consistently. +A trace is selected for profiling based on `SPLUNK_SNAPSHOT_SELECTION_PROBABILITY`. +By default, inbound snapshot profiling baggage from upstream services is ignored, +and this service applies its local selection policy instead. If +`SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE` is set to `true`, an upstream service can +propagate its decision via baggage and this service profiles or skips the request +according to that upstream decision instead of applying the local probability +setting. Only enable this when inbound baggage is set by trusted upstream services. +Either way, the resulting decision propagates to downstream services so the entire +trace is profiled consistently. For each selected trace, the profiler collects stack traces from the active thread at the interval set by `SPLUNK_SNAPSHOT_SAMPLING_INTERVAL`, filtering out threads not executing spans from that trace. The profiler continues running for up to 60 seconds after the last selected diff --git a/src/splunk_otel/distro.py b/src/splunk_otel/distro.py index 80a6c59f..2289cebb 100644 --- a/src/splunk_otel/distro.py +++ b/src/splunk_otel/distro.py @@ -38,6 +38,7 @@ SPLUNK_REALM, SPLUNK_SNAPSHOT_PROFILER_ENABLED, SPLUNK_SNAPSHOT_SELECTION_PROBABILITY, + SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE, SPLUNK_TRACE_RESPONSE_HEADER_ENABLED, Env, ) @@ -137,6 +138,11 @@ def set_callgraphs_propagator(self): propagators = [current] if self.env.is_true(SPLUNK_SNAPSHOT_PROFILER_ENABLED, "false"): - propagators.append(CallgraphsPropagator(self.env.getfloat(SPLUNK_SNAPSHOT_SELECTION_PROBABILITY, 0.01))) + propagators.append( + CallgraphsPropagator( + self.env.getfloat(SPLUNK_SNAPSHOT_SELECTION_PROBABILITY, 0.01), + trust_inbound_baggage=self.env.is_true(SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE, "false"), + ) + ) set_global_textmap(CompositePropagator(propagators)) diff --git a/src/splunk_otel/env.py b/src/splunk_otel/env.py index 774bdeea..a85d98e7 100644 --- a/src/splunk_otel/env.py +++ b/src/splunk_otel/env.py @@ -51,6 +51,7 @@ SPLUNK_SNAPSHOT_PROFILER_ENABLED = "SPLUNK_SNAPSHOT_PROFILER_ENABLED" SPLUNK_SNAPSHOT_SAMPLING_INTERVAL = "SPLUNK_SNAPSHOT_SAMPLING_INTERVAL" SPLUNK_SNAPSHOT_SELECTION_PROBABILITY = "SPLUNK_SNAPSHOT_SELECTION_PROBABILITY" +SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE = "SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE" SPLUNK_REALM = "SPLUNK_REALM" _pylogger = logging.getLogger(__name__) diff --git a/src/splunk_otel/propagator.py b/src/splunk_otel/propagator.py index 32099b86..36f81896 100644 --- a/src/splunk_otel/propagator.py +++ b/src/splunk_otel/propagator.py @@ -71,9 +71,11 @@ def _with_volume_baggage(is_selected: bool, context: typing.Optional[Context]) - class CallgraphsPropagator(textmap.TextMapPropagator): selection_probability: float + trust_inbound_baggage: bool - def __init__(self, selection_probability: float = 0.01): + def __init__(self, selection_probability: float = 0.01, *, trust_inbound_baggage: bool = False): self.selection_probability = selection_probability + self.trust_inbound_baggage = trust_inbound_baggage self.sampler = TraceIdRatioBased(selection_probability) def extract(self, carrier, context, getter): @@ -82,7 +84,7 @@ def extract(self, carrier, context, getter): if volume_baggage is None: return self._attach_volume_baggage(context) - if volume_baggage in {"highest", "off"}: + if self.trust_inbound_baggage and volume_baggage in {"highest", "off"}: return context return self._attach_volume_baggage(context) diff --git a/tests/test_callgraphs_span_processor.py b/tests/test_callgraphs_span_processor.py index e4336ca1..a975b8c5 100644 --- a/tests/test_callgraphs_span_processor.py +++ b/tests/test_callgraphs_span_processor.py @@ -20,6 +20,7 @@ from opentelemetry.trace import SpanContext from splunk_otel.callgraphs.span_processor import CallgraphsSpanProcessor, _should_process_context +from splunk_otel.propagator import CallgraphsPropagator class TestShouldProcessContext: @@ -104,6 +105,19 @@ def test_on_start_activates_profiling_when_baggage_is_highest(self, mock_profili assert 456 in processor._span_id_to_trace_id # noqa SLF001 assert processor._span_id_to_trace_id[456] == 123 # noqa SLF001 + @patch("splunk_otel.callgraphs.span_processor.ProfilingContext") + def test_on_start_does_not_trust_inbound_highest_baggage_by_default(self, mock_profiling_context): + processor = CallgraphsSpanProcessor("test-service") + span = MagicMock(spec=Span) + + ctx = baggage.set_baggage("splunk.trace.snapshot.volume", "highest", Context()) + ctx = CallgraphsPropagator(selection_probability=0.0).extract({}, ctx, None) + + processor.on_start(span, ctx) + + span.set_attribute.assert_not_called() + mock_profiling_context.return_value.start.assert_not_called() + @patch("splunk_otel.callgraphs.span_processor.ProfilingContext") def test_on_end_removes_span_from_tracking(self, mock_profiling_context): processor = CallgraphsSpanProcessor("test-service") diff --git a/tests/test_distro.py b/tests/test_distro.py index 8aea7527..eaae0280 100644 --- a/tests/test_distro.py +++ b/tests/test_distro.py @@ -152,6 +152,7 @@ def test_callgraphs_propagator_enabled(): propagators = textmap._propagators # noqa SLF001 callgraphs_propagators = [p for p in propagators if isinstance(p, CallgraphsPropagator)] assert len(callgraphs_propagators) == 1 + assert callgraphs_propagators[0].trust_inbound_baggage is False def test_callgraphs_propagator_selection_probability(): @@ -167,6 +168,19 @@ def test_callgraphs_propagator_selection_probability(): assert callgraphs_propagator.selection_probability == 0.5 +def test_callgraphs_propagator_trusts_inbound_baggage_when_enabled(): + env_store = { + "SPLUNK_SNAPSHOT_PROFILER_ENABLED": "true", + "SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE": "true", + } + configure_distro(env_store) + + textmap = get_global_textmap() + propagators = textmap._propagators # noqa SLF001 + callgraphs_propagator = next(p for p in propagators if isinstance(p, CallgraphsPropagator)) + assert callgraphs_propagator.trust_inbound_baggage is True + + def test_callgraphs_propagator_idempotent(): # Configuring twice with snapshot enabled should not accumulate propagators. env_store = {"SPLUNK_SNAPSHOT_PROFILER_ENABLED": "true"} diff --git a/tests/test_propagator.py b/tests/test_propagator.py index 6761480c..c7b20189 100644 --- a/tests/test_propagator.py +++ b/tests/test_propagator.py @@ -69,21 +69,39 @@ def test_extract_sets_off_when_not_trace_is_not_selected(self): volume = baggage.get_baggage("splunk.trace.snapshot.volume", result_ctx) assert volume == "off" - def test_extract_preserves_existing_highest_baggage(self): + def test_extract_overwrites_existing_highest_baggage_by_default(self): ctx = baggage.set_baggage("splunk.trace.snapshot.volume", "highest", Context()) prop = CallgraphsPropagator(selection_probability=0.0) result_ctx = prop.extract({}, ctx, None) volume = baggage.get_baggage("splunk.trace.snapshot.volume", result_ctx) - assert volume == "highest" + assert volume == "off" - def test_extract_preserves_existing_off_baggage(self): + def test_extract_overwrites_existing_off_baggage_by_default(self): ctx = baggage.set_baggage("splunk.trace.snapshot.volume", "off", Context()) prop = CallgraphsPropagator(selection_probability=1.0) result_ctx = prop.extract({}, ctx, None) + volume = baggage.get_baggage("splunk.trace.snapshot.volume", result_ctx) + assert volume == "highest" + + def test_extract_preserves_existing_highest_baggage_when_trusted(self): + ctx = baggage.set_baggage("splunk.trace.snapshot.volume", "highest", Context()) + + prop = CallgraphsPropagator(selection_probability=0.0, trust_inbound_baggage=True) + result_ctx = prop.extract({}, ctx, None) + + volume = baggage.get_baggage("splunk.trace.snapshot.volume", result_ctx) + assert volume == "highest" + + def test_extract_preserves_existing_off_baggage_when_trusted(self): + ctx = baggage.set_baggage("splunk.trace.snapshot.volume", "off", Context()) + + prop = CallgraphsPropagator(selection_probability=1.0, trust_inbound_baggage=True) + result_ctx = prop.extract({}, ctx, None) + volume = baggage.get_baggage("splunk.trace.snapshot.volume", result_ctx) assert volume == "off"