Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Changelog

## Unreleased
- Do not trust inbound snapshot profiling baggage by default. Set `SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE=true` to honor upstream snapshot profiling decisions.

## 2.10.1 - 2026-04-23
- Restrict `wrapt` to `<2.0.0` to fix `TypeError: 'TracedCursorProxy' object is not iterable` when using DB instrumentation on fresh installs ([upstream issue](https://github.com/open-telemetry/opentelemetry-python-contrib/issues/4462))
Expand Down
15 changes: 10 additions & 5 deletions docs/profiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,20 @@ opentelemetry-instrument python app.py
| `SPLUNK_SNAPSHOT_PROFILER_ENABLED` | `false` | Set to `true` to enable call graph profiling. |
| `SPLUNK_SNAPSHOT_SELECTION_PROBABILITY` | `0.01` | Fraction of traces to profile, as a float between `0.0` and `1.0`. `0.01` means 1% of traces. |
| `SPLUNK_SNAPSHOT_SAMPLING_INTERVAL` | `10` | How often (in milliseconds) to collect a stack sample during an active profiled trace. |
| `SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE` | `false` | Set to `true` to honor upstream snapshot profiling decisions from inbound baggage. |
| `SPLUNK_PROFILER_LOGS_ENDPOINT` | _(uses `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT`)_ | Override the endpoint where profiling data is sent. Applies to both profiling modes. |

### How it works

A trace is selected for profiling in one of two ways: the distro randomly selects it
based on `SPLUNK_SNAPSHOT_SELECTION_PROBABILITY`, or an upstream service has already
selected it and propagated that decision via baggage. In the latter case this service
profiles the request regardless of the local probability setting. Either way, the
decision propagates to downstream services so the entire trace is profiled consistently.
A trace is selected for profiling based on `SPLUNK_SNAPSHOT_SELECTION_PROBABILITY`.
By default, inbound snapshot profiling baggage from upstream services is ignored,
and this service applies its local selection policy instead. If
`SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE` is set to `true`, an upstream service can
propagate its decision via baggage and this service profiles or skips the request
according to that upstream decision instead of applying the local probability
setting. Only enable this when inbound baggage is set by trusted upstream services.
Either way, the resulting decision propagates to downstream services so the entire
trace is profiled consistently.
For each selected trace, the profiler collects stack traces from the active thread at the
interval set by `SPLUNK_SNAPSHOT_SAMPLING_INTERVAL`, filtering out threads not executing spans
from that trace. The profiler continues running for up to 60 seconds after the last selected
Expand Down
8 changes: 7 additions & 1 deletion src/splunk_otel/distro.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
SPLUNK_REALM,
SPLUNK_SNAPSHOT_PROFILER_ENABLED,
SPLUNK_SNAPSHOT_SELECTION_PROBABILITY,
SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE,
SPLUNK_TRACE_RESPONSE_HEADER_ENABLED,
Env,
)
Expand Down Expand Up @@ -137,6 +138,11 @@ def set_callgraphs_propagator(self):
propagators = [current]

if self.env.is_true(SPLUNK_SNAPSHOT_PROFILER_ENABLED, "false"):
propagators.append(CallgraphsPropagator(self.env.getfloat(SPLUNK_SNAPSHOT_SELECTION_PROBABILITY, 0.01)))
propagators.append(
CallgraphsPropagator(
self.env.getfloat(SPLUNK_SNAPSHOT_SELECTION_PROBABILITY, 0.01),
trust_inbound_baggage=self.env.is_true(SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE, "false"),
)
)

set_global_textmap(CompositePropagator(propagators))
1 change: 1 addition & 0 deletions src/splunk_otel/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
SPLUNK_SNAPSHOT_PROFILER_ENABLED = "SPLUNK_SNAPSHOT_PROFILER_ENABLED"
SPLUNK_SNAPSHOT_SAMPLING_INTERVAL = "SPLUNK_SNAPSHOT_SAMPLING_INTERVAL"
SPLUNK_SNAPSHOT_SELECTION_PROBABILITY = "SPLUNK_SNAPSHOT_SELECTION_PROBABILITY"
SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE = "SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE"
SPLUNK_REALM = "SPLUNK_REALM"

_pylogger = logging.getLogger(__name__)
Expand Down
6 changes: 4 additions & 2 deletions src/splunk_otel/propagator.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,11 @@ def _with_volume_baggage(is_selected: bool, context: typing.Optional[Context]) -

class CallgraphsPropagator(textmap.TextMapPropagator):
selection_probability: float
trust_inbound_baggage: bool

def __init__(self, selection_probability: float = 0.01):
def __init__(self, selection_probability: float = 0.01, *, trust_inbound_baggage: bool = False):
self.selection_probability = selection_probability
self.trust_inbound_baggage = trust_inbound_baggage
self.sampler = TraceIdRatioBased(selection_probability)

def extract(self, carrier, context, getter):
Expand All @@ -82,7 +84,7 @@ def extract(self, carrier, context, getter):
if volume_baggage is None:
return self._attach_volume_baggage(context)

if volume_baggage in {"highest", "off"}:
if self.trust_inbound_baggage and volume_baggage in {"highest", "off"}:
return context

return self._attach_volume_baggage(context)
Expand Down
14 changes: 14 additions & 0 deletions tests/test_callgraphs_span_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from opentelemetry.trace import SpanContext

from splunk_otel.callgraphs.span_processor import CallgraphsSpanProcessor, _should_process_context
from splunk_otel.propagator import CallgraphsPropagator


class TestShouldProcessContext:
Expand Down Expand Up @@ -104,6 +105,19 @@ def test_on_start_activates_profiling_when_baggage_is_highest(self, mock_profili
assert 456 in processor._span_id_to_trace_id # noqa SLF001
assert processor._span_id_to_trace_id[456] == 123 # noqa SLF001

@patch("splunk_otel.callgraphs.span_processor.ProfilingContext")
def test_on_start_does_not_trust_inbound_highest_baggage_by_default(self, mock_profiling_context):
processor = CallgraphsSpanProcessor("test-service")
span = MagicMock(spec=Span)

ctx = baggage.set_baggage("splunk.trace.snapshot.volume", "highest", Context())
ctx = CallgraphsPropagator(selection_probability=0.0).extract({}, ctx, None)

processor.on_start(span, ctx)

span.set_attribute.assert_not_called()
mock_profiling_context.return_value.start.assert_not_called()

@patch("splunk_otel.callgraphs.span_processor.ProfilingContext")
def test_on_end_removes_span_from_tracking(self, mock_profiling_context):
processor = CallgraphsSpanProcessor("test-service")
Expand Down
14 changes: 14 additions & 0 deletions tests/test_distro.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def test_callgraphs_propagator_enabled():
propagators = textmap._propagators # noqa SLF001
callgraphs_propagators = [p for p in propagators if isinstance(p, CallgraphsPropagator)]
assert len(callgraphs_propagators) == 1
assert callgraphs_propagators[0].trust_inbound_baggage is False


def test_callgraphs_propagator_selection_probability():
Expand All @@ -167,6 +168,19 @@ def test_callgraphs_propagator_selection_probability():
assert callgraphs_propagator.selection_probability == 0.5


def test_callgraphs_propagator_trusts_inbound_baggage_when_enabled():
env_store = {
"SPLUNK_SNAPSHOT_PROFILER_ENABLED": "true",
"SPLUNK_SNAPSHOT_TRUST_INBOUND_BAGGAGE": "true",
}
configure_distro(env_store)

textmap = get_global_textmap()
propagators = textmap._propagators # noqa SLF001
callgraphs_propagator = next(p for p in propagators if isinstance(p, CallgraphsPropagator))
assert callgraphs_propagator.trust_inbound_baggage is True


def test_callgraphs_propagator_idempotent():
# Configuring twice with snapshot enabled should not accumulate propagators.
env_store = {"SPLUNK_SNAPSHOT_PROFILER_ENABLED": "true"}
Expand Down
24 changes: 21 additions & 3 deletions tests/test_propagator.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,21 +69,39 @@ def test_extract_sets_off_when_not_trace_is_not_selected(self):
volume = baggage.get_baggage("splunk.trace.snapshot.volume", result_ctx)
assert volume == "off"

def test_extract_preserves_existing_highest_baggage(self):
def test_extract_overwrites_existing_highest_baggage_by_default(self):
ctx = baggage.set_baggage("splunk.trace.snapshot.volume", "highest", Context())

prop = CallgraphsPropagator(selection_probability=0.0)
result_ctx = prop.extract({}, ctx, None)

volume = baggage.get_baggage("splunk.trace.snapshot.volume", result_ctx)
assert volume == "highest"
assert volume == "off"

def test_extract_preserves_existing_off_baggage(self):
def test_extract_overwrites_existing_off_baggage_by_default(self):
ctx = baggage.set_baggage("splunk.trace.snapshot.volume", "off", Context())

prop = CallgraphsPropagator(selection_probability=1.0)
result_ctx = prop.extract({}, ctx, None)

volume = baggage.get_baggage("splunk.trace.snapshot.volume", result_ctx)
assert volume == "highest"

def test_extract_preserves_existing_highest_baggage_when_trusted(self):
ctx = baggage.set_baggage("splunk.trace.snapshot.volume", "highest", Context())

prop = CallgraphsPropagator(selection_probability=0.0, trust_inbound_baggage=True)
result_ctx = prop.extract({}, ctx, None)

volume = baggage.get_baggage("splunk.trace.snapshot.volume", result_ctx)
assert volume == "highest"

def test_extract_preserves_existing_off_baggage_when_trusted(self):
ctx = baggage.set_baggage("splunk.trace.snapshot.volume", "off", Context())

prop = CallgraphsPropagator(selection_probability=1.0, trust_inbound_baggage=True)
result_ctx = prop.extract({}, ctx, None)

volume = baggage.get_baggage("splunk.trace.snapshot.volume", result_ctx)
assert volume == "off"

Expand Down
Loading