diff --git a/docs/telemetry.md b/docs/telemetry.md new file mode 100644 index 0000000..a3f54e9 --- /dev/null +++ b/docs/telemetry.md @@ -0,0 +1,82 @@ +# Client telemetry — opt-in + +The `layerlens` SDK can optionally emit a small set of usage counters back +to LayerLens so we can compute SDK adoption + diagnose breakages. **It is +off by default and never sends customer payloads.** + +## Quick start + +Enable per-process: + +```bash +export LAYERLENS_TELEMETRY=on +python my_agent.py +``` + +Or in code: + +```python +import os +os.environ["LAYERLENS_TELEMETRY"] = "on" +from layerlens import Stratix +client = Stratix(api_key="...") # emits one `init` event +``` + +Disable (default — no telemetry, no network calls): + +```bash +unset LAYERLENS_TELEMETRY +``` + +## What is sent + +When telemetry is on, the SDK emits the following metrics over OTLP/gRPC +to the LayerLens collector (`https://otel.layerlens.ai:4317`): + +| Metric | Type | Labels | +|---|---|---| +| `atlas_sdk_events_total` | counter | `surface`, `event` (and optionally `command`, `outcome`, `status_code`, `resource`) | +| `atlas_sdk_request_duration_seconds` | histogram | `surface`, `event` | + +Concrete events emitted by the SDK today: + +| Where | `surface` | `event` | +|---|---|---| +| `Stratix(...)` constructor | `sdk_python` | `init` | +| `layerlens` CLI top-level invocation | `cli` | `cmd_run` (with `command` attribute = subcommand name) | + +## What is NOT sent + +- Your API key, your prompts, your traces, your evaluation data — none of + this is touched by the telemetry path. +- Any attribute key not on the allowlist (`command`, `resource`, + `outcome`, `status_code`) is silently dropped. PII never leaves the + client. + +## How it fails + +The SDK treats telemetry as best-effort. If any of these happen, the SDK +silently disables telemetry for the rest of the process and continues +serving customer requests: + +- The OpenTelemetry SDK is not installed (it's not a hard dep). +- The collector endpoint is unreachable. +- The exporter raises during init or send. + +In other words: **enabling telemetry can never break your application.** + +## Configuration + +| Env var | Purpose | Default | +|---|---|---| +| `LAYERLENS_TELEMETRY` | Master switch (`on` / `true` / `1` / `yes` to enable) | unset (off) | +| `LAYERLENS_OTLP_ENDPOINT` | Collector endpoint | `https://otel.layerlens.ai:4317` | +| `LAYERLENS_OTLP_INSECURE` | Use plaintext gRPC (for local dev) | `false` | + +## How it relates to the atlas-app server-side + +This counter is the SDK-side mirror of the `atlas_sdk_events_total` +metric the atlas-app backend exports +(`apps/shared/observability/metrics.go`). Both surface to the same +metric name in Grafana so SDK-emitted and server-emitted events line up +on the same dashboards. diff --git a/src/layerlens/_client.py b/src/layerlens/_client.py index 33fae85..d898759 100644 --- a/src/layerlens/_client.py +++ b/src/layerlens/_client.py @@ -8,7 +8,7 @@ import httpx -from . import _exceptions +from . import _exceptions, _telemetry from ._utils import is_mapping from .models import Organization, OrganizationResponse, OrganizationsListResponse from ._constants import DEFAULT_TIMEOUT, DEFAULT_BASE_URL, DIRTY_ROUTER_PREFIX @@ -88,6 +88,10 @@ def __init__( ) self.project_id = organization.projects[0].id + # Opt-in client-side telemetry. No-op unless LAYERLENS_TELEMETRY=on. + # Counts SDK initializations so atlas-app can compute SDK adoption. + _telemetry.event("sdk_python", "init") + @cached_property def benchmarks(self) -> Benchmarks: from .resources.benchmarks import Benchmarks @@ -311,6 +315,10 @@ def __init__( ) self.project_id = organization.projects[0].id + # Opt-in client-side telemetry. No-op unless LAYERLENS_TELEMETRY=on. + # Counts SDK initializations so atlas-app can compute SDK adoption. + _telemetry.event("sdk_python", "init") + @cached_property def benchmarks(self) -> AsyncBenchmarks: from .resources.benchmarks import AsyncBenchmarks diff --git a/src/layerlens/_telemetry.py b/src/layerlens/_telemetry.py new file mode 100644 index 0000000..17f0f4b --- /dev/null +++ b/src/layerlens/_telemetry.py @@ -0,0 +1,181 @@ +"""Opt-in client telemetry for the layerlens SDK. + +Emits the same `atlas_sdk_events_total{surface, event}` and +`atlas_sdk_request_duration_seconds` shapes that the atlas-app server +side records (see `apps/shared/observability/metrics.go` in the +metrics-analytics-dashboard branch of LayerLens/atlas-app). + +The contract: + +* Telemetry is **off by default**. The customer must set the env var + ``LAYERLENS_TELEMETRY=on`` (or pass ``telemetry_enabled=True`` + to ``Stratix(...)``) to opt in. +* When OFF, every helper here is a no-op — zero overhead, zero network. +* When ON, events are buffered and flushed via the OTel Collector + endpoint at ``LAYERLENS_OTLP_ENDPOINT`` (default + ``https://otel.layerlens.ai:4317``). If that's unreachable, events + are silently dropped — telemetry MUST NOT block customer work. +* No PII is ever included. Events carry only the surface name + (sdk_python / cli / vscode) and the event name (init / cmd_run / + trace_emit / etc.). Customer payloads are not transmitted. + +Usage from internal SDK code:: + + from . import _telemetry + _telemetry.event("sdk_python", "init") + with _telemetry.timed("sdk_python", "trace_emit"): + ... + +Usage from the CLI:: + + from layerlens._telemetry import event + event("cli", "cmd_run", attributes={"command": "trace ls"}) +""" +from __future__ import annotations + +import contextlib +import os +import time +from typing import Any, Iterator, Mapping, Optional + +# Module-level singletons populated lazily on first use. +_initialized: bool = False +_meter: Any = None +_counter_events: Any = None +_hist_request_duration: Any = None + + +def _enabled() -> bool: + raw = os.environ.get("LAYERLENS_TELEMETRY", "").strip().lower() + return raw in ("on", "true", "1", "yes") + + +def _try_init() -> bool: + """Lazily build OTel meter + instruments. Returns True on success. + + Failures (missing OTel SDK, network) silently disable telemetry for + the lifetime of the process — never raise into customer code. + """ + global _initialized, _meter, _counter_events, _hist_request_duration + if _initialized: + return _meter is not None + _initialized = True + + if not _enabled(): + return False + + try: + import socket + from opentelemetry import metrics + from opentelemetry.sdk.metrics import MeterProvider + from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader + from opentelemetry.sdk.resources import Resource + from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( + OTLPMetricExporter, + ) + except ImportError: + # OTel SDK not present — silent no-op. + return False + + try: + from ._version import __version__ + except ImportError: + __version__ = "unknown" + + endpoint = os.environ.get( + "LAYERLENS_OTLP_ENDPOINT", "https://otel.layerlens.ai:4317" + ) + insecure = ( + os.environ.get("LAYERLENS_OTLP_INSECURE", "false").lower() == "true" + ) + + resource = Resource.create({ + "service.name": "atlas-sdk-python", + "service.version": __version__, + "host.name": socket.gethostname() or "unknown", + }) + + try: + exporter = OTLPMetricExporter(endpoint=endpoint, insecure=insecure) + reader = PeriodicExportingMetricReader( + exporter, export_interval_millis=30_000 + ) + provider = MeterProvider(resource=resource, metric_readers=[reader]) + metrics.set_meter_provider(provider) + _meter = metrics.get_meter("layerlens.sdk") + _counter_events = _meter.create_counter( + "atlas_sdk_events_total", + description=( + "SDK + CLI + IDE events emitted by layerlens; " + "surface=sdk_python|cli|vscode|web, event=init|cmd_run|trace_emit|..." + ), + ) + _hist_request_duration = _meter.create_histogram( + "atlas_sdk_request_duration_seconds", + description="HTTP request duration emitted by layerlens SDK clients.", + unit="s", + ) + except Exception: + # Any failure during init means telemetry is disabled for this process. + _meter = None + _counter_events = None + _hist_request_duration = None + return False + + return True + + +def event( + surface: str, + event_name: str, + *, + attributes: Optional[Mapping[str, str]] = None, +) -> None: + """Increment ``atlas_sdk_events_total{surface, event}`` by 1. + + No-op when telemetry is disabled or OTel SDK is absent. + """ + if not _try_init() or _counter_events is None: + return + attrs: dict[str, Any] = {"surface": surface, "event": event_name} + if attributes: + # Only allow a small allowlist of attribute keys — no PII. + allow = {"command", "resource", "outcome", "status_code"} + for k, v in attributes.items(): + if k in allow and isinstance(v, (str, int, bool, float)): + attrs[k] = str(v) + try: + _counter_events.add(1, attributes=attrs) + except Exception: + # Never raise into customer code. + pass + + +@contextlib.contextmanager +def timed(surface: str, event_name: str) -> Iterator[None]: + """Context manager: emit one ``event`` on entry/exit and observe duration.""" + start = time.perf_counter() + try: + yield + finally: + elapsed = time.perf_counter() - start + event(surface, event_name) + if _hist_request_duration is not None: + try: + _hist_request_duration.record( + elapsed, + attributes={"surface": surface, "event": event_name}, + ) + except Exception: + pass + + +def shutdown() -> None: + """Flush + shutdown any active providers. CLI calls this in atexit.""" + try: + from opentelemetry import metrics + provider = metrics.get_meter_provider() + if hasattr(provider, "shutdown"): + provider.shutdown() + except Exception: + pass diff --git a/src/layerlens/cli/_app.py b/src/layerlens/cli/_app.py index fd58961..91c941a 100644 --- a/src/layerlens/cli/_app.py +++ b/src/layerlens/cli/_app.py @@ -1,7 +1,10 @@ from __future__ import annotations +import atexit + import click +from .. import _telemetry from .._version import __version__ from .commands.ci import ci from .commands.auth import login, logout, whoami @@ -13,6 +16,9 @@ from .commands.evaluate import evaluate from .commands.integration import integration +# Flush telemetry on interpreter exit so the last `cmd_run` event isn't lost. +atexit.register(_telemetry.shutdown) + @click.group() @click.option( @@ -56,6 +62,10 @@ def cli( ctx.obj["output_format"] = output_format ctx.obj["verbose"] = verbose + # Opt-in CLI command telemetry. No-op unless LAYERLENS_TELEMETRY=on. + invoked = ctx.invoked_subcommand or "no_subcommand" + _telemetry.event("cli", "cmd_run", attributes={"command": invoked}) + # Build base_url from --host / --port base_url = None if host is not None: diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py new file mode 100644 index 0000000..c96a37f --- /dev/null +++ b/tests/test_telemetry.py @@ -0,0 +1,175 @@ +"""Unit tests for `layerlens._telemetry` — the opt-in client telemetry. + +Contract under test: + +1. **Off by default.** Without ``LAYERLENS_TELEMETRY=on`` set, every + helper is a no-op and never imports the OTel SDK. +2. **Failure-isolated.** When OTel is missing or any init step raises, + subsequent calls quietly do nothing — telemetry MUST NOT break + customer code. +3. **Allowlist on attributes.** Only `command`, `resource`, `outcome`, + `status_code` may be passed; other keys are silently dropped. +4. **`shutdown()` is safe to call when telemetry was never enabled.** +""" +from __future__ import annotations + +import importlib + +import pytest + + +@pytest.fixture(autouse=True) +def _reset_telemetry_module(monkeypatch): + """Re-import the module so module-level singletons are fresh per test.""" + import layerlens._telemetry as t + + # Scrub global state. + t._initialized = False + t._meter = None + t._counter_events = None + t._hist_request_duration = None + monkeypatch.delenv("LAYERLENS_TELEMETRY", raising=False) + yield t + t._initialized = False + t._meter = None + t._counter_events = None + t._hist_request_duration = None + + +def test_disabled_by_default_is_silent_noop(_reset_telemetry_module): + t = _reset_telemetry_module + # Every helper must succeed and do nothing. + t.event("sdk_python", "init") + t.event("cli", "cmd_run", attributes={"command": "trace ls"}) + with t.timed("sdk_python", "trace_emit"): + pass + t.shutdown() + + assert t._meter is None + assert t._counter_events is None + assert t._initialized is True # _try_init was called once and short-circuited + + +def test_event_with_telemetry_off_doesnt_import_otel(monkeypatch, _reset_telemetry_module): + t = _reset_telemetry_module + + # Even if OTel is technically importable in the test env, the OFF state + # must short-circuit BEFORE the import block. + import sys + sys.modules.pop("opentelemetry", None) + t.event("sdk_python", "init") + assert "opentelemetry" not in sys.modules + + +def test_attributes_allowlist(monkeypatch, _reset_telemetry_module): + """Disallowed attribute keys are silently dropped, not raised.""" + t = _reset_telemetry_module + monkeypatch.setenv("LAYERLENS_TELEMETRY", "on") + + # Stub OTel SDK so we can observe what reaches add(). + seen_attrs: dict = {} + + class _StubCounter: + def add(self, value, attributes=None): + seen_attrs.update(attributes or {}) + + t._initialized = True + t._meter = object() + t._counter_events = _StubCounter() + t._hist_request_duration = None + + t.event( + "cli", + "cmd_run", + attributes={ + "command": "trace ls", + "email": "user@example.com", # MUST be dropped + "ip": "10.0.0.1", # MUST be dropped + "outcome": "success", + }, + ) + + assert seen_attrs.get("surface") == "cli" + assert seen_attrs.get("event") == "cmd_run" + assert seen_attrs.get("command") == "trace ls" + assert seen_attrs.get("outcome") == "success" + assert "email" not in seen_attrs + assert "ip" not in seen_attrs + + +def test_event_swallows_counter_errors(monkeypatch, _reset_telemetry_module): + """If the underlying counter raises, telemetry MUST NOT propagate.""" + t = _reset_telemetry_module + monkeypatch.setenv("LAYERLENS_TELEMETRY", "on") + + class _Boom: + def add(self, *_a, **_kw): + raise RuntimeError("backend down") + + t._initialized = True + t._meter = object() + t._counter_events = _Boom() + t._hist_request_duration = None + + # No exception — silent failure is the contract. + t.event("sdk_python", "init") + + +def test_timed_records_duration(monkeypatch, _reset_telemetry_module): + t = _reset_telemetry_module + monkeypatch.setenv("LAYERLENS_TELEMETRY", "on") + + class _Hist: + seen: list = [] + + def record(self, value, attributes=None): + self.seen.append((value, attributes)) + + class _Stub: + seen: list = [] + + def add(self, value, attributes=None): + self.seen.append((value, attributes)) + + hist = _Hist() + counter = _Stub() + t._initialized = True + t._meter = object() + t._counter_events = counter + t._hist_request_duration = hist + + with t.timed("sdk_python", "trace_emit"): + pass + + assert len(counter.seen) == 1 + assert len(hist.seen) == 1 + duration, attrs = hist.seen[0] + assert duration >= 0 + assert attrs == {"surface": "sdk_python", "event": "trace_emit"} + + +def test_shutdown_is_safe_when_never_enabled(_reset_telemetry_module): + """shutdown() must be safe even if telemetry never initialized.""" + t = _reset_telemetry_module + t.shutdown() # Must not raise. + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("on", True), + ("ON", True), + ("true", True), + ("True", True), + ("1", True), + ("yes", True), + ("off", False), + ("0", False), + ("", False), + ("nope", False), + ], +) +def test_enabled_recognizes_truthy_values(monkeypatch, _reset_telemetry_module, raw, expected): + t = _reset_telemetry_module + monkeypatch.setenv("LAYERLENS_TELEMETRY", raw) + assert t._enabled() is expected