From c866d66297d50560821a9047f4cce13550e8a640 Mon Sep 17 00:00:00 2001 From: Shreyas Nagaraj Date: Thu, 25 Jun 2026 17:21:22 +0530 Subject: [PATCH 1/4] CCM-33842: fix LiteLLM token usage capture Copy response usage metadata onto the SDK-owned LiteLLM span before ending it so OTLP exports include input and output token attributes. Co-authored-by: Cursor --- .../instrumentation/litellm/__init__.py | 66 ++++++++++++++++++- .../litellm/litellm_instrumentation_test.py | 8 +++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/harness_sdk/instrumentation/litellm/__init__.py b/src/harness_sdk/instrumentation/litellm/__init__.py index 5852c37..277680c 100644 --- a/src/harness_sdk/instrumentation/litellm/__init__.py +++ b/src/harness_sdk/instrumentation/litellm/__init__.py @@ -182,6 +182,64 @@ def _set_pre_call_request_attributes( logger.debug("LiteLLM: failed to set input attributes on span: %s", err) +def _get_value(obj: Any, key: str) -> Any: + if obj is None: + return None + if isinstance(obj, dict): + return obj.get(key) + return getattr(obj, key, None) + + +def _get_usage(response: Any) -> Any: + usage = _get_value(response, "usage") + if usage is not None: + return usage + if isinstance(response, dict): + return response.get("usage") + return None + + +def _set_if_present(otel_logger: Any, span: Any, key: str, value: Any) -> None: + if value is not None: + otel_logger.safe_set_attribute(span, key, value) + + +def _set_response_usage_attributes(otel_logger: Any, span: Any, response: Any) -> None: + """Copy LiteLLM usage metadata before the wrapper-owned span ends.""" + usage = _get_usage(response) + if usage is None: + return + + input_tokens = _get_value(usage, "prompt_tokens") + if input_tokens is None: + input_tokens = _get_value(usage, "input_tokens") + + output_tokens = _get_value(usage, "completion_tokens") + if output_tokens is None: + output_tokens = _get_value(usage, "output_tokens") + + _set_if_present(otel_logger, span, "gen_ai.usage.input_tokens", input_tokens) + _set_if_present(otel_logger, span, "gen_ai.usage.output_tokens", output_tokens) + _set_if_present( + otel_logger, + span, + "gen_ai.usage.total_tokens", + _get_value(usage, "total_tokens"), + ) + _set_if_present( + otel_logger, + span, + "gen_ai.usage.cache_read_input_tokens", + _get_value(usage, "cache_read_input_tokens"), + ) + _set_if_present( + otel_logger, + span, + "gen_ai.usage.cache_creation_input_tokens", + _get_value(usage, "cache_creation_input_tokens"), + ) + + def _build_traceable_otel_class() -> type: from litellm.integrations.opentelemetry import ( # pylint: disable=import-outside-toplevel OpenTelemetry, @@ -304,7 +362,9 @@ def _sync_wrapper( span = _start_evaluated_span(otel_logger, func_name, args, kwargs) token = _activate_span(span) try: - return wrapped(*args, **kwargs) + response = wrapped(*args, **kwargs) + _set_response_usage_attributes(otel_logger, span, response) + return response except Exception as exc: # pylint: disable=broad-except span.record_exception(exc) span.set_status(Status(StatusCode.ERROR, str(exc))) @@ -324,7 +384,9 @@ async def _async_wrapper( span = _start_evaluated_span(otel_logger, func_name, args, kwargs) token = _activate_span(span) try: - return await wrapped(*args, **kwargs) + response = await wrapped(*args, **kwargs) + _set_response_usage_attributes(otel_logger, span, response) + return response except Exception as exc: # pylint: disable=broad-except span.record_exception(exc) span.set_status(Status(StatusCode.ERROR, str(exc))) diff --git a/test/instrumentation/litellm/litellm_instrumentation_test.py b/test/instrumentation/litellm/litellm_instrumentation_test.py index ce4ca3f..d017a91 100644 --- a/test/instrumentation/litellm/litellm_instrumentation_test.py +++ b/test/instrumentation/litellm/litellm_instrumentation_test.py @@ -71,6 +71,9 @@ def test_litellm_completion_span_has_gen_ai_attributes(agent, exporter, litellm_ assert attrs.get("gen_ai.operation.name") == "chat" assert attrs.get("gen_ai.system") == "openai" assert attrs.get("gen_ai.framework") == "litellm" + assert attrs.get("gen_ai.usage.input_tokens") == 3 + assert attrs.get("gen_ai.usage.output_tokens") == 5 + assert attrs.get("gen_ai.usage.total_tokens") == 8 def test_litellm_evaluate_blocks_before_wrapped(agent, exporter, litellm_instrumentor): # pylint: disable=unused-argument @@ -109,6 +112,8 @@ def test_litellm_embedding_span_has_gen_ai_attributes(agent, exporter, litellm_i assert attrs.get("gen_ai.request.model") == "text-embedding-3-small" assert attrs.get("gen_ai.operation.name") == "embeddings" assert attrs.get("gen_ai.framework") == "litellm" + assert attrs.get("gen_ai.usage.input_tokens") == 4 + assert attrs.get("gen_ai.usage.total_tokens") == 4 @pytest.mark.asyncio @@ -127,6 +132,9 @@ async def _fake_async(*_args, **_kwargs): exporter.clear() assert len(spans) >= 1 assert spans[0].attributes.get("gen_ai.operation.name") == "chat" + assert spans[0].attributes.get("gen_ai.usage.input_tokens") == 3 + assert spans[0].attributes.get("gen_ai.usage.output_tokens") == 5 + assert spans[0].attributes.get("gen_ai.usage.total_tokens") == 8 def test_litellm_double_instrument_is_noop(agent, exporter, litellm_instrumentor): # pylint: disable=unused-argument From 60b68ddb64ad9dd93f38267940364d34e25e02d0 Mon Sep 17 00:00:00 2001 From: Shreyas Nagaraj Date: Thu, 25 Jun 2026 18:36:49 +0530 Subject: [PATCH 2/4] CCM-33842: normalize LiteLLM GenAI semconv fields Emit canonical provider, response metadata, and dotted cache/reasoning token attributes on LiteLLM spans. Co-authored-by: Cursor --- .../instrumentation/litellm/__init__.py | 79 ++++++++++++++++--- .../litellm/litellm_instrumentation_test.py | 44 ++++++++--- 2 files changed, 105 insertions(+), 18 deletions(-) diff --git a/src/harness_sdk/instrumentation/litellm/__init__.py b/src/harness_sdk/instrumentation/litellm/__init__.py index 277680c..0fb9187 100644 --- a/src/harness_sdk/instrumentation/litellm/__init__.py +++ b/src/harness_sdk/instrumentation/litellm/__init__.py @@ -43,6 +43,19 @@ ("aembedding", True), ) +_PROVIDER_NAME_MAP = { + "azure": "azure.ai.openai", + "azure_ai": "azure.ai.openai", + "azure_ai_openai": "azure.ai.openai", + "azureopenai": "azure.ai.openai", + "bedrock": "aws.bedrock", + "bedrock_converse": "aws.bedrock", + "gemini": "gcp.gemini", + "google": "gcp.gemini", + "vertex_ai": "gcp.vertex_ai", + "vertexai": "gcp.vertex_ai", +} + def _evaluate_span(span: Any) -> None: """Run Traceable policy evaluation against the live span; raise if blocked.""" @@ -111,6 +124,11 @@ def _resolve_provider(model: Optional[str], kwargs: dict[str, Any]) -> str: return "Unknown" +def _canonical_provider_name(provider: str) -> str: + normalized = (provider or "unknown").strip().lower().replace("-", "_") + return _PROVIDER_NAME_MAP.get(normalized, normalized) + + def _operation_name(func_name: str) -> str: if func_name in ("embedding", "aembedding"): return "embeddings" @@ -138,7 +156,9 @@ def _set_pre_call_request_attributes( otel_logger.safe_set_attribute( span, "gen_ai.operation.name", _operation_name(pre_call.call_type) ) - otel_logger.safe_set_attribute(span, "gen_ai.system", provider) + otel_logger.safe_set_attribute( + span, "gen_ai.provider.name", _canonical_provider_name(provider) + ) otel_logger.safe_set_attribute(span, "gen_ai.framework", "litellm") otel_logger.safe_set_attribute( span, @@ -204,12 +224,45 @@ def _set_if_present(otel_logger: Any, span: Any, key: str, value: Any) -> None: otel_logger.safe_set_attribute(span, key, value) -def _set_response_usage_attributes(otel_logger: Any, span: Any, response: Any) -> None: - """Copy LiteLLM usage metadata before the wrapper-owned span ends.""" +def _get_choices(response: Any) -> list[Any]: + choices = _get_value(response, "choices") + if choices is None: + return [] + return list(choices) + + +def _get_finish_reasons(response: Any) -> list[str]: + finish_reasons = [] + for choice in _get_choices(response): + finish_reason = _get_value(choice, "finish_reason") + if finish_reason: + finish_reasons.append(str(finish_reason)) + return finish_reasons + + +def _set_response_attributes(otel_logger: Any, span: Any, response: Any) -> None: + """Copy LiteLLM response metadata before the wrapper-owned span ends.""" + _set_if_present( + otel_logger, + span, + "gen_ai.response.model", + _get_value(response, "model"), + ) + _set_if_present(otel_logger, span, "gen_ai.response.id", _get_value(response, "id")) + + finish_reasons = _get_finish_reasons(response) + if finish_reasons: + otel_logger.safe_set_attribute( + span, "gen_ai.response.finish_reasons", finish_reasons + ) + usage = _get_usage(response) if usage is None: return + prompt_details = _get_value(usage, "prompt_tokens_details") + completion_details = _get_value(usage, "completion_tokens_details") + input_tokens = _get_value(usage, "prompt_tokens") if input_tokens is None: input_tokens = _get_value(usage, "input_tokens") @@ -229,14 +282,22 @@ def _set_response_usage_attributes(otel_logger: Any, span: Any, response: Any) - _set_if_present( otel_logger, span, - "gen_ai.usage.cache_read_input_tokens", - _get_value(usage, "cache_read_input_tokens"), + "gen_ai.usage.cache_read.input_tokens", + _get_value(usage, "cache_read_input_tokens") + or _get_value(prompt_details, "cached_tokens"), + ) + _set_if_present( + otel_logger, + span, + "gen_ai.usage.cache_creation.input_tokens", + _get_value(usage, "cache_creation_input_tokens") + or _get_value(prompt_details, "cache_creation_tokens"), ) _set_if_present( otel_logger, span, - "gen_ai.usage.cache_creation_input_tokens", - _get_value(usage, "cache_creation_input_tokens"), + "gen_ai.usage.reasoning.output_tokens", + _get_value(completion_details, "reasoning_tokens"), ) @@ -363,7 +424,7 @@ def _sync_wrapper( token = _activate_span(span) try: response = wrapped(*args, **kwargs) - _set_response_usage_attributes(otel_logger, span, response) + _set_response_attributes(otel_logger, span, response) return response except Exception as exc: # pylint: disable=broad-except span.record_exception(exc) @@ -385,7 +446,7 @@ async def _async_wrapper( token = _activate_span(span) try: response = await wrapped(*args, **kwargs) - _set_response_usage_attributes(otel_logger, span, response) + _set_response_attributes(otel_logger, span, response) return response except Exception as exc: # pylint: disable=broad-except span.record_exception(exc) diff --git a/test/instrumentation/litellm/litellm_instrumentation_test.py b/test/instrumentation/litellm/litellm_instrumentation_test.py index d017a91..5c3c568 100644 --- a/test/instrumentation/litellm/litellm_instrumentation_test.py +++ b/test/instrumentation/litellm/litellm_instrumentation_test.py @@ -35,7 +35,16 @@ def _fake_model_response(*_args, **_kwargs): } ], model="gpt-4o-mini", - usage={"prompt_tokens": 3, "completion_tokens": 5, "total_tokens": 8}, + usage={ + "prompt_tokens": 3, + "completion_tokens": 5, + "total_tokens": 8, + "prompt_tokens_details": { + "cached_tokens": 1, + "cache_creation_tokens": 2, + }, + "completion_tokens_details": {"reasoning_tokens": 1}, + }, ) @@ -66,14 +75,21 @@ def test_litellm_completion_span_has_gen_ai_attributes(agent, exporter, litellm_ spans = exporter.get_finished_spans() exporter.clear() assert len(spans) >= 1 - attrs = spans[0].attributes + attrs = _request_span(spans).attributes assert attrs.get("gen_ai.request.model") == "gpt-4o-mini" assert attrs.get("gen_ai.operation.name") == "chat" - assert attrs.get("gen_ai.system") == "openai" + assert attrs.get("gen_ai.provider.name") == "openai" + assert "gen_ai.system" not in attrs assert attrs.get("gen_ai.framework") == "litellm" + assert attrs.get("gen_ai.response.model") == "gpt-4o-mini" + assert attrs.get("gen_ai.response.id") == "chatcmpl-test" + assert attrs.get("gen_ai.response.finish_reasons") == "['stop']" assert attrs.get("gen_ai.usage.input_tokens") == 3 assert attrs.get("gen_ai.usage.output_tokens") == 5 assert attrs.get("gen_ai.usage.total_tokens") == 8 + assert attrs.get("gen_ai.usage.cache_read.input_tokens") == 1 + assert attrs.get("gen_ai.usage.cache_creation.input_tokens") == 2 + assert attrs.get("gen_ai.usage.reasoning.output_tokens") == 1 def test_litellm_evaluate_blocks_before_wrapped(agent, exporter, litellm_instrumentor): # pylint: disable=unused-argument @@ -108,10 +124,13 @@ def test_litellm_embedding_span_has_gen_ai_attributes(agent, exporter, litellm_i spans = exporter.get_finished_spans() exporter.clear() assert len(spans) >= 1 - attrs = spans[0].attributes + attrs = _request_span(spans).attributes assert attrs.get("gen_ai.request.model") == "text-embedding-3-small" assert attrs.get("gen_ai.operation.name") == "embeddings" + assert attrs.get("gen_ai.provider.name") == "openai" + assert "gen_ai.system" not in attrs assert attrs.get("gen_ai.framework") == "litellm" + assert attrs.get("gen_ai.response.model") == "text-embedding-3-small" assert attrs.get("gen_ai.usage.input_tokens") == 4 assert attrs.get("gen_ai.usage.total_tokens") == 4 @@ -131,10 +150,16 @@ async def _fake_async(*_args, **_kwargs): spans = exporter.get_finished_spans() exporter.clear() assert len(spans) >= 1 - assert spans[0].attributes.get("gen_ai.operation.name") == "chat" - assert spans[0].attributes.get("gen_ai.usage.input_tokens") == 3 - assert spans[0].attributes.get("gen_ai.usage.output_tokens") == 5 - assert spans[0].attributes.get("gen_ai.usage.total_tokens") == 8 + attrs = _request_span(spans).attributes + assert attrs.get("gen_ai.operation.name") == "chat" + assert attrs.get("gen_ai.provider.name") == "openai" + assert "gen_ai.system" not in attrs + assert attrs.get("gen_ai.response.model") == "gpt-4o-mini" + assert attrs.get("gen_ai.response.id") == "chatcmpl-test" + assert attrs.get("gen_ai.response.finish_reasons") == "['stop']" + assert attrs.get("gen_ai.usage.input_tokens") == 3 + assert attrs.get("gen_ai.usage.output_tokens") == 5 + assert attrs.get("gen_ai.usage.total_tokens") == 8 def test_litellm_double_instrument_is_noop(agent, exporter, litellm_instrumentor): # pylint: disable=unused-argument @@ -194,5 +219,6 @@ def test_litellm_mock_response_with_otel_callback(agent, exporter, litellm_instr attrs = _request_span(spans).attributes assert attrs.get("gen_ai.request.model") == "gpt-4o-mini" assert attrs.get("gen_ai.operation.name") == "chat" - assert attrs.get("gen_ai.system") == "openai" + assert attrs.get("gen_ai.provider.name") == "openai" + assert "gen_ai.system" not in attrs assert attrs.get("gen_ai.framework") == "litellm" From 44ff5ecaad65c10c50f670e523c23b02e25a6b93 Mon Sep 17 00:00:00 2001 From: Shreyas Nagaraj Date: Thu, 25 Jun 2026 19:21:12 +0530 Subject: [PATCH 3/4] CCM-33842: avoid LiteLLM callback duplicate attrs Rely on the SDK wrapper for LiteLLM response enrichment so exported spans do not retain the legacy gen_ai.system attribute. Co-authored-by: Cursor --- src/harness_sdk/instrumentation/litellm/__init__.py | 8 +++----- .../litellm/litellm_instrumentation_test.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/harness_sdk/instrumentation/litellm/__init__.py b/src/harness_sdk/instrumentation/litellm/__init__.py index 0fb9187..28a4b6a 100644 --- a/src/harness_sdk/instrumentation/litellm/__init__.py +++ b/src/harness_sdk/instrumentation/litellm/__init__.py @@ -6,10 +6,9 @@ - litellm.completion / litellm.acompletion - litellm.embedding / litellm.aembedding -Registers a ``TraceableLiteLLMOpenTelemetry`` callback (subclass of LiteLLM's -``OpenTelemetry``) and wraps the public entry points so evaluation runs on an -active span before the provider call. LiteLLM's OTEL callback enriches that -span on success when it is the active parent context. +Wraps the public entry points so evaluation runs on an active span before the +provider call. The wrapper enriches that span with response metadata before it +ends. Optional: ``pip install harness-sdk[litellm]`` """ @@ -477,7 +476,6 @@ def instrument(self, **_kwargs: Any) -> None: import litellm # pylint: disable=import-outside-toplevel otel_logger = _get_otel_logger() - _register_otel_callback(otel_logger) main_mod = __import__(_LITELLM_MAIN, fromlist=["*"]) for func_name, is_async in _WRAPPED_FUNCTIONS: wrapt.wrap_function_wrapper( diff --git a/test/instrumentation/litellm/litellm_instrumentation_test.py b/test/instrumentation/litellm/litellm_instrumentation_test.py index 5c3c568..a3335a7 100644 --- a/test/instrumentation/litellm/litellm_instrumentation_test.py +++ b/test/instrumentation/litellm/litellm_instrumentation_test.py @@ -205,7 +205,7 @@ def counting_fake(*_a, **_k): assert len(spans) == 0 -def test_litellm_mock_response_with_otel_callback(agent, exporter, litellm_instrumentor): # pylint: disable=unused-argument +def test_litellm_mock_response_with_wrapper_enrichment(agent, exporter, litellm_instrumentor): # pylint: disable=unused-argument litellm_instrumentor.instrument() litellm.completion( model="gpt-4o-mini", From 1f96b2f8df4af72347ec1fbe4244152d17bc8087 Mon Sep 17 00:00:00 2001 From: Shreyas Nagaraj Date: Thu, 25 Jun 2026 20:14:37 +0530 Subject: [PATCH 4/4] CCM-33842: remove unused LiteLLM logger variable Drop the stale local variable left after removing LiteLLM callback registration. Co-authored-by: Cursor --- src/harness_sdk/instrumentation/litellm/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/harness_sdk/instrumentation/litellm/__init__.py b/src/harness_sdk/instrumentation/litellm/__init__.py index 28a4b6a..ecb514a 100644 --- a/src/harness_sdk/instrumentation/litellm/__init__.py +++ b/src/harness_sdk/instrumentation/litellm/__init__.py @@ -475,7 +475,6 @@ def instrument(self, **_kwargs: Any) -> None: try: import litellm # pylint: disable=import-outside-toplevel - otel_logger = _get_otel_logger() main_mod = __import__(_LITELLM_MAIN, fromlist=["*"]) for func_name, is_async in _WRAPPED_FUNCTIONS: wrapt.wrap_function_wrapper(