From c866d66297d50560821a9047f4cce13550e8a640 Mon Sep 17 00:00:00 2001
From: Shreyas Nagaraj <shreyas.nagaraj@harness.io>
Date: Thu, 25 Jun 2026 17:21:22 +0530
Subject: [PATCH 1/4] CCM-33842: fix LiteLLM token usage capture

Copy response usage metadata onto the SDK-owned LiteLLM span before ending it so OTLP exports include input and output token attributes.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../instrumentation/litellm/__init__.py       | 66 ++++++++++++++++++-
 .../litellm/litellm_instrumentation_test.py   |  8 +++
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/harness_sdk/instrumentation/litellm/__init__.py b/src/harness_sdk/instrumentation/litellm/__init__.py
index 5852c37..277680c 100644
--- a/src/harness_sdk/instrumentation/litellm/__init__.py
+++ b/src/harness_sdk/instrumentation/litellm/__init__.py
@@ -182,6 +182,64 @@ def _set_pre_call_request_attributes(
         logger.debug("LiteLLM: failed to set input attributes on span: %s", err)
 
 
+def _get_value(obj: Any, key: str) -> Any:
+    if obj is None:
+        return None
+    if isinstance(obj, dict):
+        return obj.get(key)
+    return getattr(obj, key, None)
+
+
+def _get_usage(response: Any) -> Any:
+    usage = _get_value(response, "usage")
+    if usage is not None:
+        return usage
+    if isinstance(response, dict):
+        return response.get("usage")
+    return None
+
+
+def _set_if_present(otel_logger: Any, span: Any, key: str, value: Any) -> None:
+    if value is not None:
+        otel_logger.safe_set_attribute(span, key, value)
+
+
+def _set_response_usage_attributes(otel_logger: Any, span: Any, response: Any) -> None:
+    """Copy LiteLLM usage metadata before the wrapper-owned span ends."""
+    usage = _get_usage(response)
+    if usage is None:
+        return
+
+    input_tokens = _get_value(usage, "prompt_tokens")
+    if input_tokens is None:
+        input_tokens = _get_value(usage, "input_tokens")
+
+    output_tokens = _get_value(usage, "completion_tokens")
+    if output_tokens is None:
+        output_tokens = _get_value(usage, "output_tokens")
+
+    _set_if_present(otel_logger, span, "gen_ai.usage.input_tokens", input_tokens)
+    _set_if_present(otel_logger, span, "gen_ai.usage.output_tokens", output_tokens)
+    _set_if_present(
+        otel_logger,
+        span,
+        "gen_ai.usage.total_tokens",
+        _get_value(usage, "total_tokens"),
+    )
+    _set_if_present(
+        otel_logger,
+        span,
+        "gen_ai.usage.cache_read_input_tokens",
+        _get_value(usage, "cache_read_input_tokens"),
+    )
+    _set_if_present(
+        otel_logger,
+        span,
+        "gen_ai.usage.cache_creation_input_tokens",
+        _get_value(usage, "cache_creation_input_tokens"),
+    )
+
+
 def _build_traceable_otel_class() -> type:
     from litellm.integrations.opentelemetry import (  # pylint: disable=import-outside-toplevel
         OpenTelemetry,
@@ -304,7 +362,9 @@ def _sync_wrapper(
         span = _start_evaluated_span(otel_logger, func_name, args, kwargs)
         token = _activate_span(span)
         try:
-            return wrapped(*args, **kwargs)
+            response = wrapped(*args, **kwargs)
+            _set_response_usage_attributes(otel_logger, span, response)
+            return response
         except Exception as exc:  # pylint: disable=broad-except
             span.record_exception(exc)
             span.set_status(Status(StatusCode.ERROR, str(exc)))
@@ -324,7 +384,9 @@ async def _async_wrapper(
         span = _start_evaluated_span(otel_logger, func_name, args, kwargs)
         token = _activate_span(span)
         try:
-            return await wrapped(*args, **kwargs)
+            response = await wrapped(*args, **kwargs)
+            _set_response_usage_attributes(otel_logger, span, response)
+            return response
         except Exception as exc:  # pylint: disable=broad-except
             span.record_exception(exc)
             span.set_status(Status(StatusCode.ERROR, str(exc)))
diff --git a/test/instrumentation/litellm/litellm_instrumentation_test.py b/test/instrumentation/litellm/litellm_instrumentation_test.py
index ce4ca3f..d017a91 100644
--- a/test/instrumentation/litellm/litellm_instrumentation_test.py
+++ b/test/instrumentation/litellm/litellm_instrumentation_test.py
@@ -71,6 +71,9 @@ def test_litellm_completion_span_has_gen_ai_attributes(agent, exporter, litellm_
     assert attrs.get("gen_ai.operation.name") == "chat"
     assert attrs.get("gen_ai.system") == "openai"
     assert attrs.get("gen_ai.framework") == "litellm"
+    assert attrs.get("gen_ai.usage.input_tokens") == 3
+    assert attrs.get("gen_ai.usage.output_tokens") == 5
+    assert attrs.get("gen_ai.usage.total_tokens") == 8
 
 
 def test_litellm_evaluate_blocks_before_wrapped(agent, exporter, litellm_instrumentor):  # pylint: disable=unused-argument
@@ -109,6 +112,8 @@ def test_litellm_embedding_span_has_gen_ai_attributes(agent, exporter, litellm_i
     assert attrs.get("gen_ai.request.model") == "text-embedding-3-small"
     assert attrs.get("gen_ai.operation.name") == "embeddings"
     assert attrs.get("gen_ai.framework") == "litellm"
+    assert attrs.get("gen_ai.usage.input_tokens") == 4
+    assert attrs.get("gen_ai.usage.total_tokens") == 4
 
 
 @pytest.mark.asyncio
@@ -127,6 +132,9 @@ async def _fake_async(*_args, **_kwargs):
     exporter.clear()
     assert len(spans) >= 1
     assert spans[0].attributes.get("gen_ai.operation.name") == "chat"
+    assert spans[0].attributes.get("gen_ai.usage.input_tokens") == 3
+    assert spans[0].attributes.get("gen_ai.usage.output_tokens") == 5
+    assert spans[0].attributes.get("gen_ai.usage.total_tokens") == 8
 
 
 def test_litellm_double_instrument_is_noop(agent, exporter, litellm_instrumentor):  # pylint: disable=unused-argument

From 60b68ddb64ad9dd93f38267940364d34e25e02d0 Mon Sep 17 00:00:00 2001
From: Shreyas Nagaraj <shreyas.nagaraj@harness.io>
Date: Thu, 25 Jun 2026 18:36:49 +0530
Subject: [PATCH 2/4] CCM-33842: normalize LiteLLM GenAI semconv fields

Emit canonical provider, response metadata, and dotted cache/reasoning token attributes on LiteLLM spans.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../instrumentation/litellm/__init__.py       | 79 ++++++++++++++++---
 .../litellm/litellm_instrumentation_test.py   | 44 ++++++++---
 2 files changed, 105 insertions(+), 18 deletions(-)

diff --git a/src/harness_sdk/instrumentation/litellm/__init__.py b/src/harness_sdk/instrumentation/litellm/__init__.py
index 277680c..0fb9187 100644
--- a/src/harness_sdk/instrumentation/litellm/__init__.py
+++ b/src/harness_sdk/instrumentation/litellm/__init__.py
@@ -43,6 +43,19 @@
     ("aembedding", True),
 )
 
+_PROVIDER_NAME_MAP = {
+    "azure": "azure.ai.openai",
+    "azure_ai": "azure.ai.openai",
+    "azure_ai_openai": "azure.ai.openai",
+    "azureopenai": "azure.ai.openai",
+    "bedrock": "aws.bedrock",
+    "bedrock_converse": "aws.bedrock",
+    "gemini": "gcp.gemini",
+    "google": "gcp.gemini",
+    "vertex_ai": "gcp.vertex_ai",
+    "vertexai": "gcp.vertex_ai",
+}
+
 
 def _evaluate_span(span: Any) -> None:
     """Run Traceable policy evaluation against the live span; raise if blocked."""
@@ -111,6 +124,11 @@ def _resolve_provider(model: Optional[str], kwargs: dict[str, Any]) -> str:
     return "Unknown"
 
 
+def _canonical_provider_name(provider: str) -> str:
+    normalized = (provider or "unknown").strip().lower().replace("-", "_")
+    return _PROVIDER_NAME_MAP.get(normalized, normalized)
+
+
 def _operation_name(func_name: str) -> str:
     if func_name in ("embedding", "aembedding"):
         return "embeddings"
@@ -138,7 +156,9 @@ def _set_pre_call_request_attributes(
     otel_logger.safe_set_attribute(
         span, "gen_ai.operation.name", _operation_name(pre_call.call_type)
     )
-    otel_logger.safe_set_attribute(span, "gen_ai.system", provider)
+    otel_logger.safe_set_attribute(
+        span, "gen_ai.provider.name", _canonical_provider_name(provider)
+    )
     otel_logger.safe_set_attribute(span, "gen_ai.framework", "litellm")
     otel_logger.safe_set_attribute(
         span,
@@ -204,12 +224,45 @@ def _set_if_present(otel_logger: Any, span: Any, key: str, value: Any) -> None:
         otel_logger.safe_set_attribute(span, key, value)
 
 
-def _set_response_usage_attributes(otel_logger: Any, span: Any, response: Any) -> None:
-    """Copy LiteLLM usage metadata before the wrapper-owned span ends."""
+def _get_choices(response: Any) -> list[Any]:
+    choices = _get_value(response, "choices")
+    if choices is None:
+        return []
+    return list(choices)
+
+
+def _get_finish_reasons(response: Any) -> list[str]:
+    finish_reasons = []
+    for choice in _get_choices(response):
+        finish_reason = _get_value(choice, "finish_reason")
+        if finish_reason:
+            finish_reasons.append(str(finish_reason))
+    return finish_reasons
+
+
+def _set_response_attributes(otel_logger: Any, span: Any, response: Any) -> None:
+    """Copy LiteLLM response metadata before the wrapper-owned span ends."""
+    _set_if_present(
+        otel_logger,
+        span,
+        "gen_ai.response.model",
+        _get_value(response, "model"),
+    )
+    _set_if_present(otel_logger, span, "gen_ai.response.id", _get_value(response, "id"))
+
+    finish_reasons = _get_finish_reasons(response)
+    if finish_reasons:
+        otel_logger.safe_set_attribute(
+            span, "gen_ai.response.finish_reasons", finish_reasons
+        )
+
     usage = _get_usage(response)
     if usage is None:
         return
 
+    prompt_details = _get_value(usage, "prompt_tokens_details")
+    completion_details = _get_value(usage, "completion_tokens_details")
+
     input_tokens = _get_value(usage, "prompt_tokens")
     if input_tokens is None:
         input_tokens = _get_value(usage, "input_tokens")
@@ -229,14 +282,22 @@ def _set_response_usage_attributes(otel_logger: Any, span: Any, response: Any) -
     _set_if_present(
         otel_logger,
         span,
-        "gen_ai.usage.cache_read_input_tokens",
-        _get_value(usage, "cache_read_input_tokens"),
+        "gen_ai.usage.cache_read.input_tokens",
+        _get_value(usage, "cache_read_input_tokens")
+        or _get_value(prompt_details, "cached_tokens"),
+    )
+    _set_if_present(
+        otel_logger,
+        span,
+        "gen_ai.usage.cache_creation.input_tokens",
+        _get_value(usage, "cache_creation_input_tokens")
+        or _get_value(prompt_details, "cache_creation_tokens"),
     )
     _set_if_present(
         otel_logger,
         span,
-        "gen_ai.usage.cache_creation_input_tokens",
-        _get_value(usage, "cache_creation_input_tokens"),
+        "gen_ai.usage.reasoning.output_tokens",
+        _get_value(completion_details, "reasoning_tokens"),
     )
 
 
@@ -363,7 +424,7 @@ def _sync_wrapper(
         token = _activate_span(span)
         try:
             response = wrapped(*args, **kwargs)
-            _set_response_usage_attributes(otel_logger, span, response)
+            _set_response_attributes(otel_logger, span, response)
             return response
         except Exception as exc:  # pylint: disable=broad-except
             span.record_exception(exc)
@@ -385,7 +446,7 @@ async def _async_wrapper(
         token = _activate_span(span)
         try:
             response = await wrapped(*args, **kwargs)
-            _set_response_usage_attributes(otel_logger, span, response)
+            _set_response_attributes(otel_logger, span, response)
             return response
         except Exception as exc:  # pylint: disable=broad-except
             span.record_exception(exc)
diff --git a/test/instrumentation/litellm/litellm_instrumentation_test.py b/test/instrumentation/litellm/litellm_instrumentation_test.py
index d017a91..5c3c568 100644
--- a/test/instrumentation/litellm/litellm_instrumentation_test.py
+++ b/test/instrumentation/litellm/litellm_instrumentation_test.py
@@ -35,7 +35,16 @@ def _fake_model_response(*_args, **_kwargs):
             }
         ],
         model="gpt-4o-mini",
-        usage={"prompt_tokens": 3, "completion_tokens": 5, "total_tokens": 8},
+        usage={
+            "prompt_tokens": 3,
+            "completion_tokens": 5,
+            "total_tokens": 8,
+            "prompt_tokens_details": {
+                "cached_tokens": 1,
+                "cache_creation_tokens": 2,
+            },
+            "completion_tokens_details": {"reasoning_tokens": 1},
+        },
     )
 
 
@@ -66,14 +75,21 @@ def test_litellm_completion_span_has_gen_ai_attributes(agent, exporter, litellm_
     spans = exporter.get_finished_spans()
     exporter.clear()
     assert len(spans) >= 1
-    attrs = spans[0].attributes
+    attrs = _request_span(spans).attributes
     assert attrs.get("gen_ai.request.model") == "gpt-4o-mini"
     assert attrs.get("gen_ai.operation.name") == "chat"
-    assert attrs.get("gen_ai.system") == "openai"
+    assert attrs.get("gen_ai.provider.name") == "openai"
+    assert "gen_ai.system" not in attrs
     assert attrs.get("gen_ai.framework") == "litellm"
+    assert attrs.get("gen_ai.response.model") == "gpt-4o-mini"
+    assert attrs.get("gen_ai.response.id") == "chatcmpl-test"
+    assert attrs.get("gen_ai.response.finish_reasons") == "['stop']"
     assert attrs.get("gen_ai.usage.input_tokens") == 3
     assert attrs.get("gen_ai.usage.output_tokens") == 5
     assert attrs.get("gen_ai.usage.total_tokens") == 8
+    assert attrs.get("gen_ai.usage.cache_read.input_tokens") == 1
+    assert attrs.get("gen_ai.usage.cache_creation.input_tokens") == 2
+    assert attrs.get("gen_ai.usage.reasoning.output_tokens") == 1
 
 
 def test_litellm_evaluate_blocks_before_wrapped(agent, exporter, litellm_instrumentor):  # pylint: disable=unused-argument
@@ -108,10 +124,13 @@ def test_litellm_embedding_span_has_gen_ai_attributes(agent, exporter, litellm_i
     spans = exporter.get_finished_spans()
     exporter.clear()
     assert len(spans) >= 1
-    attrs = spans[0].attributes
+    attrs = _request_span(spans).attributes
     assert attrs.get("gen_ai.request.model") == "text-embedding-3-small"
     assert attrs.get("gen_ai.operation.name") == "embeddings"
+    assert attrs.get("gen_ai.provider.name") == "openai"
+    assert "gen_ai.system" not in attrs
     assert attrs.get("gen_ai.framework") == "litellm"
+    assert attrs.get("gen_ai.response.model") == "text-embedding-3-small"
     assert attrs.get("gen_ai.usage.input_tokens") == 4
     assert attrs.get("gen_ai.usage.total_tokens") == 4
 
@@ -131,10 +150,16 @@ async def _fake_async(*_args, **_kwargs):
     spans = exporter.get_finished_spans()
     exporter.clear()
     assert len(spans) >= 1
-    assert spans[0].attributes.get("gen_ai.operation.name") == "chat"
-    assert spans[0].attributes.get("gen_ai.usage.input_tokens") == 3
-    assert spans[0].attributes.get("gen_ai.usage.output_tokens") == 5
-    assert spans[0].attributes.get("gen_ai.usage.total_tokens") == 8
+    attrs = _request_span(spans).attributes
+    assert attrs.get("gen_ai.operation.name") == "chat"
+    assert attrs.get("gen_ai.provider.name") == "openai"
+    assert "gen_ai.system" not in attrs
+    assert attrs.get("gen_ai.response.model") == "gpt-4o-mini"
+    assert attrs.get("gen_ai.response.id") == "chatcmpl-test"
+    assert attrs.get("gen_ai.response.finish_reasons") == "['stop']"
+    assert attrs.get("gen_ai.usage.input_tokens") == 3
+    assert attrs.get("gen_ai.usage.output_tokens") == 5
+    assert attrs.get("gen_ai.usage.total_tokens") == 8
 
 
 def test_litellm_double_instrument_is_noop(agent, exporter, litellm_instrumentor):  # pylint: disable=unused-argument
@@ -194,5 +219,6 @@ def test_litellm_mock_response_with_otel_callback(agent, exporter, litellm_instr
     attrs = _request_span(spans).attributes
     assert attrs.get("gen_ai.request.model") == "gpt-4o-mini"
     assert attrs.get("gen_ai.operation.name") == "chat"
-    assert attrs.get("gen_ai.system") == "openai"
+    assert attrs.get("gen_ai.provider.name") == "openai"
+    assert "gen_ai.system" not in attrs
     assert attrs.get("gen_ai.framework") == "litellm"

From 44ff5ecaad65c10c50f670e523c23b02e25a6b93 Mon Sep 17 00:00:00 2001
From: Shreyas Nagaraj <shreyas.nagaraj@harness.io>
Date: Thu, 25 Jun 2026 19:21:12 +0530
Subject: [PATCH 3/4] CCM-33842: avoid LiteLLM callback duplicate attrs

Rely on the SDK wrapper for LiteLLM response enrichment so exported spans do not retain the legacy gen_ai.system attribute.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/harness_sdk/instrumentation/litellm/__init__.py       | 8 +++-----
 .../litellm/litellm_instrumentation_test.py               | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/harness_sdk/instrumentation/litellm/__init__.py b/src/harness_sdk/instrumentation/litellm/__init__.py
index 0fb9187..28a4b6a 100644
--- a/src/harness_sdk/instrumentation/litellm/__init__.py
+++ b/src/harness_sdk/instrumentation/litellm/__init__.py
@@ -6,10 +6,9 @@
   - litellm.completion / litellm.acompletion
   - litellm.embedding / litellm.aembedding
 
-Registers a ``TraceableLiteLLMOpenTelemetry`` callback (subclass of LiteLLM's
-``OpenTelemetry``) and wraps the public entry points so evaluation runs on an
-active span before the provider call. LiteLLM's OTEL callback enriches that
-span on success when it is the active parent context.
+Wraps the public entry points so evaluation runs on an active span before the
+provider call. The wrapper enriches that span with response metadata before it
+ends.
 
 Optional: ``pip install harness-sdk[litellm]``
 """
@@ -477,7 +476,6 @@ def instrument(self, **_kwargs: Any) -> None:
             import litellm  # pylint: disable=import-outside-toplevel
 
             otel_logger = _get_otel_logger()
-            _register_otel_callback(otel_logger)
             main_mod = __import__(_LITELLM_MAIN, fromlist=["*"])
             for func_name, is_async in _WRAPPED_FUNCTIONS:
                 wrapt.wrap_function_wrapper(
diff --git a/test/instrumentation/litellm/litellm_instrumentation_test.py b/test/instrumentation/litellm/litellm_instrumentation_test.py
index 5c3c568..a3335a7 100644
--- a/test/instrumentation/litellm/litellm_instrumentation_test.py
+++ b/test/instrumentation/litellm/litellm_instrumentation_test.py
@@ -205,7 +205,7 @@ def counting_fake(*_a, **_k):
     assert len(spans) == 0
 
 
-def test_litellm_mock_response_with_otel_callback(agent, exporter, litellm_instrumentor):  # pylint: disable=unused-argument
+def test_litellm_mock_response_with_wrapper_enrichment(agent, exporter, litellm_instrumentor):  # pylint: disable=unused-argument
     litellm_instrumentor.instrument()
     litellm.completion(
         model="gpt-4o-mini",

From 1f96b2f8df4af72347ec1fbe4244152d17bc8087 Mon Sep 17 00:00:00 2001
From: Shreyas Nagaraj <shreyas.nagaraj@harness.io>
Date: Thu, 25 Jun 2026 20:14:37 +0530
Subject: [PATCH 4/4] CCM-33842: remove unused LiteLLM logger variable

Drop the stale local variable left after removing LiteLLM callback registration.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/harness_sdk/instrumentation/litellm/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/harness_sdk/instrumentation/litellm/__init__.py b/src/harness_sdk/instrumentation/litellm/__init__.py
index 28a4b6a..ecb514a 100644
--- a/src/harness_sdk/instrumentation/litellm/__init__.py
+++ b/src/harness_sdk/instrumentation/litellm/__init__.py
@@ -475,7 +475,6 @@ def instrument(self, **_kwargs: Any) -> None:
         try:
             import litellm  # pylint: disable=import-outside-toplevel
 
-            otel_logger = _get_otel_logger()
             main_mod = __import__(_LITELLM_MAIN, fromlist=["*"])
             for func_name, is_async in _WRAPPED_FUNCTIONS:
                 wrapt.wrap_function_wrapper(