Release: Merge staging to main

Judgment Release Bot · Judgment Release Bot · commit 8bb3d8eb6605 · 2025-11-06T21:48:47.000Z
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -30,6 +30,7 @@ jobs:
       TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
       GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
       ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
       JUDGMENT_DEV: true
 
     steps:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 repos:
   - repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.9.2
+    rev: 0.9.7
     hooks:
       - id: uv-lock
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.0
+    rev: v0.14.3
     hooks:
       - id: ruff
         name: ruff (linter)
diff --git a/src/judgeval/tracer/llm/llm_openai/chat_completions.py b/src/judgeval/tracer/llm/llm_openai/chat_completions.py
@@ -25,7 +25,10 @@
     immutable_wrap_sync_iterator,
     immutable_wrap_async_iterator,
 )
-from judgeval.tracer.llm.llm_openai.utils import openai_tokens_converter
+from judgeval.tracer.llm.llm_openai.utils import (
+    openai_tokens_converter,
+    set_cost_attribute,
+)
 
 if TYPE_CHECKING:
     from judgeval.tracer import Tracer
@@ -90,6 +93,8 @@ def post_hook(ctx: Dict[str, Any], result: ChatCompletion) -> None:
             if prompt_tokens_details:
                 cache_read = prompt_tokens_details.cached_tokens or 0
 
+            set_cost_attribute(span, usage_data)
+
             prompt_tokens, completion_tokens, cache_read, cache_creation = (
                 openai_tokens_converter(
                     prompt_tokens,
@@ -195,6 +200,8 @@ def yield_hook(inner_ctx: Dict[str, Any], chunk: ChatCompletionChunk) -> None:
                 if chunk.usage.prompt_tokens_details:
                     cache_read = chunk.usage.prompt_tokens_details.cached_tokens or 0
 
+                set_cost_attribute(span, chunk.usage)
+
                 prompt_tokens, completion_tokens, cache_read, cache_creation = (
                     openai_tokens_converter(
                         prompt_tokens,
@@ -312,6 +319,8 @@ def post_hook(ctx: Dict[str, Any], result: ChatCompletion) -> None:
             if prompt_tokens_details:
                 cache_read = prompt_tokens_details.cached_tokens or 0
 
+            set_cost_attribute(span, usage_data)
+
             prompt_tokens, completion_tokens, cache_read, cache_creation = (
                 openai_tokens_converter(
                     prompt_tokens,
@@ -418,6 +427,8 @@ def yield_hook(inner_ctx: Dict[str, Any], chunk: ChatCompletionChunk) -> None:
                 if chunk.usage.prompt_tokens_details:
                     cache_read = chunk.usage.prompt_tokens_details.cached_tokens or 0
 
+                set_cost_attribute(span, chunk.usage)
+
                 prompt_tokens, completion_tokens, cache_read, cache_creation = (
                     openai_tokens_converter(
                         prompt_tokens,
diff --git a/src/judgeval/tracer/llm/llm_openai/responses.py b/src/judgeval/tracer/llm/llm_openai/responses.py
@@ -24,7 +24,10 @@
     immutable_wrap_sync_iterator,
     immutable_wrap_async_iterator,
 )
-from judgeval.tracer.llm.llm_openai.utils import openai_tokens_converter
+from judgeval.tracer.llm.llm_openai.utils import (
+    openai_tokens_converter,
+    set_cost_attribute,
+)
 
 if TYPE_CHECKING:
     from judgeval.tracer import Tracer
@@ -81,6 +84,7 @@ def post_hook(ctx: Dict[str, Any], result: Response) -> None:
             completion_tokens = usage_data.output_tokens or 0
             cache_read = usage_data.input_tokens_details.cached_tokens or 0
 
+            set_cost_attribute(span, usage_data)
             prompt_tokens, completion_tokens, cache_read, cache_creation = (
                 openai_tokens_converter(
                     prompt_tokens,
@@ -191,6 +195,7 @@ def yield_hook(inner_ctx: Dict[str, Any], chunk: Any) -> None:
                         else 0
                     )
 
+                    set_cost_attribute(span, chunk.response.usage)
                     prompt_tokens, completion_tokens, cache_read, cache_creation = (
                         openai_tokens_converter(
                             prompt_tokens,
@@ -312,6 +317,7 @@ def post_hook(ctx: Dict[str, Any], result: Response) -> None:
             completion_tokens = usage_data.output_tokens or 0
             cache_read = usage_data.input_tokens_details.cached_tokens or 0
 
+            set_cost_attribute(span, usage_data)
             prompt_tokens, completion_tokens, cache_read, cache_creation = (
                 openai_tokens_converter(
                     prompt_tokens,
@@ -424,6 +430,7 @@ def yield_hook(inner_ctx: Dict[str, Any], chunk: Any) -> None:
                         else 0
                     )
 
+                    set_cost_attribute(span, chunk.response.usage)
                     prompt_tokens, completion_tokens, cache_read, cache_creation = (
                         openai_tokens_converter(
                             prompt_tokens,
diff --git a/src/judgeval/tracer/llm/llm_openai/utils.py b/src/judgeval/tracer/llm/llm_openai/utils.py
@@ -1,3 +1,10 @@
+from typing import Any
+from opentelemetry.trace import Span
+from judgeval.tracer.keys import AttributeKeys
+from judgeval.tracer.utils import set_span_attribute
+from judgeval.utils.serialize import safe_serialize
+
+
 def openai_tokens_converter(
     prompt_tokens: int,
     completion_tokens: int,
@@ -20,3 +27,16 @@ def openai_tokens_converter(
         return prompt_tokens - cache_read, completion_tokens, cache_read, cache_creation
     else:
         return prompt_tokens, completion_tokens, cache_read, cache_creation
+
+
+def set_cost_attribute(span: Span, usage_data: Any) -> None:
+    """
+    This is for OpenRouter case where the cost is provided in the usage data when they specify:
+    extra_body={"usage": {"include": True}},
+    """
+    if hasattr(usage_data, "cost") and usage_data.cost:
+        set_span_attribute(
+            span,
+            AttributeKeys.JUDGMENT_USAGE_TOTAL_COST_USD,
+            safe_serialize(usage_data.cost),
+        )
diff --git a/src/tests/tracer/llm/llm_openrouter/__init__.py b/src/tests/tracer/llm/llm_openrouter/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/tests/tracer/llm/llm_openrouter/conftest.py b/src/tests/tracer/llm/llm_openrouter/conftest.py
@@ -0,0 +1,75 @@
+"""OpenRouter-specific fixtures for tests."""
+
+import pytest
+import os
+
+pytest.importorskip("openai")
+
+from openai import OpenAI, AsyncOpenAI
+from judgeval.tracer.llm.llm_openai.wrapper import (
+    wrap_openai_client_sync,
+    wrap_openai_client_async,
+)
+
+
+@pytest.fixture
+def openrouter_api_key():
+    """OpenRouter API key from environment"""
+    api_key = os.getenv("OPENROUTER_API_KEY")
+    if not api_key:
+        pytest.skip("OPENROUTER_API_KEY environment variable not set")
+    return api_key
+
+
+@pytest.fixture
+def sync_client(openrouter_api_key):
+    """Unwrapped sync OpenRouter client (using OpenAI SDK)"""
+    return OpenAI(
+        api_key=openrouter_api_key,
+        base_url="https://openrouter.ai/api/v1",
+        default_headers={
+            "HTTP-Referer": os.getenv("OPENROUTER_APP_URL", "https://judgmentlabs.ai"),
+            "X-Title": os.getenv("OPENROUTER_APP_NAME", "Judgeval Tests"),
+        },
+    )
+
+
+@pytest.fixture
+def async_client(openrouter_api_key):
+    """Unwrapped async OpenRouter client (using OpenAI SDK)"""
+    return AsyncOpenAI(
+        api_key=openrouter_api_key,
+        base_url="https://openrouter.ai/api/v1",
+        default_headers={
+            "HTTP-Referer": os.getenv("OPENROUTER_APP_URL", "https://judgmentlabs.ai"),
+            "X-Title": os.getenv("OPENROUTER_APP_NAME", "Judgeval Tests"),
+        },
+    )
+
+
+@pytest.fixture
+def wrapped_sync_client(tracer, sync_client):
+    """Wrapped sync OpenRouter client with tracer"""
+    return wrap_openai_client_sync(tracer, sync_client)
+
+
+@pytest.fixture
+def wrapped_async_client(tracer, async_client):
+    """Wrapped async OpenRouter client with tracer"""
+    return wrap_openai_client_async(tracer, async_client)
+
+
+@pytest.fixture(params=["wrapped", "unwrapped"], ids=["with_tracer", "without_tracer"])
+def sync_client_maybe_wrapped(request, tracer, sync_client):
+    """Parametrized fixture that yields both wrapped and unwrapped sync clients"""
+    if request.param == "wrapped":
+        return wrap_openai_client_sync(tracer, sync_client)
+    return sync_client
+
+
+@pytest.fixture(params=["wrapped", "unwrapped"], ids=["with_tracer", "without_tracer"])
+def async_client_maybe_wrapped(request, tracer, async_client):
+    """Parametrized fixture that yields both wrapped and unwrapped async clients"""
+    if request.param == "wrapped":
+        return wrap_openai_client_async(tracer, async_client)
+    return async_client
diff --git a/src/tests/tracer/llm/llm_openrouter/test_chat_completions.py b/src/tests/tracer/llm/llm_openrouter/test_chat_completions.py
diff --git a/src/tests/tracer/llm/llm_openrouter/test_responses.py b/src/tests/tracer/llm/llm_openrouter/test_responses.py