Merge pull request #17707 from raghav-stripe/raghav-fix-responsesapi-rl

Sameerlite · web-flow · commit 2ea855d2257b · 2025-12-11T08:57:16.000+05:30
fix: responses api not applying tpm rate limits on api keys
diff --git a/litellm/proxy/hooks/parallel_request_limiter_v3.py b/litellm/proxy/hooks/parallel_request_limiter_v3.py
@@ -29,6 +29,7 @@
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.auth.auth_utils import get_model_rate_limit_from_metadata
 from litellm.types.llms.openai import BaseLiteLLMOpenAIResponseObject
+from litellm.types.utils import ModelResponse, Usage
 
 if TYPE_CHECKING:
     from opentelemetry.trace import Span as _Span
@@ -1232,6 +1233,28 @@ def _create_pipeline_operations(
 
         return pipeline_operations
 
+    def _get_total_tokens_from_usage(self, usage: Any | None, rate_limit_type: Literal["output", "input", "total"]) -> int:
+        # Get total tokens from response
+        total_tokens = 0
+        # spot fix for /responses api
+        if usage:
+            if isinstance(usage, Usage):
+                if rate_limit_type == "output":
+                    total_tokens = usage.completion_tokens
+                elif rate_limit_type == "input":
+                    total_tokens = usage.prompt_tokens
+                elif rate_limit_type == "total":
+                    total_tokens = usage.total_tokens
+            elif isinstance(usage, dict):
+                # Responses API usage comes as a dict in ResponsesAPIResponse
+                if rate_limit_type == "output":
+                    total_tokens = usage.get("completion_tokens", 0)
+                elif rate_limit_type == "input":
+                    total_tokens = usage.get("prompt_tokens", 0)
+                elif rate_limit_type == "total":
+                    total_tokens = usage.get("total_tokens", 0)
+        return total_tokens
+
     async def _execute_token_increment_script(
         self,
         pipeline_operations: List["RedisPipelineIncrementOperation"],
@@ -1313,11 +1336,10 @@ async def async_increment_tokens_with_ttl_preservation(
 
     def get_rate_limit_type(self) -> Literal["output", "input", "total"]:
         from litellm.proxy.proxy_server import general_settings
-
         specified_rate_limit_type = general_settings.get(
-            "token_rate_limit_type", "output"
+            "token_rate_limit_type", "total"
         )
-        if not specified_rate_limit_type or specified_rate_limit_type not in [
+        if specified_rate_limit_type not in [
             "output",
             "input",
             "total",
@@ -1336,7 +1358,6 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti
             get_model_group_from_litellm_kwargs,
         )
         from litellm.types.caching import RedisPipelineIncrementOperation
-        from litellm.types.utils import ModelResponse, Usage
 
         rate_limit_type = self.get_rate_limit_type()
 
@@ -1372,13 +1393,7 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti
                 response_obj, BaseLiteLLMOpenAIResponseObject
             ):
                 _usage = getattr(response_obj, "usage", None)
-                if _usage and isinstance(_usage, Usage):
-                    if rate_limit_type == "output":
-                        total_tokens = _usage.completion_tokens
-                    elif rate_limit_type == "input":
-                        total_tokens = _usage.prompt_tokens
-                    elif rate_limit_type == "total":
-                        total_tokens = _usage.total_tokens
+                total_tokens = self._get_total_tokens_from_usage(usage=_usage, rate_limit_type=rate_limit_type)
 
             # Create pipeline operations for TPM increments
             pipeline_operations: List[RedisPipelineIncrementOperation] = []
diff --git a/tests/test_litellm/proxy/hooks/test_dynamic_rate_limiter_v3.py b/tests/test_litellm/proxy/hooks/test_dynamic_rate_limiter_v3.py
@@ -1403,13 +1403,13 @@ async def mock_increment(pipeline_operations, parent_otel_span=None):
             end_time=None,
         )
     
-    # Verify increments happened with actual token count (50 completion tokens)
+    # Verify increments happened with actual token count (60 total tokens)
     assert len(increment_calls) == 2, f"Expected 2 increment calls, got {len(increment_calls)}"
     
-    # Both should increment by 50 (completion_tokens, since rate_limit_type defaults to 'output')
+    # Both should increment by 50 (total_tokens, since rate_limit_type defaults to 'total')
     for call in increment_calls:
-        assert call["increment_value"] == 50, (
-            f"Expected increment of 50 tokens, got {call['increment_value']} for key {call['key']}"
+        assert call["increment_value"] == 60, (
+            f"Expected increment of 60 tokens, got {call['increment_value']} for key {call['key']}"
         )
     
     # Verify correct keys were used
diff --git a/tests/test_litellm/proxy/hooks/test_parallel_request_limiter_v3.py b/tests/test_litellm/proxy/hooks/test_parallel_request_limiter_v3.py
@@ -1583,6 +1583,231 @@ async def mock_should_rate_limit(descriptors, **kwargs):
     assert "Current limit: 2" in exc_info.value.detail
 
 
+@pytest.mark.asyncio
+async def test_get_rate_limit_type_default_is_total(monkeypatch):
+    """
+    Test that get_rate_limit_type returns 'total' as the default when no setting is specified.
+
+    This verifies the change from 'output' to 'total' as the default value.
+    """
+    local_cache = DualCache()
+    parallel_request_handler = _PROXY_MaxParallelRequestsHandler(
+        internal_usage_cache=InternalUsageCache(local_cache)
+    )
+
+    # Mock general_settings to return empty dict (no token_rate_limit_type set)
+    import litellm.proxy.proxy_server as proxy_server
+    original_settings = getattr(proxy_server, 'general_settings', {})
+    monkeypatch.setattr(proxy_server, 'general_settings', {})
+
+    try:
+        result = parallel_request_handler.get_rate_limit_type()
+        assert result == "total", f"Default rate limit type should be 'total', got '{result}'"
+    finally:
+        monkeypatch.setattr(proxy_server, 'general_settings', original_settings)
+
+
+@pytest.mark.asyncio
+async def test_get_rate_limit_type_invalid_falls_back_to_total(monkeypatch):
+    """
+    Test that get_rate_limit_type falls back to 'total' when an invalid value is specified.
+    """
+    local_cache = DualCache()
+    parallel_request_handler = _PROXY_MaxParallelRequestsHandler(
+        internal_usage_cache=InternalUsageCache(local_cache)
+    )
+
+    # Mock general_settings to return an invalid token_rate_limit_type
+    import litellm.proxy.proxy_server as proxy_server
+    original_settings = getattr(proxy_server, 'general_settings', {})
+    monkeypatch.setattr(proxy_server, 'general_settings', {'token_rate_limit_type': 'invalid_type'})
+
+    try:
+        result = parallel_request_handler.get_rate_limit_type()
+        assert result == "total", f"Invalid rate limit type should fall back to 'total', got '{result}'"
+    finally:
+        monkeypatch.setattr(proxy_server, 'general_settings', original_settings)
+
+
+@pytest.mark.parametrize(
+    "token_rate_limit_type,expected_field",
+    [
+        ("input", "prompt_tokens"),
+        ("output", "completion_tokens"),
+        ("total", "total_tokens"),
+    ],
+)
+@pytest.mark.asyncio
+async def test_async_log_success_event_with_dict_usage(monkeypatch, token_rate_limit_type, expected_field):
+    """
+    Test that async_log_success_event correctly handles usage as a dict (Responses API format).
+
+    The Responses API returns usage as a dict in ResponsesAPIResponse instead of a Usage object.
+    This test verifies that token counting works correctly with dict-based usage.
+    """
+    from unittest.mock import MagicMock
+
+    _api_key = "sk-12345"
+    _api_key = hash_token(_api_key)
+    local_cache = DualCache()
+    parallel_request_handler = _PROXY_MaxParallelRequestsHandler(
+        internal_usage_cache=InternalUsageCache(local_cache)
+    )
+
+    # Mock the get_rate_limit_type method
+    def mock_get_rate_limit_type():
+        return token_rate_limit_type
+
+    monkeypatch.setattr(
+        parallel_request_handler, "get_rate_limit_type", mock_get_rate_limit_type
+    )
+
+    # Create a mock response object with usage as a dict (Responses API format)
+    mock_response = MagicMock()
+    mock_response.usage = {
+        "prompt_tokens": 25,
+        "completion_tokens": 35,
+        "total_tokens": 60
+    }
+    # Make isinstance check for BaseLiteLLMOpenAIResponseObject return True
+    from litellm.types.utils import BaseLiteLLMOpenAIResponseObject
+    mock_response.__class__ = type('MockResponse', (BaseLiteLLMOpenAIResponseObject,), {})
+
+    # Create mock kwargs for the success event
+    mock_kwargs = {
+        "standard_logging_object": {
+            "metadata": {
+                "user_api_key_hash": _api_key,
+                "user_api_key_user_id": None,
+                "user_api_key_team_id": None,
+                "user_api_key_end_user_id": None,
+            }
+        },
+        "model": "gpt-3.5-turbo",
+    }
+
+    # Mock the pipeline increment method to capture the operations
+    captured_operations = []
+
+    async def mock_increment_pipeline(increment_list, **kwargs):
+        captured_operations.extend(increment_list)
+        return True
+
+    monkeypatch.setattr(
+        parallel_request_handler.internal_usage_cache.dual_cache,
+        "async_increment_cache_pipeline",
+        mock_increment_pipeline,
+    )
+
+    # Call the success event handler
+    await parallel_request_handler.async_log_success_event(
+        kwargs=mock_kwargs,
+        response_obj=mock_response,
+        start_time=datetime.now(),
+        end_time=datetime.now(),
+    )
+
+    # Find the TPM increment operation
+    tpm_operation = None
+    for op in captured_operations:
+        if op["key"].endswith(":tokens"):
+            tpm_operation = op
+            break
+
+    assert tpm_operation is not None, "Should have a TPM increment operation"
+
+    # Check that the correct token count was used based on the rate limit type
+    expected_tokens = {
+        "input": 25,  # prompt_tokens
+        "output": 35,  # completion_tokens
+        "total": 60,  # total_tokens
+    }
+
+    assert (
+        tpm_operation["increment_value"] == expected_tokens[token_rate_limit_type]
+    ), f"Expected {expected_tokens[token_rate_limit_type]} tokens for type '{token_rate_limit_type}', got {tpm_operation['increment_value']}"
+
+
+@pytest.mark.asyncio
+async def test_async_log_success_event_with_dict_usage_missing_fields(monkeypatch):
+    """
+    Test that async_log_success_event handles dict usage with missing fields gracefully.
+
+    When usage dict is missing expected fields, it should default to 0.
+    """
+    from unittest.mock import MagicMock
+
+    _api_key = "sk-12345"
+    _api_key = hash_token(_api_key)
+    local_cache = DualCache()
+    parallel_request_handler = _PROXY_MaxParallelRequestsHandler(
+        internal_usage_cache=InternalUsageCache(local_cache)
+    )
+
+    # Mock the get_rate_limit_type method
+    def mock_get_rate_limit_type():
+        return "output"
+
+    monkeypatch.setattr(
+        parallel_request_handler, "get_rate_limit_type", mock_get_rate_limit_type
+    )
+
+    # Create a mock response object with usage as a dict missing some fields
+    mock_response = MagicMock()
+    mock_response.usage = {
+        "prompt_tokens": 25,
+        # completion_tokens is missing
+        # total_tokens is missing
+    }
+    from litellm.types.utils import BaseLiteLLMOpenAIResponseObject
+    mock_response.__class__ = type('MockResponse', (BaseLiteLLMOpenAIResponseObject,), {})
+
+    # Create mock kwargs for the success event
+    mock_kwargs = {
+        "standard_logging_object": {
+            "metadata": {
+                "user_api_key_hash": _api_key,
+                "user_api_key_user_id": None,
+                "user_api_key_team_id": None,
+                "user_api_key_end_user_id": None,
+            }
+        },
+        "model": "gpt-3.5-turbo",
+    }
+
+    # Mock the pipeline increment method to capture the operations
+    captured_operations = []
+
+    async def mock_increment_pipeline(increment_list, **kwargs):
+        captured_operations.extend(increment_list)
+        return True
+
+    monkeypatch.setattr(
+        parallel_request_handler.internal_usage_cache.dual_cache,
+        "async_increment_cache_pipeline",
+        mock_increment_pipeline,
+    )
+
+    # Call the success event handler - should not raise exception
+    await parallel_request_handler.async_log_success_event(
+        kwargs=mock_kwargs,
+        response_obj=mock_response,
+        start_time=datetime.now(),
+        end_time=datetime.now(),
+    )
+
+    # Find the TPM increment operation
+    tpm_operation = None
+    for op in captured_operations:
+        if op["key"].endswith(":tokens"):
+            tpm_operation = op
+            break
+
+    assert tpm_operation is not None, "Should have a TPM increment operation"
+    # Should default to 0 when field is missing
+    assert tpm_operation["increment_value"] == 0, "Should default to 0 when completion_tokens is missing"
+
+
 @pytest.mark.asyncio
 async def test_execute_token_increment_script_cluster_compatibility():
     """