Skip to content

Commit b83bc10

Browse files
authored
Merge pull request #16862 from xianzongxie-stripe/add_polling_via_cache_feature
Add polling via cache feature
2 parents 321ffd7 + 7c9b70b commit b83bc10

File tree

6 files changed

+2131
-4
lines changed

6 files changed

+2131
-4
lines changed

litellm/proxy/proxy_server.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1121,6 +1121,8 @@ def swagger_monkey_patch(*args, **kwargs):
11211121
redis_usage_cache: Optional[RedisCache] = (
11221122
None # redis cache used for tracking spend, tpm/rpm limits
11231123
)
1124+
polling_via_cache_enabled: Union[Literal["all"], List[str], bool] = False
1125+
polling_cache_ttl: int = 3600 # Default 1 hour TTL for polling cache
11241126
user_custom_auth = None
11251127
user_custom_key_generate = None
11261128
user_custom_sso = None
@@ -2351,6 +2353,15 @@ async def load_config( # noqa: PLR0915
23512353
# this is set in the cache branch
23522354
# see usage here: https://docs.litellm.ai/docs/proxy/caching
23532355
pass
2356+
elif key == "responses":
2357+
# Initialize global polling via cache settings
2358+
global polling_via_cache_enabled, polling_cache_ttl
2359+
background_mode = value.get("background_mode", {})
2360+
polling_via_cache_enabled = background_mode.get("polling_via_cache", False)
2361+
polling_cache_ttl = background_mode.get("ttl", 3600)
2362+
verbose_proxy_logger.debug(
2363+
f"{blue_color_code} Initialized polling via cache: enabled={polling_via_cache_enabled}, ttl={polling_cache_ttl}{reset_color_code}"
2364+
)
23542365
elif key == "default_team_settings":
23552366
for idx, team_setting in enumerate(
23562367
value

litellm/proxy/response_api_endpoints/endpoints.py

Lines changed: 219 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
1-
from fastapi import APIRouter, Depends, Request, Response
1+
import asyncio
22

3+
from fastapi import APIRouter, Depends, HTTPException, Request, Response
4+
5+
from litellm._logging import verbose_proxy_logger
36
from litellm.proxy._types import *
47
from litellm.proxy.auth.user_api_key_auth import UserAPIKeyAuth, user_api_key_auth
58
from litellm.proxy.common_request_processing import ProxyBaseLLMRequestProcessing
9+
from litellm.types.responses.main import DeleteResponseResult
610

711
router = APIRouter()
812

@@ -30,22 +34,40 @@ async def responses_api(
3034
"""
3135
Follows the OpenAI Responses API spec: https://platform.openai.com/docs/api-reference/responses
3236
37+
Supports background mode with polling_via_cache for partial response retrieval.
38+
When background=true and polling_via_cache is enabled, returns a polling_id immediately
39+
and streams the response in the background, updating Redis cache.
40+
3341
```bash
42+
# Normal request
3443
curl -X POST http://localhost:4000/v1/responses \
3544
-H "Content-Type: application/json" \
3645
-H "Authorization: Bearer sk-1234" \
3746
-d '{
3847
"model": "gpt-4o",
3948
"input": "Tell me about AI"
4049
}'
50+
51+
# Background request with polling
52+
curl -X POST http://localhost:4000/v1/responses \
53+
-H "Content-Type: application/json" \
54+
-H "Authorization: Bearer sk-1234" \
55+
-d '{
56+
"model": "gpt-4o",
57+
"input": "Tell me about AI",
58+
"background": true
59+
}'
4160
```
4261
"""
4362
from litellm.proxy.proxy_server import (
4463
_read_request_body,
4564
general_settings,
4665
llm_router,
66+
polling_cache_ttl,
67+
polling_via_cache_enabled,
4768
proxy_config,
4869
proxy_logging_obj,
70+
redis_usage_cache,
4971
select_data_generator,
5072
user_api_base,
5173
user_max_tokens,
@@ -56,6 +78,74 @@ async def responses_api(
5678
)
5779

5880
data = await _read_request_body(request=request)
81+
82+
# Check if polling via cache should be used for this request
83+
from litellm.proxy.response_polling.polling_handler import should_use_polling_for_request
84+
85+
should_use_polling = should_use_polling_for_request(
86+
background_mode=data.get("background", False),
87+
polling_via_cache_enabled=polling_via_cache_enabled,
88+
redis_cache=redis_usage_cache,
89+
model=data.get("model", ""),
90+
llm_router=llm_router,
91+
)
92+
93+
# If polling is enabled, use polling mode
94+
if should_use_polling:
95+
from litellm.proxy.response_polling.polling_handler import (
96+
ResponsePollingHandler,
97+
)
98+
from litellm.proxy.response_polling.background_streaming import (
99+
background_streaming_task,
100+
)
101+
102+
verbose_proxy_logger.info(
103+
f"Starting background response with polling for model={data.get('model')}"
104+
)
105+
106+
# Initialize polling handler with configured TTL (from global config)
107+
polling_handler = ResponsePollingHandler(
108+
redis_cache=redis_usage_cache,
109+
ttl=polling_cache_ttl # Global var set at startup
110+
)
111+
112+
# Generate polling ID
113+
polling_id = ResponsePollingHandler.generate_polling_id()
114+
115+
# Create initial state in Redis
116+
initial_state = await polling_handler.create_initial_state(
117+
polling_id=polling_id,
118+
request_data=data,
119+
)
120+
121+
# Start background task to stream and update cache
122+
asyncio.create_task(
123+
background_streaming_task(
124+
polling_id=polling_id,
125+
data=data.copy(),
126+
polling_handler=polling_handler,
127+
request=request,
128+
fastapi_response=fastapi_response,
129+
user_api_key_dict=user_api_key_dict,
130+
general_settings=general_settings,
131+
llm_router=llm_router,
132+
proxy_config=proxy_config,
133+
proxy_logging_obj=proxy_logging_obj,
134+
select_data_generator=select_data_generator,
135+
user_model=user_model,
136+
user_temperature=user_temperature,
137+
user_request_timeout=user_request_timeout,
138+
user_max_tokens=user_max_tokens,
139+
user_api_base=user_api_base,
140+
version=version,
141+
)
142+
)
143+
144+
# Return OpenAI Response object format (initial state)
145+
# https://platform.openai.com/docs/api-reference/responses/object
146+
return initial_state
147+
148+
# Normal response flow
59149
processor = ProxyBaseLLMRequestProcessing(data=data)
60150
try:
61151
return await processor.base_process_llm_request(
@@ -253,9 +343,18 @@ async def get_response(
253343
"""
254344
Get a response by ID.
255345
346+
Supports both:
347+
- Polling IDs (litellm_poll_*): Returns cumulative cached content from background responses
348+
- Provider response IDs: Passes through to provider API
349+
256350
Follows the OpenAI Responses API spec: https://platform.openai.com/docs/api-reference/responses/get
257351
258352
```bash
353+
# Get polling response
354+
curl -X GET http://localhost:4000/v1/responses/litellm_poll_abc123 \
355+
-H "Authorization: Bearer sk-1234"
356+
357+
# Get provider response
259358
curl -X GET http://localhost:4000/v1/responses/resp_abc123 \
260359
-H "Authorization: Bearer sk-1234"
261360
```
@@ -266,6 +365,7 @@ async def get_response(
266365
llm_router,
267366
proxy_config,
268367
proxy_logging_obj,
368+
redis_usage_cache,
269369
select_data_generator,
270370
user_api_base,
271371
user_max_tokens,
@@ -274,7 +374,33 @@ async def get_response(
274374
user_temperature,
275375
version,
276376
)
277-
377+
from litellm.proxy.response_polling.polling_handler import ResponsePollingHandler
378+
379+
# Check if this is a polling ID
380+
if ResponsePollingHandler.is_polling_id(response_id):
381+
# Handle polling response
382+
if not redis_usage_cache:
383+
raise HTTPException(
384+
status_code=500,
385+
detail="Redis cache not configured. Polling requires Redis."
386+
)
387+
388+
polling_handler = ResponsePollingHandler(redis_cache=redis_usage_cache)
389+
390+
# Get current state from cache
391+
state = await polling_handler.get_state(response_id)
392+
393+
if not state:
394+
raise HTTPException(
395+
status_code=404,
396+
detail=f"Polling response {response_id} not found or expired"
397+
)
398+
399+
# Return the whole state directly (OpenAI Response object format)
400+
# https://platform.openai.com/docs/api-reference/responses/object
401+
return state
402+
403+
# Normal provider response flow
278404
data = await _read_request_body(request=request)
279405
data["response_id"] = response_id
280406
processor = ProxyBaseLLMRequestProcessing(data=data)
@@ -330,6 +456,10 @@ async def delete_response(
330456
"""
331457
Delete a response by ID.
332458
459+
Supports both:
460+
- Polling IDs (litellm_poll_*): Deletes from Redis cache
461+
- Provider response IDs: Passes through to provider API
462+
333463
Follows the OpenAI Responses API spec: https://platform.openai.com/docs/api-reference/responses/delete
334464
335465
```bash
@@ -343,6 +473,7 @@ async def delete_response(
343473
llm_router,
344474
proxy_config,
345475
proxy_logging_obj,
476+
redis_usage_cache,
346477
select_data_generator,
347478
user_api_base,
348479
user_max_tokens,
@@ -351,7 +482,44 @@ async def delete_response(
351482
user_temperature,
352483
version,
353484
)
354-
485+
from litellm.proxy.response_polling.polling_handler import ResponsePollingHandler
486+
487+
# Check if this is a polling ID
488+
if ResponsePollingHandler.is_polling_id(response_id):
489+
# Handle polling response deletion
490+
if not redis_usage_cache:
491+
raise HTTPException(
492+
status_code=500,
493+
detail="Redis cache not configured."
494+
)
495+
496+
polling_handler = ResponsePollingHandler(redis_cache=redis_usage_cache)
497+
498+
# Get state to verify access
499+
state = await polling_handler.get_state(response_id)
500+
501+
if not state:
502+
raise HTTPException(
503+
status_code=404,
504+
detail=f"Polling response {response_id} not found"
505+
)
506+
507+
# Delete from cache
508+
success = await polling_handler.delete_polling(response_id)
509+
510+
if success:
511+
return DeleteResponseResult(
512+
id=response_id,
513+
object="response",
514+
deleted=True
515+
)
516+
else:
517+
raise HTTPException(
518+
status_code=500,
519+
detail="Failed to delete polling response"
520+
)
521+
522+
# Normal provider response flow
355523
data = await _read_request_body(request=request)
356524
data["response_id"] = response_id
357525
processor = ProxyBaseLLMRequestProcessing(data=data)
@@ -475,9 +643,18 @@ async def cancel_response(
475643
"""
476644
Cancel a response by ID.
477645
646+
Supports both:
647+
- Polling IDs (litellm_poll_*): Cancels background response and updates status in Redis
648+
- Provider response IDs: Passes through to provider API
649+
478650
Follows the OpenAI Responses API spec: https://platform.openai.com/docs/api-reference/responses/cancel
479651
480652
```bash
653+
# Cancel polling response
654+
curl -X POST http://localhost:4000/v1/responses/litellm_poll_abc123/cancel \
655+
-H "Authorization: Bearer sk-1234"
656+
657+
# Cancel provider response
481658
curl -X POST http://localhost:4000/v1/responses/resp_abc123/cancel \
482659
-H "Authorization: Bearer sk-1234"
483660
```
@@ -488,6 +665,7 @@ async def cancel_response(
488665
llm_router,
489666
proxy_config,
490667
proxy_logging_obj,
668+
redis_usage_cache,
491669
select_data_generator,
492670
user_api_base,
493671
user_max_tokens,
@@ -496,7 +674,44 @@ async def cancel_response(
496674
user_temperature,
497675
version,
498676
)
499-
677+
from litellm.proxy.response_polling.polling_handler import ResponsePollingHandler
678+
679+
# Check if this is a polling ID
680+
if ResponsePollingHandler.is_polling_id(response_id):
681+
# Handle polling response cancellation
682+
if not redis_usage_cache:
683+
raise HTTPException(
684+
status_code=500,
685+
detail="Redis cache not configured."
686+
)
687+
688+
polling_handler = ResponsePollingHandler(redis_cache=redis_usage_cache)
689+
690+
# Get current state to verify it exists
691+
state = await polling_handler.get_state(response_id)
692+
693+
if not state:
694+
raise HTTPException(
695+
status_code=404,
696+
detail=f"Polling response {response_id} not found"
697+
)
698+
699+
# Cancel the polling response (sets status to "cancelled")
700+
success = await polling_handler.cancel_polling(response_id)
701+
702+
if success:
703+
# Fetch the updated state with cancelled status
704+
updated_state = await polling_handler.get_state(response_id)
705+
706+
# Return the whole state directly (now with status="cancelled")
707+
return updated_state
708+
else:
709+
raise HTTPException(
710+
status_code=500,
711+
detail="Failed to cancel polling response"
712+
)
713+
714+
# Normal provider response flow
500715
data = await _read_request_body(request=request)
501716
data["response_id"] = response_id
502717
processor = ProxyBaseLLMRequestProcessing(data=data)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""
2+
Response Polling Module for Background Responses with Cache
3+
"""
4+
from litellm.proxy.response_polling.background_streaming import (
5+
background_streaming_task,
6+
)
7+
from litellm.proxy.response_polling.polling_handler import (
8+
ResponsePollingHandler,
9+
should_use_polling_for_request,
10+
)
11+
12+
__all__ = [
13+
"ResponsePollingHandler",
14+
"background_streaming_task",
15+
"should_use_polling_for_request",
16+
]

0 commit comments

Comments
 (0)