Skip to content

Commit e686e70

Browse files
authored
fix: Retry on 404 in run details to handle read replica lag (#440)
1 parent 36c6b0c commit e686e70

File tree

2 files changed

+95
-11
lines changed

2 files changed

+95
-11
lines changed

src/aignostics/platform/resources/runs.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from typing import Any, cast
1414

1515
from aignx.codegen.api.public_api import PublicApi
16+
from aignx.codegen.exceptions import NotFoundException
1617
from aignx.codegen.exceptions import ServiceException
1718
from aignx.codegen.models import (
1819
CustomMetadataUpdateRequest,
@@ -42,6 +43,7 @@
4243
Retrying,
4344
retry_if_exception_type,
4445
stop_after_attempt,
46+
stop_after_delay,
4547
wait_exponential_jitter,
4648
)
4749
from urllib3.exceptions import IncompleteRead, PoolError, ProtocolError, ProxyError
@@ -137,7 +139,8 @@ def for_run_id(cls, run_id: str, cache_token: bool = True) -> "Run":
137139
def details(self, nocache: bool = False, hide_platform_queue_position: bool = False) -> RunData:
138140
"""Retrieves the current status of the application run.
139141
140-
Retries on network and server errors.
142+
Retries on network and server errors. Additionally retries on
143+
NotFoundException for up to 5 seconds to handle read replica lag.
141144
142145
Args:
143146
nocache (bool): If True, skip reading from cache and fetch fresh data from the API.
@@ -149,24 +152,37 @@ def details(self, nocache: bool = False, hide_platform_queue_position: bool = Fa
149152
RunData: The run data.
150153
151154
Raises:
155+
NotFoundException: If the run is not found after retries.
152156
Exception: If the API request fails.
153157
"""
154158

155159
@cached_operation(ttl=settings().run_cache_ttl, use_token=True)
156160
def details_with_retry(run_id: str) -> RunData:
161+
def _fetch() -> RunData:
162+
return Retrying(
163+
retry=retry_if_exception_type(exception_types=RETRYABLE_EXCEPTIONS),
164+
stop=stop_after_attempt(settings().run_retry_attempts),
165+
wait=wait_exponential_jitter(
166+
initial=settings().run_retry_wait_min, max=settings().run_retry_wait_max
167+
),
168+
before_sleep=_log_retry_attempt,
169+
reraise=True,
170+
)(
171+
lambda: self._api.get_run_v1_runs_run_id_get(
172+
run_id,
173+
_request_timeout=settings().run_timeout,
174+
_headers={"User-Agent": user_agent()},
175+
)
176+
)
177+
178+
# NOTE(nahua): Outer retry handles NotFoundException (read replica lag)
157179
return Retrying(
158-
retry=retry_if_exception_type(exception_types=RETRYABLE_EXCEPTIONS),
159-
stop=stop_after_attempt(settings().run_retry_attempts),
160-
wait=wait_exponential_jitter(initial=settings().run_retry_wait_min, max=settings().run_retry_wait_max),
180+
retry=retry_if_exception_type(exception_types=(NotFoundException,)),
181+
stop=stop_after_delay(5),
182+
wait=wait_exponential_jitter(initial=0.5, max=3),
161183
before_sleep=_log_retry_attempt,
162184
reraise=True,
163-
)(
164-
lambda: self._api.get_run_v1_runs_run_id_get(
165-
run_id,
166-
_request_timeout=settings().run_timeout,
167-
_headers={"User-Agent": user_agent()},
168-
)
169-
)
185+
)(_fetch)
170186

171187
run_data: RunData = details_with_retry(self.run_id, nocache=nocache) # type: ignore[call-arg]
172188
if hide_platform_queue_position:

tests/aignostics/platform/resources/runs_test.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,3 +600,71 @@ def test_run_details_can_hide_platform_queue_position(
600600
result = app_run.details(hide_platform_queue_position=hide_platform_queue_position)
601601
assert result.num_preceding_items_org == run_data.num_preceding_items_org
602602
assert result.num_preceding_items_platform == expected_platform_queue_position
603+
604+
605+
@pytest.mark.unit
606+
def test_run_details_retries_on_not_found_then_succeeds(app_run, mock_api) -> None:
607+
"""Test that Run.details retries on NotFoundException and succeeds when the run becomes available.
608+
609+
This verifies the outer retry logic that handles read replica lag by retrying
610+
NotFoundException until the run is found.
611+
612+
Args:
613+
app_run: Run instance with mock API.
614+
mock_api: Mock ExternalsApi instance.
615+
"""
616+
from aignx.codegen.exceptions import NotFoundException
617+
618+
run_data = RunReadResponse.model_construct(run_id="test-run-id")
619+
mock_api.get_run_v1_runs_run_id_get.side_effect = [
620+
NotFoundException(),
621+
NotFoundException(),
622+
run_data,
623+
]
624+
625+
result = app_run.details()
626+
627+
assert result.run_id == "test-run-id"
628+
assert mock_api.get_run_v1_runs_run_id_get.call_count == 3
629+
630+
631+
@pytest.mark.unit
632+
def test_run_details_raises_not_found_after_timeout(app_run, mock_api) -> None:
633+
"""Test that Run.details re-raises NotFoundException after the retry timeout expires.
634+
635+
This verifies that the outer retry gives up after the configured delay and
636+
surfaces the NotFoundException to the caller.
637+
638+
Args:
639+
app_run: Run instance with mock API.
640+
mock_api: Mock ExternalsApi instance.
641+
"""
642+
from aignx.codegen.exceptions import NotFoundException
643+
644+
mock_api.get_run_v1_runs_run_id_get.side_effect = NotFoundException()
645+
646+
with pytest.raises(NotFoundException):
647+
app_run.details()
648+
649+
assert mock_api.get_run_v1_runs_run_id_get.call_count > 1
650+
651+
652+
@pytest.mark.unit
653+
def test_run_details_does_not_retry_other_exceptions(app_run, mock_api) -> None:
654+
"""Test that the outer retry does not catch non-NotFoundException errors.
655+
656+
This verifies that exceptions like ForbiddenException pass straight through
657+
the outer retry without being retried.
658+
659+
Args:
660+
app_run: Run instance with mock API.
661+
mock_api: Mock ExternalsApi instance.
662+
"""
663+
from aignx.codegen.exceptions import ForbiddenException
664+
665+
mock_api.get_run_v1_runs_run_id_get.side_effect = ForbiddenException()
666+
667+
with pytest.raises(ForbiddenException):
668+
app_run.details()
669+
670+
assert mock_api.get_run_v1_runs_run_id_get.call_count == 1

0 commit comments

Comments
 (0)