diff --git a/api/core/rag/extractor/watercrawl/exceptions.py b/api/core/rag/extractor/watercrawl/exceptions.py index fc457697a2221b..e786324233d7ca 100644 --- a/api/core/rag/extractor/watercrawl/exceptions.py +++ b/api/core/rag/extractor/watercrawl/exceptions.py @@ -10,8 +10,13 @@ class WaterCrawlBadRequestError(WaterCrawlError): def __init__(self, response): self.status_code = response.status_code self.response = response - data = response.json() - self.message = data.get("message", "Unknown error occurred") + try: + data = response.json() + except (ValueError, json.JSONDecodeError): + data = {} + self.message = data.get("message") or ( + response.text if hasattr(response, "text") and response.text else "Unknown error occurred" + ) self.errors = data.get("errors", {}) super().__init__(self.message) diff --git a/api/services/auth/jina/jina.py b/api/services/auth/jina/jina.py index 6260b6b48adcce..6a7aa41c926321 100644 --- a/api/services/auth/jina/jina.py +++ b/api/services/auth/jina/jina.py @@ -6,6 +6,8 @@ from core.helper.http_client_pooling import get_pooled_http_client from services.auth.api_key_auth_base import ApiKeyAuthBase, AuthCredentials +_CREDENTIAL_VALIDATION_TIMEOUT = httpx.Timeout(10.0, connect=3.0) + _http_client: httpx.Client = get_pooled_http_client( "auth:jina", lambda: httpx.Client(limits=httpx.Limits(max_keepalive_connections=50, max_connections=100)), @@ -39,7 +41,7 @@ def _prepare_headers(self): return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} def _post_request(self, url, data, headers): - return _http_client.post(url, headers=headers, json=data) + return _http_client.post(url, headers=headers, json=data, timeout=_CREDENTIAL_VALIDATION_TIMEOUT) def _handle_error(self, response): if response.status_code in {402, 409, 500}: diff --git a/api/tests/unit_tests/core/rag/extractor/watercrawl/test_watercrawl.py b/api/tests/unit_tests/core/rag/extractor/watercrawl/test_watercrawl.py index 35e581ccc15e78..a80d2b570ddb46 100644 --- a/api/tests/unit_tests/core/rag/extractor/watercrawl/test_watercrawl.py +++ b/api/tests/unit_tests/core/rag/extractor/watercrawl/test_watercrawl.py @@ -57,6 +57,48 @@ def test_permission_and_authentication_error_strings(self): assert "exceeding your WaterCrawl API limits" in str(permission) assert "API key is invalid or expired" in str(authentication) + def test_bad_request_error_with_non_json_body_falls_back_to_text(self): + """When the response body is not valid JSON, the exception must + fall back to ``response.text`` instead of leaking JSONDecodeError.""" + response = _response(400, text="Internal Server Error") + response.json.side_effect = ValueError("Not JSON") + + err = WaterCrawlBadRequestError(response) + + assert err.message == "Internal Server Error" + assert err.errors == {} + assert "Internal Server Error" in str(err) + + def test_bad_request_error_with_non_json_empty_body(self): + """When the response body is not JSON and text is empty, + a generic message should be used.""" + response = _response(400, text="") + response.json.side_effect = ValueError("Not JSON") + + err = WaterCrawlBadRequestError(response) + + assert err.message == "Unknown error occurred" + assert err.errors == {} + + def test_permission_error_with_non_json_body(self): + """WaterCrawlPermissionError must not raise JSONDecodeError on + non-JSON error responses.""" + response = _response(403, text="Rate limit exceeded") + response.json.side_effect = ValueError("Not JSON") + + err = WaterCrawlPermissionError(response) + assert "exceeding your WaterCrawl API limits" in str(err) + assert "Rate limit exceeded" in str(err) + + def test_authentication_error_with_non_json_body(self): + """WaterCrawlAuthenticationError must not raise JSONDecodeError on + non-JSON error responses.""" + response = _response(401, text="Unauthorized") + response.json.side_effect = ValueError("Not JSON") + + err = WaterCrawlAuthenticationError(response) + assert "API key is invalid or expired" in str(err) + class TestBaseAPIClient: def test_init_session_builds_expected_headers(self, monkeypatch: pytest.MonkeyPatch): diff --git a/api/tests/unit_tests/services/auth/test_jina_auth.py b/api/tests/unit_tests/services/auth/test_jina_auth.py index eb409c61d4e295..df4044e3ef1069 100644 --- a/api/tests/unit_tests/services/auth/test_jina_auth.py +++ b/api/tests/unit_tests/services/auth/test_jina_auth.py @@ -3,7 +3,7 @@ import httpx import pytest -from services.auth.jina.jina import JinaAuth +from services.auth.jina.jina import JinaAuth, _CREDENTIAL_VALIDATION_TIMEOUT class TestJinaAuth: @@ -51,6 +51,7 @@ def test_should_validate_valid_credentials_successfully(self, mock_post): "https://r.jina.ai", headers={"Content-Type": "application/json", "Authorization": "Bearer test_api_key_123"}, json={"url": "https://example.com"}, + timeout=_CREDENTIAL_VALIDATION_TIMEOUT, ) @patch("services.auth.jina.jina._http_client.post", autospec=True) @@ -185,3 +186,21 @@ def test_should_not_expose_api_key_in_error_messages(self): with pytest.raises(ValueError) as exc_info: JinaAuth({"auth_type": "basic", "config": {"api_key": "super_secret_key_12345"}}) assert "super_secret_key_12345" not in str(exc_info.value) + + @patch("services.auth.jina.jina._http_client.post", autospec=True) + def test_should_pass_bounded_timeout_to_credential_validation(self, mock_post): + """Credential validation must not use the default unbounded timeout.""" + mock_response = MagicMock() + mock_response.status_code = 200 + mock_post.return_value = mock_response + + credentials = {"auth_type": "bearer", "config": {"api_key": "test_api_key_123"}} + auth = JinaAuth(credentials) + auth.validate_credentials() + + call_kwargs = mock_post.call_args + assert "timeout" in call_kwargs.kwargs, "timeout keyword is missing from _post_request" + timeout = call_kwargs.kwargs["timeout"] + assert isinstance(timeout, httpx.Timeout) + assert timeout.connect == _CREDENTIAL_VALIDATION_TIMEOUT.connect + assert timeout.read == _CREDENTIAL_VALIDATION_TIMEOUT.read