Skip to content

Commit 97fa1c6

Browse files
committed
update of utils module
1 parent d2a98c0 commit 97fa1c6

File tree

11 files changed

+191
-164
lines changed

11 files changed

+191
-164
lines changed

src/apify_client/_utils.py

Lines changed: 134 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,11 @@
22

33
import asyncio
44
import base64
5-
import contextlib
65
import io
76
import json
8-
import json as jsonlib
97
import random
108
import re
119
import time
12-
from collections.abc import Callable
13-
from datetime import datetime, timezone
1410
from enum import Enum
1511
from http import HTTPStatus
1612
from typing import TYPE_CHECKING, Any, TypeVar, cast
@@ -20,141 +16,119 @@
2016
from apify_client.errors import InvalidResponseBodyError
2117

2218
if TYPE_CHECKING:
23-
from collections.abc import Awaitable
19+
from collections.abc import Awaitable, Callable
2420

2521
from impit import Response
2622

2723
from apify_client.errors import ApifyApiError
2824

29-
PARSE_DATE_FIELDS_MAX_DEPTH = 3
30-
PARSE_DATE_FIELDS_KEY_SUFFIX = 'At'
31-
RECORD_NOT_FOUND_EXCEPTION_TYPES = ['record-not-found', 'record-or-token-not-found']
32-
3325
T = TypeVar('T')
34-
StopRetryingType = Callable[[], None]
35-
ListOrDict = TypeVar('ListOrDict', list, dict)
36-
3726

38-
def filter_out_none_values_recursively(dictionary: dict) -> dict:
39-
"""Return copy of the dictionary, recursively omitting all keys for which values are None."""
40-
return cast('dict', filter_out_none_values_recursively_internal(dictionary))
4127

42-
43-
def filter_out_none_values_recursively_internal(
28+
def filter_out_none_values_recursively(
4429
dictionary: dict,
4530
*,
4631
remove_empty_dicts: bool | None = None,
47-
) -> dict | None:
48-
"""Recursively filters out None values from a dictionary.
49-
50-
Unfortunately, it's necessary to have an internal function for the correct result typing,
51-
without having to create complicated overloads
52-
"""
53-
result = {}
54-
for k, v in dictionary.items():
55-
if isinstance(v, dict):
56-
v = filter_out_none_values_recursively_internal( # noqa: PLW2901
57-
v, remove_empty_dicts=remove_empty_dicts is True or remove_empty_dicts is None
58-
)
59-
if v is not None:
60-
result[k] = v
61-
if not result and remove_empty_dicts:
62-
return None
63-
return result
64-
65-
66-
def parse_date_fields(data: ListOrDict, max_depth: int = PARSE_DATE_FIELDS_MAX_DEPTH) -> ListOrDict:
67-
"""Recursively parse date fields in a list or dictionary up to the specified depth."""
68-
if max_depth < 0:
69-
return data
70-
71-
if isinstance(data, list):
72-
return [parse_date_fields(item, max_depth - 1) for item in data]
73-
74-
if isinstance(data, dict):
75-
76-
def parse(key: str, value: object) -> object:
77-
parsed_value = value
78-
if key.endswith(PARSE_DATE_FIELDS_KEY_SUFFIX) and isinstance(value, str):
79-
with contextlib.suppress(ValueError):
80-
parsed_value = datetime.strptime(value, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=timezone.utc)
81-
elif isinstance(value, dict):
82-
parsed_value = parse_date_fields(value, max_depth - 1)
83-
elif isinstance(value, list):
84-
parsed_value = parse_date_fields(value, max_depth)
85-
return parsed_value
86-
87-
return {key: parse(key, value) for (key, value) in data.items()}
88-
89-
return data
90-
32+
) -> dict:
33+
"""Return a copy of the dictionary with all None values recursively removed.
9134
92-
def is_content_type_json(content_type: str) -> bool:
93-
"""Check if the given content type is JSON."""
94-
return bool(re.search(r'^application/json', content_type, flags=re.IGNORECASE))
35+
Args:
36+
dictionary: The dictionary to filter.
37+
remove_empty_dicts: If True, also remove empty dictionaries after filtering.
9538
96-
97-
def is_content_type_xml(content_type: str) -> bool:
98-
"""Check if the given content type is XML."""
99-
return bool(re.search(r'^application/.*xml$', content_type, flags=re.IGNORECASE))
100-
101-
102-
def is_content_type_text(content_type: str) -> bool:
103-
"""Check if the given content type is text."""
104-
return bool(re.search(r'^text/', content_type, flags=re.IGNORECASE))
105-
106-
107-
def is_file_or_bytes(value: Any) -> bool:
108-
"""Check if the input value is a file-like object or bytes.
109-
110-
The check for IOBase is not ideal, it would be better to use duck typing,
111-
but then the check would be super complex, judging from how the 'requests' library does it.
112-
This way should be good enough for the vast majority of use cases, if it causes issues, we can improve it later.
39+
Returns:
40+
A new dictionary without None values.
11341
"""
114-
return isinstance(value, (bytes, bytearray, io.IOBase))
11542

43+
def _internal(dictionary: dict, *, remove_empty: bool | None = None) -> dict | None:
44+
result = {}
45+
for key, val in dictionary.items():
46+
if isinstance(val, dict):
47+
val = _internal(val, remove_empty=remove_empty) # noqa: PLW2901
48+
if val is not None:
49+
result[key] = val
50+
if not result and remove_empty:
51+
return None
52+
return result
11653

117-
def json_dumps(obj: Any) -> str:
118-
"""Dump JSON to a string with the correct settings and serializer."""
119-
return json.dumps(obj, ensure_ascii=False, indent=2, default=str)
54+
return cast('dict', _internal(dictionary, remove_empty=remove_empty_dicts))
12055

12156

12257
def maybe_extract_enum_member_value(maybe_enum_member: Any) -> Any:
123-
"""Extract the value of an enumeration member if it is an Enum, otherwise return the original value."""
58+
"""Extract the value from an Enum member, or return the input unchanged if not an Enum."""
12459
if isinstance(maybe_enum_member, Enum):
12560
return maybe_enum_member.value
12661
return maybe_enum_member
12762

12863

12964
def to_safe_id(id: str) -> str:
130-
# Identificators of resources in the API are either in the format `resource_id` or `username/resource_id`.
131-
# Since the `/` character has a special meaning in URL paths,
132-
# we replace it with `~` for proper route parsing on the API, where after parsing the URL it's replaced back to `/`.
65+
"""Convert a resource ID to URL-safe format by replacing `/` with `~`.
66+
67+
Args:
68+
id: The resource identifier (format: `resource_id` or `username/resource_id`).
69+
70+
Returns:
71+
The resource identifier with `/` replaced by `~`.
72+
"""
13373
return id.replace('/', '~')
13474

13575

13676
def pluck_data(parsed_response: Any) -> dict:
77+
"""Extract the "data" field from an API response.
78+
79+
Args:
80+
parsed_response: The parsed API response.
81+
82+
Returns:
83+
The value of the "data" field.
84+
85+
Raises:
86+
ValueError: If the "data" field is missing.
87+
"""
13788
if isinstance(parsed_response, dict) and 'data' in parsed_response:
13889
return cast('dict', parsed_response['data'])
13990

14091
raise ValueError('The "data" property is missing in the response.')
14192

14293

14394
def pluck_data_as_list(parsed_response: Any) -> list:
95+
"""Extract the "data" field from an API response as a list.
96+
97+
Args:
98+
parsed_response: The parsed API response.
99+
100+
Returns:
101+
The value of the "data" field as a list.
102+
103+
Raises:
104+
ValueError: If the "data" field is missing.
105+
"""
144106
if isinstance(parsed_response, dict) and 'data' in parsed_response:
145107
return cast('list', parsed_response['data'])
146108

147109
raise ValueError('The "data" property is missing in the response.')
148110

149111

150112
def retry_with_exp_backoff(
151-
func: Callable[[StopRetryingType, int], T],
113+
func: Callable[[Callable[[], None], int], T],
152114
*,
153115
max_retries: int = 8,
154116
backoff_base_millis: int = 500,
155117
backoff_factor: float = 2,
156118
random_factor: float = 1,
157119
) -> T:
120+
"""Retry a function with exponential backoff.
121+
122+
Args:
123+
func: Function to retry. Receives a stop_retrying callback and attempt number.
124+
max_retries: Maximum number of retry attempts.
125+
backoff_base_millis: Base backoff delay in milliseconds.
126+
backoff_factor: Exponential backoff multiplier (1-10).
127+
random_factor: Random jitter factor (0-1).
128+
129+
Returns:
130+
The return value of the function.
131+
"""
158132
random_factor = min(max(0, random_factor), 1)
159133
backoff_factor = min(max(1, backoff_factor), 10)
160134
swallow = True
@@ -181,13 +155,25 @@ def stop_retrying() -> None:
181155

182156

183157
async def retry_with_exp_backoff_async(
184-
async_func: Callable[[StopRetryingType, int], Awaitable[T]],
158+
async_func: Callable[[Callable[[], None], int], Awaitable[T]],
185159
*,
186160
max_retries: int = 8,
187161
backoff_base_millis: int = 500,
188162
backoff_factor: float = 2,
189163
random_factor: float = 1,
190164
) -> T:
165+
"""Retry an async function with exponential backoff.
166+
167+
Args:
168+
async_func: Async function to retry. Receives a stop_retrying callback and attempt number.
169+
max_retries: Maximum number of retry attempts.
170+
backoff_base_millis: Base backoff delay in milliseconds.
171+
backoff_factor: Exponential backoff multiplier (1-10).
172+
random_factor: Random jitter factor (0-1).
173+
174+
Returns:
175+
The return value of the async function.
176+
"""
191177
random_factor = min(max(0, random_factor), 1)
192178
backoff_factor = min(max(1, backoff_factor), 10)
193179
swallow = True
@@ -214,14 +200,29 @@ def stop_retrying() -> None:
214200

215201

216202
def catch_not_found_or_throw(exc: ApifyApiError) -> None:
203+
"""Suppress 404 Not Found errors, re-raise all other exceptions.
204+
205+
Args:
206+
exc: The API error to check.
207+
208+
Raises:
209+
ApifyApiError: If the error is not a 404 Not Found error.
210+
"""
217211
is_not_found_status = exc.status_code == HTTPStatus.NOT_FOUND
218-
is_not_found_type = exc.type in RECORD_NOT_FOUND_EXCEPTION_TYPES
212+
is_not_found_type = exc.type in ['record-not-found', 'record-or-token-not-found']
219213
if not (is_not_found_status and is_not_found_type):
220214
raise exc
221215

222216

223217
def encode_webhook_list_to_base64(webhooks: list[dict]) -> str:
224-
"""Encode a list of dictionaries representing webhooks to their base64-encoded representation for the API."""
218+
"""Encode a list of webhook dictionaries to base64 for API transmission.
219+
220+
Args:
221+
webhooks: List of webhook dictionaries with keys like "event_types", "request_url", etc.
222+
223+
Returns:
224+
Base64-encoded JSON string.
225+
"""
225226
data = []
226227
for webhook in webhooks:
227228
webhook_representation = {
@@ -234,25 +235,49 @@ def encode_webhook_list_to_base64(webhooks: list[dict]) -> str:
234235
webhook_representation['headersTemplate'] = webhook['headers_template']
235236
data.append(webhook_representation)
236237

237-
return base64.b64encode(jsonlib.dumps(data).encode('utf-8')).decode('ascii')
238+
return base64.b64encode(json.dumps(data).encode('utf-8')).decode('ascii')
238239

239240

240241
def encode_key_value_store_record_value(value: Any, content_type: str | None = None) -> tuple[Any, str]:
242+
"""Encode a value for storage in a key-value store record.
243+
244+
Args:
245+
value: The value to encode (can be dict, str, bytes, or file-like object).
246+
content_type: The content type. If None, it's inferred from the value type.
247+
248+
Returns:
249+
A tuple of (encoded_value, content_type).
250+
"""
241251
if not content_type:
242-
if is_file_or_bytes(value):
252+
if isinstance(value, (bytes, bytearray, io.IOBase)):
243253
content_type = 'application/octet-stream'
244254
elif isinstance(value, str):
245255
content_type = 'text/plain; charset=utf-8'
246256
else:
247257
content_type = 'application/json; charset=utf-8'
248258

249-
if 'application/json' in content_type and not is_file_or_bytes(value) and not isinstance(value, str):
250-
value = jsonlib.dumps(value, ensure_ascii=False, indent=2, allow_nan=False, default=str).encode('utf-8')
259+
if (
260+
'application/json' in content_type
261+
and not isinstance(value, (bytes, bytearray, io.IOBase))
262+
and not isinstance(value, str)
263+
):
264+
value = json.dumps(value, ensure_ascii=False, indent=2, allow_nan=False, default=str).encode('utf-8')
251265

252266
return (value, content_type)
253267

254268

255269
def maybe_parse_response(response: Response) -> Any:
270+
"""Parse an HTTP response based on its content type.
271+
272+
Args:
273+
response: The HTTP response to parse.
274+
275+
Returns:
276+
Parsed response data (JSON dict/list, text string, or raw bytes).
277+
278+
Raises:
279+
InvalidResponseBodyError: If the response body cannot be parsed.
280+
"""
256281
if response.status_code == HTTPStatus.NO_CONTENT:
257282
return None
258283

@@ -261,9 +286,11 @@ def maybe_parse_response(response: Response) -> Any:
261286
content_type = response.headers['content-type'].split(';')[0].strip()
262287

263288
try:
264-
if is_content_type_json(content_type):
289+
if re.search(r'^application/json', content_type, flags=re.IGNORECASE):
265290
response_data = response.json()
266-
elif is_content_type_xml(content_type) or is_content_type_text(content_type):
291+
elif re.search(r'^application/.*xml$', content_type, flags=re.IGNORECASE) or re.search(
292+
r'^text/', content_type, flags=re.IGNORECASE
293+
):
267294
response_data = response.text
268295
else:
269296
response_data = response.content
@@ -274,7 +301,14 @@ def maybe_parse_response(response: Response) -> Any:
274301

275302

276303
def is_retryable_error(exc: Exception) -> bool:
277-
"""Check if the given error is retryable."""
304+
"""Check if an exception should be retried.
305+
306+
Args:
307+
exc: The exception to check.
308+
309+
Returns:
310+
True if the exception is retryable (network errors, timeouts, etc.).
311+
"""
278312
return isinstance(
279313
exc,
280314
(

0 commit comments

Comments
 (0)