From 8e6573326c9d01ef26aaffb64ee7efb4e06dcafa Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 8 Dec 2025 10:15:31 +0100 Subject: [PATCH 1/3] Fix short error summary --- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- src/crawlee/crawlers/_basic/_logging_utils.py | 25 ++++++++++++++--- src/crawlee/router.py | 16 ++++++++--- .../crawlers/_basic/test_basic_crawler.py | 27 +++++++++++++++++++ 4 files changed, 63 insertions(+), 7 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 5b3e797d41..724e14ddc1 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1134,7 +1134,7 @@ async def _handle_request_retries( request.retry_count += 1 reduced_error = str(error).split('\n')[0] self.log.warning( - f'Retrying request to {context.request.url} due to: {reduced_error}' + f'Retrying request to {context.request.url} due to: {reduced_error}. ' f'{get_one_line_error_summary_if_possible(error)}' ) await self._statistics.error_tracker.add(error=error, context=context) diff --git a/src/crawlee/crawlers/_basic/_logging_utils.py b/src/crawlee/crawlers/_basic/_logging_utils.py index f9a8306458..0cb98a5a48 100644 --- a/src/crawlee/crawlers/_basic/_logging_utils.py +++ b/src/crawlee/crawlers/_basic/_logging_utils.py @@ -2,9 +2,21 @@ import re import traceback +import crawlee.router + def _get_only_innermost_exception(error: BaseException) -> BaseException: - """Get innermost exception by following __cause__ and __context__ attributes of exception.""" + """Get innermost exception by following __cause__ and __context__ attributes of exception. + + If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible. + """ + if type(error) is crawlee.router.UserHandlerTimeoutError: + if error.__cause__: + return error.__cause__ + if error.__context__: + return error.__context__ + return error + if error.__cause__: return _get_only_innermost_exception(error.__cause__) if error.__context__: @@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts( def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]: innermost_error = _get_only_innermost_exception(error) return traceback.format_exception( - type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True + type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False ) def get_one_line_error_summary_if_possible(error: Exception) -> str: if isinstance(error, asyncio.exceptions.TimeoutError): - most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1] + relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error) + most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else '' + elif isinstance(error, crawlee.router.UserHandlerTimeoutError): + # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee + # code and third line the topmost user error + traceback_parts = _get_traceback_parts_for_innermost_exception(error) + relevant_index_from_start = 3 + most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else '' elif 'playwright._impl._errors.Error' in str(error.__class__): # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway # point to deep internals. diff --git a/src/crawlee/router.py b/src/crawlee/router.py index a6278d98e1..169a807bed 100644 --- a/src/crawlee/router.py +++ b/src/crawlee/router.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio from collections.abc import Awaitable, Callable from typing import Generic, TypeVar @@ -95,7 +96,16 @@ async def __call__(self, context: TCrawlingContext) -> None: f'No handler matches label `{context.request.label}` and no default handler is configured' ) - return await self._default_handler(context) + user_defined_handler = self._default_handler + else: + user_defined_handler = self._handlers_by_label[context.request.label] - handler = self._handlers_by_label[context.request.label] - return await handler(context) + try: + return await user_defined_handler(context) + except asyncio.TimeoutError as e: + # Timeout in handler, but not timeout of handler. + raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e + + +class UserHandlerTimeoutError(Exception): + """Raised when a router fails due to user timeout.""" diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index e9606fb10c..6a86614bbd 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -6,8 +6,10 @@ import json import logging import os +import re import sys import time +from asyncio import Future from collections import Counter from dataclasses import dataclass from datetime import timedelta @@ -1565,6 +1567,31 @@ async def handler(context: BasicCrawlingContext) -> None: assert found_timeout_message, 'Expected log message about request handler error was not found.' +async def test_reduced_logs_from_time_out_in_request_handler(caplog: pytest.LogCaptureFixture) -> None: + crawler = BasicCrawler(configure_logging=False) + + @crawler.router.default_handler + async def default_handler(_: BasicCrawlingContext) -> None: + await asyncio.wait_for(Future(), timeout=1) + + # Capture all logs from the 'crawlee' logger at INFO level or higher + with caplog.at_level(logging.INFO, logger='crawlee'): + await crawler.run([Request.from_url('https://a.placeholder.com')]) + + # Check for 1 line summary message + found_timeout_message = False + for record in caplog.records: + if re.match( + r'Retrying request to .* due to: Timeout raised by user defined handler\. File .*, line .*,' + r' in default_handler, await asyncio.wait_for\(Future\(\), timeout=1\)', + record.message, + ): + found_timeout_message = True + break + + assert found_timeout_message, 'Expected log message about request handler error was not found.' + + async def test_status_message_callback() -> None: """Test that status message callback is called with the correct message.""" status_message_callback = AsyncMock() From 6580a0329b61d76e7aab4f2755274fa8e7595d56 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 8 Dec 2025 11:53:57 +0100 Subject: [PATCH 2/3] Finalize --- src/crawlee/crawlers/_basic/_basic_crawler.py | 6 ++++-- src/crawlee/crawlers/_basic/_logging_utils.py | 7 ++++--- src/crawlee/errors.py | 4 ++++ src/crawlee/router.py | 6 ++---- tests/unit/crawlers/_basic/test_basic_crawler.py | 3 ++- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 724e14ddc1..25f887ba4b 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -59,6 +59,7 @@ RequestHandlerError, SessionError, UserDefinedErrorHandlerError, + UserHandlerTimeoutError, ) from crawlee.events._types import Event, EventCrawlerStatusData from crawlee.http_clients import ImpitHttpClient @@ -1222,10 +1223,11 @@ def _get_message_from_error(self, error: Exception) -> str: if ( isinstance(error, asyncio.exceptions.TimeoutError) + and traceback_parts and self._request_handler_timeout_text in traceback_parts[-1] - ): + ) or isinstance(error, UserHandlerTimeoutError): used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error) - used_traceback_parts.append(traceback_parts[-1]) + used_traceback_parts.extend(traceback_parts[-1:]) return ''.join(used_traceback_parts).strip('\n') diff --git a/src/crawlee/crawlers/_basic/_logging_utils.py b/src/crawlee/crawlers/_basic/_logging_utils.py index 0cb98a5a48..bc5bf85dcd 100644 --- a/src/crawlee/crawlers/_basic/_logging_utils.py +++ b/src/crawlee/crawlers/_basic/_logging_utils.py @@ -2,6 +2,7 @@ import re import traceback +import crawlee.errors import crawlee.router @@ -10,7 +11,7 @@ def _get_only_innermost_exception(error: BaseException) -> BaseException: If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible. """ - if type(error) is crawlee.router.UserHandlerTimeoutError: + if type(error) is crawlee.errors.UserHandlerTimeoutError: if error.__cause__: return error.__cause__ if error.__context__: @@ -46,7 +47,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str: def reduce_asyncio_timeout_error_to_relevant_traceback_parts( - timeout_error: asyncio.exceptions.TimeoutError, + timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError, ) -> list[str]: innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error) return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts) @@ -63,7 +64,7 @@ def get_one_line_error_summary_if_possible(error: Exception) -> str: if isinstance(error, asyncio.exceptions.TimeoutError): relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error) most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else '' - elif isinstance(error, crawlee.router.UserHandlerTimeoutError): + elif isinstance(error, crawlee.errors.UserHandlerTimeoutError): # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee # code and third line the topmost user error traceback_parts = _get_traceback_parts_for_innermost_exception(error) diff --git a/src/crawlee/errors.py b/src/crawlee/errors.py index 217ce31091..539bcf7711 100644 --- a/src/crawlee/errors.py +++ b/src/crawlee/errors.py @@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception): """Wraps an exception thrown from an user-defined error handler.""" +class UserHandlerTimeoutError(UserDefinedErrorHandlerError): + """Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out.""" + + @docs_group('Errors') class SessionError(Exception): """Errors of `SessionError` type will trigger a session rotation. diff --git a/src/crawlee/router.py b/src/crawlee/router.py index 169a807bed..94312cb1d1 100644 --- a/src/crawlee/router.py +++ b/src/crawlee/router.py @@ -9,6 +9,8 @@ __all__ = ['Router'] +from crawlee.errors import UserHandlerTimeoutError + TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext) RequestHandler = Callable[[TCrawlingContext], Awaitable[None]] @@ -105,7 +107,3 @@ async def __call__(self, context: TCrawlingContext) -> None: except asyncio.TimeoutError as e: # Timeout in handler, but not timeout of handler. raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e - - -class UserHandlerTimeoutError(Exception): - """Raised when a router fails due to user timeout.""" diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 6a86614bbd..c72002e1c6 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1542,6 +1542,7 @@ async def test_reduced_logs_from_timed_out_request_handler(caplog: pytest.LogCap caplog.set_level(logging.INFO) crawler = BasicCrawler( configure_logging=False, + max_request_retries=1, request_handler_timeout=timedelta(seconds=1), ) @@ -1568,7 +1569,7 @@ async def handler(context: BasicCrawlingContext) -> None: async def test_reduced_logs_from_time_out_in_request_handler(caplog: pytest.LogCaptureFixture) -> None: - crawler = BasicCrawler(configure_logging=False) + crawler = BasicCrawler(configure_logging=False, max_request_retries=1) @crawler.router.default_handler async def default_handler(_: BasicCrawlingContext) -> None: From 7f062f03341fdae5e1946cd12f9060582207efd9 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 8 Dec 2025 15:55:49 +0100 Subject: [PATCH 3/3] Remove unused import --- src/crawlee/crawlers/_basic/_logging_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/crawlee/crawlers/_basic/_logging_utils.py b/src/crawlee/crawlers/_basic/_logging_utils.py index bc5bf85dcd..528d213cd6 100644 --- a/src/crawlee/crawlers/_basic/_logging_utils.py +++ b/src/crawlee/crawlers/_basic/_logging_utils.py @@ -3,7 +3,6 @@ import traceback import crawlee.errors -import crawlee.router def _get_only_innermost_exception(error: BaseException) -> BaseException: