Skip to content

Commit a65f86f

Browse files
authored
Do not fail on exceptions when getting referer header (#225)
* Get header values safely * Do not log referer header for responses
1 parent 396bd74 commit a65f86f

File tree

3 files changed

+53
-17
lines changed

3 files changed

+53
-17
lines changed

scrapy_playwright/_utils.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
2-
from typing import Awaitable, Iterator, Tuple
2+
from typing import Awaitable, Iterator, Optional, Tuple, Union
33

4-
from playwright.async_api import Error, Page
4+
from playwright.async_api import Error, Page, Request, Response
55
from scrapy import Spider
66
from scrapy.http.headers import Headers
77
from scrapy.utils.python import to_unicode
@@ -79,3 +79,13 @@ async def _get_page_content(
7979
)
8080
return await page.content()
8181
raise
82+
83+
84+
async def _get_header_value(
85+
resource: Union[Request, Response],
86+
header_name: str,
87+
) -> Optional[str]:
88+
try:
89+
return await resource.header_value(header_name)
90+
except Exception:
91+
return None

scrapy_playwright/handler.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from scrapy_playwright.page import PageMethod
3333
from scrapy_playwright._utils import (
3434
_encode_body,
35+
_get_header_value,
3536
_get_page_content,
3637
_is_safe_close_error,
3738
_maybe_await,
@@ -632,19 +633,22 @@ async def _maybe_execute_page_init_callback(
632633

633634
def _make_request_logger(context_name: str, spider: Spider) -> Callable:
634635
async def _log_request(request: PlaywrightRequest) -> None:
635-
referrer = await request.header_value("referer")
636+
log_args = [context_name, request.method.upper(), request.url, request.resource_type]
637+
referrer = await _get_header_value(request, "referer")
638+
if referrer:
639+
log_args.append(referrer)
640+
log_msg = "[Context=%s] Request: <%s %s> (resource type: %s, referrer: %s)"
641+
else:
642+
log_msg = "[Context=%s] Request: <%s %s> (resource type: %s)"
636643
logger.debug(
637-
"[Context=%s] Request: <%s %s> (resource type: %s, referrer: %s)",
638-
context_name,
639-
request.method.upper(),
640-
request.url,
641-
request.resource_type,
642-
referrer,
644+
log_msg,
645+
*log_args,
643646
extra={
644647
"spider": spider,
645648
"context_name": context_name,
646649
"playwright_request_url": request.url,
647650
"playwright_request_method": request.method,
651+
"playwright_resource_type": request.resource_type,
648652
},
649653
)
650654

@@ -653,16 +657,15 @@ async def _log_request(request: PlaywrightRequest) -> None:
653657

654658
def _make_response_logger(context_name: str, spider: Spider) -> Callable:
655659
async def _log_response(response: PlaywrightResponse) -> None:
656-
referrer = await response.header_value("referer")
657-
log_args = [context_name, response.status, response.url, referrer]
658-
if 300 <= response.status < 400:
659-
location = await response.header_value("location")
660+
log_args = [context_name, response.status, response.url]
661+
location = await _get_header_value(response, "location")
662+
if location:
660663
log_args.append(location)
661-
msg = "[Context=%s] Response: <%i %s> (referrer: %s, location: %s)"
664+
log_msg = "[Context=%s] Response: <%i %s> (location: %s)"
662665
else:
663-
msg = "[Context=%s] Response: <%i %s> (referrer: %s)"
666+
log_msg = "[Context=%s] Response: <%i %s>"
664667
logger.debug(
665-
msg,
668+
log_msg,
666669
*log_args,
667670
extra={
668671
"spider": spider,

tests/tests_asyncio/test_utils.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,12 @@
66
from playwright.async_api import Error as PlaywrightError
77
from scrapy import Spider
88
from scrapy.http.headers import Headers
9-
from scrapy_playwright._utils import _get_page_content, _NAVIGATION_ERROR_MSG, _encode_body
9+
from scrapy_playwright._utils import (
10+
_NAVIGATION_ERROR_MSG,
11+
_encode_body,
12+
_get_header_value,
13+
_get_page_content,
14+
)
1015

1116

1217
class TestPageContent(IsolatedAsyncioTestCase):
@@ -119,3 +124,21 @@ async def test_encode_mismatch(self):
119124
)
120125
assert encoding == "gb18030"
121126
assert body == text.encode(encoding)
127+
128+
129+
class TestHeaderValue(IsolatedAsyncioTestCase):
130+
@pytest.mark.asyncio
131+
async def test_get_header_ok(self):
132+
async def _identity(x):
133+
return x
134+
135+
resource = AsyncMock()
136+
resource.header_value = _identity
137+
assert "asdf" == await _get_header_value(resource, "asdf")
138+
assert "qwerty" == await _get_header_value(resource, "qwerty")
139+
140+
async def test_get_header_exception(self):
141+
resource = AsyncMock()
142+
resource.header_value.side_effect = Exception("nope")
143+
assert await _get_header_value(resource, "asdf") is None
144+
assert await _get_header_value(resource, "qwerty") is None

0 commit comments

Comments
 (0)