Skip to content

Commit d97c6f3

Browse files
committed
Merge branch 'main' into close-inactive-contexts
2 parents 65104db + 05648c6 commit d97c6f3

File tree

8 files changed

+81
-46
lines changed

8 files changed

+81
-46
lines changed

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.0.29
2+
current_version = 0.0.30
33
commit = True
44
tag = True
55

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ to integrate `asyncio`-based projects such as `Playwright`.
2222

2323
### Minimum required versions
2424

25-
* Python >= 3.7
25+
* Python >= 3.8
2626
* Scrapy >= 2.0 (!= 2.4.0)
2727
* Playwright >= 1.15
2828

docs/changelog.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
# scrapy-playwright changelog
22

33

4+
### [v0.0.30](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.30) (2023-08-17)
5+
6+
* Fix page_init_callback duplication (#222)
7+
* Bump minimum Python version from 3.7 to 3.8 (#223)
8+
9+
410
### [v0.0.29](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.29) (2023-08-11)
511

612
* Set exc_info=True for warning log records (#219)

scrapy_playwright/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.29"
1+
__version__ = "0.0.30"

scrapy_playwright/_utils.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
2-
from typing import Awaitable, Iterator, Optional, Tuple
2+
from typing import Awaitable, Iterator, Optional, Tuple, Union
33

4-
from playwright.async_api import Error, Page
4+
from playwright.async_api import Error, Page, Request, Response
55
from scrapy import Spider
66
from scrapy.http.headers import Headers
77
from scrapy.settings import Settings
@@ -88,3 +88,13 @@ def _read_float_setting(settings: Settings, key: str) -> Optional[float]:
8888
except (KeyError, TypeError, ValueError):
8989
pass
9090
return None
91+
92+
93+
async def _get_header_value(
94+
resource: Union[Request, Response],
95+
header_name: str,
96+
) -> Optional[str]:
97+
try:
98+
return await resource.header_value(header_name)
99+
except Exception:
100+
return None

scrapy_playwright/handler.py

Lines changed: 21 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from scrapy_playwright.page import PageMethod
3333
from scrapy_playwright._utils import (
3434
_encode_body,
35+
_get_header_value,
3536
_get_page_content,
3637
_is_safe_close_error,
3738
_maybe_await,
@@ -239,27 +240,6 @@ async def _create_page(self, request: Request, spider: Spider) -> Page:
239240
self._set_max_concurrent_page_count()
240241
if self.default_navigation_timeout is not None:
241242
page.set_default_navigation_timeout(self.default_navigation_timeout)
242-
page_init_callback = request.meta.get("playwright_page_init_callback")
243-
if page_init_callback:
244-
try:
245-
page_init_callback = load_object(page_init_callback)
246-
await page_init_callback(page, request)
247-
except Exception as ex:
248-
logger.warning(
249-
"[Context=%s] Page init callback exception for %s exc_type=%s exc_msg=%s",
250-
context_name,
251-
repr(request),
252-
type(ex),
253-
str(ex),
254-
extra={
255-
"spider": spider,
256-
"context_name": context_name,
257-
"scrapy_request_url": request.url,
258-
"scrapy_request_method": request.method,
259-
"exception": ex,
260-
},
261-
exc_info=True,
262-
)
263243

264244
page.on("close", self._make_close_page_callback(context_name))
265245
page.on("crash", self._make_close_page_callback(context_name))
@@ -399,10 +379,6 @@ async def _download_request_with_page(
399379
)
400380
request.meta["download_latency"] = time() - start_time
401381

402-
if not request.meta.get("playwright_include_page"):
403-
await page.close()
404-
self.stats.inc_value("playwright/page_count/closed")
405-
406382
server_ip_address = None
407383
with suppress(AttributeError, KeyError, TypeError, ValueError):
408384
server_addr = await response.server_addr()
@@ -411,6 +387,10 @@ async def _download_request_with_page(
411387
with suppress(AttributeError):
412388
request.meta["playwright_security_details"] = await response.security_details()
413389

390+
if not request.meta.get("playwright_include_page"):
391+
await page.close()
392+
self.stats.inc_value("playwright/page_count/closed")
393+
414394
body, encoding = _encode_body(headers=headers, text=body_str)
415395
respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
416396
return respcls(
@@ -683,19 +663,22 @@ async def _maybe_execute_page_init_callback(
683663

684664
def _make_request_logger(context_name: str, spider: Spider) -> Callable:
685665
async def _log_request(request: PlaywrightRequest) -> None:
686-
referrer = await request.header_value("referer")
666+
log_args = [context_name, request.method.upper(), request.url, request.resource_type]
667+
referrer = await _get_header_value(request, "referer")
668+
if referrer:
669+
log_args.append(referrer)
670+
log_msg = "[Context=%s] Request: <%s %s> (resource type: %s, referrer: %s)"
671+
else:
672+
log_msg = "[Context=%s] Request: <%s %s> (resource type: %s)"
687673
logger.debug(
688-
"[Context=%s] Request: <%s %s> (resource type: %s, referrer: %s)",
689-
context_name,
690-
request.method.upper(),
691-
request.url,
692-
request.resource_type,
693-
referrer,
674+
log_msg,
675+
*log_args,
694676
extra={
695677
"spider": spider,
696678
"context_name": context_name,
697679
"playwright_request_url": request.url,
698680
"playwright_request_method": request.method,
681+
"playwright_resource_type": request.resource_type,
699682
},
700683
)
701684

@@ -704,16 +687,15 @@ async def _log_request(request: PlaywrightRequest) -> None:
704687

705688
def _make_response_logger(context_name: str, spider: Spider) -> Callable:
706689
async def _log_response(response: PlaywrightResponse) -> None:
707-
referrer = await response.header_value("referer")
708-
log_args = [context_name, response.status, response.url, referrer]
709-
if 300 <= response.status < 400:
710-
location = await response.header_value("location")
690+
log_args = [context_name, response.status, response.url]
691+
location = await _get_header_value(response, "location")
692+
if location:
711693
log_args.append(location)
712-
msg = "[Context=%s] Response: <%i %s> (referrer: %s, location: %s)"
694+
log_msg = "[Context=%s] Response: <%i %s> (location: %s)"
713695
else:
714-
msg = "[Context=%s] Response: <%i %s> (referrer: %s)"
696+
log_msg = "[Context=%s] Response: <%i %s>"
715697
logger.debug(
716-
msg,
698+
log_msg,
717699
*log_args,
718700
extra={
719701
"spider": spider,

tests/tests_asyncio/test_utils.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,13 @@
66
from playwright.async_api import Error as PlaywrightError
77
from scrapy import Spider
88
from scrapy.http.headers import Headers
9-
from scrapy_playwright._utils import _get_page_content, _NAVIGATION_ERROR_MSG, _encode_body
9+
from scrapy_playwright._utils import (
10+
_NAVIGATION_ERROR_MSG,
11+
_encode_body,
12+
_get_header_value,
13+
_get_page_content,
14+
_maybe_await,
15+
)
1016

1117

1218
class TestPageContent(IsolatedAsyncioTestCase):
@@ -119,3 +125,35 @@ async def test_encode_mismatch(self):
119125
)
120126
assert encoding == "gb18030"
121127
assert body == text.encode(encoding)
128+
129+
130+
class TestHeaderValue(IsolatedAsyncioTestCase):
131+
@pytest.mark.asyncio
132+
async def test_get_header_ok(self):
133+
async def _identity(x):
134+
return x
135+
136+
resource = AsyncMock()
137+
resource.header_value = _identity
138+
assert "asdf" == await _get_header_value(resource, "asdf")
139+
assert "qwerty" == await _get_header_value(resource, "qwerty")
140+
141+
async def test_get_header_exception(self):
142+
resource = AsyncMock()
143+
resource.header_value.side_effect = Exception("nope")
144+
assert await _get_header_value(resource, "asdf") is None
145+
assert await _get_header_value(resource, "qwerty") is None
146+
147+
148+
class TestMaybeAwait(IsolatedAsyncioTestCase):
149+
@pytest.mark.asyncio
150+
async def test_maybe_await(self):
151+
async def _awaitable_identity(x):
152+
return x
153+
154+
assert await _maybe_await(_awaitable_identity("asdf")) == "asdf"
155+
assert await _maybe_await(_awaitable_identity("qwerty")) == "qwerty"
156+
assert await _maybe_await(_awaitable_identity(1234)) == 1234
157+
assert await _maybe_await("foo") == "foo"
158+
assert await _maybe_await("bar") == "bar"
159+
assert await _maybe_await(1234) == 1234

tox.ini

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ deps =
77
pytest_asyncio==0.21.1
88
pytest_cov==4.1.0
99
pytest_twisted==1.14
10-
playwright==1.36.0
1110
commands =
1211
playwright install
1312
py.test -vv --reactor=asyncio \

0 commit comments

Comments
 (0)