Merge remote-tracking branch 'origin/main' into close-inactive-contexts

elacuesta · elacuesta · commit 8035e27553d2 · 2023-08-06T21:34:34.000-03:00
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.26
+current_version = 0.0.28
 commit = True
 tag = True
 
diff --git a/README.md b/README.md
@@ -128,7 +128,8 @@ Type `dict`, default `{}`
 
 A dictionary with options to be passed as keyword arguments when launching the
 Browser. See the docs for
-[`BrowserType.launch`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch).
+[`BrowserType.launch`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch)
+for a list of supported keyword arguments.
 
 ```python
 PLAYWRIGHT_LAUNCH_OPTIONS = {
@@ -524,7 +525,7 @@ class AwesomeSpiderWithPage(scrapy.Spider):
 
 Multiple [browser contexts](https://playwright.dev/python/docs/browser-contexts)
 to be launched at startup can be defined via the
-[`PLAYWRIGHT_CONTEXTS`](#PLAYWRIGHT_CONTEXTS) setting.
+[`PLAYWRIGHT_CONTEXTS`](#playwright_contexts) setting.
 
 ### Choosing a specific context for a request
 
@@ -548,7 +549,12 @@ context can also be customized on startup via the `PLAYWRIGHT_CONTEXTS` setting.
 Pass a value for the `user_data_dir` keyword argument to launch a context as
 persistent. See also [`BrowserType.launch_persistent_context`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context).
 
-### Creating a context during a crawl
+Note that persistent contexts are launched independently from the main browser
+instance, hence keyword arguments passed in the
+[`PLAYWRIGHT_LAUNCH_OPTIONS`](#playwright_launch_options)
+setting do not apply.
+
+### Creating contexts while crawling
 
 If the context specified in the `playwright_context` meta key does not exist, it will be created.
 You can specify keyword arguments to be passed to
@@ -583,7 +589,7 @@ Specifying a non-negative integer value for the
 [`PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL`](#playwright_close_context_interval)
 setting enables closing browser contexts which have no active pages.
 
-### Closing a context during a crawl
+### Closing contexts while crawling
 
 After [receiving the Page object in your callback](#receiving-page-objects-in-callbacks),
 you can access a context though the corresponding [`Page.context`](https://playwright.dev/python/docs/api/class-page#page-context)
@@ -605,21 +611,28 @@ def parse(self, response):
 async def parse_in_new_context(self, response):
     page = response.meta["playwright_page"]
     title = await page.title()
+    await page.close()
     await page.context.close()
     return {"title": title}
 
 async def close_context_on_error(self, failure):
     page = failure.request.meta["playwright_page"]
+    await page.close()
     await page.context.close()
 ```
 
+### Avoid race conditions & memory leaks when closing contexts
+Make sure to close the page before closing the context. See
+[this comment](https://github.com/scrapy-plugins/scrapy-playwright/issues/191#issuecomment-1548097114)
+in [#191](https://github.com/scrapy-plugins/scrapy-playwright/issues/191)
+for more information.
+
 ### Maximum concurrent context count
 
 Specify a value for the `PLAYWRIGHT_MAX_CONTEXTS` setting to limit the amount
 of concurent contexts. Use with caution: it's possible to block the whole crawl
-if contexts are not closed after they are no longer used (refer to the above
-section to dinamically close contexts, see also the section about
-[automatically closing inactive contexts](#automatically-closing-inactive-contexts)).
+if contexts are not closed after they are no longer used (refer to
+[this section](#closing-contexts-while-crawling) to dinamically close contexts).
 Make sure to define an errback to still close contexts even if there are errors.
 
 
@@ -636,7 +649,7 @@ class ProxySpider(Spider):
     custom_settings = {
         "PLAYWRIGHT_LAUNCH_OPTIONS": {
             "proxy": {
-                "server": "http://myproxy.com:3128"
+                "server": "http://myproxy.com:3128",
                 "username": "user",
                 "password": "pass",
             },
@@ -671,7 +684,7 @@ PLAYWRIGHT_CONTEXTS = {
 }
 ```
 
-Or passing a `proxy` key when [creating a context during a crawl](#creating-a-context-during-a-crawl).
+Or passing a `proxy` key when [creating contexts while crawling](#creating-contexts-while-crawling).
 
 See also:
 * [`zyte-smartproxy-playwright`](https://github.com/zytedata/zyte-smartproxy-playwright):
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,18 @@
 # scrapy-playwright changelog
 
+
+### [v0.0.28](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.28) (2023-08-05)
+
+* Retry page.content if necessary (#218)
+
+
+### [v0.0.27](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.27) (2023-07-24)
+
+* Override method only for navigation requests (#177)
+* Pass spider argument to _create_browser_context (#212)
+* await AsyncPlaywright.stop on close (#214)
+
+
 ### [v0.0.26](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.26) (2023-02-01)
 
 * Fix logging (pass extra args instead of updating log record factory)
diff --git a/examples/contexts.py b/examples/contexts.py
@@ -99,6 +99,7 @@ async def parse(self, response):
         page = response.meta["playwright_page"]
         context_name = response.meta["playwright_context"]
         storage_state = await page.context.storage_state()
+        await page.close()
         await page.context.close()
         return {
             "url": response.url,
diff --git a/scrapy_playwright/__init__.py b/scrapy_playwright/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.26"
+__version__ = "0.0.28"
diff --git a/scrapy_playwright/_utils.py b/scrapy_playwright/_utils.py
@@ -1,11 +1,17 @@
+import logging
 from typing import Awaitable, Iterator, Optional, Tuple
 
+from playwright.async_api import Error, Page
+from scrapy import Spider
 from scrapy.http.headers import Headers
 from scrapy.settings import Settings
 from scrapy.utils.python import to_unicode
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
 
 
+logger = logging.getLogger("scrapy-playwright")
+
+
 async def _maybe_await(obj):
     if isinstance(obj, Awaitable):
         return await obj
@@ -30,9 +36,9 @@ def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]:
     return text.encode("utf-8"), "utf-8"  # fallback
 
 
-def _is_safe_close_error(error: Exception) -> bool:
+def _is_safe_close_error(error: Error) -> bool:
     """
-    Taken verbatim from
+    Taken almost verbatim from
     https://github.com/microsoft/playwright-python/blob/v1.20.0/playwright/_impl/_helper.py#L234-L238
     """
     message = str(error)
@@ -41,6 +47,41 @@ def _is_safe_close_error(error: Exception) -> bool:
     )
 
 
+_NAVIGATION_ERROR_MSG = (
+    "Unable to retrieve content because the page is navigating and changing the content."
+)
+
+
+async def _get_page_content(
+    page: Page,
+    spider: Spider,
+    context_name: str,
+    scrapy_request_url: str,
+    scrapy_request_method: str,
+) -> str:
+    """Wrapper around Page.content to retry if necessary.
+    Arguments other than the page are only for logging.
+    """
+    try:
+        return await page.content()
+    except Error as err:
+        if err.message == _NAVIGATION_ERROR_MSG:
+            logger.debug(
+                "Retrying to get content from page '%s', error: '%s'",
+                page.url,
+                _NAVIGATION_ERROR_MSG,
+                extra={
+                    "spider": spider,
+                    "context_name": context_name,
+                    "scrapy_request_url": scrapy_request_url,
+                    "scrapy_request_method": scrapy_request_method,
+                    "playwright_page_url": page.url,
+                },
+            )
+            return await page.content()
+        raise
+
+
 def _read_float_setting(settings: Settings, key: str) -> Optional[float]:
     try:
         return float(settings[key])
diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -10,6 +10,7 @@
     Browser,
     BrowserContext,
     BrowserType,
+    Error as PlaywrightError,
     Page,
     PlaywrightContextManager,
     Request as PlaywrightRequest,
@@ -31,6 +32,7 @@
 from scrapy_playwright.page import PageMethod
 from scrapy_playwright._utils import (
     _encode_body,
+    _get_page_content,
     _is_safe_close_error,
     _maybe_await,
     _read_float_setting,
@@ -119,8 +121,8 @@ async def _launch(self) -> None:
         """Launch Playwright manager and configured startup context(s)."""
         logger.info("Starting download handler")
         self.playwright_context_manager = PlaywrightContextManager()
-        playwright_instance = await self.playwright_context_manager.start()
-        self.browser_type: BrowserType = getattr(playwright_instance, self.browser_type_name)
+        self.playwright = await self.playwright_context_manager.start()
+        self.browser_type: BrowserType = getattr(self.playwright, self.browser_type_name)
         if self.startup_context_kwargs:
             logger.info("Launching %i startup context(s)", len(self.startup_context_kwargs))
             await asyncio.gather(
@@ -209,7 +211,9 @@ async def _create_page(self, request: Request, spider: Spider) -> Page:
             ctx_wrapper = self.context_wrappers.get(context_name)
             if ctx_wrapper is None:
                 ctx_wrapper = await self._create_browser_context(
-                    name=context_name, context_kwargs=request.meta.get("playwright_context_kwargs")
+                    name=context_name,
+                    context_kwargs=request.meta.get("playwright_context_kwargs"),
+                    spider=spider,
                 )
 
         await ctx_wrapper.semaphore.acquire()
@@ -295,6 +299,7 @@ async def _close(self) -> None:
             logger.info("Closing browser")
             await self.browser.close()
         await self.playwright_context_manager.__aexit__()
+        await self.playwright.stop()
 
     def download_request(self, request: Request, spider: Spider) -> Deferred:
         if request.meta.get("playwright"):
@@ -379,7 +384,13 @@ async def _download_request_with_page(
             headers = Headers(await response.all_headers())
             headers.pop("Content-Encoding", None)
         await self._apply_page_methods(page, request, spider)
-        body_str = await page.content()
+        body_str = await _get_page_content(
+            page=page,
+            spider=spider,
+            context_name=context_name,
+            scrapy_request_url=request.url,
+            scrapy_request_method=request.method,
+        )
         request.meta["download_latency"] = time() - start_time
 
         if not request.meta.get("playwright_include_page"):
@@ -533,7 +544,10 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
 
             # if the request is triggered by scrapy, not playwright
             original_playwright_method: str = playwright_request.method
-            if playwright_request.url == url:
+            if (
+                playwright_request.url.rstrip("/") == url.rstrip("/")
+                and playwright_request.is_navigation_request()
+            ):
                 if method.upper() != playwright_request.method.upper():
                     overrides["method"] = method
                 if body:
@@ -559,7 +573,7 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
                             "playwright_request_method_new": overrides["method"],
                         },
                     )
-            except Exception as ex:
+            except PlaywrightError as ex:
                 if _is_safe_close_error(ex):
                     logger.warning(
                         "Failed processing Playwright request: <%s %s> exc_type=%s exc_msg=%s",
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,5 +1,7 @@
 from contextlib import asynccontextmanager
 
+from scrapy import Request
+from scrapy.http.response.html import HtmlResponse
 from scrapy.utils.test import get_crawler
 
 
@@ -8,6 +10,7 @@ async def make_handler(settings_dict: dict):
     """Convenience function to obtain an initialized handler and close it gracefully"""
     from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler
 
+    settings_dict.setdefault("TELNETCONSOLE_ENABLED", False)
     crawler = get_crawler(settings_dict=settings_dict)
     handler = ScrapyPlaywrightDownloadHandler(crawler=crawler)
     try:
@@ -18,3 +21,11 @@ async def make_handler(settings_dict: dict):
         yield handler
     finally:
         await handler._close()
+
+
+def assert_correct_response(response: HtmlResponse, request: Request) -> None:
+    assert isinstance(response, HtmlResponse)
+    assert response.request is request
+    assert response.url == request.url
+    assert response.status == 200
+    assert "playwright" in response.flags
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -21,7 +21,7 @@ def _is_coroutine(obj):
     return asyncio.iscoroutinefunction(obj) or inspect.isgeneratorfunction(obj)
 
 
-@pytest.mark.tryfirst
+@pytest.hookimpl(tryfirst=True)
 def pytest_pycollect_makeitem(collector, name, obj):
     """A pytest hook to collect asyncio coroutines."""
     if collector.funcnamefilter(name) and _is_coroutine(obj):
diff --git a/tests/site/redirect.html b/tests/site/redirect.html
@@ -0,0 +1,13 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <title>Page should redirect</title>
+    <link rel="canonical" href="index.html">
+    <meta name="robots" content="noindex">
+    <meta charset="utf-8">
+    <meta http-equiv="refresh" content="0; url=index.html">
+  </head>
+  <body>
+    <p>You should not see this because you are immediately redirected.</p>
+  </body>
+</html>
diff --git a/tests/test_browser_contexts.py b/tests/test_browser_contexts.py
@@ -161,8 +161,8 @@ async def test_contexts_startup(self):
 
             page = resp.meta["playwright_page"]
             storage_state = await page.context.storage_state()
-            await page.context.close()
             await page.close()
+            await page.context.close()
             cookie = storage_state["cookies"][0]
             assert cookie["name"] == "foo"
             assert cookie["value"] == "bar"
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
diff --git a/tests/test_page_methods.py b/tests/test_page_methods.py
diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py
diff --git a/tests/test_utils.py b/tests/test_utils.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.0.26"`
	`1`	`+__version__ = "0.0.28"`