Skip to content

Commit baf4f57

Browse files
committed
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
2 parents a1a040d + 5b254e4 commit baf4f57

28 files changed

+522
-177
lines changed

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.0.31
2+
current_version = 0.0.33
33
commit = True
44
tag = True
55

.github/workflows/checks.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@ jobs:
2626
TOXENV: pylint
2727

2828
steps:
29-
- uses: actions/checkout@v2
29+
- uses: actions/checkout@v4
3030

3131
- name: Set up Python ${{ matrix.python-version }}
32-
uses: actions/setup-python@v2
32+
uses: actions/setup-python@v4
3333
with:
3434
python-version: ${{ matrix.python-version }}
3535

.github/workflows/publish.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@ jobs:
88
runs-on: ubuntu-latest
99

1010
steps:
11-
- uses: actions/checkout@v2
11+
- uses: actions/checkout@v4
1212

1313
- name: Set up Python 3
14-
uses: actions/setup-python@v2
14+
uses: actions/setup-python@v4
1515
with:
1616
python-version: 3
1717

.github/workflows/tests.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ jobs:
1212
python-version: ["3.8", "3.9", "3.10", "3.11"]
1313

1414
steps:
15-
- uses: actions/checkout@v2
15+
- uses: actions/checkout@v4
1616

1717
- name: Set up Python ${{ matrix.python-version }}
18-
uses: actions/setup-python@v2
18+
uses: actions/setup-python@v4
1919
with:
2020
python-version: ${{ matrix.python-version }}
2121

README.md

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ class AwesomeSpider(scrapy.Spider):
9898
meta={"playwright": True},
9999
)
100100

101-
def parse(self, response):
101+
def parse(self, response, **kwargs):
102102
# 'response' contains the page as seen by the browser
103103
return {"url": response.url}
104104
```
@@ -138,6 +138,37 @@ PLAYWRIGHT_LAUNCH_OPTIONS = {
138138
}
139139
```
140140

141+
### `PLAYWRIGHT_CDP_URL`
142+
Type `Optional[str]`, default `None`
143+
144+
The endpoint of a remote Chromium browser to connect using the
145+
[Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/),
146+
via [`BrowserType.connect_over_cdp`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect-over-cdp).
147+
If this setting is used:
148+
* all non-persistent contexts will be created on the connected remote browser
149+
* the `PLAYWRIGHT_LAUNCH_OPTIONS` setting is ignored
150+
* the `PLAYWRIGHT_BROWSER_TYPE` setting must not be set to a value different than "chromium"
151+
152+
```python
153+
PLAYWRIGHT_CDP_URL = "http://localhost:9222"
154+
```
155+
156+
### `PLAYWRIGHT_CDP_KWARGS`
157+
Type `dict[str, Any]`, default `{}`
158+
159+
Additional keyword arguments to be passed to
160+
[`BrowserType.connect_over_cdp`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect-over-cdp)
161+
when using `PLAYWRIGHT_CDP_URL`. The `endpoint_url` key is always ignored,
162+
`PLAYWRIGHT_CDP_URL` is used instead.
163+
164+
```python
165+
PLAYWRIGHT_CDP_KWARGS = {
166+
"slow_mo": 1000,
167+
"timeout": 10 * 1000
168+
}
169+
```
170+
171+
141172
### `PLAYWRIGHT_CONTEXTS`
142173
Type `dict[str, dict]`, default `{}`
143174

@@ -412,7 +443,7 @@ def start_requests(self):
412443
meta={"playwright": True, "playwright_include_page": True},
413444
)
414445

415-
def parse(self, response):
446+
def parse(self, response, **kwargs):
416447
page = response.meta["playwright_page"]
417448
yield scrapy.Request(
418449
url="https://httpbin.org/headers",
@@ -449,7 +480,7 @@ about the give response. Only available for HTTPS requests. Could be accessed
449480
in the callback via `response.meta['playwright_security_details']`
450481

451482
```python
452-
def parse(self, response):
483+
def parse(self, response, **kwargs):
453484
print(response.meta["playwright_security_details"])
454485
# {'issuer': 'DigiCert TLS RSA SHA256 2020 CA1', 'protocol': 'TLS 1.3', 'subjectName': 'www.example.org', 'validFrom': 1647216000, 'validTo': 1678838399}
455486
```
@@ -597,7 +628,7 @@ you can access a context though the corresponding [`Page.context`](https://playw
597628
attribute, and await [`close`](https://playwright.dev/python/docs/api/class-browsercontext#browser-context-close) on it.
598629

599630
```python
600-
def parse(self, response):
631+
def parse(self, response, **kwargs):
601632
yield scrapy.Request(
602633
url="https://example.org",
603634
callback=self.parse_in_new_context,
@@ -660,7 +691,7 @@ class ProxySpider(Spider):
660691
def start_requests(self):
661692
yield Request("http://httpbin.org/get", meta={"playwright": True})
662693

663-
def parse(self, response):
694+
def parse(self, response, **kwargs):
664695
print(response.text)
665696
```
666697

@@ -729,7 +760,7 @@ def start_requests(self):
729760
},
730761
)
731762

732-
def parse(self, response):
763+
def parse(self, response, **kwargs):
733764
screenshot = response.meta["playwright_page_methods"][0]
734765
# screenshot.result contains the image's bytes
735766
```
@@ -742,7 +773,7 @@ def start_requests(self):
742773
meta={"playwright": True, "playwright_include_page": True},
743774
)
744775

745-
async def parse(self, response):
776+
async def parse(self, response, **kwargs):
746777
page = response.meta["playwright_page"]
747778
screenshot = await page.screenshot(path="example.png", full_page=True)
748779
# screenshot contains the image's bytes
@@ -834,7 +865,7 @@ class ClickAndSavePdfSpider(scrapy.Spider):
834865
),
835866
)
836867

837-
def parse(self, response):
868+
def parse(self, response, **kwargs):
838869
pdf_bytes = response.meta["playwright_page_methods"]["pdf"].result
839870
with open("iana.pdf", "wb") as fp:
840871
fp.write(pdf_bytes)
@@ -861,7 +892,7 @@ class ScrollSpider(scrapy.Spider):
861892
),
862893
)
863894

864-
async def parse(self, response):
895+
async def parse(self, response, **kwargs):
865896
page = response.meta["playwright_page"]
866897
await page.screenshot(path="quotes.png", full_page=True)
867898
await page.close()

docs/changelog.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,16 @@
11
# scrapy-playwright changelog
22

33

4+
### [v0.0.33](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.33) (2023-10-19)
5+
6+
* Handle downloads as binary responses (#228)
7+
8+
9+
### [v0.0.32](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.32) (2023-09-04)
10+
11+
* Connect to browser using CDP (#227)
12+
13+
414
### [v0.0.31](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.31) (2023-08-28)
515

616
* Do not fail when getting referer header for debug log messages (#225)

examples/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*.png
2+
*.pdf

examples/contexts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def start_requests(self):
9595
dont_filter=True,
9696
)
9797

98-
async def parse(self, response):
98+
async def parse(self, response, **kwargs):
9999
page = response.meta["playwright_page"]
100100
context_name = response.meta["playwright_context"]
101101
storage_state = await page.context.storage_state()

examples/download.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from pathlib import Path
2+
3+
from scrapy import Spider, Request
4+
5+
6+
class DownloadSpider(Spider):
7+
name = "download"
8+
custom_settings = {
9+
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
10+
"DOWNLOAD_HANDLERS": {
11+
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
12+
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
13+
},
14+
}
15+
16+
def start_requests(self):
17+
yield Request(url="https://example.org", meta={"playwright": True})
18+
yield Request(
19+
url="https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
20+
meta={"playwright": True},
21+
)
22+
23+
def parse(self, response, **kwargs):
24+
if filename := response.meta.get("playwright_suggested_filename"):
25+
(Path(__file__).parent / filename).write_bytes(response.body)
26+
yield {
27+
"url": response.url,
28+
"response_cls": response.__class__.__name__,
29+
"first_bytes": response.body[:60],
30+
"filename": filename,
31+
}

examples/events.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,5 @@ async def handle_dialog(self, dialog: Dialog) -> None:
3737
async def handle_response(self, response: PlaywrightResponse) -> None:
3838
self.logger.info(f"Received response with URL {response.url}")
3939

40-
def parse(self, response):
40+
def parse(self, response, **kwargs):
4141
return {"url": response.url}

0 commit comments

Comments
 (0)