33from contextlib import suppress
44from dataclasses import dataclass
55from ipaddress import ip_address
6+ from tempfile import NamedTemporaryFile
67from time import time
7- from typing import Awaitable , Callable , Dict , Optional , Type , TypeVar , Union
8+ from typing import Awaitable , Callable , Dict , Optional , Tuple , Type , TypeVar , Union
89
910from playwright .async_api import (
1011 BrowserContext ,
1112 BrowserType ,
13+ Download ,
1214 Error as PlaywrightError ,
1315 Page ,
1416 PlaywrightContextManager ,
@@ -319,7 +321,7 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
319321 )
320322
321323 try :
322- result = await self ._download_request_with_page (request , page , spider )
324+ return await self ._download_request_with_page (request , page , spider )
323325 except Exception as ex :
324326 if not request .meta .get ("playwright_include_page" ) and not page .is_closed ():
325327 logger .warning (
@@ -339,8 +341,6 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
339341 await page .close ()
340342 self .stats .inc_value ("playwright/page_count/closed" )
341343 raise
342- else :
343- return result
344344
345345 async def _download_request_with_page (
346346 self , request : Request , page : Page , spider : Spider
@@ -349,51 +349,61 @@ async def _download_request_with_page(
349349 if request .meta .get ("playwright_include_page" ):
350350 request .meta ["playwright_page" ] = page
351351
352- context_name = request .meta .setdefault ("playwright_context" , DEFAULT_CONTEXT_NAME )
353-
354352 start_time = time ()
355- page_goto_kwargs = request .meta .get ("playwright_page_goto_kwargs" ) or {}
356- page_goto_kwargs .pop ("url" , None )
357- response = await page .goto (url = request .url , ** page_goto_kwargs )
358- if response is None :
353+ response , download = await self ._get_response_and_download (request = request , page = page )
354+ if isinstance (response , PlaywrightResponse ):
355+ await _set_redirect_meta (request = request , response = response )
356+ headers = Headers (await response .all_headers ())
357+ headers .pop ("Content-Encoding" , None )
358+ else :
359359 logger .warning (
360360 "Navigating to %s returned None, the response"
361361 " will have empty headers and status 200" ,
362362 request ,
363363 extra = {
364364 "spider" : spider ,
365- "context_name" : context_name ,
365+ "context_name" : request . meta . get ( "playwright_context" ) ,
366366 "scrapy_request_url" : request .url ,
367367 "scrapy_request_method" : request .method ,
368368 },
369369 )
370370 headers = Headers ()
371- else :
372- await _set_redirect_meta (request = request , response = response )
373- headers = Headers (await response .all_headers ())
374- headers .pop ("Content-Encoding" , None )
371+
375372 await self ._apply_page_methods (page , request , spider )
376373 body_str = await _get_page_content (
377374 page = page ,
378375 spider = spider ,
379- context_name = context_name ,
376+ context_name = request . meta . get ( "playwright_context" ) ,
380377 scrapy_request_url = request .url ,
381378 scrapy_request_method = request .method ,
382379 )
383380 request .meta ["download_latency" ] = time () - start_time
384381
385382 server_ip_address = None
386- with suppress (AttributeError , KeyError , TypeError , ValueError ):
387- server_addr = await response .server_addr ()
388- server_ip_address = ip_address (server_addr ["ipAddress" ])
389-
390- with suppress (AttributeError ):
383+ if response is not None :
391384 request .meta ["playwright_security_details" ] = await response .security_details ()
385+ with suppress (KeyError , TypeError , ValueError ):
386+ server_addr = await response .server_addr ()
387+ server_ip_address = ip_address (server_addr ["ipAddress" ])
388+
389+ if download .get ("exception" ):
390+ raise download ["exception" ]
392391
393392 if not request .meta .get ("playwright_include_page" ):
394393 await page .close ()
395394 self .stats .inc_value ("playwright/page_count/closed" )
396395
396+ if download :
397+ request .meta ["playwright_suggested_filename" ] = download .get ("suggested_filename" )
398+ respcls = responsetypes .from_args (url = download ["url" ], body = download ["bytes" ])
399+ return respcls (
400+ url = download ["url" ],
401+ status = 200 ,
402+ body = download ["bytes" ],
403+ request = request ,
404+ flags = ["playwright" ],
405+ )
406+
397407 body , encoding = _encode_body (headers = headers , text = body_str )
398408 respcls = responsetypes .from_args (headers = headers , url = page .url , body = body )
399409 return respcls (
@@ -407,6 +417,48 @@ async def _download_request_with_page(
407417 ip_address = server_ip_address ,
408418 )
409419
420+ async def _get_response_and_download (
421+ self , request : Request , page : Page
422+ ) -> Tuple [Optional [PlaywrightResponse ], dict ]:
423+ response : Optional [PlaywrightResponse ] = None
424+ download : dict = {} # updated in-place in _handle_download
425+ download_ready = asyncio .Event ()
426+
427+ async def _handle_download (dwnld : Download ) -> None :
428+ self .stats .inc_value ("playwright/download_count" )
429+ try :
430+ if failure := await dwnld .failure ():
431+ raise RuntimeError (f"Failed to download { dwnld .url } : { failure } " )
432+ with NamedTemporaryFile () as temp_file :
433+ await dwnld .save_as (temp_file .name )
434+ temp_file .seek (0 )
435+ download ["bytes" ] = temp_file .read ()
436+ download ["url" ] = dwnld .url
437+ download ["suggested_filename" ] = dwnld .suggested_filename
438+ except Exception as ex :
439+ download ["exception" ] = ex
440+ finally :
441+ download_ready .set ()
442+
443+ page_goto_kwargs = request .meta .get ("playwright_page_goto_kwargs" ) or {}
444+ page_goto_kwargs .pop ("url" , None )
445+ page .on ("download" , _handle_download )
446+ try :
447+ response = await page .goto (url = request .url , ** page_goto_kwargs )
448+ except PlaywrightError as err :
449+ if not (
450+ self .browser_type_name in ("firefox" , "webkit" )
451+ and "Download is starting" in err .message
452+ or self .browser_type_name == "chromium"
453+ and "net::ERR_ABORTED" in err .message
454+ ):
455+ raise
456+ await download_ready .wait ()
457+ finally :
458+ page .remove_listener ("download" , _handle_download )
459+
460+ return response , download
461+
410462 async def _apply_page_methods (self , page : Page , request : Request , spider : Spider ) -> None :
411463 context_name = request .meta .get ("playwright_context" )
412464 page_methods = request .meta .get ("playwright_page_methods" ) or ()
0 commit comments