From 3d2274bb2cbaea6894579dff7589387cc2a23864 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Sat, 28 Feb 2026 09:32:37 +0100 Subject: [PATCH] feat: Update Scrapy template and wrapper to use SDK 3.3.0 Use Scrapy's native AsyncCrawlerRunner (requires Scrapy >= 2.14.0) instead of CrawlerRunner + deferred_to_future. Let run_scrapy_actor() handle reactor installation internally, removing manual install_reactor() boilerplate from __main__.py. Add HTTP cache settings to settings.py. Co-Authored-By: Claude Opus 4.6 --- templates/python-scrapy/requirements.txt | 4 ++-- templates/python-scrapy/src/__main__.py | 7 ------- templates/python-scrapy/src/main.py | 10 ++++------ templates/python-scrapy/src/settings.py | 2 ++ wrappers/python-scrapy/requirements_apify.txt | 4 ++-- .../python-scrapy/{projectFolder}/__main__.template.py | 7 ------- .../python-scrapy/{projectFolder}/main.template.py | 10 ++++------ 7 files changed, 14 insertions(+), 30 deletions(-) diff --git a/templates/python-scrapy/requirements.txt b/templates/python-scrapy/requirements.txt index d0e8c0f69..3113b42ed 100644 --- a/templates/python-scrapy/requirements.txt +++ b/templates/python-scrapy/requirements.txt @@ -1,5 +1,5 @@ # Feel free to add your Python dependencies below. For formatting guidelines, see: # https://pip.pypa.io/en/latest/reference/requirements-file-format/ -apify[scrapy] < 4.0.0 -scrapy < 3.0.0 +apify[scrapy] >= 3.3.0, < 4.0.0 +scrapy >= 2.14.0, < 3.0.0 diff --git a/templates/python-scrapy/src/__main__.py b/templates/python-scrapy/src/__main__.py index 25a28e8ae..a4ce7c281 100644 --- a/templates/python-scrapy/src/__main__.py +++ b/templates/python-scrapy/src/__main__.py @@ -10,15 +10,8 @@ We recommend you do not modify this file unless you really know what you are doing. """ -# ruff: noqa: E402 from __future__ import annotations -from scrapy.utils.reactor import install_reactor - -# Install Twisted's asyncio reactor before importing any other Twisted or -# Scrapy components. -install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') - import os from apify.scrapy import initialize_logging, run_scrapy_actor diff --git a/templates/python-scrapy/src/main.py b/templates/python-scrapy/src/main.py index 090e7dcd0..8efb5496a 100644 --- a/templates/python-scrapy/src/main.py +++ b/templates/python-scrapy/src/main.py @@ -23,8 +23,7 @@ from apify import Actor from apify.scrapy import apply_apify_settings -from scrapy.crawler import CrawlerRunner -from scrapy.utils.defer import deferred_to_future +from scrapy.crawler import AsyncCrawlerRunner # Import your Scrapy spider here. from .spiders import TitleSpider as Spider @@ -42,11 +41,10 @@ async def main() -> None: # Apply Apify settings, which will override the Scrapy project settings. settings = apply_apify_settings(proxy_config=proxy_config) - # Create CrawlerRunner and execute the Scrapy spider. - crawler_runner = CrawlerRunner(settings) - crawl_deferred = crawler_runner.crawl( + # Create AsyncCrawlerRunner and execute the Scrapy spider. + crawler_runner = AsyncCrawlerRunner(settings) + await crawler_runner.crawl( Spider, start_urls=start_urls, allowed_domains=allowed_domains, ) - await deferred_to_future(crawl_deferred) diff --git a/templates/python-scrapy/src/settings.py b/templates/python-scrapy/src/settings.py index de2710401..e516d1c8a 100644 --- a/templates/python-scrapy/src/settings.py +++ b/templates/python-scrapy/src/settings.py @@ -15,6 +15,8 @@ TELNETCONSOLE_ENABLED = False # Do not change the Twisted reactor unless you really know what you are doing. TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' +HTTPCACHE_ENABLED = True +HTTPCACHE_EXPIRATION_SECS = 7200 ITEM_PIPELINES = { 'src.pipelines.TitleItemPipeline': 123, } diff --git a/wrappers/python-scrapy/requirements_apify.txt b/wrappers/python-scrapy/requirements_apify.txt index 0a2f0f629..71177265e 100644 --- a/wrappers/python-scrapy/requirements_apify.txt +++ b/wrappers/python-scrapy/requirements_apify.txt @@ -1,5 +1,5 @@ # Add your dependencies here. # See https://pip.pypa.io/en/latest/reference/requirements-file-format/ # for how to format them -apify[scrapy] < 3.0 -scrapy < 3.0 +apify[scrapy] >= 3.3.0, < 4.0.0 +scrapy >= 2.14.0, < 3.0.0 diff --git a/wrappers/python-scrapy/{projectFolder}/__main__.template.py b/wrappers/python-scrapy/{projectFolder}/__main__.template.py index 7069990c6..a4ce7c281 100644 --- a/wrappers/python-scrapy/{projectFolder}/__main__.template.py +++ b/wrappers/python-scrapy/{projectFolder}/__main__.template.py @@ -9,16 +9,9 @@ We recommend you do not modify this file unless you really know what you are doing. """ -# ruff: noqa: E402 from __future__ import annotations -from scrapy.utils.reactor import install_reactor - -# Install Twisted's asyncio reactor before importing any other Twisted or -# Scrapy components. -install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') - import os from apify.scrapy import initialize_logging, run_scrapy_actor diff --git a/wrappers/python-scrapy/{projectFolder}/main.template.py b/wrappers/python-scrapy/{projectFolder}/main.template.py index c78990897..e5d26f29e 100644 --- a/wrappers/python-scrapy/{projectFolder}/main.template.py +++ b/wrappers/python-scrapy/{projectFolder}/main.template.py @@ -22,8 +22,7 @@ from apify import Actor from apify.scrapy import apply_apify_settings -from scrapy.crawler import CrawlerRunner -from scrapy.utils.defer import deferred_to_future +from scrapy.crawler import AsyncCrawlerRunner # Import your Scrapy spider here. from {{spider_module_name}} import {{spider_class_name}} as Spider @@ -44,11 +43,10 @@ async def main() -> None: # Apply Apify settings, which will override the Scrapy project settings. settings = apply_apify_settings(proxy_config=proxy_config) - # Create CrawlerRunner and execute the Scrapy spider. - crawler_runner = CrawlerRunner(settings) - crawl_deferred = crawler_runner.crawl( + # Create AsyncCrawlerRunner and execute the Scrapy spider. + crawler_runner = AsyncCrawlerRunner(settings) + await crawler_runner.crawl( Spider, start_urls=start_urls, allowed_domains=allowed_domains, ) - await deferred_to_future(crawl_deferred)