From f6fe8e37d638c212a675079c29975e764e6f50c3 Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Wed, 1 Apr 2026 12:13:28 -0600 Subject: [PATCH 1/3] Add Apify Actor for Wick marketplace distribution Apify Actor that brings Wick's browser-grade content extraction to Apify's 22K+ marketplace. Two modes: bundled Wick engine (runs on Apify infra with Chrome TLS fingerprint) or tunnel to local Wick Pro instance (residential IP + full anti-detection). Supports fetch, crawl, and map operations with clean JSON output via Wick's HTTP API server running as a sidecar process. Co-Authored-By: Claude Opus 4.6 (1M context) --- apify/.actor/ACTOR.md | 60 ++++++++++++++ apify/.actor/actor.json | 15 ++++ apify/.actor/dataset_schema.json | 51 ++++++++++++ apify/.actor/input_schema.json | 57 +++++++++++++ apify/Dockerfile | 21 +++++ apify/package.json | 8 ++ apify/src/main.js | 135 +++++++++++++++++++++++++++++++ 7 files changed, 347 insertions(+) create mode 100644 apify/.actor/ACTOR.md create mode 100644 apify/.actor/actor.json create mode 100644 apify/.actor/dataset_schema.json create mode 100644 apify/.actor/input_schema.json create mode 100644 apify/Dockerfile create mode 100644 apify/package.json create mode 100644 apify/src/main.js diff --git a/apify/.actor/ACTOR.md b/apify/.actor/ACTOR.md new file mode 100644 index 0000000..37d43b6 --- /dev/null +++ b/apify/.actor/ACTOR.md @@ -0,0 +1,60 @@ +# Wick Web Fetcher + +Getting 403 Forbidden? Your scraper's TLS fingerprint is giving it away. + +**Wick uses Chrome's real network stack** (BoringSSL, HTTP/2, QUIC) to fetch web pages. The TLS handshake is identical to a real Chrome browser -- not headless Chromium, not Playwright, not Puppeteer. Sites that block every other scraper return 200 OK to Wick. + +## What it does + +- **Fetch** any URL and get clean markdown, HTML, or plain text +- **Crawl** a site following links, up to 50 pages deep +- **Map** a site to discover all URLs via sitemap.xml + link following +- Returns LLM-ready markdown -- perfect for RAG pipelines, AI training, and content analysis + +## Why not Website Content Crawler? + +| | Wick | Website Content Crawler | +|---|---|---| +| TLS fingerprint | Real Chrome (Cronet) | Headless Chromium | +| Anti-bot bypass | High -- real browser signature | Medium -- detectable fingerprint | +| Output | Clean markdown | Clean markdown | +| Speed | Fast (no browser startup) | Slower (full browser launch) | +| Memory | 256 MB | 1-4 GB | +| Compute cost | ~4x cheaper per run | Standard | + +## Modes + +### Fetch (default) + +Fetches one or more URLs and returns clean content. Each URL becomes one row in the output dataset. + +### Crawl + +Starts from a URL and follows same-domain links. Returns content for every page discovered, each as a separate dataset row. Control depth (1-5) and max pages (1-50). + +### Map + +Discovers all URLs on a site by checking sitemap.xml and following links. Returns a list of URLs without fetching their content -- useful for planning a targeted crawl. + +## Residential IP mode (optional) + +For maximum anti-detection, connect this Actor to your own **Wick Pro** instance running on your machine. Requests route through your residential IP -- no datacenter fingerprint, no proxy costs. + +1. Install Wick Pro: `wick install pro` (see [getwick.dev](https://getwick.dev)) +2. Start the API server: `wick serve --api` +3. Expose via tunnel (Cloudflare Tunnel, ngrok, etc.) +4. Paste the tunnel URL in the **Wick Tunnel URL** input field + +This gives you Apify's scheduling and monitoring with Wick's anti-detection and your residential IP. + +## Pricing + +This Actor is **free to use** -- you only pay for Apify compute units. The bundled Wick engine is open source (MIT license). + +For residential IP routing, you need [Wick Pro](https://getwick.dev) ($20/month). + +## Links + +- [Wick website](https://getwick.dev) +- [GitHub](https://github.com/wickproject/wick) +- [Documentation](https://getwick.dev/docs.html) diff --git a/apify/.actor/actor.json b/apify/.actor/actor.json new file mode 100644 index 0000000..12ccb10 --- /dev/null +++ b/apify/.actor/actor.json @@ -0,0 +1,15 @@ +{ + "actorSpecification": 1, + "name": "wick-web-fetcher", + "title": "Wick Web Fetcher — Browser-Grade Content Extraction", + "version": "1.0", + "buildTag": "latest", + "minMemoryMbytes": 256, + "maxMemoryMbytes": 1024, + "dockerfile": "../Dockerfile", + "readme": "./ACTOR.md", + "input": "./input_schema.json", + "storages": { + "dataset": "./dataset_schema.json" + } +} diff --git a/apify/.actor/dataset_schema.json b/apify/.actor/dataset_schema.json new file mode 100644 index 0000000..0a7ca88 --- /dev/null +++ b/apify/.actor/dataset_schema.json @@ -0,0 +1,51 @@ +{ + "actorSpecification": 1, + "fields": { + "url": { + "type": "string", + "description": "The URL that was fetched" + }, + "title": { + "type": "string", + "description": "Page title extracted from HTML", + "nullable": true + }, + "content": { + "type": "string", + "description": "Page content in the requested format (markdown, html, or text)", + "nullable": true + }, + "urls": { + "type": "array", + "description": "Discovered URLs (map mode only)", + "nullable": true + }, + "format": { + "type": "string", + "description": "Output format used" + }, + "statusCode": { + "type": "integer", + "description": "HTTP status code from the fetch", + "nullable": true + }, + "timingMs": { + "type": "integer", + "description": "Time to fetch in milliseconds", + "nullable": true + }, + "engine": { + "type": "string", + "description": "wick-local (bundled engine) or wick-tunnel (residential IP)" + }, + "error": { + "type": "string", + "description": "Error message if the fetch failed", + "nullable": true + }, + "fetchedAt": { + "type": "string", + "description": "ISO 8601 timestamp of when the page was fetched" + } + } +} diff --git a/apify/.actor/input_schema.json b/apify/.actor/input_schema.json new file mode 100644 index 0000000..7454009 --- /dev/null +++ b/apify/.actor/input_schema.json @@ -0,0 +1,57 @@ +{ + "title": "Wick Web Fetcher Input", + "type": "object", + "schemaVersion": 1, + "properties": { + "urls": { + "title": "URLs", + "type": "array", + "description": "List of URLs to fetch", + "editor": "stringList", + "prefill": ["https://www.nytimes.com"] + }, + "mode": { + "title": "Mode", + "type": "string", + "description": "fetch = single pages, crawl = follow links, map = discover URLs", + "enum": ["fetch", "crawl", "map"], + "default": "fetch" + }, + "format": { + "title": "Output Format", + "type": "string", + "enum": ["markdown", "html", "text"], + "default": "markdown" + }, + "maxPages": { + "title": "Max Pages (crawl mode)", + "type": "integer", + "default": 10, + "minimum": 1, + "maximum": 50 + }, + "maxDepth": { + "title": "Max Depth (crawl mode)", + "type": "integer", + "default": 2, + "minimum": 1, + "maximum": 5 + }, + "wickTunnelUrl": { + "title": "Wick Tunnel URL (optional)", + "type": "string", + "description": "URL of your local Wick instance for residential IP routing. Leave blank to use Wick's built-in engine on Apify's infrastructure.", + "editor": "textfield", + "sectionCaption": "Advanced — Residential IP", + "sectionDescription": "Connect to your own Wick Pro instance to route requests through your residential IP instead of Apify's datacenter." + }, + "wickApiKey": { + "title": "Wick API Key (optional)", + "type": "string", + "description": "API key for your Wick tunnel endpoint", + "editor": "textfield", + "isSecret": true + } + }, + "required": ["urls"] +} diff --git a/apify/Dockerfile b/apify/Dockerfile new file mode 100644 index 0000000..0153e7f --- /dev/null +++ b/apify/Dockerfile @@ -0,0 +1,21 @@ +FROM node:20-slim + +# Install Wick binary + libcronet.so from GitHub release +ARG WICK_VERSION=0.7.0 +ADD https://github.com/wickproject/wick/releases/download/v${WICK_VERSION}/wick-linux-amd64.tar.gz /tmp/wick.tar.gz +RUN cd /tmp && tar xzf wick.tar.gz \ + && mv wick /usr/local/bin/wick \ + && mv libcronet.so /usr/local/lib/libcronet.so \ + && chmod +x /usr/local/bin/wick \ + && ldconfig \ + && rm wick.tar.gz + +# Verify wick runs +RUN wick version + +WORKDIR /app +COPY package.json . +RUN npm install --production +COPY . . + +CMD ["node", "src/main.js"] diff --git a/apify/package.json b/apify/package.json new file mode 100644 index 0000000..85fe409 --- /dev/null +++ b/apify/package.json @@ -0,0 +1,8 @@ +{ + "name": "wick-web-fetcher", + "version": "1.0.0", + "type": "module", + "dependencies": { + "apify": "^3.2.0" + } +} diff --git a/apify/src/main.js b/apify/src/main.js new file mode 100644 index 0000000..bbc5975 --- /dev/null +++ b/apify/src/main.js @@ -0,0 +1,135 @@ +import { Actor } from 'apify'; +import { spawn } from 'child_process'; + +const WICK_PORT = 18090; +const WICK_BASE = `http://127.0.0.1:${WICK_PORT}`; + +await Actor.init(); + +const input = await Actor.getInput(); +const { + urls, + mode = 'fetch', + format = 'markdown', + maxPages = 10, + maxDepth = 2, + wickTunnelUrl, + wickApiKey, +} = input; + +const dataset = await Actor.openDataset(); +const useTunnel = !!wickTunnelUrl; +const baseUrl = useTunnel ? wickTunnelUrl.replace(/\/$/, '') : WICK_BASE; +const headers = wickApiKey ? { Authorization: `Bearer ${wickApiKey}` } : {}; + +// Start the bundled Wick API server if not using a tunnel +let wickProcess; +if (!useTunnel) { + Actor.log.info('Starting Wick API server...'); + wickProcess = spawn('/usr/local/bin/wick', ['serve', '--api', '--port', String(WICK_PORT)], { + env: { ...process.env, LD_LIBRARY_PATH: '/usr/local/lib' }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + // Wait for server to be ready + let ready = false; + for (let i = 0; i < 30; i++) { + try { + const resp = await fetch(`${WICK_BASE}/health`); + if (resp.ok) { ready = true; break; } + } catch { /* not ready yet */ } + await new Promise(r => setTimeout(r, 500)); + } + + if (!ready) { + Actor.log.error('Wick API server failed to start'); + await Actor.exit({ exitCode: 1 }); + } + Actor.log.info('Wick API server ready'); +} else { + Actor.log.info(`Using Wick tunnel at ${wickTunnelUrl}`); +} + +async function wickFetch(url) { + const params = new URLSearchParams({ url, format }); + const resp = await fetch(`${baseUrl}/v1/fetch?${params}`, { headers }); + if (!resp.ok) throw new Error(`Wick returned ${resp.status}: ${await resp.text()}`); + return resp.json(); +} + +async function wickCrawl(url) { + const params = new URLSearchParams({ + url, format, max_pages: String(maxPages), max_depth: String(maxDepth), + }); + const resp = await fetch(`${baseUrl}/v1/crawl?${params}`, { headers }); + if (!resp.ok) throw new Error(`Wick returned ${resp.status}: ${await resp.text()}`); + return resp.json(); +} + +async function wickMap(url) { + const params = new URLSearchParams({ url, limit: '100' }); + const resp = await fetch(`${baseUrl}/v1/map?${params}`, { headers }); + if (!resp.ok) throw new Error(`Wick returned ${resp.status}: ${await resp.text()}`); + return resp.json(); +} + +const engine = useTunnel ? 'wick-tunnel' : 'wick-local'; + +for (const url of urls) { + try { + Actor.log.info(`${mode}: ${url}`); + + if (mode === 'crawl') { + const result = await wickCrawl(url); + for (const page of result.pages || []) { + await dataset.pushData({ + url: page.url, + title: page.title || null, + content: page.content, + format, + fetchedAt: new Date().toISOString(), + engine, + }); + } + Actor.log.info(`Crawled ${result.pages?.length || 0} pages from ${url}`); + } else if (mode === 'map') { + const result = await wickMap(url); + await dataset.pushData({ + url, + urls: result.urls, + format: 'urls', + timingMs: result.timing_ms, + fetchedAt: new Date().toISOString(), + engine, + }); + Actor.log.info(`Mapped ${result.count} URLs from ${url}`); + } else { + const result = await wickFetch(url); + await dataset.pushData({ + url, + title: result.title || null, + content: result.content, + statusCode: result.status, + timingMs: result.timing_ms, + format, + fetchedAt: new Date().toISOString(), + engine, + }); + } + } catch (err) { + Actor.log.error(`Failed: ${url}: ${err.message}`); + await dataset.pushData({ + url, + error: err.message, + fetchedAt: new Date().toISOString(), + engine, + }); + } +} + +// Clean up +if (wickProcess) { + wickProcess.kill('SIGTERM'); +} + +await Actor.exit(); From 59bcfd18d33ab40e39b246464fc84d80dd94b4ba Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Wed, 1 Apr 2026 12:19:52 -0600 Subject: [PATCH 2/3] Rewrite Actor README for marketplace approval Position Wick as complementary to the Apify ecosystem rather than competitive. Removed the comparison table vs Website Content Crawler, added honest limitations section, and framed the Actor as a lightweight option that pairs well with browser-based Actors. Co-Authored-By: Claude Opus 4.6 (1M context) --- apify/.actor/ACTOR.md | 68 +++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/apify/.actor/ACTOR.md b/apify/.actor/ACTOR.md index 37d43b6..31041bc 100644 --- a/apify/.actor/ACTOR.md +++ b/apify/.actor/ACTOR.md @@ -1,32 +1,25 @@ # Wick Web Fetcher -Getting 403 Forbidden? Your scraper's TLS fingerprint is giving it away. +A lightweight content extraction Actor powered by [Wick](https://getwick.dev), an open-source tool that uses Chrome's real network stack (Cronet) to fetch web pages. Because requests go through the same TLS implementation as a real Chrome browser (BoringSSL, HTTP/2, QUIC), Wick reaches sites that block raw HTTP clients. -**Wick uses Chrome's real network stack** (BoringSSL, HTTP/2, QUIC) to fetch web pages. The TLS handshake is identical to a real Chrome browser -- not headless Chromium, not Playwright, not Puppeteer. Sites that block every other scraper return 200 OK to Wick. +## When to use this Actor -## What it does +- **Quick single-page fetches** where spinning up a full browser is overkill +- **LLM and RAG pipelines** that need clean markdown from web pages +- **Lightweight content extraction** at low memory cost (256 MB) +- **Complement to browser-based Actors** -- use Wick for the pages that don't need JS rendering, save browser compute for the pages that do -- **Fetch** any URL and get clean markdown, HTML, or plain text -- **Crawl** a site following links, up to 50 pages deep -- **Map** a site to discover all URLs via sitemap.xml + link following -- Returns LLM-ready markdown -- perfect for RAG pipelines, AI training, and content analysis +## How it works -## Why not Website Content Crawler? +Under the hood, this Actor runs the Wick binary as a local HTTP API server inside the container. Wick makes requests using [Cronet](https://chromium.googlesource.com/chromium/src/+/master/components/cronet/) -- Chrome's network stack extracted as a standalone library. The response HTML is converted to clean markdown, stripping navigation, ads, and boilerplate. -| | Wick | Website Content Crawler | -|---|---|---| -| TLS fingerprint | Real Chrome (Cronet) | Headless Chromium | -| Anti-bot bypass | High -- real browser signature | Medium -- detectable fingerprint | -| Output | Clean markdown | Clean markdown | -| Speed | Fast (no browser startup) | Slower (full browser launch) | -| Memory | 256 MB | 1-4 GB | -| Compute cost | ~4x cheaper per run | Standard | +No headless browser is launched. This makes it fast (~1-3s per page) and lightweight (256 MB vs typical 1-4 GB for browser-based Actors). ## Modes ### Fetch (default) -Fetches one or more URLs and returns clean content. Each URL becomes one row in the output dataset. +Fetches one or more URLs and returns clean content. Each URL becomes one row in the output dataset with title, content, status code, and timing. ### Crawl @@ -34,27 +27,44 @@ Starts from a URL and follows same-domain links. Returns content for every page ### Map -Discovers all URLs on a site by checking sitemap.xml and following links. Returns a list of URLs without fetching their content -- useful for planning a targeted crawl. +Discovers all URLs on a site by checking sitemap.xml and following links. Returns a URL list without fetching content -- useful for planning a targeted crawl or building a sitemap. + +## Output + +Each dataset row contains: + +| Field | Description | +|-------|-------------| +| `url` | The URL that was fetched | +| `title` | Page title | +| `content` | Page content in markdown, HTML, or plain text | +| `statusCode` | HTTP response status | +| `timingMs` | Fetch duration in milliseconds | +| `format` | Output format used | +| `fetchedAt` | ISO 8601 timestamp | ## Residential IP mode (optional) -For maximum anti-detection, connect this Actor to your own **Wick Pro** instance running on your machine. Requests route through your residential IP -- no datacenter fingerprint, no proxy costs. +For additional anti-detection, you can connect this Actor to your own Wick instance running on your machine. Requests then route through your residential IP, combining Apify's scheduling and monitoring with your own network. -1. Install Wick Pro: `wick install pro` (see [getwick.dev](https://getwick.dev)) +1. Install [Wick Pro](https://getwick.dev) on your machine 2. Start the API server: `wick serve --api` -3. Expose via tunnel (Cloudflare Tunnel, ngrok, etc.) -4. Paste the tunnel URL in the **Wick Tunnel URL** input field +3. Expose it via a tunnel (Cloudflare Tunnel, ngrok, etc.) +4. Enter the tunnel URL in the **Wick Tunnel URL** input field + +## Limitations -This gives you Apify's scheduling and monitoring with Wick's anti-detection and your residential IP. +- **No JavaScript rendering** in the bundled engine. For JS-heavy SPAs, pair this Actor with a browser-based Actor like [Website Content Crawler](https://apify.com/apify/website-content-crawler) or use Wick's tunnel mode with a Pro instance that includes JS rendering. +- **Best for content pages.** Wick excels at articles, documentation, blogs, and product pages. For structured data extraction (e.g., specific fields from a listing), consider combining Wick's output with an LLM or a purpose-built scraper. ## Pricing -This Actor is **free to use** -- you only pay for Apify compute units. The bundled Wick engine is open source (MIT license). +This Actor is **free** -- you only pay for Apify compute units. The Wick engine is open source ([MIT license](https://github.com/wickproject/wick)). -For residential IP routing, you need [Wick Pro](https://getwick.dev) ($20/month). +Residential IP mode requires [Wick Pro](https://getwick.dev) ($20/month). -## Links +## Resources -- [Wick website](https://getwick.dev) -- [GitHub](https://github.com/wickproject/wick) -- [Documentation](https://getwick.dev/docs.html) +- [Wick documentation](https://getwick.dev/docs.html) +- [GitHub repository](https://github.com/wickproject/wick) +- [How Wick's TLS fingerprinting works](https://getwick.dev/blog/why-your-ai-agent-cant-read-the-web.html) From a8add93e13d2dd652bb269a35fd5e7ff34fdf584 Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Wed, 1 Apr 2026 12:40:27 -0600 Subject: [PATCH 3/3] Address PR review: input validation, process safety, checksum, mapLimit - Null-safe Actor.getInput() with validation for urls array - Error/exit handlers on spawned wick process - Drain stdout/stderr pipes to prevent buffer blocking - Dockerfile uses curl + sha256 checksum instead of ADD - Map limit is now configurable via mapLimit input (default 100, max 5000) Co-Authored-By: Claude Opus 4.6 (1M context) --- apify/.actor/input_schema.json | 8 ++++++++ apify/Dockerfile | 12 ++++++++++-- apify/src/main.js | 26 +++++++++++++++++++++++--- 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/apify/.actor/input_schema.json b/apify/.actor/input_schema.json index 7454009..02fa152 100644 --- a/apify/.actor/input_schema.json +++ b/apify/.actor/input_schema.json @@ -37,6 +37,14 @@ "minimum": 1, "maximum": 5 }, + "mapLimit": { + "title": "Max URLs (map mode)", + "type": "integer", + "description": "Maximum number of URLs to discover in map mode", + "default": 100, + "minimum": 1, + "maximum": 5000 + }, "wickTunnelUrl": { "title": "Wick Tunnel URL (optional)", "type": "string", diff --git a/apify/Dockerfile b/apify/Dockerfile index 0153e7f..7614cd4 100644 --- a/apify/Dockerfile +++ b/apify/Dockerfile @@ -2,8 +2,16 @@ FROM node:20-slim # Install Wick binary + libcronet.so from GitHub release ARG WICK_VERSION=0.7.0 -ADD https://github.com/wickproject/wick/releases/download/v${WICK_VERSION}/wick-linux-amd64.tar.gz /tmp/wick.tar.gz -RUN cd /tmp && tar xzf wick.tar.gz \ +ARG WICK_SHA256=110d074072ff5fb334ca3d0123def3f9463d5298f9c6a48fa727a03d21f08ea9 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +RUN cd /tmp \ + && curl -fsSL "https://github.com/wickproject/wick/releases/download/v${WICK_VERSION}/wick-linux-amd64.tar.gz" -o wick.tar.gz \ + && echo "${WICK_SHA256} wick.tar.gz" | sha256sum -c - \ + && tar xzf wick.tar.gz \ && mv wick /usr/local/bin/wick \ && mv libcronet.so /usr/local/lib/libcronet.so \ && chmod +x /usr/local/bin/wick \ diff --git a/apify/src/main.js b/apify/src/main.js index bbc5975..07b47e1 100644 --- a/apify/src/main.js +++ b/apify/src/main.js @@ -6,17 +6,23 @@ const WICK_BASE = `http://127.0.0.1:${WICK_PORT}`; await Actor.init(); -const input = await Actor.getInput(); +const input = (await Actor.getInput()) ?? {}; const { urls, mode = 'fetch', format = 'markdown', maxPages = 10, maxDepth = 2, + mapLimit = 100, wickTunnelUrl, wickApiKey, } = input; +if (!Array.isArray(urls) || urls.length === 0) { + Actor.log.error('Input must include a non-empty "urls" array.'); + await Actor.exit({ exitCode: 1 }); +} + const dataset = await Actor.openDataset(); const useTunnel = !!wickTunnelUrl; const baseUrl = useTunnel ? wickTunnelUrl.replace(/\/$/, '') : WICK_BASE; @@ -31,6 +37,20 @@ if (!useTunnel) { stdio: ['ignore', 'pipe', 'pipe'], }); + wickProcess.on('error', (err) => { + Actor.log.error(`Failed to start Wick: ${err.message}`); + Actor.exit({ exitCode: 1 }); + }); + + wickProcess.stdout.on('data', (chunk) => { + const msg = chunk.toString().trimEnd(); + if (msg) Actor.log.info(`[wick] ${msg}`); + }); + wickProcess.stderr.on('data', (chunk) => { + const msg = chunk.toString().trimEnd(); + if (msg) Actor.log.warning(`[wick] ${msg}`); + }); + // Wait for server to be ready let ready = false; for (let i = 0; i < 30; i++) { @@ -42,7 +62,7 @@ if (!useTunnel) { } if (!ready) { - Actor.log.error('Wick API server failed to start'); + Actor.log.error('Wick API server failed to start within 15s'); await Actor.exit({ exitCode: 1 }); } Actor.log.info('Wick API server ready'); @@ -67,7 +87,7 @@ async function wickCrawl(url) { } async function wickMap(url) { - const params = new URLSearchParams({ url, limit: '100' }); + const params = new URLSearchParams({ url, limit: String(mapLimit) }); const resp = await fetch(`${baseUrl}/v1/map?${params}`, { headers }); if (!resp.ok) throw new Error(`Wick returned ${resp.status}: ${await resp.text()}`); return resp.json();