diff --git a/apify/.actor/ACTOR.md b/apify/.actor/ACTOR.md new file mode 100644 index 0000000..31041bc --- /dev/null +++ b/apify/.actor/ACTOR.md @@ -0,0 +1,70 @@ +# Wick Web Fetcher + +A lightweight content extraction Actor powered by [Wick](https://getwick.dev), an open-source tool that uses Chrome's real network stack (Cronet) to fetch web pages. Because requests go through the same TLS implementation as a real Chrome browser (BoringSSL, HTTP/2, QUIC), Wick reaches sites that block raw HTTP clients. + +## When to use this Actor + +- **Quick single-page fetches** where spinning up a full browser is overkill +- **LLM and RAG pipelines** that need clean markdown from web pages +- **Lightweight content extraction** at low memory cost (256 MB) +- **Complement to browser-based Actors** -- use Wick for the pages that don't need JS rendering, save browser compute for the pages that do + +## How it works + +Under the hood, this Actor runs the Wick binary as a local HTTP API server inside the container. Wick makes requests using [Cronet](https://chromium.googlesource.com/chromium/src/+/master/components/cronet/) -- Chrome's network stack extracted as a standalone library. The response HTML is converted to clean markdown, stripping navigation, ads, and boilerplate. + +No headless browser is launched. This makes it fast (~1-3s per page) and lightweight (256 MB vs typical 1-4 GB for browser-based Actors). + +## Modes + +### Fetch (default) + +Fetches one or more URLs and returns clean content. Each URL becomes one row in the output dataset with title, content, status code, and timing. + +### Crawl + +Starts from a URL and follows same-domain links. Returns content for every page discovered, each as a separate dataset row. Control depth (1-5) and max pages (1-50). + +### Map + +Discovers all URLs on a site by checking sitemap.xml and following links. Returns a URL list without fetching content -- useful for planning a targeted crawl or building a sitemap. + +## Output + +Each dataset row contains: + +| Field | Description | +|-------|-------------| +| `url` | The URL that was fetched | +| `title` | Page title | +| `content` | Page content in markdown, HTML, or plain text | +| `statusCode` | HTTP response status | +| `timingMs` | Fetch duration in milliseconds | +| `format` | Output format used | +| `fetchedAt` | ISO 8601 timestamp | + +## Residential IP mode (optional) + +For additional anti-detection, you can connect this Actor to your own Wick instance running on your machine. Requests then route through your residential IP, combining Apify's scheduling and monitoring with your own network. + +1. Install [Wick Pro](https://getwick.dev) on your machine +2. Start the API server: `wick serve --api` +3. Expose it via a tunnel (Cloudflare Tunnel, ngrok, etc.) +4. Enter the tunnel URL in the **Wick Tunnel URL** input field + +## Limitations + +- **No JavaScript rendering** in the bundled engine. For JS-heavy SPAs, pair this Actor with a browser-based Actor like [Website Content Crawler](https://apify.com/apify/website-content-crawler) or use Wick's tunnel mode with a Pro instance that includes JS rendering. +- **Best for content pages.** Wick excels at articles, documentation, blogs, and product pages. For structured data extraction (e.g., specific fields from a listing), consider combining Wick's output with an LLM or a purpose-built scraper. + +## Pricing + +This Actor is **free** -- you only pay for Apify compute units. The Wick engine is open source ([MIT license](https://github.com/wickproject/wick)). + +Residential IP mode requires [Wick Pro](https://getwick.dev) ($20/month). + +## Resources + +- [Wick documentation](https://getwick.dev/docs.html) +- [GitHub repository](https://github.com/wickproject/wick) +- [How Wick's TLS fingerprinting works](https://getwick.dev/blog/why-your-ai-agent-cant-read-the-web.html) diff --git a/apify/.actor/actor.json b/apify/.actor/actor.json new file mode 100644 index 0000000..12ccb10 --- /dev/null +++ b/apify/.actor/actor.json @@ -0,0 +1,15 @@ +{ + "actorSpecification": 1, + "name": "wick-web-fetcher", + "title": "Wick Web Fetcher — Browser-Grade Content Extraction", + "version": "1.0", + "buildTag": "latest", + "minMemoryMbytes": 256, + "maxMemoryMbytes": 1024, + "dockerfile": "../Dockerfile", + "readme": "./ACTOR.md", + "input": "./input_schema.json", + "storages": { + "dataset": "./dataset_schema.json" + } +} diff --git a/apify/.actor/dataset_schema.json b/apify/.actor/dataset_schema.json new file mode 100644 index 0000000..0a7ca88 --- /dev/null +++ b/apify/.actor/dataset_schema.json @@ -0,0 +1,51 @@ +{ + "actorSpecification": 1, + "fields": { + "url": { + "type": "string", + "description": "The URL that was fetched" + }, + "title": { + "type": "string", + "description": "Page title extracted from HTML", + "nullable": true + }, + "content": { + "type": "string", + "description": "Page content in the requested format (markdown, html, or text)", + "nullable": true + }, + "urls": { + "type": "array", + "description": "Discovered URLs (map mode only)", + "nullable": true + }, + "format": { + "type": "string", + "description": "Output format used" + }, + "statusCode": { + "type": "integer", + "description": "HTTP status code from the fetch", + "nullable": true + }, + "timingMs": { + "type": "integer", + "description": "Time to fetch in milliseconds", + "nullable": true + }, + "engine": { + "type": "string", + "description": "wick-local (bundled engine) or wick-tunnel (residential IP)" + }, + "error": { + "type": "string", + "description": "Error message if the fetch failed", + "nullable": true + }, + "fetchedAt": { + "type": "string", + "description": "ISO 8601 timestamp of when the page was fetched" + } + } +} diff --git a/apify/.actor/input_schema.json b/apify/.actor/input_schema.json new file mode 100644 index 0000000..02fa152 --- /dev/null +++ b/apify/.actor/input_schema.json @@ -0,0 +1,65 @@ +{ + "title": "Wick Web Fetcher Input", + "type": "object", + "schemaVersion": 1, + "properties": { + "urls": { + "title": "URLs", + "type": "array", + "description": "List of URLs to fetch", + "editor": "stringList", + "prefill": ["https://www.nytimes.com"] + }, + "mode": { + "title": "Mode", + "type": "string", + "description": "fetch = single pages, crawl = follow links, map = discover URLs", + "enum": ["fetch", "crawl", "map"], + "default": "fetch" + }, + "format": { + "title": "Output Format", + "type": "string", + "enum": ["markdown", "html", "text"], + "default": "markdown" + }, + "maxPages": { + "title": "Max Pages (crawl mode)", + "type": "integer", + "default": 10, + "minimum": 1, + "maximum": 50 + }, + "maxDepth": { + "title": "Max Depth (crawl mode)", + "type": "integer", + "default": 2, + "minimum": 1, + "maximum": 5 + }, + "mapLimit": { + "title": "Max URLs (map mode)", + "type": "integer", + "description": "Maximum number of URLs to discover in map mode", + "default": 100, + "minimum": 1, + "maximum": 5000 + }, + "wickTunnelUrl": { + "title": "Wick Tunnel URL (optional)", + "type": "string", + "description": "URL of your local Wick instance for residential IP routing. Leave blank to use Wick's built-in engine on Apify's infrastructure.", + "editor": "textfield", + "sectionCaption": "Advanced — Residential IP", + "sectionDescription": "Connect to your own Wick Pro instance to route requests through your residential IP instead of Apify's datacenter." + }, + "wickApiKey": { + "title": "Wick API Key (optional)", + "type": "string", + "description": "API key for your Wick tunnel endpoint", + "editor": "textfield", + "isSecret": true + } + }, + "required": ["urls"] +} diff --git a/apify/Dockerfile b/apify/Dockerfile new file mode 100644 index 0000000..7614cd4 --- /dev/null +++ b/apify/Dockerfile @@ -0,0 +1,29 @@ +FROM node:20-slim + +# Install Wick binary + libcronet.so from GitHub release +ARG WICK_VERSION=0.7.0 +ARG WICK_SHA256=110d074072ff5fb334ca3d0123def3f9463d5298f9c6a48fa727a03d21f08ea9 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +RUN cd /tmp \ + && curl -fsSL "https://github.com/wickproject/wick/releases/download/v${WICK_VERSION}/wick-linux-amd64.tar.gz" -o wick.tar.gz \ + && echo "${WICK_SHA256} wick.tar.gz" | sha256sum -c - \ + && tar xzf wick.tar.gz \ + && mv wick /usr/local/bin/wick \ + && mv libcronet.so /usr/local/lib/libcronet.so \ + && chmod +x /usr/local/bin/wick \ + && ldconfig \ + && rm wick.tar.gz + +# Verify wick runs +RUN wick version + +WORKDIR /app +COPY package.json . +RUN npm install --production +COPY . . + +CMD ["node", "src/main.js"] diff --git a/apify/package.json b/apify/package.json new file mode 100644 index 0000000..85fe409 --- /dev/null +++ b/apify/package.json @@ -0,0 +1,8 @@ +{ + "name": "wick-web-fetcher", + "version": "1.0.0", + "type": "module", + "dependencies": { + "apify": "^3.2.0" + } +} diff --git a/apify/src/main.js b/apify/src/main.js new file mode 100644 index 0000000..07b47e1 --- /dev/null +++ b/apify/src/main.js @@ -0,0 +1,155 @@ +import { Actor } from 'apify'; +import { spawn } from 'child_process'; + +const WICK_PORT = 18090; +const WICK_BASE = `http://127.0.0.1:${WICK_PORT}`; + +await Actor.init(); + +const input = (await Actor.getInput()) ?? {}; +const { + urls, + mode = 'fetch', + format = 'markdown', + maxPages = 10, + maxDepth = 2, + mapLimit = 100, + wickTunnelUrl, + wickApiKey, +} = input; + +if (!Array.isArray(urls) || urls.length === 0) { + Actor.log.error('Input must include a non-empty "urls" array.'); + await Actor.exit({ exitCode: 1 }); +} + +const dataset = await Actor.openDataset(); +const useTunnel = !!wickTunnelUrl; +const baseUrl = useTunnel ? wickTunnelUrl.replace(/\/$/, '') : WICK_BASE; +const headers = wickApiKey ? { Authorization: `Bearer ${wickApiKey}` } : {}; + +// Start the bundled Wick API server if not using a tunnel +let wickProcess; +if (!useTunnel) { + Actor.log.info('Starting Wick API server...'); + wickProcess = spawn('/usr/local/bin/wick', ['serve', '--api', '--port', String(WICK_PORT)], { + env: { ...process.env, LD_LIBRARY_PATH: '/usr/local/lib' }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + wickProcess.on('error', (err) => { + Actor.log.error(`Failed to start Wick: ${err.message}`); + Actor.exit({ exitCode: 1 }); + }); + + wickProcess.stdout.on('data', (chunk) => { + const msg = chunk.toString().trimEnd(); + if (msg) Actor.log.info(`[wick] ${msg}`); + }); + wickProcess.stderr.on('data', (chunk) => { + const msg = chunk.toString().trimEnd(); + if (msg) Actor.log.warning(`[wick] ${msg}`); + }); + + // Wait for server to be ready + let ready = false; + for (let i = 0; i < 30; i++) { + try { + const resp = await fetch(`${WICK_BASE}/health`); + if (resp.ok) { ready = true; break; } + } catch { /* not ready yet */ } + await new Promise(r => setTimeout(r, 500)); + } + + if (!ready) { + Actor.log.error('Wick API server failed to start within 15s'); + await Actor.exit({ exitCode: 1 }); + } + Actor.log.info('Wick API server ready'); +} else { + Actor.log.info(`Using Wick tunnel at ${wickTunnelUrl}`); +} + +async function wickFetch(url) { + const params = new URLSearchParams({ url, format }); + const resp = await fetch(`${baseUrl}/v1/fetch?${params}`, { headers }); + if (!resp.ok) throw new Error(`Wick returned ${resp.status}: ${await resp.text()}`); + return resp.json(); +} + +async function wickCrawl(url) { + const params = new URLSearchParams({ + url, format, max_pages: String(maxPages), max_depth: String(maxDepth), + }); + const resp = await fetch(`${baseUrl}/v1/crawl?${params}`, { headers }); + if (!resp.ok) throw new Error(`Wick returned ${resp.status}: ${await resp.text()}`); + return resp.json(); +} + +async function wickMap(url) { + const params = new URLSearchParams({ url, limit: String(mapLimit) }); + const resp = await fetch(`${baseUrl}/v1/map?${params}`, { headers }); + if (!resp.ok) throw new Error(`Wick returned ${resp.status}: ${await resp.text()}`); + return resp.json(); +} + +const engine = useTunnel ? 'wick-tunnel' : 'wick-local'; + +for (const url of urls) { + try { + Actor.log.info(`${mode}: ${url}`); + + if (mode === 'crawl') { + const result = await wickCrawl(url); + for (const page of result.pages || []) { + await dataset.pushData({ + url: page.url, + title: page.title || null, + content: page.content, + format, + fetchedAt: new Date().toISOString(), + engine, + }); + } + Actor.log.info(`Crawled ${result.pages?.length || 0} pages from ${url}`); + } else if (mode === 'map') { + const result = await wickMap(url); + await dataset.pushData({ + url, + urls: result.urls, + format: 'urls', + timingMs: result.timing_ms, + fetchedAt: new Date().toISOString(), + engine, + }); + Actor.log.info(`Mapped ${result.count} URLs from ${url}`); + } else { + const result = await wickFetch(url); + await dataset.pushData({ + url, + title: result.title || null, + content: result.content, + statusCode: result.status, + timingMs: result.timing_ms, + format, + fetchedAt: new Date().toISOString(), + engine, + }); + } + } catch (err) { + Actor.log.error(`Failed: ${url}: ${err.message}`); + await dataset.pushData({ + url, + error: err.message, + fetchedAt: new Date().toISOString(), + engine, + }); + } +} + +// Clean up +if (wickProcess) { + wickProcess.kill('SIGTERM'); +} + +await Actor.exit();