From 4c1bbecb5c56edcbae9c043489ad22ed3d4548b2 Mon Sep 17 00:00:00 2001 From: Reagan Hsu Date: Thu, 7 May 2026 13:05:10 -0700 Subject: [PATCH 1/2] Bundle Browser Harness JS for desktop agents Agents need a local CDP runtime that does not depend on legacy helper tools or global skill installation. This vendors the upstream browser-harness-js runtime and interaction recipes, while excluding the bundled SDK from app TypeScript and ESLint passes because it is a runtime asset rather than app source. Constraint: App-spawned agents run from userData/harness and need the CLI available relative to that harness. Rejected: Install the upstream skill globally at task time | mutates the user's toolchain and makes launches depend on external setup. Confidence: high Scope-risk: moderate Directive: Treat browser-harness-js/sdk as vendored runtime; update from upstream intentionally, not via incidental formatting or lint fixes. Tested: cd app && npm run typecheck; targeted Vitest harness and adapter tests. Not-tested: Fresh packaged production app launch. --- app/eslint.config.js | 1 + .../main/hl/stock/browser-harness-js/SKILL.md | 248 + .../browser-harness-js/sdk/browser-harness-js | 137 + .../stock/browser-harness-js/sdk/generated.ts | 15160 ++++++++++++++++ .../hl/stock/browser-harness-js/sdk/repl.ts | 140 + .../stock/browser-harness-js/sdk/session.ts | 430 + .../hl/stock/interaction-skills/connection.md | 98 + .../hl/stock/interaction-skills/cookies.md | 61 + .../cross-origin-iframes.md | 67 + .../hl/stock/interaction-skills/dialogs.md | 75 + .../hl/stock/interaction-skills/downloads.md | 77 + .../stock/interaction-skills/drag-and-drop.md | 56 + .../hl/stock/interaction-skills/dropdowns.md | 79 + .../hl/stock/interaction-skills/iframes.md | 69 + .../interaction-skills/network-requests.md | 109 + .../stock/interaction-skills/print-as-pdf.md | 69 + .../stock/interaction-skills/screenshots.md | 54 + .../hl/stock/interaction-skills/scrolling.md | 75 + .../hl/stock/interaction-skills/shadow-dom.md | 80 + .../main/hl/stock/interaction-skills/tabs.md | 75 + .../hl/stock/interaction-skills/uploads.md | 65 + .../hl/stock/interaction-skills/viewport.md | 70 + app/tsconfig.json | 2 +- 23 files changed, 17296 insertions(+), 1 deletion(-) create mode 100644 app/src/main/hl/stock/browser-harness-js/SKILL.md create mode 100755 app/src/main/hl/stock/browser-harness-js/sdk/browser-harness-js create mode 100644 app/src/main/hl/stock/browser-harness-js/sdk/generated.ts create mode 100644 app/src/main/hl/stock/browser-harness-js/sdk/repl.ts create mode 100644 app/src/main/hl/stock/browser-harness-js/sdk/session.ts create mode 100644 app/src/main/hl/stock/interaction-skills/connection.md create mode 100644 app/src/main/hl/stock/interaction-skills/cookies.md create mode 100644 app/src/main/hl/stock/interaction-skills/cross-origin-iframes.md create mode 100644 app/src/main/hl/stock/interaction-skills/dialogs.md create mode 100644 app/src/main/hl/stock/interaction-skills/downloads.md create mode 100644 app/src/main/hl/stock/interaction-skills/drag-and-drop.md create mode 100644 app/src/main/hl/stock/interaction-skills/dropdowns.md create mode 100644 app/src/main/hl/stock/interaction-skills/iframes.md create mode 100644 app/src/main/hl/stock/interaction-skills/network-requests.md create mode 100644 app/src/main/hl/stock/interaction-skills/print-as-pdf.md create mode 100644 app/src/main/hl/stock/interaction-skills/screenshots.md create mode 100644 app/src/main/hl/stock/interaction-skills/scrolling.md create mode 100644 app/src/main/hl/stock/interaction-skills/shadow-dom.md create mode 100644 app/src/main/hl/stock/interaction-skills/tabs.md create mode 100644 app/src/main/hl/stock/interaction-skills/uploads.md create mode 100644 app/src/main/hl/stock/interaction-skills/viewport.md diff --git a/app/eslint.config.js b/app/eslint.config.js index ed4602cd..d8afb13d 100644 --- a/app/eslint.config.js +++ b/app/eslint.config.js @@ -17,6 +17,7 @@ module.exports = [ 'dist/**', 'node_modules/**', 'docker/agent/dist/**', + 'src/main/hl/stock/browser-harness-js/sdk/**', // JS files were never linted under the old --ext .ts,.tsx flag '**/*.js', '**/*.mjs', diff --git a/app/src/main/hl/stock/browser-harness-js/SKILL.md b/app/src/main/hl/stock/browser-harness-js/SKILL.md new file mode 100644 index 00000000..3da37b5c --- /dev/null +++ b/app/src/main/hl/stock/browser-harness-js/SKILL.md @@ -0,0 +1,248 @@ +--- +name: cdp +description: Drive Browser Use Desktop's assigned Chromium target via the DevTools Protocol from JavaScript. Run snippets through the bundled `browser-harness-js` CLI; it auto-spawns a long-lived Bun HTTP server holding a CDP `Session`, and every call executes against the same persistent connection. +--- + +# CDP — `browser-harness-js` skill + +Custom codegen'd CDP SDK (every method from browser_protocol.json + js_protocol.json gets a typed wrapper) plus a tiny HTTP server that holds one persistent CDP `Session`. The `browser-harness-js` CLI auto-starts the server on first use and forwards JS snippets to it. + +Browser Use Desktop bundles the runtime under `./browser-harness-js/sdk/` and puts that directory on PATH for you. Do not run `npx skills add` or create global symlinks from inside the desktop harness. + +## First use in Browser Use Desktop + +Connect to the app-assigned target before page-level calls: + +```bash +browser-harness-js 'await connectToAssignedTarget()' +``` + +`connectToAssignedTarget()` reads `BU_TARGET_ID` and `BU_CDP_PORT`, attaches the assigned target when possible, and enables the common Page/DOM/Runtime/Network domains. The CLI auto-installs `bun` on first run if it is missing. Set `BROWSER_HARNESS_SKIP_BUN_INSTALL=1` to opt out. + +## How to use + +Just run `browser-harness-js ''`. The first call spawns the server in the background; subsequent calls hit the same process and so reuse the same `session`, the same WebSocket to Chrome, and any globals you set. + +```bash +browser-harness-js 'await connectToAssignedTarget()' +browser-harness-js 'await session.Page.navigate({url:"https://example.com"})' +browser-harness-js '(await session.Runtime.evaluate({expression:"document.title",returnByValue:true})).result.value' +``` + +Output is the **raw result content** — no `{ok,result}` envelope. + +| Result type | stdout | +|---|---| +| string | bare text, no JSON quotes (e.g. `Example Domain`) | +| number / boolean | `42`, `true` | +| object / array (non-empty) | compact JSON (e.g. `{"frameId":"..."}`, `[1,2,3]`) | +| `undefined` / `null` / `""` / `{}` / `[]` | empty (no output) | + +**Errors** go to **stderr**, exit code `1`. The CDP error message and JS stack are printed verbatim, e.g.: +``` +Error: CDP -32602: invalid params + at _call (.../session.ts:117:33) + ... +``` +Detect failure with `if browser-harness-js '...'; then ...; else handle_error; fi` or by checking `$?`. + +**Multi-line snippets via stdin (heredoc).** Important: a multi-statement snippet does NOT auto-return the last expression — write `return X` explicitly. Single-expression snippets passed as the first argument DO auto-return. + +```bash +browser-harness-js <<'EOF' +const tabs = await listPageTargets(); +globalThis.tid = tabs[0].targetId; +await session.use(globalThis.tid); +return globalThis.tid; +EOF +``` + +## CLI commands + +| Command | Behavior | +|---|---| +| `browser-harness-js ''` | Auto-start server if needed, eval the JS, print result. | +| `browser-harness-js </DevToolsActivePort` directly. | +| `{ wsUrl }` | You already have `ws://…/devtools/browser/` (e.g. piped from elsewhere). | + +```js +await session.connect({ profileDir: '/Users//Library/Application Support/Google/Chrome' }) +await session.connect({ wsUrl: 'ws://127.0.0.1:9222/devtools/browser/' }) +``` + +Profile paths by OS — use these with `{ profileDir }`: +- macOS: `~/Library/Application Support/` (e.g. `Google/Chrome`, `Comet`, `BraveSoftware/Brave-Browser`, `Arc/User Data`) +- Linux: `~/.config/` (e.g. `google-chrome`, `chromium`, `BraveSoftware/Brave-Browser`) +- Windows: `%LOCALAPPDATA%\\User Data` (e.g. `Google\Chrome`, `Microsoft\Edge`, `BraveSoftware\Brave-Browser`) + +Per-candidate WS-open timeout defaults to **5s** — live browsers answer with open/close within ~100ms, so 5s is already generous. The only case where 5s is too short is when Chrome is showing the **Allow** popup and waiting on the user to click. If you expect that, pass `timeoutMs: 30000`: + +```js +await session.connect({ profileDir: '/Users//Library/Application Support/Google/Chrome', timeoutMs: 30_000 }) +``` + +**If you see `No detected browser accepted a connection`** — the browsers have `DevToolsActivePort` files but none are currently serving WS. Most common cause: remote-debugging is enabled but the user hasn't clicked **Allow** on the prompt yet. Tell them to click Allow, then retry (or bump `timeoutMs`). + +### Picking a target (tab) + +After `connect()`, call `session.use(targetId)` once; subsequent page-level calls (Page/DOM/Runtime/Network/etc.) auto-route to that target's sessionId. `Browser.*` and `Target.*` calls always hit the browser endpoint. + +```js +const tabs = await listPageTargets() // no args; uses the connected session +const sid = await session.use(tabs[0].targetId) +await session.Page.enable() +await session.Page.navigate({ url: 'https://example.com' }) +``` + +`listPageTargets()` uses CDP's `Target.getTargets` (not `/json`), so it works on Chrome 144+ too. It already filters out `chrome://` and `devtools://` URLs. Equivalent raw call: + +```js +const { targetInfos } = await session.Target.getTargets({}) +const tabs = targetInfos.filter(t => t.type === 'page' && !t.url.startsWith('chrome://') && !t.url.startsWith('devtools://')) +``` + +To switch tabs: `session.use(otherTargetId)`. To detach: `session.setActiveSession(undefined)`. + +### Events + +```js +// Subscribe (returns an unsubscribe fn) +const off = session.onEvent((method, params, sessionId) => { ... }) + +// Or wait for a single matching event with optional predicate + timeout +await session.Network.enable() +const ev = await session.waitFor( + 'Page.frameNavigated', + (p) => p.frame.url.includes('example.com'), + 10_000 +) +``` + +### Persisting state across calls + +Each snippet runs inside its own async wrapper, so its `let`/`const` declarations vanish when it returns. To carry data forward, attach to `globalThis`: + +```bash +browser-harness-js '(await listPageTargets()).forEach((t,i)=>globalThis["tab"+i]=t.targetId)' +browser-harness-js 'await session.use(globalThis.tab0)' +browser-harness-js 'await session.Page.navigate({url:"https://example.com"})' +``` + +`session` itself, the active sessionId, and event subscribers are already preserved by the server — globals are only needed for ad-hoc data. + +## Connecting to a running Chrome (chrome://inspect flow) + +When attaching to the user's already-running browser: + +1. **Try `await session.connect()` first** (no args) — auto-detect handles every Chromium-based browser via `DevToolsActivePort`. If it returns, you're done. +2. **If auto-detect fails** with `No running browser with remote debugging detected`, the user needs to turn it on. Open the inspect page: + ```bash + # macOS — prefer AppleScript over `open -a` (reuses current profile, avoids the profile picker) + osascript -e 'open location "chrome://inspect/#remote-debugging"' + + # Linux + google-chrome 'chrome://inspect/#remote-debugging' # or: chromium, google-chrome-stable + + # Windows (PowerShell) + Start-Process chrome 'chrome://inspect/#remote-debugging' + ``` + Only macOS's AppleScript path avoids the profile picker; Linux/Windows may prompt the user to pick a profile first. +3. **Tick "Discover network targets"** in chrome://inspect, then click **Allow** when Chrome prompts. +4. **If auto-detect picks the wrong browser** (multiple running, you want a specific one): list them with `await detectBrowsers()`, then `await session.connect({ profileDir: })`. +5. **If `session.connect()` returns `No detected browser accepted a connection`**, the user has remote-debugging on but hasn't clicked **Allow** yet. Tell them to click it and retry, or pass `timeoutMs: 30000` to wait for the click. + +## Working with targets (tabs) + +- **Filter Chrome internals.** `listPageTargets()` already drops `chrome://` and `devtools://` URLs. If you call `Target.getTargets()` directly, filter manually. +- **CDP target order ≠ visible tab-strip order.** When the user says "the first tab I can see", use a screenshot or page title to identify it — `Target.activateTarget` only switches to a known targetId. + +## Looking up a method + +The full typed surface is in `/sdk/generated.ts` (~655 KB, only loaded if you read it). Each method has its CDP description as a JSDoc comment plus typed `*Params` / `*Return` interfaces in per-domain namespaces. + +```bash +grep -n "navigate" /sdk/generated.ts | head +``` + +## Regenerating the SDK + +This is a maintenance-only workflow, not a normal task step. Browser Use +Desktop already bundles the generated SDK. Do not regenerate or patch it during +ordinary browser tasks unless the user explicitly asks, or a confirmed bundled +runtime defect blocks the task. + +When the upstream protocol JSONs change, replace `sdk/browser_protocol.json` and/or `sdk/js_protocol.json` and re-run: + +```bash +cd /sdk && bun gen.ts +browser-harness-js --restart # pick up the new bindings +``` + +## Files + +All paths are relative to `` (the install path — see top of this doc). + +- `/usr/local/bin/browser-harness-js` → `/sdk/browser-harness-js` (the CLI) +- `sdk/repl.ts` — HTTP server (`Bun.serve` on `127.0.0.1:9876`) +- `sdk/session.ts` — `Session` class (transport, connect, target routing, events) +- `sdk/generated.ts` — codegen output: every CDP method as a typed wrapper +- `sdk/gen.ts` — codegen script +- `sdk/{browser,js}_protocol.json` — upstream protocol (vendored) diff --git a/app/src/main/hl/stock/browser-harness-js/sdk/browser-harness-js b/app/src/main/hl/stock/browser-harness-js/sdk/browser-harness-js new file mode 100755 index 00000000..778dcc51 --- /dev/null +++ b/app/src/main/hl/stock/browser-harness-js/sdk/browser-harness-js @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +# browser-harness-js — eval JS in the persistent CDP REPL. Auto-starts the REPL on first use. +# +# Usage: +# browser-harness-js 'await session.connect({port:9222})' +# browser-harness-js 'await session.Page.navigate({url:"https://example.com"})' +# browser-harness-js <<'EOF' +# const t = await listPageTargets("localhost", 9222); +# globalThis.tid = t[0].targetId; +# await session.use(globalThis.tid); +# globalThis.tid +# EOF +# +# browser-harness-js --status # is the REPL running? prints health JSON +# browser-harness-js --stop # gracefully shut it down +# browser-harness-js --logs # tail the REPL log +# browser-harness-js --restart # stop + start fresh (drops session state) +# browser-harness-js --start # explicit start (no-op if already running) + +set -euo pipefail + +PORT="${CDP_REPL_PORT:-9876}" +HOST="127.0.0.1" +URL="http://$HOST:$PORT" + +# Resolve repl.ts alongside this script, following symlinks (e.g. /usr/local/bin/browser-harness-js → /sdk/browser-harness-js). +SCRIPT_PATH="${BASH_SOURCE[0]}" +while [ -L "$SCRIPT_PATH" ]; do + SCRIPT_DIR="$(cd "$(dirname "$SCRIPT_PATH")" && pwd)" + SCRIPT_PATH="$(readlink "$SCRIPT_PATH")" + [[ "$SCRIPT_PATH" != /* ]] && SCRIPT_PATH="$SCRIPT_DIR/$SCRIPT_PATH" +done +REPL="$(cd "$(dirname "$SCRIPT_PATH")" && pwd)/repl.ts" + +LOG="${CDP_REPL_LOG:-/tmp/browser-harness-js.log}" + +# Bootstrap bun if missing — the REPL server is Bun-native. +ensure_bun() { + if command -v bun >/dev/null 2>&1; then return 0; fi + # Handle fresh install: bun's install script drops the binary here but + # PATH isn't updated until the next login shell. + if [ -x "$HOME/.bun/bin/bun" ]; then + export PATH="$HOME/.bun/bin:$PATH" + return 0 + fi + if [ -n "${BROWSER_HARNESS_SKIP_BUN_INSTALL:-}" ]; then + echo "browser-harness-js: bun not found and BROWSER_HARNESS_SKIP_BUN_INSTALL is set." >&2 + echo " Install manually: curl -fsSL https://bun.sh/install | bash" >&2 + return 1 + fi + echo "browser-harness-js: installing bun (one-time, from https://bun.sh/install)..." >&2 + if ! curl -fsSL https://bun.sh/install | bash >&2; then + echo "browser-harness-js: bun install failed. Install manually from https://bun.sh, or set BROWSER_HARNESS_SKIP_BUN_INSTALL=1 to suppress this prompt." >&2 + return 1 + fi + export PATH="$HOME/.bun/bin:$PATH" + command -v bun >/dev/null 2>&1 || { + echo "browser-harness-js: bun installed but not found at \$HOME/.bun/bin/bun." >&2 + return 1 + } +} + +is_up() { + curl -fsS --max-time 1 "$URL/health" >/dev/null 2>&1 +} + +start_repl() { + is_up && return 0 + ensure_bun || return 1 + CDP_REPL_PORT="$PORT" nohup bun "$REPL" >"$LOG" 2>&1 & + for _ in $(seq 1 100); do + sleep 0.1 + is_up && return 0 + done + echo "browser-harness-js: REPL failed to start on $URL (see $LOG)" >&2 + return 1 +} + +post_eval() { + # Capture body + status separately. Body goes to stdout (only if non-empty) + # on 200; otherwise to stderr with non-zero exit. + local out status body + out=$(curl -sS -w '\n___STATUS___%{http_code}' --data-binary "$1" "$URL/eval") + status="${out##*___STATUS___}" + body="${out%$'\n'___STATUS___*}" + if [ "$status" = "200" ]; then + [ -n "$body" ] && printf '%s\n' "$body" + return 0 + else + [ -n "$body" ] && printf '%s\n' "$body" >&2 + return 1 + fi +} + +case "${1:-}" in + --status) + if is_up; then + curl -sS "$URL/health"; echo + else + echo '{"ok":false,"error":"down"}' + exit 1 + fi + ;; + --start) + start_repl + curl -sS "$URL/health"; echo + ;; + --stop) + if is_up; then + curl -s -X POST "$URL/quit" >/dev/null || true + echo '{"ok":true,"stopped":true}' + else + echo '{"ok":true,"stopped":false,"note":"already down"}' + fi + ;; + --restart) + is_up && curl -s -X POST "$URL/quit" >/dev/null 2>&1 || true + sleep 0.2 + start_repl + curl -sS "$URL/health"; echo + ;; + --logs) + exec tail -f "$LOG" + ;; + --help|-h) + sed -n '2,/^set -euo/p' "$0" | sed 's/^#//; s/^ //; /^set -euo/d' + ;; + "") + start_repl + code="$(cat)" + post_eval "$code" + ;; + *) + start_repl + post_eval "$1" + ;; +esac diff --git a/app/src/main/hl/stock/browser-harness-js/sdk/generated.ts b/app/src/main/hl/stock/browser-harness-js/sdk/generated.ts new file mode 100644 index 00000000..30f31c18 --- /dev/null +++ b/app/src/main/hl/stock/browser-harness-js/sdk/generated.ts @@ -0,0 +1,15160 @@ +/* eslint-disable */ +// AUTO-GENERATED by gen.ts. Do not edit by hand. +// Run `bun gen.ts` to regenerate from browser_protocol.json + js_protocol.json. + +export interface Transport { + _call(method: string, params?: unknown): Promise; +} + + +export namespace Accessibility { + + /** Unique accessibility node identifier. */ + export type AXNodeId = string; + + /** Enum of possible property types. */ + export type AXValueType = "boolean" | "tristate" | "booleanOrUndefined" | "idref" | "idrefList" | "integer" | "node" | "nodeList" | "number" | "string" | "computedString" | "token" | "tokenList" | "domRelation" | "role" | "internalRole" | "valueUndefined"; + + /** Enum of possible property sources. */ + export type AXValueSourceType = "attribute" | "implicit" | "style" | "contents" | "placeholder" | "relatedElement"; + + /** Enum of possible native property sources (as a subtype of a particular AXValueSourceType). */ + export type AXValueNativeSourceType = "description" | "figcaption" | "label" | "labelfor" | "labelwrapped" | "legend" | "rubyannotation" | "tablecaption" | "title" | "other"; + + /** A single source for a computed AX property. */ + export interface AXValueSource { + /** What type of source this is. */ + type: Accessibility.AXValueSourceType; + /** The value of this property source. */ + value?: Accessibility.AXValue; + /** The name of the relevant attribute, if any. */ + attribute?: string; + /** The value of the relevant attribute, if any. */ + attributeValue?: Accessibility.AXValue; + /** Whether this source is superseded by a higher priority source. */ + superseded?: boolean; + /** The native markup source for this value, e.g. a `