From 0c4d24b9986b09f5de85d3f0e49673c77d3b2235 Mon Sep 17 00:00:00 2001 From: bcode Date: Fri, 8 May 2026 19:29:03 +0000 Subject: [PATCH] Option A: cloud browser via snippet, drop browser_open_cloud tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The local-browser connect path is already snippet-side (the agent calls `await session.connect(...)` itself in a snippet, picks Way 1 / Way 2). `browser_open_cloud` was the only opaque path — it wrapped one HTTP call and hid the Browser Use API surface from the agent. That asymmetry was the root of the cubic P2 lifecycle finding (auto-stop wired but never invoked) and the broader "two patterns mixed" review feedback on PR #41. This commit makes cloud match local. The agent provisions, connects, stops, and swaps cloud browsers from inside `browser_execute` snippets using `fetch` against `https://api.browser-use.com/api/v3/browsers`. Removed: - packages/bcode-browser/src/cloud-browser.ts (-109) - packages/opencode/src/tool/browser-open-cloud.{ts,txt} (-47) - registry hookup for BrowserOpenCloudTool - SessionStore.onEvict + cleanup-callback machinery (~ -25 LOC, including the Entry wrapper type). evict() now just closes + deletes; kept because the test file uses it for cleanup between cases. Added: - packages/bcode-browser/skills/cloud-browser.md (~150 LOC). Way 3 documentation: provision/connect/stop/swap, plus a recommended reusable workspace helper (.bcode/agent-workspace/cloud.ts) for projects that use cloud browsers more than once. BU API auth via process.env.BROWSER_USE_API_KEY (stays as an env var; opencode's auth.json was considered and rejected — see decisions.md §3.10). Updated: - BROWSER.md restructured around Way 1 / Way 2 / Way 3 (real Chrome popup-gated / isolated debug-port / cloud). New "Switching browsers mid-session" section. Listability hint added (`read {{SKILLS_DIR}}/interaction-skills/` to enumerate without committing). Removed the misleading "first call connects automatically" line — agent always calls connect() explicitly, just sometimes with no args. - browser-execute.txt mentions cloud-browser.md and notes process.env is in snippet scope. - bcode-browser README + package.json + index.ts: cloud-browser.ts references removed; brief note that cloud is intentionally not its own Level-1 surface. - comments in browser-execute.ts: no more `browser_open_cloud` cross-references. Net LOC: -200 deletions vs +150 markdown additions = ~-50 net hand- written. Bigger win is one fewer tool surface (back to the original Phase H §3.2 single-tool target) and zero wrapper-PR treadmill as the BU API surface grows (profile sync, custom proxies, regional pools, recording, etc. all reachable from a snippet without bcode changes). Verification: - bun typecheck clean across all browsercode packages. - bun test from packages/bcode-browser/: 4 pass + 5 skip (chrome-gated). - embed-skills.ts now embeds 18 files (was 17, +1 for cloud-browser.md). Cubic P2 finding closed: no tool claims cloud auto-stop, so there is no false claim. Stop is the agent's responsibility, documented in cloud-browser.md "Stop". Stacked on feat/phase-h-ts-harness; PR-A (#41) lands first, then this rebases onto main and lands separately. v0.1.0 tag waits for both. --- packages/bcode-browser/README.md | 5 +- packages/bcode-browser/package.json | 2 +- packages/bcode-browser/skills/BROWSER.md | 68 ++++++-- .../bcode-browser/skills/cloud-browser.md | 145 ++++++++++++++++++ packages/bcode-browser/src/browser-execute.ts | 10 +- packages/bcode-browser/src/cloud-browser.ts | 109 ------------- packages/bcode-browser/src/index.ts | 9 +- packages/bcode-browser/src/session-store.ts | 47 ++---- .../opencode/src/tool/browser-execute.txt | 10 +- .../opencode/src/tool/browser-open-cloud.ts | 35 ----- .../opencode/src/tool/browser-open-cloud.txt | 12 -- packages/opencode/src/tool/registry.ts | 4 - 12 files changed, 239 insertions(+), 217 deletions(-) create mode 100644 packages/bcode-browser/skills/cloud-browser.md delete mode 100644 packages/bcode-browser/src/cloud-browser.ts delete mode 100644 packages/opencode/src/tool/browser-open-cloud.ts delete mode 100644 packages/opencode/src/tool/browser-open-cloud.txt diff --git a/packages/bcode-browser/README.md b/packages/bcode-browser/README.md index dbb4a92d0..e79eecdf0 100644 --- a/packages/bcode-browser/README.md +++ b/packages/bcode-browser/README.md @@ -10,10 +10,9 @@ See `decisions.md §1c` (three-level model) and `§1d` (this package) in the Bro |---|---| | `src/cdp/` | Vendored CDP layer (`session.ts`, `gen.ts`, `generated.ts`, protocol JSONs). Initial copy from `browser-use/browser-harness-js`; ours after — see `src/cdp/PROVENANCE.md`. | | `src/browser-execute.ts` | In-process JS-eval `browser_execute` body. | -| `src/cloud-browser.ts` | Browser Use cloud-browser provision + attach. | -| `src/session-store.ts` | Per-opencode-session CDP `Session` map shared by both browser tools. | +| `src/session-store.ts` | Per-opencode-session CDP `Session` map. The agent calls `session.connect(...)` from a snippet; subsequent snippets find the same Session. | | `src/skills.ts` | Runtime resolver for embedded skills (extract on first call in compiled mode; in-tree path in dev). | -| `skills/` | `BROWSER.md` (the agent's prompt for `browser_execute`) plus `interaction-skills/*.md` (UI mechanic reference docs). Embedded into the binary by `script/embed-skills.ts`. | +| `skills/` | `BROWSER.md` (the agent's prompt for `browser_execute`), `cloud-browser.md` (Way 3 — provision/stop a Browser Use cloud browser via raw HTTP from inside a snippet), and `interaction-skills/*.md` (UI mechanic reference docs). Embedded into the binary by `script/embed-skills.ts`. | | `script/embed-skills.ts` | Build-time embed; emits `bcode-skills.gen.ts` consumed by the compiled binary. | | `test/` | `bun test` smoke coverage for the workspace dynamic-import pattern. | diff --git a/packages/bcode-browser/package.json b/packages/bcode-browser/package.json index a268236c6..67d80eefc 100644 --- a/packages/bcode-browser/package.json +++ b/packages/bcode-browser/package.json @@ -2,7 +2,7 @@ "$schema": "https://json.schemastore.org/package.json", "version": "0.0.0", "name": "@browser-use/bcode-browser", - "description": "BrowserCode Level-1 code: in-process CDP harness, browser_execute, cloud-browser attach, embedded skills", + "description": "BrowserCode Level-1 code: in-process CDP harness, browser_execute, embedded skills", "type": "module", "license": "MIT", "private": true, diff --git a/packages/bcode-browser/skills/BROWSER.md b/packages/bcode-browser/skills/BROWSER.md index 46e024116..76b87dfe7 100644 --- a/packages/bcode-browser/skills/BROWSER.md +++ b/packages/bcode-browser/skills/BROWSER.md @@ -1,28 +1,62 @@ # BROWSER.md — driving a real browser with `browser_execute` -Use the `browser_execute` tool to run JavaScript against a connected browser via the Chrome DevTools Protocol. The snippet runs in-process; `session` is bound to a long-lived CDP `Session` that survives across calls within the same bcode session. +Use the `browser_execute` tool to run JavaScript against a connected browser via the Chrome DevTools Protocol. The snippet runs in-process; `session` is bound to a long-lived CDP `Session` that persists across calls within the same bcode session. You connect once, drive many. **Locations:** - Workspace (read/write your reusable scripts): `/.bcode/agent-workspace/`. The bcode CLI runs from the project root, so `./.bcode/agent-workspace/foo.ts` works directly with the `read`/`write`/`edit` tools. -- Skills (read-only reference docs): `{{SKILLS_DIR}}/interaction-skills/` +- Skills (read-only reference docs): `{{SKILLS_DIR}}/`. Run `read {{SKILLS_DIR}}/interaction-skills/` to list every available interaction skill before reading any one of them. ## The model in one paragraph -`browser_execute` evaluates whatever JS you write against `session`. There is no auto-loaded library, no privileged file, no helper namespace — just `session` and standard JS globals. To reuse code from a previous snippet, save it as a `.ts` file under `./.bcode/agent-workspace/` (using the `write` tool) and `await import("/abs/path?t=" + Date.now())` it from a later snippet. The import takes an **absolute** path — construct it from `process.cwd()` inside the snippet, or shell out via the `bash` tool to get the project root. Same mechanism for a 5-line wrapper and a 500-line script. Skills under `{{SKILLS_DIR}}/interaction-skills/` are documentation you `read`, not modules you `import` — they teach you the CDP patterns; you write the code. +`browser_execute` evaluates whatever JS you write against `session`. There is no auto-loaded library, no privileged file, no helper namespace — just `session` and standard JS globals. To reuse code from a previous snippet, save it as a `.ts` file under `./.bcode/agent-workspace/` (using the `write` tool) and `await import("/abs/path?t=" + Date.now())` it from a later snippet. The import takes an **absolute** path — construct it from `process.cwd()` inside the snippet. Same mechanism for a 5-line wrapper and a 500-line script. Skills under `{{SKILLS_DIR}}/` are documentation you `read`, not modules you `import` — they teach you the CDP patterns; you write the code. ## Connecting -The first `browser_execute` call connects automatically by scanning OS-typical Chrome profile dirs for a `DevToolsActivePort` file (Chrome must be running with `--remote-debugging-port`). To attach explicitly: +You always call `session.connect(...)` once at the start of your work. The `Session` is fresh on the first `browser_execute` call of an opencode session; subsequent calls reuse it. Three connection methods, in order of preference for typical tasks: + +**Way 1 — connect to the user's running Chrome (real profile, popup-gated).** Best when the task involves the user's actual logged-in sites. + +```js +// Auto-detect the most-recently-launched Chrome with remote debugging enabled. +await session.connect() +``` + +The user must have ticked "Allow remote debugging for this browser instance" once at `chrome://inspect/#remote-debugging` (sticky per-profile), and on Chrome 144+ click "Allow" on the in-browser popup at first attach. If `connect()` fails with a 403/permission message, ask the user to do this. To wait for the click instead of erroring fast, pass `{ profileDir: "/abs/path", timeoutMs: 30000 }`. + +**Way 2 — connect to a Chrome you (or the user) launched with a debug port (isolated profile, no popups).** Best for unattended automation. + +```bash +# User runs this once (or you run it via the `bash` tool): +google-chrome --remote-debugging-port=9222 --user-data-dir=/tmp/bcode-chrome +``` + +```js +await session.connect({ wsUrl: "ws://127.0.0.1:9222/devtools/browser" }) +// or, if you know the profile dir: +await session.connect({ profileDir: "/tmp/bcode-chrome" }) +``` + +The `--user-data-dir` must NOT be Chrome's platform default (`%LOCALAPPDATA%\Google\Chrome\User Data` on Windows, `~/Library/Application Support/Google/Chrome` on macOS, `~/.config/google-chrome` on Linux) — Chrome 136+ silently no-ops the port flag in that case. + +**Way 3 — provision and connect to a Browser Use cloud browser.** Best when the user can't see the browser, you need a clean profile, geo-located proxy, or fingerprint isolation. Read `{{SKILLS_DIR}}/cloud-browser.md` for the full pattern (provision, stop, swap profile/proxy). Briefly: ```js -await session.connect({ profileDir: "/abs/path/to/Chrome/Default" }) -// or -await session.connect({ wsUrl: "ws://127.0.0.1:9222/devtools/browser/" }) -// or for a Browser Use cloud browser, call the `browser_open_cloud` tool first. +const r = await fetch("https://api.browser-use.com/api/v3/browsers", { + method: "POST", + headers: { "X-Browser-Use-API-Key": process.env.BROWSER_USE_API_KEY, "Content-Type": "application/json" }, + body: "{}", +}) +const { id, cdp_url, live_url } = await r.json() +await session.connect({ wsUrl: cdp_url }) +console.log("liveUrl for the user to watch:", live_url) ``` -After connect, attach to a page target: +Requires `BROWSER_USE_API_KEY` in the environment (the user should have set this before launching bcode). If absent, tell the user to get a key at https://browser-use.com and `export BROWSER_USE_API_KEY=...`. + +## Attaching to a target + +After `connect()`, attach to a page target before driving the browser: ```js const targets = (await session.Target.getTargets({})).targetInfos @@ -65,7 +99,18 @@ const { data } = await session.Page.captureScreenshot({ format: "png" }) // data is base64; write with the `write` tool or process in JS. ``` -For the full menu of UI mechanics — dropdowns, dialogs, iframes, shadow DOM, uploads, scrolling, screenshots-with-highlights — read the relevant skill: `{{SKILLS_DIR}}/interaction-skills/.md`. +For the full menu of UI mechanics — dropdowns, dialogs, iframes, shadow DOM, uploads, scrolling, screenshots-with-highlights — list `{{SKILLS_DIR}}/interaction-skills/` to see all available topics, then read the relevant one. + +## Switching browsers mid-session + +You own the connection. To swap: + +```js +await session.close() +await session.connect({ /* new opts */ }) +``` + +Cloud cleanup is your responsibility — if you're done with a cloud browser, stop it explicitly (see `{{SKILLS_DIR}}/cloud-browser.md` for the PATCH call). Otherwise it persists until your API quota or BU's idle timer reclaims it. ## Reusing code: write to the workspace, import from snippet @@ -110,4 +155,5 @@ Cache-bust (`?t=${Date.now()}`) is your responsibility: without it, edits to the - **`session.Page.navigate` hangs forever** → the page is showing a native dialog. Use `session.Page.handleJavaScriptDialog({ accept: true })` to dismiss. - **Selectors don't find elements that you can see** → likely an iframe or shadow DOM. Read `{{SKILLS_DIR}}/interaction-skills/iframes.md` or `shadow-dom.md`. - **Actions silently no-op** → the page is mid-load. After `Page.navigate`, await `session.waitFor("Page.loadEventFired")` before driving inputs. -- **Connection refused or 403 on connect()** → Chrome wasn't started with `--remote-debugging-port`, or the user hasn't clicked "Allow" on the remote-debugging prompt. Pass `{ timeoutMs: 30000 }` to wait for the click. +- **Connection refused or 403 on connect()** → Chrome wasn't started with `--remote-debugging-port`, or the user hasn't clicked "Allow" on the remote-debugging prompt. Pass `{ profileDir, timeoutMs: 30000 }` to wait for the click, or fall back to Way 2. +- **Cloud `connect()` fails after a successful provision** → check that `cdp_url` came back in the POST response; some BU regions return `cdpUrl` (camelCase) — accept both. See `{{SKILLS_DIR}}/cloud-browser.md`. diff --git a/packages/bcode-browser/skills/cloud-browser.md b/packages/bcode-browser/skills/cloud-browser.md new file mode 100644 index 000000000..6b1806f50 --- /dev/null +++ b/packages/bcode-browser/skills/cloud-browser.md @@ -0,0 +1,145 @@ +# cloud-browser.md — Browser Use cloud browser via raw HTTP + +When BROWSER.md sent you here, the user wants a Browser Use cloud browser (Way 3): a clean isolated Chrome on BU's infrastructure, optionally with a geo-located proxy or a synced profile, with a `liveUrl` the user can open to watch you work. + +There is no `browser_open_cloud` tool. You write the HTTP calls yourself in a `browser_execute` snippet. This keeps the connection model symmetric (you also call `session.connect()` for local browsers in Way 1 and Way 2) and gives you full control over the BU API surface — provision, stop, swap profiles, change proxies, anything BU exposes. + +## Authentication + +Every call to `https://api.browser-use.com/...` requires an API key in the `X-Browser-Use-API-Key` header. The key lives in the environment as `BROWSER_USE_API_KEY` (the user is expected to `export` it before launching bcode, the same way they'd set `AWS_BEDROCK_ACCESS_KEY_ID` for an LLM provider). + +Read it once, fail clearly if missing: + +```js +const apiKey = process.env.BROWSER_USE_API_KEY +if (!apiKey) { + throw new Error("BROWSER_USE_API_KEY is not set. Get a key at https://browser-use.com and re-launch bcode with the key exported.") +} +``` + +## Provision + +```js +const r = await fetch("https://api.browser-use.com/api/v3/browsers", { + method: "POST", + headers: { "X-Browser-Use-API-Key": apiKey, "Content-Type": "application/json" }, + body: JSON.stringify({ + // All optional — omit for an ephemeral fresh-profile browser with no proxy. + // profile_id: "", // attach an existing BU profile + // proxy_country_code: "us", // geo-located proxy + }), +}) +if (!r.ok) throw new Error(`provision failed: ${r.status} ${await r.text()}`) +const body = await r.json() +// Some BU regions return camelCase, others snake_case. Accept both. +const id = body.id +const cdpUrl = body.cdp_url ?? body.cdpUrl +const liveUrl = body.live_url ?? body.liveUrl +``` + +The `liveUrl` is a viewer URL the user can open in their own browser to watch the cloud browser's pixels. **Print it to console** so the user can click it: + +```js +console.log("Cloud browser ready. Live view:", liveUrl) +``` + +Stash `id` somewhere (a `globalThis.cloudBrowserId = id` is fine, or the snippet's return value) — you need it to stop the browser later. + +## Connect + +```js +await session.connect({ wsUrl: cdpUrl }) +const targets = (await session.Target.getTargets({})).targetInfos +const page = targets.find(t => t.type === "page") +await session.use(page.targetId) +``` + +From here on `session..(...)` drives the cloud browser exactly like a local Chrome. + +## Stop + +When you're done, stop the browser. BU's quotas and idle reclaim will eventually clean it up if you forget, but explicit stop is faster and frees the slot: + +```js +await fetch(`https://api.browser-use.com/api/v3/browsers/${id}`, { + method: "PATCH", + headers: { "X-Browser-Use-API-Key": apiKey, "Content-Type": "application/json" }, + body: JSON.stringify({ state: "stop" }), +}) +``` + +If you'll do this often within one project, save it as `./.bcode/agent-workspace/cloud.ts` (see BROWSER.md "Reusing code") and import it from later snippets. + +## Swap + +To switch from one cloud browser to another (e.g. different proxy country) within the same opencode session: + +```js +// Stop the old one first. +await fetch(`https://api.browser-use.com/api/v3/browsers/${oldId}`, { + method: "PATCH", + headers: { "X-Browser-Use-API-Key": apiKey, "Content-Type": "application/json" }, + body: JSON.stringify({ state: "stop" }), +}) + +// Close the local Session's WS so connect() opens a fresh one. +await session.close() + +// Provision and connect to the new one (provision block above, with new params). +``` + +## A reusable workspace helper + +Recommended pattern for any project that uses cloud browsers more than once: + +```ts +// ./.bcode/agent-workspace/cloud.ts +const API = "https://api.browser-use.com/api/v3/browsers" +const key = () => { + const k = process.env.BROWSER_USE_API_KEY + if (!k) throw new Error("BROWSER_USE_API_KEY is not set.") + return k +} + +export async function provision(opts: { profileId?: string; proxyCountryCode?: string } = {}) { + const r = await fetch(API, { + method: "POST", + headers: { "X-Browser-Use-API-Key": key(), "Content-Type": "application/json" }, + body: JSON.stringify({ + profile_id: opts.profileId, + proxy_country_code: opts.proxyCountryCode, + }), + }) + if (!r.ok) throw new Error(`provision failed: ${r.status} ${await r.text()}`) + const body = await r.json() + return { + id: body.id as string, + cdpUrl: (body.cdp_url ?? body.cdpUrl) as string, + liveUrl: (body.live_url ?? body.liveUrl) as string, + } +} + +export async function stop(id: string) { + const r = await fetch(`${API}/${id}`, { + method: "PATCH", + headers: { "X-Browser-Use-API-Key": key(), "Content-Type": "application/json" }, + body: JSON.stringify({ state: "stop" }), + }) + if (!r.ok) throw new Error(`stop failed: ${r.status} ${await r.text()}`) +} +``` + +Then any snippet does: + +```js +const { provision, stop } = await import(`${process.cwd()}/.bcode/agent-workspace/cloud.ts?t=${Date.now()}`) +const { id, cdpUrl, liveUrl } = await provision({ proxyCountryCode: "us" }) +console.log("Live view:", liveUrl) +await session.connect({ wsUrl: cdpUrl }) +// ... do work ... +await stop(id) +``` + +## Other BU API endpoints + +The full BU cloud API (profile sync, profile list, custom proxies, recording on/off, etc.) is documented at https://browser-use.com — `read` the docs and write the matching `fetch` call. Anything BU's API exposes is reachable from a snippet without bcode-side wrapper code. diff --git a/packages/bcode-browser/src/browser-execute.ts b/packages/bcode-browser/src/browser-execute.ts index 63694e85b..458e82713 100644 --- a/packages/bcode-browser/src/browser-execute.ts +++ b/packages/bcode-browser/src/browser-execute.ts @@ -55,8 +55,9 @@ export type Parameters = Schema.Schema.Type export interface ExecuteContext { // Identifies the per-opencode-session CDP Session to bind into the snippet. - // Shared with `browser_open_cloud` via the SessionStore so a cloud-attach - // call's Session is driven by subsequent `browser_execute` calls. + // The same Session is reused across calls — the agent calls + // `session.connect(...)` in one snippet and subsequent snippets find the + // already-connected Session. readonly sessionID: string // Per-project workspace dir: /.bcode/agent-workspace/. Created // on first call. The agent reads/writes/edits .ts files here via the @@ -97,8 +98,9 @@ const serialize = (v: unknown): string => { } // Snippet executor. The CDP Session is resolved per-call from `SessionStore` -// keyed on `ctx.sessionID` so a Session attached via `browser_open_cloud` is -// the same one a follow-up `browser_execute` drives. +// keyed on `ctx.sessionID`. The agent connects with `await session.connect(...)` +// in one snippet (Way 1 / Way 2 / Way 3 in BROWSER.md); the Session persists +// for follow-up snippets in the same opencode session. // // `dataDir` is opencode's XDG_DATA_HOME for bcode (~/.local/share/bcode/ on // Linux/Mac). Compiled-mode skills are extracted to `/skills/` once diff --git a/packages/bcode-browser/src/cloud-browser.ts b/packages/bcode-browser/src/cloud-browser.ts deleted file mode 100644 index e0fcfc024..000000000 --- a/packages/bcode-browser/src/cloud-browser.ts +++ /dev/null @@ -1,109 +0,0 @@ -// Cloud-browser attach (decisions §3.3 / §6 — single API key, BU cloud -// surfaces). Provisions a Browser Use cloud browser via the public -// /api/v3/browsers REST surface and connects a `Session` to its `cdpUrl`. -// -// Three calls per attach: -// 1. POST /api/v3/browsers → { id, cdpUrl, liveUrl } -// 2. (caller) session.connect({ wsUrl: cdpUrl }) -// 3. PATCH /api/v3/browsers/ → { state: "stop" } (finalizer) -// -// `BROWSER_USE_API_KEY` must be set; we fail fast if absent so the Level-2 -// wrapper can render a one-line error pointing at our docs without -// constructing a bad request. - -import { Effect, Schema } from "effect" -import { SessionStore } from "./session-store" - -const API_BASE = "https://api.browser-use.com/api/v3/browsers" - -export const provisionParameters = Schema.Struct({ - profileId: Schema.optional(Schema.String).annotate({ - description: "Existing BU cloud profile id to attach to. Omit for a fresh ephemeral profile.", - }), - proxyCountryCode: Schema.optional(Schema.String).annotate({ - description: "ISO-2 country code for the proxy pool (e.g. \"us\", \"de\").", - }), -}) - -export type ProvisionParameters = Schema.Schema.Type - -export interface ProvisionResult { - readonly id: string - readonly cdpUrl: string - readonly liveUrl: string -} - -const apiKey = () => { - const k = process.env.BROWSER_USE_API_KEY - if (!k) { - throw new Error( - "BROWSER_USE_API_KEY is not set. Cloud browsers require a Browser Use API key — get one at https://browser-use.com.", - ) - } - return k -} - -const provision = (args: ProvisionParameters) => - Effect.tryPromise({ - try: async () => { - const res = await fetch(API_BASE, { - method: "POST", - headers: { - "X-Browser-Use-API-Key": apiKey(), - "Content-Type": "application/json", - }, - body: JSON.stringify({ - profile_id: args.profileId, - proxy_country_code: args.proxyCountryCode, - }), - }) - if (!res.ok) throw new Error(`provision failed: ${res.status} ${await res.text()}`) - const body = (await res.json()) as { id: string; cdp_url?: string; cdpUrl?: string; live_url?: string; liveUrl?: string } - const cdpUrl = body.cdpUrl ?? body.cdp_url - const liveUrl = body.liveUrl ?? body.live_url - if (!cdpUrl || !liveUrl) throw new Error(`provision response missing cdpUrl/liveUrl: ${JSON.stringify(body)}`) - return { id: body.id, cdpUrl, liveUrl } satisfies ProvisionResult - }, - catch: (err) => (err instanceof Error ? err : new Error(String(err))), - }) - -const stop = (id: string) => - Effect.tryPromise({ - try: async () => { - const res = await fetch(`${API_BASE}/${id}`, { - method: "PATCH", - headers: { - "X-Browser-Use-API-Key": apiKey(), - "Content-Type": "application/json", - }, - body: JSON.stringify({ state: "stop" }), - }) - if (!res.ok) throw new Error(`stop failed: ${res.status} ${await res.text()}`) - }, - catch: (err) => (err instanceof Error ? err : new Error(String(err))), - }) - -// Provisions a cloud browser, connects the per-opencode-session `Session` to -// it, and registers a stop-callback with `SessionStore` so the browser is -// torn down when the session is evicted (or, in practice, at process exit -// since opencode doesn't currently call evict — that's a known gap matching -// today's `uv run` subprocess shape, where a stuck Python interpreter also -// outlives the bcode session). The Session is shared with `browser_execute` -// via `SessionStore`. Returns the public bits the agent needs. -export const open = Effect.fn("CloudBrowser.open")(function* ( - sessionID: string, - args: ProvisionParameters, -) { - const { id, cdpUrl, liveUrl } = yield* provision(args) - const session = SessionStore.get(sessionID) - SessionStore.onEvict(sessionID, () => - Effect.runPromise(stop(id).pipe(Effect.ignore)), - ) - yield* Effect.tryPromise({ - try: () => session.connect({ wsUrl: cdpUrl }), - catch: (err) => (err instanceof Error ? err : new Error(String(err))), - }) - return { id, liveUrl } as const -}) - -export * as CloudBrowser from "./cloud-browser" diff --git a/packages/bcode-browser/src/index.ts b/packages/bcode-browser/src/index.ts index 4772ba071..578823db8 100644 --- a/packages/bcode-browser/src/index.ts +++ b/packages/bcode-browser/src/index.ts @@ -9,10 +9,15 @@ // Contents: // src/cdp/ — vendored CDP layer (session.ts, generated.ts, codegen) // src/browser-execute.ts — in-process JS-eval browser_execute body -// src/cloud-browser.ts — Browser Use cloud-browser provision + attach // src/session-store.ts — per-opencode-session CDP Session map // src/skills.ts — runtime resolver for embedded skills -// skills/ — BROWSER.md + interaction-skills/*.md (embedded into binary) +// skills/ — BROWSER.md + interaction-skills/*.md + cloud-browser.md (embedded into binary) +// +// Cloud browser provisioning is intentionally NOT a separate Level-1 +// surface. The agent reads `skills/cloud-browser.md` and writes the +// fetch+connect snippet itself, matching how local-browser connect works +// (snippet-side, not tool-side). Decisions trail in +// `memory/browsercode/decisions.md` §3.4. // // Planned (per ROADMAP phase): // src/fetch-use/ — FetchUse.Service implementation (ROADMAP B1) diff --git a/packages/bcode-browser/src/session-store.ts b/packages/bcode-browser/src/session-store.ts index 815d076c8..8be05462c 100644 --- a/packages/bcode-browser/src/session-store.ts +++ b/packages/bcode-browser/src/session-store.ts @@ -1,52 +1,37 @@ // Process-scope per-opencode-session CDP Session map. // -// Both `browser_execute` and `browser_open_cloud` look up the same `Session` -// by `sessionID` so a snippet that follows a `browser_open_cloud` call drives -// the cloud browser, not a freshly-auto-detected local one. +// `browser_execute` looks up a `Session` keyed by `sessionID` so that calls +// to `session.connect(...)` made inside one snippet persist across later +// snippets in the same opencode session — the agent connects once, drives +// many. The Session is a single CDP transport (one WebSocket); the agent +// is the source of truth for which browser is on the other end. // -// Lifetime: Sessions persist for the life of the opencode process (or until -// the underlying WebSocket closes). We don't have a clean session-end hook -// in opencode's tool layer; the WS closes naturally on browser exit and the -// agent can call `session.close()` from a snippet if needed. Sessions held in -// the map after their browser exits become unusable but are cheap (just an -// idle WS reference until the next snippet replaces them). +// Lifetime: Sessions live for the life of the opencode process. The +// underlying WebSocket closes naturally when the browser exits. The agent +// can also close explicitly from a snippet (`await session.close()`) — for +// instance, before reconnecting to a different browser. // -// Evicted via `evict(sessionID)` if a future hook is added. +// `evict(sessionID)` is exposed for tests to clean up between cases. It +// closes the Session and removes the entry. Production code does not need +// to call it; sessions are cheap and the process will exit eventually. import { Session } from "./cdp/session" -interface Entry { - readonly session: Session - // Cleanup callbacks registered alongside the Session — e.g. cloud-browser - // stop calls. Run sequentially on `evict`. Each is fire-and-forget and - // must not throw outwards (errors are logged, not propagated). - readonly cleanup: Array<() => Promise> -} - -const sessions = new Map() +const sessions = new Map() export const get = (sessionID: string): Session => { const existing = sessions.get(sessionID) - if (existing) return existing.session + if (existing) return existing const fresh = new Session() - sessions.set(sessionID, { session: fresh, cleanup: [] }) + sessions.set(sessionID, fresh) return fresh } -export const onEvict = (sessionID: string, fn: () => Promise): void => { - const entry = sessions.get(sessionID) - if (!entry) throw new Error(`SessionStore.onEvict: no session ${sessionID}`) - entry.cleanup.push(fn) -} - export const evict = async (sessionID: string): Promise => { const entry = sessions.get(sessionID) if (!entry) return sessions.delete(sessionID) - for (const fn of entry.cleanup) { - try { await fn() } catch (err) { console.error(`SessionStore evict cleanup failed for ${sessionID}:`, err) } - } - entry.session.close() + entry.close() } export * as SessionStore from "./session-store" diff --git a/packages/opencode/src/tool/browser-execute.txt b/packages/opencode/src/tool/browser-execute.txt index 6658e7a63..62a73ec53 100644 --- a/packages/opencode/src/tool/browser-execute.txt +++ b/packages/opencode/src/tool/browser-execute.txt @@ -1,14 +1,14 @@ Execute JavaScript against a connected browser via the BrowserCode CDP harness. -Use this tool whenever the task requires driving a real browser — automation, scraping, end-to-end testing, or interactive exploration. The snippet runs in-process with one persistent CDP `Session` object that survives across calls in the same opencode session. +Use this tool whenever the task requires driving a real browser — automation, scraping, end-to-end testing, or interactive exploration. The snippet runs in-process with one persistent CDP `Session` object that survives across calls in the same opencode session. You connect once and drive many. -Before the first `browser_execute` call of a session, you MUST read `{{SKILLS_DIR}}/BROWSER.md`. It defines the snippet model, the workspace pattern, the `session` API surface, and gotchas. +Before the first `browser_execute` call of a session, you MUST read `{{SKILLS_DIR}}/BROWSER.md`. It defines the snippet model, the three connection methods (local user Chrome, isolated debug-port Chrome, Browser Use cloud browser), the workspace pattern, the `session` API surface, and gotchas. For cloud-browser specifics, also read `{{SKILLS_DIR}}/cloud-browser.md`. Snippet scope: -- `session` — the live CDP `Session`. Domain methods follow `session..(params)` and return Promises. -- standard JS globals (`console.log` etc. stream back to the user). +- `session` — the live CDP `Session`. You call `session.connect(...)` once at the start of your work; subsequent snippets reuse the same connection. Domain methods follow `session..(params)` and return Promises. +- standard JS globals (`console.log` etc. stream back to the user; `process.env` is available for reading `BROWSER_USE_API_KEY` etc.). Top-level `import` is not allowed inside a snippet. To reuse code across calls, save it as a `.ts` file under `./.bcode/agent-workspace/` (per-project, tracked-by-default in git) and `await import("/abs/path?t=" + Date.now())` it from a later snippet. -For UI-mechanic recipes (dialogs, dropdowns, iframes, shadow DOM, uploads, scrolling, screenshots, …), `read` files under `{{SKILLS_DIR}}/interaction-skills/`. Skills are documentation; they document what to write, never what to import. +For UI-mechanic recipes, list `{{SKILLS_DIR}}/interaction-skills/` to see all available topics, then `read` the relevant ones. Skills are documentation; they document what to write, never what to import. diff --git a/packages/opencode/src/tool/browser-open-cloud.ts b/packages/opencode/src/tool/browser-open-cloud.ts deleted file mode 100644 index 1b565133d..000000000 --- a/packages/opencode/src/tool/browser-open-cloud.ts +++ /dev/null @@ -1,35 +0,0 @@ -// browser_open_cloud — Level-2 hook (decisions.md §1c, §3.3, §6). -// -// Provisions a Browser Use cloud browser and binds the per-opencode-session -// CDP Session to it. After this tool returns, `browser_execute` snippets -// drive the cloud browser instead of any local Chrome. - -import { Effect, Schema } from "effect" -import { CloudBrowser } from "@browser-use/bcode-browser/cloud-browser" -import * as Tool from "./tool" -import DESCRIPTION from "./browser-open-cloud.txt" - -export const BrowserOpenCloudTool = Tool.define( - "browser_open_cloud", - Effect.gen(function* () { - return { - description: DESCRIPTION, - parameters: CloudBrowser.provisionParameters, - execute: (args: Schema.Schema.Type, ctx: Tool.Context) => - Effect.gen(function* () { - yield* ctx.ask({ - permission: "browser_open_cloud", - patterns: ["*"], - always: ["*"], - metadata: {}, - }) - const { id, liveUrl } = yield* CloudBrowser.open(ctx.sessionID, args) - return { - title: "browser_open_cloud", - output: `Cloud browser ready.\nbrowserId: ${id}\nliveUrl: ${liveUrl}`, - metadata: { browserId: id, liveUrl }, - } - }).pipe(Effect.orDie), - } - }), -) diff --git a/packages/opencode/src/tool/browser-open-cloud.txt b/packages/opencode/src/tool/browser-open-cloud.txt deleted file mode 100644 index e34365697..000000000 --- a/packages/opencode/src/tool/browser-open-cloud.txt +++ /dev/null @@ -1,12 +0,0 @@ -Provision a Browser Use cloud browser and bind it to this session. - -After this tool returns, every subsequent `browser_execute` snippet drives the cloud browser via the same `session` object — there is no per-snippet attach step. The cloud browser is automatically stopped when the bcode session ends. - -Use this when: -- You need a browser the user cannot see (autonomous, long-running, or background tasks). -- You need a clean profile, geo-located proxy, or cookie isolation. -- You need to scrape sites that fingerprint local Chrome installs. - -Requires `BROWSER_USE_API_KEY`. Without it the call fails with a one-line error pointing at https://browser-use.com. - -Output includes a `liveUrl` — a Browser Use URL that streams the live browser pixels to a viewer; share it with the user when they want to watch the agent work. diff --git a/packages/opencode/src/tool/registry.ts b/packages/opencode/src/tool/registry.ts index cda2b9302..eb1d0391e 100644 --- a/packages/opencode/src/tool/registry.ts +++ b/packages/opencode/src/tool/registry.ts @@ -11,7 +11,6 @@ import { TodoWriteTool } from "./todo" import { WebFetchTool } from "./webfetch" import { WriteTool } from "./write" import { BrowserExecuteTool } from "./browser-execute" -import { BrowserOpenCloudTool } from "./browser-open-cloud" import { InvalidTool } from "./invalid" import { SkillTool } from "./skill" import * as Tool from "./tool" @@ -116,7 +115,6 @@ export const layer: Layer.Layer< const patchtool = yield* ApplyPatchTool const skilltool = yield* SkillTool const browserExecute = yield* BrowserExecuteTool - const browserOpenCloud = yield* BrowserOpenCloudTool const agent = yield* Agent.Service const state = yield* InstanceState.make( @@ -211,7 +209,6 @@ export const layer: Layer.Layer< search: Tool.init(websearch), skill: Tool.init(skilltool), browserExecute: Tool.init(browserExecute), - browserOpenCloud: Tool.init(browserOpenCloud), patch: Tool.init(patchtool), question: Tool.init(question), lsp: Tool.init(lsptool), @@ -235,7 +232,6 @@ export const layer: Layer.Layer< tool.search, tool.skill, tool.browserExecute, - tool.browserOpenCloud, tool.patch, ...(Flag.OPENCODE_EXPERIMENTAL_LSP_TOOL ? [tool.lsp] : []), ...(Flag.OPENCODE_EXPERIMENTAL_PLAN_MODE && Flag.OPENCODE_CLIENT === "cli" ? [tool.plan] : []),