From 75036875a1d4227d8f65c4651562211573d752da Mon Sep 17 00:00:00 2001 From: yawbtng <154343001+yawbtng@users.noreply.github.com> Date: Thu, 2 Jul 2026 11:41:12 -0700 Subject: [PATCH 1/5] fix(core): declare CUA screenshot media type at capture boundary (#2300) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## why Closes #2046. This is the reshaped version of #2159, following the approach @seanmcguire12 outlined when closing that PR. `setScreenshotProvider` returned a bare base64 string, so every CUA client had to independently infer or hardcode the media type — all four assumed `image/png`. A non-PNG screenshot (e.g. a JPEG from a custom provider) was then mislabeled as PNG in the provider function-response payload, which is the root of #2046. Clients also stripped a hardcoded `data:image/png;base64,` prefix by regex, so any other prefix silently broke. ## what changed Move the media-type declaration to the capture boundary. `setScreenshotProvider` now returns an explicit payload: ```ts export interface ScreenshotProviderResult { base64: string; mediaType: "image/png" | "image/jpeg"; } ``` - **Default handler** (`v3CuaAgentHandler`) captures PNG explicitly (`type: "png"`) and returns `{ base64, mediaType: "image/png" }`, so the default is unchanged. - **Anthropic**: `media_type: screenshot.mediaType`, `data: screenshot.base64` (drops the `.replace(/^data:image\/png;base64,/, "")`). - **Google**: `mimeType: screenshot.mediaType` (drops the PNG-only prefix strip). - **OpenAI / Microsoft**: build `data:${screenshot.mediaType};base64,${screenshot.base64}`. - `options.base64Image` (caller-supplied) still defaults to `image/png`, preserving existing behavior. `ScreenshotProviderResult` is exported from the public entrypoint. ## testing - New `cua-screenshot-mediatype.test.ts`: asserts a non-PNG (`image/jpeg`) media type is honored by all four clients' `captureScreenshot()`, and that the `options.base64Image` path still defaults to png. - Updated the public API type test for `setScreenshotProvider(...)` and the Anthropic/Microsoft CUA client tests to the new provider shape. - `pnpm --filter @browserbasehq/stagehand run typecheck` passes; the CUA + public-API unit suites are green (55 tests). --- ## Summary by cubic Declare the screenshot media type at the capture boundary and pass it through all CUA clients. Fixes non‑PNG screenshots being mislabeled as PNG and removes PNG-only prefix stripping. - **Bug Fixes** - `setScreenshotProvider` now returns `{ base64, mediaType }` (`ScreenshotProviderResult`) instead of a string. - Default handler explicitly captures PNG and returns `image/png`. - Clients: Anthropic/Google pass `mediaType` through; OpenAI/Microsoft build `data:${mediaType};base64,${base64}`; removed PNG-only prefix regex. - `options.base64Image` still defaults to `image/png`. - Added tests validating JPEG flows through all clients; updated public API type tests. - **Migration** - If you provide a custom `setScreenshotProvider`, return `{ base64, mediaType: "image/png" | "image/jpeg" }` instead of a base64 string. - No changes needed if you use the built-in handler. Written for commit affd2ad7b58d71294886b12765661472139c7089. Summary will update on new commits. Review in cubic --- .../fix-screenshot-provider-mediatype.md | 5 ++ packages/core/lib/v3/agent/AgentClient.ts | 5 +- .../core/lib/v3/agent/AnthropicCUAClient.ts | 24 +++--- packages/core/lib/v3/agent/GoogleCUAClient.ts | 21 +++-- .../core/lib/v3/agent/MicrosoftCUAClient.ts | 11 ++- packages/core/lib/v3/agent/OpenAICUAClient.ts | 29 ++++--- .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 10 ++- packages/core/lib/v3/types/public/agent.ts | 10 +++ .../unit/anthropic-cua-triple-click.test.ts | 5 +- .../unit/cua-screenshot-mediatype.test.ts | 78 +++++++++++++++++++ .../tests/unit/microsoft-cua-client.test.ts | 5 +- .../unit/public-api/llm-and-agents.test.ts | 4 +- 12 files changed, 165 insertions(+), 42 deletions(-) create mode 100644 .changeset/fix-screenshot-provider-mediatype.md create mode 100644 packages/core/tests/unit/cua-screenshot-mediatype.test.ts diff --git a/.changeset/fix-screenshot-provider-mediatype.md b/.changeset/fix-screenshot-provider-mediatype.md new file mode 100644 index 0000000000..4636b71ca3 --- /dev/null +++ b/.changeset/fix-screenshot-provider-mediatype.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Declare the CUA screenshot media type at the capture boundary instead of hardcoding `image/png` in each computer-use client. `setScreenshotProvider` now returns `{ base64, mediaType }` (`ScreenshotProviderResult`) rather than a bare base64 string, and the Anthropic, Google, OpenAI, and Microsoft clients pass the media type through to their function-response payloads. This fixes non-PNG screenshots being mislabeled as PNG (closes #2046) and removes the per-client PNG-only data-URL parsing. diff --git a/packages/core/lib/v3/agent/AgentClient.ts b/packages/core/lib/v3/agent/AgentClient.ts index 294afe0230..fed1a02788 100644 --- a/packages/core/lib/v3/agent/AgentClient.ts +++ b/packages/core/lib/v3/agent/AgentClient.ts @@ -3,6 +3,7 @@ import { AgentResult, AgentType, AgentExecutionOptions, + ScreenshotProviderResult, } from "../types/public/agent.js"; import { ClientOptions } from "../types/public/model.js"; @@ -37,7 +38,9 @@ export abstract class AgentClient { abstract setCurrentUrl(url: string): void; - abstract setScreenshotProvider(provider: () => Promise): void; + abstract setScreenshotProvider( + provider: () => Promise, + ): void; abstract setActionHandler( handler: (action: AgentAction) => Promise, diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts index 9590daccd4..a7d5b2d8a9 100644 --- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts +++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts @@ -8,6 +8,7 @@ import { AnthropicToolResult, AgentExecutionOptions, ToolUseItem, + ScreenshotProviderResult, } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; import { ClientOptions, ThinkingEffort } from "../types/public/model.js"; @@ -46,7 +47,7 @@ export class AnthropicCUAClient extends AgentClient { public lastMessageId?: string; private currentViewport = { width: 1288, height: 711 }; private currentUrl?: string; - private screenshotProvider?: () => Promise; + private screenshotProvider?: () => Promise; private actionHandler?: (action: AgentAction) => Promise; private thinkingBudget: number | null = null; private thinkingEffort: ThinkingEffort | null = null; @@ -106,7 +107,9 @@ export class AnthropicCUAClient extends AgentClient { this.currentUrl = url; } - setScreenshotProvider(provider: () => Promise): void { + setScreenshotProvider( + provider: () => Promise, + ): void { this.screenshotProvider = provider; } @@ -664,7 +667,7 @@ export class AnthropicCUAClient extends AgentClient { const screenshot = await this.captureScreenshot(); logger({ category: "agent", - message: `Screenshot captured, length: ${screenshot.length}`, + message: `Screenshot captured, length: ${screenshot.base64.length}`, level: 2, }); @@ -674,8 +677,8 @@ export class AnthropicCUAClient extends AgentClient { type: "image", source: { type: "base64", - media_type: "image/png", - data: screenshot.replace(/^data:image\/png;base64,/, ""), + media_type: screenshot.mediaType, + data: screenshot.base64, }, }, ]; @@ -785,8 +788,8 @@ export class AnthropicCUAClient extends AgentClient { type: "image", source: { type: "base64", - media_type: "image/png", - data: screenshot.replace(/^data:image\/png;base64,/, ""), + media_type: screenshot.mediaType, + data: screenshot.base64, }, }, { @@ -1039,17 +1042,16 @@ export class AnthropicCUAClient extends AgentClient { async captureScreenshot(options?: { base64Image?: string; currentUrl?: string; - }): Promise { + }): Promise { // Use provided options if available if (options?.base64Image) { - return `data:image/png;base64,${options.base64Image}`; + return { base64: options.base64Image, mediaType: "image/png" }; } // Use the screenshot provider if available if (this.screenshotProvider) { try { - const base64Image = await this.screenshotProvider(); - return `data:image/png;base64,${base64Image}`; + return await this.screenshotProvider(); } catch (error) { console.error("Error capturing screenshot:", error); throw error; diff --git a/packages/core/lib/v3/agent/GoogleCUAClient.ts b/packages/core/lib/v3/agent/GoogleCUAClient.ts index b49503f799..f26aab2e30 100644 --- a/packages/core/lib/v3/agent/GoogleCUAClient.ts +++ b/packages/core/lib/v3/agent/GoogleCUAClient.ts @@ -16,6 +16,7 @@ import { AgentExecutionOptions, SafetyCheck, SafetyConfirmationHandler, + ScreenshotProviderResult, } from "../types/public/agent.js"; import { ClientOptions } from "../types/public/model.js"; import { AgentClient } from "./AgentClient.js"; @@ -49,7 +50,7 @@ export class GoogleCUAClient extends AgentClient { private client: GoogleGenAI; private currentViewport = { width: 1288, height: 711 }; private currentUrl?: string; - private screenshotProvider?: () => Promise; + private screenshotProvider?: () => Promise; private actionHandler?: (action: AgentAction) => Promise; private history: Content[] = []; private environment: "ENVIRONMENT_BROWSER" | "ENVIRONMENT_DESKTOP" = @@ -129,7 +130,9 @@ export class GoogleCUAClient extends AgentClient { this.currentUrl = url; } - setScreenshotProvider(provider: () => Promise): void { + setScreenshotProvider( + provider: () => Promise, + ): void { this.screenshotProvider = provider; } @@ -599,10 +602,7 @@ export class GoogleCUAClient extends AgentClient { }); const screenshot = await this.captureScreenshot(); - const base64Data = screenshot.replace( - /^data:image\/png;base64,/, - "", - ); + const base64Data = screenshot.base64; // Create one function response for each computer use function call // Following Python SDK pattern: FunctionResponse with parts containing inline_data @@ -629,7 +629,7 @@ export class GoogleCUAClient extends AgentClient { parts: [ { inlineData: { - mimeType: "image/png", + mimeType: screenshot.mediaType, data: base64Data, }, }, @@ -1161,7 +1161,7 @@ export class GoogleCUAClient extends AgentClient { async captureScreenshot(options?: { base64Image?: string; currentUrl?: string; - }): Promise { + }): Promise { // Update current URL if provided if (options?.currentUrl) { this.currentUrl = options.currentUrl; @@ -1169,14 +1169,13 @@ export class GoogleCUAClient extends AgentClient { // Use provided options if available if (options?.base64Image) { - return `data:image/png;base64,${options.base64Image}`; + return { base64: options.base64Image, mediaType: "image/png" }; } // Use the screenshot provider if available if (this.screenshotProvider) { try { - const base64Image = await this.screenshotProvider(); - return `data:image/png;base64,${base64Image}`; + return await this.screenshotProvider(); } catch (error) { console.error("Error capturing screenshot:", error); throw error; diff --git a/packages/core/lib/v3/agent/MicrosoftCUAClient.ts b/packages/core/lib/v3/agent/MicrosoftCUAClient.ts index cb4c212389..3d9f42660c 100644 --- a/packages/core/lib/v3/agent/MicrosoftCUAClient.ts +++ b/packages/core/lib/v3/agent/MicrosoftCUAClient.ts @@ -5,6 +5,7 @@ import { AgentResult, AgentType, AgentExecutionOptions, + ScreenshotProviderResult, } from "../types/public/agent.js"; import { ClientOptions } from "../types/public/model.js"; import { AgentClient } from "./AgentClient.js"; @@ -56,7 +57,7 @@ export class MicrosoftCUAClient extends AgentClient { private client: OpenAI; private currentViewport = { width: 1288, height: 711 }; private currentUrl?: string; - private screenshotProvider?: () => Promise; + private screenshotProvider?: () => Promise; private actionHandler?: (action: AgentAction) => Promise; // Dual history system @@ -138,7 +139,9 @@ export class MicrosoftCUAClient extends AgentClient { this.currentUrl = url; } - setScreenshotProvider(provider: () => Promise): void { + setScreenshotProvider( + provider: () => Promise, + ): void { this.screenshotProvider = provider; } @@ -532,8 +535,8 @@ For each function call, return a json object with function name and arguments wi throw new AgentScreenshotProviderError("Screenshot provider not set"); } - const base64Screenshot = await this.screenshotProvider(); - return `data:image/png;base64,${base64Screenshot}`; + const screenshot = await this.screenshotProvider(); + return `data:${screenshot.mediaType};base64,${screenshot.base64}`; } /** diff --git a/packages/core/lib/v3/agent/OpenAICUAClient.ts b/packages/core/lib/v3/agent/OpenAICUAClient.ts index 1843463810..6e05e903ed 100644 --- a/packages/core/lib/v3/agent/OpenAICUAClient.ts +++ b/packages/core/lib/v3/agent/OpenAICUAClient.ts @@ -16,6 +16,7 @@ import { FunctionCallItem, SafetyCheck, SafetyConfirmationHandler, + ScreenshotProviderResult, } from "../types/public/agent.js"; import { ClientOptions } from "../types/public/model.js"; import { AgentClient } from "./AgentClient.js"; @@ -49,7 +50,7 @@ export class OpenAICUAClient extends AgentClient { public lastResponseId?: string; private currentViewport = { width: 1288, height: 711 }; private currentUrl?: string; - private screenshotProvider?: () => Promise; + private screenshotProvider?: () => Promise; private actionHandler?: (action: AgentAction) => Promise; private reasoningItems: Map = new Map(); private environment: string = "browser"; // "browser", "mac", "windows", or "ubuntu" @@ -107,7 +108,9 @@ export class OpenAICUAClient extends AgentClient { this.currentUrl = url; } - setScreenshotProvider(provider: () => Promise): void { + setScreenshotProvider( + provider: () => Promise, + ): void { this.screenshotProvider = provider; } @@ -467,7 +470,7 @@ export class OpenAICUAClient extends AgentClient { if (initialScreenshot) { const screenshotInput: ResponseInputImage = { type: "input_image", - image_url: initialScreenshot, + image_url: this.toDataUrl(initialScreenshot), detail: "high", }; userContent.push(screenshotInput); @@ -635,7 +638,7 @@ export class OpenAICUAClient extends AgentClient { call_id: item.call_id, output: { type: outputType, - image_url: screenshot, + image_url: this.toDataUrl(screenshot), ...(this.usesNewComputerTool ? { detail: "original" as const } : {}), @@ -713,7 +716,7 @@ export class OpenAICUAClient extends AgentClient { call_id: item.call_id, output: { type: outputType, - image_url: screenshot, + image_url: this.toDataUrl(screenshot), error: errorMessage, ...(this.usesNewComputerTool ? { detail: "original" as const } @@ -899,7 +902,9 @@ export class OpenAICUAClient extends AgentClient { return notes; } - private async captureInitialScreenshot(): Promise { + private async captureInitialScreenshot(): Promise< + ScreenshotProviderResult | undefined + > { if (!this.screenshotProvider) { return undefined; } @@ -942,17 +947,16 @@ export class OpenAICUAClient extends AgentClient { async captureScreenshot(options?: { base64Image?: string; currentUrl?: string; - }): Promise { + }): Promise { // Use provided options if available if (options?.base64Image) { - return `data:image/png;base64,${options.base64Image}`; + return { base64: options.base64Image, mediaType: "image/png" }; } // Use the screenshot provider if available if (this.screenshotProvider) { try { - const base64Image = await this.screenshotProvider(); - return `data:image/png;base64,${base64Image}`; + return await this.screenshotProvider(); } catch (error) { console.error("Error capturing screenshot:", error); throw error; @@ -964,4 +968,9 @@ export class OpenAICUAClient extends AgentClient { "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image", ); } + + /** Build the `data:` URL the OpenAI image payload expects. */ + private toDataUrl(screenshot: ScreenshotProviderResult): string { + return `data:${screenshot.mediaType};base64,${screenshot.base64}`; + } } diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index 9dc3901bf1..c4fee1f16b 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -87,11 +87,17 @@ export class V3CuaAgentHandler { this.agentClient.setScreenshotProvider(async () => { this.ensureNotClosed(); const page = await this.v3.context.awaitActivePage(); - const screenshotBuffer = await page.screenshot({ fullPage: false }); + const screenshotBuffer = await page.screenshot({ + fullPage: false, + type: "png", + }); await this.emitCuaScreenshot(screenshotBuffer, page.url()); - return screenshotBuffer.toString("base64"); // base64 png + return { + base64: screenshotBuffer.toString("base64"), + mediaType: "image/png", + }; }); // Provide action executor diff --git a/packages/core/lib/v3/types/public/agent.ts b/packages/core/lib/v3/types/public/agent.ts index 90deb3be8e..391154dc1d 100644 --- a/packages/core/lib/v3/types/public/agent.ts +++ b/packages/core/lib/v3/types/public/agent.ts @@ -27,6 +27,16 @@ import { Page as PuppeteerPage } from "puppeteer-core"; import { Page as PatchrightPage } from "patchright-core"; import { Page } from "../../understudy/page.js"; +/** + * Result of a screenshot provider: the base64-encoded image bytes plus an + * explicit media type. Declaring the media type at the capture boundary lets + * every CUA client pass it through instead of hardcoding or inferring it. + */ +export interface ScreenshotProviderResult { + base64: string; + mediaType: "image/png" | "image/jpeg"; +} + // ============================================================================= // Variable Types // ============================================================================= diff --git a/packages/core/tests/unit/anthropic-cua-triple-click.test.ts b/packages/core/tests/unit/anthropic-cua-triple-click.test.ts index fe07561da6..7620f9a7f8 100644 --- a/packages/core/tests/unit/anthropic-cua-triple-click.test.ts +++ b/packages/core/tests/unit/anthropic-cua-triple-click.test.ts @@ -35,7 +35,10 @@ describe("AnthropicCUAClient triple_click handling", () => { }, ); client.setViewport(1280, 720); - client.setScreenshotProvider(async () => "fake-base64-screenshot"); + client.setScreenshotProvider(async () => ({ + base64: "fake-base64-screenshot", + mediaType: "image/png", + })); executedActions = []; client.setActionHandler(async (action) => { diff --git a/packages/core/tests/unit/cua-screenshot-mediatype.test.ts b/packages/core/tests/unit/cua-screenshot-mediatype.test.ts new file mode 100644 index 0000000000..5507712c15 --- /dev/null +++ b/packages/core/tests/unit/cua-screenshot-mediatype.test.ts @@ -0,0 +1,78 @@ +import { describe, it, expect, beforeEach } from "vitest"; +import { GoogleCUAClient } from "../../lib/v3/agent/GoogleCUAClient.js"; +import { AnthropicCUAClient } from "../../lib/v3/agent/AnthropicCUAClient.js"; +import { OpenAICUAClient } from "../../lib/v3/agent/OpenAICUAClient.js"; +import { MicrosoftCUAClient } from "../../lib/v3/agent/MicrosoftCUAClient.js"; +import type { ScreenshotProviderResult } from "../../lib/v3/types/public/agent.js"; + +/** + * Regression coverage for #2159 / #2046. + * + * The screenshot provider now declares the media type at the capture boundary + * (`{ base64, mediaType }`) instead of each CUA client hardcoding or inferring + * "image/png". These tests assert every client's `captureScreenshot()` honors a + * non-PNG media type rather than silently mislabeling it — the failure mode that + * broke non-PNG function responses. + */ +describe("CUA clients thread screenshot mediaType through captureScreenshot", () => { + const jpeg: ScreenshotProviderResult = { + base64: "jpeg-bytes", + mediaType: "image/jpeg", + }; + + let google: GoogleCUAClient; + let anthropic: AnthropicCUAClient; + let openai: OpenAICUAClient; + let microsoft: MicrosoftCUAClient; + + beforeEach(() => { + google = new GoogleCUAClient( + "google", + "gemini-2.5-computer-use-preview-10-2025", + undefined, + { + apiKey: "test", + }, + ); + anthropic = new AnthropicCUAClient( + "anthropic", + "claude-sonnet-4-5-20250929", + undefined, + { + apiKey: "test", + }, + ); + openai = new OpenAICUAClient("openai", "computer-use-preview", undefined, { + apiKey: "test", + }); + microsoft = new MicrosoftCUAClient("microsoft", "fara-7b", undefined, { + apiKey: "test", + baseURL: "https://example.com", + }); + }); + + it("Anthropic/Google return the provider's mediaType verbatim", async () => { + google.setScreenshotProvider(async () => jpeg); + anthropic.setScreenshotProvider(async () => jpeg); + + expect(await google.captureScreenshot()).toEqual(jpeg); + expect(await anthropic.captureScreenshot()).toEqual(jpeg); + }); + + it("OpenAI/Microsoft build the data URL with the provider's mediaType", async () => { + openai.setScreenshotProvider(async () => jpeg); + microsoft.setScreenshotProvider(async () => jpeg); + + // OpenAI returns the structured result; Microsoft returns a data URL string. + expect(await openai.captureScreenshot()).toEqual(jpeg); + expect(await microsoft.captureScreenshot()).toBe( + "data:image/jpeg;base64,jpeg-bytes", + ); + }); + + it("options.base64Image still defaults to image/png", async () => { + google.setScreenshotProvider(async () => jpeg); + const result = await google.captureScreenshot({ base64Image: "png-bytes" }); + expect(result).toEqual({ base64: "png-bytes", mediaType: "image/png" }); + }); +}); diff --git a/packages/core/tests/unit/microsoft-cua-client.test.ts b/packages/core/tests/unit/microsoft-cua-client.test.ts index 83496deb65..61e9c8e45b 100644 --- a/packages/core/tests/unit/microsoft-cua-client.test.ts +++ b/packages/core/tests/unit/microsoft-cua-client.test.ts @@ -7,7 +7,10 @@ function createClient() { apiKey: "test-key", baseURL: "https://example.com", }); - client.setScreenshotProvider(async () => "mock-base64-screenshot"); + client.setScreenshotProvider(async () => ({ + base64: "mock-base64-screenshot", + mediaType: "image/png", + })); return client; } diff --git a/packages/core/tests/unit/public-api/llm-and-agents.test.ts b/packages/core/tests/unit/public-api/llm-and-agents.test.ts index 277858f244..fa6f5c49be 100644 --- a/packages/core/tests/unit/public-api/llm-and-agents.test.ts +++ b/packages/core/tests/unit/public-api/llm-and-agents.test.ts @@ -171,7 +171,9 @@ describe("LLM and Agents public API types", () => { ) => Promise; setViewport: (width: number, height: number) => void; setCurrentUrl: (url: string) => void; - setScreenshotProvider: (provider: () => Promise) => void; + setScreenshotProvider: ( + provider: () => Promise, + ) => void; setActionHandler: ( handler: (action: Stagehand.AgentAction) => Promise, ) => void; From 9921041e10bf0ca8ef7f1e69566609612bddd38c Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 2 Jul 2026 12:42:47 -0700 Subject: [PATCH 2/5] update test with new type --- .../tests/unit/agent-captcha-hooks.test.ts | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts index 62b8d38246..7d70c48cec 100644 --- a/packages/core/tests/unit/agent-captcha-hooks.test.ts +++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts @@ -60,7 +60,10 @@ class FakeCuaClient { public contextNotes: string[] = []; public preStepHook?: () => Promise; public actionHandler?: (action: Record) => Promise; - public screenshotProvider?: () => Promise; + public screenshotProvider?: () => Promise<{ + base64: string; + mediaType: "image/png" | "image/jpeg"; + }>; public executeImpl = vi.fn(async (options: unknown) => { void options; return { @@ -73,9 +76,16 @@ class FakeCuaClient { public captureScreenshot = vi.fn(async () => null); public setViewport = vi.fn(); public setCurrentUrl = vi.fn(); - public setScreenshotProvider = vi.fn((provider: () => Promise) => { - this.screenshotProvider = provider; - }); + public setScreenshotProvider = vi.fn( + ( + provider: () => Promise<{ + base64: string; + mediaType: "image/png" | "image/jpeg"; + }>, + ) => { + this.screenshotProvider = provider; + }, + ); public setSafetyConfirmationHandler = vi.fn(); setActionHandler( @@ -513,9 +523,10 @@ describe("v3 cua handler screenshot behavior", () => { }); fakeCuaClient.executeImpl = vi.fn(async () => { - await expect(fakeCuaClient.screenshotProvider?.()).resolves.toBe( - screenshotBase64, - ); + await expect(fakeCuaClient.screenshotProvider?.()).resolves.toEqual({ + base64: screenshotBase64, + mediaType: "image/png", + }); return { success: true, message: "ok", From 54c59c081d198cbf77da9512b2ae69bb92895d37 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 2 Jul 2026 12:46:11 -0700 Subject: [PATCH 3/5] specify png in cua clients --- packages/core/lib/v3/agent/AnthropicCUAClient.ts | 6 +++++- packages/core/lib/v3/agent/GoogleCUAClient.ts | 6 +++++- packages/core/lib/v3/agent/OpenAICUAClient.ts | 6 +++++- packages/core/lib/v3/handlers/v3CuaAgentHandler.ts | 6 +++++- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts index a7d5b2d8a9..6067b81b38 100644 --- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts +++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts @@ -1041,11 +1041,15 @@ export class AnthropicCUAClient extends AgentClient { async captureScreenshot(options?: { base64Image?: string; + mediaType?: "image/png" | "image/jpeg"; currentUrl?: string; }): Promise { // Use provided options if available if (options?.base64Image) { - return { base64: options.base64Image, mediaType: "image/png" }; + return { + base64: options.base64Image, + mediaType: options.mediaType ?? "image/png", + }; } // Use the screenshot provider if available diff --git a/packages/core/lib/v3/agent/GoogleCUAClient.ts b/packages/core/lib/v3/agent/GoogleCUAClient.ts index f26aab2e30..a1cb8323ba 100644 --- a/packages/core/lib/v3/agent/GoogleCUAClient.ts +++ b/packages/core/lib/v3/agent/GoogleCUAClient.ts @@ -1160,6 +1160,7 @@ export class GoogleCUAClient extends AgentClient { async captureScreenshot(options?: { base64Image?: string; + mediaType?: "image/png" | "image/jpeg"; currentUrl?: string; }): Promise { // Update current URL if provided @@ -1169,7 +1170,10 @@ export class GoogleCUAClient extends AgentClient { // Use provided options if available if (options?.base64Image) { - return { base64: options.base64Image, mediaType: "image/png" }; + return { + base64: options.base64Image, + mediaType: options.mediaType ?? "image/png", + }; } // Use the screenshot provider if available diff --git a/packages/core/lib/v3/agent/OpenAICUAClient.ts b/packages/core/lib/v3/agent/OpenAICUAClient.ts index 6e05e903ed..ea43abe2a1 100644 --- a/packages/core/lib/v3/agent/OpenAICUAClient.ts +++ b/packages/core/lib/v3/agent/OpenAICUAClient.ts @@ -946,11 +946,15 @@ export class OpenAICUAClient extends AgentClient { async captureScreenshot(options?: { base64Image?: string; + mediaType?: "image/png" | "image/jpeg"; currentUrl?: string; }): Promise { // Use provided options if available if (options?.base64Image) { - return { base64: options.base64Image, mediaType: "image/png" }; + return { + base64: options.base64Image, + mediaType: options.mediaType ?? "image/png", + }; } // Use the screenshot provider if available diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index c4fee1f16b..1367775fb5 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -738,7 +738,10 @@ export class V3CuaAgentHandler { }); try { const page = await this.v3.context.awaitActivePage(); - const screenshotBuffer = await page.screenshot({ fullPage: false }); + const screenshotBuffer = await page.screenshot({ + fullPage: false, + type: "png", + }); const currentUrl = page.url(); @@ -747,6 +750,7 @@ export class V3CuaAgentHandler { return await this.agentClient.captureScreenshot({ base64Image: screenshotBuffer.toString("base64"), + mediaType: "image/png", currentUrl, }); } catch (e) { From b74e39e6ae53db0dd49f36fa81659aa3f79877f1 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 2 Jul 2026 12:47:12 -0700 Subject: [PATCH 4/5] update error message --- packages/core/lib/v3/agent/AnthropicCUAClient.ts | 2 +- packages/core/lib/v3/agent/GoogleCUAClient.ts | 2 +- packages/core/lib/v3/agent/OpenAICUAClient.ts | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts index 6067b81b38..b1e08b3aaa 100644 --- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts +++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts @@ -1064,7 +1064,7 @@ export class AnthropicCUAClient extends AgentClient { throw new AgentScreenshotProviderError( "`screenshotProvider` has not been set. " + - "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image", + "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image and media type", ); } } diff --git a/packages/core/lib/v3/agent/GoogleCUAClient.ts b/packages/core/lib/v3/agent/GoogleCUAClient.ts index a1cb8323ba..d9b0a38346 100644 --- a/packages/core/lib/v3/agent/GoogleCUAClient.ts +++ b/packages/core/lib/v3/agent/GoogleCUAClient.ts @@ -1188,7 +1188,7 @@ export class GoogleCUAClient extends AgentClient { throw new AgentScreenshotProviderError( "`screenshotProvider` has not been set. " + - "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image", + "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image and media type", ); } } diff --git a/packages/core/lib/v3/agent/OpenAICUAClient.ts b/packages/core/lib/v3/agent/OpenAICUAClient.ts index ea43abe2a1..6ff1a47141 100644 --- a/packages/core/lib/v3/agent/OpenAICUAClient.ts +++ b/packages/core/lib/v3/agent/OpenAICUAClient.ts @@ -969,7 +969,7 @@ export class OpenAICUAClient extends AgentClient { throw new AgentScreenshotProviderError( "`screenshotProvider` has not been set. " + - "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image", + "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image and media type", ); } From 42fb0853c468d906d312085360d3359738ba13e6 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 2 Jul 2026 13:00:36 -0700 Subject: [PATCH 5/5] update changeset --- .changeset/fix-screenshot-provider-mediatype.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changeset/fix-screenshot-provider-mediatype.md b/.changeset/fix-screenshot-provider-mediatype.md index 4636b71ca3..4e77988f2a 100644 --- a/.changeset/fix-screenshot-provider-mediatype.md +++ b/.changeset/fix-screenshot-provider-mediatype.md @@ -2,4 +2,4 @@ "@browserbasehq/stagehand": patch --- -Declare the CUA screenshot media type at the capture boundary instead of hardcoding `image/png` in each computer-use client. `setScreenshotProvider` now returns `{ base64, mediaType }` (`ScreenshotProviderResult`) rather than a bare base64 string, and the Anthropic, Google, OpenAI, and Microsoft clients pass the media type through to their function-response payloads. This fixes non-PNG screenshots being mislabeled as PNG (closes #2046) and removes the per-client PNG-only data-URL parsing. +Use the screenshot provider's declared media type when sending CUA image payloads. The `setScreenshotProvider` callback now returns `ScreenshotProviderResult` (`{ base64, mediaType }`) instead of a bare base64 string.