diff --git a/.changeset/fix-screenshot-provider-mediatype.md b/.changeset/fix-screenshot-provider-mediatype.md new file mode 100644 index 000000000..4e77988f2 --- /dev/null +++ b/.changeset/fix-screenshot-provider-mediatype.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Use the screenshot provider's declared media type when sending CUA image payloads. The `setScreenshotProvider` callback now returns `ScreenshotProviderResult` (`{ base64, mediaType }`) instead of a bare base64 string. diff --git a/packages/core/lib/v3/agent/AgentClient.ts b/packages/core/lib/v3/agent/AgentClient.ts index 294afe023..fed1a0278 100644 --- a/packages/core/lib/v3/agent/AgentClient.ts +++ b/packages/core/lib/v3/agent/AgentClient.ts @@ -3,6 +3,7 @@ import { AgentResult, AgentType, AgentExecutionOptions, + ScreenshotProviderResult, } from "../types/public/agent.js"; import { ClientOptions } from "../types/public/model.js"; @@ -37,7 +38,9 @@ export abstract class AgentClient { abstract setCurrentUrl(url: string): void; - abstract setScreenshotProvider(provider: () => Promise): void; + abstract setScreenshotProvider( + provider: () => Promise, + ): void; abstract setActionHandler( handler: (action: AgentAction) => Promise, diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts index 9590daccd..b1e08b3aa 100644 --- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts +++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts @@ -8,6 +8,7 @@ import { AnthropicToolResult, AgentExecutionOptions, ToolUseItem, + ScreenshotProviderResult, } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; import { ClientOptions, ThinkingEffort } from "../types/public/model.js"; @@ -46,7 +47,7 @@ export class AnthropicCUAClient extends AgentClient { public lastMessageId?: string; private currentViewport = { width: 1288, height: 711 }; private currentUrl?: string; - private screenshotProvider?: () => Promise; + private screenshotProvider?: () => Promise; private actionHandler?: (action: AgentAction) => Promise; private thinkingBudget: number | null = null; private thinkingEffort: ThinkingEffort | null = null; @@ -106,7 +107,9 @@ export class AnthropicCUAClient extends AgentClient { this.currentUrl = url; } - setScreenshotProvider(provider: () => Promise): void { + setScreenshotProvider( + provider: () => Promise, + ): void { this.screenshotProvider = provider; } @@ -664,7 +667,7 @@ export class AnthropicCUAClient extends AgentClient { const screenshot = await this.captureScreenshot(); logger({ category: "agent", - message: `Screenshot captured, length: ${screenshot.length}`, + message: `Screenshot captured, length: ${screenshot.base64.length}`, level: 2, }); @@ -674,8 +677,8 @@ export class AnthropicCUAClient extends AgentClient { type: "image", source: { type: "base64", - media_type: "image/png", - data: screenshot.replace(/^data:image\/png;base64,/, ""), + media_type: screenshot.mediaType, + data: screenshot.base64, }, }, ]; @@ -785,8 +788,8 @@ export class AnthropicCUAClient extends AgentClient { type: "image", source: { type: "base64", - media_type: "image/png", - data: screenshot.replace(/^data:image\/png;base64,/, ""), + media_type: screenshot.mediaType, + data: screenshot.base64, }, }, { @@ -1038,18 +1041,21 @@ export class AnthropicCUAClient extends AgentClient { async captureScreenshot(options?: { base64Image?: string; + mediaType?: "image/png" | "image/jpeg"; currentUrl?: string; - }): Promise { + }): Promise { // Use provided options if available if (options?.base64Image) { - return `data:image/png;base64,${options.base64Image}`; + return { + base64: options.base64Image, + mediaType: options.mediaType ?? "image/png", + }; } // Use the screenshot provider if available if (this.screenshotProvider) { try { - const base64Image = await this.screenshotProvider(); - return `data:image/png;base64,${base64Image}`; + return await this.screenshotProvider(); } catch (error) { console.error("Error capturing screenshot:", error); throw error; @@ -1058,7 +1064,7 @@ export class AnthropicCUAClient extends AgentClient { throw new AgentScreenshotProviderError( "`screenshotProvider` has not been set. " + - "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image", + "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image and media type", ); } } diff --git a/packages/core/lib/v3/agent/GoogleCUAClient.ts b/packages/core/lib/v3/agent/GoogleCUAClient.ts index b49503f79..d9b0a3834 100644 --- a/packages/core/lib/v3/agent/GoogleCUAClient.ts +++ b/packages/core/lib/v3/agent/GoogleCUAClient.ts @@ -16,6 +16,7 @@ import { AgentExecutionOptions, SafetyCheck, SafetyConfirmationHandler, + ScreenshotProviderResult, } from "../types/public/agent.js"; import { ClientOptions } from "../types/public/model.js"; import { AgentClient } from "./AgentClient.js"; @@ -49,7 +50,7 @@ export class GoogleCUAClient extends AgentClient { private client: GoogleGenAI; private currentViewport = { width: 1288, height: 711 }; private currentUrl?: string; - private screenshotProvider?: () => Promise; + private screenshotProvider?: () => Promise; private actionHandler?: (action: AgentAction) => Promise; private history: Content[] = []; private environment: "ENVIRONMENT_BROWSER" | "ENVIRONMENT_DESKTOP" = @@ -129,7 +130,9 @@ export class GoogleCUAClient extends AgentClient { this.currentUrl = url; } - setScreenshotProvider(provider: () => Promise): void { + setScreenshotProvider( + provider: () => Promise, + ): void { this.screenshotProvider = provider; } @@ -599,10 +602,7 @@ export class GoogleCUAClient extends AgentClient { }); const screenshot = await this.captureScreenshot(); - const base64Data = screenshot.replace( - /^data:image\/png;base64,/, - "", - ); + const base64Data = screenshot.base64; // Create one function response for each computer use function call // Following Python SDK pattern: FunctionResponse with parts containing inline_data @@ -629,7 +629,7 @@ export class GoogleCUAClient extends AgentClient { parts: [ { inlineData: { - mimeType: "image/png", + mimeType: screenshot.mediaType, data: base64Data, }, }, @@ -1160,8 +1160,9 @@ export class GoogleCUAClient extends AgentClient { async captureScreenshot(options?: { base64Image?: string; + mediaType?: "image/png" | "image/jpeg"; currentUrl?: string; - }): Promise { + }): Promise { // Update current URL if provided if (options?.currentUrl) { this.currentUrl = options.currentUrl; @@ -1169,14 +1170,16 @@ export class GoogleCUAClient extends AgentClient { // Use provided options if available if (options?.base64Image) { - return `data:image/png;base64,${options.base64Image}`; + return { + base64: options.base64Image, + mediaType: options.mediaType ?? "image/png", + }; } // Use the screenshot provider if available if (this.screenshotProvider) { try { - const base64Image = await this.screenshotProvider(); - return `data:image/png;base64,${base64Image}`; + return await this.screenshotProvider(); } catch (error) { console.error("Error capturing screenshot:", error); throw error; @@ -1185,7 +1188,7 @@ export class GoogleCUAClient extends AgentClient { throw new AgentScreenshotProviderError( "`screenshotProvider` has not been set. " + - "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image", + "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image and media type", ); } } diff --git a/packages/core/lib/v3/agent/MicrosoftCUAClient.ts b/packages/core/lib/v3/agent/MicrosoftCUAClient.ts index cb4c21238..3d9f42660 100644 --- a/packages/core/lib/v3/agent/MicrosoftCUAClient.ts +++ b/packages/core/lib/v3/agent/MicrosoftCUAClient.ts @@ -5,6 +5,7 @@ import { AgentResult, AgentType, AgentExecutionOptions, + ScreenshotProviderResult, } from "../types/public/agent.js"; import { ClientOptions } from "../types/public/model.js"; import { AgentClient } from "./AgentClient.js"; @@ -56,7 +57,7 @@ export class MicrosoftCUAClient extends AgentClient { private client: OpenAI; private currentViewport = { width: 1288, height: 711 }; private currentUrl?: string; - private screenshotProvider?: () => Promise; + private screenshotProvider?: () => Promise; private actionHandler?: (action: AgentAction) => Promise; // Dual history system @@ -138,7 +139,9 @@ export class MicrosoftCUAClient extends AgentClient { this.currentUrl = url; } - setScreenshotProvider(provider: () => Promise): void { + setScreenshotProvider( + provider: () => Promise, + ): void { this.screenshotProvider = provider; } @@ -532,8 +535,8 @@ For each function call, return a json object with function name and arguments wi throw new AgentScreenshotProviderError("Screenshot provider not set"); } - const base64Screenshot = await this.screenshotProvider(); - return `data:image/png;base64,${base64Screenshot}`; + const screenshot = await this.screenshotProvider(); + return `data:${screenshot.mediaType};base64,${screenshot.base64}`; } /** diff --git a/packages/core/lib/v3/agent/OpenAICUAClient.ts b/packages/core/lib/v3/agent/OpenAICUAClient.ts index 184346381..6ff1a4714 100644 --- a/packages/core/lib/v3/agent/OpenAICUAClient.ts +++ b/packages/core/lib/v3/agent/OpenAICUAClient.ts @@ -16,6 +16,7 @@ import { FunctionCallItem, SafetyCheck, SafetyConfirmationHandler, + ScreenshotProviderResult, } from "../types/public/agent.js"; import { ClientOptions } from "../types/public/model.js"; import { AgentClient } from "./AgentClient.js"; @@ -49,7 +50,7 @@ export class OpenAICUAClient extends AgentClient { public lastResponseId?: string; private currentViewport = { width: 1288, height: 711 }; private currentUrl?: string; - private screenshotProvider?: () => Promise; + private screenshotProvider?: () => Promise; private actionHandler?: (action: AgentAction) => Promise; private reasoningItems: Map = new Map(); private environment: string = "browser"; // "browser", "mac", "windows", or "ubuntu" @@ -107,7 +108,9 @@ export class OpenAICUAClient extends AgentClient { this.currentUrl = url; } - setScreenshotProvider(provider: () => Promise): void { + setScreenshotProvider( + provider: () => Promise, + ): void { this.screenshotProvider = provider; } @@ -467,7 +470,7 @@ export class OpenAICUAClient extends AgentClient { if (initialScreenshot) { const screenshotInput: ResponseInputImage = { type: "input_image", - image_url: initialScreenshot, + image_url: this.toDataUrl(initialScreenshot), detail: "high", }; userContent.push(screenshotInput); @@ -635,7 +638,7 @@ export class OpenAICUAClient extends AgentClient { call_id: item.call_id, output: { type: outputType, - image_url: screenshot, + image_url: this.toDataUrl(screenshot), ...(this.usesNewComputerTool ? { detail: "original" as const } : {}), @@ -713,7 +716,7 @@ export class OpenAICUAClient extends AgentClient { call_id: item.call_id, output: { type: outputType, - image_url: screenshot, + image_url: this.toDataUrl(screenshot), error: errorMessage, ...(this.usesNewComputerTool ? { detail: "original" as const } @@ -899,7 +902,9 @@ export class OpenAICUAClient extends AgentClient { return notes; } - private async captureInitialScreenshot(): Promise { + private async captureInitialScreenshot(): Promise< + ScreenshotProviderResult | undefined + > { if (!this.screenshotProvider) { return undefined; } @@ -941,18 +946,21 @@ export class OpenAICUAClient extends AgentClient { async captureScreenshot(options?: { base64Image?: string; + mediaType?: "image/png" | "image/jpeg"; currentUrl?: string; - }): Promise { + }): Promise { // Use provided options if available if (options?.base64Image) { - return `data:image/png;base64,${options.base64Image}`; + return { + base64: options.base64Image, + mediaType: options.mediaType ?? "image/png", + }; } // Use the screenshot provider if available if (this.screenshotProvider) { try { - const base64Image = await this.screenshotProvider(); - return `data:image/png;base64,${base64Image}`; + return await this.screenshotProvider(); } catch (error) { console.error("Error capturing screenshot:", error); throw error; @@ -961,7 +969,12 @@ export class OpenAICUAClient extends AgentClient { throw new AgentScreenshotProviderError( "`screenshotProvider` has not been set. " + - "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image", + "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image and media type", ); } + + /** Build the `data:` URL the OpenAI image payload expects. */ + private toDataUrl(screenshot: ScreenshotProviderResult): string { + return `data:${screenshot.mediaType};base64,${screenshot.base64}`; + } } diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index 9dc3901bf..1367775fb 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -87,11 +87,17 @@ export class V3CuaAgentHandler { this.agentClient.setScreenshotProvider(async () => { this.ensureNotClosed(); const page = await this.v3.context.awaitActivePage(); - const screenshotBuffer = await page.screenshot({ fullPage: false }); + const screenshotBuffer = await page.screenshot({ + fullPage: false, + type: "png", + }); await this.emitCuaScreenshot(screenshotBuffer, page.url()); - return screenshotBuffer.toString("base64"); // base64 png + return { + base64: screenshotBuffer.toString("base64"), + mediaType: "image/png", + }; }); // Provide action executor @@ -732,7 +738,10 @@ export class V3CuaAgentHandler { }); try { const page = await this.v3.context.awaitActivePage(); - const screenshotBuffer = await page.screenshot({ fullPage: false }); + const screenshotBuffer = await page.screenshot({ + fullPage: false, + type: "png", + }); const currentUrl = page.url(); @@ -741,6 +750,7 @@ export class V3CuaAgentHandler { return await this.agentClient.captureScreenshot({ base64Image: screenshotBuffer.toString("base64"), + mediaType: "image/png", currentUrl, }); } catch (e) { diff --git a/packages/core/lib/v3/types/public/agent.ts b/packages/core/lib/v3/types/public/agent.ts index 90deb3be8..391154dc1 100644 --- a/packages/core/lib/v3/types/public/agent.ts +++ b/packages/core/lib/v3/types/public/agent.ts @@ -27,6 +27,16 @@ import { Page as PuppeteerPage } from "puppeteer-core"; import { Page as PatchrightPage } from "patchright-core"; import { Page } from "../../understudy/page.js"; +/** + * Result of a screenshot provider: the base64-encoded image bytes plus an + * explicit media type. Declaring the media type at the capture boundary lets + * every CUA client pass it through instead of hardcoding or inferring it. + */ +export interface ScreenshotProviderResult { + base64: string; + mediaType: "image/png" | "image/jpeg"; +} + // ============================================================================= // Variable Types // ============================================================================= diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts index 62b8d3824..7d70c48ce 100644 --- a/packages/core/tests/unit/agent-captcha-hooks.test.ts +++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts @@ -60,7 +60,10 @@ class FakeCuaClient { public contextNotes: string[] = []; public preStepHook?: () => Promise; public actionHandler?: (action: Record) => Promise; - public screenshotProvider?: () => Promise; + public screenshotProvider?: () => Promise<{ + base64: string; + mediaType: "image/png" | "image/jpeg"; + }>; public executeImpl = vi.fn(async (options: unknown) => { void options; return { @@ -73,9 +76,16 @@ class FakeCuaClient { public captureScreenshot = vi.fn(async () => null); public setViewport = vi.fn(); public setCurrentUrl = vi.fn(); - public setScreenshotProvider = vi.fn((provider: () => Promise) => { - this.screenshotProvider = provider; - }); + public setScreenshotProvider = vi.fn( + ( + provider: () => Promise<{ + base64: string; + mediaType: "image/png" | "image/jpeg"; + }>, + ) => { + this.screenshotProvider = provider; + }, + ); public setSafetyConfirmationHandler = vi.fn(); setActionHandler( @@ -513,9 +523,10 @@ describe("v3 cua handler screenshot behavior", () => { }); fakeCuaClient.executeImpl = vi.fn(async () => { - await expect(fakeCuaClient.screenshotProvider?.()).resolves.toBe( - screenshotBase64, - ); + await expect(fakeCuaClient.screenshotProvider?.()).resolves.toEqual({ + base64: screenshotBase64, + mediaType: "image/png", + }); return { success: true, message: "ok", diff --git a/packages/core/tests/unit/anthropic-cua-triple-click.test.ts b/packages/core/tests/unit/anthropic-cua-triple-click.test.ts index fe07561da..7620f9a7f 100644 --- a/packages/core/tests/unit/anthropic-cua-triple-click.test.ts +++ b/packages/core/tests/unit/anthropic-cua-triple-click.test.ts @@ -35,7 +35,10 @@ describe("AnthropicCUAClient triple_click handling", () => { }, ); client.setViewport(1280, 720); - client.setScreenshotProvider(async () => "fake-base64-screenshot"); + client.setScreenshotProvider(async () => ({ + base64: "fake-base64-screenshot", + mediaType: "image/png", + })); executedActions = []; client.setActionHandler(async (action) => { diff --git a/packages/core/tests/unit/cua-screenshot-mediatype.test.ts b/packages/core/tests/unit/cua-screenshot-mediatype.test.ts new file mode 100644 index 000000000..5507712c1 --- /dev/null +++ b/packages/core/tests/unit/cua-screenshot-mediatype.test.ts @@ -0,0 +1,78 @@ +import { describe, it, expect, beforeEach } from "vitest"; +import { GoogleCUAClient } from "../../lib/v3/agent/GoogleCUAClient.js"; +import { AnthropicCUAClient } from "../../lib/v3/agent/AnthropicCUAClient.js"; +import { OpenAICUAClient } from "../../lib/v3/agent/OpenAICUAClient.js"; +import { MicrosoftCUAClient } from "../../lib/v3/agent/MicrosoftCUAClient.js"; +import type { ScreenshotProviderResult } from "../../lib/v3/types/public/agent.js"; + +/** + * Regression coverage for #2159 / #2046. + * + * The screenshot provider now declares the media type at the capture boundary + * (`{ base64, mediaType }`) instead of each CUA client hardcoding or inferring + * "image/png". These tests assert every client's `captureScreenshot()` honors a + * non-PNG media type rather than silently mislabeling it — the failure mode that + * broke non-PNG function responses. + */ +describe("CUA clients thread screenshot mediaType through captureScreenshot", () => { + const jpeg: ScreenshotProviderResult = { + base64: "jpeg-bytes", + mediaType: "image/jpeg", + }; + + let google: GoogleCUAClient; + let anthropic: AnthropicCUAClient; + let openai: OpenAICUAClient; + let microsoft: MicrosoftCUAClient; + + beforeEach(() => { + google = new GoogleCUAClient( + "google", + "gemini-2.5-computer-use-preview-10-2025", + undefined, + { + apiKey: "test", + }, + ); + anthropic = new AnthropicCUAClient( + "anthropic", + "claude-sonnet-4-5-20250929", + undefined, + { + apiKey: "test", + }, + ); + openai = new OpenAICUAClient("openai", "computer-use-preview", undefined, { + apiKey: "test", + }); + microsoft = new MicrosoftCUAClient("microsoft", "fara-7b", undefined, { + apiKey: "test", + baseURL: "https://example.com", + }); + }); + + it("Anthropic/Google return the provider's mediaType verbatim", async () => { + google.setScreenshotProvider(async () => jpeg); + anthropic.setScreenshotProvider(async () => jpeg); + + expect(await google.captureScreenshot()).toEqual(jpeg); + expect(await anthropic.captureScreenshot()).toEqual(jpeg); + }); + + it("OpenAI/Microsoft build the data URL with the provider's mediaType", async () => { + openai.setScreenshotProvider(async () => jpeg); + microsoft.setScreenshotProvider(async () => jpeg); + + // OpenAI returns the structured result; Microsoft returns a data URL string. + expect(await openai.captureScreenshot()).toEqual(jpeg); + expect(await microsoft.captureScreenshot()).toBe( + "data:image/jpeg;base64,jpeg-bytes", + ); + }); + + it("options.base64Image still defaults to image/png", async () => { + google.setScreenshotProvider(async () => jpeg); + const result = await google.captureScreenshot({ base64Image: "png-bytes" }); + expect(result).toEqual({ base64: "png-bytes", mediaType: "image/png" }); + }); +}); diff --git a/packages/core/tests/unit/microsoft-cua-client.test.ts b/packages/core/tests/unit/microsoft-cua-client.test.ts index 83496deb6..61e9c8e45 100644 --- a/packages/core/tests/unit/microsoft-cua-client.test.ts +++ b/packages/core/tests/unit/microsoft-cua-client.test.ts @@ -7,7 +7,10 @@ function createClient() { apiKey: "test-key", baseURL: "https://example.com", }); - client.setScreenshotProvider(async () => "mock-base64-screenshot"); + client.setScreenshotProvider(async () => ({ + base64: "mock-base64-screenshot", + mediaType: "image/png", + })); return client; } diff --git a/packages/core/tests/unit/public-api/llm-and-agents.test.ts b/packages/core/tests/unit/public-api/llm-and-agents.test.ts index 277858f24..fa6f5c49b 100644 --- a/packages/core/tests/unit/public-api/llm-and-agents.test.ts +++ b/packages/core/tests/unit/public-api/llm-and-agents.test.ts @@ -171,7 +171,9 @@ describe("LLM and Agents public API types", () => { ) => Promise; setViewport: (width: number, height: number) => void; setCurrentUrl: (url: string) => void; - setScreenshotProvider: (provider: () => Promise) => void; + setScreenshotProvider: ( + provider: () => Promise, + ) => void; setActionHandler: ( handler: (action: Stagehand.AgentAction) => Promise, ) => void;