From 75036875a1d4227d8f65c4651562211573d752da Mon Sep 17 00:00:00 2001
From: yawbtng <154343001+yawbtng@users.noreply.github.com>
Date: Thu, 2 Jul 2026 11:41:12 -0700
Subject: [PATCH 1/5] fix(core): declare CUA screenshot media type at capture
boundary (#2300)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## why
Closes #2046. This is the reshaped version of #2159, following the
approach @seanmcguire12 outlined when closing that PR.
`setScreenshotProvider` returned a bare base64 string, so every CUA
client had to independently infer or hardcode the media type — all four
assumed `image/png`. A non-PNG screenshot (e.g. a JPEG from a custom
provider) was then mislabeled as PNG in the provider function-response
payload, which is the root of #2046. Clients also stripped a hardcoded
`data:image/png;base64,` prefix by regex, so any other prefix silently
broke.
## what changed
Move the media-type declaration to the capture boundary.
`setScreenshotProvider` now returns an explicit payload:
```ts
export interface ScreenshotProviderResult {
base64: string;
mediaType: "image/png" | "image/jpeg";
}
```
- **Default handler** (`v3CuaAgentHandler`) captures PNG explicitly
(`type: "png"`) and returns `{ base64, mediaType: "image/png" }`, so the
default is unchanged.
- **Anthropic**: `media_type: screenshot.mediaType`, `data:
screenshot.base64` (drops the `.replace(/^data:image\/png;base64,/,
"")`).
- **Google**: `mimeType: screenshot.mediaType` (drops the PNG-only
prefix strip).
- **OpenAI / Microsoft**: build
`data:${screenshot.mediaType};base64,${screenshot.base64}`.
- `options.base64Image` (caller-supplied) still defaults to `image/png`,
preserving existing behavior.
`ScreenshotProviderResult` is exported from the public entrypoint.
## testing
- New `cua-screenshot-mediatype.test.ts`: asserts a non-PNG
(`image/jpeg`) media type is honored by all four clients'
`captureScreenshot()`, and that the `options.base64Image` path still
defaults to png.
- Updated the public API type test for `setScreenshotProvider(...)` and
the Anthropic/Microsoft CUA client tests to the new provider shape.
- `pnpm --filter @browserbasehq/stagehand run typecheck` passes; the CUA
+ public-API unit suites are green (55 tests).
---
## Summary by cubic
Declare the screenshot media type at the capture boundary and pass it
through all CUA clients. Fixes non‑PNG screenshots being mislabeled as
PNG and removes PNG-only prefix stripping.
- **Bug Fixes**
- `setScreenshotProvider` now returns `{ base64, mediaType }`
(`ScreenshotProviderResult`) instead of a string.
- Default handler explicitly captures PNG and returns `image/png`.
- Clients: Anthropic/Google pass `mediaType` through; OpenAI/Microsoft
build `data:${mediaType};base64,${base64}`; removed PNG-only prefix
regex.
- `options.base64Image` still defaults to `image/png`.
- Added tests validating JPEG flows through all clients; updated public
API type tests.
- **Migration**
- If you provide a custom `setScreenshotProvider`, return `{ base64,
mediaType: "image/png" | "image/jpeg" }` instead of a base64 string.
- No changes needed if you use the built-in handler.
Written for commit affd2ad7b58d71294886b12765661472139c7089.
Summary will update on new commits.
---
.../fix-screenshot-provider-mediatype.md | 5 ++
packages/core/lib/v3/agent/AgentClient.ts | 5 +-
.../core/lib/v3/agent/AnthropicCUAClient.ts | 24 +++---
packages/core/lib/v3/agent/GoogleCUAClient.ts | 21 +++--
.../core/lib/v3/agent/MicrosoftCUAClient.ts | 11 ++-
packages/core/lib/v3/agent/OpenAICUAClient.ts | 29 ++++---
.../core/lib/v3/handlers/v3CuaAgentHandler.ts | 10 ++-
packages/core/lib/v3/types/public/agent.ts | 10 +++
.../unit/anthropic-cua-triple-click.test.ts | 5 +-
.../unit/cua-screenshot-mediatype.test.ts | 78 +++++++++++++++++++
.../tests/unit/microsoft-cua-client.test.ts | 5 +-
.../unit/public-api/llm-and-agents.test.ts | 4 +-
12 files changed, 165 insertions(+), 42 deletions(-)
create mode 100644 .changeset/fix-screenshot-provider-mediatype.md
create mode 100644 packages/core/tests/unit/cua-screenshot-mediatype.test.ts
diff --git a/.changeset/fix-screenshot-provider-mediatype.md b/.changeset/fix-screenshot-provider-mediatype.md
new file mode 100644
index 0000000000..4636b71ca3
--- /dev/null
+++ b/.changeset/fix-screenshot-provider-mediatype.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Declare the CUA screenshot media type at the capture boundary instead of hardcoding `image/png` in each computer-use client. `setScreenshotProvider` now returns `{ base64, mediaType }` (`ScreenshotProviderResult`) rather than a bare base64 string, and the Anthropic, Google, OpenAI, and Microsoft clients pass the media type through to their function-response payloads. This fixes non-PNG screenshots being mislabeled as PNG (closes #2046) and removes the per-client PNG-only data-URL parsing.
diff --git a/packages/core/lib/v3/agent/AgentClient.ts b/packages/core/lib/v3/agent/AgentClient.ts
index 294afe0230..fed1a02788 100644
--- a/packages/core/lib/v3/agent/AgentClient.ts
+++ b/packages/core/lib/v3/agent/AgentClient.ts
@@ -3,6 +3,7 @@ import {
AgentResult,
AgentType,
AgentExecutionOptions,
+ ScreenshotProviderResult,
} from "../types/public/agent.js";
import { ClientOptions } from "../types/public/model.js";
@@ -37,7 +38,9 @@ export abstract class AgentClient {
abstract setCurrentUrl(url: string): void;
- abstract setScreenshotProvider(provider: () => Promise): void;
+ abstract setScreenshotProvider(
+ provider: () => Promise,
+ ): void;
abstract setActionHandler(
handler: (action: AgentAction) => Promise,
diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts
index 9590daccd4..a7d5b2d8a9 100644
--- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts
+++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts
@@ -8,6 +8,7 @@ import {
AnthropicToolResult,
AgentExecutionOptions,
ToolUseItem,
+ ScreenshotProviderResult,
} from "../types/public/agent.js";
import { LogLine } from "../types/public/logs.js";
import { ClientOptions, ThinkingEffort } from "../types/public/model.js";
@@ -46,7 +47,7 @@ export class AnthropicCUAClient extends AgentClient {
public lastMessageId?: string;
private currentViewport = { width: 1288, height: 711 };
private currentUrl?: string;
- private screenshotProvider?: () => Promise;
+ private screenshotProvider?: () => Promise;
private actionHandler?: (action: AgentAction) => Promise;
private thinkingBudget: number | null = null;
private thinkingEffort: ThinkingEffort | null = null;
@@ -106,7 +107,9 @@ export class AnthropicCUAClient extends AgentClient {
this.currentUrl = url;
}
- setScreenshotProvider(provider: () => Promise): void {
+ setScreenshotProvider(
+ provider: () => Promise,
+ ): void {
this.screenshotProvider = provider;
}
@@ -664,7 +667,7 @@ export class AnthropicCUAClient extends AgentClient {
const screenshot = await this.captureScreenshot();
logger({
category: "agent",
- message: `Screenshot captured, length: ${screenshot.length}`,
+ message: `Screenshot captured, length: ${screenshot.base64.length}`,
level: 2,
});
@@ -674,8 +677,8 @@ export class AnthropicCUAClient extends AgentClient {
type: "image",
source: {
type: "base64",
- media_type: "image/png",
- data: screenshot.replace(/^data:image\/png;base64,/, ""),
+ media_type: screenshot.mediaType,
+ data: screenshot.base64,
},
},
];
@@ -785,8 +788,8 @@ export class AnthropicCUAClient extends AgentClient {
type: "image",
source: {
type: "base64",
- media_type: "image/png",
- data: screenshot.replace(/^data:image\/png;base64,/, ""),
+ media_type: screenshot.mediaType,
+ data: screenshot.base64,
},
},
{
@@ -1039,17 +1042,16 @@ export class AnthropicCUAClient extends AgentClient {
async captureScreenshot(options?: {
base64Image?: string;
currentUrl?: string;
- }): Promise {
+ }): Promise {
// Use provided options if available
if (options?.base64Image) {
- return `data:image/png;base64,${options.base64Image}`;
+ return { base64: options.base64Image, mediaType: "image/png" };
}
// Use the screenshot provider if available
if (this.screenshotProvider) {
try {
- const base64Image = await this.screenshotProvider();
- return `data:image/png;base64,${base64Image}`;
+ return await this.screenshotProvider();
} catch (error) {
console.error("Error capturing screenshot:", error);
throw error;
diff --git a/packages/core/lib/v3/agent/GoogleCUAClient.ts b/packages/core/lib/v3/agent/GoogleCUAClient.ts
index b49503f799..f26aab2e30 100644
--- a/packages/core/lib/v3/agent/GoogleCUAClient.ts
+++ b/packages/core/lib/v3/agent/GoogleCUAClient.ts
@@ -16,6 +16,7 @@ import {
AgentExecutionOptions,
SafetyCheck,
SafetyConfirmationHandler,
+ ScreenshotProviderResult,
} from "../types/public/agent.js";
import { ClientOptions } from "../types/public/model.js";
import { AgentClient } from "./AgentClient.js";
@@ -49,7 +50,7 @@ export class GoogleCUAClient extends AgentClient {
private client: GoogleGenAI;
private currentViewport = { width: 1288, height: 711 };
private currentUrl?: string;
- private screenshotProvider?: () => Promise;
+ private screenshotProvider?: () => Promise;
private actionHandler?: (action: AgentAction) => Promise;
private history: Content[] = [];
private environment: "ENVIRONMENT_BROWSER" | "ENVIRONMENT_DESKTOP" =
@@ -129,7 +130,9 @@ export class GoogleCUAClient extends AgentClient {
this.currentUrl = url;
}
- setScreenshotProvider(provider: () => Promise): void {
+ setScreenshotProvider(
+ provider: () => Promise,
+ ): void {
this.screenshotProvider = provider;
}
@@ -599,10 +602,7 @@ export class GoogleCUAClient extends AgentClient {
});
const screenshot = await this.captureScreenshot();
- const base64Data = screenshot.replace(
- /^data:image\/png;base64,/,
- "",
- );
+ const base64Data = screenshot.base64;
// Create one function response for each computer use function call
// Following Python SDK pattern: FunctionResponse with parts containing inline_data
@@ -629,7 +629,7 @@ export class GoogleCUAClient extends AgentClient {
parts: [
{
inlineData: {
- mimeType: "image/png",
+ mimeType: screenshot.mediaType,
data: base64Data,
},
},
@@ -1161,7 +1161,7 @@ export class GoogleCUAClient extends AgentClient {
async captureScreenshot(options?: {
base64Image?: string;
currentUrl?: string;
- }): Promise {
+ }): Promise {
// Update current URL if provided
if (options?.currentUrl) {
this.currentUrl = options.currentUrl;
@@ -1169,14 +1169,13 @@ export class GoogleCUAClient extends AgentClient {
// Use provided options if available
if (options?.base64Image) {
- return `data:image/png;base64,${options.base64Image}`;
+ return { base64: options.base64Image, mediaType: "image/png" };
}
// Use the screenshot provider if available
if (this.screenshotProvider) {
try {
- const base64Image = await this.screenshotProvider();
- return `data:image/png;base64,${base64Image}`;
+ return await this.screenshotProvider();
} catch (error) {
console.error("Error capturing screenshot:", error);
throw error;
diff --git a/packages/core/lib/v3/agent/MicrosoftCUAClient.ts b/packages/core/lib/v3/agent/MicrosoftCUAClient.ts
index cb4c212389..3d9f42660c 100644
--- a/packages/core/lib/v3/agent/MicrosoftCUAClient.ts
+++ b/packages/core/lib/v3/agent/MicrosoftCUAClient.ts
@@ -5,6 +5,7 @@ import {
AgentResult,
AgentType,
AgentExecutionOptions,
+ ScreenshotProviderResult,
} from "../types/public/agent.js";
import { ClientOptions } from "../types/public/model.js";
import { AgentClient } from "./AgentClient.js";
@@ -56,7 +57,7 @@ export class MicrosoftCUAClient extends AgentClient {
private client: OpenAI;
private currentViewport = { width: 1288, height: 711 };
private currentUrl?: string;
- private screenshotProvider?: () => Promise;
+ private screenshotProvider?: () => Promise;
private actionHandler?: (action: AgentAction) => Promise;
// Dual history system
@@ -138,7 +139,9 @@ export class MicrosoftCUAClient extends AgentClient {
this.currentUrl = url;
}
- setScreenshotProvider(provider: () => Promise): void {
+ setScreenshotProvider(
+ provider: () => Promise,
+ ): void {
this.screenshotProvider = provider;
}
@@ -532,8 +535,8 @@ For each function call, return a json object with function name and arguments wi
throw new AgentScreenshotProviderError("Screenshot provider not set");
}
- const base64Screenshot = await this.screenshotProvider();
- return `data:image/png;base64,${base64Screenshot}`;
+ const screenshot = await this.screenshotProvider();
+ return `data:${screenshot.mediaType};base64,${screenshot.base64}`;
}
/**
diff --git a/packages/core/lib/v3/agent/OpenAICUAClient.ts b/packages/core/lib/v3/agent/OpenAICUAClient.ts
index 1843463810..6e05e903ed 100644
--- a/packages/core/lib/v3/agent/OpenAICUAClient.ts
+++ b/packages/core/lib/v3/agent/OpenAICUAClient.ts
@@ -16,6 +16,7 @@ import {
FunctionCallItem,
SafetyCheck,
SafetyConfirmationHandler,
+ ScreenshotProviderResult,
} from "../types/public/agent.js";
import { ClientOptions } from "../types/public/model.js";
import { AgentClient } from "./AgentClient.js";
@@ -49,7 +50,7 @@ export class OpenAICUAClient extends AgentClient {
public lastResponseId?: string;
private currentViewport = { width: 1288, height: 711 };
private currentUrl?: string;
- private screenshotProvider?: () => Promise;
+ private screenshotProvider?: () => Promise;
private actionHandler?: (action: AgentAction) => Promise;
private reasoningItems: Map = new Map();
private environment: string = "browser"; // "browser", "mac", "windows", or "ubuntu"
@@ -107,7 +108,9 @@ export class OpenAICUAClient extends AgentClient {
this.currentUrl = url;
}
- setScreenshotProvider(provider: () => Promise): void {
+ setScreenshotProvider(
+ provider: () => Promise,
+ ): void {
this.screenshotProvider = provider;
}
@@ -467,7 +470,7 @@ export class OpenAICUAClient extends AgentClient {
if (initialScreenshot) {
const screenshotInput: ResponseInputImage = {
type: "input_image",
- image_url: initialScreenshot,
+ image_url: this.toDataUrl(initialScreenshot),
detail: "high",
};
userContent.push(screenshotInput);
@@ -635,7 +638,7 @@ export class OpenAICUAClient extends AgentClient {
call_id: item.call_id,
output: {
type: outputType,
- image_url: screenshot,
+ image_url: this.toDataUrl(screenshot),
...(this.usesNewComputerTool
? { detail: "original" as const }
: {}),
@@ -713,7 +716,7 @@ export class OpenAICUAClient extends AgentClient {
call_id: item.call_id,
output: {
type: outputType,
- image_url: screenshot,
+ image_url: this.toDataUrl(screenshot),
error: errorMessage,
...(this.usesNewComputerTool
? { detail: "original" as const }
@@ -899,7 +902,9 @@ export class OpenAICUAClient extends AgentClient {
return notes;
}
- private async captureInitialScreenshot(): Promise {
+ private async captureInitialScreenshot(): Promise<
+ ScreenshotProviderResult | undefined
+ > {
if (!this.screenshotProvider) {
return undefined;
}
@@ -942,17 +947,16 @@ export class OpenAICUAClient extends AgentClient {
async captureScreenshot(options?: {
base64Image?: string;
currentUrl?: string;
- }): Promise {
+ }): Promise {
// Use provided options if available
if (options?.base64Image) {
- return `data:image/png;base64,${options.base64Image}`;
+ return { base64: options.base64Image, mediaType: "image/png" };
}
// Use the screenshot provider if available
if (this.screenshotProvider) {
try {
- const base64Image = await this.screenshotProvider();
- return `data:image/png;base64,${base64Image}`;
+ return await this.screenshotProvider();
} catch (error) {
console.error("Error capturing screenshot:", error);
throw error;
@@ -964,4 +968,9 @@ export class OpenAICUAClient extends AgentClient {
"Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image",
);
}
+
+ /** Build the `data:` URL the OpenAI image payload expects. */
+ private toDataUrl(screenshot: ScreenshotProviderResult): string {
+ return `data:${screenshot.mediaType};base64,${screenshot.base64}`;
+ }
}
diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index 9dc3901bf1..c4fee1f16b 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -87,11 +87,17 @@ export class V3CuaAgentHandler {
this.agentClient.setScreenshotProvider(async () => {
this.ensureNotClosed();
const page = await this.v3.context.awaitActivePage();
- const screenshotBuffer = await page.screenshot({ fullPage: false });
+ const screenshotBuffer = await page.screenshot({
+ fullPage: false,
+ type: "png",
+ });
await this.emitCuaScreenshot(screenshotBuffer, page.url());
- return screenshotBuffer.toString("base64"); // base64 png
+ return {
+ base64: screenshotBuffer.toString("base64"),
+ mediaType: "image/png",
+ };
});
// Provide action executor
diff --git a/packages/core/lib/v3/types/public/agent.ts b/packages/core/lib/v3/types/public/agent.ts
index 90deb3be8e..391154dc1d 100644
--- a/packages/core/lib/v3/types/public/agent.ts
+++ b/packages/core/lib/v3/types/public/agent.ts
@@ -27,6 +27,16 @@ import { Page as PuppeteerPage } from "puppeteer-core";
import { Page as PatchrightPage } from "patchright-core";
import { Page } from "../../understudy/page.js";
+/**
+ * Result of a screenshot provider: the base64-encoded image bytes plus an
+ * explicit media type. Declaring the media type at the capture boundary lets
+ * every CUA client pass it through instead of hardcoding or inferring it.
+ */
+export interface ScreenshotProviderResult {
+ base64: string;
+ mediaType: "image/png" | "image/jpeg";
+}
+
// =============================================================================
// Variable Types
// =============================================================================
diff --git a/packages/core/tests/unit/anthropic-cua-triple-click.test.ts b/packages/core/tests/unit/anthropic-cua-triple-click.test.ts
index fe07561da6..7620f9a7f8 100644
--- a/packages/core/tests/unit/anthropic-cua-triple-click.test.ts
+++ b/packages/core/tests/unit/anthropic-cua-triple-click.test.ts
@@ -35,7 +35,10 @@ describe("AnthropicCUAClient triple_click handling", () => {
},
);
client.setViewport(1280, 720);
- client.setScreenshotProvider(async () => "fake-base64-screenshot");
+ client.setScreenshotProvider(async () => ({
+ base64: "fake-base64-screenshot",
+ mediaType: "image/png",
+ }));
executedActions = [];
client.setActionHandler(async (action) => {
diff --git a/packages/core/tests/unit/cua-screenshot-mediatype.test.ts b/packages/core/tests/unit/cua-screenshot-mediatype.test.ts
new file mode 100644
index 0000000000..5507712c15
--- /dev/null
+++ b/packages/core/tests/unit/cua-screenshot-mediatype.test.ts
@@ -0,0 +1,78 @@
+import { describe, it, expect, beforeEach } from "vitest";
+import { GoogleCUAClient } from "../../lib/v3/agent/GoogleCUAClient.js";
+import { AnthropicCUAClient } from "../../lib/v3/agent/AnthropicCUAClient.js";
+import { OpenAICUAClient } from "../../lib/v3/agent/OpenAICUAClient.js";
+import { MicrosoftCUAClient } from "../../lib/v3/agent/MicrosoftCUAClient.js";
+import type { ScreenshotProviderResult } from "../../lib/v3/types/public/agent.js";
+
+/**
+ * Regression coverage for #2159 / #2046.
+ *
+ * The screenshot provider now declares the media type at the capture boundary
+ * (`{ base64, mediaType }`) instead of each CUA client hardcoding or inferring
+ * "image/png". These tests assert every client's `captureScreenshot()` honors a
+ * non-PNG media type rather than silently mislabeling it — the failure mode that
+ * broke non-PNG function responses.
+ */
+describe("CUA clients thread screenshot mediaType through captureScreenshot", () => {
+ const jpeg: ScreenshotProviderResult = {
+ base64: "jpeg-bytes",
+ mediaType: "image/jpeg",
+ };
+
+ let google: GoogleCUAClient;
+ let anthropic: AnthropicCUAClient;
+ let openai: OpenAICUAClient;
+ let microsoft: MicrosoftCUAClient;
+
+ beforeEach(() => {
+ google = new GoogleCUAClient(
+ "google",
+ "gemini-2.5-computer-use-preview-10-2025",
+ undefined,
+ {
+ apiKey: "test",
+ },
+ );
+ anthropic = new AnthropicCUAClient(
+ "anthropic",
+ "claude-sonnet-4-5-20250929",
+ undefined,
+ {
+ apiKey: "test",
+ },
+ );
+ openai = new OpenAICUAClient("openai", "computer-use-preview", undefined, {
+ apiKey: "test",
+ });
+ microsoft = new MicrosoftCUAClient("microsoft", "fara-7b", undefined, {
+ apiKey: "test",
+ baseURL: "https://example.com",
+ });
+ });
+
+ it("Anthropic/Google return the provider's mediaType verbatim", async () => {
+ google.setScreenshotProvider(async () => jpeg);
+ anthropic.setScreenshotProvider(async () => jpeg);
+
+ expect(await google.captureScreenshot()).toEqual(jpeg);
+ expect(await anthropic.captureScreenshot()).toEqual(jpeg);
+ });
+
+ it("OpenAI/Microsoft build the data URL with the provider's mediaType", async () => {
+ openai.setScreenshotProvider(async () => jpeg);
+ microsoft.setScreenshotProvider(async () => jpeg);
+
+ // OpenAI returns the structured result; Microsoft returns a data URL string.
+ expect(await openai.captureScreenshot()).toEqual(jpeg);
+ expect(await microsoft.captureScreenshot()).toBe(
+ "data:image/jpeg;base64,jpeg-bytes",
+ );
+ });
+
+ it("options.base64Image still defaults to image/png", async () => {
+ google.setScreenshotProvider(async () => jpeg);
+ const result = await google.captureScreenshot({ base64Image: "png-bytes" });
+ expect(result).toEqual({ base64: "png-bytes", mediaType: "image/png" });
+ });
+});
diff --git a/packages/core/tests/unit/microsoft-cua-client.test.ts b/packages/core/tests/unit/microsoft-cua-client.test.ts
index 83496deb65..61e9c8e45b 100644
--- a/packages/core/tests/unit/microsoft-cua-client.test.ts
+++ b/packages/core/tests/unit/microsoft-cua-client.test.ts
@@ -7,7 +7,10 @@ function createClient() {
apiKey: "test-key",
baseURL: "https://example.com",
});
- client.setScreenshotProvider(async () => "mock-base64-screenshot");
+ client.setScreenshotProvider(async () => ({
+ base64: "mock-base64-screenshot",
+ mediaType: "image/png",
+ }));
return client;
}
diff --git a/packages/core/tests/unit/public-api/llm-and-agents.test.ts b/packages/core/tests/unit/public-api/llm-and-agents.test.ts
index 277858f244..fa6f5c49be 100644
--- a/packages/core/tests/unit/public-api/llm-and-agents.test.ts
+++ b/packages/core/tests/unit/public-api/llm-and-agents.test.ts
@@ -171,7 +171,9 @@ describe("LLM and Agents public API types", () => {
) => Promise;
setViewport: (width: number, height: number) => void;
setCurrentUrl: (url: string) => void;
- setScreenshotProvider: (provider: () => Promise) => void;
+ setScreenshotProvider: (
+ provider: () => Promise,
+ ) => void;
setActionHandler: (
handler: (action: Stagehand.AgentAction) => Promise,
) => void;
From 9921041e10bf0ca8ef7f1e69566609612bddd38c Mon Sep 17 00:00:00 2001
From: Sean McGuire
Date: Thu, 2 Jul 2026 12:42:47 -0700
Subject: [PATCH 2/5] update test with new type
---
.../tests/unit/agent-captcha-hooks.test.ts | 25 +++++++++++++------
1 file changed, 18 insertions(+), 7 deletions(-)
diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts
index 62b8d38246..7d70c48cec 100644
--- a/packages/core/tests/unit/agent-captcha-hooks.test.ts
+++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts
@@ -60,7 +60,10 @@ class FakeCuaClient {
public contextNotes: string[] = [];
public preStepHook?: () => Promise;
public actionHandler?: (action: Record) => Promise;
- public screenshotProvider?: () => Promise;
+ public screenshotProvider?: () => Promise<{
+ base64: string;
+ mediaType: "image/png" | "image/jpeg";
+ }>;
public executeImpl = vi.fn(async (options: unknown) => {
void options;
return {
@@ -73,9 +76,16 @@ class FakeCuaClient {
public captureScreenshot = vi.fn(async () => null);
public setViewport = vi.fn();
public setCurrentUrl = vi.fn();
- public setScreenshotProvider = vi.fn((provider: () => Promise) => {
- this.screenshotProvider = provider;
- });
+ public setScreenshotProvider = vi.fn(
+ (
+ provider: () => Promise<{
+ base64: string;
+ mediaType: "image/png" | "image/jpeg";
+ }>,
+ ) => {
+ this.screenshotProvider = provider;
+ },
+ );
public setSafetyConfirmationHandler = vi.fn();
setActionHandler(
@@ -513,9 +523,10 @@ describe("v3 cua handler screenshot behavior", () => {
});
fakeCuaClient.executeImpl = vi.fn(async () => {
- await expect(fakeCuaClient.screenshotProvider?.()).resolves.toBe(
- screenshotBase64,
- );
+ await expect(fakeCuaClient.screenshotProvider?.()).resolves.toEqual({
+ base64: screenshotBase64,
+ mediaType: "image/png",
+ });
return {
success: true,
message: "ok",
From 54c59c081d198cbf77da9512b2ae69bb92895d37 Mon Sep 17 00:00:00 2001
From: Sean McGuire
Date: Thu, 2 Jul 2026 12:46:11 -0700
Subject: [PATCH 3/5] specify png in cua clients
---
packages/core/lib/v3/agent/AnthropicCUAClient.ts | 6 +++++-
packages/core/lib/v3/agent/GoogleCUAClient.ts | 6 +++++-
packages/core/lib/v3/agent/OpenAICUAClient.ts | 6 +++++-
packages/core/lib/v3/handlers/v3CuaAgentHandler.ts | 6 +++++-
4 files changed, 20 insertions(+), 4 deletions(-)
diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts
index a7d5b2d8a9..6067b81b38 100644
--- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts
+++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts
@@ -1041,11 +1041,15 @@ export class AnthropicCUAClient extends AgentClient {
async captureScreenshot(options?: {
base64Image?: string;
+ mediaType?: "image/png" | "image/jpeg";
currentUrl?: string;
}): Promise {
// Use provided options if available
if (options?.base64Image) {
- return { base64: options.base64Image, mediaType: "image/png" };
+ return {
+ base64: options.base64Image,
+ mediaType: options.mediaType ?? "image/png",
+ };
}
// Use the screenshot provider if available
diff --git a/packages/core/lib/v3/agent/GoogleCUAClient.ts b/packages/core/lib/v3/agent/GoogleCUAClient.ts
index f26aab2e30..a1cb8323ba 100644
--- a/packages/core/lib/v3/agent/GoogleCUAClient.ts
+++ b/packages/core/lib/v3/agent/GoogleCUAClient.ts
@@ -1160,6 +1160,7 @@ export class GoogleCUAClient extends AgentClient {
async captureScreenshot(options?: {
base64Image?: string;
+ mediaType?: "image/png" | "image/jpeg";
currentUrl?: string;
}): Promise {
// Update current URL if provided
@@ -1169,7 +1170,10 @@ export class GoogleCUAClient extends AgentClient {
// Use provided options if available
if (options?.base64Image) {
- return { base64: options.base64Image, mediaType: "image/png" };
+ return {
+ base64: options.base64Image,
+ mediaType: options.mediaType ?? "image/png",
+ };
}
// Use the screenshot provider if available
diff --git a/packages/core/lib/v3/agent/OpenAICUAClient.ts b/packages/core/lib/v3/agent/OpenAICUAClient.ts
index 6e05e903ed..ea43abe2a1 100644
--- a/packages/core/lib/v3/agent/OpenAICUAClient.ts
+++ b/packages/core/lib/v3/agent/OpenAICUAClient.ts
@@ -946,11 +946,15 @@ export class OpenAICUAClient extends AgentClient {
async captureScreenshot(options?: {
base64Image?: string;
+ mediaType?: "image/png" | "image/jpeg";
currentUrl?: string;
}): Promise {
// Use provided options if available
if (options?.base64Image) {
- return { base64: options.base64Image, mediaType: "image/png" };
+ return {
+ base64: options.base64Image,
+ mediaType: options.mediaType ?? "image/png",
+ };
}
// Use the screenshot provider if available
diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index c4fee1f16b..1367775fb5 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -738,7 +738,10 @@ export class V3CuaAgentHandler {
});
try {
const page = await this.v3.context.awaitActivePage();
- const screenshotBuffer = await page.screenshot({ fullPage: false });
+ const screenshotBuffer = await page.screenshot({
+ fullPage: false,
+ type: "png",
+ });
const currentUrl = page.url();
@@ -747,6 +750,7 @@ export class V3CuaAgentHandler {
return await this.agentClient.captureScreenshot({
base64Image: screenshotBuffer.toString("base64"),
+ mediaType: "image/png",
currentUrl,
});
} catch (e) {
From b74e39e6ae53db0dd49f36fa81659aa3f79877f1 Mon Sep 17 00:00:00 2001
From: Sean McGuire
Date: Thu, 2 Jul 2026 12:47:12 -0700
Subject: [PATCH 4/5] update error message
---
packages/core/lib/v3/agent/AnthropicCUAClient.ts | 2 +-
packages/core/lib/v3/agent/GoogleCUAClient.ts | 2 +-
packages/core/lib/v3/agent/OpenAICUAClient.ts | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts
index 6067b81b38..b1e08b3aaa 100644
--- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts
+++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts
@@ -1064,7 +1064,7 @@ export class AnthropicCUAClient extends AgentClient {
throw new AgentScreenshotProviderError(
"`screenshotProvider` has not been set. " +
- "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image",
+ "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image and media type",
);
}
}
diff --git a/packages/core/lib/v3/agent/GoogleCUAClient.ts b/packages/core/lib/v3/agent/GoogleCUAClient.ts
index a1cb8323ba..d9b0a38346 100644
--- a/packages/core/lib/v3/agent/GoogleCUAClient.ts
+++ b/packages/core/lib/v3/agent/GoogleCUAClient.ts
@@ -1188,7 +1188,7 @@ export class GoogleCUAClient extends AgentClient {
throw new AgentScreenshotProviderError(
"`screenshotProvider` has not been set. " +
- "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image",
+ "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image and media type",
);
}
}
diff --git a/packages/core/lib/v3/agent/OpenAICUAClient.ts b/packages/core/lib/v3/agent/OpenAICUAClient.ts
index ea43abe2a1..6ff1a47141 100644
--- a/packages/core/lib/v3/agent/OpenAICUAClient.ts
+++ b/packages/core/lib/v3/agent/OpenAICUAClient.ts
@@ -969,7 +969,7 @@ export class OpenAICUAClient extends AgentClient {
throw new AgentScreenshotProviderError(
"`screenshotProvider` has not been set. " +
- "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image",
+ "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image and media type",
);
}
From 42fb0853c468d906d312085360d3359738ba13e6 Mon Sep 17 00:00:00 2001
From: Sean McGuire
Date: Thu, 2 Jul 2026 13:00:36 -0700
Subject: [PATCH 5/5] update changeset
---
.changeset/fix-screenshot-provider-mediatype.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.changeset/fix-screenshot-provider-mediatype.md b/.changeset/fix-screenshot-provider-mediatype.md
index 4636b71ca3..4e77988f2a 100644
--- a/.changeset/fix-screenshot-provider-mediatype.md
+++ b/.changeset/fix-screenshot-provider-mediatype.md
@@ -2,4 +2,4 @@
"@browserbasehq/stagehand": patch
---
-Declare the CUA screenshot media type at the capture boundary instead of hardcoding `image/png` in each computer-use client. `setScreenshotProvider` now returns `{ base64, mediaType }` (`ScreenshotProviderResult`) rather than a bare base64 string, and the Anthropic, Google, OpenAI, and Microsoft clients pass the media type through to their function-response payloads. This fixes non-PNG screenshots being mislabeled as PNG (closes #2046) and removes the per-client PNG-only data-URL parsing.
+Use the screenshot provider's declared media type when sending CUA image payloads. The `setScreenshotProvider` callback now returns `ScreenshotProviderResult` (`{ base64, mediaType }`) instead of a bare base64 string.