Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/fix-screenshot-provider-mediatype.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
Comment thread
seanmcguire12 marked this conversation as resolved.
---

Use the screenshot provider's declared media type when sending CUA image payloads. The `setScreenshotProvider` callback now returns `ScreenshotProviderResult` (`{ base64, mediaType }`) instead of a bare base64 string.
5 changes: 4 additions & 1 deletion packages/core/lib/v3/agent/AgentClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
AgentResult,
AgentType,
AgentExecutionOptions,
ScreenshotProviderResult,
} from "../types/public/agent.js";
import { ClientOptions } from "../types/public/model.js";

Expand Down Expand Up @@ -37,7 +38,9 @@ export abstract class AgentClient {

abstract setCurrentUrl(url: string): void;

abstract setScreenshotProvider(provider: () => Promise<string>): void;
abstract setScreenshotProvider(
provider: () => Promise<ScreenshotProviderResult>,
): void;

abstract setActionHandler(
handler: (action: AgentAction) => Promise<void>,
Expand Down
30 changes: 18 additions & 12 deletions packages/core/lib/v3/agent/AnthropicCUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
AnthropicToolResult,
AgentExecutionOptions,
ToolUseItem,
ScreenshotProviderResult,
} from "../types/public/agent.js";
import { LogLine } from "../types/public/logs.js";
import { ClientOptions, ThinkingEffort } from "../types/public/model.js";
Expand Down Expand Up @@ -46,7 +47,7 @@ export class AnthropicCUAClient extends AgentClient {
public lastMessageId?: string;
private currentViewport = { width: 1288, height: 711 };
private currentUrl?: string;
private screenshotProvider?: () => Promise<string>;
private screenshotProvider?: () => Promise<ScreenshotProviderResult>;
private actionHandler?: (action: AgentAction) => Promise<void>;
private thinkingBudget: number | null = null;
private thinkingEffort: ThinkingEffort | null = null;
Expand Down Expand Up @@ -106,7 +107,9 @@ export class AnthropicCUAClient extends AgentClient {
this.currentUrl = url;
}

setScreenshotProvider(provider: () => Promise<string>): void {
setScreenshotProvider(
provider: () => Promise<ScreenshotProviderResult>,
): void {
this.screenshotProvider = provider;
}

Expand Down Expand Up @@ -664,7 +667,7 @@ export class AnthropicCUAClient extends AgentClient {
const screenshot = await this.captureScreenshot();
logger({
category: "agent",
message: `Screenshot captured, length: ${screenshot.length}`,
message: `Screenshot captured, length: ${screenshot.base64.length}`,
level: 2,
});

Expand All @@ -674,8 +677,8 @@ export class AnthropicCUAClient extends AgentClient {
type: "image",
source: {
type: "base64",
media_type: "image/png",
data: screenshot.replace(/^data:image\/png;base64,/, ""),
media_type: screenshot.mediaType,
data: screenshot.base64,
},
},
];
Expand Down Expand Up @@ -785,8 +788,8 @@ export class AnthropicCUAClient extends AgentClient {
type: "image",
source: {
type: "base64",
media_type: "image/png",
data: screenshot.replace(/^data:image\/png;base64,/, ""),
media_type: screenshot.mediaType,
data: screenshot.base64,
},
},
{
Expand Down Expand Up @@ -1038,18 +1041,21 @@ export class AnthropicCUAClient extends AgentClient {

async captureScreenshot(options?: {
base64Image?: string;
mediaType?: "image/png" | "image/jpeg";
currentUrl?: string;
}): Promise<string> {
}): Promise<ScreenshotProviderResult> {
// Use provided options if available
if (options?.base64Image) {
return `data:image/png;base64,${options.base64Image}`;
return {
base64: options.base64Image,
mediaType: options.mediaType ?? "image/png",
};
}

// Use the screenshot provider if available
if (this.screenshotProvider) {
try {
const base64Image = await this.screenshotProvider();
return `data:image/png;base64,${base64Image}`;
return await this.screenshotProvider();
Comment thread
seanmcguire12 marked this conversation as resolved.
} catch (error) {
console.error("Error capturing screenshot:", error);
throw error;
Expand All @@ -1058,7 +1064,7 @@ export class AnthropicCUAClient extends AgentClient {

throw new AgentScreenshotProviderError(
"`screenshotProvider` has not been set. " +
"Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image",
"Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image and media type",
);
}
}
27 changes: 15 additions & 12 deletions packages/core/lib/v3/agent/GoogleCUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import {
AgentExecutionOptions,
SafetyCheck,
SafetyConfirmationHandler,
ScreenshotProviderResult,
} from "../types/public/agent.js";
import { ClientOptions } from "../types/public/model.js";
import { AgentClient } from "./AgentClient.js";
Expand Down Expand Up @@ -49,7 +50,7 @@ export class GoogleCUAClient extends AgentClient {
private client: GoogleGenAI;
private currentViewport = { width: 1288, height: 711 };
private currentUrl?: string;
private screenshotProvider?: () => Promise<string>;
private screenshotProvider?: () => Promise<ScreenshotProviderResult>;
private actionHandler?: (action: AgentAction) => Promise<void>;
private history: Content[] = [];
private environment: "ENVIRONMENT_BROWSER" | "ENVIRONMENT_DESKTOP" =
Expand Down Expand Up @@ -129,7 +130,9 @@ export class GoogleCUAClient extends AgentClient {
this.currentUrl = url;
}

setScreenshotProvider(provider: () => Promise<string>): void {
setScreenshotProvider(
provider: () => Promise<ScreenshotProviderResult>,
): void {
this.screenshotProvider = provider;
}

Expand Down Expand Up @@ -599,10 +602,7 @@ export class GoogleCUAClient extends AgentClient {
});

const screenshot = await this.captureScreenshot();
const base64Data = screenshot.replace(
/^data:image\/png;base64,/,
"",
);
const base64Data = screenshot.base64;

// Create one function response for each computer use function call
// Following Python SDK pattern: FunctionResponse with parts containing inline_data
Expand All @@ -629,7 +629,7 @@ export class GoogleCUAClient extends AgentClient {
parts: [
{
inlineData: {
mimeType: "image/png",
mimeType: screenshot.mediaType,
data: base64Data,
},
},
Expand Down Expand Up @@ -1160,23 +1160,26 @@ export class GoogleCUAClient extends AgentClient {

async captureScreenshot(options?: {
base64Image?: string;
mediaType?: "image/png" | "image/jpeg";
currentUrl?: string;
}): Promise<string> {
}): Promise<ScreenshotProviderResult> {
// Update current URL if provided
if (options?.currentUrl) {
this.currentUrl = options.currentUrl;
}

// Use provided options if available
if (options?.base64Image) {
return `data:image/png;base64,${options.base64Image}`;
return {
base64: options.base64Image,
mediaType: options.mediaType ?? "image/png",
};
}

// Use the screenshot provider if available
if (this.screenshotProvider) {
try {
const base64Image = await this.screenshotProvider();
return `data:image/png;base64,${base64Image}`;
return await this.screenshotProvider();
} catch (error) {
console.error("Error capturing screenshot:", error);
throw error;
Expand All @@ -1185,7 +1188,7 @@ export class GoogleCUAClient extends AgentClient {

throw new AgentScreenshotProviderError(
"`screenshotProvider` has not been set. " +
"Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image",
"Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image and media type",
);
}
}
11 changes: 7 additions & 4 deletions packages/core/lib/v3/agent/MicrosoftCUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
AgentResult,
AgentType,
AgentExecutionOptions,
ScreenshotProviderResult,
} from "../types/public/agent.js";
import { ClientOptions } from "../types/public/model.js";
import { AgentClient } from "./AgentClient.js";
Expand Down Expand Up @@ -56,7 +57,7 @@ export class MicrosoftCUAClient extends AgentClient {
private client: OpenAI;
private currentViewport = { width: 1288, height: 711 };
private currentUrl?: string;
private screenshotProvider?: () => Promise<string>;
private screenshotProvider?: () => Promise<ScreenshotProviderResult>;
private actionHandler?: (action: AgentAction) => Promise<void>;

// Dual history system
Expand Down Expand Up @@ -138,7 +139,9 @@ export class MicrosoftCUAClient extends AgentClient {
this.currentUrl = url;
}

setScreenshotProvider(provider: () => Promise<string>): void {
setScreenshotProvider(
provider: () => Promise<ScreenshotProviderResult>,
): void {
this.screenshotProvider = provider;
}

Expand Down Expand Up @@ -532,8 +535,8 @@ For each function call, return a json object with function name and arguments wi
throw new AgentScreenshotProviderError("Screenshot provider not set");
}

const base64Screenshot = await this.screenshotProvider();
return `data:image/png;base64,${base64Screenshot}`;
const screenshot = await this.screenshotProvider();
return `data:${screenshot.mediaType};base64,${screenshot.base64}`;
}

/**
Expand Down
35 changes: 24 additions & 11 deletions packages/core/lib/v3/agent/OpenAICUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import {
FunctionCallItem,
SafetyCheck,
SafetyConfirmationHandler,
ScreenshotProviderResult,
} from "../types/public/agent.js";
import { ClientOptions } from "../types/public/model.js";
import { AgentClient } from "./AgentClient.js";
Expand Down Expand Up @@ -49,7 +50,7 @@ export class OpenAICUAClient extends AgentClient {
public lastResponseId?: string;
private currentViewport = { width: 1288, height: 711 };
private currentUrl?: string;
private screenshotProvider?: () => Promise<string>;
private screenshotProvider?: () => Promise<ScreenshotProviderResult>;
private actionHandler?: (action: AgentAction) => Promise<void>;
private reasoningItems: Map<string, ResponseItem> = new Map();
private environment: string = "browser"; // "browser", "mac", "windows", or "ubuntu"
Expand Down Expand Up @@ -107,7 +108,9 @@ export class OpenAICUAClient extends AgentClient {
this.currentUrl = url;
}

setScreenshotProvider(provider: () => Promise<string>): void {
setScreenshotProvider(
provider: () => Promise<ScreenshotProviderResult>,
): void {
this.screenshotProvider = provider;
}

Expand Down Expand Up @@ -467,7 +470,7 @@ export class OpenAICUAClient extends AgentClient {
if (initialScreenshot) {
const screenshotInput: ResponseInputImage = {
type: "input_image",
image_url: initialScreenshot,
image_url: this.toDataUrl(initialScreenshot),
detail: "high",
};
userContent.push(screenshotInput);
Expand Down Expand Up @@ -635,7 +638,7 @@ export class OpenAICUAClient extends AgentClient {
call_id: item.call_id,
output: {
type: outputType,
image_url: screenshot,
image_url: this.toDataUrl(screenshot),
...(this.usesNewComputerTool
? { detail: "original" as const }
: {}),
Expand Down Expand Up @@ -713,7 +716,7 @@ export class OpenAICUAClient extends AgentClient {
call_id: item.call_id,
output: {
type: outputType,
image_url: screenshot,
image_url: this.toDataUrl(screenshot),
error: errorMessage,
...(this.usesNewComputerTool
? { detail: "original" as const }
Expand Down Expand Up @@ -899,7 +902,9 @@ export class OpenAICUAClient extends AgentClient {
return notes;
}

private async captureInitialScreenshot(): Promise<string | undefined> {
private async captureInitialScreenshot(): Promise<
ScreenshotProviderResult | undefined
> {
if (!this.screenshotProvider) {
return undefined;
}
Expand Down Expand Up @@ -941,18 +946,21 @@ export class OpenAICUAClient extends AgentClient {

async captureScreenshot(options?: {
base64Image?: string;
mediaType?: "image/png" | "image/jpeg";
currentUrl?: string;
}): Promise<string> {
}): Promise<ScreenshotProviderResult> {
// Use provided options if available
if (options?.base64Image) {
return `data:image/png;base64,${options.base64Image}`;
return {
base64: options.base64Image,
mediaType: options.mediaType ?? "image/png",
};
}

// Use the screenshot provider if available
if (this.screenshotProvider) {
try {
const base64Image = await this.screenshotProvider();
return `data:image/png;base64,${base64Image}`;
return await this.screenshotProvider();
Comment thread
seanmcguire12 marked this conversation as resolved.
} catch (error) {
console.error("Error capturing screenshot:", error);
throw error;
Expand All @@ -961,7 +969,12 @@ export class OpenAICUAClient extends AgentClient {

throw new AgentScreenshotProviderError(
"`screenshotProvider` has not been set. " +
"Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image",
"Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image and media type",
);
}

/** Build the `data:` URL the OpenAI image payload expects. */
private toDataUrl(screenshot: ScreenshotProviderResult): string {
return `data:${screenshot.mediaType};base64,${screenshot.base64}`;
}
}
16 changes: 13 additions & 3 deletions packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,17 @@ export class V3CuaAgentHandler {
this.agentClient.setScreenshotProvider(async () => {
this.ensureNotClosed();
const page = await this.v3.context.awaitActivePage();
const screenshotBuffer = await page.screenshot({ fullPage: false });
const screenshotBuffer = await page.screenshot({
fullPage: false,
type: "png",
});

await this.emitCuaScreenshot(screenshotBuffer, page.url());

return screenshotBuffer.toString("base64"); // base64 png
return {
base64: screenshotBuffer.toString("base64"),
mediaType: "image/png",
};
});

// Provide action executor
Expand Down Expand Up @@ -732,7 +738,10 @@ export class V3CuaAgentHandler {
});
try {
const page = await this.v3.context.awaitActivePage();
const screenshotBuffer = await page.screenshot({ fullPage: false });
const screenshotBuffer = await page.screenshot({
fullPage: false,
type: "png",
});

const currentUrl = page.url();

Expand All @@ -741,6 +750,7 @@ export class V3CuaAgentHandler {

return await this.agentClient.captureScreenshot({
base64Image: screenshotBuffer.toString("base64"),
mediaType: "image/png",
currentUrl,
});
} catch (e) {
Expand Down
Loading
Loading