Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 24 additions & 10 deletions packages/evals/core/tools/browse_cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -106,18 +106,30 @@ type BrowseCliPagesResult = {
}>;
};

// The mode flag selects the environment when the daemon is first started and
// must be explicit so a set BROWSERBASE_API_KEY does not silently auto-select
// remote. It is only accepted by the driver commands, so it is skipped for the
// subcommands that reject it. The session name is safe on every command.
const BROWSE_MODELESS_COMMANDS = new Set(["stop", "status"]);

class BrowseCliRuntime {
constructor(private readonly session: string) {}
constructor(
private readonly session: string,
private readonly modeFlag: "--local" | "--remote",
) {}

async runJson<T>(args: string[]): Promise<T> {
const modeArgs = BROWSE_MODELESS_COMMANDS.has(args[0])
? []
: [this.modeFlag];
const { stdout, stderr } = await execFileAsync(
process.execPath,
[
resolveBrowseCliEntrypoint(),
"--json",
...args,
...modeArgs,
"--session",
this.session,
...args,
],
{
cwd: getRepoRootDir(),
Expand Down Expand Up @@ -645,8 +657,11 @@ class BrowseCliSession implements CoreSession {
private activePageId: string | null = null;
private closed = false;

constructor(private readonly sessionName: string) {
this.runtime = new BrowseCliRuntime(sessionName);
constructor(
private readonly sessionName: string,
modeFlag: "--local" | "--remote",
) {
this.runtime = new BrowseCliRuntime(sessionName, modeFlag);
}

private wrap(page: { targetId: string; url: string }): BrowseCliPageHandle {
Expand Down Expand Up @@ -823,11 +838,10 @@ export class BrowseCliTool implements CoreTool {
);
}

const session = new BrowseCliSession(createSessionName());
await session.runtime.runJson([
"env",
input.environment === "BROWSERBASE" ? "remote" : "local",
]);
const session = new BrowseCliSession(
createSessionName(),
input.environment === "BROWSERBASE" ? "--remote" : "--local",
);

return {
session,
Expand Down
193 changes: 191 additions & 2 deletions packages/evals/framework/benchHarness.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,25 @@ import {
AgentProvider,
getAISDKLanguageModel,
loadApiKeyFromEnv,
providerEnvVarMap,
V3,
type AgentInstance,
type AvailableModel,
type LLMClient,
type LogLine,
type V3,
type TaskSpec,
} from "@browserbasehq/stagehand";
import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js";
import { endBrowserbaseSession } from "../browserbaseCleanup.js";
import { EvalsError } from "../errors.js";
import type { EvalLogger } from "../logger.js";
import type { V3InitResult } from "../initV3.js";
import type { EvalInput } from "../types/evals.js";
import { runClaudeCodeAgent } from "./claudeCodeRunner.js";
import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
import {
runClaudeCodeAgent,
type ClaudeCodeVerifierConfig,
} from "./claudeCodeRunner.js";
import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js";
import { runCodexAgent } from "./codexRunner.js";
import { prepareCodexToolAdapter } from "./codexToolAdapter.js";
Expand Down Expand Up @@ -181,6 +187,184 @@ export const stagehandHarness: BenchHarness = {
},
};

/**
* Default judge model for the claude_code rubric verifier — used for both rubric
* generation and scoring. google/gemini-2.5-flash is V3Evaluator's own tuned
* default and reliably emits the verifier's structured-output schema; smaller
* models (e.g. anthropic/claude-haiku-4-5) intermittently fail the fused
* judgment call ("response did not match schema"), which the verifier reports as
* evidenceInsufficient → spurious outcome=false. Override with
* EVAL_CLAUDE_CODE_VERIFIER_MODEL (the judge's provider key is auto-resolved).
* Requires GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY for the default.
*/
const CLAUDE_CODE_VERIFIER_JUDGE_MODEL = "google/gemini-2.5-flash";

/**
* The Vercel AI Gateway provider (`gateway/...`) authenticates against
* AI_GATEWAY_API_KEY, but `gateway` is NOT in the SDK's providerEnvVarMap, so
* loadApiKeyFromEnv treats it like a keyless provider and returns undefined.
* A `gateway/` judge override would therefore silently skip its credential and
* downgrade the verifier. Resolve it explicitly here so a gateway judge sends
* the right key and still fail-fasts when the key is missing.
*/
const GATEWAY_JUDGE_PROVIDER = "gateway";
const GATEWAY_JUDGE_API_KEY_ENV = "AI_GATEWAY_API_KEY";

/**
* Resolve the API key for a judge provider. Mirrors loadApiKeyFromEnv for
* providers in providerEnvVarMap, but also handles `gateway` (which the SDK map
* omits) via AI_GATEWAY_API_KEY so a gateway judge isn't mistaken for keyless.
*/
function resolveJudgeApiKey(
provider: string | undefined,
logger: EvalLogger,
): string | undefined {
if (!provider) return undefined;
if (provider === GATEWAY_JUDGE_PROVIDER) {
const key = process.env[GATEWAY_JUDGE_API_KEY_ENV];
return typeof key === "string" && key.length > 0 ? key : undefined;
}
return loadApiKeyFromEnv(provider, (line: LogLine) => logger.log(line));
}

/**
* Whether a judge provider genuinely requires an API key (so a missing key is a
* misconfiguration, not a keyless provider). True for anything in the SDK's
* providerEnvVarMap plus `gateway` (which the map omits but which needs
* AI_GATEWAY_API_KEY). Genuinely-keyless providers (ollama/bedrock) and the
* built-in default stay exempt.
*/
function judgeProviderRequiresKey(provider: string | undefined): boolean {
if (provider === undefined) return false;
return provider === GATEWAY_JUDGE_PROVIDER || provider in providerEnvVarMap;
}

/**
* Whether the rubric verifier should run for claude_code. Default ON so browse
* runs get ground-truth scoring; set EVAL_CLAUDE_CODE_VERIFIER to 0/false/off to
* fall back to the agent's self-reported EVAL_RESULT line.
*/
function isClaudeCodeVerifierEnabled(): boolean {
const raw = process.env.EVAL_CLAUDE_CODE_VERIFIER;
if (raw === undefined) return true;
const normalized = raw.trim().toLowerCase();
return !(
normalized === "0" ||
normalized === "false" ||
normalized === "off" ||
normalized === "no"
);
}

/**
* Build the ClaudeCodeVerifierConfig that wires V3Evaluator's rubric verifier
* into the claude_code runner. Returns undefined (→ self-report fallback) when
* the verifier is disabled or when constructing the V3 carrier throws — never
* crashes the run. Exception: an explicit judge override
* (EVAL_CLAUDE_CODE_VERIFIER_MODEL) whose provider key can't be resolved throws
* a config error rather than silently downgrading to self-report.
*
* The V3 instance is used ONLY as the LLM-client carrier for V3Evaluator; per
* ClaudeCodeVerifierConfig it does NOT need init(). We mirror `evals verify`
* (tui/commands/verify.ts): a browser-free V3 with disableAPI + an Anthropic
* model so the verifier's LLMProvider resolves against ANTHROPIC_API_KEY.
*/
export function buildClaudeCodeVerifierConfig(
plan: ExternalHarnessTaskPlan,
logger: EvalLogger,
): ClaudeCodeVerifierConfig | undefined {
if (!isClaudeCodeVerifierEnabled()) return undefined;

const judgeModelOverride = process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL;
const judgeModel = (judgeModelOverride ||
CLAUDE_CODE_VERIFIER_JUDGE_MODEL) as AvailableModel;

// Resolve the judge provider's key so V3Evaluator sends the RIGHT credential.
// Without this it defaults modelClientOptions.apiKey to the Gemini key, which
// an Anthropic judge would receive as x-api-key → "invalid x-api-key".
const judgeProvider = judgeModel.includes("/")
? judgeModel.slice(0, judgeModel.indexOf("/"))
: undefined;
// resolveJudgeApiKey mirrors loadApiKeyFromEnv but also maps `gateway` →
// AI_GATEWAY_API_KEY (the SDK's providerEnvVarMap omits gateway).
const judgeApiKey = resolveJudgeApiKey(judgeProvider, logger);
const judgeClientOptions = judgeApiKey ? { apiKey: judgeApiKey } : undefined;

// Fail fast on a judge OVERRIDE whose key we can't resolve, so it propagates
// instead of being swallowed into the self-report fallback. Otherwise
// V3Evaluator backfills modelClientOptions with the Gemini key, hands the
// wrong provider its credential, verify() throws, and the run silently
// downgrades to legacy self-report. Surface the misconfiguration instead.
//
// Only providers that genuinely require a key qualify (see
// judgeProviderRequiresKey): anything in the SDK's providerEnvVarMap plus
// `gateway` (which needs AI_GATEWAY_API_KEY but the map omits). Genuinely
// API-keyless providers (ollama, bedrock) and the built-in default (gemini)
// stay exempt: keyless judges proceed with no explicit apiKey, and the
// default degrades gracefully to V3Evaluator's own key resolution.
if (
judgeModelOverride &&
judgeProviderRequiresKey(judgeProvider) &&
!judgeApiKey
) {
throw new EvalsError(
Comment thread
shrey150 marked this conversation as resolved.
`EVAL_CLAUDE_CODE_VERIFIER_MODEL="${judgeModel}" was set but no API key resolved for provider "${judgeProvider}". Set that provider's key (e.g. ANTHROPIC_API_KEY / OPENAI_API_KEY / AI_GATEWAY_API_KEY) or unset EVAL_CLAUDE_CODE_VERIFIER_MODEL to use the default judge.`,
);
}

try {
// Browser-free carrier — no init(). Only v3.logger is read by V3Evaluator.
const v3 = new V3({
env: "LOCAL",
verbose: 0,
disableAPI: true,
model: judgeClientOptions
? { modelName: judgeModel, ...judgeClientOptions }
: judgeModel,
logger: (line: LogLine) => logger.log(line),
});

const taskSpec: TaskSpec = {
// Fallback id feeds the trajectory dir path, so sanitize the
// instruction-derived segment — raw instruction text can contain `/`,
// `..`, or other path-unsafe characters that would fork the output dir.
id:
plan.taskId ??
`${plan.dataset}/${plan.instruction
.slice(0, 40)
.replace(/[^A-Za-z0-9_-]/g, "_")}`,
instruction: plan.instruction,
initUrl: plan.startUrl,
...(plan.precomputedRubric && {
precomputedRubric: plan.precomputedRubric,
}),
...(plan.expectedAnswer && { expectedAnswer: plan.expectedAnswer }),
};

return {
v3,
taskSpec,
dataset: plan.dataset,
judgeModel,
judgeClientOptions,
successMode: process.env.EVAL_SUCCESS_MODE as
| "outcome"
| "process"
| "both"
| undefined,
};
} catch (error) {
logger.warn({
category: "claude_code",
message: `verifier setup skipped (falling back to self-report): ${
error instanceof Error ? error.message : String(error)
}`,
level: 0,
});
return undefined;
}
}

export const claudeCodeHarness: BenchHarness = {
harness: "claude_code",
supportedTaskKinds: ["agent", "suite"],
Expand All @@ -205,12 +389,17 @@ export const claudeCodeHarness: BenchHarness = {
logger,
});
try {
// Built inside the try so a fail-fast verifier-config error (e.g. an
// override judge whose key can't be resolved) still runs the finally that
// owns the prepared tool adapter, instead of leaking it.
const verifier = buildClaudeCodeVerifierConfig(plan, logger);
Comment thread
shrey150 marked this conversation as resolved.
return await runClaudeCodeAgent({
plan,
model: input.modelName,
logger,
toolAdapter,
signal,
verifier,
});
} finally {
await toolAdapter.cleanup();
Expand Down
29 changes: 27 additions & 2 deletions packages/evals/framework/claudeCodeRunner.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand";
import type {
AvailableModel,
ClientOptions,
TaskSpec,
V3,
} from "@browserbasehq/stagehand";
import { EvalsError } from "../errors.js";
import type { EvalLogger } from "../logger.js";
import type { TaskResult } from "./types.js";
Expand Down Expand Up @@ -30,6 +35,20 @@ export interface ClaudeCodeVerifierConfig {
taskSpec: TaskSpec;
/** Dataset name for rubric cache partitioning (used when no precomputedRubric). */
dataset: string;
/**
* Judge model for V3Evaluator (scoring + rubric generation). When omitted the
* evaluator falls back to its own default (google/gemini-2.5-flash). Pass an
* Anthropic model here to score against ANTHROPIC_API_KEY.
*/
judgeModel?: AvailableModel;
/**
* Client options (API key) for the judge model. Required alongside judgeModel
* when the judge's provider differs from the evaluator's own default —
* otherwise V3Evaluator defaults modelClientOptions.apiKey to the Gemini key,
* which is sent as the wrong provider's credential (e.g. an Anthropic judge
* receives the Gemini key and fails with "invalid x-api-key").
*/
judgeClientOptions?: ClientOptions;
/** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */
successMode?: "outcome" | "process" | "both";
/** Override trajectory persistence root. */
Expand Down Expand Up @@ -289,7 +308,13 @@ export async function runClaudeCodeAgent({

const { V3Evaluator } = await import("@browserbasehq/stagehand");
const { RubricCache } = await import("./rubricCache.js");
const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" });
const evaluator = new V3Evaluator(verifier.v3, {
backend: "verifier",
...(verifier.judgeModel && { modelName: verifier.judgeModel }),
Comment thread
shrey150 marked this conversation as resolved.
...(verifier.judgeClientOptions && {
modelClientOptions: verifier.judgeClientOptions,
}),
});

// Hydrate rubric — use precomputed if present, otherwise cache-or-generate.
let rubric = verifier.taskSpec.precomputedRubric;
Expand Down
31 changes: 12 additions & 19 deletions packages/evals/framework/claudeCodeToolAdapter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -326,19 +326,28 @@ export async function prepareBrowseCliHarnessAdapter(
PATH: `${cwd}${path.delimiter}${process.env.PATH ?? ""}`,
} as Record<string, string>;

const modeFlag = input.environment === "BROWSERBASE" ? "--remote" : "--local";
Comment thread
shrey150 marked this conversation as resolved.
await fsp.writeFile(
wrapperPath,
[
"#!/usr/bin/env bash",
"set -euo pipefail",
`exec ${JSON.stringify(process.execPath)} ${JSON.stringify(BROWSE_CLI_ENTRYPOINT)} --json --session ${JSON.stringify(session)} "$@"`,
// The mode flag (--local/--remote) selects the environment when the daemon
// is first started and must be explicit so a set BROWSERBASE_API_KEY does
// not silently auto-select remote. It is only accepted by the driver
// commands, so skip it for the few subcommands that reject it (stop,
// status). The session name is safe on every command.
"cmd=${1:-}",
"mode=()",
'if [[ "$cmd" != "stop" && "$cmd" != "status" ]]; then',
` mode=(${JSON.stringify(modeFlag)})`,
"fi",
`exec ${JSON.stringify(process.execPath)} ${JSON.stringify(BROWSE_CLI_ENTRYPOINT)} "$@" "\${mode[@]+\${mode[@]}}" --session ${JSON.stringify(session)}`,
"",
].join("\n"),
{ mode: 0o755 },
);

await runBrowseSetup(wrapperPath, input.environment, input.logger, env, cwd);

return {
toolSurface: "browse_cli",
startupProfile: input.startupProfile,
Expand Down Expand Up @@ -1070,22 +1079,6 @@ function buildCdpCodePromptInstructions(plan: ExternalHarnessTaskPlan): string {
].join("\n");
}

async function runBrowseSetup(
wrapperPath: string,
environment: "LOCAL" | "BROWSERBASE",
logger: EvalLogger,
env: Record<string, string>,
cwd: string,
): Promise<void> {
await runBrowseCommand(
wrapperPath,
["env", environment === "BROWSERBASE" ? "remote" : "local"],
logger,
env,
cwd,
);
}

function buildBrowseCliPromptInstructions(
plan: ExternalHarnessTaskPlan,
): string {
Expand Down
Loading
Loading