browserbase · shrey150 · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026 · Jul 2, 2026
diff --git a/packages/evals/core/tools/browse_cli.ts b/packages/evals/core/tools/browse_cli.ts
@@ -106,18 +106,30 @@ type BrowseCliPagesResult = {
   }>;
 };
 
+// The mode flag selects the environment when the daemon is first started and
+// must be explicit so a set BROWSERBASE_API_KEY does not silently auto-select
+// remote. It is only accepted by the driver commands, so it is skipped for the
+// subcommands that reject it. The session name is safe on every command.
+const BROWSE_MODELESS_COMMANDS = new Set(["stop", "status"]);
+
 class BrowseCliRuntime {
-  constructor(private readonly session: string) {}
+  constructor(
+    private readonly session: string,
+    private readonly modeFlag: "--local" | "--remote",
+  ) {}
 
   async runJson<T>(args: string[]): Promise<T> {
+    const modeArgs = BROWSE_MODELESS_COMMANDS.has(args[0])
+      ? []
+      : [this.modeFlag];
     const { stdout, stderr } = await execFileAsync(
       process.execPath,
       [
         resolveBrowseCliEntrypoint(),
-        "--json",
+        ...args,
+        ...modeArgs,
         "--session",
         this.session,
-        ...args,
       ],
       {
         cwd: getRepoRootDir(),
@@ -645,8 +657,11 @@ class BrowseCliSession implements CoreSession {
   private activePageId: string | null = null;
   private closed = false;
 
-  constructor(private readonly sessionName: string) {
-    this.runtime = new BrowseCliRuntime(sessionName);
+  constructor(
+    private readonly sessionName: string,
+    modeFlag: "--local" | "--remote",
+  ) {
+    this.runtime = new BrowseCliRuntime(sessionName, modeFlag);
   }
 
   private wrap(page: { targetId: string; url: string }): BrowseCliPageHandle {
@@ -823,11 +838,10 @@ export class BrowseCliTool implements CoreTool {
       );
     }
 
-    const session = new BrowseCliSession(createSessionName());
-    await session.runtime.runJson([
-      "env",
-      input.environment === "BROWSERBASE" ? "remote" : "local",
-    ]);
+    const session = new BrowseCliSession(
+      createSessionName(),
+      input.environment === "BROWSERBASE" ? "--remote" : "--local",
+    );
 
     return {
       session,

diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts
@@ -2,19 +2,25 @@ import {
   AgentProvider,
   getAISDKLanguageModel,
   loadApiKeyFromEnv,
+  providerEnvVarMap,
+  V3,
   type AgentInstance,
   type AvailableModel,
   type LLMClient,
   type LogLine,
-  type V3,
+  type TaskSpec,
 } from "@browserbasehq/stagehand";
 import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js";
 import { endBrowserbaseSession } from "../browserbaseCleanup.js";
 import { EvalsError } from "../errors.js";
 import type { EvalLogger } from "../logger.js";
 import type { V3InitResult } from "../initV3.js";
 import type { EvalInput } from "../types/evals.js";
-import { runClaudeCodeAgent } from "./claudeCodeRunner.js";
+import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
+import {
+  runClaudeCodeAgent,
+  type ClaudeCodeVerifierConfig,
+} from "./claudeCodeRunner.js";
 import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js";
 import { runCodexAgent } from "./codexRunner.js";
 import { prepareCodexToolAdapter } from "./codexToolAdapter.js";
@@ -181,6 +187,184 @@ export const stagehandHarness: BenchHarness = {
   },
 };
 
+/**
+ * Default judge model for the claude_code rubric verifier — used for both rubric
+ * generation and scoring. google/gemini-2.5-flash is V3Evaluator's own tuned
+ * default and reliably emits the verifier's structured-output schema; smaller
+ * models (e.g. anthropic/claude-haiku-4-5) intermittently fail the fused
+ * judgment call ("response did not match schema"), which the verifier reports as
+ * evidenceInsufficient → spurious outcome=false. Override with
+ * EVAL_CLAUDE_CODE_VERIFIER_MODEL (the judge's provider key is auto-resolved).
+ * Requires GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY for the default.
+ */
+const CLAUDE_CODE_VERIFIER_JUDGE_MODEL = "google/gemini-2.5-flash";
+
+/**
+ * The Vercel AI Gateway provider (`gateway/...`) authenticates against
+ * AI_GATEWAY_API_KEY, but `gateway` is NOT in the SDK's providerEnvVarMap, so
+ * loadApiKeyFromEnv treats it like a keyless provider and returns undefined.
+ * A `gateway/` judge override would therefore silently skip its credential and
+ * downgrade the verifier. Resolve it explicitly here so a gateway judge sends
+ * the right key and still fail-fasts when the key is missing.
+ */
+const GATEWAY_JUDGE_PROVIDER = "gateway";
+const GATEWAY_JUDGE_API_KEY_ENV = "AI_GATEWAY_API_KEY";
+
+/**
+ * Resolve the API key for a judge provider. Mirrors loadApiKeyFromEnv for
+ * providers in providerEnvVarMap, but also handles `gateway` (which the SDK map
+ * omits) via AI_GATEWAY_API_KEY so a gateway judge isn't mistaken for keyless.
+ */
+function resolveJudgeApiKey(
+  provider: string | undefined,
+  logger: EvalLogger,
+): string | undefined {
+  if (!provider) return undefined;
+  if (provider === GATEWAY_JUDGE_PROVIDER) {
+    const key = process.env[GATEWAY_JUDGE_API_KEY_ENV];
+    return typeof key === "string" && key.length > 0 ? key : undefined;
+  }
+  return loadApiKeyFromEnv(provider, (line: LogLine) => logger.log(line));
+}
+
+/**
+ * Whether a judge provider genuinely requires an API key (so a missing key is a
+ * misconfiguration, not a keyless provider). True for anything in the SDK's
+ * providerEnvVarMap plus `gateway` (which the map omits but which needs
+ * AI_GATEWAY_API_KEY). Genuinely-keyless providers (ollama/bedrock) and the
+ * built-in default stay exempt.
+ */
+function judgeProviderRequiresKey(provider: string | undefined): boolean {
+  if (provider === undefined) return false;
+  return provider === GATEWAY_JUDGE_PROVIDER || provider in providerEnvVarMap;
+}
+
+/**
+ * Whether the rubric verifier should run for claude_code. Default ON so browse
+ * runs get ground-truth scoring; set EVAL_CLAUDE_CODE_VERIFIER to 0/false/off to
+ * fall back to the agent's self-reported EVAL_RESULT line.
+ */
+function isClaudeCodeVerifierEnabled(): boolean {
+  const raw = process.env.EVAL_CLAUDE_CODE_VERIFIER;
+  if (raw === undefined) return true;
+  const normalized = raw.trim().toLowerCase();
+  return !(
+    normalized === "0" ||
+    normalized === "false" ||
+    normalized === "off" ||
+    normalized === "no"
+  );
+}
+
+/**
+ * Build the ClaudeCodeVerifierConfig that wires V3Evaluator's rubric verifier
+ * into the claude_code runner. Returns undefined (→ self-report fallback) when
+ * the verifier is disabled or when constructing the V3 carrier throws — never
+ * crashes the run. Exception: an explicit judge override
+ * (EVAL_CLAUDE_CODE_VERIFIER_MODEL) whose provider key can't be resolved throws
+ * a config error rather than silently downgrading to self-report.
+ *
+ * The V3 instance is used ONLY as the LLM-client carrier for V3Evaluator; per
+ * ClaudeCodeVerifierConfig it does NOT need init(). We mirror `evals verify`
+ * (tui/commands/verify.ts): a browser-free V3 with disableAPI + an Anthropic
+ * model so the verifier's LLMProvider resolves against ANTHROPIC_API_KEY.
+ */
+export function buildClaudeCodeVerifierConfig(
+  plan: ExternalHarnessTaskPlan,
+  logger: EvalLogger,
+): ClaudeCodeVerifierConfig | undefined {
+  if (!isClaudeCodeVerifierEnabled()) return undefined;
+
+  const judgeModelOverride = process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL;
+  const judgeModel = (judgeModelOverride ||
+    CLAUDE_CODE_VERIFIER_JUDGE_MODEL) as AvailableModel;
+
+  // Resolve the judge provider's key so V3Evaluator sends the RIGHT credential.
+  // Without this it defaults modelClientOptions.apiKey to the Gemini key, which
+  // an Anthropic judge would receive as x-api-key → "invalid x-api-key".
+  const judgeProvider = judgeModel.includes("/")
+    ? judgeModel.slice(0, judgeModel.indexOf("/"))
+    : undefined;
+  // resolveJudgeApiKey mirrors loadApiKeyFromEnv but also maps `gateway` →
+  // AI_GATEWAY_API_KEY (the SDK's providerEnvVarMap omits gateway).
+  const judgeApiKey = resolveJudgeApiKey(judgeProvider, logger);
+  const judgeClientOptions = judgeApiKey ? { apiKey: judgeApiKey } : undefined;
+
+  // Fail fast on a judge OVERRIDE whose key we can't resolve, so it propagates
+  // instead of being swallowed into the self-report fallback. Otherwise
+  // V3Evaluator backfills modelClientOptions with the Gemini key, hands the
+  // wrong provider its credential, verify() throws, and the run silently
+  // downgrades to legacy self-report. Surface the misconfiguration instead.
+  //
+  // Only providers that genuinely require a key qualify (see
+  // judgeProviderRequiresKey): anything in the SDK's providerEnvVarMap plus
+  // `gateway` (which needs AI_GATEWAY_API_KEY but the map omits). Genuinely
+  // API-keyless providers (ollama, bedrock) and the built-in default (gemini)
+  // stay exempt: keyless judges proceed with no explicit apiKey, and the
+  // default degrades gracefully to V3Evaluator's own key resolution.
+  if (
+    judgeModelOverride &&
+    judgeProviderRequiresKey(judgeProvider) &&
+    !judgeApiKey
+  ) {
+    throw new EvalsError(
+      `EVAL_CLAUDE_CODE_VERIFIER_MODEL="${judgeModel}" was set but no API key resolved for provider "${judgeProvider}". Set that provider's key (e.g. ANTHROPIC_API_KEY / OPENAI_API_KEY / AI_GATEWAY_API_KEY) or unset EVAL_CLAUDE_CODE_VERIFIER_MODEL to use the default judge.`,
+    );
+  }
+
+  try {
+    // Browser-free carrier — no init(). Only v3.logger is read by V3Evaluator.
+    const v3 = new V3({
+      env: "LOCAL",
+      verbose: 0,
+      disableAPI: true,
+      model: judgeClientOptions
+        ? { modelName: judgeModel, ...judgeClientOptions }
+        : judgeModel,
+      logger: (line: LogLine) => logger.log(line),
+    });
+
+    const taskSpec: TaskSpec = {
+      // Fallback id feeds the trajectory dir path, so sanitize the
+      // instruction-derived segment — raw instruction text can contain `/`,
+      // `..`, or other path-unsafe characters that would fork the output dir.
+      id:
+        plan.taskId ??
+        `${plan.dataset}/${plan.instruction
+          .slice(0, 40)
+          .replace(/[^A-Za-z0-9_-]/g, "_")}`,
+      instruction: plan.instruction,
+      initUrl: plan.startUrl,
+      ...(plan.precomputedRubric && {
+        precomputedRubric: plan.precomputedRubric,
+      }),
+      ...(plan.expectedAnswer && { expectedAnswer: plan.expectedAnswer }),
+    };
+
+    return {
+      v3,
+      taskSpec,
+      dataset: plan.dataset,
+      judgeModel,
+      judgeClientOptions,
+      successMode: process.env.EVAL_SUCCESS_MODE as
+        | "outcome"
+        | "process"
+        | "both"
+        | undefined,
+    };
+  } catch (error) {
+    logger.warn({
+      category: "claude_code",
+      message: `verifier setup skipped (falling back to self-report): ${
+        error instanceof Error ? error.message : String(error)
+      }`,
+      level: 0,
+    });
+    return undefined;
+  }
+}
+
 export const claudeCodeHarness: BenchHarness = {
   harness: "claude_code",
   supportedTaskKinds: ["agent", "suite"],
@@ -205,12 +389,17 @@ export const claudeCodeHarness: BenchHarness = {
       logger,
     });
     try {
+      // Built inside the try so a fail-fast verifier-config error (e.g. an
+      // override judge whose key can't be resolved) still runs the finally that
+      // owns the prepared tool adapter, instead of leaking it.
+      const verifier = buildClaudeCodeVerifierConfig(plan, logger);
       return await runClaudeCodeAgent({
         plan,
         model: input.modelName,
         logger,
         toolAdapter,
         signal,
+        verifier,
       });
     } finally {
       await toolAdapter.cleanup();

diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts
@@ -1,4 +1,9 @@
-import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand";
+import type {
+  AvailableModel,
+  ClientOptions,
+  TaskSpec,
+  V3,
+} from "@browserbasehq/stagehand";
 import { EvalsError } from "../errors.js";
 import type { EvalLogger } from "../logger.js";
 import type { TaskResult } from "./types.js";
@@ -30,6 +35,20 @@ export interface ClaudeCodeVerifierConfig {
   taskSpec: TaskSpec;
   /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */
   dataset: string;
+  /**
+   * Judge model for V3Evaluator (scoring + rubric generation). When omitted the
+   * evaluator falls back to its own default (google/gemini-2.5-flash). Pass an
+   * Anthropic model here to score against ANTHROPIC_API_KEY.
+   */
+  judgeModel?: AvailableModel;
+  /**
+   * Client options (API key) for the judge model. Required alongside judgeModel
+   * when the judge's provider differs from the evaluator's own default —
+   * otherwise V3Evaluator defaults modelClientOptions.apiKey to the Gemini key,
+   * which is sent as the wrong provider's credential (e.g. an Anthropic judge
+   * receives the Gemini key and fails with "invalid x-api-key").
+   */
+  judgeClientOptions?: ClientOptions;
   /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */
   successMode?: "outcome" | "process" | "both";
   /** Override trajectory persistence root. */
@@ -289,7 +308,13 @@ export async function runClaudeCodeAgent({
 
     const { V3Evaluator } = await import("@browserbasehq/stagehand");
     const { RubricCache } = await import("./rubricCache.js");
-    const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" });
+    const evaluator = new V3Evaluator(verifier.v3, {
+      backend: "verifier",
+      ...(verifier.judgeModel && { modelName: verifier.judgeModel }),
+      ...(verifier.judgeClientOptions && {
+        modelClientOptions: verifier.judgeClientOptions,
+      }),
+    });
 
     // Hydrate rubric — use precomputed if present, otherwise cache-or-generate.
     let rubric = verifier.taskSpec.precomputedRubric;

diff --git a/packages/evals/framework/claudeCodeToolAdapter.ts b/packages/evals/framework/claudeCodeToolAdapter.ts
@@ -326,19 +326,28 @@ export async function prepareBrowseCliHarnessAdapter(
     PATH: `${cwd}${path.delimiter}${process.env.PATH ?? ""}`,
   } as Record<string, string>;
 
+  const modeFlag = input.environment === "BROWSERBASE" ? "--remote" : "--local";
   await fsp.writeFile(
     wrapperPath,
     [
       "#!/usr/bin/env bash",
       "set -euo pipefail",
-      `exec ${JSON.stringify(process.execPath)} ${JSON.stringify(BROWSE_CLI_ENTRYPOINT)} --json --session ${JSON.stringify(session)} "$@"`,
+      // The mode flag (--local/--remote) selects the environment when the daemon
+      // is first started and must be explicit so a set BROWSERBASE_API_KEY does
+      // not silently auto-select remote. It is only accepted by the driver
+      // commands, so skip it for the few subcommands that reject it (stop,
+      // status). The session name is safe on every command.
+      "cmd=${1:-}",
+      "mode=()",
+      'if [[ "$cmd" != "stop" && "$cmd" != "status" ]]; then',
+      `  mode=(${JSON.stringify(modeFlag)})`,
+      "fi",
+      `exec ${JSON.stringify(process.execPath)} ${JSON.stringify(BROWSE_CLI_ENTRYPOINT)} "$@" "\${mode[@]+\${mode[@]}}" --session ${JSON.stringify(session)}`,
       "",
     ].join("\n"),
     { mode: 0o755 },
   );
 
-  await runBrowseSetup(wrapperPath, input.environment, input.logger, env, cwd);
-
   return {
     toolSurface: "browse_cli",
     startupProfile: input.startupProfile,
@@ -1070,22 +1079,6 @@ function buildCdpCodePromptInstructions(plan: ExternalHarnessTaskPlan): string {
   ].join("\n");
 }
 
-async function runBrowseSetup(
-  wrapperPath: string,
-  environment: "LOCAL" | "BROWSERBASE",
-  logger: EvalLogger,
-  env: Record<string, string>,
-  cwd: string,
-): Promise<void> {
-  await runBrowseCommand(
-    wrapperPath,
-    ["env", environment === "BROWSERBASE" ? "remote" : "local"],
-    logger,
-    env,
-    cwd,
-  );
-}
-
 function buildBrowseCliPromptInstructions(
   plan: ExternalHarnessTaskPlan,
 ): string {