From a0cf4856840a2e3399be70cd73654a2559f8c209 Mon Sep 17 00:00:00 2001
From: Shrey Pandya <shrey@browserbase.com>
Date: Wed, 1 Jul 2026 18:19:25 -0400
Subject: [PATCH 1/4] fix(evals): repair claude_code browse harness +
 rubric-score via V3Evaluator

Two fixes to the claude_code/browse eval harness:

Contract fix: the browse harness drove the CLI with a stale contract
(`browse --json ... env local`). browse CLI v0.9.1 dropped the `env`
subcommand and the global `--json` flag. Switch to per-command
`--local`/`--remote` mode selection plus `--session`, and rely on the
CLI's JSON-by-default output. The mode flag is only passed to the driver
commands that accept it (skipped for `stop`/`status`) and is explicit so a
set BROWSERBASE_API_KEY cannot silently auto-select remote.
(claudeCodeToolAdapter.ts, browse_cli.ts)

Verifier wiring: the claude_code path scored solely off the agent's
self-reported EVAL_RESULT line. The V3Evaluator rubric verifier already
existed in claudeCodeRunner but no caller ever constructed or passed a
ClaudeCodeVerifierConfig (unfinished migration from #2137). benchHarness
now builds that config -- a browser-free V3 (disableAPI) as the LLM-client
carrier for V3Evaluator, judge model defaulting to google/gemini-2.5-flash,
rubric taken from the row's precomputed_rubric or generated + cached -- and
threads it into runClaudeCodeAgent. Default ON; disable with
EVAL_CLAUDE_CODE_VERIFIER=0/false/off to fall back to self-report.
externalHarnessPlan threads precomputed_rubric/expectedAnswer into the
TaskSpec, and claudeCodeRunner gains judge-model + judge-key plumbing so a
non-default (e.g. Anthropic) judge receives its own provider credential
instead of the Gemini key.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/evals/core/tools/browse_cli.ts       |  34 +++--
 packages/evals/framework/benchHarness.ts      | 119 +++++++++++++++++-
 packages/evals/framework/claudeCodeRunner.ts  |  29 ++++-
 .../evals/framework/claudeCodeToolAdapter.ts  |  31 ++---
 .../evals/framework/externalHarnessPlan.ts    |  27 ++++
 5 files changed, 207 insertions(+), 33 deletions(-)
diff --git a/packages/evals/core/tools/browse_cli.ts b/packages/evals/core/tools/browse_cli.ts
index 0e8ec5c9a4..ef96790d40 100644
--- a/packages/evals/core/tools/browse_cli.ts
+++ b/packages/evals/core/tools/browse_cli.ts
@@ -106,18 +106,30 @@ type BrowseCliPagesResult = {
   }>;
 };
 
+// The mode flag selects the environment when the daemon is first started and
+// must be explicit so a set BROWSERBASE_API_KEY does not silently auto-select
+// remote. It is only accepted by the driver commands, so it is skipped for the
+// subcommands that reject it. The session name is safe on every command.
+const BROWSE_MODELESS_COMMANDS = new Set(["stop", "status"]);
+
 class BrowseCliRuntime {
-  constructor(private readonly session: string) {}
+  constructor(
+    private readonly session: string,
+    private readonly modeFlag: "--local" | "--remote",
+  ) {}
 
   async runJson<T>(args: string[]): Promise<T> {
+    const modeArgs = BROWSE_MODELESS_COMMANDS.has(args[0])
+      ? []
+      : [this.modeFlag];
     const { stdout, stderr } = await execFileAsync(
       process.execPath,
       [
         resolveBrowseCliEntrypoint(),
-        "--json",
+        ...args,
+        ...modeArgs,
         "--session",
         this.session,
-        ...args,
       ],
       {
         cwd: getRepoRootDir(),
@@ -645,8 +657,11 @@ class BrowseCliSession implements CoreSession {
   private activePageId: string | null = null;
   private closed = false;
 
-  constructor(private readonly sessionName: string) {
-    this.runtime = new BrowseCliRuntime(sessionName);
+  constructor(
+    private readonly sessionName: string,
+    modeFlag: "--local" | "--remote",
+  ) {
+    this.runtime = new BrowseCliRuntime(sessionName, modeFlag);
   }
 
   private wrap(page: { targetId: string; url: string }): BrowseCliPageHandle {
@@ -823,11 +838,10 @@ export class BrowseCliTool implements CoreTool {
       );
     }
 
-    const session = new BrowseCliSession(createSessionName());
-    await session.runtime.runJson([
-      "env",
-      input.environment === "BROWSERBASE" ? "remote" : "local",
-    ]);
+    const session = new BrowseCliSession(
+      createSessionName(),
+      input.environment === "BROWSERBASE" ? "--remote" : "--local",
+    );
 
     return {
       session,
diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts
index c2277ea360..3cfd1fcfa8 100644
--- a/packages/evals/framework/benchHarness.ts
+++ b/packages/evals/framework/benchHarness.ts
@@ -2,11 +2,12 @@ import {
   AgentProvider,
   getAISDKLanguageModel,
   loadApiKeyFromEnv,
+  V3,
   type AgentInstance,
   type AvailableModel,
   type LLMClient,
   type LogLine,
-  type V3,
+  type TaskSpec,
 } from "@browserbasehq/stagehand";
 import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js";
 import { endBrowserbaseSession } from "../browserbaseCleanup.js";
@@ -14,7 +15,11 @@ import { EvalsError } from "../errors.js";
 import type { EvalLogger } from "../logger.js";
 import type { V3InitResult } from "../initV3.js";
 import type { EvalInput } from "../types/evals.js";
-import { runClaudeCodeAgent } from "./claudeCodeRunner.js";
+import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
+import {
+  runClaudeCodeAgent,
+  type ClaudeCodeVerifierConfig,
+} from "./claudeCodeRunner.js";
 import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js";
 import { runCodexAgent } from "./codexRunner.js";
 import { prepareCodexToolAdapter } from "./codexToolAdapter.js";
@@ -181,6 +186,114 @@ export const stagehandHarness: BenchHarness = {
   },
 };
 
+/**
+ * Default judge model for the claude_code rubric verifier — used for both rubric
+ * generation and scoring. google/gemini-2.5-flash is V3Evaluator's own tuned
+ * default and reliably emits the verifier's structured-output schema; smaller
+ * models (e.g. anthropic/claude-haiku-4-5) intermittently fail the fused
+ * judgment call ("response did not match schema"), which the verifier reports as
+ * evidenceInsufficient → spurious outcome=false. Override with
+ * EVAL_CLAUDE_CODE_VERIFIER_MODEL (the judge's provider key is auto-resolved).
+ * Requires GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY for the default.
+ */
+const CLAUDE_CODE_VERIFIER_JUDGE_MODEL = "google/gemini-2.5-flash";
+
+/**
+ * Whether the rubric verifier should run for claude_code. Default ON so browse
+ * runs get ground-truth scoring; set EVAL_CLAUDE_CODE_VERIFIER to 0/false/off to
+ * fall back to the agent's self-reported EVAL_RESULT line.
+ */
+function isClaudeCodeVerifierEnabled(): boolean {
+  const raw = process.env.EVAL_CLAUDE_CODE_VERIFIER;
+  if (raw === undefined) return true;
+  const normalized = raw.trim().toLowerCase();
+  return !(
+    normalized === "0" ||
+    normalized === "false" ||
+    normalized === "off" ||
+    normalized === "no"
+  );
+}
+
+/**
+ * Build the ClaudeCodeVerifierConfig that wires V3Evaluator's rubric verifier
+ * into the claude_code runner. Returns undefined (→ self-report fallback) when
+ * the verifier is disabled or when constructing the V3 carrier throws — never
+ * crashes the run.
+ *
+ * The V3 instance is used ONLY as the LLM-client carrier for V3Evaluator; per
+ * ClaudeCodeVerifierConfig it does NOT need init(). We mirror `evals verify`
+ * (tui/commands/verify.ts): a browser-free V3 with disableAPI + an Anthropic
+ * model so the verifier's LLMProvider resolves against ANTHROPIC_API_KEY.
+ */
+function buildClaudeCodeVerifierConfig(
+  plan: ExternalHarnessTaskPlan,
+  logger: EvalLogger,
+): ClaudeCodeVerifierConfig | undefined {
+  if (!isClaudeCodeVerifierEnabled()) return undefined;
+
+  try {
+    const judgeModel = (process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL ||
+      CLAUDE_CODE_VERIFIER_JUDGE_MODEL) as AvailableModel;
+
+    // Resolve the judge provider's key so V3Evaluator sends the RIGHT credential.
+    // Without this it defaults modelClientOptions.apiKey to the Gemini key, which
+    // an Anthropic judge would receive as x-api-key → "invalid x-api-key".
+    const judgeProvider = judgeModel.includes("/")
+      ? judgeModel.slice(0, judgeModel.indexOf("/"))
+      : undefined;
+    const judgeApiKey = judgeProvider
+      ? loadApiKeyFromEnv(judgeProvider, (line: LogLine) => logger.log(line))
+      : undefined;
+    const judgeClientOptions = judgeApiKey
+      ? { apiKey: judgeApiKey }
+      : undefined;
+
+    // Browser-free carrier — no init(). Only v3.logger is read by V3Evaluator.
+    const v3 = new V3({
+      env: "LOCAL",
+      verbose: 0,
+      disableAPI: true,
+      model: judgeClientOptions
+        ? { modelName: judgeModel, ...judgeClientOptions }
+        : judgeModel,
+      logger: (line: LogLine) => logger.log(line),
+    });
+
+    const taskSpec: TaskSpec = {
+      id: plan.taskId ?? `${plan.dataset}/${plan.instruction.slice(0, 40)}`,
+      instruction: plan.instruction,
+      initUrl: plan.startUrl,
+      ...(plan.precomputedRubric && {
+        precomputedRubric: plan.precomputedRubric,
+      }),
+      ...(plan.expectedAnswer && { expectedAnswer: plan.expectedAnswer }),
+    };
+
+    return {
+      v3,
+      taskSpec,
+      dataset: plan.dataset,
+      judgeModel,
+      judgeClientOptions,
+      successMode: process.env.EVAL_SUCCESS_MODE as
+        | "outcome"
+        | "process"
+        | "both"
+        | undefined,
+    };
+  } catch (error) {
+    logger.warn({
+      category: "claude_code",
+      message: `verifier setup skipped (falling back to self-report): ${
+        error instanceof Error ? error.message : String(error)
+      }`,
+      level: 0,
+    });
+    return undefined;
+  }
+}
+
 export const claudeCodeHarness: BenchHarness = {
   harness: "claude_code",
   supportedTaskKinds: ["agent", "suite"],
@@ -204,6 +317,7 @@ export const claudeCodeHarness: BenchHarness = {
       plan,
       logger,
     });
+    const verifier = buildClaudeCodeVerifierConfig(plan, logger);
     try {
       return await runClaudeCodeAgent({
         plan,
@@ -211,6 +325,7 @@ export const claudeCodeHarness: BenchHarness = {
         logger,
         toolAdapter,
         signal,
+        verifier,
       });
     } finally {
       await toolAdapter.cleanup();
diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts
index da68dc929f..70462aa8fa 100644
--- a/packages/evals/framework/claudeCodeRunner.ts
+++ b/packages/evals/framework/claudeCodeRunner.ts
@@ -1,4 +1,9 @@
-import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand";
+import type {
+  AvailableModel,
+  ClientOptions,
+  TaskSpec,
+  V3,
+} from "@browserbasehq/stagehand";
 import { EvalsError } from "../errors.js";
 import type { EvalLogger } from "../logger.js";
 import type { TaskResult } from "./types.js";
@@ -30,6 +35,20 @@ export interface ClaudeCodeVerifierConfig {
   taskSpec: TaskSpec;
   /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */
   dataset: string;
+  /**
+   * Judge model for V3Evaluator (scoring + rubric generation). When omitted the
+   * evaluator falls back to its own default (google/gemini-2.5-flash). Pass an
+   * Anthropic model here to score against ANTHROPIC_API_KEY.
+   */
+  judgeModel?: AvailableModel;
+  /**
+   * Client options (API key) for the judge model. Required alongside judgeModel
+   * when the judge's provider differs from the evaluator's own default —
+   * otherwise V3Evaluator defaults modelClientOptions.apiKey to the Gemini key,
+   * which is sent as the wrong provider's credential (e.g. an Anthropic judge
+   * receives the Gemini key and fails with "invalid x-api-key").
+   */
+  judgeClientOptions?: ClientOptions;
   /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */
   successMode?: "outcome" | "process" | "both";
   /** Override trajectory persistence root. */
@@ -289,7 +308,13 @@ export async function runClaudeCodeAgent({
 
     const { V3Evaluator } = await import("@browserbasehq/stagehand");
     const { RubricCache } = await import("./rubricCache.js");
-    const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" });
+    const evaluator = new V3Evaluator(verifier.v3, {
+      backend: "verifier",
+      ...(verifier.judgeModel && { modelName: verifier.judgeModel }),
+      ...(verifier.judgeClientOptions && {
+        modelClientOptions: verifier.judgeClientOptions,
+      }),
+    });
 
     // Hydrate rubric — use precomputed if present, otherwise cache-or-generate.
     let rubric = verifier.taskSpec.precomputedRubric;
diff --git a/packages/evals/framework/claudeCodeToolAdapter.ts b/packages/evals/framework/claudeCodeToolAdapter.ts
index 8fdc14182c..63050253f6 100644
--- a/packages/evals/framework/claudeCodeToolAdapter.ts
+++ b/packages/evals/framework/claudeCodeToolAdapter.ts
@@ -326,19 +326,28 @@ export async function prepareBrowseCliHarnessAdapter(
     PATH: `${cwd}${path.delimiter}${process.env.PATH ?? ""}`,
   } as Record<string, string>;
 
+  const modeFlag = input.environment === "BROWSERBASE" ? "--remote" : "--local";
   await fsp.writeFile(
     wrapperPath,
     [
       "#!/usr/bin/env bash",
       "set -euo pipefail",
-      `exec ${JSON.stringify(process.execPath)} ${JSON.stringify(BROWSE_CLI_ENTRYPOINT)} --json --session ${JSON.stringify(session)} "$@"`,
+      // The mode flag (--local/--remote) selects the environment when the daemon
+      // is first started and must be explicit so a set BROWSERBASE_API_KEY does
+      // not silently auto-select remote. It is only accepted by the driver
+      // commands, so skip it for the few subcommands that reject it (stop,
+      // status). The session name is safe on every command.
+      "cmd=${1:-}",
+      "mode=()",
+      'if [[ "$cmd" != "stop" && "$cmd" != "status" ]]; then',
+      `  mode=(${JSON.stringify(modeFlag)})`,
+      "fi",
+      `exec ${JSON.stringify(process.execPath)} ${JSON.stringify(BROWSE_CLI_ENTRYPOINT)} "$@" "\${mode[@]+\${mode[@]}}" --session ${JSON.stringify(session)}`,
       "",
     ].join("\n"),
     { mode: 0o755 },
   );
 
-  await runBrowseSetup(wrapperPath, input.environment, input.logger, env, cwd);
-
   return {
     toolSurface: "browse_cli",
     startupProfile: input.startupProfile,
@@ -1070,22 +1079,6 @@ function buildCdpCodePromptInstructions(plan: ExternalHarnessTaskPlan): string {
   ].join("\n");
 }
 
-async function runBrowseSetup(
-  wrapperPath: string,
-  environment: "LOCAL" | "BROWSERBASE",
-  logger: EvalLogger,
-  env: Record<string, string>,
-  cwd: string,
-): Promise<void> {
-  await runBrowseCommand(
-    wrapperPath,
-    ["env", environment === "BROWSERBASE" ? "remote" : "local"],
-    logger,
-    env,
-    cwd,
-  );
-}
-
 function buildBrowseCliPromptInstructions(
   plan: ExternalHarnessTaskPlan,
 ): string {
diff --git a/packages/evals/framework/externalHarnessPlan.ts b/packages/evals/framework/externalHarnessPlan.ts
index fa23bf99e6..6dae10775d 100644
--- a/packages/evals/framework/externalHarnessPlan.ts
+++ b/packages/evals/framework/externalHarnessPlan.ts
@@ -1,3 +1,4 @@
+import { normalizeRubric, type Rubric } from "@browserbasehq/stagehand";
 import { EvalsError } from "../errors.js";
 import type { EvalInput } from "../types/evals.js";
 
@@ -6,6 +7,14 @@ export interface ExternalHarnessTaskPlan {
   taskId?: string;
   startUrl: string;
   instruction: string;
+  /**
+   * Precomputed rubric carried by the dataset row (`precomputed_rubric`), if
+   * present. Threaded into the verifier's TaskSpec so it doesn't regenerate.
+   * Undefined when the row ships no rubric — the verifier generates one.
+   */
+  precomputedRubric?: Rubric;
+  /** Reference answer carried by the dataset row (`expectedAnswer`), if present. */
+  expectedAnswer?: string;
 }
 
 function readString(
@@ -16,6 +25,21 @@ function readString(
   return typeof value === "string" && value.length > 0 ? value : undefined;
 }
 
+/**
+ * Rubric + reference answer a dataset row may ship. Rows without a
+ * `precomputed_rubric` leave `precomputedRubric` undefined so the verifier
+ * generates (and caches) one from the instruction.
+ */
+function readVerifierFields(params: Record<string, unknown>): {
+  precomputedRubric?: Rubric;
+  expectedAnswer?: string;
+} {
+  return {
+    precomputedRubric: normalizeRubric(params.precomputed_rubric) ?? undefined,
+    expectedAnswer: readString(params, "expectedAnswer"),
+  };
+}
+
 export function buildExternalHarnessTaskPlan(
   input: EvalInput,
 ): ExternalHarnessTaskPlan {
@@ -34,6 +58,7 @@ export function buildExternalHarnessTaskPlan(
       taskId: readString(params, "id"),
       startUrl,
       instruction,
+      ...readVerifierFields(params),
     };
   }
 
@@ -50,6 +75,7 @@ export function buildExternalHarnessTaskPlan(
       taskId: readString(params, "task_id"),
       startUrl,
       instruction,
+      ...readVerifierFields(params),
     };
   }
 
@@ -65,6 +91,7 @@ export function buildExternalHarnessTaskPlan(
       taskId: readString(params, "id"),
       startUrl: readString(params, "web") ?? "https://www.google.com",
       instruction,
+      ...readVerifierFields(params),
     };
   }
 

From f0f65090d25de4b588a76dd8052d36ad822342ac Mon Sep 17 00:00:00 2001
From: Shrey Pandya <shrey@browserbase.com>
Date: Wed, 1 Jul 2026 18:33:00 -0400
Subject: [PATCH 2/4] fix(evals): sanitize verifier fallback task id + fail
 fast on unresolved judge key

Address cubic review on #2299:
- Sanitize the instruction-derived fallback segment of the verifier TaskSpec id
  (replace non [A-Za-z0-9_-] with _) so it can't inject `/` or `..` into the
  persisted trajectory directory path.
- Move the judge-key check ahead of the try/catch and throw a clear config
  error when EVAL_CLAUDE_CODE_VERIFIER_MODEL is set but its provider key can't
  be resolved, instead of silently downgrading the run to legacy self-report.
  The built-in gemini default stays graceful.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/evals/framework/benchHarness.ts | 55 ++++++++++++++++--------
 1 file changed, 38 insertions(+), 17 deletions(-)

diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts
index 3cfd1fcfa8..d70ea94032 100644
--- a/packages/evals/framework/benchHarness.ts
+++ b/packages/evals/framework/benchHarness.ts
@@ -219,7 +219,9 @@ function isClaudeCodeVerifierEnabled(): boolean {
  * Build the ClaudeCodeVerifierConfig that wires V3Evaluator's rubric verifier
  * into the claude_code runner. Returns undefined (→ self-report fallback) when
  * the verifier is disabled or when constructing the V3 carrier throws — never
- * crashes the run.
+ * crashes the run. Exception: an explicit judge override
+ * (EVAL_CLAUDE_CODE_VERIFIER_MODEL) whose provider key can't be resolved throws
+ * a config error rather than silently downgrading to self-report.
  *
  * The V3 instance is used ONLY as the LLM-client carrier for V3Evaluator; per
  * ClaudeCodeVerifierConfig it does NOT need init(). We mirror `evals verify`
@@ -232,23 +234,35 @@ function buildClaudeCodeVerifierConfig(
 ): ClaudeCodeVerifierConfig | undefined {
   if (!isClaudeCodeVerifierEnabled()) return undefined;
 
-  try {
-    const judgeModel = (process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL ||
-      CLAUDE_CODE_VERIFIER_JUDGE_MODEL) as AvailableModel;
+  const judgeModelOverride = process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL;
+  const judgeModel = (judgeModelOverride ||
+    CLAUDE_CODE_VERIFIER_JUDGE_MODEL) as AvailableModel;
+
+  // Resolve the judge provider's key so V3Evaluator sends the RIGHT credential.
+  // Without this it defaults modelClientOptions.apiKey to the Gemini key, which
+  // an Anthropic judge would receive as x-api-key → "invalid x-api-key".
+  const judgeProvider = judgeModel.includes("/")
+    ? judgeModel.slice(0, judgeModel.indexOf("/"))
+    : undefined;
+  const judgeApiKey = judgeProvider
+    ? loadApiKeyFromEnv(judgeProvider, (line: LogLine) => logger.log(line))
+    : undefined;
+  const judgeClientOptions = judgeApiKey ? { apiKey: judgeApiKey } : undefined;
 
-    // Resolve the judge provider's key so V3Evaluator sends the RIGHT credential.
-    // Without this it defaults modelClientOptions.apiKey to the Gemini key, which
-    // an Anthropic judge would receive as x-api-key → "invalid x-api-key".
-    const judgeProvider = judgeModel.includes("/")
-      ? judgeModel.slice(0, judgeModel.indexOf("/"))
-      : undefined;
-    const judgeApiKey = judgeProvider
-      ? loadApiKeyFromEnv(judgeProvider, (line: LogLine) => logger.log(line))
-      : undefined;
-    const judgeClientOptions = judgeApiKey
-      ? { apiKey: judgeApiKey }
-      : undefined;
+  // Fail fast on a judge OVERRIDE whose key we can't resolve — do this before
+  // the try/catch so it propagates instead of being swallowed into the
+  // self-report fallback. Otherwise V3Evaluator backfills modelClientOptions
+  // with the Gemini key, hands the wrong provider its credential, verify()
+  // throws, and the run silently downgrades to legacy self-report. Surface the
+  // misconfiguration instead. The built-in default (gemini) is exempt: it
+  // degrades gracefully to V3Evaluator's own key resolution.
+  if (judgeModelOverride && judgeProvider && !judgeApiKey) {
+    throw new EvalsError(
+      `EVAL_CLAUDE_CODE_VERIFIER_MODEL="${judgeModel}" was set but no API key resolved for provider "${judgeProvider}". Set that provider's key (e.g. ANTHROPIC_API_KEY / OPENAI_API_KEY) or unset EVAL_CLAUDE_CODE_VERIFIER_MODEL to use the default judge.`,
+    );
+  }
 
+  try {
     // Browser-free carrier — no init(). Only v3.logger is read by V3Evaluator.
     const v3 = new V3({
       env: "LOCAL",
@@ -261,7 +275,14 @@ function buildClaudeCodeVerifierConfig(
     });
 
     const taskSpec: TaskSpec = {
-      id: plan.taskId ?? `${plan.dataset}/${plan.instruction.slice(0, 40)}`,
+      // Fallback id feeds the trajectory dir path, so sanitize the
+      // instruction-derived segment — raw instruction text can contain `/`,
+      // `..`, or other path-unsafe characters that would fork the output dir.
+      id:
+        plan.taskId ??
+        `${plan.dataset}/${plan.instruction
+          .slice(0, 40)
+          .replace(/[^A-Za-z0-9_-]/g, "_")}`,
       instruction: plan.instruction,
       initUrl: plan.startUrl,
       ...(plan.precomputedRubric && {

From 923eaaa075d4c304fb5900f8ebeee0b356c881dd Mon Sep 17 00:00:00 2001
From: Shrey Pandya <shrey@browserbase.com>
Date: Wed, 1 Jul 2026 19:40:55 -0400
Subject: [PATCH 3/4] fix(evals): exempt keyless judge providers + no adapter
 leak on verifier throw
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address two Cubic findings on the claude_code verifier judge-key guard:

1. Exempt API-keyless providers. loadApiKeyFromEnv returns undefined for
   keyless providers (ollama, bedrock — absent from providerEnvVarMap) by
   design, but the fail-fast guard treated that as a config error and
   rejected them. Only throw when the judge provider genuinely requires a
   key (present in providerEnvVarMap) and it is missing; keyless judges now
   proceed with no explicit apiKey. Key-requiring providers with a missing
   key still fail fast, keeping the silent-Gemini-key bug fixed.

2. No tool-adapter leak on verifier throw. buildClaudeCodeVerifierConfig
   was called before the try/finally that owns the prepared tool adapter, so
   a fail-fast throw skipped toolAdapter.cleanup(). Moved the call inside the
   try so cleanup runs on a verifier-config throw. Fail-fast behavior is
   preserved.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/evals/framework/benchHarness.ts | 30 +++++++++++++++++-------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts
index d70ea94032..d0d50d1851 100644
--- a/packages/evals/framework/benchHarness.ts
+++ b/packages/evals/framework/benchHarness.ts
@@ -2,6 +2,7 @@ import {
   AgentProvider,
   getAISDKLanguageModel,
   loadApiKeyFromEnv,
+  providerEnvVarMap,
   V3,
   type AgentInstance,
   type AvailableModel,
@@ -249,14 +250,22 @@ function buildClaudeCodeVerifierConfig(
     : undefined;
   const judgeClientOptions = judgeApiKey ? { apiKey: judgeApiKey } : undefined;
 
-  // Fail fast on a judge OVERRIDE whose key we can't resolve — do this before
-  // the try/catch so it propagates instead of being swallowed into the
-  // self-report fallback. Otherwise V3Evaluator backfills modelClientOptions
-  // with the Gemini key, hands the wrong provider its credential, verify()
-  // throws, and the run silently downgrades to legacy self-report. Surface the
-  // misconfiguration instead. The built-in default (gemini) is exempt: it
-  // degrades gracefully to V3Evaluator's own key resolution.
-  if (judgeModelOverride && judgeProvider && !judgeApiKey) {
+  // Fail fast on a judge OVERRIDE whose key we can't resolve, so it propagates
+  // instead of being swallowed into the self-report fallback. Otherwise
+  // V3Evaluator backfills modelClientOptions with the Gemini key, hands the
+  // wrong provider its credential, verify() throws, and the run silently
+  // downgrades to legacy self-report. Surface the misconfiguration instead.
+  //
+  // Only providers that genuinely require a key qualify: `loadApiKeyFromEnv`
+  // returns undefined for key-requiring providers (missing key) AND for
+  // API-keyless providers (ollama, bedrock — no entry in providerEnvVarMap) by
+  // design. Mirror that set via providerEnvVarMap so keyless judges proceed
+  // with no explicit apiKey instead of being rejected as misconfigured. The
+  // built-in default (gemini) is also exempt: it degrades gracefully to
+  // V3Evaluator's own key resolution.
+  const judgeProviderRequiresKey =
+    judgeProvider !== undefined && judgeProvider in providerEnvVarMap;
+  if (judgeModelOverride && judgeProviderRequiresKey && !judgeApiKey) {
     throw new EvalsError(
       `EVAL_CLAUDE_CODE_VERIFIER_MODEL="${judgeModel}" was set but no API key resolved for provider "${judgeProvider}". Set that provider's key (e.g. ANTHROPIC_API_KEY / OPENAI_API_KEY) or unset EVAL_CLAUDE_CODE_VERIFIER_MODEL to use the default judge.`,
     );
@@ -338,8 +347,11 @@ export const claudeCodeHarness: BenchHarness = {
       plan,
       logger,
     });
-    const verifier = buildClaudeCodeVerifierConfig(plan, logger);
     try {
+      // Built inside the try so a fail-fast verifier-config error (e.g. an
+      // override judge whose key can't be resolved) still runs the finally that
+      // owns the prepared tool adapter, instead of leaking it.
+      const verifier = buildClaudeCodeVerifierConfig(plan, logger);
       return await runClaudeCodeAgent({
         plan,
         model: input.modelName,

From 821d4970c26f476b4c30ff4b03bee05e6c88ffaa Mon Sep 17 00:00:00 2001
From: Shrey Pandya <shrey@browserbase.com>
Date: Wed, 1 Jul 2026 20:06:38 -0400
Subject: [PATCH 4/4] fix(evals): resolve gateway judge credential + add
 verifier-config tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address cubic review nits on the claude_code rubric verifier config.

FIX 1 (gateway judge credential): the keyless-provider exemption treated any
provider absent from the SDK's providerEnvVarMap as keyless. But `gateway/...`
(Vercel AI Gateway) is not in the map yet needs AI_GATEWAY_API_KEY, so a
`gateway/` judge override would silently proceed without its credential and
downgrade the verifier to self-report. Add resolveJudgeApiKey (maps `gateway` →
AI_GATEWAY_API_KEY, else loadApiKeyFromEnv) and judgeProviderRequiresKey (true
for providerEnvVarMap entries plus `gateway`) so a gateway judge resolves its
key and still fail-fasts when it is missing; ollama/bedrock and the default
gemini judge stay exempt.

FIX 2 (regression tests): add packages/evals/tests/framework/verifierConfig.test.ts
covering (a) a keyless override (ollama) builds a config without an apiKey,
(b) an anthropic override with the key unset throws the config error while
toolAdapter.cleanup() still runs (fail-fast inside try/finally, via
claudeCodeHarness.execute), and (c) a gateway/ override resolves
AI_GATEWAY_API_KEY (plus a missing-gateway-key fail-fast case). Exported
buildClaudeCodeVerifierConfig for direct unit testing.

Unit tests: 349 pass (was 345, +4 new). Build + typecheck + lint clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/evals/framework/benchHarness.ts      |  71 ++++++--
 .../tests/framework/verifierConfig.test.ts    | 160 ++++++++++++++++++
 2 files changed, 216 insertions(+), 15 deletions(-)
 create mode 100644 packages/evals/tests/framework/verifierConfig.test.ts

diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts
index d0d50d1851..ed4638eb58 100644
--- a/packages/evals/framework/benchHarness.ts
+++ b/packages/evals/framework/benchHarness.ts
@@ -199,6 +199,46 @@ export const stagehandHarness: BenchHarness = {
  */
 const CLAUDE_CODE_VERIFIER_JUDGE_MODEL = "google/gemini-2.5-flash";
 
+/**
+ * The Vercel AI Gateway provider (`gateway/...`) authenticates against
+ * AI_GATEWAY_API_KEY, but `gateway` is NOT in the SDK's providerEnvVarMap, so
+ * loadApiKeyFromEnv treats it like a keyless provider and returns undefined.
+ * A `gateway/` judge override would therefore silently skip its credential and
+ * downgrade the verifier. Resolve it explicitly here so a gateway judge sends
+ * the right key and still fail-fasts when the key is missing.
+ */
+const GATEWAY_JUDGE_PROVIDER = "gateway";
+const GATEWAY_JUDGE_API_KEY_ENV = "AI_GATEWAY_API_KEY";
+
+/**
+ * Resolve the API key for a judge provider. Mirrors loadApiKeyFromEnv for
+ * providers in providerEnvVarMap, but also handles `gateway` (which the SDK map
+ * omits) via AI_GATEWAY_API_KEY so a gateway judge isn't mistaken for keyless.
+ */
+function resolveJudgeApiKey(
+  provider: string | undefined,
+  logger: EvalLogger,
+): string | undefined {
+  if (!provider) return undefined;
+  if (provider === GATEWAY_JUDGE_PROVIDER) {
+    const key = process.env[GATEWAY_JUDGE_API_KEY_ENV];
+    return typeof key === "string" && key.length > 0 ? key : undefined;
+  }
+  return loadApiKeyFromEnv(provider, (line: LogLine) => logger.log(line));
+}
+
+/**
+ * Whether a judge provider genuinely requires an API key (so a missing key is a
+ * misconfiguration, not a keyless provider). True for anything in the SDK's
+ * providerEnvVarMap plus `gateway` (which the map omits but which needs
+ * AI_GATEWAY_API_KEY). Genuinely-keyless providers (ollama/bedrock) and the
+ * built-in default stay exempt.
+ */
+function judgeProviderRequiresKey(provider: string | undefined): boolean {
+  if (provider === undefined) return false;
+  return provider === GATEWAY_JUDGE_PROVIDER || provider in providerEnvVarMap;
+}
+
 /**
  * Whether the rubric verifier should run for claude_code. Default ON so browse
  * runs get ground-truth scoring; set EVAL_CLAUDE_CODE_VERIFIER to 0/false/off to
@@ -229,7 +269,7 @@ function isClaudeCodeVerifierEnabled(): boolean {
  * (tui/commands/verify.ts): a browser-free V3 with disableAPI + an Anthropic
  * model so the verifier's LLMProvider resolves against ANTHROPIC_API_KEY.
  */
-function buildClaudeCodeVerifierConfig(
+export function buildClaudeCodeVerifierConfig(
   plan: ExternalHarnessTaskPlan,
   logger: EvalLogger,
 ): ClaudeCodeVerifierConfig | undefined {
@@ -245,9 +285,9 @@ function buildClaudeCodeVerifierConfig(
   const judgeProvider = judgeModel.includes("/")
     ? judgeModel.slice(0, judgeModel.indexOf("/"))
     : undefined;
-  const judgeApiKey = judgeProvider
-    ? loadApiKeyFromEnv(judgeProvider, (line: LogLine) => logger.log(line))
-    : undefined;
+  // resolveJudgeApiKey mirrors loadApiKeyFromEnv but also maps `gateway` →
+  // AI_GATEWAY_API_KEY (the SDK's providerEnvVarMap omits gateway).
+  const judgeApiKey = resolveJudgeApiKey(judgeProvider, logger);
   const judgeClientOptions = judgeApiKey ? { apiKey: judgeApiKey } : undefined;
 
   // Fail fast on a judge OVERRIDE whose key we can't resolve, so it propagates
@@ -256,18 +296,19 @@ function buildClaudeCodeVerifierConfig(
   // wrong provider its credential, verify() throws, and the run silently
   // downgrades to legacy self-report. Surface the misconfiguration instead.
   //
-  // Only providers that genuinely require a key qualify: `loadApiKeyFromEnv`
-  // returns undefined for key-requiring providers (missing key) AND for
-  // API-keyless providers (ollama, bedrock — no entry in providerEnvVarMap) by
-  // design. Mirror that set via providerEnvVarMap so keyless judges proceed
-  // with no explicit apiKey instead of being rejected as misconfigured. The
-  // built-in default (gemini) is also exempt: it degrades gracefully to
-  // V3Evaluator's own key resolution.
-  const judgeProviderRequiresKey =
-    judgeProvider !== undefined && judgeProvider in providerEnvVarMap;
-  if (judgeModelOverride && judgeProviderRequiresKey && !judgeApiKey) {
+  // Only providers that genuinely require a key qualify (see
+  // judgeProviderRequiresKey): anything in the SDK's providerEnvVarMap plus
+  // `gateway` (which needs AI_GATEWAY_API_KEY but the map omits). Genuinely
+  // API-keyless providers (ollama, bedrock) and the built-in default (gemini)
+  // stay exempt: keyless judges proceed with no explicit apiKey, and the
+  // default degrades gracefully to V3Evaluator's own key resolution.
+  if (
+    judgeModelOverride &&
+    judgeProviderRequiresKey(judgeProvider) &&
+    !judgeApiKey
+  ) {
     throw new EvalsError(
-      `EVAL_CLAUDE_CODE_VERIFIER_MODEL="${judgeModel}" was set but no API key resolved for provider "${judgeProvider}". Set that provider's key (e.g. ANTHROPIC_API_KEY / OPENAI_API_KEY) or unset EVAL_CLAUDE_CODE_VERIFIER_MODEL to use the default judge.`,
+      `EVAL_CLAUDE_CODE_VERIFIER_MODEL="${judgeModel}" was set but no API key resolved for provider "${judgeProvider}". Set that provider's key (e.g. ANTHROPIC_API_KEY / OPENAI_API_KEY / AI_GATEWAY_API_KEY) or unset EVAL_CLAUDE_CODE_VERIFIER_MODEL to use the default judge.`,
     );
   }
 
diff --git a/packages/evals/tests/framework/verifierConfig.test.ts b/packages/evals/tests/framework/verifierConfig.test.ts
new file mode 100644
index 0000000000..9f55978601
--- /dev/null
+++ b/packages/evals/tests/framework/verifierConfig.test.ts
@@ -0,0 +1,160 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import type { ExternalHarnessTaskPlan } from "../../framework/externalHarnessPlan.js";
+
+// Keep the real @browserbasehq/stagehand surface (loadApiKeyFromEnv,
+// providerEnvVarMap, etc. — the credential-resolution logic under test depends
+// on them) but replace V3 with a lightweight stub so the config-building path
+// never touches real LLM-provider/browser internals. buildClaudeCodeVerifierConfig
+// only uses the V3 instance as an inert LLM-client carrier.
+vi.mock("@browserbasehq/stagehand", async () => {
+  const actual = await vi.importActual<
+    typeof import("@browserbasehq/stagehand")
+  >("@browserbasehq/stagehand");
+  return {
+    ...actual,
+    V3: class {
+      opts: unknown;
+      constructor(opts: unknown) {
+        this.opts = opts;
+      }
+    },
+  };
+});
+
+// Mock the execute()-path collaborators so we can assert the fail-fast +
+// finally(cleanup) contract without spawning a real Claude Code agent.
+// vi.hoisted keeps these usable inside the hoisted vi.mock factories below.
+const { cleanupMock, runClaudeCodeAgentMock } = vi.hoisted(() => ({
+  cleanupMock: vi.fn(async () => {}),
+  runClaudeCodeAgentMock: vi.fn(async () => ({}) as never),
+}));
+
+vi.mock("../../framework/claudeCodeToolAdapter.js", () => ({
+  prepareClaudeCodeToolAdapter: vi.fn(async () => ({ cleanup: cleanupMock })),
+}));
+
+vi.mock("../../framework/claudeCodeRunner.js", () => ({
+  runClaudeCodeAgent: runClaudeCodeAgentMock,
+}));
+
+vi.mock("../../framework/externalHarnessPlan.js", () => ({
+  buildExternalHarnessTaskPlan: vi.fn(
+    (): ExternalHarnessTaskPlan => ({
+      dataset: "webvoyager",
+      taskId: "wv-1",
+      startUrl: "https://example.com",
+      instruction: "Find the checkout button",
+    }),
+  ),
+}));
+
+import {
+  buildClaudeCodeVerifierConfig,
+  claudeCodeHarness,
+} from "../../framework/benchHarness.js";
+import { EvalLogger } from "../../logger.js";
+import { EvalsError } from "../../errors.js";
+
+const plan: ExternalHarnessTaskPlan = {
+  dataset: "webvoyager",
+  taskId: "wv-1",
+  startUrl: "https://example.com",
+  instruction: "Find the checkout button",
+};
+
+// Env keys the verifier-config credential resolution reads. Snapshot + restore
+// so tests don't leak state into each other or the rest of the suite.
+const MANAGED_ENV = [
+  "EVAL_CLAUDE_CODE_VERIFIER",
+  "EVAL_CLAUDE_CODE_VERIFIER_MODEL",
+  "AI_GATEWAY_API_KEY",
+  "ANTHROPIC_API_KEY",
+  "OLLAMA_API_KEY",
+] as const;
+
+let savedEnv: Record<string, string | undefined>;
+
+beforeEach(() => {
+  savedEnv = {};
+  for (const key of MANAGED_ENV) {
+    savedEnv[key] = process.env[key];
+    delete process.env[key];
+  }
+  cleanupMock.mockClear();
+  runClaudeCodeAgentMock.mockClear();
+});
+
+afterEach(() => {
+  for (const key of MANAGED_ENV) {
+    if (savedEnv[key] === undefined) delete process.env[key];
+    else process.env[key] = savedEnv[key];
+  }
+});
+
+describe("buildClaudeCodeVerifierConfig judge credentials", () => {
+  it("builds a config for a keyless provider override (ollama) without an apiKey", () => {
+    process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL = "ollama/llama3";
+
+    const config = buildClaudeCodeVerifierConfig(plan, new EvalLogger(false));
+
+    expect(config).toBeDefined();
+    expect(config?.judgeModel).toBe("ollama/llama3");
+    // Keyless provider → no explicit apiKey is threaded through.
+    expect(config?.judgeClientOptions).toBeUndefined();
+  });
+
+  it("resolves AI_GATEWAY_API_KEY for a gateway/ judge override", () => {
+    process.env.AI_GATEWAY_API_KEY = "gw-test-key";
+    process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL =
+      "gateway/anthropic/claude-sonnet-4-20250514";
+
+    const config = buildClaudeCodeVerifierConfig(plan, new EvalLogger(false));
+
+    expect(config).toBeDefined();
+    expect(config?.judgeModel).toBe(
+      "gateway/anthropic/claude-sonnet-4-20250514",
+    );
+    expect(config?.judgeClientOptions).toEqual({ apiKey: "gw-test-key" });
+  });
+
+  it("fail-fasts when a gateway/ judge override is missing AI_GATEWAY_API_KEY", () => {
+    process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL = "gateway/some-model";
+
+    expect(() =>
+      buildClaudeCodeVerifierConfig(plan, new EvalLogger(false)),
+    ).toThrow(/AI_GATEWAY_API_KEY|no API key resolved/);
+  });
+});
+
+describe("claudeCodeHarness.execute verifier fail-fast", () => {
+  const makeExecuteInput = () => ({
+    task: {} as never,
+    input: { modelName: "anthropic/claude-sonnet-4-20250514" } as never,
+    row: {
+      config: {
+        harness: "claude_code" as const,
+        model: "anthropic/claude-sonnet-4-20250514" as never,
+        environment: "LOCAL" as const,
+        useApi: false,
+      },
+    } as never,
+    logger: new EvalLogger(false),
+  });
+
+  it("throws the config error but still runs toolAdapter.cleanup() (fail-fast inside try/finally)", async () => {
+    // Anthropic judge override with ANTHROPIC_API_KEY unset (cleared in beforeEach)
+    // → verifier config must throw, and the prepared adapter must still be cleaned up.
+    process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL =
+      "anthropic/claude-sonnet-4-20250514";
+
+    await expect(claudeCodeHarness.execute(makeExecuteInput())).rejects.toThrow(
+      EvalsError,
+    );
+
+    // The verifier construction was moved inside the try, so the finally that
+    // owns the adapter runs even when config resolution throws.
+    expect(cleanupMock).toHaveBeenCalledTimes(1);
+    // The agent must NOT have run — we failed fast before executing.
+    expect(runClaudeCodeAgentMock).not.toHaveBeenCalled();
+  });
+});