From a0cf4856840a2e3399be70cd73654a2559f8c209 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 1 Jul 2026 18:19:25 -0400 Subject: [PATCH 1/4] fix(evals): repair claude_code browse harness + rubric-score via V3Evaluator Two fixes to the claude_code/browse eval harness: Contract fix: the browse harness drove the CLI with a stale contract (`browse --json ... env local`). browse CLI v0.9.1 dropped the `env` subcommand and the global `--json` flag. Switch to per-command `--local`/`--remote` mode selection plus `--session`, and rely on the CLI's JSON-by-default output. The mode flag is only passed to the driver commands that accept it (skipped for `stop`/`status`) and is explicit so a set BROWSERBASE_API_KEY cannot silently auto-select remote. (claudeCodeToolAdapter.ts, browse_cli.ts) Verifier wiring: the claude_code path scored solely off the agent's self-reported EVAL_RESULT line. The V3Evaluator rubric verifier already existed in claudeCodeRunner but no caller ever constructed or passed a ClaudeCodeVerifierConfig (unfinished migration from #2137). benchHarness now builds that config -- a browser-free V3 (disableAPI) as the LLM-client carrier for V3Evaluator, judge model defaulting to google/gemini-2.5-flash, rubric taken from the row's precomputed_rubric or generated + cached -- and threads it into runClaudeCodeAgent. Default ON; disable with EVAL_CLAUDE_CODE_VERIFIER=0/false/off to fall back to self-report. externalHarnessPlan threads precomputed_rubric/expectedAnswer into the TaskSpec, and claudeCodeRunner gains judge-model + judge-key plumbing so a non-default (e.g. Anthropic) judge receives its own provider credential instead of the Gemini key. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/evals/core/tools/browse_cli.ts | 34 +++-- packages/evals/framework/benchHarness.ts | 119 +++++++++++++++++- packages/evals/framework/claudeCodeRunner.ts | 29 ++++- .../evals/framework/claudeCodeToolAdapter.ts | 31 ++--- .../evals/framework/externalHarnessPlan.ts | 27 ++++ 5 files changed, 207 insertions(+), 33 deletions(-) diff --git a/packages/evals/core/tools/browse_cli.ts b/packages/evals/core/tools/browse_cli.ts index 0e8ec5c9a4..ef96790d40 100644 --- a/packages/evals/core/tools/browse_cli.ts +++ b/packages/evals/core/tools/browse_cli.ts @@ -106,18 +106,30 @@ type BrowseCliPagesResult = { }>; }; +// The mode flag selects the environment when the daemon is first started and +// must be explicit so a set BROWSERBASE_API_KEY does not silently auto-select +// remote. It is only accepted by the driver commands, so it is skipped for the +// subcommands that reject it. The session name is safe on every command. +const BROWSE_MODELESS_COMMANDS = new Set(["stop", "status"]); + class BrowseCliRuntime { - constructor(private readonly session: string) {} + constructor( + private readonly session: string, + private readonly modeFlag: "--local" | "--remote", + ) {} async runJson(args: string[]): Promise { + const modeArgs = BROWSE_MODELESS_COMMANDS.has(args[0]) + ? [] + : [this.modeFlag]; const { stdout, stderr } = await execFileAsync( process.execPath, [ resolveBrowseCliEntrypoint(), - "--json", + ...args, + ...modeArgs, "--session", this.session, - ...args, ], { cwd: getRepoRootDir(), @@ -645,8 +657,11 @@ class BrowseCliSession implements CoreSession { private activePageId: string | null = null; private closed = false; - constructor(private readonly sessionName: string) { - this.runtime = new BrowseCliRuntime(sessionName); + constructor( + private readonly sessionName: string, + modeFlag: "--local" | "--remote", + ) { + this.runtime = new BrowseCliRuntime(sessionName, modeFlag); } private wrap(page: { targetId: string; url: string }): BrowseCliPageHandle { @@ -823,11 +838,10 @@ export class BrowseCliTool implements CoreTool { ); } - const session = new BrowseCliSession(createSessionName()); - await session.runtime.runJson([ - "env", - input.environment === "BROWSERBASE" ? "remote" : "local", - ]); + const session = new BrowseCliSession( + createSessionName(), + input.environment === "BROWSERBASE" ? "--remote" : "--local", + ); return { session, diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts index c2277ea360..3cfd1fcfa8 100644 --- a/packages/evals/framework/benchHarness.ts +++ b/packages/evals/framework/benchHarness.ts @@ -2,11 +2,12 @@ import { AgentProvider, getAISDKLanguageModel, loadApiKeyFromEnv, + V3, type AgentInstance, type AvailableModel, type LLMClient, type LogLine, - type V3, + type TaskSpec, } from "@browserbasehq/stagehand"; import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js"; import { endBrowserbaseSession } from "../browserbaseCleanup.js"; @@ -14,7 +15,11 @@ import { EvalsError } from "../errors.js"; import type { EvalLogger } from "../logger.js"; import type { V3InitResult } from "../initV3.js"; import type { EvalInput } from "../types/evals.js"; -import { runClaudeCodeAgent } from "./claudeCodeRunner.js"; +import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; +import { + runClaudeCodeAgent, + type ClaudeCodeVerifierConfig, +} from "./claudeCodeRunner.js"; import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js"; import { runCodexAgent } from "./codexRunner.js"; import { prepareCodexToolAdapter } from "./codexToolAdapter.js"; @@ -181,6 +186,114 @@ export const stagehandHarness: BenchHarness = { }, }; +/** + * Default judge model for the claude_code rubric verifier — used for both rubric + * generation and scoring. google/gemini-2.5-flash is V3Evaluator's own tuned + * default and reliably emits the verifier's structured-output schema; smaller + * models (e.g. anthropic/claude-haiku-4-5) intermittently fail the fused + * judgment call ("response did not match schema"), which the verifier reports as + * evidenceInsufficient → spurious outcome=false. Override with + * EVAL_CLAUDE_CODE_VERIFIER_MODEL (the judge's provider key is auto-resolved). + * Requires GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY for the default. + */ +const CLAUDE_CODE_VERIFIER_JUDGE_MODEL = "google/gemini-2.5-flash"; + +/** + * Whether the rubric verifier should run for claude_code. Default ON so browse + * runs get ground-truth scoring; set EVAL_CLAUDE_CODE_VERIFIER to 0/false/off to + * fall back to the agent's self-reported EVAL_RESULT line. + */ +function isClaudeCodeVerifierEnabled(): boolean { + const raw = process.env.EVAL_CLAUDE_CODE_VERIFIER; + if (raw === undefined) return true; + const normalized = raw.trim().toLowerCase(); + return !( + normalized === "0" || + normalized === "false" || + normalized === "off" || + normalized === "no" + ); +} + +/** + * Build the ClaudeCodeVerifierConfig that wires V3Evaluator's rubric verifier + * into the claude_code runner. Returns undefined (→ self-report fallback) when + * the verifier is disabled or when constructing the V3 carrier throws — never + * crashes the run. + * + * The V3 instance is used ONLY as the LLM-client carrier for V3Evaluator; per + * ClaudeCodeVerifierConfig it does NOT need init(). We mirror `evals verify` + * (tui/commands/verify.ts): a browser-free V3 with disableAPI + an Anthropic + * model so the verifier's LLMProvider resolves against ANTHROPIC_API_KEY. + */ +function buildClaudeCodeVerifierConfig( + plan: ExternalHarnessTaskPlan, + logger: EvalLogger, +): ClaudeCodeVerifierConfig | undefined { + if (!isClaudeCodeVerifierEnabled()) return undefined; + + try { + const judgeModel = (process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL || + CLAUDE_CODE_VERIFIER_JUDGE_MODEL) as AvailableModel; + + // Resolve the judge provider's key so V3Evaluator sends the RIGHT credential. + // Without this it defaults modelClientOptions.apiKey to the Gemini key, which + // an Anthropic judge would receive as x-api-key → "invalid x-api-key". + const judgeProvider = judgeModel.includes("/") + ? judgeModel.slice(0, judgeModel.indexOf("/")) + : undefined; + const judgeApiKey = judgeProvider + ? loadApiKeyFromEnv(judgeProvider, (line: LogLine) => logger.log(line)) + : undefined; + const judgeClientOptions = judgeApiKey + ? { apiKey: judgeApiKey } + : undefined; + + // Browser-free carrier — no init(). Only v3.logger is read by V3Evaluator. + const v3 = new V3({ + env: "LOCAL", + verbose: 0, + disableAPI: true, + model: judgeClientOptions + ? { modelName: judgeModel, ...judgeClientOptions } + : judgeModel, + logger: (line: LogLine) => logger.log(line), + }); + + const taskSpec: TaskSpec = { + id: plan.taskId ?? `${plan.dataset}/${plan.instruction.slice(0, 40)}`, + instruction: plan.instruction, + initUrl: plan.startUrl, + ...(plan.precomputedRubric && { + precomputedRubric: plan.precomputedRubric, + }), + ...(plan.expectedAnswer && { expectedAnswer: plan.expectedAnswer }), + }; + + return { + v3, + taskSpec, + dataset: plan.dataset, + judgeModel, + judgeClientOptions, + successMode: process.env.EVAL_SUCCESS_MODE as + | "outcome" + | "process" + | "both" + | undefined, + }; + } catch (error) { + logger.warn({ + category: "claude_code", + message: `verifier setup skipped (falling back to self-report): ${ + error instanceof Error ? error.message : String(error) + }`, + level: 0, + }); + return undefined; + } +} + export const claudeCodeHarness: BenchHarness = { harness: "claude_code", supportedTaskKinds: ["agent", "suite"], @@ -204,6 +317,7 @@ export const claudeCodeHarness: BenchHarness = { plan, logger, }); + const verifier = buildClaudeCodeVerifierConfig(plan, logger); try { return await runClaudeCodeAgent({ plan, @@ -211,6 +325,7 @@ export const claudeCodeHarness: BenchHarness = { logger, toolAdapter, signal, + verifier, }); } finally { await toolAdapter.cleanup(); diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts index da68dc929f..70462aa8fa 100644 --- a/packages/evals/framework/claudeCodeRunner.ts +++ b/packages/evals/framework/claudeCodeRunner.ts @@ -1,4 +1,9 @@ -import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand"; +import type { + AvailableModel, + ClientOptions, + TaskSpec, + V3, +} from "@browserbasehq/stagehand"; import { EvalsError } from "../errors.js"; import type { EvalLogger } from "../logger.js"; import type { TaskResult } from "./types.js"; @@ -30,6 +35,20 @@ export interface ClaudeCodeVerifierConfig { taskSpec: TaskSpec; /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */ dataset: string; + /** + * Judge model for V3Evaluator (scoring + rubric generation). When omitted the + * evaluator falls back to its own default (google/gemini-2.5-flash). Pass an + * Anthropic model here to score against ANTHROPIC_API_KEY. + */ + judgeModel?: AvailableModel; + /** + * Client options (API key) for the judge model. Required alongside judgeModel + * when the judge's provider differs from the evaluator's own default — + * otherwise V3Evaluator defaults modelClientOptions.apiKey to the Gemini key, + * which is sent as the wrong provider's credential (e.g. an Anthropic judge + * receives the Gemini key and fails with "invalid x-api-key"). + */ + judgeClientOptions?: ClientOptions; /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */ successMode?: "outcome" | "process" | "both"; /** Override trajectory persistence root. */ @@ -289,7 +308,13 @@ export async function runClaudeCodeAgent({ const { V3Evaluator } = await import("@browserbasehq/stagehand"); const { RubricCache } = await import("./rubricCache.js"); - const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" }); + const evaluator = new V3Evaluator(verifier.v3, { + backend: "verifier", + ...(verifier.judgeModel && { modelName: verifier.judgeModel }), + ...(verifier.judgeClientOptions && { + modelClientOptions: verifier.judgeClientOptions, + }), + }); // Hydrate rubric — use precomputed if present, otherwise cache-or-generate. let rubric = verifier.taskSpec.precomputedRubric; diff --git a/packages/evals/framework/claudeCodeToolAdapter.ts b/packages/evals/framework/claudeCodeToolAdapter.ts index 8fdc14182c..63050253f6 100644 --- a/packages/evals/framework/claudeCodeToolAdapter.ts +++ b/packages/evals/framework/claudeCodeToolAdapter.ts @@ -326,19 +326,28 @@ export async function prepareBrowseCliHarnessAdapter( PATH: `${cwd}${path.delimiter}${process.env.PATH ?? ""}`, } as Record; + const modeFlag = input.environment === "BROWSERBASE" ? "--remote" : "--local"; await fsp.writeFile( wrapperPath, [ "#!/usr/bin/env bash", "set -euo pipefail", - `exec ${JSON.stringify(process.execPath)} ${JSON.stringify(BROWSE_CLI_ENTRYPOINT)} --json --session ${JSON.stringify(session)} "$@"`, + // The mode flag (--local/--remote) selects the environment when the daemon + // is first started and must be explicit so a set BROWSERBASE_API_KEY does + // not silently auto-select remote. It is only accepted by the driver + // commands, so skip it for the few subcommands that reject it (stop, + // status). The session name is safe on every command. + "cmd=${1:-}", + "mode=()", + 'if [[ "$cmd" != "stop" && "$cmd" != "status" ]]; then', + ` mode=(${JSON.stringify(modeFlag)})`, + "fi", + `exec ${JSON.stringify(process.execPath)} ${JSON.stringify(BROWSE_CLI_ENTRYPOINT)} "$@" "\${mode[@]+\${mode[@]}}" --session ${JSON.stringify(session)}`, "", ].join("\n"), { mode: 0o755 }, ); - await runBrowseSetup(wrapperPath, input.environment, input.logger, env, cwd); - return { toolSurface: "browse_cli", startupProfile: input.startupProfile, @@ -1070,22 +1079,6 @@ function buildCdpCodePromptInstructions(plan: ExternalHarnessTaskPlan): string { ].join("\n"); } -async function runBrowseSetup( - wrapperPath: string, - environment: "LOCAL" | "BROWSERBASE", - logger: EvalLogger, - env: Record, - cwd: string, -): Promise { - await runBrowseCommand( - wrapperPath, - ["env", environment === "BROWSERBASE" ? "remote" : "local"], - logger, - env, - cwd, - ); -} - function buildBrowseCliPromptInstructions( plan: ExternalHarnessTaskPlan, ): string { diff --git a/packages/evals/framework/externalHarnessPlan.ts b/packages/evals/framework/externalHarnessPlan.ts index fa23bf99e6..6dae10775d 100644 --- a/packages/evals/framework/externalHarnessPlan.ts +++ b/packages/evals/framework/externalHarnessPlan.ts @@ -1,3 +1,4 @@ +import { normalizeRubric, type Rubric } from "@browserbasehq/stagehand"; import { EvalsError } from "../errors.js"; import type { EvalInput } from "../types/evals.js"; @@ -6,6 +7,14 @@ export interface ExternalHarnessTaskPlan { taskId?: string; startUrl: string; instruction: string; + /** + * Precomputed rubric carried by the dataset row (`precomputed_rubric`), if + * present. Threaded into the verifier's TaskSpec so it doesn't regenerate. + * Undefined when the row ships no rubric — the verifier generates one. + */ + precomputedRubric?: Rubric; + /** Reference answer carried by the dataset row (`expectedAnswer`), if present. */ + expectedAnswer?: string; } function readString( @@ -16,6 +25,21 @@ function readString( return typeof value === "string" && value.length > 0 ? value : undefined; } +/** + * Rubric + reference answer a dataset row may ship. Rows without a + * `precomputed_rubric` leave `precomputedRubric` undefined so the verifier + * generates (and caches) one from the instruction. + */ +function readVerifierFields(params: Record): { + precomputedRubric?: Rubric; + expectedAnswer?: string; +} { + return { + precomputedRubric: normalizeRubric(params.precomputed_rubric) ?? undefined, + expectedAnswer: readString(params, "expectedAnswer"), + }; +} + export function buildExternalHarnessTaskPlan( input: EvalInput, ): ExternalHarnessTaskPlan { @@ -34,6 +58,7 @@ export function buildExternalHarnessTaskPlan( taskId: readString(params, "id"), startUrl, instruction, + ...readVerifierFields(params), }; } @@ -50,6 +75,7 @@ export function buildExternalHarnessTaskPlan( taskId: readString(params, "task_id"), startUrl, instruction, + ...readVerifierFields(params), }; } @@ -65,6 +91,7 @@ export function buildExternalHarnessTaskPlan( taskId: readString(params, "id"), startUrl: readString(params, "web") ?? "https://www.google.com", instruction, + ...readVerifierFields(params), }; } From f0f65090d25de4b588a76dd8052d36ad822342ac Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 1 Jul 2026 18:33:00 -0400 Subject: [PATCH 2/4] fix(evals): sanitize verifier fallback task id + fail fast on unresolved judge key Address cubic review on #2299: - Sanitize the instruction-derived fallback segment of the verifier TaskSpec id (replace non [A-Za-z0-9_-] with _) so it can't inject `/` or `..` into the persisted trajectory directory path. - Move the judge-key check ahead of the try/catch and throw a clear config error when EVAL_CLAUDE_CODE_VERIFIER_MODEL is set but its provider key can't be resolved, instead of silently downgrading the run to legacy self-report. The built-in gemini default stays graceful. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/evals/framework/benchHarness.ts | 55 ++++++++++++++++-------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts index 3cfd1fcfa8..d70ea94032 100644 --- a/packages/evals/framework/benchHarness.ts +++ b/packages/evals/framework/benchHarness.ts @@ -219,7 +219,9 @@ function isClaudeCodeVerifierEnabled(): boolean { * Build the ClaudeCodeVerifierConfig that wires V3Evaluator's rubric verifier * into the claude_code runner. Returns undefined (→ self-report fallback) when * the verifier is disabled or when constructing the V3 carrier throws — never - * crashes the run. + * crashes the run. Exception: an explicit judge override + * (EVAL_CLAUDE_CODE_VERIFIER_MODEL) whose provider key can't be resolved throws + * a config error rather than silently downgrading to self-report. * * The V3 instance is used ONLY as the LLM-client carrier for V3Evaluator; per * ClaudeCodeVerifierConfig it does NOT need init(). We mirror `evals verify` @@ -232,23 +234,35 @@ function buildClaudeCodeVerifierConfig( ): ClaudeCodeVerifierConfig | undefined { if (!isClaudeCodeVerifierEnabled()) return undefined; - try { - const judgeModel = (process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL || - CLAUDE_CODE_VERIFIER_JUDGE_MODEL) as AvailableModel; + const judgeModelOverride = process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL; + const judgeModel = (judgeModelOverride || + CLAUDE_CODE_VERIFIER_JUDGE_MODEL) as AvailableModel; + + // Resolve the judge provider's key so V3Evaluator sends the RIGHT credential. + // Without this it defaults modelClientOptions.apiKey to the Gemini key, which + // an Anthropic judge would receive as x-api-key → "invalid x-api-key". + const judgeProvider = judgeModel.includes("/") + ? judgeModel.slice(0, judgeModel.indexOf("/")) + : undefined; + const judgeApiKey = judgeProvider + ? loadApiKeyFromEnv(judgeProvider, (line: LogLine) => logger.log(line)) + : undefined; + const judgeClientOptions = judgeApiKey ? { apiKey: judgeApiKey } : undefined; - // Resolve the judge provider's key so V3Evaluator sends the RIGHT credential. - // Without this it defaults modelClientOptions.apiKey to the Gemini key, which - // an Anthropic judge would receive as x-api-key → "invalid x-api-key". - const judgeProvider = judgeModel.includes("/") - ? judgeModel.slice(0, judgeModel.indexOf("/")) - : undefined; - const judgeApiKey = judgeProvider - ? loadApiKeyFromEnv(judgeProvider, (line: LogLine) => logger.log(line)) - : undefined; - const judgeClientOptions = judgeApiKey - ? { apiKey: judgeApiKey } - : undefined; + // Fail fast on a judge OVERRIDE whose key we can't resolve — do this before + // the try/catch so it propagates instead of being swallowed into the + // self-report fallback. Otherwise V3Evaluator backfills modelClientOptions + // with the Gemini key, hands the wrong provider its credential, verify() + // throws, and the run silently downgrades to legacy self-report. Surface the + // misconfiguration instead. The built-in default (gemini) is exempt: it + // degrades gracefully to V3Evaluator's own key resolution. + if (judgeModelOverride && judgeProvider && !judgeApiKey) { + throw new EvalsError( + `EVAL_CLAUDE_CODE_VERIFIER_MODEL="${judgeModel}" was set but no API key resolved for provider "${judgeProvider}". Set that provider's key (e.g. ANTHROPIC_API_KEY / OPENAI_API_KEY) or unset EVAL_CLAUDE_CODE_VERIFIER_MODEL to use the default judge.`, + ); + } + try { // Browser-free carrier — no init(). Only v3.logger is read by V3Evaluator. const v3 = new V3({ env: "LOCAL", @@ -261,7 +275,14 @@ function buildClaudeCodeVerifierConfig( }); const taskSpec: TaskSpec = { - id: plan.taskId ?? `${plan.dataset}/${plan.instruction.slice(0, 40)}`, + // Fallback id feeds the trajectory dir path, so sanitize the + // instruction-derived segment — raw instruction text can contain `/`, + // `..`, or other path-unsafe characters that would fork the output dir. + id: + plan.taskId ?? + `${plan.dataset}/${plan.instruction + .slice(0, 40) + .replace(/[^A-Za-z0-9_-]/g, "_")}`, instruction: plan.instruction, initUrl: plan.startUrl, ...(plan.precomputedRubric && { From 923eaaa075d4c304fb5900f8ebeee0b356c881dd Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 1 Jul 2026 19:40:55 -0400 Subject: [PATCH 3/4] fix(evals): exempt keyless judge providers + no adapter leak on verifier throw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address two Cubic findings on the claude_code verifier judge-key guard: 1. Exempt API-keyless providers. loadApiKeyFromEnv returns undefined for keyless providers (ollama, bedrock — absent from providerEnvVarMap) by design, but the fail-fast guard treated that as a config error and rejected them. Only throw when the judge provider genuinely requires a key (present in providerEnvVarMap) and it is missing; keyless judges now proceed with no explicit apiKey. Key-requiring providers with a missing key still fail fast, keeping the silent-Gemini-key bug fixed. 2. No tool-adapter leak on verifier throw. buildClaudeCodeVerifierConfig was called before the try/finally that owns the prepared tool adapter, so a fail-fast throw skipped toolAdapter.cleanup(). Moved the call inside the try so cleanup runs on a verifier-config throw. Fail-fast behavior is preserved. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/evals/framework/benchHarness.ts | 30 +++++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts index d70ea94032..d0d50d1851 100644 --- a/packages/evals/framework/benchHarness.ts +++ b/packages/evals/framework/benchHarness.ts @@ -2,6 +2,7 @@ import { AgentProvider, getAISDKLanguageModel, loadApiKeyFromEnv, + providerEnvVarMap, V3, type AgentInstance, type AvailableModel, @@ -249,14 +250,22 @@ function buildClaudeCodeVerifierConfig( : undefined; const judgeClientOptions = judgeApiKey ? { apiKey: judgeApiKey } : undefined; - // Fail fast on a judge OVERRIDE whose key we can't resolve — do this before - // the try/catch so it propagates instead of being swallowed into the - // self-report fallback. Otherwise V3Evaluator backfills modelClientOptions - // with the Gemini key, hands the wrong provider its credential, verify() - // throws, and the run silently downgrades to legacy self-report. Surface the - // misconfiguration instead. The built-in default (gemini) is exempt: it - // degrades gracefully to V3Evaluator's own key resolution. - if (judgeModelOverride && judgeProvider && !judgeApiKey) { + // Fail fast on a judge OVERRIDE whose key we can't resolve, so it propagates + // instead of being swallowed into the self-report fallback. Otherwise + // V3Evaluator backfills modelClientOptions with the Gemini key, hands the + // wrong provider its credential, verify() throws, and the run silently + // downgrades to legacy self-report. Surface the misconfiguration instead. + // + // Only providers that genuinely require a key qualify: `loadApiKeyFromEnv` + // returns undefined for key-requiring providers (missing key) AND for + // API-keyless providers (ollama, bedrock — no entry in providerEnvVarMap) by + // design. Mirror that set via providerEnvVarMap so keyless judges proceed + // with no explicit apiKey instead of being rejected as misconfigured. The + // built-in default (gemini) is also exempt: it degrades gracefully to + // V3Evaluator's own key resolution. + const judgeProviderRequiresKey = + judgeProvider !== undefined && judgeProvider in providerEnvVarMap; + if (judgeModelOverride && judgeProviderRequiresKey && !judgeApiKey) { throw new EvalsError( `EVAL_CLAUDE_CODE_VERIFIER_MODEL="${judgeModel}" was set but no API key resolved for provider "${judgeProvider}". Set that provider's key (e.g. ANTHROPIC_API_KEY / OPENAI_API_KEY) or unset EVAL_CLAUDE_CODE_VERIFIER_MODEL to use the default judge.`, ); @@ -338,8 +347,11 @@ export const claudeCodeHarness: BenchHarness = { plan, logger, }); - const verifier = buildClaudeCodeVerifierConfig(plan, logger); try { + // Built inside the try so a fail-fast verifier-config error (e.g. an + // override judge whose key can't be resolved) still runs the finally that + // owns the prepared tool adapter, instead of leaking it. + const verifier = buildClaudeCodeVerifierConfig(plan, logger); return await runClaudeCodeAgent({ plan, model: input.modelName, From 821d4970c26f476b4c30ff4b03bee05e6c88ffaa Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 1 Jul 2026 20:06:38 -0400 Subject: [PATCH 4/4] fix(evals): resolve gateway judge credential + add verifier-config tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address cubic review nits on the claude_code rubric verifier config. FIX 1 (gateway judge credential): the keyless-provider exemption treated any provider absent from the SDK's providerEnvVarMap as keyless. But `gateway/...` (Vercel AI Gateway) is not in the map yet needs AI_GATEWAY_API_KEY, so a `gateway/` judge override would silently proceed without its credential and downgrade the verifier to self-report. Add resolveJudgeApiKey (maps `gateway` → AI_GATEWAY_API_KEY, else loadApiKeyFromEnv) and judgeProviderRequiresKey (true for providerEnvVarMap entries plus `gateway`) so a gateway judge resolves its key and still fail-fasts when it is missing; ollama/bedrock and the default gemini judge stay exempt. FIX 2 (regression tests): add packages/evals/tests/framework/verifierConfig.test.ts covering (a) a keyless override (ollama) builds a config without an apiKey, (b) an anthropic override with the key unset throws the config error while toolAdapter.cleanup() still runs (fail-fast inside try/finally, via claudeCodeHarness.execute), and (c) a gateway/ override resolves AI_GATEWAY_API_KEY (plus a missing-gateway-key fail-fast case). Exported buildClaudeCodeVerifierConfig for direct unit testing. Unit tests: 349 pass (was 345, +4 new). Build + typecheck + lint clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/evals/framework/benchHarness.ts | 71 ++++++-- .../tests/framework/verifierConfig.test.ts | 160 ++++++++++++++++++ 2 files changed, 216 insertions(+), 15 deletions(-) create mode 100644 packages/evals/tests/framework/verifierConfig.test.ts diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts index d0d50d1851..ed4638eb58 100644 --- a/packages/evals/framework/benchHarness.ts +++ b/packages/evals/framework/benchHarness.ts @@ -199,6 +199,46 @@ export const stagehandHarness: BenchHarness = { */ const CLAUDE_CODE_VERIFIER_JUDGE_MODEL = "google/gemini-2.5-flash"; +/** + * The Vercel AI Gateway provider (`gateway/...`) authenticates against + * AI_GATEWAY_API_KEY, but `gateway` is NOT in the SDK's providerEnvVarMap, so + * loadApiKeyFromEnv treats it like a keyless provider and returns undefined. + * A `gateway/` judge override would therefore silently skip its credential and + * downgrade the verifier. Resolve it explicitly here so a gateway judge sends + * the right key and still fail-fasts when the key is missing. + */ +const GATEWAY_JUDGE_PROVIDER = "gateway"; +const GATEWAY_JUDGE_API_KEY_ENV = "AI_GATEWAY_API_KEY"; + +/** + * Resolve the API key for a judge provider. Mirrors loadApiKeyFromEnv for + * providers in providerEnvVarMap, but also handles `gateway` (which the SDK map + * omits) via AI_GATEWAY_API_KEY so a gateway judge isn't mistaken for keyless. + */ +function resolveJudgeApiKey( + provider: string | undefined, + logger: EvalLogger, +): string | undefined { + if (!provider) return undefined; + if (provider === GATEWAY_JUDGE_PROVIDER) { + const key = process.env[GATEWAY_JUDGE_API_KEY_ENV]; + return typeof key === "string" && key.length > 0 ? key : undefined; + } + return loadApiKeyFromEnv(provider, (line: LogLine) => logger.log(line)); +} + +/** + * Whether a judge provider genuinely requires an API key (so a missing key is a + * misconfiguration, not a keyless provider). True for anything in the SDK's + * providerEnvVarMap plus `gateway` (which the map omits but which needs + * AI_GATEWAY_API_KEY). Genuinely-keyless providers (ollama/bedrock) and the + * built-in default stay exempt. + */ +function judgeProviderRequiresKey(provider: string | undefined): boolean { + if (provider === undefined) return false; + return provider === GATEWAY_JUDGE_PROVIDER || provider in providerEnvVarMap; +} + /** * Whether the rubric verifier should run for claude_code. Default ON so browse * runs get ground-truth scoring; set EVAL_CLAUDE_CODE_VERIFIER to 0/false/off to @@ -229,7 +269,7 @@ function isClaudeCodeVerifierEnabled(): boolean { * (tui/commands/verify.ts): a browser-free V3 with disableAPI + an Anthropic * model so the verifier's LLMProvider resolves against ANTHROPIC_API_KEY. */ -function buildClaudeCodeVerifierConfig( +export function buildClaudeCodeVerifierConfig( plan: ExternalHarnessTaskPlan, logger: EvalLogger, ): ClaudeCodeVerifierConfig | undefined { @@ -245,9 +285,9 @@ function buildClaudeCodeVerifierConfig( const judgeProvider = judgeModel.includes("/") ? judgeModel.slice(0, judgeModel.indexOf("/")) : undefined; - const judgeApiKey = judgeProvider - ? loadApiKeyFromEnv(judgeProvider, (line: LogLine) => logger.log(line)) - : undefined; + // resolveJudgeApiKey mirrors loadApiKeyFromEnv but also maps `gateway` → + // AI_GATEWAY_API_KEY (the SDK's providerEnvVarMap omits gateway). + const judgeApiKey = resolveJudgeApiKey(judgeProvider, logger); const judgeClientOptions = judgeApiKey ? { apiKey: judgeApiKey } : undefined; // Fail fast on a judge OVERRIDE whose key we can't resolve, so it propagates @@ -256,18 +296,19 @@ function buildClaudeCodeVerifierConfig( // wrong provider its credential, verify() throws, and the run silently // downgrades to legacy self-report. Surface the misconfiguration instead. // - // Only providers that genuinely require a key qualify: `loadApiKeyFromEnv` - // returns undefined for key-requiring providers (missing key) AND for - // API-keyless providers (ollama, bedrock — no entry in providerEnvVarMap) by - // design. Mirror that set via providerEnvVarMap so keyless judges proceed - // with no explicit apiKey instead of being rejected as misconfigured. The - // built-in default (gemini) is also exempt: it degrades gracefully to - // V3Evaluator's own key resolution. - const judgeProviderRequiresKey = - judgeProvider !== undefined && judgeProvider in providerEnvVarMap; - if (judgeModelOverride && judgeProviderRequiresKey && !judgeApiKey) { + // Only providers that genuinely require a key qualify (see + // judgeProviderRequiresKey): anything in the SDK's providerEnvVarMap plus + // `gateway` (which needs AI_GATEWAY_API_KEY but the map omits). Genuinely + // API-keyless providers (ollama, bedrock) and the built-in default (gemini) + // stay exempt: keyless judges proceed with no explicit apiKey, and the + // default degrades gracefully to V3Evaluator's own key resolution. + if ( + judgeModelOverride && + judgeProviderRequiresKey(judgeProvider) && + !judgeApiKey + ) { throw new EvalsError( - `EVAL_CLAUDE_CODE_VERIFIER_MODEL="${judgeModel}" was set but no API key resolved for provider "${judgeProvider}". Set that provider's key (e.g. ANTHROPIC_API_KEY / OPENAI_API_KEY) or unset EVAL_CLAUDE_CODE_VERIFIER_MODEL to use the default judge.`, + `EVAL_CLAUDE_CODE_VERIFIER_MODEL="${judgeModel}" was set but no API key resolved for provider "${judgeProvider}". Set that provider's key (e.g. ANTHROPIC_API_KEY / OPENAI_API_KEY / AI_GATEWAY_API_KEY) or unset EVAL_CLAUDE_CODE_VERIFIER_MODEL to use the default judge.`, ); } diff --git a/packages/evals/tests/framework/verifierConfig.test.ts b/packages/evals/tests/framework/verifierConfig.test.ts new file mode 100644 index 0000000000..9f55978601 --- /dev/null +++ b/packages/evals/tests/framework/verifierConfig.test.ts @@ -0,0 +1,160 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { ExternalHarnessTaskPlan } from "../../framework/externalHarnessPlan.js"; + +// Keep the real @browserbasehq/stagehand surface (loadApiKeyFromEnv, +// providerEnvVarMap, etc. — the credential-resolution logic under test depends +// on them) but replace V3 with a lightweight stub so the config-building path +// never touches real LLM-provider/browser internals. buildClaudeCodeVerifierConfig +// only uses the V3 instance as an inert LLM-client carrier. +vi.mock("@browserbasehq/stagehand", async () => { + const actual = await vi.importActual< + typeof import("@browserbasehq/stagehand") + >("@browserbasehq/stagehand"); + return { + ...actual, + V3: class { + opts: unknown; + constructor(opts: unknown) { + this.opts = opts; + } + }, + }; +}); + +// Mock the execute()-path collaborators so we can assert the fail-fast + +// finally(cleanup) contract without spawning a real Claude Code agent. +// vi.hoisted keeps these usable inside the hoisted vi.mock factories below. +const { cleanupMock, runClaudeCodeAgentMock } = vi.hoisted(() => ({ + cleanupMock: vi.fn(async () => {}), + runClaudeCodeAgentMock: vi.fn(async () => ({}) as never), +})); + +vi.mock("../../framework/claudeCodeToolAdapter.js", () => ({ + prepareClaudeCodeToolAdapter: vi.fn(async () => ({ cleanup: cleanupMock })), +})); + +vi.mock("../../framework/claudeCodeRunner.js", () => ({ + runClaudeCodeAgent: runClaudeCodeAgentMock, +})); + +vi.mock("../../framework/externalHarnessPlan.js", () => ({ + buildExternalHarnessTaskPlan: vi.fn( + (): ExternalHarnessTaskPlan => ({ + dataset: "webvoyager", + taskId: "wv-1", + startUrl: "https://example.com", + instruction: "Find the checkout button", + }), + ), +})); + +import { + buildClaudeCodeVerifierConfig, + claudeCodeHarness, +} from "../../framework/benchHarness.js"; +import { EvalLogger } from "../../logger.js"; +import { EvalsError } from "../../errors.js"; + +const plan: ExternalHarnessTaskPlan = { + dataset: "webvoyager", + taskId: "wv-1", + startUrl: "https://example.com", + instruction: "Find the checkout button", +}; + +// Env keys the verifier-config credential resolution reads. Snapshot + restore +// so tests don't leak state into each other or the rest of the suite. +const MANAGED_ENV = [ + "EVAL_CLAUDE_CODE_VERIFIER", + "EVAL_CLAUDE_CODE_VERIFIER_MODEL", + "AI_GATEWAY_API_KEY", + "ANTHROPIC_API_KEY", + "OLLAMA_API_KEY", +] as const; + +let savedEnv: Record; + +beforeEach(() => { + savedEnv = {}; + for (const key of MANAGED_ENV) { + savedEnv[key] = process.env[key]; + delete process.env[key]; + } + cleanupMock.mockClear(); + runClaudeCodeAgentMock.mockClear(); +}); + +afterEach(() => { + for (const key of MANAGED_ENV) { + if (savedEnv[key] === undefined) delete process.env[key]; + else process.env[key] = savedEnv[key]; + } +}); + +describe("buildClaudeCodeVerifierConfig judge credentials", () => { + it("builds a config for a keyless provider override (ollama) without an apiKey", () => { + process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL = "ollama/llama3"; + + const config = buildClaudeCodeVerifierConfig(plan, new EvalLogger(false)); + + expect(config).toBeDefined(); + expect(config?.judgeModel).toBe("ollama/llama3"); + // Keyless provider → no explicit apiKey is threaded through. + expect(config?.judgeClientOptions).toBeUndefined(); + }); + + it("resolves AI_GATEWAY_API_KEY for a gateway/ judge override", () => { + process.env.AI_GATEWAY_API_KEY = "gw-test-key"; + process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL = + "gateway/anthropic/claude-sonnet-4-20250514"; + + const config = buildClaudeCodeVerifierConfig(plan, new EvalLogger(false)); + + expect(config).toBeDefined(); + expect(config?.judgeModel).toBe( + "gateway/anthropic/claude-sonnet-4-20250514", + ); + expect(config?.judgeClientOptions).toEqual({ apiKey: "gw-test-key" }); + }); + + it("fail-fasts when a gateway/ judge override is missing AI_GATEWAY_API_KEY", () => { + process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL = "gateway/some-model"; + + expect(() => + buildClaudeCodeVerifierConfig(plan, new EvalLogger(false)), + ).toThrow(/AI_GATEWAY_API_KEY|no API key resolved/); + }); +}); + +describe("claudeCodeHarness.execute verifier fail-fast", () => { + const makeExecuteInput = () => ({ + task: {} as never, + input: { modelName: "anthropic/claude-sonnet-4-20250514" } as never, + row: { + config: { + harness: "claude_code" as const, + model: "anthropic/claude-sonnet-4-20250514" as never, + environment: "LOCAL" as const, + useApi: false, + }, + } as never, + logger: new EvalLogger(false), + }); + + it("throws the config error but still runs toolAdapter.cleanup() (fail-fast inside try/finally)", async () => { + // Anthropic judge override with ANTHROPIC_API_KEY unset (cleared in beforeEach) + // → verifier config must throw, and the prepared adapter must still be cleaned up. + process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL = + "anthropic/claude-sonnet-4-20250514"; + + await expect(claudeCodeHarness.execute(makeExecuteInput())).rejects.toThrow( + EvalsError, + ); + + // The verifier construction was moved inside the try, so the finally that + // owns the adapter runs even when config resolution throws. + expect(cleanupMock).toHaveBeenCalledTimes(1); + // The agent must NOT have run — we failed fast before executing. + expect(runClaudeCodeAgentMock).not.toHaveBeenCalled(); + }); +});