diff --git a/packages/evals/core/tools/browse_cli.ts b/packages/evals/core/tools/browse_cli.ts index 0e8ec5c9a..ef96790d4 100644 --- a/packages/evals/core/tools/browse_cli.ts +++ b/packages/evals/core/tools/browse_cli.ts @@ -106,18 +106,30 @@ type BrowseCliPagesResult = { }>; }; +// The mode flag selects the environment when the daemon is first started and +// must be explicit so a set BROWSERBASE_API_KEY does not silently auto-select +// remote. It is only accepted by the driver commands, so it is skipped for the +// subcommands that reject it. The session name is safe on every command. +const BROWSE_MODELESS_COMMANDS = new Set(["stop", "status"]); + class BrowseCliRuntime { - constructor(private readonly session: string) {} + constructor( + private readonly session: string, + private readonly modeFlag: "--local" | "--remote", + ) {} async runJson(args: string[]): Promise { + const modeArgs = BROWSE_MODELESS_COMMANDS.has(args[0]) + ? [] + : [this.modeFlag]; const { stdout, stderr } = await execFileAsync( process.execPath, [ resolveBrowseCliEntrypoint(), - "--json", + ...args, + ...modeArgs, "--session", this.session, - ...args, ], { cwd: getRepoRootDir(), @@ -645,8 +657,11 @@ class BrowseCliSession implements CoreSession { private activePageId: string | null = null; private closed = false; - constructor(private readonly sessionName: string) { - this.runtime = new BrowseCliRuntime(sessionName); + constructor( + private readonly sessionName: string, + modeFlag: "--local" | "--remote", + ) { + this.runtime = new BrowseCliRuntime(sessionName, modeFlag); } private wrap(page: { targetId: string; url: string }): BrowseCliPageHandle { @@ -823,11 +838,10 @@ export class BrowseCliTool implements CoreTool { ); } - const session = new BrowseCliSession(createSessionName()); - await session.runtime.runJson([ - "env", - input.environment === "BROWSERBASE" ? "remote" : "local", - ]); + const session = new BrowseCliSession( + createSessionName(), + input.environment === "BROWSERBASE" ? "--remote" : "--local", + ); return { session, diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts index c2277ea36..ed4638eb5 100644 --- a/packages/evals/framework/benchHarness.ts +++ b/packages/evals/framework/benchHarness.ts @@ -2,11 +2,13 @@ import { AgentProvider, getAISDKLanguageModel, loadApiKeyFromEnv, + providerEnvVarMap, + V3, type AgentInstance, type AvailableModel, type LLMClient, type LogLine, - type V3, + type TaskSpec, } from "@browserbasehq/stagehand"; import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js"; import { endBrowserbaseSession } from "../browserbaseCleanup.js"; @@ -14,7 +16,11 @@ import { EvalsError } from "../errors.js"; import type { EvalLogger } from "../logger.js"; import type { V3InitResult } from "../initV3.js"; import type { EvalInput } from "../types/evals.js"; -import { runClaudeCodeAgent } from "./claudeCodeRunner.js"; +import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; +import { + runClaudeCodeAgent, + type ClaudeCodeVerifierConfig, +} from "./claudeCodeRunner.js"; import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js"; import { runCodexAgent } from "./codexRunner.js"; import { prepareCodexToolAdapter } from "./codexToolAdapter.js"; @@ -181,6 +187,184 @@ export const stagehandHarness: BenchHarness = { }, }; +/** + * Default judge model for the claude_code rubric verifier — used for both rubric + * generation and scoring. google/gemini-2.5-flash is V3Evaluator's own tuned + * default and reliably emits the verifier's structured-output schema; smaller + * models (e.g. anthropic/claude-haiku-4-5) intermittently fail the fused + * judgment call ("response did not match schema"), which the verifier reports as + * evidenceInsufficient → spurious outcome=false. Override with + * EVAL_CLAUDE_CODE_VERIFIER_MODEL (the judge's provider key is auto-resolved). + * Requires GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY for the default. + */ +const CLAUDE_CODE_VERIFIER_JUDGE_MODEL = "google/gemini-2.5-flash"; + +/** + * The Vercel AI Gateway provider (`gateway/...`) authenticates against + * AI_GATEWAY_API_KEY, but `gateway` is NOT in the SDK's providerEnvVarMap, so + * loadApiKeyFromEnv treats it like a keyless provider and returns undefined. + * A `gateway/` judge override would therefore silently skip its credential and + * downgrade the verifier. Resolve it explicitly here so a gateway judge sends + * the right key and still fail-fasts when the key is missing. + */ +const GATEWAY_JUDGE_PROVIDER = "gateway"; +const GATEWAY_JUDGE_API_KEY_ENV = "AI_GATEWAY_API_KEY"; + +/** + * Resolve the API key for a judge provider. Mirrors loadApiKeyFromEnv for + * providers in providerEnvVarMap, but also handles `gateway` (which the SDK map + * omits) via AI_GATEWAY_API_KEY so a gateway judge isn't mistaken for keyless. + */ +function resolveJudgeApiKey( + provider: string | undefined, + logger: EvalLogger, +): string | undefined { + if (!provider) return undefined; + if (provider === GATEWAY_JUDGE_PROVIDER) { + const key = process.env[GATEWAY_JUDGE_API_KEY_ENV]; + return typeof key === "string" && key.length > 0 ? key : undefined; + } + return loadApiKeyFromEnv(provider, (line: LogLine) => logger.log(line)); +} + +/** + * Whether a judge provider genuinely requires an API key (so a missing key is a + * misconfiguration, not a keyless provider). True for anything in the SDK's + * providerEnvVarMap plus `gateway` (which the map omits but which needs + * AI_GATEWAY_API_KEY). Genuinely-keyless providers (ollama/bedrock) and the + * built-in default stay exempt. + */ +function judgeProviderRequiresKey(provider: string | undefined): boolean { + if (provider === undefined) return false; + return provider === GATEWAY_JUDGE_PROVIDER || provider in providerEnvVarMap; +} + +/** + * Whether the rubric verifier should run for claude_code. Default ON so browse + * runs get ground-truth scoring; set EVAL_CLAUDE_CODE_VERIFIER to 0/false/off to + * fall back to the agent's self-reported EVAL_RESULT line. + */ +function isClaudeCodeVerifierEnabled(): boolean { + const raw = process.env.EVAL_CLAUDE_CODE_VERIFIER; + if (raw === undefined) return true; + const normalized = raw.trim().toLowerCase(); + return !( + normalized === "0" || + normalized === "false" || + normalized === "off" || + normalized === "no" + ); +} + +/** + * Build the ClaudeCodeVerifierConfig that wires V3Evaluator's rubric verifier + * into the claude_code runner. Returns undefined (→ self-report fallback) when + * the verifier is disabled or when constructing the V3 carrier throws — never + * crashes the run. Exception: an explicit judge override + * (EVAL_CLAUDE_CODE_VERIFIER_MODEL) whose provider key can't be resolved throws + * a config error rather than silently downgrading to self-report. + * + * The V3 instance is used ONLY as the LLM-client carrier for V3Evaluator; per + * ClaudeCodeVerifierConfig it does NOT need init(). We mirror `evals verify` + * (tui/commands/verify.ts): a browser-free V3 with disableAPI + an Anthropic + * model so the verifier's LLMProvider resolves against ANTHROPIC_API_KEY. + */ +export function buildClaudeCodeVerifierConfig( + plan: ExternalHarnessTaskPlan, + logger: EvalLogger, +): ClaudeCodeVerifierConfig | undefined { + if (!isClaudeCodeVerifierEnabled()) return undefined; + + const judgeModelOverride = process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL; + const judgeModel = (judgeModelOverride || + CLAUDE_CODE_VERIFIER_JUDGE_MODEL) as AvailableModel; + + // Resolve the judge provider's key so V3Evaluator sends the RIGHT credential. + // Without this it defaults modelClientOptions.apiKey to the Gemini key, which + // an Anthropic judge would receive as x-api-key → "invalid x-api-key". + const judgeProvider = judgeModel.includes("/") + ? judgeModel.slice(0, judgeModel.indexOf("/")) + : undefined; + // resolveJudgeApiKey mirrors loadApiKeyFromEnv but also maps `gateway` → + // AI_GATEWAY_API_KEY (the SDK's providerEnvVarMap omits gateway). + const judgeApiKey = resolveJudgeApiKey(judgeProvider, logger); + const judgeClientOptions = judgeApiKey ? { apiKey: judgeApiKey } : undefined; + + // Fail fast on a judge OVERRIDE whose key we can't resolve, so it propagates + // instead of being swallowed into the self-report fallback. Otherwise + // V3Evaluator backfills modelClientOptions with the Gemini key, hands the + // wrong provider its credential, verify() throws, and the run silently + // downgrades to legacy self-report. Surface the misconfiguration instead. + // + // Only providers that genuinely require a key qualify (see + // judgeProviderRequiresKey): anything in the SDK's providerEnvVarMap plus + // `gateway` (which needs AI_GATEWAY_API_KEY but the map omits). Genuinely + // API-keyless providers (ollama, bedrock) and the built-in default (gemini) + // stay exempt: keyless judges proceed with no explicit apiKey, and the + // default degrades gracefully to V3Evaluator's own key resolution. + if ( + judgeModelOverride && + judgeProviderRequiresKey(judgeProvider) && + !judgeApiKey + ) { + throw new EvalsError( + `EVAL_CLAUDE_CODE_VERIFIER_MODEL="${judgeModel}" was set but no API key resolved for provider "${judgeProvider}". Set that provider's key (e.g. ANTHROPIC_API_KEY / OPENAI_API_KEY / AI_GATEWAY_API_KEY) or unset EVAL_CLAUDE_CODE_VERIFIER_MODEL to use the default judge.`, + ); + } + + try { + // Browser-free carrier — no init(). Only v3.logger is read by V3Evaluator. + const v3 = new V3({ + env: "LOCAL", + verbose: 0, + disableAPI: true, + model: judgeClientOptions + ? { modelName: judgeModel, ...judgeClientOptions } + : judgeModel, + logger: (line: LogLine) => logger.log(line), + }); + + const taskSpec: TaskSpec = { + // Fallback id feeds the trajectory dir path, so sanitize the + // instruction-derived segment — raw instruction text can contain `/`, + // `..`, or other path-unsafe characters that would fork the output dir. + id: + plan.taskId ?? + `${plan.dataset}/${plan.instruction + .slice(0, 40) + .replace(/[^A-Za-z0-9_-]/g, "_")}`, + instruction: plan.instruction, + initUrl: plan.startUrl, + ...(plan.precomputedRubric && { + precomputedRubric: plan.precomputedRubric, + }), + ...(plan.expectedAnswer && { expectedAnswer: plan.expectedAnswer }), + }; + + return { + v3, + taskSpec, + dataset: plan.dataset, + judgeModel, + judgeClientOptions, + successMode: process.env.EVAL_SUCCESS_MODE as + | "outcome" + | "process" + | "both" + | undefined, + }; + } catch (error) { + logger.warn({ + category: "claude_code", + message: `verifier setup skipped (falling back to self-report): ${ + error instanceof Error ? error.message : String(error) + }`, + level: 0, + }); + return undefined; + } +} + export const claudeCodeHarness: BenchHarness = { harness: "claude_code", supportedTaskKinds: ["agent", "suite"], @@ -205,12 +389,17 @@ export const claudeCodeHarness: BenchHarness = { logger, }); try { + // Built inside the try so a fail-fast verifier-config error (e.g. an + // override judge whose key can't be resolved) still runs the finally that + // owns the prepared tool adapter, instead of leaking it. + const verifier = buildClaudeCodeVerifierConfig(plan, logger); return await runClaudeCodeAgent({ plan, model: input.modelName, logger, toolAdapter, signal, + verifier, }); } finally { await toolAdapter.cleanup(); diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts index da68dc929..70462aa8f 100644 --- a/packages/evals/framework/claudeCodeRunner.ts +++ b/packages/evals/framework/claudeCodeRunner.ts @@ -1,4 +1,9 @@ -import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand"; +import type { + AvailableModel, + ClientOptions, + TaskSpec, + V3, +} from "@browserbasehq/stagehand"; import { EvalsError } from "../errors.js"; import type { EvalLogger } from "../logger.js"; import type { TaskResult } from "./types.js"; @@ -30,6 +35,20 @@ export interface ClaudeCodeVerifierConfig { taskSpec: TaskSpec; /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */ dataset: string; + /** + * Judge model for V3Evaluator (scoring + rubric generation). When omitted the + * evaluator falls back to its own default (google/gemini-2.5-flash). Pass an + * Anthropic model here to score against ANTHROPIC_API_KEY. + */ + judgeModel?: AvailableModel; + /** + * Client options (API key) for the judge model. Required alongside judgeModel + * when the judge's provider differs from the evaluator's own default — + * otherwise V3Evaluator defaults modelClientOptions.apiKey to the Gemini key, + * which is sent as the wrong provider's credential (e.g. an Anthropic judge + * receives the Gemini key and fails with "invalid x-api-key"). + */ + judgeClientOptions?: ClientOptions; /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */ successMode?: "outcome" | "process" | "both"; /** Override trajectory persistence root. */ @@ -289,7 +308,13 @@ export async function runClaudeCodeAgent({ const { V3Evaluator } = await import("@browserbasehq/stagehand"); const { RubricCache } = await import("./rubricCache.js"); - const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" }); + const evaluator = new V3Evaluator(verifier.v3, { + backend: "verifier", + ...(verifier.judgeModel && { modelName: verifier.judgeModel }), + ...(verifier.judgeClientOptions && { + modelClientOptions: verifier.judgeClientOptions, + }), + }); // Hydrate rubric — use precomputed if present, otherwise cache-or-generate. let rubric = verifier.taskSpec.precomputedRubric; diff --git a/packages/evals/framework/claudeCodeToolAdapter.ts b/packages/evals/framework/claudeCodeToolAdapter.ts index 8fdc14182..63050253f 100644 --- a/packages/evals/framework/claudeCodeToolAdapter.ts +++ b/packages/evals/framework/claudeCodeToolAdapter.ts @@ -326,19 +326,28 @@ export async function prepareBrowseCliHarnessAdapter( PATH: `${cwd}${path.delimiter}${process.env.PATH ?? ""}`, } as Record; + const modeFlag = input.environment === "BROWSERBASE" ? "--remote" : "--local"; await fsp.writeFile( wrapperPath, [ "#!/usr/bin/env bash", "set -euo pipefail", - `exec ${JSON.stringify(process.execPath)} ${JSON.stringify(BROWSE_CLI_ENTRYPOINT)} --json --session ${JSON.stringify(session)} "$@"`, + // The mode flag (--local/--remote) selects the environment when the daemon + // is first started and must be explicit so a set BROWSERBASE_API_KEY does + // not silently auto-select remote. It is only accepted by the driver + // commands, so skip it for the few subcommands that reject it (stop, + // status). The session name is safe on every command. + "cmd=${1:-}", + "mode=()", + 'if [[ "$cmd" != "stop" && "$cmd" != "status" ]]; then', + ` mode=(${JSON.stringify(modeFlag)})`, + "fi", + `exec ${JSON.stringify(process.execPath)} ${JSON.stringify(BROWSE_CLI_ENTRYPOINT)} "$@" "\${mode[@]+\${mode[@]}}" --session ${JSON.stringify(session)}`, "", ].join("\n"), { mode: 0o755 }, ); - await runBrowseSetup(wrapperPath, input.environment, input.logger, env, cwd); - return { toolSurface: "browse_cli", startupProfile: input.startupProfile, @@ -1070,22 +1079,6 @@ function buildCdpCodePromptInstructions(plan: ExternalHarnessTaskPlan): string { ].join("\n"); } -async function runBrowseSetup( - wrapperPath: string, - environment: "LOCAL" | "BROWSERBASE", - logger: EvalLogger, - env: Record, - cwd: string, -): Promise { - await runBrowseCommand( - wrapperPath, - ["env", environment === "BROWSERBASE" ? "remote" : "local"], - logger, - env, - cwd, - ); -} - function buildBrowseCliPromptInstructions( plan: ExternalHarnessTaskPlan, ): string { diff --git a/packages/evals/framework/externalHarnessPlan.ts b/packages/evals/framework/externalHarnessPlan.ts index fa23bf99e..6dae10775 100644 --- a/packages/evals/framework/externalHarnessPlan.ts +++ b/packages/evals/framework/externalHarnessPlan.ts @@ -1,3 +1,4 @@ +import { normalizeRubric, type Rubric } from "@browserbasehq/stagehand"; import { EvalsError } from "../errors.js"; import type { EvalInput } from "../types/evals.js"; @@ -6,6 +7,14 @@ export interface ExternalHarnessTaskPlan { taskId?: string; startUrl: string; instruction: string; + /** + * Precomputed rubric carried by the dataset row (`precomputed_rubric`), if + * present. Threaded into the verifier's TaskSpec so it doesn't regenerate. + * Undefined when the row ships no rubric — the verifier generates one. + */ + precomputedRubric?: Rubric; + /** Reference answer carried by the dataset row (`expectedAnswer`), if present. */ + expectedAnswer?: string; } function readString( @@ -16,6 +25,21 @@ function readString( return typeof value === "string" && value.length > 0 ? value : undefined; } +/** + * Rubric + reference answer a dataset row may ship. Rows without a + * `precomputed_rubric` leave `precomputedRubric` undefined so the verifier + * generates (and caches) one from the instruction. + */ +function readVerifierFields(params: Record): { + precomputedRubric?: Rubric; + expectedAnswer?: string; +} { + return { + precomputedRubric: normalizeRubric(params.precomputed_rubric) ?? undefined, + expectedAnswer: readString(params, "expectedAnswer"), + }; +} + export function buildExternalHarnessTaskPlan( input: EvalInput, ): ExternalHarnessTaskPlan { @@ -34,6 +58,7 @@ export function buildExternalHarnessTaskPlan( taskId: readString(params, "id"), startUrl, instruction, + ...readVerifierFields(params), }; } @@ -50,6 +75,7 @@ export function buildExternalHarnessTaskPlan( taskId: readString(params, "task_id"), startUrl, instruction, + ...readVerifierFields(params), }; } @@ -65,6 +91,7 @@ export function buildExternalHarnessTaskPlan( taskId: readString(params, "id"), startUrl: readString(params, "web") ?? "https://www.google.com", instruction, + ...readVerifierFields(params), }; } diff --git a/packages/evals/tests/framework/verifierConfig.test.ts b/packages/evals/tests/framework/verifierConfig.test.ts new file mode 100644 index 000000000..9f5597860 --- /dev/null +++ b/packages/evals/tests/framework/verifierConfig.test.ts @@ -0,0 +1,160 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { ExternalHarnessTaskPlan } from "../../framework/externalHarnessPlan.js"; + +// Keep the real @browserbasehq/stagehand surface (loadApiKeyFromEnv, +// providerEnvVarMap, etc. — the credential-resolution logic under test depends +// on them) but replace V3 with a lightweight stub so the config-building path +// never touches real LLM-provider/browser internals. buildClaudeCodeVerifierConfig +// only uses the V3 instance as an inert LLM-client carrier. +vi.mock("@browserbasehq/stagehand", async () => { + const actual = await vi.importActual< + typeof import("@browserbasehq/stagehand") + >("@browserbasehq/stagehand"); + return { + ...actual, + V3: class { + opts: unknown; + constructor(opts: unknown) { + this.opts = opts; + } + }, + }; +}); + +// Mock the execute()-path collaborators so we can assert the fail-fast + +// finally(cleanup) contract without spawning a real Claude Code agent. +// vi.hoisted keeps these usable inside the hoisted vi.mock factories below. +const { cleanupMock, runClaudeCodeAgentMock } = vi.hoisted(() => ({ + cleanupMock: vi.fn(async () => {}), + runClaudeCodeAgentMock: vi.fn(async () => ({}) as never), +})); + +vi.mock("../../framework/claudeCodeToolAdapter.js", () => ({ + prepareClaudeCodeToolAdapter: vi.fn(async () => ({ cleanup: cleanupMock })), +})); + +vi.mock("../../framework/claudeCodeRunner.js", () => ({ + runClaudeCodeAgent: runClaudeCodeAgentMock, +})); + +vi.mock("../../framework/externalHarnessPlan.js", () => ({ + buildExternalHarnessTaskPlan: vi.fn( + (): ExternalHarnessTaskPlan => ({ + dataset: "webvoyager", + taskId: "wv-1", + startUrl: "https://example.com", + instruction: "Find the checkout button", + }), + ), +})); + +import { + buildClaudeCodeVerifierConfig, + claudeCodeHarness, +} from "../../framework/benchHarness.js"; +import { EvalLogger } from "../../logger.js"; +import { EvalsError } from "../../errors.js"; + +const plan: ExternalHarnessTaskPlan = { + dataset: "webvoyager", + taskId: "wv-1", + startUrl: "https://example.com", + instruction: "Find the checkout button", +}; + +// Env keys the verifier-config credential resolution reads. Snapshot + restore +// so tests don't leak state into each other or the rest of the suite. +const MANAGED_ENV = [ + "EVAL_CLAUDE_CODE_VERIFIER", + "EVAL_CLAUDE_CODE_VERIFIER_MODEL", + "AI_GATEWAY_API_KEY", + "ANTHROPIC_API_KEY", + "OLLAMA_API_KEY", +] as const; + +let savedEnv: Record; + +beforeEach(() => { + savedEnv = {}; + for (const key of MANAGED_ENV) { + savedEnv[key] = process.env[key]; + delete process.env[key]; + } + cleanupMock.mockClear(); + runClaudeCodeAgentMock.mockClear(); +}); + +afterEach(() => { + for (const key of MANAGED_ENV) { + if (savedEnv[key] === undefined) delete process.env[key]; + else process.env[key] = savedEnv[key]; + } +}); + +describe("buildClaudeCodeVerifierConfig judge credentials", () => { + it("builds a config for a keyless provider override (ollama) without an apiKey", () => { + process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL = "ollama/llama3"; + + const config = buildClaudeCodeVerifierConfig(plan, new EvalLogger(false)); + + expect(config).toBeDefined(); + expect(config?.judgeModel).toBe("ollama/llama3"); + // Keyless provider → no explicit apiKey is threaded through. + expect(config?.judgeClientOptions).toBeUndefined(); + }); + + it("resolves AI_GATEWAY_API_KEY for a gateway/ judge override", () => { + process.env.AI_GATEWAY_API_KEY = "gw-test-key"; + process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL = + "gateway/anthropic/claude-sonnet-4-20250514"; + + const config = buildClaudeCodeVerifierConfig(plan, new EvalLogger(false)); + + expect(config).toBeDefined(); + expect(config?.judgeModel).toBe( + "gateway/anthropic/claude-sonnet-4-20250514", + ); + expect(config?.judgeClientOptions).toEqual({ apiKey: "gw-test-key" }); + }); + + it("fail-fasts when a gateway/ judge override is missing AI_GATEWAY_API_KEY", () => { + process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL = "gateway/some-model"; + + expect(() => + buildClaudeCodeVerifierConfig(plan, new EvalLogger(false)), + ).toThrow(/AI_GATEWAY_API_KEY|no API key resolved/); + }); +}); + +describe("claudeCodeHarness.execute verifier fail-fast", () => { + const makeExecuteInput = () => ({ + task: {} as never, + input: { modelName: "anthropic/claude-sonnet-4-20250514" } as never, + row: { + config: { + harness: "claude_code" as const, + model: "anthropic/claude-sonnet-4-20250514" as never, + environment: "LOCAL" as const, + useApi: false, + }, + } as never, + logger: new EvalLogger(false), + }); + + it("throws the config error but still runs toolAdapter.cleanup() (fail-fast inside try/finally)", async () => { + // Anthropic judge override with ANTHROPIC_API_KEY unset (cleared in beforeEach) + // → verifier config must throw, and the prepared adapter must still be cleaned up. + process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL = + "anthropic/claude-sonnet-4-20250514"; + + await expect(claudeCodeHarness.execute(makeExecuteInput())).rejects.toThrow( + EvalsError, + ); + + // The verifier construction was moved inside the try, so the finally that + // owns the adapter runs even when config resolution throws. + expect(cleanupMock).toHaveBeenCalledTimes(1); + // The agent must NOT have run — we failed fast before executing. + expect(runClaudeCodeAgentMock).not.toHaveBeenCalled(); + }); +});