diff --git a/packages/evals/core/tools/browse_cli.ts b/packages/evals/core/tools/browse_cli.ts
index 0e8ec5c9a..ef96790d4 100644
--- a/packages/evals/core/tools/browse_cli.ts
+++ b/packages/evals/core/tools/browse_cli.ts
@@ -106,18 +106,30 @@ type BrowseCliPagesResult = {
   }>;
 };
 
+// The mode flag selects the environment when the daemon is first started and
+// must be explicit so a set BROWSERBASE_API_KEY does not silently auto-select
+// remote. It is only accepted by the driver commands, so it is skipped for the
+// subcommands that reject it. The session name is safe on every command.
+const BROWSE_MODELESS_COMMANDS = new Set(["stop", "status"]);
+
 class BrowseCliRuntime {
-  constructor(private readonly session: string) {}
+  constructor(
+    private readonly session: string,
+    private readonly modeFlag: "--local" | "--remote",
+  ) {}
 
   async runJson<T>(args: string[]): Promise<T> {
+    const modeArgs = BROWSE_MODELESS_COMMANDS.has(args[0])
+      ? []
+      : [this.modeFlag];
     const { stdout, stderr } = await execFileAsync(
       process.execPath,
       [
         resolveBrowseCliEntrypoint(),
-        "--json",
+        ...args,
+        ...modeArgs,
         "--session",
         this.session,
-        ...args,
       ],
       {
         cwd: getRepoRootDir(),
@@ -645,8 +657,11 @@ class BrowseCliSession implements CoreSession {
   private activePageId: string | null = null;
   private closed = false;
 
-  constructor(private readonly sessionName: string) {
-    this.runtime = new BrowseCliRuntime(sessionName);
+  constructor(
+    private readonly sessionName: string,
+    modeFlag: "--local" | "--remote",
+  ) {
+    this.runtime = new BrowseCliRuntime(sessionName, modeFlag);
   }
 
   private wrap(page: { targetId: string; url: string }): BrowseCliPageHandle {
@@ -823,11 +838,10 @@ export class BrowseCliTool implements CoreTool {
       );
     }
 
-    const session = new BrowseCliSession(createSessionName());
-    await session.runtime.runJson([
-      "env",
-      input.environment === "BROWSERBASE" ? "remote" : "local",
-    ]);
+    const session = new BrowseCliSession(
+      createSessionName(),
+      input.environment === "BROWSERBASE" ? "--remote" : "--local",
+    );
 
     return {
       session,
diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts
index c2277ea36..ed4638eb5 100644
--- a/packages/evals/framework/benchHarness.ts
+++ b/packages/evals/framework/benchHarness.ts
@@ -2,11 +2,13 @@ import {
   AgentProvider,
   getAISDKLanguageModel,
   loadApiKeyFromEnv,
+  providerEnvVarMap,
+  V3,
   type AgentInstance,
   type AvailableModel,
   type LLMClient,
   type LogLine,
-  type V3,
+  type TaskSpec,
 } from "@browserbasehq/stagehand";
 import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js";
 import { endBrowserbaseSession } from "../browserbaseCleanup.js";
@@ -14,7 +16,11 @@ import { EvalsError } from "../errors.js";
 import type { EvalLogger } from "../logger.js";
 import type { V3InitResult } from "../initV3.js";
 import type { EvalInput } from "../types/evals.js";
-import { runClaudeCodeAgent } from "./claudeCodeRunner.js";
+import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
+import {
+  runClaudeCodeAgent,
+  type ClaudeCodeVerifierConfig,
+} from "./claudeCodeRunner.js";
 import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js";
 import { runCodexAgent } from "./codexRunner.js";
 import { prepareCodexToolAdapter } from "./codexToolAdapter.js";
@@ -181,6 +187,184 @@ export const stagehandHarness: BenchHarness = {
   },
 };
 
+/**
+ * Default judge model for the claude_code rubric verifier — used for both rubric
+ * generation and scoring. google/gemini-2.5-flash is V3Evaluator's own tuned
+ * default and reliably emits the verifier's structured-output schema; smaller
+ * models (e.g. anthropic/claude-haiku-4-5) intermittently fail the fused
+ * judgment call ("response did not match schema"), which the verifier reports as
+ * evidenceInsufficient → spurious outcome=false. Override with
+ * EVAL_CLAUDE_CODE_VERIFIER_MODEL (the judge's provider key is auto-resolved).
+ * Requires GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY for the default.
+ */
+const CLAUDE_CODE_VERIFIER_JUDGE_MODEL = "google/gemini-2.5-flash";
+
+/**
+ * The Vercel AI Gateway provider (`gateway/...`) authenticates against
+ * AI_GATEWAY_API_KEY, but `gateway` is NOT in the SDK's providerEnvVarMap, so
+ * loadApiKeyFromEnv treats it like a keyless provider and returns undefined.
+ * A `gateway/` judge override would therefore silently skip its credential and
+ * downgrade the verifier. Resolve it explicitly here so a gateway judge sends
+ * the right key and still fail-fasts when the key is missing.
+ */
+const GATEWAY_JUDGE_PROVIDER = "gateway";
+const GATEWAY_JUDGE_API_KEY_ENV = "AI_GATEWAY_API_KEY";
+
+/**
+ * Resolve the API key for a judge provider. Mirrors loadApiKeyFromEnv for
+ * providers in providerEnvVarMap, but also handles `gateway` (which the SDK map
+ * omits) via AI_GATEWAY_API_KEY so a gateway judge isn't mistaken for keyless.
+ */
+function resolveJudgeApiKey(
+  provider: string | undefined,
+  logger: EvalLogger,
+): string | undefined {
+  if (!provider) return undefined;
+  if (provider === GATEWAY_JUDGE_PROVIDER) {
+    const key = process.env[GATEWAY_JUDGE_API_KEY_ENV];
+    return typeof key === "string" && key.length > 0 ? key : undefined;
+  }
+  return loadApiKeyFromEnv(provider, (line: LogLine) => logger.log(line));
+}
+
+/**
+ * Whether a judge provider genuinely requires an API key (so a missing key is a
+ * misconfiguration, not a keyless provider). True for anything in the SDK's
+ * providerEnvVarMap plus `gateway` (which the map omits but which needs
+ * AI_GATEWAY_API_KEY). Genuinely-keyless providers (ollama/bedrock) and the
+ * built-in default stay exempt.
+ */
+function judgeProviderRequiresKey(provider: string | undefined): boolean {
+  if (provider === undefined) return false;
+  return provider === GATEWAY_JUDGE_PROVIDER || provider in providerEnvVarMap;
+}
+
+/**
+ * Whether the rubric verifier should run for claude_code. Default ON so browse
+ * runs get ground-truth scoring; set EVAL_CLAUDE_CODE_VERIFIER to 0/false/off to
+ * fall back to the agent's self-reported EVAL_RESULT line.
+ */
+function isClaudeCodeVerifierEnabled(): boolean {
+  const raw = process.env.EVAL_CLAUDE_CODE_VERIFIER;
+  if (raw === undefined) return true;
+  const normalized = raw.trim().toLowerCase();
+  return !(
+    normalized === "0" ||
+    normalized === "false" ||
+    normalized === "off" ||
+    normalized === "no"
+  );
+}
+
+/**
+ * Build the ClaudeCodeVerifierConfig that wires V3Evaluator's rubric verifier
+ * into the claude_code runner. Returns undefined (→ self-report fallback) when
+ * the verifier is disabled or when constructing the V3 carrier throws — never
+ * crashes the run. Exception: an explicit judge override
+ * (EVAL_CLAUDE_CODE_VERIFIER_MODEL) whose provider key can't be resolved throws
+ * a config error rather than silently downgrading to self-report.
+ *
+ * The V3 instance is used ONLY as the LLM-client carrier for V3Evaluator; per
+ * ClaudeCodeVerifierConfig it does NOT need init(). We mirror `evals verify`
+ * (tui/commands/verify.ts): a browser-free V3 with disableAPI + an Anthropic
+ * model so the verifier's LLMProvider resolves against ANTHROPIC_API_KEY.
+ */
+export function buildClaudeCodeVerifierConfig(
+  plan: ExternalHarnessTaskPlan,
+  logger: EvalLogger,
+): ClaudeCodeVerifierConfig | undefined {
+  if (!isClaudeCodeVerifierEnabled()) return undefined;
+
+  const judgeModelOverride = process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL;
+  const judgeModel = (judgeModelOverride ||
+    CLAUDE_CODE_VERIFIER_JUDGE_MODEL) as AvailableModel;
+
+  // Resolve the judge provider's key so V3Evaluator sends the RIGHT credential.
+  // Without this it defaults modelClientOptions.apiKey to the Gemini key, which
+  // an Anthropic judge would receive as x-api-key → "invalid x-api-key".
+  const judgeProvider = judgeModel.includes("/")
+    ? judgeModel.slice(0, judgeModel.indexOf("/"))
+    : undefined;
+  // resolveJudgeApiKey mirrors loadApiKeyFromEnv but also maps `gateway` →
+  // AI_GATEWAY_API_KEY (the SDK's providerEnvVarMap omits gateway).
+  const judgeApiKey = resolveJudgeApiKey(judgeProvider, logger);
+  const judgeClientOptions = judgeApiKey ? { apiKey: judgeApiKey } : undefined;
+
+  // Fail fast on a judge OVERRIDE whose key we can't resolve, so it propagates
+  // instead of being swallowed into the self-report fallback. Otherwise
+  // V3Evaluator backfills modelClientOptions with the Gemini key, hands the
+  // wrong provider its credential, verify() throws, and the run silently
+  // downgrades to legacy self-report. Surface the misconfiguration instead.
+  //
+  // Only providers that genuinely require a key qualify (see
+  // judgeProviderRequiresKey): anything in the SDK's providerEnvVarMap plus
+  // `gateway` (which needs AI_GATEWAY_API_KEY but the map omits). Genuinely
+  // API-keyless providers (ollama, bedrock) and the built-in default (gemini)
+  // stay exempt: keyless judges proceed with no explicit apiKey, and the
+  // default degrades gracefully to V3Evaluator's own key resolution.
+  if (
+    judgeModelOverride &&
+    judgeProviderRequiresKey(judgeProvider) &&
+    !judgeApiKey
+  ) {
+    throw new EvalsError(
+      `EVAL_CLAUDE_CODE_VERIFIER_MODEL="${judgeModel}" was set but no API key resolved for provider "${judgeProvider}". Set that provider's key (e.g. ANTHROPIC_API_KEY / OPENAI_API_KEY / AI_GATEWAY_API_KEY) or unset EVAL_CLAUDE_CODE_VERIFIER_MODEL to use the default judge.`,
+    );
+  }
+
+  try {
+    // Browser-free carrier — no init(). Only v3.logger is read by V3Evaluator.
+    const v3 = new V3({
+      env: "LOCAL",
+      verbose: 0,
+      disableAPI: true,
+      model: judgeClientOptions
+        ? { modelName: judgeModel, ...judgeClientOptions }
+        : judgeModel,
+      logger: (line: LogLine) => logger.log(line),
+    });
+
+    const taskSpec: TaskSpec = {
+      // Fallback id feeds the trajectory dir path, so sanitize the
+      // instruction-derived segment — raw instruction text can contain `/`,
+      // `..`, or other path-unsafe characters that would fork the output dir.
+      id:
+        plan.taskId ??
+        `${plan.dataset}/${plan.instruction
+          .slice(0, 40)
+          .replace(/[^A-Za-z0-9_-]/g, "_")}`,
+      instruction: plan.instruction,
+      initUrl: plan.startUrl,
+      ...(plan.precomputedRubric && {
+        precomputedRubric: plan.precomputedRubric,
+      }),
+      ...(plan.expectedAnswer && { expectedAnswer: plan.expectedAnswer }),
+    };
+
+    return {
+      v3,
+      taskSpec,
+      dataset: plan.dataset,
+      judgeModel,
+      judgeClientOptions,
+      successMode: process.env.EVAL_SUCCESS_MODE as
+        | "outcome"
+        | "process"
+        | "both"
+        | undefined,
+    };
+  } catch (error) {
+    logger.warn({
+      category: "claude_code",
+      message: `verifier setup skipped (falling back to self-report): ${
+        error instanceof Error ? error.message : String(error)
+      }`,
+      level: 0,
+    });
+    return undefined;
+  }
+}
+
 export const claudeCodeHarness: BenchHarness = {
   harness: "claude_code",
   supportedTaskKinds: ["agent", "suite"],
@@ -205,12 +389,17 @@ export const claudeCodeHarness: BenchHarness = {
       logger,
     });
     try {
+      // Built inside the try so a fail-fast verifier-config error (e.g. an
+      // override judge whose key can't be resolved) still runs the finally that
+      // owns the prepared tool adapter, instead of leaking it.
+      const verifier = buildClaudeCodeVerifierConfig(plan, logger);
       return await runClaudeCodeAgent({
         plan,
         model: input.modelName,
         logger,
         toolAdapter,
         signal,
+        verifier,
       });
     } finally {
       await toolAdapter.cleanup();
diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts
index da68dc929..70462aa8f 100644
--- a/packages/evals/framework/claudeCodeRunner.ts
+++ b/packages/evals/framework/claudeCodeRunner.ts
@@ -1,4 +1,9 @@
-import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand";
+import type {
+  AvailableModel,
+  ClientOptions,
+  TaskSpec,
+  V3,
+} from "@browserbasehq/stagehand";
 import { EvalsError } from "../errors.js";
 import type { EvalLogger } from "../logger.js";
 import type { TaskResult } from "./types.js";
@@ -30,6 +35,20 @@ export interface ClaudeCodeVerifierConfig {
   taskSpec: TaskSpec;
   /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */
   dataset: string;
+  /**
+   * Judge model for V3Evaluator (scoring + rubric generation). When omitted the
+   * evaluator falls back to its own default (google/gemini-2.5-flash). Pass an
+   * Anthropic model here to score against ANTHROPIC_API_KEY.
+   */
+  judgeModel?: AvailableModel;
+  /**
+   * Client options (API key) for the judge model. Required alongside judgeModel
+   * when the judge's provider differs from the evaluator's own default —
+   * otherwise V3Evaluator defaults modelClientOptions.apiKey to the Gemini key,
+   * which is sent as the wrong provider's credential (e.g. an Anthropic judge
+   * receives the Gemini key and fails with "invalid x-api-key").
+   */
+  judgeClientOptions?: ClientOptions;
   /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */
   successMode?: "outcome" | "process" | "both";
   /** Override trajectory persistence root. */
@@ -289,7 +308,13 @@ export async function runClaudeCodeAgent({
 
     const { V3Evaluator } = await import("@browserbasehq/stagehand");
     const { RubricCache } = await import("./rubricCache.js");
-    const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" });
+    const evaluator = new V3Evaluator(verifier.v3, {
+      backend: "verifier",
+      ...(verifier.judgeModel && { modelName: verifier.judgeModel }),
+      ...(verifier.judgeClientOptions && {
+        modelClientOptions: verifier.judgeClientOptions,
+      }),
+    });
 
     // Hydrate rubric — use precomputed if present, otherwise cache-or-generate.
     let rubric = verifier.taskSpec.precomputedRubric;
diff --git a/packages/evals/framework/claudeCodeToolAdapter.ts b/packages/evals/framework/claudeCodeToolAdapter.ts
index 8fdc14182..63050253f 100644
--- a/packages/evals/framework/claudeCodeToolAdapter.ts
+++ b/packages/evals/framework/claudeCodeToolAdapter.ts
@@ -326,19 +326,28 @@ export async function prepareBrowseCliHarnessAdapter(
     PATH: `${cwd}${path.delimiter}${process.env.PATH ?? ""}`,
   } as Record<string, string>;
 
+  const modeFlag = input.environment === "BROWSERBASE" ? "--remote" : "--local";
   await fsp.writeFile(
     wrapperPath,
     [
       "#!/usr/bin/env bash",
       "set -euo pipefail",
-      `exec ${JSON.stringify(process.execPath)} ${JSON.stringify(BROWSE_CLI_ENTRYPOINT)} --json --session ${JSON.stringify(session)} "$@"`,
+      // The mode flag (--local/--remote) selects the environment when the daemon
+      // is first started and must be explicit so a set BROWSERBASE_API_KEY does
+      // not silently auto-select remote. It is only accepted by the driver
+      // commands, so skip it for the few subcommands that reject it (stop,
+      // status). The session name is safe on every command.
+      "cmd=${1:-}",
+      "mode=()",
+      'if [[ "$cmd" != "stop" && "$cmd" != "status" ]]; then',
+      `  mode=(${JSON.stringify(modeFlag)})`,
+      "fi",
+      `exec ${JSON.stringify(process.execPath)} ${JSON.stringify(BROWSE_CLI_ENTRYPOINT)} "$@" "\${mode[@]+\${mode[@]}}" --session ${JSON.stringify(session)}`,
       "",
     ].join("\n"),
     { mode: 0o755 },
   );
 
-  await runBrowseSetup(wrapperPath, input.environment, input.logger, env, cwd);
-
   return {
     toolSurface: "browse_cli",
     startupProfile: input.startupProfile,
@@ -1070,22 +1079,6 @@ function buildCdpCodePromptInstructions(plan: ExternalHarnessTaskPlan): string {
   ].join("\n");
 }
 
-async function runBrowseSetup(
-  wrapperPath: string,
-  environment: "LOCAL" | "BROWSERBASE",
-  logger: EvalLogger,
-  env: Record<string, string>,
-  cwd: string,
-): Promise<void> {
-  await runBrowseCommand(
-    wrapperPath,
-    ["env", environment === "BROWSERBASE" ? "remote" : "local"],
-    logger,
-    env,
-    cwd,
-  );
-}
-
 function buildBrowseCliPromptInstructions(
   plan: ExternalHarnessTaskPlan,
 ): string {
diff --git a/packages/evals/framework/externalHarnessPlan.ts b/packages/evals/framework/externalHarnessPlan.ts
index fa23bf99e..6dae10775 100644
--- a/packages/evals/framework/externalHarnessPlan.ts
+++ b/packages/evals/framework/externalHarnessPlan.ts
@@ -1,3 +1,4 @@
+import { normalizeRubric, type Rubric } from "@browserbasehq/stagehand";
 import { EvalsError } from "../errors.js";
 import type { EvalInput } from "../types/evals.js";
 
@@ -6,6 +7,14 @@ export interface ExternalHarnessTaskPlan {
   taskId?: string;
   startUrl: string;
   instruction: string;
+  /**
+   * Precomputed rubric carried by the dataset row (`precomputed_rubric`), if
+   * present. Threaded into the verifier's TaskSpec so it doesn't regenerate.
+   * Undefined when the row ships no rubric — the verifier generates one.
+   */
+  precomputedRubric?: Rubric;
+  /** Reference answer carried by the dataset row (`expectedAnswer`), if present. */
+  expectedAnswer?: string;
 }
 
 function readString(
@@ -16,6 +25,21 @@ function readString(
   return typeof value === "string" && value.length > 0 ? value : undefined;
 }
 
+/**
+ * Rubric + reference answer a dataset row may ship. Rows without a
+ * `precomputed_rubric` leave `precomputedRubric` undefined so the verifier
+ * generates (and caches) one from the instruction.
+ */
+function readVerifierFields(params: Record<string, unknown>): {
+  precomputedRubric?: Rubric;
+  expectedAnswer?: string;
+} {
+  return {
+    precomputedRubric: normalizeRubric(params.precomputed_rubric) ?? undefined,
+    expectedAnswer: readString(params, "expectedAnswer"),
+  };
+}
+
 export function buildExternalHarnessTaskPlan(
   input: EvalInput,
 ): ExternalHarnessTaskPlan {
@@ -34,6 +58,7 @@ export function buildExternalHarnessTaskPlan(
       taskId: readString(params, "id"),
       startUrl,
       instruction,
+      ...readVerifierFields(params),
     };
   }
 
@@ -50,6 +75,7 @@ export function buildExternalHarnessTaskPlan(
       taskId: readString(params, "task_id"),
       startUrl,
       instruction,
+      ...readVerifierFields(params),
     };
   }
 
@@ -65,6 +91,7 @@ export function buildExternalHarnessTaskPlan(
       taskId: readString(params, "id"),
       startUrl: readString(params, "web") ?? "https://www.google.com",
       instruction,
+      ...readVerifierFields(params),
     };
   }
 
diff --git a/packages/evals/tests/framework/verifierConfig.test.ts b/packages/evals/tests/framework/verifierConfig.test.ts
new file mode 100644
index 000000000..9f5597860
--- /dev/null
+++ b/packages/evals/tests/framework/verifierConfig.test.ts
@@ -0,0 +1,160 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import type { ExternalHarnessTaskPlan } from "../../framework/externalHarnessPlan.js";
+
+// Keep the real @browserbasehq/stagehand surface (loadApiKeyFromEnv,
+// providerEnvVarMap, etc. — the credential-resolution logic under test depends
+// on them) but replace V3 with a lightweight stub so the config-building path
+// never touches real LLM-provider/browser internals. buildClaudeCodeVerifierConfig
+// only uses the V3 instance as an inert LLM-client carrier.
+vi.mock("@browserbasehq/stagehand", async () => {
+  const actual = await vi.importActual<
+    typeof import("@browserbasehq/stagehand")
+  >("@browserbasehq/stagehand");
+  return {
+    ...actual,
+    V3: class {
+      opts: unknown;
+      constructor(opts: unknown) {
+        this.opts = opts;
+      }
+    },
+  };
+});
+
+// Mock the execute()-path collaborators so we can assert the fail-fast +
+// finally(cleanup) contract without spawning a real Claude Code agent.
+// vi.hoisted keeps these usable inside the hoisted vi.mock factories below.
+const { cleanupMock, runClaudeCodeAgentMock } = vi.hoisted(() => ({
+  cleanupMock: vi.fn(async () => {}),
+  runClaudeCodeAgentMock: vi.fn(async () => ({}) as never),
+}));
+
+vi.mock("../../framework/claudeCodeToolAdapter.js", () => ({
+  prepareClaudeCodeToolAdapter: vi.fn(async () => ({ cleanup: cleanupMock })),
+}));
+
+vi.mock("../../framework/claudeCodeRunner.js", () => ({
+  runClaudeCodeAgent: runClaudeCodeAgentMock,
+}));
+
+vi.mock("../../framework/externalHarnessPlan.js", () => ({
+  buildExternalHarnessTaskPlan: vi.fn(
+    (): ExternalHarnessTaskPlan => ({
+      dataset: "webvoyager",
+      taskId: "wv-1",
+      startUrl: "https://example.com",
+      instruction: "Find the checkout button",
+    }),
+  ),
+}));
+
+import {
+  buildClaudeCodeVerifierConfig,
+  claudeCodeHarness,
+} from "../../framework/benchHarness.js";
+import { EvalLogger } from "../../logger.js";
+import { EvalsError } from "../../errors.js";
+
+const plan: ExternalHarnessTaskPlan = {
+  dataset: "webvoyager",
+  taskId: "wv-1",
+  startUrl: "https://example.com",
+  instruction: "Find the checkout button",
+};
+
+// Env keys the verifier-config credential resolution reads. Snapshot + restore
+// so tests don't leak state into each other or the rest of the suite.
+const MANAGED_ENV = [
+  "EVAL_CLAUDE_CODE_VERIFIER",
+  "EVAL_CLAUDE_CODE_VERIFIER_MODEL",
+  "AI_GATEWAY_API_KEY",
+  "ANTHROPIC_API_KEY",
+  "OLLAMA_API_KEY",
+] as const;
+
+let savedEnv: Record<string, string | undefined>;
+
+beforeEach(() => {
+  savedEnv = {};
+  for (const key of MANAGED_ENV) {
+    savedEnv[key] = process.env[key];
+    delete process.env[key];
+  }
+  cleanupMock.mockClear();
+  runClaudeCodeAgentMock.mockClear();
+});
+
+afterEach(() => {
+  for (const key of MANAGED_ENV) {
+    if (savedEnv[key] === undefined) delete process.env[key];
+    else process.env[key] = savedEnv[key];
+  }
+});
+
+describe("buildClaudeCodeVerifierConfig judge credentials", () => {
+  it("builds a config for a keyless provider override (ollama) without an apiKey", () => {
+    process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL = "ollama/llama3";
+
+    const config = buildClaudeCodeVerifierConfig(plan, new EvalLogger(false));
+
+    expect(config).toBeDefined();
+    expect(config?.judgeModel).toBe("ollama/llama3");
+    // Keyless provider → no explicit apiKey is threaded through.
+    expect(config?.judgeClientOptions).toBeUndefined();
+  });
+
+  it("resolves AI_GATEWAY_API_KEY for a gateway/ judge override", () => {
+    process.env.AI_GATEWAY_API_KEY = "gw-test-key";
+    process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL =
+      "gateway/anthropic/claude-sonnet-4-20250514";
+
+    const config = buildClaudeCodeVerifierConfig(plan, new EvalLogger(false));
+
+    expect(config).toBeDefined();
+    expect(config?.judgeModel).toBe(
+      "gateway/anthropic/claude-sonnet-4-20250514",
+    );
+    expect(config?.judgeClientOptions).toEqual({ apiKey: "gw-test-key" });
+  });
+
+  it("fail-fasts when a gateway/ judge override is missing AI_GATEWAY_API_KEY", () => {
+    process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL = "gateway/some-model";
+
+    expect(() =>
+      buildClaudeCodeVerifierConfig(plan, new EvalLogger(false)),
+    ).toThrow(/AI_GATEWAY_API_KEY|no API key resolved/);
+  });
+});
+
+describe("claudeCodeHarness.execute verifier fail-fast", () => {
+  const makeExecuteInput = () => ({
+    task: {} as never,
+    input: { modelName: "anthropic/claude-sonnet-4-20250514" } as never,
+    row: {
+      config: {
+        harness: "claude_code" as const,
+        model: "anthropic/claude-sonnet-4-20250514" as never,
+        environment: "LOCAL" as const,
+        useApi: false,
+      },
+    } as never,
+    logger: new EvalLogger(false),
+  });
+
+  it("throws the config error but still runs toolAdapter.cleanup() (fail-fast inside try/finally)", async () => {
+    // Anthropic judge override with ANTHROPIC_API_KEY unset (cleared in beforeEach)
+    // → verifier config must throw, and the prepared adapter must still be cleaned up.
+    process.env.EVAL_CLAUDE_CODE_VERIFIER_MODEL =
+      "anthropic/claude-sonnet-4-20250514";
+
+    await expect(claudeCodeHarness.execute(makeExecuteInput())).rejects.toThrow(
+      EvalsError,
+    );
+
+    // The verifier construction was moved inside the try, so the finally that
+    // owns the adapter runs even when config resolution throws.
+    expect(cleanupMock).toHaveBeenCalledTimes(1);
+    // The agent must NOT have run — we failed fast before executing.
+    expect(runClaudeCodeAgentMock).not.toHaveBeenCalled();
+  });
+});