browserbase · miguelg719 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/.changeset/odysseysbench-eval-suite.md b/.changeset/odysseysbench-eval-suite.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand-evals": minor
+---
+
+Add OdysseysBench as a supported agent benchmark in the evals CLI. OdysseysBench is a 200-task web-agent benchmark (45 easy / 46 medium / 109 hard); each task ships a weighted rubric that is baked into the verifier's `precomputed_rubric` format so process + outcome are scored against the published criteria. Run with `--eval-name agent/odysseysbench` (or the `external_agent_benchmarks` category); supports `EVAL_ODYSSEYSBENCH_LIMIT`, `EVAL_ODYSSEYSBENCH_SAMPLE`, `EVAL_ODYSSEYSBENCH_LEVEL`, and `EVAL_ODYSSEYSBENCH_IDS`.
diff --git a/packages/evals/cli-legacy.ts b/packages/evals/cli-legacy.ts
@@ -100,6 +100,7 @@ const CATEGORY_OVERRIDES: Record<string, string[]> = {
   "agent/webvoyager": ["external_agent_benchmarks"],
   "agent/onlineMind2Web": ["external_agent_benchmarks"],
   "agent/webtailbench": ["external_agent_benchmarks"],
+  "agent/odysseysbench": ["external_agent_benchmarks"],
 };
 
 /**
@@ -681,6 +682,7 @@ function handleRun(args: string[]): void {
         osworld: "agent/osworld",
         onlineMind2Web: "agent/onlineMind2Web",
         webtailbench: "agent/webtailbench",
+        odysseysbench: "agent/odysseysbench",
       };
 
       evalName = benchmarkMap[benchmarkName];

diff --git a/packages/evals/datasets/odysseysbench/OdysseysBench_data.jsonl b/packages/evals/datasets/odysseysbench/OdysseysBench_data.jsonl
diff --git a/packages/evals/datasets/odysseysbench/source/tasks.json b/packages/evals/datasets/odysseysbench/source/tasks.json
diff --git a/packages/evals/evals.config.json b/packages/evals/evals.config.json
@@ -17,6 +17,9 @@
     },
     "webtailbench": {
       "limit": 25
+    },
+    "odysseysbench": {
+      "limit": 25
     }
   }
 }
diff --git a/packages/evals/framework/benchPlanner.ts b/packages/evals/framework/benchPlanner.ts
@@ -3,6 +3,7 @@ import { EvalsError } from "../errors.js";
 import { buildOnlineMind2WebTestcases } from "../suites/onlineMind2Web.js";
 import { buildWebTailBenchTestcases } from "../suites/webtailbench.js";
 import { buildWebVoyagerTestcases } from "../suites/webvoyager.js";
+import { buildOdysseysBenchTestcases } from "../suites/odysseysbench.js";
 import {
   getAgentModelEntries,
   getModelList,
@@ -513,6 +514,7 @@ export function generateSuiteTestcases(
     "agent/webvoyager": (models) => buildWebVoyagerTestcases(models),
     "agent/onlineMind2Web": (models) => buildOnlineMind2WebTestcases(models),
     "agent/webtailbench": (models) => buildWebTailBenchTestcases(models),
+    "agent/odysseysbench": (models) => buildOdysseysBenchTestcases(models),
   };
   const legacyOnlySuites = new Set(["agent/gaia"]);
 

diff --git a/packages/evals/framework/discovery.ts b/packages/evals/framework/discovery.ts
@@ -63,6 +63,7 @@ const CATEGORY_OVERRIDES: Record<string, string[]> = {
   "agent/webvoyager": ["external_agent_benchmarks"],
   "agent/onlineMind2Web": ["external_agent_benchmarks"],
   "agent/webtailbench": ["external_agent_benchmarks"],
+  "agent/odysseysbench": ["external_agent_benchmarks"],
 };
 
 function getTaskBasename(taskName: string): string {

diff --git a/packages/evals/framework/externalHarnessPlan.ts b/packages/evals/framework/externalHarnessPlan.ts
@@ -2,7 +2,7 @@ import { EvalsError } from "../errors.js";
 import type { EvalInput } from "../types/evals.js";
 
 export interface ExternalHarnessTaskPlan {
-  dataset: "webvoyager" | "onlineMind2Web" | "webtailbench";
+  dataset: "webvoyager" | "onlineMind2Web" | "webtailbench" | "odysseysbench";
   taskId?: string;
   startUrl: string;
   instruction: string;
@@ -68,7 +68,22 @@ export function buildExternalHarnessTaskPlan(
     };
   }
 
+  if (input.name === "agent/odysseysbench") {
+    const instruction = readString(params, "confirmed_task");
+    if (!instruction) {
+      throw new EvalsError(
+        `Missing OdysseysBench params for external harness: expected confirmed_task.`,
+      );
+    }
+    return {
+      dataset: "odysseysbench",
+      taskId: readString(params, "task_id"),
+      startUrl: readString(params, "website") ?? "https://www.google.com",
+      instruction,
+    };
+  }
+
   throw new EvalsError(
-    `External harness "${input.name}" is not supported yet. Supported: agent/webvoyager, agent/onlineMind2Web, agent/webtailbench.`,
+    `External harness "${input.name}" is not supported yet. Supported: agent/webvoyager, agent/onlineMind2Web, agent/webtailbench, agent/odysseysbench.`,
   );
 }
diff --git a/packages/evals/index.eval.ts b/packages/evals/index.eval.ts
@@ -52,6 +52,7 @@ import { buildWebVoyagerTestcases } from "./suites/webvoyager.js";
 import { buildOnlineMind2WebTestcases } from "./suites/onlineMind2Web.js";
 import { endBrowserbaseSession } from "./browserbaseCleanup.js";
 import { buildWebTailBenchTestcases } from "./suites/webtailbench.js";
+import { buildOdysseysBenchTestcases } from "./suites/odysseysbench.js";
 import { getCurrentDirPath } from "./runtimePaths.js";
 
 import dotenv from "dotenv";
@@ -252,6 +253,25 @@ const generateFilteredTestcases = (): Testcase[] => {
     taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webtailbench");
   }
 
+  // Special handling: fan out OdysseysBench dataset for agent/odysseysbench
+  const isOdysseysBenchTaskIncluded = taskNamesToRun.includes(
+    "agent/odysseysbench",
+  );
+
+  if (
+    isOdysseysBenchTaskIncluded &&
+    (!datasetFilter || datasetFilter === "odysseysbench")
+  ) {
+    taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/odysseysbench");
+    allTestcases.push(...buildOdysseysBenchTestcases(currentModels));
+  } else if (
+    isOdysseysBenchTaskIncluded &&
+    datasetFilter &&
+    datasetFilter !== "odysseysbench"
+  ) {
+    taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/odysseysbench");
+  }
+
   // Create a list of all remaining testcases using the determined task names and models
   const isAgentCategory =
     effectiveCategory === "agent" ||

diff --git a/packages/evals/scripts/build-odysseysbench-dataset.ts b/packages/evals/scripts/build-odysseysbench-dataset.ts
@@ -0,0 +1,209 @@
+/**
+ * Build packages/evals/datasets/odysseysbench/OdysseysBench_data.jsonl from the
+ * published OdysseysBench task set.
+ *
+ * OdysseysBench (https://odysseysbench.com) is a 200-task web-agent benchmark
+ * (45 easy / 46 medium / 109 hard). Every task ships a weighted rubric whose
+ * weights sum to 1.0. This script converts each task's `rubrics` map into the
+ * verifier's `precomputed_rubric` shape ({ items: [{ criterion, description,
+ * max_points }] }) so the suite can hand it straight to V3Evaluator.verify()
+ * without generating a rubric.
+ *
+ * Source of truth is the committed snapshot at
+ *   packages/evals/datasets/odysseysbench/source/tasks.json
+ * (mirrored from https://odysseysbench.com/assets/data/tasks.json). Re-fetch
+ * with `--fetch` to refresh that snapshot before rebuilding.
+ *
+ * Run after pulling the branch (or whenever the source snapshot changes):
+ *   pnpm tsx packages/evals/scripts/build-odysseysbench-dataset.ts
+ *
+ * Idempotent — regenerates the JSONL deterministically from the snapshot.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+
+const SOURCE_URL = "https://odysseysbench.com/assets/data/tasks.json";
+
+const DATASET_DIR = path.join(
+  path.resolve(import.meta.dirname, ".."),
+  "datasets",
+  "odysseysbench",
+);
+const SOURCE_PATH = path.join(DATASET_DIR, "source", "tasks.json");
+const JSONL_PATH = path.join(DATASET_DIR, "OdysseysBench_data.jsonl");
+
+interface SourceRubric {
+  requirement: string;
+  verification: string;
+  weight: number;
+}
+
+interface SourceTask {
+  task_id: string;
+  confirmed_task: string;
+  website: string;
+  reference_length: number;
+  level: "easy" | "medium" | "hard";
+  rubrics: Record<string, SourceRubric>;
+  categories?: string[];
+  num_categories?: number;
+}
+
+interface RubricItem {
+  criterion: string;
+  description: string;
+  max_points: number;
+}
+
+interface OutputRow {
+  task_id: string;
+  confirmed_task: string;
+  website: string;
+  level: "easy" | "medium" | "hard";
+  reference_length: number;
+  categories?: string[];
+  precomputed_rubric: { items: RubricItem[] };
+}
+
+/** Order rubric keys R1, R2, … R10 numerically rather than lexicographically. */
+function sortRubricKeys(keys: string[]): string[] {
+  return [...keys].sort((a, b) => {
+    const na = Number.parseInt(a.replace(/^\D+/, ""), 10);
+    const nb = Number.parseInt(b.replace(/^\D+/, ""), 10);
+    if (Number.isFinite(na) && Number.isFinite(nb) && na !== nb) return na - nb;
+    return a.localeCompare(b);
+  });
+}
+
+const POINT_SCALE = 1000;
+
+/**
+ * Convert one OdysseysBench rubric entry into a verifier rubric item.
+ *
+ * `weight` (summing to 1.0 across a task) is scaled to integer points. The
+ * process score is Σ earned / Σ max, so the absolute scale is immaterial — but
+ * rounding is *not* a uniform scaling, so a coarse scale (e.g. ×100) would
+ * distort the relative weighting of small criteria. ×1000 keeps the rounding
+ * error well under 1% even for the smallest published weights. `max(1, …)` is
+ * a defensive floor; with valid weights it never binds.
+ */
+function toRubricItem(r: SourceRubric): RubricItem {
+  return {
+    criterion: r.requirement,
+    description: `${r.requirement}\n\nHow a grader verifies this: ${r.verification}`,
+    max_points: Math.max(1, Math.round(r.weight * POINT_SCALE)),
+  };
+}
+
+async function loadSource(): Promise<SourceTask[]> {
+  if (process.argv.includes("--fetch")) {
+    const res = await fetch(SOURCE_URL);
+    if (!res.ok) {
+      throw new Error(`Failed to fetch ${SOURCE_URL}: ${res.status}`);
+    }
+    const text = await res.text();
+    await fs.mkdir(path.dirname(SOURCE_PATH), { recursive: true });
+    await fs.writeFile(SOURCE_PATH, text);
+    console.log(`Refreshed snapshot: ${SOURCE_PATH}`);
+    return JSON.parse(text) as SourceTask[];
+  }
+  const text = await fs.readFile(SOURCE_PATH, "utf8");
+  return JSON.parse(text) as SourceTask[];
+}
+
+async function main(): Promise<void> {
+  const tasks = await loadSource();
+  if (!Array.isArray(tasks) || tasks.length === 0) {
+    throw new Error("Source tasks.json is empty or not an array");
+  }
+
+  const lines: string[] = [];
+  for (const task of tasks) {
+    // Fail loud on an upstream schema change rather than silently emitting a
+    // row the suite validator would later drop (shrinking the benchmark).
+    if (typeof task.task_id !== "string" || !task.task_id) {
+      throw new Error(
+        `Task is missing a string task_id: ${JSON.stringify(task).slice(0, 200)}`,
+      );
+    }
+    if (typeof task.confirmed_task !== "string" || !task.confirmed_task) {
+      throw new Error(`Task ${task.task_id} is missing confirmed_task`);
+    }
+    const rubricKeys = sortRubricKeys(Object.keys(task.rubrics ?? {}));
+    if (rubricKeys.length === 0) {
+      throw new Error(`Task ${task.task_id} has no rubrics`);
+    }
+    // Validate each rubric entry individually — an aggregate sum check alone
+    // lets a bad weight (e.g. negative, offset by a larger one) or an empty
+    // requirement/verification slip through and produce a mis-weighted or
+    // malformed item. Fail loud on source schema drift instead.
+    for (const k of rubricKeys) {
+      const r = task.rubrics[k];
+      if (typeof r?.requirement !== "string" || !r.requirement.trim()) {
+        throw new Error(
+          `Task ${task.task_id} rubric ${k} has an empty requirement`,
+        );
+      }
+      if (typeof r.verification !== "string" || !r.verification.trim()) {
+        throw new Error(
+          `Task ${task.task_id} rubric ${k} has an empty verification`,
+        );
+      }
+      if (
+        typeof r.weight !== "number" ||
+        !Number.isFinite(r.weight) ||
+        r.weight <= 0 ||
+        r.weight > 1
+      ) {
+        throw new Error(
+          `Task ${task.task_id} rubric ${k} has invalid weight ${r.weight}; expected a number in (0, 1]`,
+        );
+      }
+    }
+    // The published weights are a normalized distribution; a re-fetched snapshot
+    // that breaks that convention would silently mis-weight the rubric.
+    const weightSum = rubricKeys.reduce(
+      (acc, k) => acc + task.rubrics[k].weight,
+      0,
+    );
+    if (Math.abs(weightSum - 1) > 0.02) {
+      throw new Error(
+        `Task ${task.task_id} rubric weights sum to ${weightSum}, expected ~1.0`,
+      );
+    }
+    const items = rubricKeys.map((k) => toRubricItem(task.rubrics[k]));
+
+    const row: OutputRow = {
+      task_id: task.task_id,
+      confirmed_task: task.confirmed_task,
+      website: task.website,
+      level: task.level,
+      reference_length: task.reference_length,
+      ...(Array.isArray(task.categories) && task.categories.length > 0
+        ? { categories: task.categories }
+        : {}),
+      precomputed_rubric: { items },
+    };
+    lines.push(JSON.stringify(row));
+  }
+
+  if (lines.length !== tasks.length) {
+    throw new Error(
+      `Expected ${tasks.length} rows, produced ${lines.length} — a task was dropped`,
+    );
+  }
+
+  await fs.writeFile(JSONL_PATH, lines.join("\n") + "\n");
+  const byLevel = tasks.reduce<Record<string, number>>((acc, t) => {
+    acc[t.level] = (acc[t.level] ?? 0) + 1;
+    return acc;
+  }, {});
+  console.log(
+    `Wrote ${lines.length} rows to ${JSONL_PATH} (${JSON.stringify(byLevel)})`,
+  );
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});