Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/odysseysbench-eval-suite.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand-evals": minor
---

Add OdysseysBench as a supported agent benchmark in the evals CLI. OdysseysBench is a 200-task web-agent benchmark (45 easy / 46 medium / 109 hard); each task ships a weighted rubric that is baked into the verifier's `precomputed_rubric` format so process + outcome are scored against the published criteria. Run with `--eval-name agent/odysseysbench` (or the `external_agent_benchmarks` category); supports `EVAL_ODYSSEYSBENCH_LIMIT`, `EVAL_ODYSSEYSBENCH_SAMPLE`, `EVAL_ODYSSEYSBENCH_LEVEL`, and `EVAL_ODYSSEYSBENCH_IDS`.
2 changes: 2 additions & 0 deletions packages/evals/cli-legacy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ const CATEGORY_OVERRIDES: Record<string, string[]> = {
"agent/webvoyager": ["external_agent_benchmarks"],
"agent/onlineMind2Web": ["external_agent_benchmarks"],
"agent/webtailbench": ["external_agent_benchmarks"],
"agent/odysseysbench": ["external_agent_benchmarks"],
};

/**
Expand Down Expand Up @@ -681,6 +682,7 @@ function handleRun(args: string[]): void {
osworld: "agent/osworld",
onlineMind2Web: "agent/onlineMind2Web",
webtailbench: "agent/webtailbench",
odysseysbench: "agent/odysseysbench",
};

evalName = benchmarkMap[benchmarkName];
Expand Down
200 changes: 200 additions & 0 deletions packages/evals/datasets/odysseysbench/OdysseysBench_data.jsonl

Large diffs are not rendered by default.

8,532 changes: 8,532 additions & 0 deletions packages/evals/datasets/odysseysbench/source/tasks.json

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions packages/evals/evals.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
},
"webtailbench": {
"limit": 25
},
"odysseysbench": {
"limit": 25
}
}
}
2 changes: 2 additions & 0 deletions packages/evals/framework/benchPlanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { EvalsError } from "../errors.js";
import { buildOnlineMind2WebTestcases } from "../suites/onlineMind2Web.js";
import { buildWebTailBenchTestcases } from "../suites/webtailbench.js";
import { buildWebVoyagerTestcases } from "../suites/webvoyager.js";
import { buildOdysseysBenchTestcases } from "../suites/odysseysbench.js";
import {
getAgentModelEntries,
getModelList,
Expand Down Expand Up @@ -513,6 +514,7 @@ export function generateSuiteTestcases(
"agent/webvoyager": (models) => buildWebVoyagerTestcases(models),
"agent/onlineMind2Web": (models) => buildOnlineMind2WebTestcases(models),
"agent/webtailbench": (models) => buildWebTailBenchTestcases(models),
"agent/odysseysbench": (models) => buildOdysseysBenchTestcases(models),
};
const legacyOnlySuites = new Set(["agent/gaia"]);

Expand Down
1 change: 1 addition & 0 deletions packages/evals/framework/discovery.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ const CATEGORY_OVERRIDES: Record<string, string[]> = {
"agent/webvoyager": ["external_agent_benchmarks"],
"agent/onlineMind2Web": ["external_agent_benchmarks"],
"agent/webtailbench": ["external_agent_benchmarks"],
"agent/odysseysbench": ["external_agent_benchmarks"],
};

function getTaskBasename(taskName: string): string {
Expand Down
19 changes: 17 additions & 2 deletions packages/evals/framework/externalHarnessPlan.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { EvalsError } from "../errors.js";
import type { EvalInput } from "../types/evals.js";

export interface ExternalHarnessTaskPlan {
dataset: "webvoyager" | "onlineMind2Web" | "webtailbench";
dataset: "webvoyager" | "onlineMind2Web" | "webtailbench" | "odysseysbench";
taskId?: string;
startUrl: string;
instruction: string;
Expand Down Expand Up @@ -68,7 +68,22 @@ export function buildExternalHarnessTaskPlan(
};
}

if (input.name === "agent/odysseysbench") {

@cubic-dev-ai cubic-dev-ai Bot Jun 25, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P3: New OdysseysBench planner path lacks focused unit coverage. Add tests for success mapping and missing confirmed_task failure to lock behavior.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At packages/evals/framework/externalHarnessPlan.ts, line 71:

<comment>New OdysseysBench planner path lacks focused unit coverage. Add tests for success mapping and missing confirmed_task failure to lock behavior.</comment>

<file context>
@@ -68,7 +68,22 @@ export function buildExternalHarnessTaskPlan(
     };
   }
 
+  if (input.name === "agent/odysseysbench") {
+    const instruction = readString(params, "confirmed_task");
+    if (!instruction) {
</file context>
Fix with cubic

const instruction = readString(params, "confirmed_task");
if (!instruction) {
throw new EvalsError(
`Missing OdysseysBench params for external harness: expected confirmed_task.`,
);
}
return {
dataset: "odysseysbench",
taskId: readString(params, "task_id"),
startUrl: readString(params, "website") ?? "https://www.google.com",
instruction,
};
}

throw new EvalsError(
`External harness "${input.name}" is not supported yet. Supported: agent/webvoyager, agent/onlineMind2Web, agent/webtailbench.`,
`External harness "${input.name}" is not supported yet. Supported: agent/webvoyager, agent/onlineMind2Web, agent/webtailbench, agent/odysseysbench.`,
);
}
20 changes: 20 additions & 0 deletions packages/evals/index.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ import { buildWebVoyagerTestcases } from "./suites/webvoyager.js";
import { buildOnlineMind2WebTestcases } from "./suites/onlineMind2Web.js";
import { endBrowserbaseSession } from "./browserbaseCleanup.js";
import { buildWebTailBenchTestcases } from "./suites/webtailbench.js";
import { buildOdysseysBenchTestcases } from "./suites/odysseysbench.js";
import { getCurrentDirPath } from "./runtimePaths.js";

import dotenv from "dotenv";
Expand Down Expand Up @@ -252,6 +253,25 @@ const generateFilteredTestcases = (): Testcase[] => {
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webtailbench");
}

// Special handling: fan out OdysseysBench dataset for agent/odysseysbench
const isOdysseysBenchTaskIncluded = taskNamesToRun.includes(
"agent/odysseysbench",
);

if (
isOdysseysBenchTaskIncluded &&
(!datasetFilter || datasetFilter === "odysseysbench")
) {
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/odysseysbench");
allTestcases.push(...buildOdysseysBenchTestcases(currentModels));
} else if (
isOdysseysBenchTaskIncluded &&
datasetFilter &&
datasetFilter !== "odysseysbench"
) {
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/odysseysbench");
}

// Create a list of all remaining testcases using the determined task names and models
const isAgentCategory =
effectiveCategory === "agent" ||
Expand Down
209 changes: 209 additions & 0 deletions packages/evals/scripts/build-odysseysbench-dataset.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
/**
* Build packages/evals/datasets/odysseysbench/OdysseysBench_data.jsonl from the
* published OdysseysBench task set.
*
* OdysseysBench (https://odysseysbench.com) is a 200-task web-agent benchmark
* (45 easy / 46 medium / 109 hard). Every task ships a weighted rubric whose
* weights sum to 1.0. This script converts each task's `rubrics` map into the
* verifier's `precomputed_rubric` shape ({ items: [{ criterion, description,
* max_points }] }) so the suite can hand it straight to V3Evaluator.verify()
* without generating a rubric.
*
* Source of truth is the committed snapshot at
* packages/evals/datasets/odysseysbench/source/tasks.json
* (mirrored from https://odysseysbench.com/assets/data/tasks.json). Re-fetch
* with `--fetch` to refresh that snapshot before rebuilding.
*
* Run after pulling the branch (or whenever the source snapshot changes):
* pnpm tsx packages/evals/scripts/build-odysseysbench-dataset.ts
*
* Idempotent — regenerates the JSONL deterministically from the snapshot.
*/
import fs from "node:fs/promises";
import path from "node:path";

const SOURCE_URL = "https://odysseysbench.com/assets/data/tasks.json";

const DATASET_DIR = path.join(

@cubic-dev-ai cubic-dev-ai Bot Jun 25, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P3: Uses path.join for repo-internal dataset paths; this emits backslashes on Windows and violates the repo’s '/' path convention.

(Based on your team's feedback about forward-slash path separators.) .

View Feedback

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At packages/evals/scripts/build-odysseysbench-dataset.ts, line 27:

<comment>Uses path.join for repo-internal dataset paths; this emits backslashes on Windows and violates the repo’s '/' path convention.

(Based on your team's feedback about forward-slash path separators.) .</comment>

<file context>
@@ -0,0 +1,151 @@
+
+const SOURCE_URL = "https://odysseysbench.com/assets/data/tasks.json";
+
+const DATASET_DIR = path.join(
+  path.resolve(import.meta.dirname, ".."),
+  "datasets",
</file context>
Fix with cubic

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Declining this one: path.join is the correct choice for runtime filesystem paths (it emits the OS-native separator, which is what fs wants on Windows), and it matches the sibling dev script scripts/backfill-webtailbench-rubrics.ts, which also uses path.join for its dataset paths. The forward-slash convention applies to in-code/URL/import paths; the suite loader that builds an embedded dataset path does use /. Keeping path.join here for consistency with the existing converter script.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the feedback. This comment was influenced by this learning. Open the link to edit it, or reply here to edit or delete it.

path.resolve(import.meta.dirname, ".."),
"datasets",
"odysseysbench",
);
const SOURCE_PATH = path.join(DATASET_DIR, "source", "tasks.json");
const JSONL_PATH = path.join(DATASET_DIR, "OdysseysBench_data.jsonl");

interface SourceRubric {
requirement: string;
verification: string;
weight: number;
}

interface SourceTask {
task_id: string;
confirmed_task: string;
website: string;
reference_length: number;
level: "easy" | "medium" | "hard";
rubrics: Record<string, SourceRubric>;
categories?: string[];
num_categories?: number;
}

interface RubricItem {
criterion: string;
description: string;
max_points: number;
}

interface OutputRow {
task_id: string;
confirmed_task: string;
website: string;
level: "easy" | "medium" | "hard";
reference_length: number;
categories?: string[];
precomputed_rubric: { items: RubricItem[] };
}

/** Order rubric keys R1, R2, … R10 numerically rather than lexicographically. */
function sortRubricKeys(keys: string[]): string[] {
return [...keys].sort((a, b) => {
const na = Number.parseInt(a.replace(/^\D+/, ""), 10);
const nb = Number.parseInt(b.replace(/^\D+/, ""), 10);
if (Number.isFinite(na) && Number.isFinite(nb) && na !== nb) return na - nb;
return a.localeCompare(b);
});
}

const POINT_SCALE = 1000;

/**
* Convert one OdysseysBench rubric entry into a verifier rubric item.
*
* `weight` (summing to 1.0 across a task) is scaled to integer points. The
* process score is Σ earned / Σ max, so the absolute scale is immaterial — but
* rounding is *not* a uniform scaling, so a coarse scale (e.g. ×100) would
* distort the relative weighting of small criteria. ×1000 keeps the rounding
* error well under 1% even for the smallest published weights. `max(1, …)` is
* a defensive floor; with valid weights it never binds.
*/
function toRubricItem(r: SourceRubric): RubricItem {
return {
criterion: r.requirement,
description: `${r.requirement}\n\nHow a grader verifies this: ${r.verification}`,
max_points: Math.max(1, Math.round(r.weight * POINT_SCALE)),
};
}

async function loadSource(): Promise<SourceTask[]> {
if (process.argv.includes("--fetch")) {
const res = await fetch(SOURCE_URL);
if (!res.ok) {
throw new Error(`Failed to fetch ${SOURCE_URL}: ${res.status}`);
}
const text = await res.text();
await fs.mkdir(path.dirname(SOURCE_PATH), { recursive: true });
await fs.writeFile(SOURCE_PATH, text);
console.log(`Refreshed snapshot: ${SOURCE_PATH}`);
return JSON.parse(text) as SourceTask[];
}
const text = await fs.readFile(SOURCE_PATH, "utf8");
return JSON.parse(text) as SourceTask[];
}

async function main(): Promise<void> {
const tasks = await loadSource();
if (!Array.isArray(tasks) || tasks.length === 0) {
throw new Error("Source tasks.json is empty or not an array");
}

const lines: string[] = [];
for (const task of tasks) {
// Fail loud on an upstream schema change rather than silently emitting a
// row the suite validator would later drop (shrinking the benchmark).
if (typeof task.task_id !== "string" || !task.task_id) {
throw new Error(
`Task is missing a string task_id: ${JSON.stringify(task).slice(0, 200)}`,
);
}
if (typeof task.confirmed_task !== "string" || !task.confirmed_task) {
throw new Error(`Task ${task.task_id} is missing confirmed_task`);
}
const rubricKeys = sortRubricKeys(Object.keys(task.rubrics ?? {}));
if (rubricKeys.length === 0) {
throw new Error(`Task ${task.task_id} has no rubrics`);
}
// Validate each rubric entry individually — an aggregate sum check alone
// lets a bad weight (e.g. negative, offset by a larger one) or an empty
// requirement/verification slip through and produce a mis-weighted or
// malformed item. Fail loud on source schema drift instead.
for (const k of rubricKeys) {
const r = task.rubrics[k];
if (typeof r?.requirement !== "string" || !r.requirement.trim()) {
throw new Error(
`Task ${task.task_id} rubric ${k} has an empty requirement`,
);
}
if (typeof r.verification !== "string" || !r.verification.trim()) {
throw new Error(
`Task ${task.task_id} rubric ${k} has an empty verification`,
);
}
if (
typeof r.weight !== "number" ||
!Number.isFinite(r.weight) ||
r.weight <= 0 ||
r.weight > 1
) {
throw new Error(
`Task ${task.task_id} rubric ${k} has invalid weight ${r.weight}; expected a number in (0, 1]`,
);
}
}
// The published weights are a normalized distribution; a re-fetched snapshot
// that breaks that convention would silently mis-weight the rubric.
const weightSum = rubricKeys.reduce(
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
(acc, k) => acc + task.rubrics[k].weight,
0,
);
if (Math.abs(weightSum - 1) > 0.02) {
throw new Error(
`Task ${task.task_id} rubric weights sum to ${weightSum}, expected ~1.0`,
);
}
const items = rubricKeys.map((k) => toRubricItem(task.rubrics[k]));

const row: OutputRow = {
task_id: task.task_id,
confirmed_task: task.confirmed_task,
website: task.website,
level: task.level,
reference_length: task.reference_length,
...(Array.isArray(task.categories) && task.categories.length > 0
? { categories: task.categories }
: {}),
precomputed_rubric: { items },
};
lines.push(JSON.stringify(row));
}

if (lines.length !== tasks.length) {
throw new Error(
`Expected ${tasks.length} rows, produced ${lines.length} — a task was dropped`,
);
}

await fs.writeFile(JSONL_PATH, lines.join("\n") + "\n");
const byLevel = tasks.reduce<Record<string, number>>((acc, t) => {
acc[t.level] = (acc[t.level] ?? 0) + 1;
return acc;
}, {});
console.log(
`Wrote ${lines.length} rows to ${JSONL_PATH} (${JSON.stringify(byLevel)})`,
);
}

main().catch((err) => {
console.error(err);
process.exit(1);
});
Loading
Loading