From e96c92ac871c1abde32a8df56f51bb2c5235069f Mon Sep 17 00:00:00 2001 From: Ipseeta Date: Tue, 12 May 2026 18:50:16 +0530 Subject: [PATCH] Add consensusPolicy: fail-on-disagreement option for assertions --- README.md | 21 ++++++++- src/__tests__/assertion.test.ts | 78 +++++++++++++++++++++++++++++++++ src/assertion.ts | 22 +++++++++- src/config.ts | 31 +++++++++++++ 4 files changed, 150 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d27e120..dc7d863 100644 --- a/README.md +++ b/README.md @@ -216,7 +216,7 @@ const result = await runUserFlow({ ### `assert(options: AssertionOptions)` -Multi-model consensus assertion. Runs Claude and Gemini in parallel; if they disagree, a third model arbitrates. +Multi-model consensus assertion. Runs Claude and Gemini in parallel; if they disagree, a third model arbitrates (configurable — see [Consensus Policy](#consensus-policy)). ```typescript const result = await assert({ @@ -226,6 +226,25 @@ const result = await assert({ }); ``` +### Consensus Policy + +When the primary (Claude) and secondary (Gemini) assertion models reach the same verdict, the result is used directly. When they **disagree**, you choose how Passmark resolves it: + +| Policy | Behavior | +|---|---| +| `consult-arbiter-on-disagreement` *(default)* | Calls the arbiter model (Gemini 3.1 Pro) to break the tie. | +| `fail-on-disagreement` | Treats any disagreement as a failure immediately — no arbiter call. The returned reasoning includes both models' takes so you can inspect what they saw differently. | + +Pick `fail-on-disagreement` when you'd rather surface ambiguity/flakiness in the UI under test than let a single model swing the result. Pick the default when you trust the arbiter to make the final call. + +```typescript +configure({ + assertions: { + consensusPolicy: "fail-on-disagreement", + }, +}); +``` + ### Video Assertions For UI that's only visible for a second or two — toast messages, snackbar confirmations, transient banners — a single end-of-flow screenshot often misses the evidence. Set `video: true` on an assertion inside `runSteps` and Passmark will record the entire step run with `page.screencast`, upload the resulting `.webm` to Gemini's Files API, and evaluate the assertion against the full video: diff --git a/src/__tests__/assertion.test.ts b/src/__tests__/assertion.test.ts index 636b1e8..583add3 100644 --- a/src/__tests__/assertion.test.ts +++ b/src/__tests__/assertion.test.ts @@ -36,6 +36,7 @@ vi.mock("../utils", () => ({ })); import { assert } from "../assertion"; +import { configure, resetConfig } from "../config"; import { withTimeout } from "../utils"; import { generateText } from "ai"; @@ -85,6 +86,7 @@ function makeGenerateTextImpl(opts: { beforeEach(() => { vi.clearAllMocks(); + resetConfig(); }); describe("assert consensus logic", () => { @@ -233,3 +235,79 @@ describe("assert consensus logic", () => { expect(res).toContain("✅ passed"); }); }); + +describe("consensusPolicy", () => { + it('fails on disagreement when policy is "fail-on-disagreement" and skips the arbiter', async () => { + configure({ assertions: { consensusPolicy: "fail-on-disagreement" } }); + + const page = createMockPage(); + let arbiterCalled = false; + + vi.mocked(generateText).mockImplementation((async (args: any) => { + const model = String(args.model ?? ""); + const wantsStructured = Boolean(args.output); + if (!wantsStructured) return { text: "claude text" } as any; + if (model.includes("anthropic")) { + return { output: { assertionPassed: true, confidenceScore: 90, reasoning: "Claude says pass" } } as any; + } + if (model.includes("3.1-pro-preview")) { + arbiterCalled = true; + return { output: { assertionPassed: true, confidenceScore: 80, reasoning: "Arbiter should NOT be called" } } as any; + } + if (model.includes("gemini-3-flash")) { + return { output: { assertionPassed: false, confidenceScore: 70, reasoning: "Gemini says fail" } } as any; + } + return { output: { assertionPassed: false, confidenceScore: 0, reasoning: "unknown" } } as any; + }) as any); + + const res = await assert({ + page, + assertion: "The page shows 3 items", + test: mockTest, + expect: ((a: unknown, _m?: string) => ({ toBe: (_v: unknown) => {} })) as any, + failSilently: true, + maxRetries: 0, // skip the outer retry loop so we observe a single attempt + }); + + expect(arbiterCalled).toBe(false); + expect(res).toContain("❌ failed"); + expect(res).toContain("Claude says pass"); + expect(res).toContain("Gemini says fail"); + expect(res).toContain("fail-on-disagreement"); + }); + + it("still consults the arbiter on disagreement when policy is the default", async () => { + // No configure() — should use default "consult-arbiter-on-disagreement" + const page = createMockPage(); + let arbiterCalled = false; + + vi.mocked(generateText).mockImplementation((async (args: any) => { + const model = String(args.model ?? ""); + const wantsStructured = Boolean(args.output); + if (!wantsStructured) return { text: "claude text" } as any; + if (model.includes("anthropic")) { + return { output: { assertionPassed: true, confidenceScore: 90, reasoning: "Claude says pass" } } as any; + } + if (model.includes("3.1-pro-preview")) { + arbiterCalled = true; + return { output: { assertionPassed: true, confidenceScore: 75, reasoning: "Arbiter: pass" } } as any; + } + if (model.includes("gemini-3-flash")) { + return { output: { assertionPassed: false, confidenceScore: 70, reasoning: "Gemini says fail" } } as any; + } + return { output: { assertionPassed: false, confidenceScore: 0, reasoning: "unknown" } } as any; + }) as any); + + const res = await assert({ + page, + assertion: "The page shows 3 items", + test: mockTest, + expect: ((a: unknown, _m?: string) => ({ toBe: (_v: unknown) => {} })) as any, + failSilently: true, + }); + + expect(arbiterCalled).toBe(true); + expect(res).toContain("✅ passed"); + expect(res).toContain("Arbiter: pass"); + }); +}); diff --git a/src/assertion.ts b/src/assertion.ts index 0173b4a..b83152b 100644 --- a/src/assertion.ts +++ b/src/assertion.ts @@ -1,6 +1,6 @@ import { generateText, ModelMessage, Output } from "ai"; import { z } from "zod"; -import { getModelId } from "./config"; +import { getConsensusPolicy, getModelId } from "./config"; import { ASSERTION_MODEL_TIMEOUT, THINKING_BUDGET_DEFAULT } from "./constants"; import { logger } from "./logger"; import { resolveModel } from "./models"; @@ -312,6 +312,26 @@ Please carefully review the evidence (screenshot and accessibility snapshot (whe // Check if models disagree on assertionPassed if (claudeResult.assertionPassed !== geminiResult.assertionPassed) { + const policy = getConsensusPolicy(); + + if (policy === "fail-on-disagreement") { + logger.debug( + "Models disagree on assertion result; failing per consensusPolicy=fail-on-disagreement.", + ); + const lower = Math.min( + claudeResult.confidenceScore, + geminiResult.confidenceScore, + ); + return { + assertionPassed: false, + confidenceScore: Math.round(lower), + reasoning: + `Assertion failed: models disagreed and consensusPolicy is "fail-on-disagreement".\n` + + `Claude (passed=${claudeResult.assertionPassed}, ${claudeResult.confidenceScore}%): ${claudeResult.reasoning}\n` + + `Gemini (passed=${geminiResult.assertionPassed}, ${geminiResult.confidenceScore}%): ${geminiResult.reasoning}`, + }; + } + logger.debug("Models disagree on assertion result, consulting arbiter..."); const arbiterResult = await withTimeout( getArbiterDecision(claudeResult, geminiResult), diff --git a/src/config.ts b/src/config.ts index dffa078..f47cc98 100644 --- a/src/config.ts +++ b/src/config.ts @@ -76,6 +76,27 @@ export type RedisConfig = { url?: string; }; +/** + * Policy for resolving disagreements between the primary and secondary + * assertion models. + * - "consult-arbiter-on-disagreement" (default): a third arbiter model + * makes the final call. Best when you trust the arbiter to break ties. + * - "fail-on-disagreement": any disagreement fails the assertion + * immediately. Strictest possible setting — useful when you'd rather + * surface flakiness/ambiguity than risk a single model being wrong. + */ +export type ConsensusPolicy = + | "consult-arbiter-on-disagreement" + | "fail-on-disagreement"; + +export type AssertionsConfig = { + /** + * How to resolve disagreements between the primary and secondary + * assertion models. Defaults to "consult-arbiter-on-disagreement". + */ + consensusPolicy?: ConsensusPolicy; +}; + export type TelemetryConfig = { /** * Axiom API token for OpenTelemetry tracing of AI calls. @@ -98,6 +119,8 @@ type Config = { redis?: RedisConfig; /** Telemetry (Axiom) connection. When omitted, falls back to `AXIOM_TOKEN`/`AXIOM_DATASET` env vars. */ telemetry?: TelemetryConfig; + /** Behavior of the multi-model assertion consensus engine. */ + assertions?: AssertionsConfig; /** * Directory used to temporarily store video recordings for video-flagged * assertions. Defaults to `/tmp/passmark-recordings`. Files are deleted @@ -161,6 +184,14 @@ export function getMode(): AIMode { return getConfig().ai?.mode ?? "snapshot"; } +/** + * Returns the effective consensus policy. Defaults to + * "consult-arbiter-on-disagreement" so existing users see no change. + */ +export function getConsensusPolicy(): ConsensusPolicy { + return getConfig().assertions?.consensusPolicy ?? "consult-arbiter-on-disagreement"; +} + /** * Effective AI config for a single step / call after merging overrides with * the global config. `getModelId` looks up a model with the same precedence