From e96c92ac871c1abde32a8df56f51bb2c5235069f Mon Sep 17 00:00:00 2001
From: Ipseeta <ipseeta.pkar@gmail.com>
Date: Tue, 12 May 2026 18:50:16 +0530
Subject: [PATCH] Add consensusPolicy: fail-on-disagreement option for
 assertions

---
 README.md                       | 21 ++++++++-
 src/__tests__/assertion.test.ts | 78 +++++++++++++++++++++++++++++++++
 src/assertion.ts                | 22 +++++++++-
 src/config.ts                   | 31 +++++++++++++
 4 files changed, 150 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d27e120..dc7d863 100644
--- a/README.md
+++ b/README.md
@@ -216,7 +216,7 @@ const result = await runUserFlow({
 
 ### `assert(options: AssertionOptions)`
 
-Multi-model consensus assertion. Runs Claude and Gemini in parallel; if they disagree, a third model arbitrates.
+Multi-model consensus assertion. Runs Claude and Gemini in parallel; if they disagree, a third model arbitrates (configurable — see [Consensus Policy](#consensus-policy)).
 
 ```typescript
 const result = await assert({
@@ -226,6 +226,25 @@ const result = await assert({
 });
 ```
 
+### Consensus Policy
+
+When the primary (Claude) and secondary (Gemini) assertion models reach the same verdict, the result is used directly. When they **disagree**, you choose how Passmark resolves it:
+
+| Policy | Behavior |
+|---|---|
+| `consult-arbiter-on-disagreement` *(default)* | Calls the arbiter model (Gemini 3.1 Pro) to break the tie. |
+| `fail-on-disagreement` | Treats any disagreement as a failure immediately — no arbiter call. The returned reasoning includes both models' takes so you can inspect what they saw differently. |
+
+Pick `fail-on-disagreement` when you'd rather surface ambiguity/flakiness in the UI under test than let a single model swing the result. Pick the default when you trust the arbiter to make the final call.
+
+```typescript
+configure({
+  assertions: {
+    consensusPolicy: "fail-on-disagreement",
+  },
+});
+```
+
 ### Video Assertions
 
 For UI that's only visible for a second or two — toast messages, snackbar confirmations, transient banners — a single end-of-flow screenshot often misses the evidence. Set `video: true` on an assertion inside `runSteps` and Passmark will record the entire step run with `page.screencast`, upload the resulting `.webm` to Gemini's Files API, and evaluate the assertion against the full video:
diff --git a/src/__tests__/assertion.test.ts b/src/__tests__/assertion.test.ts
index 636b1e8..583add3 100644
--- a/src/__tests__/assertion.test.ts
+++ b/src/__tests__/assertion.test.ts
@@ -36,6 +36,7 @@ vi.mock("../utils", () => ({
 }));
 
 import { assert } from "../assertion";
+import { configure, resetConfig } from "../config";
 import { withTimeout } from "../utils";
 import { generateText } from "ai";
 
@@ -85,6 +86,7 @@ function makeGenerateTextImpl(opts: {
 
 beforeEach(() => {
   vi.clearAllMocks();
+  resetConfig();
 });
 
 describe("assert consensus logic", () => {
@@ -233,3 +235,79 @@ describe("assert consensus logic", () => {
     expect(res).toContain("✅ passed");
   });
 });
+
+describe("consensusPolicy", () => {
+  it('fails on disagreement when policy is "fail-on-disagreement" and skips the arbiter', async () => {
+    configure({ assertions: { consensusPolicy: "fail-on-disagreement" } });
+
+    const page = createMockPage();
+    let arbiterCalled = false;
+
+    vi.mocked(generateText).mockImplementation((async (args: any) => {
+      const model = String(args.model ?? "");
+      const wantsStructured = Boolean(args.output);
+      if (!wantsStructured) return { text: "claude text" } as any;
+      if (model.includes("anthropic")) {
+        return { output: { assertionPassed: true, confidenceScore: 90, reasoning: "Claude says pass" } } as any;
+      }
+      if (model.includes("3.1-pro-preview")) {
+        arbiterCalled = true;
+        return { output: { assertionPassed: true, confidenceScore: 80, reasoning: "Arbiter should NOT be called" } } as any;
+      }
+      if (model.includes("gemini-3-flash")) {
+        return { output: { assertionPassed: false, confidenceScore: 70, reasoning: "Gemini says fail" } } as any;
+      }
+      return { output: { assertionPassed: false, confidenceScore: 0, reasoning: "unknown" } } as any;
+    }) as any);
+
+    const res = await assert({
+      page,
+      assertion: "The page shows 3 items",
+      test: mockTest,
+      expect: ((a: unknown, _m?: string) => ({ toBe: (_v: unknown) => {} })) as any,
+      failSilently: true,
+      maxRetries: 0, // skip the outer retry loop so we observe a single attempt
+    });
+
+    expect(arbiterCalled).toBe(false);
+    expect(res).toContain("❌ failed");
+    expect(res).toContain("Claude says pass");
+    expect(res).toContain("Gemini says fail");
+    expect(res).toContain("fail-on-disagreement");
+  });
+
+  it("still consults the arbiter on disagreement when policy is the default", async () => {
+    // No configure() — should use default "consult-arbiter-on-disagreement"
+    const page = createMockPage();
+    let arbiterCalled = false;
+
+    vi.mocked(generateText).mockImplementation((async (args: any) => {
+      const model = String(args.model ?? "");
+      const wantsStructured = Boolean(args.output);
+      if (!wantsStructured) return { text: "claude text" } as any;
+      if (model.includes("anthropic")) {
+        return { output: { assertionPassed: true, confidenceScore: 90, reasoning: "Claude says pass" } } as any;
+      }
+      if (model.includes("3.1-pro-preview")) {
+        arbiterCalled = true;
+        return { output: { assertionPassed: true, confidenceScore: 75, reasoning: "Arbiter: pass" } } as any;
+      }
+      if (model.includes("gemini-3-flash")) {
+        return { output: { assertionPassed: false, confidenceScore: 70, reasoning: "Gemini says fail" } } as any;
+      }
+      return { output: { assertionPassed: false, confidenceScore: 0, reasoning: "unknown" } } as any;
+    }) as any);
+
+    const res = await assert({
+      page,
+      assertion: "The page shows 3 items",
+      test: mockTest,
+      expect: ((a: unknown, _m?: string) => ({ toBe: (_v: unknown) => {} })) as any,
+      failSilently: true,
+    });
+
+    expect(arbiterCalled).toBe(true);
+    expect(res).toContain("✅ passed");
+    expect(res).toContain("Arbiter: pass");
+  });
+});
diff --git a/src/assertion.ts b/src/assertion.ts
index 0173b4a..b83152b 100644
--- a/src/assertion.ts
+++ b/src/assertion.ts
@@ -1,6 +1,6 @@
 import { generateText, ModelMessage, Output } from "ai";
 import { z } from "zod";
-import { getModelId } from "./config";
+import { getConsensusPolicy, getModelId } from "./config";
 import { ASSERTION_MODEL_TIMEOUT, THINKING_BUDGET_DEFAULT } from "./constants";
 import { logger } from "./logger";
 import { resolveModel } from "./models";
@@ -312,6 +312,26 @@ Please carefully review the evidence (screenshot and accessibility snapshot (whe
 
         // Check if models disagree on assertionPassed
         if (claudeResult.assertionPassed !== geminiResult.assertionPassed) {
+          const policy = getConsensusPolicy();
+
+          if (policy === "fail-on-disagreement") {
+            logger.debug(
+              "Models disagree on assertion result; failing per consensusPolicy=fail-on-disagreement.",
+            );
+            const lower = Math.min(
+              claudeResult.confidenceScore,
+              geminiResult.confidenceScore,
+            );
+            return {
+              assertionPassed: false,
+              confidenceScore: Math.round(lower),
+              reasoning:
+                `Assertion failed: models disagreed and consensusPolicy is "fail-on-disagreement".\n` +
+                `Claude (passed=${claudeResult.assertionPassed}, ${claudeResult.confidenceScore}%): ${claudeResult.reasoning}\n` +
+                `Gemini (passed=${geminiResult.assertionPassed}, ${geminiResult.confidenceScore}%): ${geminiResult.reasoning}`,
+            };
+          }
+
           logger.debug("Models disagree on assertion result, consulting arbiter...");
           const arbiterResult = await withTimeout(
             getArbiterDecision(claudeResult, geminiResult),
diff --git a/src/config.ts b/src/config.ts
index dffa078..f47cc98 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -76,6 +76,27 @@ export type RedisConfig = {
   url?: string;
 };
 
+/**
+ * Policy for resolving disagreements between the primary and secondary
+ * assertion models.
+ * - "consult-arbiter-on-disagreement" (default): a third arbiter model
+ *   makes the final call. Best when you trust the arbiter to break ties.
+ * - "fail-on-disagreement": any disagreement fails the assertion
+ *   immediately. Strictest possible setting — useful when you'd rather
+ *   surface flakiness/ambiguity than risk a single model being wrong.
+ */
+export type ConsensusPolicy =
+  | "consult-arbiter-on-disagreement"
+  | "fail-on-disagreement";
+
+export type AssertionsConfig = {
+  /**
+   * How to resolve disagreements between the primary and secondary
+   * assertion models. Defaults to "consult-arbiter-on-disagreement".
+   */
+  consensusPolicy?: ConsensusPolicy;
+};
+
 export type TelemetryConfig = {
   /**
    * Axiom API token for OpenTelemetry tracing of AI calls.
@@ -98,6 +119,8 @@ type Config = {
   redis?: RedisConfig;
   /** Telemetry (Axiom) connection. When omitted, falls back to `AXIOM_TOKEN`/`AXIOM_DATASET` env vars. */
   telemetry?: TelemetryConfig;
+  /** Behavior of the multi-model assertion consensus engine. */
+  assertions?: AssertionsConfig;
   /**
    * Directory used to temporarily store video recordings for video-flagged
    * assertions. Defaults to `/tmp/passmark-recordings`. Files are deleted
@@ -161,6 +184,14 @@ export function getMode(): AIMode {
   return getConfig().ai?.mode ?? "snapshot";
 }
 
+/**
+ * Returns the effective consensus policy. Defaults to
+ * "consult-arbiter-on-disagreement" so existing users see no change.
+ */
+export function getConsensusPolicy(): ConsensusPolicy {
+  return getConfig().assertions?.consensusPolicy ?? "consult-arbiter-on-disagreement";
+}
+
 /**
  * Effective AI config for a single step / call after merging overrides with
  * the global config. `getModelId` looks up a model with the same precedence