From 27a0863209c9ad2e8303f10687474e4043577423 Mon Sep 17 00:00:00 2001
From: Madhusudhanan <smkp84@gmail.com>
Date: Thu, 16 Apr 2026 06:34:53 +0530
Subject: [PATCH 1/3] feat: add support for multiple assertion models via
 assertionModels array

- Add assertionModels array option to ModelConfig for flexible model configuration
- Maintain backward compatibility with assertionPrimary/assertionSecondary
- Implement majority voting for 3+ models
- Add getAssertionModelsList() helper to resolve configured models
- Update assertion engine to support dynamic number of models
- Update config test to account for new assertionModels key

Closes #25
---
 src/__tests__/config.test.ts |   4 +-
 src/assertion.ts             | 427 +++++++++++++++++++++--------------
 src/config.ts                |  81 ++++++-
 3 files changed, 335 insertions(+), 177 deletions(-)
diff --git a/src/__tests__/config.test.ts b/src/__tests__/config.test.ts
index 5a6f22c..2bed72e 100644
--- a/src/__tests__/config.test.ts
+++ b/src/__tests__/config.test.ts
@@ -44,9 +44,9 @@ describe("config", () => {
     expect(getConfig().uploadBasePath).toBe("./second");
   });
 
-  it("getModelId returns default for each of the 7 keys", () => {
+  it("getModelId returns default for each of the 8 keys", () => {
     const keys = Object.keys(DEFAULT_MODELS) as Array<keyof typeof DEFAULT_MODELS>;
-    expect(keys).toHaveLength(7);
+    expect(keys).toHaveLength(8);
 
     for (const key of keys) {
       expect(getModelId(key)).toBe(DEFAULT_MODELS[key]);
diff --git a/src/assertion.ts b/src/assertion.ts
index 5188ea7..4917755 100644
--- a/src/assertion.ts
+++ b/src/assertion.ts
@@ -1,6 +1,6 @@
 import { generateText, ModelMessage, Output } from "ai";
 import { z } from "zod";
-import { getModelId } from "./config";
+import { getModelId, getAssertionModelsList } from "./config";
 import { ASSERTION_MODEL_TIMEOUT, THINKING_BUDGET_DEFAULT } from "./constants";
 import { logger } from "./logger";
 import { resolveModel } from "./models";
@@ -19,10 +19,199 @@ const assertionSchema = z.object({
     ),
 });
 
+type ModelAssertionFunction = () => Promise<AssertionResult>;
+
+/**
+ * Creates an assertion function for a specific model
+ */
+function createModelAssertionFunction(
+  modelId: string,
+  messages: ModelMessage[],
+  thinkingEnabled: boolean
+): ModelAssertionFunction {
+  return async (): Promise<AssertionResult> => {
+    // Check if it's a Google model (for provider-specific options)
+    const isGoogleModel = modelId.toLowerCase().includes('google') || modelId.toLowerCase().includes('gemini');
+    const isAnthropicModel = modelId.toLowerCase().includes('anthropic') || modelId.toLowerCase().includes('claude');
+    
+    const providerOptions: Record<string, any> = {};
+    
+    if (thinkingEnabled) {
+      if (isAnthropicModel) {
+        providerOptions.anthropic = {
+          thinking: { type: "enabled", budgetTokens: THINKING_BUDGET_DEFAULT },
+        };
+      } else if (isGoogleModel) {
+        providerOptions.google = {
+          thinkingConfig: {
+            thinkingBudget: THINKING_BUDGET_DEFAULT,
+          },
+        };
+      }
+      
+      // Always include openrouter reasoning option as fallback
+      providerOptions.openrouter = {
+        reasoning: { max_tokens: THINKING_BUDGET_DEFAULT },
+      };
+    }
+    
+    const { output } = await generateText({
+      model: resolveModel(modelId),
+      temperature: 0,
+      providerOptions: Object.keys(providerOptions).length > 0 ? providerOptions : undefined,
+      messages,
+      output: Output.object({ schema: assertionSchema }),
+    });
+    
+    return output;
+  };
+}
+
+/**
+ * Creates an arbiter function to resolve disagreements between multiple model results
+ */
+function createArbiterFunction(
+  modelResults: Array<{ modelId: string; result: AssertionResult }>,
+  messages: ModelMessage[],
+  imageContent: Array<{ type: "image"; image: string }>,
+  assertion: string,
+  snapshot: string,
+  images?: string[]
+): ModelAssertionFunction {
+  return async (): Promise<AssertionResult> => {
+    const resultsSummary = modelResults
+      .map(({ modelId, result }) => `
+Model: ${modelId}
+- Assertion Passed: ${result.assertionPassed}
+- Confidence: ${result.confidenceScore}%
+- Reasoning: ${result.reasoning}
+`)
+      .join("\n");
+
+    const arbiterPrompt = `
+You are an AI arbiter tasked with resolving a disagreement between multiple AI models about an assertion.
+
+The following models evaluated the assertion and reached different conclusions:
+
+${resultsSummary}
+
+${!images
+        ? `
+<Snapshot>
+${snapshot}
+</Snapshot>
+`
+        : ""
+      }
+
+<Assertion>
+${assertion}
+</Assertion>
+
+Please carefully review the evidence (screenshot and accessibility snapshot when provided) and make the final determination. Consider all models' reasoning but make your own independent assessment.
+
+<Rules>
+- Make your own independent evaluation based on the evidence
+- Don't simply pick one model's answer - analyze the situation yourself
+- Provide clear reasoning for your decision
+- Be decisive - this is the final answer
+- First use the attached screenshot(s) to visually inspect the page and try to verify the assertion.
+- Only if the screenshot is not sufficient, use the accessibility snapshot (if supplied) to verify the assertion.
+- Don't create additional assertion conditions on your own - only consider the exact assertion provided above.
+- The assertion should pass if either the screenshot or the accessibility snapshot supports it.
+- Don't be overly strict or pedantic about exact wording. Focus on the intent and objective of the assertion rather than literal text matching.
+- Think like a practical QA tester - if the core functionality or state being asserted is present, the assertion should pass even if minor details differ.
+</Rules>
+`;
+
+    const arbiterMessages: ModelMessage[] = [
+      {
+        role: "user",
+        content: [
+          {
+            type: "text",
+            text: arbiterPrompt,
+          },
+          ...imageContent,
+        ],
+      },
+    ];
+
+    const arbiterModelId = getModelId("assertionArbiter");
+    
+    const { output } = await generateText({
+      model: resolveModel(arbiterModelId),
+      temperature: 0,
+      providerOptions: {
+        openrouter: {
+          reasoning: { max_tokens: THINKING_BUDGET_DEFAULT },
+        },
+      },
+      messages: arbiterMessages,
+      output: Output.object({ schema: assertionSchema }),
+    });
+
+    return output;
+  };
+}
+
+/**
+ * Checks if all models agree on the assertion result
+ */
+function checkConsensus(results: AssertionResult[]): {
+  hasConsensus: boolean;
+  consensusResult?: AssertionResult;
+  majorityResult?: AssertionResult;
+  passedCount: number;
+  failedCount: number;
+} {
+  const passedCount = results.filter(r => r.assertionPassed).length;
+  const failedCount = results.length - passedCount;
+  
+  // Unanimous consensus
+  if (passedCount === results.length || failedCount === results.length) {
+    const avgConfidence = results.reduce((sum, r) => sum + r.confidenceScore, 0) / results.length;
+    return {
+      hasConsensus: true,
+      consensusResult: {
+        assertionPassed: passedCount === results.length,
+        confidenceScore: Math.round(avgConfidence),
+        reasoning: results[0].reasoning, // Use first model's reasoning
+      },
+      passedCount,
+      failedCount,
+    };
+  }
+  
+  // Majority vote (for 3+ models)
+  if (results.length >= 3) {
+    const majorityPassed = passedCount > failedCount;
+    const majorityResults = results.filter(r => r.assertionPassed === majorityPassed);
+    const avgConfidence = majorityResults.reduce((sum, r) => sum + r.confidenceScore, 0) / majorityResults.length;
+    
+    return {
+      hasConsensus: false,
+      majorityResult: {
+        assertionPassed: majorityPassed,
+        confidenceScore: Math.round(avgConfidence),
+        reasoning: majorityResults[0].reasoning,
+      },
+      passedCount,
+      failedCount,
+    };
+  }
+  
+  return {
+    hasConsensus: false,
+    passedCount,
+    failedCount,
+  };
+}
+
 /**
  * Multi-model consensus assertion engine.
- * Runs Claude and Gemini in parallel; if they disagree, a third model (arbiter) makes the final call.
- * An assertion passes only if both models agree (or the arbiter decides).
+ * Runs multiple AI models in parallel; if they disagree, an arbiter model makes the final call.
+ * Supports both legacy primary/secondary and new assertionModels array configuration.
  * Automatically retries failed assertions once with a fresh page snapshot.
  *
  * @param options - Assertion configuration
@@ -38,6 +227,7 @@ const assertionSchema = z.object({
  *
  * @example
  * ```typescript
+ * // Using multiple models (new approach)
  * await assert({
  *   page,
  *   assertion: "The dashboard shows 3 active projects",
@@ -57,6 +247,13 @@ export const assert = async ({
   failSilently,
 }: AssertionOptions): Promise<string> => {
   const thinkingEnabled = effort === "high";
+  
+  // Get the list of models to use for assertions
+  const assertionModels = getAssertionModelsList();
+  
+  if (assertionModels.length === 0) {
+    throw new Error("No assertion models configured. Please configure at least one model for assertions.");
+  }
 
   const runFullAssertion = async (): Promise<AssertionResult> => {
     const snapshot = await safeSnapshot(page);
@@ -125,177 +322,67 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco
       },
     ];
 
-    // Claude assertion function
-    const getClaudeAssertion = async (): Promise<AssertionResult> => {
-      // First get Claude's text response with thinking if enabled
-      const { text } = await generateText({
-        model: resolveModel(getModelId("assertionPrimary")),
-        temperature: 0,
-        providerOptions: thinkingEnabled
-          ? {
-            anthropic: {
-              thinking: { type: "enabled", budgetTokens: THINKING_BUDGET_DEFAULT },
-            },
-            openrouter: {
-              reasoning: { max_tokens: THINKING_BUDGET_DEFAULT },
-            },
-          }
-          : undefined,
-        messages,
-      });
-
-      // Convert Claude's response to structured format using Haiku
-      const { output } = await generateText({
-        model: resolveModel(getModelId("assertionPrimary")),
-        temperature: 0.1,
-        prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`,
-        output: Output.object({ schema: assertionSchema }),
-      });
-
-      return output;
-    };
-
-    // Gemini assertion function
-    const getGeminiAssertion = async (): Promise<AssertionResult> => {
-      const { output } = await generateText({
-        model: resolveModel(getModelId("assertionSecondary")),
-        temperature: 0,
-        providerOptions: thinkingEnabled
-          ? {
-            google: {
-              thinkingConfig: {
-                thinkingBudget: THINKING_BUDGET_DEFAULT,
-              },
-            },
-            openrouter: {
-              reasoning: { max_tokens: THINKING_BUDGET_DEFAULT },
-            },
-          }
-          : undefined,
-        messages,
-        output: Output.object({ schema: assertionSchema }),
-      });
-
-      return output;
-    };
-
-    // Arbiter function using Gemini 2.5 Pro with thinking enabled
-    const getArbiterDecision = async (
-      claudeResult: AssertionResult,
-      geminiResult: AssertionResult,
-    ): Promise<AssertionResult> => {
-      const arbiterPrompt = `
-You are an AI arbiter tasked with resolving a disagreement between two AI models about an assertion.
-
-Claude's Assessment:
-- Assertion Passed: ${claudeResult.assertionPassed}
-- Confidence: ${claudeResult.confidenceScore}%
-- Reasoning: ${claudeResult.reasoning}
-
-Gemini's Assessment:
-- Assertion Passed: ${geminiResult.assertionPassed}
-- Confidence: ${geminiResult.confidenceScore}%
-- Reasoning: ${geminiResult.reasoning}
-
-${!images
-          ? `
-<Snapshot>
-${snapshot}
-</Snapshot>
-`
-          : ""
-        }
-
-<Assertion>
-${assertion}
-</Assertion>
-
-Please carefully review the evidence (screenshot and accessibility snapshot (when provided)) and make the final determination. Consider both models' reasoning but make your own independent assessment.
-
-<Rules>
-- Make your own independent evaluation based on the evidence
-- Don't simply pick one model's answer - analyze the situation yourself
-- Provide clear reasoning for your decision
-- Be decisive - this is the final answer
-- First use the attached screenshot(s) to visually inspect the page and try to verify the assertion.
-- Only if the screenshot is not sufficient, use the accessibility snapshot (if supplied) to verify the assertion.
-- Don't create additional assertion conditions on your own - only consider the exact assertion provided above.
-- The assertion should pass if either the screenshot or the accessibility snapshot supports it.
-- Don't be overly strict or pedantic about exact wording. Focus on the intent and objective of the assertion rather than literal text matching.
-- Think like a practical QA tester - if the core functionality or state being asserted is present, the assertion should pass even if minor details differ.
-</Rules>
-`;
-
-      const arbiterMessages: ModelMessage[] = [
-        {
-          role: "user",
-          content: [
-            {
-              type: "text",
-              text: arbiterPrompt,
-            },
-            ...imageContent,
-          ],
-        },
-      ];
-
-      const { output } = await generateText({
-        model: resolveModel(getModelId("assertionArbiter")),
-        temperature: 0,
-        providerOptions: {
-          google: {
-            thinkingConfig: {
-              thinkingBudget: THINKING_BUDGET_DEFAULT,
-            },
-          },
-          openrouter: {
-            reasoning: { max_tokens: THINKING_BUDGET_DEFAULT },
-          },
-        },
-        messages: arbiterMessages,
-        output: Output.object({ schema: assertionSchema }),
-      });
-
-      return output;
-    };
-
     const runAssertion = async (attempt = 0): Promise<AssertionResult> => {
       try {
-        // Run both models in parallel for speed optimization
-        const [claudeResult, geminiResult] = await Promise.all([
-          withTimeout(getClaudeAssertion(), ASSERTION_MODEL_TIMEOUT),
-          withTimeout(getGeminiAssertion(), ASSERTION_MODEL_TIMEOUT),
-        ]);
-
-        // Check if models disagree on assertionPassed
-        if (claudeResult.assertionPassed !== geminiResult.assertionPassed) {
-          logger.debug("Models disagree on assertion result, consulting arbiter...");
-          const arbiterResult = await withTimeout(
-            getArbiterDecision(claudeResult, geminiResult),
-            ASSERTION_MODEL_TIMEOUT,
+        // Create assertion functions for all configured models
+        const assertionFunctions = assertionModels.map(modelId => 
+          createModelAssertionFunction(modelId, messages, thinkingEnabled)
+        );
+        
+        // Run all models in parallel
+        const results = await Promise.all(
+          assertionFunctions.map(fn => withTimeout(fn(), ASSERTION_MODEL_TIMEOUT))
+        );
+        
+        // Pair results with model IDs for potential arbiter use
+        const modelResults = assertionModels.map((modelId, index) => ({
+          modelId,
+          result: results[index],
+        }));
+        
+        // Check for consensus
+        const consensus = checkConsensus(results);
+        
+        if (consensus.hasConsensus && consensus.consensusResult) {
+          logger.debug(`All ${results.length} models agreed on assertion result`);
+          return consensus.consensusResult;
+        }
+        
+        // For 2 models with disagreement, use arbiter
+        if (results.length === 2 && results[0].assertionPassed !== results[1].assertionPassed) {
+          logger.debug("Two models disagree on assertion result, consulting arbiter...");
+          const arbiterFn = createArbiterFunction(
+            modelResults,
+            messages,
+            imageContent,
+            assertion,
+            snapshot,
+            images
           );
-
-          return {
-            assertionPassed: arbiterResult.assertionPassed,
-            confidenceScore: arbiterResult.confidenceScore,
-            reasoning: arbiterResult.reasoning,
-          };
+          return await withTimeout(arbiterFn(), ASSERTION_MODEL_TIMEOUT);
         }
-
-        // Assertion passes only if both models agree it should pass
-        const assertionPassed = claudeResult.assertionPassed && geminiResult.assertionPassed;
-
-        // Calculate average confidence score
-        const confidenceScore = (claudeResult.confidenceScore + geminiResult.confidenceScore) / 2;
-
-        // For now take Gemini's reasoning for simplicity
-        const reasoning = geminiResult.reasoning;
-
-        return {
-          assertionPassed,
-          confidenceScore: Math.round(confidenceScore),
-          reasoning,
-        };
+        
+        // For 3+ models with majority but not unanimous
+        if (results.length >= 3 && consensus.majorityResult) {
+          logger.debug(
+            `Majority vote: ${consensus.passedCount} passed, ${consensus.failedCount} failed. ` +
+            `Using majority result.`
+          );
+          return consensus.majorityResult;
+        }
+        
+        // Fallback: Use arbiter for any unresolved cases
+        logger.debug("Consulting arbiter for final decision...");
+        const arbiterFn = createArbiterFunction(
+          modelResults,
+          messages,
+          imageContent,
+          assertion,
+          snapshot,
+          images
+        );
+        return await withTimeout(arbiterFn(), ASSERTION_MODEL_TIMEOUT);
+        
       } catch (error) {
         if (attempt < 1) {
           logger.debug("Retrying assertion due to error...");
diff --git a/src/config.ts b/src/config.ts
index d53e6eb..fdd3b2b 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -22,18 +22,21 @@ export type ModelConfig = {
   assertionPrimary?: string;
   /** Model for assertions (secondary). Default: google/gemini-3-flash */
   assertionSecondary?: string;
+  /** Array of models to use for consensus assertions. When provided, overrides assertionPrimary/assertionSecondary */
+  assertionModels?: string[];
   /** Model for assertion arbiter. Default: google/gemini-3.1-pro-preview */
   assertionArbiter?: string;
   /** Model for data extraction, wait conditions, and lightweight tasks. Default: google/gemini-2.5-flash */
   utility?: string;
 };
 
-export const DEFAULT_MODELS: Required<ModelConfig> = {
+export const DEFAULT_MODELS: Required<Omit<ModelConfig, 'assertionModels'>> & Pick<ModelConfig, 'assertionModels'> = {
   stepExecution: "google/gemini-3-flash",
   userFlowLow: "google/gemini-3-flash",
   userFlowHigh: "google/gemini-3.1-pro-preview",
   assertionPrimary: "anthropic/claude-haiku-4.5",
   assertionSecondary: "google/gemini-3-flash",
+  assertionModels: undefined,
   assertionArbiter: "google/gemini-3.1-pro-preview",
   utility: "google/gemini-2.5-flash",
 };
@@ -58,14 +61,46 @@ let globalConfig: Config = {};
  *
  * @example
  * ```typescript
+ * // Using primary/secondary models (backward compatible)
  * configure({
- *   ai: { gateway: "none", models: { stepExecution: "google/gemini-3-flash" } },
- *   email: { domain: "test.com", extractContent: async ({ email, prompt }) => "..." },
+ *   ai: { 
+ *     gateway: "none", 
+ *     models: { 
+ *       assertionPrimary: "anthropic/claude-haiku-4.5",
+ *       assertionSecondary: "google/gemini-3-flash"
+ *     } 
+ *   },
+ * });
+ * 
+ * // Using multiple models array (new flexible approach)
+ * configure({
+ *   ai: { 
+ *     gateway: "openrouter", 
+ *     models: { 
+ *       assertionModels: [
+ *         "anthropic/claude-haiku-4.5",
+ *         "google/gemini-3-flash",
+ *         "meta-llama/llama-3.1-8b-instruct"
+ *       ],
+ *       assertionArbiter: "google/gemini-3.1-pro-preview"
+ *     } 
+ *   },
  * });
  * ```
  */
 export function configure(config: Config) {
   globalConfig = { ...globalConfig, ...config };
+  
+  // Validate assertion model configuration
+  const models = globalConfig.ai?.models;
+  if (models) {
+    const assertionModels = getAssertionModelsList(models);
+    if (assertionModels.length < 2) {
+      console.warn(
+        'Passmark: At least 2 assertion models are recommended for reliable consensus validation.'
+      );
+    }
+  }
 }
 
 /**
@@ -81,11 +116,47 @@ export function getConfig(): Config {
  * @param key - The model use case key (e.g. "stepExecution", "utility")
  * @returns The model identifier string (e.g. "google/gemini-3-flash")
  */
-export function getModelId(key: keyof ModelConfig): string {
+export function getModelId(key: keyof Omit<ModelConfig, 'assertionModels'>): string {
   return getConfig().ai?.models?.[key] ?? DEFAULT_MODELS[key];
 }
 
+/**
+ * Returns the list of assertion models from configuration.
+ * Prioritizes assertionModels array, falls back to [assertionPrimary, assertionSecondary].
+ *
+ * @param models - The model configuration
+ * @returns Array of model identifiers for assertions
+ */
+export function getAssertionModelsList(models?: ModelConfig): string[] {
+  const configModels = models ?? getConfig().ai?.models;
+  
+  if (!configModels) {
+    return [DEFAULT_MODELS.assertionPrimary, DEFAULT_MODELS.assertionSecondary];
+  }
+  
+  // Prefer the new assertionModels array if provided
+  if (configModels.assertionModels && configModels.assertionModels.length > 0) {
+    return configModels.assertionModels;
+  }
+  
+  // Fall back to primary/secondary for backward compatibility
+  const models_list: string[] = [];
+  if (configModels.assertionPrimary) {
+    models_list.push(configModels.assertionPrimary);
+  }
+  if (configModels.assertionSecondary) {
+    models_list.push(configModels.assertionSecondary);
+  }
+  
+  // If nothing configured, use defaults
+  if (models_list.length === 0) {
+    return [DEFAULT_MODELS.assertionPrimary, DEFAULT_MODELS.assertionSecondary];
+  }
+  
+  return models_list;
+}
+
 /** @internal Reset config to empty state. Used for testing only. */
 export function resetConfig() {
   globalConfig = {};
-}
+}
\ No newline at end of file

From 4b327847773ef73d14f9653d756b3c8eaed0035d Mon Sep 17 00:00:00 2001
From: Madhusudhanan <smkp84@gmail.com>
Date: Thu, 16 Apr 2026 11:20:36 +0530
Subject: [PATCH 2/3] fix: address code review feedback for assertionModels PR

- Fix tie handling in majority voting (consult arbiter on ties)
- Preserve backward compatible reasoning selection (2 models use secondary)
- Remove unreachable empty models check
- Use logger instead of console.warn
- Fix snake_case variable naming to camelCase
- Pass thinkingEnabled to arbiter for proper provider options
---
 src/assertion.ts | 51 +++++++++++++++++++++++++++++++++++-------------
 src/config.ts    | 14 +++++++------
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/src/assertion.ts b/src/assertion.ts
index 4917755..5b134de 100644
--- a/src/assertion.ts
+++ b/src/assertion.ts
@@ -76,7 +76,8 @@ function createArbiterFunction(
   imageContent: Array<{ type: "image"; image: string }>,
   assertion: string,
   snapshot: string,
-  images?: string[]
+  images: string[] | undefined,
+  thinkingEnabled: boolean = false
 ): ModelAssertionFunction {
   return async (): Promise<AssertionResult> => {
     const resultsSummary = modelResults
@@ -138,15 +139,26 @@ Please carefully review the evidence (screenshot and accessibility snapshot when
     ];
 
     const arbiterModelId = getModelId("assertionArbiter");
+    const isGoogleModel = arbiterModelId.toLowerCase().includes('google') || arbiterModelId.toLowerCase().includes('gemini');
+    
+    const providerOptions: Record<string, any> = {
+      openrouter: {
+        reasoning: { max_tokens: THINKING_BUDGET_DEFAULT },
+      },
+    };
+    
+    if (thinkingEnabled && isGoogleModel) {
+      providerOptions.google = {
+        thinkingConfig: {
+          thinkingBudget: THINKING_BUDGET_DEFAULT,
+        },
+      };
+    }
     
     const { output } = await generateText({
       model: resolveModel(arbiterModelId),
       temperature: 0,
-      providerOptions: {
-        openrouter: {
-          reasoning: { max_tokens: THINKING_BUDGET_DEFAULT },
-        },
-      },
+      providerOptions,
       messages: arbiterMessages,
       output: Output.object({ schema: assertionSchema }),
     });
@@ -171,12 +183,15 @@ function checkConsensus(results: AssertionResult[]): {
   // Unanimous consensus
   if (passedCount === results.length || failedCount === results.length) {
     const avgConfidence = results.reduce((sum, r) => sum + r.confidenceScore, 0) / results.length;
+    // For backward compatibility: 2 models use secondary (index 1), 3+ use first
+    const reasoningIndex = results.length === 2 ? 1 : 0;
+    
     return {
       hasConsensus: true,
       consensusResult: {
         assertionPassed: passedCount === results.length,
         confidenceScore: Math.round(avgConfidence),
-        reasoning: results[0].reasoning, // Use first model's reasoning
+        reasoning: results[reasoningIndex].reasoning,
       },
       passedCount,
       failedCount,
@@ -185,6 +200,16 @@ function checkConsensus(results: AssertionResult[]): {
   
   // Majority vote (for 3+ models)
   if (results.length >= 3) {
+    // Check for tie - if tie, return no majorityResult so arbiter is consulted
+    if (passedCount === failedCount) {
+      return {
+        hasConsensus: false,
+        passedCount,
+        failedCount,
+        // No majorityResult - this will trigger arbiter
+      };
+    }
+    
     const majorityPassed = passedCount > failedCount;
     const majorityResults = results.filter(r => r.assertionPassed === majorityPassed);
     const avgConfidence = majorityResults.reduce((sum, r) => sum + r.confidenceScore, 0) / majorityResults.length;
@@ -250,10 +275,6 @@ export const assert = async ({
   
   // Get the list of models to use for assertions
   const assertionModels = getAssertionModelsList();
-  
-  if (assertionModels.length === 0) {
-    throw new Error("No assertion models configured. Please configure at least one model for assertions.");
-  }
 
   const runFullAssertion = async (): Promise<AssertionResult> => {
     const snapshot = await safeSnapshot(page);
@@ -357,7 +378,8 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco
             imageContent,
             assertion,
             snapshot,
-            images
+            images,
+            thinkingEnabled
           );
           return await withTimeout(arbiterFn(), ASSERTION_MODEL_TIMEOUT);
         }
@@ -371,7 +393,7 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco
           return consensus.majorityResult;
         }
         
-        // Fallback: Use arbiter for any unresolved cases
+        // Fallback: Use arbiter for any unresolved cases (including ties)
         logger.debug("Consulting arbiter for final decision...");
         const arbiterFn = createArbiterFunction(
           modelResults,
@@ -379,7 +401,8 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco
           imageContent,
           assertion,
           snapshot,
-          images
+          images,
+          thinkingEnabled
         );
         return await withTimeout(arbiterFn(), ASSERTION_MODEL_TIMEOUT);
         
diff --git a/src/config.ts b/src/config.ts
index fdd3b2b..a67b023 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -1,3 +1,5 @@
+import { logger } from "./logger";
+
 export type EmailProvider = {
   /** Domain for generating test emails (e.g. "emailsink.dev") */
   domain: string;
@@ -96,7 +98,7 @@ export function configure(config: Config) {
   if (models) {
     const assertionModels = getAssertionModelsList(models);
     if (assertionModels.length < 2) {
-      console.warn(
+      logger.warn(
         'Passmark: At least 2 assertion models are recommended for reliable consensus validation.'
       );
     }
@@ -140,20 +142,20 @@ export function getAssertionModelsList(models?: ModelConfig): string[] {
   }
   
   // Fall back to primary/secondary for backward compatibility
-  const models_list: string[] = [];
+  const modelsList: string[] = [];
   if (configModels.assertionPrimary) {
-    models_list.push(configModels.assertionPrimary);
+    modelsList.push(configModels.assertionPrimary);
   }
   if (configModels.assertionSecondary) {
-    models_list.push(configModels.assertionSecondary);
+    modelsList.push(configModels.assertionSecondary);
   }
   
   // If nothing configured, use defaults
-  if (models_list.length === 0) {
+  if (modelsList.length === 0) {
     return [DEFAULT_MODELS.assertionPrimary, DEFAULT_MODELS.assertionSecondary];
   }
   
-  return models_list;
+  return modelsList;
 }
 
 /** @internal Reset config to empty state. Used for testing only. */

From 9ee9892dcd2e87ecbf70b3d259e6d3ee4daa6225 Mon Sep 17 00:00:00 2001
From: Madhusudhanan <smkp84@gmail.com>
Date: Wed, 13 May 2026 18:50:57 +0530
Subject: [PATCH 3/3] fix: throw on insufficient assertion models, ensure
 backward compat

- Use ConfigurationError instead of logger.warn for <2 assertion models
- getAssertionModelsList now always returns 2 models when using primary/secondary,
  applying defaults for unset fields to prevent regressions
---
 src/config.ts | 35 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/src/config.ts b/src/config.ts
index 5d6b7e8..b5910c4 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -1,5 +1,5 @@
 import { logger } from "./logger";
-
+import { ConfigurationError } from "./errors";
 import { initTelemetry } from "./instrumentation";
 
 export type EmailProvider = {
@@ -162,8 +162,8 @@ export function configure(config: Config) {
   if (models) {
     const assertionModels = getAssertionModelsList(models);
     if (assertionModels.length < 2) {
-      logger.warn(
-        'Passmark: At least 2 assertion models are recommended for reliable consensus validation.'
+      throw new ConfigurationError(
+        `Passmark: assertion consensus requires at least 2 models, got ${assertionModels.length}.`
       );
     }
   }
@@ -192,10 +192,10 @@ export function getModelId(key: keyof Omit<ModelConfig, 'assertionModels'>): str
 
 /**
  * Returns the list of assertion models from configuration.
- * Prioritizes assertionModels array, falls back to [assertionPrimary, assertionSecondary].
+ * Prioritizes assertionModels array, falls back to [assertionPrimary, assertionSecondary] with defaults.
  *
  * @param models - The model configuration
- * @returns Array of model identifiers for assertions
+ * @returns Array of model identifiers for assertions (always at least 2 when using primary/secondary)
  */
 export function getAssertionModelsList(models?: ModelConfig): string[] {
   const configModels = models ?? getConfig().ai?.models;
@@ -204,26 +204,15 @@ export function getAssertionModelsList(models?: ModelConfig): string[] {
     return [DEFAULT_MODELS.assertionPrimary, DEFAULT_MODELS.assertionSecondary];
   }
   
-  // Prefer the new assertionModels array if provided
-  if (configModels.assertionModels && configModels.assertionModels.length > 0) {
-    return configModels.assertionModels;
-  }
-  
-  // Fall back to primary/secondary for backward compatibility
-  const modelsList: string[] = [];
-  if (configModels.assertionPrimary) {
-    modelsList.push(configModels.assertionPrimary);
-  }
-  if (configModels.assertionSecondary) {
-    modelsList.push(configModels.assertionSecondary);
-  }
-  
-  // If nothing configured, use defaults
-  if (modelsList.length === 0) {
-    return [DEFAULT_MODELS.assertionPrimary, DEFAULT_MODELS.assertionSecondary];
+  // Prefer the new assertionModels array if explicitly provided (even if empty)
+  if (configModels.assertionModels !== undefined) {
+    return configModels.assertionModels; // caller will validate length
   }
   
-  return modelsList;
+  // Backward compatible: build from primary/secondary, applying defaults for unset ones
+  const primary = configModels.assertionPrimary ?? DEFAULT_MODELS.assertionPrimary;
+  const secondary = configModels.assertionSecondary ?? DEFAULT_MODELS.assertionSecondary;
+  return [primary, secondary];
 }
 
 /**