From 27a0863209c9ad2e8303f10687474e4043577423 Mon Sep 17 00:00:00 2001 From: Madhusudhanan Date: Thu, 16 Apr 2026 06:34:53 +0530 Subject: [PATCH 1/3] feat: add support for multiple assertion models via assertionModels array - Add assertionModels array option to ModelConfig for flexible model configuration - Maintain backward compatibility with assertionPrimary/assertionSecondary - Implement majority voting for 3+ models - Add getAssertionModelsList() helper to resolve configured models - Update assertion engine to support dynamic number of models - Update config test to account for new assertionModels key Closes #25 --- src/__tests__/config.test.ts | 4 +- src/assertion.ts | 427 +++++++++++++++++++++-------------- src/config.ts | 81 ++++++- 3 files changed, 335 insertions(+), 177 deletions(-) diff --git a/src/__tests__/config.test.ts b/src/__tests__/config.test.ts index 5a6f22c..2bed72e 100644 --- a/src/__tests__/config.test.ts +++ b/src/__tests__/config.test.ts @@ -44,9 +44,9 @@ describe("config", () => { expect(getConfig().uploadBasePath).toBe("./second"); }); - it("getModelId returns default for each of the 7 keys", () => { + it("getModelId returns default for each of the 8 keys", () => { const keys = Object.keys(DEFAULT_MODELS) as Array; - expect(keys).toHaveLength(7); + expect(keys).toHaveLength(8); for (const key of keys) { expect(getModelId(key)).toBe(DEFAULT_MODELS[key]); diff --git a/src/assertion.ts b/src/assertion.ts index 5188ea7..4917755 100644 --- a/src/assertion.ts +++ b/src/assertion.ts @@ -1,6 +1,6 @@ import { generateText, ModelMessage, Output } from "ai"; import { z } from "zod"; -import { getModelId } from "./config"; +import { getModelId, getAssertionModelsList } from "./config"; import { ASSERTION_MODEL_TIMEOUT, THINKING_BUDGET_DEFAULT } from "./constants"; import { logger } from "./logger"; import { resolveModel } from "./models"; @@ -19,10 +19,199 @@ const assertionSchema = z.object({ ), }); +type ModelAssertionFunction = () => Promise; + +/** + * Creates an assertion function for a specific model + */ +function createModelAssertionFunction( + modelId: string, + messages: ModelMessage[], + thinkingEnabled: boolean +): ModelAssertionFunction { + return async (): Promise => { + // Check if it's a Google model (for provider-specific options) + const isGoogleModel = modelId.toLowerCase().includes('google') || modelId.toLowerCase().includes('gemini'); + const isAnthropicModel = modelId.toLowerCase().includes('anthropic') || modelId.toLowerCase().includes('claude'); + + const providerOptions: Record = {}; + + if (thinkingEnabled) { + if (isAnthropicModel) { + providerOptions.anthropic = { + thinking: { type: "enabled", budgetTokens: THINKING_BUDGET_DEFAULT }, + }; + } else if (isGoogleModel) { + providerOptions.google = { + thinkingConfig: { + thinkingBudget: THINKING_BUDGET_DEFAULT, + }, + }; + } + + // Always include openrouter reasoning option as fallback + providerOptions.openrouter = { + reasoning: { max_tokens: THINKING_BUDGET_DEFAULT }, + }; + } + + const { output } = await generateText({ + model: resolveModel(modelId), + temperature: 0, + providerOptions: Object.keys(providerOptions).length > 0 ? providerOptions : undefined, + messages, + output: Output.object({ schema: assertionSchema }), + }); + + return output; + }; +} + +/** + * Creates an arbiter function to resolve disagreements between multiple model results + */ +function createArbiterFunction( + modelResults: Array<{ modelId: string; result: AssertionResult }>, + messages: ModelMessage[], + imageContent: Array<{ type: "image"; image: string }>, + assertion: string, + snapshot: string, + images?: string[] +): ModelAssertionFunction { + return async (): Promise => { + const resultsSummary = modelResults + .map(({ modelId, result }) => ` +Model: ${modelId} +- Assertion Passed: ${result.assertionPassed} +- Confidence: ${result.confidenceScore}% +- Reasoning: ${result.reasoning} +`) + .join("\n"); + + const arbiterPrompt = ` +You are an AI arbiter tasked with resolving a disagreement between multiple AI models about an assertion. + +The following models evaluated the assertion and reached different conclusions: + +${resultsSummary} + +${!images + ? ` + +${snapshot} + +` + : "" + } + + +${assertion} + + +Please carefully review the evidence (screenshot and accessibility snapshot when provided) and make the final determination. Consider all models' reasoning but make your own independent assessment. + + +- Make your own independent evaluation based on the evidence +- Don't simply pick one model's answer - analyze the situation yourself +- Provide clear reasoning for your decision +- Be decisive - this is the final answer +- First use the attached screenshot(s) to visually inspect the page and try to verify the assertion. +- Only if the screenshot is not sufficient, use the accessibility snapshot (if supplied) to verify the assertion. +- Don't create additional assertion conditions on your own - only consider the exact assertion provided above. +- The assertion should pass if either the screenshot or the accessibility snapshot supports it. +- Don't be overly strict or pedantic about exact wording. Focus on the intent and objective of the assertion rather than literal text matching. +- Think like a practical QA tester - if the core functionality or state being asserted is present, the assertion should pass even if minor details differ. + +`; + + const arbiterMessages: ModelMessage[] = [ + { + role: "user", + content: [ + { + type: "text", + text: arbiterPrompt, + }, + ...imageContent, + ], + }, + ]; + + const arbiterModelId = getModelId("assertionArbiter"); + + const { output } = await generateText({ + model: resolveModel(arbiterModelId), + temperature: 0, + providerOptions: { + openrouter: { + reasoning: { max_tokens: THINKING_BUDGET_DEFAULT }, + }, + }, + messages: arbiterMessages, + output: Output.object({ schema: assertionSchema }), + }); + + return output; + }; +} + +/** + * Checks if all models agree on the assertion result + */ +function checkConsensus(results: AssertionResult[]): { + hasConsensus: boolean; + consensusResult?: AssertionResult; + majorityResult?: AssertionResult; + passedCount: number; + failedCount: number; +} { + const passedCount = results.filter(r => r.assertionPassed).length; + const failedCount = results.length - passedCount; + + // Unanimous consensus + if (passedCount === results.length || failedCount === results.length) { + const avgConfidence = results.reduce((sum, r) => sum + r.confidenceScore, 0) / results.length; + return { + hasConsensus: true, + consensusResult: { + assertionPassed: passedCount === results.length, + confidenceScore: Math.round(avgConfidence), + reasoning: results[0].reasoning, // Use first model's reasoning + }, + passedCount, + failedCount, + }; + } + + // Majority vote (for 3+ models) + if (results.length >= 3) { + const majorityPassed = passedCount > failedCount; + const majorityResults = results.filter(r => r.assertionPassed === majorityPassed); + const avgConfidence = majorityResults.reduce((sum, r) => sum + r.confidenceScore, 0) / majorityResults.length; + + return { + hasConsensus: false, + majorityResult: { + assertionPassed: majorityPassed, + confidenceScore: Math.round(avgConfidence), + reasoning: majorityResults[0].reasoning, + }, + passedCount, + failedCount, + }; + } + + return { + hasConsensus: false, + passedCount, + failedCount, + }; +} + /** * Multi-model consensus assertion engine. - * Runs Claude and Gemini in parallel; if they disagree, a third model (arbiter) makes the final call. - * An assertion passes only if both models agree (or the arbiter decides). + * Runs multiple AI models in parallel; if they disagree, an arbiter model makes the final call. + * Supports both legacy primary/secondary and new assertionModels array configuration. * Automatically retries failed assertions once with a fresh page snapshot. * * @param options - Assertion configuration @@ -38,6 +227,7 @@ const assertionSchema = z.object({ * * @example * ```typescript + * // Using multiple models (new approach) * await assert({ * page, * assertion: "The dashboard shows 3 active projects", @@ -57,6 +247,13 @@ export const assert = async ({ failSilently, }: AssertionOptions): Promise => { const thinkingEnabled = effort === "high"; + + // Get the list of models to use for assertions + const assertionModels = getAssertionModelsList(); + + if (assertionModels.length === 0) { + throw new Error("No assertion models configured. Please configure at least one model for assertions."); + } const runFullAssertion = async (): Promise => { const snapshot = await safeSnapshot(page); @@ -125,177 +322,67 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco }, ]; - // Claude assertion function - const getClaudeAssertion = async (): Promise => { - // First get Claude's text response with thinking if enabled - const { text } = await generateText({ - model: resolveModel(getModelId("assertionPrimary")), - temperature: 0, - providerOptions: thinkingEnabled - ? { - anthropic: { - thinking: { type: "enabled", budgetTokens: THINKING_BUDGET_DEFAULT }, - }, - openrouter: { - reasoning: { max_tokens: THINKING_BUDGET_DEFAULT }, - }, - } - : undefined, - messages, - }); - - // Convert Claude's response to structured format using Haiku - const { output } = await generateText({ - model: resolveModel(getModelId("assertionPrimary")), - temperature: 0.1, - prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`, - output: Output.object({ schema: assertionSchema }), - }); - - return output; - }; - - // Gemini assertion function - const getGeminiAssertion = async (): Promise => { - const { output } = await generateText({ - model: resolveModel(getModelId("assertionSecondary")), - temperature: 0, - providerOptions: thinkingEnabled - ? { - google: { - thinkingConfig: { - thinkingBudget: THINKING_BUDGET_DEFAULT, - }, - }, - openrouter: { - reasoning: { max_tokens: THINKING_BUDGET_DEFAULT }, - }, - } - : undefined, - messages, - output: Output.object({ schema: assertionSchema }), - }); - - return output; - }; - - // Arbiter function using Gemini 2.5 Pro with thinking enabled - const getArbiterDecision = async ( - claudeResult: AssertionResult, - geminiResult: AssertionResult, - ): Promise => { - const arbiterPrompt = ` -You are an AI arbiter tasked with resolving a disagreement between two AI models about an assertion. - -Claude's Assessment: -- Assertion Passed: ${claudeResult.assertionPassed} -- Confidence: ${claudeResult.confidenceScore}% -- Reasoning: ${claudeResult.reasoning} - -Gemini's Assessment: -- Assertion Passed: ${geminiResult.assertionPassed} -- Confidence: ${geminiResult.confidenceScore}% -- Reasoning: ${geminiResult.reasoning} - -${!images - ? ` - -${snapshot} - -` - : "" - } - - -${assertion} - - -Please carefully review the evidence (screenshot and accessibility snapshot (when provided)) and make the final determination. Consider both models' reasoning but make your own independent assessment. - - -- Make your own independent evaluation based on the evidence -- Don't simply pick one model's answer - analyze the situation yourself -- Provide clear reasoning for your decision -- Be decisive - this is the final answer -- First use the attached screenshot(s) to visually inspect the page and try to verify the assertion. -- Only if the screenshot is not sufficient, use the accessibility snapshot (if supplied) to verify the assertion. -- Don't create additional assertion conditions on your own - only consider the exact assertion provided above. -- The assertion should pass if either the screenshot or the accessibility snapshot supports it. -- Don't be overly strict or pedantic about exact wording. Focus on the intent and objective of the assertion rather than literal text matching. -- Think like a practical QA tester - if the core functionality or state being asserted is present, the assertion should pass even if minor details differ. - -`; - - const arbiterMessages: ModelMessage[] = [ - { - role: "user", - content: [ - { - type: "text", - text: arbiterPrompt, - }, - ...imageContent, - ], - }, - ]; - - const { output } = await generateText({ - model: resolveModel(getModelId("assertionArbiter")), - temperature: 0, - providerOptions: { - google: { - thinkingConfig: { - thinkingBudget: THINKING_BUDGET_DEFAULT, - }, - }, - openrouter: { - reasoning: { max_tokens: THINKING_BUDGET_DEFAULT }, - }, - }, - messages: arbiterMessages, - output: Output.object({ schema: assertionSchema }), - }); - - return output; - }; - const runAssertion = async (attempt = 0): Promise => { try { - // Run both models in parallel for speed optimization - const [claudeResult, geminiResult] = await Promise.all([ - withTimeout(getClaudeAssertion(), ASSERTION_MODEL_TIMEOUT), - withTimeout(getGeminiAssertion(), ASSERTION_MODEL_TIMEOUT), - ]); - - // Check if models disagree on assertionPassed - if (claudeResult.assertionPassed !== geminiResult.assertionPassed) { - logger.debug("Models disagree on assertion result, consulting arbiter..."); - const arbiterResult = await withTimeout( - getArbiterDecision(claudeResult, geminiResult), - ASSERTION_MODEL_TIMEOUT, + // Create assertion functions for all configured models + const assertionFunctions = assertionModels.map(modelId => + createModelAssertionFunction(modelId, messages, thinkingEnabled) + ); + + // Run all models in parallel + const results = await Promise.all( + assertionFunctions.map(fn => withTimeout(fn(), ASSERTION_MODEL_TIMEOUT)) + ); + + // Pair results with model IDs for potential arbiter use + const modelResults = assertionModels.map((modelId, index) => ({ + modelId, + result: results[index], + })); + + // Check for consensus + const consensus = checkConsensus(results); + + if (consensus.hasConsensus && consensus.consensusResult) { + logger.debug(`All ${results.length} models agreed on assertion result`); + return consensus.consensusResult; + } + + // For 2 models with disagreement, use arbiter + if (results.length === 2 && results[0].assertionPassed !== results[1].assertionPassed) { + logger.debug("Two models disagree on assertion result, consulting arbiter..."); + const arbiterFn = createArbiterFunction( + modelResults, + messages, + imageContent, + assertion, + snapshot, + images ); - - return { - assertionPassed: arbiterResult.assertionPassed, - confidenceScore: arbiterResult.confidenceScore, - reasoning: arbiterResult.reasoning, - }; + return await withTimeout(arbiterFn(), ASSERTION_MODEL_TIMEOUT); } - - // Assertion passes only if both models agree it should pass - const assertionPassed = claudeResult.assertionPassed && geminiResult.assertionPassed; - - // Calculate average confidence score - const confidenceScore = (claudeResult.confidenceScore + geminiResult.confidenceScore) / 2; - - // For now take Gemini's reasoning for simplicity - const reasoning = geminiResult.reasoning; - - return { - assertionPassed, - confidenceScore: Math.round(confidenceScore), - reasoning, - }; + + // For 3+ models with majority but not unanimous + if (results.length >= 3 && consensus.majorityResult) { + logger.debug( + `Majority vote: ${consensus.passedCount} passed, ${consensus.failedCount} failed. ` + + `Using majority result.` + ); + return consensus.majorityResult; + } + + // Fallback: Use arbiter for any unresolved cases + logger.debug("Consulting arbiter for final decision..."); + const arbiterFn = createArbiterFunction( + modelResults, + messages, + imageContent, + assertion, + snapshot, + images + ); + return await withTimeout(arbiterFn(), ASSERTION_MODEL_TIMEOUT); + } catch (error) { if (attempt < 1) { logger.debug("Retrying assertion due to error..."); diff --git a/src/config.ts b/src/config.ts index d53e6eb..fdd3b2b 100644 --- a/src/config.ts +++ b/src/config.ts @@ -22,18 +22,21 @@ export type ModelConfig = { assertionPrimary?: string; /** Model for assertions (secondary). Default: google/gemini-3-flash */ assertionSecondary?: string; + /** Array of models to use for consensus assertions. When provided, overrides assertionPrimary/assertionSecondary */ + assertionModels?: string[]; /** Model for assertion arbiter. Default: google/gemini-3.1-pro-preview */ assertionArbiter?: string; /** Model for data extraction, wait conditions, and lightweight tasks. Default: google/gemini-2.5-flash */ utility?: string; }; -export const DEFAULT_MODELS: Required = { +export const DEFAULT_MODELS: Required> & Pick = { stepExecution: "google/gemini-3-flash", userFlowLow: "google/gemini-3-flash", userFlowHigh: "google/gemini-3.1-pro-preview", assertionPrimary: "anthropic/claude-haiku-4.5", assertionSecondary: "google/gemini-3-flash", + assertionModels: undefined, assertionArbiter: "google/gemini-3.1-pro-preview", utility: "google/gemini-2.5-flash", }; @@ -58,14 +61,46 @@ let globalConfig: Config = {}; * * @example * ```typescript + * // Using primary/secondary models (backward compatible) * configure({ - * ai: { gateway: "none", models: { stepExecution: "google/gemini-3-flash" } }, - * email: { domain: "test.com", extractContent: async ({ email, prompt }) => "..." }, + * ai: { + * gateway: "none", + * models: { + * assertionPrimary: "anthropic/claude-haiku-4.5", + * assertionSecondary: "google/gemini-3-flash" + * } + * }, + * }); + * + * // Using multiple models array (new flexible approach) + * configure({ + * ai: { + * gateway: "openrouter", + * models: { + * assertionModels: [ + * "anthropic/claude-haiku-4.5", + * "google/gemini-3-flash", + * "meta-llama/llama-3.1-8b-instruct" + * ], + * assertionArbiter: "google/gemini-3.1-pro-preview" + * } + * }, * }); * ``` */ export function configure(config: Config) { globalConfig = { ...globalConfig, ...config }; + + // Validate assertion model configuration + const models = globalConfig.ai?.models; + if (models) { + const assertionModels = getAssertionModelsList(models); + if (assertionModels.length < 2) { + console.warn( + 'Passmark: At least 2 assertion models are recommended for reliable consensus validation.' + ); + } + } } /** @@ -81,11 +116,47 @@ export function getConfig(): Config { * @param key - The model use case key (e.g. "stepExecution", "utility") * @returns The model identifier string (e.g. "google/gemini-3-flash") */ -export function getModelId(key: keyof ModelConfig): string { +export function getModelId(key: keyof Omit): string { return getConfig().ai?.models?.[key] ?? DEFAULT_MODELS[key]; } +/** + * Returns the list of assertion models from configuration. + * Prioritizes assertionModels array, falls back to [assertionPrimary, assertionSecondary]. + * + * @param models - The model configuration + * @returns Array of model identifiers for assertions + */ +export function getAssertionModelsList(models?: ModelConfig): string[] { + const configModels = models ?? getConfig().ai?.models; + + if (!configModels) { + return [DEFAULT_MODELS.assertionPrimary, DEFAULT_MODELS.assertionSecondary]; + } + + // Prefer the new assertionModels array if provided + if (configModels.assertionModels && configModels.assertionModels.length > 0) { + return configModels.assertionModels; + } + + // Fall back to primary/secondary for backward compatibility + const models_list: string[] = []; + if (configModels.assertionPrimary) { + models_list.push(configModels.assertionPrimary); + } + if (configModels.assertionSecondary) { + models_list.push(configModels.assertionSecondary); + } + + // If nothing configured, use defaults + if (models_list.length === 0) { + return [DEFAULT_MODELS.assertionPrimary, DEFAULT_MODELS.assertionSecondary]; + } + + return models_list; +} + /** @internal Reset config to empty state. Used for testing only. */ export function resetConfig() { globalConfig = {}; -} +} \ No newline at end of file From 4b327847773ef73d14f9653d756b3c8eaed0035d Mon Sep 17 00:00:00 2001 From: Madhusudhanan Date: Thu, 16 Apr 2026 11:20:36 +0530 Subject: [PATCH 2/3] fix: address code review feedback for assertionModels PR - Fix tie handling in majority voting (consult arbiter on ties) - Preserve backward compatible reasoning selection (2 models use secondary) - Remove unreachable empty models check - Use logger instead of console.warn - Fix snake_case variable naming to camelCase - Pass thinkingEnabled to arbiter for proper provider options --- src/assertion.ts | 51 +++++++++++++++++++++++++++++++++++------------- src/config.ts | 14 +++++++------ 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/src/assertion.ts b/src/assertion.ts index 4917755..5b134de 100644 --- a/src/assertion.ts +++ b/src/assertion.ts @@ -76,7 +76,8 @@ function createArbiterFunction( imageContent: Array<{ type: "image"; image: string }>, assertion: string, snapshot: string, - images?: string[] + images: string[] | undefined, + thinkingEnabled: boolean = false ): ModelAssertionFunction { return async (): Promise => { const resultsSummary = modelResults @@ -138,15 +139,26 @@ Please carefully review the evidence (screenshot and accessibility snapshot when ]; const arbiterModelId = getModelId("assertionArbiter"); + const isGoogleModel = arbiterModelId.toLowerCase().includes('google') || arbiterModelId.toLowerCase().includes('gemini'); + + const providerOptions: Record = { + openrouter: { + reasoning: { max_tokens: THINKING_BUDGET_DEFAULT }, + }, + }; + + if (thinkingEnabled && isGoogleModel) { + providerOptions.google = { + thinkingConfig: { + thinkingBudget: THINKING_BUDGET_DEFAULT, + }, + }; + } const { output } = await generateText({ model: resolveModel(arbiterModelId), temperature: 0, - providerOptions: { - openrouter: { - reasoning: { max_tokens: THINKING_BUDGET_DEFAULT }, - }, - }, + providerOptions, messages: arbiterMessages, output: Output.object({ schema: assertionSchema }), }); @@ -171,12 +183,15 @@ function checkConsensus(results: AssertionResult[]): { // Unanimous consensus if (passedCount === results.length || failedCount === results.length) { const avgConfidence = results.reduce((sum, r) => sum + r.confidenceScore, 0) / results.length; + // For backward compatibility: 2 models use secondary (index 1), 3+ use first + const reasoningIndex = results.length === 2 ? 1 : 0; + return { hasConsensus: true, consensusResult: { assertionPassed: passedCount === results.length, confidenceScore: Math.round(avgConfidence), - reasoning: results[0].reasoning, // Use first model's reasoning + reasoning: results[reasoningIndex].reasoning, }, passedCount, failedCount, @@ -185,6 +200,16 @@ function checkConsensus(results: AssertionResult[]): { // Majority vote (for 3+ models) if (results.length >= 3) { + // Check for tie - if tie, return no majorityResult so arbiter is consulted + if (passedCount === failedCount) { + return { + hasConsensus: false, + passedCount, + failedCount, + // No majorityResult - this will trigger arbiter + }; + } + const majorityPassed = passedCount > failedCount; const majorityResults = results.filter(r => r.assertionPassed === majorityPassed); const avgConfidence = majorityResults.reduce((sum, r) => sum + r.confidenceScore, 0) / majorityResults.length; @@ -250,10 +275,6 @@ export const assert = async ({ // Get the list of models to use for assertions const assertionModels = getAssertionModelsList(); - - if (assertionModels.length === 0) { - throw new Error("No assertion models configured. Please configure at least one model for assertions."); - } const runFullAssertion = async (): Promise => { const snapshot = await safeSnapshot(page); @@ -357,7 +378,8 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco imageContent, assertion, snapshot, - images + images, + thinkingEnabled ); return await withTimeout(arbiterFn(), ASSERTION_MODEL_TIMEOUT); } @@ -371,7 +393,7 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco return consensus.majorityResult; } - // Fallback: Use arbiter for any unresolved cases + // Fallback: Use arbiter for any unresolved cases (including ties) logger.debug("Consulting arbiter for final decision..."); const arbiterFn = createArbiterFunction( modelResults, @@ -379,7 +401,8 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco imageContent, assertion, snapshot, - images + images, + thinkingEnabled ); return await withTimeout(arbiterFn(), ASSERTION_MODEL_TIMEOUT); diff --git a/src/config.ts b/src/config.ts index fdd3b2b..a67b023 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,3 +1,5 @@ +import { logger } from "./logger"; + export type EmailProvider = { /** Domain for generating test emails (e.g. "emailsink.dev") */ domain: string; @@ -96,7 +98,7 @@ export function configure(config: Config) { if (models) { const assertionModels = getAssertionModelsList(models); if (assertionModels.length < 2) { - console.warn( + logger.warn( 'Passmark: At least 2 assertion models are recommended for reliable consensus validation.' ); } @@ -140,20 +142,20 @@ export function getAssertionModelsList(models?: ModelConfig): string[] { } // Fall back to primary/secondary for backward compatibility - const models_list: string[] = []; + const modelsList: string[] = []; if (configModels.assertionPrimary) { - models_list.push(configModels.assertionPrimary); + modelsList.push(configModels.assertionPrimary); } if (configModels.assertionSecondary) { - models_list.push(configModels.assertionSecondary); + modelsList.push(configModels.assertionSecondary); } // If nothing configured, use defaults - if (models_list.length === 0) { + if (modelsList.length === 0) { return [DEFAULT_MODELS.assertionPrimary, DEFAULT_MODELS.assertionSecondary]; } - return models_list; + return modelsList; } /** @internal Reset config to empty state. Used for testing only. */ From 9ee9892dcd2e87ecbf70b3d259e6d3ee4daa6225 Mon Sep 17 00:00:00 2001 From: Madhusudhanan Date: Wed, 13 May 2026 18:50:57 +0530 Subject: [PATCH 3/3] fix: throw on insufficient assertion models, ensure backward compat - Use ConfigurationError instead of logger.warn for <2 assertion models - getAssertionModelsList now always returns 2 models when using primary/secondary, applying defaults for unset fields to prevent regressions --- src/config.ts | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/src/config.ts b/src/config.ts index 5d6b7e8..b5910c4 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,5 +1,5 @@ import { logger } from "./logger"; - +import { ConfigurationError } from "./errors"; import { initTelemetry } from "./instrumentation"; export type EmailProvider = { @@ -162,8 +162,8 @@ export function configure(config: Config) { if (models) { const assertionModels = getAssertionModelsList(models); if (assertionModels.length < 2) { - logger.warn( - 'Passmark: At least 2 assertion models are recommended for reliable consensus validation.' + throw new ConfigurationError( + `Passmark: assertion consensus requires at least 2 models, got ${assertionModels.length}.` ); } } @@ -192,10 +192,10 @@ export function getModelId(key: keyof Omit): str /** * Returns the list of assertion models from configuration. - * Prioritizes assertionModels array, falls back to [assertionPrimary, assertionSecondary]. + * Prioritizes assertionModels array, falls back to [assertionPrimary, assertionSecondary] with defaults. * * @param models - The model configuration - * @returns Array of model identifiers for assertions + * @returns Array of model identifiers for assertions (always at least 2 when using primary/secondary) */ export function getAssertionModelsList(models?: ModelConfig): string[] { const configModels = models ?? getConfig().ai?.models; @@ -204,26 +204,15 @@ export function getAssertionModelsList(models?: ModelConfig): string[] { return [DEFAULT_MODELS.assertionPrimary, DEFAULT_MODELS.assertionSecondary]; } - // Prefer the new assertionModels array if provided - if (configModels.assertionModels && configModels.assertionModels.length > 0) { - return configModels.assertionModels; - } - - // Fall back to primary/secondary for backward compatibility - const modelsList: string[] = []; - if (configModels.assertionPrimary) { - modelsList.push(configModels.assertionPrimary); - } - if (configModels.assertionSecondary) { - modelsList.push(configModels.assertionSecondary); - } - - // If nothing configured, use defaults - if (modelsList.length === 0) { - return [DEFAULT_MODELS.assertionPrimary, DEFAULT_MODELS.assertionSecondary]; + // Prefer the new assertionModels array if explicitly provided (even if empty) + if (configModels.assertionModels !== undefined) { + return configModels.assertionModels; // caller will validate length } - return modelsList; + // Backward compatible: build from primary/secondary, applying defaults for unset ones + const primary = configModels.assertionPrimary ?? DEFAULT_MODELS.assertionPrimary; + const secondary = configModels.assertionSecondary ?? DEFAULT_MODELS.assertionSecondary; + return [primary, secondary]; } /**