diff --git a/src/__tests__/config.test.ts b/src/__tests__/config.test.ts index f3e5f67..108e435 100644 --- a/src/__tests__/config.test.ts +++ b/src/__tests__/config.test.ts @@ -44,13 +44,18 @@ describe("config", () => { expect(getConfig().uploadBasePath).toBe("./second"); }); - it("getModelId returns default for each DEFAULT_MODELS key", () => { - const keys = Object.keys(DEFAULT_MODELS) as Array; - expect(keys.length).toBeGreaterThanOrEqual(7); - - for (const key of keys) { - expect(getModelId(key)).toBe(DEFAULT_MODELS[key]); + it("getModelId returns default for each of the keys", () => { + const keys = Object.keys(DEFAULT_MODELS) as Array; + expect(keys).toHaveLength(9); + + for (const key of keys) { + // assertionModels is a special case—it's not a simple string ID like the others + if (key === "assertionModels") { + expect(DEFAULT_MODELS[key]).toBeUndefined(); + continue; } + expect(getModelId(key)).toBe(DEFAULT_MODELS[key]); + } }); it("getModelId returns custom value after configure", () => { diff --git a/src/assertion.ts b/src/assertion.ts index 0173b4a..f63375d 100644 --- a/src/assertion.ts +++ b/src/assertion.ts @@ -1,6 +1,6 @@ import { generateText, ModelMessage, Output } from "ai"; import { z } from "zod"; -import { getModelId } from "./config"; +import { getModelId, getAssertionModelsList } from "./config"; import { ASSERTION_MODEL_TIMEOUT, THINKING_BUDGET_DEFAULT } from "./constants"; import { logger } from "./logger"; import { resolveModel } from "./models"; @@ -20,10 +20,224 @@ const assertionSchema = z.object({ ), }); +type ModelAssertionFunction = () => Promise; + +/** + * Creates an assertion function for a specific model + */ +function createModelAssertionFunction( + modelId: string, + messages: ModelMessage[], + thinkingEnabled: boolean +): ModelAssertionFunction { + return async (): Promise => { + // Check if it's a Google model (for provider-specific options) + const isGoogleModel = modelId.toLowerCase().includes('google') || modelId.toLowerCase().includes('gemini'); + const isAnthropicModel = modelId.toLowerCase().includes('anthropic') || modelId.toLowerCase().includes('claude'); + + const providerOptions: Record = {}; + + if (thinkingEnabled) { + if (isAnthropicModel) { + providerOptions.anthropic = { + thinking: { type: "enabled", budgetTokens: THINKING_BUDGET_DEFAULT }, + }; + } else if (isGoogleModel) { + providerOptions.google = { + thinkingConfig: { + thinkingBudget: THINKING_BUDGET_DEFAULT, + }, + }; + } + + // Always include openrouter reasoning option as fallback + providerOptions.openrouter = { + reasoning: { max_tokens: THINKING_BUDGET_DEFAULT }, + }; + } + + const { output } = await generateText({ + model: resolveModel(modelId), + temperature: 0, + providerOptions: Object.keys(providerOptions).length > 0 ? providerOptions : undefined, + messages, + output: Output.object({ schema: assertionSchema }), + }); + + return output; + }; +} + +/** + * Creates an arbiter function to resolve disagreements between multiple model results + */ +function createArbiterFunction( + modelResults: Array<{ modelId: string; result: AssertionResult }>, + messages: ModelMessage[], + imageContent: Array<{ type: "image"; image: string }>, + assertion: string, + snapshot: string, + images: string[] | undefined, + thinkingEnabled: boolean = false +): ModelAssertionFunction { + return async (): Promise => { + const resultsSummary = modelResults + .map(({ modelId, result }) => ` +Model: ${modelId} +- Assertion Passed: ${result.assertionPassed} +- Confidence: ${result.confidenceScore}% +- Reasoning: ${result.reasoning} +`) + .join("\n"); + + const arbiterPrompt = ` +You are an AI arbiter tasked with resolving a disagreement between multiple AI models about an assertion. + +The following models evaluated the assertion and reached different conclusions: + +${resultsSummary} + +${!images + ? ` + +${snapshot} + +` + : "" + } + + +${assertion} + + +Please carefully review the evidence (screenshot and accessibility snapshot when provided) and make the final determination. Consider all models' reasoning but make your own independent assessment. + + +- Make your own independent evaluation based on the evidence +- Don't simply pick one model's answer - analyze the situation yourself +- Provide clear reasoning for your decision +- Be decisive - this is the final answer +- First use the attached screenshot(s) to visually inspect the page and try to verify the assertion. +- Only if the screenshot is not sufficient, use the accessibility snapshot (if supplied) to verify the assertion. +- Don't create additional assertion conditions on your own - only consider the exact assertion provided above. +- The assertion should pass if either the screenshot or the accessibility snapshot supports it. +- Don't be overly strict or pedantic about exact wording. Focus on the intent and objective of the assertion rather than literal text matching. +- Think like a practical QA tester - if the core functionality or state being asserted is present, the assertion should pass even if minor details differ. + +`; + + const arbiterMessages: ModelMessage[] = [ + { + role: "user", + content: [ + { + type: "text", + text: arbiterPrompt, + }, + ...imageContent, + ], + }, + ]; + + const arbiterModelId = getModelId("assertionArbiter"); + const isGoogleModel = arbiterModelId.toLowerCase().includes('google') || arbiterModelId.toLowerCase().includes('gemini'); + + const providerOptions: Record = { + openrouter: { + reasoning: { max_tokens: THINKING_BUDGET_DEFAULT }, + }, + }; + + if (thinkingEnabled && isGoogleModel) { + providerOptions.google = { + thinkingConfig: { + thinkingBudget: THINKING_BUDGET_DEFAULT, + }, + }; + } + + const { output } = await generateText({ + model: resolveModel(arbiterModelId), + temperature: 0, + providerOptions, + messages: arbiterMessages, + output: Output.object({ schema: assertionSchema }), + }); + + return output; + }; +} + +/** + * Checks if all models agree on the assertion result + */ +function checkConsensus(results: AssertionResult[]): { + hasConsensus: boolean; + consensusResult?: AssertionResult; + majorityResult?: AssertionResult; + passedCount: number; + failedCount: number; +} { + const passedCount = results.filter(r => r.assertionPassed).length; + const failedCount = results.length - passedCount; + + // Unanimous consensus + if (passedCount === results.length || failedCount === results.length) { + const avgConfidence = results.reduce((sum, r) => sum + r.confidenceScore, 0) / results.length; + // For backward compatibility: 2 models use secondary (index 1), 3+ use first + const reasoningIndex = results.length === 2 ? 1 : 0; + + return { + hasConsensus: true, + consensusResult: { + assertionPassed: passedCount === results.length, + confidenceScore: Math.round(avgConfidence), + reasoning: results[reasoningIndex].reasoning, + }, + passedCount, + failedCount, + }; + } + + // Majority vote (for 3+ models) + if (results.length >= 3) { + // Check for tie - if tie, return no majorityResult so arbiter is consulted + if (passedCount === failedCount) { + return { + hasConsensus: false, + passedCount, + failedCount, + // No majorityResult - this will trigger arbiter + }; + } + + const majorityPassed = passedCount > failedCount; + const majorityResults = results.filter(r => r.assertionPassed === majorityPassed); + const avgConfidence = majorityResults.reduce((sum, r) => sum + r.confidenceScore, 0) / majorityResults.length; + + return { + hasConsensus: false, + majorityResult: { + assertionPassed: majorityPassed, + confidenceScore: Math.round(avgConfidence), + reasoning: majorityResults[0].reasoning, + }, + passedCount, + failedCount, + }; + } + + return { + hasConsensus: false, + passedCount, + failedCount, + }; +} + /** * Multi-model consensus assertion engine. - * Runs Claude and Gemini in parallel; if they disagree, a third model (arbiter) makes the final call. - * An assertion passes only if both models agree (or the arbiter decides). + * Runs multiple AI models in parallel; if they disagree, an arbiter model makes the final call. + * Supports both legacy primary/secondary and new assertionModels array configuration. * Automatically retries failed assertions once with a fresh page snapshot. * * @param options - Assertion configuration @@ -39,6 +253,7 @@ const assertionSchema = z.object({ * * @example * ```typescript + * // Using multiple models (new approach) * await assert({ * page, * assertion: "The dashboard shows 3 active projects", @@ -62,6 +277,9 @@ export const assert = async ({ videoFilePath, }: AssertionOptions): Promise => { const thinkingEnabled = effort === "high"; + + // Get the list of models to use for assertions + const assertionModels = getAssertionModelsList(); // Video assertion path: when a recorded video is provided, evaluate the // assertion against the full video using a video-capable Gemini model. @@ -168,177 +386,69 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco }, ]; - // Claude assertion function - const getClaudeAssertion = async (): Promise => { - // First get Claude's text response with thinking if enabled - const { text } = await generateText({ - model: resolveModel(getModelId("assertionPrimary")), - temperature: 0, - providerOptions: thinkingEnabled - ? { - anthropic: { - thinking: { type: "enabled", budgetTokens: THINKING_BUDGET_DEFAULT }, - }, - openrouter: { - reasoning: { max_tokens: THINKING_BUDGET_DEFAULT }, - }, - } - : undefined, - messages, - }); - - // Convert Claude's response to structured format using Haiku - const { output } = await generateText({ - model: resolveModel(getModelId("assertionPrimary")), - temperature: 0.1, - prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`, - output: Output.object({ schema: assertionSchema }), - }); - - return output; - }; - - // Gemini assertion function - const getGeminiAssertion = async (): Promise => { - const { output } = await generateText({ - model: resolveModel(getModelId("assertionSecondary")), - temperature: 0, - providerOptions: thinkingEnabled - ? { - google: { - thinkingConfig: { - thinkingBudget: THINKING_BUDGET_DEFAULT, - }, - }, - openrouter: { - reasoning: { max_tokens: THINKING_BUDGET_DEFAULT }, - }, - } - : undefined, - messages, - output: Output.object({ schema: assertionSchema }), - }); - - return output; - }; - - // Arbiter function using Gemini 2.5 Pro with thinking enabled - const getArbiterDecision = async ( - claudeResult: AssertionResult, - geminiResult: AssertionResult, - ): Promise => { - const arbiterPrompt = ` -You are an AI arbiter tasked with resolving a disagreement between two AI models about an assertion. - -Claude's Assessment: -- Assertion Passed: ${claudeResult.assertionPassed} -- Confidence: ${claudeResult.confidenceScore}% -- Reasoning: ${claudeResult.reasoning} - -Gemini's Assessment: -- Assertion Passed: ${geminiResult.assertionPassed} -- Confidence: ${geminiResult.confidenceScore}% -- Reasoning: ${geminiResult.reasoning} - -${!images - ? ` - -${snapshot} - -` - : "" - } - - -${assertion} - - -Please carefully review the evidence (screenshot and accessibility snapshot (when provided)) and make the final determination. Consider both models' reasoning but make your own independent assessment. - - -- Make your own independent evaluation based on the evidence -- Don't simply pick one model's answer - analyze the situation yourself -- Provide clear reasoning for your decision -- Be decisive - this is the final answer -- First use the attached screenshot(s) to visually inspect the page and try to verify the assertion. -- Only if the screenshot is not sufficient, use the accessibility snapshot (if supplied) to verify the assertion. -- Don't create additional assertion conditions on your own - only consider the exact assertion provided above. -- The assertion should pass if either the screenshot or the accessibility snapshot supports it. -- Don't be overly strict or pedantic about exact wording. Focus on the intent and objective of the assertion rather than literal text matching. -- Think like a practical QA tester - if the core functionality or state being asserted is present, the assertion should pass even if minor details differ. - -`; - - const arbiterMessages: ModelMessage[] = [ - { - role: "user", - content: [ - { - type: "text", - text: arbiterPrompt, - }, - ...imageContent, - ], - }, - ]; - - const { output } = await generateText({ - model: resolveModel(getModelId("assertionArbiter")), - temperature: 0, - providerOptions: { - google: { - thinkingConfig: { - thinkingBudget: THINKING_BUDGET_DEFAULT, - }, - }, - openrouter: { - reasoning: { max_tokens: THINKING_BUDGET_DEFAULT }, - }, - }, - messages: arbiterMessages, - output: Output.object({ schema: assertionSchema }), - }); - - return output; - }; - const runAssertion = async (attempt = 0): Promise => { try { - // Run both models in parallel for speed optimization - const [claudeResult, geminiResult] = await Promise.all([ - withTimeout(getClaudeAssertion(), ASSERTION_MODEL_TIMEOUT), - withTimeout(getGeminiAssertion(), ASSERTION_MODEL_TIMEOUT), - ]); - - // Check if models disagree on assertionPassed - if (claudeResult.assertionPassed !== geminiResult.assertionPassed) { - logger.debug("Models disagree on assertion result, consulting arbiter..."); - const arbiterResult = await withTimeout( - getArbiterDecision(claudeResult, geminiResult), - ASSERTION_MODEL_TIMEOUT, + // Create assertion functions for all configured models + const assertionFunctions = assertionModels.map(modelId => + createModelAssertionFunction(modelId, messages, thinkingEnabled) + ); + + // Run all models in parallel + const results = await Promise.all( + assertionFunctions.map(fn => withTimeout(fn(), ASSERTION_MODEL_TIMEOUT)) + ); + + // Pair results with model IDs for potential arbiter use + const modelResults = assertionModels.map((modelId, index) => ({ + modelId, + result: results[index], + })); + + // Check for consensus + const consensus = checkConsensus(results); + + if (consensus.hasConsensus && consensus.consensusResult) { + logger.debug(`All ${results.length} models agreed on assertion result`); + return consensus.consensusResult; + } + + // For 2 models with disagreement, use arbiter + if (results.length === 2 && results[0].assertionPassed !== results[1].assertionPassed) { + logger.debug("Two models disagree on assertion result, consulting arbiter..."); + const arbiterFn = createArbiterFunction( + modelResults, + messages, + imageContent, + assertion, + snapshot, + images, + thinkingEnabled ); - - return { - assertionPassed: arbiterResult.assertionPassed, - confidenceScore: arbiterResult.confidenceScore, - reasoning: arbiterResult.reasoning, - }; + return await withTimeout(arbiterFn(), ASSERTION_MODEL_TIMEOUT); } - - // Assertion passes only if both models agree it should pass - const assertionPassed = claudeResult.assertionPassed && geminiResult.assertionPassed; - - // Calculate average confidence score - const confidenceScore = (claudeResult.confidenceScore + geminiResult.confidenceScore) / 2; - - // For now take Gemini's reasoning for simplicity - const reasoning = geminiResult.reasoning; - - return { - assertionPassed, - confidenceScore: Math.round(confidenceScore), - reasoning, - }; + + // For 3+ models with majority but not unanimous + if (results.length >= 3 && consensus.majorityResult) { + logger.debug( + `Majority vote: ${consensus.passedCount} passed, ${consensus.failedCount} failed. ` + + `Using majority result.` + ); + return consensus.majorityResult; + } + + // Fallback: Use arbiter for any unresolved cases (including ties) + logger.debug("Consulting arbiter for final decision..."); + const arbiterFn = createArbiterFunction( + modelResults, + messages, + imageContent, + assertion, + snapshot, + images, + thinkingEnabled + ); + return await withTimeout(arbiterFn(), ASSERTION_MODEL_TIMEOUT); + } catch (error) { if (attempt < 1) { logger.debug("Retrying assertion due to error..."); diff --git a/src/config.ts b/src/config.ts index dffa078..b5910c4 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,3 +1,5 @@ +import { logger } from "./logger"; +import { ConfigurationError } from "./errors"; import { initTelemetry } from "./instrumentation"; export type EmailProvider = { @@ -32,6 +34,8 @@ export type ModelConfig = { assertionPrimary?: string; /** Model for assertions (secondary). Default: google/gemini-3-flash */ assertionSecondary?: string; + /** Array of models to use for consensus assertions. When provided, overrides assertionPrimary/assertionSecondary */ + assertionModels?: string[]; /** Model for assertion arbiter. Default: google/gemini-3.1-pro-preview */ assertionArbiter?: string; /** Model for data extraction, wait conditions, and lightweight tasks. Default: google/gemini-2.5-flash */ @@ -44,12 +48,13 @@ export type ModelConfig = { cua?: string; }; -export const DEFAULT_MODELS: Required = { +export const DEFAULT_MODELS: Required> & Pick = { stepExecution: "google/gemini-3-flash", userFlowLow: "google/gemini-3-flash", userFlowHigh: "google/gemini-3.1-pro-preview", assertionPrimary: "anthropic/claude-haiku-4.5", assertionSecondary: "google/gemini-3-flash", + assertionModels: undefined, assertionArbiter: "google/gemini-3.1-pro-preview", utility: "google/gemini-2.5-flash", cua: "gpt-5.5", @@ -116,9 +121,30 @@ let globalConfig: Config = {}; * * @example * ```typescript + * // Using primary/secondary models (backward compatible) * configure({ - * ai: { gateway: "none", models: { stepExecution: "google/gemini-3-flash" } }, - * email: { domain: "test.com", extractContent: async ({ email, prompt }) => "..." }, + * ai: { + * gateway: "none", + * models: { + * assertionPrimary: "anthropic/claude-haiku-4.5", + * assertionSecondary: "google/gemini-3-flash" + * } + * }, + * }); + * + * // Using multiple models array (new flexible approach) + * configure({ + * ai: { + * gateway: "openrouter", + * models: { + * assertionModels: [ + * "anthropic/claude-haiku-4.5", + * "google/gemini-3-flash", + * "meta-llama/llama-3.1-8b-instruct" + * ], + * assertionArbiter: "google/gemini-3.1-pro-preview" + * } + * }, * }); * ``` */ @@ -130,6 +156,17 @@ export function configure(config: Config) { ); } globalConfig = { ...globalConfig, ...config }; + + // Validate assertion model configuration + const models = globalConfig.ai?.models; + if (models) { + const assertionModels = getAssertionModelsList(models); + if (assertionModels.length < 2) { + throw new ConfigurationError( + `Passmark: assertion consensus requires at least 2 models, got ${assertionModels.length}.` + ); + } + } if (config.telemetry) { initTelemetry(); @@ -149,10 +186,35 @@ export function getConfig(): Config { * @param key - The model use case key (e.g. "stepExecution", "utility") * @returns The model identifier string (e.g. "google/gemini-3-flash") */ -export function getModelId(key: keyof ModelConfig): string { +export function getModelId(key: keyof Omit): string { return getConfig().ai?.models?.[key] ?? DEFAULT_MODELS[key]; } +/** + * Returns the list of assertion models from configuration. + * Prioritizes assertionModels array, falls back to [assertionPrimary, assertionSecondary] with defaults. + * + * @param models - The model configuration + * @returns Array of model identifiers for assertions (always at least 2 when using primary/secondary) + */ +export function getAssertionModelsList(models?: ModelConfig): string[] { + const configModels = models ?? getConfig().ai?.models; + + if (!configModels) { + return [DEFAULT_MODELS.assertionPrimary, DEFAULT_MODELS.assertionSecondary]; + } + + // Prefer the new assertionModels array if explicitly provided (even if empty) + if (configModels.assertionModels !== undefined) { + return configModels.assertionModels; // caller will validate length + } + + // Backward compatible: build from primary/secondary, applying defaults for unset ones + const primary = configModels.assertionPrimary ?? DEFAULT_MODELS.assertionPrimary; + const secondary = configModels.assertionSecondary ?? DEFAULT_MODELS.assertionSecondary; + return [primary, secondary]; +} + /** * Returns the configured execution mode ("snapshot" | "cua"). * Defaults to "snapshot" so existing users see no behavior change. @@ -216,4 +278,4 @@ export function resolveAI(...overrides: (AIOverride | undefined)[]): ResolvedAI /** @internal Reset config to empty state. Used for testing only. */ export function resetConfig() { globalConfig = {}; -} +} \ No newline at end of file