diff --git a/src/assertion.ts b/src/assertion.ts index f20b765..712ca08 100644 --- a/src/assertion.ts +++ b/src/assertion.ts @@ -6,6 +6,7 @@ import { logger } from "./logger"; import { resolveModel } from "./models"; import { AssertionResult, AssertionOptions } from "./types"; import { resolvePage, safeSnapshot, withTimeout } from "./utils"; +import { trackUsage } from "./cost"; const assertionSchema = z.object({ assertionPassed: z.boolean().describe("Indicates whether the assertion passed or not."), @@ -130,8 +131,9 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco // Claude assertion function const getClaudeAssertion = async (): Promise => { // First get Claude's text response with thinking if enabled - const { text } = await generateText({ - model: resolveModel(getModelId("assertionPrimary")), + const modelId = getModelId("assertionPrimary"); + const result = await generateText({ + model: resolveModel(modelId), temperature: 0, providerOptions: thinkingEnabled ? { @@ -146,21 +148,30 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco messages, }); + if (result.usage) { + await trackUsage(modelId, result.usage); + } + // Convert Claude's response to structured format using Haiku - const { output } = await generateText({ - model: resolveModel(getModelId("assertionPrimary")), + const haikuResult = await generateText({ + model: resolveModel(modelId), temperature: 0.1, - prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`, + prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${result.text}`, output: Output.object({ schema: assertionSchema }), }); - return output; + if (haikuResult.usage) { + await trackUsage(modelId, haikuResult.usage); + } + + return haikuResult.output; }; // Gemini assertion function const getGeminiAssertion = async (): Promise => { - const { output } = await generateText({ - model: resolveModel(getModelId("assertionSecondary")), + const modelId = getModelId("assertionSecondary"); + const result = await generateText({ + model: resolveModel(modelId), temperature: 0, providerOptions: thinkingEnabled ? { @@ -178,7 +189,11 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco output: Output.object({ schema: assertionSchema }), }); - return output; + if (result.usage) { + await trackUsage(modelId, result.usage); + } + + return result.output; }; // Arbiter function using Gemini 2.5 Pro with thinking enabled @@ -241,8 +256,9 @@ Please carefully review the evidence (screenshot and accessibility snapshot (whe }, ]; - const { output } = await generateText({ - model: resolveModel(getModelId("assertionArbiter")), + const modelId = getModelId("assertionArbiter"); + const result = await generateText({ + model: resolveModel(modelId), temperature: 0, providerOptions: { google: { @@ -258,7 +274,11 @@ Please carefully review the evidence (screenshot and accessibility snapshot (whe output: Output.object({ schema: assertionSchema }), }); - return output; + if (result.usage) { + await trackUsage(modelId, result.usage); + } + + return result.output; }; const runAssertion = async (attempt = 0): Promise => { diff --git a/src/config.ts b/src/config.ts index 82c6974..ec0de2a 100644 --- a/src/config.ts +++ b/src/config.ts @@ -40,6 +40,8 @@ export type ModelConfig = { * Override may be re-enabled in a future release. */ cua?: string; + /** Model for explaining visual regression failures. Default: google/gemini-3-flash */ + visualRegressionExplanation?: string; }; export const DEFAULT_MODELS: Required = { @@ -51,6 +53,7 @@ export const DEFAULT_MODELS: Required = { assertionArbiter: "google/gemini-3.1-pro-preview", utility: "google/gemini-2.5-flash", cua: "gpt-5.5", + visualRegressionExplanation: "google/gemini-3-flash", }; /** diff --git a/src/constants.ts b/src/constants.ts index 195521f..732b1d9 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -9,6 +9,7 @@ export const INITIAL_DOM_STABILIZATION_IDLE = 3000; export const ASSERTION_MODEL_TIMEOUT = 35000; export const STEP_EXECUTION_TIMEOUT = 180000; export const WAIT_CONDITION_TIMEOUT = 120000; +export const VISUAL_DIFF_EXPLANATION_TIMEOUT = 45000; export const WAIT_CONDITION_INITIAL_INTERVAL = 1000; export const WAIT_CONDITION_MAX_INTERVAL = 10000; export const EMAIL_INITIAL_WAIT = 5000; diff --git a/src/cost.ts b/src/cost.ts new file mode 100644 index 0000000..143607f --- /dev/null +++ b/src/cost.ts @@ -0,0 +1,91 @@ +import { logger } from "./logger"; +import { redis } from "./redis"; + +export interface ModelPricing { + promptTokenPricePerMillion: number; + completionTokenPricePerMillion: number; +} + +/** + * Default pricing for models used in Passmark. + * Prices are in USD per 1 million tokens. + * Data sourced from provider documentation as of April 2026. + */ +export const DEFAULT_PRICING: Record = { + "google/gemini-3-flash": { + promptTokenPricePerMillion: 0.1, + completionTokenPricePerMillion: 0.4, + }, + "google/gemini-3.1-pro-preview": { + promptTokenPricePerMillion: 1.25, + completionTokenPricePerMillion: 5.0, + }, + "anthropic/claude-haiku-4.5": { + promptTokenPricePerMillion: 0.25, + completionTokenPricePerMillion: 1.25, + }, + "google/gemini-2.5-flash": { + promptTokenPricePerMillion: 0.1, + completionTokenPricePerMillion: 0.4, + }, + "gpt-5.5": { + promptTokenPricePerMillion: 2.5, + completionTokenPricePerMillion: 10.0, + }, +}; + +export interface Usage { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; +} + +/** + * Tracks LLM usage and calculates the cost of the call. + * Updates a global cost counter in Redis if available for cross-worker synchronization. + * + * @param modelId - The canonical model ID (e.g. "google/gemini-3-flash") + * @param usage - Token usage data from the AI SDK + */ +export async function trackUsage(modelId: string, usage: Usage) { + const pricing = DEFAULT_PRICING[modelId] || DEFAULT_PRICING["google/gemini-3-flash"]; + + const promptTokens = usage.promptTokens ?? 0; + const completionTokens = usage.completionTokens ?? 0; + + const promptCost = (promptTokens / 1_000_000) * pricing.promptTokenPricePerMillion; + const completionCost = (completionTokens / 1_000_000) * pricing.completionTokenPricePerMillion; + const totalCost = promptCost + completionCost; + + logger.debug( + `[Cost] Model: ${modelId} | Prompt: ${promptTokens} | Completion: ${completionTokens} | Cost: $${totalCost.toFixed(6)}` + ); + + if (redis) { + try { + // Use a global key to track cumulative cost across all test workers + const executionId = process.env.executionId || "default"; + const costKey = `cost:total:${executionId}`; + const modelCostKey = `cost:model:${modelId}:${executionId}`; + + await Promise.all([ + redis.incrbyfloat(costKey, totalCost), + redis.incrbyfloat(modelCostKey, totalCost), + ]); + } catch (err) { + logger.warn(`Failed to update cost in Redis: ${err}`); + } + } +} + +/** + * Retrieves the total estimated cost for the current execution. + * @returns Total cost in USD + */ +export async function getTotalCost(): Promise { + if (!redis) return 0; + + const executionId = process.env.executionId || "default"; + const cost = await redis.get(`cost:total:${executionId}`); + return cost ? parseFloat(cost) : 0; +} diff --git a/src/errors.ts b/src/errors.ts index f10ced4..de7f174 100644 --- a/src/errors.ts +++ b/src/errors.ts @@ -89,4 +89,13 @@ export class ValidationError extends PassmarkError { constructor(message: string) { super(message, "VALIDATION_ERROR"); } +} + +/** + * Thrown when visual regression explanation fails. + */ +export class VisualRegressionError extends PassmarkError { + constructor(message: string) { + super(message, "VISUAL_REGRESSION_ERROR"); + } } \ No newline at end of file diff --git a/src/extract.ts b/src/extract.ts index abff50d..1d0b677 100644 --- a/src/extract.ts +++ b/src/extract.ts @@ -2,6 +2,7 @@ import { generateText, Output } from "ai"; import { z } from "zod"; import { getModelId } from "./config"; import { resolveModel } from "./models"; +import { trackUsage } from "./cost"; const extractionSchema = z.object({ extractedValue: z.string().describe("The extracted value based on the prompt"), @@ -35,8 +36,9 @@ export async function extractDataWithAI({ url: string; prompt: string; }): Promise { - const { output } = await generateText({ - model: resolveModel(getModelId("utility")), + const modelId = getModelId("utility"); + const result = await generateText({ + model: resolveModel(modelId), temperature: 0, output: Output.object({ schema: extractionSchema }), prompt: `You are an AI assistant that extracts specific data from web pages. @@ -66,5 +68,9 @@ ${prompt} Return the extracted value.`, }); - return output.extractedValue; + if (result.usage) { + await trackUsage(modelId, result.usage); + } + + return result.output.extractedValue; } diff --git a/src/index.ts b/src/index.ts index c069c9e..c76e27e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -13,13 +13,7 @@ import { withSpan } from "axiom/ai"; import shortid from "shortid"; import { axiomEnabled } from "./instrumentation"; -// Only use withSpan when Axiom is configured, otherwise just execute the function directly -async function maybeWithSpan( - meta: { capability: string; step: string }, - fn: () => Promise, -): Promise { - return axiomEnabled ? withSpan(meta, async () => fn()) : fn(); -} +import { maybeWithSpan } from "./utils/telemetry"; import { z } from "zod"; import { buildRunStepsPrompt, buildRunUserFlowPrompt } from "./prompts"; import { redis } from "./redis"; @@ -45,6 +39,8 @@ import { runCUALoop, buildRunStepsPromptCUA, buildRunUserFlowPromptCUA } from ". import { extractDataWithAI } from "./extract"; import { logger } from "./logger"; import { resolveModel } from "./models"; +export * from "./visual"; +import { trackUsage } from "./cost"; import { runSecureScript } from "./utils/secure-script-runner"; import { createTabManager } from "./utils/tab-manager"; import { @@ -460,10 +456,10 @@ export const runSteps = async ({ ); } - const stepModelId = effectiveAi.getModelId("stepExecution"); - const model = resolveModel(stepModelId, effectiveAi.gateway); + const modelId = getModelId("stepExecution"); + const model = resolveModel(modelId); logger.debug( - `Using model: ${stepModelId} for step execution / gateway: ${effectiveAi.gateway}`, + `Using model: ${modelId} for step execution / gateway: ${getConfig().ai?.gateway ?? "none"}`, ); try { @@ -514,6 +510,10 @@ export const runSteps = async ({ }), ); + if (result.usage) { + await trackUsage(modelId, result.usage); + } + // Cache the step action only if it was a single tool call (simple, deterministic action). // Multi-step actions are not cached as they may be non-deterministic. const allToolCalls = result.steps @@ -680,8 +680,9 @@ export const runUserFlow = async ({ ); if (assertion) { - const { output } = await generateText({ - model: resolveModel(effectiveAi.getModelId("utility"), effectiveAi.gateway), + const utilityModelId = getModelId("utility"); + const { output, usage } = await generateText({ + model: resolveModel(utilityModelId), prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`, output: Output.object({ schema: z.object({ @@ -695,6 +696,11 @@ export const runUserFlow = async ({ }), }), }); + + if (usage) { + await trackUsage(utilityModelId, usage); + } + return output; } @@ -715,7 +721,7 @@ export const runUserFlow = async ({ }); try { - const { text } = await maybeWithSpan( + const result = await maybeWithSpan( { capability: "user_flow_execution", step: "agentic_tool_calling" }, async () => { return generateText({ @@ -758,10 +764,18 @@ export const runUserFlow = async ({ }, ); + if (result.usage) { + await trackUsage( + effort === "low" ? getModelId("userFlowLow") : getModelId("userFlowHigh"), + result.usage, + ); + } + if (assertion) { - const { output } = await generateText({ - model: resolveModel(effectiveAi.getModelId("utility"), effectiveAi.gateway), - prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`, + const utilityModelId = getModelId("utility"); + const { output, usage } = await generateText({ + model: resolveModel(utilityModelId), + prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${result.text}`, output: Output.object({ schema: z.object({ assertionPassed: z.boolean().describe("Indicates whether the assertion passed or not."), @@ -775,10 +789,14 @@ export const runUserFlow = async ({ }), }); + if (usage) { + await trackUsage(utilityModelId, usage); + } + return output; } - return text; + return result.text; } catch (error: unknown) { logger.error({ err: error }, "Error during user flow execution"); } diff --git a/src/types.ts b/src/types.ts index e4ff76e..041c1cd 100644 --- a/src/types.ts +++ b/src/types.ts @@ -141,3 +141,20 @@ export type RunStepsOptions = { } | { assertions?: never; expect?: never } ); + +export type VisualExplanationResult = { + explanation: string; + isBug: boolean; + confidence: number; + diffAreas?: string[]; +}; + +export type VisualDiffOptions = { + page: Page; + expectedImage?: string | Buffer; + actualImage?: string | Buffer; + test?: TestType< + PlaywrightTestArgs & PlaywrightTestOptions, + PlaywrightWorkerArgs & PlaywrightWorkerOptions + >; +}; diff --git a/src/utils/telemetry.ts b/src/utils/telemetry.ts new file mode 100644 index 0000000..6ef7ef9 --- /dev/null +++ b/src/utils/telemetry.ts @@ -0,0 +1,16 @@ +import { withSpan } from "axiom/ai"; +import { axiomEnabled } from "../instrumentation"; + +/** + * Executes a function within an Axiom span if instrumentation is enabled. + * If Axiom is not configured, simply executes the function directly. + * + * @param meta - Span metadata including capability and step name + * @param fn - The function to execute + */ +export async function maybeWithSpan( + meta: { capability: string; step: string }, + fn: () => Promise, +): Promise { + return axiomEnabled ? withSpan(meta, async () => fn()) : fn(); +} diff --git a/src/visual.ts b/src/visual.ts new file mode 100644 index 0000000..0556c99 --- /dev/null +++ b/src/visual.ts @@ -0,0 +1,200 @@ +import { generateText, Output } from "ai"; +import { resolveModel } from "./models"; +import { getModelId } from "./config"; +import { logger } from "./logger"; +import { trackUsage } from "./cost"; +import fs from "fs"; +import { z } from "zod"; +import { VisualDiffOptions, VisualExplanationResult } from "./types"; +import { maybeWithSpan } from "./utils/telemetry"; +import { withTimeout } from "./utils"; +import { VISUAL_DIFF_EXPLANATION_TIMEOUT } from "./constants"; +import { VisualRegressionError } from "./errors"; + +const visualExplanationSchema = z.object({ + explanation: z + .string() + .describe("A human-readable explanation of the visual differences between the two images."), + isBug: z + .boolean() + .describe("Whether the change appears to be a functional bug or a legitimate UI tweak."), + confidence: z.number().min(0).max(100).describe("Confidence score of the explanation (0-100)."), + diffAreas: z + .array(z.string()) + .optional() + .describe("List of specific areas or elements that changed."), +}); + +/** + * Resolves various image input formats into a Buffer. + * Supports file paths, base64 data URLs, and Buffers. + */ +async function resolveImageBuffer(input: string | Buffer): Promise { + if (Buffer.isBuffer(input)) { + return input; + } + + // Handle data URL (e.g. "data:image/png;base64,...") + if (input.startsWith("data:")) { + const base64 = input.split(",")[1]; + if (!base64) { + throw new VisualRegressionError("Invalid base64 data URL provided as image input."); + } + return Buffer.from(base64, "base64"); + } + + // Handle file path + if (fs.existsSync(input)) { + try { + return fs.readFileSync(input); + } catch (err) { + throw new VisualRegressionError(`Failed to read image from path: ${input}. ${err}`); + } + } + + // If it doesn't look like a path and isn't a Buffer/DataURL, maybe it's raw base64 + try { + return Buffer.from(input, "base64"); + } catch (err) { + throw new VisualRegressionError( + `Unable to resolve image input. It is not a valid path, Buffer, or base64 string.`, + ); + } +} + +/** + * Explains the visual differences between an "Expected" screenshot and the "Actual" page state. + * Uses a Vision-capable AI model to generate a human-readable explanation, helping QA teams + * distinguish between minor styling tweaks and breaking visual bugs. + * + * @param options - Configuration for the visual diff explanation + * @param options.page - The Playwright page instance + * @param options.expectedImage - The baseline image (path, Buffer, or base64) + * @param options.actualImage - The failed state image (path, Buffer, or base64). If omitted, takes a fresh screenshot. + * @param options.test - Playwright test instance for attaching rich annotations to the report. + * @returns A structured explanation of the visual differences. + * + * @example + * ```typescript + * try { + * await expect(page).toHaveScreenshot('landing.png'); + * } catch (error) { + * await explainVisualDiff({ + * page, + * expectedImage: 'test-snapshots/landing-linux.png', + * test, + * }); + * throw error; + * } + * ``` + */ +export async function explainVisualDiff({ + page, + expectedImage, + actualImage, + test, +}: VisualDiffOptions): Promise { + return maybeWithSpan({ capability: "visual_regression", step: "explain_diff" }, async () => { + logger.info("Generating AI explanation for visual regression failure..."); + + try { + // 1. Resolve images to Buffers + let expectedBuffer: Buffer | undefined; + if (expectedImage) { + expectedBuffer = await resolveImageBuffer(expectedImage); + } + + let actualBuffer: Buffer; + if (actualImage) { + actualBuffer = await resolveImageBuffer(actualImage); + } else { + logger.debug("No actual image provided, taking fresh screenshot..."); + actualBuffer = await page.screenshot({ fullPage: false }); + } + + // 2. Prepare model + const modelId = getModelId("visualRegressionExplanation"); + const model = resolveModel(modelId); + + const prompt = ` +You are an elite QA Automation Engineer and Visual UX Expert. +You have been tasked with explaining why a visual regression test failed. + +Attached are two images: +1. **Expected Image (Baseline)**: The reference point that represents the "correct" UI state. +2. **Actual Image (Current)**: The current state of the application which failed the pixel-diff check. + +### Objective +Provide a precise, human-readable explanation of the differences. Your goal is to help a developer or product manager quickly understand if this is a regression bug or a planned UI update. + +### Analysis Requirements +- **Spatial Changes**: Note if elements moved, swapped positions, or if margins/padding changed. +- **Visual Styles**: Identify changes in fonts, colors, border-radius, or shadows. +- **Content Changes**: Note if text changed, icons were swapped, or images are missing. +- **Layout Integrity**: Identify if the layout broke or if elements are overlapping unexpectedly. + +### Output Format +Be concise but technical. Avoid fluff. + + +- \`explanation\`: A 2-3 sentence summary of the core differences. +- \`isBug\`: Boolean (true if it looks like a broken layout/missing asset, false if it looks like a clean styling update). +- \`confidence\`: 0-100 score of your assessment. +- \`diffAreas\`: Optional list of specific components or selectors that appear to have changed. + +`; + + // 3. Execute model call with timeout + const result = await withTimeout( + generateText({ + model, + messages: [ + { + role: "user", + content: [ + { type: "text", text: prompt }, + ...(expectedBuffer ? [{ type: "image" as const, image: expectedBuffer }] : []), + { type: "image" as const, image: actualBuffer }, + ], + }, + ], + output: Output.object({ schema: visualExplanationSchema }), + }), + VISUAL_DIFF_EXPLANATION_TIMEOUT, + ); + + if (result.usage) { + await trackUsage(modelId, result.usage); + } + + const { explanation, isBug, confidence, diffAreas } = result.output; + + // 4. Format for Playwright Report + const areasStr = + diffAreas && diffAreas.length > 0 + ? `\n\n**Impacted Areas:**\n${diffAreas.map((a) => `- ${a}`).join("\n")}` + : ""; + + const judgmentEmoji = isBug ? "🚨 **Lately a BUG**" : "✨ **Likely a STYLING TWEAK**"; + const summary = `### 🔍 AI Visual Diff Analysis\n\n**Confidence:** ${confidence}%\n\n**Explanation:**\n${explanation}\n\n**Judgment:** ${judgmentEmoji}${areasStr}\n\n---\n*Analysis generated by Passmark AI Regression Engine*`; + + if (test) { + test.info().annotations.push({ + type: "Visual Diff Analysis", + description: summary, + }); + } + + logger.info(`Successfully generated visual diff explanation (${confidence}% confidence).`); + return result.output; + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + logger.error(`Failed to generate visual diff explanation: ${message}`); + + if (error instanceof VisualRegressionError) { + throw error; + } + throw new VisualRegressionError(`AI visual analysis failed: ${message}`); + } + }); +}