bug0inc · devhimanshuu · Apr 27, 2026 · Apr 27, 2026
diff --git a/src/assertion.ts b/src/assertion.ts
@@ -6,6 +6,7 @@ import { logger } from "./logger";
 import { resolveModel } from "./models";
 import { AssertionResult, AssertionOptions } from "./types";
 import { resolvePage, safeSnapshot, withTimeout } from "./utils";
+import { trackUsage } from "./cost";
 
 const assertionSchema = z.object({
   assertionPassed: z.boolean().describe("Indicates whether the assertion passed or not."),
@@ -130,8 +131,9 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco
     // Claude assertion function
     const getClaudeAssertion = async (): Promise<AssertionResult> => {
       // First get Claude's text response with thinking if enabled
-      const { text } = await generateText({
-        model: resolveModel(getModelId("assertionPrimary")),
+      const modelId = getModelId("assertionPrimary");
+      const result = await generateText({
+        model: resolveModel(modelId),
         temperature: 0,
         providerOptions: thinkingEnabled
           ? {
@@ -146,21 +148,30 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco
         messages,
       });
 
+      if (result.usage) {
+        await trackUsage(modelId, result.usage);
+      }
+
       // Convert Claude's response to structured format using Haiku
-      const { output } = await generateText({
-        model: resolveModel(getModelId("assertionPrimary")),
+      const haikuResult = await generateText({
+        model: resolveModel(modelId),
         temperature: 0.1,
-        prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`,
+        prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${result.text}`,
         output: Output.object({ schema: assertionSchema }),
       });
 
-      return output;
+      if (haikuResult.usage) {
+        await trackUsage(modelId, haikuResult.usage);
+      }
+
+      return haikuResult.output;
     };
 
     // Gemini assertion function
     const getGeminiAssertion = async (): Promise<AssertionResult> => {
-      const { output } = await generateText({
-        model: resolveModel(getModelId("assertionSecondary")),
+      const modelId = getModelId("assertionSecondary");
+      const result = await generateText({
+        model: resolveModel(modelId),
         temperature: 0,
         providerOptions: thinkingEnabled
           ? {
@@ -178,7 +189,11 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco
         output: Output.object({ schema: assertionSchema }),
       });
 
-      return output;
+      if (result.usage) {
+        await trackUsage(modelId, result.usage);
+      }
+
+      return result.output;
     };
 
     // Arbiter function using Gemini 2.5 Pro with thinking enabled
@@ -241,8 +256,9 @@ Please carefully review the evidence (screenshot and accessibility snapshot (whe
         },
       ];
 
-      const { output } = await generateText({
-        model: resolveModel(getModelId("assertionArbiter")),
+      const modelId = getModelId("assertionArbiter");
+      const result = await generateText({
+        model: resolveModel(modelId),
         temperature: 0,
         providerOptions: {
           google: {
@@ -258,7 +274,11 @@ Please carefully review the evidence (screenshot and accessibility snapshot (whe
         output: Output.object({ schema: assertionSchema }),
       });
 
-      return output;
+      if (result.usage) {
+        await trackUsage(modelId, result.usage);
+      }
+
+      return result.output;
     };
 
     const runAssertion = async (attempt = 0): Promise<AssertionResult> => {

diff --git a/src/config.ts b/src/config.ts
@@ -40,6 +40,8 @@ export type ModelConfig = {
    * Override may be re-enabled in a future release.
    */
   cua?: string;
+  /** Model for explaining visual regression failures. Default: google/gemini-3-flash */
+  visualRegressionExplanation?: string;
 };
 
 export const DEFAULT_MODELS: Required<ModelConfig> = {
@@ -51,6 +53,7 @@ export const DEFAULT_MODELS: Required<ModelConfig> = {
   assertionArbiter: "google/gemini-3.1-pro-preview",
   utility: "google/gemini-2.5-flash",
   cua: "gpt-5.5",
+  visualRegressionExplanation: "google/gemini-3-flash",
 };
 
 /**

diff --git a/src/constants.ts b/src/constants.ts
@@ -9,6 +9,7 @@ export const INITIAL_DOM_STABILIZATION_IDLE = 3000;
 export const ASSERTION_MODEL_TIMEOUT = 35000;
 export const STEP_EXECUTION_TIMEOUT = 180000;
 export const WAIT_CONDITION_TIMEOUT = 120000;
+export const VISUAL_DIFF_EXPLANATION_TIMEOUT = 45000;
 export const WAIT_CONDITION_INITIAL_INTERVAL = 1000;
 export const WAIT_CONDITION_MAX_INTERVAL = 10000;
 export const EMAIL_INITIAL_WAIT = 5000;

diff --git a/src/cost.ts b/src/cost.ts
@@ -0,0 +1,91 @@
+import { logger } from "./logger";
+import { redis } from "./redis";
+
+export interface ModelPricing {
+  promptTokenPricePerMillion: number;
+  completionTokenPricePerMillion: number;
+}
+
+/**
+ * Default pricing for models used in Passmark.
+ * Prices are in USD per 1 million tokens.
+ * Data sourced from provider documentation as of April 2026.
+ */
+export const DEFAULT_PRICING: Record<string, ModelPricing> = {
+  "google/gemini-3-flash": {
+    promptTokenPricePerMillion: 0.1,
+    completionTokenPricePerMillion: 0.4,
+  },
+  "google/gemini-3.1-pro-preview": {
+    promptTokenPricePerMillion: 1.25,
+    completionTokenPricePerMillion: 5.0,
+  },
+  "anthropic/claude-haiku-4.5": {
+    promptTokenPricePerMillion: 0.25,
+    completionTokenPricePerMillion: 1.25,
+  },
+  "google/gemini-2.5-flash": {
+    promptTokenPricePerMillion: 0.1,
+    completionTokenPricePerMillion: 0.4,
+  },
+  "gpt-5.5": {
+    promptTokenPricePerMillion: 2.5,
+    completionTokenPricePerMillion: 10.0,
+  },
+};
+
+export interface Usage {
+  promptTokens?: number;
+  completionTokens?: number;
+  totalTokens?: number;
+}
+
+/**
+ * Tracks LLM usage and calculates the cost of the call.
+ * Updates a global cost counter in Redis if available for cross-worker synchronization.
+ *
+ * @param modelId - The canonical model ID (e.g. "google/gemini-3-flash")
+ * @param usage - Token usage data from the AI SDK
+ */
+export async function trackUsage(modelId: string, usage: Usage) {
+  const pricing = DEFAULT_PRICING[modelId] || DEFAULT_PRICING["google/gemini-3-flash"];
+
+  const promptTokens = usage.promptTokens ?? 0;
+  const completionTokens = usage.completionTokens ?? 0;
+
+  const promptCost = (promptTokens / 1_000_000) * pricing.promptTokenPricePerMillion;
+  const completionCost = (completionTokens / 1_000_000) * pricing.completionTokenPricePerMillion;
+  const totalCost = promptCost + completionCost;
+
+  logger.debug(
+    `[Cost] Model: ${modelId} | Prompt: ${promptTokens} | Completion: ${completionTokens} | Cost: $${totalCost.toFixed(6)}`
+  );
+
+  if (redis) {
+    try {
+      // Use a global key to track cumulative cost across all test workers
+      const executionId = process.env.executionId || "default";
+      const costKey = `cost:total:${executionId}`;
+      const modelCostKey = `cost:model:${modelId}:${executionId}`;
+
+      await Promise.all([
+        redis.incrbyfloat(costKey, totalCost),
+        redis.incrbyfloat(modelCostKey, totalCost),
+      ]);
+    } catch (err) {
+      logger.warn(`Failed to update cost in Redis: ${err}`);
+    }
+  }
+}
+
+/**
+ * Retrieves the total estimated cost for the current execution.
+ * @returns Total cost in USD
+ */
+export async function getTotalCost(): Promise<number> {
+  if (!redis) return 0;
+
+  const executionId = process.env.executionId || "default";
+  const cost = await redis.get(`cost:total:${executionId}`);
+  return cost ? parseFloat(cost) : 0;
+}
diff --git a/src/errors.ts b/src/errors.ts
@@ -89,4 +89,13 @@ export class ValidationError extends PassmarkError {
   constructor(message: string) {
     super(message, "VALIDATION_ERROR");
   }
+}
+
+/**
+ * Thrown when visual regression explanation fails.
+ */
+export class VisualRegressionError extends PassmarkError {
+  constructor(message: string) {
+    super(message, "VISUAL_REGRESSION_ERROR");
+  }
 }
diff --git a/src/extract.ts b/src/extract.ts
@@ -2,6 +2,7 @@ import { generateText, Output } from "ai";
 import { z } from "zod";
 import { getModelId } from "./config";
 import { resolveModel } from "./models";
+import { trackUsage } from "./cost";
 
 const extractionSchema = z.object({
   extractedValue: z.string().describe("The extracted value based on the prompt"),
@@ -35,8 +36,9 @@ export async function extractDataWithAI({
   url: string;
   prompt: string;
 }): Promise<string> {
-  const { output } = await generateText({
-    model: resolveModel(getModelId("utility")),
+  const modelId = getModelId("utility");
+  const result = await generateText({
+    model: resolveModel(modelId),
     temperature: 0,
     output: Output.object({ schema: extractionSchema }),
     prompt: `You are an AI assistant that extracts specific data from web pages.
@@ -66,5 +68,9 @@ ${prompt}
 Return the extracted value.`,
   });
 
-  return output.extractedValue;
+  if (result.usage) {
+    await trackUsage(modelId, result.usage);
+  }
+
+  return result.output.extractedValue;
 }
diff --git a/src/index.ts b/src/index.ts
@@ -13,13 +13,7 @@ import { withSpan } from "axiom/ai";
 import shortid from "shortid";
 import { axiomEnabled } from "./instrumentation";
 
-// Only use withSpan when Axiom is configured, otherwise just execute the function directly
-async function maybeWithSpan<T>(
-  meta: { capability: string; step: string },
-  fn: () => Promise<T>,
-): Promise<T> {
-  return axiomEnabled ? withSpan(meta, async () => fn()) : fn();
-}
+import { maybeWithSpan } from "./utils/telemetry";
 import { z } from "zod";
 import { buildRunStepsPrompt, buildRunUserFlowPrompt } from "./prompts";
 import { redis } from "./redis";
@@ -45,6 +39,8 @@ import { runCUALoop, buildRunStepsPromptCUA, buildRunUserFlowPromptCUA } from ".
 import { extractDataWithAI } from "./extract";
 import { logger } from "./logger";
 import { resolveModel } from "./models";
+export * from "./visual";
+import { trackUsage } from "./cost";
 import { runSecureScript } from "./utils/secure-script-runner";
 import { createTabManager } from "./utils/tab-manager";
 import {
@@ -460,10 +456,10 @@ export const runSteps = async ({
       );
     }
 
-    const stepModelId = effectiveAi.getModelId("stepExecution");
-    const model = resolveModel(stepModelId, effectiveAi.gateway);
+    const modelId = getModelId("stepExecution");
+    const model = resolveModel(modelId);
     logger.debug(
-      `Using model: ${stepModelId} for step execution / gateway: ${effectiveAi.gateway}`,
+      `Using model: ${modelId} for step execution / gateway: ${getConfig().ai?.gateway ?? "none"}`,
     );
 
     try {
@@ -514,6 +510,10 @@ export const runSteps = async ({
           }),
       );
 
+      if (result.usage) {
+        await trackUsage(modelId, result.usage);
+      }
+
       // Cache the step action only if it was a single tool call (simple, deterministic action).
       // Multi-step actions are not cached as they may be non-deterministic.
       const allToolCalls = result.steps
@@ -680,8 +680,9 @@ export const runUserFlow = async ({
       );
 
       if (assertion) {
-        const { output } = await generateText({
-          model: resolveModel(effectiveAi.getModelId("utility"), effectiveAi.gateway),
+        const utilityModelId = getModelId("utility");
+        const { output, usage } = await generateText({
+          model: resolveModel(utilityModelId),
           prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`,
           output: Output.object({
             schema: z.object({
@@ -695,6 +696,11 @@ export const runUserFlow = async ({
             }),
           }),
         });
+
+        if (usage) {
+          await trackUsage(utilityModelId, usage);
+        }
+
         return output;
       }
 
@@ -715,7 +721,7 @@ export const runUserFlow = async ({
   });
 
   try {
-    const { text } = await maybeWithSpan(
+    const result = await maybeWithSpan(
       { capability: "user_flow_execution", step: "agentic_tool_calling" },
       async () => {
         return generateText({
@@ -758,10 +764,18 @@ export const runUserFlow = async ({
       },
     );
 
+    if (result.usage) {
+      await trackUsage(
+        effort === "low" ? getModelId("userFlowLow") : getModelId("userFlowHigh"),
+        result.usage,
+      );
+    }
+
     if (assertion) {
-      const { output } = await generateText({
-        model: resolveModel(effectiveAi.getModelId("utility"), effectiveAi.gateway),
-        prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`,
+      const utilityModelId = getModelId("utility");
+      const { output, usage } = await generateText({
+        model: resolveModel(utilityModelId),
+        prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${result.text}`,
         output: Output.object({
           schema: z.object({
             assertionPassed: z.boolean().describe("Indicates whether the assertion passed or not."),
@@ -775,10 +789,14 @@ export const runUserFlow = async ({
         }),
       });
 
+      if (usage) {
+        await trackUsage(utilityModelId, usage);
+      }
+
       return output;
     }
 
-    return text;
+    return result.text;
   } catch (error: unknown) {
     logger.error({ err: error }, "Error during user flow execution");
   }

diff --git a/src/types.ts b/src/types.ts
@@ -141,3 +141,20 @@ export type RunStepsOptions = {
     }
     | { assertions?: never; expect?: never }
   );
+
+export type VisualExplanationResult = {
+  explanation: string;
+  isBug: boolean;
+  confidence: number;
+  diffAreas?: string[];
+};
+
+export type VisualDiffOptions = {
+  page: Page;
+  expectedImage?: string | Buffer;
+  actualImage?: string | Buffer;
+  test?: TestType<
+    PlaywrightTestArgs & PlaywrightTestOptions,
+    PlaywrightWorkerArgs & PlaywrightWorkerOptions
+  >;
+};