Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 32 additions & 12 deletions src/assertion.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { logger } from "./logger";
import { resolveModel } from "./models";
import { AssertionResult, AssertionOptions } from "./types";
import { resolvePage, safeSnapshot, withTimeout } from "./utils";
import { trackUsage } from "./cost";

const assertionSchema = z.object({
assertionPassed: z.boolean().describe("Indicates whether the assertion passed or not."),
Expand Down Expand Up @@ -130,8 +131,9 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco
// Claude assertion function
const getClaudeAssertion = async (): Promise<AssertionResult> => {
// First get Claude's text response with thinking if enabled
const { text } = await generateText({
model: resolveModel(getModelId("assertionPrimary")),
const modelId = getModelId("assertionPrimary");
const result = await generateText({
model: resolveModel(modelId),
temperature: 0,
providerOptions: thinkingEnabled
? {
Expand All @@ -146,21 +148,30 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco
messages,
});

if (result.usage) {
await trackUsage(modelId, result.usage);
}

// Convert Claude's response to structured format using Haiku
const { output } = await generateText({
model: resolveModel(getModelId("assertionPrimary")),
const haikuResult = await generateText({
model: resolveModel(modelId),
temperature: 0.1,
prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`,
prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${result.text}`,
output: Output.object({ schema: assertionSchema }),
});

return output;
if (haikuResult.usage) {
await trackUsage(modelId, haikuResult.usage);
}

return haikuResult.output;
};

// Gemini assertion function
const getGeminiAssertion = async (): Promise<AssertionResult> => {
const { output } = await generateText({
model: resolveModel(getModelId("assertionSecondary")),
const modelId = getModelId("assertionSecondary");
const result = await generateText({
model: resolveModel(modelId),
temperature: 0,
providerOptions: thinkingEnabled
? {
Expand All @@ -178,7 +189,11 @@ Never hallucinate. Be truthful and if you are not sure, use a low confidence sco
output: Output.object({ schema: assertionSchema }),
});

return output;
if (result.usage) {
await trackUsage(modelId, result.usage);
}

return result.output;
};

// Arbiter function using Gemini 2.5 Pro with thinking enabled
Expand Down Expand Up @@ -241,8 +256,9 @@ Please carefully review the evidence (screenshot and accessibility snapshot (whe
},
];

const { output } = await generateText({
model: resolveModel(getModelId("assertionArbiter")),
const modelId = getModelId("assertionArbiter");
const result = await generateText({
model: resolveModel(modelId),
temperature: 0,
providerOptions: {
google: {
Expand All @@ -258,7 +274,11 @@ Please carefully review the evidence (screenshot and accessibility snapshot (whe
output: Output.object({ schema: assertionSchema }),
});

return output;
if (result.usage) {
await trackUsage(modelId, result.usage);
}

return result.output;
};

const runAssertion = async (attempt = 0): Promise<AssertionResult> => {
Expand Down
3 changes: 3 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ export type ModelConfig = {
* Override may be re-enabled in a future release.
*/
cua?: string;
/** Model for explaining visual regression failures. Default: google/gemini-3-flash */
visualRegressionExplanation?: string;
};

export const DEFAULT_MODELS: Required<ModelConfig> = {
Expand All @@ -51,6 +53,7 @@ export const DEFAULT_MODELS: Required<ModelConfig> = {
assertionArbiter: "google/gemini-3.1-pro-preview",
utility: "google/gemini-2.5-flash",
cua: "gpt-5.5",
visualRegressionExplanation: "google/gemini-3-flash",
};

/**
Expand Down
1 change: 1 addition & 0 deletions src/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ export const INITIAL_DOM_STABILIZATION_IDLE = 3000;
export const ASSERTION_MODEL_TIMEOUT = 35000;
export const STEP_EXECUTION_TIMEOUT = 180000;
export const WAIT_CONDITION_TIMEOUT = 120000;
export const VISUAL_DIFF_EXPLANATION_TIMEOUT = 45000;
export const WAIT_CONDITION_INITIAL_INTERVAL = 1000;
export const WAIT_CONDITION_MAX_INTERVAL = 10000;
export const EMAIL_INITIAL_WAIT = 5000;
Expand Down
91 changes: 91 additions & 0 deletions src/cost.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import { logger } from "./logger";
import { redis } from "./redis";

export interface ModelPricing {
promptTokenPricePerMillion: number;
completionTokenPricePerMillion: number;
}

/**
* Default pricing for models used in Passmark.
* Prices are in USD per 1 million tokens.
* Data sourced from provider documentation as of April 2026.
*/
export const DEFAULT_PRICING: Record<string, ModelPricing> = {
"google/gemini-3-flash": {
promptTokenPricePerMillion: 0.1,
completionTokenPricePerMillion: 0.4,
},
"google/gemini-3.1-pro-preview": {
promptTokenPricePerMillion: 1.25,
completionTokenPricePerMillion: 5.0,
},
"anthropic/claude-haiku-4.5": {
promptTokenPricePerMillion: 0.25,
completionTokenPricePerMillion: 1.25,
},
"google/gemini-2.5-flash": {
promptTokenPricePerMillion: 0.1,
completionTokenPricePerMillion: 0.4,
},
"gpt-5.5": {
promptTokenPricePerMillion: 2.5,
completionTokenPricePerMillion: 10.0,
},
};

export interface Usage {
promptTokens?: number;
completionTokens?: number;
totalTokens?: number;
}

/**
* Tracks LLM usage and calculates the cost of the call.
* Updates a global cost counter in Redis if available for cross-worker synchronization.
*
* @param modelId - The canonical model ID (e.g. "google/gemini-3-flash")
* @param usage - Token usage data from the AI SDK
*/
export async function trackUsage(modelId: string, usage: Usage) {
const pricing = DEFAULT_PRICING[modelId] || DEFAULT_PRICING["google/gemini-3-flash"];

const promptTokens = usage.promptTokens ?? 0;
const completionTokens = usage.completionTokens ?? 0;

const promptCost = (promptTokens / 1_000_000) * pricing.promptTokenPricePerMillion;
const completionCost = (completionTokens / 1_000_000) * pricing.completionTokenPricePerMillion;
const totalCost = promptCost + completionCost;

logger.debug(
`[Cost] Model: ${modelId} | Prompt: ${promptTokens} | Completion: ${completionTokens} | Cost: $${totalCost.toFixed(6)}`
);

if (redis) {
try {
// Use a global key to track cumulative cost across all test workers
const executionId = process.env.executionId || "default";
const costKey = `cost:total:${executionId}`;
const modelCostKey = `cost:model:${modelId}:${executionId}`;

await Promise.all([
redis.incrbyfloat(costKey, totalCost),
redis.incrbyfloat(modelCostKey, totalCost),
]);
} catch (err) {
logger.warn(`Failed to update cost in Redis: ${err}`);
}
}
}

/**
* Retrieves the total estimated cost for the current execution.
* @returns Total cost in USD
*/
export async function getTotalCost(): Promise<number> {
if (!redis) return 0;

const executionId = process.env.executionId || "default";
const cost = await redis.get(`cost:total:${executionId}`);
return cost ? parseFloat(cost) : 0;
}
9 changes: 9 additions & 0 deletions src/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,13 @@ export class ValidationError extends PassmarkError {
constructor(message: string) {
super(message, "VALIDATION_ERROR");
}
}

/**
* Thrown when visual regression explanation fails.
*/
export class VisualRegressionError extends PassmarkError {
constructor(message: string) {
super(message, "VISUAL_REGRESSION_ERROR");
}
}
12 changes: 9 additions & 3 deletions src/extract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { generateText, Output } from "ai";
import { z } from "zod";
import { getModelId } from "./config";
import { resolveModel } from "./models";
import { trackUsage } from "./cost";

const extractionSchema = z.object({
extractedValue: z.string().describe("The extracted value based on the prompt"),
Expand Down Expand Up @@ -35,8 +36,9 @@ export async function extractDataWithAI({
url: string;
prompt: string;
}): Promise<string> {
const { output } = await generateText({
model: resolveModel(getModelId("utility")),
const modelId = getModelId("utility");
const result = await generateText({
model: resolveModel(modelId),
temperature: 0,
output: Output.object({ schema: extractionSchema }),
prompt: `You are an AI assistant that extracts specific data from web pages.
Expand Down Expand Up @@ -66,5 +68,9 @@ ${prompt}
Return the extracted value.`,
});

return output.extractedValue;
if (result.usage) {
await trackUsage(modelId, result.usage);
}

return result.output.extractedValue;
}
52 changes: 35 additions & 17 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,7 @@ import { withSpan } from "axiom/ai";
import shortid from "shortid";
import { axiomEnabled } from "./instrumentation";

// Only use withSpan when Axiom is configured, otherwise just execute the function directly
async function maybeWithSpan<T>(
meta: { capability: string; step: string },
fn: () => Promise<T>,
): Promise<T> {
return axiomEnabled ? withSpan(meta, async () => fn()) : fn();
}
import { maybeWithSpan } from "./utils/telemetry";
import { z } from "zod";
import { buildRunStepsPrompt, buildRunUserFlowPrompt } from "./prompts";
import { redis } from "./redis";
Expand All @@ -45,6 +39,8 @@ import { runCUALoop, buildRunStepsPromptCUA, buildRunUserFlowPromptCUA } from ".
import { extractDataWithAI } from "./extract";
import { logger } from "./logger";
import { resolveModel } from "./models";
export * from "./visual";
import { trackUsage } from "./cost";
import { runSecureScript } from "./utils/secure-script-runner";
import { createTabManager } from "./utils/tab-manager";
import {
Expand Down Expand Up @@ -460,10 +456,10 @@ export const runSteps = async ({
);
}

const stepModelId = effectiveAi.getModelId("stepExecution");
const model = resolveModel(stepModelId, effectiveAi.gateway);
const modelId = getModelId("stepExecution");
const model = resolveModel(modelId);
logger.debug(
`Using model: ${stepModelId} for step execution / gateway: ${effectiveAi.gateway}`,
`Using model: ${modelId} for step execution / gateway: ${getConfig().ai?.gateway ?? "none"}`,
);

try {
Expand Down Expand Up @@ -514,6 +510,10 @@ export const runSteps = async ({
}),
);

if (result.usage) {
await trackUsage(modelId, result.usage);
}

// Cache the step action only if it was a single tool call (simple, deterministic action).
// Multi-step actions are not cached as they may be non-deterministic.
const allToolCalls = result.steps
Expand Down Expand Up @@ -680,8 +680,9 @@ export const runUserFlow = async ({
);

if (assertion) {
const { output } = await generateText({
model: resolveModel(effectiveAi.getModelId("utility"), effectiveAi.gateway),
const utilityModelId = getModelId("utility");
const { output, usage } = await generateText({
model: resolveModel(utilityModelId),
prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`,
output: Output.object({
schema: z.object({
Expand All @@ -695,6 +696,11 @@ export const runUserFlow = async ({
}),
}),
});

if (usage) {
await trackUsage(utilityModelId, usage);
}

return output;
}

Expand All @@ -715,7 +721,7 @@ export const runUserFlow = async ({
});

try {
const { text } = await maybeWithSpan(
const result = await maybeWithSpan(
{ capability: "user_flow_execution", step: "agentic_tool_calling" },
async () => {
return generateText({
Expand Down Expand Up @@ -758,10 +764,18 @@ export const runUserFlow = async ({
},
);

if (result.usage) {
await trackUsage(
effort === "low" ? getModelId("userFlowLow") : getModelId("userFlowHigh"),
result.usage,
);
}

if (assertion) {
const { output } = await generateText({
model: resolveModel(effectiveAi.getModelId("utility"), effectiveAi.gateway),
prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`,
const utilityModelId = getModelId("utility");
const { output, usage } = await generateText({
model: resolveModel(utilityModelId),
prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${result.text}`,
output: Output.object({
schema: z.object({
assertionPassed: z.boolean().describe("Indicates whether the assertion passed or not."),
Expand All @@ -775,10 +789,14 @@ export const runUserFlow = async ({
}),
});

if (usage) {
await trackUsage(utilityModelId, usage);
}

return output;
}

return text;
return result.text;
} catch (error: unknown) {
logger.error({ err: error }, "Error during user flow execution");
}
Expand Down
17 changes: 17 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,20 @@ export type RunStepsOptions = {
}
| { assertions?: never; expect?: never }
);

export type VisualExplanationResult = {
explanation: string;
isBug: boolean;
confidence: number;
diffAreas?: string[];
};

export type VisualDiffOptions = {
page: Page;
expectedImage?: string | Buffer;
actualImage?: string | Buffer;
test?: TestType<
PlaywrightTestArgs & PlaywrightTestOptions,
PlaywrightWorkerArgs & PlaywrightWorkerOptions
>;
};
Loading