Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions src/lib/__tests__/agent-interface.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,49 @@ describe('runAgent', () => {
expect(result.message).toContain('API Error');
});

it('should classify a transient Bedrock-fallback billing 403 as PROVISIONING_ERROR', async () => {
// The gateway's Bedrock fallback can return a 403 with
// INVALID_PAYMENT_INSTRUMENT (a PostHog-side AWS Marketplace billing
// condition that clears on its own). This must be distinguished from a
// generic API error so the runner retries instead of aborting fatally.
function* mockGeneratorWithProvisioning403() {
yield {
type: 'system',
subtype: 'init',
model: 'claude-opus-4-5-20251101',
tools: [],
mcp_servers: [],
};

yield {
type: 'result',
subtype: 'success',
is_error: true,
result:
'API Error: 403 Model access is denied due to INVALID_PAYMENT_INSTRUMENT. ' +
'Your AWS Marketplace subscription for this model cannot be completed',
};

throw new Error('Process exited with code 1');
}

mockQuery.mockReturnValue(mockGeneratorWithProvisioning403());

const result = await runAgent(
defaultAgentConfig,
'test prompt',
defaultOptions,
mockSpinner as unknown as SpinnerHandle,
{
successMessage: 'Test success',
errorMessage: 'Test error',
},
);

expect(result.error).toBe('WIZARD_PROVISIONING_ERROR');
expect(result.message).toContain('INVALID_PAYMENT_INSTRUMENT');
});

it('should suppress user-facing errors when SDK yields error result after success', async () => {
// This test models actual SDK behavior where the SDK emits TWO result messages:
// 1. SDK yields success result (num_turns: 105, is_error: false)
Expand Down
25 changes: 25 additions & 0 deletions src/lib/agent/__tests__/output-signals.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,31 @@ describe('AgentOutputSignals', () => {
expect(signals.hasApiErrorStatus(500)).toBe(false);
});

it('detects a transient provisioning 403 from the Bedrock fallback', () => {
const invalidPayment = new AgentOutputSignals();
invalidPayment.push(
'API Error: 403 Model access is denied due to INVALID_PAYMENT_INSTRUMENT',
);
expect(invalidPayment.hasProvisioningError()).toBe(true);

const marketplace = new AgentOutputSignals();
marketplace.push(
'API Error: 403 Your AWS Marketplace subscription for this model cannot be completed',
);
expect(marketplace.hasProvisioningError()).toBe(true);
});

it('does not treat other 403s or other statuses as provisioning errors', () => {
const forbidden = new AgentOutputSignals();
forbidden.push('API Error: 403 Forbidden: insufficient permissions');
expect(forbidden.hasProvisioningError()).toBe(false);

// The billing keywords on a non-403 status must not match either.
const wrongStatus = new AgentOutputSignals();
wrongStatus.push('API Error: 500 INVALID_PAYMENT_INSTRUMENT');
expect(wrongStatus.hasProvisioningError()).toBe(false);
});

it('detects YARA violations from either marker', () => {
const critical = new AgentOutputSignals();
critical.push('[YARA CRITICAL] prompt injection detected');
Expand Down
18 changes: 18 additions & 0 deletions src/lib/agent/agent-interface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1135,6 +1135,15 @@ export async function runAgent(
return { error: AgentErrorType.RATE_LIMIT, message: apiErrorMessage };
}

if (signals.hasProvisioningError()) {
logToFile('Agent error: PROVISIONING_ERROR');
spinner.stop('Model service temporarily unavailable');
return {
error: AgentErrorType.PROVISIONING_ERROR,
message: apiErrorMessage,
};
}

if (signals.hasApiError()) {
logToFile('Agent error: API_ERROR');
spinner.stop('API error occurred');
Expand Down Expand Up @@ -1176,6 +1185,15 @@ export async function runAgent(
return { error: AgentErrorType.RATE_LIMIT, message: apiErrorMessage };
}

if (signals.hasProvisioningError()) {
logToFile('Agent error (caught): PROVISIONING_ERROR');
spinner.stop('Model service temporarily unavailable');
return {
error: AgentErrorType.PROVISIONING_ERROR,
message: apiErrorMessage,
};
}

if (signals.hasApiError()) {
logToFile('Agent error (caught): API_ERROR');
spinner.stop('API error occurred');
Expand Down
15 changes: 15 additions & 0 deletions src/lib/agent/output-signals.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,21 @@ export class AgentOutputSignals {
return this.text.includes(`${OUTPUT_SIGNALS.API_ERROR} ${code}`);
}

/**
* True for a transient provisioning/billing 403 from the gateway's Bedrock
* fallback: a PostHog-side AWS Marketplace condition (`INVALID_PAYMENT_INSTRUMENT`)
* that clears on its own, whose own message advises retrying after ~2 minutes.
* Distinct from a generic API error so the runner can retry instead of aborting
* with the terminal "report this" message.
*/
hasProvisioningError(): boolean {
if (!this.hasApiErrorStatus(403)) return false;
return (
this.text.includes('INVALID_PAYMENT_INSTRUMENT') ||
this.text.includes('AWS Marketplace')
);
}

hasYaraViolation(): boolean {
return this.has('YARA_CRITICAL') || this.has('YARA_SCANNER_ERROR');
}
Expand Down
74 changes: 60 additions & 14 deletions src/lib/agent/runner/linear.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import { assemblePrompt } from '../agent-prompt';
import type { ProgramRun, BootstrapResult } from './shared/types';
import { abortOnInstallFailure } from './shared/errors';
import { shouldDisableAsk, sessionToOptions } from './shared/bootstrap';
import { runWithProvisioningRetry } from './shared/provisioning-retry';

export async function runLinearProgram(
session: WizardSession,
Expand Down Expand Up @@ -150,21 +151,40 @@ export async function runLinearProgram(
});
logToFile(`[agent-runner] prompt assembled (${prompt.length} chars)`);

// 8. Run agent
const agentResult = await executeAgent(
agent,
prompt,
sessionToOptions(session),
spinner,
{
estimatedDurationMinutes: config.estimatedDurationMinutes,
spinnerMessage: config.spinnerMessage,
successMessage: config.successMessage,
errorMessage: config.errorMessage ?? `${config.integrationLabel} failed`,
additionalFeatureQueue: config.additionalFeatureQueue ?? [],
abortCases: config.abortCases,
// 8. Run agent. A transient provisioning/billing 403 from the gateway's
// Bedrock fallback (PostHog-side AWS Marketplace billing, self-resolving)
// is retried with backoff rather than aborting — see provisioning-retry.ts.
const agentResult = await runWithProvisioningRetry(
() =>
executeAgent(
agent,
prompt,
sessionToOptions(session),
spinner,
{
estimatedDurationMinutes: config.estimatedDurationMinutes,
spinnerMessage: config.spinnerMessage,
successMessage: config.successMessage,
errorMessage:
config.errorMessage ?? `${config.integrationLabel} failed`,
additionalFeatureQueue: config.additionalFeatureQueue ?? [],
abortCases: config.abortCases,
},
middleware,
),
({ attempt, total, delayMs }) => {
analytics.wizardCapture('agent provisioning retry', {
integration: config.integrationLabel,
attempt,
total,
delay_ms: delayMs,
});
getUI().log.warn(
`PostHog's model service is temporarily unavailable. ` +
`Retrying in ${Math.round(delayMs / 1000)}s ` +
`(attempt ${attempt} of ${total})...`,
);
},
middleware,
);

// 9. Error handling (full set from both runners)
Expand Down Expand Up @@ -239,6 +259,32 @@ export async function runLinearProgram(
});
}

if (agentResult.error === AgentErrorType.PROVISIONING_ERROR) {
// Retries (step 8) were exhausted and the gateway is still returning the
// transient billing 403. This is a PostHog-side condition that clears on
// its own, so surface a friendly "try again shortly" message rather than
// the terminal "report this to wizard@posthog.com" abort.
analytics.wizardCapture('agent provisioning error', {
integration: config.integrationLabel,
error_type: agentResult.error,
error_message: agentResult.message,
});

await wizardAbort({
message:
"PostHog's model service is temporarily unavailable\n\n" +
'This is a transient issue on our side, not a problem with your\n' +
'project. Please wait a few minutes and run the wizard again.',
error: new WizardError(
`Provisioning error: ${agentResult.message ?? 'unknown'}`,
{
integration: config.integrationLabel,
error_type: agentResult.error,
},
),
});
}

if (
agentResult.error === AgentErrorType.RATE_LIMIT ||
agentResult.error === AgentErrorType.API_ERROR
Expand Down
84 changes: 84 additions & 0 deletions src/lib/agent/runner/shared/__tests__/provisioning-retry.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import {
runWithProvisioningRetry,
type AgentRunResult,
} from '@lib/agent/runner/shared/provisioning-retry';
import { AgentErrorType } from '@lib/agent/signals';

describe('runWithProvisioningRetry', () => {
const provisioning: AgentRunResult = {
error: AgentErrorType.PROVISIONING_ERROR,
message: 'API Error: 403 INVALID_PAYMENT_INSTRUMENT',
};
const success: AgentRunResult = {};

const noWait = () => Promise.resolve();

it('does not retry when the first run succeeds', async () => {
const runOnce = jest.fn<Promise<AgentRunResult>, []>(() =>
Promise.resolve(success),
);
const onRetry = jest.fn();

const result = await runWithProvisioningRetry(runOnce, onRetry, {
wait: noWait,
});

expect(result).toEqual(success);
expect(runOnce).toHaveBeenCalledTimes(1);
expect(onRetry).not.toHaveBeenCalled();
});

it('retries on a provisioning error and returns the first non-provisioning result', async () => {
const runOnce = jest
.fn<Promise<AgentRunResult>, []>()
.mockResolvedValueOnce(provisioning)
.mockResolvedValueOnce(success);
const onRetry = jest.fn();

const result = await runWithProvisioningRetry(runOnce, onRetry, {
delays: [10, 10],
wait: noWait,
});

expect(result).toEqual(success);
expect(runOnce).toHaveBeenCalledTimes(2);
expect(onRetry).toHaveBeenCalledTimes(1);
expect(onRetry).toHaveBeenCalledWith({ attempt: 1, total: 2, delayMs: 10 });
});

it('gives up after exhausting the backoff schedule and surfaces the provisioning error', async () => {
const runOnce = jest.fn<Promise<AgentRunResult>, []>(() =>
Promise.resolve(provisioning),
);
const onRetry = jest.fn();

const result = await runWithProvisioningRetry(runOnce, onRetry, {
delays: [10, 10],
wait: noWait,
});

// First run + 2 retries = 3 attempts, all still failing.
expect(result.error).toBe(AgentErrorType.PROVISIONING_ERROR);
expect(runOnce).toHaveBeenCalledTimes(3);
expect(onRetry).toHaveBeenCalledTimes(2);
});

it('does not retry a different error type', async () => {
const apiError: AgentRunResult = {
error: AgentErrorType.API_ERROR,
message: 'boom',
};
const runOnce = jest.fn<Promise<AgentRunResult>, []>(() =>
Promise.resolve(apiError),
);
const onRetry = jest.fn();

const result = await runWithProvisioningRetry(runOnce, onRetry, {
wait: noWait,
});

expect(result).toEqual(apiError);
expect(runOnce).toHaveBeenCalledTimes(1);
expect(onRetry).not.toHaveBeenCalled();
});
});
57 changes: 57 additions & 0 deletions src/lib/agent/runner/shared/provisioning-retry.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/**
* Retry helper for transient provisioning/billing failures from the LLM
* gateway's Bedrock fallback.
*
* The wizard forces `x-posthog-use-bedrock-fallback: true`, so a failed
* Anthropic call is re-routed to AWS Bedrock. Bedrock can answer with a 403
* `INVALID_PAYMENT_INSTRUMENT` / AWS Marketplace subscription error — a
* PostHog-side billing condition that clears on its own and whose own message
* advises retrying after ~2 minutes. Rather than aborting the run with the
* terminal "report this" message, we wait and retry a couple of times.
*/

import { AgentErrorType } from '../../signals';

export type AgentRunResult = { error?: AgentErrorType; message?: string };

/**
* Backoff schedule (ms) for retrying after a transient provisioning 403. The
* upstream error advises retrying after ~2 minutes, so we wait roughly that
* long, twice, before giving up.
*/
export const PROVISIONING_RETRY_DELAYS_MS = [120_000, 120_000];

const sleep = (ms: number): Promise<void> =>
new Promise((resolve) => setTimeout(resolve, ms));

/**
* Run `runOnce`; if it returns a `PROVISIONING_ERROR`, wait per the backoff
* schedule and retry. Returns the last result — which may still be a
* provisioning error if every attempt failed, leaving the caller to surface a
* friendly transient-error message.
*
* `onRetry` runs before each wait so the caller can tell the user what's
* happening; `delays`/`wait` are injectable so tests don't actually sleep.
*/
export async function runWithProvisioningRetry(
runOnce: () => Promise<AgentRunResult>,
onRetry: (info: { attempt: number; total: number; delayMs: number }) => void,
opts: { delays?: number[]; wait?: (ms: number) => Promise<void> } = {},
): Promise<AgentRunResult> {
const delays = opts.delays ?? PROVISIONING_RETRY_DELAYS_MS;
const wait = opts.wait ?? sleep;

let result = await runOnce();
for (
let attempt = 0;
result.error === AgentErrorType.PROVISIONING_ERROR &&
attempt < delays.length;
attempt++
) {
const delayMs = delays[attempt];
onRetry({ attempt: attempt + 1, total: delays.length, delayMs });
await wait(delayMs);
result = await runOnce();
}
return result;
}
7 changes: 7 additions & 0 deletions src/lib/agent/signals.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ export enum AgentErrorType {
RESOURCE_MISSING = 'WIZARD_RESOURCE_MISSING',
/** API rate limit exceeded */
RATE_LIMIT = 'WIZARD_RATE_LIMIT',
/**
* Transient provisioning/billing failure from the LLM gateway's Bedrock
* fallback (e.g. a 403 `INVALID_PAYMENT_INSTRUMENT` / AWS Marketplace
* subscription error). PostHog-side and self-resolving — retryable, not a
* "report this" terminal error.
*/
PROVISIONING_ERROR = 'WIZARD_PROVISIONING_ERROR',
/** Generic API error */
API_ERROR = 'WIZARD_API_ERROR',
/** YARA scanner detected a security violation */
Expand Down
Loading