From 198455b186a0ba12c465c1ceb30019afbef1c4ff Mon Sep 17 00:00:00 2001 From: "posthog[bot]" <206114724+posthog[bot]@users.noreply.github.com> Date: Mon, 22 Jun 2026 19:30:26 +0000 Subject: [PATCH] fix: retry transient Bedrock-fallback billing 403s instead of aborting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gateway routes failed Anthropic calls to AWS Bedrock (the wizard forces `x-posthog-use-bedrock-fallback: true`). Bedrock can answer with a 403 `INVALID_PAYMENT_INSTRUMENT` / AWS Marketplace subscription error — a transient, PostHog-side billing condition that clears on its own and whose own message advises retrying after ~2 minutes. Previously `output-signals.ts` only special-cased 401/429, so this 403 fell through to a generic API_ERROR and the linear runner escalated it to a fatal "report this to wizard@posthog.com" abort, breaking the whole integration flow. Classify the billing 403 as a new `PROVISIONING_ERROR`, retry it with backoff in the linear runner, and if retries are exhausted surface a friendly "temporarily unavailable, try again shortly" message instead of the terminal abort. Generated-By: PostHog Code Task-Id: 089e04c7-5d59-41f2-a9ab-8119a97a1c3a --- src/lib/__tests__/agent-interface.test.ts | 43 ++++++++++ .../agent/__tests__/output-signals.test.ts | 25 ++++++ src/lib/agent/agent-interface.ts | 18 ++++ src/lib/agent/output-signals.ts | 15 ++++ src/lib/agent/runner/linear.ts | 74 ++++++++++++---- .../__tests__/provisioning-retry.test.ts | 84 +++++++++++++++++++ .../agent/runner/shared/provisioning-retry.ts | 57 +++++++++++++ src/lib/agent/signals.ts | 7 ++ 8 files changed, 309 insertions(+), 14 deletions(-) create mode 100644 src/lib/agent/runner/shared/__tests__/provisioning-retry.test.ts create mode 100644 src/lib/agent/runner/shared/provisioning-retry.ts diff --git a/src/lib/__tests__/agent-interface.test.ts b/src/lib/__tests__/agent-interface.test.ts index b390fccb..f0c6a64d 100644 --- a/src/lib/__tests__/agent-interface.test.ts +++ b/src/lib/__tests__/agent-interface.test.ts @@ -211,6 +211,49 @@ describe('runAgent', () => { expect(result.message).toContain('API Error'); }); + it('should classify a transient Bedrock-fallback billing 403 as PROVISIONING_ERROR', async () => { + // The gateway's Bedrock fallback can return a 403 with + // INVALID_PAYMENT_INSTRUMENT (a PostHog-side AWS Marketplace billing + // condition that clears on its own). This must be distinguished from a + // generic API error so the runner retries instead of aborting fatally. + function* mockGeneratorWithProvisioning403() { + yield { + type: 'system', + subtype: 'init', + model: 'claude-opus-4-5-20251101', + tools: [], + mcp_servers: [], + }; + + yield { + type: 'result', + subtype: 'success', + is_error: true, + result: + 'API Error: 403 Model access is denied due to INVALID_PAYMENT_INSTRUMENT. ' + + 'Your AWS Marketplace subscription for this model cannot be completed', + }; + + throw new Error('Process exited with code 1'); + } + + mockQuery.mockReturnValue(mockGeneratorWithProvisioning403()); + + const result = await runAgent( + defaultAgentConfig, + 'test prompt', + defaultOptions, + mockSpinner as unknown as SpinnerHandle, + { + successMessage: 'Test success', + errorMessage: 'Test error', + }, + ); + + expect(result.error).toBe('WIZARD_PROVISIONING_ERROR'); + expect(result.message).toContain('INVALID_PAYMENT_INSTRUMENT'); + }); + it('should suppress user-facing errors when SDK yields error result after success', async () => { // This test models actual SDK behavior where the SDK emits TWO result messages: // 1. SDK yields success result (num_turns: 105, is_error: false) diff --git a/src/lib/agent/__tests__/output-signals.test.ts b/src/lib/agent/__tests__/output-signals.test.ts index 48ceda77..73500b82 100644 --- a/src/lib/agent/__tests__/output-signals.test.ts +++ b/src/lib/agent/__tests__/output-signals.test.ts @@ -26,6 +26,31 @@ describe('AgentOutputSignals', () => { expect(signals.hasApiErrorStatus(500)).toBe(false); }); + it('detects a transient provisioning 403 from the Bedrock fallback', () => { + const invalidPayment = new AgentOutputSignals(); + invalidPayment.push( + 'API Error: 403 Model access is denied due to INVALID_PAYMENT_INSTRUMENT', + ); + expect(invalidPayment.hasProvisioningError()).toBe(true); + + const marketplace = new AgentOutputSignals(); + marketplace.push( + 'API Error: 403 Your AWS Marketplace subscription for this model cannot be completed', + ); + expect(marketplace.hasProvisioningError()).toBe(true); + }); + + it('does not treat other 403s or other statuses as provisioning errors', () => { + const forbidden = new AgentOutputSignals(); + forbidden.push('API Error: 403 Forbidden: insufficient permissions'); + expect(forbidden.hasProvisioningError()).toBe(false); + + // The billing keywords on a non-403 status must not match either. + const wrongStatus = new AgentOutputSignals(); + wrongStatus.push('API Error: 500 INVALID_PAYMENT_INSTRUMENT'); + expect(wrongStatus.hasProvisioningError()).toBe(false); + }); + it('detects YARA violations from either marker', () => { const critical = new AgentOutputSignals(); critical.push('[YARA CRITICAL] prompt injection detected'); diff --git a/src/lib/agent/agent-interface.ts b/src/lib/agent/agent-interface.ts index 9276665d..079704de 100644 --- a/src/lib/agent/agent-interface.ts +++ b/src/lib/agent/agent-interface.ts @@ -1135,6 +1135,15 @@ export async function runAgent( return { error: AgentErrorType.RATE_LIMIT, message: apiErrorMessage }; } + if (signals.hasProvisioningError()) { + logToFile('Agent error: PROVISIONING_ERROR'); + spinner.stop('Model service temporarily unavailable'); + return { + error: AgentErrorType.PROVISIONING_ERROR, + message: apiErrorMessage, + }; + } + if (signals.hasApiError()) { logToFile('Agent error: API_ERROR'); spinner.stop('API error occurred'); @@ -1176,6 +1185,15 @@ export async function runAgent( return { error: AgentErrorType.RATE_LIMIT, message: apiErrorMessage }; } + if (signals.hasProvisioningError()) { + logToFile('Agent error (caught): PROVISIONING_ERROR'); + spinner.stop('Model service temporarily unavailable'); + return { + error: AgentErrorType.PROVISIONING_ERROR, + message: apiErrorMessage, + }; + } + if (signals.hasApiError()) { logToFile('Agent error (caught): API_ERROR'); spinner.stop('API error occurred'); diff --git a/src/lib/agent/output-signals.ts b/src/lib/agent/output-signals.ts index ed0b098d..e0c0ce41 100644 --- a/src/lib/agent/output-signals.ts +++ b/src/lib/agent/output-signals.ts @@ -54,6 +54,21 @@ export class AgentOutputSignals { return this.text.includes(`${OUTPUT_SIGNALS.API_ERROR} ${code}`); } + /** + * True for a transient provisioning/billing 403 from the gateway's Bedrock + * fallback: a PostHog-side AWS Marketplace condition (`INVALID_PAYMENT_INSTRUMENT`) + * that clears on its own, whose own message advises retrying after ~2 minutes. + * Distinct from a generic API error so the runner can retry instead of aborting + * with the terminal "report this" message. + */ + hasProvisioningError(): boolean { + if (!this.hasApiErrorStatus(403)) return false; + return ( + this.text.includes('INVALID_PAYMENT_INSTRUMENT') || + this.text.includes('AWS Marketplace') + ); + } + hasYaraViolation(): boolean { return this.has('YARA_CRITICAL') || this.has('YARA_SCANNER_ERROR'); } diff --git a/src/lib/agent/runner/linear.ts b/src/lib/agent/runner/linear.ts index f8d7a95c..9d20dd3d 100644 --- a/src/lib/agent/runner/linear.ts +++ b/src/lib/agent/runner/linear.ts @@ -33,6 +33,7 @@ import { assemblePrompt } from '../agent-prompt'; import type { ProgramRun, BootstrapResult } from './shared/types'; import { abortOnInstallFailure } from './shared/errors'; import { shouldDisableAsk, sessionToOptions } from './shared/bootstrap'; +import { runWithProvisioningRetry } from './shared/provisioning-retry'; export async function runLinearProgram( session: WizardSession, @@ -150,21 +151,40 @@ export async function runLinearProgram( }); logToFile(`[agent-runner] prompt assembled (${prompt.length} chars)`); - // 8. Run agent - const agentResult = await executeAgent( - agent, - prompt, - sessionToOptions(session), - spinner, - { - estimatedDurationMinutes: config.estimatedDurationMinutes, - spinnerMessage: config.spinnerMessage, - successMessage: config.successMessage, - errorMessage: config.errorMessage ?? `${config.integrationLabel} failed`, - additionalFeatureQueue: config.additionalFeatureQueue ?? [], - abortCases: config.abortCases, + // 8. Run agent. A transient provisioning/billing 403 from the gateway's + // Bedrock fallback (PostHog-side AWS Marketplace billing, self-resolving) + // is retried with backoff rather than aborting — see provisioning-retry.ts. + const agentResult = await runWithProvisioningRetry( + () => + executeAgent( + agent, + prompt, + sessionToOptions(session), + spinner, + { + estimatedDurationMinutes: config.estimatedDurationMinutes, + spinnerMessage: config.spinnerMessage, + successMessage: config.successMessage, + errorMessage: + config.errorMessage ?? `${config.integrationLabel} failed`, + additionalFeatureQueue: config.additionalFeatureQueue ?? [], + abortCases: config.abortCases, + }, + middleware, + ), + ({ attempt, total, delayMs }) => { + analytics.wizardCapture('agent provisioning retry', { + integration: config.integrationLabel, + attempt, + total, + delay_ms: delayMs, + }); + getUI().log.warn( + `PostHog's model service is temporarily unavailable. ` + + `Retrying in ${Math.round(delayMs / 1000)}s ` + + `(attempt ${attempt} of ${total})...`, + ); }, - middleware, ); // 9. Error handling (full set from both runners) @@ -239,6 +259,32 @@ export async function runLinearProgram( }); } + if (agentResult.error === AgentErrorType.PROVISIONING_ERROR) { + // Retries (step 8) were exhausted and the gateway is still returning the + // transient billing 403. This is a PostHog-side condition that clears on + // its own, so surface a friendly "try again shortly" message rather than + // the terminal "report this to wizard@posthog.com" abort. + analytics.wizardCapture('agent provisioning error', { + integration: config.integrationLabel, + error_type: agentResult.error, + error_message: agentResult.message, + }); + + await wizardAbort({ + message: + "PostHog's model service is temporarily unavailable\n\n" + + 'This is a transient issue on our side, not a problem with your\n' + + 'project. Please wait a few minutes and run the wizard again.', + error: new WizardError( + `Provisioning error: ${agentResult.message ?? 'unknown'}`, + { + integration: config.integrationLabel, + error_type: agentResult.error, + }, + ), + }); + } + if ( agentResult.error === AgentErrorType.RATE_LIMIT || agentResult.error === AgentErrorType.API_ERROR diff --git a/src/lib/agent/runner/shared/__tests__/provisioning-retry.test.ts b/src/lib/agent/runner/shared/__tests__/provisioning-retry.test.ts new file mode 100644 index 00000000..687d80ac --- /dev/null +++ b/src/lib/agent/runner/shared/__tests__/provisioning-retry.test.ts @@ -0,0 +1,84 @@ +import { + runWithProvisioningRetry, + type AgentRunResult, +} from '@lib/agent/runner/shared/provisioning-retry'; +import { AgentErrorType } from '@lib/agent/signals'; + +describe('runWithProvisioningRetry', () => { + const provisioning: AgentRunResult = { + error: AgentErrorType.PROVISIONING_ERROR, + message: 'API Error: 403 INVALID_PAYMENT_INSTRUMENT', + }; + const success: AgentRunResult = {}; + + const noWait = () => Promise.resolve(); + + it('does not retry when the first run succeeds', async () => { + const runOnce = jest.fn, []>(() => + Promise.resolve(success), + ); + const onRetry = jest.fn(); + + const result = await runWithProvisioningRetry(runOnce, onRetry, { + wait: noWait, + }); + + expect(result).toEqual(success); + expect(runOnce).toHaveBeenCalledTimes(1); + expect(onRetry).not.toHaveBeenCalled(); + }); + + it('retries on a provisioning error and returns the first non-provisioning result', async () => { + const runOnce = jest + .fn, []>() + .mockResolvedValueOnce(provisioning) + .mockResolvedValueOnce(success); + const onRetry = jest.fn(); + + const result = await runWithProvisioningRetry(runOnce, onRetry, { + delays: [10, 10], + wait: noWait, + }); + + expect(result).toEqual(success); + expect(runOnce).toHaveBeenCalledTimes(2); + expect(onRetry).toHaveBeenCalledTimes(1); + expect(onRetry).toHaveBeenCalledWith({ attempt: 1, total: 2, delayMs: 10 }); + }); + + it('gives up after exhausting the backoff schedule and surfaces the provisioning error', async () => { + const runOnce = jest.fn, []>(() => + Promise.resolve(provisioning), + ); + const onRetry = jest.fn(); + + const result = await runWithProvisioningRetry(runOnce, onRetry, { + delays: [10, 10], + wait: noWait, + }); + + // First run + 2 retries = 3 attempts, all still failing. + expect(result.error).toBe(AgentErrorType.PROVISIONING_ERROR); + expect(runOnce).toHaveBeenCalledTimes(3); + expect(onRetry).toHaveBeenCalledTimes(2); + }); + + it('does not retry a different error type', async () => { + const apiError: AgentRunResult = { + error: AgentErrorType.API_ERROR, + message: 'boom', + }; + const runOnce = jest.fn, []>(() => + Promise.resolve(apiError), + ); + const onRetry = jest.fn(); + + const result = await runWithProvisioningRetry(runOnce, onRetry, { + wait: noWait, + }); + + expect(result).toEqual(apiError); + expect(runOnce).toHaveBeenCalledTimes(1); + expect(onRetry).not.toHaveBeenCalled(); + }); +}); diff --git a/src/lib/agent/runner/shared/provisioning-retry.ts b/src/lib/agent/runner/shared/provisioning-retry.ts new file mode 100644 index 00000000..e8a2a50b --- /dev/null +++ b/src/lib/agent/runner/shared/provisioning-retry.ts @@ -0,0 +1,57 @@ +/** + * Retry helper for transient provisioning/billing failures from the LLM + * gateway's Bedrock fallback. + * + * The wizard forces `x-posthog-use-bedrock-fallback: true`, so a failed + * Anthropic call is re-routed to AWS Bedrock. Bedrock can answer with a 403 + * `INVALID_PAYMENT_INSTRUMENT` / AWS Marketplace subscription error — a + * PostHog-side billing condition that clears on its own and whose own message + * advises retrying after ~2 minutes. Rather than aborting the run with the + * terminal "report this" message, we wait and retry a couple of times. + */ + +import { AgentErrorType } from '../../signals'; + +export type AgentRunResult = { error?: AgentErrorType; message?: string }; + +/** + * Backoff schedule (ms) for retrying after a transient provisioning 403. The + * upstream error advises retrying after ~2 minutes, so we wait roughly that + * long, twice, before giving up. + */ +export const PROVISIONING_RETRY_DELAYS_MS = [120_000, 120_000]; + +const sleep = (ms: number): Promise => + new Promise((resolve) => setTimeout(resolve, ms)); + +/** + * Run `runOnce`; if it returns a `PROVISIONING_ERROR`, wait per the backoff + * schedule and retry. Returns the last result — which may still be a + * provisioning error if every attempt failed, leaving the caller to surface a + * friendly transient-error message. + * + * `onRetry` runs before each wait so the caller can tell the user what's + * happening; `delays`/`wait` are injectable so tests don't actually sleep. + */ +export async function runWithProvisioningRetry( + runOnce: () => Promise, + onRetry: (info: { attempt: number; total: number; delayMs: number }) => void, + opts: { delays?: number[]; wait?: (ms: number) => Promise } = {}, +): Promise { + const delays = opts.delays ?? PROVISIONING_RETRY_DELAYS_MS; + const wait = opts.wait ?? sleep; + + let result = await runOnce(); + for ( + let attempt = 0; + result.error === AgentErrorType.PROVISIONING_ERROR && + attempt < delays.length; + attempt++ + ) { + const delayMs = delays[attempt]; + onRetry({ attempt: attempt + 1, total: delays.length, delayMs }); + await wait(delayMs); + result = await runOnce(); + } + return result; +} diff --git a/src/lib/agent/signals.ts b/src/lib/agent/signals.ts index 361b2719..97fd3461 100644 --- a/src/lib/agent/signals.ts +++ b/src/lib/agent/signals.ts @@ -48,6 +48,13 @@ export enum AgentErrorType { RESOURCE_MISSING = 'WIZARD_RESOURCE_MISSING', /** API rate limit exceeded */ RATE_LIMIT = 'WIZARD_RATE_LIMIT', + /** + * Transient provisioning/billing failure from the LLM gateway's Bedrock + * fallback (e.g. a 403 `INVALID_PAYMENT_INSTRUMENT` / AWS Marketplace + * subscription error). PostHog-side and self-resolving — retryable, not a + * "report this" terminal error. + */ + PROVISIONING_ERROR = 'WIZARD_PROVISIONING_ERROR', /** Generic API error */ API_ERROR = 'WIZARD_API_ERROR', /** YARA scanner detected a security violation */