From e7057e7ace1ccdc8990ea5135a0fc8f4f5b02e78 Mon Sep 17 00:00:00 2001 From: "posthog[bot]" <206114724+posthog[bot]@users.noreply.github.com> Date: Wed, 1 Jul 2026 23:32:47 +0000 Subject: [PATCH] fix(api): make Slack connection check resilient to transient failures `fetchSlackConnected` issued a bare `axios.get` with no timeout or retry, so any transient network blip against `/integrations/` threw and the callers captured it as an exception. On a non-critical post-install poll that already degrades gracefully to the connect nudge, this produced steadily accumulating error-tracking noise across several transient failure modes (gateway 504s, socket resets, TLS-handshake disconnects). Add a short timeout plus a small retry/backoff for gateway 5xx and retryable socket/TLS errors. Exhausted transient failures now degrade quietly to "not connected" without calling `captureException`; only genuine, non-transient errors (401/403, malformed responses) still throw for the caller to report. Generated-By: PostHog Code Task-Id: 54188ec4-3681-47ed-9228-a0628b42af27 --- .../__tests__/fetch-slack-connected.test.ts | 164 ++++++++++++++++++ src/lib/api.ts | 125 +++++++++++-- src/ui/tui/screens/SlackConnectScreen.tsx | 12 +- 3 files changed, 281 insertions(+), 20 deletions(-) create mode 100644 src/lib/__tests__/fetch-slack-connected.test.ts diff --git a/src/lib/__tests__/fetch-slack-connected.test.ts b/src/lib/__tests__/fetch-slack-connected.test.ts new file mode 100644 index 00000000..1504cef6 --- /dev/null +++ b/src/lib/__tests__/fetch-slack-connected.test.ts @@ -0,0 +1,164 @@ +import axios, { AxiosError, CanceledError } from 'axios'; +import { fetchSlackConnected, ApiError } from '@lib/api'; + +// Partially mock axios: stub `get`, keep the real `isAxiosError` / `isCancel` +// type guards and the real `AxiosError` / `CanceledError` classes so the +// transient-vs-genuine classifier behaves as it does in production. +vi.mock('axios', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + default: { ...actual.default, get: vi.fn() }, + }; +}); + +// Avoid real backoff delays. +vi.mock('@lib/helper-functions', () => ({ + sleep: vi.fn(() => Promise.resolve()), +})); + +vi.mock('@utils/analytics', () => ({ + analytics: { captureException: vi.fn() }, +})); + +const mockedGet = axios.get as unknown as ReturnType; + +const ACCESS_TOKEN = 'phx_test'; +const PROJECT_ID = 42; +const BASE_URL = 'https://us.posthog.com'; + +function ok(kinds: string[]) { + return { data: { results: kinds.map((kind) => ({ kind })) } }; +} + +/** A gateway 5xx AxiosError (has an HTTP response). */ +function httpError(status: number): AxiosError { + return new AxiosError( + `Request failed with status code ${status}`, + 'ERR_BAD_RESPONSE', + undefined, + {}, + { status, data: {}, statusText: '', headers: {}, config: {} as never }, + ); +} + +/** A transport-layer AxiosError (no HTTP response) carrying a code. */ +function networkError(message: string, code?: string): AxiosError { + return new AxiosError(message, code); +} + +beforeEach(() => { + vi.clearAllMocks(); +}); + +describe('fetchSlackConnected', () => { + it('returns true when a slack integration is present', async () => { + mockedGet.mockResolvedValueOnce(ok(['slack', 'github'])); + await expect( + fetchSlackConnected(ACCESS_TOKEN, PROJECT_ID, BASE_URL), + ).resolves.toBe(true); + expect(mockedGet).toHaveBeenCalledTimes(1); + }); + + it('returns false when no slack integration is present', async () => { + mockedGet.mockResolvedValueOnce(ok(['github'])); + await expect( + fetchSlackConnected(ACCESS_TOKEN, PROJECT_ID, BASE_URL), + ).resolves.toBe(false); + }); + + it('sends a timeout on the request', async () => { + mockedGet.mockResolvedValueOnce(ok([])); + await fetchSlackConnected(ACCESS_TOKEN, PROJECT_ID, BASE_URL); + expect(mockedGet.mock.calls[0][1]).toMatchObject({ + timeout: expect.any(Number), + }); + }); + + it('retries a 504 and succeeds on a later attempt', async () => { + mockedGet + .mockRejectedValueOnce(httpError(504)) + .mockResolvedValueOnce(ok(['slack'])); + await expect( + fetchSlackConnected(ACCESS_TOKEN, PROJECT_ID, BASE_URL), + ).resolves.toBe(true); + expect(mockedGet).toHaveBeenCalledTimes(2); + }); + + it.each([ + ['504 gateway timeout', () => httpError(504)], + ['502 bad gateway', () => httpError(502)], + ['503 service unavailable', () => httpError(503)], + [ + 'read EADDRNOTAVAIL', + () => networkError('read EADDRNOTAVAIL', 'EADDRNOTAVAIL'), + ], + ['ECONNRESET', () => networkError('socket hang up', 'ECONNRESET')], + [ + 'axios timeout', + () => networkError('timeout of 5000ms exceeded', 'ECONNABORTED'), + ], + [ + 'TLS handshake disconnect (no code)', + () => + networkError( + 'Client network socket disconnected before secure TLS connection was established', + ), + ], + ])( + 'degrades to false (no throw) after exhausting retries on %s', + async (_label, makeError) => { + mockedGet.mockRejectedValue(makeError()); + await expect( + fetchSlackConnected(ACCESS_TOKEN, PROJECT_ID, BASE_URL), + ).resolves.toBe(false); + // 1 initial attempt + 2 retries. + expect(mockedGet).toHaveBeenCalledTimes(3); + }, + ); + + it.each([ + ['401 unauthorized', () => httpError(401)], + ['403 forbidden', () => httpError(403)], + ['404 not found', () => httpError(404)], + ])( + 'throws on genuine, non-transient %s without retrying', + async (_l, makeError) => { + mockedGet.mockRejectedValue(makeError()); + await expect( + fetchSlackConnected(ACCESS_TOKEN, PROJECT_ID, BASE_URL), + ).rejects.toBeInstanceOf(ApiError); + expect(mockedGet).toHaveBeenCalledTimes(1); + }, + ); + + it('throws an ApiError on a malformed 200 response', async () => { + mockedGet.mockResolvedValueOnce({ data: { unexpected: true } }); + await expect( + fetchSlackConnected(ACCESS_TOKEN, PROJECT_ID, BASE_URL), + ).rejects.toBeInstanceOf(ApiError); + expect(mockedGet).toHaveBeenCalledTimes(1); + }); + + it('propagates a caller-initiated cancellation without retrying', async () => { + mockedGet.mockRejectedValue(new CanceledError('canceled')); + await expect( + fetchSlackConnected(ACCESS_TOKEN, PROJECT_ID, BASE_URL), + ).rejects.toBeInstanceOf(CanceledError); + expect(mockedGet).toHaveBeenCalledTimes(1); + }); + + it('throws without a request when the signal is already aborted', async () => { + const controller = new AbortController(); + controller.abort(); + await expect( + fetchSlackConnected( + ACCESS_TOKEN, + PROJECT_ID, + BASE_URL, + controller.signal, + ), + ).rejects.toBeInstanceOf(ApiError); + expect(mockedGet).not.toHaveBeenCalled(); + }); +}); diff --git a/src/lib/api.ts b/src/lib/api.ts index ba478a82..8c36fc51 100644 --- a/src/lib/api.ts +++ b/src/lib/api.ts @@ -2,6 +2,7 @@ import axios, { AxiosError } from 'axios'; import { z } from 'zod'; import { analytics } from '@utils/analytics'; import { WIZARD_USER_AGENT } from './constants'; +import { sleep } from './helper-functions'; /** * User payload from `/api/users/@me/`. Schema typed for the fields the @@ -234,11 +235,72 @@ const IntegrationsResponseSchema = z.object({ results: z.array(z.object({ kind: z.string().nullish() }).passthrough()), }); +// The Slack check runs on a non-critical post-install step and against a +// flaky-under-load gateway, so transient failures are expected. Ride them +// out with a short timeout plus a couple of quick retries rather than +// surfacing every network blip. +const SLACK_CHECK_TIMEOUT_MS = 5000; +// One quick retry, then a slightly longer one — enough to clear a single +// gateway hiccup or socket reset without visibly stalling the 3s poll +// (worst case ~1.3s of added backoff on top of request time). +const SLACK_CHECK_BACKOFFS_MS = [300, 1000]; + +// Gateway-class statuses worth retrying. A 401/403/404 (or anything else) +// is a genuine error the caller should report, not a transient blip. +const TRANSIENT_STATUSES = new Set([502, 503, 504]); + +// Socket/transport error codes that routinely recover on a retry. Includes +// the axios client-side timeout (ECONNABORTED) and the DNS-level EAI_AGAIN. +const TRANSIENT_NETWORK_CODES = new Set([ + 'ECONNRESET', + 'ECONNABORTED', + 'ETIMEDOUT', + 'EADDRNOTAVAIL', + 'ENETUNREACH', + 'ENETDOWN', + 'EPIPE', + 'EAI_AGAIN', +]); + +// Some transport failures arrive as a bare Error with a missing or +// unhelpful `code` — most notably the TLS-handshake disconnect ("Client +// network socket disconnected before secure TLS connection was +// established"). Fall back to matching those by message. +const TRANSIENT_MESSAGE_RE = + /socket disconnected|before secure tls|network socket|EADDRNOTAVAIL|ECONNRESET|ETIMEDOUT/i; + +/** + * Classify an error from the Slack check as a transient network failure + * (retry / degrade quietly) vs. a genuine error (report). Only axios + * errors are considered; anything else is treated as genuine. + */ +function isTransientSlackCheckError(error: unknown): boolean { + if (!axios.isAxiosError(error)) return false; + const status = error.response?.status; + // Got an HTTP response → only gateway-class 5xx are transient. + if (status !== undefined) return TRANSIENT_STATUSES.has(status); + // No response → transport/network-layer failure. + if (error.code && TRANSIENT_NETWORK_CODES.has(error.code)) return true; + const cause = + (error.cause as { message?: string } | undefined)?.message ?? ''; + return TRANSIENT_MESSAGE_RE.test(`${error.message} ${cause}`); +} + /** * Check whether the project already has a Slack integration connected. - * Requires the `integration:read` scope. Throws on failure — callers - * (including the SlackConnectScreen poll) decide how to degrade and - * are responsible for capturing the error exactly once. + * Requires the `integration:read` scope. + * + * Resilient to transient failures: a short timeout plus a small + * retry/backoff for gateway 5xx and retryable socket/TLS errors. When the + * transient budget is exhausted, degrades to "not connected" (returns + * `false`) WITHOUT throwing — this runs on a non-critical post-install + * step where the caller already falls back to the connect nudge, so a + * flaky network shouldn't generate error-tracking noise. + * + * Throws only on genuine, non-transient errors (401/403, malformed + * response, etc.). The caller (the SlackConnectScreen poll) is responsible + * for capturing those exactly once. A caller-initiated abort propagates as + * a cancellation for the caller's teardown guard to swallow. */ export async function fetchSlackConnected( accessToken: string, @@ -246,19 +308,50 @@ export async function fetchSlackConnected( baseUrl: string, signal?: AbortSignal, ): Promise { - const response = await axios.get( - `${baseUrl}/api/projects/${projectId}/integrations/`, - { - headers: { - Authorization: `Bearer ${accessToken}`, - 'User-Agent': WIZARD_USER_AGENT, - }, - signal, - }, - ); - const parsed = IntegrationsResponseSchema.safeParse(response.data); - if (!parsed.success) return false; - return parsed.data.results.some((i) => i.kind === 'slack'); + for (let attempt = 0; attempt <= SLACK_CHECK_BACKOFFS_MS.length; attempt++) { + if (attempt > 0) await sleep(SLACK_CHECK_BACKOFFS_MS[attempt - 1]); + // Don't spend another attempt if the caller tore the screen down. + if (signal?.aborted) throw new ApiError('Slack connection check aborted'); + try { + const response = await axios.get( + `${baseUrl}/api/projects/${projectId}/integrations/`, + { + headers: { + Authorization: `Bearer ${accessToken}`, + 'User-Agent': WIZARD_USER_AGENT, + }, + signal, + timeout: SLACK_CHECK_TIMEOUT_MS, + }, + ); + const parsed = IntegrationsResponseSchema.safeParse(response.data); + if (!parsed.success) { + // A 200 with an unexpected shape means the API contract changed + // under us — a genuine bug, not a network blip. Report it. + throw new ApiError( + 'Invalid response format while checking Slack connection', + ); + } + return parsed.data.results.some((i) => i.kind === 'slack'); + } catch (error) { + // Caller-initiated abort (screen teardown): propagate without + // retrying. The poll's `cancelled` guard swallows it so it never + // reaches captureException. + if (axios.isCancel(error) || signal?.aborted) throw error; + + const transient = isTransientSlackCheckError(error); + // Retry transient failures until the backoff budget is spent. + if (transient && attempt < SLACK_CHECK_BACKOFFS_MS.length) continue; + // Exhausted transient → degrade quietly to "not connected". + if (transient) return false; + // Genuine, non-transient error → let the caller report it. + throw error instanceof ApiError + ? error + : handleApiError(error, 'check Slack connection'); + } + } + // Unreachable: the loop always returns or throws on the final attempt. + return false; } export function handleApiError(error: unknown, operation: string): ApiError { diff --git a/src/ui/tui/screens/SlackConnectScreen.tsx b/src/ui/tui/screens/SlackConnectScreen.tsx index 9db8f74d..a049525a 100644 --- a/src/ui/tui/screens/SlackConnectScreen.tsx +++ b/src/ui/tui/screens/SlackConnectScreen.tsx @@ -155,10 +155,14 @@ export const SlackConnectScreen = ({ store }: SlackConnectScreenProps) => { }) .catch((err: unknown) => { if (cancelled) return; - // Capture once and stop polling — repeating a failing call - // every tick would spam error tracking. The nudge copy is - // the fallback either way; a failed check counts as not - // connected so the screen doesn't sit on the loading state. + // Only genuine, non-transient errors reach here now — + // `fetchSlackConnected` rides out flaky-network blips itself and + // degrades to "not connected" without throwing. So capture once + // and stop polling: a persistent 401/403/malformed response + // won't fix itself, and repeating it every tick would spam error + // tracking. The nudge copy is the fallback either way; a failed + // check counts as not connected so the screen doesn't sit on the + // loading state. if (store.session.slackConnected === null) { store.setSlackConnected(false); }