diff --git a/AGENTS.md b/AGENTS.md index 3a8b08cc9..1cd5c66e4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -15,6 +15,14 @@ - Prefer composable primitives (signals, hooks, utilities) over deep inheritance or implicit global state. - When adding platform integrations (SSE, IPC, SDK), isolate them in thin adapters that surface typed events/actions. +## PR Review Principles +- **Check for regressions first.** Before approving any change, verify the existing behavior still works — run the test suite, test manually on both mobile and desktop, and confirm no unintended side effects in related subsystems. +- **Look for better possible implementations.** Don't settle for the first working approach. Ask: is there a simpler way? Does the codebase already have a pattern for this? Would a different abstraction reduce future maintenance cost? +- **Be the PR gatekeeper.** Every line merged becomes technical debt someone else will read. If it's unclear, fragile, or lacks tests, push back. The reviewer's job is to protect the codebase, not to be nice. +- **Be ruthless about code quality.** Surface-level "LGTM" is negligence. Inspect: naming, error handling, edge cases, type safety, logging (is it useful or just noise?), performance (any unnecessary allocations or re-renders?), and whether the change respects existing architectural boundaries. +- **Test before responding to review comments.** Never reply "works for me" or "this fixes it" without deploying the exact commit and verifying the behavior. Untested responses waste reviewer time and erode trust. +- **UI and server must be built from the same version.** Version mismatches between UI and server cause subtle bugs (e.g., sessions disappearing). Always build both from the same commit before testing. + ## Multi-Language Support (i18n) The UI uses a small custom i18n layer (no ICU/messageformat). When building features, never hardcode user-visible strings. diff --git a/MOBILE-SSE-FIX-IMPLEMENTATION-GUIDE.md b/MOBILE-SSE-FIX-IMPLEMENTATION-GUIDE.md new file mode 100644 index 000000000..17d20e71c --- /dev/null +++ b/MOBILE-SSE-FIX-IMPLEMENTATION-GUIDE.md @@ -0,0 +1,289 @@ +# Mobile SSE Resilience Fix - Implementation Guide + +**Reference:** See `MOBILE-SSE-RESILIENCE-ANALYSIS.md` for root cause details + +## Implementation: Option A (Pong Retry Logic) + +This guide provides step-by-step instructions for implementing retry logic on the pong HTTP POST. + +### Changes Required + +#### 1. Create Utility: Retry Helper with Exponential Backoff + +**File:** `packages/ui/src/lib/retry-utils.ts` (new) + +```typescript +interface RetryOptions { + maxAttempts?: number + initialDelayMs?: number + maxDelayMs?: number + backoffMultiplier?: number +} + +export async function retryWithBackoff( + fn: () => Promise, + options: RetryOptions = {}, +): Promise { + const { + maxAttempts = 3, + initialDelayMs = 100, + maxDelayMs = 5000, + backoffMultiplier = 2, + } = options + + let lastError: Error | null = null + let delayMs = initialDelayMs + + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + try { + return await fn() + } catch (error) { + lastError = error instanceof Error ? error : new Error(String(error)) + + if (attempt < maxAttempts) { + await new Promise((resolve) => setTimeout(resolve, delayMs)) + delayMs = Math.min(delayMs * backoffMultiplier, maxDelayMs) + } + } + } + + throw new Error( + `Failed after ${maxAttempts} attempts: ${lastError?.message || "Unknown error"}`, + ) +} +``` + +#### 2. Update Server Events Pong Handler + +**File:** `packages/ui/src/lib/server-events.ts` + +**Change at line 47-56:** + +```typescript +// BEFORE +(payload) => { + void serverApi + .sendClientConnectionPong({ + ...getClientIdentity(), + pingTs: payload.ts, + }) + .catch(() => { + debugWarn("sse", "Pong failed (connection already closed)") + }) +} + +// AFTER +(payload) => { + const identity = getClientIdentity() + const pongPayload = { ...identity, pingTs: payload.ts } + + void retryWithBackoff( + () => serverApi.sendClientConnectionPong(pongPayload), + { + maxAttempts: 3, + initialDelayMs: 100, + maxDelayMs: 2000, + } + ) + .catch((error) => { + debugWarn("sse", `Pong failed after retries: ${error.message}`) + }) +} +``` + +**Also add import at top:** +```typescript +import { retryWithBackoff } from "./retry-utils" +``` + +#### 3. Add Logging for Observability + +**File:** `packages/ui/src/lib/server-events.ts` + +Enhance the pong handler to log retry attempts: + +```typescript +(payload) => { + const identity = getClientIdentity() + const pongPayload = { ...identity, pingTs: payload.ts } + let attemptCount = 0 + + void retryWithBackoff( + async () => { + attemptCount++ + if (attemptCount > 1) { + logSse(`Pong retry attempt ${attemptCount}`) + } + return serverApi.sendClientConnectionPong(pongPayload) + }, + { + maxAttempts: 3, + initialDelayMs: 100, + maxDelayMs: 2000, + } + ) + .catch((error) => { + debugWarn("sse", `Pong failed: ${error.message}`) + // Optionally trigger reconnect if pong fails completely + // scheduleReconnect() + }) +} +``` + +### Testing Checklist + +#### Unit Tests + +**File:** `packages/ui/src/lib/retry-utils.test.ts` (new) + +```typescript +import { describe, it, expect, vi } from "vitest" +import { retryWithBackoff } from "./retry-utils" + +describe("retryWithBackoff", () => { + it("succeeds on first attempt", async () => { + const fn = vi.fn().mockResolvedValue("success") + const result = await retryWithBackoff(fn) + expect(result).toBe("success") + expect(fn).toHaveBeenCalledTimes(1) + }) + + it("retries on failure and succeeds", async () => { + const fn = vi + .fn() + .mockRejectedValueOnce(new Error("fail")) + .mockResolvedValue("success") + const result = await retryWithBackoff(fn) + expect(result).toBe("success") + expect(fn).toHaveBeenCalledTimes(2) + }) + + it("fails after max attempts", async () => { + const fn = vi.fn().mockRejectedValue(new Error("always fails")) + await expect(retryWithBackoff(fn, { maxAttempts: 2 })).rejects.toThrow() + expect(fn).toHaveBeenCalledTimes(2) + }) + + it("respects backoff delays", async () => { + const fn = vi.fn().mockRejectedValue(new Error("fail")) + const startTime = Date.now() + + await expect( + retryWithBackoff(fn, { + maxAttempts: 3, + initialDelayMs: 50, + backoffMultiplier: 2, + }) + ).rejects.toThrow() + + const elapsed = Date.now() - startTime + // 50ms + 100ms = 150ms minimum (plus jitter) + expect(elapsed).toBeGreaterThanOrEqual(150) + }) +}) +``` + +#### Integration Tests + +**File:** `packages/ui/src/lib/server-events.test.ts` (update existing) + +Add test for pong retry on network failure: + +```typescript +it("retries pong on transient network failure", async () => { + const mockApi = { + sendClientConnectionPong: vi + .fn() + .mockRejectedValueOnce(new Error("Network timeout")) + .mockResolvedValue(undefined), + } + + // Trigger ping handler + const pingPayload = { ts: Date.now() } + await pingHandler(pingPayload) // Assuming handler is extracted + + // Verify retry logic was invoked + await vi.advanceTimersByTimeAsync(100) + expect(mockApi.sendClientConnectionPong).toHaveBeenCalledTimes(2) +}) +``` + +#### Manual Testing + +1. **Throttle Network (DevTools → Network → 3G)** + - Send message + - Verify response arrives (no "Abort" button needed) + - Check console for `Pong retry attempt` logs + +2. **Intermittent Connectivity** + - Use network simulator + - Create 500ms packet loss window during pong + - Verify retry succeeds after loss ends + +3. **Multiple Messages** + - Send 5 messages rapidly + - Verify all receive responses (with retries in logs) + - No dropped messages + +### Metrics to Monitor + +Add to server logs for post-deployment monitoring: + +**File:** `packages/server/src/server/routes/events.ts` + +Track pong success/failure rate: + +```typescript +interface PongStats { + totalPings: number + successfulPongs: number + failedPongs: number +} + +const stats = new Map() + +app.post("/api/client-connections/pong", (request, reply) => { + const body = PongBodySchema.parse(request.body ?? {}) + if (!deps.connectionManager.pong(body)) { + // Track failed pongs + stats.set(body.clientId, { + ...stats.get(body.clientId), + failedPongs: (stats.get(body.clientId)?.failedPongs ?? 0) + 1, + }) + reply.code(404).send({ error: "Client connection not found" }) + return + } + + // Track successful pongs + stats.set(body.clientId, { + ...stats.get(body.clientId), + successfulPongs: (stats.get(body.clientId)?.successfulPongs ?? 0) + 1, + }) + reply.code(204).send() +}) +``` + +### Success Criteria + +- ✓ No console errors for pong failures on 3G network +- ✓ Messages received responses within 5 seconds (including retry time) +- ✓ No "Abort" button needed for messages sent on poor networks +- ✓ Retry logs show 1-2 retries on slow connections +- ✓ Unit tests pass (100% coverage of retry-utils) +- ✓ Integration tests pass (pong retry scenarios) + +### Rollback Plan + +If issues appear post-deployment: +1. Reduce `maxAttempts` from 3 to 2 +2. Increase `initialDelayMs` from 100ms to 200ms +3. Revert to previous version if retry logic causes new issues + +### Next Steps After Option A + +If retry logic doesn't fully solve mobile issues: +1. Monitor metrics for pong failure patterns +2. Identify if failures are permanent or transient +3. Consider escalating to Option B (SSE-based pong) + +See `MOBILE-SSE-RESILIENCE-ANALYSIS.md` for Option B details. diff --git a/MOBILE-SSE-RESILIENCE-ANALYSIS.md b/MOBILE-SSE-RESILIENCE-ANALYSIS.md new file mode 100644 index 000000000..d36775fc8 --- /dev/null +++ b/MOBILE-SSE-RESILIENCE-ANALYSIS.md @@ -0,0 +1,178 @@ +# Mobile SSE Connection Resilience - Root Cause Analysis + +**Date:** May 28, 2026 +**Branch:** `fix/mobile-network-resilience` +**Status:** Root cause identified, fix pending implementation + +## Problem Statement + +On mobile networks with poor connectivity or high latency, users experience: + +1. Message sent but no response received +2. "Abort" button pressed to cancel the waiting request +3. Response only arrives after: + - User sends another message (triggers reconnection), OR + - Page is manually refreshed + +**Expected behavior:** Server processes message and sends response immediately + +## Detailed Analysis + +### Server-Side Flow (events.ts, connection-manager.ts) + +``` +Server → Client: ping (every 15 seconds, line 53 in events.ts) + heartbeat = setInterval(() => { reply.raw.write(...) }, 15000) + +Client → Server: pong (HTTP POST to /api/client-connections/pong) + +Server waiting: 45 second timeout without pong (STALE_CONNECTION_TIMEOUT_MS, line 3 in connection-manager.ts) + If no pong received → connection marked as stale + Sweep interval: 5 seconds (STALE_SWEEP_INTERVAL_MS) +``` + +### Client-Side Flow (server-events.ts) + +```typescript +// Line 47-56: When ping received +(payload) => { + void serverApi + .sendClientConnectionPong({ + ...getClientIdentity(), + pingTs: payload.ts, + }) + .catch(() => { + debugWarn("sse", "Pong failed (connection already closed)") + }) +} +``` + +The pong is sent via **separate HTTP POST** (api-client.ts:548-552): +```typescript +sendClientConnectionPong(payload): Promise { + return request("/api/client-connections/pong", { + method: "POST", + body: JSON.stringify(payload), + }) +} +``` + +## Root Cause + +**On mobile networks with poor connectivity:** + +1. **User sends message** → arrives at server OK, processing begins +2. **Server sends ping** via established SSE connection ✓ (works fine) +3. **Client receives ping**, attempts to respond with **HTTP POST pong** +4. **POST fails** due to: + - Network timeout (mobile latency) + - Network switch (WiFi ↔ cellular) + - High packet loss + - Result: `Client connection not found` error + +5. **Server never receives pong** → assumes client is dead +6. **Server closes SSE connection after 45s timeout** +7. **Message responses get stuck in event queue** (unable to send on closed connection) +8. **Client shows "Abort" error** to user + +**Why it works after sending another message:** +- New message triggers reconnection +- New SSE connection established +- Queued events from previous message now deliverable +- Both old and new responses arrive together + +**Evidence from logs:** +``` +[20:53:14.814] ERROR actions: sendMessage failed: No network connection +[20:53:43.500] ERROR api: Request failed: POST /api/client-connections/pong - Client connection not found +[20:53:43.501] WARN sse: Pong failed (connection already closed) +``` + +Timeline: +- Message sent (14.8s) +- ~29 seconds later: Pong POST fails → server closes connection + +## Critical Path + +1. **events.ts:27-79** — SSE connection setup with heartbeat ping +2. **connection-manager.ts:73-96** — Pong reception and timeout sweep +3. **server-events.ts:47-56** — Client pong response (HTTP POST) +4. **api-client.ts:548-552** — HTTP POST implementation (no retry logic) + +## Proposed Solutions + +### Option A: Add Retry Logic to Pong POST (Quick Win) +- **File:** `packages/ui/src/lib/server-events.ts` +- **Change:** Implement exponential backoff retry for pong +- **Risk:** Low +- **Benefit:** Improves reliability for transient failures +- **Limitation:** Still uses separate HTTP request + +**Implementation:** +```typescript +// Retry up to 3 times with exponential backoff +sendPongWithRetry(payload, maxRetries = 3, delayMs = 100) +``` + +### Option B: Send Pong via SSE Channel (Robust Solution) +- **File:** `packages/server/src/server/routes/events.ts` +- **Change:** Allow client to send messages through SSE (not just receive) +- **Risk:** Medium (protocol change) +- **Benefit:** No separate HTTP request, reuses working connection +- **Limitation:** Requires client/server coordination + +**Implementation:** +``` +Client → Server: Can send special "codenomad.client.pong" event through SSE + Uses existing EventSource connection (no new HTTP request) +``` + +## Recommendation + +**Start with Option A** (retry logic) because: +1. ✓ Low risk, easy to verify +2. ✓ Handles transient network issues +3. ✓ Quick to implement and deploy +4. ✗ Still has fundamental limitation (separate HTTP request) + +**If Option A insufficient, move to Option B** for comprehensive solution. + +## Test Scenarios + +To verify fix effectiveness: + +1. **Slow network (throttle to 3G):** + - Send message, verify response arrives + - No "Abort" button needed + +2. **Network switch (WiFi → mobile):** + - Send message on WiFi + - Switch to mobile mid-transmission + - Verify response still arrives (with delay if needed) + +3. **High packet loss:** + - Use network simulator + - 10-20% packet loss + - Verify pong retries succeed + +4. **Intermittent connectivity:** + - Brief disconnects between ping/pong + - Verify graceful recovery + +## Files Involved + +**Server:** +- `packages/server/src/server/routes/events.ts` — SSE setup, ping interval +- `packages/server/src/clients/connection-manager.ts` — Pong timeout logic + +**Client:** +- `packages/ui/src/lib/server-events.ts` — Pong response handler +- `packages/ui/src/lib/api-client.ts` — HTTP POST implementation +- `packages/ui/src/lib/event-source-handlers.ts` — SSE event listeners + +## Next Steps + +1. Implement Option A (pong retry logic) +2. Test on mobile networks with poor connectivity +3. Monitor logs for retry patterns +4. If needed, escalate to Option B (SSE-based pong) diff --git a/packages/server/scripts/copy-ui-dist.mjs b/packages/server/scripts/copy-ui-dist.mjs index fdbe4818b..8448382a2 100644 --- a/packages/server/scripts/copy-ui-dist.mjs +++ b/packages/server/scripts/copy-ui-dist.mjs @@ -1,5 +1,5 @@ #!/usr/bin/env node -import { cpSync, existsSync, mkdirSync, rmSync } from "fs" +import { cpSync, existsSync, mkdirSync, rmSync, readFileSync, writeFileSync } from "fs" import path from "path" import { fileURLToPath } from "url" @@ -18,4 +18,16 @@ rmSync(targetDir, { recursive: true, force: true }) mkdirSync(targetDir, { recursive: true }) cpSync(uiDistDir, targetDir, { recursive: true }) +// Patch index.html to unregister any existing service worker +const htmlPath = path.join(targetDir, "index.html") +if (existsSync(htmlPath)) { + let html = readFileSync(htmlPath, "utf-8") + const script = '' + if (!html.includes(script)) { + html = html.replace("", script + "") + writeFileSync(htmlPath, html, "utf-8") + console.log("[copy-ui-dist] Injected SW unregister script") + } +} + console.log(`[copy-ui-dist] Copied UI bundle from ${uiDistDir} -> ${targetDir}`) diff --git a/packages/server/src/server/http-server.ts b/packages/server/src/server/http-server.ts index 71a325438..4b9851a42 100644 --- a/packages/server/src/server/http-server.ts +++ b/packages/server/src/server/http-server.ts @@ -35,6 +35,7 @@ import { InstanceStore } from "../storage/instance-store" import { BackgroundProcessManager } from "../background-processes/manager" import type { AuthManager } from "../auth/manager" import { registerAuthRoutes } from "./routes/auth" +import { registerDebugLogRoutes } from "./routes/debug-log" import { sendUnauthorized, wantsHtml } from "../auth/http-auth" import type { SpeechService } from "../speech/service" import { ClientConnectionManager } from "../clients/connection-manager" @@ -274,6 +275,7 @@ export function createHttpServer(deps: HttpServerDeps) { registerFilesystemRoutes(app, { fileSystemBrowser: deps.fileSystemBrowser }) registerConfigFileRoutes(app) registerMetaRoutes(app, { serverMeta: deps.serverMeta }) + registerDebugLogRoutes(app) registerEventRoutes(app, { eventBus: deps.eventBus, registerClient: registerSseClient, diff --git a/packages/ui/src/App.tsx b/packages/ui/src/App.tsx index 2aa430fcd..e979b9d65 100644 --- a/packages/ui/src/App.tsx +++ b/packages/ui/src/App.tsx @@ -1,5 +1,7 @@ import { Component, For, Show, createMemo, createEffect, createSignal, onMount, onCleanup } from "solid-js" import { Dialog } from "@kobalte/core/dialog" +import NetworkStatusBanner from "./components/network-status-banner" +import DebugOverlay from "./components/debug-overlay" import { Toaster } from "solid-toast" import useMediaQuery from "@suid/material/useMediaQuery" import { Minimize2 } from "lucide-solid" @@ -555,6 +557,7 @@ const App: Component = () => {
+
) diff --git a/packages/ui/src/components/debug-overlay.tsx b/packages/ui/src/components/debug-overlay.tsx new file mode 100644 index 000000000..aab9167af --- /dev/null +++ b/packages/ui/src/components/debug-overlay.tsx @@ -0,0 +1,113 @@ +import { For, Show, createEffect, createSignal } from "solid-js" +import { entries, paused, visible, clearLog, toggleVisibility, togglePause, exportLog } from "../stores/debug-log" +import { serverEvents } from "../lib/server-events" +import { CODENOMAD_API_BASE } from "../lib/api-client" + +export default function DebugOverlay() { + let listRef: HTMLDivElement | undefined + const [sending, setSending] = createSignal(false) + const [copied, setCopied] = createSignal(false) + + createEffect(() => { + if (!paused() && listRef) { + queueMicrotask(() => { + listRef?.scrollTo({ top: listRef.scrollHeight, behavior: "smooth" }) + }) + } + }) + + async function copyLog() { + try { + const text = entries().map((e) => `[${e.ts}] ${e.level.toUpperCase()} ${e.source}: ${e.message}`).join("\n") + await navigator.clipboard.writeText(text) + setCopied(true) + setTimeout(() => setCopied(false), 2000) + } catch { + // fallback for insecure contexts + } + } + + async function sendToServer() { + setSending(true) + try { + const data = entries() + await fetch(`${CODENOMAD_API_BASE || ""}/api/debug-log`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ logs: data }), + }) + } catch { + // Silently fail — the user can use export instead + } finally { + setSending(false) + } + } + + function downloadLog() { + const blob = new Blob([exportLog()], { type: "application/json" }) + const url = URL.createObjectURL(blob) + const a = document.createElement("a") + a.href = url + a.download = `codenomad-debug-${Date.now()}.json` + a.click() + URL.revokeObjectURL(url) + } + + return ( + <> + +
+
+ Debug Log ({entries().length}){serverEvents.connected ? "" : " ⚠ disconnected"} +
+ + + + + + +
+
+
+ + {(entry) => ( +
+ [{entry.ts}] + {" "} + + {entry.level.toUpperCase()} + + {" "} + {entry.source}: + {" "} + {entry.message} +
+ )} +
+
+
+
+ + ) +} diff --git a/packages/ui/src/components/message-item.tsx b/packages/ui/src/components/message-item.tsx index 00785441f..e8e8ac58d 100644 --- a/packages/ui/src/components/message-item.tsx +++ b/packages/ui/src/components/message-item.tsx @@ -8,7 +8,7 @@ import MessagePart from "./message-part" import { copyToClipboard } from "../lib/clipboard" import { useI18n } from "../lib/i18n" import { showAlertDialog } from "../stores/alerts" -import { deleteMessage } from "../stores/session-actions" +import { deleteMessage, retrySendMessage } from "../stores/session-actions" import { isTauriHost } from "../lib/runtime-env" import type { DeleteHoverState } from "../types/delete-hover" import { useSpeech } from "../lib/hooks/use-speech" @@ -46,6 +46,30 @@ export default function MessageItem(props: MessageItemProps) { const [copied, setCopied] = createSignal(false) const [deletingMessage, setDeletingMessage] = createSignal(false) const [deletingUpTo, setDeletingUpTo] = createSignal(false) + const [showSending, setShowSending] = createSignal(false) + const [showGenerating, setShowGenerating] = createSignal(false) + + createEffect(() => { + if (props.record.status === "sending") { + setShowSending(true) + } else if (showSending()) { + const timer = setTimeout(() => setShowSending(false), 1500) + onCleanup(() => clearTimeout(timer)) + } else if (!showSending() && isUser() && props.record.status !== "error" && Date.now() - props.record.createdAt < 1500) { + setShowSending(true) + const timer = setTimeout(() => setShowSending(false), 1500) + onCleanup(() => clearTimeout(timer)) + } + }) + + createEffect(() => { + if (isGenerating()) { + setShowGenerating(true) + } else if (showGenerating()) { + const timer = setTimeout(() => setShowGenerating(false), 1500) + onCleanup(() => clearTimeout(timer)) + } + }) type ImagePreviewState = { url: string @@ -664,7 +688,7 @@ export default function MessageItem(props: MessageItemProps) {
⚠️ {errorMessage()}
- +
{t("messageItem.status.generating")}
@@ -763,14 +787,23 @@ export default function MessageItem(props: MessageItemProps) { }}
- +
{t("messageItem.status.sending")}
-
⚠ {t("messageItem.status.failedToSend")}
+
+ ⚠ {t("messageItem.status.failedToSend")} + +
diff --git a/packages/ui/src/components/message-section.tsx b/packages/ui/src/components/message-section.tsx index ea9fac707..2406677ff 100644 --- a/packages/ui/src/components/message-section.tsx +++ b/packages/ui/src/components/message-section.tsx @@ -15,6 +15,7 @@ import { copyToClipboard } from "../lib/clipboard" import { showToastNotification } from "../lib/notifications" import { showAlertDialog } from "../stores/alerts" import { deleteMessage, deleteMessagePart } from "../stores/session-actions" +import { visible as debugVisible, toggleVisibility as toggleDebug } from "../stores/debug-log" import type { InstanceMessageStore } from "../stores/message-v2/instance-store" import type { DeleteHoverState } from "../types/delete-hover" import { partHasRenderableText } from "../types/message" @@ -728,7 +729,9 @@ export default function MessageSection(props: MessageSectionProps) { const api = listApi() if (!api) return if (props.registerScrollToBottom) { - props.registerScrollToBottom(() => api.scrollToBottom({ immediate: true })) + props.registerScrollToBottom(() => { + api.scrollToBottom({ immediate: true }) + }) } }) @@ -1295,6 +1298,14 @@ export default function MessageSection(props: MessageSectionProps) { registerState={(state) => setListState(state)} renderControls={(state, api) => (
+