From 29a7b1b8bdbb509c784a305b55176e1e20537083 Mon Sep 17 00:00:00 2001 From: Vivek Raghunathan Date: Tue, 5 May 2026 21:32:37 -0700 Subject: [PATCH 1/2] fix(daemon): skip unresponsive targets on attach + per-call CDP timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs caused the daemon to hang silently on real-world setups (multiple heavy SPA tabs open, sleeping tabs, etc.): 1. attach_first_page picked targets[0] unconditionally and silently swallowed Page/DOM/Runtime/Network.enable timeouts. When the first real page was a wedged renderer (sleeping tab, busy SPA, frozen iframe), the daemon happily reported "attached" with a session ID, but every subsequent CDP call against that session hung forever. Now iterates real pages, abandons (and detaches) any target whose enables time out within 4s, and falls back to a fresh about:blank if every existing page is unresponsive. 2. handle() wrapped cdp.send_raw with no per-call timeout. The IPC layer's 5s read timeout protects the *calling* client, but the daemon's own event loop stayed jammed while await self.cdp.send_raw sat forever, starving every other connected client. Now bounded by BU_CDP_TIMEOUT (default 45s); times out individual calls cleanly with a recovery hint. Also wraps the stale-session re-attach path in try/except so re-attach failures surface as errors rather than propagating out of the handler. The override env var lets users bump it for legitimately slow workloads (large captureBeyondViewport screenshots, long Runtime.evaluate scripts). Repro before fix: open Chrome with heavy SPA tabs (Gmail, Cloudflare dashboard, BBC article, etc.) such that targets[0] is unresponsive, then run any helper. The CLI hung indefinitely with no output, error, or diagnostic. The daemon log only showed empty-message TimeoutErrors from the four .enable calls and then went silent. After fix, the daemon log clearly shows the iteration: enable Page on D58089BF...: TimeoutError: ... target 8F5C7B... (https://dit.example.com/) unresponsive, trying next enable Page on 2417ABE3...: TimeoutError: ... target 784EE4CB... (https://notarycouncil.org/) unresponsive, trying next attached 723889E3... (https://mail.google.com/...) session=4E670D2D... Tests: - test_attach_first_page_skips_unresponsive_targets — wedged target with hanging Page.enable; verifies the next live target wins. - test_attach_first_page_creates_blank_when_all_pages_unresponsive — every real page wedged; verifies fallback to fresh about:blank. - test_handle_times_out_wedged_cdp_call — Runtime.evaluate hangs forever; verifies BU_CDP_TIMEOUT bounds it with a clean error containing the method name and a recovery hint. All pre-existing tests still pass. --- src/browser_harness/daemon.py | 96 ++++++++++++++++++++++++++++------- tests/unit/test_daemon.py | 91 +++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+), 19 deletions(-) diff --git a/src/browser_harness/daemon.py b/src/browser_harness/daemon.py index 3410876d..e0091e5a 100644 --- a/src/browser_harness/daemon.py +++ b/src/browser_harness/daemon.py @@ -187,23 +187,54 @@ def __init__(self): self.stop = None # asyncio.Event, set inside start() async def attach_first_page(self): - """Attach to a real page (or any page). Sets self.session. Returns attached target or None.""" + """Attach to a real page (or any page). Sets self.session. Returns attached target or None. + + Iterates real pages so a single wedged tab (busy renderer, sleeping tab, + unresponsive page) does not pin the daemon to a dead session — previously + the daemon picked pages[0] unconditionally, silently swallowed Page.enable + timeouts, and every later CDP call against that session hung forever. + Falls back to a fresh about:blank if every existing page is unresponsive. + """ targets = (await self.cdp.send_raw("Target.getTargets"))["targetInfos"] pages = [t for t in targets if is_real_page(t)] - if not pages: - # No real pages — create one instead of attaching to omnibox popup - tid = (await self.cdp.send_raw("Target.createTarget", {"url": "about:blank"}))["targetId"] - log(f"no real pages found, created about:blank ({tid})") - pages = [{"targetId": tid, "url": "about:blank", "type": "page"}] - self.session = (await self.cdp.send_raw( - "Target.attachToTarget", {"targetId": pages[0]["targetId"], "flatten": True} - ))["sessionId"] - self.target_id = pages[0]["targetId"] - log(f"attached {pages[0]['targetId']} ({pages[0].get('url','')[:80]}) session={self.session}") - await self._enable_default_domains(self.session) - return pages[0] - - async def _enable_default_domains(self, session_id): + for p in pages: + sid = await self._try_attach(p) + if sid: + self.session = sid + self.target_id = p["targetId"] + log(f"attached {p['targetId']} ({p.get('url','')[:80]}) session={sid}") + return p + log(f"target {p['targetId']} ({p.get('url','')[:80]}) unresponsive, trying next") + # Either no real pages, or every real page was unresponsive — create a fresh tab. + tid = (await self.cdp.send_raw("Target.createTarget", {"url": "about:blank"}))["targetId"] + fresh = {"targetId": tid, "url": "about:blank", "type": "page"} + sid = await self._try_attach(fresh) + if not sid: + raise RuntimeError(f"could not attach to any page (created about:blank {tid} but it also failed to enable)") + self.session = sid + self.target_id = tid + log(f"no responsive pages found, created+attached about:blank ({tid}) session={sid}") + return fresh + + async def _try_attach(self, target): + """Attach to a single target and enable Page/DOM/Runtime/Network. + Returns sessionId on success, None if attach or any enable hangs/fails + (target is unresponsive — caller should try the next one).""" + try: + sid = (await asyncio.wait_for( + self.cdp.send_raw("Target.attachToTarget", {"targetId": target["targetId"], "flatten": True}), + timeout=5, + ))["sessionId"] + except Exception as e: + log(f"attach {target['targetId']} failed: {type(e).__name__}: {e}") + return None + if not await self._enable_default_domains(sid, strict=True): + try: await asyncio.wait_for(self.cdp.send_raw("Target.detachFromTarget", {"sessionId": sid}), timeout=2) + except Exception: pass + return None + return sid + + async def _enable_default_domains(self, session_id, strict=False): """Enable Page/DOM/Runtime/Network on a CDP session. Used by both initial attach and set_session (called after switch_tab/ @@ -216,16 +247,25 @@ async def _enable_default_domains(self, session_id): bounded by a single CDP round trip rather than four sequential ones — important on the set_session path, where the helper's IPC socket has a 5s read timeout. + + With strict=True (used during initial attach), returns False if any + enable failed so the caller can abandon the unresponsive target. + Without strict, returns True regardless — set_session has already + committed to the new target and partial enable is better than nothing. """ + results = [] async def enable_one(d): try: await asyncio.wait_for( self.cdp.send_raw(f"{d}.enable", session_id=session_id), timeout=4, ) + results.append(True) except Exception as e: - log(f"enable {d} on {session_id}: {e}") + log(f"enable {d} on {session_id}: {type(e).__name__}: {e}") + results.append(False) await asyncio.gather(*(enable_one(d) for d in ("Page", "DOM", "Runtime", "Network"))) + return all(results) if strict else True async def start(self): self.stop = asyncio.Event() @@ -331,14 +371,32 @@ async def disable_old(): # Browser-level Target.* calls must not use a session (stale or otherwise). # For everything else, explicit session in req wins; else default. sid = None if method.startswith("Target.") else (req.get("session_id") or self.session) + # Per-call CDP timeout. A wedged renderer (sleeping tab, infinite JS, + # frozen iframe) would otherwise block the daemon's event loop forever, + # starving every other connected client. The IPC layer's 5s read timeout + # protects the *calling* client, but the daemon itself stays jammed + # without this. Override via BU_CDP_TIMEOUT for legitimately slow calls + # (large captureBeyondViewport screenshots, long Runtime.evaluate scripts). + cdp_timeout = float(os.environ.get("BU_CDP_TIMEOUT", "45")) try: - return {"result": await self.cdp.send_raw(method, params, session_id=sid)} + return {"result": await asyncio.wait_for( + self.cdp.send_raw(method, params, session_id=sid), + timeout=cdp_timeout, + )} + except asyncio.TimeoutError: + return {"error": f"CDP {method} timed out after {cdp_timeout}s (session={sid}) — tab may be unresponsive; try ensure_real_tab() or restart_daemon()"} except Exception as e: msg = str(e) if "Session with given id not found" in msg and sid == self.session and sid: log(f"stale session {sid}, re-attaching") - if await self.attach_first_page(): - return {"result": await self.cdp.send_raw(method, params, session_id=self.session)} + try: + if await self.attach_first_page(): + return {"result": await asyncio.wait_for( + self.cdp.send_raw(method, params, session_id=self.session), + timeout=cdp_timeout, + )} + except Exception as e2: + return {"error": f"re-attach failed: {e2}"} return {"error": msg} diff --git a/tests/unit/test_daemon.py b/tests/unit/test_daemon.py index d0c8c82e..251f7f75 100644 --- a/tests/unit/test_daemon.py +++ b/tests/unit/test_daemon.py @@ -243,3 +243,94 @@ async def run(): f"first set_session must run 4 enables concurrently " f"(observed peak = {peak}). No Network.disable should fire." ) + + +def test_attach_first_page_skips_unresponsive_targets(): + """If the first real page is wedged (Page.enable hangs forever), the + daemon must abandon it and try the next one — previously it pinned + self.session to the dead target and every later CDP call hung.""" + class _SelectivelyWedgedCDP: + """Page.enable hangs forever for target-WEDGED, succeeds for target-LIVE.""" + def __init__(self): + self.calls = [] + + async def send_raw(self, method, params=None, session_id=None): + self.calls.append((method, params, session_id)) + if method == "Target.getTargets": + return {"targetInfos": [ + {"targetId": "target-WEDGED", "type": "page", "url": "https://wedged.example/"}, + {"targetId": "target-LIVE", "type": "page", "url": "https://live.example/"}, + ]} + if method == "Target.attachToTarget": + tid = params["targetId"] + return {"sessionId": f"session-{tid}"} + if method == "Target.detachFromTarget": + return {} + if method.endswith(".enable") and session_id == "session-target-WEDGED": + # Hang forever — caller must use asyncio.wait_for to escape. + await asyncio.Event().wait() + return {} + + async def run(): + d = daemon.Daemon() + d.cdp = _SelectivelyWedgedCDP() + return await d.attach_first_page() + + attached = asyncio.run(run()) + assert attached["targetId"] == "target-LIVE", ( + f"daemon must skip the wedged target and attach to the live one. " + f"Got: {attached}" + ) + + +def test_attach_first_page_creates_blank_when_all_pages_unresponsive(): + """If every real page is wedged, fall back to a fresh about:blank rather + than hanging or dying. about:blank is guaranteed responsive.""" + class _AllWedgedExceptBlankCDP: + def __init__(self): + self.calls = [] + + async def send_raw(self, method, params=None, session_id=None): + self.calls.append((method, params, session_id)) + if method == "Target.getTargets": + return {"targetInfos": [ + {"targetId": "wedged-1", "type": "page", "url": "https://a.example/"}, + ]} + if method == "Target.createTarget": + return {"targetId": "target-BLANK"} + if method == "Target.attachToTarget": + return {"sessionId": f"session-{params['targetId']}"} + if method == "Target.detachFromTarget": + return {} + if method.endswith(".enable") and session_id == "session-wedged-1": + await asyncio.Event().wait() # hang forever + return {} + + async def run(): + d = daemon.Daemon() + d.cdp = _AllWedgedExceptBlankCDP() + return await d.attach_first_page() + + attached = asyncio.run(run()) + assert attached["targetId"] == "target-BLANK" + assert attached["url"] == "about:blank" + + +def test_handle_times_out_wedged_cdp_call(monkeypatch): + """A wedged CDP call must not pin the daemon's event loop forever. + BU_CDP_TIMEOUT (default 45s, lowered here for test speed) bounds it. + Returns a clean error with a recovery hint instead of hanging.""" + monkeypatch.setenv("BU_CDP_TIMEOUT", "0.1") + + class _WedgedCDP: + async def send_raw(self, method, params=None, session_id=None): + await asyncio.Event().wait() # hang forever + + d = daemon.Daemon() + d.cdp = _WedgedCDP() + d.session = "s1" + + resp = asyncio.run(d.handle({"method": "Runtime.evaluate", "params": {"expression": "1+1"}})) + assert "error" in resp + assert "timed out" in resp["error"] + assert "Runtime.evaluate" in resp["error"] From ed94dc2dd71502944235be9921145125e3ef020f Mon Sep 17 00:00:00 2001 From: Vivek Raghunathan Date: Tue, 5 May 2026 21:40:12 -0700 Subject: [PATCH 2/2] fix(daemon): bound stale-session recovery by BU_CDP_TIMEOUT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit attach_first_page() iterates pages with per-call timeouts that stack (5s attach + 4×4s enables per page). On the stale-session recovery path (Session with given id not found), a browser full of wedged tabs could keep one client waiting well past the BU_CDP_TIMEOUT contract. Wrap the whole recovery in asyncio.wait_for(cdp_timeout) so it surfaces a clean error with a restart_daemon() hint instead of accumulating per-tab waits. Adds a regression test covering the multi-tab wedged recovery case. Identified by cubic-dev-ai on PR #307. --- src/browser_harness/daemon.py | 8 ++++++- tests/unit/test_daemon.py | 41 +++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/src/browser_harness/daemon.py b/src/browser_harness/daemon.py index e0091e5a..8562ad20 100644 --- a/src/browser_harness/daemon.py +++ b/src/browser_harness/daemon.py @@ -390,11 +390,17 @@ async def disable_old(): if "Session with given id not found" in msg and sid == self.session and sid: log(f"stale session {sid}, re-attaching") try: - if await self.attach_first_page(): + # attach_first_page() iterates pages with per-call timeouts + # that stack — bound the whole recovery by cdp_timeout so a + # browser full of wedged tabs can't keep one client waiting + # past the contract. + if await asyncio.wait_for(self.attach_first_page(), timeout=cdp_timeout): return {"result": await asyncio.wait_for( self.cdp.send_raw(method, params, session_id=self.session), timeout=cdp_timeout, )} + except asyncio.TimeoutError: + return {"error": f"re-attach timed out after {cdp_timeout}s — every tab unresponsive; try restart_daemon()"} except Exception as e2: return {"error": f"re-attach failed: {e2}"} return {"error": msg} diff --git a/tests/unit/test_daemon.py b/tests/unit/test_daemon.py index 251f7f75..6076c009 100644 --- a/tests/unit/test_daemon.py +++ b/tests/unit/test_daemon.py @@ -334,3 +334,44 @@ async def send_raw(self, method, params=None, session_id=None): assert "error" in resp assert "timed out" in resp["error"] assert "Runtime.evaluate" in resp["error"] + + +def test_handle_bounds_stale_session_recovery_by_cdp_timeout(monkeypatch): + """Stale-session recovery (re-attach path) iterates pages with per-call + timeouts that stack — a browser full of wedged tabs could otherwise pin + one client past BU_CDP_TIMEOUT. The whole recovery must be bounded.""" + monkeypatch.setenv("BU_CDP_TIMEOUT", "0.2") + + class _StaleThenWedgedCDP: + """First call raises 'Session with given id not found'; reattach path + then sees one real page whose Page.enable hangs forever.""" + def __init__(self): + self.calls = [] + + async def send_raw(self, method, params=None, session_id=None): + self.calls.append((method, params, session_id)) + if method == "Runtime.evaluate": + raise RuntimeError("Session with given id not found") + if method == "Target.getTargets": + return {"targetInfos": [ + {"targetId": "wedged", "type": "page", "url": "https://wedged.example/"}, + ]} + if method == "Target.attachToTarget": + return {"sessionId": "session-wedged"} + if method == "Target.createTarget": + # If recovery is unbounded, it would fall through to creating + # about:blank (also wedged below) and keep hanging. + return {"targetId": "blank"} + if method.endswith(".enable"): + await asyncio.Event().wait() # every enable hangs + return {} + + d = daemon.Daemon() + d.cdp = _StaleThenWedgedCDP() + d.session = "stale-session" + + resp = asyncio.run(d.handle({"method": "Runtime.evaluate", "params": {"expression": "1+1"}})) + assert "error" in resp + assert "re-attach timed out" in resp["error"], ( + f"recovery must surface a clean re-attach timeout, got: {resp['error']}" + )