diff --git a/src/browser_harness/daemon.py b/src/browser_harness/daemon.py index 3410876d..8562ad20 100644 --- a/src/browser_harness/daemon.py +++ b/src/browser_harness/daemon.py @@ -187,23 +187,54 @@ def __init__(self): self.stop = None # asyncio.Event, set inside start() async def attach_first_page(self): - """Attach to a real page (or any page). Sets self.session. Returns attached target or None.""" + """Attach to a real page (or any page). Sets self.session. Returns attached target or None. + + Iterates real pages so a single wedged tab (busy renderer, sleeping tab, + unresponsive page) does not pin the daemon to a dead session — previously + the daemon picked pages[0] unconditionally, silently swallowed Page.enable + timeouts, and every later CDP call against that session hung forever. + Falls back to a fresh about:blank if every existing page is unresponsive. + """ targets = (await self.cdp.send_raw("Target.getTargets"))["targetInfos"] pages = [t for t in targets if is_real_page(t)] - if not pages: - # No real pages — create one instead of attaching to omnibox popup - tid = (await self.cdp.send_raw("Target.createTarget", {"url": "about:blank"}))["targetId"] - log(f"no real pages found, created about:blank ({tid})") - pages = [{"targetId": tid, "url": "about:blank", "type": "page"}] - self.session = (await self.cdp.send_raw( - "Target.attachToTarget", {"targetId": pages[0]["targetId"], "flatten": True} - ))["sessionId"] - self.target_id = pages[0]["targetId"] - log(f"attached {pages[0]['targetId']} ({pages[0].get('url','')[:80]}) session={self.session}") - await self._enable_default_domains(self.session) - return pages[0] - - async def _enable_default_domains(self, session_id): + for p in pages: + sid = await self._try_attach(p) + if sid: + self.session = sid + self.target_id = p["targetId"] + log(f"attached {p['targetId']} ({p.get('url','')[:80]}) session={sid}") + return p + log(f"target {p['targetId']} ({p.get('url','')[:80]}) unresponsive, trying next") + # Either no real pages, or every real page was unresponsive — create a fresh tab. + tid = (await self.cdp.send_raw("Target.createTarget", {"url": "about:blank"}))["targetId"] + fresh = {"targetId": tid, "url": "about:blank", "type": "page"} + sid = await self._try_attach(fresh) + if not sid: + raise RuntimeError(f"could not attach to any page (created about:blank {tid} but it also failed to enable)") + self.session = sid + self.target_id = tid + log(f"no responsive pages found, created+attached about:blank ({tid}) session={sid}") + return fresh + + async def _try_attach(self, target): + """Attach to a single target and enable Page/DOM/Runtime/Network. + Returns sessionId on success, None if attach or any enable hangs/fails + (target is unresponsive — caller should try the next one).""" + try: + sid = (await asyncio.wait_for( + self.cdp.send_raw("Target.attachToTarget", {"targetId": target["targetId"], "flatten": True}), + timeout=5, + ))["sessionId"] + except Exception as e: + log(f"attach {target['targetId']} failed: {type(e).__name__}: {e}") + return None + if not await self._enable_default_domains(sid, strict=True): + try: await asyncio.wait_for(self.cdp.send_raw("Target.detachFromTarget", {"sessionId": sid}), timeout=2) + except Exception: pass + return None + return sid + + async def _enable_default_domains(self, session_id, strict=False): """Enable Page/DOM/Runtime/Network on a CDP session. Used by both initial attach and set_session (called after switch_tab/ @@ -216,16 +247,25 @@ async def _enable_default_domains(self, session_id): bounded by a single CDP round trip rather than four sequential ones — important on the set_session path, where the helper's IPC socket has a 5s read timeout. + + With strict=True (used during initial attach), returns False if any + enable failed so the caller can abandon the unresponsive target. + Without strict, returns True regardless — set_session has already + committed to the new target and partial enable is better than nothing. """ + results = [] async def enable_one(d): try: await asyncio.wait_for( self.cdp.send_raw(f"{d}.enable", session_id=session_id), timeout=4, ) + results.append(True) except Exception as e: - log(f"enable {d} on {session_id}: {e}") + log(f"enable {d} on {session_id}: {type(e).__name__}: {e}") + results.append(False) await asyncio.gather(*(enable_one(d) for d in ("Page", "DOM", "Runtime", "Network"))) + return all(results) if strict else True async def start(self): self.stop = asyncio.Event() @@ -331,14 +371,38 @@ async def disable_old(): # Browser-level Target.* calls must not use a session (stale or otherwise). # For everything else, explicit session in req wins; else default. sid = None if method.startswith("Target.") else (req.get("session_id") or self.session) + # Per-call CDP timeout. A wedged renderer (sleeping tab, infinite JS, + # frozen iframe) would otherwise block the daemon's event loop forever, + # starving every other connected client. The IPC layer's 5s read timeout + # protects the *calling* client, but the daemon itself stays jammed + # without this. Override via BU_CDP_TIMEOUT for legitimately slow calls + # (large captureBeyondViewport screenshots, long Runtime.evaluate scripts). + cdp_timeout = float(os.environ.get("BU_CDP_TIMEOUT", "45")) try: - return {"result": await self.cdp.send_raw(method, params, session_id=sid)} + return {"result": await asyncio.wait_for( + self.cdp.send_raw(method, params, session_id=sid), + timeout=cdp_timeout, + )} + except asyncio.TimeoutError: + return {"error": f"CDP {method} timed out after {cdp_timeout}s (session={sid}) — tab may be unresponsive; try ensure_real_tab() or restart_daemon()"} except Exception as e: msg = str(e) if "Session with given id not found" in msg and sid == self.session and sid: log(f"stale session {sid}, re-attaching") - if await self.attach_first_page(): - return {"result": await self.cdp.send_raw(method, params, session_id=self.session)} + try: + # attach_first_page() iterates pages with per-call timeouts + # that stack — bound the whole recovery by cdp_timeout so a + # browser full of wedged tabs can't keep one client waiting + # past the contract. + if await asyncio.wait_for(self.attach_first_page(), timeout=cdp_timeout): + return {"result": await asyncio.wait_for( + self.cdp.send_raw(method, params, session_id=self.session), + timeout=cdp_timeout, + )} + except asyncio.TimeoutError: + return {"error": f"re-attach timed out after {cdp_timeout}s — every tab unresponsive; try restart_daemon()"} + except Exception as e2: + return {"error": f"re-attach failed: {e2}"} return {"error": msg} diff --git a/tests/unit/test_daemon.py b/tests/unit/test_daemon.py index d0c8c82e..6076c009 100644 --- a/tests/unit/test_daemon.py +++ b/tests/unit/test_daemon.py @@ -243,3 +243,135 @@ async def run(): f"first set_session must run 4 enables concurrently " f"(observed peak = {peak}). No Network.disable should fire." ) + + +def test_attach_first_page_skips_unresponsive_targets(): + """If the first real page is wedged (Page.enable hangs forever), the + daemon must abandon it and try the next one — previously it pinned + self.session to the dead target and every later CDP call hung.""" + class _SelectivelyWedgedCDP: + """Page.enable hangs forever for target-WEDGED, succeeds for target-LIVE.""" + def __init__(self): + self.calls = [] + + async def send_raw(self, method, params=None, session_id=None): + self.calls.append((method, params, session_id)) + if method == "Target.getTargets": + return {"targetInfos": [ + {"targetId": "target-WEDGED", "type": "page", "url": "https://wedged.example/"}, + {"targetId": "target-LIVE", "type": "page", "url": "https://live.example/"}, + ]} + if method == "Target.attachToTarget": + tid = params["targetId"] + return {"sessionId": f"session-{tid}"} + if method == "Target.detachFromTarget": + return {} + if method.endswith(".enable") and session_id == "session-target-WEDGED": + # Hang forever — caller must use asyncio.wait_for to escape. + await asyncio.Event().wait() + return {} + + async def run(): + d = daemon.Daemon() + d.cdp = _SelectivelyWedgedCDP() + return await d.attach_first_page() + + attached = asyncio.run(run()) + assert attached["targetId"] == "target-LIVE", ( + f"daemon must skip the wedged target and attach to the live one. " + f"Got: {attached}" + ) + + +def test_attach_first_page_creates_blank_when_all_pages_unresponsive(): + """If every real page is wedged, fall back to a fresh about:blank rather + than hanging or dying. about:blank is guaranteed responsive.""" + class _AllWedgedExceptBlankCDP: + def __init__(self): + self.calls = [] + + async def send_raw(self, method, params=None, session_id=None): + self.calls.append((method, params, session_id)) + if method == "Target.getTargets": + return {"targetInfos": [ + {"targetId": "wedged-1", "type": "page", "url": "https://a.example/"}, + ]} + if method == "Target.createTarget": + return {"targetId": "target-BLANK"} + if method == "Target.attachToTarget": + return {"sessionId": f"session-{params['targetId']}"} + if method == "Target.detachFromTarget": + return {} + if method.endswith(".enable") and session_id == "session-wedged-1": + await asyncio.Event().wait() # hang forever + return {} + + async def run(): + d = daemon.Daemon() + d.cdp = _AllWedgedExceptBlankCDP() + return await d.attach_first_page() + + attached = asyncio.run(run()) + assert attached["targetId"] == "target-BLANK" + assert attached["url"] == "about:blank" + + +def test_handle_times_out_wedged_cdp_call(monkeypatch): + """A wedged CDP call must not pin the daemon's event loop forever. + BU_CDP_TIMEOUT (default 45s, lowered here for test speed) bounds it. + Returns a clean error with a recovery hint instead of hanging.""" + monkeypatch.setenv("BU_CDP_TIMEOUT", "0.1") + + class _WedgedCDP: + async def send_raw(self, method, params=None, session_id=None): + await asyncio.Event().wait() # hang forever + + d = daemon.Daemon() + d.cdp = _WedgedCDP() + d.session = "s1" + + resp = asyncio.run(d.handle({"method": "Runtime.evaluate", "params": {"expression": "1+1"}})) + assert "error" in resp + assert "timed out" in resp["error"] + assert "Runtime.evaluate" in resp["error"] + + +def test_handle_bounds_stale_session_recovery_by_cdp_timeout(monkeypatch): + """Stale-session recovery (re-attach path) iterates pages with per-call + timeouts that stack — a browser full of wedged tabs could otherwise pin + one client past BU_CDP_TIMEOUT. The whole recovery must be bounded.""" + monkeypatch.setenv("BU_CDP_TIMEOUT", "0.2") + + class _StaleThenWedgedCDP: + """First call raises 'Session with given id not found'; reattach path + then sees one real page whose Page.enable hangs forever.""" + def __init__(self): + self.calls = [] + + async def send_raw(self, method, params=None, session_id=None): + self.calls.append((method, params, session_id)) + if method == "Runtime.evaluate": + raise RuntimeError("Session with given id not found") + if method == "Target.getTargets": + return {"targetInfos": [ + {"targetId": "wedged", "type": "page", "url": "https://wedged.example/"}, + ]} + if method == "Target.attachToTarget": + return {"sessionId": "session-wedged"} + if method == "Target.createTarget": + # If recovery is unbounded, it would fall through to creating + # about:blank (also wedged below) and keep hanging. + return {"targetId": "blank"} + if method.endswith(".enable"): + await asyncio.Event().wait() # every enable hangs + return {} + + d = daemon.Daemon() + d.cdp = _StaleThenWedgedCDP() + d.session = "stale-session" + + resp = asyncio.run(d.handle({"method": "Runtime.evaluate", "params": {"expression": "1+1"}})) + assert "error" in resp + assert "re-attach timed out" in resp["error"], ( + f"recovery must surface a clean re-attach timeout, got: {resp['error']}" + )