Harden extract tool schema and output serialization

cursoragent · shrisukhani · cursoragent · commit cf17fc03cfad · 2026-02-13T22:04:00.000Z
Co-authored-by: Shri Sukhani &lt;shrisukhani@users.noreply.github.com&gt;
diff --git a/hyperbrowser/tools/__init__.py b/hyperbrowser/tools/__init__.py
@@ -48,7 +48,9 @@ def _prepare_extract_tool_params(params: Mapping[str, Any]) -> Dict[str, Any]:
     if isinstance(schema_value, str):
         try:
             normalized_params["schema"] = json.loads(schema_value)
-        except json.JSONDecodeError as exc:
+        except HyperbrowserError:
+            raise
+        except Exception as exc:
             raise HyperbrowserError(
                 "Invalid JSON string provided for `schema` in extract tool params",
                 original_error=exc,
@@ -87,6 +89,20 @@ def _to_param_dict(params: Mapping[str, Any]) -> Dict[str, Any]:
     return normalized_params
 
 
+def _serialize_extract_tool_data(data: Any) -> str:
+    if data is None:
+        return ""
+    try:
+        return json.dumps(data)
+    except HyperbrowserError:
+        raise
+    except Exception as exc:
+        raise HyperbrowserError(
+            "Failed to serialize extract tool response data",
+            original_error=exc,
+        ) from exc
+
+
 class WebsiteScrapeTool:
     openai_tool_definition = SCRAPE_TOOL_OPENAI
     anthropic_tool_definition = SCRAPE_TOOL_ANTHROPIC
@@ -168,15 +184,15 @@ def runnable(hb: Hyperbrowser, params: Mapping[str, Any]) -> str:
         resp = hb.extract.start_and_wait(
             params=StartExtractJobParams(**normalized_params)
         )
-        return json.dumps(resp.data) if resp.data else ""
+        return _serialize_extract_tool_data(resp.data)
 
     @staticmethod
     async def async_runnable(hb: AsyncHyperbrowser, params: Mapping[str, Any]) -> str:
         normalized_params = _prepare_extract_tool_params(params)
         resp = await hb.extract.start_and_wait(
             params=StartExtractJobParams(**normalized_params)
         )
-        return json.dumps(resp.data) if resp.data else ""
+        return _serialize_extract_tool_data(resp.data)
 
 
 class BrowserUseTool:
diff --git a/tests/test_tools_extract.py b/tests/test_tools_extract.py
@@ -2,42 +2,47 @@
 
 import pytest
 
+import hyperbrowser.tools as tools_module
 from hyperbrowser.exceptions import HyperbrowserError
 from hyperbrowser.models.extract import StartExtractJobParams
 from hyperbrowser.tools import WebsiteExtractTool
 
+_UNSET = object()
+
 
 class _Response:
     def __init__(self, data):
         self.data = data
 
 
 class _SyncExtractManager:
-    def __init__(self):
+    def __init__(self, response_data=_UNSET):
         self.last_params = None
+        self._response_data = {"ok": True} if response_data is _UNSET else response_data
 
     def start_and_wait(self, params: StartExtractJobParams):
         self.last_params = params
-        return _Response({"ok": True})
+        return _Response(self._response_data)
 
 
 class _AsyncExtractManager:
-    def __init__(self):
+    def __init__(self, response_data=_UNSET):
         self.last_params = None
+        self._response_data = {"ok": True} if response_data is _UNSET else response_data
 
     async def start_and_wait(self, params: StartExtractJobParams):
         self.last_params = params
-        return _Response({"ok": True})
+        return _Response(self._response_data)
 
 
 class _SyncClient:
-    def __init__(self):
-        self.extract = _SyncExtractManager()
+    def __init__(self, response_data=_UNSET):
+        self.extract = _SyncExtractManager(response_data=response_data)
 
 
 class _AsyncClient:
-    def __init__(self):
-        self.extract = _AsyncExtractManager()
+    def __init__(self, response_data=_UNSET):
+        self.extract = _AsyncExtractManager(response_data=response_data)
 
 
 def test_extract_tool_runnable_does_not_mutate_input_params():
@@ -104,3 +109,85 @@ async def run():
         HyperbrowserError, match="Invalid JSON string provided for `schema`"
     ):
         asyncio.run(run())
+
+
+def test_extract_tool_runnable_serializes_empty_object_data():
+    client = _SyncClient(response_data={})
+
+    output = WebsiteExtractTool.runnable(client, {"urls": ["https://example.com"]})
+
+    assert output == "{}"
+
+
+def test_extract_tool_async_runnable_serializes_empty_list_data():
+    client = _AsyncClient(response_data=[])
+
+    async def run():
+        return await WebsiteExtractTool.async_runnable(
+            client, {"urls": ["https://example.com"]}
+        )
+
+    output = asyncio.run(run())
+
+    assert output == "[]"
+
+
+def test_extract_tool_runnable_returns_empty_string_for_none_data():
+    client = _SyncClient(response_data=None)
+
+    output = WebsiteExtractTool.runnable(client, {"urls": ["https://example.com"]})
+
+    assert output == ""
+
+
+def test_extract_tool_runnable_wraps_serialization_failures():
+    client = _SyncClient(response_data={1, 2})
+
+    with pytest.raises(
+        HyperbrowserError, match="Failed to serialize extract tool response data"
+    ) as exc_info:
+        WebsiteExtractTool.runnable(client, {"urls": ["https://example.com"]})
+
+    assert exc_info.value.original_error is not None
+
+
+def test_extract_tool_runnable_wraps_unexpected_schema_parse_failures(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    def _raise_recursion_error(_: str):
+        raise RecursionError("schema parsing recursion overflow")
+
+    monkeypatch.setattr(tools_module.json, "loads", _raise_recursion_error)
+
+    with pytest.raises(
+        HyperbrowserError, match="Invalid JSON string provided for `schema`"
+    ) as exc_info:
+        WebsiteExtractTool.runnable(
+            _SyncClient(),
+            {
+                "urls": ["https://example.com"],
+                "schema": '{"type":"object"}',
+            },
+        )
+
+    assert exc_info.value.original_error is not None
+
+
+def test_extract_tool_runnable_preserves_hyperbrowser_schema_parse_errors(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    def _raise_hyperbrowser_error(_: str):
+        raise HyperbrowserError("custom schema parse failure")
+
+    monkeypatch.setattr(tools_module.json, "loads", _raise_hyperbrowser_error)
+
+    with pytest.raises(HyperbrowserError, match="custom schema parse failure") as exc_info:
+        WebsiteExtractTool.runnable(
+            _SyncClient(),
+            {
+                "urls": ["https://example.com"],
+                "schema": '{"type":"object"}',
+            },
+        )
+
+    assert exc_info.value.original_error is None