Skip to content

Commit cf17fc0

Browse files
Harden extract tool schema and output serialization
Co-authored-by: Shri Sukhani <shrisukhani@users.noreply.github.com>
1 parent cced4f3 commit cf17fc0

File tree

2 files changed

+114
-11
lines changed

2 files changed

+114
-11
lines changed

hyperbrowser/tools/__init__.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ def _prepare_extract_tool_params(params: Mapping[str, Any]) -> Dict[str, Any]:
4848
if isinstance(schema_value, str):
4949
try:
5050
normalized_params["schema"] = json.loads(schema_value)
51-
except json.JSONDecodeError as exc:
51+
except HyperbrowserError:
52+
raise
53+
except Exception as exc:
5254
raise HyperbrowserError(
5355
"Invalid JSON string provided for `schema` in extract tool params",
5456
original_error=exc,
@@ -87,6 +89,20 @@ def _to_param_dict(params: Mapping[str, Any]) -> Dict[str, Any]:
8789
return normalized_params
8890

8991

92+
def _serialize_extract_tool_data(data: Any) -> str:
93+
if data is None:
94+
return ""
95+
try:
96+
return json.dumps(data)
97+
except HyperbrowserError:
98+
raise
99+
except Exception as exc:
100+
raise HyperbrowserError(
101+
"Failed to serialize extract tool response data",
102+
original_error=exc,
103+
) from exc
104+
105+
90106
class WebsiteScrapeTool:
91107
openai_tool_definition = SCRAPE_TOOL_OPENAI
92108
anthropic_tool_definition = SCRAPE_TOOL_ANTHROPIC
@@ -168,15 +184,15 @@ def runnable(hb: Hyperbrowser, params: Mapping[str, Any]) -> str:
168184
resp = hb.extract.start_and_wait(
169185
params=StartExtractJobParams(**normalized_params)
170186
)
171-
return json.dumps(resp.data) if resp.data else ""
187+
return _serialize_extract_tool_data(resp.data)
172188

173189
@staticmethod
174190
async def async_runnable(hb: AsyncHyperbrowser, params: Mapping[str, Any]) -> str:
175191
normalized_params = _prepare_extract_tool_params(params)
176192
resp = await hb.extract.start_and_wait(
177193
params=StartExtractJobParams(**normalized_params)
178194
)
179-
return json.dumps(resp.data) if resp.data else ""
195+
return _serialize_extract_tool_data(resp.data)
180196

181197

182198
class BrowserUseTool:

tests/test_tools_extract.py

Lines changed: 95 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,47 @@
22

33
import pytest
44

5+
import hyperbrowser.tools as tools_module
56
from hyperbrowser.exceptions import HyperbrowserError
67
from hyperbrowser.models.extract import StartExtractJobParams
78
from hyperbrowser.tools import WebsiteExtractTool
89

10+
_UNSET = object()
11+
912

1013
class _Response:
1114
def __init__(self, data):
1215
self.data = data
1316

1417

1518
class _SyncExtractManager:
16-
def __init__(self):
19+
def __init__(self, response_data=_UNSET):
1720
self.last_params = None
21+
self._response_data = {"ok": True} if response_data is _UNSET else response_data
1822

1923
def start_and_wait(self, params: StartExtractJobParams):
2024
self.last_params = params
21-
return _Response({"ok": True})
25+
return _Response(self._response_data)
2226

2327

2428
class _AsyncExtractManager:
25-
def __init__(self):
29+
def __init__(self, response_data=_UNSET):
2630
self.last_params = None
31+
self._response_data = {"ok": True} if response_data is _UNSET else response_data
2732

2833
async def start_and_wait(self, params: StartExtractJobParams):
2934
self.last_params = params
30-
return _Response({"ok": True})
35+
return _Response(self._response_data)
3136

3237

3338
class _SyncClient:
34-
def __init__(self):
35-
self.extract = _SyncExtractManager()
39+
def __init__(self, response_data=_UNSET):
40+
self.extract = _SyncExtractManager(response_data=response_data)
3641

3742

3843
class _AsyncClient:
39-
def __init__(self):
40-
self.extract = _AsyncExtractManager()
44+
def __init__(self, response_data=_UNSET):
45+
self.extract = _AsyncExtractManager(response_data=response_data)
4146

4247

4348
def test_extract_tool_runnable_does_not_mutate_input_params():
@@ -104,3 +109,85 @@ async def run():
104109
HyperbrowserError, match="Invalid JSON string provided for `schema`"
105110
):
106111
asyncio.run(run())
112+
113+
114+
def test_extract_tool_runnable_serializes_empty_object_data():
115+
client = _SyncClient(response_data={})
116+
117+
output = WebsiteExtractTool.runnable(client, {"urls": ["https://example.com"]})
118+
119+
assert output == "{}"
120+
121+
122+
def test_extract_tool_async_runnable_serializes_empty_list_data():
123+
client = _AsyncClient(response_data=[])
124+
125+
async def run():
126+
return await WebsiteExtractTool.async_runnable(
127+
client, {"urls": ["https://example.com"]}
128+
)
129+
130+
output = asyncio.run(run())
131+
132+
assert output == "[]"
133+
134+
135+
def test_extract_tool_runnable_returns_empty_string_for_none_data():
136+
client = _SyncClient(response_data=None)
137+
138+
output = WebsiteExtractTool.runnable(client, {"urls": ["https://example.com"]})
139+
140+
assert output == ""
141+
142+
143+
def test_extract_tool_runnable_wraps_serialization_failures():
144+
client = _SyncClient(response_data={1, 2})
145+
146+
with pytest.raises(
147+
HyperbrowserError, match="Failed to serialize extract tool response data"
148+
) as exc_info:
149+
WebsiteExtractTool.runnable(client, {"urls": ["https://example.com"]})
150+
151+
assert exc_info.value.original_error is not None
152+
153+
154+
def test_extract_tool_runnable_wraps_unexpected_schema_parse_failures(
155+
monkeypatch: pytest.MonkeyPatch,
156+
):
157+
def _raise_recursion_error(_: str):
158+
raise RecursionError("schema parsing recursion overflow")
159+
160+
monkeypatch.setattr(tools_module.json, "loads", _raise_recursion_error)
161+
162+
with pytest.raises(
163+
HyperbrowserError, match="Invalid JSON string provided for `schema`"
164+
) as exc_info:
165+
WebsiteExtractTool.runnable(
166+
_SyncClient(),
167+
{
168+
"urls": ["https://example.com"],
169+
"schema": '{"type":"object"}',
170+
},
171+
)
172+
173+
assert exc_info.value.original_error is not None
174+
175+
176+
def test_extract_tool_runnable_preserves_hyperbrowser_schema_parse_errors(
177+
monkeypatch: pytest.MonkeyPatch,
178+
):
179+
def _raise_hyperbrowser_error(_: str):
180+
raise HyperbrowserError("custom schema parse failure")
181+
182+
monkeypatch.setattr(tools_module.json, "loads", _raise_hyperbrowser_error)
183+
184+
with pytest.raises(HyperbrowserError, match="custom schema parse failure") as exc_info:
185+
WebsiteExtractTool.runnable(
186+
_SyncClient(),
187+
{
188+
"urls": ["https://example.com"],
189+
"schema": '{"type":"object"}',
190+
},
191+
)
192+
193+
assert exc_info.value.original_error is None

0 commit comments

Comments
 (0)