Skip to content

Commit c00a15f

Browse files
Share scrape/crawl pagination helpers across managers
Co-authored-by: Shri Sukhani <shrisukhani@users.noreply.github.com>
1 parent ddd7691 commit c00a15f

File tree

9 files changed

+190
-56
lines changed

9 files changed

+190
-56
lines changed

CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ This runs lint, format checks, compile checks, tests, and package build.
8989
- `tests/test_examples_naming_convention.py` (example sync/async prefix naming enforcement),
9090
- `tests/test_examples_syntax.py` (example script syntax guardrail),
9191
- `tests/test_guardrail_ast_utils.py` (shared AST guard utility contract),
92+
- `tests/test_job_pagination_helper_usage.py` (shared scrape/crawl pagination helper usage enforcement),
9293
- `tests/test_makefile_quality_targets.py` (Makefile quality-gate target enforcement),
9394
- `tests/test_manager_model_dump_usage.py` (manager serialization centralization),
9495
- `tests/test_mapping_keys_access_usage.py` (centralized key-iteration boundaries),

hyperbrowser/client/managers/async_manager/crawl.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
poll_until_terminal_status_async,
1010
retry_operation_async,
1111
)
12+
from ..job_pagination_utils import (
13+
initialize_job_paginated_response,
14+
merge_job_paginated_page_response,
15+
)
1216
from ..serialization_utils import (
1317
serialize_model_dump_or_default,
1418
serialize_model_dump_to_dict,
@@ -102,24 +106,19 @@ async def start_and_wait(
102106
retry_delay_seconds=0.5,
103107
)
104108

105-
job_response = CrawlJobResponse(
106-
jobId=job_id,
109+
job_response = initialize_job_paginated_response(
110+
model=CrawlJobResponse,
111+
job_id=job_id,
107112
status=job_status,
108-
data=[],
109-
currentPageBatch=0,
110-
totalPageBatches=0,
111-
totalCrawledPages=0,
112-
batchSize=100,
113+
total_counter_alias="totalCrawledPages",
113114
)
114115

115116
def merge_page_response(page_response: CrawlJobResponse) -> None:
116-
if page_response.data:
117-
job_response.data.extend(page_response.data)
118-
job_response.current_page_batch = page_response.current_page_batch
119-
job_response.total_crawled_pages = page_response.total_crawled_pages
120-
job_response.total_page_batches = page_response.total_page_batches
121-
job_response.batch_size = page_response.batch_size
122-
job_response.error = page_response.error
117+
merge_job_paginated_page_response(
118+
job_response,
119+
page_response,
120+
total_counter_attr="total_crawled_pages",
121+
)
123122

124123
await collect_paginated_results_async(
125124
operation_name=operation_name,

hyperbrowser/client/managers/async_manager/scrape.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
retry_operation_async,
1111
wait_for_job_result_async,
1212
)
13+
from ..job_pagination_utils import (
14+
initialize_job_paginated_response,
15+
merge_job_paginated_page_response,
16+
)
1317
from ..serialization_utils import (
1418
serialize_model_dump_or_default,
1519
serialize_model_dump_to_dict,
@@ -109,24 +113,19 @@ async def start_and_wait(
109113
retry_delay_seconds=0.5,
110114
)
111115

112-
job_response = BatchScrapeJobResponse(
113-
jobId=job_id,
116+
job_response = initialize_job_paginated_response(
117+
model=BatchScrapeJobResponse,
118+
job_id=job_id,
114119
status=job_status,
115-
data=[],
116-
currentPageBatch=0,
117-
totalPageBatches=0,
118-
totalScrapedPages=0,
119-
batchSize=100,
120+
total_counter_alias="totalScrapedPages",
120121
)
121122

122123
def merge_page_response(page_response: BatchScrapeJobResponse) -> None:
123-
if page_response.data:
124-
job_response.data.extend(page_response.data)
125-
job_response.current_page_batch = page_response.current_page_batch
126-
job_response.total_scraped_pages = page_response.total_scraped_pages
127-
job_response.total_page_batches = page_response.total_page_batches
128-
job_response.batch_size = page_response.batch_size
129-
job_response.error = page_response.error
124+
merge_job_paginated_page_response(
125+
job_response,
126+
page_response,
127+
total_counter_attr="total_scraped_pages",
128+
)
130129

131130
await collect_paginated_results_async(
132131
operation_name=operation_name,
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from typing import Any, Type, TypeVar
2+
3+
T = TypeVar("T")
4+
5+
6+
def initialize_job_paginated_response(
7+
*,
8+
model: Type[T],
9+
job_id: str,
10+
status: str,
11+
total_counter_alias: str,
12+
batch_size: int = 100,
13+
) -> T:
14+
return model(
15+
jobId=job_id,
16+
status=status,
17+
data=[],
18+
currentPageBatch=0,
19+
totalPageBatches=0,
20+
batchSize=batch_size,
21+
**{total_counter_alias: 0},
22+
)
23+
24+
25+
def merge_job_paginated_page_response(
26+
job_response: Any,
27+
page_response: Any,
28+
*,
29+
total_counter_attr: str,
30+
) -> None:
31+
if page_response.data:
32+
job_response.data.extend(page_response.data)
33+
job_response.current_page_batch = page_response.current_page_batch
34+
setattr(
35+
job_response,
36+
total_counter_attr,
37+
getattr(page_response, total_counter_attr),
38+
)
39+
job_response.total_page_batches = page_response.total_page_batches
40+
job_response.batch_size = page_response.batch_size
41+
job_response.error = page_response.error

hyperbrowser/client/managers/sync_manager/crawl.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
poll_until_terminal_status,
1010
retry_operation,
1111
)
12+
from ..job_pagination_utils import (
13+
initialize_job_paginated_response,
14+
merge_job_paginated_page_response,
15+
)
1216
from ..serialization_utils import (
1317
serialize_model_dump_or_default,
1418
serialize_model_dump_to_dict,
@@ -102,24 +106,19 @@ def start_and_wait(
102106
retry_delay_seconds=0.5,
103107
)
104108

105-
job_response = CrawlJobResponse(
106-
jobId=job_id,
109+
job_response = initialize_job_paginated_response(
110+
model=CrawlJobResponse,
111+
job_id=job_id,
107112
status=job_status,
108-
data=[],
109-
currentPageBatch=0,
110-
totalPageBatches=0,
111-
totalCrawledPages=0,
112-
batchSize=100,
113+
total_counter_alias="totalCrawledPages",
113114
)
114115

115116
def merge_page_response(page_response: CrawlJobResponse) -> None:
116-
if page_response.data:
117-
job_response.data.extend(page_response.data)
118-
job_response.current_page_batch = page_response.current_page_batch
119-
job_response.total_crawled_pages = page_response.total_crawled_pages
120-
job_response.total_page_batches = page_response.total_page_batches
121-
job_response.batch_size = page_response.batch_size
122-
job_response.error = page_response.error
117+
merge_job_paginated_page_response(
118+
job_response,
119+
page_response,
120+
total_counter_attr="total_crawled_pages",
121+
)
123122

124123
collect_paginated_results(
125124
operation_name=operation_name,

hyperbrowser/client/managers/sync_manager/scrape.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
retry_operation,
1111
wait_for_job_result,
1212
)
13+
from ..job_pagination_utils import (
14+
initialize_job_paginated_response,
15+
merge_job_paginated_page_response,
16+
)
1317
from ..serialization_utils import (
1418
serialize_model_dump_or_default,
1519
serialize_model_dump_to_dict,
@@ -107,24 +111,19 @@ def start_and_wait(
107111
retry_delay_seconds=0.5,
108112
)
109113

110-
job_response = BatchScrapeJobResponse(
111-
jobId=job_id,
114+
job_response = initialize_job_paginated_response(
115+
model=BatchScrapeJobResponse,
116+
job_id=job_id,
112117
status=job_status,
113-
data=[],
114-
currentPageBatch=0,
115-
totalPageBatches=0,
116-
totalScrapedPages=0,
117-
batchSize=100,
118+
total_counter_alias="totalScrapedPages",
118119
)
119120

120121
def merge_page_response(page_response: BatchScrapeJobResponse) -> None:
121-
if page_response.data:
122-
job_response.data.extend(page_response.data)
123-
job_response.current_page_batch = page_response.current_page_batch
124-
job_response.total_scraped_pages = page_response.total_scraped_pages
125-
job_response.total_page_batches = page_response.total_page_batches
126-
job_response.batch_size = page_response.batch_size
127-
job_response.error = page_response.error
122+
merge_job_paginated_page_response(
123+
job_response,
124+
page_response,
125+
total_counter_attr="total_scraped_pages",
126+
)
128127

129128
collect_paginated_results(
130129
operation_name=operation_name,

tests/test_architecture_marker_usage.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
"tests/test_examples_syntax.py",
3030
"tests/test_docs_python3_commands.py",
3131
"tests/test_examples_naming_convention.py",
32+
"tests/test_job_pagination_helper_usage.py",
3233
"tests/test_example_sync_async_parity.py",
3334
"tests/test_example_run_instructions.py",
3435
"tests/test_computer_action_endpoint_helper_usage.py",
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from pathlib import Path
2+
3+
import pytest
4+
5+
pytestmark = pytest.mark.architecture
6+
7+
8+
BATCH_JOB_MANAGER_MODULES = (
9+
"hyperbrowser/client/managers/sync_manager/scrape.py",
10+
"hyperbrowser/client/managers/async_manager/scrape.py",
11+
"hyperbrowser/client/managers/sync_manager/crawl.py",
12+
"hyperbrowser/client/managers/async_manager/crawl.py",
13+
)
14+
15+
16+
def test_job_managers_use_shared_pagination_helpers():
17+
for module_path in BATCH_JOB_MANAGER_MODULES:
18+
module_text = Path(module_path).read_text(encoding="utf-8")
19+
assert "initialize_job_paginated_response(" in module_text
20+
assert "merge_job_paginated_page_response(" in module_text
21+
assert "total_page_batches = page_response.total_page_batches" not in module_text
22+
assert "job_response = BatchScrapeJobResponse(" not in module_text
23+
assert "job_response = CrawlJobResponse(" not in module_text

tests/test_job_pagination_utils.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
from hyperbrowser.client.managers.job_pagination_utils import (
2+
initialize_job_paginated_response,
3+
merge_job_paginated_page_response,
4+
)
5+
from hyperbrowser.models.crawl import CrawlJobResponse
6+
from hyperbrowser.models.scrape import BatchScrapeJobResponse
7+
8+
9+
def test_initialize_job_paginated_response_for_batch_scrape():
10+
response = initialize_job_paginated_response(
11+
model=BatchScrapeJobResponse,
12+
job_id="job-1",
13+
status="completed",
14+
total_counter_alias="totalScrapedPages",
15+
)
16+
17+
assert response.job_id == "job-1"
18+
assert response.status == "completed"
19+
assert response.data == []
20+
assert response.current_page_batch == 0
21+
assert response.total_page_batches == 0
22+
assert response.total_scraped_pages == 0
23+
assert response.batch_size == 100
24+
25+
26+
def test_initialize_job_paginated_response_for_crawl_with_custom_batch_size():
27+
response = initialize_job_paginated_response(
28+
model=CrawlJobResponse,
29+
job_id="job-2",
30+
status="running",
31+
total_counter_alias="totalCrawledPages",
32+
batch_size=25,
33+
)
34+
35+
assert response.job_id == "job-2"
36+
assert response.status == "running"
37+
assert response.data == []
38+
assert response.current_page_batch == 0
39+
assert response.total_page_batches == 0
40+
assert response.total_crawled_pages == 0
41+
assert response.batch_size == 25
42+
43+
44+
def test_merge_job_paginated_page_response_updates_totals_and_error():
45+
job_response = initialize_job_paginated_response(
46+
model=CrawlJobResponse,
47+
job_id="job-2",
48+
status="running",
49+
total_counter_alias="totalCrawledPages",
50+
)
51+
page_response = CrawlJobResponse(
52+
jobId="job-2",
53+
status="running",
54+
data=[],
55+
currentPageBatch=3,
56+
totalPageBatches=9,
57+
totalCrawledPages=21,
58+
batchSize=50,
59+
error="partial failure",
60+
)
61+
62+
merge_job_paginated_page_response(
63+
job_response,
64+
page_response,
65+
total_counter_attr="total_crawled_pages",
66+
)
67+
68+
assert job_response.current_page_batch == 3
69+
assert job_response.total_page_batches == 9
70+
assert job_response.total_crawled_pages == 21
71+
assert job_response.batch_size == 50
72+
assert job_response.error == "partial failure"

0 commit comments

Comments
 (0)