diff --git a/tools/firecrawl/manifest.yaml b/tools/firecrawl/manifest.yaml index 1e4fee805..93f78bfec 100644 --- a/tools/firecrawl/manifest.yaml +++ b/tools/firecrawl/manifest.yaml @@ -32,4 +32,4 @@ tags: - search - utilities type: plugin -version: 0.0.9 +version: 0.1.0 diff --git a/tools/firecrawl/tools/crawl.py b/tools/firecrawl/tools/crawl.py index a6e912e12..4d106ed6a 100644 --- a/tools/firecrawl/tools/crawl.py +++ b/tools/firecrawl/tools/crawl.py @@ -20,7 +20,8 @@ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessag payload["excludePaths"] = get_array_params(tool_parameters, "excludePaths") payload["includePaths"] = get_array_params(tool_parameters, "includePaths") payload["maxDepth"] = tool_parameters.get("maxDepth") - payload["ignoreSitemap"] = tool_parameters.get("ignoreSitemap", False) + if tool_parameters.get("ignoreSitemap", False): + payload["sitemap"] = "skip" payload["limit"] = tool_parameters.get("limit", 5) payload["allowBackwardLinks"] = tool_parameters.get("allowBackwardLinks", False) payload["allowExternalLinks"] = tool_parameters.get("allowExternalLinks", False) diff --git a/tools/firecrawl/tools/firecrawl_appx.py b/tools/firecrawl/tools/firecrawl_appx.py index 0d19180a8..0f1fe419e 100644 --- a/tools/firecrawl/tools/firecrawl_appx.py +++ b/tools/firecrawl/tools/firecrawl_appx.py @@ -46,7 +46,7 @@ def _request( return None def scrape_url(self, url: str, **kwargs): - endpoint = f"{self.base_url}/v1/scrape" + endpoint = f"{self.base_url}/v2/scrape" data = {"url": url, **kwargs} logger.debug(f"Sent request to {endpoint=} body={data}") response = self._request("POST", endpoint, data) @@ -55,7 +55,7 @@ def scrape_url(self, url: str, **kwargs): return response def map(self, url: str, **kwargs): - endpoint = f"{self.base_url}/v1/map" + endpoint = f"{self.base_url}/v2/map" data = {"url": url, **kwargs} logger.debug(f"Sent request to {endpoint=} body={data}") response = self._request("POST", endpoint, data) @@ -66,7 +66,7 @@ def map(self, url: str, **kwargs): def crawl_url( self, url: str, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs ): - endpoint = f"{self.base_url}/v1/crawl" + endpoint = f"{self.base_url}/v2/crawl" headers = self._prepare_headers(idempotency_key) data = {"url": url, **kwargs} logger.debug(f"Sent request to {endpoint=} body={data}") @@ -81,14 +81,14 @@ def crawl_url( return response def check_crawl_status(self, job_id: str): - endpoint = f"{self.base_url}/v1/crawl/{job_id}" + endpoint = f"{self.base_url}/v2/crawl/{job_id}" response = self._request("GET", endpoint) if response is None: raise HTTPError(f"Failed to check status for job {job_id} after multiple retries") return response def cancel_crawl_job(self, job_id: str): - endpoint = f"{self.base_url}/v1/crawl/{job_id}" + endpoint = f"{self.base_url}/v2/crawl/{job_id}" response = self._request("DELETE", endpoint) if response is None: raise HTTPError(f"Failed to cancel job {job_id} after multiple retries") diff --git a/tools/firecrawl/tools/map.py b/tools/firecrawl/tools/map.py index 199280f5b..cd5baccf6 100644 --- a/tools/firecrawl/tools/map.py +++ b/tools/firecrawl/tools/map.py @@ -16,7 +16,8 @@ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessag ) payload = {} payload["search"] = tool_parameters.get("search") - payload["ignoreSitemap"] = tool_parameters.get("ignoreSitemap", True) + if tool_parameters.get("ignoreSitemap", True): + payload["sitemap"] = "skip" payload["includeSubdomains"] = tool_parameters.get("includeSubdomains", False) payload["limit"] = tool_parameters.get("limit", 5000) map_result = app.map(url=tool_parameters["url"], **payload) diff --git a/tools/firecrawl/tools/scrape.py b/tools/firecrawl/tools/scrape.py index 49c57768e..8dc2d8cdb 100644 --- a/tools/firecrawl/tools/scrape.py +++ b/tools/firecrawl/tools/scrape.py @@ -14,19 +14,29 @@ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessag api_key=self.runtime.credentials.get("firecrawl_api_key"), base_url=self.runtime.credentials.get("base_url") ) payload = {} - extract = {} - payload["formats"] = get_array_params(tool_parameters, "formats") + formats = get_array_params(tool_parameters, "formats") or [] payload["onlyMainContent"] = tool_parameters.get("onlyMainContent", True) payload["includeTags"] = get_array_params(tool_parameters, "includeTags") payload["excludeTags"] = get_array_params(tool_parameters, "excludeTags") payload["headers"] = get_json_params(tool_parameters, "headers") payload["waitFor"] = tool_parameters.get("waitFor", 0) payload["timeout"] = tool_parameters.get("timeout", 30000) - extract["schema"] = get_json_params(tool_parameters, "schema") - extract["systemPrompt"] = tool_parameters.get("systemPrompt") - extract["prompt"] = tool_parameters.get("prompt") - extract = {k: v for (k, v) in extract.items() if v not in (None, "")} - payload["extract"] = extract or None + # v2: structured/LLM extraction is expressed as a "json" format object inside + # the formats array (the v1 top-level "extract" field was removed). + json_format = {"type": "json"} + json_format["schema"] = get_json_params(tool_parameters, "schema") + # v2 removed the json format's "systemPrompt" field (only "prompt"/"schema" remain), + # so fold any provided system prompt into the single "prompt" field. + system_prompt = tool_parameters.get("systemPrompt") + prompt = tool_parameters.get("prompt") + if system_prompt: + prompt = f"{system_prompt}\n\n{prompt}" if prompt else system_prompt + json_format["prompt"] = prompt + json_format = {k: v for (k, v) in json_format.items() if v not in (None, "")} + if len(json_format) > 1: # has more than just {"type": "json"} + formats = [f for f in formats if f != "extract"] + formats.append(json_format) + payload["formats"] = formats or None payload = {k: v for (k, v) in payload.items() if v not in (None, "")} crawl_result = app.scrape_url(url=tool_parameters["url"], **payload) markdown_result = crawl_result.get("data", {}).get("markdown", "")