diff --git a/tests/tools/paddleocr/test_file_input.py b/tests/tools/paddleocr/test_file_input.py index af56d3789..f57580465 100644 --- a/tests/tools/paddleocr/test_file_input.py +++ b/tests/tools/paddleocr/test_file_input.py @@ -1,10 +1,38 @@ import base64 import os import sys +from unittest.mock import MagicMock import pytest import yaml +# Mock paddleocr module before any imports +mock_paddleocr = MagicMock() + +# Mock public API classes +mock_paddleocr.PaddleOCRClient = MagicMock +mock_paddleocr.OCROptions = lambda **kw: MagicMock() +mock_paddleocr.PPStructureV3Options = lambda **kw: MagicMock() +mock_paddleocr.PaddleOCRVLOptions = lambda **kw: MagicMock() +mock_paddleocr.AuthError = Exception +mock_paddleocr.PaddleOCRAPIError = Exception + +# Mock internal modules for backward compatibility +mock_paddleocr._api_client = MagicMock() +mock_paddleocr._api_client.PaddleOCRClient = mock_paddleocr.PaddleOCRClient +mock_paddleocr._api_client.models = MagicMock() +mock_paddleocr._api_client.models.OCROptions = mock_paddleocr.OCROptions +mock_paddleocr._api_client.models.PPStructureV3Options = mock_paddleocr.PPStructureV3Options +mock_paddleocr._api_client.models.PaddleOCRVLOptions = mock_paddleocr.PaddleOCRVLOptions +mock_paddleocr._api_client.errors = MagicMock() +mock_paddleocr._api_client.errors.AuthError = mock_paddleocr.AuthError +mock_paddleocr._api_client.errors.PaddleOCRAPIError = mock_paddleocr.PaddleOCRAPIError + +sys.modules["paddleocr"] = mock_paddleocr +sys.modules["paddleocr._api_client"] = mock_paddleocr._api_client +sys.modules["paddleocr._api_client.models"] = mock_paddleocr._api_client.models +sys.modules["paddleocr._api_client.errors"] = mock_paddleocr._api_client.errors + REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")) PLUGIN_DIR = os.path.join(REPO_ROOT, "tools", "paddleocr") if PLUGIN_DIR not in sys.path: @@ -45,10 +73,14 @@ def test_file_upload_is_base64_encoded(): file_type=FileType.IMAGE, ) - payload, normalized_file_type = normalize_file_input(file, "auto") + input_value, is_temp_file, file_type_code = normalize_file_input(file, "auto") - assert payload == base64.b64encode(b"image-bytes").decode("utf-8") - assert normalized_file_type == 1 + # New implementation saves to temp file for SDK + assert os.path.exists(input_value) + assert is_temp_file is True + assert file_type_code == 1 + # Clean up + os.unlink(input_value) def test_pdf_file_upload_infers_file_type(): @@ -56,10 +88,14 @@ def test_pdf_file_upload_infers_file_type(): b"%PDF-1.7", filename="invoice.pdf", mime_type="application/pdf", extension=".pdf" ) - payload, normalized_file_type = normalize_file_input(file, "auto") + input_value, is_temp_file, file_type_code = normalize_file_input(file, "auto") - assert payload == base64.b64encode(b"%PDF-1.7").decode("utf-8") - assert normalized_file_type == 0 + # New implementation saves to temp file for SDK + assert os.path.exists(input_value) + assert is_temp_file is True + assert file_type_code == 0 + # Clean up + os.unlink(input_value) def test_image_file_upload_infers_file_type_from_filename_when_mime_type_missing(): @@ -71,10 +107,14 @@ def test_image_file_upload_infers_file_type_from_filename_when_mime_type_missing file_type=FileType.IMAGE, ) - payload, normalized_file_type = normalize_file_input(file, None) + input_value, is_temp_file, file_type_code = normalize_file_input(file, None) - assert payload == base64.b64encode(b"image-bytes").decode("utf-8") - assert normalized_file_type == 1 + # New implementation saves to temp file for SDK + assert os.path.exists(input_value) + assert is_temp_file is True + assert file_type_code == 1 + # Clean up + os.unlink(input_value) def test_explicit_file_type_overrides_inference(): @@ -86,17 +126,22 @@ def test_explicit_file_type_overrides_inference(): file_type=FileType.IMAGE, ) - payload, normalized_file_type = normalize_file_input(file, "pdf") + input_value, is_temp_file, file_type_code = normalize_file_input(file, "pdf") - assert payload == base64.b64encode(b"image-bytes").decode("utf-8") - assert normalized_file_type == 0 + # New implementation saves to temp file for SDK + assert os.path.exists(input_value) + assert is_temp_file is True + assert file_type_code == 0 + # Clean up + os.unlink(input_value) def test_legacy_file_string_is_passed_through(): - payload, normalized_file_type = normalize_file_input("https://example.com/scan.pdf", "auto") + input_value, is_temp_file, file_type_code = normalize_file_input("https://example.com/scan.pdf", "auto") - assert payload == "https://example.com/scan.pdf" - assert normalized_file_type is None + assert input_value == "https://example.com/scan.pdf" + assert is_temp_file is False + assert file_type_code is None def test_missing_file_input_raises_clear_error(): @@ -106,21 +151,44 @@ def test_missing_file_input_raises_clear_error(): def invoke_tool_with_mocked_api(monkeypatch, tool_cls, credentials, parameters): captured = {} - module_name = tool_cls.__module__.split(".")[-1] - - def fake_api_request(api_url, params, access_token): - captured["api_url"] = api_url - captured["params"] = params - captured["access_token"] = access_token - return { - "errorCode": 0, - "result": { - "ocrResults": [{"prunedResult": {"rec_texts": ["hello", "world"]}}], - "layoutParsingResults": [{"markdown": {"text": "# Parsed", "images": {}}}], - }, - } - - monkeypatch.setattr(f"tools.{module_name}.make_paddleocr_api_request", fake_api_request) + + def fake_sdk_call(**kwargs): + captured["kwargs"] = kwargs + # Return mock result - use simple dict instead of SDK classes + if tool_cls == TextRecognitionTool: + return type("OCRResult", (), {"job_id": "test-job", "pages": [ + type("OCRPage", (), {"pruned_result": {"rec_texts": ["hello", "world"]}, "ocr_image_url": None})() + ]})() + else: + return type("DocParsingResult", (), {"job_id": "test-job", "pages": [ + type("DocParsingPage", (), {"markdown_text": "# Parsed", "markdown_images": {}, "output_images": {}})() + ]})() + + # Mock the entire SDK module and client + fake_client = MagicMock() + fake_client.ocr = fake_sdk_call + fake_client.parse_document = fake_sdk_call + + # Mock utils module functions + import tools.utils as utils_module + monkeypatch.setattr(utils_module, "get_sdk_client", lambda *args: fake_client) + monkeypatch.setattr(utils_module, "base64_to_temp_file", lambda *args: "temp_file.png") + monkeypatch.setattr(utils_module, "cleanup_temp_file", lambda *args: None) + + # Mock in the specific tool module (they import these directly from utils) + if tool_cls == TextRecognitionTool: + import tools.text_recognition as tr_module + monkeypatch.setattr(tr_module, "get_sdk_client", lambda *args: fake_client) + monkeypatch.setattr(tr_module, "cleanup_temp_file", lambda *args: None) + elif tool_cls == DocumentParsingTool: + import tools.document_parsing as dp_module + monkeypatch.setattr(dp_module, "get_sdk_client", lambda *args: fake_client) + monkeypatch.setattr(dp_module, "cleanup_temp_file", lambda *args: None) + else: + import tools.document_parsing_vl as dpv_module + monkeypatch.setattr(dpv_module, "get_sdk_client", lambda *args: fake_client) + monkeypatch.setattr(dpv_module, "cleanup_temp_file", lambda *args: None) + tool = tool_cls.from_credentials(credentials) list(tool._invoke(parameters)) return captured @@ -145,11 +213,11 @@ def test_text_recognition_sends_normalized_file_to_api(monkeypatch): {"file": file, "fileType": "auto", "visualize": False}, ) - assert captured["api_url"] == "https://example.com/text-recognition" - assert captured["access_token"] == "token" - assert captured["params"]["file"] == base64.b64encode(b"image-bytes").decode("utf-8") - assert captured["params"]["fileType"] == 1 - assert captured["params"]["visualize"] is False + # SDK receives file_path (temp file), not base64 directly + assert "file_path" in captured["kwargs"] + assert captured["kwargs"]["file_path"] == "temp_file.png" + assert captured["kwargs"]["options"] is not None + assert hasattr(captured["kwargs"]["options"], "visualize") def test_document_parsing_sends_normalized_file_to_api(monkeypatch): @@ -167,10 +235,9 @@ def test_document_parsing_sends_normalized_file_to_api(monkeypatch): {"file": file, "fileType": "auto", "markdownIgnoreLabels": "header, footer"}, ) - assert captured["api_url"] == "https://example.com/document-parsing" - assert captured["params"]["file"] == base64.b64encode(b"%PDF-1.7").decode("utf-8") - assert captured["params"]["fileType"] == 0 - assert captured["params"]["markdownIgnoreLabels"] == ["header", "footer"] + assert "file_path" in captured["kwargs"] + assert captured["kwargs"]["file_path"] == "temp_file.png" + assert captured["kwargs"]["options"] is not None def test_document_parsing_vl_sends_normalized_file_to_api(monkeypatch): @@ -192,10 +259,8 @@ def test_document_parsing_vl_sends_normalized_file_to_api(monkeypatch): {"file": file, "fileType": "auto", "promptLabel": "undefined"}, ) - assert captured["api_url"] == "https://example.com/document-parsing-vl" - assert captured["params"]["file"] == base64.b64encode(b"image-bytes").decode("utf-8") - assert captured["params"]["fileType"] == 1 - assert "promptLabel" not in captured["params"] + assert "file_path" in captured["kwargs"] + assert captured["kwargs"]["file_path"] == "temp_file.png" def load_tool_yaml(tool_name: str) -> dict: diff --git a/tools/paddleocr/manifest.yaml b/tools/paddleocr/manifest.yaml index 1f29e9fac..872e9f7fd 100644 --- a/tools/paddleocr/manifest.yaml +++ b/tools/paddleocr/manifest.yaml @@ -1,4 +1,4 @@ -version: 0.2.6 +version: 0.2.7 type: plugin author: langgenius name: paddleocr diff --git a/tools/paddleocr/provider/paddleocr.py b/tools/paddleocr/provider/paddleocr.py index 98c6ebfd2..eb1d3b4a6 100644 --- a/tools/paddleocr/provider/paddleocr.py +++ b/tools/paddleocr/provider/paddleocr.py @@ -6,6 +6,7 @@ from tools.document_parsing import DocumentParsingTool from tools.document_parsing_vl import DocumentParsingVlTool from tools.text_recognition import TextRecognitionTool +from tools.utils import call_paddleocr_api, get_sdk_client class PaddleocrProvider(ToolProvider): @@ -15,36 +16,26 @@ def _validate_credentials(self, credentials: dict[str, Any]) -> None: "AI Studio access token must be provided" ) - api_url_keys = ( - "text_recognition_api_url", - "document_parsing_api_url", - "document_parsing_vl_api_url", - ) - tool_classes = ( - TextRecognitionTool, - DocumentParsingTool, - DocumentParsingVlTool, - ) + # Get base_url (optional, uses SDK default if not provided) + base_url = credentials.get("base_url") + + # Test with OCR (works for all models) test_file = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png" - if not any(key in credentials for key in api_url_keys): - raise ToolProviderCredentialValidationError( - "You should provide at least one API URL" + try: + client_config = get_sdk_client( + access_token=credentials["aistudio_access_token"], + base_url=base_url, ) - - for api_url_key, tool_cls in zip(api_url_keys, tool_classes): - if api_url_key in credentials: - try: - self._test_tool_validation(tool_cls, credentials, test_file) - except Exception as e: - raise ToolProviderCredentialValidationError( - f"Invalid credentials for {tool_cls.__name__}" - ) from e - - def _test_tool_validation( - self, tool_cls, credentials: dict[str, Any], test_file: str - ) -> None: - tool = tool_cls.from_credentials(credentials) - - for _ in tool.invoke(tool_parameters={"file": test_file}): - break + call_paddleocr_api( + model="PP-OCRv5", + file_url=test_file, + file_path=None, + options={}, + client_config=client_config, + is_document_parsing=False, + ) + except Exception as e: + raise ToolProviderCredentialValidationError( + f"Validation failed: {e}" + ) from e \ No newline at end of file diff --git a/tools/paddleocr/provider/paddleocr.yaml b/tools/paddleocr/provider/paddleocr.yaml index 21bd58e6a..04cac9ff5 100644 --- a/tools/paddleocr/provider/paddleocr.yaml +++ b/tools/paddleocr/provider/paddleocr.yaml @@ -34,39 +34,15 @@ credentials_for_provider: en_US: Get your AI Studio access token zh_Hans: 获取星河社区访问令牌 url: https://aistudio.baidu.com/index/accessToken - text_recognition_api_url: + base_url: type: text-input + required: false label: - en_US: Text Recognition API URL - zh_Hans: 文字识别 API URL + en_US: Base URL (Optional) + zh_Hans: Base URL(可选) placeholder: - en_US: Text Recognition API URL - zh_Hans: 文字识别 API URL + en_US: https://paddleocr.aistudio-app.com + zh_Hans: https://paddleocr.aistudio-app.com help: - en_US: Click the "API" button in the upper-left corner, select "Text recognition(PP-OCRv5)", and copy the `API_URL`. - zh_Hans: 点击左上角的“API”,选择“文字识别(PP-OCRv5)”并复制 `API_URL` - url: https://aistudio.baidu.com/paddleocr/task - document_parsing_api_url: - type: text-input - label: - en_US: Document Parsing API URL - zh_Hans: 文档解析 API URL - placeholder: - en_US: Document Parsing API URL - zh_Hans: 文档解析 API URL - help: - en_US: Click the "API" button in the upper-left corner, select "Document parsing(PP-StructureV3)", and copy the `API_URL`. - zh_Hans: 点击左上角的“API”,选择“文档解析(PP-StructureV3)”并复制 `API_URL` - url: https://aistudio.baidu.com/paddleocr/task - document_parsing_vl_api_url: - type: text-input - label: - en_US: Large Model Document Parsing API URL - zh_Hans: 大模型文档解析 API URL - placeholder: - en_US: Large Model Document Parsing API URL - zh_Hans: 大模型文档解析 API URL - help: - en_US: Click the "API" button in the upper-left corner, select "Large Model document parsing(PaddleOCR-VL)", and copy the `API_URL`. - zh_Hans: 点击左上角的“API”,选择“大模型文档解析(PaddleOCR-VL)”并复制 `API_URL` - url: https://aistudio.baidu.com/paddleocr/task + en_US: Leave empty to use the default PaddleOCR service. Only needed for self-hosted deployments. + zh_Hans: 留空则使用默认的 PaddleOCR 服务。仅自建服务时需要填写。 \ No newline at end of file diff --git a/tools/paddleocr/pyproject.toml b/tools/paddleocr/pyproject.toml index 8d92d7a7a..4213efd5d 100644 --- a/tools/paddleocr/pyproject.toml +++ b/tools/paddleocr/pyproject.toml @@ -1,10 +1,11 @@ [project] -name = "paddleocr" +name = "paddleocr-dify" version = "0.1.0" description = "Add your description here" readme = "README.md" requires-python = ">=3.12" +# Managed with uv; refresh the lockfile with `uv lock`. dependencies = [ "dify_plugin>=0.9.0", "requests>=2.34.2", @@ -12,4 +13,4 @@ dependencies = [ # uv run black . -C -l 100 && uv run ruff check --fix [dependency-groups] -dev = [] +dev = [] \ No newline at end of file diff --git a/tools/paddleocr/tools/document_parsing.py b/tools/paddleocr/tools/document_parsing.py index 90bc01817..ad6bad353 100644 --- a/tools/paddleocr/tools/document_parsing.py +++ b/tools/paddleocr/tools/document_parsing.py @@ -5,10 +5,11 @@ from dify_plugin.entities.tool import ToolInvokeMessage from tools.utils import ( - get_markdown_from_result, - make_paddleocr_api_request, + build_pp_structure_v3_options, + call_paddleocr_api, + cleanup_temp_file, + get_sdk_client, normalize_file_input, - process_images_from_result, ) @@ -21,79 +22,100 @@ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessag ) access_token = self.runtime.credentials["aistudio_access_token"] - if "document_parsing_api_url" not in self.runtime.credentials: - raise RuntimeError( - "The document parsing API URL is not configured or invalid. Please provide it in the plugin settings." - ) - api_url = self.runtime.credentials["document_parsing_api_url"] + # Get base_url (optional, uses SDK default if not provided) + base_url = self.runtime.credentials.get("base_url") - file_payload, file_type = normalize_file_input( + # Normalize file input - returns (input_value, is_temp_file, file_type_code) + file_input, is_temp_file, file_type_code = normalize_file_input( tool_parameters.get("file"), tool_parameters.get("fileType") ) - params: dict[str, Any] = {"file": file_payload} - if file_type is not None: - params["fileType"] = file_type - for optional_param_name in [ - "fileType", - "useDocOrientationClassify", - "useDocUnwarping", - "useTextlineOrientation", - "useSealRecognition", - "useTableRecognition", - "useFormulaRecognition", - "useChartRecognition", - "useRegionDetection", - "formatBlockContent", - "layoutThreshold", - "layoutNms", - "layoutUnclipRatio", - "layoutMergeBboxesMode", - "textDetLimitSideLen", - "textDetLimitType", - "textDetThresh", - "textDetBoxThresh", - "textDetUnclipRatio", - "textRecScoreThresh", - "sealDetLimitSideLen", - "sealDetLimitType", - "sealDetThresh", - "sealDetBoxThresh", - "sealDetUnclipRatio", - "sealRecScoreThresh", - "useWiredTableCellsTransToHtml", - "useWirelessTableCellsTransToHtml", - "useTableOrientationClassify", - "useOcrResultsWithTableCells", - "useE2eWiredTableRecModel", - "useE2eWirelessTableRecModel", - "markdownIgnoreLabels", - "prettifyMarkdown", - "showFormulaNumber", - "visualize", - ]: - if optional_param_name in tool_parameters and optional_param_name != "fileType": - params[optional_param_name] = tool_parameters[optional_param_name] + try: + # Build options from parameters + options = build_pp_structure_v3_options(tool_parameters) - # Convert markdownIgnoreLabels from comma-separated string to list - if "markdownIgnoreLabels" in params and isinstance(params["markdownIgnoreLabels"], str): - params["markdownIgnoreLabels"] = [ - label.strip() - for label in params["markdownIgnoreLabels"].split(",") - if label.strip() - ] + # Get API client config + client_config = get_sdk_client(access_token, base_url) - result = make_paddleocr_api_request(api_url, params, access_token) + # Call API with PP-StructureV3 model + if file_input.startswith(("http://", "https://")): + result = call_paddleocr_api( + model="PP-StructureV3", + file_url=file_input, + file_path=None, + options=options, + client_config=client_config, + is_document_parsing=True, + ) + else: + result = call_paddleocr_api( + model="PP-StructureV3", + file_url=None, + file_path=file_input, + options=options, + client_config=client_config, + is_document_parsing=True, + ) - images, image_path_map, failed_images, blob_messages = process_images_from_result( - result, self - ) + # Process images from result + images = [] + image_path_map = {} + failed_images = [] + + for page in result["pages"]: + if page["markdown_images"]: + image_dict = page["markdown_images"] + if image_dict: + for image_path, image_url in image_dict.items(): + if image_path in image_path_map: + continue + try: + import requests + image_bytes = requests.get(image_url, timeout=(10, 600)).content + file_name = f"paddleocr_image_{len(images)}.jpg" + upload_response = self.session.file.upload( + file_name, image_bytes, "image/jpeg" + ) + images.append(upload_response) + image_path_map[image_path] = upload_response + if not upload_response.preview_url: + failed_images.append(image_path) + except Exception as e: + self.runtime.logger.warning(f"Failed to process image {image_path}: {e}") + failed_images.append(image_path) - markdown = get_markdown_from_result(result, image_path_map, failed_images) + # Build markdown with image replacement + markdown_text_list = [] + for page in result["pages"]: + markdown_text = page["markdown_text"] + if markdown_text is not None: + # Replace image paths with uploaded URLs + for image_path, upload_response in image_path_map.items(): + if upload_response.preview_url: + markdown_text = markdown_text.replace( + f'src="{image_path}"', + f'src="{upload_response.preview_url}"' + ) + else: + markdown_text = markdown_text.replace( + f'src="{image_path}"', + 'src="[Image unavailable]"' + ) + markdown_text_list.append(markdown_text) - for blob_data, blob_meta in blob_messages: - yield self.create_blob_message(blob_data, meta=blob_meta) + # Return raw result as JSON + yield self.create_json_message({ + "job_id": result["job_id"], + "pages": [ + { + "markdown_text": page["markdown_text"], + "markdown_images": page["markdown_images"], + "output_images": page["output_images"], + } + for page in result["pages"] + ] + }) - yield self.create_variable_message("images", images) - yield self.create_text_message(markdown) - yield self.create_json_message(result) + finally: + # Clean up temporary file if created + cleanup_temp_file(file_input, is_temp_file) \ No newline at end of file diff --git a/tools/paddleocr/tools/document_parsing_vl.py b/tools/paddleocr/tools/document_parsing_vl.py index 9406af801..f3cce642b 100644 --- a/tools/paddleocr/tools/document_parsing_vl.py +++ b/tools/paddleocr/tools/document_parsing_vl.py @@ -5,10 +5,11 @@ from dify_plugin.entities.tool import ToolInvokeMessage from tools.utils import ( - get_markdown_from_result, - make_paddleocr_api_request, + build_paddleocr_vl_options, + call_paddleocr_api, + cleanup_temp_file, + get_sdk_client, normalize_file_input, - process_images_from_result, ) @@ -21,76 +22,102 @@ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessag ) access_token = self.runtime.credentials["aistudio_access_token"] - if "document_parsing_vl_api_url" not in self.runtime.credentials: - raise RuntimeError( - "The large model document parsing API URL is not configured or invalid. Please provide it in the plugin settings." - ) - api_url = self.runtime.credentials["document_parsing_vl_api_url"] + # Get base_url (optional, uses SDK default if not provided) + base_url = self.runtime.credentials.get("base_url") - file_payload, file_type = normalize_file_input( + # Normalize file input - returns (input_value, is_temp_file, file_type_code) + file_input, is_temp_file, file_type_code = normalize_file_input( tool_parameters.get("file"), tool_parameters.get("fileType") ) - params: dict[str, Any] = {"file": file_payload} - if file_type is not None: - params["fileType"] = file_type - for optional_param_name in [ - "fileType", - "useDocOrientationClassify", - "useDocUnwarping", - "useLayoutDetection", - "useChartRecognition", - "useSealRecognition", - "useOcrForImageBlock", - "layoutThreshold", - "layoutNms", - "layoutUnclipRatio", - "layoutMergeBboxesMode", - "layoutShapeMode", - "promptLabel", - "formatBlockContent", - "repetitionPenalty", - "temperature", - "topP", - "minPixels", - "maxPixels", - "maxNewTokens", - "mergeLayoutBlocks", - "markdownIgnoreLabels", - "vlmExtraArgs", - "prettifyMarkdown", - "showFormulaNumber", - "restructurePages", - "mergeTables", - "relevelTitles", - "visualize", - ]: - if optional_param_name in tool_parameters and optional_param_name != "fileType": - params[optional_param_name] = tool_parameters[optional_param_name] + try: + # Build options from parameters + options = build_paddleocr_vl_options(tool_parameters) - # Convert promptLabel parameter - if "promptLabel" in params and params["promptLabel"] == "undefined": - params.pop("promptLabel") + # Get API client config + client_config = get_sdk_client(access_token, base_url) - # Convert markdownIgnoreLabels from comma-separated string to list - if "markdownIgnoreLabels" in params and isinstance(params["markdownIgnoreLabels"], str): - params["markdownIgnoreLabels"] = [ - label.strip() - for label in params["markdownIgnoreLabels"].split(",") - if label.strip() - ] + # Call API with PaddleOCR-VL-1.6 model + if file_input.startswith(("http://", "https://")): + result = call_paddleocr_api( + model="PaddleOCR-VL-1.6", + file_url=file_input, + file_path=None, + options=options, + client_config=client_config, + is_document_parsing=True, + ) + else: + result = call_paddleocr_api( + model="PaddleOCR-VL-1.6", + file_url=None, + file_path=file_input, + options=options, + client_config=client_config, + is_document_parsing=True, + ) - result = make_paddleocr_api_request(api_url, params, access_token) + # Process images from result + images = [] + image_path_map = {} + failed_images = [] - images, image_path_map, failed_images, blob_messages = process_images_from_result( - result, self - ) + for page in result["pages"]: + if page["markdown_images"]: + image_dict = page["markdown_images"] + if image_dict: + for image_path, image_url in image_dict.items(): + if image_path in image_path_map: + continue + try: + import requests + image_bytes = requests.get(image_url, timeout=(10, 600)).content + file_name = f"paddleocr_vl_image_{len(images)}.jpg" + upload_response = self.session.file.upload( + file_name, image_bytes, "image/jpeg" + ) + images.append(upload_response) + image_path_map[image_path] = upload_response + if not upload_response.preview_url: + failed_images.append(image_path) + except Exception as e: + self.runtime.logger.warning(f"Failed to process image {image_path}: {e}") + failed_images.append(image_path) + + # Build markdown with image replacement + markdown_text_list = [] + for page in result["pages"]: + markdown_text = page["markdown_text"] + if markdown_text is not None: + # Replace image paths with uploaded URLs + for image_path, upload_response in image_path_map.items(): + if upload_response.preview_url: + markdown_text = markdown_text.replace( + f'src="{image_path}"', + f'src="{upload_response.preview_url}"' + ) + else: + markdown_text = markdown_text.replace( + f'src="{image_path}"', + 'src="[Image unavailable]"' + ) + markdown_text_list.append(markdown_text) - markdown = get_markdown_from_result(result, image_path_map, failed_images) + yield self.create_text_message("\n\n".join(markdown_text_list)) - for blob_data, blob_meta in blob_messages: - yield self.create_blob_message(blob_data, meta=blob_meta) + # Return raw result as JSON + yield self.create_json_message({ + "job_id": result["job_id"], + "pages": [ + { + "markdown_text": page["markdown_text"], + "markdown_images": page["markdown_images"], + "output_images": page["output_images"], + } + for page in result["pages"] + ] + }) - yield self.create_variable_message("images", images) - yield self.create_text_message(markdown) - yield self.create_json_message(result) + finally: + # Clean up temporary file if created + cleanup_temp_file(file_input, is_temp_file) \ No newline at end of file diff --git a/tools/paddleocr/tools/text_recognition.py b/tools/paddleocr/tools/text_recognition.py index 128236c45..a6da52f27 100644 --- a/tools/paddleocr/tools/text_recognition.py +++ b/tools/paddleocr/tools/text_recognition.py @@ -4,7 +4,13 @@ from dify_plugin import Tool from dify_plugin.entities.tool import ToolInvokeMessage -from tools.utils import make_paddleocr_api_request, normalize_file_input +from tools.utils import ( + build_ocr_options, + call_paddleocr_api, + cleanup_temp_file, + get_sdk_client, + normalize_file_input, +) class TextRecognitionTool(Tool): @@ -16,42 +22,64 @@ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessag ) access_token = self.runtime.credentials["aistudio_access_token"] - if "text_recognition_api_url" not in self.runtime.credentials: - raise RuntimeError( - "The text recognition API URL is not configured or invalid. Please provide it in the plugin settings." - ) - api_url = self.runtime.credentials["text_recognition_api_url"] + # Get base_url (optional, uses SDK default if not provided) + base_url = self.runtime.credentials.get("base_url") - file_payload, file_type = normalize_file_input( + # Normalize file input - returns (input_value, is_temp_file, file_type_code) + file_input, is_temp_file, file_type_code = normalize_file_input( tool_parameters.get("file"), tool_parameters.get("fileType") ) - params: dict[str, Any] = {"file": file_payload} - if file_type is not None: - params["fileType"] = file_type - for optional_param_name in [ - "fileType", - "useDocOrientationClassify", - "useDocUnwarping", - "useTextlineOrientation", - "textDetLimitSideLen", - "textDetLimitType", - "textDetThresh", - "textDetBoxThresh", - "textDetUnclipRatio", - "textRecScoreThresh", - "returnWordBox", - "visualize", - ]: - if optional_param_name in tool_parameters and optional_param_name != "fileType": - params[optional_param_name] = tool_parameters[optional_param_name] - - result = make_paddleocr_api_request(api_url, params, access_token) - - all_text = [] - for item in result.get("result", {}).get("ocrResults", []): - text_list = item.get("prunedResult", {}).get("rec_texts") - if text_list is not None: - all_text.append("\n".join(text_list)) - yield self.create_text_message("\n\n".join(all_text)) - yield self.create_json_message(result) + try: + # Build OCR options from parameters + options = build_ocr_options(tool_parameters) + + # Get API client config + client_config = get_sdk_client(access_token, base_url) + + # Call API + if file_input.startswith(("http://", "https://")): + result = call_paddleocr_api( + model="PP-OCRv5", + file_url=file_input, + file_path=None, + options=options, + client_config=client_config, + is_document_parsing=False, + ) + else: + result = call_paddleocr_api( + model="PP-OCRv5", + file_url=None, + file_path=file_input, + options=options, + client_config=client_config, + is_document_parsing=False, + ) + + # Extract text for output + all_text = [] + for page in result["pages"]: + pruned = page["pruned_result"] + if pruned and "rec_texts" in pruned: + text_list = pruned["rec_texts"] + if text_list is not None: + all_text.append("\n".join(text_list)) + + yield self.create_text_message("\n\n".join(all_text)) + + # Return raw result as JSON + yield self.create_json_message({ + "job_id": result["job_id"], + "pages": [ + { + "pruned_result": page["pruned_result"], + "ocr_image_url": page["ocr_image_url"], + } + for page in result["pages"] + ] + }) + + finally: + # Clean up temporary file if created + cleanup_temp_file(file_input, is_temp_file) \ No newline at end of file diff --git a/tools/paddleocr/tools/utils.py b/tools/paddleocr/tools/utils.py index ff93254d7..36367d299 100644 --- a/tools/paddleocr/tools/utils.py +++ b/tools/paddleocr/tools/utils.py @@ -1,19 +1,18 @@ import base64 +import json import logging import os import re +import time +import tempfile from typing import Any, List, Optional, Tuple +from urllib.parse import urlparse -import requests from dify_plugin.file.file import File from dify_plugin.invocations.file import UploadFileResponse -REQUEST_TIMEOUT = (10, 600) - # Pre-compiled regex patterns for performance HTML_IMG_PATTERN = re.compile(r'(]*src=")([^"]+)(")') - -# Template for failed image replacement pattern FAILED_IMG_TAG_TEMPLATE = r']*src="[^"]*{escaped_path}[^"]*"[^>]*>' @@ -22,6 +21,42 @@ IMAGE_EXTENSIONS = {".bmp", ".jpeg", ".jpg", ".png", ".tif", ".tiff", ".webp"} +def camel_to_snake(name: str) -> str: + """Convert camelCase or PascalCase to snake_case. + + Args: + name: camelCase or PascalCase string + + Returns: + snake_case string + """ + # Handle camelCase + s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + # Handle PascalCase + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() + + +def extract_base_url(api_url: str) -> str: + """Extract base URL from full API URL. + + The SDK requires a base URL (e.g., https://example.com) + but users provide the full API URL (e.g., https://example.com/ocr). + This function extracts the base URL by removing the endpoint path. + + Args: + api_url: Full API URL + + Returns: + Base URL without endpoint path + """ + parsed = urlparse(api_url) + # Remove common PaddleOCR endpoints + path = parsed.path.rstrip("/") + if path in ("", "/ocr", "/layout-parsing", "/paddleocr"): + path = "" + return f"{parsed.scheme}://{parsed.netloc}{path}" + + def convert_file_type(file_type: str | None) -> int | None: """Convert file type string to API parameter value. @@ -35,16 +70,18 @@ def convert_file_type(file_type: str | None) -> int | None: return 0 elif file_type == "image": return 1 - else: # "auto" or None + else: return None -def normalize_file_input(file_value: Any, file_type: str | None) -> tuple[str, int | None]: - """Normalize PaddleOCR file input for API payloads. +def normalize_file_input(file_value: Any, file_type: str | None) -> Tuple[str, bool, int | None]: + """Normalize PaddleOCR file input. - Uploaded Dify files are converted to base64 content because the PaddleOCR - API accepts either a URL or base64-encoded file content in the `file` field. - Legacy string values are kept unchanged for URL/base64 compatibility. + Returns: + A tuple of (input_value, is_temp_file, file_type_code): + - input_value: URL, file path (temp or regular), or base64 string + - is_temp_file: True if the value is a temporary file path that should be deleted + - file_type_code: 0 for PDF, 1 for image, None for auto """ if file_value is None or (isinstance(file_value, str) and file_value == ""): raise RuntimeError("File is not provided.") @@ -53,12 +90,23 @@ def normalize_file_input(file_value: Any, file_type: str | None) -> tuple[str, i if isinstance(file_value, File): encoded_file = base64.b64encode(file_value.blob).decode("utf-8") - if explicit_file_type is not None: - return encoded_file, explicit_file_type - return encoded_file, infer_file_type(file_value) + temp_file = base64_to_temp_file(encoded_file, infer_file_extension(file_value)) + file_type_code = explicit_file_type if explicit_file_type is not None else infer_file_type(file_value) + return temp_file, True, file_type_code if isinstance(file_value, str): - return file_value, explicit_file_type + # Check if it's a URL + if file_value.startswith(("http://", "https://")): + return file_value, False, explicit_file_type + # Check if it's a file path (AI reviewer suggestion: check file path before base64 validation) + if os.path.exists(file_value): + return file_value, False, explicit_file_type + # Check if it's base64 (data URL or raw) + if file_value.startswith("data:") or is_likely_base64(file_value): + temp_file = base64_to_temp_file(extract_base64(file_value)) + return temp_file, True, explicit_file_type + # It's a file path (doesn't exist, but could be relative path) + return file_value, False, explicit_file_type raise RuntimeError("File must be a Dify file, URL, or base64-encoded string.") @@ -82,6 +130,21 @@ def infer_file_type(file_value: File) -> int | None: return None +def infer_file_extension(file_value: File) -> str: + mime_type = (file_value.mime_type or "").lower() + if mime_type == "application/pdf": + return ".pdf" + if mime_type.startswith("image/"): + ext = mime_type.split("/")[-1] + return f".{ext}" + + extension = normalize_extension(file_value.extension) + if extension is None: + extension = normalize_extension(os.path.splitext(file_value.filename or "")[1]) + + return extension if extension else ".png" + + def normalize_extension(extension: str | None) -> str | None: if not extension: return None @@ -89,36 +152,73 @@ def normalize_extension(extension: str | None) -> str | None: return extension if extension.startswith(".") else f".{extension}" +def extract_base64(data_url: str) -> str: + if data_url.startswith("data:"): + return data_url.split(",", 1)[1] + return data_url + + +def is_likely_base64(s: str) -> bool: + if len(s) < 32: + return False + try: + base64.b64decode(s, validate=True) + return True + except Exception: + return False + + +def base64_to_temp_file(base64_str: str, suffix: str = ".png") -> str: + """Save base64 string to a temporary file. + + Args: + base64_str: Base64 encoded string + suffix: File extension suffix + + Returns: + Path to the temporary file + """ + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f: + f.write(base64.b64decode(base64_str)) + return f.name + + +def bytes_to_temp_file(data: bytes, suffix: str = ".png") -> str: + """Save bytes directly to a temporary file (AI reviewer suggestion). + + Args: + data: Raw bytes data + suffix: File extension suffix + + Returns: + Path to the temporary file + """ + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f: + f.write(data) + return f.name + + +def cleanup_temp_file(file_path: str, is_temp: bool) -> None: + """Clean up temporary file if it exists and is marked as temporary. + + Args: + file_path: Path to the file + is_temp: True if the file is a temporary file that should be deleted + """ + if is_temp and file_path and os.path.exists(file_path): + try: + os.unlink(file_path) + except Exception as e: + logger.warning(f"Failed to clean up temporary file {file_path}: {e}") + + def extract_image_urls_from_markdown(markdown: str) -> List[str]: """Extract image URLs from markdown""" - # Match various image URL formats, including relative and absolute paths image_pattern = re.compile(r']*src="([^"]*)"[^>]*>', re.IGNORECASE) matches = image_pattern.findall(markdown) return matches -def download_image_from_url(image_url: str) -> bytes: - """Download image from URL and return image data and MIME type""" - try: - logger.debug(f"Downloading image from URL: {image_url}") - resp = requests.get(image_url, timeout=REQUEST_TIMEOUT) - resp.raise_for_status() - - logger.debug( - f"Successfully downloaded image from {image_url}, size: {len(resp.content)} bytes" - ) - return resp.content - except requests.exceptions.Timeout as e: - logger.error(f"Timeout downloading image from {image_url}: {e}") - raise RuntimeError(f"Failed to download image from {image_url}: timeout") from e - except requests.exceptions.RequestException as e: - logger.error(f"Network error downloading image from {image_url}: {e}") - raise RuntimeError(f"Failed to download image from {image_url}: network error") from e - except Exception as e: - logger.error(f"Unexpected error downloading image from {image_url}: {e}") - raise RuntimeError(f"Failed to download image from {image_url}: {e}") from e - - def replace_markdown_image_paths( markdown: str, image_path_map: dict[str, UploadFileResponse], @@ -157,7 +257,6 @@ def replace_markdown_image_paths( placeholder_count = 0 for failed_path in failed_images: escaped_path = re.escape(failed_path) - # Remove entire img tags for failed images using template pattern pattern = FAILED_IMG_TAG_TEMPLATE.format(escaped_path=escaped_path) original_markdown = markdown markdown = re.sub(pattern, "[Image unavailable]", markdown) @@ -183,9 +282,9 @@ def process_images_from_result( tool_instance: Tool instance for file operations """ images = [] - image_path_map = {} # key: image path, value: UploadFileResponse - failed_images = [] # images that failed to process - blob_messages = [] # blob messages to yield: [(data, meta), ...] + image_path_map = {} + failed_images = [] + blob_messages = [] image_counter = 0 logger.debug("Processing images from API result") @@ -193,18 +292,14 @@ def process_images_from_result( for item in result.get("result", {}).get("layoutParsingResults", []): markdown_data = item.get("markdown", {}) if markdown_data: - # Get image dictionary {path: url} from markdown image_dict = markdown_data.get("images", {}) if image_dict: - logger.debug( - f"Found {len(image_dict)} images to process: {list(image_dict.keys())}" - ) + logger.debug(f"Found {len(image_dict)} images to process: {list(image_dict.keys())}") else: logger.debug("No images found in this markdown item") for image_path, image_url in image_dict.items(): if image_path in image_path_map: - # Already processed this path logger.debug(f"Skipping already processed image: {image_path}") continue @@ -213,18 +308,15 @@ def process_images_from_result( image_processed_successfully = False try: - # Download image first try: image_bytes = download_image_from_url(image_url) except Exception as download_error: logger.warning( f"Failed to download image {image_path} from {image_url}: {download_error}" ) - # Cannot download - cannot create blob message, mark as failed for markdown failed_images.append(image_path) continue - # Upload image to dify with error handling file_name = f"paddleocr_image_{image_counter}.jpg" logger.debug(f"Uploading image {image_path} as {file_name}") @@ -240,7 +332,6 @@ def process_images_from_result( f"Successfully uploaded image {image_path}, preview_url: {upload_response.preview_url}" ) - # Check if upload was successful but no preview URL if not upload_response.preview_url: logger.warning( f"No preview URL for uploaded image {image_path}, creating blob message as fallback" @@ -254,7 +345,6 @@ def process_images_from_result( except Exception as upload_error: logger.error(f"Failed to upload image {image_path} to dify: {upload_error}") - # Create blob message as fallback when upload fails logger.info( f"Creating blob message as fallback for failed upload of {image_path}" ) @@ -298,68 +388,398 @@ def get_markdown_from_result( return "\n\n".join(markdown_text_list) -def make_paddleocr_api_request(api_url: str, params: dict, access_token: str) -> dict: +def download_image_from_url(image_url: str) -> bytes: + """Download image from URL and return image data and MIME type""" + import requests + try: - logger.debug(f"Making PaddleOCR API request to {api_url}") - resp = requests.post( - api_url, - headers={"Client-Platform": "dify", "Authorization": f"token {access_token}"}, - json=params, - timeout=REQUEST_TIMEOUT, + logger.debug(f"Downloading image from URL: {image_url}") + resp = requests.get(image_url, timeout=(10, 600)) + resp.raise_for_status() + + logger.debug( + f"Successfully downloaded image from {image_url}, size: {len(resp.content)} bytes" ) - logger.debug(f"PaddleOCR API request completed with status {resp.status_code}") + return resp.content except requests.exceptions.Timeout as e: - logger.error(f"PaddleOCR API request timed out: {e}") - raise RuntimeError("PaddleOCR API request timed out") from e + logger.error(f"Timeout downloading image from {image_url}: {e}") + raise RuntimeError(f"Failed to download image from {image_url}: timeout") from e except requests.exceptions.RequestException as e: - logger.error(f"PaddleOCR API request failed (network error): {e}") - raise RuntimeError("PaddleOCR API request failed (network error)") from e + logger.error(f"Network error downloading image from {image_url}: {e}") + raise RuntimeError(f"Failed to download image from {image_url}: network error") from e + except Exception as e: + logger.error(f"Unexpected error downloading image from {image_url}: {e}") + raise RuntimeError(f"Failed to download image from {image_url}: {e}") from e + + +# ==================== HTTP Async Job API Implementation ==================== + +DEFAULT_BASE_URL = "https://paddleocr.aistudio-app.com" +API_PATH = "/api/v2/ocr/jobs" +DEFAULT_REQUEST_TIMEOUT = 300.0 +DEFAULT_POLL_TIMEOUT = 600.0 +DEFAULT_INITIAL_INTERVAL = 3.0 +DEFAULT_MULTIPLIER = 1.5 +DEFAULT_MAX_INTERVAL = 15.0 + + +def get_sdk_client(access_token: str, base_url: str | None = None) -> dict[str, Any]: + """Get PaddleOCR API client configuration. + + Args: + access_token: AI Studio access token + base_url: Base URL (optional, uses SDK default if not provided) + + Returns: + Configuration dict with token, base_url, headers + """ + # If base_url is provided, extract it (in case user passed full API URL) + if base_url: + base_url = extract_base_url(base_url) + else: + base_url = DEFAULT_BASE_URL + + return { + "token": access_token, + "base_url": base_url.rstrip("/"), + "headers": { + "Authorization": f"Bearer {access_token}", + "Client-Platform": "dify", + }, + } + + +def build_ocr_options(params: dict[str, Any]) -> dict[str, Any]: + """Build OCR options dict from parameters using dynamic conversion. + + Args: + params: Tool parameters + + Returns: + Options dict with snake_case keys + """ + options_dict = {} + for api_name, value in params.items(): + if value is None: + continue + # Convert camelCase to snake_case + option_name = camel_to_snake(api_name) + options_dict[option_name] = value + return options_dict + + +def build_pp_structure_v3_options(params: dict[str, Any]) -> dict[str, Any]: + """Build PPStructureV3 options dict from parameters using dynamic conversion. + + Args: + params: Tool parameters + + Returns: + Options dict with snake_case keys + """ + options_dict = {} + for api_name, value in params.items(): + if value is None: + continue + # Convert camelCase to snake_case + option_name = camel_to_snake(api_name) + # Handle markdownIgnoreLabels conversion + if api_name == "markdownIgnoreLabels" and isinstance(value, str): + value = [label.strip() for label in value.split(",") if label.strip()] + options_dict[option_name] = value + return options_dict + + +def build_paddleocr_vl_options(params: dict[str, Any]) -> dict[str, Any]: + """Build PaddleOCRVLOptions dict from parameters using dynamic conversion. + + Args: + params: Tool parameters + + Returns: + Options dict with snake_case keys + """ + options_dict = {} + for api_name, value in params.items(): + if value is None: + continue + # Handle promptLabel conversion - skip if "undefined" + if api_name == "promptLabel" and value == "undefined": + continue + # Convert camelCase to snake_case + option_name = camel_to_snake(api_name) + # Handle markdownIgnoreLabels conversion + if api_name == "markdownIgnoreLabels" and isinstance(value, str): + value = [label.strip() for label in value.split(",") if label.strip()] + options_dict[option_name] = value + return options_dict + + +def _submit_job( + model: str, + file_url: str | None, + file_path: str | None, + options: dict[str, Any], + base_url: str, + headers: dict[str, str], +) -> str: + """Submit job and return job_id. + + Args: + model: Model name (e.g., "PP-OCRv5", "PP-StructureV3", "PaddleOCR-VL-1.6") + file_url: URL of the file (if using URL input) + file_path: Path to the file (if using file input) + options: Optional payload parameters + base_url: Base API URL + headers: Request headers + + Returns: + job_id string + + Raises: + RuntimeError: If submission fails + """ + import requests + + jobs_url = f"{base_url}{API_PATH}" try: - resp.raise_for_status() - except requests.exceptions.HTTPError as e: - status = resp.status_code + if file_url: + # Submit with URL + body = { + "fileUrl": file_url, + "model": model, + "optionalPayload": options, + } + resp = requests.post(jobs_url, json=body, headers=headers, timeout=DEFAULT_REQUEST_TIMEOUT) + else: + # Submit with file + data = { + "model": model, + "optionalPayload": json.dumps(options), + } + with open(file_path, "rb") as f: + resp = requests.post( + jobs_url, data=data, files={"file": f}, headers=headers, timeout=DEFAULT_REQUEST_TIMEOUT + ) + except requests.Timeout as e: + raise RuntimeError(f"Request timed out: {e}") from e + except requests.ConnectionError as e: + raise RuntimeError(f"Connection failed: {e}") from e + + if not 200 <= resp.status_code < 300: + try: + payload = resp.json() + msg = payload.get("msg") or payload.get("message") or payload.get("error") or resp.text + except ValueError: + msg = resp.text + raise RuntimeError(f"Job submission failed (HTTP {resp.status_code}): {msg}") + + try: + payload = resp.json() + job_id = payload.get("data", {}).get("jobId") or payload.get("jobId") + if not job_id: + raise RuntimeError(f"Job ID not found in response: {payload}") + return job_id + except (ValueError, KeyError) as e: + raise RuntimeError(f"Failed to parse job submission response: {e}") from e + + +def _poll_job( + job_id: str, + base_url: str, + headers: dict[str, str], + max_wait_time: float = DEFAULT_POLL_TIMEOUT, +) -> tuple[list[dict[str, Any]], dict[str, Any]]: + """Poll job until done, return (jsonl_data, status_data). + + Args: + job_id: Job ID + base_url: Base API URL + headers: Request headers + max_wait_time: Maximum wait time in seconds + + Returns: + Tuple of (jsonl_data list, status_data dict) + + Raises: + RuntimeError: If polling fails or job fails + """ + import requests + + jobs_url = f"{base_url}{API_PATH}" + status_url = f"{jobs_url}/{job_id}" - if status in (400, 422): + interval = DEFAULT_INITIAL_INTERVAL + start = time.monotonic() + deadline = start + max_wait_time + + while True: + now = time.monotonic() + if now >= deadline: + raise RuntimeError(f"Job {job_id} timed out after {max_wait_time:.1f} seconds") + + try: + resp = requests.get(status_url, headers=headers, timeout=DEFAULT_REQUEST_TIMEOUT) + except requests.Timeout as e: + raise RuntimeError(f"Request timed out: {e}") from e + except requests.ConnectionError as e: + raise RuntimeError(f"Connection failed: {e}") from e + + if not 200 <= resp.status_code < 300: try: - result = resp.json() - err_code = result.get("errorCode") - err_msg = result.get("errorMsg") + payload = resp.json() + msg = payload.get("msg") or payload.get("message") or payload.get("error") or resp.text except ValueError: - err_code = None - err_msg = resp.text or "Bad Request" + msg = resp.text + raise RuntimeError(f"Poll failed (HTTP {resp.status_code}): {msg}") + + try: + data = resp.json() + state = data.get("data", {}).get("state") or data.get("state") + except (ValueError, KeyError) as e: + raise RuntimeError(f"Failed to parse poll response: {e}") from e + + if state == "done": + # Get result URL + result_json_url = data.get("data", {}).get("resultJsonUrl") or data.get("resultJsonUrl") + if not result_json_url: + raise RuntimeError(f"Result URL not found in response: {data}") + + # Fetch JSONL result + try: + resp = requests.get(result_json_url, timeout=DEFAULT_REQUEST_TIMEOUT) + resp.raise_for_status() + except requests.Timeout as e: + raise RuntimeError(f"Result download timed out: {e}") from e + except requests.ConnectionError as e: + raise RuntimeError(f"Result download failed: {e}") from e + + # Parse JSONL + lines = resp.text.strip().split("\n") + jsonl_data = [] + for line in lines: + line = line.strip() + if line: + try: + jsonl_data.append(json.loads(line)) + except json.JSONDecodeError as e: + raise RuntimeError(f"Malformed JSONL result: {e}") from e + + return jsonl_data, data + + if state == "failed": + error_msg = data.get("data", {}).get("errorMsg") or data.get("errorMsg") or "Unknown error" + raise RuntimeError(f"Job {job_id} failed: {error_msg}") + + # Continue polling + remaining = deadline - time.monotonic() + if remaining <= 0: + raise RuntimeError(f"Job {job_id} timed out after {max_wait_time:.1f} seconds") + + sleep_time = min(interval, remaining) + time.sleep(sleep_time) + interval = min(interval * DEFAULT_MULTIPLIER, DEFAULT_MAX_INTERVAL) + - logger.error(f"PaddleOCR API returned {status}: code={err_code}, msg={err_msg}") - raise RuntimeError( - f"PaddleOCR API returned {status}: code={err_code}, msg={err_msg}" - ) from e +def _parse_ocr_result(job_id: str, jsonl_data: list[dict[str, Any]]) -> dict[str, Any]: + """Parse OCR result into compatible format. - if status in (401, 403): - logger.error(f"PaddleOCR API authorization failed ({status})") - raise RuntimeError(f"PaddleOCR API authorization failed ({status})") from e + Args: + job_id: Job ID + jsonl_data: JSONL data list + + Returns: + Dict with job_id and pages list - if status == 429: - logger.warning("PaddleOCR API rate limit exceeded (429)") - raise RuntimeError("PaddleOCR API rate limit exceeded (429)") from e + Raises: + RuntimeError: If parsing fails + """ + try: + pages = [] + for line_obj in jsonl_data: + result = line_obj["result"] + for item in result["ocrResults"]: + pages.append( + { + "pruned_result": item["prunedResult"], + "ocr_image_url": item.get("ocrImage"), + } + ) + return { + "job_id": job_id, + "pages": pages, + } + except (KeyError, TypeError) as e: + raise RuntimeError(f"Malformed OCR result payload: {e}") from e - if status in (500, 502, 503, 504): - logger.error(f"PaddleOCR API service unavailable ({status})") - raise RuntimeError(f"PaddleOCR API service unavailable ({status})") from e - logger.error(f"PaddleOCR API returned HTTP {status}: {resp.text}") - raise RuntimeError(f"PaddleOCR API returned HTTP {status}: {resp.text}") from e +def _parse_doc_parsing_result(job_id: str, jsonl_data: list[dict[str, Any]]) -> dict[str, Any]: + """Parse doc parsing result into compatible format. + Args: + job_id: Job ID + jsonl_data: JSONL data list + + Returns: + Dict with job_id and pages list + + Raises: + RuntimeError: If parsing fails + """ try: - result = resp.json() - logger.debug("Successfully parsed PaddleOCR API response") - except ValueError as e: - logger.error(f"Failed to decode JSON response from PaddleOCR API: {resp.text}") - raise RuntimeError(f"Failed to decode JSON response from PaddleOCR API: {resp.text}") from e - - err_code = result.get("errorCode") - err_msg = result.get("errorMsg") - if err_code != 0: - logger.error(f"PaddleOCR API returned error: code={err_code}, msg={err_msg}") - raise RuntimeError(f"PaddleOCR API returned error: code={err_code}, msg={err_msg}") - - return result + pages = [] + for line_obj in jsonl_data: + result = line_obj["result"] + for item in result["layoutParsingResults"]: + markdown = item["markdown"] + pages.append( + { + "markdown_text": markdown["text"], + "markdown_images": markdown.get("images", {}), + "output_images": item.get("outputImages", {}), + } + ) + return { + "job_id": job_id, + "pages": pages, + } + except (KeyError, TypeError) as e: + raise RuntimeError(f"Malformed document parsing result payload: {e}") from e + + +def call_paddleocr_api( + model: str, + file_url: str | None, + file_path: str | None, + options: dict[str, Any], + client_config: dict[str, Any], + is_document_parsing: bool = False, +) -> dict[str, Any]: + """Call PaddleOCR API using async job pattern. + + Args: + model: Model name (e.g., "PP-OCRv5", "PP-StructureV3", "PaddleOCR-VL-1.6") + file_url: URL of the file (if using URL input) + file_path: Path to the file (if using file input) + options: Optional payload parameters + client_config: Client config from get_sdk_client() + is_document_parsing: True for doc parsing, False for OCR + + Returns: + Parsed result dict with job_id and pages + + Raises: + RuntimeError: If API call fails + """ + job_id = _submit_job( + model, file_url, file_path, options, client_config["base_url"], client_config["headers"] + ) + jsonl_data, status_data = _poll_job( + job_id, client_config["base_url"], client_config["headers"] + ) + + if is_document_parsing: + return _parse_doc_parsing_result(job_id, jsonl_data) + else: + return _parse_ocr_result(job_id, jsonl_data) \ No newline at end of file diff --git a/tools/paddleocr/uv.lock b/tools/paddleocr/uv.lock index dda9a8ba2..8ba50b5bb 100644 --- a/tools/paddleocr/uv.lock +++ b/tools/paddleocr/uv.lock @@ -365,11 +365,11 @@ wheels = [ [[package]] name = "idna" -version = "3.16" +version = "3.18" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1a/88/bcf9709822fe69d02c2a6a77956c98ce6ea8ca8767a9aadcedc7eb6a2390/idna-3.16.tar.gz", hash = "sha256:d7a6da03db833450fca25d2358ac9ff06cd624577a4aea3a596d5c0f77b8e03d", size = 203770, upload-time = "2026-05-22T00:16:18.781Z" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/63/9496c57188a2ee585e0f1db071d75089a11e98aa86eb99d9d7618fc1edce/idna-3.18.tar.gz", hash = "sha256:ffb385a7e039654cef1ab9ef32c6fafe283c0c0467bba1d9029738ce4a14a848", size = 196711, upload-time = "2026-06-02T14:34:07.794Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/94/16/70255075a9859a0e3adb789b68ceb0e210dec03934245fd98d248226572f/idna-3.16-py3-none-any.whl", hash = "sha256:cc246e3a3f89580c3a951b5ad298ca4638078b2cdd4f115654332b5c26daded5", size = 74165, upload-time = "2026-05-22T00:16:16.698Z" }, + { url = "https://files.pythonhosted.org/packages/1e/5e/d4e9f1a599fb8e573b7b87160658329fbf28d19eac2718f51fc3def3aa5a/idna-3.18-py3-none-any.whl", hash = "sha256:7f952cbe720b688055e3f87de14f5c3e5fdaa8bc3928985c4077ca689de849a2", size = 65455, upload-time = "2026-06-02T14:34:06.319Z" }, ] [[package]] @@ -565,7 +565,7 @@ wheels = [ ] [[package]] -name = "paddleocr" +name = "paddleocr-dify" version = "0.1.0" source = { virtual = "." } dependencies = [ @@ -1138,29 +1138,35 @@ wheels = [ [[package]] name = "zope-interface" -version = "8.4" +version = "8.5" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9f/65/34a6e6e4dfa260c4c55ee02bb2fc53625e126ff0181485286cf0c9d453d6/zope_interface-8.4.tar.gz", hash = "sha256:9dbee7925a23aa6349738892c911019d4095a96cff487b743482073ecbc174a8", size = 257736, upload-time = "2026-04-25T07:22:10.439Z" } +sdist = { url = "https://files.pythonhosted.org/packages/08/dc/50550cfcbb2ea3cbca5f1d7ed05c8aa840f831a0f2d63aec0a953f7c590e/zope_interface-8.5.tar.gz", hash = "sha256:7a3ba1c5877f0f3e3906b02ddf793abed2becc2948116414ce0e1dd820b68d6d", size = 257957, upload-time = "2026-05-26T06:50:14.574Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b8/96/0017b980424125cf98a9851d8fd3e24939818b7a82ecdd19ae672bb2413f/zope_interface-8.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84064876ed96ddd0744e3ad5d37134c758d77885e54113567792671405a02bac", size = 211604, upload-time = "2026-04-25T07:28:08.13Z" }, - { url = "https://files.pythonhosted.org/packages/59/4c/2cf5c45477fdd58a2c786d0c0d1817cbaaff8743d98ae72c643c4fe3be7b/zope_interface-8.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:81ed23698bfb588c48b1756129814b890febac971ff6c8a414f82601773145bb", size = 211783, upload-time = "2026-04-25T07:28:10.028Z" }, - { url = "https://files.pythonhosted.org/packages/fa/8c/efabdafc25ed44ef9c1084aad9870bb6c2c9b78e542684efe6865c0f0067/zope_interface-8.4-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:e0b9d7e958657fad414f8272afcdf0b8a873fbbb2bb6a6287232d2f11a232bf8", size = 264752, upload-time = "2026-04-25T07:28:11.773Z" }, - { url = "https://files.pythonhosted.org/packages/53/5a/c4d52c58d5fee4ff67cc02f0dec24d0e84428520f67a52f1e4086f0e7779/zope_interface-8.4-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:eef0a49e041f4dc4d2a6ab894b4fd0c5354e0e8037e731fb953531e59b0d3d33", size = 269829, upload-time = "2026-04-25T07:28:13.988Z" }, - { url = "https://files.pythonhosted.org/packages/16/d2/df8f339c93bb5adee695546ba90d0daa2917338a4792281f6b8e652a9328/zope_interface-8.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b302f955c36e924e1f4fe70dd9105ff06235857861c6ae72c3b10b016aeee99", size = 269452, upload-time = "2026-04-25T07:28:16.403Z" }, - { url = "https://files.pythonhosted.org/packages/17/4b/bd97b1a21bb2c16d66a42f6c7a43c0a5afcfaf14c68d3b7d2ee6afb28e52/zope_interface-8.4-cp312-cp312-win_amd64.whl", hash = "sha256:4ae6a1e111642dbf724f635424dcaf5a5c8abbde49eac3f452f5323ffaa10232", size = 214420, upload-time = "2026-04-25T07:28:18.405Z" }, - { url = "https://files.pythonhosted.org/packages/7d/85/1477f23cf3b0476608ca987b4338f91439abb5b96564ac26b26d2cde38fd/zope_interface-8.4-cp312-cp312-win_arm64.whl", hash = "sha256:2e9e4aa33b76877af903d5532545e64d24ade0f6f80d9d1a31e6efcea76a60bc", size = 212992, upload-time = "2026-04-25T07:28:20.48Z" }, - { url = "https://files.pythonhosted.org/packages/8e/6a/a08c62bc1fa0e34fe7b8b401646cba4817427c716bfbef6cc88937cd327f/zope_interface-8.4-cp313-cp313-macosx_10_9_x86_64.whl", hash = "sha256:cd55965d715413038774aead54851bc3dbdd74a69f3ce30252182a94407b9905", size = 211924, upload-time = "2026-04-25T07:28:22.219Z" }, - { url = "https://files.pythonhosted.org/packages/50/30/2011f17e00ff078658bc317e1f7eccd7843fc1ce60695b665b0a52c45c1b/zope_interface-8.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0d88c1f106a4f06e074a3ada2d20f4a602e3f2871c4f55726ed5d91e94ec19b1", size = 211995, upload-time = "2026-04-25T07:28:24.107Z" }, - { url = "https://files.pythonhosted.org/packages/25/f3/a16fe884571cfa89271412dbb40def6d6865824428d1e14785a82795100c/zope_interface-8.4-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:36c575356732d59ffd3279ad67e302a6fe517e67db5b061b36b377ee0fa016c4", size = 264443, upload-time = "2026-04-25T07:28:26.401Z" }, - { url = "https://files.pythonhosted.org/packages/83/88/e08923fcd8a8c8704af05a90418b07cd897ac90865925b37d7ad8139adfa/zope_interface-8.4-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:29f09ec8bda65f7b30294328070070a2590b90f252f834ee0817cdb0e2c35f6a", size = 269626, upload-time = "2026-04-25T07:28:28.423Z" }, - { url = "https://files.pythonhosted.org/packages/27/67/96c94cd307f9946d0b0f03402a335f7aae7b4f0b129b5734cc56cc78cb65/zope_interface-8.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2bc388cebcb753d21eaf2a0481fd6f0ce6840a47300a40dcec0b56bac27d0f97", size = 269583, upload-time = "2026-04-25T07:28:30.434Z" }, - { url = "https://files.pythonhosted.org/packages/e2/d4/7e9fcc8bb0dba5d023b9fca92035d68c018457cc550e9d51746670b76a6b/zope_interface-8.4-cp313-cp313-win_amd64.whl", hash = "sha256:3e5866917ccb57d929e515a1136d729bd3fa4f367965fb16e38a4bc72cb05521", size = 214422, upload-time = "2026-04-25T07:28:32.201Z" }, - { url = "https://files.pythonhosted.org/packages/16/26/b0bcde302f6a4c155d047a8ab5cba1003363031919d6e8f3bcdc139c28a6/zope_interface-8.4-cp313-cp313-win_arm64.whl", hash = "sha256:f1f854bef8bc137519e4413bcc1322d55faad28b20b3ca39f7bec49d2f1b26df", size = 213029, upload-time = "2026-04-25T07:28:34.677Z" }, - { url = "https://files.pythonhosted.org/packages/f6/d5/ca60c8b404b303d9490e1417430a5198a77557dbeb17c1cb31616e432318/zope_interface-8.4-cp314-cp314-macosx_10_9_x86_64.whl", hash = "sha256:7cbb887fdbfaacb4c362dbb487033551646e28013ad5ffe72e96eb260003a1a1", size = 212012, upload-time = "2026-04-25T07:28:36.88Z" }, - { url = "https://files.pythonhosted.org/packages/83/64/6bb9f54250c817e24b39e986f173b6cd21ff658bec6c6cc0baad05d761e4/zope_interface-8.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a5638c6be715116d3453e6d099c299c6844d54810de7445ce116424e905ede06", size = 212071, upload-time = "2026-04-25T07:28:38.742Z" }, - { url = "https://files.pythonhosted.org/packages/c6/cf/42851262e102723058019dc7d0b48210b85a935f79ae32ce60ddccc2e8fb/zope_interface-8.4-cp314-cp314-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:b8147b40bfcd53803870a9519e0879ff066aeecc2fcff8295663c1b17fc38dc2", size = 266075, upload-time = "2026-04-25T07:28:41.084Z" }, - { url = "https://files.pythonhosted.org/packages/d2/a7/e48c79b836f6f0a2c219288e2ec343517f90e95c93de5435a8a23918bf20/zope_interface-8.4-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:049ba3c7b38cc400ae08e011617635706e0f442e1d075db1b015246fcbf6091e", size = 269127, upload-time = "2026-04-25T07:28:42.868Z" }, - { url = "https://files.pythonhosted.org/packages/6a/40/0e26f24d3a2f34f0de2cfeaab6458a865284d9d1fa317ab78913aa1f7322/zope_interface-8.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9c4ac009c2c8e43283842f80387c4d4b41bcbc293391c3b9ab71532ae1ccc301", size = 269446, upload-time = "2026-04-25T07:28:44.97Z" }, - { url = "https://files.pythonhosted.org/packages/91/d5/20310601450367fc35fa28b0544c98d0347b8cc25eaf106a2c4cc36841e1/zope_interface-8.4-cp314-cp314-win_amd64.whl", hash = "sha256:4713bf651ec36e7eea49d2ace4f0e89bec2b33a339674874b1121f2537edc62a", size = 215199, upload-time = "2026-04-25T07:28:47.146Z" }, - { url = "https://files.pythonhosted.org/packages/5b/00/0d22ce75126e31f81baa5889e2a40aad37c8e34d1220cf8b18d744f2b5d9/zope_interface-8.4-cp314-cp314-win_arm64.whl", hash = "sha256:d934497c4b72d5f528d2b5ebe9b8b5a7004b5877948ebd4ea00c2432fb27178f", size = 213178, upload-time = "2026-04-25T07:28:48.868Z" }, + { url = "https://files.pythonhosted.org/packages/97/cc/b84123a948f3162a34623e188922827cd845244fdd043ed20f8d02228caa/zope_interface-8.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8e6ee90c2e6de7c37058d5fa41f123c8b13a312db8d1e0fb5840d7f4bcdff9c9", size = 212165, upload-time = "2026-05-26T06:49:26.566Z" }, + { url = "https://files.pythonhosted.org/packages/4e/78/cbceec44f1b27208a76c1a688c131302685852406a23df5aab68324109cc/zope_interface-8.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c1adc90d3576b3b4c4de4953e6002c37bef28b78d7fa54c1bbfd0c50f022fe7c", size = 212341, upload-time = "2026-05-26T06:49:28.182Z" }, + { url = "https://files.pythonhosted.org/packages/e1/c3/005032195ff3b210c139b7c560ed5c534e844b0907d8e44d2b3d8919305e/zope_interface-8.5-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:e6347b8d8d12c5eca6502450a92be30079b7acfade2c4f693efa0deb8871b06e", size = 265296, upload-time = "2026-05-26T06:49:29.741Z" }, + { url = "https://files.pythonhosted.org/packages/c5/66/1036543d6a66bc04c19df3cf650f3ad938a002ab0a443c24e23e8de5e8b9/zope_interface-8.5-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5e970dabea777a24b0b0bbf9dae3ab75ce8b2d8e948edf4875627034b21f3560", size = 270689, upload-time = "2026-05-26T06:49:31.767Z" }, + { url = "https://files.pythonhosted.org/packages/30/4c/8b56259558cace4414e753ca6740396a1f59d4a95ddb55b4658600408670/zope_interface-8.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f0b48ccadaa9839e09ff81e969703cecb3f402c813bfe8b958652e699bea69f5", size = 270280, upload-time = "2026-05-26T06:49:33.489Z" }, + { url = "https://files.pythonhosted.org/packages/f9/ea/649908c83aa8fdb7faf2ddca4d3cf6fb8f2157121267dc56e8f72681e26c/zope_interface-8.5-cp312-cp312-win_amd64.whl", hash = "sha256:e0e311f1277468c08fd59a2b41f71b43d25dff639789d364747acd1705c0df6e", size = 215019, upload-time = "2026-05-26T06:49:35.607Z" }, + { url = "https://files.pythonhosted.org/packages/9f/97/da13037b4c563e4df32eedbc819f8c00b754af494f68211e3dffd48d52da/zope_interface-8.5-cp312-cp312-win_arm64.whl", hash = "sha256:652b73107a04159ec6c020db6c1543d4f1e8f4d069bd2aac88a947820923517b", size = 213569, upload-time = "2026-05-26T06:49:37.317Z" }, + { url = "https://files.pythonhosted.org/packages/f4/8c/4c15755d701f2ec0e80d64a18e1ebaf5be2c584c0ec153fd516f5d13eada/zope_interface-8.5-cp313-cp313-macosx_10_9_x86_64.whl", hash = "sha256:28e80457c134d1fa57a7d758004dece348654e1b1467ac22dcdc20fc1d127c52", size = 212512, upload-time = "2026-05-26T06:49:38.996Z" }, + { url = "https://files.pythonhosted.org/packages/9a/2e/4360c54c465db042cc8fbeeec92abac28b4cedbf6ba63c1f092fd08a190f/zope_interface-8.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:09495ce9d559c06b70f2d4855b3e4f48a822a9ddc8be1d30c5b4e5be14ae1ace", size = 212541, upload-time = "2026-05-26T06:49:41.186Z" }, + { url = "https://files.pythonhosted.org/packages/aa/a5/692a2b8d70f78e848793231d5fae5fecbf8d0cccd73430fdc34802a6d3c1/zope_interface-8.5-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:7849ad8fa90763cc1087f4dda78ca3a233e950b3e08fac7079297c9cafbbd7bb", size = 265191, upload-time = "2026-05-26T06:49:43.449Z" }, + { url = "https://files.pythonhosted.org/packages/70/8d/454a9cfc7a050c394ab4f11b3371f7897828b7415e096afff724637e65e0/zope_interface-8.5-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5578c9421ca409a1f39f153d6f7803e4cde01da592ec75a9ac5e1b777d18d33b", size = 270626, upload-time = "2026-05-26T06:49:45.425Z" }, + { url = "https://files.pythonhosted.org/packages/51/8c/db8409cfa3575b8e9b4800babd7d49f8228433cd1f0c56814bd0ada49c33/zope_interface-8.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e1bd7d96b4ca5fa311f54c9eac16dce4886b428c1531dbe06067763ccdf123b4", size = 270444, upload-time = "2026-05-26T06:49:47.025Z" }, + { url = "https://files.pythonhosted.org/packages/4a/df/a386940e41469ef615e100a216d8b386521e9e598817147f87932ca203c4/zope_interface-8.5-cp313-cp313-win_amd64.whl", hash = "sha256:0c8123d2a4dfde2a613c7cb772605477724782c20bc2e0ad1d9435376a6a44a3", size = 215021, upload-time = "2026-05-26T06:49:48.478Z" }, + { url = "https://files.pythonhosted.org/packages/89/75/477eb5669b6b2a7a843decd1a075e9b1971a8720017654143a7183abd3d9/zope_interface-8.5-cp313-cp313-win_arm64.whl", hash = "sha256:6d02be14f3173c6c7288bc2fdf530090c01c3cf8764ad46c68024686f364278e", size = 213610, upload-time = "2026-05-26T06:49:50.01Z" }, + { url = "https://files.pythonhosted.org/packages/d4/19/5032e954827fdf02db2d2f49737ac4378bb9cfc2cd95a8f2e2a5ae2ec01a/zope_interface-8.5-cp314-cp314-macosx_10_9_x86_64.whl", hash = "sha256:ffaecf013251a89d0de6feb49a46eba48ad8cbbf8a40aeb6045e459e7bec6784", size = 212597, upload-time = "2026-05-26T06:49:51.63Z" }, + { url = "https://files.pythonhosted.org/packages/f1/53/3ef644012cf8a6a234a2d6134aab5a5c65ac5467c86296865501d4fbc406/zope_interface-8.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:126fa9d1c52295ae076d4cf968634f0a1826afa408a20808b57ff72877b8f69f", size = 212626, upload-time = "2026-05-26T06:49:53.236Z" }, + { url = "https://files.pythonhosted.org/packages/32/67/bc8b4f465d388039255003e230c284a175cedf1203c692f23cb7bff64efe/zope_interface-8.5-cp314-cp314-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:3090e3a663d20194756a59a272e0c8508b889341e31d5894223331fe6b4f9b21", size = 266827, upload-time = "2026-05-26T06:49:54.873Z" }, + { url = "https://files.pythonhosted.org/packages/a7/eb/37d05b935ede53d79690fecc8d201440084418e590bcfc05f384451c7593/zope_interface-8.5-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9342fb74e2afefdb081bf1df727d209ea56995c6e13f5a0540e6d7aff4beafb8", size = 270139, upload-time = "2026-05-26T06:49:57.116Z" }, + { url = "https://files.pythonhosted.org/packages/8b/0b/fd0c54579e2ce8dc6cf1a757903f3374bc6fbda929a46af9e0f53cb0e5f0/zope_interface-8.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6c54725d818f1b57a7efb8b16528326e1f3c257b602b32393fd255c45af8799d", size = 270338, upload-time = "2026-05-26T06:49:58.698Z" }, + { url = "https://files.pythonhosted.org/packages/c1/1d/c420dcd777bb761067ea92879ac766694a5ca78608185f1aecea64cbfc11/zope_interface-8.5-cp314-cp314-win_amd64.whl", hash = "sha256:29d74febbae1afeb6834c4ccbf42e242a673c860060f09e53142825270456140", size = 215789, upload-time = "2026-05-26T06:50:00.405Z" }, + { url = "https://files.pythonhosted.org/packages/62/94/50b5eb8f94e527edceac14f9955e58917424ea79bb572ddc18548561cbc2/zope_interface-8.5-cp314-cp314-win_arm64.whl", hash = "sha256:633c8c49396f38df030340797c533e9fe460d1b5d1e42d88e55e938e525f548c", size = 213757, upload-time = "2026-05-26T06:50:01.973Z" }, + { url = "https://files.pythonhosted.org/packages/17/6f/5d5f32c4dfcdb16ce2ec5363da686840f13c13e1a1214cb70b49e1cd6d9f/zope_interface-8.5-cp314-cp314t-macosx_10_9_x86_64.whl", hash = "sha256:133999820fdbae513c36c03d6f29ef87317aaa3edef39112222b155083664714", size = 213591, upload-time = "2026-05-26T06:50:03.529Z" }, + { url = "https://files.pythonhosted.org/packages/f3/55/de0c3459ff717fce3342f9a29464c281fdeb0d36c3171ee88d119d5f0650/zope_interface-8.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:8bd75c96966e573232f0599deaff717564828031c7f05563ccc1ac35c5ee0304", size = 213733, upload-time = "2026-05-26T06:50:05.101Z" }, + { url = "https://files.pythonhosted.org/packages/c2/95/d97430abd5ae9677e8b9295b58720c0064a5b557dbb6b8bf5928484cf0d8/zope_interface-8.5-cp314-cp314t-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:14b0e9799351d4c34fe99afd67f0cdd76e55ba15c66a98699d5fc22ea8241e08", size = 294905, upload-time = "2026-05-26T06:50:07.384Z" }, + { url = "https://files.pythonhosted.org/packages/41/ec/a0f8f3dad6e74992f4654bdd94802be0929eabca7b871cac3b6fbb5e961b/zope_interface-8.5-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0cd6a732ac84b94eb1ef9222a117347a27efd294ee16810ffdf7ecd307677ed5", size = 300885, upload-time = "2026-05-26T06:50:08.997Z" }, + { url = "https://files.pythonhosted.org/packages/0f/da/6881b48803a0ee8d23eb5efa30fce3ed218a2bd9de5758ce489d224fee81/zope_interface-8.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:798b7c87d0e59a7d5d086d642208d0d8700ff0d55c4029134b3c479c3bfb110f", size = 304672, upload-time = "2026-05-26T06:50:10.563Z" }, + { url = "https://files.pythonhosted.org/packages/2e/0e/b4c01320859ff1d585438bc231fd60bd258d096359bccf6654fecdf0cffb/zope_interface-8.5-cp314-cp314t-win_amd64.whl", hash = "sha256:0fc3a9d45f114d27eaa1e53beeb144533689edca8a9f66505b1e8e8b3f075e42", size = 217241, upload-time = "2026-05-26T06:50:12.171Z" }, ]