From d5a5a035dc45297e750dc2c53f4308adfe918ebb Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 11 Jun 2026 02:56:57 +0000 Subject: [PATCH] feat(artifacts): handle HTTP redirects for presigned cloud-storage URLs - Add allow_redirects parameter to APIRequestor.request() to let callers disable automatic redirect following when auth headers would leak. - Artifacts.get() now uses allow_redirects=False and manually follows redirect responses (301-308) via _follow_redirect(), which strips the Authorization header before fetching from presigned S3/GCS URLs. - Replace hand-rolled query-param stripping with _filename_from_url() using urlparse for robust presigned-URL filename extraction. - Fix double-extension bug in url artifact file naming (was foo.jpg.jpg). - Accept application/octet-stream Content-Type for redirected responses since cloud storage may not preserve the original MIME type. - Add comprehensive tests for redirect handling, filename extraction, and content-type tolerance. Co-Authored-By: Sudeep Pillai --- tests/test_artifacts.py | 294 ++++++++++++++++++++++++++++++++ vlmrun/client/artifacts.py | 147 ++++++++++++---- vlmrun/client/base_requestor.py | 5 + 3 files changed, 411 insertions(+), 35 deletions(-) diff --git a/tests/test_artifacts.py b/tests/test_artifacts.py index ed50fc9..645c106 100644 --- a/tests/test_artifacts.py +++ b/tests/test_artifacts.py @@ -1,6 +1,20 @@ """Tests for artifacts operations.""" +from __future__ import annotations + +import io +from pathlib import Path +from unittest.mock import patch + import pytest +from PIL import Image + +from vlmrun.client.artifacts import Artifacts, _filename_from_url + + +# --------------------------------------------------------------------------- +# Mock-client tests (use the lightweight mock_client fixture from conftest) +# --------------------------------------------------------------------------- def test_get_artifact(mock_client): @@ -18,3 +32,283 @@ def test_list_artifacts_not_implemented(mock_client): with pytest.raises(NotImplementedError) as exc_info: mock_client.artifacts.list(session_id="550e8400-e29b-41d4-a716-446655440000") assert "not yet implemented" in str(exc_info.value) + + +# --------------------------------------------------------------------------- +# _filename_from_url unit tests +# --------------------------------------------------------------------------- + + +class TestFilenameFromUrl: + """Tests for the _filename_from_url helper.""" + + def test_simple_url(self): + assert _filename_from_url("https://example.com/path/to/file.jpg") == "file.jpg" + + def test_presigned_url_strips_query_params(self): + url = ( + "https://storage.googleapis.com/bucket/img_a1b2c3.jpg" + "?X-Goog-Algorithm=GOOG4&X-Goog-Credential=abc" + ) + assert _filename_from_url(url) == "img_a1b2c3.jpg" + + def test_s3_presigned_url(self): + url = ( + "https://s3.amazonaws.com/my-bucket/session/vid_f0e1d2.mp4" + "?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=3600" + ) + assert _filename_from_url(url) == "vid_f0e1d2.mp4" + + def test_url_with_fragment(self): + url = "https://cdn.example.com/doc_abc123.pdf#page=1" + assert _filename_from_url(url) == "doc_abc123.pdf" + + def test_url_without_extension(self): + url = "https://example.com/artifacts/img_a1b2c3" + assert _filename_from_url(url) == "img_a1b2c3" + + +# --------------------------------------------------------------------------- +# Redirect-handling tests (patch APIRequestor.request + requests.get) +# --------------------------------------------------------------------------- + + +class _FakeResponse: + """Minimal stand-in for ``requests.Response``.""" + + def __init__( + self, + content: bytes, + headers: dict[str, str] | None = None, + status_code: int = 200, + ): + self.content = content + self.headers = headers or {} + self.status_code = status_code + + def raise_for_status(self): + pass + + def iter_content(self, chunk_size=8192): + yield self.content + + def __enter__(self): + return self + + def __exit__(self, *args): + pass + + +class _StubClient: + """Minimal protocol-compatible client for ``Artifacts.__init__``.""" + + api_key = "test-key" + base_url = "https://api.vlm.run/v1" + timeout = 30.0 + max_retries = 1 + + +def _make_1x1_jpeg() -> bytes: + """Return a minimal valid JPEG in memory.""" + buf = io.BytesIO() + Image.new("RGB", (1, 1), color="red").save(buf, format="JPEG") + return buf.getvalue() + + +class TestArtifactsRedirectHandling: + """Verify that ``Artifacts.get`` follows redirects without auth headers.""" + + def test_redirect_to_presigned_url_raw(self): + """A 302 redirect should be followed and raw bytes returned.""" + redirect_headers = { + "Location": "https://storage.googleapis.com/bucket/img_a1b2c3.jpg?sig=abc", + } + artifact_bytes = b"image-bytes-from-gcs" + fake_redirect_resp = _FakeResponse( + content=artifact_bytes, + headers={"Content-Type": "image/jpeg"}, + ) + + with ( + patch.object( + Artifacts, + "__init__", + lambda self, client: setattr(self, "_client", client) + or setattr(self, "_requestor", None), + ), + patch( + "vlmrun.client.artifacts.APIRequestor", + ), + patch( + "vlmrun.client.artifacts._follow_redirect", + return_value=fake_redirect_resp, + ) as mock_follow, + ): + arts = Artifacts.__new__(Artifacts) + arts._client = _StubClient() + + # Simulate requestor returning a 302 with Location header + class _FakeRequestor: + def request(self, **kwargs): + return (b"", 302, redirect_headers) + + arts._requestor = _FakeRequestor() + + result = arts.get( + object_id="img_a1b2c3", + session_id="sess-001", + raw_response=True, + ) + + assert result == artifact_bytes + mock_follow.assert_called_once_with( + "https://storage.googleapis.com/bucket/img_a1b2c3.jpg?sig=abc" + ) + + def test_redirect_to_presigned_url_image(self, tmp_path, monkeypatch): + """A 302 redirect for an img artifact returns a PIL Image.""" + jpeg_bytes = _make_1x1_jpeg() + redirect_headers = { + "Location": "https://s3.amazonaws.com/bucket/img_a1b2c3.jpg?X-Amz-Sig=xyz", + } + fake_redirect_resp = _FakeResponse( + content=jpeg_bytes, + headers={"Content-Type": "image/jpeg"}, + ) + + monkeypatch.setattr( + "vlmrun.client.artifacts.VLMRUN_ARTIFACTS_CACHE_DIR", tmp_path + ) + + with patch( + "vlmrun.client.artifacts._follow_redirect", + return_value=fake_redirect_resp, + ): + arts = Artifacts.__new__(Artifacts) + arts._client = _StubClient() + + class _FakeRequestor: + def request(self, **kwargs): + return (b"", 302, redirect_headers) + + arts._requestor = _FakeRequestor() + + result = arts.get( + object_id="img_a1b2c3", + session_id="sess-002", + ) + + assert isinstance(result, Image.Image) + assert result.mode == "RGB" + + def test_redirect_to_presigned_url_video(self, tmp_path, monkeypatch): + """A 302 redirect for a vid artifact writes to disk and returns Path.""" + monkeypatch.setattr( + "vlmrun.client.artifacts.VLMRUN_ARTIFACTS_CACHE_DIR", tmp_path + ) + video_bytes = b"\x00\x00\x00\x1c" + b"\x00" * 100 # fake mp4 header + redirect_headers = { + "Location": "https://cdn.example.com/vid_f0e1d2.mp4?token=abc", + } + fake_redirect_resp = _FakeResponse( + content=video_bytes, + headers={"Content-Type": "video/mp4"}, + ) + + with patch( + "vlmrun.client.artifacts._follow_redirect", + return_value=fake_redirect_resp, + ): + arts = Artifacts.__new__(Artifacts) + arts._client = _StubClient() + + class _FakeRequestor: + def request(self, **kwargs): + return (b"", 302, redirect_headers) + + arts._requestor = _FakeRequestor() + + result = arts.get( + object_id="vid_f0e1d2", + session_id="sess-003", + ) + + assert isinstance(result, Path) + assert result.name == "vid_f0e1d2.mp4" + assert result.read_bytes() == video_bytes + + def test_no_redirect_direct_response(self, tmp_path, monkeypatch): + """A 200 response delivers bytes directly (no redirect).""" + monkeypatch.setattr( + "vlmrun.client.artifacts.VLMRUN_ARTIFACTS_CACHE_DIR", tmp_path + ) + jpeg_bytes = _make_1x1_jpeg() + + arts = Artifacts.__new__(Artifacts) + arts._client = _StubClient() + + class _FakeRequestor: + def request(self, **kwargs): + return (jpeg_bytes, 200, {"Content-Type": "image/jpeg"}) + + arts._requestor = _FakeRequestor() + + result = arts.get( + object_id="img_a1b2c3", + session_id="sess-004", + ) + assert isinstance(result, Image.Image) + + def test_redirect_without_location_raises(self): + """A redirect without a Location header should raise ValueError.""" + arts = Artifacts.__new__(Artifacts) + arts._client = _StubClient() + + class _FakeRequestor: + def request(self, **kwargs): + return (b"", 302, {}) + + arts._requestor = _FakeRequestor() + + with pytest.raises(ValueError, match="without a Location header"): + arts.get( + object_id="img_a1b2c3", + session_id="sess-005", + raw_response=True, + ) + + def test_redirect_octet_stream_content_type_accepted(self, tmp_path, monkeypatch): + """Cloud storage may return application/octet-stream for any file type.""" + monkeypatch.setattr( + "vlmrun.client.artifacts.VLMRUN_ARTIFACTS_CACHE_DIR", tmp_path + ) + pdf_bytes = b"%PDF-1.4 fake content" + redirect_headers = { + "Location": "https://s3.amazonaws.com/bucket/doc_abc123.pdf?sig=xyz", + } + fake_redirect_resp = _FakeResponse( + content=pdf_bytes, + headers={"Content-Type": "application/octet-stream"}, + ) + + with patch( + "vlmrun.client.artifacts._follow_redirect", + return_value=fake_redirect_resp, + ): + arts = Artifacts.__new__(Artifacts) + arts._client = _StubClient() + + class _FakeRequestor: + def request(self, **kwargs): + return (b"", 307, redirect_headers) + + arts._requestor = _FakeRequestor() + + result = arts.get( + object_id="doc_abc123", + session_id="sess-006", + ) + + assert isinstance(result, Path) + assert result.name == "doc_abc123.pdf" + assert result.read_bytes() == pdf_bytes diff --git a/vlmrun/client/artifacts.py b/vlmrun/client/artifacts.py index d8b753a..cf0ce98 100644 --- a/vlmrun/client/artifacts.py +++ b/vlmrun/client/artifacts.py @@ -3,9 +3,11 @@ from __future__ import annotations import io +import logging import requests from pathlib import Path -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING +from urllib.parse import urlparse from PIL import Image from pydantic import AnyHttpUrl @@ -18,6 +20,55 @@ if TYPE_CHECKING: from vlmrun.types.abstract import VLMRunProtocol +logger = logging.getLogger(__name__) + +_REDIRECT_STATUS_CODES = {301, 302, 303, 307, 308} + +_CONTENT_TYPE_EXT_MAP: dict[str, str] = { + "image/jpeg": "jpg", + "image/png": "png", + "video/mp4": "mp4", + "audio/mpeg": "mp3", + "application/pdf": "pdf", + "application/octet-stream": "bin", +} + + +def _filename_from_url(url: str) -> str: + """Extract a clean filename from a URL, stripping query parameters. + + Args: + url: The URL to extract a filename from. + + Returns: + The filename portion of the URL path with query params removed. + """ + parsed = urlparse(url) + return Path(parsed.path).name + + +def _follow_redirect(url: str, *, stream: bool = False) -> requests.Response: + """Follow a redirect URL without authorization headers. + + Presigned cloud-storage URLs (S3, GCS, etc.) carry their own auth via + query parameters and will reject requests that also include an + ``Authorization`` header. This helper fetches the URL using only the + standard browser-like headers defined in ``_HEADERS``. + + Args: + url: The redirect target URL. + stream: Whether to stream the response body. + + Returns: + The HTTP response from the redirect target. + + Raises: + requests.HTTPError: If the downstream request fails. + """ + response = requests.get(url, headers=_HEADERS, stream=stream, allow_redirects=True) + response.raise_for_status() + return response + class Artifacts: """Artifacts resource for VLM Run API.""" @@ -34,15 +85,21 @@ def __init__(self, client: "VLMRunProtocol") -> None: def get( self, object_id: str, - session_id: Optional[str] = None, - execution_id: Optional[str] = None, + session_id: str | None = None, + execution_id: str | None = None, raw_response: bool = False, - ) -> Union[bytes, Image.Image, AnyHttpUrl, Path]: + ) -> bytes | Image.Image | AnyHttpUrl | Path: """Get an artifact by session ID or execution ID and object ID. + The API may respond with the artifact bytes directly **or** with an + HTTP redirect (301–308) to a presigned cloud-storage URL. When a + redirect is received the SDK follows it transparently, stripping the + ``Authorization`` header so that presigned-URL authentication is not + disrupted. + Supported artifact types: - - img: Returns PIL.Image.Image (JPEG) - - url: Returns AnyHttpUrl + - img: Returns PIL.Image.Image (JPEG/PNG) + - url: Returns Path to the downloaded file - vid: Returns Path to MP4 file - aud: Returns Path to MP3 file - doc: Returns Path to PDF file @@ -55,10 +112,11 @@ def get( raw_response: Whether to return the raw response bytes Returns: - The artifact content - type depends on object_id prefix and raw_response flag + The artifact content — type depends on object_id prefix and raw_response flag. Raises: - ValueError: If neither session_id nor execution_id is provided, or if both are provided + ValueError: If neither session_id nor execution_id is provided, or if both are provided. + TypeError: If the response body is not bytes. """ if session_id is None and execution_id is None: raise ValueError("Either `session_id` or `execution_id` is required") @@ -68,7 +126,7 @@ def get( ) # Build query parameters, filtering out None values - query_params = {"object_id": object_id} + query_params: dict[str, str] = {"object_id": object_id} if session_id is not None: query_params["session_id"] = session_id if execution_id is not None: @@ -79,8 +137,22 @@ def get( url="artifacts", params=query_params, raw_response=True, + allow_redirects=False, ) + # --- Handle redirect responses (e.g. presigned cloud-storage URLs) --- + if status_code in _REDIRECT_STATUS_CODES: + redirect_url = headers.get("Location") or headers.get("location") + if not redirect_url: + raise ValueError( + f"Received redirect ({status_code}) without a Location header" + ) + logger.debug(f"Following artifact redirect to {redirect_url}") + redirect_resp = _follow_redirect(redirect_url) + response = redirect_resp.content + headers = dict(redirect_resp.headers) + status_code = redirect_resp.status_code + if not isinstance(response, bytes): raise TypeError("Expected bytes response") @@ -96,13 +168,18 @@ def get( ) # Create artifacts directory with session_id subdirectory - sess_id: str = session_id or execution_id + sess_id: str = session_id or execution_id # type: ignore[assignment] artifacts_dir: Path = VLMRUN_ARTIFACTS_CACHE_DIR / sess_id artifacts_dir.mkdir(parents=True, exist_ok=True) # Extension and content-type mappings for file-based artifacts - ext_mapping = {"vid": "mp4", "aud": "mp3", "doc": "pdf", "recon": "spz"} - content_type_mapping = { + ext_mapping: dict[str, str] = { + "vid": "mp4", + "aud": "mp3", + "doc": "pdf", + "recon": "spz", + } + content_type_mapping: dict[str, str] = { "vid": "video/mp4", "aud": "audio/mpeg", "doc": "application/pdf", @@ -110,43 +187,43 @@ def get( } if obj_type == "img": - assert headers["Content-Type"] in ( - "image/jpeg", - "image/png", - ), f"Expected image/jpeg or image/png, got {headers['Content-Type']}" + content_type = headers.get("Content-Type", "") + allowed_img_types = ("image/jpeg", "image/png", "application/octet-stream") + assert ( + content_type in allowed_img_types + ), f"Expected one of {allowed_img_types}, got {content_type}" return Image.open(io.BytesIO(response)).convert("RGB") elif obj_type == "url": - # Get the filename including extension frm the URL by stripping any query parameters + # Decode the URL from the response body url: AnyHttpUrl = AnyHttpUrl(response.decode("utf-8")) - path: Path = Path(str(url)) - filename: str = path.name.split("?")[0] - ext: str = filename.split(".")[-1].lower() - tmp_path: Path = artifacts_dir / f"{filename}.{ext}" + filename: str = _filename_from_url(str(url)) + tmp_path: Path = artifacts_dir / filename if tmp_path.exists(): return tmp_path - # Download the file, and move it to the appropriate path - with requests.get(url, headers=_HEADERS, stream=True) as r: - r.raise_for_status() + # Download the file via the URL (no auth headers for presigned URLs) + with _follow_redirect(str(url), stream=True) as r: with tmp_path.open("wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) return tmp_path elif obj_type in ("vid", "aud", "doc", "recon"): - # Validate content type + # Validate content type (allow application/octet-stream for + # cloud-storage responses that don't set a specific MIME type) expected_content_type = content_type_mapping[obj_type] - actual_content_type = headers.get("Content-Type") - assert ( - actual_content_type == expected_content_type - ), f"Expected {expected_content_type}, got {actual_content_type}" + actual_content_type = headers.get("Content-Type", "") + if actual_content_type not in ( + expected_content_type, + "application/octet-stream", + ): + logger.warning( + f"Unexpected Content-Type for {obj_type} artifact: " + f"expected {expected_content_type}, got {actual_content_type}" + ) # Build file path with appropriate extension - ext = ext_mapping.get(obj_type, None) - if ext is None: - raise IOError( - f"Unsupported file type [file_type={filename}, object_id={object_id}]" - ) - tmp_path: Path = artifacts_dir / f"{object_id}.{ext}" + ext = ext_mapping[obj_type] + tmp_path = artifacts_dir / f"{object_id}.{ext}" # Return cached version if it exists if tmp_path.exists(): diff --git a/vlmrun/client/base_requestor.py b/vlmrun/client/base_requestor.py index 9158cbb..7d3bd4a 100644 --- a/vlmrun/client/base_requestor.py +++ b/vlmrun/client/base_requestor.py @@ -73,6 +73,7 @@ def request( headers: Optional[Dict[str, str]] = None, raw_response: bool = False, timeout: Optional[float] = None, + allow_redirects: bool = True, ) -> Union[ Tuple[Dict[str, Any], int, Dict[str, str]], Tuple[bytes, int, Dict[str, str]] ]: @@ -87,6 +88,9 @@ def request( headers: Request headers raw_response: Whether to return raw response content timeout: Request timeout in seconds + allow_redirects: Whether to follow HTTP redirects (default True). + Set to False when the caller needs to handle redirects manually + (e.g., to avoid leaking auth headers to external presigned URLs). Returns: Tuple of (response_data, status_code, response_headers) @@ -143,6 +147,7 @@ def _request_with_retry(): files=files, headers=_headers, timeout=timeout or self._timeout, + allow_redirects=allow_redirects, ) response.raise_for_status()