From 28172fcd38c01894edb6631324799b7043755e15 Mon Sep 17 00:00:00 2001 From: behnazh-w Date: Tue, 31 Mar 2026 12:48:12 +1000 Subject: [PATCH] fix: improve URL validation to avoid unexpected redirects Signed-off-by: behnazh-w --- .../slsa_analyzer/provenance/loader.py | 12 +--- src/macaron/util.py | 62 ++++++++++++++++++- 2 files changed, 64 insertions(+), 10 deletions(-) diff --git a/src/macaron/slsa_analyzer/provenance/loader.py b/src/macaron/slsa_analyzer/provenance/loader.py index 3e9d9b1b0..0b7b1352b 100644 --- a/src/macaron/slsa_analyzer/provenance/loader.py +++ b/src/macaron/slsa_analyzer/provenance/loader.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the loaders for SLSA provenances.""" @@ -9,7 +9,6 @@ import json import logging import zlib -from urllib.parse import urlparse from cryptography import x509 from cryptography.x509 import DuplicateExtension, UnsupportedGeneralNameType @@ -19,7 +18,7 @@ from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, validate_intoto_payload from macaron.slsa_analyzer.provenance.intoto.errors import LoadIntotoAttestationError, ValidateInTotoPayloadError from macaron.slsa_analyzer.specs.pypi_certificate_predicate import PyPICertificatePredicate -from macaron.util import send_get_http_raw +from macaron.util import send_get_http_raw, url_is_safe logger: logging.Logger = logging.getLogger(__name__) @@ -43,13 +42,8 @@ def _try_read_url_link_file(file_content: bytes) -> str | None: def _download_url_file_content(url: str, url_link_hostname_allowlist: list[str]) -> bytes: - hostname = urlparse(url).hostname - if hostname is None or hostname == "": + if not url_is_safe(url, allow_list=url_link_hostname_allowlist): raise LoadIntotoAttestationError("Cannot resolve URL link file: invalid URL") - if hostname not in url_link_hostname_allowlist: - raise LoadIntotoAttestationError( - "Cannot resolve URL link file: target hostname '" + hostname + "' is not in allowed hostnames." - ) # TODO download size limit? timeout = defaults.getint("downloads", "timeout", fallback=120) diff --git a/src/macaron/util.py b/src/macaron/util.py index 6509e2f67..b6f789493 100644 --- a/src/macaron/util.py +++ b/src/macaron/util.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2026, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module includes utilities functions for Macaron.""" @@ -21,6 +21,66 @@ logger: logging.Logger = logging.getLogger(__name__) +def url_is_safe(url: str, allow_list: list[str] | None = None, allow_login: bool = False) -> bool: + r"""Validate that a URL has an acceptable host and login component. + + Parameters + ---------- + url : str + URL string to validate. + allow_list : list[str] | None + Allowed hostnames. When provided, the parsed hostname must be in this list. + If ``None``, any non-empty hostname is accepted. + allow_login : bool, default=False + Whether username/password URL components are permitted. + + Returns + ------- + bool + ``True`` when the URL passes safety checks, otherwise ``False``. + + Examples + -------- + >>> url_is_safe("https://example.com") + True + >>> url_is_safe("https://example.com", allow_list=["example.com"]) + True + >>> url_is_safe("https://example.com", allow_list=["oracle.com"]) + False + >>> url_is_safe("https://user:test@example.com") + False + >>> url_is_safe("https://user:test@example.com", allow_login=True) + True + >>> url_is_safe("not-a-url") + False + >>> url_is_safe("127.0.0.1:6666\\@allowlist.com", ["allowlist.com"]) + False + >>> url_is_safe("https://attacker.com:6666\\@allowlist.com", ["allowlist.com"]) + False + >>> url_is_safe("https://username:attacker.com\\@allowlist.com", ["allowlist.com"]) + False + >>> url_is_safe("https://username:test@allowlist.com", ["allowlist.com"], allow_login = True) + True + """ + try: + parsed_url = urllib.parse.urlparse(url) + except ValueError: + return False + if not allow_login: + if parsed_url.username or parsed_url.password: + logger.debug("Potential attempt to redirect to an invalid URL: hostname %s", parsed_url.hostname) + return False + + hostname = parsed_url.hostname + if hostname is None or hostname == "": + return False + if allow_list and (hostname not in allow_list): + logger.debug("URL %s is not in allowed hostnames.", url) + return False + + return True + + def send_get_http(url: str, headers: dict) -> dict: """Send the GET HTTP request with the given url and headers.