From dcd07d82e6349c2c87cbe9eab6c66ecc36697973 Mon Sep 17 00:00:00 2001 From: telday Date: Fri, 18 Jul 2025 14:10:24 -0600 Subject: [PATCH 1/2] Add development requirements --- requirements-dev.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements-dev.txt diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..35aaeb5 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +pytest +pyyaml From bdd1416b56c831f407caa495e08b5f84fd6f3700 Mon Sep 17 00:00:00 2001 From: telday Date: Fri, 18 Jul 2025 14:10:43 -0600 Subject: [PATCH 2/2] Update URL data section regex & add regex tests --- .github/workflows/test.yml | 3 +- data_url/__init__.py | 36 +++++++- requirements-dev.txt | 1 + test/data_url_test_cases.yaml | 165 ++++++++++++++++++++++++++++++++++ test/test_regex.py | 21 +++++ 5 files changed, 222 insertions(+), 4 deletions(-) create mode 100644 test/data_url_test_cases.yaml create mode 100644 test/test_regex.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d0ab7aa..5d8f3a6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,8 +26,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install -r requirements-dev.txt - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/data_url/__init__.py b/data_url/__init__.py index 2e0082f..9e21cf3 100644 --- a/data_url/__init__.py +++ b/data_url/__init__.py @@ -8,7 +8,7 @@ (?P[a-z][a-z0-9\-]+/[a-z][\w\-\.\+]+)? # optional media type (?P(?:;[\w\-\.+]+=[\w\-\.+%]+)*) # optional attribute=values, value can be url encoded (?P;base64)?, # optional base64 flag - (?P[\w\d.~%\=\/\+-]+) # the data + (?P.*) # data section - validate separately """, re.MULTILINE | re.VERBOSE ) @@ -126,11 +126,23 @@ def __parse_url(self): for pair in params.split(";"): if pair: name, value = pair.split("=", 1) + # base64 is reserved and can only appear as a flag, not as a parameter + if name == "base64": + return False self._parameters[name] = unquote(value) raw_data = match.group('data') + + # Validate the data section contains only allowed characters + if not _validate_data_section(raw_data, self._is_base64_encoded): + return False + if self._is_base64_encoded: - self._data = base64.b64decode(raw_data) + try: + self._data = base64.b64decode(raw_data) + except Exception: + # Invalid base64 data + return False else: self._data = raw_data return True @@ -181,3 +193,23 @@ def parameters(self): if not hasattr(self, '_parameters'): self._parameters = {} return self._parameters + +def _validate_data_section(data, is_base64=False): + """ + Validate that the data section contains only allowed characters. + + Args: + data (str): The data section to validate + is_base64 (bool): Whether this is base64 encoded data + + Returns: + bool: True if valid, False otherwise + """ + if is_base64: + # Base64 alphabet plus padding + base64_pattern = re.compile(r'^[A-Za-z0-9+/=]*$') + return base64_pattern.match(data) is not None + else: + # Unreserved characters plus percent-encoded sequences + unreserved_pattern = re.compile(r'^(?:[A-Za-z0-9\-_.!~*\'()]|%[0-9A-Fa-f]{2})*$') + return unreserved_pattern.match(data) is not None diff --git a/requirements-dev.txt b/requirements-dev.txt index 35aaeb5..b1280b6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,2 +1,3 @@ pytest pyyaml +flake8 \ No newline at end of file diff --git a/test/data_url_test_cases.yaml b/test/data_url_test_cases.yaml new file mode 100644 index 0000000..3485c3b --- /dev/null +++ b/test/data_url_test_cases.yaml @@ -0,0 +1,165 @@ +test_cases: + # Valid test cases (20 passing) + + - url: "data:text/plain,Hello%20World" + valid: true + description: "Basic text with URL-encoded space" + + - url: "data:text/plain,Hello-World_123" + valid: true + description: "Text with unreserved characters" + + - url: "data:text/plain,Hello.World" + valid: true + description: "Text with dot character" + + - url: "data:text/plain,Hello!World" + valid: true + description: "Text with exclamation mark" + + - url: "data:text/plain,Hello~World" + valid: true + description: "Text with tilde character" + + - url: "data:text/plain,Hello*World" + valid: true + description: "Text with asterisk character" + + - url: "data:text/plain,Hello%27World" + valid: true + description: "Text with URL-encoded single quote" + + - url: "data:text/plain,Hello%28World%29" + valid: true + description: "Text with URL-encoded parentheses" + + - url: "data:text/plain;base64,SGVsbG8gV29ybGQ=" + valid: true + description: "Base64 encoded text" + + - url: "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==" + valid: true + description: "Base64 encoded PNG image" + + - url: "data:application/json;charset=utf-8,Hello%20World" + valid: true + description: "JSON MIME type with charset parameter" + + - url: "data:text/plain,Hello%7BWorld%7D" + valid: true + description: "Text with URL-encoded curly braces" + + - url: "data:text/plain,Hello%5BWorld%5D" + valid: true + description: "Text with URL-encoded square brackets" + + - url: "data:text/plain,Hello%3AWorld" + valid: true + description: "Text with URL-encoded colon" + + - url: "data:text/plain,Hello%40World" + valid: true + description: "Text with URL-encoded at symbol" + + - url: "data:text/plain,Hello%2FWorld" + valid: true + description: "Text with URL-encoded forward slash" + + - url: "data:text/plain,Hello%23World" + valid: true + description: "Text with URL-encoded hash" + + - url: "data:text/plain,Hello%22World%22" + valid: true + description: "Text with URL-encoded double quotes" + + - url: "data:text/plain," + valid: true + description: "Empty data section" + + - url: "data:,Hello%20World" + valid: true + description: "Missing MIME type" + + + # Invalid test cases (20 failing) + + - url: "data:text/plain,Hello World" + valid: false + description: "Unencoded space character" + + - url: "data:text/plain,Hello{World}" + valid: false + description: "Unencoded curly braces" + + - url: "data:text/plain,Hello[World]" + valid: false + description: "Unencoded square brackets" + + - url: "data:text/plain,Hello:World" + valid: false + description: "Unencoded colon" + + - url: "data:text/plain,Hello@World" + valid: false + description: "Unencoded at symbol" + + - url: "data:text/plain,Hello/World" + valid: false + description: "Unencoded forward slash" + + - url: "data:text/plain,Hello#World" + valid: false + description: "Unencoded hash" + + - url: "data:text/plain,Hello\"World\"" + valid: false + description: "Unencoded double quotes" + + - url: "data:text/plain,Hello" + valid: false + description: "Unencoded angle brackets" + + - url: "data:text/plain,Hello=World" + valid: false + description: "Unencoded equals sign" + + - url: "data:TEXT/plain,Hello" + valid: false + description: "MIME type with uppercase letters" + + - url: "data:text/PLAIN,Hello" + valid: false + description: "MIME subtype with uppercase letters" + + - url: "data:text,Hello" + valid: false + description: "MIME type without subtype" + + - url: "data:/plain,Hello" + valid: false + description: "MIME type starts with slash" + + - url: "data:text/,Hello" + valid: false + description: "Empty MIME subtype" + + - url: "data:text/plain;base64;charset=utf-8,Hello" + valid: false + description: "Parameters after base64 flag are not allowed" + + - url: "data:text/plain;base64=true,Hello" + valid: false + description: "Base64 flag with value is not allowed" + + - url: "data:text/plain;BASE64,Hello" + valid: false + description: "Base64 flag with uppercase letters" + + - url: "data:text/plain" + valid: false + description: "Missing comma separator and data" + + - url: "DATA:text/plain,Hello" + valid: false + description: "Uppercase DATA scheme is not allowed" diff --git a/test/test_regex.py b/test/test_regex.py new file mode 100644 index 0000000..1f7772e --- /dev/null +++ b/test/test_regex.py @@ -0,0 +1,21 @@ +import unittest +import yaml +import os +from data_url import DataURL + +class TestDataURLRegex(unittest.TestCase): + def setUp(self): + # Load test cases from YAML file + test_file = os.path.join(os.path.dirname(__file__), 'data_url_test_cases.yaml') + with open(test_file, 'r') as f: + self.test_cases = yaml.safe_load(f)['test_cases'] + + def test_data_url_regex(self): + for case in self.test_cases: + url = case['url'] + expected_valid = case['valid'] + + data_url = DataURL.from_url(url) + actual_valid = data_url is not None + with self.subTest(url=url, description=case['description']): + self.assertEqual(actual_valid, expected_valid, f"URL: {url}\nDescription: {case['description']}") \ No newline at end of file