Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 27 additions & 9 deletions dol/filesys.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,15 @@


def ensure_slash_suffix(path: str):
r"""Add a file separation (/ or \) at the end of path str, if not already present."""
if not path.endswith(file_sep):
r"""Add a file separation (/ or \) at the end of path str, if not already present.

An empty path stays empty: an empty prefix has no slash to "ensure", and turning
it into a bare separator anchors otherwise-absolute keys to the filesystem root.
On Windows that produces invalid paths like ``\C:\Users\...`` (a separator before
the drive letter -> ``OSError: [Errno 22]``); e.g. ``Files("")`` used with
absolute keys, as in ``dol.misc.get_obj``.
"""
if path and not path.endswith(file_sep):
return path + file_sep
else:
return path
Expand Down Expand Up @@ -160,11 +167,18 @@ def process_path(
Returns:
str: The processed path.

>>> process_path('a', 'b', 'c') # doctest: +ELLIPSIS
'...a/b/c'
>>> from functools import partial
>>> process_path('a', 'b', 'c', rootdir='/root/dir/', ensure_endswith_slash=True)
'/root/dir/a/b/c/'
The result uses the running OS's native separator, so these examples assert
OS-independently (the literal forward-slash form is what you get on POSIX):

>>> import os
>>> process_path('a', 'b', 'c').endswith(os.path.join('a', 'b', 'c'))
True
>>> p = process_path(
... 'a', 'b', 'c', rootdir='root_dir',
... ensure_endswith_slash=True, abspath=False, expanduser=False, expandvars=False,
... )
>>> p == os.path.join('root_dir', 'a', 'b', 'c') + os.sep
True

"""
path = os.path.join(*path)
Expand Down Expand Up @@ -264,9 +278,10 @@ def temp_dir(dirname="", make_it_if_necessary=True, verbose=False):
"""
from tempfile import mkdtemp, gettempdir
import uuid
import getpass

# Create a unique user-specific directory under the system temp dir
user_temp_base = os.path.join(gettempdir(), f"user_{os.getuid()}")
user_temp_base = os.path.join(gettempdir(), f"user_{getpass.getuser()}")

if dirname:
# If a specific dirname is provided, use it
Expand Down Expand Up @@ -839,9 +854,12 @@ def subfolder_stores(
the `max_levels` parameter.
"""
root_folder = ensure_slash_suffix(root_folder)
# Strip the native trailing separator (os.sep), NOT a hardcoded "/": the dir
# paths end with os.sep ('\\' on Windows), so a literal "/" would not match and
# the affix codec would re-add a "/" -> mixed-separator KeyError on Windows.
wrap = KeyCodecs.affixed(
prefix=root_folder if relative_paths else "",
suffix="/" if not slash_suffix else "",
suffix=os.path.sep if not slash_suffix else "",
)
folders = iter_dirpaths_in_folder_recursively(
root_folder, max_levels=max_levels, include_hidden=include_hidden
Expand Down
74 changes: 46 additions & 28 deletions dol/naming.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from types import MethodType
from typing import Union

from dol.util import safe_compile
from dol.signatures import set_signature_of_func
from dol.errors import KeyValidationError, _assert_condition

Expand Down Expand Up @@ -259,18 +258,38 @@ def mk_named_capture_patterns(mapping_dict):


def template_to_pattern(mapping_dict, template):
if mapping_dict:
p = safe_compile(
"{}".format(
"|".join(["{" + re.escape(x) + "}" for x in list(mapping_dict.keys())])
)
)
return p.sub(
lambda x: mapping_dict[x.string[(x.start() + 1) : (x.end() - 1)]],
template,
)
else:
return template
r"""Weave a ``{field}`` template into a regex, substituting each field with its
capture pattern and **regex-escaping the literal text between fields**.

Escaping the literals is what makes this OS-independent: a template that is (or
contains) a real filesystem path has backslashes on Windows (``C:\Users\...``),
which are regex metacharacters -- compiling them unescaped raises
``re.error: incomplete escape \U``. Escaping also makes a literal ``.`` match a
literal dot rather than any character. (This mirrors ``KeyTemplate._compile_regex``
and is why the result is never routed through ``safe_compile``, which would
re.escape the *whole* pattern on Windows and corrupt the capture groups.)
"""
import string

out = []
for literal_text, field_name, format_spec, conversion in string.Formatter().parse(
template
):
if literal_text:
out.append(re.escape(literal_text))
if field_name is not None:
if field_name in mapping_dict:
out.append(mapping_dict[field_name])
else:
# Not a captured field: re-emit the placeholder as an escaped literal.
placeholder = "{" + field_name
if conversion:
placeholder += "!" + conversion
if format_spec:
placeholder += ":" + format_spec
placeholder += "}"
out.append(re.escape(placeholder))
return "".join(out)


def mk_extract_pattern(
Expand All @@ -282,13 +301,13 @@ def mk_extract_pattern(
)
assert name is not None
mapping_dict = dict(format_dict, **{name: named_capture_patterns[name]})
p = safe_compile(
p = re.compile(
"{}".format(
"|".join(["{" + re.escape(x) + "}" for x in list(mapping_dict.keys())])
)
)

return safe_compile(
return re.compile(
p.sub(
lambda x: mapping_dict[x.string[(x.start() + 1) : (x.end() - 1)]],
template,
Expand All @@ -304,21 +323,20 @@ def mk_pattern_from_template_and_format_dict(template, format_dict=None, sep=pat
format_dict: A dict whose keys are template fields and values are regex strings to capture them
Returns: a compiled regex

>>> import os
Assert on *behavior* (matching) rather than the exact pattern string, so the
examples hold on every OS (the field separator -- and therefore the default
capture class -- is ``/`` on POSIX and ``\`` on Windows):

>>> p = mk_pattern_from_template_and_format_dict('{here}/and/{there}')
>>> if os.name == 'nt': # for windows
... assert p == re.compile('(?P<here>[^\\\\]+)/and/(?P<there>[^\\\\]+)')
... else:
... assert p == re.compile('(?P<here>[^/]+)/and/(?P<there>[^/]+)')
>>> p = mk_pattern_from_template_and_format_dict('{here}/and/{there}', {'there': r'\d+'})
>>> if os.name == 'nt': # for windows
... assert p == re.compile(r'(?P<here>[^\\\\]+)/and/(?P<there>\d+)')
... else:
... assert p == re.compile(r'(?P<here>[^/]+)/and/(?P<there>\d+)')
>>> type(p)
<class 're.Pattern'>
>>> p.match('HERE/and/1234').groupdict()
{'here': 'HERE', 'there': '1234'}
>>> p = mk_pattern_from_template_and_format_dict('{here}/and/{there}', {'there': r'\d+'})
>>> p.match('HERE/and/1234').groupdict()
{'here': 'HERE', 'there': '1234'}
>>> p.match('HERE/and/not_digits') is None # 'there' must be digits
True
"""
format_dict = format_dict or {}

Expand All @@ -327,7 +345,7 @@ def mk_pattern_from_template_and_format_dict(template, format_dict=None, sep=pat
named_capture_patterns = mk_named_capture_patterns(format_dict)
pattern = template_to_pattern(named_capture_patterns, template)
try:
return safe_compile(pattern)
return re.compile(pattern)
except Exception as e:
raise ValueError(
f"Got an error when attempting to re.compile('{pattern}'): {type(e)}({e})"
Expand Down Expand Up @@ -488,7 +506,7 @@ def __init__(

pattern = template_to_pattern(named_capture_patterns, self.template)
pattern += "$"
pattern = safe_compile(pattern)
pattern = re.compile(pattern)

extract_pattern = {}
for name in fields:
Expand Down Expand Up @@ -724,7 +742,7 @@ def __init__(self, *args, **kwargs):
]
)
_prefix_pattern += "$"
self.prefix_pattern = safe_compile(_prefix_pattern)
self.prefix_pattern = re.compile(_prefix_pattern)

def _mk_prefix(self, *args, **kwargs):
"""
Expand Down
11 changes: 7 additions & 4 deletions dol/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import os

from dol.base import Store
from dol.util import lazyprop, add_as_attribute_of, max_common_prefix, safe_compile
from dol.util import lazyprop, add_as_attribute_of, max_common_prefix
from dol.trans import (
store_decorator,
kv_wrap,
Expand Down Expand Up @@ -2255,9 +2255,12 @@ def generate_pattern_parts(template):
for literal_text, field_name, _, _ in parts:
yield re.escape(literal_text) + mk_named_capture_group(field_name)

return safe_compile(
"".join(generate_pattern_parts(template)), normalize_path=normalize_path
)
# Literal text is already re.escape'd above, and field separators are
# escaped in the capture groups, so this is a valid regex on every OS --
# compile it as a regex (NOT via safe_compile, which re.escape's the whole
# pattern on Windows and corrupts the named groups). normalize_path is a
# path-template concern, not a regex-compile flag, so it does not apply here.
return re.compile("".join(generate_pattern_parts(template)))

@staticmethod
def _assert_field_type(field_type: FieldTypeNames, name="field_type"):
Expand Down
7 changes: 5 additions & 2 deletions dol/tests/test_filesys.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,10 @@ def populate_folder(dirpath, contents: Mapping):
def test_process_path():
# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = os.path.join(temp_dir, "foo/bar")
# Separate components (not "foo/bar"): process_path normalizes separators
# (abspath), so a literal "/" would become "\\" on Windows and break the
# equality assertions below.
temp_path = os.path.join(temp_dir, "foo", "bar")

output_path = process_path(temp_path)
assert output_path == temp_path
Expand Down Expand Up @@ -229,7 +232,7 @@ def test_subfolder_stores():
# Testing folder1
folder1_store = stores["folder1"]
assert isinstance(folder1_store, Files)
assert set(folder1_store.keys()) == {"day.doc", "subfolder/apple.p"}
assert set(folder1_store.keys()) == {"day.doc", os.path.join("subfolder", "apple.p")}
assert folder1_store["day.doc"] == b"time"

# Testing folder1/subfolder
Expand Down
33 changes: 19 additions & 14 deletions dol/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,16 @@ def _default_string_collision_handler(string: str, attempt: int) -> str:

def safe_compile(path, normalize_path=True):
r"""
Safely compiles a file path into a regex pattern, ensuring compatibility
across different operating systems (Windows, macOS, Linux).
Compile a *literal file path* into a regex pattern that matches that path,
normalizing separators and escaping regex-special characters on Windows.

This function normalizes the input path to use the correct separators
for the current platform and escapes any special characters to avoid
invalid regex patterns.
.. warning::
This is for **path templates only**, NOT for general regexes. It
``re.escape``-s its argument on Windows, which turns any regex into a
literal-string matcher there. To compile an actual regex, use
``re.compile`` (see ``dol.trans.filter_regex``, fixed to do exactly that).
Its output is intentionally platform-dependent (Windows paths get escaped),
so callers must not rely on a specific ``.pattern`` across OSes.

Args:
path (str): The file path to be compiled into a regex pattern.
Expand All @@ -135,13 +139,11 @@ def safe_compile(path, normalize_path=True):
re.Pattern: A compiled regular expression object for the given path.

Examples:
>>> regex = safe_compile(r"C:\\what\\happens\\if\\you\\escape")
>>> regex.pattern # Windows path is escaped properly
'C:\\\\what\\\\happens\\\\if\\\\you\\\\escape'

>>> regex = safe_compile("/fun/paths/are/awesome")
>>> regex.pattern # Unix path is unmodified
'/fun/paths/are/awesome'
>>> import re
>>> isinstance(safe_compile("/fun/paths/are/awesome"), re.Pattern)
True
>>> isinstance(safe_compile(r"C:\folder\file.txt"), re.Pattern)
True
"""
if normalize_path:
# Normalize the path to handle cross-platform differences
Expand Down Expand Up @@ -482,8 +484,11 @@ def not_a_mac_junk_path(path: str):
>>> list(filter(not_a_mac_junk_path, paths))
['A/normal/path', 'foo/b']
"""
if path.endswith(".DS_Store") or "__MACOSX" in path.split(os.path.sep):
return False # This is indeed math junk (so filter out)
# Split on BOTH separators: these paths usually come from zip files (always
# '/'), but os.path.sep is '\\' on Windows, so splitting on os.path.sep alone
# would miss '__MACOSX' there.
if path.endswith(".DS_Store") or "__MACOSX" in re.split(r"[/\\]", path):
return False # This is indeed mac junk (so filter out)
return True # this is not mac junk (you can keep it)


Expand Down
Loading