From a7e2f00f23eabdc0cd7daba0e187b769939e55d0 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 24 Mar 2025 13:35:54 -0400 Subject: [PATCH 1/3] Add initial support for download of subpath within zarr implemented by adding "subpath" within an Asset record and adding support to assess it for .zarrs. Note also that it adjusts logic from try/except to explicit checking -- much better since avoids swallowing unrelated exceptions. Tried with dandi -l debug download -f debug --preserve-tree dandi://dandi/000108/sub-MITU01/ses-20210521h17m17s06/micr/sub-MITU01_ses-20210521h17m17s06_sample-178_stain-LEC_run-1_chunk-1_SPIM.ome.zarr/0/0/0/0/0 which seems succeeded downloading, we seems still need to - [ ] disable checksumming for such partial downloads (since there is no partial checksums ATM at API level) - [ ] add tests and verify that works on full file paths --- dandi/dandiapi.py | 35 +++++++++++++++++++++++++---------- dandi/download.py | 14 ++++++++++++-- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 553ec672c..562cf2713 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -1240,16 +1240,29 @@ def get_asset_by_path(self, path: str) -> RemoteAsset: paths (e.g., ``../``) within it. """ path = self._normalize_path(path) - try: - # Weed out any assets that happen to have the given path as a - # proper prefix: - (asset,) = ( - a for a in self.get_assets_with_path_prefix(path) if a.path == path - ) - except ValueError: - raise NotFoundError(f"No asset at path {path!r}") - else: - return asset + # Weed out any assets that happen to have the given path as a + # proper prefix: + assets = [a for a in self.get_assets_with_path_prefix(path) if a.path == path] + if assets: + if len(assets) > 1: + lgr.warning( + "Multiple assets found at path %r; returning the first one", + path, + ) + asset = assets[0] + elif not assets: + zarr_suf = ".zarr/" + if (zarr_suf in path) and (zarr_suffix_idx := path.index(zarr_suf)): + # If path ends with .zarr/, we might have a zarr asset without + # a trailing slash in the path. Try again: + zarr_path_len = zarr_suffix_idx + len(zarr_suf) + zarr_path = path[: zarr_path_len - 1] # -1 for trailing / + subpath = path[len(zarr_path) :] + asset = self.get_asset_by_path(zarr_path) + asset.subpath = subpath + else: + raise NotFoundError(f"No asset at path {path!r}") + return asset def download_directory( self, @@ -1378,6 +1391,8 @@ class BaseRemoteAsset(ABC, APIBase): identifier: str = Field(alias="asset_id") #: The asset's (forward-slash-separated) path path: str + #: The path within the asset, as e.g. path in a zarr/ + subpath: str | None = Field(default=None) #: The size of the asset in bytes size: int #: The date at which the asset was created diff --git a/dandi/download.py b/dandi/download.py index 3edf495ed..c40c8e52d 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -226,6 +226,7 @@ class Downloader: url: ParsedDandiURL output_dir: InitVar[str | Path] output_prefix: Path = field(init=False) + #: just a convenience combination of output_dir and output_prefix output_path: Path = field(init=False) existing: DownloadExisting get_metadata: bool @@ -333,6 +334,12 @@ def download_generator(self) -> Iterator[dict]: asset.path, ) mtime = asset.modified + if asset.subpath: + lgr.warning( + "No downloading of subpaths within blobs yet. Got %s for %s", + asset.subpath, + asset.path, + ) _download_generator = _download_file( asset.get_download_file_iter(), download_path, @@ -352,7 +359,8 @@ def download_generator(self) -> Iterator[dict]: ), f"Asset {asset.path} is neither blob nor Zarr" _download_generator = _download_zarr( asset, - download_path, + prefix=asset.subpath, + download_path=download_path, toplevel_path=self.output_path, existing=self.existing, jobs=self.jobs_per_zarr, @@ -812,6 +820,7 @@ def _download_file( lgr.warning("downloader logic: We should not be here!") final_digest = None + if downloaded_digest and not resuming: assert downloaded_digest is not None final_digest = downloaded_digest.hexdigest() # we care only about hex @@ -977,6 +986,7 @@ def _download_zarr( toplevel_path: str | Path, existing: DownloadExisting, lock: Lock, + prefix: str | None = None, jobs: int | None = None, ) -> Iterator[dict]: # Avoid heavy import by importing within function: @@ -993,7 +1003,7 @@ def digest_callback(path: str, algoname: str, d: str) -> None: digests[path] = d def downloads_gen(): - for entry in asset.iterfiles(): + for entry in asset.iterfiles(prefix=prefix): entries.append(entry) etag = entry.digest assert etag.algorithm is DigestType.md5 From 3849b7daebd2a9e876b7934a110f52f4c86ad6a6 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 24 Mar 2025 15:04:53 -0400 Subject: [PATCH 2/3] use Optional instead of | None Co-authored-by: John T. Wodder II --- dandi/dandiapi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 562cf2713..41a1e3e7f 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -1392,7 +1392,7 @@ class BaseRemoteAsset(ABC, APIBase): #: The asset's (forward-slash-separated) path path: str #: The path within the asset, as e.g. path in a zarr/ - subpath: str | None = Field(default=None) + subpath: Optional[str] = None #: The size of the asset in bytes size: int #: The date at which the asset was created From 0f8d6432dde90467a48cb91ae3511ca2e32cec44 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 25 Mar 2025 08:51:01 -0400 Subject: [PATCH 3/3] Partial improvements --- dandi/dandiapi.py | 71 ++++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 41a1e3e7f..052b5d6d4 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -31,6 +31,7 @@ REQUEST_RETRIES, RETRY_STATUSES, ZARR_DELETE_BATCH_SIZE, + ZARR_EXTENSIONS, DandiInstance, EmbargoStatus, ) @@ -1240,29 +1241,45 @@ def get_asset_by_path(self, path: str) -> RemoteAsset: paths (e.g., ``../``) within it. """ path = self._normalize_path(path) - # Weed out any assets that happen to have the given path as a - # proper prefix: - assets = [a for a in self.get_assets_with_path_prefix(path) if a.path == path] - if assets: - if len(assets) > 1: - lgr.warning( - "Multiple assets found at path %r; returning the first one", - path, - ) - asset = assets[0] - elif not assets: - zarr_suf = ".zarr/" - if (zarr_suf in path) and (zarr_suffix_idx := path.index(zarr_suf)): - # If path ends with .zarr/, we might have a zarr asset without - # a trailing slash in the path. Try again: - zarr_path_len = zarr_suffix_idx + len(zarr_suf) - zarr_path = path[: zarr_path_len - 1] # -1 for trailing / - subpath = path[len(zarr_path) :] - asset = self.get_asset_by_path(zarr_path) - asset.subpath = subpath - else: - raise NotFoundError(f"No asset at path {path!r}") - return asset + try: + # Weed out any assets that happen to have the given path as a + # proper prefix: + (asset,) = ( + a for a in self.get_assets_with_path_prefix(path) if a.path == path + ) + except ValueError: + raise NotFoundError(f"No asset at path {path!r}") + else: + return asset + + def get_asset_with_subpath(self, path: str) -> RemoteAsset | ZarrWithPrefix: + def is_zarr_part(part: str) -> bool: + for ext in ZARR_EXTENSIONS: + if part.endswith(ext) and part != ext: + return True + return False + + full_path = PurePosixPath(path) + asset_path = PurePosixPath() + for i, p in enumerate(full_path.parts): + asset_path /= p + if is_zarr_part(p) and i < len(full_path.parts) - 1: + try: + asset = self.get_asset_by_path(str(asset_path)) + except NotFoundError: + pass + else: + if isinstance(asset, RemoteZarrAsset): + return ZarrWithPrefix( + zarr=asset, prefix="/".join(full_path.parts[i + 1 :]) + ) + else: + # We found a blob, which is not a folder, so no Zarr + # path can exist under it. + raise NotFoundError( + f"{path!r} is not a Zarr path with entry prefix" + ) + return self.get_asset_by_path(path) def download_directory( self, @@ -1391,8 +1408,6 @@ class BaseRemoteAsset(ABC, APIBase): identifier: str = Field(alias="asset_id") #: The asset's (forward-slash-separated) path path: str - #: The path within the asset, as e.g. path in a zarr/ - subpath: Optional[str] = None #: The size of the asset in bytes size: int #: The date at which the asset was created @@ -1949,6 +1964,12 @@ def set_raw_metadata(self, metadata: dict[str, Any]) -> None: self._metadata = data["metadata"] +@dataclass +class ZarrWithPrefix: + zarr: RemoteZarrAsset + prefix: str + + @dataclass class RemoteZarrEntry: """