From fc508801ef6a5649fd96920b32dbdff93da8091a Mon Sep 17 00:00:00 2001 From: mpeex Date: Mon, 8 Dec 2025 19:45:50 +0100 Subject: [PATCH 1/2] support revision/branches and transformation can return None to skip sample --- src/litdata/streaming/dataset.py | 4 +++- src/litdata/streaming/downloader.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/litdata/streaming/dataset.py b/src/litdata/streaming/dataset.py index 1dca6c5b..b3fa1f1b 100644 --- a/src/litdata/streaming/dataset.py +++ b/src/litdata/streaming/dataset.py @@ -441,10 +441,12 @@ def __getitem__(self, index: Union[ChunkedIndex, int, slice]) -> Any: if isinstance(self.transform, list): for transform_fn in self.transform: item = transform_fn(item) + if item is None: + break else: item = self.transform(item) - return item + return item if item else self.__next__() def __next__(self) -> Any: # check if we have reached the end of the dataset (i.e., all the chunks have been processed) diff --git a/src/litdata/streaming/downloader.py b/src/litdata/streaming/downloader.py index c213e097..9addd424 100644 --- a/src/litdata/streaming/downloader.py +++ b/src/litdata/streaming/downloader.py @@ -591,11 +591,13 @@ def download_file(self, remote_filepath: str, local_filepath: str) -> None: FileLock(local_filepath + ".lock", timeout=0), tempfile.TemporaryDirectory() as tmpdir, ): - _, _, _, repo_org, repo_name, path = remote_filepath.split("/", 5) - repo_id = f"{repo_org}/{repo_name}" + _, _, _, repo_org, repo_name_revision, path = remote_filepath.split("/", 5) + splits = repo_name_revision.split("@", 2) + repo_id = f"{repo_org}/{splits[0]}" downloaded_path = hf_hub_download( repo_id, path, + revision=splits[1] if len(splits)==2 else None, cache_dir=tmpdir, repo_type="dataset", **self._storage_options, From c2d45bc17b7d606af30d7c28411683618bb95efc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 8 Dec 2025 18:53:28 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/litdata/streaming/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/litdata/streaming/downloader.py b/src/litdata/streaming/downloader.py index 9addd424..7b44bcd6 100644 --- a/src/litdata/streaming/downloader.py +++ b/src/litdata/streaming/downloader.py @@ -597,7 +597,7 @@ def download_file(self, remote_filepath: str, local_filepath: str) -> None: downloaded_path = hf_hub_download( repo_id, path, - revision=splits[1] if len(splits)==2 else None, + revision=splits[1] if len(splits) == 2 else None, cache_dir=tmpdir, repo_type="dataset", **self._storage_options,