Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/litdata/streaming/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,10 +464,12 @@ def __getitem__(self, index: ChunkedIndex | int | slice) -> Any:
if isinstance(self.transform, list):
for transform_fn in self.transform:
item = transform_fn(item)
if item is None:
break
else:
item = self.transform(item)

return item
return item if item else self.__next__()
Copy link
Collaborator

@deependujha deependujha Dec 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes more sense to have this in __next__ method, and not in __getitem__.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the suggestion


def __next__(self) -> Any:
# check if we have reached the end of the dataset (i.e., all the chunks have been processed)
Expand Down
6 changes: 4 additions & 2 deletions src/litdata/streaming/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,11 +591,13 @@ def download_file(self, remote_filepath: str, local_filepath: str) -> None:
FileLock(local_filepath + ".lock", timeout=0),
tempfile.TemporaryDirectory() as tmpdir,
):
_, _, _, repo_org, repo_name, path = remote_filepath.split("/", 5)
repo_id = f"{repo_org}/{repo_name}"
_, _, _, repo_org, repo_name_revision, path = remote_filepath.split("/", 5)
splits = repo_name_revision.split("@", 2)
repo_id = f"{repo_org}/{splits[0]}"
downloaded_path = hf_hub_download(
repo_id,
path,
revision=splits[1] if len(splits) == 2 else None,
cache_dir=tmpdir,
repo_type="dataset",
**self._storage_options,
Expand Down