diff --git a/mlenergy_data/records/runs.py b/mlenergy_data/records/runs.py index b1dbe38..d9d66f7 100644 --- a/mlenergy_data/records/runs.py +++ b/mlenergy_data/records/runs.py @@ -951,7 +951,7 @@ def _ensure_downloaded(self, path_field: str) -> None: if not path_val: return root = self.__dict__["_hf_snapshot_root"] - rel = str(Path(path_val).relative_to(root)) + rel = Path(path_val).relative_to(root).as_posix() download_file(repo_id, rel, revision=self.__dict__.get("_hf_revision")) def read_results_json(self) -> dict[str, Any]: @@ -1143,7 +1143,7 @@ def _ensure_downloaded(self, path_field: str) -> None: if not path_val: return root = self.__dict__["_hf_snapshot_root"] - rel = str(Path(path_val).relative_to(root)) + rel = Path(path_val).relative_to(root).as_posix() download_file(repo_id, rel, revision=self.__dict__.get("_hf_revision")) def read_results_json(self) -> dict[str, Any]: @@ -1608,7 +1608,7 @@ def output_lengths(self, *, include_unsuccessful: bool = False) -> pd.DataFrame: """ cols = ["task", "model_id", "num_gpus", "max_num_seqs", "output_len"] if not self._runs: - return pd.DataFrame(columns=cols) + return pd.DataFrame(columns=pd.Index(cols)) rows: list[dict[str, Any]] = [] for run in self._runs: @@ -1622,7 +1622,7 @@ def output_lengths(self, *, include_unsuccessful: bool = False) -> pd.DataFrame: "output_len": length, } ) - return pd.DataFrame(rows) if rows else pd.DataFrame(columns=cols) + return pd.DataFrame(rows) if rows else pd.DataFrame(columns=pd.Index(cols)) def inter_token_latencies(self) -> pd.DataFrame: """Extract per-token inter-token latency samples. @@ -1642,7 +1642,7 @@ def inter_token_latencies(self) -> pd.DataFrame: """ cols = ["task", "model_id", "num_gpus", "max_num_seqs", "itl_s"] if not self._runs: - return pd.DataFrame(columns=cols) + return pd.DataFrame(columns=pd.Index(cols)) rows: list[dict[str, Any]] = [] for run in self._runs: @@ -1656,7 +1656,7 @@ def inter_token_latencies(self) -> pd.DataFrame: "itl_s": v, } ) - return pd.DataFrame(rows) if rows else pd.DataFrame(columns=cols) + return pd.DataFrame(rows) if rows else pd.DataFrame(columns=pd.Index(cols)) def timelines( self, @@ -1690,7 +1690,7 @@ def timelines( "metric", ] if not self._runs: - return pd.DataFrame(columns=cols) + return pd.DataFrame(columns=pd.Index(cols)) frames: list[pd.DataFrame] = [] for run in self._runs: @@ -1701,7 +1701,7 @@ def timelines( tl["max_num_seqs"] = run.max_num_seqs frames.append(tl) if not frames: - return pd.DataFrame(columns=cols) + return pd.DataFrame(columns=pd.Index(cols)) out = pd.concat(frames, ignore_index=True) return out.sort_values(["task", "model_id", "relative_time_s"]).reset_index(drop=True) @@ -2044,7 +2044,7 @@ def timelines( "metric", ] if not self._runs: - return pd.DataFrame(columns=cols) + return pd.DataFrame(columns=pd.Index(cols)) frames: list[pd.DataFrame] = [] for run in self._runs: @@ -2055,6 +2055,6 @@ def timelines( tl["batch_size"] = run.batch_size frames.append(tl) if not frames: - return pd.DataFrame(columns=cols) + return pd.DataFrame(columns=pd.Index(cols)) out = pd.concat(frames, ignore_index=True) return out.sort_values(["task", "model_id", "relative_time_s"]).reset_index(drop=True) diff --git a/mlenergy_data/sources.py b/mlenergy_data/sources.py index dc677c9..4cfd325 100644 --- a/mlenergy_data/sources.py +++ b/mlenergy_data/sources.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import sys from dataclasses import dataclass from pathlib import Path from typing import Protocol @@ -12,6 +13,48 @@ logger = logging.getLogger(__name__) +_long_paths_checked = False + + +def _ensure_windows_long_paths() -> None: + """Verify that long path support is enabled on Windows. + + On Windows, the default MAX_PATH of 260 characters is too short for the + deeply nested HuggingFace Hub cache paths used by this dataset. This + function checks whether long path support is enabled and raises a clear + error with instructions if it is not. + + On non-Windows platforms this is a no-op. + """ + global _long_paths_checked + if _long_paths_checked or sys.platform != "win32": + return + _long_paths_checked = True + + import ctypes + + try: + ntdll = ctypes.WinDLL("ntdll") + ntdll.RtlAreLongPathsEnabled.restype = ctypes.c_ubyte + ntdll.RtlAreLongPathsEnabled.argtypes = [] + if ntdll.RtlAreLongPathsEnabled(): + return + except (OSError, AttributeError): + # Cannot determine status (old Windows version); skip check. + return + + raise RuntimeError( + "Windows long path support is not enabled. " + "This dataset contains deeply nested file paths that exceed the " + "default 260-character MAX_PATH limit.\n\n" + "To enable long path support, run the following in an elevated " + "PowerShell (Run as Administrator):\n\n" + ' Set-ItemProperty -Path "HKLM:\\SYSTEM\\CurrentControlSet\\Control' + '\\FileSystem" -Name "LongPathsEnabled" -Value 1\n\n' + "Then restart your Python process." + ) + + _GATED_DATASET_MSG = ( "Failed to download dataset '{repo_id}'. " "This is a gated dataset. Please ensure you have done the following:\n" @@ -55,6 +98,7 @@ class HFDatasetSource: allow_patterns: list[str] | None = None def local_root(self) -> Path: + _ensure_windows_long_paths() try: local = snapshot_download( repo_id=self.repo_id, @@ -86,6 +130,7 @@ def download_file( Returns: Local path to the downloaded file. """ + _ensure_windows_long_paths() try: local = hf_hub_download( repo_id=repo_id,