Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions mlenergy_data/records/runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -951,7 +951,7 @@ def _ensure_downloaded(self, path_field: str) -> None:
if not path_val:
return
root = self.__dict__["_hf_snapshot_root"]
rel = str(Path(path_val).relative_to(root))
rel = Path(path_val).relative_to(root).as_posix()
download_file(repo_id, rel, revision=self.__dict__.get("_hf_revision"))

def read_results_json(self) -> dict[str, Any]:
Expand Down Expand Up @@ -1143,7 +1143,7 @@ def _ensure_downloaded(self, path_field: str) -> None:
if not path_val:
return
root = self.__dict__["_hf_snapshot_root"]
rel = str(Path(path_val).relative_to(root))
rel = Path(path_val).relative_to(root).as_posix()
download_file(repo_id, rel, revision=self.__dict__.get("_hf_revision"))

def read_results_json(self) -> dict[str, Any]:
Expand Down Expand Up @@ -1608,7 +1608,7 @@ def output_lengths(self, *, include_unsuccessful: bool = False) -> pd.DataFrame:
"""
cols = ["task", "model_id", "num_gpus", "max_num_seqs", "output_len"]
if not self._runs:
return pd.DataFrame(columns=cols)
return pd.DataFrame(columns=pd.Index(cols))

rows: list[dict[str, Any]] = []
for run in self._runs:
Expand All @@ -1622,7 +1622,7 @@ def output_lengths(self, *, include_unsuccessful: bool = False) -> pd.DataFrame:
"output_len": length,
}
)
return pd.DataFrame(rows) if rows else pd.DataFrame(columns=cols)
return pd.DataFrame(rows) if rows else pd.DataFrame(columns=pd.Index(cols))

def inter_token_latencies(self) -> pd.DataFrame:
"""Extract per-token inter-token latency samples.
Expand All @@ -1642,7 +1642,7 @@ def inter_token_latencies(self) -> pd.DataFrame:
"""
cols = ["task", "model_id", "num_gpus", "max_num_seqs", "itl_s"]
if not self._runs:
return pd.DataFrame(columns=cols)
return pd.DataFrame(columns=pd.Index(cols))

rows: list[dict[str, Any]] = []
for run in self._runs:
Expand All @@ -1656,7 +1656,7 @@ def inter_token_latencies(self) -> pd.DataFrame:
"itl_s": v,
}
)
return pd.DataFrame(rows) if rows else pd.DataFrame(columns=cols)
return pd.DataFrame(rows) if rows else pd.DataFrame(columns=pd.Index(cols))

def timelines(
self,
Expand Down Expand Up @@ -1690,7 +1690,7 @@ def timelines(
"metric",
]
if not self._runs:
return pd.DataFrame(columns=cols)
return pd.DataFrame(columns=pd.Index(cols))

frames: list[pd.DataFrame] = []
for run in self._runs:
Expand All @@ -1701,7 +1701,7 @@ def timelines(
tl["max_num_seqs"] = run.max_num_seqs
frames.append(tl)
if not frames:
return pd.DataFrame(columns=cols)
return pd.DataFrame(columns=pd.Index(cols))
out = pd.concat(frames, ignore_index=True)
return out.sort_values(["task", "model_id", "relative_time_s"]).reset_index(drop=True)

Expand Down Expand Up @@ -2044,7 +2044,7 @@ def timelines(
"metric",
]
if not self._runs:
return pd.DataFrame(columns=cols)
return pd.DataFrame(columns=pd.Index(cols))

frames: list[pd.DataFrame] = []
for run in self._runs:
Expand All @@ -2055,6 +2055,6 @@ def timelines(
tl["batch_size"] = run.batch_size
frames.append(tl)
if not frames:
return pd.DataFrame(columns=cols)
return pd.DataFrame(columns=pd.Index(cols))
out = pd.concat(frames, ignore_index=True)
return out.sort_values(["task", "model_id", "relative_time_s"]).reset_index(drop=True)
45 changes: 45 additions & 0 deletions mlenergy_data/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import logging
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Protocol
Expand All @@ -12,6 +13,48 @@

logger = logging.getLogger(__name__)

_long_paths_checked = False


def _ensure_windows_long_paths() -> None:
"""Verify that long path support is enabled on Windows.

On Windows, the default MAX_PATH of 260 characters is too short for the
deeply nested HuggingFace Hub cache paths used by this dataset. This
function checks whether long path support is enabled and raises a clear
error with instructions if it is not.

On non-Windows platforms this is a no-op.
"""
global _long_paths_checked
if _long_paths_checked or sys.platform != "win32":
return
_long_paths_checked = True

import ctypes

try:
ntdll = ctypes.WinDLL("ntdll")
ntdll.RtlAreLongPathsEnabled.restype = ctypes.c_ubyte
ntdll.RtlAreLongPathsEnabled.argtypes = []
if ntdll.RtlAreLongPathsEnabled():
return
except (OSError, AttributeError):
# Cannot determine status (old Windows version); skip check.
return

raise RuntimeError(
"Windows long path support is not enabled. "
"This dataset contains deeply nested file paths that exceed the "
"default 260-character MAX_PATH limit.\n\n"
"To enable long path support, run the following in an elevated "
"PowerShell (Run as Administrator):\n\n"
' Set-ItemProperty -Path "HKLM:\\SYSTEM\\CurrentControlSet\\Control'
'\\FileSystem" -Name "LongPathsEnabled" -Value 1\n\n'
"Then restart your Python process."
)


_GATED_DATASET_MSG = (
"Failed to download dataset '{repo_id}'. "
"This is a gated dataset. Please ensure you have done the following:\n"
Expand Down Expand Up @@ -55,6 +98,7 @@ class HFDatasetSource:
allow_patterns: list[str] | None = None

def local_root(self) -> Path:
_ensure_windows_long_paths()
try:
local = snapshot_download(
repo_id=self.repo_id,
Expand Down Expand Up @@ -86,6 +130,7 @@ def download_file(
Returns:
Local path to the downloaded file.
"""
_ensure_windows_long_paths()
try:
local = hf_hub_download(
repo_id=repo_id,
Expand Down