Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 17 additions & 15 deletions cmake/onnxruntime_providers_cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@
target_compile_definitions(onnxruntime_providers_cuda PRIVATE FILE_NAME=\"onnxruntime_providers_cuda.dll\")
endif()
# Work around a CUDA 13.x cudafe++ (EDG front-end) regression that mis-parses CCCL's
# Work around a CUDA 13.3 cudafe++ (EDG front-end) regression that mis-parses CCCL's
# global-qualified partial specializations, e.g. in <cub/device/device_transform.cuh>:
# template <typename T>
# struct ::cuda::proclaims_copyable_arguments<...> : ::cuda::std::true_type {};
Expand All @@ -218,7 +218,7 @@
# corrected copies of the affected headers into the build tree and place that directory
# ahead of the toolkit cccl include path. This is a no-op on toolkits whose headers do not
# contain the offending pattern (e.g. once NVIDIA fixes it), so it is safe to keep enabled.
function(ort_cuda13_patch_cccl_header src dst)
function(ort_cuda133_patch_cccl_header src dst)
if (NOT EXISTS "${src}")
return()
endif()
Expand Down Expand Up @@ -412,19 +412,21 @@
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
foreach(inc_dir ${CUDAToolkit_INCLUDE_DIRS})
if (EXISTS "${inc_dir}/cccl")
# Generate cudafe++-parseable copies of the CCCL headers that contain global-qualified
# partial specializations (see ort_cuda13_patch_cccl_header above) and put the fixed
# directory ahead of the toolkit cccl include so the corrected headers win.
set(_ort_cccl_fix_dir "${CMAKE_CURRENT_BINARY_DIR}/cccl_cuda13_fix")
ort_cuda13_patch_cccl_header(
"${inc_dir}/cccl/cub/device/device_transform.cuh"
"${_ort_cccl_fix_dir}/cub/device/device_transform.cuh")
ort_cuda13_patch_cccl_header(
"${inc_dir}/cccl/cub/device/dispatch/tuning/tuning_transform.cuh"
"${_ort_cccl_fix_dir}/cub/device/dispatch/tuning/tuning_transform.cuh")
if (EXISTS "${_ort_cccl_fix_dir}/cub/device/device_transform.cuh" OR
EXISTS "${_ort_cccl_fix_dir}/cub/device/dispatch/tuning/tuning_transform.cuh")
target_include_directories(${target} BEFORE PRIVATE "${_ort_cccl_fix_dir}")
if (UNIX AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.3 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13.4)
# Generate cudafe++-parseable copies of the CCCL headers that contain global-qualified
# partial specializations (see ort_cuda133_patch_cccl_header above) and put the fixed
# directory ahead of the toolkit cccl include so the corrected headers win.
set(_ort_cccl_fix_dir "${CMAKE_CURRENT_BINARY_DIR}/cccl_cuda13_fix")
ort_cuda133_patch_cccl_header(
"${inc_dir}/cccl/cub/device/device_transform.cuh"
"${_ort_cccl_fix_dir}/cub/device/device_transform.cuh")
ort_cuda133_patch_cccl_header(
"${inc_dir}/cccl/cub/device/dispatch/tuning/tuning_transform.cuh"
"${_ort_cccl_fix_dir}/cub/device/dispatch/tuning/tuning_transform.cuh")
if (EXISTS "${_ort_cccl_fix_dir}/cub/device/device_transform.cuh" OR
EXISTS "${_ort_cccl_fix_dir}/cub/device/dispatch/tuning/tuning_transform.cuh")
target_include_directories(${target} BEFORE PRIVATE "${_ort_cccl_fix_dir}")
endif()
endif()
# Add the cccl subdirectory to the include path so <cuda/std/utility> can be found
Expand Down
81 changes: 60 additions & 21 deletions onnxruntime/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,42 +150,64 @@ def _extract_cuda_major_version(version_str: str) -> str:
return version_str.split(".", maxsplit=1)[0] if version_str else "12"


def _get_cufft_version(cuda_major: str) -> str:
def _get_cufft_version(cuda_major_version: str) -> str:
"""Get cufft library version based on CUDA major version.

Args:
cuda_major: CUDA major version as string (e.g., "12", "13")
cuda_major_version: CUDA major version as string (e.g., "12", "13")

Returns:
cufft version as string
"""
# cufft versions: CUDA 12.x -> 11, CUDA 13.x -> 12
return "12" if cuda_major == "13" else "11"
return "12" if int(cuda_major_version) >= 13 else "11"


def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = True):
# Dynamically determine CUDA major version from build info
# Dynamically determine CUDA major version from build info.
# build_cuda_version defaults to the version this package was built with; it is a parameter for testability.
cuda_major_version = _extract_cuda_major_version(cuda_version)
Comment thread
tianleiwu marked this conversation as resolved.
cufft_version = _get_cufft_version(cuda_major_version)

if is_windows:
# Starting with CUDA 13, NVIDIA consolidated the per-component CUDA Toolkit wheels
# (cublas, cufft, cuda_runtime, cuda_nvrtc, curand, ...) into a single "nvidia/cu{major}"
# package and dropped the "-cuNN" suffix from those package names. On Windows the DLLs
# moved into an architecture sub-folder ("bin/<arch>", e.g. "bin/x86_64"); on Linux the
# libraries are placed directly in "lib" (the wheel itself is architecture specific, so
# there is no arch sub-folder). cuDNN keeps its own "nvidia/cudnn" package and layout.
use_consolidated_layout = cuda_major_version.isdigit() and int(cuda_major_version) >= 13

if use_consolidated_layout:
cuda_dir = f"cu{cuda_major_version}"
if is_windows:
import platform # noqa: PLC0415
Comment thread
tianleiwu marked this conversation as resolved.

arch = "arm64" if platform.machine().lower() in ("arm64", "aarch64") else "x86_64"
cuda_dll_paths = [
("nvidia", cuda_dir, "bin", arch, f"cublasLt64_{cuda_major_version}.dll"),
("nvidia", cuda_dir, "bin", arch, f"cublas64_{cuda_major_version}.dll"),
("nvidia", cuda_dir, "bin", arch, f"cufft64_{cufft_version}.dll"),
("nvidia", cuda_dir, "bin", arch, f"cudart64_{cuda_major_version}.dll"),
]
else: # Linux
# cublas64 depends on cublasLt64, so cublasLt64 should be loaded first.
cuda_dll_paths = [
("nvidia", cuda_dir, "lib", f"libcublasLt.so.{cuda_major_version}"),
("nvidia", cuda_dir, "lib", f"libcublas.so.{cuda_major_version}"),
("nvidia", cuda_dir, "lib", f"libnvrtc.so.{cuda_major_version}"),
("nvidia", cuda_dir, "lib", "libcurand.so.10"),
("nvidia", cuda_dir, "lib", f"libcufft.so.{cufft_version}"),
("nvidia", cuda_dir, "lib", f"libcudart.so.{cuda_major_version}"),
]
elif is_windows:
# CUDA 12 and earlier: each component ships its own "nvidia/<component>" package.
# Path is relative to site-packages directory.
cuda_dll_paths = [
("nvidia", "cublas", "bin", f"cublasLt64_{cuda_major_version}.dll"),
("nvidia", "cublas", "bin", f"cublas64_{cuda_major_version}.dll"),
("nvidia", "cufft", "bin", f"cufft64_{cufft_version}.dll"),
("nvidia", "cuda_runtime", "bin", f"cudart64_{cuda_major_version}.dll"),
]
cudnn_dll_paths = [
("nvidia", "cudnn", "bin", "cudnn_engines_runtime_compiled64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn_engines_precompiled64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn_heuristic64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn_ops64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn_adv64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn_graph64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn_engines_tensor_ir64_9.dll"),
]
else: # Linux
# cublas64 depends on cublasLt64, so cublasLt64 should be loaded first.
cuda_dll_paths = [
Expand All @@ -197,6 +219,19 @@ def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = Tru
("nvidia", "cuda_runtime", "lib", f"libcudart.so.{cuda_major_version}"),
]

# cuDNN keeps its own "nvidia/cudnn" package layout in both old and consolidated schemes.
if is_windows:
cudnn_dll_paths = [
("nvidia", "cudnn", "bin", "cudnn_engines_runtime_compiled64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn_engines_precompiled64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn_heuristic64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn_ops64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn_adv64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn_graph64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn64_9.dll"),
("nvidia", "cudnn", "bin", "cudnn_engines_tensor_ir64_9.dll"),
]
else: # Linux
# Do not load cudnn sub DLLs (they will be dynamically loaded later) to be consistent with PyTorch in Linux.
cudnn_dll_paths = [
("nvidia", "cudnn", "lib", "libcudnn.so.9"),
Expand Down Expand Up @@ -238,15 +273,19 @@ def print_debug_info():
# Print version of installed packages that is related to CUDA or cuDNN DLLs.
cuda_major = _extract_cuda_major_version(cuda_version)

# Starting with CUDA 13, NVIDIA dropped the "-cuNN" suffix from the per-component
# CUDA Toolkit packages (cuDNN keeps its suffixed package name).
cuda_pkg_suffix = "" if (cuda_major.isdigit() and int(cuda_major) >= 13) else f"-cu{cuda_major}"

packages = [
"torch",
f"nvidia-cuda-runtime-cu{cuda_major}",
f"nvidia-cuda-runtime{cuda_pkg_suffix}",
f"nvidia-cudnn-cu{cuda_major}",
f"nvidia-cublas-cu{cuda_major}",
f"nvidia-cufft-cu{cuda_major}",
f"nvidia-curand-cu{cuda_major}",
f"nvidia-cuda-nvrtc-cu{cuda_major}",
f"nvidia-nvjitlink-cu{cuda_major}",
f"nvidia-cublas{cuda_pkg_suffix}",
f"nvidia-cufft{cuda_pkg_suffix}",
f"nvidia-curand{cuda_pkg_suffix}",
f"nvidia-cuda-nvrtc{cuda_pkg_suffix}",
f"nvidia-nvjitlink{cuda_pkg_suffix}",
]
for package in packages:
directory_name = "nvidia" if package.startswith("nvidia-") else None
Expand Down
79 changes: 79 additions & 0 deletions onnxruntime/test/python/onnxruntime_test_python_preload_dlls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# pylint: disable=C0114,C0115,C0116,W0212
import unittest

import onnxruntime


class TestGetNvidiaDllPaths(unittest.TestCase):
"""Unit tests for the private _get_nvidia_dll_paths helper that locates CUDA/cuDNN
libraries inside the NVIDIA site-packages folders.

NVIDIA restructured the CUDA Python wheels starting with CUDA 13: the per-component
packages (cublas, cufft, cuda_runtime, ...) were consolidated into a single
"nvidia/cu{major}" tree. These tests pin down the expected relative paths for the
old (CUDA 12) and new (CUDA 13) layouts on both Windows and Linux.
"""

def _paths(self, **kwargs):
return onnxruntime._get_nvidia_dll_paths(**kwargs)

# ---- CUDA 12 (legacy per-component layout) --------------------------------------
def test_cuda12_windows(self):
paths = self._paths(is_windows=True, build_cuda_version="12.4", cudnn=False)
self.assertIn(("nvidia", "cublas", "bin", "cublasLt64_12.dll"), paths)
self.assertIn(("nvidia", "cublas", "bin", "cublas64_12.dll"), paths)
self.assertIn(("nvidia", "cufft", "bin", "cufft64_11.dll"), paths)
self.assertIn(("nvidia", "cuda_runtime", "bin", "cudart64_12.dll"), paths)

def test_cuda12_linux(self):
paths = self._paths(is_windows=False, build_cuda_version="12.4", cudnn=False)
self.assertIn(("nvidia", "cublas", "lib", "libcublasLt.so.12"), paths)
self.assertIn(("nvidia", "cublas", "lib", "libcublas.so.12"), paths)
self.assertIn(("nvidia", "cuda_nvrtc", "lib", "libnvrtc.so.12"), paths)
self.assertIn(("nvidia", "curand", "lib", "libcurand.so.10"), paths)
self.assertIn(("nvidia", "cufft", "lib", "libcufft.so.11"), paths)
self.assertIn(("nvidia", "cuda_runtime", "lib", "libcudart.so.12"), paths)

# ---- CUDA 13 (consolidated "cu13" layout) ---------------------------------------
def test_cuda13_windows_x86_64(self):
paths = self._paths(is_windows=True, build_cuda_version="13.2", cudnn=False, arch="x86_64")
self.assertIn(("nvidia", "cu13", "bin", "x86_64", "cublasLt64_13.dll"), paths)
self.assertIn(("nvidia", "cu13", "bin", "x86_64", "cublas64_13.dll"), paths)
self.assertIn(("nvidia", "cu13", "bin", "x86_64", "cufft64_12.dll"), paths)
self.assertIn(("nvidia", "cu13", "bin", "x86_64", "cudart64_13.dll"), paths)

def test_cuda13_windows_arch_override(self):
paths = self._paths(is_windows=True, build_cuda_version="13.2", cudnn=False, arch="arm64")
self.assertIn(("nvidia", "cu13", "bin", "arm64", "cudart64_13.dll"), paths)

def test_cuda13_linux_is_flat(self):
paths = self._paths(is_windows=False, build_cuda_version="13.2", cudnn=False)
# Linux consolidated layout has no architecture sub-folder (flat "lib").
self.assertIn(("nvidia", "cu13", "lib", "libcublasLt.so.13"), paths)
self.assertIn(("nvidia", "cu13", "lib", "libcublas.so.13"), paths)
self.assertIn(("nvidia", "cu13", "lib", "libnvrtc.so.13"), paths)
self.assertIn(("nvidia", "cu13", "lib", "libcurand.so.10"), paths)
self.assertIn(("nvidia", "cu13", "lib", "libcufft.so.12"), paths)
self.assertIn(("nvidia", "cu13", "lib", "libcudart.so.13"), paths)

# ---- cuDNN keeps its own package/layout in both schemes -------------------------
def test_cudnn_layout_unchanged(self):
for build_cuda_version in ("12.4", "13.2"):
win = self._paths(is_windows=True, build_cuda_version=build_cuda_version, cuda=False)
self.assertIn(("nvidia", "cudnn", "bin", "cudnn64_9.dll"), win)

linux = self._paths(is_windows=False, build_cuda_version=build_cuda_version, cuda=False)
self.assertEqual(linux, [("nvidia", "cudnn", "lib", "libcudnn.so.9")])

# ---- toggles --------------------------------------------------------------------
def test_cuda_and_cudnn_toggles(self):
self.assertEqual(self._paths(is_windows=False, build_cuda_version="13.2", cuda=False, cudnn=False), [])

cuda_only = self._paths(is_windows=False, build_cuda_version="13.2", cuda=True, cudnn=False)
self.assertTrue(all(p[1] == "cu13" for p in cuda_only))


if __name__ == "__main__":
unittest.main()
17 changes: 12 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,7 +817,9 @@ def reformat_run_count(count_str):
# Adding CUDA Runtime as dependency for NV TensorRT RTX python wheel
if package_name == "onnxruntime-trt-rtx":
major = cuda_major_version or "12" # Default to CUDA 12
install_requires.append(f"nvidia-cuda-runtime-cu{major}~={major}.0")
# CUDA 13 dropped the "-cuNN" suffix from the CUDA Runtime package name.
runtime_pkg = "nvidia-cuda-runtime" if int(major) >= 13 else f"nvidia-cuda-runtime-cu{major}"
install_requires.append(f"{runtime_pkg}~={major}.0")


def save_build_and_package_info(package_name, version_number, cuda_version, qnn_version):
Expand Down Expand Up @@ -862,13 +864,18 @@ def save_build_and_package_info(package_name, version_number, cuda_version, qnn_
if package_name == "onnxruntime-gpu" and cuda_major_version:
# Determine cufft version: CUDA 13 uses cufft 12, CUDA 12 uses cufft 11
cufft_version = "12.0" if cuda_major_version == "13" else "11.0"
Comment thread
tianleiwu marked this conversation as resolved.

# Starting with CUDA 13, NVIDIA renamed the per-component CUDA Toolkit packages by
# dropping the "-cuNN" suffix (e.g. "nvidia-cuda-runtime-cu12" -> "nvidia-cuda-runtime").
# cuDNN keeps the suffixed package name ("nvidia-cudnn-cu13").
cuda_pkg_suffix = "" if int(cuda_major_version) >= 13 else f"-cu{cuda_major_version}"
extras_require.update(
{
"cuda": [
f"nvidia-cuda-nvrtc-cu{cuda_major_version}~={cuda_major_version}.0",
f"nvidia-cuda-runtime-cu{cuda_major_version}~={cuda_major_version}.0",
f"nvidia-cufft-cu{cuda_major_version}~={cufft_version}",
f"nvidia-curand-cu{cuda_major_version}~=10.0",
f"nvidia-cuda-nvrtc{cuda_pkg_suffix}~={cuda_major_version}.0",
f"nvidia-cuda-runtime{cuda_pkg_suffix}~={cuda_major_version}.0",
f"nvidia-cufft{cuda_pkg_suffix}~={cufft_version}",
f"nvidia-curand{cuda_pkg_suffix}~=10.0",
],
"cudnn": [
f"nvidia-cudnn-cu{cuda_major_version}~=9.0",
Expand Down
Loading