diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index f692f1f5e0a57..2aa31276cc395 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -208,7 +208,7 @@ target_compile_definitions(onnxruntime_providers_cuda PRIVATE FILE_NAME=\"onnxruntime_providers_cuda.dll\") endif() - # Work around a CUDA 13.x cudafe++ (EDG front-end) regression that mis-parses CCCL's + # Work around a CUDA 13.3 cudafe++ (EDG front-end) regression that mis-parses CCCL's # global-qualified partial specializations, e.g. in : # template # struct ::cuda::proclaims_copyable_arguments<...> : ::cuda::std::true_type {}; @@ -218,7 +218,7 @@ # corrected copies of the affected headers into the build tree and place that directory # ahead of the toolkit cccl include path. This is a no-op on toolkits whose headers do not # contain the offending pattern (e.g. once NVIDIA fixes it), so it is safe to keep enabled. - function(ort_cuda13_patch_cccl_header src dst) + function(ort_cuda133_patch_cccl_header src dst) if (NOT EXISTS "${src}") return() endif() @@ -412,19 +412,21 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) foreach(inc_dir ${CUDAToolkit_INCLUDE_DIRS}) if (EXISTS "${inc_dir}/cccl") - # Generate cudafe++-parseable copies of the CCCL headers that contain global-qualified - # partial specializations (see ort_cuda13_patch_cccl_header above) and put the fixed - # directory ahead of the toolkit cccl include so the corrected headers win. - set(_ort_cccl_fix_dir "${CMAKE_CURRENT_BINARY_DIR}/cccl_cuda13_fix") - ort_cuda13_patch_cccl_header( - "${inc_dir}/cccl/cub/device/device_transform.cuh" - "${_ort_cccl_fix_dir}/cub/device/device_transform.cuh") - ort_cuda13_patch_cccl_header( - "${inc_dir}/cccl/cub/device/dispatch/tuning/tuning_transform.cuh" - "${_ort_cccl_fix_dir}/cub/device/dispatch/tuning/tuning_transform.cuh") - if (EXISTS "${_ort_cccl_fix_dir}/cub/device/device_transform.cuh" OR - EXISTS "${_ort_cccl_fix_dir}/cub/device/dispatch/tuning/tuning_transform.cuh") - target_include_directories(${target} BEFORE PRIVATE "${_ort_cccl_fix_dir}") + if (UNIX AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.3 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13.4) + # Generate cudafe++-parseable copies of the CCCL headers that contain global-qualified + # partial specializations (see ort_cuda133_patch_cccl_header above) and put the fixed + # directory ahead of the toolkit cccl include so the corrected headers win. + set(_ort_cccl_fix_dir "${CMAKE_CURRENT_BINARY_DIR}/cccl_cuda13_fix") + ort_cuda133_patch_cccl_header( + "${inc_dir}/cccl/cub/device/device_transform.cuh" + "${_ort_cccl_fix_dir}/cub/device/device_transform.cuh") + ort_cuda133_patch_cccl_header( + "${inc_dir}/cccl/cub/device/dispatch/tuning/tuning_transform.cuh" + "${_ort_cccl_fix_dir}/cub/device/dispatch/tuning/tuning_transform.cuh") + if (EXISTS "${_ort_cccl_fix_dir}/cub/device/device_transform.cuh" OR + EXISTS "${_ort_cccl_fix_dir}/cub/device/dispatch/tuning/tuning_transform.cuh") + target_include_directories(${target} BEFORE PRIVATE "${_ort_cccl_fix_dir}") + endif() endif() # Add the cccl subdirectory to the include path so can be found diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py index df14bc8c57f24..0a06156fe78d8 100644 --- a/onnxruntime/__init__.py +++ b/onnxruntime/__init__.py @@ -150,25 +150,57 @@ def _extract_cuda_major_version(version_str: str) -> str: return version_str.split(".", maxsplit=1)[0] if version_str else "12" -def _get_cufft_version(cuda_major: str) -> str: +def _get_cufft_version(cuda_major_version: str) -> str: """Get cufft library version based on CUDA major version. Args: - cuda_major: CUDA major version as string (e.g., "12", "13") + cuda_major_version: CUDA major version as string (e.g., "12", "13") Returns: cufft version as string """ # cufft versions: CUDA 12.x -> 11, CUDA 13.x -> 12 - return "12" if cuda_major == "13" else "11" + return "12" if int(cuda_major_version) >= 13 else "11" def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = True): - # Dynamically determine CUDA major version from build info + # Dynamically determine CUDA major version from build info. + # build_cuda_version defaults to the version this package was built with; it is a parameter for testability. cuda_major_version = _extract_cuda_major_version(cuda_version) cufft_version = _get_cufft_version(cuda_major_version) - if is_windows: + # Starting with CUDA 13, NVIDIA consolidated the per-component CUDA Toolkit wheels + # (cublas, cufft, cuda_runtime, cuda_nvrtc, curand, ...) into a single "nvidia/cu{major}" + # package and dropped the "-cuNN" suffix from those package names. On Windows the DLLs + # moved into an architecture sub-folder ("bin/", e.g. "bin/x86_64"); on Linux the + # libraries are placed directly in "lib" (the wheel itself is architecture specific, so + # there is no arch sub-folder). cuDNN keeps its own "nvidia/cudnn" package and layout. + use_consolidated_layout = cuda_major_version.isdigit() and int(cuda_major_version) >= 13 + + if use_consolidated_layout: + cuda_dir = f"cu{cuda_major_version}" + if is_windows: + import platform # noqa: PLC0415 + + arch = "arm64" if platform.machine().lower() in ("arm64", "aarch64") else "x86_64" + cuda_dll_paths = [ + ("nvidia", cuda_dir, "bin", arch, f"cublasLt64_{cuda_major_version}.dll"), + ("nvidia", cuda_dir, "bin", arch, f"cublas64_{cuda_major_version}.dll"), + ("nvidia", cuda_dir, "bin", arch, f"cufft64_{cufft_version}.dll"), + ("nvidia", cuda_dir, "bin", arch, f"cudart64_{cuda_major_version}.dll"), + ] + else: # Linux + # cublas64 depends on cublasLt64, so cublasLt64 should be loaded first. + cuda_dll_paths = [ + ("nvidia", cuda_dir, "lib", f"libcublasLt.so.{cuda_major_version}"), + ("nvidia", cuda_dir, "lib", f"libcublas.so.{cuda_major_version}"), + ("nvidia", cuda_dir, "lib", f"libnvrtc.so.{cuda_major_version}"), + ("nvidia", cuda_dir, "lib", "libcurand.so.10"), + ("nvidia", cuda_dir, "lib", f"libcufft.so.{cufft_version}"), + ("nvidia", cuda_dir, "lib", f"libcudart.so.{cuda_major_version}"), + ] + elif is_windows: + # CUDA 12 and earlier: each component ships its own "nvidia/" package. # Path is relative to site-packages directory. cuda_dll_paths = [ ("nvidia", "cublas", "bin", f"cublasLt64_{cuda_major_version}.dll"), @@ -176,16 +208,6 @@ def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = Tru ("nvidia", "cufft", "bin", f"cufft64_{cufft_version}.dll"), ("nvidia", "cuda_runtime", "bin", f"cudart64_{cuda_major_version}.dll"), ] - cudnn_dll_paths = [ - ("nvidia", "cudnn", "bin", "cudnn_engines_runtime_compiled64_9.dll"), - ("nvidia", "cudnn", "bin", "cudnn_engines_precompiled64_9.dll"), - ("nvidia", "cudnn", "bin", "cudnn_heuristic64_9.dll"), - ("nvidia", "cudnn", "bin", "cudnn_ops64_9.dll"), - ("nvidia", "cudnn", "bin", "cudnn_adv64_9.dll"), - ("nvidia", "cudnn", "bin", "cudnn_graph64_9.dll"), - ("nvidia", "cudnn", "bin", "cudnn64_9.dll"), - ("nvidia", "cudnn", "bin", "cudnn_engines_tensor_ir64_9.dll"), - ] else: # Linux # cublas64 depends on cublasLt64, so cublasLt64 should be loaded first. cuda_dll_paths = [ @@ -197,6 +219,19 @@ def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = Tru ("nvidia", "cuda_runtime", "lib", f"libcudart.so.{cuda_major_version}"), ] + # cuDNN keeps its own "nvidia/cudnn" package layout in both old and consolidated schemes. + if is_windows: + cudnn_dll_paths = [ + ("nvidia", "cudnn", "bin", "cudnn_engines_runtime_compiled64_9.dll"), + ("nvidia", "cudnn", "bin", "cudnn_engines_precompiled64_9.dll"), + ("nvidia", "cudnn", "bin", "cudnn_heuristic64_9.dll"), + ("nvidia", "cudnn", "bin", "cudnn_ops64_9.dll"), + ("nvidia", "cudnn", "bin", "cudnn_adv64_9.dll"), + ("nvidia", "cudnn", "bin", "cudnn_graph64_9.dll"), + ("nvidia", "cudnn", "bin", "cudnn64_9.dll"), + ("nvidia", "cudnn", "bin", "cudnn_engines_tensor_ir64_9.dll"), + ] + else: # Linux # Do not load cudnn sub DLLs (they will be dynamically loaded later) to be consistent with PyTorch in Linux. cudnn_dll_paths = [ ("nvidia", "cudnn", "lib", "libcudnn.so.9"), @@ -238,15 +273,19 @@ def print_debug_info(): # Print version of installed packages that is related to CUDA or cuDNN DLLs. cuda_major = _extract_cuda_major_version(cuda_version) + # Starting with CUDA 13, NVIDIA dropped the "-cuNN" suffix from the per-component + # CUDA Toolkit packages (cuDNN keeps its suffixed package name). + cuda_pkg_suffix = "" if (cuda_major.isdigit() and int(cuda_major) >= 13) else f"-cu{cuda_major}" + packages = [ "torch", - f"nvidia-cuda-runtime-cu{cuda_major}", + f"nvidia-cuda-runtime{cuda_pkg_suffix}", f"nvidia-cudnn-cu{cuda_major}", - f"nvidia-cublas-cu{cuda_major}", - f"nvidia-cufft-cu{cuda_major}", - f"nvidia-curand-cu{cuda_major}", - f"nvidia-cuda-nvrtc-cu{cuda_major}", - f"nvidia-nvjitlink-cu{cuda_major}", + f"nvidia-cublas{cuda_pkg_suffix}", + f"nvidia-cufft{cuda_pkg_suffix}", + f"nvidia-curand{cuda_pkg_suffix}", + f"nvidia-cuda-nvrtc{cuda_pkg_suffix}", + f"nvidia-nvjitlink{cuda_pkg_suffix}", ] for package in packages: directory_name = "nvidia" if package.startswith("nvidia-") else None diff --git a/onnxruntime/test/python/onnxruntime_test_python_preload_dlls.py b/onnxruntime/test/python/onnxruntime_test_python_preload_dlls.py new file mode 100644 index 0000000000000..a8ce794f5fdd3 --- /dev/null +++ b/onnxruntime/test/python/onnxruntime_test_python_preload_dlls.py @@ -0,0 +1,79 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# pylint: disable=C0114,C0115,C0116,W0212 +import unittest + +import onnxruntime + + +class TestGetNvidiaDllPaths(unittest.TestCase): + """Unit tests for the private _get_nvidia_dll_paths helper that locates CUDA/cuDNN + libraries inside the NVIDIA site-packages folders. + + NVIDIA restructured the CUDA Python wheels starting with CUDA 13: the per-component + packages (cublas, cufft, cuda_runtime, ...) were consolidated into a single + "nvidia/cu{major}" tree. These tests pin down the expected relative paths for the + old (CUDA 12) and new (CUDA 13) layouts on both Windows and Linux. + """ + + def _paths(self, **kwargs): + return onnxruntime._get_nvidia_dll_paths(**kwargs) + + # ---- CUDA 12 (legacy per-component layout) -------------------------------------- + def test_cuda12_windows(self): + paths = self._paths(is_windows=True, build_cuda_version="12.4", cudnn=False) + self.assertIn(("nvidia", "cublas", "bin", "cublasLt64_12.dll"), paths) + self.assertIn(("nvidia", "cublas", "bin", "cublas64_12.dll"), paths) + self.assertIn(("nvidia", "cufft", "bin", "cufft64_11.dll"), paths) + self.assertIn(("nvidia", "cuda_runtime", "bin", "cudart64_12.dll"), paths) + + def test_cuda12_linux(self): + paths = self._paths(is_windows=False, build_cuda_version="12.4", cudnn=False) + self.assertIn(("nvidia", "cublas", "lib", "libcublasLt.so.12"), paths) + self.assertIn(("nvidia", "cublas", "lib", "libcublas.so.12"), paths) + self.assertIn(("nvidia", "cuda_nvrtc", "lib", "libnvrtc.so.12"), paths) + self.assertIn(("nvidia", "curand", "lib", "libcurand.so.10"), paths) + self.assertIn(("nvidia", "cufft", "lib", "libcufft.so.11"), paths) + self.assertIn(("nvidia", "cuda_runtime", "lib", "libcudart.so.12"), paths) + + # ---- CUDA 13 (consolidated "cu13" layout) --------------------------------------- + def test_cuda13_windows_x86_64(self): + paths = self._paths(is_windows=True, build_cuda_version="13.2", cudnn=False, arch="x86_64") + self.assertIn(("nvidia", "cu13", "bin", "x86_64", "cublasLt64_13.dll"), paths) + self.assertIn(("nvidia", "cu13", "bin", "x86_64", "cublas64_13.dll"), paths) + self.assertIn(("nvidia", "cu13", "bin", "x86_64", "cufft64_12.dll"), paths) + self.assertIn(("nvidia", "cu13", "bin", "x86_64", "cudart64_13.dll"), paths) + + def test_cuda13_windows_arch_override(self): + paths = self._paths(is_windows=True, build_cuda_version="13.2", cudnn=False, arch="arm64") + self.assertIn(("nvidia", "cu13", "bin", "arm64", "cudart64_13.dll"), paths) + + def test_cuda13_linux_is_flat(self): + paths = self._paths(is_windows=False, build_cuda_version="13.2", cudnn=False) + # Linux consolidated layout has no architecture sub-folder (flat "lib"). + self.assertIn(("nvidia", "cu13", "lib", "libcublasLt.so.13"), paths) + self.assertIn(("nvidia", "cu13", "lib", "libcublas.so.13"), paths) + self.assertIn(("nvidia", "cu13", "lib", "libnvrtc.so.13"), paths) + self.assertIn(("nvidia", "cu13", "lib", "libcurand.so.10"), paths) + self.assertIn(("nvidia", "cu13", "lib", "libcufft.so.12"), paths) + self.assertIn(("nvidia", "cu13", "lib", "libcudart.so.13"), paths) + + # ---- cuDNN keeps its own package/layout in both schemes ------------------------- + def test_cudnn_layout_unchanged(self): + for build_cuda_version in ("12.4", "13.2"): + win = self._paths(is_windows=True, build_cuda_version=build_cuda_version, cuda=False) + self.assertIn(("nvidia", "cudnn", "bin", "cudnn64_9.dll"), win) + + linux = self._paths(is_windows=False, build_cuda_version=build_cuda_version, cuda=False) + self.assertEqual(linux, [("nvidia", "cudnn", "lib", "libcudnn.so.9")]) + + # ---- toggles -------------------------------------------------------------------- + def test_cuda_and_cudnn_toggles(self): + self.assertEqual(self._paths(is_windows=False, build_cuda_version="13.2", cuda=False, cudnn=False), []) + + cuda_only = self._paths(is_windows=False, build_cuda_version="13.2", cuda=True, cudnn=False) + self.assertTrue(all(p[1] == "cu13" for p in cuda_only)) + + +if __name__ == "__main__": + unittest.main() diff --git a/setup.py b/setup.py index 3b8bb9b81d20a..62ced38819f2c 100644 --- a/setup.py +++ b/setup.py @@ -817,7 +817,9 @@ def reformat_run_count(count_str): # Adding CUDA Runtime as dependency for NV TensorRT RTX python wheel if package_name == "onnxruntime-trt-rtx": major = cuda_major_version or "12" # Default to CUDA 12 - install_requires.append(f"nvidia-cuda-runtime-cu{major}~={major}.0") + # CUDA 13 dropped the "-cuNN" suffix from the CUDA Runtime package name. + runtime_pkg = "nvidia-cuda-runtime" if int(major) >= 13 else f"nvidia-cuda-runtime-cu{major}" + install_requires.append(f"{runtime_pkg}~={major}.0") def save_build_and_package_info(package_name, version_number, cuda_version, qnn_version): @@ -862,13 +864,18 @@ def save_build_and_package_info(package_name, version_number, cuda_version, qnn_ if package_name == "onnxruntime-gpu" and cuda_major_version: # Determine cufft version: CUDA 13 uses cufft 12, CUDA 12 uses cufft 11 cufft_version = "12.0" if cuda_major_version == "13" else "11.0" + + # Starting with CUDA 13, NVIDIA renamed the per-component CUDA Toolkit packages by + # dropping the "-cuNN" suffix (e.g. "nvidia-cuda-runtime-cu12" -> "nvidia-cuda-runtime"). + # cuDNN keeps the suffixed package name ("nvidia-cudnn-cu13"). + cuda_pkg_suffix = "" if int(cuda_major_version) >= 13 else f"-cu{cuda_major_version}" extras_require.update( { "cuda": [ - f"nvidia-cuda-nvrtc-cu{cuda_major_version}~={cuda_major_version}.0", - f"nvidia-cuda-runtime-cu{cuda_major_version}~={cuda_major_version}.0", - f"nvidia-cufft-cu{cuda_major_version}~={cufft_version}", - f"nvidia-curand-cu{cuda_major_version}~=10.0", + f"nvidia-cuda-nvrtc{cuda_pkg_suffix}~={cuda_major_version}.0", + f"nvidia-cuda-runtime{cuda_pkg_suffix}~={cuda_major_version}.0", + f"nvidia-cufft{cuda_pkg_suffix}~={cufft_version}", + f"nvidia-curand{cuda_pkg_suffix}~=10.0", ], "cudnn": [ f"nvidia-cudnn-cu{cuda_major_version}~=9.0",