microsoft · tianleiwu · Jun 26, 2026 · Jun 21, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
@@ -208,7 +208,7 @@
     target_compile_definitions(onnxruntime_providers_cuda PRIVATE FILE_NAME=\"onnxruntime_providers_cuda.dll\")
   endif()
 
-  # Work around a CUDA 13.x cudafe++ (EDG front-end) regression that mis-parses CCCL's
+  # Work around a CUDA 13.3 cudafe++ (EDG front-end) regression that mis-parses CCCL's
   # global-qualified partial specializations, e.g. in <cub/device/device_transform.cuh>:
   #   template <typename T>
   #   struct ::cuda::proclaims_copyable_arguments<...> : ::cuda::std::true_type {};
@@ -218,7 +218,7 @@
   # corrected copies of the affected headers into the build tree and place that directory
   # ahead of the toolkit cccl include path. This is a no-op on toolkits whose headers do not
   # contain the offending pattern (e.g. once NVIDIA fixes it), so it is safe to keep enabled.
-  function(ort_cuda13_patch_cccl_header src dst)
+  function(ort_cuda133_patch_cccl_header src dst)
     if (NOT EXISTS "${src}")
       return()
     endif()
@@ -412,19 +412,21 @@
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
       foreach(inc_dir ${CUDAToolkit_INCLUDE_DIRS})
         if (EXISTS "${inc_dir}/cccl")
-          # Generate cudafe++-parseable copies of the CCCL headers that contain global-qualified
-          # partial specializations (see ort_cuda13_patch_cccl_header above) and put the fixed
-          # directory ahead of the toolkit cccl include so the corrected headers win.
-          set(_ort_cccl_fix_dir "${CMAKE_CURRENT_BINARY_DIR}/cccl_cuda13_fix")
-          ort_cuda13_patch_cccl_header(
-            "${inc_dir}/cccl/cub/device/device_transform.cuh"
-            "${_ort_cccl_fix_dir}/cub/device/device_transform.cuh")
-          ort_cuda13_patch_cccl_header(
-            "${inc_dir}/cccl/cub/device/dispatch/tuning/tuning_transform.cuh"
-            "${_ort_cccl_fix_dir}/cub/device/dispatch/tuning/tuning_transform.cuh")
-          if (EXISTS "${_ort_cccl_fix_dir}/cub/device/device_transform.cuh" OR
-              EXISTS "${_ort_cccl_fix_dir}/cub/device/dispatch/tuning/tuning_transform.cuh")
-            target_include_directories(${target} BEFORE PRIVATE "${_ort_cccl_fix_dir}")
+          if (UNIX AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.3 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13.4)
+            # Generate cudafe++-parseable copies of the CCCL headers that contain global-qualified
+            # partial specializations (see ort_cuda133_patch_cccl_header above) and put the fixed
+            # directory ahead of the toolkit cccl include so the corrected headers win.
+            set(_ort_cccl_fix_dir "${CMAKE_CURRENT_BINARY_DIR}/cccl_cuda13_fix")
+            ort_cuda133_patch_cccl_header(
+              "${inc_dir}/cccl/cub/device/device_transform.cuh"
+              "${_ort_cccl_fix_dir}/cub/device/device_transform.cuh")
+            ort_cuda133_patch_cccl_header(
+              "${inc_dir}/cccl/cub/device/dispatch/tuning/tuning_transform.cuh"
+              "${_ort_cccl_fix_dir}/cub/device/dispatch/tuning/tuning_transform.cuh")
+            if (EXISTS "${_ort_cccl_fix_dir}/cub/device/device_transform.cuh" OR
+                EXISTS "${_ort_cccl_fix_dir}/cub/device/dispatch/tuning/tuning_transform.cuh")
+              target_include_directories(${target} BEFORE PRIVATE "${_ort_cccl_fix_dir}")
+            endif()
           endif()
 
           # Add the cccl subdirectory to the include path so <cuda/std/utility> can be found

diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
@@ -150,42 +150,64 @@ def _extract_cuda_major_version(version_str: str) -> str:
     return version_str.split(".", maxsplit=1)[0] if version_str else "12"
 
 
-def _get_cufft_version(cuda_major: str) -> str:
+def _get_cufft_version(cuda_major_version: str) -> str:
     """Get cufft library version based on CUDA major version.
 
     Args:
-        cuda_major: CUDA major version as string (e.g., "12", "13")
+        cuda_major_version: CUDA major version as string (e.g., "12", "13")
 
     Returns:
         cufft version as string
     """
     # cufft versions: CUDA 12.x -> 11, CUDA 13.x -> 12
-    return "12" if cuda_major == "13" else "11"
+    return "12" if int(cuda_major_version) >= 13 else "11"
 
 
 def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = True):
-    # Dynamically determine CUDA major version from build info
+    # Dynamically determine CUDA major version from build info.
+    # build_cuda_version defaults to the version this package was built with; it is a parameter for testability.
     cuda_major_version = _extract_cuda_major_version(cuda_version)
     cufft_version = _get_cufft_version(cuda_major_version)
 
-    if is_windows:
+    # Starting with CUDA 13, NVIDIA consolidated the per-component CUDA Toolkit wheels
+    # (cublas, cufft, cuda_runtime, cuda_nvrtc, curand, ...) into a single "nvidia/cu{major}"
+    # package and dropped the "-cuNN" suffix from those package names. On Windows the DLLs
+    # moved into an architecture sub-folder ("bin/<arch>", e.g. "bin/x86_64"); on Linux the
+    # libraries are placed directly in "lib" (the wheel itself is architecture specific, so
+    # there is no arch sub-folder). cuDNN keeps its own "nvidia/cudnn" package and layout.
+    use_consolidated_layout = cuda_major_version.isdigit() and int(cuda_major_version) >= 13
+
+    if use_consolidated_layout:
+        cuda_dir = f"cu{cuda_major_version}"
+        if is_windows:
+            import platform  # noqa: PLC0415
+
+            arch = "arm64" if platform.machine().lower() in ("arm64", "aarch64") else "x86_64"
+            cuda_dll_paths = [
+                ("nvidia", cuda_dir, "bin", arch, f"cublasLt64_{cuda_major_version}.dll"),
+                ("nvidia", cuda_dir, "bin", arch, f"cublas64_{cuda_major_version}.dll"),
+                ("nvidia", cuda_dir, "bin", arch, f"cufft64_{cufft_version}.dll"),
+                ("nvidia", cuda_dir, "bin", arch, f"cudart64_{cuda_major_version}.dll"),
+            ]
+        else:  # Linux
+            # cublas64 depends on cublasLt64, so cublasLt64 should be loaded first.
+            cuda_dll_paths = [
+                ("nvidia", cuda_dir, "lib", f"libcublasLt.so.{cuda_major_version}"),
+                ("nvidia", cuda_dir, "lib", f"libcublas.so.{cuda_major_version}"),
+                ("nvidia", cuda_dir, "lib", f"libnvrtc.so.{cuda_major_version}"),
+                ("nvidia", cuda_dir, "lib", "libcurand.so.10"),
+                ("nvidia", cuda_dir, "lib", f"libcufft.so.{cufft_version}"),
+                ("nvidia", cuda_dir, "lib", f"libcudart.so.{cuda_major_version}"),
+            ]
+    elif is_windows:
+        # CUDA 12 and earlier: each component ships its own "nvidia/<component>" package.
         # Path is relative to site-packages directory.
         cuda_dll_paths = [
             ("nvidia", "cublas", "bin", f"cublasLt64_{cuda_major_version}.dll"),
             ("nvidia", "cublas", "bin", f"cublas64_{cuda_major_version}.dll"),
             ("nvidia", "cufft", "bin", f"cufft64_{cufft_version}.dll"),
             ("nvidia", "cuda_runtime", "bin", f"cudart64_{cuda_major_version}.dll"),
         ]
-        cudnn_dll_paths = [
-            ("nvidia", "cudnn", "bin", "cudnn_engines_runtime_compiled64_9.dll"),
-            ("nvidia", "cudnn", "bin", "cudnn_engines_precompiled64_9.dll"),
-            ("nvidia", "cudnn", "bin", "cudnn_heuristic64_9.dll"),
-            ("nvidia", "cudnn", "bin", "cudnn_ops64_9.dll"),
-            ("nvidia", "cudnn", "bin", "cudnn_adv64_9.dll"),
-            ("nvidia", "cudnn", "bin", "cudnn_graph64_9.dll"),
-            ("nvidia", "cudnn", "bin", "cudnn64_9.dll"),
-            ("nvidia", "cudnn", "bin", "cudnn_engines_tensor_ir64_9.dll"),
-        ]
     else:  # Linux
         # cublas64 depends on cublasLt64, so cublasLt64 should be loaded first.
         cuda_dll_paths = [
@@ -197,6 +219,19 @@ def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = Tru
             ("nvidia", "cuda_runtime", "lib", f"libcudart.so.{cuda_major_version}"),
         ]
 
+    # cuDNN keeps its own "nvidia/cudnn" package layout in both old and consolidated schemes.
+    if is_windows:
+        cudnn_dll_paths = [
+            ("nvidia", "cudnn", "bin", "cudnn_engines_runtime_compiled64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn_engines_precompiled64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn_heuristic64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn_ops64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn_adv64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn_graph64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn_engines_tensor_ir64_9.dll"),
+        ]
+    else:  # Linux
         # Do not load cudnn sub DLLs (they will be dynamically loaded later) to be consistent with PyTorch in Linux.
         cudnn_dll_paths = [
             ("nvidia", "cudnn", "lib", "libcudnn.so.9"),
@@ -238,15 +273,19 @@ def print_debug_info():
         # Print version of installed packages that is related to CUDA or cuDNN DLLs.
         cuda_major = _extract_cuda_major_version(cuda_version)
 
+        # Starting with CUDA 13, NVIDIA dropped the "-cuNN" suffix from the per-component
+        # CUDA Toolkit packages (cuDNN keeps its suffixed package name).
+        cuda_pkg_suffix = "" if (cuda_major.isdigit() and int(cuda_major) >= 13) else f"-cu{cuda_major}"
+
         packages = [
             "torch",
-            f"nvidia-cuda-runtime-cu{cuda_major}",
+            f"nvidia-cuda-runtime{cuda_pkg_suffix}",
             f"nvidia-cudnn-cu{cuda_major}",
-            f"nvidia-cublas-cu{cuda_major}",
-            f"nvidia-cufft-cu{cuda_major}",
-            f"nvidia-curand-cu{cuda_major}",
-            f"nvidia-cuda-nvrtc-cu{cuda_major}",
-            f"nvidia-nvjitlink-cu{cuda_major}",
+            f"nvidia-cublas{cuda_pkg_suffix}",
+            f"nvidia-cufft{cuda_pkg_suffix}",
+            f"nvidia-curand{cuda_pkg_suffix}",
+            f"nvidia-cuda-nvrtc{cuda_pkg_suffix}",
+            f"nvidia-nvjitlink{cuda_pkg_suffix}",
         ]
         for package in packages:
             directory_name = "nvidia" if package.startswith("nvidia-") else None

diff --git a/onnxruntime/test/python/onnxruntime_test_python_preload_dlls.py b/onnxruntime/test/python/onnxruntime_test_python_preload_dlls.py
@@ -0,0 +1,79 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# pylint: disable=C0114,C0115,C0116,W0212
+import unittest
+
+import onnxruntime
+
+
+class TestGetNvidiaDllPaths(unittest.TestCase):
+    """Unit tests for the private _get_nvidia_dll_paths helper that locates CUDA/cuDNN
+    libraries inside the NVIDIA site-packages folders.
+
+    NVIDIA restructured the CUDA Python wheels starting with CUDA 13: the per-component
+    packages (cublas, cufft, cuda_runtime, ...) were consolidated into a single
+    "nvidia/cu{major}" tree. These tests pin down the expected relative paths for the
+    old (CUDA 12) and new (CUDA 13) layouts on both Windows and Linux.
+    """
+
+    def _paths(self, **kwargs):
+        return onnxruntime._get_nvidia_dll_paths(**kwargs)
+
+    # ---- CUDA 12 (legacy per-component layout) --------------------------------------
+    def test_cuda12_windows(self):
+        paths = self._paths(is_windows=True, build_cuda_version="12.4", cudnn=False)
+        self.assertIn(("nvidia", "cublas", "bin", "cublasLt64_12.dll"), paths)
+        self.assertIn(("nvidia", "cublas", "bin", "cublas64_12.dll"), paths)
+        self.assertIn(("nvidia", "cufft", "bin", "cufft64_11.dll"), paths)
+        self.assertIn(("nvidia", "cuda_runtime", "bin", "cudart64_12.dll"), paths)
+
+    def test_cuda12_linux(self):
+        paths = self._paths(is_windows=False, build_cuda_version="12.4", cudnn=False)
+        self.assertIn(("nvidia", "cublas", "lib", "libcublasLt.so.12"), paths)
+        self.assertIn(("nvidia", "cublas", "lib", "libcublas.so.12"), paths)
+        self.assertIn(("nvidia", "cuda_nvrtc", "lib", "libnvrtc.so.12"), paths)
+        self.assertIn(("nvidia", "curand", "lib", "libcurand.so.10"), paths)
+        self.assertIn(("nvidia", "cufft", "lib", "libcufft.so.11"), paths)
+        self.assertIn(("nvidia", "cuda_runtime", "lib", "libcudart.so.12"), paths)
+
+    # ---- CUDA 13 (consolidated "cu13" layout) ---------------------------------------
+    def test_cuda13_windows_x86_64(self):
+        paths = self._paths(is_windows=True, build_cuda_version="13.2", cudnn=False, arch="x86_64")
+        self.assertIn(("nvidia", "cu13", "bin", "x86_64", "cublasLt64_13.dll"), paths)
+        self.assertIn(("nvidia", "cu13", "bin", "x86_64", "cublas64_13.dll"), paths)
+        self.assertIn(("nvidia", "cu13", "bin", "x86_64", "cufft64_12.dll"), paths)
+        self.assertIn(("nvidia", "cu13", "bin", "x86_64", "cudart64_13.dll"), paths)
+
+    def test_cuda13_windows_arch_override(self):
+        paths = self._paths(is_windows=True, build_cuda_version="13.2", cudnn=False, arch="arm64")
+        self.assertIn(("nvidia", "cu13", "bin", "arm64", "cudart64_13.dll"), paths)
+
+    def test_cuda13_linux_is_flat(self):
+        paths = self._paths(is_windows=False, build_cuda_version="13.2", cudnn=False)
+        # Linux consolidated layout has no architecture sub-folder (flat "lib").
+        self.assertIn(("nvidia", "cu13", "lib", "libcublasLt.so.13"), paths)
+        self.assertIn(("nvidia", "cu13", "lib", "libcublas.so.13"), paths)
+        self.assertIn(("nvidia", "cu13", "lib", "libnvrtc.so.13"), paths)
+        self.assertIn(("nvidia", "cu13", "lib", "libcurand.so.10"), paths)
+        self.assertIn(("nvidia", "cu13", "lib", "libcufft.so.12"), paths)
+        self.assertIn(("nvidia", "cu13", "lib", "libcudart.so.13"), paths)
+
+    # ---- cuDNN keeps its own package/layout in both schemes -------------------------
+    def test_cudnn_layout_unchanged(self):
+        for build_cuda_version in ("12.4", "13.2"):
+            win = self._paths(is_windows=True, build_cuda_version=build_cuda_version, cuda=False)
+            self.assertIn(("nvidia", "cudnn", "bin", "cudnn64_9.dll"), win)
+
+            linux = self._paths(is_windows=False, build_cuda_version=build_cuda_version, cuda=False)
+            self.assertEqual(linux, [("nvidia", "cudnn", "lib", "libcudnn.so.9")])
+
+    # ---- toggles --------------------------------------------------------------------
+    def test_cuda_and_cudnn_toggles(self):
+        self.assertEqual(self._paths(is_windows=False, build_cuda_version="13.2", cuda=False, cudnn=False), [])
+
+        cuda_only = self._paths(is_windows=False, build_cuda_version="13.2", cuda=True, cudnn=False)
+        self.assertTrue(all(p[1] == "cu13" for p in cuda_only))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/setup.py b/setup.py
@@ -817,7 +817,9 @@ def reformat_run_count(count_str):
 # Adding CUDA Runtime as dependency for NV TensorRT RTX python wheel
 if package_name == "onnxruntime-trt-rtx":
     major = cuda_major_version or "12"  # Default to CUDA 12
-    install_requires.append(f"nvidia-cuda-runtime-cu{major}~={major}.0")
+    # CUDA 13 dropped the "-cuNN" suffix from the CUDA Runtime package name.
+    runtime_pkg = "nvidia-cuda-runtime" if int(major) >= 13 else f"nvidia-cuda-runtime-cu{major}"
+    install_requires.append(f"{runtime_pkg}~={major}.0")
 
 
 def save_build_and_package_info(package_name, version_number, cuda_version, qnn_version):
@@ -862,13 +864,18 @@ def save_build_and_package_info(package_name, version_number, cuda_version, qnn_
 if package_name == "onnxruntime-gpu" and cuda_major_version:
     # Determine cufft version: CUDA 13 uses cufft 12, CUDA 12 uses cufft 11
     cufft_version = "12.0" if cuda_major_version == "13" else "11.0"
+
+    # Starting with CUDA 13, NVIDIA renamed the per-component CUDA Toolkit packages by
+    # dropping the "-cuNN" suffix (e.g. "nvidia-cuda-runtime-cu12" -> "nvidia-cuda-runtime").
+    # cuDNN keeps the suffixed package name ("nvidia-cudnn-cu13").
+    cuda_pkg_suffix = "" if int(cuda_major_version) >= 13 else f"-cu{cuda_major_version}"
     extras_require.update(
         {
             "cuda": [
-                f"nvidia-cuda-nvrtc-cu{cuda_major_version}~={cuda_major_version}.0",
-                f"nvidia-cuda-runtime-cu{cuda_major_version}~={cuda_major_version}.0",
-                f"nvidia-cufft-cu{cuda_major_version}~={cufft_version}",
-                f"nvidia-curand-cu{cuda_major_version}~=10.0",
+                f"nvidia-cuda-nvrtc{cuda_pkg_suffix}~={cuda_major_version}.0",
+                f"nvidia-cuda-runtime{cuda_pkg_suffix}~={cuda_major_version}.0",
+                f"nvidia-cufft{cuda_pkg_suffix}~={cufft_version}",
+                f"nvidia-curand{cuda_pkg_suffix}~=10.0",
             ],
             "cudnn": [
                 f"nvidia-cudnn-cu{cuda_major_version}~=9.0",