JuliaGPU
diff --git a/‎lib/cublas/CUBLAS.jl‎
Lines changed: 1 addition & 3 deletions b/‎lib/cublas/CUBLAS.jl‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎lib/cudadrv/context.jl‎
Lines changed: 3 additions & 17 deletions b/‎lib/cudadrv/context.jl‎
Lines changed: 3 additions & 17 deletions
diff --git a/‎lib/cudadrv/devices.jl‎
Lines changed: 1 addition & 5 deletions b/‎lib/cudadrv/devices.jl‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎lib/cudadrv/graph.jl‎
Lines changed: 1 addition & 19 deletions b/‎lib/cudadrv/graph.jl‎
Lines changed: 1 addition & 19 deletions
diff --git a/‎lib/cudadrv/memory.jl‎
Lines changed: 2 additions & 10 deletions b/‎lib/cudadrv/memory.jl‎
Lines changed: 2 additions & 10 deletions
diff --git a/‎lib/cupti/wrappers.jl‎
Lines changed: 4 additions & 34 deletions b/‎lib/cupti/wrappers.jl‎
Lines changed: 4 additions & 34 deletions
diff --git a/‎lib/cusparse/generic.jl‎
Lines changed: 7 additions & 37 deletions b/‎lib/cusparse/generic.jl‎
Lines changed: 7 additions & 37 deletions
diff --git a/‎src/compiler/reflection.jl‎
Lines changed: 4 additions & 4 deletions b/‎src/compiler/reflection.jl‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/initialization.jl‎
Lines changed: 4 additions & 9 deletions b/‎src/initialization.jl‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎src/profile.jl‎
Lines changed: 1 addition & 4 deletions b/‎src/profile.jl‎
Lines changed: 1 addition & 4 deletions
@@ -43,9 +43,7 @@ function math_mode!(handle, mode)
     flags = 0
 
     # https://github.com/facebookresearch/faiss/issues/1385
-    if version() > v"11"
-        flags = CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION
-    end
+    flags = CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION
 
     flags |= if mode == CUDA.PEDANTIC_MATH
         # prevent use of tensor cores
 
@@ -214,12 +214,7 @@ Lower the refcount of a context, possibly freeing up all resources associated wi
 does not respect any users of the context, and might make other objects unusable.
 """
 function unsafe_release!(pctx::CuPrimaryContext)
-    if driver_version() >= v"11"
-        cuDevicePrimaryCtxRelease_v2(pctx.dev)
-    else
-        cuDevicePrimaryCtxRelease(pctx.dev)
-    end
-
+    cuDevicePrimaryCtxRelease_v2(pctx.dev)
     return
 end
 
@@ -231,12 +226,7 @@ in the current process. Note that this forcibly invalidates all contexts derived
 primary context, and as a result outstanding resources might become invalid.
 """
 function unsafe_reset!(pctx::CuPrimaryContext)
-    if driver_version() >= v"11"
-        cuDevicePrimaryCtxReset_v2(pctx.dev)
-    else
-        cuDevicePrimaryCtxReset(pctx.dev)
-    end
-
+    cuDevicePrimaryCtxReset_v2(pctx.dev)
     return
 end
 
@@ -267,11 +257,7 @@ flags(pctx::CuPrimaryContext) = state(pctx)[1]
 Set the flags of a primary context.
 """
 function setflags!(pctx::CuPrimaryContext, flags)
-    if driver_version() >= v"11"
-        cuDevicePrimaryCtxSetFlags_v2(pctx.dev, flags)
-    else
-        cuDevicePrimaryCtxSetFlags(pctx.dev, flags)
-    end
+    cuDevicePrimaryCtxSetFlags_v2(pctx.dev, flags)
 end
 
 
 
@@ -80,8 +80,6 @@ corresponding to the device ID as known to CUDA.
 deviceid(dev::CuDevice) = Int(convert(CUdevice, dev))
 
 function uuid(dev::CuDevice)
-    driver_version() < v"11.4" && return parent_uuid(dev)
-
     # returns the MIG UUID if this is a compute instance
     uuid_ref = Ref{CUuuid}()
     cuDeviceGetUuid_v2(uuid_ref, dev)
@@ -186,9 +184,7 @@ function capability(dev::CuDevice)
                          attribute(dev, DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR))
 end
 
-memory_pools_supported(dev::CuDevice) =
-    CUDA.driver_version() >= v"11.2" &&
-    attribute(dev, DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED) == 1
+memory_pools_supported(dev::CuDevice) = attribute(dev, DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED) == 1
 @deprecate has_stream_ordered(dev::CuDevice) memory_pools_supported(dev)
 
 unified_addressing(dev::CuDevice) =
 
@@ -90,25 +90,7 @@ mutable struct CuGraphExec
     global function instantiate(graph::CuGraph, flags=0)
         handle_ref = Ref{CUgraphExec}()
 
-        if driver_version() >= v"12.0"
-            cuGraphInstantiateWithFlags(handle_ref, graph, flags)
-        else
-            flags == 0 || error("Flags are not supported on CUDA < 12.0")
-
-            error_node = Ref{CUgraphNode}()
-            buflen = 256
-            buf = Vector{UInt8}(undef, buflen)
-
-            GC.@preserve buf begin
-                if driver_version() >= v"11.0"
-                    cuGraphInstantiate_v2(handle_ref, graph, error_node, pointer(buf), buflen)
-                else
-                    cuGraphInstantiate(handle_ref, graph, error_node, pointer(buf), buflen)
-                end
-                diag = String(buf)
-                # TODO: how to use these?
-            end
-        end
+        cuGraphInstantiateWithFlags(handle_ref, graph, flags)
 
         ctx = current_context()
         obj = new(handle_ref[], graph, ctx)
 
@@ -565,16 +565,8 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
     srcPos = CuDim3(srcPos)
     dstPos = CuDim3(dstPos)
 
-    # JuliaGPU/CUDA.jl#863: cuMemcpy3DAsync calculates wrong offset
-    #                       when using the stream-ordered memory allocator
-    # NOTE: we apply the workaround unconditionally, since we want to keep this call cheap.
-    if v"11.2" <= driver_version() <= v"11.3" #&& pools[device()].stream_ordered
-        srcOffset = (srcPos.x-1)*aligned_sizeof(T) + srcPitch*((srcPos.y-1) + srcHeight*(srcPos.z-1))
-        dstOffset = (dstPos.x-1)*aligned_sizeof(T) + dstPitch*((dstPos.y-1) + dstHeight*(dstPos.z-1))
-    else
-        srcOffset = 0
-        dstOffset = 0
-    end
+    srcOffset = 0
+    dstOffset = 0
 
     srcMemoryType, srcHost, srcDevice, srcArray = if srcTyp == HostMemory
         CU_MEMORYTYPE_HOST,
 
@@ -282,46 +282,16 @@ function process(f, cfg::ActivityConfig)
     cuda_version = CUDA.runtime_version()
     ## kernel activities
     activity_types[CUPTI_ACTIVITY_KIND_KERNEL] =
-        if cuda_version >= v"12.0"
-            CUpti_ActivityKernel9
-        elseif cuda_version >= v"11.8"
-            CUpti_ActivityKernel8
-        elseif cuda_version >= v"11.6"
-            CUpti_ActivityKernel7
-        elseif cuda_version >= v"11.2"
-            CUpti_ActivityKernel6
-        elseif cuda_version >= v"11.1"
-            CUpti_ActivityKernel5
-        else # v"11.0"
-            CUpti_ActivityKernel4
-        end
+        CUpti_ActivityKernel9
     activity_types[CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL] =
         activity_types[CUPTI_ACTIVITY_KIND_KERNEL]
     ## memcpy activities
     activity_types[CUPTI_ACTIVITY_KIND_MEMCPY] =
-        if cuda_version >= v"11.6"
-            CUpti_ActivityMemcpy5
-        elseif cuda_version >= v"11.1"
-            CUpti_ActivityMemcpy4
-        else # v"11.0"
-            CUpti_ActivityMemcpy3
-        end
+        CUpti_ActivityMemcpy5
     activity_types[CUPTI_ACTIVITY_KIND_MEMSET] =
-        if cuda_version >= v"11.6"
-            CUpti_ActivityMemset4
-        elseif cuda_version >= v"11.1"
-            CUpti_ActivityMemset3
-        else # v"11.0"
-            CUpti_ActivityMemset2
-        end
+        CUpti_ActivityMemset4
     activity_types[CUPTI_ACTIVITY_KIND_MEMORY2] =
-        if cuda_version >= v"11.6"
-            CUpti_ActivityMemory3
-        elseif cuda_version >= v"11.2"
-            CUpti_ActivityMemory2
-        else # v"9.0"
-            CUpti_ActivityMemory
-        end
+        CUpti_ActivityMemory3
 
     # extract typed activity records
     for (ctx_handle, stream_id, buf_ptr, sz, valid_sz) in cfg.results
 
@@ -158,12 +158,7 @@ function mv!(transa::SparseChar, alpha::Number, A::Union{CuSparseMatrixCSC{TA},C
     # Support transa = 'C' for real matrices
     transa = T <: Real && transa == 'C' ? 'T' : transa
 
-    if CUSPARSE.version() < v"12.0" && isa(A, CuSparseMatrixCSC) && transa == 'C' && TA <: Complex
-        throw(ArgumentError("Matrix-vector multiplication with the adjoint of a complex CSC matrix" *
-                            " is not supported by the current CUDA version. Use a CSR or COO matrix instead."))
-    end
-
-    if CUSPARSE.version() < v"12.0" && isa(A, CuSparseMatrixCSC)
+    if isa(A, CuSparseMatrixCSC)
         # cusparseSpMV completely supports CSC matrices with CUSPARSE.version() ≥ v"12.0".
         # We use Aᵀ to model them as CSR matrices for older versions of CUSPARSE.
         descA = CuSparseMatrixDescriptor(A, index, transposed=true)
@@ -186,9 +181,9 @@ function mv!(transa::SparseChar, alpha::Number, A::Union{CuSparseMatrixCSC{TA},C
     # operations with 16-bit numbers always imply mixed-precision computation
     # TODO: we should better model the supported combinations here,
     #       and error if using an unsupported one (like with gemmEx!)
-    compute_type = if version() >= v"11.4" && T == Float16
+    compute_type = if T == Float16
         Float32
-    elseif version() >= v"11.7.2" && T == ComplexF16
+    elseif T == ComplexF16
         ComplexF32
     else
         T
@@ -216,21 +211,8 @@ function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseM
     transa = T <: Real && transa == 'C' ? 'T' : transa
     transb = T <: Real && transb == 'C' ? 'T' : transb
 
-    if CUSPARSE.version() < v"12.0" && isa(A, CuSparseMatrixCSC) && transa == 'C' && T <: Complex
-        throw(ArgumentError("Matrix-matrix multiplication with the adjoint of a complex CSC matrix" *
-                            " is not supported by the current CUDA version. Use a CSR or COO matrix instead."))
-    end
-
-    if CUSPARSE.version() < v"12.0" && isa(A, CuSparseMatrixCSC)
-        # cusparseSpMM completely supports CSC matrices with CUSPARSE.version() ≥ v"12.0".
-        # We use Aᵀ to model them as CSR matrices for older versions of CUSPARSE.
-        descA = CuSparseMatrixDescriptor(A, index, transposed=true)
-        k,m = size(A)
-        transa = transa == 'N' ? 'T' : 'N'
-    else
-        descA = CuSparseMatrixDescriptor(A, index)
-        m,k = size(A)
-    end
+    descA = CuSparseMatrixDescriptor(A, index)
+    m,k = size(A)
     n = size(C)[2]
 
     if transa == 'N' && transb == 'N'
@@ -288,10 +270,6 @@ end
 function bmm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseArrayCSR{T,Ti,3},
               B::DenseCuArray{T,3}, beta::Number, C::DenseCuArray{T,3}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where {T,Ti}
 
-    if CUSPARSE.version() < v"11.7.2"
-        throw(ErrorException("Batched dense-matrix times batched sparse-matrix (bmm!) requires a CUSPARSE version ≥ 11.7.2 (yours: $(CUSPARSE.version()))."))
-    end
-
     # Support transa = 'C' and `transb = 'C' for real matrices
     transa = T <: Real && transa == 'C' ? 'T' : transa
     transb = T <: Real && transb == 'C' ? 'T' : transb
@@ -341,7 +319,7 @@ function bmm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparse
     end
     with_workspace(bufferSize) do buffer
         # We should find a way to reuse the buffer (issue #1362)
-        if !(A isa CuSparseMatrixCOO) && (CUSPARSE.version() ≥ v"11.7.2")
+        if !(A isa CuSparseMatrixCOO)
             cusparseSpMM_preprocess(
                 handle(), transa, transb, Ref{T}(alpha), descA, descB, Ref{T}(beta),
                 descC, T, algo, buffer)
@@ -357,8 +335,6 @@ function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMa
              B::Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T},CuSparseMatrixCOO{T}}, beta::Number,
              C::DenseCuMatrix{T}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where {T}
 
-    CUSPARSE.version() < v"11.7.4" && throw(ErrorException("This operation is not supported by the current CUDA version."))
-
     # Support transa = 'C' and `transb = 'C' for real matrices
     transa = T <: Real && transa == 'C' ? 'T' : transa
     transb = T <: Real && transb == 'C' ? 'T' : transb
@@ -369,11 +345,6 @@ function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMa
     # Cc = α * Ac * Bᴴ + β * Cc → α * B̅  * Ar + β * Cr
     # where B is a sparse matrix, Ac and Cc indicate column-major layout, while Ar and Cr refer to row-major layout.
 
-    if CUSPARSE.version() < v"12.0" && isa(B, CuSparseMatrixCSR) && transb == 'C' && T <: Complex
-        throw(ArgumentError("Matrix-matrix multiplication with the adjoint of a complex CSR matrix" *
-                            " is not supported by the current CUDA version. Use a CSC or COO matrix instead."))
-    end
-
     m,k = size(A)
     n = size(C)[2]
 
@@ -402,7 +373,7 @@ function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMa
     end
     with_workspace(bufferSize) do buffer
         # We should find a way to reuse the buffer (issue #1362)
-        if !(B isa CuSparseMatrixCOO) && (CUSPARSE.version() ≥ v"11.7.2")
+        if !(B isa CuSparseMatrixCOO)
             cusparseSpMM_preprocess(
                 handle(), transb, transa, Ref{T}(alpha), descB, descA, Ref{T}(beta),
                 descC, T, algo, buffer)
@@ -824,7 +795,6 @@ end
 function sddmm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMatrix{T}, B::DenseCuMatrix{T},
                 beta::Number, C::Union{CuSparseMatrixCSR{T},CuSparseMatrixBSR{T}}, index::SparseChar, algo::cusparseSDDMMAlg_t=CUSPARSE_SDDMM_ALG_DEFAULT) where {T}
 
-    CUSPARSE.version() < v"11.4.1" && throw(ErrorException("This operation is not supported by the current CUDA version."))
     (C isa CuSparseMatrixBSR) && (CUSPARSE.version() < v"12.1.0") && throw(ErrorException("This operation is not supported by the current CUDA version."))
 
     # Support transa = 'C' and `transb = 'C' for real matrices
 
@@ -45,9 +45,9 @@ function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
     end
 
     # NVIDIA bug #3964667: CUPTI in CUDA 11.7+ broken for sm_35 devices
-    if runtime_version() >= v"11.7" && capability(device()) <= v"3.7"
+    if capability(device()) <= v"3.7"
         @error """SASS code generation is not supported on this device.
-                  Please downgrade to CUDA 11.6 or lower, or use a more recent device."""
+                  Please use a more recent device."""
         return
     end
 
@@ -82,9 +82,9 @@ end
 
 function code_sass(f::Base.Callable, io::IO=stdout; raw::Bool=false)
     # NVIDIA bug #3964667: CUPTI in CUDA 11.7+ broken for sm_35 devices
-    if runtime_version() >= v"11.7" && capability(device()) <= v"3.7"
+    if capability(device()) <= v"3.7"
         @error """SASS code generation is not supported on this device.
-                  Please downgrade to CUDA 11.6 or lower, or use a more recent device."""
+                  Please use a more recent device."""
         return
     end
 
 
@@ -66,17 +66,12 @@ function __init__()
         return
     end
 
-    if !(v"11" <= driver < v"14-")
-        @error "This version of CUDA.jl only supports NVIDIA drivers for CUDA 11.x, 12.x or 13.x (yours is for CUDA $driver)"
+    if !(v"12" <= driver < v"14-")
+        @error "This version of CUDA.jl only supports NVIDIA drivers for CUDA 12.x or 13.x (yours is for CUDA $driver)"
         _initialization_error[] = "CUDA driver unsupported"
         return
     end
 
-    if driver < v"11.3"
-        @warn """The NVIDIA driver on this system only supports up to CUDA $driver.
-                 For performance reasons, it is recommended to upgrade to a driver that supports CUDA 11.3 or higher."""
-    end
-
     # check that we have a runtime
     if !CUDA_Runtime.is_available()
         # try to find out why
@@ -135,8 +130,8 @@ function __init__()
     end
 
     # ensure the loaded runtime is supported
-    if runtime < v"10.2"
-        @error "This version of CUDA.jl only supports CUDA 11 or higher (your toolkit provides CUDA $runtime)"
+    if runtime < v"12.0"
+        @error "This version of CUDA.jl only supports CUDA 12 or higher (your toolkit provides CUDA $runtime)"
     end
     if runtime.major > driver.major
         @warn """You are using CUDA $runtime with a driver that only supports up to $(driver.major).x.
 
@@ -23,9 +23,6 @@ slowest 25%, while entries colored in red are among the slowest 5% of all operat
 
 !!! compat "Julia 1.9" This functionality is only available on Julia 1.9 and later.
 
-!!! compat "CUDA 11.2" Older versions of CUDA, before 11.2, contain bugs that may prevent
-    the `CUDA.@profile` macro to work. It is recommended to use a newer runtime.
-
 ## External profilers (`external=true`, when an external profiler is detected)
 
 For more advanced profiling, it is possible to use an external profiling tool, such as
@@ -495,7 +492,7 @@ function capture(cfg)
                                    size=record.bytes); cols=:union)
 
         # memory allocations
-        elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_MEMORY2 && cuda_version >= v"11.2"
+        elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_MEMORY2
             # XXX: we'd prefer to postpone processing (i.e. calling format_bytes),
             #      but cannot realistically add a column for every API call