Merge pull request #834 from hexaeder/hw/cusparse_defaults

ChrisRackauckas · web-flow · commit aa7f12e9c5ed · 2025-12-07T20:34:21.000-01:00
fix default solver selection for CuSparseMatrix
diff --git a/Project.toml b/Project.toml
@@ -86,7 +86,7 @@ ArrayInterface = "7.17"
 BandedMatrices = "1.8"
 BlockDiagonals = "0.2"
 CUDA = "5.5"
-CUDSS = "0.4, 0.6.1"
+CUDSS = "0.6.3"
 CUSOLVERRF = "0.2.6"
 ChainRulesCore = "1.25"
 CliqueTrees = "1.11.0"
diff --git a/ext/LinearSolveCUDAExt.jl b/ext/LinearSolveCUDAExt.jl
@@ -17,6 +17,8 @@ function LinearSolve.is_cusparse(A::Union{
         CUDA.CUSPARSE.CuSparseMatrixCSR, CUDA.CUSPARSE.CuSparseMatrixCSC})
     true
 end
+LinearSolve.is_cusparse_csr(::CUDA.CUSPARSE.CuSparseMatrixCSR) = true
+LinearSolve.is_cusparse_csc(::CUDA.CUSPARSE.CuSparseMatrixCSC) = true
 
 function LinearSolve.defaultalg(A::CUDA.CUSPARSE.CuSparseMatrixCSR{Tv, Ti}, b,
         assump::OperatorAssumptions{Bool}) where {Tv, Ti}
@@ -31,6 +33,16 @@ function LinearSolve.defaultalg(A::CUDA.CUSPARSE.CuSparseMatrixCSR{Tv, Ti}, b,
     end
 end
 
+function LinearSolve.defaultalg(A::CUDA.CUSPARSE.CuSparseMatrixCSC, b,
+        assump::OperatorAssumptions{Bool})
+    if LinearSolve.cudss_loaded(A)
+        @warn("CUDSS.jl does not support CuSparseMatrixCSC for LU Factorizations, consider using CuSparseMatrixCSR instead. Falling back to Krylov", maxlog=1)
+    else
+        @warn("CuSparseMatrixCSC does not support LU Factorization falling back to Krylov. Consider using CUDSS.jl together with CuSparseMatrixCSR", maxlog=1)
+    end
+    LinearSolve.DefaultLinearSolver(LinearSolve.DefaultAlgorithmChoice.KrylovJL_GMRES)
+end
+
 function LinearSolve.error_no_cudss_lu(A::CUDA.CUSPARSE.CuSparseMatrixCSR)
     if !LinearSolve.cudss_loaded(A)
         error("CUDSS.jl is required for LU Factorizations on CuSparseMatrixCSR. Please load this library.")
diff --git a/ext/LinearSolveCUSOLVERRFExt.jl b/ext/LinearSolveCUSOLVERRFExt.jl
@@ -1,6 +1,6 @@
 module LinearSolveCUSOLVERRFExt
 
-using LinearSolve: LinearSolve, @get_cacheval, pattern_changed, OperatorAssumptions
+using LinearSolve: LinearSolve, @get_cacheval, pattern_changed, OperatorAssumptions, LinearVerbosity
 using CUSOLVERRF: CUSOLVERRF, RFLU, CUDA
 using SparseArrays: SparseArrays, SparseMatrixCSC, nnz
 using CUSOLVERRF.CUDA.CUSPARSE: CuSparseMatrixCSR
diff --git a/ext/LinearSolveSparseArraysExt.jl b/ext/LinearSolveSparseArraysExt.jl
@@ -129,7 +129,7 @@ function LinearSolve.init_cacheval(
         maxiters::Int, abstol, reltol,
         verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions) where {T <: BLASELTYPES}
     if LinearSolve.is_cusparse(A)
-        ArrayInterface.lu_instance(A)
+        LinearSolve.cudss_loaded(A) ? ArrayInterface.lu_instance(A) : nothing
     else
         SparseArrays.UMFPACK.UmfpackLU(SparseMatrixCSC{T, Int64}(
             zero(Int64), zero(Int64), [Int64(1)], Int64[], T[]))
@@ -141,7 +141,7 @@ function LinearSolve.init_cacheval(
         maxiters::Int, abstol, reltol,
         verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions) where {T <: BLASELTYPES}
     if LinearSolve.is_cusparse(A)
-        ArrayInterface.lu_instance(A)
+        LinearSolve.cudss_loaded(A) ? ArrayInterface.lu_instance(A) : nothing
     else
         SparseArrays.UMFPACK.UmfpackLU(SparseMatrixCSC{T, Int32}(
             zero(Int32), zero(Int32), [Int32(1)], Int32[], T[]))
@@ -344,7 +344,13 @@ function LinearSolve.init_cacheval(alg::NormalCholeskyFactorization,
             Symmetric{T, <:AbstractSparseArray{T}}}, b, u, Pl, Pr,
         maxiters::Int, abstol, reltol, verbose::Union{LinearVerbosity, Bool},
         assumptions::OperatorAssumptions) where {T <: BLASELTYPES}
-    ArrayInterface.cholesky_instance(convert(AbstractMatrix, A))
+    if LinearSolve.is_cusparse_csc(A)
+        nothing
+    elseif LinearSolve.is_cusparse_csr(A) && !LinearSolve.cudss_loaded(A)
+        nothing
+    else
+        ArrayInterface.cholesky_instance(convert(AbstractMatrix, A))
+    end
 end
 
 # Specialize QR for the non-square case
diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl
@@ -478,6 +478,8 @@ ALREADY_WARNED_CUDSS = Ref{Bool}(false)
 error_no_cudss_lu(A) = nothing
 cudss_loaded(A) = false
 is_cusparse(A) = false
+is_cusparse_csr(A) = false
+is_cusparse_csc(A) = false
 
 export LUFactorization, SVDFactorization, QRFactorization, GenericFactorization,
        GenericLUFactorization, SimpleLUFactorization, RFLUFactorization, ButterflyFactorization,
diff --git a/src/factorization.jl b/src/factorization.jl
@@ -395,7 +395,13 @@ end
 function init_cacheval(
         alg::CholeskyFactorization, A::AbstractArray{<:BLASELTYPES}, b, u, Pl, Pr,
         maxiters::Int, abstol, reltol, verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions)
-    ArrayInterface.cholesky_instance(convert(AbstractMatrix, A), alg.pivot)
+    if LinearSolve.is_cusparse_csc(A)
+        nothing
+    elseif LinearSolve.is_cusparse_csr(A) && !LinearSolve.cudss_loaded(A)
+        nothing
+    else
+        ArrayInterface.cholesky_instance(convert(AbstractMatrix, A), alg.pivot)
+    end
 end
 
 const PREALLOCATED_CHOLESKY = ArrayInterface.cholesky_instance(rand(1, 1), NoPivot())
diff --git a/test/gpu/cuda.jl b/test/gpu/cuda.jl
@@ -1,7 +1,65 @@
 using LinearSolve, CUDA, LinearAlgebra, SparseArrays, StableRNGs
-using CUDA.CUSPARSE, CUDSS
+using CUDA.CUSPARSE
 using Test
 
+@testset "Test default solver choice for CuSparse" begin
+    b = Float64[1, 2, 3, 4]
+    b_gpu = CUDA.adapt(CuArray, b)
+
+    A = Float64[1 1 0 0
+                0 1 1 0
+                0 0 3 1
+                0 0 0 4]
+    A_gpu_csr = CUDA.CUSPARSE.CuSparseMatrixCSR(sparse(A))
+    A_gpu_csc = CUDA.CUSPARSE.CuSparseMatrixCSC(sparse(A))
+    prob_csr = LinearProblem(A_gpu_csr, b_gpu)
+    prob_csc = LinearProblem(A_gpu_csc, b_gpu)
+
+    A_sym = Float64[1 1 0 0
+                    1 0 0 2
+                    0 0 3 0
+                    0 2 0 0]
+    A_gpu_sym_csr = CUDA.CUSPARSE.CuSparseMatrixCSR(sparse(A_sym))
+    A_gpu_sym_csc = CUDA.CUSPARSE.CuSparseMatrixCSC(sparse(A_sym))
+    prob_sym_csr = LinearProblem(A_gpu_sym_csr, b_gpu)
+    prob_sym_csc = LinearProblem(A_gpu_sym_csc, b_gpu)
+
+    @testset "Test without CUDSS loaded" begin
+        # assert CuDSS is not loaded yet
+        @test !LinearSolve.cudss_loaded(A_gpu_csr)
+        # csr fallback to krylov
+        alg = solve(prob_csr).alg
+        @test alg.alg == LinearSolve.DefaultAlgorithmChoice.KrylovJL_GMRES
+        # csc fallback to krylov
+        alg = solve(prob_csc).alg
+        @test alg.alg == LinearSolve.DefaultAlgorithmChoice.KrylovJL_GMRES
+        # csr symmetric fallback to krylov
+        alg = solve(prob_sym_csr).alg
+        @test alg.alg == LinearSolve.DefaultAlgorithmChoice.KrylovJL_GMRES
+        # csc symmetric fallback to krylov
+        alg = solve(prob_sym_csc).alg
+        @test alg.alg == LinearSolve.DefaultAlgorithmChoice.KrylovJL_GMRES
+    end
+
+    using CUDSS
+
+    @testset "Test with CUDSS loaded" begin
+        @test LinearSolve.cudss_loaded(A_gpu_csr)
+        # csr uses LU
+        alg = solve(prob_csr).alg
+        @test alg.alg == LinearSolve.DefaultAlgorithmChoice.LUFactorization
+        # csc fallback to krylov
+        alg = solve(prob_csc).alg
+        @test alg.alg == LinearSolve.DefaultAlgorithmChoice.KrylovJL_GMRES
+        # csr symmetric uses LU/cholesky
+        alg = solve(prob_sym_csr).alg
+        @test alg.alg == LinearSolve.DefaultAlgorithmChoice.LUFactorization
+        # csc symmetric fallback to krylov
+        alg = solve(prob_sym_csc).alg
+        @test alg.alg == LinearSolve.DefaultAlgorithmChoice.KrylovJL_GMRES
+    end
+end
+
 CUDA.allowscalar(false)
 
 n = 8
@@ -96,9 +154,9 @@ end
 @testset "CUDSS" begin
     T = Float32
     n = 100
-    A_cpu = sprand(T, n, n, 0.05) + I
+    A_cpu = sprand(rng, T, n, n, 0.05) + I
     x_cpu = zeros(T, n)
-    b_cpu = rand(T, n)
+    b_cpu = rand(rng, T, n)
 
     A_gpu_csr = CuSparseMatrixCSR(A_cpu)
     b_gpu = CuVector(b_cpu)