diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index 17a4d897..8a3fed24 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -20,6 +20,7 @@ using KernelAbstractions # device functionality include("device/abstractarray.jl") include("device/sparse.jl") +include("device/indexing.jl") # host abstractions include("host/abstractarray.jl") diff --git a/src/device/indexing.jl b/src/device/indexing.jl new file mode 100644 index 00000000..359d9fd4 --- /dev/null +++ b/src/device/indexing.jl @@ -0,0 +1,164 @@ +# device-level indexing +using SparseArrays: nonzeroinds, nonzeros, nnz, getcolptr +using Base: @propagate_inbounds + +Base.IndexStyle(::Type{GPUSparseDeviceVector}) = Base.IndexLinear() + +# Implementing only scalar indexing. Non-scalar indexing would allocate in device code + +## Adapted from SparseArrays.AbstractSparseVector +@propagate_inbounds function Base.getindex( + v::GPUSparseDeviceVector{Tv,Ti}, + i::Integer, +) where {Tv,Ti} + @boundscheck checkbounds(v, i) + m = nnz(v) + nzind = nonzeroinds(v) + nzval = nonzeros(v) + + ii = searchsortedfirst(nzind, convert(Ti, i)) + (ii <= m && nzind[ii] == i) ? nzval[ii] : zero(Tv) +end + +# Logical getindex +@propagate_inbounds function Base.getindex( + v::GPUSparseDeviceVector, + I::AbstractVector{Bool}, +) + isone(count(I)) ? v[findfirst(I)] : + error( + "Logical index contains more than one true value. Device arrays only support scalar indexing.", + ) +end + +@propagate_inbounds function Base.getindex( + A::AbstractGPUSparseDeviceMatrix, + i::Integer, + J::AbstractVector{Bool}, +) + isone(count(J)) ? A[i, findfirst(J)] : + error( + "Logical index contains more than one true value. Device arrays only support scalar indexing.", + ) +end + +@propagate_inbounds function Base.getindex( + A::AbstractGPUSparseDeviceMatrix, + I::AbstractVector{Bool}, + j::Integer, +) + isone(count(I)) ? A[findfirst(I), j] : + error( + "Logical index contains more than one true value. Device arrays only support scalar indexing.", + ) +end + +@propagate_inbounds function Base.getindex( + A::AbstractGPUSparseDeviceMatrix, + I::AbstractVector{Bool}, + J::AbstractVector{Bool}, +) + if (isone(count(I)) && isone(count(J))) + A[findfirst(I), findfirst(J)] + else + failing_len, failing_idx = findmax((count(I), count(J))) + error( + "Logical index $(failing_idx == 1 ? "I" : "J") contains $(failing_len) true " * + "values. Device arrays only support scalar indexing.", + ) + end +end + +@propagate_inbounds Base.getindex( + A::AbstractGPUSparseDeviceMatrix, + I::Tuple{Integer,Integer}, +) = getindex(A, I[1], I[2]) + +# Scalar getindex methods linear-scan the minor axis rather than binary-searching +# and sum across matching entries. cuSPARSE formats don't guarantee sorted indices +# within a major-axis slice (e.g. SpGEMM output may leave CSR columns unsorted +# within a row, and COO is only guaranteed row-sorted), nor uniqueness — duplicate +# (i, j) entries are permitted and their values sum, matching the convention of +# Julia's `sparse()` constructor and SciPy/CuPy. For Bool we OR instead of sum, +# also matching `sparse()`, since Bool + Bool doesn't stay Bool. +sum_duplicate(a, b) = a + b +sum_duplicate(a::Bool, b::Bool) = a | b + +## Adapted logic from SparseArrays.AbstractSparseMatrixCSC +@propagate_inbounds function Base.getindex( + A::GPUSparseDeviceMatrixCSC{Tv,Ti}, + i::Integer, + j::Integer, +) where {Tv,Ti} + @boundscheck checkbounds(A, i, j) + colPtr, rowVal, nzVal = getcolptr(A), rowvals(A), nonzeros(A) + + # Range of possible row indices + rl = convert(Ti, @inbounds colPtr[j]) + rr = convert(Ti, @inbounds colPtr[j+1] - 1) + (rl > rr) && return zero(Tv) + + ii = searchsortedfirst(rowVal, convert(Ti, i), rl, rr, Base.Order.Forward) + (ii <= nnz(A) && rowVal[ii] == i) ? nzVal[ii] : zero(Tv) +end + +@propagate_inbounds function Base.getindex( + A::GPUSparseDeviceMatrixCSR{Tv,Ti}, + i::Integer, + j::Integer, +) where {Tv,Ti} + @boundscheck checkbounds(A, i, j) + rowPtr, colVal, nzVal = A.rowPtr, A.colVal, A.nzVal + + # Range of possible col indices + rt = convert(Ti, @inbounds rowPtr[i]) + rb = convert(Ti, @inbounds rowPtr[i+1] - 1) + (rt > rb) && return zero(Tv) + + jj = searchsortedfirst(colVal, convert(Ti, j), rt, rb, Base.Order.Forward) + (jj <= nnz(A) && colVal[jj] == j) ? nzVal[jj] : zero(Tv) +end + +## Adapted from CUDA.jl/blob/lib/cusparse/src/array.jl#L490 +@propagate_inbounds function Base.getindex( + A::GPUSparseDeviceMatrixCOO{Tv,Ti}, + i::Integer, + j::Integer, +) where {Tv,Ti} + # COO in CUDA is assumed to be sorted by row (not col): + #https://docs.nvidia.com/cuda/cusparse/storage-formats.html?highlight=coo#coordinate-coo + @boundscheck checkbounds(A, i, j) + rowInd, colInd, nzVal = A.rowInd, A.colInd, A.nzVal + + # Looking for the range s.t. rowInd[r1:r2] .== i + rl = searchsortedfirst(rowInd, i, Base.Order.Forward) + (rl > nnz(A) || rowInd[rl] > i) && return zero(Tv) + rr = min(searchsortedfirst(rowInd, i+1, Base.Order.Forward), nnz(A)) + # Important to exclude rr, as including it un-sorts colInd[rl:rr] + # Column is not guaranteed to be sorted. We linear scan + result = zero(Tv) + for k in rl:rl + A.colInd[k] == i && (result = sum_duplicate(result, nonzeros(A)[k])) + end + return result +end + +## Adapted from CUDA.jl/blob/lib/cusparse/src/array.jl#L500 +@propagate_inbounds function Base.getindex( + A::GPUSparseDeviceMatrixBSR{Tv,Ti}, + i::Integer, + j::Integer, +) where {Tv,Ti} + @boundscheck checkbounds(A, i, j) + + i_block, i_idx = fldmod1(i, A.blockDim) + j_block, j_idx = fldmod1(j, A.blockDim) + block_idx = (i_idx-1) * A.blockDim + j_idx - 1 + c1 = convert(Ti, A.rowPtr[i_block]) + c2 = convert(Ti, A.rowPtr[i_block+1]-1) + result = zero(T) + for k in c1:c2 + A.colVal[k] == i_block && (result == sum_duplicate(result, nonzeros(a)[k+block_idx])) + end + return result +end diff --git a/src/device/sparse.jl b/src/device/sparse.jl index b8346eaf..d315ccf2 100644 --- a/src/device/sparse.jl +++ b/src/device/sparse.jl @@ -106,6 +106,7 @@ Base.length(g::AbstractGPUSparseDeviceMatrix) = prod(g.dims) Base.size(g::AbstractGPUSparseDeviceMatrix) = g.dims SparseArrays.nnz(g::AbstractGPUSparseDeviceMatrix) = g.nnz SparseArrays.getnzval(g::AbstractGPUSparseDeviceMatrix) = g.nzVal +# FIXME: Implement `rowvals` to explicitly say why it's not available for CSR format? struct GPUSparseDeviceArrayCSR{Tv, Ti, Vi, Vv, N, M, A} <: AbstractSparseArray{Tv, Ti, N} rowPtr::Vi diff --git a/test/runtests.jl b/test/runtests.jl index a91715b9..e9161675 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -9,7 +9,7 @@ const init_worker_code = quote include("testsuite.jl") TestSuite.sparse_types(::Type{<:JLArray}) = (JLSparseVector, JLSparseMatrixCSC, JLSparseMatrixCSR) - TestSuite.sparse_types(::Type{<:Array}) = (SparseVector, SparseMatrixCSC) + TestSuite.sparse_types(::Type{<:Array}) = (SparseVector, SparseMatrixCSC) # TODO: Add CSR, COO, etc formats # Disable Float16-related tests until JuliaGPU/KernelAbstractions#600 is resolved if isdefined(JLArrays.KernelAbstractions, :POCL) diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl index 3e0d92fe..753dc497 100644 --- a/test/testsuite/sparse.jl +++ b/test/testsuite/sparse.jl @@ -20,6 +20,48 @@ end using SparseArrays using SparseArrays: nonzeroinds, nonzeros, rowvals +@kernel function linearize_kernel!(dst, @Const(src)) + I = @index(Global, Linear) + @inbounds dst[I] = src[I] +end + +function linearize!(dst, src) + backend = get_backend(src) + @assert length(dst) == length(src) + + kernel = linearize_kernel!(backend) + kernel(dst, src, ndrange=length(dst)) + return +end + +@kernel function singleindex_kernel!(dst, @Const(src), i) + I = @index(Global, Linear) + @inbounds dst[1] = src[i] +end + +function singleindex!(dst, src::AbstractSparseVector, i::Integer) + backend = get_backend(src) + @assert length(dst) == 1 + checkbounds(src, i) + kernel = singleindex_kernel!(backend) + kernel(dst, src, i, ndrange=1) + return +end + +@kernel function singleindex_kernel!(dst, @Const(src), i, j) + I = @index(Global, Linear) + @inbounds dst[1] = src[i, j] +end + +function singleindex!(dst, src::AbstractSparseMatrix, i::Integer, j::Integer) + backend = get_backend(src) + @assert length(dst) == 1 + checkbounds(src, i) + kernel = singleindex_kernel!(backend) + kernel(dst, src, i, j, ndrange=1) + return +end + function vector(AT, eltypes) dense_AT = GPUArrays.dense_array_type(AT) for ET in eltypes @@ -38,6 +80,7 @@ function vector(AT, eltypes) @test ndims(d_x) == 1 dense_d_x = dense_AT(x) dense_d_x2 = dense_AT(d_x) + # Indexing @allowscalar begin @test Array(d_x[:]) == x[:] @test d_x[firstindex(d_x)] == x[firstindex(x)] @@ -54,6 +97,13 @@ function vector(AT, eltypes) @test Array(nonzeroinds(d_x)) == nonzeroinds(x) @test Array(rowvals(d_x)) == nonzeroinds(x) @test nnz(d_x) == length(nonzeros(d_x)) + # Device-side (scalar) indexing + dst = zeros(ET, size(d_x)) + linearize!(dst, d_x) + @test x == dst + dst_single = zeros(ET, 1) + singleindex!(dst_single, d_x, 17) + @test only(dst_single) == x[17] end end end @@ -77,7 +127,7 @@ end function matrix(AT, eltypes) dense_AT = GPUArrays.dense_array_type(AT) for ET in eltypes - @testset "Sparse matrix properties($ET)" begin + @testset "$(AT) properties($ET)" begin m = 25 n = 35 k = 10 @@ -98,6 +148,7 @@ function matrix(AT, eltypes) @test size(d_x, 2) == n @test size(d_x, 3) == 1 @test ndims(d_x) == 2 + # Indexing @allowscalar begin @test d_x[:, :] == x[:, :] @test d_tx[:, :] == transpose(x)[:, :] @@ -127,6 +178,16 @@ function matrix(AT, eltypes) @test nnz(d_x) == length(nonzeros(d_x)) @test !issymmetric(d_x) @test !ishermitian(d_x) + # Device-side (scalar) indexing + x = sprand(ET, m, n, 0.2) + d_x = AT(x) + dst = zeros(ET, length(d_x)) + linearize!(dst, d_x) + @test Array(x[:]) == dst + dst_single = zeros(ET, 1) + i, j = floor(Int, m * 0.5), floor(Int, n * 0.5) + singleindex!(dst_single, d_x, i, j) + @test only(dst_single) == x[i, j] end end end