diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
index 17a4d897..8a3fed24 100644
--- a/src/GPUArrays.jl
+++ b/src/GPUArrays.jl
@@ -20,6 +20,7 @@ using KernelAbstractions
 # device functionality
 include("device/abstractarray.jl")
 include("device/sparse.jl")
+include("device/indexing.jl")
 
 # host abstractions
 include("host/abstractarray.jl")
diff --git a/src/device/indexing.jl b/src/device/indexing.jl
new file mode 100644
index 00000000..359d9fd4
--- /dev/null
+++ b/src/device/indexing.jl
@@ -0,0 +1,164 @@
+# device-level indexing
+using SparseArrays: nonzeroinds, nonzeros, nnz, getcolptr
+using Base: @propagate_inbounds
+
+Base.IndexStyle(::Type{GPUSparseDeviceVector}) = Base.IndexLinear()
+
+# Implementing only scalar indexing. Non-scalar indexing would allocate in device code
+
+## Adapted from SparseArrays.AbstractSparseVector
+@propagate_inbounds function Base.getindex(
+    v::GPUSparseDeviceVector{Tv,Ti},
+    i::Integer,
+) where {Tv,Ti}
+    @boundscheck checkbounds(v, i)
+    m = nnz(v)
+    nzind = nonzeroinds(v)
+    nzval = nonzeros(v)
+
+    ii = searchsortedfirst(nzind, convert(Ti, i))
+    (ii <= m && nzind[ii] == i) ? nzval[ii] : zero(Tv)
+end
+
+# Logical getindex
+@propagate_inbounds function Base.getindex(
+    v::GPUSparseDeviceVector,
+    I::AbstractVector{Bool},
+)
+    isone(count(I)) ? v[findfirst(I)] :
+    error(
+        "Logical index contains more than one true value. Device arrays only support scalar indexing.",
+    )
+end
+
+@propagate_inbounds function Base.getindex(
+    A::AbstractGPUSparseDeviceMatrix,
+    i::Integer,
+    J::AbstractVector{Bool},
+)
+    isone(count(J)) ? A[i, findfirst(J)] :
+    error(
+        "Logical index contains more than one true value. Device arrays only support scalar indexing.",
+    )
+end
+
+@propagate_inbounds function Base.getindex(
+    A::AbstractGPUSparseDeviceMatrix,
+    I::AbstractVector{Bool},
+    j::Integer,
+)
+    isone(count(I)) ? A[findfirst(I), j] :
+    error(
+        "Logical index contains more than one true value. Device arrays only support scalar indexing.",
+    )
+end
+
+@propagate_inbounds function Base.getindex(
+    A::AbstractGPUSparseDeviceMatrix,
+    I::AbstractVector{Bool},
+    J::AbstractVector{Bool},
+)
+    if (isone(count(I)) && isone(count(J)))
+        A[findfirst(I), findfirst(J)]
+    else
+        failing_len, failing_idx = findmax((count(I), count(J)))
+        error(
+            "Logical index $(failing_idx == 1 ? "I" : "J") contains $(failing_len) true " *
+            "values. Device arrays only support scalar indexing.",
+        )
+    end
+end
+
+@propagate_inbounds Base.getindex(
+    A::AbstractGPUSparseDeviceMatrix,
+    I::Tuple{Integer,Integer},
+) = getindex(A, I[1], I[2])
+
+# Scalar getindex methods linear-scan the minor axis rather than binary-searching
+# and sum across matching entries. cuSPARSE formats don't guarantee sorted indices
+# within a major-axis slice (e.g. SpGEMM output may leave CSR columns unsorted
+# within a row, and COO is only guaranteed row-sorted), nor uniqueness — duplicate
+# (i, j) entries are permitted and their values sum, matching the convention of
+# Julia's `sparse()` constructor and SciPy/CuPy. For Bool we OR instead of sum,
+# also matching `sparse()`, since Bool + Bool doesn't stay Bool.
+sum_duplicate(a, b) = a + b
+sum_duplicate(a::Bool, b::Bool) = a | b
+
+## Adapted logic from SparseArrays.AbstractSparseMatrixCSC
+@propagate_inbounds function Base.getindex(
+    A::GPUSparseDeviceMatrixCSC{Tv,Ti},
+    i::Integer,
+    j::Integer,
+) where {Tv,Ti}
+    @boundscheck checkbounds(A, i, j)
+    colPtr, rowVal, nzVal = getcolptr(A), rowvals(A), nonzeros(A)
+
+    # Range of possible row indices
+    rl = convert(Ti, @inbounds colPtr[j])
+    rr = convert(Ti, @inbounds colPtr[j+1] - 1)
+    (rl > rr) && return zero(Tv)
+
+    ii = searchsortedfirst(rowVal, convert(Ti, i), rl, rr, Base.Order.Forward)
+    (ii <= nnz(A) && rowVal[ii] == i) ? nzVal[ii] : zero(Tv)
+end
+
+@propagate_inbounds function Base.getindex(
+    A::GPUSparseDeviceMatrixCSR{Tv,Ti},
+    i::Integer,
+    j::Integer,
+) where {Tv,Ti}
+    @boundscheck checkbounds(A, i, j)
+    rowPtr, colVal, nzVal = A.rowPtr, A.colVal, A.nzVal
+
+    # Range of possible col indices
+    rt = convert(Ti, @inbounds rowPtr[i])
+    rb = convert(Ti, @inbounds rowPtr[i+1] - 1)
+    (rt > rb) && return zero(Tv)
+
+    jj = searchsortedfirst(colVal, convert(Ti, j), rt, rb, Base.Order.Forward)
+    (jj <= nnz(A) && colVal[jj] == j) ? nzVal[jj] : zero(Tv)
+end
+
+## Adapted from CUDA.jl/blob/lib/cusparse/src/array.jl#L490
+@propagate_inbounds function Base.getindex(
+    A::GPUSparseDeviceMatrixCOO{Tv,Ti},
+    i::Integer,
+    j::Integer,
+) where {Tv,Ti}
+    # COO in CUDA is assumed to be sorted by row (not col):
+    #https://docs.nvidia.com/cuda/cusparse/storage-formats.html?highlight=coo#coordinate-coo
+    @boundscheck checkbounds(A, i, j)
+    rowInd, colInd, nzVal = A.rowInd, A.colInd, A.nzVal
+
+    # Looking for the range s.t. rowInd[r1:r2] .== i
+    rl = searchsortedfirst(rowInd, i, Base.Order.Forward)
+    (rl > nnz(A) || rowInd[rl] > i) && return zero(Tv)
+    rr = min(searchsortedfirst(rowInd, i+1, Base.Order.Forward), nnz(A))
+    # Important to exclude rr, as including it un-sorts colInd[rl:rr] 
+    # Column is not guaranteed to be sorted. We linear scan
+    result = zero(Tv)
+    for k in rl:rl
+        A.colInd[k] == i && (result = sum_duplicate(result, nonzeros(A)[k]))
+    end
+    return result
+end
+
+## Adapted from CUDA.jl/blob/lib/cusparse/src/array.jl#L500
+@propagate_inbounds function Base.getindex(
+    A::GPUSparseDeviceMatrixBSR{Tv,Ti},
+    i::Integer,
+    j::Integer,
+) where {Tv,Ti}
+    @boundscheck checkbounds(A, i, j)
+
+    i_block, i_idx = fldmod1(i, A.blockDim)
+    j_block, j_idx = fldmod1(j, A.blockDim)
+    block_idx = (i_idx-1) * A.blockDim + j_idx - 1
+    c1 = convert(Ti, A.rowPtr[i_block])
+    c2 = convert(Ti, A.rowPtr[i_block+1]-1)
+    result = zero(T)
+    for k in c1:c2
+        A.colVal[k] == i_block && (result == sum_duplicate(result, nonzeros(a)[k+block_idx]))
+    end
+    return result
+end
diff --git a/src/device/sparse.jl b/src/device/sparse.jl
index b8346eaf..d315ccf2 100644
--- a/src/device/sparse.jl
+++ b/src/device/sparse.jl
@@ -106,6 +106,7 @@ Base.length(g::AbstractGPUSparseDeviceMatrix) = prod(g.dims)
 Base.size(g::AbstractGPUSparseDeviceMatrix) = g.dims
 SparseArrays.nnz(g::AbstractGPUSparseDeviceMatrix) = g.nnz
 SparseArrays.getnzval(g::AbstractGPUSparseDeviceMatrix) = g.nzVal
+# FIXME: Implement `rowvals` to explicitly say why it's not available for CSR format?
 
 struct GPUSparseDeviceArrayCSR{Tv, Ti, Vi, Vv, N, M, A} <: AbstractSparseArray{Tv, Ti, N}
     rowPtr::Vi
diff --git a/test/runtests.jl b/test/runtests.jl
index a91715b9..e9161675 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -9,7 +9,7 @@ const init_worker_code = quote
     include("testsuite.jl")
 
     TestSuite.sparse_types(::Type{<:JLArray}) = (JLSparseVector, JLSparseMatrixCSC, JLSparseMatrixCSR)
-    TestSuite.sparse_types(::Type{<:Array})   = (SparseVector, SparseMatrixCSC)
+    TestSuite.sparse_types(::Type{<:Array})   = (SparseVector, SparseMatrixCSC) # TODO: Add CSR, COO, etc formats
 
     # Disable Float16-related tests until JuliaGPU/KernelAbstractions#600 is resolved
     if isdefined(JLArrays.KernelAbstractions, :POCL)
diff --git a/test/testsuite/sparse.jl b/test/testsuite/sparse.jl
index 3e0d92fe..753dc497 100644
--- a/test/testsuite/sparse.jl
+++ b/test/testsuite/sparse.jl
@@ -20,6 +20,48 @@ end
 using SparseArrays
 using SparseArrays: nonzeroinds, nonzeros, rowvals
 
+@kernel function linearize_kernel!(dst, @Const(src))
+    I = @index(Global, Linear)
+    @inbounds dst[I] = src[I]
+end
+
+function linearize!(dst, src)
+    backend = get_backend(src)
+    @assert length(dst) == length(src)
+
+    kernel = linearize_kernel!(backend)
+    kernel(dst, src, ndrange=length(dst))
+    return
+end
+
+@kernel function singleindex_kernel!(dst, @Const(src), i)
+    I = @index(Global, Linear)
+    @inbounds dst[1] = src[i]
+end
+
+function singleindex!(dst, src::AbstractSparseVector, i::Integer)
+    backend = get_backend(src)
+    @assert length(dst) == 1
+    checkbounds(src, i)
+    kernel = singleindex_kernel!(backend)
+    kernel(dst, src, i, ndrange=1)
+    return
+end
+
+@kernel function singleindex_kernel!(dst, @Const(src), i, j)
+    I = @index(Global, Linear)
+    @inbounds dst[1] = src[i, j]
+end
+
+function singleindex!(dst, src::AbstractSparseMatrix, i::Integer, j::Integer)
+    backend = get_backend(src)
+    @assert length(dst) == 1
+    checkbounds(src, i)
+    kernel = singleindex_kernel!(backend)
+    kernel(dst, src, i, j, ndrange=1)
+    return
+end
+
 function vector(AT, eltypes)
     dense_AT = GPUArrays.dense_array_type(AT)
     for ET in eltypes
@@ -38,6 +80,7 @@ function vector(AT, eltypes)
             @test ndims(d_x)  == 1
             dense_d_x  = dense_AT(x)
             dense_d_x2 = dense_AT(d_x)
+            # Indexing
             @allowscalar begin
                 @test Array(d_x[:])        == x[:]
                 @test d_x[firstindex(d_x)] == x[firstindex(x)]
@@ -54,6 +97,13 @@ function vector(AT, eltypes)
             @test Array(nonzeroinds(d_x)) == nonzeroinds(x)
             @test Array(rowvals(d_x)) == nonzeroinds(x)
             @test nnz(d_x)    == length(nonzeros(d_x))
+            # Device-side (scalar) indexing
+            dst = zeros(ET, size(d_x))
+            linearize!(dst, d_x)
+            @test x == dst
+            dst_single = zeros(ET, 1)
+            singleindex!(dst_single, d_x, 17)
+            @test only(dst_single) == x[17]
         end
     end
 end
@@ -77,7 +127,7 @@ end
 function matrix(AT, eltypes)
     dense_AT = GPUArrays.dense_array_type(AT)
     for ET in eltypes
-        @testset "Sparse matrix properties($ET)" begin
+        @testset "$(AT) properties($ET)" begin
             m = 25
             n = 35
             k = 10
@@ -98,6 +148,7 @@ function matrix(AT, eltypes)
             @test size(d_x, 2) == n
             @test size(d_x, 3) == 1
             @test ndims(d_x)   == 2
+            # Indexing
             @allowscalar begin
                 @test d_x[:, :]            == x[:, :]
                 @test d_tx[:, :]           == transpose(x)[:, :]
@@ -127,6 +178,16 @@ function matrix(AT, eltypes)
             @test nnz(d_x)    == length(nonzeros(d_x))
             @test !issymmetric(d_x)
             @test !ishermitian(d_x)
+            # Device-side (scalar) indexing
+            x    = sprand(ET, m, n, 0.2)
+            d_x  = AT(x)
+            dst = zeros(ET, length(d_x))
+            linearize!(dst, d_x)
+            @test Array(x[:]) == dst
+            dst_single = zeros(ET, 1)
+            i, j = floor(Int, m * 0.5), floor(Int, n * 0.5)
+            singleindex!(dst_single, d_x, i, j)
+            @test only(dst_single) == x[i, j]
         end
     end
 end