Remove kron methods and use those in GPUArrays (#2643)

kshyatt · web-flow · commit 031d7b9a84b8 · 2025-02-06T06:59:50.000+01:00
diff --git a/lib/cublas/linalg.jl b/lib/cublas/linalg.jl
@@ -396,89 +396,3 @@ for op in (:(+), :(-))
         @eval Base.$op(A::$TypeA, B::$TypeB) where {T <: CublasFloat} = geam($transa(T), $transb(T), one(T), $(unwrapa(:A)), $(op)(one(T)), $(unwrapb(:B)))
     end
 end
-
-# Kronecker product
-function LinearAlgebra.kron!(C::CuMatrix{TC}, A::CuMatrix{TA}, B::CuMatrix{TB}) where {TA,TB,TC}
-
-    function _kron_mat_kernelA!(C, A, B, m, n, p, q)
-        index_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-        index_j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-
-        stride_i = blockDim().x * gridDim().x
-        stride_j = blockDim().y * gridDim().y
-
-        index_i > m && return
-        index_j > n && return
-
-        for i in index_i:stride_i:m
-            for j in index_j:stride_j:n
-                for k in 1:p
-                    for l in 1:q
-                        @inbounds C[(i-1)*p+k, (j-1)*q+l] = A[i,j] * B[k,l]
-                    end
-                end
-            end
-        end
-        return nothing
-    end
-
-    function _kron_mat_kernelB!(C, A, B, m, n, p, q)
-        index_p = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-        index_q = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-
-        stride_p = blockDim().x * gridDim().x
-        stride_q = blockDim().y * gridDim().y
-
-        index_p > p && return
-        index_q > q && return
-
-        for i in 1:m
-            for j in 1:n
-                for k in index_p:stride_p:p
-                    for l in index_q:stride_q:q
-                        @inbounds C[(i-1)*p+k, (j-1)*q+l] = A[i,j] * B[k,l]
-                    end
-                end
-            end
-        end
-        return nothing
-    end
-
-    m, n = size(A)
-    p, q = size(B)
-
-    # Use different kernels depending on the size of the matrices
-    # choosing to parallelize the matrix with the largest number of elements
-    m*n >= p*q ? (kernel = @cuda launch=false _kron_mat_kernelA!(C, A, B, m, n, p, q)) :
-                 (kernel = @cuda launch=false _kron_mat_kernelB!(C, A, B, m, n, p, q))
-
-    m*n >= p*q ? (sizes = (m, n)) : (sizes = (p, q))
-
-    config = launch_configuration(kernel.fun)
-    dim_ratio = sizes[1] / sizes[2]
-    max_threads_i = max(1, floor(Int, sqrt(config.threads * dim_ratio)))
-    max_threads_j = max(1, floor(Int, sqrt(config.threads / dim_ratio)))
-    max_blocks_i = max(1, floor(Int, sqrt(config.blocks * dim_ratio)))
-    max_blocks_j = max(1, floor(Int, sqrt(config.blocks / dim_ratio)))
-
-    threads_i = min(sizes[1], max_threads_i)
-    threads_j = min(sizes[2], max_threads_j)
-    threads = (threads_i, threads_j)
-    blocks_i = min(cld(sizes[1], threads_i), max_blocks_i)
-    blocks_j = min(cld(sizes[2], threads_j), max_blocks_j)
-    blocks = (blocks_i, blocks_j)
-
-    kernel(C, A, B, m, n, p, q; threads=threads, blocks=blocks)
-
-    return C
-end
-
-function LinearAlgebra.kron(A::CuMatrix{TA}, B::CuMatrix{TB}) where {TA,TB}
-    m, n = size(A)
-    p, q = size(B)
-
-    T = promote_type(TA, TB)
-    C = similar(A, T, m*p, n*q)
-
-    kron!(C, A, B)
-end