diff --git a/gen/hiptensor/generator.jl b/gen/hiptensor/generator.jl new file mode 100644 index 000000000..046fc1825 --- /dev/null +++ b/gen/hiptensor/generator.jl @@ -0,0 +1,21 @@ +using Clang.Generators +using JuliaFormatter + +include_dir = normpath("/opt/rocm/include") +rocblas_dir = joinpath(include_dir, "hiptensor") +options = load_options("hiptensor/hiptensor-generator.toml") + +args = get_default_args() +push!(args, "-I$include_dir") + +headers = [ + joinpath(rocblas_dir, header) + for header in readdir(rocblas_dir) + if endswith(header, ".h") +] + +ctx = create_context(headers, args, options) +build!(ctx) + +path = options["general"]["output_file_path"] +format_file(path, YASStyle()) diff --git a/gen/hiptensor/hiptensor-generator.toml b/gen/hiptensor/hiptensor-generator.toml new file mode 100644 index 000000000..d5118ced9 --- /dev/null +++ b/gen/hiptensor/hiptensor-generator.toml @@ -0,0 +1,8 @@ +[general] +library_name = "libhiptensor" +output_file_path = "./libhiptensor.jl" +export_symbol_prefixes = [] +print_using_CEnum = false + +[codegen] +use_ccall_macro = true diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl index 52059b8f1..32f88e1fc 100644 --- a/src/AMDGPU.jl +++ b/src/AMDGPU.jl @@ -4,6 +4,8 @@ using Adapt using BFloat16s using CEnum using GPUCompiler +using GPUToolbox +using GPUToolbox: @enum_without_prefix using GPUArrays using GPUArrays: allowscalar using Libdl @@ -114,6 +116,7 @@ include("kernels/reverse.jl") include("blas/rocBLAS.jl") include("solver/rocSOLVER.jl") include("sparse/rocSPARSE.jl") +include("tensor/hipTENSOR.jl") include("rand/rocRAND.jl") include("fft/rocFFT.jl") include("dnn/MIOpen.jl") diff --git a/src/discovery/discovery.jl b/src/discovery/discovery.jl index a7998b4dc..b879e9afa 100644 --- a/src/discovery/discovery.jl +++ b/src/discovery/discovery.jl @@ -3,6 +3,7 @@ module ROCmDiscovery export lld_artifact, lld_path, libhsaruntime, libdevice_libs, libhip export librocblas, librocsparse, librocsolver export librocrand, librocfft, libMIOpen_path +export libhiptensor using LLD_jll using ROCmDeviceLibs_jll @@ -55,6 +56,7 @@ global librocsparse::String = "" global librocsolver::String = "" global librocrand::String = "" global librocfft::String = "" +global libhiptensor::String = "" global libMIOpen_path::String = "" function __init__() @@ -96,6 +98,7 @@ function __init__() global librocsolver = find_rocm_library(lib_prefix * "rocsolver"; rocm_path) global librocrand = find_rocm_library(lib_prefix * "rocrand"; rocm_path) global librocfft = find_rocm_library(lib_prefix * "rocfft"; rocm_path) + global libhiptensor = find_rocm_library(lib_prefix * "hiptensor"; rocm_path) global libMIOpen_path = find_rocm_library(lib_prefix * "MIOpen"; rocm_path) catch err @error """ROCm discovery failed! diff --git a/src/tensor/error.jl b/src/tensor/error.jl new file mode 100644 index 000000000..b9df8278e --- /dev/null +++ b/src/tensor/error.jl @@ -0,0 +1,46 @@ +export hipTENSORError + +struct hipTENSORError <: Exception + code::hiptensorStatus_t +end + +Base.convert(::Type{hiptensorStatus_t}, err::hipTENSORError) = err.code + +Base.showerror(io::IO, err::hipTENSORError) = + print(io, "hipTENSORError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))") + +name(err::hipTENSORError) = unsafe_string(hiptensorGetErrorString(err)) + +## COV_EXCL_START +function description(err::hipTENSORError) + if err.code == HIPTENSOR_STATUS_SUCCESS + "the operation completed successfully" + elseif err.code == HIPTENSOR_STATUS_NOT_INITIALIZED + "the library was not initialized" + elseif err.code == HIPTENSOR_STATUS_ALLOC_FAILED + "the resource allocation failed" + elseif err.code == HIPTENSOR_STATUS_INVALID_VALUE + "an invalid value was used as an argument" + elseif err.code == HIPTENSOR_STATUS_ARCH_MISMATCH + "an absent device architectural feature is required" + elseif err.code == HIPTENSOR_STATUS_EXECUTION_FAILED + "the GPU program failed to execute" + elseif err.code == HIPTENSOR_STATUS_INTERNAL_ERROR + "an internal operation failed" + elseif err.code == HIPTENSOR_STATUS_NOT_SUPPORTED + "operation not supported (yet)" + elseif err.code == HIPTENSOR_STATUS_CK_ERROR + "error detected trying to check the license" + elseif err.code == HIPTENSOR_STATUS_HIP_ERROR + "error occurred during a HIP operation" + elseif err.code == HIPTENSOR_STATUS_INSUFFICIENT_WORKSPACE + "insufficient workspace memory for this operation" + elseif err.code == HIPTENSOR_STATUS_INSUFFICIENT_DRIVER + "insufficient driver version" + elseif err.code == HIPTENSOR_STATUS_IO_ERROR + "file not found" + else + "no description for this error" + end +end +## COV_EXCL_STOP diff --git a/src/tensor/hipTENSOR.jl b/src/tensor/hipTENSOR.jl new file mode 100644 index 000000000..2352acea7 --- /dev/null +++ b/src/tensor/hipTENSOR.jl @@ -0,0 +1,60 @@ +module hipTENSOR + +using AMDGPU +using AMDGPU +using AMDGPU: @gcsafe_ccall, @checked, @enum_without_prefix, @debug_ccall +using AMDGPU: Mem +import AMDGPU: libhiptensor, HandleCache, HIP, library_state +import AMDGPU.Mem: alloc_or_retry! +import .HIP: HIPContext, HIPStream, hipStream_t + +using CEnum: @cenum + +using Printf: @printf + +export has_hiptensor + +has_hiptensor() = AMDGPU.functional(:hiptensor) + +# core library +include("libhiptensor.jl") + +# low-level wrappers +include("error.jl") +#include("utils.jl") +include("types.jl") +include("operations.jl") + +# high-level integrations +include("interfaces.jl") + + +## handles + +function create_handle() + AMDGPU.functional(:hiptensor) || error("hipTENSOR is not available") + + handle_ref = Ref{hiptensorHandle_t}() + hiptensorCreate(handle_ref) + handle_ref[] +end + +const IDLE_HANDLES = HandleCache{HIPContext,hiptensorHandle_t}() + +lib_state() = library_state( + :hipTENSOR, hiptensorHandle_t, IDLE_HANDLES, + create_handle, hiptensorDestroy, (handle, stream) -> 0 ) + +handle() = lib_state().handle +stream() = lib_state().stream + +function version() + #=ver = Ref{Cint}() + hiptensor_get_version(handle(), ver) + major = ver[] ÷ 100000 + minor = (ver[] ÷ 100) % 1000 + patch = ver[] % 100=# + return VersionNumber(2, 2, 0) +end + +end diff --git a/src/tensor/interfaces.jl b/src/tensor/interfaces.jl new file mode 100644 index 000000000..47ec90bfd --- /dev/null +++ b/src/tensor/interfaces.jl @@ -0,0 +1,62 @@ +# interfacing with other packages + +## Base + +function Base.:(+)(A::hipTensor, B::hipTensor) + α = convert(eltype(A), 1.0) + γ = convert(eltype(B), 1.0) + C = similar(B) + elementwise_binary_execute!(α, A.data, A.inds, HIPTENSOR_OP_IDENTITY, + γ, B.data, B.inds, HIPTENSOR_OP_IDENTITY, + C.data, C.inds, HIPTENSOR_OP_ADD) + return C +end + +function Base.:(-)(A::hipTensor, B::hipTensor) + α = convert(eltype(A), 1.0) + γ = convert(eltype(B), -1.0) + C = similar(B) + elementwise_binary_execute!(α, A.data, A.inds, HIPTENSOR_OP_IDENTITY, + γ, B.data, B.inds, HIPTENSOR_OP_IDENTITY, + C.data, C.inds, HIPTENSOR_OP_ADD) + return C +end + +function Base.:(*)(A::hipTensor, B::hipTensor) + tC = promote_type(eltype(A), eltype(B)) + A_uniqs = [(idx, i) for (idx, i) in enumerate(A.inds) if !(i in B.inds)] + B_uniqs = [(idx, i) for (idx, i) in enumerate(B.inds) if !(i in A.inds)] + A_sizes = map(x->size(A,x[1]), A_uniqs) + B_sizes = map(x->size(B,x[1]), B_uniqs) + A_inds = map(x->x[2], A_uniqs) + B_inds = map(x->x[2], B_uniqs) + C = hipTensor(fill!(similar(B.data, tC, Dims(vcat(A_sizes, B_sizes))), zero(tC)), vcat(A_inds, B_inds)) + return mul!(C, A, B) +end + + +## LinearAlgebra + +using LinearAlgebra + +function LinearAlgebra.axpy!(a, X::hipTensor, Y::hipTensor) + elementwise_binary_execute!(a, X.data, X.inds, HIPTENSOR_OP_IDENTITY, + one(eltype(Y)), Y.data, Y.inds, HIPTENSOR_OP_IDENTITY, + Y.data, Y.inds, HIPTENSOR_OP_ADD) + return Y +end + +function LinearAlgebra.axpby!(a, X::hipTensor, b, Y::hipTensor) + elementwise_binary_execute!(a, X.data, X.inds, HIPTENSOR_OP_IDENTITY, + b, Y.data, Y.inds, HIPTENSOR_OP_IDENTITY, + Y.data, Y.inds, HIPTENSOR_OP_ADD) + return Y +end + +function LinearAlgebra.mul!(C::hipTensor, A::hipTensor, B::hipTensor, α::Number, β::Number) + contract!(α, A.data, A.inds, HIPTENSOR_OP_IDENTITY, + B.data, B.inds, HIPTENSOR_OP_IDENTITY, β, + C.data, C.inds, HIPTENSOR_OP_IDENTITY, HIPTENSOR_OP_IDENTITY; + jit=HIPTENSOR_JIT_MODE_DEFAULT) + return C +end diff --git a/src/tensor/libhiptensor.jl b/src/tensor/libhiptensor.jl new file mode 100644 index 000000000..d91f7e28e --- /dev/null +++ b/src/tensor/libhiptensor.jl @@ -0,0 +1,548 @@ +@inline function retry_reclaim(f, retry_if) + ret = f() + if retry_if(ret) + return alloc_or_retry!(f, retry_if; stream = stream()) + else + return ret + end +end + +@cenum hiptensorStatus_t::UInt32 begin + HIPTENSOR_STATUS_SUCCESS = 0 + HIPTENSOR_STATUS_NOT_INITIALIZED = 1 + HIPTENSOR_STATUS_ALLOC_FAILED = 3 + HIPTENSOR_STATUS_INVALID_VALUE = 7 + HIPTENSOR_STATUS_ARCH_MISMATCH = 8 + HIPTENSOR_STATUS_EXECUTION_FAILED = 13 + HIPTENSOR_STATUS_INTERNAL_ERROR = 14 + HIPTENSOR_STATUS_NOT_SUPPORTED = 15 + HIPTENSOR_STATUS_CK_ERROR = 17 + HIPTENSOR_STATUS_HIP_ERROR = 18 + HIPTENSOR_STATUS_INSUFFICIENT_WORKSPACE = 19 + HIPTENSOR_STATUS_INSUFFICIENT_DRIVER = 20 + HIPTENSOR_STATUS_IO_ERROR = 21 +end + +# outlined functionality to avoid GC frame allocation +@noinline function throw_api_error(res) + if res == HIPTENSOR_STATUS_ALLOC_FAILED + throw(OutOfGPUMemoryError()) + else + throw(hipTENSORError(res)) + end +end + +@inline function check(f) + retry_if(res) = res in (HIPTENSOR_STATUS_NOT_INITIALIZED, + HIPTENSOR_STATUS_ALLOC_FAILED, + HIPTENSOR_STATUS_INTERNAL_ERROR) + res = retry_reclaim(f, retry_if) + + if res != HIPTENSOR_STATUS_SUCCESS + throw_api_error(res) + end +end + +function hiptensorGetErrorString(error) + @ccall libhiptensor.hiptensorGetErrorString(error::hiptensorStatus_t)::Ptr{Cchar} +end + +mutable struct hiptensorHandle end + +const hiptensorHandle_t = Ptr{hiptensorHandle} + +@checked function hiptensorCreate(handle) + AMDGPU.prepare_state() + @gcsafe_ccall libhiptensor.hiptensorCreate(handle::Ptr{hiptensorHandle_t})::hiptensorStatus_t +end + +@checked function hiptensorDestroy(handle) + @gcsafe_ccall libhiptensor.hiptensorDestroy(handle::hiptensorHandle_t)::hiptensorStatus_t +end + +@checked function hiptensorHandleResizePlanCache(handle, numEntries) + @ccall libhiptensor.hiptensorHandleResizePlanCache(handle::hiptensorHandle_t, + numEntries::UInt32)::hiptensorStatus_t +end + +@checked function hiptensorHandleWritePlanCacheToFile(handle, fileName) + @ccall libhiptensor.hiptensorHandleWritePlanCacheToFile(handle::hiptensorHandle_t, + fileName::Ptr{Cchar})::hiptensorStatus_t +end + +@checked function hiptensorHandleReadPlanCacheFromFile(handle, fileName, numCachelinesRead) + @ccall libhiptensor.hiptensorHandleReadPlanCacheFromFile(handle::hiptensorHandle_t, + fileName::Ptr{Cchar}, + numCachelinesRead::Ptr{UInt32})::hiptensorStatus_t +end + +@checked function hiptensorWriteKernelCacheToFile(handle, fileName) + @ccall libhiptensor.hiptensorWriteKernelCacheToFile(handle::hiptensorHandle_t, + fileName::Ptr{Cchar})::hiptensorStatus_t +end + +@checked function hiptensorReadKernelCacheFromFile(handle, fileName) + @ccall libhiptensor.hiptensorReadKernelCacheFromFile(handle::hiptensorHandle_t, + fileName::Ptr{Cchar})::hiptensorStatus_t +end + +mutable struct hiptensorTensorDescriptor end + +const hiptensorTensorDescriptor_t = Ptr{hiptensorTensorDescriptor} + +@cenum hiptensorDataType_t::UInt32 begin + HIPTENSOR_R_32F = 0 + HIPTENSOR_R_64F = 1 + HIPTENSOR_R_16F = 2 + HIPTENSOR_R_8I = 3 + HIPTENSOR_C_32F = 4 + HIPTENSOR_C_64F = 5 + HIPTENSOR_C_16F = 6 + HIPTENSOR_C_8I = 7 + HIPTENSOR_R_8U = 8 + HIPTENSOR_C_8U = 9 + HIPTENSOR_R_32I = 10 + HIPTENSOR_C_32I = 11 + HIPTENSOR_R_32U = 12 + HIPTENSOR_C_32U = 13 + HIPTENSOR_R_16BF = 14 + HIPTENSOR_C_16BF = 15 + HIPTENSOR_R_4I = 16 + HIPTENSOR_C_4I = 17 + HIPTENSOR_R_4U = 18 + HIPTENSOR_C_4U = 19 + HIPTENSOR_R_16I = 20 + HIPTENSOR_C_16I = 21 + HIPTENSOR_R_16U = 22 + HIPTENSOR_C_16U = 23 + HIPTENSOR_R_64I = 24 + HIPTENSOR_C_64I = 25 + HIPTENSOR_R_64U = 26 + HIPTENSOR_C_64U = 27 +end + +function hiptensorCreateTensorDescriptor(handle, desc, numModes, lens, strides, dataType, + alignmentRequirement) + @debug_ccall libhiptensor.hiptensorCreateTensorDescriptor(handle::hiptensorHandle_t, + desc::Ptr{hiptensorTensorDescriptor_t}, + numModes::UInt32, lens::Ptr{Int64}, + strides::Ptr{Int64}, + dataType::hiptensorDataType_t, + alignmentRequirement::UInt32)::hiptensorStatus_t +end + +@checked function hiptensorDestroyTensorDescriptor(desc) + @ccall libhiptensor.hiptensorDestroyTensorDescriptor(desc::hiptensorTensorDescriptor_t)::hiptensorStatus_t +end + +mutable struct hiptensorOperationDescriptor end + +const hiptensorOperationDescriptor_t = Ptr{hiptensorOperationDescriptor} + +@cenum hiptensorOperator_t::UInt32 begin + HIPTENSOR_OP_IDENTITY = 1 + HIPTENSOR_OP_SQRT = 2 + HIPTENSOR_OP_RELU = 8 + HIPTENSOR_OP_CONJ = 9 + HIPTENSOR_OP_RCP = 10 + HIPTENSOR_OP_SIGMOID = 11 + HIPTENSOR_OP_TANH = 12 + HIPTENSOR_OP_EXP = 22 + HIPTENSOR_OP_LOG = 23 + HIPTENSOR_OP_ABS = 24 + HIPTENSOR_OP_NEG = 25 + HIPTENSOR_OP_SIN = 26 + HIPTENSOR_OP_COS = 27 + HIPTENSOR_OP_TAN = 28 + HIPTENSOR_OP_SINH = 29 + HIPTENSOR_OP_COSH = 30 + HIPTENSOR_OP_ASIN = 31 + HIPTENSOR_OP_ACOS = 32 + HIPTENSOR_OP_ATAN = 33 + HIPTENSOR_OP_ASINH = 34 + HIPTENSOR_OP_ACOSH = 35 + HIPTENSOR_OP_ATANH = 36 + HIPTENSOR_OP_CEIL = 37 + HIPTENSOR_OP_FLOOR = 38 + HIPTENSOR_OP_ADD = 3 + HIPTENSOR_OP_MUL = 5 + HIPTENSOR_OP_MAX = 6 + HIPTENSOR_OP_MIN = 7 + HIPTENSOR_OP_UNKNOWN = 126 +end + +@cenum hiptensorComputeDescriptor_t::UInt32 begin + HIPTENSOR_COMPUTE_DESC_32F = 4 + HIPTENSOR_COMPUTE_DESC_64F = 16 + HIPTENSOR_COMPUTE_DESC_16F = 1 + HIPTENSOR_COMPUTE_DESC_16BF = 1024 + HIPTENSOR_COMPUTE_DESC_C32F = 2048 + HIPTENSOR_COMPUTE_DESC_C64F = 4096 + HIPTENSOR_COMPUTE_DESC_NONE = 0 + HIPTENSOR_COMPUTE_DESC_8U = 64 + HIPTENSOR_COMPUTE_DESC_8I = 256 + HIPTENSOR_COMPUTE_DESC_32U = 128 + HIPTENSOR_COMPUTE_DESC_32I = 512 +end + +@checked function hiptensorCreateContraction(handle, desc, descA, modeA, opA, descB, modeB, opB, + descC, modeC, opC, descD, modeD, descCompute) + @ccall libhiptensor.hiptensorCreateContraction(handle::hiptensorHandle_t, + desc::Ptr{hiptensorOperationDescriptor_t}, + descA::hiptensorTensorDescriptor_t, + modeA::Ptr{Int32}, + opA::hiptensorOperator_t, + descB::hiptensorTensorDescriptor_t, + modeB::Ptr{Int32}, + opB::hiptensorOperator_t, + descC::hiptensorTensorDescriptor_t, + modeC::Ptr{Int32}, + opC::hiptensorOperator_t, + descD::hiptensorTensorDescriptor_t, + modeD::Ptr{Int32}, + descCompute::hiptensorComputeDescriptor_t)::hiptensorStatus_t +end + +@checked function hiptensorDestroyOperationDescriptor(desc) + @ccall libhiptensor.hiptensorDestroyOperationDescriptor(desc::hiptensorOperationDescriptor_t)::hiptensorStatus_t +end + +@cenum hiptensorOperationDescriptorAttribute_t::UInt32 begin + HIPTENSOR_OPERATION_DESCRIPTOR_TAG = 0 + HIPTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE = 1 + HIPTENSOR_OPERATION_DESCRIPTOR_FLOPS = 2 + HIPTENSOR_OPERATION_DESCRIPTOR_MOVED_BYTES = 3 + HIPTENSOR_OPERATION_DESCRIPTOR_PADDING_LEFT = 4 + HIPTENSOR_OPERATION_DESCRIPTOR_PADDING_RIGHT = 5 + HIPTENSOR_OPERATION_DESCRIPTOR_PADDING_VALUE = 6 +end + +@checked function hiptensorOperationDescriptorSetAttribute(handle, desc, attr, buf, sizeInBytes) + @ccall libhiptensor.hiptensorOperationDescriptorSetAttribute(handle::hiptensorHandle_t, + desc::hiptensorOperationDescriptor_t, + attr::hiptensorOperationDescriptorAttribute_t, + buf::Ptr{Cvoid}, + sizeInBytes::Csize_t)::hiptensorStatus_t +end + +function hiptensorOperationDescriptorGetAttribute(handle, desc, attr, buf, sizeInBytes) + @debug_ccall libhiptensor.hiptensorOperationDescriptorGetAttribute(handle::hiptensorHandle_t, + desc::hiptensorOperationDescriptor_t, + attr::hiptensorOperationDescriptorAttribute_t, + buf::Ptr{Cvoid}, + sizeInBytes::Csize_t)::hiptensorStatus_t +end + +mutable struct hiptensorPlanPreference end + +const hiptensorPlanPreference_t = Ptr{hiptensorPlanPreference} + +@cenum hiptensorAlgo_t::Int32 begin + HIPTENSOR_ALGO_ACTOR_CRITIC = -8 + HIPTENSOR_ALGO_DEFAULT = -1 + HIPTENSOR_ALGO_DEFAULT_PATIENT = -6 +end + +@cenum hiptensorJitMode_t::UInt32 begin + HIPTENSOR_JIT_MODE_NONE = 0 + HIPTENSOR_JIT_MODE_DEFAULT = 1 +end + +@checked function hiptensorCreatePlanPreference(handle, pref, algo, jitMode) + @ccall libhiptensor.hiptensorCreatePlanPreference(handle::hiptensorHandle_t, + pref::Ptr{hiptensorPlanPreference_t}, + algo::hiptensorAlgo_t, + jitMode::hiptensorJitMode_t)::hiptensorStatus_t +end + +@checked function hiptensorDestroyPlanPreference(pref) + @ccall libhiptensor.hiptensorDestroyPlanPreference(pref::hiptensorPlanPreference_t)::hiptensorStatus_t +end + +@cenum hiptensorPlanPreferenceAttribute_t::UInt32 begin + HIPTENSOR_PLAN_PREFERENCE_AUTOTUNE_MODE = 0 + HIPTENSOR_PLAN_PREFERENCE_CACHE_MODE = 1 + HIPTENSOR_PLAN_PREFERENCE_INCREMENTAL_COUNT = 2 + HIPTENSOR_PLAN_PREFERENCE_ALGO = 3 + HIPTENSOR_PLAN_PREFERENCE_KERNEL_RANK = 4 + HIPTENSOR_PLAN_PREFERENCE_JIT = 5 +end + +@checked function hiptensorPlanPreferenceSetAttribute(handle, pref, attr, buf, sizeInBytes) + @ccall libhiptensor.hiptensorPlanPreferenceSetAttribute(handle::hiptensorHandle_t, + pref::hiptensorPlanPreference_t, + attr::hiptensorPlanPreferenceAttribute_t, + buf::Ptr{Cvoid}, + sizeInBytes::Csize_t)::hiptensorStatus_t +end + +mutable struct hiptensorPlan end + +const hiptensorPlan_t = Ptr{hiptensorPlan} + +@cenum hiptensorPlanAttribute_t::UInt32 begin + HIPTENSOR_PLAN_REQUIRED_WORKSPACE = 0 +end + +function hiptensorPlanGetAttribute(handle, plan, attr, buf, sizeInBytes) + @debug_ccall libhiptensor.hiptensorPlanGetAttribute(handle::hiptensorHandle_t, + plan::hiptensorPlan_t, + attr::hiptensorPlanAttribute_t, + buf::Ptr{Cvoid}, + sizeInBytes::Csize_t)::hiptensorStatus_t +end + +@cenum hiptensorWorksizePreference_t::UInt32 begin + HIPTENSOR_WORKSPACE_MIN = 1 + HIPTENSOR_WORKSPACE_DEFAULT = 2 + HIPTENSOR_WORKSPACE_MAX = 3 +end + +@checked function hiptensorEstimateWorkspaceSize(handle, desc, planPref, workspacePref, + workspaceSizeEstimate) + @ccall libhiptensor.hiptensorEstimateWorkspaceSize(handle::hiptensorHandle_t, + desc::hiptensorOperationDescriptor_t, + planPref::hiptensorPlanPreference_t, + workspacePref::hiptensorWorksizePreference_t, + workspaceSizeEstimate::Ptr{UInt64})::hiptensorStatus_t +end + +@checked function hiptensorCreatePermutation(handle, desc, descA, modeA, opA, descB, modeB, + descCompute) + @ccall libhiptensor.hiptensorCreatePermutation(handle::hiptensorHandle_t, + desc::Ptr{hiptensorOperationDescriptor_t}, + descA::hiptensorTensorDescriptor_t, + modeA::Ptr{Int32}, + opA::hiptensorOperator_t, + descB::hiptensorTensorDescriptor_t, + modeB::Ptr{Int32}, + descCompute::hiptensorComputeDescriptor_t)::hiptensorStatus_t +end + +function hiptensorCreatePlan(handle, plan, desc, pref, workspaceSizeLimit) + @debug_ccall libhiptensor.hiptensorCreatePlan(handle::hiptensorHandle_t, + plan::Ptr{hiptensorPlan_t}, + desc::hiptensorOperationDescriptor_t, + pref::hiptensorPlanPreference_t, + workspaceSizeLimit::UInt64)::hiptensorStatus_t +end + +@checked function hiptensorDestroyPlan(plan) + @ccall libhiptensor.hiptensorDestroyPlan(plan::hiptensorPlan_t)::hiptensorStatus_t +end + +@checked function hiptensorContract(handle, plan, alpha, A, B, beta, C, D, workspace, workspaceSize, + stream) + @ccall libhiptensor.hiptensorContract(handle::hiptensorHandle_t, plan::hiptensorPlan_t, + alpha::Ptr{Cvoid}, A::Ptr{Cvoid}, B::Ptr{Cvoid}, + beta::Ptr{Cvoid}, C::Ptr{Cvoid}, D::Ptr{Cvoid}, + workspace::Ptr{Cvoid}, workspaceSize::UInt64, + stream::Cint)::hiptensorStatus_t +end + +@checked function hiptensorPermute(handle, plan, alpha, A, B, stream) + @ccall libhiptensor.hiptensorPermute(handle::hiptensorHandle_t, plan::hiptensorPlan_t, + alpha::Ptr{Cvoid}, A::Ptr{Cvoid}, B::Ptr{Cvoid}, + stream::Cint)::hiptensorStatus_t +end + +@checked function hiptensorCreateElementwiseBinary(handle, desc, descA, modeA, opA, descC, modeC, + opC, descD, modeD, opAC, descCompute) + @ccall libhiptensor.hiptensorCreateElementwiseBinary(handle::hiptensorHandle_t, + desc::Ptr{hiptensorOperationDescriptor_t}, + descA::hiptensorTensorDescriptor_t, + modeA::Ptr{Int32}, + opA::hiptensorOperator_t, + descC::hiptensorTensorDescriptor_t, + modeC::Ptr{Int32}, + opC::hiptensorOperator_t, + descD::hiptensorTensorDescriptor_t, + modeD::Ptr{Int32}, + opAC::hiptensorOperator_t, + descCompute::hiptensorComputeDescriptor_t)::hiptensorStatus_t +end + +@checked function hiptensorElementwiseBinaryExecute(handle, plan, alpha, A, gamma, C, D, stream) + @ccall libhiptensor.hiptensorElementwiseBinaryExecute(handle::hiptensorHandle_t, + plan::hiptensorPlan_t, + alpha::Ptr{Cvoid}, A::Ptr{Cvoid}, + gamma::Ptr{Cvoid}, C::Ptr{Cvoid}, + D::Ptr{Cvoid}, + stream::Cint)::hiptensorStatus_t +end + +@checked function hiptensorCreateElementwiseTrinary(handle, desc, descA, modeA, opA, descB, modeB, + opB, descC, modeC, opC, descD, modeD, opAB, + opABC, descCompute) + @ccall libhiptensor.hiptensorCreateElementwiseTrinary(handle::hiptensorHandle_t, + desc::Ptr{hiptensorOperationDescriptor_t}, + descA::hiptensorTensorDescriptor_t, + modeA::Ptr{Int32}, + opA::hiptensorOperator_t, + descB::hiptensorTensorDescriptor_t, + modeB::Ptr{Int32}, + opB::hiptensorOperator_t, + descC::hiptensorTensorDescriptor_t, + modeC::Ptr{Int32}, + opC::hiptensorOperator_t, + descD::hiptensorTensorDescriptor_t, + modeD::Ptr{Int32}, + opAB::hiptensorOperator_t, + opABC::hiptensorOperator_t, + descCompute::hiptensorComputeDescriptor_t)::hiptensorStatus_t +end + +@checked function hiptensorElementwiseTrinaryExecute(handle, plan, alpha, A, beta, B, gamma, C, D, + stream) + @ccall libhiptensor.hiptensorElementwiseTrinaryExecute(handle::hiptensorHandle_t, + plan::hiptensorPlan_t, + alpha::Ptr{Cvoid}, A::Ptr{Cvoid}, + beta::Ptr{Cvoid}, B::Ptr{Cvoid}, + gamma::Ptr{Cvoid}, C::Ptr{Cvoid}, + D::Ptr{Cvoid}, + stream::Cint)::hiptensorStatus_t +end + +function hiptensorCreateReduction(handle, desc, descA, modeA, opA, descC, modeC, opC, descD, + modeD, opReduce, descCompute) + @debug_ccall libhiptensor.hiptensorCreateReduction(handle::hiptensorHandle_t, + desc::Ptr{hiptensorOperationDescriptor_t}, + descA::hiptensorTensorDescriptor_t, + modeA::Ptr{Int32}, + opA::hiptensorOperator_t, + descC::hiptensorTensorDescriptor_t, + modeC::Ptr{Int32}, + opC::hiptensorOperator_t, + descD::hiptensorTensorDescriptor_t, + modeD::Ptr{Int32}, + opReduce::hiptensorOperator_t, + descCompute::hiptensorComputeDescriptor_t)::hiptensorStatus_t +end + +function hiptensorReduce(handle, plan, alpha, A, beta, C, D, workspace, workspaceSize, + stream) + @debug_ccall libhiptensor.hiptensorReduce(handle::hiptensorHandle_t, plan::hiptensorPlan_t, + alpha::Ptr{Cvoid}, A::Ptr{Cvoid}, beta::Ptr{Cvoid}, + C::Ptr{Cvoid}, D::Ptr{Cvoid}, workspace::Ptr{Cvoid}, + workspaceSize::UInt64, + stream::Cint)::hiptensorStatus_t +end + +# typedef void ( * hiptensorLoggerCallback_t ) ( int32_t logContext , const char * funcName , const char * msg ) +const hiptensorLoggerCallback_t = Ptr{Cvoid} + +function hiptensorLoggerSetCallback(callback) + @ccall libhiptensor.hiptensorLoggerSetCallback(callback::hiptensorLoggerCallback_t)::hiptensorStatus_t +end + +function hiptensorLoggerSetFile(file) + @ccall libhiptensor.hiptensorLoggerSetFile(file::Ptr{Libc.FILE})::hiptensorStatus_t +end + +function hiptensorLoggerOpenFile(logFile) + @ccall libhiptensor.hiptensorLoggerOpenFile(logFile::Ptr{Cchar})::hiptensorStatus_t +end + +@cenum hiptensorLogLevel_t::UInt32 begin + HIPTENSOR_LOG_LEVEL_OFF = 0 + HIPTENSOR_LOG_LEVEL_ERROR = 1 + HIPTENSOR_LOG_LEVEL_PERF_TRACE = 2 + HIPTENSOR_LOG_LEVEL_PERF_HINT = 4 + HIPTENSOR_LOG_LEVEL_HEURISTICS_TRACE = 8 + HIPTENSOR_LOG_LEVEL_API_TRACE = 16 +end + +function hiptensorLoggerSetLevel(level) + @ccall libhiptensor.hiptensorLoggerSetLevel(level::hiptensorLogLevel_t)::hiptensorStatus_t +end + +function hiptensorLoggerSetMask(mask) + @ccall libhiptensor.hiptensorLoggerSetMask(mask::Int32)::hiptensorStatus_t +end + +# no prototype is found for this function at hiptensor.h:578:19, please use with caution +function hiptensorLoggerForceDisable() + @ccall libhiptensor.hiptensorLoggerForceDisable()::hiptensorStatus_t +end + +# no prototype is found for this function at hiptensor.h:583:5, please use with caution +function hiptensorGetHiprtVersion() + @ccall libhiptensor.hiptensorGetHiprtVersion()::Cint +end + +# no prototype is found for this function at hiptensor.h:589:8, please use with caution +function hiptensorGetVersion() + @ccall libhiptensor.hiptensorGetVersion()::Csize_t +end + +@cenum hiptensorAutotuneMode_t::UInt32 begin + HIPTENSOR_AUTOTUNE_MODE_NONE = 0 + HIPTENSOR_AUTOTUNE_MODE_INCREMENTAL = 1 +end + +@cenum hiptensorCacheMode_t::UInt32 begin + HIPTENSOR_CACHE_MODE_NONE = 0 + HIPTENSOR_CACHE_MODE_PEDANTIC = 1 +end + +# Skipping MacroDefinition: HIP_PUBLIC_API __attribute__ ( ( visibility ( "default" ) ) ) + +# Skipping MacroDefinition: HIP_INTERNAL_EXPORTED_API __attribute__ ( ( visibility ( "default" ) ) ) + +const __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ = 0 + +const __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ = 0 + +const __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ = 0 + +const __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ = 0 + +const __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ = 0 + +const __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ = 0 + +const __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ = 0 + +const __HIP_ARCH_HAS_DOUBLES__ = 0 + +const __HIP_ARCH_HAS_WARP_VOTE__ = 0 + +const __HIP_ARCH_HAS_WARP_BALLOT__ = 0 + +const __HIP_ARCH_HAS_WARP_SHUFFLE__ = 0 + +const __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ = 0 + +const __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ = 0 + +const __HIP_ARCH_HAS_SYNC_THREAD_EXT__ = 0 + +const __HIP_ARCH_HAS_SURFACE_FUNCS__ = 0 + +const __HIP_ARCH_HAS_3DGRID__ = 0 + +const __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ = 0 + +const HIPTENSOR_MAJOR_VERSION = 2 + +const HIPTENSOR_MINOR_VERSION = 2 + +const HIPTENSOR_PATCH_VERSION = 0 + +const HIP_VERSION_MAJOR = 7 + +const HIP_VERSION_MINOR = 2 + +const HIP_VERSION_PATCH = 26015 + +const HIP_VERSION_GITHASH = "fc0010cf6a" + +const HIP_VERSION_BUILD_ID = 0 + +const HIP_VERSION_BUILD_NAME = "" + +const HIP_VERSION = HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + + HIP_VERSION_PATCH + +const __HIP_HAS_GET_PCH = 1 + diff --git a/src/tensor/operations.jl b/src/tensor/operations.jl new file mode 100644 index 000000000..b3013639d --- /dev/null +++ b/src/tensor/operations.jl @@ -0,0 +1,429 @@ +const ModeType = AbstractVector{<:Union{Char, Integer}} + +# remove the HIPTENSOR_ prefix from some common enums, +# as they're namespaced to the cuTENSOR module anyway. +@enum_without_prefix hiptensorOperator_t HIPTENSOR_ +@enum_without_prefix hiptensorWorksizePreference_t HIPTENSOR_ +@enum_without_prefix hiptensorAlgo_t HIPTENSOR_ +@enum_without_prefix hiptensorJitMode_t HIPTENSOR_ + +is_unary(op::hiptensorOperator_t) = (op ∈ (OP_IDENTITY, OP_SQRT, OP_RELU, OP_CONJ, OP_RCP)) +is_binary(op::hiptensorOperator_t) = (op ∈ (OP_ADD, OP_MUL, OP_MAX, OP_MIN)) + +function elementwise_trinary_execute!( + @nospecialize(alpha::Number), + @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t, + @nospecialize(beta::Number), + @nospecialize(B::AbstractArray), Binds::ModeType, opB::hiptensorOperator_t, + @nospecialize(gamma::Number), + @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t, + @nospecialize(D::AbstractArray), Dinds::ModeType, opAB::hiptensorOperator_t, + opABC::hiptensorOperator_t; + workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT, + algo::hiptensorAlgo_t=ALGO_DEFAULT, + compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing, + plan::Union{hipTensorPlan, Nothing}=nothing) + + actual_plan = if plan === nothing + plan_elementwise_trinary(A, Ainds, opA, + B, Binds, opB, + C, Cinds, opC, + D, Dinds, opAB, opABC; + workspace, algo, compute_type) + else + plan + end + + elementwise_trinary_execute!(actual_plan, alpha, A, beta, B, gamma, C, D) + + if plan === nothing + AMDGPU.unsafe_free!(actual_plan) + end + + return D +end + +function elementwise_trinary_execute!(plan::hipTensorPlan, + @nospecialize(alpha::Number), + @nospecialize(A::AbstractArray), + @nospecialize(beta::Number), + @nospecialize(B::AbstractArray), + @nospecialize(gamma::Number), + @nospecialize(C::AbstractArray), + @nospecialize(D::AbstractArray)) + scalar_type = plan.scalar_type + hiptensorElementwiseTrinaryExecute(handle(), plan, + Ref{scalar_type}(alpha), A, + Ref{scalar_type}(beta), B, + Ref{scalar_type}(gamma), C, D, + stream().stream) + return D +end + +function plan_elementwise_trinary( + @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t, + @nospecialize(B::AbstractArray), Binds::ModeType, opB::hiptensorOperator_t, + @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t, + @nospecialize(D::AbstractArray), Dinds::ModeType, opAB::hiptensorOperator_t, + opABC::hiptensorOperator_t; + jit::hiptensorJitMode_t=JIT_MODE_NONE, + workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT, + algo::hiptensorAlgo_t=ALGO_DEFAULT, + compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing) + !is_unary(opA) && throw(ArgumentError("opA must be a unary op!")) + !is_unary(opB) && throw(ArgumentError("opB must be a unary op!")) + !is_unary(opC) && throw(ArgumentError("opC must be a unary op!")) + !is_binary(opAB) && throw(ArgumentError("opAB must be a binary op!")) + !is_binary(opABC) && throw(ArgumentError("opABC must be a binary op!")) + descA = hipTensorDescriptor(A) + descB = hipTensorDescriptor(B) + descC = hipTensorDescriptor(C) + @assert size(C) == size(D) && strides(C) == strides(D) + descD = descC # must currently be identical + modeA = collect(Cint, Ainds) + modeB = collect(Cint, Binds) + modeC = collect(Cint, Cinds) + modeD = modeC + + actual_compute_type = if compute_type === nothing + elementwise_trinary_compute_types[(eltype(A), eltype(B), eltype(C))] + else + compute_type + end + + desc = Ref{hiptensorOperationDescriptor_t}() + hiptensorCreateElementwiseTrinary(handle(), + desc, + descA, modeA, opA, + descB, modeB, opB, + descC, modeC, opC, + descD, modeD, + opAB, opABC, + actual_compute_type) + + plan_pref = Ref{hiptensorPlanPreference_t}() + hiptensorCreatePlanPreference(handle(), plan_pref, algo, jit) + + plan = hipTensorPlan(desc[], plan_pref[]; workspacePref=workspace) + hiptensorDestroyOperationDescriptor(desc[]) + hiptensorDestroyPlanPreference(plan_pref[]) + return plan +end + +function elementwise_binary_execute!( + @nospecialize(alpha::Number), + @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t, + @nospecialize(gamma::Number), + @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t, + @nospecialize(D::AbstractArray), Dinds::ModeType, opAC::hiptensorOperator_t; + workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT, + algo::hiptensorAlgo_t=ALGO_DEFAULT, + compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing, + plan::Union{hipTensorPlan, Nothing}=nothing) + actual_plan = if plan === nothing + plan_elementwise_binary(A, Ainds, opA, + C, Cinds, opC, + D, Dinds, opAC; + workspace, algo, compute_type) + else + plan + end + elementwise_binary_execute!(actual_plan, alpha, A, gamma, C, D) + if plan === nothing + AMDGPU.unsafe_free!(actual_plan) + end + + return D +end + +function elementwise_binary_execute!(plan::hipTensorPlan, + @nospecialize(alpha::Number), + @nospecialize(A::AbstractArray), + @nospecialize(gamma::Number), + @nospecialize(C::AbstractArray), + @nospecialize(D::AbstractArray)) + scalar_type = plan.scalar_type + hiptensorElementwiseBinaryExecute(handle(), plan, + Ref{scalar_type}(alpha), A, + Ref{scalar_type}(gamma), C, D, + stream().stream) + return D +end + +function plan_elementwise_binary( + @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t, + @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t, + @nospecialize(D::AbstractArray), Dinds::ModeType, opAC::hiptensorOperator_t; + jit::hiptensorJitMode_t=JIT_MODE_NONE, + workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT, + algo::hiptensorAlgo_t=ALGO_DEFAULT, + compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=eltype(C)) + !is_unary(opA) && throw(ArgumentError("opA must be a unary op!")) + !is_unary(opC) && throw(ArgumentError("opC must be a unary op!")) + !is_binary(opAC) && throw(ArgumentError("opAC must be a binary op!")) + descA = hipTensorDescriptor(A) + descC = hipTensorDescriptor(C) + @assert size(C) == size(D) && strides(C) == strides(D) + descD = descC # must currently be identical + modeA = collect(Cint, Ainds) + modeC = collect(Cint, Cinds) + modeD = modeC + + actual_compute_type = if compute_type === nothing + elementwise_binary_compute_types[(eltype(A), eltype(C))] + else + compute_type + end + + desc = Ref{hiptensorOperationDescriptor_t}() + hiptensorCreateElementwiseBinary(handle(), + desc, + descA, modeA, opA, + descC, modeC, opC, + descD, modeD, + opAC, + actual_compute_type) + plan_pref = Ref{hiptensorPlanPreference_t}() + hiptensorCreatePlanPreference(handle(), plan_pref, algo, jit) + + plan = hipTensorPlan(desc[], plan_pref[]; workspacePref=workspace) + hiptensorDestroyOperationDescriptor(desc[]) + hiptensorDestroyPlanPreference(plan_pref[]) + return plan +end + +function permute!( + @nospecialize(alpha::Number), + @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t, + @nospecialize(B::AbstractArray), Binds::ModeType; + workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT, + algo::hiptensorAlgo_t=ALGO_DEFAULT, + compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing, + plan::Union{hipTensorPlan, Nothing}=nothing) + actual_plan = if plan === nothing + plan_permutation(A, Ainds, opA, + B, Binds; + workspace, algo, compute_type) + else + plan + end + permute!(actual_plan, alpha, A, B) + + if plan === nothing + AMDGPU.unsafe_free!(actual_plan) + end + + return B +end + +function permute!(plan::hipTensorPlan, + @nospecialize(alpha::Number), + @nospecialize(A::AbstractArray), + @nospecialize(B::AbstractArray)) + scalar_type = plan.scalar_type + hiptensorPermute(handle(), plan, + Ref{scalar_type}(alpha), A, B, + stream().stream) + return B +end + +function plan_permutation( + @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t, + @nospecialize(B::AbstractArray), Binds::ModeType; + jit::hiptensorJitMode_t=JIT_MODE_NONE, + workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT, + algo::hiptensorAlgo_t=ALGO_DEFAULT, + compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing) + descA = hipTensorDescriptor(A) + descB = hipTensorDescriptor(B) + + modeA = collect(Cint, Ainds) + modeB = collect(Cint, Binds) + + actual_compute_type = if compute_type === nothing + permutation_compute_types[(eltype(A), eltype(B))] + else + compute_type + end + compute_desc = Base.cconvert(hiptensorComputeDescriptor_t, actual_compute_type) + desc = Ref{hiptensorOperationDescriptor_t}() + hiptensorCreatePermutation(handle(), desc, + descA, modeA, opA, + descB, modeB, + compute_desc) + plan_pref = Ref{hiptensorPlanPreference_t}() + hiptensorCreatePlanPreference(handle(), plan_pref, algo, jit) + + plan = hipTensorPlan(desc[], plan_pref[]; workspacePref=workspace) + hiptensorDestroyOperationDescriptor(desc[]) + hiptensorDestroyPlanPreference(plan_pref[]) + return plan +end + +function contract!( + @nospecialize(alpha::Number), + @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t, + @nospecialize(B::AbstractArray), Binds::ModeType, opB::hiptensorOperator_t, + @nospecialize(beta::Number), + @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t, + opOut::hiptensorOperator_t; + jit::hiptensorJitMode_t=JIT_MODE_NONE, + workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT, + algo::hiptensorAlgo_t=ALGO_DEFAULT, + compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing, + plan::Union{hipTensorPlan, Nothing}=nothing) + actual_plan = if plan === nothing + plan_contraction(A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut; + jit, workspace, algo, compute_type) + else + plan + end + + contract!(actual_plan, alpha, A, B, beta, C) + + if plan === nothing + AMDGPU.unsafe_free!(actual_plan) + end + + return C +end + +function contract!(plan::hipTensorPlan, + @nospecialize(alpha::Number), + @nospecialize(A::AbstractArray), + @nospecialize(B::AbstractArray), + @nospecialize(beta::Number), + @nospecialize(C::AbstractArray)) + scalar_type = plan.scalar_type + hiptensorContract(handle(), plan, + Ref{scalar_type}(alpha), A, B, + Ref{scalar_type}(beta), C, C, + plan.workspace, sizeof(plan.workspace), stream().stream) + return C +end + +function plan_contraction( + @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t, + @nospecialize(B::AbstractArray), Binds::ModeType, opB::hiptensorOperator_t, + @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t, + opOut::hiptensorOperator_t; + jit::hiptensorJitMode_t=JIT_MODE_NONE, + workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT, + algo::hiptensorAlgo_t=ALGO_DEFAULT, + compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing) + !is_unary(opA) && throw(ArgumentError("opA must be a unary op!")) + !is_unary(opB) && throw(ArgumentError("opB must be a unary op!")) + !is_unary(opC) && throw(ArgumentError("opC must be a unary op!")) + !is_unary(opOut) && throw(ArgumentError("opOut must be a unary op!")) + descA = hipTensorDescriptor(A) + descB = hipTensorDescriptor(B) + descC = hipTensorDescriptor(C) + # for now, D must be identical to C (and thus, descD must be identical to descC) + modeA = collect(Cint, Ainds) + length(modeA) == ndims(A) || throw(ArgumentError("Ainds must match number of dimensions in A!")) + modeB = collect(Cint, Binds) + length(modeB) == ndims(B) || throw(ArgumentError("Binds must match number of dimensions in B!")) + modeC = collect(Cint, Cinds) + length(modeC) == ndims(C) || throw(ArgumentError("Cinds must match number of dimensions in C!")) + + actual_compute_type = if compute_type === nothing + contraction_compute_types[(eltype(A), eltype(B), eltype(C))] + else + compute_type + end + + desc = Ref{hiptensorOperationDescriptor_t}() + hiptensorCreateContraction(handle(), + desc, + descA, modeA, opA, + descB, modeB, opB, + descC, modeC, opC, + descC, modeC, + actual_compute_type) + + plan_pref = Ref{hiptensorPlanPreference_t}() + hiptensorCreatePlanPreference(handle(), plan_pref, algo, jit) + + plan = hipTensorPlan(desc[], plan_pref[]; workspacePref=workspace) + hiptensorDestroyOperationDescriptor(desc[]) + hiptensorDestroyPlanPreference(plan_pref[]) + return plan +end + +function reduce!( + @nospecialize(alpha::Number), + @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t, + @nospecialize(beta::Number), + @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t, + opReduce::hiptensorOperator_t; + workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT, + algo::hiptensorAlgo_t=ALGO_DEFAULT, + compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing, + plan::Union{hipTensorPlan, Nothing}=nothing) + actual_plan = if plan === nothing + plan_reduction(A, Ainds, opA, C, Cinds, opC, opReduce; workspace, algo, compute_type) + else + plan + end + + reduce!(actual_plan, alpha, A, beta, C) + + if plan === nothing + AMDGPU.unsafe_free!(actual_plan) + end + + return C +end + +function reduce!(plan::hipTensorPlan, + @nospecialize(alpha::Number), + @nospecialize(A::AbstractArray), + @nospecialize(beta::Number), + @nospecialize(C::AbstractArray)) + scalar_type = plan.scalar_type + hiptensorReduce(handle(), plan, + Ref{scalar_type}(alpha), A, + Ref{scalar_type}(beta), C, C, + plan.workspace, sizeof(plan.workspace), Cint(0)) + return C +end + +function plan_reduction( + @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t, + @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t, + opReduce::hiptensorOperator_t; + jit::hiptensorJitMode_t=JIT_MODE_NONE, + workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT, + algo::hiptensorAlgo_t=ALGO_DEFAULT, + compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing) + !is_unary(opA) && throw(ArgumentError("opA must be a unary op!")) + !is_unary(opC) && throw(ArgumentError("opC must be a unary op!")) + !is_binary(opReduce) && throw(ArgumentError("opReduce must be a binary op!")) + descA = hipTensorDescriptor(A) + descC = hipTensorDescriptor(C) + # for now, D must be identical to C (and thus, descD must be identical to descC) + modeA = collect(Cint, Ainds) + modeC = collect(Cint, Cinds) + + actual_compute_type = if compute_type === nothing + reduction_compute_types[(eltype(A), eltype(C))] + else + compute_type + end + + desc = Ref{hiptensorOperationDescriptor_t}() + hiptensorCreateReduction(handle(), + desc, + descA, modeA, opA, + descC, modeC, opC, + descC, modeC, opReduce, + actual_compute_type) + + plan_pref = Ref{hiptensorPlanPreference_t}() + hiptensorCreatePlanPreference(handle(), plan_pref, algo, jit) + + plan = hipTensorPlan(desc[], plan_pref[]; workspacePref=workspace) + hiptensorDestroyOperationDescriptor(desc[]) + hiptensorDestroyPlanPreference(plan_pref[]) + return plan +end diff --git a/src/tensor/types.jl b/src/tensor/types.jl new file mode 100644 index 000000000..c8ae7edb6 --- /dev/null +++ b/src/tensor/types.jl @@ -0,0 +1,290 @@ +## data types + +@enum hiptensorComputeDescriptorEnum begin + COMPUTE_DESC_32F = 4 + COMPUTE_DESC_64F = 16 + COMPUTE_DESC_16F = 1 + COMPUTE_DESC_16BF = 1024 + COMPUTE_DESC_C32F = 2048 + COMPUTE_DESC_C64F = 4096 + COMPUTE_DESC_NONE = 0 + COMPUTE_DESC_8U = 64 + COMPUTE_DESC_8I = 256 + COMPUTE_DESC_32U = 128 + COMPUTE_DESC_32I = 512 +end + +const contraction_compute_types = Dict( + # typeA, typeB, typeC => typeCompute + (Float16, Float16, Float16) => Float32, + (Float32, Float32, Float32) => Float32, + (Float64, Float64, Float64) => Float64, + (ComplexF32, ComplexF32, ComplexF32) => Float32, + (ComplexF64, ComplexF64, ComplexF64) => Float64, + (Float64, ComplexF64, ComplexF64) => Float64, + (ComplexF64, Float64, ComplexF64) => Float64) + +const elementwise_trinary_compute_types = Dict( + # typeA, typeB, typeC => typeCompute + (Float16, Float16, Float16) => Float16, + (Float32, Float32, Float32) => Float32, + (Float64, Float64, Float64) => Float64, + (ComplexF32, ComplexF32, ComplexF32) => Float32, + (ComplexF64, ComplexF64, ComplexF64) => Float64, + (Float32, Float32, Float16) => Float32, + # (Float64, Float64, Float32) => Float32, + (ComplexF64, ComplexF64, ComplexF32) => Float64) + +const elementwise_binary_compute_types = Dict( + # typeA, typeC => typeCompute + (Float16, Float16) => Float16, + (Float16, Float32) => Float32, + (Float32, Float32) => Float32, + (Float64, Float64) => Float64, + (ComplexF32, ComplexF32) => Float32, + (ComplexF64, ComplexF64) => Float64, + (ComplexF64, ComplexF32) => Float64, + (Float32, Float16) => Float32, + (Float64, Float32) => Float64) + +const permutation_compute_types = Dict( + # typeA, typeB => typeCompute + (Float16, Float16) => Float16, + (Float16, Float32) => Float32, + # (Float32, Float16) => Float32, + (Float32, Float32) => Float32, + (Float64, Float64) => Float64, + (Float32, Float64) => Float64, + # (Float64, Float32) => Float64, + (ComplexF32, ComplexF32) => Float32, + (ComplexF64, ComplexF64) => Float64, + (ComplexF32, ComplexF64) => Float64, + # (ComplexF64, ComplexF32) => Float64 + ) + +const reduction_compute_types = Dict( + # typeA, typeC => typeCompute + (Float16, Float16) => Float16, + (Float32, Float32) => Float32, + (Float64, Float64) => Float64, + (ComplexF32, ComplexF32) => Float32, + (ComplexF64, ComplexF64) => Float64) + +# we have our own enum to represent hiptensorComputeDescriptor_t values +function Base.convert(::Type{hiptensorComputeDescriptorEnum}, T::DataType) + if T == Float16 + return COMPUTE_DESC_16F + elseif T == Float32 + return COMPUTE_DESC_32F + elseif T == ComplexF32 + return COMPUTE_DESC_C32F + elseif T == Float64 + return COMPUTE_DESC_64F + elseif T == ComplexF64 + return COMPUTE_DESC_C64F + elseif T == Int8 + return COMPUTE_DESC_8I + elseif T == UInt8 + return COMPUTE_DESC_8U + elseif T == Int32 + return COMPUTE_DESC_32I + elseif T == UInt32 + return COMPUTE_DESC_32U + else + throw(ArgumentError("hiptensorComputeDescriptor equivalent for input type $T does not exist!")) + end +end +Base.cconvert(::Type{hiptensorComputeDescriptor_t}, T::DataType) = + Base.cconvert(hiptensorComputeDescriptor_t, convert(hiptensorComputeDescriptorEnum, T)) + +function Base.cconvert(::Type{hiptensorComputeDescriptor_t}, T::hiptensorComputeDescriptorEnum) + if T == COMPUTE_DESC_16F + return HIPTENSOR_COMPUTE_DESC_16F + elseif T == COMPUTE_DESC_32F + return HIPTENSOR_COMPUTE_DESC_32F + elseif T == COMPUTE_DESC_64F + return HIPTENSOR_COMPUTE_DESC_64F + elseif T == COMPUTE_DESC_16BF + return HIPTENSOR_COMPUTE_DESC_16BF + elseif T == COMPUTE_DESC_C32F + return HIPTENSOR_COMPUTE_DESC_C32F + elseif T == COMPUTE_DESC_C64F + return HIPTENSOR_COMPUTE_DESC_C64F + elseif T == COMPUTE_DESC_8U + return HIPTENSOR_COMPUTE_DESC_8U + elseif T == COMPUTE_DESC_8I + return HIPTENSOR_COMPUTE_DESC_8I + elseif T == COMPUTE_DESC_32U + return HIPTENSOR_COMPUTE_DESC_32U + elseif T == COMPUTE_DESC_32I + return HIPTENSOR_COMPUTE_DESC_32I + else + throw(ArgumentError("hiptensorComputeDescriptor equivalent for input enum value $T does not exist!")) + end +end + + +function Base.convert(::Type{hiptensorDataType_t}, T::DataType) + if T == Float16 + return HIPTENSOR_R_16F + elseif T == ComplexF16 + return HIPTENSOR_C_16F + elseif T == Float32 + return HIPTENSOR_R_32F + elseif T == ComplexF32 + return HIPTENSOR_C_32F + elseif T == Float64 + return HIPTENSOR_R_64F + elseif T == ComplexF64 + return HIPTENSOR_C_64F + elseif T == Int8 + return HIPTENSOR_R_8I + elseif T == Int32 + return HIPTENSOR_R_32I + elseif T == UInt8 + return HIPTENSOR_R_8U + elseif T == UInt32 + return HIPTENSOR_R_32U + else + throw(ArgumentError("hiptensorDataType equivalent for input type $T does not exist!")) + end +end + +function Base.convert(::DataType, T::hiptensorDataType_t) + if T == HIPTENSOR_R_16F + return Float16 + elseif T == HIPTENSOR_R_32F + return Float32 + elseif T == HIPTENSOR_C_32F + return ComplexF32 + elseif T == HIPTENSOR_R_64F + return Float64 + elseif T == HIPTENSOR_C_64F + return ComplexF64 + else + throw(ArgumentError("Data type equivalent for hiptensorDataType type $T does not exist!")) + end +end + + +## plan + +mutable struct hipTensorPlan + handle::hiptensorPlan_t + workspace::ROCVector{UInt8,Mem.HIPBuffer} + scalar_type::DataType + + function hipTensorPlan(desc, pref; workspacePref=HIPTENSOR_WORKSPACE_DEFAULT) + # estimate the workspace size + workspaceSizeEstimate = Ref{UInt64}(0) + hiptensorEstimateWorkspaceSize(handle(), desc, pref, workspacePref, workspaceSizeEstimate) + + # determine the scalar type + required_scalar_type = Ref{hiptensorDataType_t}() + hiptensorOperationDescriptorGetAttribute(handle(), desc, HIPTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE, required_scalar_type, sizeof(required_scalar_type)) + @show convert(Int64, UInt32(required_scalar_type[])) + + # create the plan + plan_ref = Ref{hiptensorPlan_t}() + hiptensorCreatePlan(handle(), plan_ref, desc, pref, workspaceSizeEstimate[]) + + # allocate the actual workspace + actualWorkspaceSize = Ref{UInt64}(0) + hiptensorPlanGetAttribute(handle(), plan_ref[], HIPTENSOR_PLAN_REQUIRED_WORKSPACE, actualWorkspaceSize, sizeof(actualWorkspaceSize)) + workspace = ROCVector{UInt8}(undef, actualWorkspaceSize[]) + + obj = new(plan_ref[], workspace, required_scalar_type[]) + finalizer(AMDGPU.unsafe_free!, obj) + return obj + end +end + +Base.show(io::IO, plan::hipTensorPlan) = @printf(io, "hipTensorPlan(%p)", plan.handle) + +Base.unsafe_convert(::Type{hiptensorPlan_t}, plan::hipTensorPlan) = plan.handle + +Base.:(==)(a::hipTensorPlan, b::hipTensorPlan) = a.handle == b.handle +Base.hash(plan::hipTensorPlan, h::UInt) = hash(plan.handle, h) + +# destroying the plan +unsafe_destroy!(plan::hipTensorPlan) = hiptensorDestroyPlan(plan) + +# freeing the plan and associated workspace +function AMDGPU.unsafe_free!(plan::hipTensorPlan) + if plan.workspace != C_NULL + AMDGPU.unsafe_free!(plan.workspace) + end + if plan.handle != C_NULL + unsafe_destroy!(plan) + plan.handle = C_NULL + end +end + + +const HIPTENSOR_ALIGNMENT = UInt32(1) + +## descriptor + +mutable struct hipTensorDescriptor + handle::hiptensorTensorDescriptor_t + # inner constructor handles creation and finalizer of the descriptor + function hipTensorDescriptor(sz::Vector{Int64}, st::Vector{Int64}, eltype::DataType, + alignmentRequirement::UInt32=HIPTENSOR_ALIGNMENT) + desc = Ref{hiptensorTensorDescriptor_t}(C_NULL) + length(st) == (N = length(sz)) || throw(ArgumentError("size and stride vectors must have the same length")) + T = convert(hiptensorDataType_t, eltype) + hiptensorCreateTensorDescriptor(handle(), desc, UInt32(N), sz, st, T, alignmentRequirement) + obj = new(desc[]) + finalizer(unsafe_destroy!, obj) + return obj + end +end + +# outer constructor restricted to DenseROCArray, but could be extended +function hipTensorDescriptor(a::DenseROCArray; size=size(a), strides=strides(a), eltype=eltype(a)) + sz = collect(Int64, size) + st = collect(Int64, strides) + return hipTensorDescriptor(sz, st, eltype) +end + +Base.show(io::IO, desc::hipTensorDescriptor) = @printf(io, "hipTensorDescriptor(%p)", desc.handle) + +Base.unsafe_convert(::Type{hiptensorTensorDescriptor_t}, obj::hipTensorDescriptor) = obj.handle + +unsafe_destroy!(obj::hipTensorDescriptor) = hiptensorDestroyTensorDescriptor(obj) + + +## tensor + +export hipTensor + +mutable struct hipTensor{T, N} + data::ROCArray{T, N} + inds::Vector{Int32} + + function hipTensor{T, N}(data::ROCArray{T,N}, inds::Vector) where {T<:Number, N} + if length(inds) != N + throw(ArgumentError("The number of indices must match the number of dimensions of the data.")) + end + if !iszero(UInt(pointer(data)) % HIPTENSOR_ALIGNMENT) + @warn "The data for this hipTensor does not obey the hipTENSOR alignment requirement of $HIPTENSOR_ALIGNMENT. An explicit copy will be made to ensure the requirement is met." + return new(copy(data), inds) + else + return new(data, inds) + end + end +end + +hipTensor(data::ROCArray{T,N}, inds::Vector) where {T<:Number, N} = + hipTensor{T,N}(data, inds) + +# array interface +Base.size(T::hipTensor) = size(T.data) +Base.size(T::hipTensor, i) = size(T.data, i) +Base.length(T::hipTensor) = length(T.data) +Base.ndims(T::hipTensor) = length(T.inds) +Base.strides(T::hipTensor) = strides(T.data) +Base.eltype(T::hipTensor) = eltype(T.data) +Base.similar(T::hipTensor{Tv, N}) where {Tv, N} = hipTensor{Tv, N}(similar(T.data), copy(T.inds)) +Base.copy(T::hipTensor{Tv, N}) where {Tv, N} = hipTensor{Tv, N}(copy(T.data), copy(T.inds)) +Base.collect(T::hipTensor) = (collect(T.data), T.inds) diff --git a/src/tensor/utils.jl b/src/tensor/utils.jl new file mode 100644 index 000000000..e69de29bb diff --git a/src/utils.jl b/src/utils.jl index fea65b01c..205f6a9e4 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -12,6 +12,7 @@ function versioninfo() _status(functional(:rocsparse)) "rocSPARSE" _ver(:rocsparse, rocSPARSE.version) _libpath(librocsparse); _status(functional(:rocrand)) "rocRAND" _ver(:rocrand, rocRAND.version) _libpath(librocrand); _status(functional(:rocfft)) "rocFFT" _ver(:rocfft, rocFFT.version) _libpath(librocfft); + _status(functional(:hiptensor)) "hipTENSOR" _ver(:hiptensor, hipTENSOR.version) _libpath(libhiptensor); _status(functional(:MIOpen)) "MIOpen" _ver(:MIOpen, MIOpen.version) _libpath(libMIOpen_path); ] @@ -64,6 +65,7 @@ function correctly. Available `component` values are: - `:rocsparse` - Queries rocSPARSE library availability - `:rocrand` - Queries rocRAND library availability - `:rocfft` - Queries rocFFT library availability +- `:hiptensor` - Queries hipTENSOR library availability - `:MIOpen` - Queries MIOpen library availability - `:all` - Queries all above components @@ -86,6 +88,8 @@ function functional(component::Symbol) return !isempty(librocrand) elseif component == :rocfft return !isempty(librocfft) + elseif component == :hiptensor + return !isempty(libhiptensor) elseif component == :MIOpen return !isempty(libMIOpen_path) elseif component == :all diff --git a/test/hiptensor/base.jl b/test/hiptensor/base.jl new file mode 100644 index 000000000..072bf8481 --- /dev/null +++ b/test/hiptensor/base.jl @@ -0,0 +1,29 @@ +@testset "base" begin + +using LinearAlgebra, Random, AMDGPU + +if AMDGPU.hipTENSOR.has_hiptensor() + @test AMDGPU.hipTENSOR.version() isa VersionNumber + + @testset "type basics" begin + N = 2 + dmax = 2^div(18,N) + dims = rand(2:dmax, N) + p = randperm(N) + indsA = collect(('a':'z')[1:N]) + dimsA = dims + A = rand(Float64, dimsA...) + dA = ROCArray(A) + p = randperm(N) + indsA = collect(('a':'z')[1:N]) + ctA = AMDGPU.hipTENSOR.hipTensor(dA, indsA) + @test length(ctA) == length(A) + @test size(ctA) == size(A) + @test size(ctA, 1) == size(A, 1) + @test ndims(ctA) == ndims(A) + @test strides(ctA) == strides(A) + @test eltype(ctA) == eltype(A) + end +end + +end diff --git a/test/hiptensor/contractions.jl b/test/hiptensor/contractions.jl new file mode 100644 index 000000000..4b617eea2 --- /dev/null +++ b/test/hiptensor/contractions.jl @@ -0,0 +1,207 @@ +using Test, AMDGPU +using LinearAlgebra +using AMDGPU.hipTENSOR: contract!, plan_contraction, hipTensor + +if AMDGPU.hipTENSOR.has_hiptensor() + +@testset "contractions" verbose = true begin + +AMDGPU.hipTENSOR.hiptensorLoggerSetLevel(AMDGPU.hipTENSOR.hiptensorLogLevel_t(UInt32(16))) +AMDGPU.hipTENSOR.hiptensorLoggerOpenFile("contract.log") + + +eltypes = [(Float32, Float32, Float32, Float32), + (Float32, Float32, Float32, Float16), + (Float16, Float16, Float16, Float32), + (ComplexF32, ComplexF32, ComplexF32, ComplexF32), + (Float64, Float64, Float64, Float64), + (Float64, Float64, Float64, Float32), + (ComplexF64, ComplexF64, ComplexF64, ComplexF64), + ] + +@testset for NoA=1:2, NoB=1:2, Nc=1:2 + @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes + @show eltyA, eltyB, eltyC, NoA, NoB, Nc + flush(stdout) + # setup + #dmax = 2^div(12, max(NoA+Nc, NoB+Nc, NoA+NoB)) + dmax = 4 + dimsoA = rand(2:dmax, NoA) + loA = prod(dimsoA) + dimsoB = rand(2:dmax, NoB) + loB = prod(dimsoB) + dimsc = rand(2:dmax, Nc) + lc = prod(dimsc) + allinds = collect('a':'z') + indsoA = allinds[1:NoA] + indsoB = allinds[NoA .+ (1:NoB)] + indsc = allinds[NoA .+ NoB .+ (1:Nc)] + pA = randperm(NoA + Nc) + ipA = invperm(pA) + pB = randperm(Nc + NoB) + ipB = invperm(pB) + pC = randperm(NoA + NoB) + ipC = invperm(pC) + compute_rtol = (eltyCompute == Float16 || eltyC == Float16) ? 1e-2 : (eltyCompute == Float32 ? 1e-4 : 1e-6) + dimsA = [dimsoA; dimsc][pA] + indsA = [indsoA; indsc][pA] + dimsB = [dimsc; dimsoB][pB] + indsB = [indsc; indsoB][pB] + dimsC = [dimsoA; dimsoB][pC] + indsC = [indsoA; indsoB][pC] + + A = rand(eltyA, (dimsA...,)) + mA = reshape(permutedims(A, ipA), (loA, lc)) + B = rand(eltyB, (dimsB...,)) + mB = reshape(permutedims(B, ipB), (lc, loB)) + C = zeros(eltyC, (dimsC...,)) + dA = ROCArray(A) + dB = ROCArray(B) + dC = ROCArray(C) + opA = AMDGPU.hipTENSOR.OP_IDENTITY + opB = AMDGPU.hipTENSOR.OP_IDENTITY + opC = AMDGPU.hipTENSOR.OP_IDENTITY + opOut = AMDGPU.hipTENSOR.OP_IDENTITY + + @testset "simple case" begin + dC = contract!(1, dA, indsA, opA, dB, indsB, opB, 0, dC, indsC, opC, opOut, compute_type=eltyCompute) + C = collect(dC) + mC = reshape(permutedims(C, ipC), (loA, loB)) + @test mC ≈ mA * mB rtol=compute_rtol + AMDGPU.synchronize() + end + + @testset "simple case with plan storage" begin + plan = hipTENSOR.plan_contraction(dA, indsA, opA, dB, indsB, opB, dC, indsC, opC, opOut) + dC = contract!(plan, 1, dA, dB, 0, dC) + C = collect(dC) + mC = reshape(permutedims(C, ipC), (loA, loB)) + @test mC ≈ mA * mB + AMDGPU.synchronize() + end + + @testset "simple case with plan storage and compute type" begin + eltypComputeEnum = convert(hipTENSOR.hiptensorComputeDescriptorEnum, eltyCompute) + plan = hipTENSOR.plan_contraction(dA, indsA, opA, dB, indsB, opB, dC, indsC, opC, opOut; compute_type=eltypComputeEnum) + dC = hipTENSOR.contract!(plan, 1, dA, dB, 0, dC) + C = collect(dC) + mC = reshape(permutedims(C, ipC), (loA, loB)) + @test mC ≈ mA * mB rtol=compute_rtol + AMDGPU.synchronize() + end + + @testset "simple case with plan storage and JIT compilation" begin + plan = AMDGPU.hipTENSOR.plan_contraction(dA, indsA, opA, dB, indsB, opB, dC, indsC, opC, opOut; jit=hipTENSOR.JIT_MODE_DEFAULT) + dC = contract!(plan, 1, dA, dB, 0, dC) + C = collect(dC) + mC = reshape(permutedims(C, ipC), (loA, loB)) + @test mC ≈ mA * mB + AMDGPU.synchronize() + end + + @testset "with non-trivial α" begin + α = rand(eltyCompute) + dC = contract!(α, dA, indsA, opA, dB, indsB, opB, zero(eltyCompute), dC, indsC, opC, opOut; compute_type=eltyCompute) + C = collect(dC) + mC = reshape(permutedims(C, ipC), (loA, loB)) + @test mC ≈ α * mA * mB rtol=compute_rtol + AMDGPU.synchronize() + end + + @testset "with non-trivial β" begin + C = rand(eltyC, (dimsC...,)) + dC = ROCArray(C) + α = rand(eltyCompute) + β = rand(eltyCompute) + copyto!(dC, C) + dD = contract!(α, dA, indsA, opA, dB, indsB, opB, β, dC, indsC, opC, opOut; compute_type=eltyCompute) + D = collect(dD) + mC = reshape(permutedims(C, ipC), (loA, loB)) + mD = reshape(permutedims(D, ipC), (loA, loB)) + @test mD ≈ α * mA * mB + β * mC rtol=compute_rtol + AMDGPU.synchronize() + end + + if eltyCompute != Float32 && eltyC != Float16 + @testset "with hipTensor objects" begin + ctA = hipTensor(dA, indsA) + ctB = hipTensor(dB, indsB) + ctC = hipTensor(dC, indsC) + ctC = LinearAlgebra.mul!(ctC, ctA, ctB) + C2, C2inds = collect(ctC) + mC = reshape(permutedims(C2, ipC), (loA, loB)) + @test mC ≈ mA * mB + ctC = ctA * ctB + C2, C2inds = collect(ctC) + pC2 = Int.(indexin(Char.(C2inds), [indsoA; indsoB])) + mC = reshape(permutedims(C2, invperm(pC2)), (loA, loB)) + @test mC ≈ mA * mB + end + AMDGPU.synchronize() + end + + # not supported for these specific cases for unknown reason + if !((NoA, NoB, Nc) in ((1,1,3), (1,2,3), (3,1,2))) + @testset "with conjugation flag for complex arguments" begin + if eltyA <: Complex + opA = AMDGPU.hipTENSOR.OP_CONJ + opB = AMDGPU.hipTENSOR.OP_IDENTITY + opOut = AMDGPU.hipTENSOR.OP_IDENTITY + dC = contract!(complex(1.0, 0.0), dA, indsA, opA, dB, indsB, opB, + 0, dC, indsC, opC, opOut; compute_type=eltyCompute) + C = collect(dC) + mC = reshape(permutedims(C, ipC), (loA, loB)) + @test mC ≈ conj(mA) * mB rtol=compute_rtol + end + if eltyB <: Complex + opA = AMDGPU.hipTENSOR.OP_IDENTITY + opB = AMDGPU.hipTENSOR.OP_CONJ + opOut = AMDGPU.hipTENSOR.OP_IDENTITY + dC = contract!(complex(1.0, 0.0), dA, indsA, opA, dB, indsB, opB, + complex(0.0, 0.0), dC, indsC, opC, opOut; compute_type=eltyCompute) + C = collect(dC) + mC = reshape(permutedims(C, ipC), (loA, loB)) + @test mC ≈ mA*conj(mB) rtol=compute_rtol + end + if eltyA <: Complex && eltyB <: Complex + opA = AMDGPU.hipTENSOR.OP_CONJ + opB = AMDGPU.hipTENSOR.OP_CONJ + opOut = AMDGPU.hipTENSOR.OP_IDENTITY + dC = contract!(one(eltyCompute), dA, indsA, opA, dB, indsB, opB, + zero(eltyCompute), dC, indsC, opC, opOut; compute_type=eltyCompute) + C = collect(dC) + mC = reshape(permutedims(C, ipC), (loA, loB)) + @test mC ≈ conj(mA)*conj(mB) rtol=compute_rtol + end + end + AMDGPU.synchronize() + end + end +end + +@testset "contractions of views" begin + @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes + dimsA = (16,) + dimsB = (4,) + dimsC = (8,) + A = rand(eltyA, dimsA) + B = rand(eltyB, dimsB) + C = rand(eltyC, dimsC) + dA = ROCArray(A) + dB = ROCArray(B) + dC = ROCArray(C) + dD = ROCArray(C) + vA = @view dA[1:4] + vB = @view dB[4:4] + vC = @view dC[3:6] + vD = @view dD[3:6] + tA = hipTensor(reshape(vA, (4, 1)), [1, 2]) + tB = hipTensor(reshape(vB, (1, 1)), [3, 2]) + tC = hipTensor(reshape(vC, (1, 4)), [3, 1]) + mul!(tC, tA, tB) + end +end + +end + +end diff --git a/test/hiptensor/elementwise_binary.jl b/test/hiptensor/elementwise_binary.jl new file mode 100644 index 000000000..3cd01eebe --- /dev/null +++ b/test/hiptensor/elementwise_binary.jl @@ -0,0 +1,128 @@ +using Test, AMDGPU +using AMDGPU +using LinearAlgebra +using AMDGPU.hipTENSOR: elementwise_binary_execute!, hipTensor + +if AMDGPU.hipTENSOR.has_hiptensor() + + AMDGPU.hipTENSOR.hiptensorLoggerSetLevel(AMDGPU.hipTENSOR.hiptensorLogLevel_t(UInt32(16))) + AMDGPU.hipTENSOR.hiptensorLoggerOpenFile("tensor.log") + + eltypes = [(Float16, Float16), + (Float16, Float32), + (Float32, Float32), + #(Float64, Float64), + #(ComplexF32, ComplexF32), + #(ComplexF64, ComplexF64), + #(ComplexF64, ComplexF32), + #(Float64, Float32), + ] + + Ns = 2:5 + + @testset "elementwise binary ($eltyA, $eltyC), N $N" for (eltyA, eltyC) in eltypes, N in Ns + # setup + eltyD = eltyC + #dmax = 2^div(18,N) + #dims = rand(2:dmax, N) + dims = fill(4, N) + p = randperm(N) + indsA = collect(('a':'z')[1:N]) + indsC = indsA[p] + dimsA = dims + dimsC = dims[p] + A = rand(eltyA, dimsA...) + dA = ROCArray(A) + C = rand(eltyC, dimsC...) + dC = ROCArray(C) + AMDGPU.synchronize() + + @testset "simple case" begin + opA = AMDGPU.hipTENSOR.OP_IDENTITY + opC = AMDGPU.hipTENSOR.OP_IDENTITY + dD = similar(dC, eltyD) + opAC = AMDGPU.hipTENSOR.OP_ADD + dD = elementwise_binary_execute!(one(eltyA), dA, indsA, opA, one(eltyC), dC, indsC, opC, dD, indsC, opAC) + D = collect(dD) + @test D ≈ permutedims(A, p) + C + end + + #=@testset "using integers as indices" begin + opA = AMDGPU.hipTENSOR.OP_IDENTITY + opC = AMDGPU.hipTENSOR.OP_IDENTITY + dD = similar(dC, eltyD) + opAC = AMDGPU.hipTENSOR.OP_ADD + dD = elementwise_binary_execute!(1, dA, 1:N, opA, 1, dC, p, opC, dD, p, opAC) + D = collect(dD) + @test D ≈ permutedims(A, p) + C + end + + @testset "multiplication as binary operator" begin + opA = AMDGPU.hipTENSOR.OP_IDENTITY + opC = AMDGPU.hipTENSOR.OP_IDENTITY + dD = similar(dC, eltyD) + opAC = AMDGPU.hipTENSOR.OP_MUL + dD = elementwise_binary_execute!(1, dA, indsA, opA, 1, dC, indsC, opC, dD, indsC, opAC) + D = collect(dD) + @test D ≈ permutedims(A, p) .* C + end + + @testset "with non-trivial coefficients and conjugation" begin + dD = similar(dC, eltyD) + opA = eltyA <: Complex ? AMDGPU.hipTENSOR.OP_CONJ : AMDGPU.hipTENSOR.OP_IDENTITY + opC = AMDGPU.hipTENSOR.OP_IDENTITY + opAC = AMDGPU.hipTENSOR.OP_ADD + α = rand(eltyD) + γ = rand(eltyD) + dD = elementwise_binary_execute!(α, dA, indsA, opA, γ, dC, indsC, opC, dD, indsC, opAC) + D = collect(dD) + @test D ≈ α * conj.(permutedims(A, p)) + γ * C + end + + @testset "test in-place, and more complicated unary and binary operations" begin + opA = eltyA <: Complex ? AMDGPU.hipTENSOR.OP_IDENTITY : AMDGPU.hipTENSOR.OP_SQRT + # because we use rand, entries of A will be positive when elty is real + opC = eltyC <: Complex ? AMDGPU.hipTENSOR.OP_CONJ : AMDGPU.hipTENSOR.OP_IDENTITY + opAC = eltyD <: Complex ? AMDGPU.hipTENSOR.OP_ADD : AMDGPU.hipTENSOR.OP_MAX + α = rand(eltyD) + γ = rand(eltyD) + dD = elementwise_binary_execute!(α, dA, indsA, opA, γ, dC, indsC, opC, dC, indsC, opAC) + D = collect(dC) + if eltyD <: Complex + if eltyA <: Complex + @test D ≈ α * permutedims(A, p) + γ * conj.(C) + else + @test D ≈ α * sqrt.(eltyD.(permutedims(A, p))) + γ * conj.(C) + end + else + @test D ≈ max.(α * sqrt.(eltyD.(permutedims(A, p))), γ * C) + end + end + + # using hipTensor type + dA = ROCArray(A) + dC = ROCArray(C) + ctA = hipTensor(dA, indsA) + ctC = hipTensor(dC, indsC) + ctD = ctA + ctC + hD = collect(ctD.data) + @test hD ≈ permutedims(A, p) + C + ctD = ctA - ctC + hD = collect(ctD.data) + @test hD ≈ permutedims(A, p) - C + + α = rand(eltyD) + ctC_copy = copy(ctC) + ctD = LinearAlgebra.axpy!(α, ctA, ctC_copy) + @test ctD == ctC_copy + hD = collect(ctD.data) + @test hD ≈ α * permutedims(A, p) + C + + γ = rand(eltyD) + ctC_copy = copy(ctC) + ctD = LinearAlgebra.axpby!(α, ctA, γ, ctC_copy) + @test ctD == ctC_copy + hD = collect(ctD.data) + @test hD ≈ α * permutedims(A, p) + γ * C=# + end +end diff --git a/test/hiptensor/elementwise_trinary.jl b/test/hiptensor/elementwise_trinary.jl new file mode 100644 index 000000000..ef4e688a4 --- /dev/null +++ b/test/hiptensor/elementwise_trinary.jl @@ -0,0 +1,148 @@ +using Test, AMDGPU +using LinearAlgebra + +if AMDGPU.hipTENSOR.has_hiptensor() + @testset "elementwise trinary" begin + + using AMDGPU.hipTENSOR: elementwise_trinary_execute! + + eltypes = [(Float16, Float16, Float16), + (Float32, Float32, Float32), + (Float64, Float64, Float32), + #=(Float64, Float64, Float64), + (ComplexF32, ComplexF32, ComplexF32), + (ComplexF64, ComplexF64, ComplexF64), + (Float32, Float32, Float16), + (ComplexF64, ComplexF64, ComplexF32)=#, + ] + + @testset for N=2:5 + @testset for (eltyA, eltyB, eltyC) in eltypes + @show eltyA, eltyB, eltyC, N + flush(stdout) + # setup + eltyD = eltyC + dmax = 2^div(18,N) + dims = rand(2:dmax, N) + pA = randperm(N) + ipA = invperm(pA) + pB = randperm(N) + ipB = invperm(pB) + indsC = collect(('a':'z')[1:N]) + dimsC = dims + indsA = indsC[ipA] + dimsA = dims[ipA] + indsB = indsC[ipB] + dimsB = dims[ipB] + A = rand(eltyA, dimsA...) + dA = ROCArray(A) + B = rand(eltyB, dimsB...) + dB = ROCArray(B) + C = rand(eltyC, dimsC...) + dC = ROCArray(C) + dD = similar(dC) + + # simple case + opA = AMDGPU.hipTENSOR.OP_IDENTITY + opB = AMDGPU.hipTENSOR.OP_IDENTITY + opC = AMDGPU.hipTENSOR.OP_IDENTITY + opAB = AMDGPU.hipTENSOR.OP_ADD + opABC = AMDGPU.hipTENSOR.OP_ADD + dD = elementwise_trinary_execute!(1, dA, indsA, opA, 1, dB, indsB, opB, + 1, dC, indsC, opC, dD, indsC, opAB, opABC) + D = collect(dD) + @test D ≈ permutedims(A, pA) + permutedims(B, pB) + C + + # using integers as indices + dD = elementwise_trinary_execute!(1, dA, ipA, opA, 1, dB, ipB, opB, + 1, dC, 1:N, opC, dD, 1:N, opAB, opABC) + D = collect(dD) + @test D ≈ permutedims(A, pA) + permutedims(B, pB) + C + + # multiplication as binary operator + opAB = AMDGPU.hipTENSOR.OP_MUL + opABC = AMDGPU.hipTENSOR.OP_ADD + dD = elementwise_trinary_execute!(1, dA, indsA, opA, 1, dB, indsB, opB, + 1, dC, indsC, opC, dD, indsC, opAB, opABC) + D = collect(dD) + @test D ≈ (eltyD.(permutedims(A, pA)) .* eltyD.(permutedims(B, pB))) + C + + opAB = AMDGPU.hipTENSOR.OP_ADD + opABC = AMDGPU.hipTENSOR.OP_MUL + dD = elementwise_trinary_execute!(1, dA, indsA, opA, 1, dB, indsB, opB, + 1, dC, indsC, opC, dD, indsC, opAB, opABC) + D = collect(dD) + @test D ≈ (eltyD.(permutedims(A, pA)) + eltyD.(permutedims(B, pB))) .* C + + opAB = AMDGPU.hipTENSOR.OP_MUL + opABC = AMDGPU.hipTENSOR.OP_MUL + dD = elementwise_trinary_execute!(1, dA, indsA, opA, 1, dB, indsB, opB, + 1, dC, indsC, opC, dD, indsC, opAB, opABC) + D = collect(dD) + @test D ≈ eltyD.(permutedims(A, pA)) .* eltyD.(permutedims(B, pB)) .* C + + # with non-trivial coefficients and conjugation + α = rand(eltyD) + β = rand(eltyD) + γ = rand(eltyD) + opA = eltyA <: Complex ? AMDGPU.hipTENSOR.OP_CONJ : + AMDGPU.hipTENSOR.OP_IDENTITY + opAB = AMDGPU.hipTENSOR.OP_ADD + opABC = AMDGPU.hipTENSOR.OP_ADD + dD = elementwise_trinary_execute!(α, dA, indsA, opA, β, dB, indsB, opB, + γ, dC, indsC, opC, dD, indsC, opAB, opABC) + D = collect(dD) + @test D ≈ α * conj.(permutedims(A, pA)) + β * permutedims(B, pB) + γ * C + + opB = eltyB <: Complex ? AMDGPU.hipTENSOR.OP_CONJ : AMDGPU.hipTENSOR.OP_IDENTITY + opAB = AMDGPU.hipTENSOR.OP_ADD + opABC = AMDGPU.hipTENSOR.OP_ADD + dD = elementwise_trinary_execute!(α, dA, indsA, opA, β, dB, indsB, opB, + γ, dC, indsC, opC, dD, indsC, opAB, opABC) + D = collect(dD) + @test D ≈ α * conj.(permutedims(A, pA)) + β * conj.(permutedims(B, pB)) + γ * C + opA = AMDGPU.hipTENSOR.OP_IDENTITY + opAB = AMDGPU.hipTENSOR.OP_MUL + opABC = AMDGPU.hipTENSOR.OP_ADD + dD = elementwise_trinary_execute!(α, dA, indsA, opA, β, dB, indsB, opB, + γ, dC, indsC, opC, dD, indsC, opAB, opABC) + D = collect(dD) + @test D ≈ (α * permutedims(A, pA)) .* (β * conj.(permutedims(B, pB))) + γ * C + + # test in-place, and more complicated unary and binary operations + opA = eltyA <: Complex ? AMDGPU.hipTENSOR.OP_IDENTITY : AMDGPU.hipTENSOR.OP_SQRT + opB = eltyB <: Complex ? AMDGPU.hipTENSOR.OP_IDENTITY : AMDGPU.hipTENSOR.OP_SQRT + # because we use rand, entries of A will be positive when elty is real + opC = eltyC <: Complex ? AMDGPU.hipTENSOR.OP_CONJ : AMDGPU.hipTENSOR.OP_IDENTITY + opAB = eltyD <: Complex ? AMDGPU.hipTENSOR.OP_MUL : AMDGPU.hipTENSOR.OP_MIN + opABC = eltyD <: Complex ? AMDGPU.hipTENSOR.OP_ADD : AMDGPU.hipTENSOR.OP_MAX + α = rand(eltyD) + β = rand(eltyD) + γ = rand(eltyD) + dD = elementwise_trinary_execute!(α, dA, indsA, opA, β, dB, indsB, opB, + γ, dC, indsC, opC, dC, indsC, opAB, opABC) + D = collect(dD) + if eltyD <: Complex + if eltyA <: Complex && eltyB <: Complex + @test D ≈ (α * permutedims(A, pA)) .* + (β * permutedims(B, pB)) + γ * conj.(C) + elseif eltyB <: Complex + @test D ≈ (α * sqrt.(eltyD.(permutedims(A, pA)))) .* + (β * permutedims(B, pB)) + γ * conj.(C) + elseif eltyB <: Complex + @test D ≈ (α * permutedims(A, pA)) .* + (β * sqrt.(eltyD.(permutedims(B, pB)))) + γ * conj.(C) + else + @test D ≈ (α * sqrt.(eltyD.(permutedims(A, pA)))) .* + (β * sqrt.(eltyD.(permutedims(B, pB)))) + γ * conj.(C) + end + else + @test D ≈ max.(min.(α * sqrt.(eltyD.(permutedims(A, pA))), + β * sqrt.(eltyD.(permutedims(B, pB)))), + γ * C) + end + end + end + + end +end diff --git a/test/hiptensor/permutations.jl b/test/hiptensor/permutations.jl new file mode 100644 index 000000000..2b57f9477 --- /dev/null +++ b/test/hiptensor/permutations.jl @@ -0,0 +1,58 @@ +using Test, AMDGPU +using LinearAlgebra, Random + +@show AMDGPU.hipTENSOR.has_hiptensor() +if AMDGPU.hipTENSOR.has_hiptensor() + + @testset "permutations" begin + + using AMDGPU.hipTENSOR: permute! + + AMDGPU.hipTENSOR.hiptensorLoggerSetLevel(AMDGPU.hipTENSOR.hiptensorLogLevel_t(UInt32(16))) + AMDGPU.hipTENSOR.hiptensorLoggerOpenFile("permute.log") + + eltypes = [(Float16, Float16), + (Float16, Float32), + #(Float32, Float16), + (Float32, Float32), + #=(Float64, Float64), + (Float32, Float64), + (Float64, Float32), + (ComplexF32, ComplexF32), + (ComplexF64, ComplexF64), + (ComplexF32, ComplexF64), + (ComplexF64, ComplexF32)=# + ] + + @testset for N=2#:5 + @testset for (eltyA, eltyC) in eltypes + # setup + dmax = 2^div(18,N) + dims = rand(2:dmax, N) + p = randperm(N) + indsA = collect(('a':'z')[1:N]) + indsC = indsA[p] + dimsA = dims + dimsC = dims[p] + A = rand(eltyA, dimsA...) + dA = ROCArray(A) + dC = similar(dA, eltyC, dimsC...) + opA = AMDGPU.hipTENSOR.OP_IDENTITY + + @testset "simple case" begin + dC = permute!(one(eltyA), dA, indsA, opA, dC, indsC) + C = collect(dC) + @test C == permutedims(A, p) # exact equality + end + + @testset "with scalar" begin + α = rand(eltyA) + dC = permute!(α, dA, indsA, opA, dC, indsC) + C = collect(dC) + @test C ≈ α * permutedims(A, p) # approximate, floating point rounding + end + end + end + + end +end diff --git a/test/hiptensor/reductions.jl b/test/hiptensor/reductions.jl new file mode 100644 index 000000000..f2a86134e --- /dev/null +++ b/test/hiptensor/reductions.jl @@ -0,0 +1,71 @@ +@testset "reductions" begin + +using AMDGPU +using AMDGPU.hipTENSOR: reduce! + +using LinearAlgebra, Random + +eltypes = [(Float16, Float16, Float16), + (Float16, Float16, Float32), + (Float32, Float32, Float32), + (Float64, Float64, Float64), + ] + +if AMDGPU.hipTENSOR.has_hiptensor() + AMDGPU.hipTENSOR.hiptensorLoggerSetLevel(AMDGPU.hipTENSOR.hiptensorLogLevel_t(UInt32(16))) + AMDGPU.hipTENSOR.hiptensorLoggerOpenFile("reduce.log") + + @testset for NA=2:5, NC = 1:NA-1 + @testset for (eltyA, eltyC, elty_compute) in eltypes + # setup + eltyD = eltyC + dmax = 2^div(18,NA) + dims = rand(2:dmax, NA) + p = randperm(NA) + indsA = collect(('a':'z')[1:NA]) + indsC = indsA[p][1:NC] + dimsA = dims + dimsC = dims[p][1:NC] + A = rand(eltyA, (dimsA...,)) + dA = ROCArray(A) + C = rand(eltyC, (dimsC...,)) + dC = ROCArray(C) + + opA = AMDGPU.hipTENSOR.OP_IDENTITY + opC = AMDGPU.hipTENSOR.OP_IDENTITY + opReduce = AMDGPU.hipTENSOR.OP_ADD + # simple case + dC = reduce!(1, dA, indsA, opA, 0, dC, indsC, opC, opReduce; compute_type = elty_compute) + C = collect(dC) + @test reshape(C, (dimsC..., ones(Int,NA-NC)...)) ≈ + sum(permutedims(A, p); dims = ((NC+1:NA)...,)) + + # using integers as indices + dC = reduce!(1, dA, collect(1:NA), opA, 0, dC, p[1:NC], opC, opReduce; compute_type = elty_compute) + C = collect(dC) + @test reshape(C, (dimsC..., ones(Int,NA-NC)...)) ≈ + sum(permutedims(A, p); dims = ((NC+1:NA)...,)) + + # multiplication as reduction operator + opReduce = AMDGPU.hipTENSOR.OP_MUL + dC = reduce!(1, dA, indsA, opA, 0, dC, indsC, opC, opReduce; compute_type = elty_compute) + C = collect(dC) + @test reshape(C, (dimsC..., ones(Int,NA-NC)...)) ≈ + prod(permutedims(A, p); dims = ((NC+1:NA)...,)) atol=eps(Float16) rtol=Base.rtoldefault(Float16) + + # with non-trivial coefficients and conjugation + opA = eltyA <: Complex ? AMDGPU.hipTENSOR.OP_CONJ : AMDGPU.hipTENSOR.OP_IDENTITY + opC = AMDGPU.hipTENSOR.OP_IDENTITY + opReduce = AMDGPU.hipTENSOR.OP_ADD + C = rand(eltyC, (dimsC...,)) + dC = ROCArray(C) + α = rand(eltyC) + γ = rand(eltyC) + dC = reduce!(α, dA, indsA, opA, γ, dC, indsC, opC, opReduce; compute_type = elty_compute) + @test reshape(collect(dC), (dimsC..., ones(Int,NA-NC)...)) ≈ + α * conj.(sum(permutedims(A, p); dims = ((NC+1:NA)...,))) + γ * C + end + end + + end +end