diff --git a/gen/hiptensor/generator.jl b/gen/hiptensor/generator.jl
new file mode 100644
index 000000000..046fc1825
--- /dev/null
+++ b/gen/hiptensor/generator.jl
@@ -0,0 +1,21 @@
+using Clang.Generators
+using JuliaFormatter
+
+include_dir = normpath("/opt/rocm/include")
+rocblas_dir = joinpath(include_dir, "hiptensor")
+options = load_options("hiptensor/hiptensor-generator.toml")
+
+args = get_default_args()
+push!(args, "-I$include_dir")
+
+headers = [
+    joinpath(rocblas_dir, header)
+    for header in readdir(rocblas_dir)
+    if endswith(header, ".h")
+]
+
+ctx = create_context(headers, args, options)
+build!(ctx)
+
+path = options["general"]["output_file_path"]
+format_file(path, YASStyle())
diff --git a/gen/hiptensor/hiptensor-generator.toml b/gen/hiptensor/hiptensor-generator.toml
new file mode 100644
index 000000000..d5118ced9
--- /dev/null
+++ b/gen/hiptensor/hiptensor-generator.toml
@@ -0,0 +1,8 @@
+[general]
+library_name = "libhiptensor"
+output_file_path = "./libhiptensor.jl"
+export_symbol_prefixes = []
+print_using_CEnum = false
+
+[codegen]
+use_ccall_macro = true
diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl
index 52059b8f1..32f88e1fc 100644
--- a/src/AMDGPU.jl
+++ b/src/AMDGPU.jl
@@ -4,6 +4,8 @@ using Adapt
 using BFloat16s
 using CEnum
 using GPUCompiler
+using GPUToolbox
+using GPUToolbox: @enum_without_prefix
 using GPUArrays
 using GPUArrays: allowscalar
 using Libdl
@@ -114,6 +116,7 @@ include("kernels/reverse.jl")
 include("blas/rocBLAS.jl")
 include("solver/rocSOLVER.jl")
 include("sparse/rocSPARSE.jl")
+include("tensor/hipTENSOR.jl")
 include("rand/rocRAND.jl")
 include("fft/rocFFT.jl")
 include("dnn/MIOpen.jl")
diff --git a/src/discovery/discovery.jl b/src/discovery/discovery.jl
index a7998b4dc..b879e9afa 100644
--- a/src/discovery/discovery.jl
+++ b/src/discovery/discovery.jl
@@ -3,6 +3,7 @@ module ROCmDiscovery
 export lld_artifact, lld_path, libhsaruntime, libdevice_libs, libhip
 export librocblas, librocsparse, librocsolver
 export librocrand, librocfft, libMIOpen_path
+export libhiptensor
 
 using LLD_jll
 using ROCmDeviceLibs_jll
@@ -55,6 +56,7 @@ global librocsparse::String = ""
 global librocsolver::String = ""
 global librocrand::String = ""
 global librocfft::String = ""
+global libhiptensor::String = ""
 global libMIOpen_path::String = ""
 
 function __init__()
@@ -96,6 +98,7 @@ function __init__()
         global librocsolver = find_rocm_library(lib_prefix * "rocsolver"; rocm_path)
         global librocrand = find_rocm_library(lib_prefix * "rocrand"; rocm_path)
         global librocfft = find_rocm_library(lib_prefix * "rocfft"; rocm_path)
+        global libhiptensor = find_rocm_library(lib_prefix * "hiptensor"; rocm_path)
         global libMIOpen_path = find_rocm_library(lib_prefix * "MIOpen"; rocm_path)
     catch err
         @error """ROCm discovery failed!
diff --git a/src/tensor/error.jl b/src/tensor/error.jl
new file mode 100644
index 000000000..b9df8278e
--- /dev/null
+++ b/src/tensor/error.jl
@@ -0,0 +1,46 @@
+export hipTENSORError
+
+struct hipTENSORError <: Exception
+    code::hiptensorStatus_t
+end
+
+Base.convert(::Type{hiptensorStatus_t}, err::hipTENSORError) = err.code
+
+Base.showerror(io::IO, err::hipTENSORError) =
+    print(io, "hipTENSORError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))")
+
+name(err::hipTENSORError) = unsafe_string(hiptensorGetErrorString(err))
+
+## COV_EXCL_START
+function description(err::hipTENSORError)
+    if err.code == HIPTENSOR_STATUS_SUCCESS
+        "the operation completed successfully"
+    elseif err.code == HIPTENSOR_STATUS_NOT_INITIALIZED
+        "the library was not initialized"
+    elseif err.code == HIPTENSOR_STATUS_ALLOC_FAILED
+        "the resource allocation failed"
+    elseif err.code == HIPTENSOR_STATUS_INVALID_VALUE
+        "an invalid value was used as an argument"
+    elseif err.code == HIPTENSOR_STATUS_ARCH_MISMATCH
+        "an absent device architectural feature is required"
+    elseif err.code == HIPTENSOR_STATUS_EXECUTION_FAILED
+        "the GPU program failed to execute"
+    elseif err.code == HIPTENSOR_STATUS_INTERNAL_ERROR
+        "an internal operation failed"
+    elseif err.code == HIPTENSOR_STATUS_NOT_SUPPORTED
+        "operation not supported (yet)"
+    elseif err.code == HIPTENSOR_STATUS_CK_ERROR
+        "error detected trying to check the license"
+    elseif err.code == HIPTENSOR_STATUS_HIP_ERROR
+        "error occurred during a HIP operation"
+    elseif err.code == HIPTENSOR_STATUS_INSUFFICIENT_WORKSPACE
+        "insufficient workspace memory for this operation"
+    elseif err.code == HIPTENSOR_STATUS_INSUFFICIENT_DRIVER
+        "insufficient driver version"
+    elseif err.code == HIPTENSOR_STATUS_IO_ERROR
+        "file not found"
+    else
+        "no description for this error"
+    end
+end
+## COV_EXCL_STOP
diff --git a/src/tensor/hipTENSOR.jl b/src/tensor/hipTENSOR.jl
new file mode 100644
index 000000000..2352acea7
--- /dev/null
+++ b/src/tensor/hipTENSOR.jl
@@ -0,0 +1,60 @@
+module hipTENSOR
+
+using AMDGPU
+using AMDGPU
+using AMDGPU: @gcsafe_ccall, @checked, @enum_without_prefix, @debug_ccall
+using AMDGPU: Mem
+import AMDGPU: libhiptensor, HandleCache, HIP, library_state
+import AMDGPU.Mem: alloc_or_retry!
+import .HIP: HIPContext, HIPStream, hipStream_t
+
+using CEnum: @cenum
+
+using Printf: @printf
+
+export has_hiptensor
+
+has_hiptensor() = AMDGPU.functional(:hiptensor)
+
+# core library
+include("libhiptensor.jl")
+
+# low-level wrappers
+include("error.jl")
+#include("utils.jl")
+include("types.jl")
+include("operations.jl")
+
+# high-level integrations
+include("interfaces.jl")
+
+
+## handles
+
+function create_handle()
+    AMDGPU.functional(:hiptensor) || error("hipTENSOR is not available")
+
+    handle_ref = Ref{hiptensorHandle_t}()
+    hiptensorCreate(handle_ref)
+    handle_ref[]
+end
+
+const IDLE_HANDLES = HandleCache{HIPContext,hiptensorHandle_t}()
+
+lib_state() = library_state(
+    :hipTENSOR, hiptensorHandle_t, IDLE_HANDLES,
+    create_handle, hiptensorDestroy, (handle, stream) -> 0 )
+
+handle() = lib_state().handle
+stream() = lib_state().stream
+
+function version()
+    #=ver = Ref{Cint}()
+    hiptensor_get_version(handle(), ver)
+    major = ver[] ÷ 100000
+    minor = (ver[] ÷ 100) % 1000
+    patch = ver[] % 100=#
+    return VersionNumber(2, 2, 0)
+end
+
+end
diff --git a/src/tensor/interfaces.jl b/src/tensor/interfaces.jl
new file mode 100644
index 000000000..47ec90bfd
--- /dev/null
+++ b/src/tensor/interfaces.jl
@@ -0,0 +1,62 @@
+# interfacing with other packages
+
+## Base
+
+function Base.:(+)(A::hipTensor, B::hipTensor)
+    α = convert(eltype(A), 1.0)
+    γ = convert(eltype(B), 1.0)
+    C = similar(B)
+    elementwise_binary_execute!(α, A.data, A.inds, HIPTENSOR_OP_IDENTITY,
+                                γ, B.data, B.inds, HIPTENSOR_OP_IDENTITY,
+                                C.data, C.inds, HIPTENSOR_OP_ADD)
+    return C
+end
+
+function Base.:(-)(A::hipTensor, B::hipTensor)
+    α = convert(eltype(A), 1.0)
+    γ = convert(eltype(B), -1.0)
+    C = similar(B)
+    elementwise_binary_execute!(α, A.data, A.inds, HIPTENSOR_OP_IDENTITY,
+                                γ, B.data, B.inds, HIPTENSOR_OP_IDENTITY,
+                                C.data, C.inds, HIPTENSOR_OP_ADD)
+    return C
+end
+
+function Base.:(*)(A::hipTensor, B::hipTensor)
+    tC = promote_type(eltype(A), eltype(B))
+    A_uniqs = [(idx, i) for (idx, i) in enumerate(A.inds) if !(i in B.inds)]
+    B_uniqs = [(idx, i) for (idx, i) in enumerate(B.inds) if !(i in A.inds)]
+    A_sizes = map(x->size(A,x[1]), A_uniqs)
+    B_sizes = map(x->size(B,x[1]), B_uniqs)
+    A_inds = map(x->x[2], A_uniqs)
+    B_inds = map(x->x[2], B_uniqs)
+    C = hipTensor(fill!(similar(B.data, tC, Dims(vcat(A_sizes, B_sizes))), zero(tC)), vcat(A_inds, B_inds))
+    return mul!(C, A, B)
+end
+
+
+## LinearAlgebra
+
+using LinearAlgebra
+
+function LinearAlgebra.axpy!(a, X::hipTensor, Y::hipTensor)
+    elementwise_binary_execute!(a, X.data, X.inds, HIPTENSOR_OP_IDENTITY,
+                                one(eltype(Y)), Y.data, Y.inds, HIPTENSOR_OP_IDENTITY,
+                                Y.data, Y.inds, HIPTENSOR_OP_ADD)
+    return Y
+end
+
+function LinearAlgebra.axpby!(a, X::hipTensor, b, Y::hipTensor)
+    elementwise_binary_execute!(a, X.data, X.inds, HIPTENSOR_OP_IDENTITY,
+                                b, Y.data, Y.inds, HIPTENSOR_OP_IDENTITY,
+                                Y.data, Y.inds, HIPTENSOR_OP_ADD)
+    return Y
+end
+
+function LinearAlgebra.mul!(C::hipTensor, A::hipTensor, B::hipTensor, α::Number, β::Number)
+   contract!(α, A.data, A.inds, HIPTENSOR_OP_IDENTITY,
+             B.data, B.inds, HIPTENSOR_OP_IDENTITY, β,
+             C.data, C.inds, HIPTENSOR_OP_IDENTITY, HIPTENSOR_OP_IDENTITY;
+             jit=HIPTENSOR_JIT_MODE_DEFAULT)
+   return C
+end
diff --git a/src/tensor/libhiptensor.jl b/src/tensor/libhiptensor.jl
new file mode 100644
index 000000000..d91f7e28e
--- /dev/null
+++ b/src/tensor/libhiptensor.jl
@@ -0,0 +1,548 @@
+@inline function retry_reclaim(f, retry_if)
+  ret = f()
+  if retry_if(ret)
+      return alloc_or_retry!(f, retry_if; stream = stream())
+  else
+    return ret
+  end
+end
+
+@cenum hiptensorStatus_t::UInt32 begin
+    HIPTENSOR_STATUS_SUCCESS = 0
+    HIPTENSOR_STATUS_NOT_INITIALIZED = 1
+    HIPTENSOR_STATUS_ALLOC_FAILED = 3
+    HIPTENSOR_STATUS_INVALID_VALUE = 7
+    HIPTENSOR_STATUS_ARCH_MISMATCH = 8
+    HIPTENSOR_STATUS_EXECUTION_FAILED = 13
+    HIPTENSOR_STATUS_INTERNAL_ERROR = 14
+    HIPTENSOR_STATUS_NOT_SUPPORTED = 15
+    HIPTENSOR_STATUS_CK_ERROR = 17
+    HIPTENSOR_STATUS_HIP_ERROR = 18
+    HIPTENSOR_STATUS_INSUFFICIENT_WORKSPACE = 19
+    HIPTENSOR_STATUS_INSUFFICIENT_DRIVER = 20
+    HIPTENSOR_STATUS_IO_ERROR = 21
+end
+
+# outlined functionality to avoid GC frame allocation
+@noinline function throw_api_error(res)
+    if res == HIPTENSOR_STATUS_ALLOC_FAILED
+        throw(OutOfGPUMemoryError())
+    else
+        throw(hipTENSORError(res))
+    end
+end
+
+@inline function check(f)
+    retry_if(res) = res in (HIPTENSOR_STATUS_NOT_INITIALIZED,
+                            HIPTENSOR_STATUS_ALLOC_FAILED,
+                            HIPTENSOR_STATUS_INTERNAL_ERROR)
+    res = retry_reclaim(f, retry_if)
+
+    if res != HIPTENSOR_STATUS_SUCCESS
+        throw_api_error(res)
+    end
+end
+
+function hiptensorGetErrorString(error)
+    @ccall libhiptensor.hiptensorGetErrorString(error::hiptensorStatus_t)::Ptr{Cchar}
+end
+
+mutable struct hiptensorHandle end
+
+const hiptensorHandle_t = Ptr{hiptensorHandle}
+
+@checked function hiptensorCreate(handle)
+    AMDGPU.prepare_state()
+    @gcsafe_ccall libhiptensor.hiptensorCreate(handle::Ptr{hiptensorHandle_t})::hiptensorStatus_t
+end
+
+@checked function hiptensorDestroy(handle)
+    @gcsafe_ccall libhiptensor.hiptensorDestroy(handle::hiptensorHandle_t)::hiptensorStatus_t
+end
+
+@checked function hiptensorHandleResizePlanCache(handle, numEntries)
+    @ccall libhiptensor.hiptensorHandleResizePlanCache(handle::hiptensorHandle_t,
+                                                       numEntries::UInt32)::hiptensorStatus_t
+end
+
+@checked function hiptensorHandleWritePlanCacheToFile(handle, fileName)
+    @ccall libhiptensor.hiptensorHandleWritePlanCacheToFile(handle::hiptensorHandle_t,
+                                                            fileName::Ptr{Cchar})::hiptensorStatus_t
+end
+
+@checked function hiptensorHandleReadPlanCacheFromFile(handle, fileName, numCachelinesRead)
+    @ccall libhiptensor.hiptensorHandleReadPlanCacheFromFile(handle::hiptensorHandle_t,
+                                                             fileName::Ptr{Cchar},
+                                                             numCachelinesRead::Ptr{UInt32})::hiptensorStatus_t
+end
+
+@checked function hiptensorWriteKernelCacheToFile(handle, fileName)
+    @ccall libhiptensor.hiptensorWriteKernelCacheToFile(handle::hiptensorHandle_t,
+                                                        fileName::Ptr{Cchar})::hiptensorStatus_t
+end
+
+@checked function hiptensorReadKernelCacheFromFile(handle, fileName)
+    @ccall libhiptensor.hiptensorReadKernelCacheFromFile(handle::hiptensorHandle_t,
+                                                         fileName::Ptr{Cchar})::hiptensorStatus_t
+end
+
+mutable struct hiptensorTensorDescriptor end
+
+const hiptensorTensorDescriptor_t = Ptr{hiptensorTensorDescriptor}
+
+@cenum hiptensorDataType_t::UInt32 begin
+    HIPTENSOR_R_32F = 0
+    HIPTENSOR_R_64F = 1
+    HIPTENSOR_R_16F = 2
+    HIPTENSOR_R_8I = 3
+    HIPTENSOR_C_32F = 4
+    HIPTENSOR_C_64F = 5
+    HIPTENSOR_C_16F = 6
+    HIPTENSOR_C_8I = 7
+    HIPTENSOR_R_8U = 8
+    HIPTENSOR_C_8U = 9
+    HIPTENSOR_R_32I = 10
+    HIPTENSOR_C_32I = 11
+    HIPTENSOR_R_32U = 12
+    HIPTENSOR_C_32U = 13
+    HIPTENSOR_R_16BF = 14
+    HIPTENSOR_C_16BF = 15
+    HIPTENSOR_R_4I = 16
+    HIPTENSOR_C_4I = 17
+    HIPTENSOR_R_4U = 18
+    HIPTENSOR_C_4U = 19
+    HIPTENSOR_R_16I = 20
+    HIPTENSOR_C_16I = 21
+    HIPTENSOR_R_16U = 22
+    HIPTENSOR_C_16U = 23
+    HIPTENSOR_R_64I = 24
+    HIPTENSOR_C_64I = 25
+    HIPTENSOR_R_64U = 26
+    HIPTENSOR_C_64U = 27
+end
+
+function hiptensorCreateTensorDescriptor(handle, desc, numModes, lens, strides, dataType,
+                                         alignmentRequirement)
+    @debug_ccall libhiptensor.hiptensorCreateTensorDescriptor(handle::hiptensorHandle_t,
+                                                        desc::Ptr{hiptensorTensorDescriptor_t},
+                                                        numModes::UInt32, lens::Ptr{Int64},
+                                                        strides::Ptr{Int64},
+                                                        dataType::hiptensorDataType_t,
+                                                        alignmentRequirement::UInt32)::hiptensorStatus_t
+end
+
+@checked function hiptensorDestroyTensorDescriptor(desc)
+    @ccall libhiptensor.hiptensorDestroyTensorDescriptor(desc::hiptensorTensorDescriptor_t)::hiptensorStatus_t
+end
+
+mutable struct hiptensorOperationDescriptor end
+
+const hiptensorOperationDescriptor_t = Ptr{hiptensorOperationDescriptor}
+
+@cenum hiptensorOperator_t::UInt32 begin
+    HIPTENSOR_OP_IDENTITY = 1
+    HIPTENSOR_OP_SQRT = 2
+    HIPTENSOR_OP_RELU = 8
+    HIPTENSOR_OP_CONJ = 9
+    HIPTENSOR_OP_RCP = 10
+    HIPTENSOR_OP_SIGMOID = 11
+    HIPTENSOR_OP_TANH = 12
+    HIPTENSOR_OP_EXP = 22
+    HIPTENSOR_OP_LOG = 23
+    HIPTENSOR_OP_ABS = 24
+    HIPTENSOR_OP_NEG = 25
+    HIPTENSOR_OP_SIN = 26
+    HIPTENSOR_OP_COS = 27
+    HIPTENSOR_OP_TAN = 28
+    HIPTENSOR_OP_SINH = 29
+    HIPTENSOR_OP_COSH = 30
+    HIPTENSOR_OP_ASIN = 31
+    HIPTENSOR_OP_ACOS = 32
+    HIPTENSOR_OP_ATAN = 33
+    HIPTENSOR_OP_ASINH = 34
+    HIPTENSOR_OP_ACOSH = 35
+    HIPTENSOR_OP_ATANH = 36
+    HIPTENSOR_OP_CEIL = 37
+    HIPTENSOR_OP_FLOOR = 38
+    HIPTENSOR_OP_ADD = 3
+    HIPTENSOR_OP_MUL = 5
+    HIPTENSOR_OP_MAX = 6
+    HIPTENSOR_OP_MIN = 7
+    HIPTENSOR_OP_UNKNOWN = 126
+end
+
+@cenum hiptensorComputeDescriptor_t::UInt32 begin
+    HIPTENSOR_COMPUTE_DESC_32F = 4
+    HIPTENSOR_COMPUTE_DESC_64F = 16
+    HIPTENSOR_COMPUTE_DESC_16F = 1
+    HIPTENSOR_COMPUTE_DESC_16BF = 1024
+    HIPTENSOR_COMPUTE_DESC_C32F = 2048
+    HIPTENSOR_COMPUTE_DESC_C64F = 4096
+    HIPTENSOR_COMPUTE_DESC_NONE = 0
+    HIPTENSOR_COMPUTE_DESC_8U = 64
+    HIPTENSOR_COMPUTE_DESC_8I = 256
+    HIPTENSOR_COMPUTE_DESC_32U = 128
+    HIPTENSOR_COMPUTE_DESC_32I = 512
+end
+
+@checked function hiptensorCreateContraction(handle, desc, descA, modeA, opA, descB, modeB, opB,
+                                    descC, modeC, opC, descD, modeD, descCompute)
+    @ccall libhiptensor.hiptensorCreateContraction(handle::hiptensorHandle_t,
+                                                   desc::Ptr{hiptensorOperationDescriptor_t},
+                                                   descA::hiptensorTensorDescriptor_t,
+                                                   modeA::Ptr{Int32},
+                                                   opA::hiptensorOperator_t,
+                                                   descB::hiptensorTensorDescriptor_t,
+                                                   modeB::Ptr{Int32},
+                                                   opB::hiptensorOperator_t,
+                                                   descC::hiptensorTensorDescriptor_t,
+                                                   modeC::Ptr{Int32},
+                                                   opC::hiptensorOperator_t,
+                                                   descD::hiptensorTensorDescriptor_t,
+                                                   modeD::Ptr{Int32},
+                                                   descCompute::hiptensorComputeDescriptor_t)::hiptensorStatus_t
+end
+
+@checked function hiptensorDestroyOperationDescriptor(desc)
+    @ccall libhiptensor.hiptensorDestroyOperationDescriptor(desc::hiptensorOperationDescriptor_t)::hiptensorStatus_t
+end
+
+@cenum hiptensorOperationDescriptorAttribute_t::UInt32 begin
+    HIPTENSOR_OPERATION_DESCRIPTOR_TAG = 0
+    HIPTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE = 1
+    HIPTENSOR_OPERATION_DESCRIPTOR_FLOPS = 2
+    HIPTENSOR_OPERATION_DESCRIPTOR_MOVED_BYTES = 3
+    HIPTENSOR_OPERATION_DESCRIPTOR_PADDING_LEFT = 4
+    HIPTENSOR_OPERATION_DESCRIPTOR_PADDING_RIGHT = 5
+    HIPTENSOR_OPERATION_DESCRIPTOR_PADDING_VALUE = 6
+end
+
+@checked function hiptensorOperationDescriptorSetAttribute(handle, desc, attr, buf, sizeInBytes)
+    @ccall libhiptensor.hiptensorOperationDescriptorSetAttribute(handle::hiptensorHandle_t,
+                                                                 desc::hiptensorOperationDescriptor_t,
+                                                                 attr::hiptensorOperationDescriptorAttribute_t,
+                                                                 buf::Ptr{Cvoid},
+                                                                 sizeInBytes::Csize_t)::hiptensorStatus_t
+end
+
+function hiptensorOperationDescriptorGetAttribute(handle, desc, attr, buf, sizeInBytes)
+    @debug_ccall libhiptensor.hiptensorOperationDescriptorGetAttribute(handle::hiptensorHandle_t,
+                                                                 desc::hiptensorOperationDescriptor_t,
+                                                                 attr::hiptensorOperationDescriptorAttribute_t,
+                                                                 buf::Ptr{Cvoid},
+                                                                 sizeInBytes::Csize_t)::hiptensorStatus_t
+end
+
+mutable struct hiptensorPlanPreference end
+
+const hiptensorPlanPreference_t = Ptr{hiptensorPlanPreference}
+
+@cenum hiptensorAlgo_t::Int32 begin
+    HIPTENSOR_ALGO_ACTOR_CRITIC = -8
+    HIPTENSOR_ALGO_DEFAULT = -1
+    HIPTENSOR_ALGO_DEFAULT_PATIENT = -6
+end
+
+@cenum hiptensorJitMode_t::UInt32 begin
+    HIPTENSOR_JIT_MODE_NONE = 0
+    HIPTENSOR_JIT_MODE_DEFAULT = 1
+end
+
+@checked function hiptensorCreatePlanPreference(handle, pref, algo, jitMode)
+    @ccall libhiptensor.hiptensorCreatePlanPreference(handle::hiptensorHandle_t,
+                                                      pref::Ptr{hiptensorPlanPreference_t},
+                                                      algo::hiptensorAlgo_t,
+                                                      jitMode::hiptensorJitMode_t)::hiptensorStatus_t
+end
+
+@checked function hiptensorDestroyPlanPreference(pref)
+    @ccall libhiptensor.hiptensorDestroyPlanPreference(pref::hiptensorPlanPreference_t)::hiptensorStatus_t
+end
+
+@cenum hiptensorPlanPreferenceAttribute_t::UInt32 begin
+    HIPTENSOR_PLAN_PREFERENCE_AUTOTUNE_MODE = 0
+    HIPTENSOR_PLAN_PREFERENCE_CACHE_MODE = 1
+    HIPTENSOR_PLAN_PREFERENCE_INCREMENTAL_COUNT = 2
+    HIPTENSOR_PLAN_PREFERENCE_ALGO = 3
+    HIPTENSOR_PLAN_PREFERENCE_KERNEL_RANK = 4
+    HIPTENSOR_PLAN_PREFERENCE_JIT = 5
+end
+
+@checked function hiptensorPlanPreferenceSetAttribute(handle, pref, attr, buf, sizeInBytes)
+    @ccall libhiptensor.hiptensorPlanPreferenceSetAttribute(handle::hiptensorHandle_t,
+                                                            pref::hiptensorPlanPreference_t,
+                                                            attr::hiptensorPlanPreferenceAttribute_t,
+                                                            buf::Ptr{Cvoid},
+                                                            sizeInBytes::Csize_t)::hiptensorStatus_t
+end
+
+mutable struct hiptensorPlan end
+
+const hiptensorPlan_t = Ptr{hiptensorPlan}
+
+@cenum hiptensorPlanAttribute_t::UInt32 begin
+    HIPTENSOR_PLAN_REQUIRED_WORKSPACE = 0
+end
+
+function hiptensorPlanGetAttribute(handle, plan, attr, buf, sizeInBytes)
+    @debug_ccall libhiptensor.hiptensorPlanGetAttribute(handle::hiptensorHandle_t,
+                                                  plan::hiptensorPlan_t,
+                                                  attr::hiptensorPlanAttribute_t,
+                                                  buf::Ptr{Cvoid},
+                                                  sizeInBytes::Csize_t)::hiptensorStatus_t
+end
+
+@cenum hiptensorWorksizePreference_t::UInt32 begin
+    HIPTENSOR_WORKSPACE_MIN = 1
+    HIPTENSOR_WORKSPACE_DEFAULT = 2
+    HIPTENSOR_WORKSPACE_MAX = 3
+end
+
+@checked function hiptensorEstimateWorkspaceSize(handle, desc, planPref, workspacePref,
+                                        workspaceSizeEstimate)
+    @ccall libhiptensor.hiptensorEstimateWorkspaceSize(handle::hiptensorHandle_t,
+                                                       desc::hiptensorOperationDescriptor_t,
+                                                       planPref::hiptensorPlanPreference_t,
+                                                       workspacePref::hiptensorWorksizePreference_t,
+                                                       workspaceSizeEstimate::Ptr{UInt64})::hiptensorStatus_t
+end
+
+@checked function hiptensorCreatePermutation(handle, desc, descA, modeA, opA, descB, modeB,
+                                    descCompute)
+    @ccall libhiptensor.hiptensorCreatePermutation(handle::hiptensorHandle_t,
+                                                   desc::Ptr{hiptensorOperationDescriptor_t},
+                                                   descA::hiptensorTensorDescriptor_t,
+                                                   modeA::Ptr{Int32},
+                                                   opA::hiptensorOperator_t,
+                                                   descB::hiptensorTensorDescriptor_t,
+                                                   modeB::Ptr{Int32},
+                                                   descCompute::hiptensorComputeDescriptor_t)::hiptensorStatus_t
+end
+
+function hiptensorCreatePlan(handle, plan, desc, pref, workspaceSizeLimit)
+    @debug_ccall libhiptensor.hiptensorCreatePlan(handle::hiptensorHandle_t,
+                                            plan::Ptr{hiptensorPlan_t},
+                                            desc::hiptensorOperationDescriptor_t,
+                                            pref::hiptensorPlanPreference_t,
+                                            workspaceSizeLimit::UInt64)::hiptensorStatus_t
+end
+
+@checked function hiptensorDestroyPlan(plan)
+    @ccall libhiptensor.hiptensorDestroyPlan(plan::hiptensorPlan_t)::hiptensorStatus_t
+end
+
+@checked function hiptensorContract(handle, plan, alpha, A, B, beta, C, D, workspace, workspaceSize,
+                           stream)
+    @ccall libhiptensor.hiptensorContract(handle::hiptensorHandle_t, plan::hiptensorPlan_t,
+                                          alpha::Ptr{Cvoid}, A::Ptr{Cvoid}, B::Ptr{Cvoid},
+                                          beta::Ptr{Cvoid}, C::Ptr{Cvoid}, D::Ptr{Cvoid},
+                                          workspace::Ptr{Cvoid}, workspaceSize::UInt64,
+                                          stream::Cint)::hiptensorStatus_t
+end
+
+@checked function hiptensorPermute(handle, plan, alpha, A, B, stream)
+    @ccall libhiptensor.hiptensorPermute(handle::hiptensorHandle_t, plan::hiptensorPlan_t,
+                                         alpha::Ptr{Cvoid}, A::Ptr{Cvoid}, B::Ptr{Cvoid},
+                                         stream::Cint)::hiptensorStatus_t
+end
+
+@checked function hiptensorCreateElementwiseBinary(handle, desc, descA, modeA, opA, descC, modeC,
+                                          opC, descD, modeD, opAC, descCompute)
+    @ccall libhiptensor.hiptensorCreateElementwiseBinary(handle::hiptensorHandle_t,
+                                                         desc::Ptr{hiptensorOperationDescriptor_t},
+                                                         descA::hiptensorTensorDescriptor_t,
+                                                         modeA::Ptr{Int32},
+                                                         opA::hiptensorOperator_t,
+                                                         descC::hiptensorTensorDescriptor_t,
+                                                         modeC::Ptr{Int32},
+                                                         opC::hiptensorOperator_t,
+                                                         descD::hiptensorTensorDescriptor_t,
+                                                         modeD::Ptr{Int32},
+                                                         opAC::hiptensorOperator_t,
+                                                         descCompute::hiptensorComputeDescriptor_t)::hiptensorStatus_t
+end
+
+@checked function hiptensorElementwiseBinaryExecute(handle, plan, alpha, A, gamma, C, D, stream)
+    @ccall libhiptensor.hiptensorElementwiseBinaryExecute(handle::hiptensorHandle_t,
+                                                          plan::hiptensorPlan_t,
+                                                          alpha::Ptr{Cvoid}, A::Ptr{Cvoid},
+                                                          gamma::Ptr{Cvoid}, C::Ptr{Cvoid},
+                                                          D::Ptr{Cvoid},
+                                                          stream::Cint)::hiptensorStatus_t
+end
+
+@checked function hiptensorCreateElementwiseTrinary(handle, desc, descA, modeA, opA, descB, modeB,
+                                           opB, descC, modeC, opC, descD, modeD, opAB,
+                                           opABC, descCompute)
+    @ccall libhiptensor.hiptensorCreateElementwiseTrinary(handle::hiptensorHandle_t,
+                                                          desc::Ptr{hiptensorOperationDescriptor_t},
+                                                          descA::hiptensorTensorDescriptor_t,
+                                                          modeA::Ptr{Int32},
+                                                          opA::hiptensorOperator_t,
+                                                          descB::hiptensorTensorDescriptor_t,
+                                                          modeB::Ptr{Int32},
+                                                          opB::hiptensorOperator_t,
+                                                          descC::hiptensorTensorDescriptor_t,
+                                                          modeC::Ptr{Int32},
+                                                          opC::hiptensorOperator_t,
+                                                          descD::hiptensorTensorDescriptor_t,
+                                                          modeD::Ptr{Int32},
+                                                          opAB::hiptensorOperator_t,
+                                                          opABC::hiptensorOperator_t,
+                                                          descCompute::hiptensorComputeDescriptor_t)::hiptensorStatus_t
+end
+
+@checked function hiptensorElementwiseTrinaryExecute(handle, plan, alpha, A, beta, B, gamma, C, D,
+                                            stream)
+    @ccall libhiptensor.hiptensorElementwiseTrinaryExecute(handle::hiptensorHandle_t,
+                                                           plan::hiptensorPlan_t,
+                                                           alpha::Ptr{Cvoid}, A::Ptr{Cvoid},
+                                                           beta::Ptr{Cvoid}, B::Ptr{Cvoid},
+                                                           gamma::Ptr{Cvoid}, C::Ptr{Cvoid},
+                                                           D::Ptr{Cvoid},
+                                                           stream::Cint)::hiptensorStatus_t
+end
+
+function hiptensorCreateReduction(handle, desc, descA, modeA, opA, descC, modeC, opC, descD,
+                                  modeD, opReduce, descCompute)
+    @debug_ccall libhiptensor.hiptensorCreateReduction(handle::hiptensorHandle_t,
+                                                 desc::Ptr{hiptensorOperationDescriptor_t},
+                                                 descA::hiptensorTensorDescriptor_t,
+                                                 modeA::Ptr{Int32},
+                                                 opA::hiptensorOperator_t,
+                                                 descC::hiptensorTensorDescriptor_t,
+                                                 modeC::Ptr{Int32},
+                                                 opC::hiptensorOperator_t,
+                                                 descD::hiptensorTensorDescriptor_t,
+                                                 modeD::Ptr{Int32},
+                                                 opReduce::hiptensorOperator_t,
+                                                 descCompute::hiptensorComputeDescriptor_t)::hiptensorStatus_t
+end
+
+function hiptensorReduce(handle, plan, alpha, A, beta, C, D, workspace, workspaceSize,
+                         stream)
+    @debug_ccall libhiptensor.hiptensorReduce(handle::hiptensorHandle_t, plan::hiptensorPlan_t,
+                                        alpha::Ptr{Cvoid}, A::Ptr{Cvoid}, beta::Ptr{Cvoid},
+                                        C::Ptr{Cvoid}, D::Ptr{Cvoid}, workspace::Ptr{Cvoid},
+                                        workspaceSize::UInt64,
+                                        stream::Cint)::hiptensorStatus_t
+end
+
+# typedef void ( * hiptensorLoggerCallback_t ) ( int32_t logContext , const char * funcName , const char * msg )
+const hiptensorLoggerCallback_t = Ptr{Cvoid}
+
+function hiptensorLoggerSetCallback(callback)
+    @ccall libhiptensor.hiptensorLoggerSetCallback(callback::hiptensorLoggerCallback_t)::hiptensorStatus_t
+end
+
+function hiptensorLoggerSetFile(file)
+    @ccall libhiptensor.hiptensorLoggerSetFile(file::Ptr{Libc.FILE})::hiptensorStatus_t
+end
+
+function hiptensorLoggerOpenFile(logFile)
+    @ccall libhiptensor.hiptensorLoggerOpenFile(logFile::Ptr{Cchar})::hiptensorStatus_t
+end
+
+@cenum hiptensorLogLevel_t::UInt32 begin
+    HIPTENSOR_LOG_LEVEL_OFF = 0
+    HIPTENSOR_LOG_LEVEL_ERROR = 1
+    HIPTENSOR_LOG_LEVEL_PERF_TRACE = 2
+    HIPTENSOR_LOG_LEVEL_PERF_HINT = 4
+    HIPTENSOR_LOG_LEVEL_HEURISTICS_TRACE = 8
+    HIPTENSOR_LOG_LEVEL_API_TRACE = 16
+end
+
+function hiptensorLoggerSetLevel(level)
+    @ccall libhiptensor.hiptensorLoggerSetLevel(level::hiptensorLogLevel_t)::hiptensorStatus_t
+end
+
+function hiptensorLoggerSetMask(mask)
+    @ccall libhiptensor.hiptensorLoggerSetMask(mask::Int32)::hiptensorStatus_t
+end
+
+# no prototype is found for this function at hiptensor.h:578:19, please use with caution
+function hiptensorLoggerForceDisable()
+    @ccall libhiptensor.hiptensorLoggerForceDisable()::hiptensorStatus_t
+end
+
+# no prototype is found for this function at hiptensor.h:583:5, please use with caution
+function hiptensorGetHiprtVersion()
+    @ccall libhiptensor.hiptensorGetHiprtVersion()::Cint
+end
+
+# no prototype is found for this function at hiptensor.h:589:8, please use with caution
+function hiptensorGetVersion()
+    @ccall libhiptensor.hiptensorGetVersion()::Csize_t
+end
+
+@cenum hiptensorAutotuneMode_t::UInt32 begin
+    HIPTENSOR_AUTOTUNE_MODE_NONE = 0
+    HIPTENSOR_AUTOTUNE_MODE_INCREMENTAL = 1
+end
+
+@cenum hiptensorCacheMode_t::UInt32 begin
+    HIPTENSOR_CACHE_MODE_NONE = 0
+    HIPTENSOR_CACHE_MODE_PEDANTIC = 1
+end
+
+# Skipping MacroDefinition: HIP_PUBLIC_API __attribute__ ( ( visibility ( "default" ) ) )
+
+# Skipping MacroDefinition: HIP_INTERNAL_EXPORTED_API __attribute__ ( ( visibility ( "default" ) ) )
+
+const __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ = 0
+
+const __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ = 0
+
+const __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ = 0
+
+const __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ = 0
+
+const __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ = 0
+
+const __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ = 0
+
+const __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ = 0
+
+const __HIP_ARCH_HAS_DOUBLES__ = 0
+
+const __HIP_ARCH_HAS_WARP_VOTE__ = 0
+
+const __HIP_ARCH_HAS_WARP_BALLOT__ = 0
+
+const __HIP_ARCH_HAS_WARP_SHUFFLE__ = 0
+
+const __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ = 0
+
+const __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ = 0
+
+const __HIP_ARCH_HAS_SYNC_THREAD_EXT__ = 0
+
+const __HIP_ARCH_HAS_SURFACE_FUNCS__ = 0
+
+const __HIP_ARCH_HAS_3DGRID__ = 0
+
+const __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ = 0
+
+const HIPTENSOR_MAJOR_VERSION = 2
+
+const HIPTENSOR_MINOR_VERSION = 2
+
+const HIPTENSOR_PATCH_VERSION = 0
+
+const HIP_VERSION_MAJOR = 7
+
+const HIP_VERSION_MINOR = 2
+
+const HIP_VERSION_PATCH = 26015
+
+const HIP_VERSION_GITHASH = "fc0010cf6a"
+
+const HIP_VERSION_BUILD_ID = 0
+
+const HIP_VERSION_BUILD_NAME = ""
+
+const HIP_VERSION = HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 +
+                    HIP_VERSION_PATCH
+
+const __HIP_HAS_GET_PCH = 1
+
diff --git a/src/tensor/operations.jl b/src/tensor/operations.jl
new file mode 100644
index 000000000..b3013639d
--- /dev/null
+++ b/src/tensor/operations.jl
@@ -0,0 +1,429 @@
+const ModeType = AbstractVector{<:Union{Char, Integer}}
+
+# remove the HIPTENSOR_ prefix from some common enums,
+# as they're namespaced to the cuTENSOR module anyway.
+@enum_without_prefix hiptensorOperator_t HIPTENSOR_
+@enum_without_prefix hiptensorWorksizePreference_t HIPTENSOR_
+@enum_without_prefix hiptensorAlgo_t HIPTENSOR_
+@enum_without_prefix hiptensorJitMode_t HIPTENSOR_
+
+is_unary(op::hiptensorOperator_t) =  (op ∈ (OP_IDENTITY, OP_SQRT, OP_RELU, OP_CONJ, OP_RCP))
+is_binary(op::hiptensorOperator_t) = (op ∈ (OP_ADD, OP_MUL, OP_MAX, OP_MIN))
+
+function elementwise_trinary_execute!(
+        @nospecialize(alpha::Number),
+        @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t,
+        @nospecialize(beta::Number),
+        @nospecialize(B::AbstractArray), Binds::ModeType, opB::hiptensorOperator_t,
+        @nospecialize(gamma::Number),
+        @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t,
+        @nospecialize(D::AbstractArray), Dinds::ModeType, opAB::hiptensorOperator_t,
+        opABC::hiptensorOperator_t;
+        workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT,
+        algo::hiptensorAlgo_t=ALGO_DEFAULT,
+        compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing,
+        plan::Union{hipTensorPlan, Nothing}=nothing)
+
+    actual_plan = if plan === nothing
+        plan_elementwise_trinary(A, Ainds, opA,
+                                 B, Binds, opB,
+                                 C, Cinds, opC,
+                                 D, Dinds, opAB, opABC;
+                                 workspace, algo, compute_type)
+    else
+        plan
+    end
+
+    elementwise_trinary_execute!(actual_plan, alpha, A, beta, B, gamma, C, D)
+
+    if plan === nothing
+        AMDGPU.unsafe_free!(actual_plan)
+    end
+
+    return D
+end
+
+function elementwise_trinary_execute!(plan::hipTensorPlan,
+                                      @nospecialize(alpha::Number),
+                                      @nospecialize(A::AbstractArray),
+                                      @nospecialize(beta::Number),
+                                      @nospecialize(B::AbstractArray),
+                                      @nospecialize(gamma::Number),
+                                      @nospecialize(C::AbstractArray),
+                                      @nospecialize(D::AbstractArray))
+    scalar_type = plan.scalar_type
+    hiptensorElementwiseTrinaryExecute(handle(), plan,
+                                      Ref{scalar_type}(alpha), A,
+                                      Ref{scalar_type}(beta), B,
+                                      Ref{scalar_type}(gamma), C, D,
+                                      stream().stream)
+    return D
+end
+
+function plan_elementwise_trinary(
+        @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t,
+        @nospecialize(B::AbstractArray), Binds::ModeType, opB::hiptensorOperator_t,
+        @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t,
+        @nospecialize(D::AbstractArray), Dinds::ModeType, opAB::hiptensorOperator_t,
+        opABC::hiptensorOperator_t;
+        jit::hiptensorJitMode_t=JIT_MODE_NONE,
+        workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT,
+        algo::hiptensorAlgo_t=ALGO_DEFAULT,
+        compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing)
+    !is_unary(opA)    && throw(ArgumentError("opA must be a unary op!"))
+    !is_unary(opB)    && throw(ArgumentError("opB must be a unary op!"))
+    !is_unary(opC)    && throw(ArgumentError("opC must be a unary op!"))
+    !is_binary(opAB)  && throw(ArgumentError("opAB must be a binary op!"))
+    !is_binary(opABC) && throw(ArgumentError("opABC must be a binary op!"))
+    descA = hipTensorDescriptor(A)
+    descB = hipTensorDescriptor(B)
+    descC = hipTensorDescriptor(C)
+    @assert size(C) == size(D) && strides(C) == strides(D)
+    descD = descC # must currently be identical
+    modeA = collect(Cint, Ainds)
+    modeB = collect(Cint, Binds)
+    modeC = collect(Cint, Cinds)
+    modeD = modeC
+
+    actual_compute_type = if compute_type === nothing
+        elementwise_trinary_compute_types[(eltype(A), eltype(B), eltype(C))]
+    else
+        compute_type
+    end
+
+    desc = Ref{hiptensorOperationDescriptor_t}()
+    hiptensorCreateElementwiseTrinary(handle(),
+                                     desc,
+                                     descA, modeA, opA,
+                                     descB, modeB, opB,
+                                     descC, modeC, opC,
+                                     descD, modeD,
+                                     opAB, opABC,
+                                     actual_compute_type)
+
+    plan_pref = Ref{hiptensorPlanPreference_t}()
+    hiptensorCreatePlanPreference(handle(), plan_pref, algo, jit)
+
+    plan = hipTensorPlan(desc[], plan_pref[]; workspacePref=workspace)
+    hiptensorDestroyOperationDescriptor(desc[])
+    hiptensorDestroyPlanPreference(plan_pref[])
+    return plan
+end
+
+function elementwise_binary_execute!(
+        @nospecialize(alpha::Number),
+        @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t,
+        @nospecialize(gamma::Number),
+        @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t,
+        @nospecialize(D::AbstractArray), Dinds::ModeType, opAC::hiptensorOperator_t;
+        workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT,
+        algo::hiptensorAlgo_t=ALGO_DEFAULT,
+        compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing,
+        plan::Union{hipTensorPlan, Nothing}=nothing)
+    actual_plan = if plan === nothing
+        plan_elementwise_binary(A, Ainds, opA,
+                                C, Cinds, opC,
+                                D, Dinds, opAC;
+                                workspace, algo, compute_type)
+    else
+        plan
+    end
+    elementwise_binary_execute!(actual_plan, alpha, A, gamma, C, D)
+    if plan === nothing
+        AMDGPU.unsafe_free!(actual_plan)
+    end
+
+    return D
+end
+
+function elementwise_binary_execute!(plan::hipTensorPlan,
+                                     @nospecialize(alpha::Number),
+                                     @nospecialize(A::AbstractArray),
+                                     @nospecialize(gamma::Number),
+                                     @nospecialize(C::AbstractArray),
+                                     @nospecialize(D::AbstractArray))
+    scalar_type = plan.scalar_type
+    hiptensorElementwiseBinaryExecute(handle(), plan,
+                                      Ref{scalar_type}(alpha), A,
+                                      Ref{scalar_type}(gamma), C, D,
+                                      stream().stream)
+    return D
+end
+
+function plan_elementwise_binary(
+        @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t,
+        @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t,
+        @nospecialize(D::AbstractArray), Dinds::ModeType, opAC::hiptensorOperator_t;
+        jit::hiptensorJitMode_t=JIT_MODE_NONE,
+        workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT,
+        algo::hiptensorAlgo_t=ALGO_DEFAULT,
+        compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=eltype(C))
+    !is_unary(opA)    && throw(ArgumentError("opA must be a unary op!"))
+    !is_unary(opC)    && throw(ArgumentError("opC must be a unary op!"))
+    !is_binary(opAC)  && throw(ArgumentError("opAC must be a binary op!"))
+    descA = hipTensorDescriptor(A)
+    descC = hipTensorDescriptor(C)
+    @assert size(C) == size(D) && strides(C) == strides(D)
+    descD = descC # must currently be identical
+    modeA = collect(Cint, Ainds)
+    modeC = collect(Cint, Cinds)
+    modeD = modeC
+
+    actual_compute_type = if compute_type === nothing
+        elementwise_binary_compute_types[(eltype(A), eltype(C))]
+    else
+        compute_type
+    end
+
+    desc = Ref{hiptensorOperationDescriptor_t}()
+    hiptensorCreateElementwiseBinary(handle(),
+                                     desc,
+                                     descA, modeA, opA,
+                                     descC, modeC, opC,
+                                     descD, modeD,
+                                     opAC,
+                                     actual_compute_type)
+    plan_pref = Ref{hiptensorPlanPreference_t}()
+    hiptensorCreatePlanPreference(handle(), plan_pref, algo, jit)
+
+    plan = hipTensorPlan(desc[], plan_pref[]; workspacePref=workspace)
+    hiptensorDestroyOperationDescriptor(desc[])
+    hiptensorDestroyPlanPreference(plan_pref[])
+    return plan
+end
+
+function permute!(
+        @nospecialize(alpha::Number),
+        @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t,
+        @nospecialize(B::AbstractArray), Binds::ModeType;
+        workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT,
+        algo::hiptensorAlgo_t=ALGO_DEFAULT,
+        compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing,
+        plan::Union{hipTensorPlan, Nothing}=nothing)
+    actual_plan = if plan === nothing
+        plan_permutation(A, Ainds, opA,
+                         B, Binds;
+                         workspace, algo, compute_type)
+    else
+        plan
+    end
+    permute!(actual_plan, alpha, A, B)
+
+    if plan === nothing
+        AMDGPU.unsafe_free!(actual_plan)
+    end
+
+    return B
+end
+
+function permute!(plan::hipTensorPlan,
+                  @nospecialize(alpha::Number),
+                  @nospecialize(A::AbstractArray),
+                  @nospecialize(B::AbstractArray))
+    scalar_type = plan.scalar_type
+    hiptensorPermute(handle(), plan,
+                    Ref{scalar_type}(alpha), A, B,
+                    stream().stream)
+    return B
+end
+
+function plan_permutation(
+        @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t,
+        @nospecialize(B::AbstractArray), Binds::ModeType;
+        jit::hiptensorJitMode_t=JIT_MODE_NONE,
+        workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT,
+        algo::hiptensorAlgo_t=ALGO_DEFAULT,
+        compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing)
+    descA = hipTensorDescriptor(A)
+    descB = hipTensorDescriptor(B)
+
+    modeA = collect(Cint, Ainds)
+    modeB = collect(Cint, Binds)
+
+    actual_compute_type = if compute_type === nothing
+        permutation_compute_types[(eltype(A), eltype(B))]
+    else
+        compute_type
+    end
+    compute_desc = Base.cconvert(hiptensorComputeDescriptor_t, actual_compute_type)
+    desc = Ref{hiptensorOperationDescriptor_t}()
+    hiptensorCreatePermutation(handle(), desc,
+                              descA, modeA, opA,
+                              descB, modeB,
+                              compute_desc)
+    plan_pref = Ref{hiptensorPlanPreference_t}()
+    hiptensorCreatePlanPreference(handle(), plan_pref, algo, jit)
+
+    plan = hipTensorPlan(desc[], plan_pref[]; workspacePref=workspace)
+    hiptensorDestroyOperationDescriptor(desc[])
+    hiptensorDestroyPlanPreference(plan_pref[])
+    return plan
+end
+
+function contract!(
+        @nospecialize(alpha::Number),
+        @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t,
+        @nospecialize(B::AbstractArray), Binds::ModeType, opB::hiptensorOperator_t,
+        @nospecialize(beta::Number),
+        @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t,
+        opOut::hiptensorOperator_t;
+        jit::hiptensorJitMode_t=JIT_MODE_NONE,
+        workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT,
+        algo::hiptensorAlgo_t=ALGO_DEFAULT,
+        compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing,
+        plan::Union{hipTensorPlan, Nothing}=nothing)
+    actual_plan = if plan === nothing
+        plan_contraction(A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
+                         jit, workspace, algo, compute_type)
+    else
+        plan
+    end
+
+    contract!(actual_plan, alpha, A, B, beta, C)
+
+    if plan === nothing
+        AMDGPU.unsafe_free!(actual_plan)
+    end
+
+    return C
+end
+
+function contract!(plan::hipTensorPlan,
+                   @nospecialize(alpha::Number),
+                   @nospecialize(A::AbstractArray),
+                   @nospecialize(B::AbstractArray),
+                   @nospecialize(beta::Number),
+                   @nospecialize(C::AbstractArray))
+    scalar_type = plan.scalar_type
+    hiptensorContract(handle(), plan,
+                     Ref{scalar_type}(alpha), A, B,
+                     Ref{scalar_type}(beta), C, C,
+                     plan.workspace, sizeof(plan.workspace), stream().stream)
+    return C
+end
+
+function plan_contraction(
+        @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t,
+        @nospecialize(B::AbstractArray), Binds::ModeType, opB::hiptensorOperator_t,
+        @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t,
+        opOut::hiptensorOperator_t;
+        jit::hiptensorJitMode_t=JIT_MODE_NONE,
+        workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT,
+        algo::hiptensorAlgo_t=ALGO_DEFAULT,
+        compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing)
+    !is_unary(opA)    && throw(ArgumentError("opA must be a unary op!"))
+    !is_unary(opB)    && throw(ArgumentError("opB must be a unary op!"))
+    !is_unary(opC)    && throw(ArgumentError("opC must be a unary op!"))
+    !is_unary(opOut)  && throw(ArgumentError("opOut must be a unary op!"))
+    descA = hipTensorDescriptor(A)
+    descB = hipTensorDescriptor(B)
+    descC = hipTensorDescriptor(C)
+    # for now, D must be identical to C (and thus, descD must be identical to descC)
+    modeA = collect(Cint, Ainds)
+    length(modeA) == ndims(A) || throw(ArgumentError("Ainds must match number of dimensions in A!"))
+    modeB = collect(Cint, Binds)
+    length(modeB) == ndims(B) || throw(ArgumentError("Binds must match number of dimensions in B!"))
+    modeC = collect(Cint, Cinds)
+    length(modeC) == ndims(C) || throw(ArgumentError("Cinds must match number of dimensions in C!"))
+
+    actual_compute_type = if compute_type === nothing
+        contraction_compute_types[(eltype(A), eltype(B), eltype(C))]
+    else
+        compute_type
+    end
+
+    desc = Ref{hiptensorOperationDescriptor_t}()
+    hiptensorCreateContraction(handle(),
+                              desc,
+                              descA, modeA, opA,
+                              descB, modeB, opB,
+                              descC, modeC, opC,
+                              descC, modeC,
+                              actual_compute_type)
+
+    plan_pref = Ref{hiptensorPlanPreference_t}()
+    hiptensorCreatePlanPreference(handle(), plan_pref, algo, jit)
+
+    plan = hipTensorPlan(desc[], plan_pref[]; workspacePref=workspace)
+    hiptensorDestroyOperationDescriptor(desc[])
+    hiptensorDestroyPlanPreference(plan_pref[])
+    return plan
+end
+
+function reduce!(
+        @nospecialize(alpha::Number),
+        @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t,
+        @nospecialize(beta::Number),
+        @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t,
+        opReduce::hiptensorOperator_t;
+        workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT,
+        algo::hiptensorAlgo_t=ALGO_DEFAULT,
+        compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing,
+        plan::Union{hipTensorPlan, Nothing}=nothing)
+    actual_plan = if plan === nothing
+        plan_reduction(A, Ainds, opA, C, Cinds, opC, opReduce; workspace, algo, compute_type)
+    else
+        plan
+    end
+
+    reduce!(actual_plan, alpha, A, beta, C)
+
+    if plan === nothing
+        AMDGPU.unsafe_free!(actual_plan)
+    end
+
+    return C
+end
+
+function reduce!(plan::hipTensorPlan,
+                 @nospecialize(alpha::Number),
+                 @nospecialize(A::AbstractArray),
+                 @nospecialize(beta::Number),
+                 @nospecialize(C::AbstractArray))
+    scalar_type = plan.scalar_type
+    hiptensorReduce(handle(), plan,
+                   Ref{scalar_type}(alpha), A,
+                   Ref{scalar_type}(beta), C, C,
+                   plan.workspace, sizeof(plan.workspace), Cint(0))
+    return C
+end
+
+function plan_reduction(
+        @nospecialize(A::AbstractArray), Ainds::ModeType, opA::hiptensorOperator_t,
+        @nospecialize(C::AbstractArray), Cinds::ModeType, opC::hiptensorOperator_t,
+        opReduce::hiptensorOperator_t;
+        jit::hiptensorJitMode_t=JIT_MODE_NONE,
+        workspace::hiptensorWorksizePreference_t=WORKSPACE_DEFAULT,
+        algo::hiptensorAlgo_t=ALGO_DEFAULT,
+        compute_type::Union{DataType, hiptensorComputeDescriptorEnum, Nothing}=nothing)
+    !is_unary(opA)    && throw(ArgumentError("opA must be a unary op!"))
+    !is_unary(opC)    && throw(ArgumentError("opC must be a unary op!"))
+    !is_binary(opReduce)  && throw(ArgumentError("opReduce must be a binary op!"))
+    descA = hipTensorDescriptor(A)
+    descC = hipTensorDescriptor(C)
+    # for now, D must be identical to C (and thus, descD must be identical to descC)
+    modeA = collect(Cint, Ainds)
+    modeC = collect(Cint, Cinds)
+
+    actual_compute_type = if compute_type === nothing
+        reduction_compute_types[(eltype(A), eltype(C))]
+    else
+        compute_type
+    end
+
+    desc = Ref{hiptensorOperationDescriptor_t}()
+    hiptensorCreateReduction(handle(),
+                            desc,
+                            descA, modeA, opA,
+                            descC, modeC, opC,
+                            descC, modeC, opReduce,
+                            actual_compute_type)
+
+    plan_pref = Ref{hiptensorPlanPreference_t}()
+    hiptensorCreatePlanPreference(handle(), plan_pref, algo, jit)
+
+    plan = hipTensorPlan(desc[], plan_pref[]; workspacePref=workspace)
+    hiptensorDestroyOperationDescriptor(desc[])
+    hiptensorDestroyPlanPreference(plan_pref[])
+    return plan
+end
diff --git a/src/tensor/types.jl b/src/tensor/types.jl
new file mode 100644
index 000000000..c8ae7edb6
--- /dev/null
+++ b/src/tensor/types.jl
@@ -0,0 +1,290 @@
+## data types
+
+@enum hiptensorComputeDescriptorEnum begin
+    COMPUTE_DESC_32F = 4
+    COMPUTE_DESC_64F = 16
+    COMPUTE_DESC_16F = 1
+    COMPUTE_DESC_16BF = 1024
+    COMPUTE_DESC_C32F = 2048
+    COMPUTE_DESC_C64F = 4096
+    COMPUTE_DESC_NONE = 0
+    COMPUTE_DESC_8U = 64
+    COMPUTE_DESC_8I = 256
+    COMPUTE_DESC_32U = 128
+    COMPUTE_DESC_32I = 512
+end
+
+const contraction_compute_types = Dict(
+    # typeA,     typeB,      typeC       => typeCompute
+    (Float16,    Float16,    Float16)    => Float32,
+    (Float32,    Float32,    Float32)    => Float32,
+    (Float64,    Float64,    Float64)    => Float64,
+    (ComplexF32, ComplexF32, ComplexF32) => Float32,
+    (ComplexF64, ComplexF64, ComplexF64) => Float64,
+    (Float64,    ComplexF64, ComplexF64) => Float64,
+    (ComplexF64, Float64,    ComplexF64) => Float64)
+
+const elementwise_trinary_compute_types = Dict(
+    # typeA,     typeB,      typeC       => typeCompute
+    (Float16,    Float16,    Float16)    => Float16,
+    (Float32,    Float32,    Float32)    => Float32,
+    (Float64,    Float64,    Float64)    => Float64,
+    (ComplexF32, ComplexF32, ComplexF32) => Float32,
+    (ComplexF64, ComplexF64, ComplexF64) => Float64,
+    (Float32,    Float32,    Float16)    => Float32,
+  # (Float64,    Float64,    Float32)    => Float32,
+    (ComplexF64, ComplexF64, ComplexF32) => Float64)
+
+const elementwise_binary_compute_types = Dict(
+    # typeA,     typeC       => typeCompute
+    (Float16,    Float16)    => Float16,
+    (Float16,    Float32)    => Float32,
+    (Float32,    Float32)    => Float32,
+    (Float64,    Float64)    => Float64,
+    (ComplexF32, ComplexF32) => Float32,
+    (ComplexF64, ComplexF64) => Float64,
+    (ComplexF64, ComplexF32) => Float64,
+    (Float32,    Float16)    => Float32,
+    (Float64,    Float32)    => Float64)
+
+const permutation_compute_types = Dict(
+    # typeA,     typeB       => typeCompute
+    (Float16,    Float16)    => Float16,
+    (Float16,    Float32)    => Float32,
+  # (Float32,    Float16)    => Float32,
+    (Float32,    Float32)    => Float32,
+    (Float64,    Float64)    => Float64,
+    (Float32,    Float64)    => Float64,
+  # (Float64,    Float32)    => Float64,
+    (ComplexF32, ComplexF32) => Float32,
+    (ComplexF64, ComplexF64) => Float64,
+    (ComplexF32, ComplexF64) => Float64,
+  # (ComplexF64, ComplexF32) => Float64
+    )
+
+const reduction_compute_types = Dict(
+    # typeA,     typeC       => typeCompute
+    (Float16,    Float16)    => Float16,
+    (Float32,    Float32)    => Float32,
+    (Float64,    Float64)    => Float64,
+    (ComplexF32, ComplexF32) => Float32,
+    (ComplexF64, ComplexF64) => Float64)
+
+# we have our own enum to represent hiptensorComputeDescriptor_t values
+function Base.convert(::Type{hiptensorComputeDescriptorEnum}, T::DataType)
+    if T == Float16
+        return COMPUTE_DESC_16F
+    elseif T == Float32
+        return COMPUTE_DESC_32F
+    elseif T == ComplexF32
+        return COMPUTE_DESC_C32F
+    elseif T == Float64
+        return COMPUTE_DESC_64F
+    elseif T == ComplexF64
+        return COMPUTE_DESC_C64F
+    elseif T == Int8 
+        return COMPUTE_DESC_8I
+    elseif T == UInt8 
+        return COMPUTE_DESC_8U
+    elseif T == Int32 
+        return COMPUTE_DESC_32I
+    elseif T == UInt32 
+        return COMPUTE_DESC_32U
+    else
+        throw(ArgumentError("hiptensorComputeDescriptor equivalent for input type $T does not exist!"))
+    end
+end
+Base.cconvert(::Type{hiptensorComputeDescriptor_t}, T::DataType) =
+    Base.cconvert(hiptensorComputeDescriptor_t, convert(hiptensorComputeDescriptorEnum, T))
+
+function Base.cconvert(::Type{hiptensorComputeDescriptor_t}, T::hiptensorComputeDescriptorEnum)
+    if T == COMPUTE_DESC_16F
+        return HIPTENSOR_COMPUTE_DESC_16F
+    elseif T == COMPUTE_DESC_32F
+        return HIPTENSOR_COMPUTE_DESC_32F
+    elseif T == COMPUTE_DESC_64F
+        return HIPTENSOR_COMPUTE_DESC_64F
+    elseif T == COMPUTE_DESC_16BF
+        return HIPTENSOR_COMPUTE_DESC_16BF
+    elseif T == COMPUTE_DESC_C32F
+        return HIPTENSOR_COMPUTE_DESC_C32F
+    elseif T == COMPUTE_DESC_C64F
+        return HIPTENSOR_COMPUTE_DESC_C64F
+    elseif T == COMPUTE_DESC_8U
+        return HIPTENSOR_COMPUTE_DESC_8U
+    elseif T == COMPUTE_DESC_8I
+        return HIPTENSOR_COMPUTE_DESC_8I
+    elseif T == COMPUTE_DESC_32U
+        return HIPTENSOR_COMPUTE_DESC_32U
+    elseif T == COMPUTE_DESC_32I
+        return HIPTENSOR_COMPUTE_DESC_32I
+    else
+        throw(ArgumentError("hiptensorComputeDescriptor equivalent for input enum value $T does not exist!"))
+    end
+end
+
+
+function Base.convert(::Type{hiptensorDataType_t}, T::DataType)
+    if T == Float16
+        return HIPTENSOR_R_16F
+    elseif T == ComplexF16
+        return HIPTENSOR_C_16F
+    elseif T == Float32
+        return HIPTENSOR_R_32F
+    elseif T == ComplexF32
+        return HIPTENSOR_C_32F
+    elseif T == Float64
+        return HIPTENSOR_R_64F
+    elseif T == ComplexF64
+        return HIPTENSOR_C_64F
+    elseif T == Int8
+        return HIPTENSOR_R_8I
+    elseif T == Int32
+        return HIPTENSOR_R_32I
+    elseif T == UInt8
+        return HIPTENSOR_R_8U
+    elseif T == UInt32
+        return HIPTENSOR_R_32U
+    else
+        throw(ArgumentError("hiptensorDataType equivalent for input type $T does not exist!"))
+    end
+end
+
+function Base.convert(::DataType, T::hiptensorDataType_t)
+    if T == HIPTENSOR_R_16F
+        return Float16
+    elseif T == HIPTENSOR_R_32F
+        return Float32
+    elseif T == HIPTENSOR_C_32F
+        return ComplexF32
+    elseif T == HIPTENSOR_R_64F
+        return Float64
+    elseif T == HIPTENSOR_C_64F
+        return ComplexF64
+    else
+        throw(ArgumentError("Data type equivalent for hiptensorDataType type $T does not exist!"))
+    end
+end
+
+
+## plan
+
+mutable struct hipTensorPlan
+    handle::hiptensorPlan_t
+    workspace::ROCVector{UInt8,Mem.HIPBuffer}
+    scalar_type::DataType
+
+    function hipTensorPlan(desc, pref; workspacePref=HIPTENSOR_WORKSPACE_DEFAULT)
+        # estimate the workspace size
+        workspaceSizeEstimate = Ref{UInt64}(0)
+        hiptensorEstimateWorkspaceSize(handle(), desc, pref, workspacePref, workspaceSizeEstimate)
+
+        # determine the scalar type
+        required_scalar_type = Ref{hiptensorDataType_t}()
+        hiptensorOperationDescriptorGetAttribute(handle(), desc, HIPTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE, required_scalar_type, sizeof(required_scalar_type))
+        @show convert(Int64, UInt32(required_scalar_type[]))
+
+        # create the plan
+        plan_ref = Ref{hiptensorPlan_t}()
+        hiptensorCreatePlan(handle(), plan_ref, desc, pref, workspaceSizeEstimate[])
+
+        # allocate the actual workspace
+        actualWorkspaceSize = Ref{UInt64}(0)
+        hiptensorPlanGetAttribute(handle(), plan_ref[], HIPTENSOR_PLAN_REQUIRED_WORKSPACE, actualWorkspaceSize, sizeof(actualWorkspaceSize))
+        workspace = ROCVector{UInt8}(undef, actualWorkspaceSize[])
+
+        obj = new(plan_ref[], workspace, required_scalar_type[])
+        finalizer(AMDGPU.unsafe_free!, obj)
+        return obj
+    end
+end
+
+Base.show(io::IO, plan::hipTensorPlan) = @printf(io, "hipTensorPlan(%p)", plan.handle)
+
+Base.unsafe_convert(::Type{hiptensorPlan_t}, plan::hipTensorPlan) = plan.handle
+
+Base.:(==)(a::hipTensorPlan, b::hipTensorPlan) = a.handle == b.handle
+Base.hash(plan::hipTensorPlan, h::UInt) = hash(plan.handle, h)
+
+# destroying the plan
+unsafe_destroy!(plan::hipTensorPlan) = hiptensorDestroyPlan(plan)
+
+# freeing the plan and associated workspace
+function AMDGPU.unsafe_free!(plan::hipTensorPlan)
+    if plan.workspace != C_NULL
+        AMDGPU.unsafe_free!(plan.workspace)
+    end
+    if plan.handle != C_NULL
+        unsafe_destroy!(plan)
+        plan.handle = C_NULL
+    end
+end
+
+
+const HIPTENSOR_ALIGNMENT = UInt32(1)
+
+## descriptor
+
+mutable struct hipTensorDescriptor
+    handle::hiptensorTensorDescriptor_t
+    # inner constructor handles creation and finalizer of the descriptor
+    function hipTensorDescriptor(sz::Vector{Int64}, st::Vector{Int64}, eltype::DataType,
+                                alignmentRequirement::UInt32=HIPTENSOR_ALIGNMENT)
+        desc = Ref{hiptensorTensorDescriptor_t}(C_NULL)
+        length(st) == (N = length(sz)) || throw(ArgumentError("size and stride vectors must have the same length"))
+        T = convert(hiptensorDataType_t, eltype)
+        hiptensorCreateTensorDescriptor(handle(), desc, UInt32(N), sz, st, T, alignmentRequirement)
+        obj = new(desc[])
+        finalizer(unsafe_destroy!, obj)
+        return obj
+    end
+end
+
+# outer constructor restricted to DenseROCArray, but could be extended
+function hipTensorDescriptor(a::DenseROCArray; size=size(a), strides=strides(a), eltype=eltype(a))
+    sz = collect(Int64, size)
+    st = collect(Int64, strides)
+    return hipTensorDescriptor(sz, st, eltype)
+end
+
+Base.show(io::IO, desc::hipTensorDescriptor) = @printf(io, "hipTensorDescriptor(%p)", desc.handle)
+
+Base.unsafe_convert(::Type{hiptensorTensorDescriptor_t}, obj::hipTensorDescriptor) = obj.handle
+
+unsafe_destroy!(obj::hipTensorDescriptor) = hiptensorDestroyTensorDescriptor(obj)
+
+
+## tensor
+
+export hipTensor
+
+mutable struct hipTensor{T, N}
+    data::ROCArray{T, N}
+    inds::Vector{Int32}
+
+    function hipTensor{T, N}(data::ROCArray{T,N}, inds::Vector) where {T<:Number, N}
+        if length(inds) != N
+            throw(ArgumentError("The number of indices must match the number of dimensions of the data."))
+        end
+        if !iszero(UInt(pointer(data)) % HIPTENSOR_ALIGNMENT)
+            @warn "The data for this hipTensor does not obey the hipTENSOR alignment requirement of $HIPTENSOR_ALIGNMENT. An explicit copy will be made to ensure the requirement is met."
+            return new(copy(data), inds)
+        else
+            return new(data, inds)
+        end
+    end
+end
+
+hipTensor(data::ROCArray{T,N}, inds::Vector) where {T<:Number, N} =
+    hipTensor{T,N}(data, inds)
+
+# array interface
+Base.size(T::hipTensor) = size(T.data)
+Base.size(T::hipTensor, i) = size(T.data, i)
+Base.length(T::hipTensor) = length(T.data)
+Base.ndims(T::hipTensor) = length(T.inds)
+Base.strides(T::hipTensor) = strides(T.data)
+Base.eltype(T::hipTensor) = eltype(T.data)
+Base.similar(T::hipTensor{Tv, N}) where {Tv, N} = hipTensor{Tv, N}(similar(T.data), copy(T.inds))
+Base.copy(T::hipTensor{Tv, N}) where {Tv, N} = hipTensor{Tv, N}(copy(T.data), copy(T.inds))
+Base.collect(T::hipTensor) = (collect(T.data), T.inds)
diff --git a/src/tensor/utils.jl b/src/tensor/utils.jl
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/utils.jl b/src/utils.jl
index fea65b01c..205f6a9e4 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -12,6 +12,7 @@ function versioninfo()
         _status(functional(:rocsparse))   "rocSPARSE"        _ver(:rocsparse, rocSPARSE.version) _libpath(librocsparse);
         _status(functional(:rocrand))     "rocRAND"          _ver(:rocrand, rocRAND.version)     _libpath(librocrand);
         _status(functional(:rocfft))      "rocFFT"           _ver(:rocfft, rocFFT.version)       _libpath(librocfft);
+        _status(functional(:hiptensor))   "hipTENSOR"        _ver(:hiptensor, hipTENSOR.version) _libpath(libhiptensor);
         _status(functional(:MIOpen))      "MIOpen"           _ver(:MIOpen, MIOpen.version)       _libpath(libMIOpen_path);
     ]
 
@@ -64,6 +65,7 @@ function correctly. Available `component` values are:
 - `:rocsparse`   - Queries rocSPARSE library availability
 - `:rocrand`     - Queries rocRAND library availability
 - `:rocfft`      - Queries rocFFT library availability
+- `:hiptensor`   - Queries hipTENSOR library availability
 - `:MIOpen`      - Queries MIOpen library availability
 - `:all`         - Queries all above components
 
@@ -86,6 +88,8 @@ function functional(component::Symbol)
         return !isempty(librocrand)
     elseif component == :rocfft
         return !isempty(librocfft)
+    elseif component == :hiptensor
+        return !isempty(libhiptensor)
     elseif component == :MIOpen
         return !isempty(libMIOpen_path)
     elseif component == :all
diff --git a/test/hiptensor/base.jl b/test/hiptensor/base.jl
new file mode 100644
index 000000000..072bf8481
--- /dev/null
+++ b/test/hiptensor/base.jl
@@ -0,0 +1,29 @@
+@testset "base" begin
+
+using LinearAlgebra, Random, AMDGPU
+
+if AMDGPU.hipTENSOR.has_hiptensor()
+    @test AMDGPU.hipTENSOR.version() isa VersionNumber
+
+    @testset "type basics" begin
+        N = 2
+        dmax = 2^div(18,N)
+        dims = rand(2:dmax, N)
+        p = randperm(N)
+        indsA = collect(('a':'z')[1:N])
+        dimsA = dims
+        A = rand(Float64, dimsA...)
+        dA = ROCArray(A)
+        p = randperm(N)
+        indsA = collect(('a':'z')[1:N])
+        ctA = AMDGPU.hipTENSOR.hipTensor(dA, indsA)
+        @test length(ctA) == length(A)
+        @test size(ctA) == size(A)
+        @test size(ctA, 1) == size(A, 1)
+        @test ndims(ctA) == ndims(A)
+        @test strides(ctA) == strides(A)
+        @test eltype(ctA) == eltype(A)
+    end
+end
+
+end
diff --git a/test/hiptensor/contractions.jl b/test/hiptensor/contractions.jl
new file mode 100644
index 000000000..4b617eea2
--- /dev/null
+++ b/test/hiptensor/contractions.jl
@@ -0,0 +1,207 @@
+using Test, AMDGPU
+using LinearAlgebra
+using AMDGPU.hipTENSOR: contract!, plan_contraction, hipTensor
+
+if AMDGPU.hipTENSOR.has_hiptensor()
+
+@testset "contractions" verbose = true begin
+    
+AMDGPU.hipTENSOR.hiptensorLoggerSetLevel(AMDGPU.hipTENSOR.hiptensorLogLevel_t(UInt32(16)))
+AMDGPU.hipTENSOR.hiptensorLoggerOpenFile("contract.log")
+
+
+eltypes = [(Float32, Float32, Float32, Float32),
+           (Float32, Float32, Float32, Float16),
+           (Float16, Float16, Float16, Float32),
+           (ComplexF32, ComplexF32, ComplexF32, ComplexF32),
+           (Float64, Float64, Float64, Float64),
+           (Float64, Float64, Float64, Float32),
+           (ComplexF64, ComplexF64, ComplexF64, ComplexF64),
+           ]
+
+@testset for NoA=1:2, NoB=1:2, Nc=1:2
+    @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes
+        @show eltyA, eltyB, eltyC, NoA, NoB, Nc
+        flush(stdout)
+        # setup
+        #dmax = 2^div(12, max(NoA+Nc, NoB+Nc, NoA+NoB))
+        dmax = 4
+        dimsoA = rand(2:dmax, NoA)
+        loA = prod(dimsoA)
+        dimsoB = rand(2:dmax, NoB)
+        loB = prod(dimsoB)
+        dimsc = rand(2:dmax, Nc)
+        lc = prod(dimsc)
+        allinds = collect('a':'z')
+        indsoA = allinds[1:NoA]
+        indsoB = allinds[NoA .+ (1:NoB)]
+        indsc = allinds[NoA .+ NoB .+ (1:Nc)]
+        pA = randperm(NoA + Nc)
+        ipA = invperm(pA)
+        pB = randperm(Nc + NoB)
+        ipB = invperm(pB)
+        pC = randperm(NoA + NoB)
+        ipC = invperm(pC)
+        compute_rtol = (eltyCompute == Float16 || eltyC == Float16) ? 1e-2 : (eltyCompute == Float32 ? 1e-4 : 1e-6)
+        dimsA = [dimsoA; dimsc][pA]
+        indsA = [indsoA; indsc][pA]
+        dimsB = [dimsc; dimsoB][pB]
+        indsB = [indsc; indsoB][pB]
+        dimsC = [dimsoA; dimsoB][pC]
+        indsC = [indsoA; indsoB][pC]
+
+        A = rand(eltyA, (dimsA...,))
+        mA = reshape(permutedims(A, ipA), (loA, lc))
+        B = rand(eltyB, (dimsB...,))
+        mB = reshape(permutedims(B, ipB), (lc, loB))
+        C = zeros(eltyC, (dimsC...,))
+        dA = ROCArray(A)
+        dB = ROCArray(B)
+        dC = ROCArray(C)
+        opA = AMDGPU.hipTENSOR.OP_IDENTITY
+        opB = AMDGPU.hipTENSOR.OP_IDENTITY
+        opC = AMDGPU.hipTENSOR.OP_IDENTITY
+        opOut = AMDGPU.hipTENSOR.OP_IDENTITY
+        
+        @testset "simple case" begin 
+            dC = contract!(1, dA, indsA, opA, dB, indsB, opB, 0, dC, indsC, opC, opOut, compute_type=eltyCompute)
+            C = collect(dC)
+            mC = reshape(permutedims(C, ipC), (loA, loB))
+            @test mC ≈ mA * mB rtol=compute_rtol
+            AMDGPU.synchronize()
+        end
+
+        @testset "simple case with plan storage" begin
+            plan  = hipTENSOR.plan_contraction(dA, indsA, opA, dB, indsB, opB, dC, indsC, opC, opOut)
+            dC = contract!(plan, 1, dA, dB, 0, dC)
+            C = collect(dC)
+            mC = reshape(permutedims(C, ipC), (loA, loB))
+            @test mC ≈ mA * mB
+            AMDGPU.synchronize()
+        end
+
+        @testset "simple case with plan storage and compute type" begin
+            eltypComputeEnum = convert(hipTENSOR.hiptensorComputeDescriptorEnum, eltyCompute)
+            plan  = hipTENSOR.plan_contraction(dA, indsA, opA, dB, indsB, opB, dC, indsC, opC, opOut; compute_type=eltypComputeEnum)
+            dC = hipTENSOR.contract!(plan, 1, dA, dB, 0, dC)
+            C = collect(dC)
+            mC = reshape(permutedims(C, ipC), (loA, loB))
+            @test mC ≈ mA * mB rtol=compute_rtol
+            AMDGPU.synchronize()
+        end
+
+        @testset "simple case with plan storage and JIT compilation" begin
+            plan  = AMDGPU.hipTENSOR.plan_contraction(dA, indsA, opA, dB, indsB, opB, dC, indsC, opC, opOut; jit=hipTENSOR.JIT_MODE_DEFAULT)
+            dC = contract!(plan, 1, dA, dB, 0, dC)
+            C = collect(dC)
+            mC = reshape(permutedims(C, ipC), (loA, loB))
+            @test mC ≈ mA * mB
+            AMDGPU.synchronize()
+        end
+
+        @testset "with non-trivial α" begin
+            α = rand(eltyCompute)
+            dC = contract!(α, dA, indsA, opA, dB, indsB, opB, zero(eltyCompute), dC, indsC, opC, opOut; compute_type=eltyCompute)
+            C = collect(dC)
+            mC = reshape(permutedims(C, ipC), (loA, loB))
+            @test mC ≈ α * mA * mB rtol=compute_rtol
+            AMDGPU.synchronize()
+        end
+
+        @testset "with non-trivial β" begin
+            C = rand(eltyC, (dimsC...,))
+            dC = ROCArray(C)
+            α = rand(eltyCompute)
+            β = rand(eltyCompute)
+            copyto!(dC, C)
+            dD = contract!(α, dA, indsA, opA, dB, indsB, opB, β, dC, indsC, opC, opOut; compute_type=eltyCompute)
+            D = collect(dD)
+            mC = reshape(permutedims(C, ipC), (loA, loB))
+            mD = reshape(permutedims(D, ipC), (loA, loB))
+            @test mD ≈ α * mA * mB + β * mC rtol=compute_rtol
+            AMDGPU.synchronize()
+        end
+
+        if eltyCompute != Float32 && eltyC != Float16
+            @testset "with hipTensor objects" begin
+                ctA = hipTensor(dA, indsA)
+                ctB = hipTensor(dB, indsB)
+                ctC = hipTensor(dC, indsC)
+                ctC = LinearAlgebra.mul!(ctC, ctA, ctB)
+                C2, C2inds = collect(ctC)
+                mC = reshape(permutedims(C2, ipC), (loA, loB))
+                @test mC ≈ mA * mB
+                ctC = ctA * ctB
+                C2, C2inds = collect(ctC)
+                pC2 = Int.(indexin(Char.(C2inds), [indsoA; indsoB]))
+                mC = reshape(permutedims(C2, invperm(pC2)), (loA, loB))
+                @test mC ≈ mA * mB
+            end
+            AMDGPU.synchronize()
+        end
+
+        # not supported for these specific cases for unknown reason
+        if !((NoA, NoB, Nc) in ((1,1,3), (1,2,3), (3,1,2)))
+            @testset "with conjugation flag for complex arguments" begin
+                if eltyA <: Complex
+                    opA   = AMDGPU.hipTENSOR.OP_CONJ
+                    opB   = AMDGPU.hipTENSOR.OP_IDENTITY
+                    opOut = AMDGPU.hipTENSOR.OP_IDENTITY
+                    dC    = contract!(complex(1.0, 0.0), dA, indsA, opA, dB, indsB, opB,
+                                                    0, dC, indsC, opC, opOut; compute_type=eltyCompute)
+                    C     = collect(dC)
+                    mC    = reshape(permutedims(C, ipC), (loA, loB))
+                    @test mC ≈ conj(mA) * mB rtol=compute_rtol
+                end
+                if eltyB <: Complex
+                    opA = AMDGPU.hipTENSOR.OP_IDENTITY
+                    opB = AMDGPU.hipTENSOR.OP_CONJ
+                    opOut = AMDGPU.hipTENSOR.OP_IDENTITY
+                    dC = contract!(complex(1.0, 0.0), dA, indsA, opA, dB, indsB, opB,
+                                   complex(0.0, 0.0), dC, indsC, opC, opOut; compute_type=eltyCompute)
+                    C = collect(dC)
+                    mC = reshape(permutedims(C, ipC), (loA, loB))
+                    @test mC ≈ mA*conj(mB) rtol=compute_rtol
+                end
+                if eltyA <: Complex && eltyB <: Complex
+                    opA = AMDGPU.hipTENSOR.OP_CONJ
+                    opB = AMDGPU.hipTENSOR.OP_CONJ
+                    opOut = AMDGPU.hipTENSOR.OP_IDENTITY
+                    dC = contract!(one(eltyCompute), dA, indsA, opA, dB, indsB, opB,
+                            zero(eltyCompute), dC, indsC, opC, opOut; compute_type=eltyCompute)
+                    C = collect(dC)
+                    mC = reshape(permutedims(C, ipC), (loA, loB))
+                    @test mC ≈ conj(mA)*conj(mB) rtol=compute_rtol
+                end
+            end
+            AMDGPU.synchronize()
+        end
+    end
+end
+
+@testset "contractions of views" begin
+    @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes
+        dimsA = (16,)
+        dimsB = (4,)
+        dimsC = (8,)
+        A = rand(eltyA, dimsA)
+        B = rand(eltyB, dimsB)
+        C = rand(eltyC, dimsC)
+        dA = ROCArray(A)
+        dB = ROCArray(B)
+        dC = ROCArray(C)
+        dD = ROCArray(C)
+        vA = @view dA[1:4]
+        vB = @view dB[4:4]
+        vC = @view dC[3:6]
+        vD = @view dD[3:6]
+        tA = hipTensor(reshape(vA, (4, 1)), [1, 2])
+        tB = hipTensor(reshape(vB, (1, 1)), [3, 2])
+        tC = hipTensor(reshape(vC, (1, 4)), [3, 1])
+        mul!(tC, tA, tB)
+    end
+end
+
+end
+
+end
diff --git a/test/hiptensor/elementwise_binary.jl b/test/hiptensor/elementwise_binary.jl
new file mode 100644
index 000000000..3cd01eebe
--- /dev/null
+++ b/test/hiptensor/elementwise_binary.jl
@@ -0,0 +1,128 @@
+using Test, AMDGPU
+using AMDGPU
+using LinearAlgebra
+using AMDGPU.hipTENSOR: elementwise_binary_execute!, hipTensor
+
+if AMDGPU.hipTENSOR.has_hiptensor()
+
+    AMDGPU.hipTENSOR.hiptensorLoggerSetLevel(AMDGPU.hipTENSOR.hiptensorLogLevel_t(UInt32(16)))
+    AMDGPU.hipTENSOR.hiptensorLoggerOpenFile("tensor.log")
+
+    eltypes = [(Float16, Float16),
+               (Float16, Float32),
+               (Float32, Float32),
+               #(Float64, Float64),
+               #(ComplexF32, ComplexF32),
+               #(ComplexF64, ComplexF64),
+               #(ComplexF64, ComplexF32),
+               #(Float64, Float32),
+               ]
+
+    Ns = 2:5
+
+    @testset "elementwise binary ($eltyA, $eltyC), N $N" for (eltyA, eltyC) in eltypes, N in Ns
+        # setup
+        eltyD = eltyC
+        #dmax  = 2^div(18,N)
+        #dims  = rand(2:dmax, N)
+        dims  = fill(4, N)
+        p     = randperm(N)
+        indsA = collect(('a':'z')[1:N])
+        indsC = indsA[p]
+        dimsA = dims
+        dimsC = dims[p]
+        A     = rand(eltyA, dimsA...)
+        dA    = ROCArray(A)
+        C     = rand(eltyC, dimsC...)
+        dC    = ROCArray(C)
+        AMDGPU.synchronize()
+
+        @testset "simple case" begin
+            opA   = AMDGPU.hipTENSOR.OP_IDENTITY
+            opC   = AMDGPU.hipTENSOR.OP_IDENTITY
+            dD    = similar(dC, eltyD)
+            opAC  = AMDGPU.hipTENSOR.OP_ADD
+            dD    = elementwise_binary_execute!(one(eltyA), dA, indsA, opA, one(eltyC), dC, indsC, opC, dD, indsC, opAC)
+            D     = collect(dD)
+            @test D ≈ permutedims(A, p) + C
+        end
+
+        #=@testset "using integers as indices" begin
+            opA   = AMDGPU.hipTENSOR.OP_IDENTITY
+            opC   = AMDGPU.hipTENSOR.OP_IDENTITY
+            dD    = similar(dC, eltyD)
+            opAC  = AMDGPU.hipTENSOR.OP_ADD
+            dD    = elementwise_binary_execute!(1, dA, 1:N, opA, 1, dC, p, opC, dD, p, opAC)
+            D     = collect(dD)
+            @test D ≈ permutedims(A, p) + C
+        end
+
+        @testset "multiplication as binary operator" begin
+            opA   = AMDGPU.hipTENSOR.OP_IDENTITY
+            opC   = AMDGPU.hipTENSOR.OP_IDENTITY
+            dD    = similar(dC, eltyD)
+            opAC  = AMDGPU.hipTENSOR.OP_MUL
+            dD    = elementwise_binary_execute!(1, dA, indsA, opA, 1, dC, indsC, opC, dD, indsC, opAC)
+            D     = collect(dD)
+            @test D ≈ permutedims(A, p) .* C
+        end
+
+        @testset "with non-trivial coefficients and conjugation" begin
+            dD   = similar(dC, eltyD)
+            opA  = eltyA <: Complex ? AMDGPU.hipTENSOR.OP_CONJ : AMDGPU.hipTENSOR.OP_IDENTITY
+            opC  = AMDGPU.hipTENSOR.OP_IDENTITY
+            opAC = AMDGPU.hipTENSOR.OP_ADD
+            α    = rand(eltyD)
+            γ    = rand(eltyD)
+            dD   = elementwise_binary_execute!(α, dA, indsA, opA, γ, dC, indsC, opC, dD, indsC, opAC)
+            D    = collect(dD)
+            @test D ≈ α * conj.(permutedims(A, p)) + γ * C
+        end
+
+        @testset "test in-place, and more complicated unary and binary operations" begin
+            opA = eltyA <: Complex ? AMDGPU.hipTENSOR.OP_IDENTITY : AMDGPU.hipTENSOR.OP_SQRT
+            # because we use rand, entries of A will be positive when elty is real
+            opC = eltyC <: Complex ? AMDGPU.hipTENSOR.OP_CONJ : AMDGPU.hipTENSOR.OP_IDENTITY
+            opAC = eltyD <: Complex ? AMDGPU.hipTENSOR.OP_ADD : AMDGPU.hipTENSOR.OP_MAX
+            α = rand(eltyD)
+            γ = rand(eltyD)
+            dD = elementwise_binary_execute!(α, dA, indsA, opA, γ, dC, indsC, opC, dC, indsC, opAC)
+            D = collect(dC)
+            if eltyD <: Complex
+                if eltyA <: Complex
+                    @test D ≈ α * permutedims(A, p) + γ * conj.(C)
+                else
+                    @test D ≈ α * sqrt.(eltyD.(permutedims(A, p))) + γ * conj.(C)
+                end
+            else
+                @test D ≈ max.(α * sqrt.(eltyD.(permutedims(A, p))), γ * C)
+            end
+        end
+
+        # using hipTensor type
+        dA = ROCArray(A)
+        dC = ROCArray(C)
+        ctA = hipTensor(dA, indsA)
+        ctC = hipTensor(dC, indsC)
+        ctD = ctA + ctC
+        hD = collect(ctD.data)
+        @test hD ≈ permutedims(A, p) + C
+        ctD = ctA - ctC
+        hD = collect(ctD.data)
+        @test hD ≈ permutedims(A, p) - C
+
+        α = rand(eltyD)
+        ctC_copy = copy(ctC)
+        ctD = LinearAlgebra.axpy!(α, ctA, ctC_copy)
+        @test ctD == ctC_copy
+        hD = collect(ctD.data)
+        @test hD ≈ α * permutedims(A, p) + C
+
+        γ = rand(eltyD)
+        ctC_copy = copy(ctC)
+        ctD = LinearAlgebra.axpby!(α, ctA, γ, ctC_copy)
+        @test ctD == ctC_copy
+        hD = collect(ctD.data)
+        @test hD ≈ α * permutedims(A, p) + γ * C=#
+    end
+end
diff --git a/test/hiptensor/elementwise_trinary.jl b/test/hiptensor/elementwise_trinary.jl
new file mode 100644
index 000000000..ef4e688a4
--- /dev/null
+++ b/test/hiptensor/elementwise_trinary.jl
@@ -0,0 +1,148 @@
+using Test, AMDGPU
+using LinearAlgebra
+
+if AMDGPU.hipTENSOR.has_hiptensor()
+    @testset "elementwise trinary" begin
+
+    using AMDGPU.hipTENSOR: elementwise_trinary_execute!
+
+    eltypes = [(Float16, Float16, Float16),
+               (Float32, Float32, Float32),
+               (Float64, Float64, Float32),
+               #=(Float64, Float64, Float64),
+               (ComplexF32, ComplexF32, ComplexF32),
+               (ComplexF64, ComplexF64, ComplexF64),
+               (Float32, Float32, Float16),
+               (ComplexF64, ComplexF64, ComplexF32)=#,
+              ]
+
+    @testset for N=2:5
+        @testset for (eltyA, eltyB, eltyC) in eltypes
+            @show eltyA, eltyB, eltyC, N
+            flush(stdout)
+            # setup
+            eltyD = eltyC
+            dmax = 2^div(18,N)
+            dims = rand(2:dmax, N)
+            pA = randperm(N)
+            ipA = invperm(pA)
+            pB = randperm(N)
+            ipB = invperm(pB)
+            indsC = collect(('a':'z')[1:N])
+            dimsC = dims
+            indsA = indsC[ipA]
+            dimsA = dims[ipA]
+            indsB = indsC[ipB]
+            dimsB = dims[ipB]
+            A = rand(eltyA, dimsA...)
+            dA = ROCArray(A)
+            B = rand(eltyB, dimsB...)
+            dB = ROCArray(B)
+            C = rand(eltyC, dimsC...)
+            dC = ROCArray(C)
+            dD = similar(dC)
+
+            # simple case
+            opA = AMDGPU.hipTENSOR.OP_IDENTITY
+            opB = AMDGPU.hipTENSOR.OP_IDENTITY
+            opC = AMDGPU.hipTENSOR.OP_IDENTITY
+            opAB = AMDGPU.hipTENSOR.OP_ADD
+            opABC = AMDGPU.hipTENSOR.OP_ADD
+            dD = elementwise_trinary_execute!(1, dA, indsA, opA, 1, dB, indsB, opB,
+                                              1, dC, indsC, opC, dD, indsC, opAB, opABC)
+            D = collect(dD)
+            @test D ≈ permutedims(A, pA) + permutedims(B, pB) + C
+
+            # using integers as indices
+            dD = elementwise_trinary_execute!(1, dA, ipA, opA, 1, dB, ipB, opB,
+                                              1, dC, 1:N, opC, dD, 1:N, opAB, opABC)
+            D = collect(dD)
+            @test D ≈ permutedims(A, pA) + permutedims(B, pB) + C
+
+            # multiplication as binary operator
+            opAB = AMDGPU.hipTENSOR.OP_MUL
+            opABC = AMDGPU.hipTENSOR.OP_ADD
+            dD = elementwise_trinary_execute!(1, dA, indsA, opA, 1, dB, indsB, opB,
+                                              1, dC, indsC, opC, dD, indsC, opAB, opABC)
+            D = collect(dD)
+            @test D ≈ (eltyD.(permutedims(A, pA)) .* eltyD.(permutedims(B, pB))) + C
+
+            opAB = AMDGPU.hipTENSOR.OP_ADD
+            opABC = AMDGPU.hipTENSOR.OP_MUL
+            dD = elementwise_trinary_execute!(1, dA, indsA, opA, 1, dB, indsB, opB,
+                                              1, dC, indsC, opC, dD, indsC, opAB, opABC)
+            D = collect(dD)
+            @test D ≈ (eltyD.(permutedims(A, pA)) + eltyD.(permutedims(B, pB))) .* C
+
+            opAB = AMDGPU.hipTENSOR.OP_MUL
+            opABC = AMDGPU.hipTENSOR.OP_MUL
+            dD = elementwise_trinary_execute!(1, dA, indsA, opA, 1, dB, indsB, opB,
+                                              1, dC, indsC, opC, dD, indsC, opAB, opABC)
+            D = collect(dD)
+            @test D ≈ eltyD.(permutedims(A, pA)) .* eltyD.(permutedims(B, pB)) .* C
+
+            # with non-trivial coefficients and conjugation
+            α = rand(eltyD)
+            β = rand(eltyD)
+            γ = rand(eltyD)
+            opA = eltyA <: Complex ? AMDGPU.hipTENSOR.OP_CONJ :
+                                    AMDGPU.hipTENSOR.OP_IDENTITY
+            opAB = AMDGPU.hipTENSOR.OP_ADD
+            opABC = AMDGPU.hipTENSOR.OP_ADD
+            dD = elementwise_trinary_execute!(α, dA, indsA, opA, β, dB, indsB, opB,
+                                              γ, dC, indsC, opC, dD, indsC, opAB, opABC)
+            D = collect(dD)
+            @test D ≈ α * conj.(permutedims(A, pA)) + β * permutedims(B, pB) + γ * C
+
+            opB = eltyB <: Complex ? AMDGPU.hipTENSOR.OP_CONJ : AMDGPU.hipTENSOR.OP_IDENTITY
+            opAB = AMDGPU.hipTENSOR.OP_ADD
+            opABC = AMDGPU.hipTENSOR.OP_ADD
+            dD = elementwise_trinary_execute!(α, dA, indsA, opA, β, dB, indsB, opB,
+                                              γ, dC, indsC, opC, dD, indsC, opAB, opABC)
+            D = collect(dD)
+            @test D ≈ α * conj.(permutedims(A, pA)) + β * conj.(permutedims(B, pB)) + γ * C
+            opA = AMDGPU.hipTENSOR.OP_IDENTITY
+            opAB = AMDGPU.hipTENSOR.OP_MUL
+            opABC = AMDGPU.hipTENSOR.OP_ADD
+            dD = elementwise_trinary_execute!(α, dA, indsA, opA, β, dB, indsB, opB,
+                                              γ, dC, indsC, opC, dD, indsC, opAB, opABC)
+            D = collect(dD)
+            @test D ≈ (α * permutedims(A, pA)) .* (β * conj.(permutedims(B, pB))) + γ * C
+
+            # test in-place, and more complicated unary and binary operations
+            opA = eltyA <: Complex ? AMDGPU.hipTENSOR.OP_IDENTITY : AMDGPU.hipTENSOR.OP_SQRT
+            opB = eltyB <: Complex ? AMDGPU.hipTENSOR.OP_IDENTITY : AMDGPU.hipTENSOR.OP_SQRT
+            # because we use rand, entries of A will be positive when elty is real
+            opC = eltyC <: Complex ? AMDGPU.hipTENSOR.OP_CONJ : AMDGPU.hipTENSOR.OP_IDENTITY
+            opAB = eltyD <: Complex ? AMDGPU.hipTENSOR.OP_MUL : AMDGPU.hipTENSOR.OP_MIN
+            opABC = eltyD <: Complex ? AMDGPU.hipTENSOR.OP_ADD : AMDGPU.hipTENSOR.OP_MAX
+            α = rand(eltyD)
+            β = rand(eltyD)
+            γ = rand(eltyD)
+            dD = elementwise_trinary_execute!(α, dA, indsA, opA, β, dB, indsB, opB,
+                                              γ, dC, indsC, opC, dC, indsC, opAB, opABC)
+            D = collect(dD)
+            if eltyD <: Complex
+                if eltyA <: Complex && eltyB <: Complex
+                    @test D ≈ (α * permutedims(A, pA)) .*
+                              (β * permutedims(B, pB)) + γ * conj.(C)
+                elseif eltyB <: Complex
+                    @test D ≈ (α * sqrt.(eltyD.(permutedims(A, pA)))) .*
+                              (β * permutedims(B, pB)) + γ * conj.(C)
+                elseif eltyB <: Complex
+                    @test D ≈ (α * permutedims(A, pA)) .*
+                              (β * sqrt.(eltyD.(permutedims(B, pB)))) + γ * conj.(C)
+                else
+                    @test D ≈ (α * sqrt.(eltyD.(permutedims(A, pA)))) .*
+                              (β * sqrt.(eltyD.(permutedims(B, pB)))) + γ * conj.(C)
+                end
+            else
+                @test D ≈ max.(min.(α * sqrt.(eltyD.(permutedims(A, pA))),
+                                    β * sqrt.(eltyD.(permutedims(B, pB)))),
+                                γ * C)
+            end
+        end
+    end
+
+    end
+end
diff --git a/test/hiptensor/permutations.jl b/test/hiptensor/permutations.jl
new file mode 100644
index 000000000..2b57f9477
--- /dev/null
+++ b/test/hiptensor/permutations.jl
@@ -0,0 +1,58 @@
+using Test, AMDGPU
+using LinearAlgebra, Random
+
+@show AMDGPU.hipTENSOR.has_hiptensor()
+if AMDGPU.hipTENSOR.has_hiptensor()
+
+    @testset "permutations" begin
+
+    using AMDGPU.hipTENSOR: permute!
+
+    AMDGPU.hipTENSOR.hiptensorLoggerSetLevel(AMDGPU.hipTENSOR.hiptensorLogLevel_t(UInt32(16)))
+    AMDGPU.hipTENSOR.hiptensorLoggerOpenFile("permute.log")
+
+    eltypes = [(Float16, Float16),
+               (Float16, Float32),
+               #(Float32, Float16),
+               (Float32, Float32),
+               #=(Float64, Float64),
+               (Float32, Float64),
+               (Float64, Float32),
+               (ComplexF32, ComplexF32),
+               (ComplexF64, ComplexF64),
+               (ComplexF32, ComplexF64),
+               (ComplexF64, ComplexF32)=#
+               ]
+
+    @testset for N=2#:5
+        @testset for (eltyA, eltyC) in eltypes
+            # setup
+            dmax = 2^div(18,N)
+            dims = rand(2:dmax, N)
+            p = randperm(N)
+            indsA = collect(('a':'z')[1:N])
+            indsC = indsA[p]
+            dimsA = dims
+            dimsC = dims[p]
+            A = rand(eltyA, dimsA...)
+            dA = ROCArray(A)
+            dC = similar(dA, eltyC, dimsC...)
+            opA = AMDGPU.hipTENSOR.OP_IDENTITY
+
+            @testset "simple case" begin
+                dC = permute!(one(eltyA), dA, indsA, opA, dC, indsC)
+                C  = collect(dC)
+                @test C == permutedims(A, p) # exact equality
+            end
+
+            @testset "with scalar" begin
+                α  = rand(eltyA)
+                dC = permute!(α, dA, indsA, opA, dC, indsC)
+                C  = collect(dC)
+                @test C ≈ α * permutedims(A, p) # approximate, floating point rounding
+            end
+        end
+    end
+
+    end
+end
diff --git a/test/hiptensor/reductions.jl b/test/hiptensor/reductions.jl
new file mode 100644
index 000000000..f2a86134e
--- /dev/null
+++ b/test/hiptensor/reductions.jl
@@ -0,0 +1,71 @@
+@testset "reductions" begin
+
+using AMDGPU
+using AMDGPU.hipTENSOR: reduce!
+
+using LinearAlgebra, Random
+
+eltypes = [(Float16, Float16, Float16),
+           (Float16, Float16, Float32),
+           (Float32, Float32, Float32),
+           (Float64, Float64, Float64),
+           ]
+
+if AMDGPU.hipTENSOR.has_hiptensor()
+    AMDGPU.hipTENSOR.hiptensorLoggerSetLevel(AMDGPU.hipTENSOR.hiptensorLogLevel_t(UInt32(16)))
+    AMDGPU.hipTENSOR.hiptensorLoggerOpenFile("reduce.log")
+
+    @testset for NA=2:5, NC = 1:NA-1
+        @testset for (eltyA, eltyC, elty_compute) in eltypes
+            # setup
+            eltyD = eltyC
+            dmax = 2^div(18,NA)
+            dims = rand(2:dmax, NA)
+            p = randperm(NA)
+            indsA = collect(('a':'z')[1:NA])
+            indsC = indsA[p][1:NC]
+            dimsA = dims
+            dimsC = dims[p][1:NC]
+            A = rand(eltyA, (dimsA...,))
+            dA = ROCArray(A)
+            C = rand(eltyC, (dimsC...,))
+            dC = ROCArray(C)
+
+            opA = AMDGPU.hipTENSOR.OP_IDENTITY
+            opC = AMDGPU.hipTENSOR.OP_IDENTITY
+            opReduce = AMDGPU.hipTENSOR.OP_ADD
+            # simple case
+            dC = reduce!(1, dA, indsA, opA, 0, dC, indsC, opC, opReduce; compute_type = elty_compute)
+            C = collect(dC)
+            @test reshape(C, (dimsC..., ones(Int,NA-NC)...)) ≈
+                sum(permutedims(A, p); dims = ((NC+1:NA)...,))
+
+            # using integers as indices
+            dC = reduce!(1, dA, collect(1:NA), opA, 0, dC, p[1:NC], opC, opReduce; compute_type = elty_compute)
+            C = collect(dC)
+            @test reshape(C, (dimsC..., ones(Int,NA-NC)...)) ≈
+                sum(permutedims(A, p); dims = ((NC+1:NA)...,))
+
+            # multiplication as reduction operator
+            opReduce = AMDGPU.hipTENSOR.OP_MUL
+            dC = reduce!(1, dA, indsA, opA, 0, dC, indsC, opC, opReduce; compute_type = elty_compute)
+            C = collect(dC)
+            @test reshape(C, (dimsC..., ones(Int,NA-NC)...)) ≈
+                prod(permutedims(A, p); dims = ((NC+1:NA)...,)) atol=eps(Float16) rtol=Base.rtoldefault(Float16)
+
+            # with non-trivial coefficients and conjugation
+            opA = eltyA <: Complex ? AMDGPU.hipTENSOR.OP_CONJ : AMDGPU.hipTENSOR.OP_IDENTITY
+            opC = AMDGPU.hipTENSOR.OP_IDENTITY
+            opReduce = AMDGPU.hipTENSOR.OP_ADD
+            C = rand(eltyC, (dimsC...,))
+            dC = ROCArray(C)
+            α = rand(eltyC)
+            γ = rand(eltyC)
+            dC = reduce!(α, dA, indsA, opA, γ, dC, indsC, opC, opReduce; compute_type = elty_compute)
+            @test reshape(collect(dC), (dimsC..., ones(Int,NA-NC)...)) ≈
+                α * conj.(sum(permutedims(A, p); dims = ((NC+1:NA)...,))) + γ * C
+        end
+    end
+
+    end
+end