From 8b585757d6033e928d5b62c9e20374f462ae7206 Mon Sep 17 00:00:00 2001 From: guozhihao3 Date: Wed, 26 Nov 2025 22:21:41 +0800 Subject: [PATCH 01/10] feat: support sqrt-cpu --- include/infinicore/ops/sqrt.hpp | 17 +++++ include/infiniop.h | 1 + include/infiniop/ops/sqrt.h | 26 +++++++ python/infinicore/__init__.py | 1 + python/infinicore/ops/sqrt.py | 8 ++ src/infinicore/ops/sqrt/sqrt.cc | 24 ++++++ src/infinicore/ops/sqrt/sqrt_infiniop.cc | 57 ++++++++++++++ src/infinicore/pybind11/ops.hpp | 2 + src/infinicore/pybind11/ops/sqrt.hpp | 15 ++++ src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc | 51 +++++++++++++ src/infiniop/ops/sqrt/cpu/sqrt_cpu.h | 22 ++++++ src/infiniop/ops/sqrt/operator.cc | 97 ++++++++++++++++++++++++ test/infinicore/ops/sqrt.py | 6 +- 13 files changed, 324 insertions(+), 3 deletions(-) create mode 100644 include/infinicore/ops/sqrt.hpp create mode 100644 include/infiniop/ops/sqrt.h create mode 100644 python/infinicore/ops/sqrt.py create mode 100644 src/infinicore/ops/sqrt/sqrt.cc create mode 100644 src/infinicore/ops/sqrt/sqrt_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/sqrt.hpp create mode 100644 src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc create mode 100644 src/infiniop/ops/sqrt/cpu/sqrt_cpu.h create mode 100644 src/infiniop/ops/sqrt/operator.cc diff --git a/include/infinicore/ops/sqrt.hpp b/include/infinicore/ops/sqrt.hpp new file mode 100644 index 000000000..aa3ab89fd --- /dev/null +++ b/include/infinicore/ops/sqrt.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Sqrt { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor sqrt(Tensor input); +void sqrt_(Tensor ouput, Tensor input); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infiniop.h b/include/infiniop.h index 92e6f5963..d00306e14 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -24,6 +24,7 @@ #include "infiniop/ops/silu.h" #include "infiniop/ops/softmax.h" #include "infiniop/ops/softplus.h" +#include "infiniop/ops/sqrt.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" #include "infiniop/ops/tanh.h" diff --git a/include/infiniop/ops/sqrt.h b/include/infiniop/ops/sqrt.h new file mode 100644 index 000000000..78afce64e --- /dev/null +++ b/include/infiniop/ops/sqrt.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_SQRT_API_H__ +#define __INFINIOP_SQRT_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSqrtDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSqrtDescriptor( + infiniopHandle_t handle, + infiniopSqrtDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSqrt( + infiniopSqrtDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 06294bf3e..fe61667f3 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -44,6 +44,7 @@ from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow from infinicore.ops.rearrange import rearrange +from .ops.sqrt import sqrt from infinicore.tensor import ( Tensor, empty, diff --git a/python/infinicore/ops/sqrt.py b/python/infinicore/ops/sqrt.py new file mode 100644 index 000000000..9395d1ef0 --- /dev/null +++ b/python/infinicore/ops/sqrt.py @@ -0,0 +1,8 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def sqrt(input, *, out=None): + if out is None: + return Tensor(_infinicore.sqrt(input._underlying)) + _infinicore.sqrt_(out._underlying, input._underlying) + return out \ No newline at end of file diff --git a/src/infinicore/ops/sqrt/sqrt.cc b/src/infinicore/ops/sqrt/sqrt.cc new file mode 100644 index 000000000..26b11c868 --- /dev/null +++ b/src/infinicore/ops/sqrt/sqrt.cc @@ -0,0 +1,24 @@ +#include "infinicore/ops/sqrt.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Sqrt::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void Sqrt::execute(Tensor output, Tensor input) { + dispatcher().lookup(context::getDevice().getType())(output, input); +} + +Tensor sqrt(Tensor input) { + auto output = Tensor::empty(input->shape(), input->dtype(), input->device()); + sqrt_(output, input); + return output; +} + +void sqrt_(Tensor output, Tensor input) { + Sqrt::execute(output, input); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/sqrt/sqrt_infiniop.cc b/src/infinicore/ops/sqrt/sqrt_infiniop.cc new file mode 100644 index 000000000..a7ab698d3 --- /dev/null +++ b/src/infinicore/ops/sqrt/sqrt_infiniop.cc @@ -0,0 +1,57 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/sqrt.hpp" +#include + +namespace infinicore::op::sqrt_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopSqrtDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroySqrtDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopSqrtDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateSqrtDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc())); + + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetSqrtWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopSqrt( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + context::getStream())); +} + +static bool registered = []() { + Sqrt::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::sqrt_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 978defa17..df72d3ef6 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -14,6 +14,7 @@ #include "ops/rms_norm.hpp" #include "ops/rope.hpp" #include "ops/silu.hpp" +#include "ops/sqrt.hpp" #include "ops/swiglu.hpp" namespace py = pybind11; @@ -34,6 +35,7 @@ inline void bind(py::module &m) { bind_swiglu(m); bind_rope(m); bind_embedding(m); + bind_sqrt(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/sqrt.hpp b/src/infinicore/pybind11/ops/sqrt.hpp new file mode 100644 index 000000000..d5abfa761 --- /dev/null +++ b/src/infinicore/pybind11/ops/sqrt.hpp @@ -0,0 +1,15 @@ +#pragma once + +#include "infinicore/ops/sqrt.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { +inline void bind_sqrt(py::module &m) { + m.def("sqrt", &op::sqrt, py::arg("input"), + R"doc(Element-wise square root.)doc"); + m.def("sqrt_", &op::sqrt_, py::arg("output"), py::arg("input"), + R"doc(In-place square root.)doc"); +} +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc new file mode 100644 index 000000000..03be87edc --- /dev/null +++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc @@ -0,0 +1,51 @@ +#include "sqrt_cpu.h" + +namespace op::sqrt::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::sqrt::cpu \ No newline at end of file diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h new file mode 100644 index 000000000..c021f7977 --- /dev/null +++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h @@ -0,0 +1,22 @@ +#ifndef __SQRT_CPU_H__ +#define __SQRT_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(sqrt, cpu) + +namespace op::sqrt::cpu { +typedef struct SqrtOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::sqrt(x); + } +} SqrtOp; +} // namespace op::sqrt::cpu + +#endif // __SQRT_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc new file mode 100644 index 000000000..82119e4f9 --- /dev/null +++ b/src/infiniop/ops/sqrt/operator.cc @@ -0,0 +1,97 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sqrt.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sqrt_cpu.h" +#endif + +__C infiniStatus_t infiniopCreateSqrtDescriptor( + infiniopHandle_t handle, + infiniopSqrtDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sqrt::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__C infiniStatus_t infiniopGetSqrtWorkspaceSize( + infiniopSqrtDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size \ + = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopSqrt( + infiniopSqrtDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infinicore/ops/sqrt.py b/test/infinicore/ops/sqrt.py index df1de761f..bfcf028f5 100644 --- a/test/infinicore/ops/sqrt.py +++ b/test/infinicore/ops/sqrt.py @@ -87,9 +87,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.sqrt(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.sqrt(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.sqrt(*args, **kwargs) def main(): From 5ea9db3d373618b9d61e71a4cd49761c95136f03 Mon Sep 17 00:00:00 2001 From: guozhihao3 Date: Sat, 29 Nov 2025 17:47:45 +0800 Subject: [PATCH 02/10] sqrt: support cuda and nvidia --- src/infiniop/ops/sqrt/cuda/kernel.cuh | 36 ++++++++++++ src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu | 59 ++++++++++++++++++++ src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh | 8 +++ src/infiniop/ops/sqrt/operator.cc | 12 ++++ 4 files changed, 115 insertions(+) create mode 100644 src/infiniop/ops/sqrt/cuda/kernel.cuh create mode 100644 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu create mode 100644 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh diff --git a/src/infiniop/ops/sqrt/cuda/kernel.cuh b/src/infiniop/ops/sqrt/cuda/kernel.cuh new file mode 100644 index 000000000..df355ab1b --- /dev/null +++ b/src/infiniop/ops/sqrt/cuda/kernel.cuh @@ -0,0 +1,36 @@ +#ifndef __SQRT_CUDA_H__ +#define __SQRT_CUDA_H__ + +// #include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include +#include + +namespace op::sqrt::cuda { +typedef struct SqrtOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + float2 xf = __half22float2(x); + return __floats2half2_rn(sqrtf(xf.x), sqrtf(xf.y)); + } else if constexpr (std::is_same_v) { + // FP16, convert to float first + float xf = __half2float(x); + return __float2half(sqrtf(xf)); + } else if constexpr (std::is_same_v) { + // BF16, convert to float first + float xf = __bfloat162float(x); + return __float2bfloat16(sqrtf(xf)); + } else if constexpr (std::is_same_v) { + return sqrtf(x); + } else if constexpr (std::is_same_v) { + return sqrt(x); + } else { + return sqrt(x); + } + } +} SqrtOp; +} // namespace op::sqrt::cuda + +#endif // __SQRT_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu new file mode 100644 index 000000000..e27cce305 --- /dev/null +++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sqrt_nvidia.cuh" + +namespace op::sqrt::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SqrtOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SqrtOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SqrtOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SqrtOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sqrt::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh new file mode 100644 index 000000000..8ea8466f6 --- /dev/null +++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SQRT_CUDA_API_H__ +#define __SQRT_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sqrt, nvidia) + +#endif // __SQRT_CUDA_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc index 82119e4f9..f557b364e 100644 --- a/src/infiniop/ops/sqrt/operator.cc +++ b/src/infiniop/ops/sqrt/operator.cc @@ -24,6 +24,9 @@ __C infiniStatus_t infiniopCreateSqrtDescriptor( #ifdef ENABLE_CPU_API CREATE(INFINI_DEVICE_CPU, cpu); #endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -44,6 +47,9 @@ __C infiniStatus_t infiniopGetSqrtWorkspaceSize( #ifdef ENABLE_CPU_API GET(INFINI_DEVICE_CPU, cpu) #endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -68,6 +74,9 @@ __C infiniStatus_t infiniopSqrt( #ifdef ENABLE_CPU_API CALCULATE(INFINI_DEVICE_CPU, cpu); #endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -88,6 +97,9 @@ infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc) { #ifdef ENABLE_CPU_API DELETE(INFINI_DEVICE_CPU, cpu); #endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; From e609a8c255dc6f60e13954797d06dd4c73d62be0 Mon Sep 17 00:00:00 2001 From: guozhihao-224 Date: Sat, 29 Nov 2025 18:36:40 +0800 Subject: [PATCH 03/10] =?UTF-8?q?sqrt:=20=E5=AE=8C=E6=88=90cuda=E7=BC=96?= =?UTF-8?q?=E5=86=99=EF=BC=8C=E8=B7=91=E9=80=9A=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/infinicore/ops/sqrt/sqrt_infiniop.cc | 83 +++++++++++++++++++++++- src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc | 6 +- src/infiniop/ops/sqrt/cuda/kernel.cuh | 4 +- src/infiniop/ops/sqrt/operator.cc | 4 ++ 4 files changed, 89 insertions(+), 8 deletions(-) diff --git a/src/infinicore/ops/sqrt/sqrt_infiniop.cc b/src/infinicore/ops/sqrt/sqrt_infiniop.cc index a7ab698d3..032538625 100644 --- a/src/infinicore/ops/sqrt/sqrt_infiniop.cc +++ b/src/infinicore/ops/sqrt/sqrt_infiniop.cc @@ -1,3 +1,61 @@ +// #include "../../utils.hpp" +// #include "infinicore/common/hash.hpp" +// #include "infinicore/ops/common/cache.hpp" +// #include "infinicore/ops/sqrt.hpp" +// #include + +// namespace infinicore::op::sqrt_impl::infiniop { + +// thread_local common::OpCache caches( +// 100, +// [](infiniopSqrtDescriptor_t &desc) { +// if (desc != nullptr) { +// INFINICORE_CHECK_ERROR(infiniopDestroySqrtDescriptor(desc)); +// desc = nullptr; +// } +// }); + +// void calculate(Tensor output, Tensor input) { +// size_t seed = hash_combine(output, input); +// auto device_type = context::getDevice().getType(); +// auto device_index = context::getDevice().getIndex(); +// auto &cache = caches.getCache(device_type, device_index); + +// auto desc_opt = cache.get(seed); +// infiniopSqrtDescriptor_t desc = nullptr; + +// if (!desc_opt) { +// INFINICORE_CHECK_ERROR(infiniopCreateSqrtDescriptor( +// context::getInfiniopHandle(output->device()), +// &desc, +// output->desc(), +// input->desc())); + +// cache.put(seed, desc); +// } else { +// desc = *desc_opt; +// } + +// size_t workspace_size = 0; +// INFINICORE_CHECK_ERROR(infiniopGetSqrtWorkspaceSize(desc, &workspace_size)); +// std::shared_ptr workspace = context::allocateMemory(workspace_size); + +// INFINICORE_CHECK_ERROR(infiniopSqrt( +// desc, +// workspace->data(), +// workspace_size, +// output->data(), +// input->data(), +// context::getStream())); +// } + +// static bool registered = []() { +// Sqrt::dispatcher().registerAll(&calculate, false); +// return true; +// }(); + +// } // namespace infinicore::op::sqrt_impl::infiniop + #include "../../utils.hpp" #include "infinicore/common/hash.hpp" #include "infinicore/ops/common/cache.hpp" @@ -16,7 +74,19 @@ thread_local common::OpCache caches( }); void calculate(Tensor output, Tensor input) { - size_t seed = hash_combine(output, input); + // 优化:只使用 input 的特征作为缓存键,不依赖 output 指针 + // 这样即使 output 是新创建的,也能复用 descriptor + size_t seed = 0; + hash_combine(seed, static_cast(input->dtype())); + + // 手动遍历 shape 和 strides(因为 hash_combine 不支持 std::vector) + for (Size shape_val : input->shape()) { + hash_combine(seed, shape_val); + } + for (Stride stride_val : input->strides()) { + hash_combine(seed, static_cast(stride_val)); + } + auto device_type = context::getDevice().getType(); auto device_index = context::getDevice().getIndex(); auto &cache = caches.getCache(device_type, device_index); @@ -38,11 +108,18 @@ void calculate(Tensor output, Tensor input) { size_t workspace_size = 0; INFINICORE_CHECK_ERROR(infiniopGetSqrtWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace = context::allocateMemory(workspace_size); + + // 优化:如果 workspace_size 为 0,跳过分配 + std::shared_ptr workspace; + void *workspace_ptr = nullptr; + if (workspace_size > 0) { + workspace = context::allocateMemory(workspace_size); + workspace_ptr = workspace->data(); + } INFINICORE_CHECK_ERROR(infiniopSqrt( desc, - workspace->data(), + workspace_ptr, workspace_size, output->data(), input->data(), diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc index 03be87edc..8d30110cb 100644 --- a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc +++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc @@ -8,19 +8,19 @@ infiniStatus_t Descriptor::create( infiniopHandle_t handle_, Descriptor **desc_ptr, infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { + std::vector input_descs) { auto handle = reinterpret_cast(handle_); auto dtype = out_desc->dtype(); - const auto &input_desc = input_desc_vec.at(0); + const auto &input_desc = input_descs.at(0); const auto &output_shape = out_desc->shape(); const auto &input_shape = input_desc->shape(); CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); CHECK_SAME_SHAPE(output_shape, input_shape); - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_descs); return INFINI_STATUS_SUCCESS; } diff --git a/src/infiniop/ops/sqrt/cuda/kernel.cuh b/src/infiniop/ops/sqrt/cuda/kernel.cuh index df355ab1b..ff3347d11 100644 --- a/src/infiniop/ops/sqrt/cuda/kernel.cuh +++ b/src/infiniop/ops/sqrt/cuda/kernel.cuh @@ -25,9 +25,9 @@ public: } else if constexpr (std::is_same_v) { return sqrtf(x); } else if constexpr (std::is_same_v) { - return sqrt(x); + return sqrtf(x); } else { - return sqrt(x); + return sqrtf(x); } } } SqrtOp; diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc index f557b364e..57b99f9bf 100644 --- a/src/infiniop/ops/sqrt/operator.cc +++ b/src/infiniop/ops/sqrt/operator.cc @@ -6,6 +6,10 @@ #include "cpu/sqrt_cpu.h" #endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/sqrt_nvidia.cuh" +#endif + __C infiniStatus_t infiniopCreateSqrtDescriptor( infiniopHandle_t handle, infiniopSqrtDescriptor_t *desc_ptr, From cda315ff243964ababa32d5ae2afded9a727c90e Mon Sep 17 00:00:00 2001 From: guozhihao3 Date: Mon, 1 Dec 2025 13:32:28 +0800 Subject: [PATCH 04/10] =?UTF-8?q?elu:=20=E6=94=AF=E6=8C=81CPU=E7=AE=97?= =?UTF-8?q?=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/infinicore/ops/elu.hpp | 17 ++++ include/infiniop.h | 1 + include/infiniop/ops/elu.h | 25 +++++ python/infinicore/nn/functional/__init__.py | 2 + python/infinicore/nn/functional/elu.py | 32 +++++++ src/infinicore/ops/elu/elu.cc | 24 +++++ src/infinicore/ops/elu/elu_infiniop.cc | 77 +++++++++++++++ src/infinicore/pybind11/ops.hpp | 2 + src/infinicore/pybind11/ops/elu.hpp | 29 ++++++ src/infiniop/ops/elu/cpu/elu_cpu.cc | 64 +++++++++++++ src/infiniop/ops/elu/cpu/elu_cpu.h | 66 +++++++++++++ src/infiniop/ops/elu/operator.cc | 100 ++++++++++++++++++++ test/infinicore/ops/elu.py | 8 +- 13 files changed, 443 insertions(+), 4 deletions(-) create mode 100644 include/infinicore/ops/elu.hpp create mode 100644 include/infiniop/ops/elu.h create mode 100644 python/infinicore/nn/functional/elu.py create mode 100644 src/infinicore/ops/elu/elu.cc create mode 100644 src/infinicore/ops/elu/elu_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/elu.hpp create mode 100644 src/infiniop/ops/elu/cpu/elu_cpu.cc create mode 100644 src/infiniop/ops/elu/cpu/elu_cpu.h create mode 100644 src/infiniop/ops/elu/operator.cc diff --git a/include/infinicore/ops/elu.hpp b/include/infinicore/ops/elu.hpp new file mode 100644 index 000000000..a93efd1b9 --- /dev/null +++ b/include/infinicore/ops/elu.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Elu { +public: + using schema = void (*)(Tensor, Tensor, float); + static void execute(Tensor output, Tensor input, float alpha); + static common::OpDispatcher &dispatcher(); +}; + +Tensor elu(Tensor input, float alpha = 1.0f); +void elu_(Tensor output, Tensor input, float alpha = 1.0f); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infiniop.h b/include/infiniop.h index d00306e14..f66f7d952 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -8,6 +8,7 @@ #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" #include "infiniop/ops/dequantize_awq.h" +#include "infiniop/ops/elu.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/layer_norm.h" diff --git a/include/infiniop/ops/elu.h b/include/infiniop/ops/elu.h new file mode 100644 index 000000000..d23162151 --- /dev/null +++ b/include/infiniop/ops/elu.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_ELU_API_H__ +#define __INFINIOP_ELU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopEluDescriptor_t; + +__C __export infiniStatus_t infiniopCreateEluDescriptor(infiniopHandle_t handle, + infiniopEluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + float alpha); + +__C __export infiniStatus_t infiniopGetEluWorkspaceSize(infiniopEluDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopElu(infiniopEluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyEluDescriptor(infiniopEluDescriptor_t desc); + +#endif // INFINIOP_OPS_ELU_H \ No newline at end of file diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 255079790..cbfd7422e 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -1,4 +1,5 @@ from .causal_softmax import causal_softmax +from .elu import elu from .embedding import embedding from .linear import linear from .random_sample import random_sample @@ -9,6 +10,7 @@ __all__ = [ "causal_softmax", + "elu", "random_sample", "rms_norm", "silu", diff --git a/python/infinicore/nn/functional/elu.py b/python/infinicore/nn/functional/elu.py new file mode 100644 index 000000000..b45b332bc --- /dev/null +++ b/python/infinicore/nn/functional/elu.py @@ -0,0 +1,32 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def elu(input: Tensor, alpha: float = 1.0, inplace: bool = False, *, out=None) -> Tensor: + r"""Apply the Exponential Linear Unit (ELU) function, element-wise. + + ELU(x) = x if x >= 0 else alpha * (exp(x) - 1) + + Args: + input: Input tensor + alpha: ELU parameter (default: 1.0) + inplace: If True, performs the operation in-place (default: False) + out: Optional output tensor for in-place operation + + Returns: + Output tensor with ELU applied element-wise. + """ + if infinicore.use_ntops and input.device.type in ("cuda", "musa") and out is None: + return infinicore.ntops.torch.elu(input, alpha=alpha, inplace=inplace) + + if inplace: + _infinicore.elu_(input._underlying, input._underlying, alpha) + return input + + if out is None: + return Tensor(_infinicore.elu(input._underlying, alpha)) + + _infinicore.elu_(out._underlying, input._underlying, alpha) + return out + diff --git a/src/infinicore/ops/elu/elu.cc b/src/infinicore/ops/elu/elu.cc new file mode 100644 index 000000000..c332fbf88 --- /dev/null +++ b/src/infinicore/ops/elu/elu.cc @@ -0,0 +1,24 @@ +#include "infinicore/ops/elu.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Elu::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void Elu::execute(Tensor output, Tensor input, float alpha) { + dispatcher().lookup(context::getDevice().getType())(output, input, alpha); +} + +Tensor elu(Tensor input, float alpha) { + auto output = Tensor::empty(input->shape(), input->dtype(), input->device()); + elu_(output, input, alpha); + return output; +} + +void elu_(Tensor output, Tensor input, float alpha) { + Elu::execute(output, input, alpha); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/elu/elu_infiniop.cc b/src/infinicore/ops/elu/elu_infiniop.cc new file mode 100644 index 000000000..15e2d7e76 --- /dev/null +++ b/src/infinicore/ops/elu/elu_infiniop.cc @@ -0,0 +1,77 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/elu.hpp" +#include + +namespace infinicore::op::elu_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopEluDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyEluDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, float alpha) { + // 构建缓存键:包含 input 特征和 alpha 参数 + size_t seed = 0; + hash_combine(seed, static_cast(input->dtype())); + hash_combine(seed, static_cast(*reinterpret_cast(&alpha))); // 将 float 转换为 uint32_t 进行哈希 + + // 手动遍历 shape 和 strides + for (Size shape_val : input->shape()) { + hash_combine(seed, shape_val); + } + for (Stride stride_val : input->strides()) { + hash_combine(seed, static_cast(stride_val)); + } + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopEluDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateEluDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + alpha)); + + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetEluWorkspaceSize(desc, &workspace_size)); + + // 如果 workspace_size 为 0,跳过分配 + std::shared_ptr workspace; + void *workspace_ptr = nullptr; + if (workspace_size > 0) { + workspace = context::allocateMemory(workspace_size); + workspace_ptr = workspace->data(); + } + + INFINICORE_CHECK_ERROR(infiniopElu( + desc, + workspace_ptr, + workspace_size, + output->data(), + input->data(), + context::getStream())); +} + +static bool registered = []() { + Elu::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::elu_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index df72d3ef6..48d9637a7 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -5,6 +5,7 @@ #include "ops/add.hpp" #include "ops/attention.hpp" #include "ops/causal_softmax.hpp" +#include "ops/elu.hpp" #include "ops/embedding.hpp" #include "ops/linear.hpp" #include "ops/matmul.hpp" @@ -36,6 +37,7 @@ inline void bind(py::module &m) { bind_rope(m); bind_embedding(m); bind_sqrt(m); + bind_elu(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/elu.hpp b/src/infinicore/pybind11/ops/elu.hpp new file mode 100644 index 000000000..fc16c95b3 --- /dev/null +++ b/src/infinicore/pybind11/ops/elu.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "infinicore/ops/elu.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { +inline void bind_elu(py::module &m) { + m.def("elu", &op::elu, py::arg("input"), py::arg("alpha") = 1.0f, + R"doc(Element-wise ELU activation function. + +Args: + input: Input tensor + alpha: ELU parameter (default: 1.0) + +Returns: + Output tensor with ELU applied element-wise. +)doc"); + m.def("elu_", &op::elu_, py::arg("output"), py::arg("input"), py::arg("alpha") = 1.0f, + R"doc(In-place ELU activation function. + +Args: + output: Output tensor (modified in-place) + input: Input tensor + alpha: ELU parameter (default: 1.0) +)doc"); +} +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infiniop/ops/elu/cpu/elu_cpu.cc b/src/infiniop/ops/elu/cpu/elu_cpu.cc new file mode 100644 index 000000000..4d3743d65 --- /dev/null +++ b/src/infiniop/ops/elu/cpu/elu_cpu.cc @@ -0,0 +1,64 @@ +#include "elu_cpu.h" + +namespace op::elu::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float alpha) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create elementwise info + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + + // create descriptor + *desc_ptr = new Descriptor( + dtype, + info_result.take(), + nullptr, + 0, + handle->device, + handle->device_id, + alpha); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream, _alpha); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream, _alpha); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream, _alpha); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream, _alpha); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::elu::cpu \ No newline at end of file diff --git a/src/infiniop/ops/elu/cpu/elu_cpu.h b/src/infiniop/ops/elu/cpu/elu_cpu.h new file mode 100644 index 000000000..7eb0debf1 --- /dev/null +++ b/src/infiniop/ops/elu/cpu/elu_cpu.h @@ -0,0 +1,66 @@ +#ifndef __ELU_CPU_H__ +#define __ELU_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +// #include "../../../utils.h" + +namespace op::elu::cpu { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _alpha; // ELU parameter + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::cpu::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float alpha) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size), + _alpha(alpha) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float alpha); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; +}; + +typedef struct Eluop { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x, float alpha) const { + if (x > T(0)) { + return x; + } else { + return T(alpha) * (std::exp(x) - T(1)); + } + } +} EluOp; +} // namespace op::elu::cpu + +#endif // __ELU_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/elu/operator.cc b/src/infiniop/ops/elu/operator.cc new file mode 100644 index 000000000..35a6bb065 --- /dev/null +++ b/src/infiniop/ops/elu/operator.cc @@ -0,0 +1,100 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/elu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/elu_cpu.h" +#endif + +__C infiniStatus_t infiniopCreateEluDescriptor( + infiniopHandle_t handle, + infiniopEluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + float alpha) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::elu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + {input}, \ + alpha); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetEluWorkspaceSize( + infiniopEluDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size \ + = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopElu( + infiniopEluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyEluDescriptor(infiniopEluDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infinicore/ops/elu.py b/test/infinicore/ops/elu.py index 3c5416cc5..f1e566dbb 100644 --- a/test/infinicore/ops/elu.py +++ b/test/infinicore/ops/elu.py @@ -136,10 +136,10 @@ def torch_operator(self, *args, **kwargs): """PyTorch ELU implementation""" return torch.nn.functional.elu(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore ELU implementation""" - # return infinicore.nn.functional.elu(*args, **kwargs) - # # return None + def infinicore_operator(self, *args, **kwargs): + """InfiniCore ELU implementation""" + return infinicore.nn.functional.elu(*args, **kwargs) + # return None def main(): From 62c861b4d6aed0a26bdf8893198bf08afbc1c2c5 Mon Sep 17 00:00:00 2001 From: guozhihao Date: Mon, 1 Dec 2025 22:21:06 +0800 Subject: [PATCH 05/10] =?UTF-8?q?diagflat:=20=E6=94=AF=E6=8C=81cpu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/infinicore/ops/diagflat.hpp | 20 ++ include/infiniop/ops/diagflat.h | 32 +++ python/infinicore/__init__.py | 3 + python/infinicore/ops/diagflat.py | 11 ++ src/infinicore/ops/diagflat/diagflat.cc | 29 +++ .../ops/diagflat/diagflat_infiniop.cc | 77 ++++++++ src/infinicore/pybind11/ops.hpp | 2 + src/infinicore/pybind11/ops/diagflat.hpp | 28 +++ src/infiniop/ops/diagflat/cpu/diagflat_cpu.cc | 185 ++++++++++++++++++ src/infiniop/ops/diagflat/cpu/diagflat_cpu.h | 52 +++++ src/infiniop/ops/diagflat/operator.cc | 99 ++++++++++ 11 files changed, 538 insertions(+) create mode 100644 include/infinicore/ops/diagflat.hpp create mode 100644 include/infiniop/ops/diagflat.h create mode 100644 python/infinicore/ops/diagflat.py create mode 100644 src/infinicore/ops/diagflat/diagflat.cc create mode 100644 src/infinicore/ops/diagflat/diagflat_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/diagflat.hpp create mode 100644 src/infiniop/ops/diagflat/cpu/diagflat_cpu.cc create mode 100644 src/infiniop/ops/diagflat/cpu/diagflat_cpu.h create mode 100644 src/infiniop/ops/diagflat/operator.cc diff --git a/include/infinicore/ops/diagflat.hpp b/include/infinicore/ops/diagflat.hpp new file mode 100644 index 000000000..6e714babd --- /dev/null +++ b/include/infinicore/ops/diagflat.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Diagflat { +public: + using schema = void (*)(Tensor, Tensor, int64_t); + static void execute(Tensor output, Tensor input, int64_t offset); + static common::OpDispatcher &dispatcher(); +}; + +Tensor diagflat(Tensor input, int64_t offset = 0); +void diagflat_(Tensor output, Tensor input, int64_t offset = 0); + +} // namespace infinicore::op + + diff --git a/include/infiniop/ops/diagflat.h b/include/infiniop/ops/diagflat.h new file mode 100644 index 000000000..b6a9f13b2 --- /dev/null +++ b/include/infiniop/ops/diagflat.h @@ -0,0 +1,32 @@ +#pragma once + +#ifndef __INFINIOP_DIAGFLAT_API_H__ +#define __INFINIOP_DIAGFLAT_API_H__ + +#include "../operator_descriptor.h" +#include + +typedef struct InfiniopDescriptor *infiniopDiagflatDescriptor_t; + +__C __export infiniStatus_t infiniopCreateDiagflatDescriptor( + infiniopHandle_t handle, + infiniopDiagflatDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + int64_t offset); + +__C __export infiniStatus_t +infiniopGetDiagflatWorkspaceSize(infiniopDiagflatDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopDiagflat( + infiniopDiagflatDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t +infiniopDestroyDiagflatDescriptor(infiniopDiagflatDescriptor_t desc); + +#endif // __INFINIOP_DIAGFLAT_API_H__ \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index fe61667f3..7684e7706 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -45,6 +45,7 @@ from infinicore.ops.narrow import narrow from infinicore.ops.rearrange import rearrange from .ops.sqrt import sqrt +from .ops.diagflat import diagflat from infinicore.tensor import ( Tensor, empty, @@ -114,6 +115,8 @@ "strided_empty", "strided_from_blob", "zeros", + "sqrt", + "diagflat", ] use_ntops = False diff --git a/python/infinicore/ops/diagflat.py b/python/infinicore/ops/diagflat.py new file mode 100644 index 000000000..c60878fc4 --- /dev/null +++ b/python/infinicore/ops/diagflat.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def diagflat(input, *, offset=0, out=None): + if out is None: + return Tensor(_infinicore.diagflat(input._underlying, offset)) + _infinicore.diagflat_(out._underlying, input._underlying, offset) + return out + + diff --git a/src/infinicore/ops/diagflat/diagflat.cc b/src/infinicore/ops/diagflat/diagflat.cc new file mode 100644 index 000000000..0e676774e --- /dev/null +++ b/src/infinicore/ops/diagflat/diagflat.cc @@ -0,0 +1,29 @@ +#include "infinicore/ops/diagflat.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Diagflat::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void Diagflat::execute(Tensor output, Tensor input, int64_t offset) { + dispatcher().lookup(context::getDevice().getType())(output, input, offset); +} + +Tensor diagflat(Tensor input, int64_t offset) { + // 输出 shape 由后端决定,这里直接让后端写入 output + // 先构造一个占位 Tensor(0-dim),再让实现自己 resize/allocate 也可以; + // 为简单起见,这里暂时只支持 out 版本:用户通过 diagflat_ 使用。 + auto flat = Tensor::empty({input->numel()}, input->dtype(), input->device()); + diagflat_(flat, input, offset); + return flat; +} + +void diagflat_(Tensor output, Tensor input, int64_t offset) { + Diagflat::execute(output, input, offset); +} + +} // namespace infinicore::op + + diff --git a/src/infinicore/ops/diagflat/diagflat_infiniop.cc b/src/infinicore/ops/diagflat/diagflat_infiniop.cc new file mode 100644 index 000000000..ba2f64475 --- /dev/null +++ b/src/infinicore/ops/diagflat/diagflat_infiniop.cc @@ -0,0 +1,77 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/diagflat.hpp" +#include "infiniop/ops/diagflat.h" +#include + + + +namespace infinicore::op::diagflat_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopDiagflatDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyDiagflatDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, int64_t offset) { + size_t seed = 0; + hash_combine(seed, static_cast(input->dtype())); + for (Size s : input->shape()) { + hash_combine(seed, static_cast(s)); + } + for (Stride st : input->strides()) { + hash_combine(seed, static_cast(st)); + } + hash_combine(seed, static_cast(offset)); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopDiagflatDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateDiagflatDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + offset)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetDiagflatWorkspaceSize(desc, &workspace_size)); + + std::shared_ptr workspace; + void *workspace_ptr = nullptr; + if (workspace_size > 0) { + workspace = context::allocateMemory(workspace_size); + workspace_ptr = workspace->data(); + } + + INFINICORE_CHECK_ERROR(infiniopDiagflat( + desc, + workspace_ptr, + workspace_size, + output->data(), + input->data(), + context::getStream())); +} + +static bool registered = []() { + Diagflat::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::diagflat_impl::infiniop + + diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 48d9637a7..abf1baee9 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -6,6 +6,7 @@ #include "ops/attention.hpp" #include "ops/causal_softmax.hpp" #include "ops/elu.hpp" +#include "ops/diagflat.hpp" #include "ops/embedding.hpp" #include "ops/linear.hpp" #include "ops/matmul.hpp" @@ -38,6 +39,7 @@ inline void bind(py::module &m) { bind_embedding(m); bind_sqrt(m); bind_elu(m); + bind_diagflat(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/diagflat.hpp b/src/infinicore/pybind11/ops/diagflat.hpp new file mode 100644 index 000000000..3b53c43f2 --- /dev/null +++ b/src/infinicore/pybind11/ops/diagflat.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "infinicore/ops/diagflat.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_diagflat(py::module &m) { + m.def( + "diagflat", + &op::diagflat, + py::arg("input"), + py::arg("offset") = 0, + R"doc(Create a 2D matrix with the input flattened into the diagonal.)doc"); + m.def( + "diagflat_", + &op::diagflat_, + py::arg("output"), + py::arg("input"), + py::arg("offset") = 0, + R"doc(In-place diagflat into the given output tensor.)doc"); +} + +} // namespace infinicore::ops + + diff --git a/src/infiniop/ops/diagflat/cpu/diagflat_cpu.cc b/src/infiniop/ops/diagflat/cpu/diagflat_cpu.cc new file mode 100644 index 000000000..2f68917b6 --- /dev/null +++ b/src/infiniop/ops/diagflat/cpu/diagflat_cpu.cc @@ -0,0 +1,185 @@ + #include "diagflat_cpu.h" + + namespace op::diagflat::cpu { + + Descriptor::~Descriptor() = default; + + infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_descs, + int64_t offset) { + + auto handle = reinterpret_cast(handle_); + + if (input_descs.size() != 1) { + return INFINI_STATUS_BAD_PARAM; + } + + auto in_desc = input_descs[0]; + auto dtype = out_desc->dtype(); + auto in_shape = in_desc->shape(); + auto out_shape = out_desc->shape(); + + // 支持与 diagflat 测试一致的 dtype + CHECK_DTYPE( + dtype, + INFINI_DTYPE_BF16, + INFINI_DTYPE_F16, + INFINI_DTYPE_F32, + INFINI_DTYPE_F64); + + // 只支持 contiguous + if (!in_desc->isContiguous() || !out_desc->isContiguous()) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + // NOTE: 这里只做非常轻量的 shape 检查: + // 输入展平后长度 n,输出必须是 2D 或更高维最后两维组成的矩阵,至少能容纳 diag。 + size_t in_numel = in_desc->numel(); + if (out_shape.size() < 2) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + size_t n = out_shape[out_shape.size() - 2]; + size_t m = out_shape[out_shape.size() - 1]; + // 最长对角线长度 + size_t max_diag = 0; + if (offset >= 0) { + if (offset >= static_cast(m)) { + max_diag = 0; + } else { + max_diag = std::min(n, m - static_cast(offset)); + } + } else { // offset < 0 + if (-offset >= static_cast(n)) { + max_diag = 0; + } else { + max_diag = std::min(m, n - static_cast(-offset)); + } + } + if (in_numel > max_diag && max_diag > 0) { + // 输入比对角线可容纳的更长,按 torch.diagflat 语义会被截断; + // 这里允许这种情况(kernel 里会在越界前 break),不报错。 + (void)in_numel; + } + + *desc_ptr = new Descriptor( + dtype, + std::move(in_shape), + std::move(out_shape), + offset, + 0, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; + } + + template + static void diagflat_kernel( + const T *x, + T *y, + const std::vector &in_shape, + const std::vector &out_shape, + int64_t offset) { + + // 计算输出元素个数并清零 + size_t out_numel = 1; + for (auto d : out_shape) { + out_numel *= d; + } + for (size_t i = 0; i < out_numel; ++i) { + y[i] = T{}; + } + + // 输入展平 + size_t in_numel = 1; + for (auto d : in_shape) { + in_numel *= d; + } + + // 视输出为二维矩阵 (n, m) + size_t ndim = out_shape.size(); + size_t n = out_shape[ndim - 2]; + size_t m = out_shape[ndim - 1]; + + (void)ndim; + + size_t i0 = 0; + size_t j0 = 0; + if (offset >= 0) { + j0 = static_cast(offset); + } else { + i0 = static_cast(-offset); + } + + for (size_t k = 0; k < in_numel; ++k) { + size_t ii = i0 + k; + size_t jj = j0 + k; + if (ii >= n || jj >= m) { + break; + } + size_t idx = ii * m + jj; + y[idx] = x[k]; + } + } + + infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + (void)workspace; + (void)workspace_size; + (void)stream; + + if (inputs.size() != 1) { + return INFINI_STATUS_BAD_PARAM; + } + + const void *x = inputs[0]; + + switch (_dtype) { + case INFINI_DTYPE_F16: + diagflat_kernel( + static_cast(x), + static_cast(output), + _input_shape, + _output_shape, + _offset); + break; + case INFINI_DTYPE_BF16: + diagflat_kernel( + static_cast(x), + static_cast(output), + _input_shape, + _output_shape, + _offset); + break; + case INFINI_DTYPE_F32: + diagflat_kernel( + static_cast(x), + static_cast(output), + _input_shape, + _output_shape, + _offset); + break; + case INFINI_DTYPE_F64: + diagflat_kernel( + static_cast(x), + static_cast(output), + _input_shape, + _output_shape, + _offset); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } + + } // namespace op::diagflat::cpu \ No newline at end of file diff --git a/src/infiniop/ops/diagflat/cpu/diagflat_cpu.h b/src/infiniop/ops/diagflat/cpu/diagflat_cpu.h new file mode 100644 index 000000000..489189540 --- /dev/null +++ b/src/infiniop/ops/diagflat/cpu/diagflat_cpu.h @@ -0,0 +1,52 @@ +#ifndef __DIAGFLAT_CPU_H__ +#define __DIAGFLAT_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +namespace op::diagflat::cpu { +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + std::vector _input_shape; + std::vector _output_shape; + int64_t _offset; + size_t _workspace_size; + + Descriptor( + infiniDtype_t dtype, + std::vector input_shape, + std::vector output_shape, + int64_t offset, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _input_shape(std::move(input_shape)), + _output_shape(std::move(output_shape)), + _offset(offset), + _workspace_size(workspace_size) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_descs, + int64_t offset); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; +}; + +} // namespace op::diagflat::cpu + +#endif // __DIAGFLAT_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/diagflat/operator.cc b/src/infiniop/ops/diagflat/operator.cc new file mode 100644 index 000000000..5fba61898 --- /dev/null +++ b/src/infiniop/ops/diagflat/operator.cc @@ -0,0 +1,99 @@ + #include "../../operator.h" + #include "../../handle.h" + #include "infiniop/ops/diagflat.h" + + #ifdef ENABLE_CPU_API + #include "cpu/diagflat_cpu.h" + #endif + + __C infiniStatus_t infiniopCreateDiagflatDescriptor( + infiniopHandle_t handle, + infiniopDiagflatDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int64_t offset) { + + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::diagflat::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + std::vector{x_desc}, \ + offset) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + + #undef CREATE + } + + __C infiniStatus_t + infiniopGetDiagflatWorkspaceSize( + infiniopDiagflatDescriptor_t desc, + size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc) \ + ->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + + #undef GET + } + + __C infiniStatus_t infiniopDiagflat( + infiniopDiagflatDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + + #undef CALCULATE + } + + __C infiniStatus_t + infiniopDestroyDiagflatDescriptor(infiniopDiagflatDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + + #undef DELETE + } From 4843c3d09db4c264eafeff147006ac42252222af Mon Sep 17 00:00:00 2001 From: guozhihao3 Date: Mon, 1 Dec 2025 22:35:42 +0800 Subject: [PATCH 06/10] =?UTF-8?q?diagflat:=20=E8=B7=91=E9=80=9Acpu?= =?UTF-8?q?=E6=B5=8B=E8=AF=95=E4=BE=8B=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/infinicore/ops/diagflat/diagflat.cc | 18 +- src/infiniop/ops/diagflat/cpu/diagflat_cpu.cc | 400 ++++++++++-------- src/infiniop/ops/diagflat/cpu/diagflat_cpu.h | 11 +- test/infinicore/ops/diagflat.py | 6 +- 4 files changed, 238 insertions(+), 197 deletions(-) diff --git a/src/infinicore/ops/diagflat/diagflat.cc b/src/infinicore/ops/diagflat/diagflat.cc index 0e676774e..d63bbc603 100644 --- a/src/infinicore/ops/diagflat/diagflat.cc +++ b/src/infinicore/ops/diagflat/diagflat.cc @@ -12,12 +12,16 @@ void Diagflat::execute(Tensor output, Tensor input, int64_t offset) { } Tensor diagflat(Tensor input, int64_t offset) { - // 输出 shape 由后端决定,这里直接让后端写入 output - // 先构造一个占位 Tensor(0-dim),再让实现自己 resize/allocate 也可以; - // 为简单起见,这里暂时只支持 out 版本:用户通过 diagflat_ 使用。 - auto flat = Tensor::empty({input->numel()}, input->dtype(), input->device()); - diagflat_(flat, input, offset); - return flat; + // 根据 PyTorch 语义:先展平,长度 n = input.numel() + // 输出为 2D 矩阵 (n + abs(offset), n + abs(offset)) + auto n = input->numel(); + auto abs_off = offset >= 0 ? static_cast(offset) + : static_cast(-offset); + auto dim = static_cast(n + abs_off); + + auto out = Tensor::empty({dim, dim}, input->dtype(), input->device()); + diagflat_(out, input, offset); + return out; } void diagflat_(Tensor output, Tensor input, int64_t offset) { @@ -25,5 +29,3 @@ void diagflat_(Tensor output, Tensor input, int64_t offset) { } } // namespace infinicore::op - - diff --git a/src/infiniop/ops/diagflat/cpu/diagflat_cpu.cc b/src/infiniop/ops/diagflat/cpu/diagflat_cpu.cc index 2f68917b6..aafc18827 100644 --- a/src/infiniop/ops/diagflat/cpu/diagflat_cpu.cc +++ b/src/infiniop/ops/diagflat/cpu/diagflat_cpu.cc @@ -1,185 +1,215 @@ - #include "diagflat_cpu.h" - - namespace op::diagflat::cpu { - - Descriptor::~Descriptor() = default; - - infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_descs, - int64_t offset) { - - auto handle = reinterpret_cast(handle_); - - if (input_descs.size() != 1) { - return INFINI_STATUS_BAD_PARAM; - } - - auto in_desc = input_descs[0]; - auto dtype = out_desc->dtype(); - auto in_shape = in_desc->shape(); - auto out_shape = out_desc->shape(); - - // 支持与 diagflat 测试一致的 dtype - CHECK_DTYPE( - dtype, - INFINI_DTYPE_BF16, - INFINI_DTYPE_F16, - INFINI_DTYPE_F32, - INFINI_DTYPE_F64); - - // 只支持 contiguous - if (!in_desc->isContiguous() || !out_desc->isContiguous()) { - return INFINI_STATUS_BAD_TENSOR_STRIDES; - } - - // NOTE: 这里只做非常轻量的 shape 检查: - // 输入展平后长度 n,输出必须是 2D 或更高维最后两维组成的矩阵,至少能容纳 diag。 - size_t in_numel = in_desc->numel(); - if (out_shape.size() < 2) { - return INFINI_STATUS_BAD_TENSOR_SHAPE; - } - size_t n = out_shape[out_shape.size() - 2]; - size_t m = out_shape[out_shape.size() - 1]; - // 最长对角线长度 - size_t max_diag = 0; - if (offset >= 0) { - if (offset >= static_cast(m)) { - max_diag = 0; - } else { - max_diag = std::min(n, m - static_cast(offset)); - } - } else { // offset < 0 - if (-offset >= static_cast(n)) { - max_diag = 0; - } else { - max_diag = std::min(m, n - static_cast(-offset)); - } - } - if (in_numel > max_diag && max_diag > 0) { - // 输入比对角线可容纳的更长,按 torch.diagflat 语义会被截断; - // 这里允许这种情况(kernel 里会在越界前 break),不报错。 - (void)in_numel; - } - - *desc_ptr = new Descriptor( - dtype, - std::move(in_shape), - std::move(out_shape), - offset, - 0, - handle->device, - handle->device_id); - - return INFINI_STATUS_SUCCESS; - } - - template - static void diagflat_kernel( - const T *x, - T *y, - const std::vector &in_shape, - const std::vector &out_shape, - int64_t offset) { - - // 计算输出元素个数并清零 - size_t out_numel = 1; - for (auto d : out_shape) { - out_numel *= d; - } - for (size_t i = 0; i < out_numel; ++i) { - y[i] = T{}; - } - - // 输入展平 - size_t in_numel = 1; - for (auto d : in_shape) { - in_numel *= d; - } - - // 视输出为二维矩阵 (n, m) - size_t ndim = out_shape.size(); - size_t n = out_shape[ndim - 2]; - size_t m = out_shape[ndim - 1]; - - (void)ndim; - - size_t i0 = 0; - size_t j0 = 0; - if (offset >= 0) { - j0 = static_cast(offset); - } else { - i0 = static_cast(-offset); - } - - for (size_t k = 0; k < in_numel; ++k) { - size_t ii = i0 + k; - size_t jj = j0 + k; - if (ii >= n || jj >= m) { - break; - } - size_t idx = ii * m + jj; - y[idx] = x[k]; - } - } - - infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - (void)workspace; - (void)workspace_size; - (void)stream; - - if (inputs.size() != 1) { - return INFINI_STATUS_BAD_PARAM; - } - - const void *x = inputs[0]; - - switch (_dtype) { - case INFINI_DTYPE_F16: - diagflat_kernel( - static_cast(x), - static_cast(output), - _input_shape, - _output_shape, - _offset); - break; - case INFINI_DTYPE_BF16: - diagflat_kernel( - static_cast(x), - static_cast(output), - _input_shape, - _output_shape, - _offset); - break; - case INFINI_DTYPE_F32: - diagflat_kernel( - static_cast(x), - static_cast(output), - _input_shape, - _output_shape, - _offset); - break; - case INFINI_DTYPE_F64: - diagflat_kernel( - static_cast(x), - static_cast(output), - _input_shape, - _output_shape, - _offset); - break; - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; - } - - } // namespace op::diagflat::cpu \ No newline at end of file +#include "diagflat_cpu.h" + +namespace op::diagflat::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_descs, + int64_t offset) { + + auto handle = reinterpret_cast(handle_); + + if (input_descs.size() != 1) { + return INFINI_STATUS_BAD_PARAM; + } + + auto in_desc = input_descs[0]; + auto dtype = out_desc->dtype(); + auto in_shape = in_desc->shape(); + auto in_strides = in_desc->strides(); + auto out_shape = out_desc->shape(); + auto ndim = in_desc->ndim(); + + // 支持与 diagflat 测试一致的 dtype + CHECK_DTYPE( + dtype, + INFINI_DTYPE_BF16, + INFINI_DTYPE_F16, + INFINI_DTYPE_F32, + INFINI_DTYPE_F64); + + // 只要求输出连续,输入可以非连续 + if (!out_desc->isContiguous()) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + // NOTE: 这里只做非常轻量的 shape 检查: + // 输入展平后长度 n,输出必须是 2D 或更高维最后两维组成的矩阵,至少能容纳 diag。 + size_t in_numel = in_desc->numel(); + if (out_shape.size() < 2) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + size_t n = out_shape[out_shape.size() - 2]; + size_t m = out_shape[out_shape.size() - 1]; + // 最长对角线长度 + size_t max_diag = 0; + if (offset >= 0) { + if (offset >= static_cast(m)) { + max_diag = 0; + } else { + max_diag = std::min(n, m - static_cast(offset)); + } + } else { // offset < 0 + if (-offset >= static_cast(n)) { + max_diag = 0; + } else { + max_diag = std::min(m, n - static_cast(-offset)); + } + } + if (in_numel > max_diag && max_diag > 0) { + // 输入比对角线可容纳的更长,按 torch.diagflat 语义会被截断; + // 这里允许这种情况(kernel 里会在越界前 break),不报错。 + (void)in_numel; + } + + bool input_contiguous = in_desc->isContiguous(); + + *desc_ptr = new Descriptor( + dtype, + std::move(in_shape), + std::move(in_strides), + std::move(out_shape), + offset, + 0, + input_contiguous, + ndim, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +static void diagflat_kernel( + const T *x, + T *y, + const std::vector &in_shape, + const std::vector &in_strides, + bool input_contiguous, + const std::vector &out_shape, + int64_t offset, + size_t ndim) { + + // 计算输出元素个数并清零 + size_t out_numel = 1; + for (auto d : out_shape) { + out_numel *= d; + } + for (size_t i = 0; i < out_numel; ++i) { + y[i] = T{}; + } + + // 输入展平 + size_t in_numel = 1; + for (auto d : in_shape) { + in_numel *= d; + } + + // 视输出为二维矩阵 (n, m) + size_t n = out_shape[out_shape.size() - 2]; + size_t m = out_shape[out_shape.size() - 1]; + + size_t i0 = 0; + size_t j0 = 0; + if (offset >= 0) { + j0 = static_cast(offset); + } else { + i0 = static_cast(-offset); + } + + for (size_t k = 0; k < in_numel; ++k) { + // 计算输入真实 index(支持非连续) + size_t src_idx; + if (input_contiguous) { + src_idx = k; + } else { + src_idx = op::common_cpu::indexToOffset( + k, + ndim, + in_shape.data(), + in_strides.data()); + } + size_t ii = i0 + k; + size_t jj = j0 + k; + if (ii >= n || jj >= m) { + break; + } + size_t idx = ii * m + jj; + y[idx] = x[src_idx]; + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + (void)workspace; + (void)workspace_size; + (void)stream; + + if (inputs.size() != 1) { + return INFINI_STATUS_BAD_PARAM; + } + + const void *x = inputs[0]; + + switch (_dtype) { + case INFINI_DTYPE_F16: + diagflat_kernel( + static_cast(x), + static_cast(output), + _input_shape, + _input_strides, + _input_contiguous, + _output_shape, + _offset, + _ndim); + break; + case INFINI_DTYPE_BF16: + diagflat_kernel( + static_cast(x), + static_cast(output), + _input_shape, + _input_strides, + _input_contiguous, + _output_shape, + _offset, + _ndim); + break; + case INFINI_DTYPE_F32: + diagflat_kernel( + static_cast(x), + static_cast(output), + _input_shape, + _input_strides, + _input_contiguous, + _output_shape, + _offset, + _ndim); + break; + case INFINI_DTYPE_F64: + diagflat_kernel( + static_cast(x), + static_cast(output), + _input_shape, + _input_strides, + _input_contiguous, + _output_shape, + _offset, + _ndim); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::diagflat::cpu \ No newline at end of file diff --git a/src/infiniop/ops/diagflat/cpu/diagflat_cpu.h b/src/infiniop/ops/diagflat/cpu/diagflat_cpu.h index 489189540..c741ffa5a 100644 --- a/src/infiniop/ops/diagflat/cpu/diagflat_cpu.h +++ b/src/infiniop/ops/diagflat/cpu/diagflat_cpu.h @@ -8,24 +8,33 @@ namespace op::diagflat::cpu { class Descriptor final : public InfiniopDescriptor { infiniDtype_t _dtype; std::vector _input_shape; + std::vector _input_strides; std::vector _output_shape; int64_t _offset; size_t _workspace_size; + bool _input_contiguous; + size_t _ndim; Descriptor( infiniDtype_t dtype, std::vector input_shape, + std::vector input_strides, std::vector output_shape, int64_t offset, size_t workspace_size, + bool input_contiguous, + size_t ndim, infiniDevice_t device_type, int device_id) : InfiniopDescriptor{device_type, device_id}, _dtype(dtype), _input_shape(std::move(input_shape)), + _input_strides(std::move(input_strides)), _output_shape(std::move(output_shape)), _offset(offset), - _workspace_size(workspace_size) {} + _workspace_size(workspace_size), + _input_contiguous(input_contiguous), + _ndim(ndim) {} public: ~Descriptor(); diff --git a/test/infinicore/ops/diagflat.py b/test/infinicore/ops/diagflat.py index 37a17d119..d7a01df4b 100644 --- a/test/infinicore/ops/diagflat.py +++ b/test/infinicore/ops/diagflat.py @@ -64,9 +64,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.diagflat(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.diagflat(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.diagflat(*args, **kwargs) def main(): From 1b9409a8587ed0cbf45280e886c4d2032144684b Mon Sep 17 00:00:00 2001 From: guozhihao Date: Fri, 5 Dec 2025 00:38:03 +0800 Subject: [PATCH 07/10] =?UTF-8?q?aminmax=EF=BC=9A=20support=20cpu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/infinicore/ops.hpp | 1 + include/infinicore/ops/aminmax.hpp | 25 ++ include/infiniop.h | 1 + include/infiniop/ops/aminmax.h | 31 ++ python/infinicore/__init__.py | 2 + python/infinicore/ops/aminmax.py | 16 + src/infinicore/ops/aminmax/aminmax.cc | 59 +++ .../ops/aminmax/aminmax_infiniop.cc | 74 ++++ src/infinicore/pybind11/ops.hpp | 2 + src/infinicore/pybind11/ops/aminmax.hpp | 43 +++ src/infiniop/ops/aminmax/cpu/aminmax_cpu.cc | 350 ++++++++++++++++++ src/infiniop/ops/aminmax/cpu/aminmax_cpu.h | 60 +++ src/infiniop/ops/aminmax/operator.cc | 93 +++++ test/infinicore/ops/aminmax.py | 2 + 14 files changed, 759 insertions(+) create mode 100644 include/infinicore/ops/aminmax.hpp create mode 100644 include/infiniop/ops/aminmax.h create mode 100644 python/infinicore/ops/aminmax.py create mode 100644 src/infinicore/ops/aminmax/aminmax.cc create mode 100644 src/infinicore/ops/aminmax/aminmax_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/aminmax.hpp create mode 100644 src/infiniop/ops/aminmax/cpu/aminmax_cpu.cc create mode 100644 src/infiniop/ops/aminmax/cpu/aminmax_cpu.h create mode 100644 src/infiniop/ops/aminmax/operator.cc diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index 0937a4821..5984dcf78 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -1,6 +1,7 @@ #pragma once #include "ops/add.hpp" +#include "ops/aminmax.hpp" #include "ops/attention.hpp" #include "ops/causal_softmax.hpp" #include "ops/matmul.hpp" diff --git a/include/infinicore/ops/aminmax.hpp b/include/infinicore/ops/aminmax.hpp new file mode 100644 index 000000000..c6c2451b5 --- /dev/null +++ b/include/infinicore/ops/aminmax.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" +#include +#include + +namespace infinicore::op { +class Aminmax { +public: + using schema = void (*)(Tensor, Tensor, Tensor, std::optional, bool); + static void execute(Tensor min_output, Tensor max_output, Tensor input, + std::optional dim, bool keepdim); + static common::OpDispatcher &dispatcher(); +}; + +// 返回 (min_tensor, max_tensor) 的 pair +std::pair aminmax(Tensor input, + std::optional dim = std::nullopt, + bool keepdim = false); + +void aminmax_(Tensor min_output, Tensor max_output, Tensor input, + std::optional dim = std::nullopt, + bool keepdim = false); +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index f66f7d952..6d57794e3 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -3,6 +3,7 @@ #include "infiniop/handle.h" #include "infiniop/ops/add.h" +#include "infiniop/ops/aminmax.h" #include "infiniop/ops/attention.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" diff --git a/include/infiniop/ops/aminmax.h b/include/infiniop/ops/aminmax.h new file mode 100644 index 000000000..9a3616144 --- /dev/null +++ b/include/infiniop/ops/aminmax.h @@ -0,0 +1,31 @@ +#pragma once + +#include "../operator_descriptor.h" +#include + +typedef struct InfiniopDescriptor *infiniopAminmaxDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAminmaxDescriptor( + infiniopHandle_t handle, + infiniopAminmaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t min_output_desc, + infiniopTensorDescriptor_t max_output_desc, + infiniopTensorDescriptor_t input_desc, + int64_t dim, + int32_t keepdim, + int32_t has_dim); + +__C __export infiniStatus_t +infiniopGetAminmaxWorkspaceSize(infiniopAminmaxDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAminmax( + infiniopAminmaxDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *min_output, + void *max_output, + const void *input, + void *stream); + +__C __export infiniStatus_t +infiniopDestroyAminmaxDescriptor(infiniopAminmaxDescriptor_t desc); diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 7684e7706..90b609d12 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -39,6 +39,7 @@ uint8, ) from infinicore.ops.add import add +from infinicore.ops.aminmax import aminmax from infinicore.ops.attention import attention from infinicore.ops.matmul import matmul from infinicore.ops.mul import mul @@ -100,6 +101,7 @@ "uint8", # Operations. "add", + "aminmax", "attention", "matmul", "mul", diff --git a/python/infinicore/ops/aminmax.py b/python/infinicore/ops/aminmax.py new file mode 100644 index 000000000..9aa5e418e --- /dev/null +++ b/python/infinicore/ops/aminmax.py @@ -0,0 +1,16 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def aminmax(input, dim=None, keepdim=False, *, out=None): + if out is None: + min_tensor, max_tensor = _infinicore.aminmax(input._underlying, dim, keepdim) + return (Tensor(min_tensor), Tensor(max_tensor)) + + if not isinstance(out, tuple) or len(out) != 2: + raise ValueError("out must be a tuple of (min_tensor, max_tensor)") + + min_out, max_out = out + _infinicore.aminmax_(min_out._underlying, max_out._underlying, input._underlying, dim, keepdim) + return out + diff --git a/src/infinicore/ops/aminmax/aminmax.cc b/src/infinicore/ops/aminmax/aminmax.cc new file mode 100644 index 000000000..5e640e58f --- /dev/null +++ b/src/infinicore/ops/aminmax/aminmax.cc @@ -0,0 +1,59 @@ +#include "infinicore/ops/aminmax.hpp" +#include "infinicore/context/context.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &Aminmax::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void Aminmax::execute(Tensor min_output, Tensor max_output, Tensor input, + std::optional dim, bool keepdim) { + dispatcher().lookup(context::getDevice().getType())( + min_output, max_output, input, dim, keepdim); +} + +// 计算输出形状(与 any 类似) +static Shape compute_output_shape(const Shape &input_shape, + std::optional dim, + bool keepdim) { + if (!dim.has_value()) { + // 全局 reduce + if (keepdim) { + return Shape(input_shape.size(), 1); + } else { + return Shape{}; // 标量 + } + } else { + int64_t d = dim.value(); + if (d < 0) { + d += static_cast(input_shape.size()); + } + Shape output_shape = input_shape; + if (keepdim) { + output_shape[d] = 1; + } else { + output_shape.erase(output_shape.begin() + d); + } + return output_shape; + } +} + +std::pair aminmax(Tensor input, + std::optional dim, + bool keepdim) { + auto output_shape = compute_output_shape(input->shape(), dim, keepdim); + auto min_output = Tensor::empty(output_shape, input->dtype(), input->device()); + auto max_output = Tensor::empty(output_shape, input->dtype(), input->device()); + aminmax_(min_output, max_output, input, dim, keepdim); + return {min_output, max_output}; +} + +void aminmax_(Tensor min_output, Tensor max_output, Tensor input, + std::optional dim, bool keepdim) { + Aminmax::execute(min_output, max_output, input, dim, keepdim); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/aminmax/aminmax_infiniop.cc b/src/infinicore/ops/aminmax/aminmax_infiniop.cc new file mode 100644 index 000000000..6834fb706 --- /dev/null +++ b/src/infinicore/ops/aminmax/aminmax_infiniop.cc @@ -0,0 +1,74 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/aminmax.hpp" +#include "infiniop/ops/aminmax.h" +#include + +namespace infinicore::op::aminmax_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopAminmaxDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyAminmaxDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor min_output, Tensor max_output, Tensor input, + std::optional dim, bool keepdim) { + size_t seed = hash_combine(min_output, max_output, input, dim.has_value() ? dim.value() : -1, keepdim); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopAminmaxDescriptor_t desc = nullptr; + + if (!desc_opt) { + int32_t has_dim = dim.has_value() ? 1 : 0; + int64_t dim_val = dim.has_value() ? dim.value() : 0; + + INFINICORE_CHECK_ERROR(infiniopCreateAminmaxDescriptor( + context::getInfiniopHandle(min_output->device()), + &desc, + min_output->desc(), + max_output->desc(), + input->desc(), + dim_val, + keepdim ? 1 : 0, + has_dim)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAminmaxWorkspaceSize(desc, &workspace_size)); + + std::shared_ptr workspace; + void *workspace_ptr = nullptr; + if (workspace_size > 0) { + workspace = context::allocateMemory(workspace_size); + workspace_ptr = workspace->data(); + } + + INFINICORE_CHECK_ERROR(infiniopAminmax( + desc, + workspace_ptr, + workspace_size, + min_output->data(), + max_output->data(), + input->data(), + context::getStream())); +} + +static bool registered = []() { + Aminmax::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::aminmax_impl::infiniop + diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index abf1baee9..cc4365ec0 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -18,6 +18,7 @@ #include "ops/silu.hpp" #include "ops/sqrt.hpp" #include "ops/swiglu.hpp" +#include "ops/aminmax.hpp" namespace py = pybind11; @@ -40,6 +41,7 @@ inline void bind(py::module &m) { bind_sqrt(m); bind_elu(m); bind_diagflat(m); + bind_aminmax(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/aminmax.hpp b/src/infinicore/pybind11/ops/aminmax.hpp new file mode 100644 index 000000000..65c7eafcd --- /dev/null +++ b/src/infinicore/pybind11/ops/aminmax.hpp @@ -0,0 +1,43 @@ +#pragma once + +#include "infinicore/ops/aminmax.hpp" +#include +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +std::pair py_aminmax(Tensor input, py::object dim, bool keepdim) { + std::optional dim_opt = std::nullopt; + if (!dim.is_none()) { + dim_opt = dim.cast(); + } + return op::aminmax(input, dim_opt, keepdim); +} + +void py_aminmax_(Tensor min_output, Tensor max_output, Tensor input, py::object dim, bool keepdim) { + std::optional dim_opt = std::nullopt; + if (!dim.is_none()) { + dim_opt = dim.cast(); + } + op::aminmax_(min_output, max_output, input, dim_opt, keepdim); +} + +inline void bind_aminmax(py::module &m) { + m.def("aminmax", &py_aminmax, + py::arg("input"), + py::arg("dim") = py::none(), + py::arg("keepdim") = false, + R"doc(Returns a tuple (min, max) of the minimum and maximum values of the input tensor.)doc"); + + m.def("aminmax_", &py_aminmax_, + py::arg("min_output"), + py::arg("max_output"), + py::arg("input"), + py::arg("dim") = py::none(), + py::arg("keepdim") = false, + R"doc(In-place version of aminmax.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infiniop/ops/aminmax/cpu/aminmax_cpu.cc b/src/infiniop/ops/aminmax/cpu/aminmax_cpu.cc new file mode 100644 index 000000000..492d02e0b --- /dev/null +++ b/src/infiniop/ops/aminmax/cpu/aminmax_cpu.cc @@ -0,0 +1,350 @@ +#include "aminmax_cpu.h" +#include "../../../../utils.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../tensor.h" +#include +#include +#include + +namespace op::aminmax::cpu { + +// 辅助函数:将线性索引转换为多维索引(与 any 相同) +namespace { +inline void linear_to_multi_index( + size_t linear_idx, + const std::vector &shape, + std::vector &indices) { + size_t temp = linear_idx; + for (size_t i = shape.size(); i-- > 0;) { + indices[i] = temp % shape[i]; + temp /= shape[i]; + } +} + +// 笛卡尔积迭代器(与 any 相同) +class CartesianProductIterator { +public: + CartesianProductIterator(const std::vector &dims) + : dims(dims), indices(dims.size(), 0), first_call(true) {} + + bool next() { + if (first_call) { + first_call = false; + return !dims.empty(); + } + + for (int i = dims.size() - 1; i >= 0; --i) { + indices[i]++; + if (indices[i] < dims[i]) { + return true; + } + indices[i] = 0; + } + return false; + } + + const std::vector &get() const { + return indices; + } + +private: + std::vector dims; + std::vector indices; + bool first_call; +}; +} // namespace + +// 全局 aminmax kernel +template +void aminmax_global_kernel( + const T *input_data, + T *min_output_data, + T *max_output_data, + const AminmaxInfo &info) { + + // 对于 fp16_t 和 bf16_t,使用 float 来计算以提高精度 + if constexpr (std::is_same_v || std::is_same_v) { + float min_val_f = std::numeric_limits::max(); + float max_val_f = std::numeric_limits::lowest(); + + size_t in_numel = 1; + for (auto s : info.in_shape) { + in_numel *= s; + } + + for (size_t i = 0; i < in_numel; ++i) { + std::vector in_indices(info.in_shape.size()); + linear_to_multi_index(i, info.in_shape, in_indices); + + size_t in_offset = 0; + for (size_t j = 0; j < in_indices.size(); ++j) { + in_offset += in_indices[j] * info.in_strides[j]; + } + + // 转换为 float 进行比较 + float val_f = utils::cast(input_data[in_offset]); + if (val_f < min_val_f) min_val_f = val_f; + if (val_f > max_val_f) max_val_f = val_f; + } + + min_output_data[0] = utils::cast(min_val_f); + max_output_data[0] = utils::cast(max_val_f); + } else { + T min_val = std::numeric_limits::max(); + T max_val = std::numeric_limits::lowest(); + + size_t in_numel = 1; + for (auto s : info.in_shape) { + in_numel *= s; + } + + for (size_t i = 0; i < in_numel; ++i) { + std::vector in_indices(info.in_shape.size()); + linear_to_multi_index(i, info.in_shape, in_indices); + + size_t in_offset = 0; + for (size_t j = 0; j < in_indices.size(); ++j) { + in_offset += in_indices[j] * info.in_strides[j]; + } + + T val = input_data[in_offset]; + if (val < min_val) min_val = val; + if (val > max_val) max_val = val; + } + + min_output_data[0] = min_val; + max_output_data[0] = max_val; + } +} + +// 通用 aminmax kernel(支持多维规约、非连续张量) +template +void aminmax_kernel( + const T *input_data, + T *min_output_data, + T *max_output_data, + const AminmaxInfo &info) { + + // 处理全局 aminmax + if (info.is_global) { + aminmax_global_kernel(input_data, min_output_data, max_output_data, info); + return; + } + + size_t out_numel = 1; + for (auto s : info.out_shape) { + out_numel *= s; + } + + for (size_t out_idx = 0; out_idx < out_numel; ++out_idx) { + // 1. 将 flat 输出索引转换为多维索引 + std::vector out_indices(info.out_shape.size()); + linear_to_multi_index(out_idx, info.out_shape, out_indices); + + // 2. 构建完整的输入索引(根据 keepdim 和 reduce_dims) + std::vector in_indices(info.in_shape.size()); + if (info.keepdim) { + in_indices = out_indices; + } else { + size_t out_index = 0; + for (size_t i = 0; i < info.in_shape.size(); ++i) { + bool is_reduce_dim = false; + for (auto rd : info.reduce_dims) { + if (i == static_cast(rd)) { + is_reduce_dim = true; + break; + } + } + if (!is_reduce_dim) { + in_indices[i] = out_indices[out_index++]; + } + } + } + + // 3. 构建笛卡尔积:遍历所有 reduce_dims 的组合 + std::vector reduce_dims_sizes; + for (auto rd : info.reduce_dims) { + reduce_dims_sizes.push_back(info.in_shape[rd]); + } + + T min_val; + T max_val; + + // 对于 fp16_t 和 bf16_t,使用 float 来计算以提高精度 + if constexpr (std::is_same_v || std::is_same_v) { + float min_val_f = std::numeric_limits::max(); + float max_val_f = std::numeric_limits::lowest(); + + CartesianProductIterator iter(reduce_dims_sizes); + while (iter.next()) { + // 构建完整的输入索引 + std::vector full_in_indices = in_indices; + size_t reduce_index = 0; + for (auto rd : info.reduce_dims) { + full_in_indices[rd] = iter.get()[reduce_index++]; + } + + // 计算输入内存偏移 + size_t in_offset = 0; + for (size_t i = 0; i < full_in_indices.size(); ++i) { + in_offset += full_in_indices[i] * info.in_strides[i]; + } + + // 转换为 float 进行比较 + float val_f = utils::cast(input_data[in_offset]); + if (val_f < min_val_f) min_val_f = val_f; + if (val_f > max_val_f) max_val_f = val_f; + } + + // 转换回 T 类型 + min_val = utils::cast(min_val_f); + max_val = utils::cast(max_val_f); + } else { + min_val = std::numeric_limits::max(); + max_val = std::numeric_limits::lowest(); + + CartesianProductIterator iter(reduce_dims_sizes); + while (iter.next()) { + // 构建完整的输入索引 + std::vector full_in_indices = in_indices; + size_t reduce_index = 0; + for (auto rd : info.reduce_dims) { + full_in_indices[rd] = iter.get()[reduce_index++]; + } + + // 计算输入内存偏移 + size_t in_offset = 0; + for (size_t i = 0; i < full_in_indices.size(); ++i) { + in_offset += full_in_indices[i] * info.in_strides[i]; + } + + // 更新 min 和 max + T val = input_data[in_offset]; + if (val < min_val) min_val = val; + if (val > max_val) max_val = val; + } + } + + // 4. 写入输出(考虑输出张量的 strides) + size_t min_out_offset = 0; + size_t max_out_offset = 0; + for (size_t i = 0; i < out_indices.size(); ++i) { + min_out_offset += out_indices[i] * info.out_strides[i]; + max_out_offset += out_indices[i] * info.out_strides[i]; + } + + min_output_data[min_out_offset] = min_val; + max_output_data[max_out_offset] = max_val; + } +} + +// 创建 AminmaxInfo(与 AnyInfo::create 类似) +utils::Result AminmaxInfo::create( + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t output_desc, + int64_t dim, + int32_t keepdim, + int32_t has_dim) { + + AminmaxInfo info; + info.keepdim = (keepdim != 0); + info.dtype = input_desc->dtype(); + + // 获取输入形状和 strides + size_t ndim = input_desc->ndim(); + info.in_shape.resize(ndim); + info.in_strides.resize(ndim); + for (size_t i = 0; i < ndim; ++i) { + info.in_shape[i] = input_desc->dim(i); + info.in_strides[i] = input_desc->stride(i); + } + + // 处理 reduce_dims + if (has_dim) { + int64_t d = dim; + if (d < 0) { + d += static_cast(ndim); + } + if (d < 0 || d >= static_cast(ndim)) { + return utils::Result(INFINI_STATUS_BAD_TENSOR_SHAPE); + } + info.reduce_dims.push_back(d); + } + + info.is_global = info.reduce_dims.empty(); + + // 获取输出形状和 strides + size_t out_ndim = output_desc->ndim(); + info.out_shape.resize(out_ndim); + info.out_strides.resize(out_ndim); + for (size_t i = 0; i < out_ndim; ++i) { + info.out_shape[i] = output_desc->dim(i); + info.out_strides[i] = output_desc->stride(i); + } + + return utils::Result(info); +} + +// 实际计算函数(根据 dtype 分发) +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *min_output, + void *max_output, + const void *input, + void *stream) const { + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + aminmax_kernel(static_cast(input), + static_cast(min_output), + static_cast(max_output), _info); + break; + case INFINI_DTYPE_BF16: + aminmax_kernel(static_cast(input), + static_cast(min_output), + static_cast(max_output), _info); + break; + case INFINI_DTYPE_F32: + aminmax_kernel(static_cast(input), + static_cast(min_output), + static_cast(max_output), _info); + break; + case INFINI_DTYPE_F64: + aminmax_kernel(static_cast(input), + static_cast(min_output), + static_cast(max_output), _info); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t min_output_desc, + infiniopTensorDescriptor_t max_output_desc, + infiniopTensorDescriptor_t input_desc, + int64_t dim, + int32_t keepdim, + int32_t has_dim) { + + // 验证 min 和 max 输出形状相同 + if (min_output_desc->shape() != max_output_desc->shape()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + auto result = AminmaxInfo::create( + input_desc, min_output_desc, dim, keepdim, has_dim); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + result.take(), handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::aminmax::cpu + diff --git a/src/infiniop/ops/aminmax/cpu/aminmax_cpu.h b/src/infiniop/ops/aminmax/cpu/aminmax_cpu.h new file mode 100644 index 000000000..51c0db226 --- /dev/null +++ b/src/infiniop/ops/aminmax/cpu/aminmax_cpu.h @@ -0,0 +1,60 @@ +#ifndef __AMINMAX_CPU_H__ +#define __AMINMAX_CPU_H__ + +#include "../../../../utils.h" +#include "../../../operator.h" +#include + +namespace op::aminmax::cpu { + +struct AminmaxInfo { + std::vector in_shape; + std::vector in_strides; + std::vector out_shape; + std::vector out_strides; + std::vector reduce_dims; + bool keepdim; + bool is_global; + infiniDtype_t dtype; + + static utils::Result create( + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t output_desc, + int64_t dim, + int32_t keepdim, + int32_t has_dim); +}; + +class Descriptor : public InfiniopDescriptor { +public: + AminmaxInfo _info; + + Descriptor(AminmaxInfo info, infiniDevice_t device, int device_id) + : InfiniopDescriptor{device, device_id}, _info(std::move(info)) {} + + ~Descriptor() = default; + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t min_output_desc, + infiniopTensorDescriptor_t max_output_desc, + infiniopTensorDescriptor_t input_desc, + int64_t dim, + int32_t keepdim, + int32_t has_dim); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *min_output, + void *max_output, + const void *input, + void *stream) const; +}; + +} // namespace op::aminmax::cpu + +#endif // __AMINMAX_CPU_H__ diff --git a/src/infiniop/ops/aminmax/operator.cc b/src/infiniop/ops/aminmax/operator.cc new file mode 100644 index 000000000..6f30c6380 --- /dev/null +++ b/src/infiniop/ops/aminmax/operator.cc @@ -0,0 +1,93 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/aminmax.h" + +#ifdef ENABLE_CPU_API +#include "cpu/aminmax_cpu.h" +#endif + +__C infiniStatus_t infiniopCreateAminmaxDescriptor( + infiniopHandle_t handle, + infiniopAminmaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t min_output_desc, + infiniopTensorDescriptor_t max_output_desc, + infiniopTensorDescriptor_t input_desc, + int64_t dim, + int32_t keepdim, + int32_t has_dim) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::aminmax::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + min_output_desc, \ + max_output_desc, \ + input_desc, \ + dim, keepdim, has_dim) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAminmaxWorkspaceSize(infiniopAminmaxDescriptor_t desc, + size_t *size) { + *size = 0; + return INFINI_STATUS_SUCCESS; +} + +__C infiniStatus_t infiniopAminmax(infiniopAminmaxDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *min_output, + void *max_output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, min_output, max_output, input, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAminmaxDescriptor(infiniopAminmaxDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infinicore/ops/aminmax.py b/test/infinicore/ops/aminmax.py index b10c1b511..cb85d9d2e 100644 --- a/test/infinicore/ops/aminmax.py +++ b/test/infinicore/ops/aminmax.py @@ -231,6 +231,8 @@ def get_test_cases(self): def torch_operator(self, x, dim=None, keepdim=False, out=None, **kwargs): return torch.aminmax(x, dim=dim, keepdim=keepdim, out=out) + def infinicore_operator(self, x, dim=None, keepdim=False, out=None, **kwargs): + return infinicore.aminmax(x, dim=dim, keepdim=keepdim, out=out) def main(): """Main entry point""" From 9fb99771f0856bcd7703ec810303fd39a5b71289 Mon Sep 17 00:00:00 2001 From: guozhihao-224 Date: Fri, 5 Dec 2025 13:24:22 +0800 Subject: [PATCH 08/10] aminmax: fix cpu --- python/infinicore/ops/aminmax.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/infinicore/ops/aminmax.py b/python/infinicore/ops/aminmax.py index 9aa5e418e..0287abca0 100644 --- a/python/infinicore/ops/aminmax.py +++ b/python/infinicore/ops/aminmax.py @@ -7,8 +7,13 @@ def aminmax(input, dim=None, keepdim=False, *, out=None): min_tensor, max_tensor = _infinicore.aminmax(input._underlying, dim, keepdim) return (Tensor(min_tensor), Tensor(max_tensor)) - if not isinstance(out, tuple) or len(out) != 2: - raise ValueError("out must be a tuple of (min_tensor, max_tensor)") + # if not isinstance(out, tuple) or len(out) != 2: + # raise ValueError("out must be a tuple of (min_tensor, max_tensor)") + + # 接受元组或列表 + if not isinstance(out, (tuple, list)) or len(out) != 2: + raise ValueError("out must be a tuple or list of (min_tensor, max_tensor)") + min_out, max_out = out _infinicore.aminmax_(min_out._underlying, max_out._underlying, input._underlying, dim, keepdim) From 685ddb9f0940f64714da4b484289835479353832 Mon Sep 17 00:00:00 2001 From: guozhihao-224 Date: Fri, 5 Dec 2025 19:34:33 +0800 Subject: [PATCH 09/10] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8Delu-cpu=20?= =?UTF-8?q?=E7=AE=97=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/infinicore/ops/elu/elu_infiniop.cc | 37 +++++++++++++++++++++----- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/src/infinicore/ops/elu/elu_infiniop.cc b/src/infinicore/ops/elu/elu_infiniop.cc index 15e2d7e76..6c5be549c 100644 --- a/src/infinicore/ops/elu/elu_infiniop.cc +++ b/src/infinicore/ops/elu/elu_infiniop.cc @@ -16,17 +16,40 @@ thread_local common::OpCache caches( }); void calculate(Tensor output, Tensor input, float alpha) { - // 构建缓存键:包含 input 特征和 alpha 参数 + // 构建缓存键:需要同时考虑 output 和 input 的特征 + // 特别是要区分 inplace (output == input) 和 out-of-place 的情况 size_t seed = 0; hash_combine(seed, static_cast(input->dtype())); hash_combine(seed, static_cast(*reinterpret_cast(&alpha))); // 将 float 转换为 uint32_t 进行哈希 + + // 检查是否为 inplace 操作 + bool is_inplace = (output->data() == input->data()); + hash_combine(seed, static_cast(is_inplace ? 1 : 0)); - // 手动遍历 shape 和 strides - for (Size shape_val : input->shape()) { - hash_combine(seed, shape_val); - } - for (Stride stride_val : input->strides()) { - hash_combine(seed, static_cast(stride_val)); + // 对于 inplace 操作,只需要 input 的特征(因为 output == input) + // 对于 out-of-place 操作,需要同时考虑 output 和 input 的特征 + if (is_inplace) { + // Inplace: 只使用 input 的特征 + for (Size shape_val : input->shape()) { + hash_combine(seed, shape_val); + } + for (Stride stride_val : input->strides()) { + hash_combine(seed, static_cast(stride_val)); + } + } else { + // Out-of-place: 需要同时考虑 output 和 input + for (Size shape_val : output->shape()) { + hash_combine(seed, shape_val); + } + for (Stride stride_val : output->strides()) { + hash_combine(seed, static_cast(stride_val)); + } + for (Size shape_val : input->shape()) { + hash_combine(seed, shape_val); + } + for (Stride stride_val : input->strides()) { + hash_combine(seed, static_cast(stride_val)); + } } auto device_type = context::getDevice().getType(); From f7f45c59df1f6f10844fe2c5b9ed797c7c0912fb Mon Sep 17 00:00:00 2001 From: guozhihao-224 Date: Wed, 10 Dec 2025 19:24:10 +0800 Subject: [PATCH 10/10] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=20aminmax?= =?UTF-8?q?=E3=80=81diagflat=E3=80=81sqrt=20=E7=9A=84=20ntops=20=E7=AE=97?= =?UTF-8?q?=E5=AD=90=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/infinicore/ops/aminmax.py | 4 ++++ python/infinicore/ops/diagflat.py | 11 ++++++++--- python/infinicore/ops/sqrt.py | 6 ++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/python/infinicore/ops/aminmax.py b/python/infinicore/ops/aminmax.py index 0287abca0..fdef5f8d3 100644 --- a/python/infinicore/ops/aminmax.py +++ b/python/infinicore/ops/aminmax.py @@ -1,8 +1,12 @@ from infinicore.lib import _infinicore from infinicore.tensor import Tensor +import infinicore def aminmax(input, dim=None, keepdim=False, *, out=None): + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.aminmax(input, dim=dim, keepdim=keepdim, out=out) + if out is None: min_tensor, max_tensor = _infinicore.aminmax(input._underlying, dim, keepdim) return (Tensor(min_tensor), Tensor(max_tensor)) diff --git a/python/infinicore/ops/diagflat.py b/python/infinicore/ops/diagflat.py index c60878fc4..d19fee475 100644 --- a/python/infinicore/ops/diagflat.py +++ b/python/infinicore/ops/diagflat.py @@ -1,10 +1,15 @@ from infinicore.lib import _infinicore from infinicore.tensor import Tensor +import infinicore -def diagflat(input, *, offset=0, out=None): - if out is None: - return Tensor(_infinicore.diagflat(input._underlying, offset)) +def diagflat(input, *, offset=0): + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.diagflat(input, offset=offset) + + + return Tensor(_infinicore.diagflat(input._underlying, offset)) + _infinicore.diagflat_(out._underlying, input._underlying, offset) return out diff --git a/python/infinicore/ops/sqrt.py b/python/infinicore/ops/sqrt.py index 9395d1ef0..555d8a40a 100644 --- a/python/infinicore/ops/sqrt.py +++ b/python/infinicore/ops/sqrt.py @@ -1,8 +1,14 @@ from infinicore.lib import _infinicore from infinicore.tensor import Tensor +import infinicore + def sqrt(input, *, out=None): + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.sqrt(input, out=out) + if out is None: return Tensor(_infinicore.sqrt(input._underlying)) + _infinicore.sqrt_(out._underlying, input._underlying) return out \ No newline at end of file