diff --git a/include/infinicore/nn.hpp b/include/infinicore/nn.hpp index b927b294b..4fb2d55c3 100644 --- a/include/infinicore/nn.hpp +++ b/include/infinicore/nn.hpp @@ -1,5 +1,6 @@ #pragma once #include "nn/embedding.hpp" +#include "nn/layernorm.hpp" #include "nn/linear.hpp" #include "nn/rmsnorm.hpp" diff --git a/include/infinicore/nn/layernorm.hpp b/include/infinicore/nn/layernorm.hpp new file mode 100644 index 000000000..6dbc5d3ba --- /dev/null +++ b/include/infinicore/nn/layernorm.hpp @@ -0,0 +1,60 @@ +#pragma once + +#include "module.hpp" +#include "../ops.hpp" + +namespace infinicore::nn { + +/** + * @brief Layer Normalization + * + * Applies LayerNorm over the last dimension. + * + * Formula: y = (x - mean) / sqrt(var + eps) * weight + bias + */ +class LayerNorm : public Module { +public: + /** + * @brief Construct a LayerNorm layer + * + * @param normalized_shape Size of the feature dimension to normalize (typically hidden_size) + * @param eps Small constant for numerical stability (default: 1e-5) + * @param dtype Data type for the weight/bias (default: DataType::F32) + * @param device Device to create the parameters on + */ + LayerNorm(size_t normalized_shape, + double eps = 1e-5, + const DataType &dtype = DataType::F32, + const Device &device = Device()); + + /** + * @brief Forward pass: apply LayerNorm + * + * @param x Input tensor of shape (*, normalized_shape) + * @return Normalized tensor with same shape as input + */ + Tensor forward(const Tensor &x) const; + + // Module information + size_t normalized_shape() const { return normalized_shape_; } + double eps() const { return eps_; } + DataType dtype() const { return dtype_; } + + // String representation + std::string extra_repr() const; + + // Accessors for parameters + Tensor weight() const { return weight_; } + Tensor bias() const { return bias_; } + +protected: + INFINICORE_NN_PARAMETER(weight); + INFINICORE_NN_PARAMETER(bias); + +private: + size_t normalized_shape_; + double eps_; + DataType dtype_; +}; + +} // namespace infinicore::nn diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index a7249ec9d..031ba92cc 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -4,14 +4,22 @@ #include "ops/add_rms_norm.hpp" #include "ops/attention.hpp" #include "ops/causal_softmax.hpp" +#include "ops/conv2d.hpp" +#include "ops/gelu.hpp" +#include "ops/gelutanh.hpp" +#include "ops/layer_norm.hpp" +#include "ops/linear.hpp" #include "ops/matmul.hpp" #include "ops/ones.hpp" #include "ops/paged_attention.hpp" #include "ops/paged_attention_prefill.hpp" #include "ops/paged_caching.hpp" #include "ops/random_sample.hpp" +#include "ops/relu.hpp" +#include "ops/quickgelu.hpp" #include "ops/rearrange.hpp" #include "ops/rms_norm.hpp" #include "ops/rope.hpp" #include "ops/silu.hpp" +#include "ops/softmax.hpp" #include "ops/swiglu.hpp" diff --git a/include/infinicore/ops/conv2d.hpp b/include/infinicore/ops/conv2d.hpp new file mode 100644 index 000000000..f1dda90ac --- /dev/null +++ b/include/infinicore/ops/conv2d.hpp @@ -0,0 +1,38 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +#include +#include + +namespace infinicore::op { +class Conv2d { +public: + using schema = void (*)(Tensor, Tensor, Tensor, Tensor, + const size_t *, const size_t *, const size_t *, size_t); + static void execute(Tensor output, + Tensor input, + Tensor weight, + Tensor bias, + const size_t *pads, + const size_t *strides, + const size_t *dilations, + size_t n); + static common::OpDispatcher &dispatcher(); +}; + +Tensor conv2d(Tensor input, + Tensor weight, + Tensor bias, + const std::vector &pads, + const std::vector &strides, + const std::vector &dilations); +void conv2d_(Tensor output, + Tensor input, + Tensor weight, + Tensor bias, + const std::vector &pads, + const std::vector &strides, + const std::vector &dilations); +} // namespace infinicore::op diff --git a/include/infinicore/ops/gelu.hpp b/include/infinicore/ops/gelu.hpp new file mode 100644 index 000000000..5e8c61347 --- /dev/null +++ b/include/infinicore/ops/gelu.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Gelu { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor gelu(Tensor input); +void gelu_(Tensor output, Tensor input); +} // namespace infinicore::op diff --git a/include/infinicore/ops/gelutanh.hpp b/include/infinicore/ops/gelutanh.hpp new file mode 100644 index 000000000..c968c81fd --- /dev/null +++ b/include/infinicore/ops/gelutanh.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class GeluTanh { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor gelu_tanh(Tensor input); +void gelu_tanh_(Tensor output, Tensor input); +} // namespace infinicore::op diff --git a/include/infinicore/ops/layer_norm.hpp b/include/infinicore/ops/layer_norm.hpp new file mode 100644 index 000000000..ad8187bdc --- /dev/null +++ b/include/infinicore/ops/layer_norm.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class LayerNorm { +public: + using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, float); + static void execute(Tensor output, + Tensor input_standardization, + Tensor input_std_deviation, + Tensor input, + Tensor weight, + Tensor bias, + float epsilon); + static common::OpDispatcher &dispatcher(); +}; + +Tensor layer_norm(Tensor input, Tensor weight, Tensor bias, float epsilon = 1e-5f); +void layer_norm_(Tensor output, + Tensor input_standardization, + Tensor input_std_deviation, + Tensor input, + Tensor weight, + Tensor bias, + float epsilon = 1e-5f); +} // namespace infinicore::op diff --git a/include/infinicore/ops/quickgelu.hpp b/include/infinicore/ops/quickgelu.hpp new file mode 100644 index 000000000..b67d7f71a --- /dev/null +++ b/include/infinicore/ops/quickgelu.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class QuickGelu { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor quick_gelu(Tensor input); +void quick_gelu_(Tensor output, Tensor input); +} // namespace infinicore::op diff --git a/include/infinicore/ops/relu.hpp b/include/infinicore/ops/relu.hpp new file mode 100644 index 000000000..59f2b26f5 --- /dev/null +++ b/include/infinicore/ops/relu.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Relu { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor relu(Tensor input); +void relu_(Tensor output, Tensor input); +} // namespace infinicore::op diff --git a/include/infinicore/ops/softmax.hpp b/include/infinicore/ops/softmax.hpp new file mode 100644 index 000000000..c96ab1810 --- /dev/null +++ b/include/infinicore/ops/softmax.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Softmax { +public: + using schema = void (*)(Tensor, Tensor, int); + static void execute(Tensor output, Tensor input, int axis); + static common::OpDispatcher &dispatcher(); +}; + +Tensor softmax(Tensor input, int axis = -1); +void softmax_(Tensor output, Tensor input, int axis = -1); +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index c0a09fcb4..f0d682863 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -10,6 +10,7 @@ #include "infiniop/ops/conv.h" #include "infiniop/ops/dequantize_awq.h" #include "infiniop/ops/gelu.h" +#include "infiniop/ops/gelutanh.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" @@ -20,6 +21,7 @@ #include "infiniop/ops/paged_attention_prefill.h" #include "infiniop/ops/paged_caching.h" #include "infiniop/ops/random_sample.h" +#include "infiniop/ops/quickgelu.h" #include "infiniop/ops/rearrange.h" #include "infiniop/ops/relu.h" #include "infiniop/ops/rms_norm.h" diff --git a/include/infiniop/ops/gelutanh.h b/include/infiniop/ops/gelutanh.h new file mode 100644 index 000000000..5ff6dad23 --- /dev/null +++ b/include/infiniop/ops/gelutanh.h @@ -0,0 +1,43 @@ +#ifndef __INFINIOP_GELUTANH_API_H__ +#define __INFINIOP_GELUTANH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopGeluTanhDescriptor_t; + +/** + * Create GELU-Tanh descriptor + * + * y = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) + */ +__C __export infiniStatus_t infiniopCreateGeluTanhDescriptor( + infiniopHandle_t handle, + infiniopGeluTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +/** + * Query workspace size + */ +__C __export infiniStatus_t infiniopGetGeluTanhWorkspaceSize( + infiniopGeluTanhDescriptor_t desc, + size_t *size); + +/** + * Launch GELU-Tanh operator + */ +__C __export infiniStatus_t infiniopGeluTanh( + infiniopGeluTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +/** + * Destroy descriptor + */ +__C __export infiniStatus_t infiniopDestroyGeluTanhDescriptor( + infiniopGeluTanhDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/quickgelu.h b/include/infiniop/ops/quickgelu.h new file mode 100644 index 000000000..1ea19ccf1 --- /dev/null +++ b/include/infiniop/ops/quickgelu.h @@ -0,0 +1,42 @@ +#ifndef __INFINIOP_QUICKGELU_API_H__ +#define __INFINIOP_QUICKGELU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopQuickGeluDescriptor_t; + +/** + * Create QuickGELU descriptor + * y = x * sigmoid(1.702 * x) + */ +__C __export infiniStatus_t infiniopCreateQuickGeluDescriptor( + infiniopHandle_t handle, + infiniopQuickGeluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +/** + * Query workspace size + */ +__C __export infiniStatus_t infiniopGetQuickGeluWorkspaceSize( + infiniopQuickGeluDescriptor_t desc, + size_t *size); + +/** + * Launch QuickGELU operator + */ +__C __export infiniStatus_t infiniopQuickGelu( + infiniopQuickGeluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +/** + * Destroy descriptor + */ +__C __export infiniStatus_t infiniopDestroyQuickGeluDescriptor( + infiniopQuickGeluDescriptor_t desc); + +#endif diff --git a/src/infinicore/nn/layernorm.cc b/src/infinicore/nn/layernorm.cc new file mode 100644 index 000000000..ca72a452a --- /dev/null +++ b/src/infinicore/nn/layernorm.cc @@ -0,0 +1,30 @@ +#include "infinicore/nn/layernorm.hpp" + +namespace infinicore::nn { + +LayerNorm::LayerNorm(size_t normalized_shape, + double eps, + const DataType &dtype, + const Device &device) + : normalized_shape_(normalized_shape), + eps_(eps), + dtype_(dtype) { + INFINICORE_NN_PARAMETER_INIT(weight, ({normalized_shape_}, dtype_, device)); + INFINICORE_NN_PARAMETER_INIT(bias, ({normalized_shape_}, dtype_, device)); + auto weight_init = infinicore::Tensor::ones({normalized_shape_}, dtype_, device); + auto bias_init = infinicore::Tensor::zeros({normalized_shape_}, dtype_, device); + weight_->copy_from(weight_init); + bias_->copy_from(bias_init); +} + +Tensor LayerNorm::forward(const Tensor &x) const { + return infinicore::op::layer_norm(x, weight_, bias_, static_cast(eps_)); +} + +std::string LayerNorm::extra_repr() const { + return "normalized_shape=" + std::to_string(normalized_shape_) + + ", eps=" + std::to_string(eps_) + + ", dtype=" + infinicore::toString(dtype_); +} + +} // namespace infinicore::nn diff --git a/src/infinicore/ops/conv2d/conv2d.cc b/src/infinicore/ops/conv2d/conv2d.cc new file mode 100644 index 000000000..6ea76087d --- /dev/null +++ b/src/infinicore/ops/conv2d/conv2d.cc @@ -0,0 +1,67 @@ +#include "infinicore/ops/conv2d.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &Conv2d::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Conv2d::execute(Tensor output, + Tensor input, + Tensor weight, + Tensor bias, + const size_t *pads, + const size_t *strides, + const size_t *dilations, + size_t n) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input, weight, bias); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Conv2d implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, weight, bias, pads, strides, dilations, n); +} + +Tensor conv2d(Tensor input, + Tensor weight, + Tensor bias, + const std::vector &pads, + const std::vector &strides, + const std::vector &dilations) { + // Output shape should be pre-computed by caller; allocate a conservative placeholder. + // This helper is rarely used in performance-critical paths. + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + conv2d_(output, input, weight, bias, pads, strides, dilations); + return output; +} + +void conv2d_(Tensor output, + Tensor input, + Tensor weight, + Tensor bias, + const std::vector &pads, + const std::vector &strides, + const std::vector &dilations) { + if (pads.size() != strides.size() || pads.size() != dilations.size()) { + throw std::runtime_error("conv2d_: pads/strides/dilations must have the same size"); + } + Conv2d::execute(output, + input, + weight, + bias, + pads.data(), + strides.data(), + dilations.data(), + pads.size()); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/conv2d/conv2d_infiniop.cc b/src/infinicore/ops/conv2d/conv2d_infiniop.cc new file mode 100644 index 000000000..c15c14fdd --- /dev/null +++ b/src/infinicore/ops/conv2d/conv2d_infiniop.cc @@ -0,0 +1,69 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/conv2d.hpp" +#include + +namespace infinicore::op::conv2d_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopConvDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyConvDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, + Tensor input, + Tensor weight, + Tensor bias, + const size_t *pads, + const size_t *strides, + const size_t *dilations, + size_t n) { + size_t seed = hash_combine(output, input, weight, bias, n); + for (size_t i = 0; i < n; ++i) { + hash_combine(seed, pads[i], strides[i], dilations[i]); + } + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopConvDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateConvDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), input->desc(), weight->desc(), + bias ? bias->desc() : nullptr, + const_cast(pads), + const_cast(strides), + const_cast(dilations), + n)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetConvWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopConv( + desc, workspace->data(), workspace_size, + output->data(), + input->data(), + weight->data(), + bias ? bias->data() : nullptr, + context::getStream())); +} + +static bool registered = []() { + Conv2d::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::conv2d_impl::infiniop diff --git a/src/infinicore/ops/gelu/gelu.cc b/src/infinicore/ops/gelu/gelu.cc new file mode 100644 index 000000000..612a2ecee --- /dev/null +++ b/src/infinicore/ops/gelu/gelu.cc @@ -0,0 +1,37 @@ +#include "infinicore/ops/gelu.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &Gelu::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Gelu::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Gelu implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor gelu(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + gelu_(output, input); + return output; +} + +void gelu_(Tensor output, Tensor input) { + Gelu::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/gelu/gelu_infiniop.cc b/src/infinicore/ops/gelu/gelu_infiniop.cc new file mode 100644 index 000000000..6294a05c2 --- /dev/null +++ b/src/infinicore/ops/gelu/gelu_infiniop.cc @@ -0,0 +1,50 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/gelu.hpp" +#include + +namespace infinicore::op::gelu_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopGeluDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyGeluDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopGeluDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateGeluDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetGeluWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopGelu( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Gelu::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::gelu_impl::infiniop diff --git a/src/infinicore/ops/gelutanh/gelutanh.cc b/src/infinicore/ops/gelutanh/gelutanh.cc new file mode 100644 index 000000000..b6bae39dd --- /dev/null +++ b/src/infinicore/ops/gelutanh/gelutanh.cc @@ -0,0 +1,37 @@ +#include "infinicore/ops/gelutanh.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &GeluTanh::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void GeluTanh::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No GeluTanh implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor gelu_tanh(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + gelu_tanh_(output, input); + return output; +} + +void gelu_tanh_(Tensor output, Tensor input) { + GeluTanh::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/gelutanh/gelutanh_infiniop.cc b/src/infinicore/ops/gelutanh/gelutanh_infiniop.cc new file mode 100644 index 000000000..fb13ca98a --- /dev/null +++ b/src/infinicore/ops/gelutanh/gelutanh_infiniop.cc @@ -0,0 +1,50 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/gelutanh.hpp" +#include + +namespace infinicore::op::gelutanh_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopGeluTanhDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyGeluTanhDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopGeluTanhDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateGeluTanhDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetGeluTanhWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopGeluTanh( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + GeluTanh::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::gelutanh_impl::infiniop diff --git a/src/infinicore/ops/layer_norm/layer_norm.cc b/src/infinicore/ops/layer_norm/layer_norm.cc new file mode 100644 index 000000000..55d2ccd73 --- /dev/null +++ b/src/infinicore/ops/layer_norm/layer_norm.cc @@ -0,0 +1,62 @@ +#include "infinicore/ops/layer_norm.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &LayerNorm::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void LayerNorm::execute(Tensor output, + Tensor input_standardization, + Tensor input_std_deviation, + Tensor input, + Tensor weight, + Tensor bias, + float epsilon) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input_standardization, input_std_deviation, input, weight, bias); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No LayerNorm implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input_standardization, input_std_deviation, input, weight, bias, epsilon); +} + +Tensor layer_norm(Tensor input, Tensor weight, Tensor bias, float epsilon) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + + if (shape.empty()) { + throw std::runtime_error("layer_norm: input must have at least one dimension"); + } + + Shape std_shape = shape; + std_shape.pop_back(); + if (std_shape.empty()) { + std_shape.push_back(1); + } + + auto input_standardization = Tensor::empty(shape, input->dtype(), input->device()); + auto input_std_deviation = Tensor::empty(std_shape, input->dtype(), input->device()); + layer_norm_(output, input_standardization, input_std_deviation, input, weight, bias, epsilon); + return output; +} + +void layer_norm_(Tensor output, + Tensor input_standardization, + Tensor input_std_deviation, + Tensor input, + Tensor weight, + Tensor bias, + float epsilon) { + LayerNorm::execute(output, input_standardization, input_std_deviation, input, weight, bias, epsilon); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/layer_norm/layer_norm_infiniop.cc b/src/infinicore/ops/layer_norm/layer_norm_infiniop.cc new file mode 100644 index 000000000..6ef64ac90 --- /dev/null +++ b/src/infinicore/ops/layer_norm/layer_norm_infiniop.cc @@ -0,0 +1,68 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/layer_norm.hpp" +#include + +namespace infinicore::op::layer_norm_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopLayerNormDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyLayerNormDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, + Tensor input_standardization, + Tensor input_std_deviation, + Tensor input, + Tensor weight, + Tensor bias, + float epsilon) { + size_t seed = hash_combine(output, input_standardization, input_std_deviation, input, weight, bias, epsilon); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopLayerNormDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateLayerNormDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), + input_standardization->desc(), + input_std_deviation->desc(), + input->desc(), + weight ? weight->desc() : nullptr, + bias ? bias->desc() : nullptr, + epsilon)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetLayerNormWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopLayerNorm( + desc, workspace->data(), workspace_size, + output->data(), + input_standardization->data(), + input_std_deviation->data(), + input->data(), + weight ? weight->data() : nullptr, + bias ? bias->data() : nullptr, + context::getStream())); +} + +static bool registered = []() { + LayerNorm::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::layer_norm_impl::infiniop diff --git a/src/infinicore/ops/quickgelu/quickgelu.cc b/src/infinicore/ops/quickgelu/quickgelu.cc new file mode 100644 index 000000000..4ab46c49a --- /dev/null +++ b/src/infinicore/ops/quickgelu/quickgelu.cc @@ -0,0 +1,37 @@ +#include "infinicore/ops/quickgelu.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &QuickGelu::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void QuickGelu::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No QuickGelu implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor quick_gelu(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + quick_gelu_(output, input); + return output; +} + +void quick_gelu_(Tensor output, Tensor input) { + QuickGelu::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/quickgelu/quickgelu_infiniop.cc b/src/infinicore/ops/quickgelu/quickgelu_infiniop.cc new file mode 100644 index 000000000..3b18c315a --- /dev/null +++ b/src/infinicore/ops/quickgelu/quickgelu_infiniop.cc @@ -0,0 +1,50 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/quickgelu.hpp" +#include + +namespace infinicore::op::quickgelu_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopQuickGeluDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyQuickGeluDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopQuickGeluDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateQuickGeluDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetQuickGeluWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopQuickGelu( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + QuickGelu::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::quickgelu_impl::infiniop diff --git a/src/infinicore/ops/relu/relu.cc b/src/infinicore/ops/relu/relu.cc new file mode 100644 index 000000000..dceb618b1 --- /dev/null +++ b/src/infinicore/ops/relu/relu.cc @@ -0,0 +1,37 @@ +#include "infinicore/ops/relu.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &Relu::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Relu::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Relu implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor relu(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + relu_(output, input); + return output; +} + +void relu_(Tensor output, Tensor input) { + Relu::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/relu/relu_infiniop.cc b/src/infinicore/ops/relu/relu_infiniop.cc new file mode 100644 index 000000000..dc80535a8 --- /dev/null +++ b/src/infinicore/ops/relu/relu_infiniop.cc @@ -0,0 +1,50 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/relu.hpp" +#include + +namespace infinicore::op::relu_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopReluDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyReluDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopReluDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateReluDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetReluWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopRelu( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Relu::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::relu_impl::infiniop diff --git a/src/infinicore/ops/softmax/softmax.cc b/src/infinicore/ops/softmax/softmax.cc new file mode 100644 index 000000000..1856f61de --- /dev/null +++ b/src/infinicore/ops/softmax/softmax.cc @@ -0,0 +1,37 @@ +#include "infinicore/ops/softmax.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &Softmax::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Softmax::execute(Tensor output, Tensor input, int axis) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Softmax implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, axis); +} + +Tensor softmax(Tensor input, int axis) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + softmax_(output, input, axis); + return output; +} + +void softmax_(Tensor output, Tensor input, int axis) { + Softmax::execute(output, input, axis); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/softmax/softmax_infiniop.cc b/src/infinicore/ops/softmax/softmax_infiniop.cc new file mode 100644 index 000000000..2c1dfd6e5 --- /dev/null +++ b/src/infinicore/ops/softmax/softmax_infiniop.cc @@ -0,0 +1,50 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/softmax.hpp" +#include + +namespace infinicore::op::softmax_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopSoftmaxDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroySoftmaxDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, int axis) { + size_t seed = hash_combine(output, input, axis); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopSoftmaxDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateSoftmaxDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), input->desc(), axis)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetSoftmaxWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopSoftmax( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Softmax::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::softmax_impl::infiniop diff --git a/src/infiniop/ops/add/operator.cc b/src/infiniop/ops/add/operator.cc index eba226421..87528c4b3 100644 --- a/src/infiniop/ops/add/operator.cc +++ b/src/infiniop/ops/add/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/add_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/add_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -51,6 +51,9 @@ __C infiniStatus_t infiniopCreateAddDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -91,6 +94,9 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -139,6 +145,9 @@ __C infiniStatus_t infiniopAdd( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -181,6 +190,9 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/conv/operator.cc b/src/infiniop/ops/conv/operator.cc index 4c974febc..ed30767be 100644 --- a/src/infiniop/ops/conv/operator.cc +++ b/src/infiniop/ops/conv/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/conv_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/conv_nvidia.cuh" #endif @@ -45,6 +45,9 @@ __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -76,6 +79,9 @@ infiniopGetConvWorkspaceSize( #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -115,6 +121,9 @@ __C infiniStatus_t infiniopConv( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -142,6 +151,9 @@ infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; diff --git a/src/infiniop/ops/gelu/operator.cc b/src/infiniop/ops/gelu/operator.cc index 262808ff0..0d2ff00f5 100644 --- a/src/infiniop/ops/gelu/operator.cc +++ b/src/infiniop/ops/gelu/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/gelu_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/gelu_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -43,6 +43,9 @@ __C infiniStatus_t infiniopCreateGeluDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -77,6 +80,9 @@ __C infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t desc, s #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -118,6 +124,9 @@ __C infiniStatus_t infiniopGelu( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -154,6 +163,9 @@ infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.cc b/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.cc new file mode 100644 index 000000000..6d7631e91 --- /dev/null +++ b/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.cc @@ -0,0 +1,53 @@ +#include "gelutanh_cpu.h" + +namespace op::gelutanh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_SAME_SHAPE(y_shape, x_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + (void)workspace; + (void)workspace_size; + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::gelutanh::cpu + diff --git a/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.h b/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.h new file mode 100644 index 000000000..713a2d0c5 --- /dev/null +++ b/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.h @@ -0,0 +1,27 @@ +#ifndef __GELUTANH_CPU_H__ +#define __GELUTANH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +#include + +ELEMENTWISE_DESCRIPTOR(gelutanh, cpu) + +namespace op::gelutanh::cpu { +typedef struct GeluTanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + // y = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) + constexpr T alpha = static_cast(0.7978845608); // sqrt(2/pi) + constexpr T beta = static_cast(0.044715); + T inner = alpha * (x + beta * x * x * x); + return x * static_cast(0.5) * (static_cast(1) + std::tanh(inner)); + } +} GeluTanhOp; +} // namespace op::gelutanh::cpu + +#endif // __GELUTANH_CPU_H__ + diff --git a/src/infiniop/ops/gelutanh/cuda/kernel.cuh b/src/infiniop/ops/gelutanh/cuda/kernel.cuh new file mode 100644 index 000000000..d52c344fb --- /dev/null +++ b/src/infiniop/ops/gelutanh/cuda/kernel.cuh @@ -0,0 +1,59 @@ +#ifndef __GELUTANH_CUDA_H__ +#define __GELUTANH_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include +#include +#include + +namespace op::gelutanh::cuda { + +typedef struct GeluTanhOp { +public: + static constexpr size_t num_inputs = 1; + + // GELU-Tanh constants + // static constexpr float alpha = std::sqrt(2.0 / M_PI); + // static constexpr float beta = 0.044715f; + static constexpr float alpha = 0.7978845608f; // sqrt(2/pi) + static constexpr float beta = 0.044715f; + // f32 tanh helper + __device__ __forceinline__ float tanh_f32_func(float x) const { + return tanhf(x); + } + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // half2 -> float2 + float2 vf = __half22float2(x); + float inner_x0 = alpha * (vf.x + beta * vf.x * vf.x * vf.x); + float inner_x1 = alpha * (vf.y + beta * vf.y * vf.y * vf.y); + float2 vr = make_float2(tanh_f32_func(inner_x0) * 0.5f + 0.5f, + tanh_f32_func(inner_x1) * 0.5f + 0.5f); + return __hmul2(x, __float22half2_rn(vr)); // y = x * 0.5 * (1 + tanh(...)) + } else if constexpr (std::is_same_v) { + float xf = __half2float(x); + float inner = alpha * (xf + beta * xf * xf * xf); + float yf = xf * 0.5f * (1.0f + tanh_f32_func(inner)); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + float inner = alpha * (xf + beta * xf * xf * xf); + float yf = xf * 0.5f * (1.0f + tanh_f32_func(inner)); + return __float2bfloat16(yf); + } else if constexpr (std::is_same_v) { + float inner = alpha * (x + beta * x * x * x); + return x * 0.5f * (1.0f + tanh_f32_func(inner)); + } else { // double + double inner = alpha * (x + beta * x * x * x); + return x * 0.5 * (1.0 + std::tanh(inner)); + } + } + +} GeluTanhOp; + +} // namespace op::gelutanh::cuda + +#endif // __GELUTANH_CUDA_H__ + diff --git a/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cu b/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cu new file mode 100644 index 000000000..00f6cebcb --- /dev/null +++ b/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cu @@ -0,0 +1,71 @@ +#include "../cuda/kernel.cuh" +#include "gelutanh_nvidia.cuh" + +namespace op::gelutanh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, + INFINI_DTYPE_F32, + INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::GeluTanhOp, half>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::GeluTanhOp, __nv_bfloat16>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::GeluTanhOp, float>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::GeluTanhOp, double>( + _info, workspace, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::gelutanh::nvidia + diff --git a/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cuh b/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cuh new file mode 100644 index 000000000..e8d8d8c31 --- /dev/null +++ b/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cuh @@ -0,0 +1,9 @@ +#ifndef __GELUTANH_CUDA_API_H__ +#define __GELUTANH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(gelutanh, nvidia) + +#endif // __GELUTANH_CUDA_API_H__ + diff --git a/src/infiniop/ops/gelutanh/operator.cc b/src/infiniop/ops/gelutanh/operator.cc new file mode 100644 index 000000000..f329f667e --- /dev/null +++ b/src/infiniop/ops/gelutanh/operator.cc @@ -0,0 +1,144 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/gelutanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/gelutanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/gelutanh_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateGeluTanhDescriptor( + infiniopHandle_t handle, + infiniopGeluTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::gelutanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetGeluTanhWorkspaceSize(infiniopGeluTanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopGeluTanh( + infiniopGeluTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyGeluTanhDescriptor(infiniopGeluTanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} + diff --git a/src/infiniop/ops/layer_norm/operator.cc b/src/infiniop/ops/layer_norm/operator.cc index 3dbbdcb21..594f942cd 100644 --- a/src/infiniop/ops/layer_norm/operator.cc +++ b/src/infiniop/ops/layer_norm/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/layer_norm_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/layer_norm_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -49,6 +49,9 @@ __C infiniStatus_t infiniopCreateLayerNormDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -79,6 +82,9 @@ __C infiniStatus_t infiniopGetLayerNormWorkspaceSize(infiniopLayerNormDescriptor #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -129,6 +135,9 @@ __C infiniStatus_t infiniopLayerNorm( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -159,6 +168,9 @@ infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.cc b/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.cc new file mode 100644 index 000000000..dfb64da0a --- /dev/null +++ b/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.cc @@ -0,0 +1,53 @@ +#include "quickgelu_cpu.h" + +namespace op::quickgelu::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_SAME_SHAPE(y_shape, x_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + (void)workspace; + (void)workspace_size; + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::quickgelu::cpu + diff --git a/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.h b/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.h new file mode 100644 index 000000000..3418bdd25 --- /dev/null +++ b/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.h @@ -0,0 +1,26 @@ +#ifndef __QUICKGELU_CPU_H__ +#define __QUICKGELU_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +#include + +ELEMENTWISE_DESCRIPTOR(quickgelu, cpu) + +namespace op::quickgelu::cpu { +typedef struct QuickGeluOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + // quickgelu(x) = x * sigmoid(1.702 * x) + constexpr T alpha = static_cast(1.702); + T ax = alpha * x; + return x / (static_cast(1) + std::exp(-ax)); + } +} QuickGeluOp; +} // namespace op::quickgelu::cpu + +#endif // __QUICKGELU_CPU_H__ + diff --git a/src/infiniop/ops/quickgelu/cuda/kernel.cuh b/src/infiniop/ops/quickgelu/cuda/kernel.cuh new file mode 100644 index 000000000..5d678a350 --- /dev/null +++ b/src/infiniop/ops/quickgelu/cuda/kernel.cuh @@ -0,0 +1,61 @@ +#ifndef __QUICKGELU_CUDA_H__ +#define __QUICKGELU_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include +#include + +namespace op::quickgelu::cuda { + +typedef struct QuickGeluOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + // quickgelu(x) = x * sigmoid(1.702 * x) + + constexpr float alpha = 1.702f; + + if constexpr (std::is_same_v) { + half2 ax = __hmul2(make_half2(alpha, alpha), x); + half2 denominator = __hadd2(make_half2(1, 1), h2exp(__hneg2(ax))); + half2 sigmoid = h2rcp(denominator); + return __hmul2(x, sigmoid); + + } else if constexpr (std::is_same_v) { + half ax = __hmul(__float2half(alpha), x); + half denominator = __hadd(__float2half(1.0f), hexp(__hneg(ax))); + half sigmoid = hrcp(denominator); + return __hmul(x, sigmoid); + + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + float ax = alpha * xf; + float s = 1.0f / (1.0f + __expf(-ax)); + return __float2bfloat16(xf * s); + + } else if constexpr (std::is_same_v) { + float ax = alpha * x; + float s; + if (ax >= 0.0f) { + float z = expf(-ax); + s = 1.0f / (1.0f + z); + } else { + float z = expf(ax); + s = z / (1.0f + z); + } + return x * s; + + } else { // double + double ax = static_cast(alpha) * x; + return x / (1.0 + exp(-ax)); + } + } + +} QuickGeluOp; + +} // namespace op::quickgelu::cuda + +#endif // __QUICKGELU_CUDA_H__ + diff --git a/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cu b/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cu new file mode 100644 index 000000000..e4bcae1a7 --- /dev/null +++ b/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cu @@ -0,0 +1,71 @@ +#include "../cuda/kernel.cuh" +#include "quickgelu_nvidia.cuh" + +namespace op::quickgelu::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, + INFINI_DTYPE_F32, + INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::QuickGeluOp, half>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::QuickGeluOp, __nv_bfloat16>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::QuickGeluOp, float>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::QuickGeluOp, double>( + _info, workspace, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::quickgelu::nvidia + diff --git a/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cuh b/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cuh new file mode 100644 index 000000000..935c86758 --- /dev/null +++ b/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cuh @@ -0,0 +1,9 @@ +#ifndef __QUICKGELU_CUDA_API_H__ +#define __QUICKGELU_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(quickgelu, nvidia) + +#endif // __QUICKGELU_CUDA_API_H__ + diff --git a/src/infiniop/ops/quickgelu/operator.cc b/src/infiniop/ops/quickgelu/operator.cc new file mode 100644 index 000000000..158e21cf3 --- /dev/null +++ b/src/infiniop/ops/quickgelu/operator.cc @@ -0,0 +1,144 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/quickgelu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/quickgelu_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/quickgelu_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateQuickGeluDescriptor( + infiniopHandle_t handle, + infiniopQuickGeluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::quickgelu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetQuickGeluWorkspaceSize(infiniopQuickGeluDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopQuickGelu( + infiniopQuickGeluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyQuickGeluDescriptor(infiniopQuickGeluDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} + diff --git a/src/infiniop/ops/relu/operator.cc b/src/infiniop/ops/relu/operator.cc index 093674de6..8992ca56b 100644 --- a/src/infiniop/ops/relu/operator.cc +++ b/src/infiniop/ops/relu/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/relu_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/relu_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -42,6 +42,9 @@ __C infiniStatus_t infiniopCreateReluDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED CREATE(INFINI_DEVICE_METAX, metax); @@ -75,6 +78,9 @@ __C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, s #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia) #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) +#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED GET(INFINI_DEVICE_METAX, metax) @@ -115,6 +121,9 @@ __C infiniStatus_t infiniopRelu( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED CALCULATE(INFINI_DEVICE_METAX, metax); @@ -150,6 +159,9 @@ infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED DELETE(INFINI_DEVICE_METAX, metax); diff --git a/src/infiniop/ops/sigmoid/operator.cc b/src/infiniop/ops/sigmoid/operator.cc index c86fc91d6..7bab01a49 100644 --- a/src/infiniop/ops/sigmoid/operator.cc +++ b/src/infiniop/ops/sigmoid/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/sigmoid_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/sigmoid_nvidia.cuh" #endif @@ -34,6 +34,9 @@ __C infiniStatus_t infiniopCreateSigmoidDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -58,6 +61,9 @@ __C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t d #endif #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -91,6 +97,9 @@ __C infiniStatus_t infiniopSigmoid( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -118,6 +127,9 @@ infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc index 7dcc9b303..a727f2084 100644 --- a/src/infiniop/ops/tanh/operator.cc +++ b/src/infiniop/ops/tanh/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/tanh_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/tanh_nvidia.cuh" #endif // #ifdef ENABLE_METAX_API @@ -39,6 +39,9 @@ __C infiniStatus_t infiniopCreateTanhDescriptor( #endif #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); #endif // #ifdef ENABLE_METAX_API // CREATE(INFINI_DEVICE_METAX, metax); @@ -70,6 +73,9 @@ __C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, s #endif #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); #endif // #ifdef ENABLE_METAX_API // GET(INFINI_DEVICE_METAX, metax); @@ -108,6 +114,9 @@ __C infiniStatus_t infiniopTanh( #endif #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); #endif // #ifdef ENABLE_METAX_API // CALCULATE(INFINI_DEVICE_METAX, metax); @@ -141,6 +150,9 @@ infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) { #endif #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); #endif // #ifdef ENABLE_METAX_API // DELETE(INFINI_DEVICE_METAX, metax); diff --git a/xmake/hygon.lua b/xmake/hygon.lua index ed4b91f0e..a5dc646c9 100644 --- a/xmake/hygon.lua +++ b/xmake/hygon.lua @@ -74,6 +74,16 @@ target("infiniop-hygon") add_files("../src/infiniop/ops/rearrange/nvidia/*.cu") add_files("../src/infiniop/ops/rms_norm/nvidia/*.cu") add_files("../src/infiniop/ops/swiglu/nvidia/*.cu") + add_files("../src/infiniop/ops/conv/nvidia/*.cu") + add_files("../src/infiniop/ops/add/nvidia/*.cu") + add_files("../src/infiniop/ops/layer_norm/nvidia/*.cu") + add_files("../src/infiniop/ops/relu/nvidia/*.cu") + add_files("../src/infiniop/ops/softmax/nvidia/*.cu") + add_files("../src/infiniop/ops/sigmoid/nvidia/*.cu") + add_files("../src/infiniop/ops/gelu/nvidia/*.cu") + add_files("../src/infiniop/ops/tanh/nvidia/*.cu") + add_files("../src/infiniop/ops/quickgelu/nvidia/*.cu") + add_files("../src/infiniop/ops/gelutanh/nvidia/*.cu") if has_config("ninetoothed") then add_files("../build/ninetoothed/*.c", {cxflags = {"-Wno-return-type"}}) diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua index a86090776..5e292170b 100644 --- a/xmake/nvidia.lua +++ b/xmake/nvidia.lua @@ -45,9 +45,9 @@ target("infiniop-nvidia") end else add_cuflags("-Xcompiler=-Wall", "-Xcompiler=-Werror") - add_cuflags("-Xcompiler=-fPIC") + add_cuflags("-Xcompiler=-fPIC", {force = true}) add_cuflags("--extended-lambda") - add_culdflags("-Xcompiler=-fPIC") + add_culdflags("-Xcompiler=-fPIC", {force = true}) add_cxxflags("-fPIC") add_cuflags("--expt-relaxed-constexpr") if CUDNN_ROOT ~= nil then @@ -89,8 +89,8 @@ target("infinirt-nvidia") add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler") add_cxxflags("/FS") else - add_cuflags("-Xcompiler=-fPIC") - add_culdflags("-Xcompiler=-fPIC") + add_cuflags("-Xcompiler=-fPIC", {force = true}) + add_culdflags("-Xcompiler=-fPIC", {force = true}) add_cxflags("-fPIC") end